xref: /openbmc/linux/net/core/dev.c (revision b85d4594)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/dst_metadata.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
121 #include <net/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/pci.h>
131 #include <linux/inetdevice.h>
132 #include <linux/cpu_rmap.h>
133 #include <linux/static_key.h>
134 #include <linux/hashtable.h>
135 #include <linux/vmalloc.h>
136 #include <linux/if_macvlan.h>
137 #include <linux/errqueue.h>
138 #include <linux/hrtimer.h>
139 #include <linux/netfilter_ingress.h>
140 
141 #include "net-sysfs.h"
142 
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
145 
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
148 
149 static DEFINE_SPINLOCK(ptype_lock);
150 static DEFINE_SPINLOCK(offload_lock);
151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
152 struct list_head ptype_all __read_mostly;	/* Taps */
153 static struct list_head offload_base __read_mostly;
154 
155 static int netif_rx_internal(struct sk_buff *skb);
156 static int call_netdevice_notifiers_info(unsigned long val,
157 					 struct net_device *dev,
158 					 struct netdev_notifier_info *info);
159 
160 /*
161  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
162  * semaphore.
163  *
164  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
165  *
166  * Writers must hold the rtnl semaphore while they loop through the
167  * dev_base_head list, and hold dev_base_lock for writing when they do the
168  * actual updates.  This allows pure readers to access the list even
169  * while a writer is preparing to update it.
170  *
171  * To put it another way, dev_base_lock is held for writing only to
172  * protect against pure readers; the rtnl semaphore provides the
173  * protection against other writers.
174  *
175  * See, for example usages, register_netdevice() and
176  * unregister_netdevice(), which must be called with the rtnl
177  * semaphore held.
178  */
179 DEFINE_RWLOCK(dev_base_lock);
180 EXPORT_SYMBOL(dev_base_lock);
181 
182 /* protects napi_hash addition/deletion and napi_gen_id */
183 static DEFINE_SPINLOCK(napi_hash_lock);
184 
185 static unsigned int napi_gen_id;
186 static DEFINE_HASHTABLE(napi_hash, 8);
187 
188 static seqcount_t devnet_rename_seq;
189 
190 static inline void dev_base_seq_inc(struct net *net)
191 {
192 	while (++net->dev_base_seq == 0);
193 }
194 
195 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
196 {
197 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
198 
199 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
200 }
201 
202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
203 {
204 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
205 }
206 
207 static inline void rps_lock(struct softnet_data *sd)
208 {
209 #ifdef CONFIG_RPS
210 	spin_lock(&sd->input_pkt_queue.lock);
211 #endif
212 }
213 
214 static inline void rps_unlock(struct softnet_data *sd)
215 {
216 #ifdef CONFIG_RPS
217 	spin_unlock(&sd->input_pkt_queue.lock);
218 #endif
219 }
220 
221 /* Device list insertion */
222 static void list_netdevice(struct net_device *dev)
223 {
224 	struct net *net = dev_net(dev);
225 
226 	ASSERT_RTNL();
227 
228 	write_lock_bh(&dev_base_lock);
229 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
230 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
231 	hlist_add_head_rcu(&dev->index_hlist,
232 			   dev_index_hash(net, dev->ifindex));
233 	write_unlock_bh(&dev_base_lock);
234 
235 	dev_base_seq_inc(net);
236 }
237 
238 /* Device list removal
239  * caller must respect a RCU grace period before freeing/reusing dev
240  */
241 static void unlist_netdevice(struct net_device *dev)
242 {
243 	ASSERT_RTNL();
244 
245 	/* Unlink dev from the device chain */
246 	write_lock_bh(&dev_base_lock);
247 	list_del_rcu(&dev->dev_list);
248 	hlist_del_rcu(&dev->name_hlist);
249 	hlist_del_rcu(&dev->index_hlist);
250 	write_unlock_bh(&dev_base_lock);
251 
252 	dev_base_seq_inc(dev_net(dev));
253 }
254 
255 /*
256  *	Our notifier list
257  */
258 
259 static RAW_NOTIFIER_HEAD(netdev_chain);
260 
261 /*
262  *	Device drivers call our routines to queue packets here. We empty the
263  *	queue in the local softnet handler.
264  */
265 
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
267 EXPORT_PER_CPU_SYMBOL(softnet_data);
268 
269 #ifdef CONFIG_LOCKDEP
270 /*
271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272  * according to dev->type
273  */
274 static const unsigned short netdev_lock_type[] =
275 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
276 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
277 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
278 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
279 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
280 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
281 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
282 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
283 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
284 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
285 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
286 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
287 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
288 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
289 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
290 
291 static const char *const netdev_lock_name[] =
292 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
305 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
306 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
307 
308 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
309 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
310 
311 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
312 {
313 	int i;
314 
315 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
316 		if (netdev_lock_type[i] == dev_type)
317 			return i;
318 	/* the last key is used by default */
319 	return ARRAY_SIZE(netdev_lock_type) - 1;
320 }
321 
322 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
323 						 unsigned short dev_type)
324 {
325 	int i;
326 
327 	i = netdev_lock_pos(dev_type);
328 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
329 				   netdev_lock_name[i]);
330 }
331 
332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
333 {
334 	int i;
335 
336 	i = netdev_lock_pos(dev->type);
337 	lockdep_set_class_and_name(&dev->addr_list_lock,
338 				   &netdev_addr_lock_key[i],
339 				   netdev_lock_name[i]);
340 }
341 #else
342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
343 						 unsigned short dev_type)
344 {
345 }
346 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
347 {
348 }
349 #endif
350 
351 /*******************************************************************************
352 
353 		Protocol management and registration routines
354 
355 *******************************************************************************/
356 
357 /*
358  *	Add a protocol ID to the list. Now that the input handler is
359  *	smarter we can dispense with all the messy stuff that used to be
360  *	here.
361  *
362  *	BEWARE!!! Protocol handlers, mangling input packets,
363  *	MUST BE last in hash buckets and checking protocol handlers
364  *	MUST start from promiscuous ptype_all chain in net_bh.
365  *	It is true now, do not change it.
366  *	Explanation follows: if protocol handler, mangling packet, will
367  *	be the first on list, it is not able to sense, that packet
368  *	is cloned and should be copied-on-write, so that it will
369  *	change it and subsequent readers will get broken packet.
370  *							--ANK (980803)
371  */
372 
373 static inline struct list_head *ptype_head(const struct packet_type *pt)
374 {
375 	if (pt->type == htons(ETH_P_ALL))
376 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
377 	else
378 		return pt->dev ? &pt->dev->ptype_specific :
379 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
380 }
381 
382 /**
383  *	dev_add_pack - add packet handler
384  *	@pt: packet type declaration
385  *
386  *	Add a protocol handler to the networking stack. The passed &packet_type
387  *	is linked into kernel lists and may not be freed until it has been
388  *	removed from the kernel lists.
389  *
390  *	This call does not sleep therefore it can not
391  *	guarantee all CPU's that are in middle of receiving packets
392  *	will see the new packet type (until the next received packet).
393  */
394 
395 void dev_add_pack(struct packet_type *pt)
396 {
397 	struct list_head *head = ptype_head(pt);
398 
399 	spin_lock(&ptype_lock);
400 	list_add_rcu(&pt->list, head);
401 	spin_unlock(&ptype_lock);
402 }
403 EXPORT_SYMBOL(dev_add_pack);
404 
405 /**
406  *	__dev_remove_pack	 - remove packet handler
407  *	@pt: packet type declaration
408  *
409  *	Remove a protocol handler that was previously added to the kernel
410  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
411  *	from the kernel lists and can be freed or reused once this function
412  *	returns.
413  *
414  *      The packet type might still be in use by receivers
415  *	and must not be freed until after all the CPU's have gone
416  *	through a quiescent state.
417  */
418 void __dev_remove_pack(struct packet_type *pt)
419 {
420 	struct list_head *head = ptype_head(pt);
421 	struct packet_type *pt1;
422 
423 	spin_lock(&ptype_lock);
424 
425 	list_for_each_entry(pt1, head, list) {
426 		if (pt == pt1) {
427 			list_del_rcu(&pt->list);
428 			goto out;
429 		}
430 	}
431 
432 	pr_warn("dev_remove_pack: %p not found\n", pt);
433 out:
434 	spin_unlock(&ptype_lock);
435 }
436 EXPORT_SYMBOL(__dev_remove_pack);
437 
438 /**
439  *	dev_remove_pack	 - remove packet handler
440  *	@pt: packet type declaration
441  *
442  *	Remove a protocol handler that was previously added to the kernel
443  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
444  *	from the kernel lists and can be freed or reused once this function
445  *	returns.
446  *
447  *	This call sleeps to guarantee that no CPU is looking at the packet
448  *	type after return.
449  */
450 void dev_remove_pack(struct packet_type *pt)
451 {
452 	__dev_remove_pack(pt);
453 
454 	synchronize_net();
455 }
456 EXPORT_SYMBOL(dev_remove_pack);
457 
458 
459 /**
460  *	dev_add_offload - register offload handlers
461  *	@po: protocol offload declaration
462  *
463  *	Add protocol offload handlers to the networking stack. The passed
464  *	&proto_offload is linked into kernel lists and may not be freed until
465  *	it has been removed from the kernel lists.
466  *
467  *	This call does not sleep therefore it can not
468  *	guarantee all CPU's that are in middle of receiving packets
469  *	will see the new offload handlers (until the next received packet).
470  */
471 void dev_add_offload(struct packet_offload *po)
472 {
473 	struct packet_offload *elem;
474 
475 	spin_lock(&offload_lock);
476 	list_for_each_entry(elem, &offload_base, list) {
477 		if (po->priority < elem->priority)
478 			break;
479 	}
480 	list_add_rcu(&po->list, elem->list.prev);
481 	spin_unlock(&offload_lock);
482 }
483 EXPORT_SYMBOL(dev_add_offload);
484 
485 /**
486  *	__dev_remove_offload	 - remove offload handler
487  *	@po: packet offload declaration
488  *
489  *	Remove a protocol offload handler that was previously added to the
490  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
491  *	is removed from the kernel lists and can be freed or reused once this
492  *	function returns.
493  *
494  *      The packet type might still be in use by receivers
495  *	and must not be freed until after all the CPU's have gone
496  *	through a quiescent state.
497  */
498 static void __dev_remove_offload(struct packet_offload *po)
499 {
500 	struct list_head *head = &offload_base;
501 	struct packet_offload *po1;
502 
503 	spin_lock(&offload_lock);
504 
505 	list_for_each_entry(po1, head, list) {
506 		if (po == po1) {
507 			list_del_rcu(&po->list);
508 			goto out;
509 		}
510 	}
511 
512 	pr_warn("dev_remove_offload: %p not found\n", po);
513 out:
514 	spin_unlock(&offload_lock);
515 }
516 
517 /**
518  *	dev_remove_offload	 - remove packet offload handler
519  *	@po: packet offload declaration
520  *
521  *	Remove a packet offload handler that was previously added to the kernel
522  *	offload handlers by dev_add_offload(). The passed &offload_type is
523  *	removed from the kernel lists and can be freed or reused once this
524  *	function returns.
525  *
526  *	This call sleeps to guarantee that no CPU is looking at the packet
527  *	type after return.
528  */
529 void dev_remove_offload(struct packet_offload *po)
530 {
531 	__dev_remove_offload(po);
532 
533 	synchronize_net();
534 }
535 EXPORT_SYMBOL(dev_remove_offload);
536 
537 /******************************************************************************
538 
539 		      Device Boot-time Settings Routines
540 
541 *******************************************************************************/
542 
543 /* Boot time configuration table */
544 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
545 
546 /**
547  *	netdev_boot_setup_add	- add new setup entry
548  *	@name: name of the device
549  *	@map: configured settings for the device
550  *
551  *	Adds new setup entry to the dev_boot_setup list.  The function
552  *	returns 0 on error and 1 on success.  This is a generic routine to
553  *	all netdevices.
554  */
555 static int netdev_boot_setup_add(char *name, struct ifmap *map)
556 {
557 	struct netdev_boot_setup *s;
558 	int i;
559 
560 	s = dev_boot_setup;
561 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
562 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
563 			memset(s[i].name, 0, sizeof(s[i].name));
564 			strlcpy(s[i].name, name, IFNAMSIZ);
565 			memcpy(&s[i].map, map, sizeof(s[i].map));
566 			break;
567 		}
568 	}
569 
570 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
571 }
572 
573 /**
574  *	netdev_boot_setup_check	- check boot time settings
575  *	@dev: the netdevice
576  *
577  * 	Check boot time settings for the device.
578  *	The found settings are set for the device to be used
579  *	later in the device probing.
580  *	Returns 0 if no settings found, 1 if they are.
581  */
582 int netdev_boot_setup_check(struct net_device *dev)
583 {
584 	struct netdev_boot_setup *s = dev_boot_setup;
585 	int i;
586 
587 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
588 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
589 		    !strcmp(dev->name, s[i].name)) {
590 			dev->irq 	= s[i].map.irq;
591 			dev->base_addr 	= s[i].map.base_addr;
592 			dev->mem_start 	= s[i].map.mem_start;
593 			dev->mem_end 	= s[i].map.mem_end;
594 			return 1;
595 		}
596 	}
597 	return 0;
598 }
599 EXPORT_SYMBOL(netdev_boot_setup_check);
600 
601 
602 /**
603  *	netdev_boot_base	- get address from boot time settings
604  *	@prefix: prefix for network device
605  *	@unit: id for network device
606  *
607  * 	Check boot time settings for the base address of device.
608  *	The found settings are set for the device to be used
609  *	later in the device probing.
610  *	Returns 0 if no settings found.
611  */
612 unsigned long netdev_boot_base(const char *prefix, int unit)
613 {
614 	const struct netdev_boot_setup *s = dev_boot_setup;
615 	char name[IFNAMSIZ];
616 	int i;
617 
618 	sprintf(name, "%s%d", prefix, unit);
619 
620 	/*
621 	 * If device already registered then return base of 1
622 	 * to indicate not to probe for this interface
623 	 */
624 	if (__dev_get_by_name(&init_net, name))
625 		return 1;
626 
627 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
628 		if (!strcmp(name, s[i].name))
629 			return s[i].map.base_addr;
630 	return 0;
631 }
632 
633 /*
634  * Saves at boot time configured settings for any netdevice.
635  */
636 int __init netdev_boot_setup(char *str)
637 {
638 	int ints[5];
639 	struct ifmap map;
640 
641 	str = get_options(str, ARRAY_SIZE(ints), ints);
642 	if (!str || !*str)
643 		return 0;
644 
645 	/* Save settings */
646 	memset(&map, 0, sizeof(map));
647 	if (ints[0] > 0)
648 		map.irq = ints[1];
649 	if (ints[0] > 1)
650 		map.base_addr = ints[2];
651 	if (ints[0] > 2)
652 		map.mem_start = ints[3];
653 	if (ints[0] > 3)
654 		map.mem_end = ints[4];
655 
656 	/* Add new entry to the list */
657 	return netdev_boot_setup_add(str, &map);
658 }
659 
660 __setup("netdev=", netdev_boot_setup);
661 
662 /*******************************************************************************
663 
664 			    Device Interface Subroutines
665 
666 *******************************************************************************/
667 
668 /**
669  *	dev_get_iflink	- get 'iflink' value of a interface
670  *	@dev: targeted interface
671  *
672  *	Indicates the ifindex the interface is linked to.
673  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
674  */
675 
676 int dev_get_iflink(const struct net_device *dev)
677 {
678 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
679 		return dev->netdev_ops->ndo_get_iflink(dev);
680 
681 	return dev->ifindex;
682 }
683 EXPORT_SYMBOL(dev_get_iflink);
684 
685 /**
686  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
687  *	@dev: targeted interface
688  *	@skb: The packet.
689  *
690  *	For better visibility of tunnel traffic OVS needs to retrieve
691  *	egress tunnel information for a packet. Following API allows
692  *	user to get this info.
693  */
694 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
695 {
696 	struct ip_tunnel_info *info;
697 
698 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
699 		return -EINVAL;
700 
701 	info = skb_tunnel_info_unclone(skb);
702 	if (!info)
703 		return -ENOMEM;
704 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
705 		return -EINVAL;
706 
707 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
708 }
709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
710 
711 /**
712  *	__dev_get_by_name	- find a device by its name
713  *	@net: the applicable net namespace
714  *	@name: name to find
715  *
716  *	Find an interface by name. Must be called under RTNL semaphore
717  *	or @dev_base_lock. If the name is found a pointer to the device
718  *	is returned. If the name is not found then %NULL is returned. The
719  *	reference counters are not incremented so the caller must be
720  *	careful with locks.
721  */
722 
723 struct net_device *__dev_get_by_name(struct net *net, const char *name)
724 {
725 	struct net_device *dev;
726 	struct hlist_head *head = dev_name_hash(net, name);
727 
728 	hlist_for_each_entry(dev, head, name_hlist)
729 		if (!strncmp(dev->name, name, IFNAMSIZ))
730 			return dev;
731 
732 	return NULL;
733 }
734 EXPORT_SYMBOL(__dev_get_by_name);
735 
736 /**
737  *	dev_get_by_name_rcu	- find a device by its name
738  *	@net: the applicable net namespace
739  *	@name: name to find
740  *
741  *	Find an interface by name.
742  *	If the name is found a pointer to the device is returned.
743  * 	If the name is not found then %NULL is returned.
744  *	The reference counters are not incremented so the caller must be
745  *	careful with locks. The caller must hold RCU lock.
746  */
747 
748 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
749 {
750 	struct net_device *dev;
751 	struct hlist_head *head = dev_name_hash(net, name);
752 
753 	hlist_for_each_entry_rcu(dev, head, name_hlist)
754 		if (!strncmp(dev->name, name, IFNAMSIZ))
755 			return dev;
756 
757 	return NULL;
758 }
759 EXPORT_SYMBOL(dev_get_by_name_rcu);
760 
761 /**
762  *	dev_get_by_name		- find a device by its name
763  *	@net: the applicable net namespace
764  *	@name: name to find
765  *
766  *	Find an interface by name. This can be called from any
767  *	context and does its own locking. The returned handle has
768  *	the usage count incremented and the caller must use dev_put() to
769  *	release it when it is no longer needed. %NULL is returned if no
770  *	matching device is found.
771  */
772 
773 struct net_device *dev_get_by_name(struct net *net, const char *name)
774 {
775 	struct net_device *dev;
776 
777 	rcu_read_lock();
778 	dev = dev_get_by_name_rcu(net, name);
779 	if (dev)
780 		dev_hold(dev);
781 	rcu_read_unlock();
782 	return dev;
783 }
784 EXPORT_SYMBOL(dev_get_by_name);
785 
786 /**
787  *	__dev_get_by_index - find a device by its ifindex
788  *	@net: the applicable net namespace
789  *	@ifindex: index of device
790  *
791  *	Search for an interface by index. Returns %NULL if the device
792  *	is not found or a pointer to the device. The device has not
793  *	had its reference counter increased so the caller must be careful
794  *	about locking. The caller must hold either the RTNL semaphore
795  *	or @dev_base_lock.
796  */
797 
798 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
799 {
800 	struct net_device *dev;
801 	struct hlist_head *head = dev_index_hash(net, ifindex);
802 
803 	hlist_for_each_entry(dev, head, index_hlist)
804 		if (dev->ifindex == ifindex)
805 			return dev;
806 
807 	return NULL;
808 }
809 EXPORT_SYMBOL(__dev_get_by_index);
810 
811 /**
812  *	dev_get_by_index_rcu - find a device by its ifindex
813  *	@net: the applicable net namespace
814  *	@ifindex: index of device
815  *
816  *	Search for an interface by index. Returns %NULL if the device
817  *	is not found or a pointer to the device. The device has not
818  *	had its reference counter increased so the caller must be careful
819  *	about locking. The caller must hold RCU lock.
820  */
821 
822 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
823 {
824 	struct net_device *dev;
825 	struct hlist_head *head = dev_index_hash(net, ifindex);
826 
827 	hlist_for_each_entry_rcu(dev, head, index_hlist)
828 		if (dev->ifindex == ifindex)
829 			return dev;
830 
831 	return NULL;
832 }
833 EXPORT_SYMBOL(dev_get_by_index_rcu);
834 
835 
836 /**
837  *	dev_get_by_index - find a device by its ifindex
838  *	@net: the applicable net namespace
839  *	@ifindex: index of device
840  *
841  *	Search for an interface by index. Returns NULL if the device
842  *	is not found or a pointer to the device. The device returned has
843  *	had a reference added and the pointer is safe until the user calls
844  *	dev_put to indicate they have finished with it.
845  */
846 
847 struct net_device *dev_get_by_index(struct net *net, int ifindex)
848 {
849 	struct net_device *dev;
850 
851 	rcu_read_lock();
852 	dev = dev_get_by_index_rcu(net, ifindex);
853 	if (dev)
854 		dev_hold(dev);
855 	rcu_read_unlock();
856 	return dev;
857 }
858 EXPORT_SYMBOL(dev_get_by_index);
859 
860 /**
861  *	netdev_get_name - get a netdevice name, knowing its ifindex.
862  *	@net: network namespace
863  *	@name: a pointer to the buffer where the name will be stored.
864  *	@ifindex: the ifindex of the interface to get the name from.
865  *
866  *	The use of raw_seqcount_begin() and cond_resched() before
867  *	retrying is required as we want to give the writers a chance
868  *	to complete when CONFIG_PREEMPT is not set.
869  */
870 int netdev_get_name(struct net *net, char *name, int ifindex)
871 {
872 	struct net_device *dev;
873 	unsigned int seq;
874 
875 retry:
876 	seq = raw_seqcount_begin(&devnet_rename_seq);
877 	rcu_read_lock();
878 	dev = dev_get_by_index_rcu(net, ifindex);
879 	if (!dev) {
880 		rcu_read_unlock();
881 		return -ENODEV;
882 	}
883 
884 	strcpy(name, dev->name);
885 	rcu_read_unlock();
886 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
887 		cond_resched();
888 		goto retry;
889 	}
890 
891 	return 0;
892 }
893 
894 /**
895  *	dev_getbyhwaddr_rcu - find a device by its hardware address
896  *	@net: the applicable net namespace
897  *	@type: media type of device
898  *	@ha: hardware address
899  *
900  *	Search for an interface by MAC address. Returns NULL if the device
901  *	is not found or a pointer to the device.
902  *	The caller must hold RCU or RTNL.
903  *	The returned device has not had its ref count increased
904  *	and the caller must therefore be careful about locking
905  *
906  */
907 
908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
909 				       const char *ha)
910 {
911 	struct net_device *dev;
912 
913 	for_each_netdev_rcu(net, dev)
914 		if (dev->type == type &&
915 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
916 			return dev;
917 
918 	return NULL;
919 }
920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
921 
922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
923 {
924 	struct net_device *dev;
925 
926 	ASSERT_RTNL();
927 	for_each_netdev(net, dev)
928 		if (dev->type == type)
929 			return dev;
930 
931 	return NULL;
932 }
933 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
934 
935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
936 {
937 	struct net_device *dev, *ret = NULL;
938 
939 	rcu_read_lock();
940 	for_each_netdev_rcu(net, dev)
941 		if (dev->type == type) {
942 			dev_hold(dev);
943 			ret = dev;
944 			break;
945 		}
946 	rcu_read_unlock();
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_getfirstbyhwtype);
950 
951 /**
952  *	__dev_get_by_flags - find any device with given flags
953  *	@net: the applicable net namespace
954  *	@if_flags: IFF_* values
955  *	@mask: bitmask of bits in if_flags to check
956  *
957  *	Search for any interface with the given flags. Returns NULL if a device
958  *	is not found or a pointer to the device. Must be called inside
959  *	rtnl_lock(), and result refcount is unchanged.
960  */
961 
962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
963 				      unsigned short mask)
964 {
965 	struct net_device *dev, *ret;
966 
967 	ASSERT_RTNL();
968 
969 	ret = NULL;
970 	for_each_netdev(net, dev) {
971 		if (((dev->flags ^ if_flags) & mask) == 0) {
972 			ret = dev;
973 			break;
974 		}
975 	}
976 	return ret;
977 }
978 EXPORT_SYMBOL(__dev_get_by_flags);
979 
980 /**
981  *	dev_valid_name - check if name is okay for network device
982  *	@name: name string
983  *
984  *	Network device names need to be valid file names to
985  *	to allow sysfs to work.  We also disallow any kind of
986  *	whitespace.
987  */
988 bool dev_valid_name(const char *name)
989 {
990 	if (*name == '\0')
991 		return false;
992 	if (strlen(name) >= IFNAMSIZ)
993 		return false;
994 	if (!strcmp(name, ".") || !strcmp(name, ".."))
995 		return false;
996 
997 	while (*name) {
998 		if (*name == '/' || *name == ':' || isspace(*name))
999 			return false;
1000 		name++;
1001 	}
1002 	return true;
1003 }
1004 EXPORT_SYMBOL(dev_valid_name);
1005 
1006 /**
1007  *	__dev_alloc_name - allocate a name for a device
1008  *	@net: network namespace to allocate the device name in
1009  *	@name: name format string
1010  *	@buf:  scratch buffer and result name string
1011  *
1012  *	Passed a format string - eg "lt%d" it will try and find a suitable
1013  *	id. It scans list of devices to build up a free map, then chooses
1014  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1015  *	while allocating the name and adding the device in order to avoid
1016  *	duplicates.
1017  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018  *	Returns the number of the unit assigned or a negative errno code.
1019  */
1020 
1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1022 {
1023 	int i = 0;
1024 	const char *p;
1025 	const int max_netdevices = 8*PAGE_SIZE;
1026 	unsigned long *inuse;
1027 	struct net_device *d;
1028 
1029 	p = strnchr(name, IFNAMSIZ-1, '%');
1030 	if (p) {
1031 		/*
1032 		 * Verify the string as this thing may have come from
1033 		 * the user.  There must be either one "%d" and no other "%"
1034 		 * characters.
1035 		 */
1036 		if (p[1] != 'd' || strchr(p + 2, '%'))
1037 			return -EINVAL;
1038 
1039 		/* Use one page as a bit array of possible slots */
1040 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1041 		if (!inuse)
1042 			return -ENOMEM;
1043 
1044 		for_each_netdev(net, d) {
1045 			if (!sscanf(d->name, name, &i))
1046 				continue;
1047 			if (i < 0 || i >= max_netdevices)
1048 				continue;
1049 
1050 			/*  avoid cases where sscanf is not exact inverse of printf */
1051 			snprintf(buf, IFNAMSIZ, name, i);
1052 			if (!strncmp(buf, d->name, IFNAMSIZ))
1053 				set_bit(i, inuse);
1054 		}
1055 
1056 		i = find_first_zero_bit(inuse, max_netdevices);
1057 		free_page((unsigned long) inuse);
1058 	}
1059 
1060 	if (buf != name)
1061 		snprintf(buf, IFNAMSIZ, name, i);
1062 	if (!__dev_get_by_name(net, buf))
1063 		return i;
1064 
1065 	/* It is possible to run out of possible slots
1066 	 * when the name is long and there isn't enough space left
1067 	 * for the digits, or if all bits are used.
1068 	 */
1069 	return -ENFILE;
1070 }
1071 
1072 /**
1073  *	dev_alloc_name - allocate a name for a device
1074  *	@dev: device
1075  *	@name: name format string
1076  *
1077  *	Passed a format string - eg "lt%d" it will try and find a suitable
1078  *	id. It scans list of devices to build up a free map, then chooses
1079  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1080  *	while allocating the name and adding the device in order to avoid
1081  *	duplicates.
1082  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083  *	Returns the number of the unit assigned or a negative errno code.
1084  */
1085 
1086 int dev_alloc_name(struct net_device *dev, const char *name)
1087 {
1088 	char buf[IFNAMSIZ];
1089 	struct net *net;
1090 	int ret;
1091 
1092 	BUG_ON(!dev_net(dev));
1093 	net = dev_net(dev);
1094 	ret = __dev_alloc_name(net, name, buf);
1095 	if (ret >= 0)
1096 		strlcpy(dev->name, buf, IFNAMSIZ);
1097 	return ret;
1098 }
1099 EXPORT_SYMBOL(dev_alloc_name);
1100 
1101 static int dev_alloc_name_ns(struct net *net,
1102 			     struct net_device *dev,
1103 			     const char *name)
1104 {
1105 	char buf[IFNAMSIZ];
1106 	int ret;
1107 
1108 	ret = __dev_alloc_name(net, name, buf);
1109 	if (ret >= 0)
1110 		strlcpy(dev->name, buf, IFNAMSIZ);
1111 	return ret;
1112 }
1113 
1114 static int dev_get_valid_name(struct net *net,
1115 			      struct net_device *dev,
1116 			      const char *name)
1117 {
1118 	BUG_ON(!net);
1119 
1120 	if (!dev_valid_name(name))
1121 		return -EINVAL;
1122 
1123 	if (strchr(name, '%'))
1124 		return dev_alloc_name_ns(net, dev, name);
1125 	else if (__dev_get_by_name(net, name))
1126 		return -EEXIST;
1127 	else if (dev->name != name)
1128 		strlcpy(dev->name, name, IFNAMSIZ);
1129 
1130 	return 0;
1131 }
1132 
1133 /**
1134  *	dev_change_name - change name of a device
1135  *	@dev: device
1136  *	@newname: name (or format string) must be at least IFNAMSIZ
1137  *
1138  *	Change name of a device, can pass format strings "eth%d".
1139  *	for wildcarding.
1140  */
1141 int dev_change_name(struct net_device *dev, const char *newname)
1142 {
1143 	unsigned char old_assign_type;
1144 	char oldname[IFNAMSIZ];
1145 	int err = 0;
1146 	int ret;
1147 	struct net *net;
1148 
1149 	ASSERT_RTNL();
1150 	BUG_ON(!dev_net(dev));
1151 
1152 	net = dev_net(dev);
1153 	if (dev->flags & IFF_UP)
1154 		return -EBUSY;
1155 
1156 	write_seqcount_begin(&devnet_rename_seq);
1157 
1158 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1159 		write_seqcount_end(&devnet_rename_seq);
1160 		return 0;
1161 	}
1162 
1163 	memcpy(oldname, dev->name, IFNAMSIZ);
1164 
1165 	err = dev_get_valid_name(net, dev, newname);
1166 	if (err < 0) {
1167 		write_seqcount_end(&devnet_rename_seq);
1168 		return err;
1169 	}
1170 
1171 	if (oldname[0] && !strchr(oldname, '%'))
1172 		netdev_info(dev, "renamed from %s\n", oldname);
1173 
1174 	old_assign_type = dev->name_assign_type;
1175 	dev->name_assign_type = NET_NAME_RENAMED;
1176 
1177 rollback:
1178 	ret = device_rename(&dev->dev, dev->name);
1179 	if (ret) {
1180 		memcpy(dev->name, oldname, IFNAMSIZ);
1181 		dev->name_assign_type = old_assign_type;
1182 		write_seqcount_end(&devnet_rename_seq);
1183 		return ret;
1184 	}
1185 
1186 	write_seqcount_end(&devnet_rename_seq);
1187 
1188 	netdev_adjacent_rename_links(dev, oldname);
1189 
1190 	write_lock_bh(&dev_base_lock);
1191 	hlist_del_rcu(&dev->name_hlist);
1192 	write_unlock_bh(&dev_base_lock);
1193 
1194 	synchronize_rcu();
1195 
1196 	write_lock_bh(&dev_base_lock);
1197 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198 	write_unlock_bh(&dev_base_lock);
1199 
1200 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201 	ret = notifier_to_errno(ret);
1202 
1203 	if (ret) {
1204 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1205 		if (err >= 0) {
1206 			err = ret;
1207 			write_seqcount_begin(&devnet_rename_seq);
1208 			memcpy(dev->name, oldname, IFNAMSIZ);
1209 			memcpy(oldname, newname, IFNAMSIZ);
1210 			dev->name_assign_type = old_assign_type;
1211 			old_assign_type = NET_NAME_RENAMED;
1212 			goto rollback;
1213 		} else {
1214 			pr_err("%s: name change rollback failed: %d\n",
1215 			       dev->name, ret);
1216 		}
1217 	}
1218 
1219 	return err;
1220 }
1221 
1222 /**
1223  *	dev_set_alias - change ifalias of a device
1224  *	@dev: device
1225  *	@alias: name up to IFALIASZ
1226  *	@len: limit of bytes to copy from info
1227  *
1228  *	Set ifalias for a device,
1229  */
1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1231 {
1232 	char *new_ifalias;
1233 
1234 	ASSERT_RTNL();
1235 
1236 	if (len >= IFALIASZ)
1237 		return -EINVAL;
1238 
1239 	if (!len) {
1240 		kfree(dev->ifalias);
1241 		dev->ifalias = NULL;
1242 		return 0;
1243 	}
1244 
1245 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1246 	if (!new_ifalias)
1247 		return -ENOMEM;
1248 	dev->ifalias = new_ifalias;
1249 
1250 	strlcpy(dev->ifalias, alias, len+1);
1251 	return len;
1252 }
1253 
1254 
1255 /**
1256  *	netdev_features_change - device changes features
1257  *	@dev: device to cause notification
1258  *
1259  *	Called to indicate a device has changed features.
1260  */
1261 void netdev_features_change(struct net_device *dev)
1262 {
1263 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1264 }
1265 EXPORT_SYMBOL(netdev_features_change);
1266 
1267 /**
1268  *	netdev_state_change - device changes state
1269  *	@dev: device to cause notification
1270  *
1271  *	Called to indicate a device has changed state. This function calls
1272  *	the notifier chains for netdev_chain and sends a NEWLINK message
1273  *	to the routing socket.
1274  */
1275 void netdev_state_change(struct net_device *dev)
1276 {
1277 	if (dev->flags & IFF_UP) {
1278 		struct netdev_notifier_change_info change_info;
1279 
1280 		change_info.flags_changed = 0;
1281 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1282 					      &change_info.info);
1283 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1284 	}
1285 }
1286 EXPORT_SYMBOL(netdev_state_change);
1287 
1288 /**
1289  * 	netdev_notify_peers - notify network peers about existence of @dev
1290  * 	@dev: network device
1291  *
1292  * Generate traffic such that interested network peers are aware of
1293  * @dev, such as by generating a gratuitous ARP. This may be used when
1294  * a device wants to inform the rest of the network about some sort of
1295  * reconfiguration such as a failover event or virtual machine
1296  * migration.
1297  */
1298 void netdev_notify_peers(struct net_device *dev)
1299 {
1300 	rtnl_lock();
1301 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1302 	rtnl_unlock();
1303 }
1304 EXPORT_SYMBOL(netdev_notify_peers);
1305 
1306 static int __dev_open(struct net_device *dev)
1307 {
1308 	const struct net_device_ops *ops = dev->netdev_ops;
1309 	int ret;
1310 
1311 	ASSERT_RTNL();
1312 
1313 	if (!netif_device_present(dev))
1314 		return -ENODEV;
1315 
1316 	/* Block netpoll from trying to do any rx path servicing.
1317 	 * If we don't do this there is a chance ndo_poll_controller
1318 	 * or ndo_poll may be running while we open the device
1319 	 */
1320 	netpoll_poll_disable(dev);
1321 
1322 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1323 	ret = notifier_to_errno(ret);
1324 	if (ret)
1325 		return ret;
1326 
1327 	set_bit(__LINK_STATE_START, &dev->state);
1328 
1329 	if (ops->ndo_validate_addr)
1330 		ret = ops->ndo_validate_addr(dev);
1331 
1332 	if (!ret && ops->ndo_open)
1333 		ret = ops->ndo_open(dev);
1334 
1335 	netpoll_poll_enable(dev);
1336 
1337 	if (ret)
1338 		clear_bit(__LINK_STATE_START, &dev->state);
1339 	else {
1340 		dev->flags |= IFF_UP;
1341 		dev_set_rx_mode(dev);
1342 		dev_activate(dev);
1343 		add_device_randomness(dev->dev_addr, dev->addr_len);
1344 	}
1345 
1346 	return ret;
1347 }
1348 
1349 /**
1350  *	dev_open	- prepare an interface for use.
1351  *	@dev:	device to open
1352  *
1353  *	Takes a device from down to up state. The device's private open
1354  *	function is invoked and then the multicast lists are loaded. Finally
1355  *	the device is moved into the up state and a %NETDEV_UP message is
1356  *	sent to the netdev notifier chain.
1357  *
1358  *	Calling this function on an active interface is a nop. On a failure
1359  *	a negative errno code is returned.
1360  */
1361 int dev_open(struct net_device *dev)
1362 {
1363 	int ret;
1364 
1365 	if (dev->flags & IFF_UP)
1366 		return 0;
1367 
1368 	ret = __dev_open(dev);
1369 	if (ret < 0)
1370 		return ret;
1371 
1372 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1373 	call_netdevice_notifiers(NETDEV_UP, dev);
1374 
1375 	return ret;
1376 }
1377 EXPORT_SYMBOL(dev_open);
1378 
1379 static int __dev_close_many(struct list_head *head)
1380 {
1381 	struct net_device *dev;
1382 
1383 	ASSERT_RTNL();
1384 	might_sleep();
1385 
1386 	list_for_each_entry(dev, head, close_list) {
1387 		/* Temporarily disable netpoll until the interface is down */
1388 		netpoll_poll_disable(dev);
1389 
1390 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1391 
1392 		clear_bit(__LINK_STATE_START, &dev->state);
1393 
1394 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1395 		 * can be even on different cpu. So just clear netif_running().
1396 		 *
1397 		 * dev->stop() will invoke napi_disable() on all of it's
1398 		 * napi_struct instances on this device.
1399 		 */
1400 		smp_mb__after_atomic(); /* Commit netif_running(). */
1401 	}
1402 
1403 	dev_deactivate_many(head);
1404 
1405 	list_for_each_entry(dev, head, close_list) {
1406 		const struct net_device_ops *ops = dev->netdev_ops;
1407 
1408 		/*
1409 		 *	Call the device specific close. This cannot fail.
1410 		 *	Only if device is UP
1411 		 *
1412 		 *	We allow it to be called even after a DETACH hot-plug
1413 		 *	event.
1414 		 */
1415 		if (ops->ndo_stop)
1416 			ops->ndo_stop(dev);
1417 
1418 		dev->flags &= ~IFF_UP;
1419 		netpoll_poll_enable(dev);
1420 	}
1421 
1422 	return 0;
1423 }
1424 
1425 static int __dev_close(struct net_device *dev)
1426 {
1427 	int retval;
1428 	LIST_HEAD(single);
1429 
1430 	list_add(&dev->close_list, &single);
1431 	retval = __dev_close_many(&single);
1432 	list_del(&single);
1433 
1434 	return retval;
1435 }
1436 
1437 int dev_close_many(struct list_head *head, bool unlink)
1438 {
1439 	struct net_device *dev, *tmp;
1440 
1441 	/* Remove the devices that don't need to be closed */
1442 	list_for_each_entry_safe(dev, tmp, head, close_list)
1443 		if (!(dev->flags & IFF_UP))
1444 			list_del_init(&dev->close_list);
1445 
1446 	__dev_close_many(head);
1447 
1448 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1449 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1450 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1451 		if (unlink)
1452 			list_del_init(&dev->close_list);
1453 	}
1454 
1455 	return 0;
1456 }
1457 EXPORT_SYMBOL(dev_close_many);
1458 
1459 /**
1460  *	dev_close - shutdown an interface.
1461  *	@dev: device to shutdown
1462  *
1463  *	This function moves an active device into down state. A
1464  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1465  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1466  *	chain.
1467  */
1468 int dev_close(struct net_device *dev)
1469 {
1470 	if (dev->flags & IFF_UP) {
1471 		LIST_HEAD(single);
1472 
1473 		list_add(&dev->close_list, &single);
1474 		dev_close_many(&single, true);
1475 		list_del(&single);
1476 	}
1477 	return 0;
1478 }
1479 EXPORT_SYMBOL(dev_close);
1480 
1481 
1482 /**
1483  *	dev_disable_lro - disable Large Receive Offload on a device
1484  *	@dev: device
1485  *
1486  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1487  *	called under RTNL.  This is needed if received packets may be
1488  *	forwarded to another interface.
1489  */
1490 void dev_disable_lro(struct net_device *dev)
1491 {
1492 	struct net_device *lower_dev;
1493 	struct list_head *iter;
1494 
1495 	dev->wanted_features &= ~NETIF_F_LRO;
1496 	netdev_update_features(dev);
1497 
1498 	if (unlikely(dev->features & NETIF_F_LRO))
1499 		netdev_WARN(dev, "failed to disable LRO!\n");
1500 
1501 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1502 		dev_disable_lro(lower_dev);
1503 }
1504 EXPORT_SYMBOL(dev_disable_lro);
1505 
1506 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1507 				   struct net_device *dev)
1508 {
1509 	struct netdev_notifier_info info;
1510 
1511 	netdev_notifier_info_init(&info, dev);
1512 	return nb->notifier_call(nb, val, &info);
1513 }
1514 
1515 static int dev_boot_phase = 1;
1516 
1517 /**
1518  *	register_netdevice_notifier - register a network notifier block
1519  *	@nb: notifier
1520  *
1521  *	Register a notifier to be called when network device events occur.
1522  *	The notifier passed is linked into the kernel structures and must
1523  *	not be reused until it has been unregistered. A negative errno code
1524  *	is returned on a failure.
1525  *
1526  * 	When registered all registration and up events are replayed
1527  *	to the new notifier to allow device to have a race free
1528  *	view of the network device list.
1529  */
1530 
1531 int register_netdevice_notifier(struct notifier_block *nb)
1532 {
1533 	struct net_device *dev;
1534 	struct net_device *last;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_register(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 	if (dev_boot_phase)
1543 		goto unlock;
1544 	for_each_net(net) {
1545 		for_each_netdev(net, dev) {
1546 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1547 			err = notifier_to_errno(err);
1548 			if (err)
1549 				goto rollback;
1550 
1551 			if (!(dev->flags & IFF_UP))
1552 				continue;
1553 
1554 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1555 		}
1556 	}
1557 
1558 unlock:
1559 	rtnl_unlock();
1560 	return err;
1561 
1562 rollback:
1563 	last = dev;
1564 	for_each_net(net) {
1565 		for_each_netdev(net, dev) {
1566 			if (dev == last)
1567 				goto outroll;
1568 
1569 			if (dev->flags & IFF_UP) {
1570 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1571 							dev);
1572 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1573 			}
1574 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1575 		}
1576 	}
1577 
1578 outroll:
1579 	raw_notifier_chain_unregister(&netdev_chain, nb);
1580 	goto unlock;
1581 }
1582 EXPORT_SYMBOL(register_netdevice_notifier);
1583 
1584 /**
1585  *	unregister_netdevice_notifier - unregister a network notifier block
1586  *	@nb: notifier
1587  *
1588  *	Unregister a notifier previously registered by
1589  *	register_netdevice_notifier(). The notifier is unlinked into the
1590  *	kernel structures and may then be reused. A negative errno code
1591  *	is returned on a failure.
1592  *
1593  * 	After unregistering unregister and down device events are synthesized
1594  *	for all devices on the device list to the removed notifier to remove
1595  *	the need for special case cleanup code.
1596  */
1597 
1598 int unregister_netdevice_notifier(struct notifier_block *nb)
1599 {
1600 	struct net_device *dev;
1601 	struct net *net;
1602 	int err;
1603 
1604 	rtnl_lock();
1605 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1606 	if (err)
1607 		goto unlock;
1608 
1609 	for_each_net(net) {
1610 		for_each_netdev(net, dev) {
1611 			if (dev->flags & IFF_UP) {
1612 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1613 							dev);
1614 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1615 			}
1616 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1617 		}
1618 	}
1619 unlock:
1620 	rtnl_unlock();
1621 	return err;
1622 }
1623 EXPORT_SYMBOL(unregister_netdevice_notifier);
1624 
1625 /**
1626  *	call_netdevice_notifiers_info - call all network notifier blocks
1627  *	@val: value passed unmodified to notifier function
1628  *	@dev: net_device pointer passed unmodified to notifier function
1629  *	@info: notifier information data
1630  *
1631  *	Call all network notifier blocks.  Parameters and return value
1632  *	are as for raw_notifier_call_chain().
1633  */
1634 
1635 static int call_netdevice_notifiers_info(unsigned long val,
1636 					 struct net_device *dev,
1637 					 struct netdev_notifier_info *info)
1638 {
1639 	ASSERT_RTNL();
1640 	netdev_notifier_info_init(info, dev);
1641 	return raw_notifier_call_chain(&netdev_chain, val, info);
1642 }
1643 
1644 /**
1645  *	call_netdevice_notifiers - call all network notifier blocks
1646  *      @val: value passed unmodified to notifier function
1647  *      @dev: net_device pointer passed unmodified to notifier function
1648  *
1649  *	Call all network notifier blocks.  Parameters and return value
1650  *	are as for raw_notifier_call_chain().
1651  */
1652 
1653 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1654 {
1655 	struct netdev_notifier_info info;
1656 
1657 	return call_netdevice_notifiers_info(val, dev, &info);
1658 }
1659 EXPORT_SYMBOL(call_netdevice_notifiers);
1660 
1661 #ifdef CONFIG_NET_INGRESS
1662 static struct static_key ingress_needed __read_mostly;
1663 
1664 void net_inc_ingress_queue(void)
1665 {
1666 	static_key_slow_inc(&ingress_needed);
1667 }
1668 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1669 
1670 void net_dec_ingress_queue(void)
1671 {
1672 	static_key_slow_dec(&ingress_needed);
1673 }
1674 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1675 #endif
1676 
1677 static struct static_key netstamp_needed __read_mostly;
1678 #ifdef HAVE_JUMP_LABEL
1679 /* We are not allowed to call static_key_slow_dec() from irq context
1680  * If net_disable_timestamp() is called from irq context, defer the
1681  * static_key_slow_dec() calls.
1682  */
1683 static atomic_t netstamp_needed_deferred;
1684 #endif
1685 
1686 void net_enable_timestamp(void)
1687 {
1688 #ifdef HAVE_JUMP_LABEL
1689 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1690 
1691 	if (deferred) {
1692 		while (--deferred)
1693 			static_key_slow_dec(&netstamp_needed);
1694 		return;
1695 	}
1696 #endif
1697 	static_key_slow_inc(&netstamp_needed);
1698 }
1699 EXPORT_SYMBOL(net_enable_timestamp);
1700 
1701 void net_disable_timestamp(void)
1702 {
1703 #ifdef HAVE_JUMP_LABEL
1704 	if (in_interrupt()) {
1705 		atomic_inc(&netstamp_needed_deferred);
1706 		return;
1707 	}
1708 #endif
1709 	static_key_slow_dec(&netstamp_needed);
1710 }
1711 EXPORT_SYMBOL(net_disable_timestamp);
1712 
1713 static inline void net_timestamp_set(struct sk_buff *skb)
1714 {
1715 	skb->tstamp.tv64 = 0;
1716 	if (static_key_false(&netstamp_needed))
1717 		__net_timestamp(skb);
1718 }
1719 
1720 #define net_timestamp_check(COND, SKB)			\
1721 	if (static_key_false(&netstamp_needed)) {		\
1722 		if ((COND) && !(SKB)->tstamp.tv64)	\
1723 			__net_timestamp(SKB);		\
1724 	}						\
1725 
1726 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1727 {
1728 	unsigned int len;
1729 
1730 	if (!(dev->flags & IFF_UP))
1731 		return false;
1732 
1733 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1734 	if (skb->len <= len)
1735 		return true;
1736 
1737 	/* if TSO is enabled, we don't care about the length as the packet
1738 	 * could be forwarded without being segmented before
1739 	 */
1740 	if (skb_is_gso(skb))
1741 		return true;
1742 
1743 	return false;
1744 }
1745 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1746 
1747 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1748 {
1749 	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1750 	    unlikely(!is_skb_forwardable(dev, skb))) {
1751 		atomic_long_inc(&dev->rx_dropped);
1752 		kfree_skb(skb);
1753 		return NET_RX_DROP;
1754 	}
1755 
1756 	skb_scrub_packet(skb, true);
1757 	skb->priority = 0;
1758 	skb->protocol = eth_type_trans(skb, dev);
1759 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1760 
1761 	return 0;
1762 }
1763 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1764 
1765 /**
1766  * dev_forward_skb - loopback an skb to another netif
1767  *
1768  * @dev: destination network device
1769  * @skb: buffer to forward
1770  *
1771  * return values:
1772  *	NET_RX_SUCCESS	(no congestion)
1773  *	NET_RX_DROP     (packet was dropped, but freed)
1774  *
1775  * dev_forward_skb can be used for injecting an skb from the
1776  * start_xmit function of one device into the receive queue
1777  * of another device.
1778  *
1779  * The receiving device may be in another namespace, so
1780  * we have to clear all information in the skb that could
1781  * impact namespace isolation.
1782  */
1783 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1784 {
1785 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1786 }
1787 EXPORT_SYMBOL_GPL(dev_forward_skb);
1788 
1789 static inline int deliver_skb(struct sk_buff *skb,
1790 			      struct packet_type *pt_prev,
1791 			      struct net_device *orig_dev)
1792 {
1793 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1794 		return -ENOMEM;
1795 	atomic_inc(&skb->users);
1796 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1797 }
1798 
1799 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1800 					  struct packet_type **pt,
1801 					  struct net_device *orig_dev,
1802 					  __be16 type,
1803 					  struct list_head *ptype_list)
1804 {
1805 	struct packet_type *ptype, *pt_prev = *pt;
1806 
1807 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1808 		if (ptype->type != type)
1809 			continue;
1810 		if (pt_prev)
1811 			deliver_skb(skb, pt_prev, orig_dev);
1812 		pt_prev = ptype;
1813 	}
1814 	*pt = pt_prev;
1815 }
1816 
1817 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1818 {
1819 	if (!ptype->af_packet_priv || !skb->sk)
1820 		return false;
1821 
1822 	if (ptype->id_match)
1823 		return ptype->id_match(ptype, skb->sk);
1824 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1825 		return true;
1826 
1827 	return false;
1828 }
1829 
1830 /*
1831  *	Support routine. Sends outgoing frames to any network
1832  *	taps currently in use.
1833  */
1834 
1835 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1836 {
1837 	struct packet_type *ptype;
1838 	struct sk_buff *skb2 = NULL;
1839 	struct packet_type *pt_prev = NULL;
1840 	struct list_head *ptype_list = &ptype_all;
1841 
1842 	rcu_read_lock();
1843 again:
1844 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1845 		/* Never send packets back to the socket
1846 		 * they originated from - MvS (miquels@drinkel.ow.org)
1847 		 */
1848 		if (skb_loop_sk(ptype, skb))
1849 			continue;
1850 
1851 		if (pt_prev) {
1852 			deliver_skb(skb2, pt_prev, skb->dev);
1853 			pt_prev = ptype;
1854 			continue;
1855 		}
1856 
1857 		/* need to clone skb, done only once */
1858 		skb2 = skb_clone(skb, GFP_ATOMIC);
1859 		if (!skb2)
1860 			goto out_unlock;
1861 
1862 		net_timestamp_set(skb2);
1863 
1864 		/* skb->nh should be correctly
1865 		 * set by sender, so that the second statement is
1866 		 * just protection against buggy protocols.
1867 		 */
1868 		skb_reset_mac_header(skb2);
1869 
1870 		if (skb_network_header(skb2) < skb2->data ||
1871 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1872 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1873 					     ntohs(skb2->protocol),
1874 					     dev->name);
1875 			skb_reset_network_header(skb2);
1876 		}
1877 
1878 		skb2->transport_header = skb2->network_header;
1879 		skb2->pkt_type = PACKET_OUTGOING;
1880 		pt_prev = ptype;
1881 	}
1882 
1883 	if (ptype_list == &ptype_all) {
1884 		ptype_list = &dev->ptype_all;
1885 		goto again;
1886 	}
1887 out_unlock:
1888 	if (pt_prev)
1889 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1890 	rcu_read_unlock();
1891 }
1892 
1893 /**
1894  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1895  * @dev: Network device
1896  * @txq: number of queues available
1897  *
1898  * If real_num_tx_queues is changed the tc mappings may no longer be
1899  * valid. To resolve this verify the tc mapping remains valid and if
1900  * not NULL the mapping. With no priorities mapping to this
1901  * offset/count pair it will no longer be used. In the worst case TC0
1902  * is invalid nothing can be done so disable priority mappings. If is
1903  * expected that drivers will fix this mapping if they can before
1904  * calling netif_set_real_num_tx_queues.
1905  */
1906 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1907 {
1908 	int i;
1909 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1910 
1911 	/* If TC0 is invalidated disable TC mapping */
1912 	if (tc->offset + tc->count > txq) {
1913 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1914 		dev->num_tc = 0;
1915 		return;
1916 	}
1917 
1918 	/* Invalidated prio to tc mappings set to TC0 */
1919 	for (i = 1; i < TC_BITMASK + 1; i++) {
1920 		int q = netdev_get_prio_tc_map(dev, i);
1921 
1922 		tc = &dev->tc_to_txq[q];
1923 		if (tc->offset + tc->count > txq) {
1924 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1925 				i, q);
1926 			netdev_set_prio_tc_map(dev, i, 0);
1927 		}
1928 	}
1929 }
1930 
1931 #ifdef CONFIG_XPS
1932 static DEFINE_MUTEX(xps_map_mutex);
1933 #define xmap_dereference(P)		\
1934 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1935 
1936 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1937 					int cpu, u16 index)
1938 {
1939 	struct xps_map *map = NULL;
1940 	int pos;
1941 
1942 	if (dev_maps)
1943 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1944 
1945 	for (pos = 0; map && pos < map->len; pos++) {
1946 		if (map->queues[pos] == index) {
1947 			if (map->len > 1) {
1948 				map->queues[pos] = map->queues[--map->len];
1949 			} else {
1950 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1951 				kfree_rcu(map, rcu);
1952 				map = NULL;
1953 			}
1954 			break;
1955 		}
1956 	}
1957 
1958 	return map;
1959 }
1960 
1961 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1962 {
1963 	struct xps_dev_maps *dev_maps;
1964 	int cpu, i;
1965 	bool active = false;
1966 
1967 	mutex_lock(&xps_map_mutex);
1968 	dev_maps = xmap_dereference(dev->xps_maps);
1969 
1970 	if (!dev_maps)
1971 		goto out_no_maps;
1972 
1973 	for_each_possible_cpu(cpu) {
1974 		for (i = index; i < dev->num_tx_queues; i++) {
1975 			if (!remove_xps_queue(dev_maps, cpu, i))
1976 				break;
1977 		}
1978 		if (i == dev->num_tx_queues)
1979 			active = true;
1980 	}
1981 
1982 	if (!active) {
1983 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1984 		kfree_rcu(dev_maps, rcu);
1985 	}
1986 
1987 	for (i = index; i < dev->num_tx_queues; i++)
1988 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1989 					     NUMA_NO_NODE);
1990 
1991 out_no_maps:
1992 	mutex_unlock(&xps_map_mutex);
1993 }
1994 
1995 static struct xps_map *expand_xps_map(struct xps_map *map,
1996 				      int cpu, u16 index)
1997 {
1998 	struct xps_map *new_map;
1999 	int alloc_len = XPS_MIN_MAP_ALLOC;
2000 	int i, pos;
2001 
2002 	for (pos = 0; map && pos < map->len; pos++) {
2003 		if (map->queues[pos] != index)
2004 			continue;
2005 		return map;
2006 	}
2007 
2008 	/* Need to add queue to this CPU's existing map */
2009 	if (map) {
2010 		if (pos < map->alloc_len)
2011 			return map;
2012 
2013 		alloc_len = map->alloc_len * 2;
2014 	}
2015 
2016 	/* Need to allocate new map to store queue on this CPU's map */
2017 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2018 			       cpu_to_node(cpu));
2019 	if (!new_map)
2020 		return NULL;
2021 
2022 	for (i = 0; i < pos; i++)
2023 		new_map->queues[i] = map->queues[i];
2024 	new_map->alloc_len = alloc_len;
2025 	new_map->len = pos;
2026 
2027 	return new_map;
2028 }
2029 
2030 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2031 			u16 index)
2032 {
2033 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2034 	struct xps_map *map, *new_map;
2035 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2036 	int cpu, numa_node_id = -2;
2037 	bool active = false;
2038 
2039 	mutex_lock(&xps_map_mutex);
2040 
2041 	dev_maps = xmap_dereference(dev->xps_maps);
2042 
2043 	/* allocate memory for queue storage */
2044 	for_each_online_cpu(cpu) {
2045 		if (!cpumask_test_cpu(cpu, mask))
2046 			continue;
2047 
2048 		if (!new_dev_maps)
2049 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2050 		if (!new_dev_maps) {
2051 			mutex_unlock(&xps_map_mutex);
2052 			return -ENOMEM;
2053 		}
2054 
2055 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2056 				 NULL;
2057 
2058 		map = expand_xps_map(map, cpu, index);
2059 		if (!map)
2060 			goto error;
2061 
2062 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2063 	}
2064 
2065 	if (!new_dev_maps)
2066 		goto out_no_new_maps;
2067 
2068 	for_each_possible_cpu(cpu) {
2069 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2070 			/* add queue to CPU maps */
2071 			int pos = 0;
2072 
2073 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2074 			while ((pos < map->len) && (map->queues[pos] != index))
2075 				pos++;
2076 
2077 			if (pos == map->len)
2078 				map->queues[map->len++] = index;
2079 #ifdef CONFIG_NUMA
2080 			if (numa_node_id == -2)
2081 				numa_node_id = cpu_to_node(cpu);
2082 			else if (numa_node_id != cpu_to_node(cpu))
2083 				numa_node_id = -1;
2084 #endif
2085 		} else if (dev_maps) {
2086 			/* fill in the new device map from the old device map */
2087 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2088 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2089 		}
2090 
2091 	}
2092 
2093 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2094 
2095 	/* Cleanup old maps */
2096 	if (dev_maps) {
2097 		for_each_possible_cpu(cpu) {
2098 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2099 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2100 			if (map && map != new_map)
2101 				kfree_rcu(map, rcu);
2102 		}
2103 
2104 		kfree_rcu(dev_maps, rcu);
2105 	}
2106 
2107 	dev_maps = new_dev_maps;
2108 	active = true;
2109 
2110 out_no_new_maps:
2111 	/* update Tx queue numa node */
2112 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2113 				     (numa_node_id >= 0) ? numa_node_id :
2114 				     NUMA_NO_NODE);
2115 
2116 	if (!dev_maps)
2117 		goto out_no_maps;
2118 
2119 	/* removes queue from unused CPUs */
2120 	for_each_possible_cpu(cpu) {
2121 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2122 			continue;
2123 
2124 		if (remove_xps_queue(dev_maps, cpu, index))
2125 			active = true;
2126 	}
2127 
2128 	/* free map if not active */
2129 	if (!active) {
2130 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2131 		kfree_rcu(dev_maps, rcu);
2132 	}
2133 
2134 out_no_maps:
2135 	mutex_unlock(&xps_map_mutex);
2136 
2137 	return 0;
2138 error:
2139 	/* remove any maps that we added */
2140 	for_each_possible_cpu(cpu) {
2141 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2142 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2143 				 NULL;
2144 		if (new_map && new_map != map)
2145 			kfree(new_map);
2146 	}
2147 
2148 	mutex_unlock(&xps_map_mutex);
2149 
2150 	kfree(new_dev_maps);
2151 	return -ENOMEM;
2152 }
2153 EXPORT_SYMBOL(netif_set_xps_queue);
2154 
2155 #endif
2156 /*
2157  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2158  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2159  */
2160 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2161 {
2162 	int rc;
2163 
2164 	if (txq < 1 || txq > dev->num_tx_queues)
2165 		return -EINVAL;
2166 
2167 	if (dev->reg_state == NETREG_REGISTERED ||
2168 	    dev->reg_state == NETREG_UNREGISTERING) {
2169 		ASSERT_RTNL();
2170 
2171 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2172 						  txq);
2173 		if (rc)
2174 			return rc;
2175 
2176 		if (dev->num_tc)
2177 			netif_setup_tc(dev, txq);
2178 
2179 		if (txq < dev->real_num_tx_queues) {
2180 			qdisc_reset_all_tx_gt(dev, txq);
2181 #ifdef CONFIG_XPS
2182 			netif_reset_xps_queues_gt(dev, txq);
2183 #endif
2184 		}
2185 	}
2186 
2187 	dev->real_num_tx_queues = txq;
2188 	return 0;
2189 }
2190 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2191 
2192 #ifdef CONFIG_SYSFS
2193 /**
2194  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2195  *	@dev: Network device
2196  *	@rxq: Actual number of RX queues
2197  *
2198  *	This must be called either with the rtnl_lock held or before
2199  *	registration of the net device.  Returns 0 on success, or a
2200  *	negative error code.  If called before registration, it always
2201  *	succeeds.
2202  */
2203 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2204 {
2205 	int rc;
2206 
2207 	if (rxq < 1 || rxq > dev->num_rx_queues)
2208 		return -EINVAL;
2209 
2210 	if (dev->reg_state == NETREG_REGISTERED) {
2211 		ASSERT_RTNL();
2212 
2213 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2214 						  rxq);
2215 		if (rc)
2216 			return rc;
2217 	}
2218 
2219 	dev->real_num_rx_queues = rxq;
2220 	return 0;
2221 }
2222 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2223 #endif
2224 
2225 /**
2226  * netif_get_num_default_rss_queues - default number of RSS queues
2227  *
2228  * This routine should set an upper limit on the number of RSS queues
2229  * used by default by multiqueue devices.
2230  */
2231 int netif_get_num_default_rss_queues(void)
2232 {
2233 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2234 }
2235 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2236 
2237 static inline void __netif_reschedule(struct Qdisc *q)
2238 {
2239 	struct softnet_data *sd;
2240 	unsigned long flags;
2241 
2242 	local_irq_save(flags);
2243 	sd = this_cpu_ptr(&softnet_data);
2244 	q->next_sched = NULL;
2245 	*sd->output_queue_tailp = q;
2246 	sd->output_queue_tailp = &q->next_sched;
2247 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2248 	local_irq_restore(flags);
2249 }
2250 
2251 void __netif_schedule(struct Qdisc *q)
2252 {
2253 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2254 		__netif_reschedule(q);
2255 }
2256 EXPORT_SYMBOL(__netif_schedule);
2257 
2258 struct dev_kfree_skb_cb {
2259 	enum skb_free_reason reason;
2260 };
2261 
2262 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2263 {
2264 	return (struct dev_kfree_skb_cb *)skb->cb;
2265 }
2266 
2267 void netif_schedule_queue(struct netdev_queue *txq)
2268 {
2269 	rcu_read_lock();
2270 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2271 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2272 
2273 		__netif_schedule(q);
2274 	}
2275 	rcu_read_unlock();
2276 }
2277 EXPORT_SYMBOL(netif_schedule_queue);
2278 
2279 /**
2280  *	netif_wake_subqueue - allow sending packets on subqueue
2281  *	@dev: network device
2282  *	@queue_index: sub queue index
2283  *
2284  * Resume individual transmit queue of a device with multiple transmit queues.
2285  */
2286 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2287 {
2288 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2289 
2290 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2291 		struct Qdisc *q;
2292 
2293 		rcu_read_lock();
2294 		q = rcu_dereference(txq->qdisc);
2295 		__netif_schedule(q);
2296 		rcu_read_unlock();
2297 	}
2298 }
2299 EXPORT_SYMBOL(netif_wake_subqueue);
2300 
2301 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2302 {
2303 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2304 		struct Qdisc *q;
2305 
2306 		rcu_read_lock();
2307 		q = rcu_dereference(dev_queue->qdisc);
2308 		__netif_schedule(q);
2309 		rcu_read_unlock();
2310 	}
2311 }
2312 EXPORT_SYMBOL(netif_tx_wake_queue);
2313 
2314 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2315 {
2316 	unsigned long flags;
2317 
2318 	if (likely(atomic_read(&skb->users) == 1)) {
2319 		smp_rmb();
2320 		atomic_set(&skb->users, 0);
2321 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2322 		return;
2323 	}
2324 	get_kfree_skb_cb(skb)->reason = reason;
2325 	local_irq_save(flags);
2326 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2327 	__this_cpu_write(softnet_data.completion_queue, skb);
2328 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2329 	local_irq_restore(flags);
2330 }
2331 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2332 
2333 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2334 {
2335 	if (in_irq() || irqs_disabled())
2336 		__dev_kfree_skb_irq(skb, reason);
2337 	else
2338 		dev_kfree_skb(skb);
2339 }
2340 EXPORT_SYMBOL(__dev_kfree_skb_any);
2341 
2342 
2343 /**
2344  * netif_device_detach - mark device as removed
2345  * @dev: network device
2346  *
2347  * Mark device as removed from system and therefore no longer available.
2348  */
2349 void netif_device_detach(struct net_device *dev)
2350 {
2351 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2352 	    netif_running(dev)) {
2353 		netif_tx_stop_all_queues(dev);
2354 	}
2355 }
2356 EXPORT_SYMBOL(netif_device_detach);
2357 
2358 /**
2359  * netif_device_attach - mark device as attached
2360  * @dev: network device
2361  *
2362  * Mark device as attached from system and restart if needed.
2363  */
2364 void netif_device_attach(struct net_device *dev)
2365 {
2366 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2367 	    netif_running(dev)) {
2368 		netif_tx_wake_all_queues(dev);
2369 		__netdev_watchdog_up(dev);
2370 	}
2371 }
2372 EXPORT_SYMBOL(netif_device_attach);
2373 
2374 /*
2375  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2376  * to be used as a distribution range.
2377  */
2378 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2379 		  unsigned int num_tx_queues)
2380 {
2381 	u32 hash;
2382 	u16 qoffset = 0;
2383 	u16 qcount = num_tx_queues;
2384 
2385 	if (skb_rx_queue_recorded(skb)) {
2386 		hash = skb_get_rx_queue(skb);
2387 		while (unlikely(hash >= num_tx_queues))
2388 			hash -= num_tx_queues;
2389 		return hash;
2390 	}
2391 
2392 	if (dev->num_tc) {
2393 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2394 		qoffset = dev->tc_to_txq[tc].offset;
2395 		qcount = dev->tc_to_txq[tc].count;
2396 	}
2397 
2398 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2399 }
2400 EXPORT_SYMBOL(__skb_tx_hash);
2401 
2402 static void skb_warn_bad_offload(const struct sk_buff *skb)
2403 {
2404 	static const netdev_features_t null_features = 0;
2405 	struct net_device *dev = skb->dev;
2406 	const char *driver = "";
2407 
2408 	if (!net_ratelimit())
2409 		return;
2410 
2411 	if (dev && dev->dev.parent)
2412 		driver = dev_driver_string(dev->dev.parent);
2413 
2414 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2415 	     "gso_type=%d ip_summed=%d\n",
2416 	     driver, dev ? &dev->features : &null_features,
2417 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2418 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2419 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2420 }
2421 
2422 /*
2423  * Invalidate hardware checksum when packet is to be mangled, and
2424  * complete checksum manually on outgoing path.
2425  */
2426 int skb_checksum_help(struct sk_buff *skb)
2427 {
2428 	__wsum csum;
2429 	int ret = 0, offset;
2430 
2431 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2432 		goto out_set_summed;
2433 
2434 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2435 		skb_warn_bad_offload(skb);
2436 		return -EINVAL;
2437 	}
2438 
2439 	/* Before computing a checksum, we should make sure no frag could
2440 	 * be modified by an external entity : checksum could be wrong.
2441 	 */
2442 	if (skb_has_shared_frag(skb)) {
2443 		ret = __skb_linearize(skb);
2444 		if (ret)
2445 			goto out;
2446 	}
2447 
2448 	offset = skb_checksum_start_offset(skb);
2449 	BUG_ON(offset >= skb_headlen(skb));
2450 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2451 
2452 	offset += skb->csum_offset;
2453 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2454 
2455 	if (skb_cloned(skb) &&
2456 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2457 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2458 		if (ret)
2459 			goto out;
2460 	}
2461 
2462 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2463 out_set_summed:
2464 	skb->ip_summed = CHECKSUM_NONE;
2465 out:
2466 	return ret;
2467 }
2468 EXPORT_SYMBOL(skb_checksum_help);
2469 
2470 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2471 {
2472 	__be16 type = skb->protocol;
2473 
2474 	/* Tunnel gso handlers can set protocol to ethernet. */
2475 	if (type == htons(ETH_P_TEB)) {
2476 		struct ethhdr *eth;
2477 
2478 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2479 			return 0;
2480 
2481 		eth = (struct ethhdr *)skb_mac_header(skb);
2482 		type = eth->h_proto;
2483 	}
2484 
2485 	return __vlan_get_protocol(skb, type, depth);
2486 }
2487 
2488 /**
2489  *	skb_mac_gso_segment - mac layer segmentation handler.
2490  *	@skb: buffer to segment
2491  *	@features: features for the output path (see dev->features)
2492  */
2493 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2494 				    netdev_features_t features)
2495 {
2496 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2497 	struct packet_offload *ptype;
2498 	int vlan_depth = skb->mac_len;
2499 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2500 
2501 	if (unlikely(!type))
2502 		return ERR_PTR(-EINVAL);
2503 
2504 	__skb_pull(skb, vlan_depth);
2505 
2506 	rcu_read_lock();
2507 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2508 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2509 			segs = ptype->callbacks.gso_segment(skb, features);
2510 			break;
2511 		}
2512 	}
2513 	rcu_read_unlock();
2514 
2515 	__skb_push(skb, skb->data - skb_mac_header(skb));
2516 
2517 	return segs;
2518 }
2519 EXPORT_SYMBOL(skb_mac_gso_segment);
2520 
2521 
2522 /* openvswitch calls this on rx path, so we need a different check.
2523  */
2524 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2525 {
2526 	if (tx_path)
2527 		return skb->ip_summed != CHECKSUM_PARTIAL;
2528 	else
2529 		return skb->ip_summed == CHECKSUM_NONE;
2530 }
2531 
2532 /**
2533  *	__skb_gso_segment - Perform segmentation on skb.
2534  *	@skb: buffer to segment
2535  *	@features: features for the output path (see dev->features)
2536  *	@tx_path: whether it is called in TX path
2537  *
2538  *	This function segments the given skb and returns a list of segments.
2539  *
2540  *	It may return NULL if the skb requires no segmentation.  This is
2541  *	only possible when GSO is used for verifying header integrity.
2542  */
2543 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2544 				  netdev_features_t features, bool tx_path)
2545 {
2546 	if (unlikely(skb_needs_check(skb, tx_path))) {
2547 		int err;
2548 
2549 		skb_warn_bad_offload(skb);
2550 
2551 		err = skb_cow_head(skb, 0);
2552 		if (err < 0)
2553 			return ERR_PTR(err);
2554 	}
2555 
2556 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2557 	SKB_GSO_CB(skb)->encap_level = 0;
2558 
2559 	skb_reset_mac_header(skb);
2560 	skb_reset_mac_len(skb);
2561 
2562 	return skb_mac_gso_segment(skb, features);
2563 }
2564 EXPORT_SYMBOL(__skb_gso_segment);
2565 
2566 /* Take action when hardware reception checksum errors are detected. */
2567 #ifdef CONFIG_BUG
2568 void netdev_rx_csum_fault(struct net_device *dev)
2569 {
2570 	if (net_ratelimit()) {
2571 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2572 		dump_stack();
2573 	}
2574 }
2575 EXPORT_SYMBOL(netdev_rx_csum_fault);
2576 #endif
2577 
2578 /* Actually, we should eliminate this check as soon as we know, that:
2579  * 1. IOMMU is present and allows to map all the memory.
2580  * 2. No high memory really exists on this machine.
2581  */
2582 
2583 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2584 {
2585 #ifdef CONFIG_HIGHMEM
2586 	int i;
2587 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2588 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2589 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2590 			if (PageHighMem(skb_frag_page(frag)))
2591 				return 1;
2592 		}
2593 	}
2594 
2595 	if (PCI_DMA_BUS_IS_PHYS) {
2596 		struct device *pdev = dev->dev.parent;
2597 
2598 		if (!pdev)
2599 			return 0;
2600 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2601 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2602 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2603 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2604 				return 1;
2605 		}
2606 	}
2607 #endif
2608 	return 0;
2609 }
2610 
2611 /* If MPLS offload request, verify we are testing hardware MPLS features
2612  * instead of standard features for the netdev.
2613  */
2614 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2615 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2616 					   netdev_features_t features,
2617 					   __be16 type)
2618 {
2619 	if (eth_p_mpls(type))
2620 		features &= skb->dev->mpls_features;
2621 
2622 	return features;
2623 }
2624 #else
2625 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2626 					   netdev_features_t features,
2627 					   __be16 type)
2628 {
2629 	return features;
2630 }
2631 #endif
2632 
2633 static netdev_features_t harmonize_features(struct sk_buff *skb,
2634 	netdev_features_t features)
2635 {
2636 	int tmp;
2637 	__be16 type;
2638 
2639 	type = skb_network_protocol(skb, &tmp);
2640 	features = net_mpls_features(skb, features, type);
2641 
2642 	if (skb->ip_summed != CHECKSUM_NONE &&
2643 	    !can_checksum_protocol(features, type)) {
2644 		features &= ~NETIF_F_ALL_CSUM;
2645 	} else if (illegal_highdma(skb->dev, skb)) {
2646 		features &= ~NETIF_F_SG;
2647 	}
2648 
2649 	return features;
2650 }
2651 
2652 netdev_features_t passthru_features_check(struct sk_buff *skb,
2653 					  struct net_device *dev,
2654 					  netdev_features_t features)
2655 {
2656 	return features;
2657 }
2658 EXPORT_SYMBOL(passthru_features_check);
2659 
2660 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2661 					     struct net_device *dev,
2662 					     netdev_features_t features)
2663 {
2664 	return vlan_features_check(skb, features);
2665 }
2666 
2667 netdev_features_t netif_skb_features(struct sk_buff *skb)
2668 {
2669 	struct net_device *dev = skb->dev;
2670 	netdev_features_t features = dev->features;
2671 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2672 
2673 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2674 		features &= ~NETIF_F_GSO_MASK;
2675 
2676 	/* If encapsulation offload request, verify we are testing
2677 	 * hardware encapsulation features instead of standard
2678 	 * features for the netdev
2679 	 */
2680 	if (skb->encapsulation)
2681 		features &= dev->hw_enc_features;
2682 
2683 	if (skb_vlan_tagged(skb))
2684 		features = netdev_intersect_features(features,
2685 						     dev->vlan_features |
2686 						     NETIF_F_HW_VLAN_CTAG_TX |
2687 						     NETIF_F_HW_VLAN_STAG_TX);
2688 
2689 	if (dev->netdev_ops->ndo_features_check)
2690 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2691 								features);
2692 	else
2693 		features &= dflt_features_check(skb, dev, features);
2694 
2695 	return harmonize_features(skb, features);
2696 }
2697 EXPORT_SYMBOL(netif_skb_features);
2698 
2699 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2700 		    struct netdev_queue *txq, bool more)
2701 {
2702 	unsigned int len;
2703 	int rc;
2704 
2705 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2706 		dev_queue_xmit_nit(skb, dev);
2707 
2708 	len = skb->len;
2709 	trace_net_dev_start_xmit(skb, dev);
2710 	rc = netdev_start_xmit(skb, dev, txq, more);
2711 	trace_net_dev_xmit(skb, rc, dev, len);
2712 
2713 	return rc;
2714 }
2715 
2716 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2717 				    struct netdev_queue *txq, int *ret)
2718 {
2719 	struct sk_buff *skb = first;
2720 	int rc = NETDEV_TX_OK;
2721 
2722 	while (skb) {
2723 		struct sk_buff *next = skb->next;
2724 
2725 		skb->next = NULL;
2726 		rc = xmit_one(skb, dev, txq, next != NULL);
2727 		if (unlikely(!dev_xmit_complete(rc))) {
2728 			skb->next = next;
2729 			goto out;
2730 		}
2731 
2732 		skb = next;
2733 		if (netif_xmit_stopped(txq) && skb) {
2734 			rc = NETDEV_TX_BUSY;
2735 			break;
2736 		}
2737 	}
2738 
2739 out:
2740 	*ret = rc;
2741 	return skb;
2742 }
2743 
2744 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2745 					  netdev_features_t features)
2746 {
2747 	if (skb_vlan_tag_present(skb) &&
2748 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2749 		skb = __vlan_hwaccel_push_inside(skb);
2750 	return skb;
2751 }
2752 
2753 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2754 {
2755 	netdev_features_t features;
2756 
2757 	if (skb->next)
2758 		return skb;
2759 
2760 	features = netif_skb_features(skb);
2761 	skb = validate_xmit_vlan(skb, features);
2762 	if (unlikely(!skb))
2763 		goto out_null;
2764 
2765 	if (netif_needs_gso(skb, features)) {
2766 		struct sk_buff *segs;
2767 
2768 		segs = skb_gso_segment(skb, features);
2769 		if (IS_ERR(segs)) {
2770 			goto out_kfree_skb;
2771 		} else if (segs) {
2772 			consume_skb(skb);
2773 			skb = segs;
2774 		}
2775 	} else {
2776 		if (skb_needs_linearize(skb, features) &&
2777 		    __skb_linearize(skb))
2778 			goto out_kfree_skb;
2779 
2780 		/* If packet is not checksummed and device does not
2781 		 * support checksumming for this protocol, complete
2782 		 * checksumming here.
2783 		 */
2784 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2785 			if (skb->encapsulation)
2786 				skb_set_inner_transport_header(skb,
2787 							       skb_checksum_start_offset(skb));
2788 			else
2789 				skb_set_transport_header(skb,
2790 							 skb_checksum_start_offset(skb));
2791 			if (!(features & NETIF_F_ALL_CSUM) &&
2792 			    skb_checksum_help(skb))
2793 				goto out_kfree_skb;
2794 		}
2795 	}
2796 
2797 	return skb;
2798 
2799 out_kfree_skb:
2800 	kfree_skb(skb);
2801 out_null:
2802 	return NULL;
2803 }
2804 
2805 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2806 {
2807 	struct sk_buff *next, *head = NULL, *tail;
2808 
2809 	for (; skb != NULL; skb = next) {
2810 		next = skb->next;
2811 		skb->next = NULL;
2812 
2813 		/* in case skb wont be segmented, point to itself */
2814 		skb->prev = skb;
2815 
2816 		skb = validate_xmit_skb(skb, dev);
2817 		if (!skb)
2818 			continue;
2819 
2820 		if (!head)
2821 			head = skb;
2822 		else
2823 			tail->next = skb;
2824 		/* If skb was segmented, skb->prev points to
2825 		 * the last segment. If not, it still contains skb.
2826 		 */
2827 		tail = skb->prev;
2828 	}
2829 	return head;
2830 }
2831 
2832 static void qdisc_pkt_len_init(struct sk_buff *skb)
2833 {
2834 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2835 
2836 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2837 
2838 	/* To get more precise estimation of bytes sent on wire,
2839 	 * we add to pkt_len the headers size of all segments
2840 	 */
2841 	if (shinfo->gso_size)  {
2842 		unsigned int hdr_len;
2843 		u16 gso_segs = shinfo->gso_segs;
2844 
2845 		/* mac layer + network layer */
2846 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2847 
2848 		/* + transport layer */
2849 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2850 			hdr_len += tcp_hdrlen(skb);
2851 		else
2852 			hdr_len += sizeof(struct udphdr);
2853 
2854 		if (shinfo->gso_type & SKB_GSO_DODGY)
2855 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2856 						shinfo->gso_size);
2857 
2858 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2859 	}
2860 }
2861 
2862 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2863 				 struct net_device *dev,
2864 				 struct netdev_queue *txq)
2865 {
2866 	spinlock_t *root_lock = qdisc_lock(q);
2867 	bool contended;
2868 	int rc;
2869 
2870 	qdisc_pkt_len_init(skb);
2871 	qdisc_calculate_pkt_len(skb, q);
2872 	/*
2873 	 * Heuristic to force contended enqueues to serialize on a
2874 	 * separate lock before trying to get qdisc main lock.
2875 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2876 	 * often and dequeue packets faster.
2877 	 */
2878 	contended = qdisc_is_running(q);
2879 	if (unlikely(contended))
2880 		spin_lock(&q->busylock);
2881 
2882 	spin_lock(root_lock);
2883 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2884 		kfree_skb(skb);
2885 		rc = NET_XMIT_DROP;
2886 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2887 		   qdisc_run_begin(q)) {
2888 		/*
2889 		 * This is a work-conserving queue; there are no old skbs
2890 		 * waiting to be sent out; and the qdisc is not running -
2891 		 * xmit the skb directly.
2892 		 */
2893 
2894 		qdisc_bstats_update(q, skb);
2895 
2896 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2897 			if (unlikely(contended)) {
2898 				spin_unlock(&q->busylock);
2899 				contended = false;
2900 			}
2901 			__qdisc_run(q);
2902 		} else
2903 			qdisc_run_end(q);
2904 
2905 		rc = NET_XMIT_SUCCESS;
2906 	} else {
2907 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2908 		if (qdisc_run_begin(q)) {
2909 			if (unlikely(contended)) {
2910 				spin_unlock(&q->busylock);
2911 				contended = false;
2912 			}
2913 			__qdisc_run(q);
2914 		}
2915 	}
2916 	spin_unlock(root_lock);
2917 	if (unlikely(contended))
2918 		spin_unlock(&q->busylock);
2919 	return rc;
2920 }
2921 
2922 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2923 static void skb_update_prio(struct sk_buff *skb)
2924 {
2925 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2926 
2927 	if (!skb->priority && skb->sk && map) {
2928 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2929 
2930 		if (prioidx < map->priomap_len)
2931 			skb->priority = map->priomap[prioidx];
2932 	}
2933 }
2934 #else
2935 #define skb_update_prio(skb)
2936 #endif
2937 
2938 DEFINE_PER_CPU(int, xmit_recursion);
2939 EXPORT_SYMBOL(xmit_recursion);
2940 
2941 #define RECURSION_LIMIT 10
2942 
2943 /**
2944  *	dev_loopback_xmit - loop back @skb
2945  *	@skb: buffer to transmit
2946  */
2947 int dev_loopback_xmit(struct sock *sk, struct sk_buff *skb)
2948 {
2949 	skb_reset_mac_header(skb);
2950 	__skb_pull(skb, skb_network_offset(skb));
2951 	skb->pkt_type = PACKET_LOOPBACK;
2952 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2953 	WARN_ON(!skb_dst(skb));
2954 	skb_dst_force(skb);
2955 	netif_rx_ni(skb);
2956 	return 0;
2957 }
2958 EXPORT_SYMBOL(dev_loopback_xmit);
2959 
2960 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2961 {
2962 #ifdef CONFIG_XPS
2963 	struct xps_dev_maps *dev_maps;
2964 	struct xps_map *map;
2965 	int queue_index = -1;
2966 
2967 	rcu_read_lock();
2968 	dev_maps = rcu_dereference(dev->xps_maps);
2969 	if (dev_maps) {
2970 		map = rcu_dereference(
2971 		    dev_maps->cpu_map[skb->sender_cpu - 1]);
2972 		if (map) {
2973 			if (map->len == 1)
2974 				queue_index = map->queues[0];
2975 			else
2976 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
2977 									   map->len)];
2978 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2979 				queue_index = -1;
2980 		}
2981 	}
2982 	rcu_read_unlock();
2983 
2984 	return queue_index;
2985 #else
2986 	return -1;
2987 #endif
2988 }
2989 
2990 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
2991 {
2992 	struct sock *sk = skb->sk;
2993 	int queue_index = sk_tx_queue_get(sk);
2994 
2995 	if (queue_index < 0 || skb->ooo_okay ||
2996 	    queue_index >= dev->real_num_tx_queues) {
2997 		int new_index = get_xps_queue(dev, skb);
2998 		if (new_index < 0)
2999 			new_index = skb_tx_hash(dev, skb);
3000 
3001 		if (queue_index != new_index && sk &&
3002 		    rcu_access_pointer(sk->sk_dst_cache))
3003 			sk_tx_queue_set(sk, new_index);
3004 
3005 		queue_index = new_index;
3006 	}
3007 
3008 	return queue_index;
3009 }
3010 
3011 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3012 				    struct sk_buff *skb,
3013 				    void *accel_priv)
3014 {
3015 	int queue_index = 0;
3016 
3017 #ifdef CONFIG_XPS
3018 	if (skb->sender_cpu == 0)
3019 		skb->sender_cpu = raw_smp_processor_id() + 1;
3020 #endif
3021 
3022 	if (dev->real_num_tx_queues != 1) {
3023 		const struct net_device_ops *ops = dev->netdev_ops;
3024 		if (ops->ndo_select_queue)
3025 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3026 							    __netdev_pick_tx);
3027 		else
3028 			queue_index = __netdev_pick_tx(dev, skb);
3029 
3030 		if (!accel_priv)
3031 			queue_index = netdev_cap_txqueue(dev, queue_index);
3032 	}
3033 
3034 	skb_set_queue_mapping(skb, queue_index);
3035 	return netdev_get_tx_queue(dev, queue_index);
3036 }
3037 
3038 /**
3039  *	__dev_queue_xmit - transmit a buffer
3040  *	@skb: buffer to transmit
3041  *	@accel_priv: private data used for L2 forwarding offload
3042  *
3043  *	Queue a buffer for transmission to a network device. The caller must
3044  *	have set the device and priority and built the buffer before calling
3045  *	this function. The function can be called from an interrupt.
3046  *
3047  *	A negative errno code is returned on a failure. A success does not
3048  *	guarantee the frame will be transmitted as it may be dropped due
3049  *	to congestion or traffic shaping.
3050  *
3051  * -----------------------------------------------------------------------------------
3052  *      I notice this method can also return errors from the queue disciplines,
3053  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3054  *      be positive.
3055  *
3056  *      Regardless of the return value, the skb is consumed, so it is currently
3057  *      difficult to retry a send to this method.  (You can bump the ref count
3058  *      before sending to hold a reference for retry if you are careful.)
3059  *
3060  *      When calling this method, interrupts MUST be enabled.  This is because
3061  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3062  *          --BLG
3063  */
3064 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3065 {
3066 	struct net_device *dev = skb->dev;
3067 	struct netdev_queue *txq;
3068 	struct Qdisc *q;
3069 	int rc = -ENOMEM;
3070 
3071 	skb_reset_mac_header(skb);
3072 
3073 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3074 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3075 
3076 	/* Disable soft irqs for various locks below. Also
3077 	 * stops preemption for RCU.
3078 	 */
3079 	rcu_read_lock_bh();
3080 
3081 	skb_update_prio(skb);
3082 
3083 	/* If device/qdisc don't need skb->dst, release it right now while
3084 	 * its hot in this cpu cache.
3085 	 */
3086 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3087 		skb_dst_drop(skb);
3088 	else
3089 		skb_dst_force(skb);
3090 
3091 #ifdef CONFIG_NET_SWITCHDEV
3092 	/* Don't forward if offload device already forwarded */
3093 	if (skb->offload_fwd_mark &&
3094 	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3095 		consume_skb(skb);
3096 		rc = NET_XMIT_SUCCESS;
3097 		goto out;
3098 	}
3099 #endif
3100 
3101 	txq = netdev_pick_tx(dev, skb, accel_priv);
3102 	q = rcu_dereference_bh(txq->qdisc);
3103 
3104 #ifdef CONFIG_NET_CLS_ACT
3105 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3106 #endif
3107 	trace_net_dev_queue(skb);
3108 	if (q->enqueue) {
3109 		rc = __dev_xmit_skb(skb, q, dev, txq);
3110 		goto out;
3111 	}
3112 
3113 	/* The device has no queue. Common case for software devices:
3114 	   loopback, all the sorts of tunnels...
3115 
3116 	   Really, it is unlikely that netif_tx_lock protection is necessary
3117 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3118 	   counters.)
3119 	   However, it is possible, that they rely on protection
3120 	   made by us here.
3121 
3122 	   Check this and shot the lock. It is not prone from deadlocks.
3123 	   Either shot noqueue qdisc, it is even simpler 8)
3124 	 */
3125 	if (dev->flags & IFF_UP) {
3126 		int cpu = smp_processor_id(); /* ok because BHs are off */
3127 
3128 		if (txq->xmit_lock_owner != cpu) {
3129 
3130 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3131 				goto recursion_alert;
3132 
3133 			skb = validate_xmit_skb(skb, dev);
3134 			if (!skb)
3135 				goto drop;
3136 
3137 			HARD_TX_LOCK(dev, txq, cpu);
3138 
3139 			if (!netif_xmit_stopped(txq)) {
3140 				__this_cpu_inc(xmit_recursion);
3141 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3142 				__this_cpu_dec(xmit_recursion);
3143 				if (dev_xmit_complete(rc)) {
3144 					HARD_TX_UNLOCK(dev, txq);
3145 					goto out;
3146 				}
3147 			}
3148 			HARD_TX_UNLOCK(dev, txq);
3149 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3150 					     dev->name);
3151 		} else {
3152 			/* Recursion is detected! It is possible,
3153 			 * unfortunately
3154 			 */
3155 recursion_alert:
3156 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3157 					     dev->name);
3158 		}
3159 	}
3160 
3161 	rc = -ENETDOWN;
3162 drop:
3163 	rcu_read_unlock_bh();
3164 
3165 	atomic_long_inc(&dev->tx_dropped);
3166 	kfree_skb_list(skb);
3167 	return rc;
3168 out:
3169 	rcu_read_unlock_bh();
3170 	return rc;
3171 }
3172 
3173 int dev_queue_xmit_sk(struct sock *sk, struct sk_buff *skb)
3174 {
3175 	return __dev_queue_xmit(skb, NULL);
3176 }
3177 EXPORT_SYMBOL(dev_queue_xmit_sk);
3178 
3179 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3180 {
3181 	return __dev_queue_xmit(skb, accel_priv);
3182 }
3183 EXPORT_SYMBOL(dev_queue_xmit_accel);
3184 
3185 
3186 /*=======================================================================
3187 			Receiver routines
3188   =======================================================================*/
3189 
3190 int netdev_max_backlog __read_mostly = 1000;
3191 EXPORT_SYMBOL(netdev_max_backlog);
3192 
3193 int netdev_tstamp_prequeue __read_mostly = 1;
3194 int netdev_budget __read_mostly = 300;
3195 int weight_p __read_mostly = 64;            /* old backlog weight */
3196 
3197 /* Called with irq disabled */
3198 static inline void ____napi_schedule(struct softnet_data *sd,
3199 				     struct napi_struct *napi)
3200 {
3201 	list_add_tail(&napi->poll_list, &sd->poll_list);
3202 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3203 }
3204 
3205 #ifdef CONFIG_RPS
3206 
3207 /* One global table that all flow-based protocols share. */
3208 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3209 EXPORT_SYMBOL(rps_sock_flow_table);
3210 u32 rps_cpu_mask __read_mostly;
3211 EXPORT_SYMBOL(rps_cpu_mask);
3212 
3213 struct static_key rps_needed __read_mostly;
3214 
3215 static struct rps_dev_flow *
3216 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3217 	    struct rps_dev_flow *rflow, u16 next_cpu)
3218 {
3219 	if (next_cpu < nr_cpu_ids) {
3220 #ifdef CONFIG_RFS_ACCEL
3221 		struct netdev_rx_queue *rxqueue;
3222 		struct rps_dev_flow_table *flow_table;
3223 		struct rps_dev_flow *old_rflow;
3224 		u32 flow_id;
3225 		u16 rxq_index;
3226 		int rc;
3227 
3228 		/* Should we steer this flow to a different hardware queue? */
3229 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3230 		    !(dev->features & NETIF_F_NTUPLE))
3231 			goto out;
3232 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3233 		if (rxq_index == skb_get_rx_queue(skb))
3234 			goto out;
3235 
3236 		rxqueue = dev->_rx + rxq_index;
3237 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3238 		if (!flow_table)
3239 			goto out;
3240 		flow_id = skb_get_hash(skb) & flow_table->mask;
3241 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3242 							rxq_index, flow_id);
3243 		if (rc < 0)
3244 			goto out;
3245 		old_rflow = rflow;
3246 		rflow = &flow_table->flows[flow_id];
3247 		rflow->filter = rc;
3248 		if (old_rflow->filter == rflow->filter)
3249 			old_rflow->filter = RPS_NO_FILTER;
3250 	out:
3251 #endif
3252 		rflow->last_qtail =
3253 			per_cpu(softnet_data, next_cpu).input_queue_head;
3254 	}
3255 
3256 	rflow->cpu = next_cpu;
3257 	return rflow;
3258 }
3259 
3260 /*
3261  * get_rps_cpu is called from netif_receive_skb and returns the target
3262  * CPU from the RPS map of the receiving queue for a given skb.
3263  * rcu_read_lock must be held on entry.
3264  */
3265 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3266 		       struct rps_dev_flow **rflowp)
3267 {
3268 	const struct rps_sock_flow_table *sock_flow_table;
3269 	struct netdev_rx_queue *rxqueue = dev->_rx;
3270 	struct rps_dev_flow_table *flow_table;
3271 	struct rps_map *map;
3272 	int cpu = -1;
3273 	u32 tcpu;
3274 	u32 hash;
3275 
3276 	if (skb_rx_queue_recorded(skb)) {
3277 		u16 index = skb_get_rx_queue(skb);
3278 
3279 		if (unlikely(index >= dev->real_num_rx_queues)) {
3280 			WARN_ONCE(dev->real_num_rx_queues > 1,
3281 				  "%s received packet on queue %u, but number "
3282 				  "of RX queues is %u\n",
3283 				  dev->name, index, dev->real_num_rx_queues);
3284 			goto done;
3285 		}
3286 		rxqueue += index;
3287 	}
3288 
3289 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3290 
3291 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3292 	map = rcu_dereference(rxqueue->rps_map);
3293 	if (!flow_table && !map)
3294 		goto done;
3295 
3296 	skb_reset_network_header(skb);
3297 	hash = skb_get_hash(skb);
3298 	if (!hash)
3299 		goto done;
3300 
3301 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3302 	if (flow_table && sock_flow_table) {
3303 		struct rps_dev_flow *rflow;
3304 		u32 next_cpu;
3305 		u32 ident;
3306 
3307 		/* First check into global flow table if there is a match */
3308 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3309 		if ((ident ^ hash) & ~rps_cpu_mask)
3310 			goto try_rps;
3311 
3312 		next_cpu = ident & rps_cpu_mask;
3313 
3314 		/* OK, now we know there is a match,
3315 		 * we can look at the local (per receive queue) flow table
3316 		 */
3317 		rflow = &flow_table->flows[hash & flow_table->mask];
3318 		tcpu = rflow->cpu;
3319 
3320 		/*
3321 		 * If the desired CPU (where last recvmsg was done) is
3322 		 * different from current CPU (one in the rx-queue flow
3323 		 * table entry), switch if one of the following holds:
3324 		 *   - Current CPU is unset (>= nr_cpu_ids).
3325 		 *   - Current CPU is offline.
3326 		 *   - The current CPU's queue tail has advanced beyond the
3327 		 *     last packet that was enqueued using this table entry.
3328 		 *     This guarantees that all previous packets for the flow
3329 		 *     have been dequeued, thus preserving in order delivery.
3330 		 */
3331 		if (unlikely(tcpu != next_cpu) &&
3332 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3333 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3334 		      rflow->last_qtail)) >= 0)) {
3335 			tcpu = next_cpu;
3336 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3337 		}
3338 
3339 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3340 			*rflowp = rflow;
3341 			cpu = tcpu;
3342 			goto done;
3343 		}
3344 	}
3345 
3346 try_rps:
3347 
3348 	if (map) {
3349 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3350 		if (cpu_online(tcpu)) {
3351 			cpu = tcpu;
3352 			goto done;
3353 		}
3354 	}
3355 
3356 done:
3357 	return cpu;
3358 }
3359 
3360 #ifdef CONFIG_RFS_ACCEL
3361 
3362 /**
3363  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3364  * @dev: Device on which the filter was set
3365  * @rxq_index: RX queue index
3366  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3367  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3368  *
3369  * Drivers that implement ndo_rx_flow_steer() should periodically call
3370  * this function for each installed filter and remove the filters for
3371  * which it returns %true.
3372  */
3373 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3374 			 u32 flow_id, u16 filter_id)
3375 {
3376 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3377 	struct rps_dev_flow_table *flow_table;
3378 	struct rps_dev_flow *rflow;
3379 	bool expire = true;
3380 	unsigned int cpu;
3381 
3382 	rcu_read_lock();
3383 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3384 	if (flow_table && flow_id <= flow_table->mask) {
3385 		rflow = &flow_table->flows[flow_id];
3386 		cpu = ACCESS_ONCE(rflow->cpu);
3387 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3388 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3389 			   rflow->last_qtail) <
3390 		     (int)(10 * flow_table->mask)))
3391 			expire = false;
3392 	}
3393 	rcu_read_unlock();
3394 	return expire;
3395 }
3396 EXPORT_SYMBOL(rps_may_expire_flow);
3397 
3398 #endif /* CONFIG_RFS_ACCEL */
3399 
3400 /* Called from hardirq (IPI) context */
3401 static void rps_trigger_softirq(void *data)
3402 {
3403 	struct softnet_data *sd = data;
3404 
3405 	____napi_schedule(sd, &sd->backlog);
3406 	sd->received_rps++;
3407 }
3408 
3409 #endif /* CONFIG_RPS */
3410 
3411 /*
3412  * Check if this softnet_data structure is another cpu one
3413  * If yes, queue it to our IPI list and return 1
3414  * If no, return 0
3415  */
3416 static int rps_ipi_queued(struct softnet_data *sd)
3417 {
3418 #ifdef CONFIG_RPS
3419 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3420 
3421 	if (sd != mysd) {
3422 		sd->rps_ipi_next = mysd->rps_ipi_list;
3423 		mysd->rps_ipi_list = sd;
3424 
3425 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3426 		return 1;
3427 	}
3428 #endif /* CONFIG_RPS */
3429 	return 0;
3430 }
3431 
3432 #ifdef CONFIG_NET_FLOW_LIMIT
3433 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3434 #endif
3435 
3436 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3437 {
3438 #ifdef CONFIG_NET_FLOW_LIMIT
3439 	struct sd_flow_limit *fl;
3440 	struct softnet_data *sd;
3441 	unsigned int old_flow, new_flow;
3442 
3443 	if (qlen < (netdev_max_backlog >> 1))
3444 		return false;
3445 
3446 	sd = this_cpu_ptr(&softnet_data);
3447 
3448 	rcu_read_lock();
3449 	fl = rcu_dereference(sd->flow_limit);
3450 	if (fl) {
3451 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3452 		old_flow = fl->history[fl->history_head];
3453 		fl->history[fl->history_head] = new_flow;
3454 
3455 		fl->history_head++;
3456 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3457 
3458 		if (likely(fl->buckets[old_flow]))
3459 			fl->buckets[old_flow]--;
3460 
3461 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3462 			fl->count++;
3463 			rcu_read_unlock();
3464 			return true;
3465 		}
3466 	}
3467 	rcu_read_unlock();
3468 #endif
3469 	return false;
3470 }
3471 
3472 /*
3473  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3474  * queue (may be a remote CPU queue).
3475  */
3476 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3477 			      unsigned int *qtail)
3478 {
3479 	struct softnet_data *sd;
3480 	unsigned long flags;
3481 	unsigned int qlen;
3482 
3483 	sd = &per_cpu(softnet_data, cpu);
3484 
3485 	local_irq_save(flags);
3486 
3487 	rps_lock(sd);
3488 	if (!netif_running(skb->dev))
3489 		goto drop;
3490 	qlen = skb_queue_len(&sd->input_pkt_queue);
3491 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3492 		if (qlen) {
3493 enqueue:
3494 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3495 			input_queue_tail_incr_save(sd, qtail);
3496 			rps_unlock(sd);
3497 			local_irq_restore(flags);
3498 			return NET_RX_SUCCESS;
3499 		}
3500 
3501 		/* Schedule NAPI for backlog device
3502 		 * We can use non atomic operation since we own the queue lock
3503 		 */
3504 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3505 			if (!rps_ipi_queued(sd))
3506 				____napi_schedule(sd, &sd->backlog);
3507 		}
3508 		goto enqueue;
3509 	}
3510 
3511 drop:
3512 	sd->dropped++;
3513 	rps_unlock(sd);
3514 
3515 	local_irq_restore(flags);
3516 
3517 	atomic_long_inc(&skb->dev->rx_dropped);
3518 	kfree_skb(skb);
3519 	return NET_RX_DROP;
3520 }
3521 
3522 static int netif_rx_internal(struct sk_buff *skb)
3523 {
3524 	int ret;
3525 
3526 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3527 
3528 	trace_netif_rx(skb);
3529 #ifdef CONFIG_RPS
3530 	if (static_key_false(&rps_needed)) {
3531 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3532 		int cpu;
3533 
3534 		preempt_disable();
3535 		rcu_read_lock();
3536 
3537 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3538 		if (cpu < 0)
3539 			cpu = smp_processor_id();
3540 
3541 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3542 
3543 		rcu_read_unlock();
3544 		preempt_enable();
3545 	} else
3546 #endif
3547 	{
3548 		unsigned int qtail;
3549 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3550 		put_cpu();
3551 	}
3552 	return ret;
3553 }
3554 
3555 /**
3556  *	netif_rx	-	post buffer to the network code
3557  *	@skb: buffer to post
3558  *
3559  *	This function receives a packet from a device driver and queues it for
3560  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3561  *	may be dropped during processing for congestion control or by the
3562  *	protocol layers.
3563  *
3564  *	return values:
3565  *	NET_RX_SUCCESS	(no congestion)
3566  *	NET_RX_DROP     (packet was dropped)
3567  *
3568  */
3569 
3570 int netif_rx(struct sk_buff *skb)
3571 {
3572 	trace_netif_rx_entry(skb);
3573 
3574 	return netif_rx_internal(skb);
3575 }
3576 EXPORT_SYMBOL(netif_rx);
3577 
3578 int netif_rx_ni(struct sk_buff *skb)
3579 {
3580 	int err;
3581 
3582 	trace_netif_rx_ni_entry(skb);
3583 
3584 	preempt_disable();
3585 	err = netif_rx_internal(skb);
3586 	if (local_softirq_pending())
3587 		do_softirq();
3588 	preempt_enable();
3589 
3590 	return err;
3591 }
3592 EXPORT_SYMBOL(netif_rx_ni);
3593 
3594 static void net_tx_action(struct softirq_action *h)
3595 {
3596 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3597 
3598 	if (sd->completion_queue) {
3599 		struct sk_buff *clist;
3600 
3601 		local_irq_disable();
3602 		clist = sd->completion_queue;
3603 		sd->completion_queue = NULL;
3604 		local_irq_enable();
3605 
3606 		while (clist) {
3607 			struct sk_buff *skb = clist;
3608 			clist = clist->next;
3609 
3610 			WARN_ON(atomic_read(&skb->users));
3611 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3612 				trace_consume_skb(skb);
3613 			else
3614 				trace_kfree_skb(skb, net_tx_action);
3615 			__kfree_skb(skb);
3616 		}
3617 	}
3618 
3619 	if (sd->output_queue) {
3620 		struct Qdisc *head;
3621 
3622 		local_irq_disable();
3623 		head = sd->output_queue;
3624 		sd->output_queue = NULL;
3625 		sd->output_queue_tailp = &sd->output_queue;
3626 		local_irq_enable();
3627 
3628 		while (head) {
3629 			struct Qdisc *q = head;
3630 			spinlock_t *root_lock;
3631 
3632 			head = head->next_sched;
3633 
3634 			root_lock = qdisc_lock(q);
3635 			if (spin_trylock(root_lock)) {
3636 				smp_mb__before_atomic();
3637 				clear_bit(__QDISC_STATE_SCHED,
3638 					  &q->state);
3639 				qdisc_run(q);
3640 				spin_unlock(root_lock);
3641 			} else {
3642 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3643 					      &q->state)) {
3644 					__netif_reschedule(q);
3645 				} else {
3646 					smp_mb__before_atomic();
3647 					clear_bit(__QDISC_STATE_SCHED,
3648 						  &q->state);
3649 				}
3650 			}
3651 		}
3652 	}
3653 }
3654 
3655 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3656     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3657 /* This hook is defined here for ATM LANE */
3658 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3659 			     unsigned char *addr) __read_mostly;
3660 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3661 #endif
3662 
3663 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3664 					 struct packet_type **pt_prev,
3665 					 int *ret, struct net_device *orig_dev)
3666 {
3667 #ifdef CONFIG_NET_CLS_ACT
3668 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3669 	struct tcf_result cl_res;
3670 
3671 	/* If there's at least one ingress present somewhere (so
3672 	 * we get here via enabled static key), remaining devices
3673 	 * that are not configured with an ingress qdisc will bail
3674 	 * out here.
3675 	 */
3676 	if (!cl)
3677 		return skb;
3678 	if (*pt_prev) {
3679 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3680 		*pt_prev = NULL;
3681 	}
3682 
3683 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3684 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3685 	qdisc_bstats_cpu_update(cl->q, skb);
3686 
3687 	switch (tc_classify(skb, cl, &cl_res, false)) {
3688 	case TC_ACT_OK:
3689 	case TC_ACT_RECLASSIFY:
3690 		skb->tc_index = TC_H_MIN(cl_res.classid);
3691 		break;
3692 	case TC_ACT_SHOT:
3693 		qdisc_qstats_cpu_drop(cl->q);
3694 	case TC_ACT_STOLEN:
3695 	case TC_ACT_QUEUED:
3696 		kfree_skb(skb);
3697 		return NULL;
3698 	default:
3699 		break;
3700 	}
3701 #endif /* CONFIG_NET_CLS_ACT */
3702 	return skb;
3703 }
3704 
3705 /**
3706  *	netdev_rx_handler_register - register receive handler
3707  *	@dev: device to register a handler for
3708  *	@rx_handler: receive handler to register
3709  *	@rx_handler_data: data pointer that is used by rx handler
3710  *
3711  *	Register a receive handler for a device. This handler will then be
3712  *	called from __netif_receive_skb. A negative errno code is returned
3713  *	on a failure.
3714  *
3715  *	The caller must hold the rtnl_mutex.
3716  *
3717  *	For a general description of rx_handler, see enum rx_handler_result.
3718  */
3719 int netdev_rx_handler_register(struct net_device *dev,
3720 			       rx_handler_func_t *rx_handler,
3721 			       void *rx_handler_data)
3722 {
3723 	ASSERT_RTNL();
3724 
3725 	if (dev->rx_handler)
3726 		return -EBUSY;
3727 
3728 	/* Note: rx_handler_data must be set before rx_handler */
3729 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3730 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3731 
3732 	return 0;
3733 }
3734 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3735 
3736 /**
3737  *	netdev_rx_handler_unregister - unregister receive handler
3738  *	@dev: device to unregister a handler from
3739  *
3740  *	Unregister a receive handler from a device.
3741  *
3742  *	The caller must hold the rtnl_mutex.
3743  */
3744 void netdev_rx_handler_unregister(struct net_device *dev)
3745 {
3746 
3747 	ASSERT_RTNL();
3748 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3749 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3750 	 * section has a guarantee to see a non NULL rx_handler_data
3751 	 * as well.
3752 	 */
3753 	synchronize_net();
3754 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3755 }
3756 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3757 
3758 /*
3759  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3760  * the special handling of PFMEMALLOC skbs.
3761  */
3762 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3763 {
3764 	switch (skb->protocol) {
3765 	case htons(ETH_P_ARP):
3766 	case htons(ETH_P_IP):
3767 	case htons(ETH_P_IPV6):
3768 	case htons(ETH_P_8021Q):
3769 	case htons(ETH_P_8021AD):
3770 		return true;
3771 	default:
3772 		return false;
3773 	}
3774 }
3775 
3776 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3777 			     int *ret, struct net_device *orig_dev)
3778 {
3779 #ifdef CONFIG_NETFILTER_INGRESS
3780 	if (nf_hook_ingress_active(skb)) {
3781 		if (*pt_prev) {
3782 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
3783 			*pt_prev = NULL;
3784 		}
3785 
3786 		return nf_hook_ingress(skb);
3787 	}
3788 #endif /* CONFIG_NETFILTER_INGRESS */
3789 	return 0;
3790 }
3791 
3792 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3793 {
3794 	struct packet_type *ptype, *pt_prev;
3795 	rx_handler_func_t *rx_handler;
3796 	struct net_device *orig_dev;
3797 	bool deliver_exact = false;
3798 	int ret = NET_RX_DROP;
3799 	__be16 type;
3800 
3801 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3802 
3803 	trace_netif_receive_skb(skb);
3804 
3805 	orig_dev = skb->dev;
3806 
3807 	skb_reset_network_header(skb);
3808 	if (!skb_transport_header_was_set(skb))
3809 		skb_reset_transport_header(skb);
3810 	skb_reset_mac_len(skb);
3811 
3812 	pt_prev = NULL;
3813 
3814 another_round:
3815 	skb->skb_iif = skb->dev->ifindex;
3816 
3817 	__this_cpu_inc(softnet_data.processed);
3818 
3819 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3820 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3821 		skb = skb_vlan_untag(skb);
3822 		if (unlikely(!skb))
3823 			goto out;
3824 	}
3825 
3826 #ifdef CONFIG_NET_CLS_ACT
3827 	if (skb->tc_verd & TC_NCLS) {
3828 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3829 		goto ncls;
3830 	}
3831 #endif
3832 
3833 	if (pfmemalloc)
3834 		goto skip_taps;
3835 
3836 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3837 		if (pt_prev)
3838 			ret = deliver_skb(skb, pt_prev, orig_dev);
3839 		pt_prev = ptype;
3840 	}
3841 
3842 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3843 		if (pt_prev)
3844 			ret = deliver_skb(skb, pt_prev, orig_dev);
3845 		pt_prev = ptype;
3846 	}
3847 
3848 skip_taps:
3849 #ifdef CONFIG_NET_INGRESS
3850 	if (static_key_false(&ingress_needed)) {
3851 		skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3852 		if (!skb)
3853 			goto out;
3854 
3855 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3856 			goto out;
3857 	}
3858 #endif
3859 #ifdef CONFIG_NET_CLS_ACT
3860 	skb->tc_verd = 0;
3861 ncls:
3862 #endif
3863 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3864 		goto drop;
3865 
3866 	if (skb_vlan_tag_present(skb)) {
3867 		if (pt_prev) {
3868 			ret = deliver_skb(skb, pt_prev, orig_dev);
3869 			pt_prev = NULL;
3870 		}
3871 		if (vlan_do_receive(&skb))
3872 			goto another_round;
3873 		else if (unlikely(!skb))
3874 			goto out;
3875 	}
3876 
3877 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3878 	if (rx_handler) {
3879 		if (pt_prev) {
3880 			ret = deliver_skb(skb, pt_prev, orig_dev);
3881 			pt_prev = NULL;
3882 		}
3883 		switch (rx_handler(&skb)) {
3884 		case RX_HANDLER_CONSUMED:
3885 			ret = NET_RX_SUCCESS;
3886 			goto out;
3887 		case RX_HANDLER_ANOTHER:
3888 			goto another_round;
3889 		case RX_HANDLER_EXACT:
3890 			deliver_exact = true;
3891 		case RX_HANDLER_PASS:
3892 			break;
3893 		default:
3894 			BUG();
3895 		}
3896 	}
3897 
3898 	if (unlikely(skb_vlan_tag_present(skb))) {
3899 		if (skb_vlan_tag_get_id(skb))
3900 			skb->pkt_type = PACKET_OTHERHOST;
3901 		/* Note: we might in the future use prio bits
3902 		 * and set skb->priority like in vlan_do_receive()
3903 		 * For the time being, just ignore Priority Code Point
3904 		 */
3905 		skb->vlan_tci = 0;
3906 	}
3907 
3908 	type = skb->protocol;
3909 
3910 	/* deliver only exact match when indicated */
3911 	if (likely(!deliver_exact)) {
3912 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3913 				       &ptype_base[ntohs(type) &
3914 						   PTYPE_HASH_MASK]);
3915 	}
3916 
3917 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3918 			       &orig_dev->ptype_specific);
3919 
3920 	if (unlikely(skb->dev != orig_dev)) {
3921 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3922 				       &skb->dev->ptype_specific);
3923 	}
3924 
3925 	if (pt_prev) {
3926 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3927 			goto drop;
3928 		else
3929 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3930 	} else {
3931 drop:
3932 		atomic_long_inc(&skb->dev->rx_dropped);
3933 		kfree_skb(skb);
3934 		/* Jamal, now you will not able to escape explaining
3935 		 * me how you were going to use this. :-)
3936 		 */
3937 		ret = NET_RX_DROP;
3938 	}
3939 
3940 out:
3941 	return ret;
3942 }
3943 
3944 static int __netif_receive_skb(struct sk_buff *skb)
3945 {
3946 	int ret;
3947 
3948 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3949 		unsigned long pflags = current->flags;
3950 
3951 		/*
3952 		 * PFMEMALLOC skbs are special, they should
3953 		 * - be delivered to SOCK_MEMALLOC sockets only
3954 		 * - stay away from userspace
3955 		 * - have bounded memory usage
3956 		 *
3957 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3958 		 * context down to all allocation sites.
3959 		 */
3960 		current->flags |= PF_MEMALLOC;
3961 		ret = __netif_receive_skb_core(skb, true);
3962 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3963 	} else
3964 		ret = __netif_receive_skb_core(skb, false);
3965 
3966 	return ret;
3967 }
3968 
3969 static int netif_receive_skb_internal(struct sk_buff *skb)
3970 {
3971 	int ret;
3972 
3973 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3974 
3975 	if (skb_defer_rx_timestamp(skb))
3976 		return NET_RX_SUCCESS;
3977 
3978 	rcu_read_lock();
3979 
3980 #ifdef CONFIG_RPS
3981 	if (static_key_false(&rps_needed)) {
3982 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3983 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3984 
3985 		if (cpu >= 0) {
3986 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3987 			rcu_read_unlock();
3988 			return ret;
3989 		}
3990 	}
3991 #endif
3992 	ret = __netif_receive_skb(skb);
3993 	rcu_read_unlock();
3994 	return ret;
3995 }
3996 
3997 /**
3998  *	netif_receive_skb - process receive buffer from network
3999  *	@skb: buffer to process
4000  *
4001  *	netif_receive_skb() is the main receive data processing function.
4002  *	It always succeeds. The buffer may be dropped during processing
4003  *	for congestion control or by the protocol layers.
4004  *
4005  *	This function may only be called from softirq context and interrupts
4006  *	should be enabled.
4007  *
4008  *	Return values (usually ignored):
4009  *	NET_RX_SUCCESS: no congestion
4010  *	NET_RX_DROP: packet was dropped
4011  */
4012 int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
4013 {
4014 	trace_netif_receive_skb_entry(skb);
4015 
4016 	return netif_receive_skb_internal(skb);
4017 }
4018 EXPORT_SYMBOL(netif_receive_skb_sk);
4019 
4020 /* Network device is going away, flush any packets still pending
4021  * Called with irqs disabled.
4022  */
4023 static void flush_backlog(void *arg)
4024 {
4025 	struct net_device *dev = arg;
4026 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4027 	struct sk_buff *skb, *tmp;
4028 
4029 	rps_lock(sd);
4030 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4031 		if (skb->dev == dev) {
4032 			__skb_unlink(skb, &sd->input_pkt_queue);
4033 			kfree_skb(skb);
4034 			input_queue_head_incr(sd);
4035 		}
4036 	}
4037 	rps_unlock(sd);
4038 
4039 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4040 		if (skb->dev == dev) {
4041 			__skb_unlink(skb, &sd->process_queue);
4042 			kfree_skb(skb);
4043 			input_queue_head_incr(sd);
4044 		}
4045 	}
4046 }
4047 
4048 static int napi_gro_complete(struct sk_buff *skb)
4049 {
4050 	struct packet_offload *ptype;
4051 	__be16 type = skb->protocol;
4052 	struct list_head *head = &offload_base;
4053 	int err = -ENOENT;
4054 
4055 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4056 
4057 	if (NAPI_GRO_CB(skb)->count == 1) {
4058 		skb_shinfo(skb)->gso_size = 0;
4059 		goto out;
4060 	}
4061 
4062 	rcu_read_lock();
4063 	list_for_each_entry_rcu(ptype, head, list) {
4064 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4065 			continue;
4066 
4067 		err = ptype->callbacks.gro_complete(skb, 0);
4068 		break;
4069 	}
4070 	rcu_read_unlock();
4071 
4072 	if (err) {
4073 		WARN_ON(&ptype->list == head);
4074 		kfree_skb(skb);
4075 		return NET_RX_SUCCESS;
4076 	}
4077 
4078 out:
4079 	return netif_receive_skb_internal(skb);
4080 }
4081 
4082 /* napi->gro_list contains packets ordered by age.
4083  * youngest packets at the head of it.
4084  * Complete skbs in reverse order to reduce latencies.
4085  */
4086 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4087 {
4088 	struct sk_buff *skb, *prev = NULL;
4089 
4090 	/* scan list and build reverse chain */
4091 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4092 		skb->prev = prev;
4093 		prev = skb;
4094 	}
4095 
4096 	for (skb = prev; skb; skb = prev) {
4097 		skb->next = NULL;
4098 
4099 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4100 			return;
4101 
4102 		prev = skb->prev;
4103 		napi_gro_complete(skb);
4104 		napi->gro_count--;
4105 	}
4106 
4107 	napi->gro_list = NULL;
4108 }
4109 EXPORT_SYMBOL(napi_gro_flush);
4110 
4111 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4112 {
4113 	struct sk_buff *p;
4114 	unsigned int maclen = skb->dev->hard_header_len;
4115 	u32 hash = skb_get_hash_raw(skb);
4116 
4117 	for (p = napi->gro_list; p; p = p->next) {
4118 		unsigned long diffs;
4119 
4120 		NAPI_GRO_CB(p)->flush = 0;
4121 
4122 		if (hash != skb_get_hash_raw(p)) {
4123 			NAPI_GRO_CB(p)->same_flow = 0;
4124 			continue;
4125 		}
4126 
4127 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4128 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4129 		if (maclen == ETH_HLEN)
4130 			diffs |= compare_ether_header(skb_mac_header(p),
4131 						      skb_mac_header(skb));
4132 		else if (!diffs)
4133 			diffs = memcmp(skb_mac_header(p),
4134 				       skb_mac_header(skb),
4135 				       maclen);
4136 		NAPI_GRO_CB(p)->same_flow = !diffs;
4137 	}
4138 }
4139 
4140 static void skb_gro_reset_offset(struct sk_buff *skb)
4141 {
4142 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4143 	const skb_frag_t *frag0 = &pinfo->frags[0];
4144 
4145 	NAPI_GRO_CB(skb)->data_offset = 0;
4146 	NAPI_GRO_CB(skb)->frag0 = NULL;
4147 	NAPI_GRO_CB(skb)->frag0_len = 0;
4148 
4149 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4150 	    pinfo->nr_frags &&
4151 	    !PageHighMem(skb_frag_page(frag0))) {
4152 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4153 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4154 	}
4155 }
4156 
4157 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4158 {
4159 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4160 
4161 	BUG_ON(skb->end - skb->tail < grow);
4162 
4163 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4164 
4165 	skb->data_len -= grow;
4166 	skb->tail += grow;
4167 
4168 	pinfo->frags[0].page_offset += grow;
4169 	skb_frag_size_sub(&pinfo->frags[0], grow);
4170 
4171 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4172 		skb_frag_unref(skb, 0);
4173 		memmove(pinfo->frags, pinfo->frags + 1,
4174 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4175 	}
4176 }
4177 
4178 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4179 {
4180 	struct sk_buff **pp = NULL;
4181 	struct packet_offload *ptype;
4182 	__be16 type = skb->protocol;
4183 	struct list_head *head = &offload_base;
4184 	int same_flow;
4185 	enum gro_result ret;
4186 	int grow;
4187 
4188 	if (!(skb->dev->features & NETIF_F_GRO))
4189 		goto normal;
4190 
4191 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4192 		goto normal;
4193 
4194 	gro_list_prepare(napi, skb);
4195 
4196 	rcu_read_lock();
4197 	list_for_each_entry_rcu(ptype, head, list) {
4198 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4199 			continue;
4200 
4201 		skb_set_network_header(skb, skb_gro_offset(skb));
4202 		skb_reset_mac_len(skb);
4203 		NAPI_GRO_CB(skb)->same_flow = 0;
4204 		NAPI_GRO_CB(skb)->flush = 0;
4205 		NAPI_GRO_CB(skb)->free = 0;
4206 		NAPI_GRO_CB(skb)->udp_mark = 0;
4207 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4208 
4209 		/* Setup for GRO checksum validation */
4210 		switch (skb->ip_summed) {
4211 		case CHECKSUM_COMPLETE:
4212 			NAPI_GRO_CB(skb)->csum = skb->csum;
4213 			NAPI_GRO_CB(skb)->csum_valid = 1;
4214 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4215 			break;
4216 		case CHECKSUM_UNNECESSARY:
4217 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4218 			NAPI_GRO_CB(skb)->csum_valid = 0;
4219 			break;
4220 		default:
4221 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4222 			NAPI_GRO_CB(skb)->csum_valid = 0;
4223 		}
4224 
4225 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4226 		break;
4227 	}
4228 	rcu_read_unlock();
4229 
4230 	if (&ptype->list == head)
4231 		goto normal;
4232 
4233 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4234 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4235 
4236 	if (pp) {
4237 		struct sk_buff *nskb = *pp;
4238 
4239 		*pp = nskb->next;
4240 		nskb->next = NULL;
4241 		napi_gro_complete(nskb);
4242 		napi->gro_count--;
4243 	}
4244 
4245 	if (same_flow)
4246 		goto ok;
4247 
4248 	if (NAPI_GRO_CB(skb)->flush)
4249 		goto normal;
4250 
4251 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4252 		struct sk_buff *nskb = napi->gro_list;
4253 
4254 		/* locate the end of the list to select the 'oldest' flow */
4255 		while (nskb->next) {
4256 			pp = &nskb->next;
4257 			nskb = *pp;
4258 		}
4259 		*pp = NULL;
4260 		nskb->next = NULL;
4261 		napi_gro_complete(nskb);
4262 	} else {
4263 		napi->gro_count++;
4264 	}
4265 	NAPI_GRO_CB(skb)->count = 1;
4266 	NAPI_GRO_CB(skb)->age = jiffies;
4267 	NAPI_GRO_CB(skb)->last = skb;
4268 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4269 	skb->next = napi->gro_list;
4270 	napi->gro_list = skb;
4271 	ret = GRO_HELD;
4272 
4273 pull:
4274 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4275 	if (grow > 0)
4276 		gro_pull_from_frag0(skb, grow);
4277 ok:
4278 	return ret;
4279 
4280 normal:
4281 	ret = GRO_NORMAL;
4282 	goto pull;
4283 }
4284 
4285 struct packet_offload *gro_find_receive_by_type(__be16 type)
4286 {
4287 	struct list_head *offload_head = &offload_base;
4288 	struct packet_offload *ptype;
4289 
4290 	list_for_each_entry_rcu(ptype, offload_head, list) {
4291 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4292 			continue;
4293 		return ptype;
4294 	}
4295 	return NULL;
4296 }
4297 EXPORT_SYMBOL(gro_find_receive_by_type);
4298 
4299 struct packet_offload *gro_find_complete_by_type(__be16 type)
4300 {
4301 	struct list_head *offload_head = &offload_base;
4302 	struct packet_offload *ptype;
4303 
4304 	list_for_each_entry_rcu(ptype, offload_head, list) {
4305 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4306 			continue;
4307 		return ptype;
4308 	}
4309 	return NULL;
4310 }
4311 EXPORT_SYMBOL(gro_find_complete_by_type);
4312 
4313 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4314 {
4315 	switch (ret) {
4316 	case GRO_NORMAL:
4317 		if (netif_receive_skb_internal(skb))
4318 			ret = GRO_DROP;
4319 		break;
4320 
4321 	case GRO_DROP:
4322 		kfree_skb(skb);
4323 		break;
4324 
4325 	case GRO_MERGED_FREE:
4326 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4327 			kmem_cache_free(skbuff_head_cache, skb);
4328 		else
4329 			__kfree_skb(skb);
4330 		break;
4331 
4332 	case GRO_HELD:
4333 	case GRO_MERGED:
4334 		break;
4335 	}
4336 
4337 	return ret;
4338 }
4339 
4340 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4341 {
4342 	trace_napi_gro_receive_entry(skb);
4343 
4344 	skb_gro_reset_offset(skb);
4345 
4346 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4347 }
4348 EXPORT_SYMBOL(napi_gro_receive);
4349 
4350 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4351 {
4352 	if (unlikely(skb->pfmemalloc)) {
4353 		consume_skb(skb);
4354 		return;
4355 	}
4356 	__skb_pull(skb, skb_headlen(skb));
4357 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4358 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4359 	skb->vlan_tci = 0;
4360 	skb->dev = napi->dev;
4361 	skb->skb_iif = 0;
4362 	skb->encapsulation = 0;
4363 	skb_shinfo(skb)->gso_type = 0;
4364 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4365 
4366 	napi->skb = skb;
4367 }
4368 
4369 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4370 {
4371 	struct sk_buff *skb = napi->skb;
4372 
4373 	if (!skb) {
4374 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4375 		napi->skb = skb;
4376 	}
4377 	return skb;
4378 }
4379 EXPORT_SYMBOL(napi_get_frags);
4380 
4381 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4382 				      struct sk_buff *skb,
4383 				      gro_result_t ret)
4384 {
4385 	switch (ret) {
4386 	case GRO_NORMAL:
4387 	case GRO_HELD:
4388 		__skb_push(skb, ETH_HLEN);
4389 		skb->protocol = eth_type_trans(skb, skb->dev);
4390 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4391 			ret = GRO_DROP;
4392 		break;
4393 
4394 	case GRO_DROP:
4395 	case GRO_MERGED_FREE:
4396 		napi_reuse_skb(napi, skb);
4397 		break;
4398 
4399 	case GRO_MERGED:
4400 		break;
4401 	}
4402 
4403 	return ret;
4404 }
4405 
4406 /* Upper GRO stack assumes network header starts at gro_offset=0
4407  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4408  * We copy ethernet header into skb->data to have a common layout.
4409  */
4410 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4411 {
4412 	struct sk_buff *skb = napi->skb;
4413 	const struct ethhdr *eth;
4414 	unsigned int hlen = sizeof(*eth);
4415 
4416 	napi->skb = NULL;
4417 
4418 	skb_reset_mac_header(skb);
4419 	skb_gro_reset_offset(skb);
4420 
4421 	eth = skb_gro_header_fast(skb, 0);
4422 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4423 		eth = skb_gro_header_slow(skb, hlen, 0);
4424 		if (unlikely(!eth)) {
4425 			napi_reuse_skb(napi, skb);
4426 			return NULL;
4427 		}
4428 	} else {
4429 		gro_pull_from_frag0(skb, hlen);
4430 		NAPI_GRO_CB(skb)->frag0 += hlen;
4431 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4432 	}
4433 	__skb_pull(skb, hlen);
4434 
4435 	/*
4436 	 * This works because the only protocols we care about don't require
4437 	 * special handling.
4438 	 * We'll fix it up properly in napi_frags_finish()
4439 	 */
4440 	skb->protocol = eth->h_proto;
4441 
4442 	return skb;
4443 }
4444 
4445 gro_result_t napi_gro_frags(struct napi_struct *napi)
4446 {
4447 	struct sk_buff *skb = napi_frags_skb(napi);
4448 
4449 	if (!skb)
4450 		return GRO_DROP;
4451 
4452 	trace_napi_gro_frags_entry(skb);
4453 
4454 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4455 }
4456 EXPORT_SYMBOL(napi_gro_frags);
4457 
4458 /* Compute the checksum from gro_offset and return the folded value
4459  * after adding in any pseudo checksum.
4460  */
4461 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4462 {
4463 	__wsum wsum;
4464 	__sum16 sum;
4465 
4466 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4467 
4468 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4469 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4470 	if (likely(!sum)) {
4471 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4472 		    !skb->csum_complete_sw)
4473 			netdev_rx_csum_fault(skb->dev);
4474 	}
4475 
4476 	NAPI_GRO_CB(skb)->csum = wsum;
4477 	NAPI_GRO_CB(skb)->csum_valid = 1;
4478 
4479 	return sum;
4480 }
4481 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4482 
4483 /*
4484  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4485  * Note: called with local irq disabled, but exits with local irq enabled.
4486  */
4487 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4488 {
4489 #ifdef CONFIG_RPS
4490 	struct softnet_data *remsd = sd->rps_ipi_list;
4491 
4492 	if (remsd) {
4493 		sd->rps_ipi_list = NULL;
4494 
4495 		local_irq_enable();
4496 
4497 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4498 		while (remsd) {
4499 			struct softnet_data *next = remsd->rps_ipi_next;
4500 
4501 			if (cpu_online(remsd->cpu))
4502 				smp_call_function_single_async(remsd->cpu,
4503 							   &remsd->csd);
4504 			remsd = next;
4505 		}
4506 	} else
4507 #endif
4508 		local_irq_enable();
4509 }
4510 
4511 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4512 {
4513 #ifdef CONFIG_RPS
4514 	return sd->rps_ipi_list != NULL;
4515 #else
4516 	return false;
4517 #endif
4518 }
4519 
4520 static int process_backlog(struct napi_struct *napi, int quota)
4521 {
4522 	int work = 0;
4523 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4524 
4525 	/* Check if we have pending ipi, its better to send them now,
4526 	 * not waiting net_rx_action() end.
4527 	 */
4528 	if (sd_has_rps_ipi_waiting(sd)) {
4529 		local_irq_disable();
4530 		net_rps_action_and_irq_enable(sd);
4531 	}
4532 
4533 	napi->weight = weight_p;
4534 	local_irq_disable();
4535 	while (1) {
4536 		struct sk_buff *skb;
4537 
4538 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4539 			rcu_read_lock();
4540 			local_irq_enable();
4541 			__netif_receive_skb(skb);
4542 			rcu_read_unlock();
4543 			local_irq_disable();
4544 			input_queue_head_incr(sd);
4545 			if (++work >= quota) {
4546 				local_irq_enable();
4547 				return work;
4548 			}
4549 		}
4550 
4551 		rps_lock(sd);
4552 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4553 			/*
4554 			 * Inline a custom version of __napi_complete().
4555 			 * only current cpu owns and manipulates this napi,
4556 			 * and NAPI_STATE_SCHED is the only possible flag set
4557 			 * on backlog.
4558 			 * We can use a plain write instead of clear_bit(),
4559 			 * and we dont need an smp_mb() memory barrier.
4560 			 */
4561 			napi->state = 0;
4562 			rps_unlock(sd);
4563 
4564 			break;
4565 		}
4566 
4567 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4568 					   &sd->process_queue);
4569 		rps_unlock(sd);
4570 	}
4571 	local_irq_enable();
4572 
4573 	return work;
4574 }
4575 
4576 /**
4577  * __napi_schedule - schedule for receive
4578  * @n: entry to schedule
4579  *
4580  * The entry's receive function will be scheduled to run.
4581  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4582  */
4583 void __napi_schedule(struct napi_struct *n)
4584 {
4585 	unsigned long flags;
4586 
4587 	local_irq_save(flags);
4588 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4589 	local_irq_restore(flags);
4590 }
4591 EXPORT_SYMBOL(__napi_schedule);
4592 
4593 /**
4594  * __napi_schedule_irqoff - schedule for receive
4595  * @n: entry to schedule
4596  *
4597  * Variant of __napi_schedule() assuming hard irqs are masked
4598  */
4599 void __napi_schedule_irqoff(struct napi_struct *n)
4600 {
4601 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4602 }
4603 EXPORT_SYMBOL(__napi_schedule_irqoff);
4604 
4605 void __napi_complete(struct napi_struct *n)
4606 {
4607 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4608 
4609 	list_del_init(&n->poll_list);
4610 	smp_mb__before_atomic();
4611 	clear_bit(NAPI_STATE_SCHED, &n->state);
4612 }
4613 EXPORT_SYMBOL(__napi_complete);
4614 
4615 void napi_complete_done(struct napi_struct *n, int work_done)
4616 {
4617 	unsigned long flags;
4618 
4619 	/*
4620 	 * don't let napi dequeue from the cpu poll list
4621 	 * just in case its running on a different cpu
4622 	 */
4623 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4624 		return;
4625 
4626 	if (n->gro_list) {
4627 		unsigned long timeout = 0;
4628 
4629 		if (work_done)
4630 			timeout = n->dev->gro_flush_timeout;
4631 
4632 		if (timeout)
4633 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4634 				      HRTIMER_MODE_REL_PINNED);
4635 		else
4636 			napi_gro_flush(n, false);
4637 	}
4638 	if (likely(list_empty(&n->poll_list))) {
4639 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4640 	} else {
4641 		/* If n->poll_list is not empty, we need to mask irqs */
4642 		local_irq_save(flags);
4643 		__napi_complete(n);
4644 		local_irq_restore(flags);
4645 	}
4646 }
4647 EXPORT_SYMBOL(napi_complete_done);
4648 
4649 /* must be called under rcu_read_lock(), as we dont take a reference */
4650 struct napi_struct *napi_by_id(unsigned int napi_id)
4651 {
4652 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4653 	struct napi_struct *napi;
4654 
4655 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4656 		if (napi->napi_id == napi_id)
4657 			return napi;
4658 
4659 	return NULL;
4660 }
4661 EXPORT_SYMBOL_GPL(napi_by_id);
4662 
4663 void napi_hash_add(struct napi_struct *napi)
4664 {
4665 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4666 
4667 		spin_lock(&napi_hash_lock);
4668 
4669 		/* 0 is not a valid id, we also skip an id that is taken
4670 		 * we expect both events to be extremely rare
4671 		 */
4672 		napi->napi_id = 0;
4673 		while (!napi->napi_id) {
4674 			napi->napi_id = ++napi_gen_id;
4675 			if (napi_by_id(napi->napi_id))
4676 				napi->napi_id = 0;
4677 		}
4678 
4679 		hlist_add_head_rcu(&napi->napi_hash_node,
4680 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4681 
4682 		spin_unlock(&napi_hash_lock);
4683 	}
4684 }
4685 EXPORT_SYMBOL_GPL(napi_hash_add);
4686 
4687 /* Warning : caller is responsible to make sure rcu grace period
4688  * is respected before freeing memory containing @napi
4689  */
4690 void napi_hash_del(struct napi_struct *napi)
4691 {
4692 	spin_lock(&napi_hash_lock);
4693 
4694 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4695 		hlist_del_rcu(&napi->napi_hash_node);
4696 
4697 	spin_unlock(&napi_hash_lock);
4698 }
4699 EXPORT_SYMBOL_GPL(napi_hash_del);
4700 
4701 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4702 {
4703 	struct napi_struct *napi;
4704 
4705 	napi = container_of(timer, struct napi_struct, timer);
4706 	if (napi->gro_list)
4707 		napi_schedule(napi);
4708 
4709 	return HRTIMER_NORESTART;
4710 }
4711 
4712 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4713 		    int (*poll)(struct napi_struct *, int), int weight)
4714 {
4715 	INIT_LIST_HEAD(&napi->poll_list);
4716 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4717 	napi->timer.function = napi_watchdog;
4718 	napi->gro_count = 0;
4719 	napi->gro_list = NULL;
4720 	napi->skb = NULL;
4721 	napi->poll = poll;
4722 	if (weight > NAPI_POLL_WEIGHT)
4723 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4724 			    weight, dev->name);
4725 	napi->weight = weight;
4726 	list_add(&napi->dev_list, &dev->napi_list);
4727 	napi->dev = dev;
4728 #ifdef CONFIG_NETPOLL
4729 	spin_lock_init(&napi->poll_lock);
4730 	napi->poll_owner = -1;
4731 #endif
4732 	set_bit(NAPI_STATE_SCHED, &napi->state);
4733 }
4734 EXPORT_SYMBOL(netif_napi_add);
4735 
4736 void napi_disable(struct napi_struct *n)
4737 {
4738 	might_sleep();
4739 	set_bit(NAPI_STATE_DISABLE, &n->state);
4740 
4741 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4742 		msleep(1);
4743 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4744 		msleep(1);
4745 
4746 	hrtimer_cancel(&n->timer);
4747 
4748 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4749 }
4750 EXPORT_SYMBOL(napi_disable);
4751 
4752 void netif_napi_del(struct napi_struct *napi)
4753 {
4754 	list_del_init(&napi->dev_list);
4755 	napi_free_frags(napi);
4756 
4757 	kfree_skb_list(napi->gro_list);
4758 	napi->gro_list = NULL;
4759 	napi->gro_count = 0;
4760 }
4761 EXPORT_SYMBOL(netif_napi_del);
4762 
4763 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4764 {
4765 	void *have;
4766 	int work, weight;
4767 
4768 	list_del_init(&n->poll_list);
4769 
4770 	have = netpoll_poll_lock(n);
4771 
4772 	weight = n->weight;
4773 
4774 	/* This NAPI_STATE_SCHED test is for avoiding a race
4775 	 * with netpoll's poll_napi().  Only the entity which
4776 	 * obtains the lock and sees NAPI_STATE_SCHED set will
4777 	 * actually make the ->poll() call.  Therefore we avoid
4778 	 * accidentally calling ->poll() when NAPI is not scheduled.
4779 	 */
4780 	work = 0;
4781 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4782 		work = n->poll(n, weight);
4783 		trace_napi_poll(n);
4784 	}
4785 
4786 	WARN_ON_ONCE(work > weight);
4787 
4788 	if (likely(work < weight))
4789 		goto out_unlock;
4790 
4791 	/* Drivers must not modify the NAPI state if they
4792 	 * consume the entire weight.  In such cases this code
4793 	 * still "owns" the NAPI instance and therefore can
4794 	 * move the instance around on the list at-will.
4795 	 */
4796 	if (unlikely(napi_disable_pending(n))) {
4797 		napi_complete(n);
4798 		goto out_unlock;
4799 	}
4800 
4801 	if (n->gro_list) {
4802 		/* flush too old packets
4803 		 * If HZ < 1000, flush all packets.
4804 		 */
4805 		napi_gro_flush(n, HZ >= 1000);
4806 	}
4807 
4808 	/* Some drivers may have called napi_schedule
4809 	 * prior to exhausting their budget.
4810 	 */
4811 	if (unlikely(!list_empty(&n->poll_list))) {
4812 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4813 			     n->dev ? n->dev->name : "backlog");
4814 		goto out_unlock;
4815 	}
4816 
4817 	list_add_tail(&n->poll_list, repoll);
4818 
4819 out_unlock:
4820 	netpoll_poll_unlock(have);
4821 
4822 	return work;
4823 }
4824 
4825 static void net_rx_action(struct softirq_action *h)
4826 {
4827 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4828 	unsigned long time_limit = jiffies + 2;
4829 	int budget = netdev_budget;
4830 	LIST_HEAD(list);
4831 	LIST_HEAD(repoll);
4832 
4833 	local_irq_disable();
4834 	list_splice_init(&sd->poll_list, &list);
4835 	local_irq_enable();
4836 
4837 	for (;;) {
4838 		struct napi_struct *n;
4839 
4840 		if (list_empty(&list)) {
4841 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4842 				return;
4843 			break;
4844 		}
4845 
4846 		n = list_first_entry(&list, struct napi_struct, poll_list);
4847 		budget -= napi_poll(n, &repoll);
4848 
4849 		/* If softirq window is exhausted then punt.
4850 		 * Allow this to run for 2 jiffies since which will allow
4851 		 * an average latency of 1.5/HZ.
4852 		 */
4853 		if (unlikely(budget <= 0 ||
4854 			     time_after_eq(jiffies, time_limit))) {
4855 			sd->time_squeeze++;
4856 			break;
4857 		}
4858 	}
4859 
4860 	local_irq_disable();
4861 
4862 	list_splice_tail_init(&sd->poll_list, &list);
4863 	list_splice_tail(&repoll, &list);
4864 	list_splice(&list, &sd->poll_list);
4865 	if (!list_empty(&sd->poll_list))
4866 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4867 
4868 	net_rps_action_and_irq_enable(sd);
4869 }
4870 
4871 struct netdev_adjacent {
4872 	struct net_device *dev;
4873 
4874 	/* upper master flag, there can only be one master device per list */
4875 	bool master;
4876 
4877 	/* counter for the number of times this device was added to us */
4878 	u16 ref_nr;
4879 
4880 	/* private field for the users */
4881 	void *private;
4882 
4883 	struct list_head list;
4884 	struct rcu_head rcu;
4885 };
4886 
4887 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4888 						 struct net_device *adj_dev,
4889 						 struct list_head *adj_list)
4890 {
4891 	struct netdev_adjacent *adj;
4892 
4893 	list_for_each_entry(adj, adj_list, list) {
4894 		if (adj->dev == adj_dev)
4895 			return adj;
4896 	}
4897 	return NULL;
4898 }
4899 
4900 /**
4901  * netdev_has_upper_dev - Check if device is linked to an upper device
4902  * @dev: device
4903  * @upper_dev: upper device to check
4904  *
4905  * Find out if a device is linked to specified upper device and return true
4906  * in case it is. Note that this checks only immediate upper device,
4907  * not through a complete stack of devices. The caller must hold the RTNL lock.
4908  */
4909 bool netdev_has_upper_dev(struct net_device *dev,
4910 			  struct net_device *upper_dev)
4911 {
4912 	ASSERT_RTNL();
4913 
4914 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4915 }
4916 EXPORT_SYMBOL(netdev_has_upper_dev);
4917 
4918 /**
4919  * netdev_has_any_upper_dev - Check if device is linked to some device
4920  * @dev: device
4921  *
4922  * Find out if a device is linked to an upper device and return true in case
4923  * it is. The caller must hold the RTNL lock.
4924  */
4925 static bool netdev_has_any_upper_dev(struct net_device *dev)
4926 {
4927 	ASSERT_RTNL();
4928 
4929 	return !list_empty(&dev->all_adj_list.upper);
4930 }
4931 
4932 /**
4933  * netdev_master_upper_dev_get - Get master upper device
4934  * @dev: device
4935  *
4936  * Find a master upper device and return pointer to it or NULL in case
4937  * it's not there. The caller must hold the RTNL lock.
4938  */
4939 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4940 {
4941 	struct netdev_adjacent *upper;
4942 
4943 	ASSERT_RTNL();
4944 
4945 	if (list_empty(&dev->adj_list.upper))
4946 		return NULL;
4947 
4948 	upper = list_first_entry(&dev->adj_list.upper,
4949 				 struct netdev_adjacent, list);
4950 	if (likely(upper->master))
4951 		return upper->dev;
4952 	return NULL;
4953 }
4954 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4955 
4956 void *netdev_adjacent_get_private(struct list_head *adj_list)
4957 {
4958 	struct netdev_adjacent *adj;
4959 
4960 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4961 
4962 	return adj->private;
4963 }
4964 EXPORT_SYMBOL(netdev_adjacent_get_private);
4965 
4966 /**
4967  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4968  * @dev: device
4969  * @iter: list_head ** of the current position
4970  *
4971  * Gets the next device from the dev's upper list, starting from iter
4972  * position. The caller must hold RCU read lock.
4973  */
4974 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4975 						 struct list_head **iter)
4976 {
4977 	struct netdev_adjacent *upper;
4978 
4979 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4980 
4981 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4982 
4983 	if (&upper->list == &dev->adj_list.upper)
4984 		return NULL;
4985 
4986 	*iter = &upper->list;
4987 
4988 	return upper->dev;
4989 }
4990 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4991 
4992 /**
4993  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4994  * @dev: device
4995  * @iter: list_head ** of the current position
4996  *
4997  * Gets the next device from the dev's upper list, starting from iter
4998  * position. The caller must hold RCU read lock.
4999  */
5000 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5001 						     struct list_head **iter)
5002 {
5003 	struct netdev_adjacent *upper;
5004 
5005 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5006 
5007 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5008 
5009 	if (&upper->list == &dev->all_adj_list.upper)
5010 		return NULL;
5011 
5012 	*iter = &upper->list;
5013 
5014 	return upper->dev;
5015 }
5016 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5017 
5018 /**
5019  * netdev_lower_get_next_private - Get the next ->private from the
5020  *				   lower neighbour list
5021  * @dev: device
5022  * @iter: list_head ** of the current position
5023  *
5024  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5025  * list, starting from iter position. The caller must hold either hold the
5026  * RTNL lock or its own locking that guarantees that the neighbour lower
5027  * list will remain unchanged.
5028  */
5029 void *netdev_lower_get_next_private(struct net_device *dev,
5030 				    struct list_head **iter)
5031 {
5032 	struct netdev_adjacent *lower;
5033 
5034 	lower = list_entry(*iter, struct netdev_adjacent, list);
5035 
5036 	if (&lower->list == &dev->adj_list.lower)
5037 		return NULL;
5038 
5039 	*iter = lower->list.next;
5040 
5041 	return lower->private;
5042 }
5043 EXPORT_SYMBOL(netdev_lower_get_next_private);
5044 
5045 /**
5046  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5047  *				       lower neighbour list, RCU
5048  *				       variant
5049  * @dev: device
5050  * @iter: list_head ** of the current position
5051  *
5052  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5053  * list, starting from iter position. The caller must hold RCU read lock.
5054  */
5055 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5056 					struct list_head **iter)
5057 {
5058 	struct netdev_adjacent *lower;
5059 
5060 	WARN_ON_ONCE(!rcu_read_lock_held());
5061 
5062 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5063 
5064 	if (&lower->list == &dev->adj_list.lower)
5065 		return NULL;
5066 
5067 	*iter = &lower->list;
5068 
5069 	return lower->private;
5070 }
5071 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5072 
5073 /**
5074  * netdev_lower_get_next - Get the next device from the lower neighbour
5075  *                         list
5076  * @dev: device
5077  * @iter: list_head ** of the current position
5078  *
5079  * Gets the next netdev_adjacent from the dev's lower neighbour
5080  * list, starting from iter position. The caller must hold RTNL lock or
5081  * its own locking that guarantees that the neighbour lower
5082  * list will remain unchanged.
5083  */
5084 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5085 {
5086 	struct netdev_adjacent *lower;
5087 
5088 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5089 
5090 	if (&lower->list == &dev->adj_list.lower)
5091 		return NULL;
5092 
5093 	*iter = &lower->list;
5094 
5095 	return lower->dev;
5096 }
5097 EXPORT_SYMBOL(netdev_lower_get_next);
5098 
5099 /**
5100  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5101  *				       lower neighbour list, RCU
5102  *				       variant
5103  * @dev: device
5104  *
5105  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5106  * list. The caller must hold RCU read lock.
5107  */
5108 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5109 {
5110 	struct netdev_adjacent *lower;
5111 
5112 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5113 			struct netdev_adjacent, list);
5114 	if (lower)
5115 		return lower->private;
5116 	return NULL;
5117 }
5118 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5119 
5120 /**
5121  * netdev_master_upper_dev_get_rcu - Get master upper device
5122  * @dev: device
5123  *
5124  * Find a master upper device and return pointer to it or NULL in case
5125  * it's not there. The caller must hold the RCU read lock.
5126  */
5127 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5128 {
5129 	struct netdev_adjacent *upper;
5130 
5131 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5132 				       struct netdev_adjacent, list);
5133 	if (upper && likely(upper->master))
5134 		return upper->dev;
5135 	return NULL;
5136 }
5137 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5138 
5139 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5140 			      struct net_device *adj_dev,
5141 			      struct list_head *dev_list)
5142 {
5143 	char linkname[IFNAMSIZ+7];
5144 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5145 		"upper_%s" : "lower_%s", adj_dev->name);
5146 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5147 				 linkname);
5148 }
5149 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5150 			       char *name,
5151 			       struct list_head *dev_list)
5152 {
5153 	char linkname[IFNAMSIZ+7];
5154 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5155 		"upper_%s" : "lower_%s", name);
5156 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5157 }
5158 
5159 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5160 						 struct net_device *adj_dev,
5161 						 struct list_head *dev_list)
5162 {
5163 	return (dev_list == &dev->adj_list.upper ||
5164 		dev_list == &dev->adj_list.lower) &&
5165 		net_eq(dev_net(dev), dev_net(adj_dev));
5166 }
5167 
5168 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5169 					struct net_device *adj_dev,
5170 					struct list_head *dev_list,
5171 					void *private, bool master)
5172 {
5173 	struct netdev_adjacent *adj;
5174 	int ret;
5175 
5176 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5177 
5178 	if (adj) {
5179 		adj->ref_nr++;
5180 		return 0;
5181 	}
5182 
5183 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5184 	if (!adj)
5185 		return -ENOMEM;
5186 
5187 	adj->dev = adj_dev;
5188 	adj->master = master;
5189 	adj->ref_nr = 1;
5190 	adj->private = private;
5191 	dev_hold(adj_dev);
5192 
5193 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5194 		 adj_dev->name, dev->name, adj_dev->name);
5195 
5196 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5197 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5198 		if (ret)
5199 			goto free_adj;
5200 	}
5201 
5202 	/* Ensure that master link is always the first item in list. */
5203 	if (master) {
5204 		ret = sysfs_create_link(&(dev->dev.kobj),
5205 					&(adj_dev->dev.kobj), "master");
5206 		if (ret)
5207 			goto remove_symlinks;
5208 
5209 		list_add_rcu(&adj->list, dev_list);
5210 	} else {
5211 		list_add_tail_rcu(&adj->list, dev_list);
5212 	}
5213 
5214 	return 0;
5215 
5216 remove_symlinks:
5217 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5218 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5219 free_adj:
5220 	kfree(adj);
5221 	dev_put(adj_dev);
5222 
5223 	return ret;
5224 }
5225 
5226 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5227 					 struct net_device *adj_dev,
5228 					 struct list_head *dev_list)
5229 {
5230 	struct netdev_adjacent *adj;
5231 
5232 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5233 
5234 	if (!adj) {
5235 		pr_err("tried to remove device %s from %s\n",
5236 		       dev->name, adj_dev->name);
5237 		BUG();
5238 	}
5239 
5240 	if (adj->ref_nr > 1) {
5241 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5242 			 adj->ref_nr-1);
5243 		adj->ref_nr--;
5244 		return;
5245 	}
5246 
5247 	if (adj->master)
5248 		sysfs_remove_link(&(dev->dev.kobj), "master");
5249 
5250 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5251 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5252 
5253 	list_del_rcu(&adj->list);
5254 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5255 		 adj_dev->name, dev->name, adj_dev->name);
5256 	dev_put(adj_dev);
5257 	kfree_rcu(adj, rcu);
5258 }
5259 
5260 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5261 					    struct net_device *upper_dev,
5262 					    struct list_head *up_list,
5263 					    struct list_head *down_list,
5264 					    void *private, bool master)
5265 {
5266 	int ret;
5267 
5268 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5269 					   master);
5270 	if (ret)
5271 		return ret;
5272 
5273 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5274 					   false);
5275 	if (ret) {
5276 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5277 		return ret;
5278 	}
5279 
5280 	return 0;
5281 }
5282 
5283 static int __netdev_adjacent_dev_link(struct net_device *dev,
5284 				      struct net_device *upper_dev)
5285 {
5286 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5287 						&dev->all_adj_list.upper,
5288 						&upper_dev->all_adj_list.lower,
5289 						NULL, false);
5290 }
5291 
5292 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5293 					       struct net_device *upper_dev,
5294 					       struct list_head *up_list,
5295 					       struct list_head *down_list)
5296 {
5297 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5298 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5299 }
5300 
5301 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5302 					 struct net_device *upper_dev)
5303 {
5304 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5305 					   &dev->all_adj_list.upper,
5306 					   &upper_dev->all_adj_list.lower);
5307 }
5308 
5309 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5310 						struct net_device *upper_dev,
5311 						void *private, bool master)
5312 {
5313 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5314 
5315 	if (ret)
5316 		return ret;
5317 
5318 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5319 					       &dev->adj_list.upper,
5320 					       &upper_dev->adj_list.lower,
5321 					       private, master);
5322 	if (ret) {
5323 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5324 		return ret;
5325 	}
5326 
5327 	return 0;
5328 }
5329 
5330 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5331 						   struct net_device *upper_dev)
5332 {
5333 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5334 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5335 					   &dev->adj_list.upper,
5336 					   &upper_dev->adj_list.lower);
5337 }
5338 
5339 static int __netdev_upper_dev_link(struct net_device *dev,
5340 				   struct net_device *upper_dev, bool master,
5341 				   void *private)
5342 {
5343 	struct netdev_notifier_changeupper_info changeupper_info;
5344 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5345 	int ret = 0;
5346 
5347 	ASSERT_RTNL();
5348 
5349 	if (dev == upper_dev)
5350 		return -EBUSY;
5351 
5352 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5353 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5354 		return -EBUSY;
5355 
5356 	if (__netdev_find_adj(dev, upper_dev, &dev->adj_list.upper))
5357 		return -EEXIST;
5358 
5359 	if (master && netdev_master_upper_dev_get(dev))
5360 		return -EBUSY;
5361 
5362 	changeupper_info.upper_dev = upper_dev;
5363 	changeupper_info.master = master;
5364 	changeupper_info.linking = true;
5365 
5366 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5367 						   master);
5368 	if (ret)
5369 		return ret;
5370 
5371 	/* Now that we linked these devs, make all the upper_dev's
5372 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5373 	 * versa, and don't forget the devices itself. All of these
5374 	 * links are non-neighbours.
5375 	 */
5376 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5377 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5378 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5379 				 i->dev->name, j->dev->name);
5380 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5381 			if (ret)
5382 				goto rollback_mesh;
5383 		}
5384 	}
5385 
5386 	/* add dev to every upper_dev's upper device */
5387 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5388 		pr_debug("linking %s's upper device %s with %s\n",
5389 			 upper_dev->name, i->dev->name, dev->name);
5390 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5391 		if (ret)
5392 			goto rollback_upper_mesh;
5393 	}
5394 
5395 	/* add upper_dev to every dev's lower device */
5396 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5397 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5398 			 i->dev->name, upper_dev->name);
5399 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5400 		if (ret)
5401 			goto rollback_lower_mesh;
5402 	}
5403 
5404 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5405 				      &changeupper_info.info);
5406 	return 0;
5407 
5408 rollback_lower_mesh:
5409 	to_i = i;
5410 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5411 		if (i == to_i)
5412 			break;
5413 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5414 	}
5415 
5416 	i = NULL;
5417 
5418 rollback_upper_mesh:
5419 	to_i = i;
5420 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5421 		if (i == to_i)
5422 			break;
5423 		__netdev_adjacent_dev_unlink(dev, i->dev);
5424 	}
5425 
5426 	i = j = NULL;
5427 
5428 rollback_mesh:
5429 	to_i = i;
5430 	to_j = j;
5431 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5432 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5433 			if (i == to_i && j == to_j)
5434 				break;
5435 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5436 		}
5437 		if (i == to_i)
5438 			break;
5439 	}
5440 
5441 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5442 
5443 	return ret;
5444 }
5445 
5446 /**
5447  * netdev_upper_dev_link - Add a link to the upper device
5448  * @dev: device
5449  * @upper_dev: new upper device
5450  *
5451  * Adds a link to device which is upper to this one. The caller must hold
5452  * the RTNL lock. On a failure a negative errno code is returned.
5453  * On success the reference counts are adjusted and the function
5454  * returns zero.
5455  */
5456 int netdev_upper_dev_link(struct net_device *dev,
5457 			  struct net_device *upper_dev)
5458 {
5459 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5460 }
5461 EXPORT_SYMBOL(netdev_upper_dev_link);
5462 
5463 /**
5464  * netdev_master_upper_dev_link - Add a master link to the upper device
5465  * @dev: device
5466  * @upper_dev: new upper device
5467  *
5468  * Adds a link to device which is upper to this one. In this case, only
5469  * one master upper device can be linked, although other non-master devices
5470  * might be linked as well. The caller must hold the RTNL lock.
5471  * On a failure a negative errno code is returned. On success the reference
5472  * counts are adjusted and the function returns zero.
5473  */
5474 int netdev_master_upper_dev_link(struct net_device *dev,
5475 				 struct net_device *upper_dev)
5476 {
5477 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5478 }
5479 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5480 
5481 int netdev_master_upper_dev_link_private(struct net_device *dev,
5482 					 struct net_device *upper_dev,
5483 					 void *private)
5484 {
5485 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5486 }
5487 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5488 
5489 /**
5490  * netdev_upper_dev_unlink - Removes a link to upper device
5491  * @dev: device
5492  * @upper_dev: new upper device
5493  *
5494  * Removes a link to device which is upper to this one. The caller must hold
5495  * the RTNL lock.
5496  */
5497 void netdev_upper_dev_unlink(struct net_device *dev,
5498 			     struct net_device *upper_dev)
5499 {
5500 	struct netdev_notifier_changeupper_info changeupper_info;
5501 	struct netdev_adjacent *i, *j;
5502 	ASSERT_RTNL();
5503 
5504 	changeupper_info.upper_dev = upper_dev;
5505 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5506 	changeupper_info.linking = false;
5507 
5508 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5509 
5510 	/* Here is the tricky part. We must remove all dev's lower
5511 	 * devices from all upper_dev's upper devices and vice
5512 	 * versa, to maintain the graph relationship.
5513 	 */
5514 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5515 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5516 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5517 
5518 	/* remove also the devices itself from lower/upper device
5519 	 * list
5520 	 */
5521 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5522 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5523 
5524 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5525 		__netdev_adjacent_dev_unlink(dev, i->dev);
5526 
5527 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5528 				      &changeupper_info.info);
5529 }
5530 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5531 
5532 /**
5533  * netdev_bonding_info_change - Dispatch event about slave change
5534  * @dev: device
5535  * @bonding_info: info to dispatch
5536  *
5537  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5538  * The caller must hold the RTNL lock.
5539  */
5540 void netdev_bonding_info_change(struct net_device *dev,
5541 				struct netdev_bonding_info *bonding_info)
5542 {
5543 	struct netdev_notifier_bonding_info	info;
5544 
5545 	memcpy(&info.bonding_info, bonding_info,
5546 	       sizeof(struct netdev_bonding_info));
5547 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5548 				      &info.info);
5549 }
5550 EXPORT_SYMBOL(netdev_bonding_info_change);
5551 
5552 static void netdev_adjacent_add_links(struct net_device *dev)
5553 {
5554 	struct netdev_adjacent *iter;
5555 
5556 	struct net *net = dev_net(dev);
5557 
5558 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5559 		if (!net_eq(net,dev_net(iter->dev)))
5560 			continue;
5561 		netdev_adjacent_sysfs_add(iter->dev, dev,
5562 					  &iter->dev->adj_list.lower);
5563 		netdev_adjacent_sysfs_add(dev, iter->dev,
5564 					  &dev->adj_list.upper);
5565 	}
5566 
5567 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5568 		if (!net_eq(net,dev_net(iter->dev)))
5569 			continue;
5570 		netdev_adjacent_sysfs_add(iter->dev, dev,
5571 					  &iter->dev->adj_list.upper);
5572 		netdev_adjacent_sysfs_add(dev, iter->dev,
5573 					  &dev->adj_list.lower);
5574 	}
5575 }
5576 
5577 static void netdev_adjacent_del_links(struct net_device *dev)
5578 {
5579 	struct netdev_adjacent *iter;
5580 
5581 	struct net *net = dev_net(dev);
5582 
5583 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5584 		if (!net_eq(net,dev_net(iter->dev)))
5585 			continue;
5586 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5587 					  &iter->dev->adj_list.lower);
5588 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5589 					  &dev->adj_list.upper);
5590 	}
5591 
5592 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5593 		if (!net_eq(net,dev_net(iter->dev)))
5594 			continue;
5595 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5596 					  &iter->dev->adj_list.upper);
5597 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5598 					  &dev->adj_list.lower);
5599 	}
5600 }
5601 
5602 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5603 {
5604 	struct netdev_adjacent *iter;
5605 
5606 	struct net *net = dev_net(dev);
5607 
5608 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5609 		if (!net_eq(net,dev_net(iter->dev)))
5610 			continue;
5611 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5612 					  &iter->dev->adj_list.lower);
5613 		netdev_adjacent_sysfs_add(iter->dev, dev,
5614 					  &iter->dev->adj_list.lower);
5615 	}
5616 
5617 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5618 		if (!net_eq(net,dev_net(iter->dev)))
5619 			continue;
5620 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5621 					  &iter->dev->adj_list.upper);
5622 		netdev_adjacent_sysfs_add(iter->dev, dev,
5623 					  &iter->dev->adj_list.upper);
5624 	}
5625 }
5626 
5627 void *netdev_lower_dev_get_private(struct net_device *dev,
5628 				   struct net_device *lower_dev)
5629 {
5630 	struct netdev_adjacent *lower;
5631 
5632 	if (!lower_dev)
5633 		return NULL;
5634 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5635 	if (!lower)
5636 		return NULL;
5637 
5638 	return lower->private;
5639 }
5640 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5641 
5642 
5643 int dev_get_nest_level(struct net_device *dev,
5644 		       bool (*type_check)(struct net_device *dev))
5645 {
5646 	struct net_device *lower = NULL;
5647 	struct list_head *iter;
5648 	int max_nest = -1;
5649 	int nest;
5650 
5651 	ASSERT_RTNL();
5652 
5653 	netdev_for_each_lower_dev(dev, lower, iter) {
5654 		nest = dev_get_nest_level(lower, type_check);
5655 		if (max_nest < nest)
5656 			max_nest = nest;
5657 	}
5658 
5659 	if (type_check(dev))
5660 		max_nest++;
5661 
5662 	return max_nest;
5663 }
5664 EXPORT_SYMBOL(dev_get_nest_level);
5665 
5666 static void dev_change_rx_flags(struct net_device *dev, int flags)
5667 {
5668 	const struct net_device_ops *ops = dev->netdev_ops;
5669 
5670 	if (ops->ndo_change_rx_flags)
5671 		ops->ndo_change_rx_flags(dev, flags);
5672 }
5673 
5674 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5675 {
5676 	unsigned int old_flags = dev->flags;
5677 	kuid_t uid;
5678 	kgid_t gid;
5679 
5680 	ASSERT_RTNL();
5681 
5682 	dev->flags |= IFF_PROMISC;
5683 	dev->promiscuity += inc;
5684 	if (dev->promiscuity == 0) {
5685 		/*
5686 		 * Avoid overflow.
5687 		 * If inc causes overflow, untouch promisc and return error.
5688 		 */
5689 		if (inc < 0)
5690 			dev->flags &= ~IFF_PROMISC;
5691 		else {
5692 			dev->promiscuity -= inc;
5693 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5694 				dev->name);
5695 			return -EOVERFLOW;
5696 		}
5697 	}
5698 	if (dev->flags != old_flags) {
5699 		pr_info("device %s %s promiscuous mode\n",
5700 			dev->name,
5701 			dev->flags & IFF_PROMISC ? "entered" : "left");
5702 		if (audit_enabled) {
5703 			current_uid_gid(&uid, &gid);
5704 			audit_log(current->audit_context, GFP_ATOMIC,
5705 				AUDIT_ANOM_PROMISCUOUS,
5706 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5707 				dev->name, (dev->flags & IFF_PROMISC),
5708 				(old_flags & IFF_PROMISC),
5709 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5710 				from_kuid(&init_user_ns, uid),
5711 				from_kgid(&init_user_ns, gid),
5712 				audit_get_sessionid(current));
5713 		}
5714 
5715 		dev_change_rx_flags(dev, IFF_PROMISC);
5716 	}
5717 	if (notify)
5718 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5719 	return 0;
5720 }
5721 
5722 /**
5723  *	dev_set_promiscuity	- update promiscuity count on a device
5724  *	@dev: device
5725  *	@inc: modifier
5726  *
5727  *	Add or remove promiscuity from a device. While the count in the device
5728  *	remains above zero the interface remains promiscuous. Once it hits zero
5729  *	the device reverts back to normal filtering operation. A negative inc
5730  *	value is used to drop promiscuity on the device.
5731  *	Return 0 if successful or a negative errno code on error.
5732  */
5733 int dev_set_promiscuity(struct net_device *dev, int inc)
5734 {
5735 	unsigned int old_flags = dev->flags;
5736 	int err;
5737 
5738 	err = __dev_set_promiscuity(dev, inc, true);
5739 	if (err < 0)
5740 		return err;
5741 	if (dev->flags != old_flags)
5742 		dev_set_rx_mode(dev);
5743 	return err;
5744 }
5745 EXPORT_SYMBOL(dev_set_promiscuity);
5746 
5747 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5748 {
5749 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5750 
5751 	ASSERT_RTNL();
5752 
5753 	dev->flags |= IFF_ALLMULTI;
5754 	dev->allmulti += inc;
5755 	if (dev->allmulti == 0) {
5756 		/*
5757 		 * Avoid overflow.
5758 		 * If inc causes overflow, untouch allmulti and return error.
5759 		 */
5760 		if (inc < 0)
5761 			dev->flags &= ~IFF_ALLMULTI;
5762 		else {
5763 			dev->allmulti -= inc;
5764 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5765 				dev->name);
5766 			return -EOVERFLOW;
5767 		}
5768 	}
5769 	if (dev->flags ^ old_flags) {
5770 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5771 		dev_set_rx_mode(dev);
5772 		if (notify)
5773 			__dev_notify_flags(dev, old_flags,
5774 					   dev->gflags ^ old_gflags);
5775 	}
5776 	return 0;
5777 }
5778 
5779 /**
5780  *	dev_set_allmulti	- update allmulti count on a device
5781  *	@dev: device
5782  *	@inc: modifier
5783  *
5784  *	Add or remove reception of all multicast frames to a device. While the
5785  *	count in the device remains above zero the interface remains listening
5786  *	to all interfaces. Once it hits zero the device reverts back to normal
5787  *	filtering operation. A negative @inc value is used to drop the counter
5788  *	when releasing a resource needing all multicasts.
5789  *	Return 0 if successful or a negative errno code on error.
5790  */
5791 
5792 int dev_set_allmulti(struct net_device *dev, int inc)
5793 {
5794 	return __dev_set_allmulti(dev, inc, true);
5795 }
5796 EXPORT_SYMBOL(dev_set_allmulti);
5797 
5798 /*
5799  *	Upload unicast and multicast address lists to device and
5800  *	configure RX filtering. When the device doesn't support unicast
5801  *	filtering it is put in promiscuous mode while unicast addresses
5802  *	are present.
5803  */
5804 void __dev_set_rx_mode(struct net_device *dev)
5805 {
5806 	const struct net_device_ops *ops = dev->netdev_ops;
5807 
5808 	/* dev_open will call this function so the list will stay sane. */
5809 	if (!(dev->flags&IFF_UP))
5810 		return;
5811 
5812 	if (!netif_device_present(dev))
5813 		return;
5814 
5815 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5816 		/* Unicast addresses changes may only happen under the rtnl,
5817 		 * therefore calling __dev_set_promiscuity here is safe.
5818 		 */
5819 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5820 			__dev_set_promiscuity(dev, 1, false);
5821 			dev->uc_promisc = true;
5822 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5823 			__dev_set_promiscuity(dev, -1, false);
5824 			dev->uc_promisc = false;
5825 		}
5826 	}
5827 
5828 	if (ops->ndo_set_rx_mode)
5829 		ops->ndo_set_rx_mode(dev);
5830 }
5831 
5832 void dev_set_rx_mode(struct net_device *dev)
5833 {
5834 	netif_addr_lock_bh(dev);
5835 	__dev_set_rx_mode(dev);
5836 	netif_addr_unlock_bh(dev);
5837 }
5838 
5839 /**
5840  *	dev_get_flags - get flags reported to userspace
5841  *	@dev: device
5842  *
5843  *	Get the combination of flag bits exported through APIs to userspace.
5844  */
5845 unsigned int dev_get_flags(const struct net_device *dev)
5846 {
5847 	unsigned int flags;
5848 
5849 	flags = (dev->flags & ~(IFF_PROMISC |
5850 				IFF_ALLMULTI |
5851 				IFF_RUNNING |
5852 				IFF_LOWER_UP |
5853 				IFF_DORMANT)) |
5854 		(dev->gflags & (IFF_PROMISC |
5855 				IFF_ALLMULTI));
5856 
5857 	if (netif_running(dev)) {
5858 		if (netif_oper_up(dev))
5859 			flags |= IFF_RUNNING;
5860 		if (netif_carrier_ok(dev))
5861 			flags |= IFF_LOWER_UP;
5862 		if (netif_dormant(dev))
5863 			flags |= IFF_DORMANT;
5864 	}
5865 
5866 	return flags;
5867 }
5868 EXPORT_SYMBOL(dev_get_flags);
5869 
5870 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5871 {
5872 	unsigned int old_flags = dev->flags;
5873 	int ret;
5874 
5875 	ASSERT_RTNL();
5876 
5877 	/*
5878 	 *	Set the flags on our device.
5879 	 */
5880 
5881 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5882 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5883 			       IFF_AUTOMEDIA)) |
5884 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5885 				    IFF_ALLMULTI));
5886 
5887 	/*
5888 	 *	Load in the correct multicast list now the flags have changed.
5889 	 */
5890 
5891 	if ((old_flags ^ flags) & IFF_MULTICAST)
5892 		dev_change_rx_flags(dev, IFF_MULTICAST);
5893 
5894 	dev_set_rx_mode(dev);
5895 
5896 	/*
5897 	 *	Have we downed the interface. We handle IFF_UP ourselves
5898 	 *	according to user attempts to set it, rather than blindly
5899 	 *	setting it.
5900 	 */
5901 
5902 	ret = 0;
5903 	if ((old_flags ^ flags) & IFF_UP)
5904 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5905 
5906 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5907 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5908 		unsigned int old_flags = dev->flags;
5909 
5910 		dev->gflags ^= IFF_PROMISC;
5911 
5912 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5913 			if (dev->flags != old_flags)
5914 				dev_set_rx_mode(dev);
5915 	}
5916 
5917 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5918 	   is important. Some (broken) drivers set IFF_PROMISC, when
5919 	   IFF_ALLMULTI is requested not asking us and not reporting.
5920 	 */
5921 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5922 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5923 
5924 		dev->gflags ^= IFF_ALLMULTI;
5925 		__dev_set_allmulti(dev, inc, false);
5926 	}
5927 
5928 	return ret;
5929 }
5930 
5931 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5932 			unsigned int gchanges)
5933 {
5934 	unsigned int changes = dev->flags ^ old_flags;
5935 
5936 	if (gchanges)
5937 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5938 
5939 	if (changes & IFF_UP) {
5940 		if (dev->flags & IFF_UP)
5941 			call_netdevice_notifiers(NETDEV_UP, dev);
5942 		else
5943 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5944 	}
5945 
5946 	if (dev->flags & IFF_UP &&
5947 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5948 		struct netdev_notifier_change_info change_info;
5949 
5950 		change_info.flags_changed = changes;
5951 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5952 					      &change_info.info);
5953 	}
5954 }
5955 
5956 /**
5957  *	dev_change_flags - change device settings
5958  *	@dev: device
5959  *	@flags: device state flags
5960  *
5961  *	Change settings on device based state flags. The flags are
5962  *	in the userspace exported format.
5963  */
5964 int dev_change_flags(struct net_device *dev, unsigned int flags)
5965 {
5966 	int ret;
5967 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5968 
5969 	ret = __dev_change_flags(dev, flags);
5970 	if (ret < 0)
5971 		return ret;
5972 
5973 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5974 	__dev_notify_flags(dev, old_flags, changes);
5975 	return ret;
5976 }
5977 EXPORT_SYMBOL(dev_change_flags);
5978 
5979 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5980 {
5981 	const struct net_device_ops *ops = dev->netdev_ops;
5982 
5983 	if (ops->ndo_change_mtu)
5984 		return ops->ndo_change_mtu(dev, new_mtu);
5985 
5986 	dev->mtu = new_mtu;
5987 	return 0;
5988 }
5989 
5990 /**
5991  *	dev_set_mtu - Change maximum transfer unit
5992  *	@dev: device
5993  *	@new_mtu: new transfer unit
5994  *
5995  *	Change the maximum transfer size of the network device.
5996  */
5997 int dev_set_mtu(struct net_device *dev, int new_mtu)
5998 {
5999 	int err, orig_mtu;
6000 
6001 	if (new_mtu == dev->mtu)
6002 		return 0;
6003 
6004 	/*	MTU must be positive.	 */
6005 	if (new_mtu < 0)
6006 		return -EINVAL;
6007 
6008 	if (!netif_device_present(dev))
6009 		return -ENODEV;
6010 
6011 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6012 	err = notifier_to_errno(err);
6013 	if (err)
6014 		return err;
6015 
6016 	orig_mtu = dev->mtu;
6017 	err = __dev_set_mtu(dev, new_mtu);
6018 
6019 	if (!err) {
6020 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6021 		err = notifier_to_errno(err);
6022 		if (err) {
6023 			/* setting mtu back and notifying everyone again,
6024 			 * so that they have a chance to revert changes.
6025 			 */
6026 			__dev_set_mtu(dev, orig_mtu);
6027 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6028 		}
6029 	}
6030 	return err;
6031 }
6032 EXPORT_SYMBOL(dev_set_mtu);
6033 
6034 /**
6035  *	dev_set_group - Change group this device belongs to
6036  *	@dev: device
6037  *	@new_group: group this device should belong to
6038  */
6039 void dev_set_group(struct net_device *dev, int new_group)
6040 {
6041 	dev->group = new_group;
6042 }
6043 EXPORT_SYMBOL(dev_set_group);
6044 
6045 /**
6046  *	dev_set_mac_address - Change Media Access Control Address
6047  *	@dev: device
6048  *	@sa: new address
6049  *
6050  *	Change the hardware (MAC) address of the device
6051  */
6052 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6053 {
6054 	const struct net_device_ops *ops = dev->netdev_ops;
6055 	int err;
6056 
6057 	if (!ops->ndo_set_mac_address)
6058 		return -EOPNOTSUPP;
6059 	if (sa->sa_family != dev->type)
6060 		return -EINVAL;
6061 	if (!netif_device_present(dev))
6062 		return -ENODEV;
6063 	err = ops->ndo_set_mac_address(dev, sa);
6064 	if (err)
6065 		return err;
6066 	dev->addr_assign_type = NET_ADDR_SET;
6067 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6068 	add_device_randomness(dev->dev_addr, dev->addr_len);
6069 	return 0;
6070 }
6071 EXPORT_SYMBOL(dev_set_mac_address);
6072 
6073 /**
6074  *	dev_change_carrier - Change device carrier
6075  *	@dev: device
6076  *	@new_carrier: new value
6077  *
6078  *	Change device carrier
6079  */
6080 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6081 {
6082 	const struct net_device_ops *ops = dev->netdev_ops;
6083 
6084 	if (!ops->ndo_change_carrier)
6085 		return -EOPNOTSUPP;
6086 	if (!netif_device_present(dev))
6087 		return -ENODEV;
6088 	return ops->ndo_change_carrier(dev, new_carrier);
6089 }
6090 EXPORT_SYMBOL(dev_change_carrier);
6091 
6092 /**
6093  *	dev_get_phys_port_id - Get device physical port ID
6094  *	@dev: device
6095  *	@ppid: port ID
6096  *
6097  *	Get device physical port ID
6098  */
6099 int dev_get_phys_port_id(struct net_device *dev,
6100 			 struct netdev_phys_item_id *ppid)
6101 {
6102 	const struct net_device_ops *ops = dev->netdev_ops;
6103 
6104 	if (!ops->ndo_get_phys_port_id)
6105 		return -EOPNOTSUPP;
6106 	return ops->ndo_get_phys_port_id(dev, ppid);
6107 }
6108 EXPORT_SYMBOL(dev_get_phys_port_id);
6109 
6110 /**
6111  *	dev_get_phys_port_name - Get device physical port name
6112  *	@dev: device
6113  *	@name: port name
6114  *
6115  *	Get device physical port name
6116  */
6117 int dev_get_phys_port_name(struct net_device *dev,
6118 			   char *name, size_t len)
6119 {
6120 	const struct net_device_ops *ops = dev->netdev_ops;
6121 
6122 	if (!ops->ndo_get_phys_port_name)
6123 		return -EOPNOTSUPP;
6124 	return ops->ndo_get_phys_port_name(dev, name, len);
6125 }
6126 EXPORT_SYMBOL(dev_get_phys_port_name);
6127 
6128 /**
6129  *	dev_change_proto_down - update protocol port state information
6130  *	@dev: device
6131  *	@proto_down: new value
6132  *
6133  *	This info can be used by switch drivers to set the phys state of the
6134  *	port.
6135  */
6136 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6137 {
6138 	const struct net_device_ops *ops = dev->netdev_ops;
6139 
6140 	if (!ops->ndo_change_proto_down)
6141 		return -EOPNOTSUPP;
6142 	if (!netif_device_present(dev))
6143 		return -ENODEV;
6144 	return ops->ndo_change_proto_down(dev, proto_down);
6145 }
6146 EXPORT_SYMBOL(dev_change_proto_down);
6147 
6148 /**
6149  *	dev_new_index	-	allocate an ifindex
6150  *	@net: the applicable net namespace
6151  *
6152  *	Returns a suitable unique value for a new device interface
6153  *	number.  The caller must hold the rtnl semaphore or the
6154  *	dev_base_lock to be sure it remains unique.
6155  */
6156 static int dev_new_index(struct net *net)
6157 {
6158 	int ifindex = net->ifindex;
6159 	for (;;) {
6160 		if (++ifindex <= 0)
6161 			ifindex = 1;
6162 		if (!__dev_get_by_index(net, ifindex))
6163 			return net->ifindex = ifindex;
6164 	}
6165 }
6166 
6167 /* Delayed registration/unregisteration */
6168 static LIST_HEAD(net_todo_list);
6169 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6170 
6171 static void net_set_todo(struct net_device *dev)
6172 {
6173 	list_add_tail(&dev->todo_list, &net_todo_list);
6174 	dev_net(dev)->dev_unreg_count++;
6175 }
6176 
6177 static void rollback_registered_many(struct list_head *head)
6178 {
6179 	struct net_device *dev, *tmp;
6180 	LIST_HEAD(close_head);
6181 
6182 	BUG_ON(dev_boot_phase);
6183 	ASSERT_RTNL();
6184 
6185 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6186 		/* Some devices call without registering
6187 		 * for initialization unwind. Remove those
6188 		 * devices and proceed with the remaining.
6189 		 */
6190 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6191 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6192 				 dev->name, dev);
6193 
6194 			WARN_ON(1);
6195 			list_del(&dev->unreg_list);
6196 			continue;
6197 		}
6198 		dev->dismantle = true;
6199 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6200 	}
6201 
6202 	/* If device is running, close it first. */
6203 	list_for_each_entry(dev, head, unreg_list)
6204 		list_add_tail(&dev->close_list, &close_head);
6205 	dev_close_many(&close_head, true);
6206 
6207 	list_for_each_entry(dev, head, unreg_list) {
6208 		/* And unlink it from device chain. */
6209 		unlist_netdevice(dev);
6210 
6211 		dev->reg_state = NETREG_UNREGISTERING;
6212 		on_each_cpu(flush_backlog, dev, 1);
6213 	}
6214 
6215 	synchronize_net();
6216 
6217 	list_for_each_entry(dev, head, unreg_list) {
6218 		struct sk_buff *skb = NULL;
6219 
6220 		/* Shutdown queueing discipline. */
6221 		dev_shutdown(dev);
6222 
6223 
6224 		/* Notify protocols, that we are about to destroy
6225 		   this device. They should clean all the things.
6226 		*/
6227 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6228 
6229 		if (!dev->rtnl_link_ops ||
6230 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6231 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6232 						     GFP_KERNEL);
6233 
6234 		/*
6235 		 *	Flush the unicast and multicast chains
6236 		 */
6237 		dev_uc_flush(dev);
6238 		dev_mc_flush(dev);
6239 
6240 		if (dev->netdev_ops->ndo_uninit)
6241 			dev->netdev_ops->ndo_uninit(dev);
6242 
6243 		if (skb)
6244 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6245 
6246 		/* Notifier chain MUST detach us all upper devices. */
6247 		WARN_ON(netdev_has_any_upper_dev(dev));
6248 
6249 		/* Remove entries from kobject tree */
6250 		netdev_unregister_kobject(dev);
6251 #ifdef CONFIG_XPS
6252 		/* Remove XPS queueing entries */
6253 		netif_reset_xps_queues_gt(dev, 0);
6254 #endif
6255 	}
6256 
6257 	synchronize_net();
6258 
6259 	list_for_each_entry(dev, head, unreg_list)
6260 		dev_put(dev);
6261 }
6262 
6263 static void rollback_registered(struct net_device *dev)
6264 {
6265 	LIST_HEAD(single);
6266 
6267 	list_add(&dev->unreg_list, &single);
6268 	rollback_registered_many(&single);
6269 	list_del(&single);
6270 }
6271 
6272 static netdev_features_t netdev_fix_features(struct net_device *dev,
6273 	netdev_features_t features)
6274 {
6275 	/* Fix illegal checksum combinations */
6276 	if ((features & NETIF_F_HW_CSUM) &&
6277 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6278 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6279 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6280 	}
6281 
6282 	/* TSO requires that SG is present as well. */
6283 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6284 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6285 		features &= ~NETIF_F_ALL_TSO;
6286 	}
6287 
6288 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6289 					!(features & NETIF_F_IP_CSUM)) {
6290 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6291 		features &= ~NETIF_F_TSO;
6292 		features &= ~NETIF_F_TSO_ECN;
6293 	}
6294 
6295 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6296 					 !(features & NETIF_F_IPV6_CSUM)) {
6297 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6298 		features &= ~NETIF_F_TSO6;
6299 	}
6300 
6301 	/* TSO ECN requires that TSO is present as well. */
6302 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6303 		features &= ~NETIF_F_TSO_ECN;
6304 
6305 	/* Software GSO depends on SG. */
6306 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6307 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6308 		features &= ~NETIF_F_GSO;
6309 	}
6310 
6311 	/* UFO needs SG and checksumming */
6312 	if (features & NETIF_F_UFO) {
6313 		/* maybe split UFO into V4 and V6? */
6314 		if (!((features & NETIF_F_GEN_CSUM) ||
6315 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6316 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6317 			netdev_dbg(dev,
6318 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6319 			features &= ~NETIF_F_UFO;
6320 		}
6321 
6322 		if (!(features & NETIF_F_SG)) {
6323 			netdev_dbg(dev,
6324 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6325 			features &= ~NETIF_F_UFO;
6326 		}
6327 	}
6328 
6329 #ifdef CONFIG_NET_RX_BUSY_POLL
6330 	if (dev->netdev_ops->ndo_busy_poll)
6331 		features |= NETIF_F_BUSY_POLL;
6332 	else
6333 #endif
6334 		features &= ~NETIF_F_BUSY_POLL;
6335 
6336 	return features;
6337 }
6338 
6339 int __netdev_update_features(struct net_device *dev)
6340 {
6341 	netdev_features_t features;
6342 	int err = 0;
6343 
6344 	ASSERT_RTNL();
6345 
6346 	features = netdev_get_wanted_features(dev);
6347 
6348 	if (dev->netdev_ops->ndo_fix_features)
6349 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6350 
6351 	/* driver might be less strict about feature dependencies */
6352 	features = netdev_fix_features(dev, features);
6353 
6354 	if (dev->features == features)
6355 		return 0;
6356 
6357 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6358 		&dev->features, &features);
6359 
6360 	if (dev->netdev_ops->ndo_set_features)
6361 		err = dev->netdev_ops->ndo_set_features(dev, features);
6362 
6363 	if (unlikely(err < 0)) {
6364 		netdev_err(dev,
6365 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6366 			err, &features, &dev->features);
6367 		return -1;
6368 	}
6369 
6370 	if (!err)
6371 		dev->features = features;
6372 
6373 	return 1;
6374 }
6375 
6376 /**
6377  *	netdev_update_features - recalculate device features
6378  *	@dev: the device to check
6379  *
6380  *	Recalculate dev->features set and send notifications if it
6381  *	has changed. Should be called after driver or hardware dependent
6382  *	conditions might have changed that influence the features.
6383  */
6384 void netdev_update_features(struct net_device *dev)
6385 {
6386 	if (__netdev_update_features(dev))
6387 		netdev_features_change(dev);
6388 }
6389 EXPORT_SYMBOL(netdev_update_features);
6390 
6391 /**
6392  *	netdev_change_features - recalculate device features
6393  *	@dev: the device to check
6394  *
6395  *	Recalculate dev->features set and send notifications even
6396  *	if they have not changed. Should be called instead of
6397  *	netdev_update_features() if also dev->vlan_features might
6398  *	have changed to allow the changes to be propagated to stacked
6399  *	VLAN devices.
6400  */
6401 void netdev_change_features(struct net_device *dev)
6402 {
6403 	__netdev_update_features(dev);
6404 	netdev_features_change(dev);
6405 }
6406 EXPORT_SYMBOL(netdev_change_features);
6407 
6408 /**
6409  *	netif_stacked_transfer_operstate -	transfer operstate
6410  *	@rootdev: the root or lower level device to transfer state from
6411  *	@dev: the device to transfer operstate to
6412  *
6413  *	Transfer operational state from root to device. This is normally
6414  *	called when a stacking relationship exists between the root
6415  *	device and the device(a leaf device).
6416  */
6417 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6418 					struct net_device *dev)
6419 {
6420 	if (rootdev->operstate == IF_OPER_DORMANT)
6421 		netif_dormant_on(dev);
6422 	else
6423 		netif_dormant_off(dev);
6424 
6425 	if (netif_carrier_ok(rootdev)) {
6426 		if (!netif_carrier_ok(dev))
6427 			netif_carrier_on(dev);
6428 	} else {
6429 		if (netif_carrier_ok(dev))
6430 			netif_carrier_off(dev);
6431 	}
6432 }
6433 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6434 
6435 #ifdef CONFIG_SYSFS
6436 static int netif_alloc_rx_queues(struct net_device *dev)
6437 {
6438 	unsigned int i, count = dev->num_rx_queues;
6439 	struct netdev_rx_queue *rx;
6440 	size_t sz = count * sizeof(*rx);
6441 
6442 	BUG_ON(count < 1);
6443 
6444 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6445 	if (!rx) {
6446 		rx = vzalloc(sz);
6447 		if (!rx)
6448 			return -ENOMEM;
6449 	}
6450 	dev->_rx = rx;
6451 
6452 	for (i = 0; i < count; i++)
6453 		rx[i].dev = dev;
6454 	return 0;
6455 }
6456 #endif
6457 
6458 static void netdev_init_one_queue(struct net_device *dev,
6459 				  struct netdev_queue *queue, void *_unused)
6460 {
6461 	/* Initialize queue lock */
6462 	spin_lock_init(&queue->_xmit_lock);
6463 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6464 	queue->xmit_lock_owner = -1;
6465 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6466 	queue->dev = dev;
6467 #ifdef CONFIG_BQL
6468 	dql_init(&queue->dql, HZ);
6469 #endif
6470 }
6471 
6472 static void netif_free_tx_queues(struct net_device *dev)
6473 {
6474 	kvfree(dev->_tx);
6475 }
6476 
6477 static int netif_alloc_netdev_queues(struct net_device *dev)
6478 {
6479 	unsigned int count = dev->num_tx_queues;
6480 	struct netdev_queue *tx;
6481 	size_t sz = count * sizeof(*tx);
6482 
6483 	if (count < 1 || count > 0xffff)
6484 		return -EINVAL;
6485 
6486 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6487 	if (!tx) {
6488 		tx = vzalloc(sz);
6489 		if (!tx)
6490 			return -ENOMEM;
6491 	}
6492 	dev->_tx = tx;
6493 
6494 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6495 	spin_lock_init(&dev->tx_global_lock);
6496 
6497 	return 0;
6498 }
6499 
6500 void netif_tx_stop_all_queues(struct net_device *dev)
6501 {
6502 	unsigned int i;
6503 
6504 	for (i = 0; i < dev->num_tx_queues; i++) {
6505 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6506 		netif_tx_stop_queue(txq);
6507 	}
6508 }
6509 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6510 
6511 /**
6512  *	register_netdevice	- register a network device
6513  *	@dev: device to register
6514  *
6515  *	Take a completed network device structure and add it to the kernel
6516  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6517  *	chain. 0 is returned on success. A negative errno code is returned
6518  *	on a failure to set up the device, or if the name is a duplicate.
6519  *
6520  *	Callers must hold the rtnl semaphore. You may want
6521  *	register_netdev() instead of this.
6522  *
6523  *	BUGS:
6524  *	The locking appears insufficient to guarantee two parallel registers
6525  *	will not get the same name.
6526  */
6527 
6528 int register_netdevice(struct net_device *dev)
6529 {
6530 	int ret;
6531 	struct net *net = dev_net(dev);
6532 
6533 	BUG_ON(dev_boot_phase);
6534 	ASSERT_RTNL();
6535 
6536 	might_sleep();
6537 
6538 	/* When net_device's are persistent, this will be fatal. */
6539 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6540 	BUG_ON(!net);
6541 
6542 	spin_lock_init(&dev->addr_list_lock);
6543 	netdev_set_addr_lockdep_class(dev);
6544 
6545 	ret = dev_get_valid_name(net, dev, dev->name);
6546 	if (ret < 0)
6547 		goto out;
6548 
6549 	/* Init, if this function is available */
6550 	if (dev->netdev_ops->ndo_init) {
6551 		ret = dev->netdev_ops->ndo_init(dev);
6552 		if (ret) {
6553 			if (ret > 0)
6554 				ret = -EIO;
6555 			goto out;
6556 		}
6557 	}
6558 
6559 	if (((dev->hw_features | dev->features) &
6560 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6561 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6562 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6563 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6564 		ret = -EINVAL;
6565 		goto err_uninit;
6566 	}
6567 
6568 	ret = -EBUSY;
6569 	if (!dev->ifindex)
6570 		dev->ifindex = dev_new_index(net);
6571 	else if (__dev_get_by_index(net, dev->ifindex))
6572 		goto err_uninit;
6573 
6574 	/* Transfer changeable features to wanted_features and enable
6575 	 * software offloads (GSO and GRO).
6576 	 */
6577 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6578 	dev->features |= NETIF_F_SOFT_FEATURES;
6579 	dev->wanted_features = dev->features & dev->hw_features;
6580 
6581 	if (!(dev->flags & IFF_LOOPBACK)) {
6582 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6583 	}
6584 
6585 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6586 	 */
6587 	dev->vlan_features |= NETIF_F_HIGHDMA;
6588 
6589 	/* Make NETIF_F_SG inheritable to tunnel devices.
6590 	 */
6591 	dev->hw_enc_features |= NETIF_F_SG;
6592 
6593 	/* Make NETIF_F_SG inheritable to MPLS.
6594 	 */
6595 	dev->mpls_features |= NETIF_F_SG;
6596 
6597 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6598 	ret = notifier_to_errno(ret);
6599 	if (ret)
6600 		goto err_uninit;
6601 
6602 	ret = netdev_register_kobject(dev);
6603 	if (ret)
6604 		goto err_uninit;
6605 	dev->reg_state = NETREG_REGISTERED;
6606 
6607 	__netdev_update_features(dev);
6608 
6609 	/*
6610 	 *	Default initial state at registry is that the
6611 	 *	device is present.
6612 	 */
6613 
6614 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6615 
6616 	linkwatch_init_dev(dev);
6617 
6618 	dev_init_scheduler(dev);
6619 	dev_hold(dev);
6620 	list_netdevice(dev);
6621 	add_device_randomness(dev->dev_addr, dev->addr_len);
6622 
6623 	/* If the device has permanent device address, driver should
6624 	 * set dev_addr and also addr_assign_type should be set to
6625 	 * NET_ADDR_PERM (default value).
6626 	 */
6627 	if (dev->addr_assign_type == NET_ADDR_PERM)
6628 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6629 
6630 	/* Notify protocols, that a new device appeared. */
6631 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6632 	ret = notifier_to_errno(ret);
6633 	if (ret) {
6634 		rollback_registered(dev);
6635 		dev->reg_state = NETREG_UNREGISTERED;
6636 	}
6637 	/*
6638 	 *	Prevent userspace races by waiting until the network
6639 	 *	device is fully setup before sending notifications.
6640 	 */
6641 	if (!dev->rtnl_link_ops ||
6642 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6643 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6644 
6645 out:
6646 	return ret;
6647 
6648 err_uninit:
6649 	if (dev->netdev_ops->ndo_uninit)
6650 		dev->netdev_ops->ndo_uninit(dev);
6651 	goto out;
6652 }
6653 EXPORT_SYMBOL(register_netdevice);
6654 
6655 /**
6656  *	init_dummy_netdev	- init a dummy network device for NAPI
6657  *	@dev: device to init
6658  *
6659  *	This takes a network device structure and initialize the minimum
6660  *	amount of fields so it can be used to schedule NAPI polls without
6661  *	registering a full blown interface. This is to be used by drivers
6662  *	that need to tie several hardware interfaces to a single NAPI
6663  *	poll scheduler due to HW limitations.
6664  */
6665 int init_dummy_netdev(struct net_device *dev)
6666 {
6667 	/* Clear everything. Note we don't initialize spinlocks
6668 	 * are they aren't supposed to be taken by any of the
6669 	 * NAPI code and this dummy netdev is supposed to be
6670 	 * only ever used for NAPI polls
6671 	 */
6672 	memset(dev, 0, sizeof(struct net_device));
6673 
6674 	/* make sure we BUG if trying to hit standard
6675 	 * register/unregister code path
6676 	 */
6677 	dev->reg_state = NETREG_DUMMY;
6678 
6679 	/* NAPI wants this */
6680 	INIT_LIST_HEAD(&dev->napi_list);
6681 
6682 	/* a dummy interface is started by default */
6683 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6684 	set_bit(__LINK_STATE_START, &dev->state);
6685 
6686 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6687 	 * because users of this 'device' dont need to change
6688 	 * its refcount.
6689 	 */
6690 
6691 	return 0;
6692 }
6693 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6694 
6695 
6696 /**
6697  *	register_netdev	- register a network device
6698  *	@dev: device to register
6699  *
6700  *	Take a completed network device structure and add it to the kernel
6701  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6702  *	chain. 0 is returned on success. A negative errno code is returned
6703  *	on a failure to set up the device, or if the name is a duplicate.
6704  *
6705  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6706  *	and expands the device name if you passed a format string to
6707  *	alloc_netdev.
6708  */
6709 int register_netdev(struct net_device *dev)
6710 {
6711 	int err;
6712 
6713 	rtnl_lock();
6714 	err = register_netdevice(dev);
6715 	rtnl_unlock();
6716 	return err;
6717 }
6718 EXPORT_SYMBOL(register_netdev);
6719 
6720 int netdev_refcnt_read(const struct net_device *dev)
6721 {
6722 	int i, refcnt = 0;
6723 
6724 	for_each_possible_cpu(i)
6725 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6726 	return refcnt;
6727 }
6728 EXPORT_SYMBOL(netdev_refcnt_read);
6729 
6730 /**
6731  * netdev_wait_allrefs - wait until all references are gone.
6732  * @dev: target net_device
6733  *
6734  * This is called when unregistering network devices.
6735  *
6736  * Any protocol or device that holds a reference should register
6737  * for netdevice notification, and cleanup and put back the
6738  * reference if they receive an UNREGISTER event.
6739  * We can get stuck here if buggy protocols don't correctly
6740  * call dev_put.
6741  */
6742 static void netdev_wait_allrefs(struct net_device *dev)
6743 {
6744 	unsigned long rebroadcast_time, warning_time;
6745 	int refcnt;
6746 
6747 	linkwatch_forget_dev(dev);
6748 
6749 	rebroadcast_time = warning_time = jiffies;
6750 	refcnt = netdev_refcnt_read(dev);
6751 
6752 	while (refcnt != 0) {
6753 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6754 			rtnl_lock();
6755 
6756 			/* Rebroadcast unregister notification */
6757 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6758 
6759 			__rtnl_unlock();
6760 			rcu_barrier();
6761 			rtnl_lock();
6762 
6763 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6764 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6765 				     &dev->state)) {
6766 				/* We must not have linkwatch events
6767 				 * pending on unregister. If this
6768 				 * happens, we simply run the queue
6769 				 * unscheduled, resulting in a noop
6770 				 * for this device.
6771 				 */
6772 				linkwatch_run_queue();
6773 			}
6774 
6775 			__rtnl_unlock();
6776 
6777 			rebroadcast_time = jiffies;
6778 		}
6779 
6780 		msleep(250);
6781 
6782 		refcnt = netdev_refcnt_read(dev);
6783 
6784 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6785 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6786 				 dev->name, refcnt);
6787 			warning_time = jiffies;
6788 		}
6789 	}
6790 }
6791 
6792 /* The sequence is:
6793  *
6794  *	rtnl_lock();
6795  *	...
6796  *	register_netdevice(x1);
6797  *	register_netdevice(x2);
6798  *	...
6799  *	unregister_netdevice(y1);
6800  *	unregister_netdevice(y2);
6801  *      ...
6802  *	rtnl_unlock();
6803  *	free_netdev(y1);
6804  *	free_netdev(y2);
6805  *
6806  * We are invoked by rtnl_unlock().
6807  * This allows us to deal with problems:
6808  * 1) We can delete sysfs objects which invoke hotplug
6809  *    without deadlocking with linkwatch via keventd.
6810  * 2) Since we run with the RTNL semaphore not held, we can sleep
6811  *    safely in order to wait for the netdev refcnt to drop to zero.
6812  *
6813  * We must not return until all unregister events added during
6814  * the interval the lock was held have been completed.
6815  */
6816 void netdev_run_todo(void)
6817 {
6818 	struct list_head list;
6819 
6820 	/* Snapshot list, allow later requests */
6821 	list_replace_init(&net_todo_list, &list);
6822 
6823 	__rtnl_unlock();
6824 
6825 
6826 	/* Wait for rcu callbacks to finish before next phase */
6827 	if (!list_empty(&list))
6828 		rcu_barrier();
6829 
6830 	while (!list_empty(&list)) {
6831 		struct net_device *dev
6832 			= list_first_entry(&list, struct net_device, todo_list);
6833 		list_del(&dev->todo_list);
6834 
6835 		rtnl_lock();
6836 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6837 		__rtnl_unlock();
6838 
6839 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6840 			pr_err("network todo '%s' but state %d\n",
6841 			       dev->name, dev->reg_state);
6842 			dump_stack();
6843 			continue;
6844 		}
6845 
6846 		dev->reg_state = NETREG_UNREGISTERED;
6847 
6848 		netdev_wait_allrefs(dev);
6849 
6850 		/* paranoia */
6851 		BUG_ON(netdev_refcnt_read(dev));
6852 		BUG_ON(!list_empty(&dev->ptype_all));
6853 		BUG_ON(!list_empty(&dev->ptype_specific));
6854 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6855 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6856 		WARN_ON(dev->dn_ptr);
6857 
6858 		if (dev->destructor)
6859 			dev->destructor(dev);
6860 
6861 		/* Report a network device has been unregistered */
6862 		rtnl_lock();
6863 		dev_net(dev)->dev_unreg_count--;
6864 		__rtnl_unlock();
6865 		wake_up(&netdev_unregistering_wq);
6866 
6867 		/* Free network device */
6868 		kobject_put(&dev->dev.kobj);
6869 	}
6870 }
6871 
6872 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6873  * fields in the same order, with only the type differing.
6874  */
6875 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6876 			     const struct net_device_stats *netdev_stats)
6877 {
6878 #if BITS_PER_LONG == 64
6879 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6880 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6881 #else
6882 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6883 	const unsigned long *src = (const unsigned long *)netdev_stats;
6884 	u64 *dst = (u64 *)stats64;
6885 
6886 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6887 		     sizeof(*stats64) / sizeof(u64));
6888 	for (i = 0; i < n; i++)
6889 		dst[i] = src[i];
6890 #endif
6891 }
6892 EXPORT_SYMBOL(netdev_stats_to_stats64);
6893 
6894 /**
6895  *	dev_get_stats	- get network device statistics
6896  *	@dev: device to get statistics from
6897  *	@storage: place to store stats
6898  *
6899  *	Get network statistics from device. Return @storage.
6900  *	The device driver may provide its own method by setting
6901  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6902  *	otherwise the internal statistics structure is used.
6903  */
6904 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6905 					struct rtnl_link_stats64 *storage)
6906 {
6907 	const struct net_device_ops *ops = dev->netdev_ops;
6908 
6909 	if (ops->ndo_get_stats64) {
6910 		memset(storage, 0, sizeof(*storage));
6911 		ops->ndo_get_stats64(dev, storage);
6912 	} else if (ops->ndo_get_stats) {
6913 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6914 	} else {
6915 		netdev_stats_to_stats64(storage, &dev->stats);
6916 	}
6917 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6918 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6919 	return storage;
6920 }
6921 EXPORT_SYMBOL(dev_get_stats);
6922 
6923 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6924 {
6925 	struct netdev_queue *queue = dev_ingress_queue(dev);
6926 
6927 #ifdef CONFIG_NET_CLS_ACT
6928 	if (queue)
6929 		return queue;
6930 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6931 	if (!queue)
6932 		return NULL;
6933 	netdev_init_one_queue(dev, queue, NULL);
6934 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6935 	queue->qdisc_sleeping = &noop_qdisc;
6936 	rcu_assign_pointer(dev->ingress_queue, queue);
6937 #endif
6938 	return queue;
6939 }
6940 
6941 static const struct ethtool_ops default_ethtool_ops;
6942 
6943 void netdev_set_default_ethtool_ops(struct net_device *dev,
6944 				    const struct ethtool_ops *ops)
6945 {
6946 	if (dev->ethtool_ops == &default_ethtool_ops)
6947 		dev->ethtool_ops = ops;
6948 }
6949 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6950 
6951 void netdev_freemem(struct net_device *dev)
6952 {
6953 	char *addr = (char *)dev - dev->padded;
6954 
6955 	kvfree(addr);
6956 }
6957 
6958 /**
6959  *	alloc_netdev_mqs - allocate network device
6960  *	@sizeof_priv:		size of private data to allocate space for
6961  *	@name:			device name format string
6962  *	@name_assign_type: 	origin of device name
6963  *	@setup:			callback to initialize device
6964  *	@txqs:			the number of TX subqueues to allocate
6965  *	@rxqs:			the number of RX subqueues to allocate
6966  *
6967  *	Allocates a struct net_device with private data area for driver use
6968  *	and performs basic initialization.  Also allocates subqueue structs
6969  *	for each queue on the device.
6970  */
6971 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6972 		unsigned char name_assign_type,
6973 		void (*setup)(struct net_device *),
6974 		unsigned int txqs, unsigned int rxqs)
6975 {
6976 	struct net_device *dev;
6977 	size_t alloc_size;
6978 	struct net_device *p;
6979 
6980 	BUG_ON(strlen(name) >= sizeof(dev->name));
6981 
6982 	if (txqs < 1) {
6983 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6984 		return NULL;
6985 	}
6986 
6987 #ifdef CONFIG_SYSFS
6988 	if (rxqs < 1) {
6989 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6990 		return NULL;
6991 	}
6992 #endif
6993 
6994 	alloc_size = sizeof(struct net_device);
6995 	if (sizeof_priv) {
6996 		/* ensure 32-byte alignment of private area */
6997 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6998 		alloc_size += sizeof_priv;
6999 	}
7000 	/* ensure 32-byte alignment of whole construct */
7001 	alloc_size += NETDEV_ALIGN - 1;
7002 
7003 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7004 	if (!p)
7005 		p = vzalloc(alloc_size);
7006 	if (!p)
7007 		return NULL;
7008 
7009 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7010 	dev->padded = (char *)dev - (char *)p;
7011 
7012 	dev->pcpu_refcnt = alloc_percpu(int);
7013 	if (!dev->pcpu_refcnt)
7014 		goto free_dev;
7015 
7016 	if (dev_addr_init(dev))
7017 		goto free_pcpu;
7018 
7019 	dev_mc_init(dev);
7020 	dev_uc_init(dev);
7021 
7022 	dev_net_set(dev, &init_net);
7023 
7024 	dev->gso_max_size = GSO_MAX_SIZE;
7025 	dev->gso_max_segs = GSO_MAX_SEGS;
7026 	dev->gso_min_segs = 0;
7027 
7028 	INIT_LIST_HEAD(&dev->napi_list);
7029 	INIT_LIST_HEAD(&dev->unreg_list);
7030 	INIT_LIST_HEAD(&dev->close_list);
7031 	INIT_LIST_HEAD(&dev->link_watch_list);
7032 	INIT_LIST_HEAD(&dev->adj_list.upper);
7033 	INIT_LIST_HEAD(&dev->adj_list.lower);
7034 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7035 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7036 	INIT_LIST_HEAD(&dev->ptype_all);
7037 	INIT_LIST_HEAD(&dev->ptype_specific);
7038 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7039 	setup(dev);
7040 
7041 	if (!dev->tx_queue_len)
7042 		dev->priv_flags |= IFF_NO_QUEUE;
7043 
7044 	dev->num_tx_queues = txqs;
7045 	dev->real_num_tx_queues = txqs;
7046 	if (netif_alloc_netdev_queues(dev))
7047 		goto free_all;
7048 
7049 #ifdef CONFIG_SYSFS
7050 	dev->num_rx_queues = rxqs;
7051 	dev->real_num_rx_queues = rxqs;
7052 	if (netif_alloc_rx_queues(dev))
7053 		goto free_all;
7054 #endif
7055 
7056 	strcpy(dev->name, name);
7057 	dev->name_assign_type = name_assign_type;
7058 	dev->group = INIT_NETDEV_GROUP;
7059 	if (!dev->ethtool_ops)
7060 		dev->ethtool_ops = &default_ethtool_ops;
7061 
7062 	nf_hook_ingress_init(dev);
7063 
7064 	return dev;
7065 
7066 free_all:
7067 	free_netdev(dev);
7068 	return NULL;
7069 
7070 free_pcpu:
7071 	free_percpu(dev->pcpu_refcnt);
7072 free_dev:
7073 	netdev_freemem(dev);
7074 	return NULL;
7075 }
7076 EXPORT_SYMBOL(alloc_netdev_mqs);
7077 
7078 /**
7079  *	free_netdev - free network device
7080  *	@dev: device
7081  *
7082  *	This function does the last stage of destroying an allocated device
7083  * 	interface. The reference to the device object is released.
7084  *	If this is the last reference then it will be freed.
7085  */
7086 void free_netdev(struct net_device *dev)
7087 {
7088 	struct napi_struct *p, *n;
7089 
7090 	netif_free_tx_queues(dev);
7091 #ifdef CONFIG_SYSFS
7092 	kvfree(dev->_rx);
7093 #endif
7094 
7095 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7096 
7097 	/* Flush device addresses */
7098 	dev_addr_flush(dev);
7099 
7100 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7101 		netif_napi_del(p);
7102 
7103 	free_percpu(dev->pcpu_refcnt);
7104 	dev->pcpu_refcnt = NULL;
7105 
7106 	/*  Compatibility with error handling in drivers */
7107 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7108 		netdev_freemem(dev);
7109 		return;
7110 	}
7111 
7112 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7113 	dev->reg_state = NETREG_RELEASED;
7114 
7115 	/* will free via device release */
7116 	put_device(&dev->dev);
7117 }
7118 EXPORT_SYMBOL(free_netdev);
7119 
7120 /**
7121  *	synchronize_net -  Synchronize with packet receive processing
7122  *
7123  *	Wait for packets currently being received to be done.
7124  *	Does not block later packets from starting.
7125  */
7126 void synchronize_net(void)
7127 {
7128 	might_sleep();
7129 	if (rtnl_is_locked())
7130 		synchronize_rcu_expedited();
7131 	else
7132 		synchronize_rcu();
7133 }
7134 EXPORT_SYMBOL(synchronize_net);
7135 
7136 /**
7137  *	unregister_netdevice_queue - remove device from the kernel
7138  *	@dev: device
7139  *	@head: list
7140  *
7141  *	This function shuts down a device interface and removes it
7142  *	from the kernel tables.
7143  *	If head not NULL, device is queued to be unregistered later.
7144  *
7145  *	Callers must hold the rtnl semaphore.  You may want
7146  *	unregister_netdev() instead of this.
7147  */
7148 
7149 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7150 {
7151 	ASSERT_RTNL();
7152 
7153 	if (head) {
7154 		list_move_tail(&dev->unreg_list, head);
7155 	} else {
7156 		rollback_registered(dev);
7157 		/* Finish processing unregister after unlock */
7158 		net_set_todo(dev);
7159 	}
7160 }
7161 EXPORT_SYMBOL(unregister_netdevice_queue);
7162 
7163 /**
7164  *	unregister_netdevice_many - unregister many devices
7165  *	@head: list of devices
7166  *
7167  *  Note: As most callers use a stack allocated list_head,
7168  *  we force a list_del() to make sure stack wont be corrupted later.
7169  */
7170 void unregister_netdevice_many(struct list_head *head)
7171 {
7172 	struct net_device *dev;
7173 
7174 	if (!list_empty(head)) {
7175 		rollback_registered_many(head);
7176 		list_for_each_entry(dev, head, unreg_list)
7177 			net_set_todo(dev);
7178 		list_del(head);
7179 	}
7180 }
7181 EXPORT_SYMBOL(unregister_netdevice_many);
7182 
7183 /**
7184  *	unregister_netdev - remove device from the kernel
7185  *	@dev: device
7186  *
7187  *	This function shuts down a device interface and removes it
7188  *	from the kernel tables.
7189  *
7190  *	This is just a wrapper for unregister_netdevice that takes
7191  *	the rtnl semaphore.  In general you want to use this and not
7192  *	unregister_netdevice.
7193  */
7194 void unregister_netdev(struct net_device *dev)
7195 {
7196 	rtnl_lock();
7197 	unregister_netdevice(dev);
7198 	rtnl_unlock();
7199 }
7200 EXPORT_SYMBOL(unregister_netdev);
7201 
7202 /**
7203  *	dev_change_net_namespace - move device to different nethost namespace
7204  *	@dev: device
7205  *	@net: network namespace
7206  *	@pat: If not NULL name pattern to try if the current device name
7207  *	      is already taken in the destination network namespace.
7208  *
7209  *	This function shuts down a device interface and moves it
7210  *	to a new network namespace. On success 0 is returned, on
7211  *	a failure a netagive errno code is returned.
7212  *
7213  *	Callers must hold the rtnl semaphore.
7214  */
7215 
7216 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7217 {
7218 	int err;
7219 
7220 	ASSERT_RTNL();
7221 
7222 	/* Don't allow namespace local devices to be moved. */
7223 	err = -EINVAL;
7224 	if (dev->features & NETIF_F_NETNS_LOCAL)
7225 		goto out;
7226 
7227 	/* Ensure the device has been registrered */
7228 	if (dev->reg_state != NETREG_REGISTERED)
7229 		goto out;
7230 
7231 	/* Get out if there is nothing todo */
7232 	err = 0;
7233 	if (net_eq(dev_net(dev), net))
7234 		goto out;
7235 
7236 	/* Pick the destination device name, and ensure
7237 	 * we can use it in the destination network namespace.
7238 	 */
7239 	err = -EEXIST;
7240 	if (__dev_get_by_name(net, dev->name)) {
7241 		/* We get here if we can't use the current device name */
7242 		if (!pat)
7243 			goto out;
7244 		if (dev_get_valid_name(net, dev, pat) < 0)
7245 			goto out;
7246 	}
7247 
7248 	/*
7249 	 * And now a mini version of register_netdevice unregister_netdevice.
7250 	 */
7251 
7252 	/* If device is running close it first. */
7253 	dev_close(dev);
7254 
7255 	/* And unlink it from device chain */
7256 	err = -ENODEV;
7257 	unlist_netdevice(dev);
7258 
7259 	synchronize_net();
7260 
7261 	/* Shutdown queueing discipline. */
7262 	dev_shutdown(dev);
7263 
7264 	/* Notify protocols, that we are about to destroy
7265 	   this device. They should clean all the things.
7266 
7267 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7268 	   This is wanted because this way 8021q and macvlan know
7269 	   the device is just moving and can keep their slaves up.
7270 	*/
7271 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7272 	rcu_barrier();
7273 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7274 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7275 
7276 	/*
7277 	 *	Flush the unicast and multicast chains
7278 	 */
7279 	dev_uc_flush(dev);
7280 	dev_mc_flush(dev);
7281 
7282 	/* Send a netdev-removed uevent to the old namespace */
7283 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7284 	netdev_adjacent_del_links(dev);
7285 
7286 	/* Actually switch the network namespace */
7287 	dev_net_set(dev, net);
7288 
7289 	/* If there is an ifindex conflict assign a new one */
7290 	if (__dev_get_by_index(net, dev->ifindex))
7291 		dev->ifindex = dev_new_index(net);
7292 
7293 	/* Send a netdev-add uevent to the new namespace */
7294 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7295 	netdev_adjacent_add_links(dev);
7296 
7297 	/* Fixup kobjects */
7298 	err = device_rename(&dev->dev, dev->name);
7299 	WARN_ON(err);
7300 
7301 	/* Add the device back in the hashes */
7302 	list_netdevice(dev);
7303 
7304 	/* Notify protocols, that a new device appeared. */
7305 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7306 
7307 	/*
7308 	 *	Prevent userspace races by waiting until the network
7309 	 *	device is fully setup before sending notifications.
7310 	 */
7311 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7312 
7313 	synchronize_net();
7314 	err = 0;
7315 out:
7316 	return err;
7317 }
7318 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7319 
7320 static int dev_cpu_callback(struct notifier_block *nfb,
7321 			    unsigned long action,
7322 			    void *ocpu)
7323 {
7324 	struct sk_buff **list_skb;
7325 	struct sk_buff *skb;
7326 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7327 	struct softnet_data *sd, *oldsd;
7328 
7329 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7330 		return NOTIFY_OK;
7331 
7332 	local_irq_disable();
7333 	cpu = smp_processor_id();
7334 	sd = &per_cpu(softnet_data, cpu);
7335 	oldsd = &per_cpu(softnet_data, oldcpu);
7336 
7337 	/* Find end of our completion_queue. */
7338 	list_skb = &sd->completion_queue;
7339 	while (*list_skb)
7340 		list_skb = &(*list_skb)->next;
7341 	/* Append completion queue from offline CPU. */
7342 	*list_skb = oldsd->completion_queue;
7343 	oldsd->completion_queue = NULL;
7344 
7345 	/* Append output queue from offline CPU. */
7346 	if (oldsd->output_queue) {
7347 		*sd->output_queue_tailp = oldsd->output_queue;
7348 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7349 		oldsd->output_queue = NULL;
7350 		oldsd->output_queue_tailp = &oldsd->output_queue;
7351 	}
7352 	/* Append NAPI poll list from offline CPU, with one exception :
7353 	 * process_backlog() must be called by cpu owning percpu backlog.
7354 	 * We properly handle process_queue & input_pkt_queue later.
7355 	 */
7356 	while (!list_empty(&oldsd->poll_list)) {
7357 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7358 							    struct napi_struct,
7359 							    poll_list);
7360 
7361 		list_del_init(&napi->poll_list);
7362 		if (napi->poll == process_backlog)
7363 			napi->state = 0;
7364 		else
7365 			____napi_schedule(sd, napi);
7366 	}
7367 
7368 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7369 	local_irq_enable();
7370 
7371 	/* Process offline CPU's input_pkt_queue */
7372 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7373 		netif_rx_ni(skb);
7374 		input_queue_head_incr(oldsd);
7375 	}
7376 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7377 		netif_rx_ni(skb);
7378 		input_queue_head_incr(oldsd);
7379 	}
7380 
7381 	return NOTIFY_OK;
7382 }
7383 
7384 
7385 /**
7386  *	netdev_increment_features - increment feature set by one
7387  *	@all: current feature set
7388  *	@one: new feature set
7389  *	@mask: mask feature set
7390  *
7391  *	Computes a new feature set after adding a device with feature set
7392  *	@one to the master device with current feature set @all.  Will not
7393  *	enable anything that is off in @mask. Returns the new feature set.
7394  */
7395 netdev_features_t netdev_increment_features(netdev_features_t all,
7396 	netdev_features_t one, netdev_features_t mask)
7397 {
7398 	if (mask & NETIF_F_GEN_CSUM)
7399 		mask |= NETIF_F_ALL_CSUM;
7400 	mask |= NETIF_F_VLAN_CHALLENGED;
7401 
7402 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7403 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7404 
7405 	/* If one device supports hw checksumming, set for all. */
7406 	if (all & NETIF_F_GEN_CSUM)
7407 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7408 
7409 	return all;
7410 }
7411 EXPORT_SYMBOL(netdev_increment_features);
7412 
7413 static struct hlist_head * __net_init netdev_create_hash(void)
7414 {
7415 	int i;
7416 	struct hlist_head *hash;
7417 
7418 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7419 	if (hash != NULL)
7420 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7421 			INIT_HLIST_HEAD(&hash[i]);
7422 
7423 	return hash;
7424 }
7425 
7426 /* Initialize per network namespace state */
7427 static int __net_init netdev_init(struct net *net)
7428 {
7429 	if (net != &init_net)
7430 		INIT_LIST_HEAD(&net->dev_base_head);
7431 
7432 	net->dev_name_head = netdev_create_hash();
7433 	if (net->dev_name_head == NULL)
7434 		goto err_name;
7435 
7436 	net->dev_index_head = netdev_create_hash();
7437 	if (net->dev_index_head == NULL)
7438 		goto err_idx;
7439 
7440 	return 0;
7441 
7442 err_idx:
7443 	kfree(net->dev_name_head);
7444 err_name:
7445 	return -ENOMEM;
7446 }
7447 
7448 /**
7449  *	netdev_drivername - network driver for the device
7450  *	@dev: network device
7451  *
7452  *	Determine network driver for device.
7453  */
7454 const char *netdev_drivername(const struct net_device *dev)
7455 {
7456 	const struct device_driver *driver;
7457 	const struct device *parent;
7458 	const char *empty = "";
7459 
7460 	parent = dev->dev.parent;
7461 	if (!parent)
7462 		return empty;
7463 
7464 	driver = parent->driver;
7465 	if (driver && driver->name)
7466 		return driver->name;
7467 	return empty;
7468 }
7469 
7470 static void __netdev_printk(const char *level, const struct net_device *dev,
7471 			    struct va_format *vaf)
7472 {
7473 	if (dev && dev->dev.parent) {
7474 		dev_printk_emit(level[1] - '0',
7475 				dev->dev.parent,
7476 				"%s %s %s%s: %pV",
7477 				dev_driver_string(dev->dev.parent),
7478 				dev_name(dev->dev.parent),
7479 				netdev_name(dev), netdev_reg_state(dev),
7480 				vaf);
7481 	} else if (dev) {
7482 		printk("%s%s%s: %pV",
7483 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7484 	} else {
7485 		printk("%s(NULL net_device): %pV", level, vaf);
7486 	}
7487 }
7488 
7489 void netdev_printk(const char *level, const struct net_device *dev,
7490 		   const char *format, ...)
7491 {
7492 	struct va_format vaf;
7493 	va_list args;
7494 
7495 	va_start(args, format);
7496 
7497 	vaf.fmt = format;
7498 	vaf.va = &args;
7499 
7500 	__netdev_printk(level, dev, &vaf);
7501 
7502 	va_end(args);
7503 }
7504 EXPORT_SYMBOL(netdev_printk);
7505 
7506 #define define_netdev_printk_level(func, level)			\
7507 void func(const struct net_device *dev, const char *fmt, ...)	\
7508 {								\
7509 	struct va_format vaf;					\
7510 	va_list args;						\
7511 								\
7512 	va_start(args, fmt);					\
7513 								\
7514 	vaf.fmt = fmt;						\
7515 	vaf.va = &args;						\
7516 								\
7517 	__netdev_printk(level, dev, &vaf);			\
7518 								\
7519 	va_end(args);						\
7520 }								\
7521 EXPORT_SYMBOL(func);
7522 
7523 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7524 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7525 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7526 define_netdev_printk_level(netdev_err, KERN_ERR);
7527 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7528 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7529 define_netdev_printk_level(netdev_info, KERN_INFO);
7530 
7531 static void __net_exit netdev_exit(struct net *net)
7532 {
7533 	kfree(net->dev_name_head);
7534 	kfree(net->dev_index_head);
7535 }
7536 
7537 static struct pernet_operations __net_initdata netdev_net_ops = {
7538 	.init = netdev_init,
7539 	.exit = netdev_exit,
7540 };
7541 
7542 static void __net_exit default_device_exit(struct net *net)
7543 {
7544 	struct net_device *dev, *aux;
7545 	/*
7546 	 * Push all migratable network devices back to the
7547 	 * initial network namespace
7548 	 */
7549 	rtnl_lock();
7550 	for_each_netdev_safe(net, dev, aux) {
7551 		int err;
7552 		char fb_name[IFNAMSIZ];
7553 
7554 		/* Ignore unmoveable devices (i.e. loopback) */
7555 		if (dev->features & NETIF_F_NETNS_LOCAL)
7556 			continue;
7557 
7558 		/* Leave virtual devices for the generic cleanup */
7559 		if (dev->rtnl_link_ops)
7560 			continue;
7561 
7562 		/* Push remaining network devices to init_net */
7563 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7564 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7565 		if (err) {
7566 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7567 				 __func__, dev->name, err);
7568 			BUG();
7569 		}
7570 	}
7571 	rtnl_unlock();
7572 }
7573 
7574 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7575 {
7576 	/* Return with the rtnl_lock held when there are no network
7577 	 * devices unregistering in any network namespace in net_list.
7578 	 */
7579 	struct net *net;
7580 	bool unregistering;
7581 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7582 
7583 	add_wait_queue(&netdev_unregistering_wq, &wait);
7584 	for (;;) {
7585 		unregistering = false;
7586 		rtnl_lock();
7587 		list_for_each_entry(net, net_list, exit_list) {
7588 			if (net->dev_unreg_count > 0) {
7589 				unregistering = true;
7590 				break;
7591 			}
7592 		}
7593 		if (!unregistering)
7594 			break;
7595 		__rtnl_unlock();
7596 
7597 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7598 	}
7599 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7600 }
7601 
7602 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7603 {
7604 	/* At exit all network devices most be removed from a network
7605 	 * namespace.  Do this in the reverse order of registration.
7606 	 * Do this across as many network namespaces as possible to
7607 	 * improve batching efficiency.
7608 	 */
7609 	struct net_device *dev;
7610 	struct net *net;
7611 	LIST_HEAD(dev_kill_list);
7612 
7613 	/* To prevent network device cleanup code from dereferencing
7614 	 * loopback devices or network devices that have been freed
7615 	 * wait here for all pending unregistrations to complete,
7616 	 * before unregistring the loopback device and allowing the
7617 	 * network namespace be freed.
7618 	 *
7619 	 * The netdev todo list containing all network devices
7620 	 * unregistrations that happen in default_device_exit_batch
7621 	 * will run in the rtnl_unlock() at the end of
7622 	 * default_device_exit_batch.
7623 	 */
7624 	rtnl_lock_unregistering(net_list);
7625 	list_for_each_entry(net, net_list, exit_list) {
7626 		for_each_netdev_reverse(net, dev) {
7627 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7628 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7629 			else
7630 				unregister_netdevice_queue(dev, &dev_kill_list);
7631 		}
7632 	}
7633 	unregister_netdevice_many(&dev_kill_list);
7634 	rtnl_unlock();
7635 }
7636 
7637 static struct pernet_operations __net_initdata default_device_ops = {
7638 	.exit = default_device_exit,
7639 	.exit_batch = default_device_exit_batch,
7640 };
7641 
7642 /*
7643  *	Initialize the DEV module. At boot time this walks the device list and
7644  *	unhooks any devices that fail to initialise (normally hardware not
7645  *	present) and leaves us with a valid list of present and active devices.
7646  *
7647  */
7648 
7649 /*
7650  *       This is called single threaded during boot, so no need
7651  *       to take the rtnl semaphore.
7652  */
7653 static int __init net_dev_init(void)
7654 {
7655 	int i, rc = -ENOMEM;
7656 
7657 	BUG_ON(!dev_boot_phase);
7658 
7659 	if (dev_proc_init())
7660 		goto out;
7661 
7662 	if (netdev_kobject_init())
7663 		goto out;
7664 
7665 	INIT_LIST_HEAD(&ptype_all);
7666 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7667 		INIT_LIST_HEAD(&ptype_base[i]);
7668 
7669 	INIT_LIST_HEAD(&offload_base);
7670 
7671 	if (register_pernet_subsys(&netdev_net_ops))
7672 		goto out;
7673 
7674 	/*
7675 	 *	Initialise the packet receive queues.
7676 	 */
7677 
7678 	for_each_possible_cpu(i) {
7679 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7680 
7681 		skb_queue_head_init(&sd->input_pkt_queue);
7682 		skb_queue_head_init(&sd->process_queue);
7683 		INIT_LIST_HEAD(&sd->poll_list);
7684 		sd->output_queue_tailp = &sd->output_queue;
7685 #ifdef CONFIG_RPS
7686 		sd->csd.func = rps_trigger_softirq;
7687 		sd->csd.info = sd;
7688 		sd->cpu = i;
7689 #endif
7690 
7691 		sd->backlog.poll = process_backlog;
7692 		sd->backlog.weight = weight_p;
7693 	}
7694 
7695 	dev_boot_phase = 0;
7696 
7697 	/* The loopback device is special if any other network devices
7698 	 * is present in a network namespace the loopback device must
7699 	 * be present. Since we now dynamically allocate and free the
7700 	 * loopback device ensure this invariant is maintained by
7701 	 * keeping the loopback device as the first device on the
7702 	 * list of network devices.  Ensuring the loopback devices
7703 	 * is the first device that appears and the last network device
7704 	 * that disappears.
7705 	 */
7706 	if (register_pernet_device(&loopback_net_ops))
7707 		goto out;
7708 
7709 	if (register_pernet_device(&default_device_ops))
7710 		goto out;
7711 
7712 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7713 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7714 
7715 	hotcpu_notifier(dev_cpu_callback, 0);
7716 	dst_subsys_init();
7717 	rc = 0;
7718 out:
7719 	return rc;
7720 }
7721 
7722 subsys_initcall(net_dev_init);
7723