xref: /openbmc/linux/net/core/dev.c (revision 1f9f6a78)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;	/* Taps */
151 static struct list_head offload_base __read_mostly;
152 
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155 					 struct net_device *dev,
156 					 struct netdev_notifier_info *info);
157 
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179 
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182 
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185 
186 static seqcount_t devnet_rename_seq;
187 
188 static inline void dev_base_seq_inc(struct net *net)
189 {
190 	while (++net->dev_base_seq == 0);
191 }
192 
193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
194 {
195 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196 
197 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
198 }
199 
200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
201 {
202 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
203 }
204 
205 static inline void rps_lock(struct softnet_data *sd)
206 {
207 #ifdef CONFIG_RPS
208 	spin_lock(&sd->input_pkt_queue.lock);
209 #endif
210 }
211 
212 static inline void rps_unlock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_unlock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 /* Device list insertion */
220 static void list_netdevice(struct net_device *dev)
221 {
222 	struct net *net = dev_net(dev);
223 
224 	ASSERT_RTNL();
225 
226 	write_lock_bh(&dev_base_lock);
227 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
228 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
229 	hlist_add_head_rcu(&dev->index_hlist,
230 			   dev_index_hash(net, dev->ifindex));
231 	write_unlock_bh(&dev_base_lock);
232 
233 	dev_base_seq_inc(net);
234 }
235 
236 /* Device list removal
237  * caller must respect a RCU grace period before freeing/reusing dev
238  */
239 static void unlist_netdevice(struct net_device *dev)
240 {
241 	ASSERT_RTNL();
242 
243 	/* Unlink dev from the device chain */
244 	write_lock_bh(&dev_base_lock);
245 	list_del_rcu(&dev->dev_list);
246 	hlist_del_rcu(&dev->name_hlist);
247 	hlist_del_rcu(&dev->index_hlist);
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(dev_net(dev));
251 }
252 
253 /*
254  *	Our notifier list
255  */
256 
257 static RAW_NOTIFIER_HEAD(netdev_chain);
258 
259 /*
260  *	Device drivers call our routines to queue packets here. We empty the
261  *	queue in the local softnet handler.
262  */
263 
264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
265 EXPORT_PER_CPU_SYMBOL(softnet_data);
266 
267 #ifdef CONFIG_LOCKDEP
268 /*
269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
270  * according to dev->type
271  */
272 static const unsigned short netdev_lock_type[] =
273 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
285 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
288 
289 static const char *const netdev_lock_name[] =
290 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
302 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 
306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 
309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310 {
311 	int i;
312 
313 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314 		if (netdev_lock_type[i] == dev_type)
315 			return i;
316 	/* the last key is used by default */
317 	return ARRAY_SIZE(netdev_lock_type) - 1;
318 }
319 
320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321 						 unsigned short dev_type)
322 {
323 	int i;
324 
325 	i = netdev_lock_pos(dev_type);
326 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327 				   netdev_lock_name[i]);
328 }
329 
330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331 {
332 	int i;
333 
334 	i = netdev_lock_pos(dev->type);
335 	lockdep_set_class_and_name(&dev->addr_list_lock,
336 				   &netdev_addr_lock_key[i],
337 				   netdev_lock_name[i]);
338 }
339 #else
340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341 						 unsigned short dev_type)
342 {
343 }
344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345 {
346 }
347 #endif
348 
349 /*******************************************************************************
350 
351 		Protocol management and registration routines
352 
353 *******************************************************************************/
354 
355 /*
356  *	Add a protocol ID to the list. Now that the input handler is
357  *	smarter we can dispense with all the messy stuff that used to be
358  *	here.
359  *
360  *	BEWARE!!! Protocol handlers, mangling input packets,
361  *	MUST BE last in hash buckets and checking protocol handlers
362  *	MUST start from promiscuous ptype_all chain in net_bh.
363  *	It is true now, do not change it.
364  *	Explanation follows: if protocol handler, mangling packet, will
365  *	be the first on list, it is not able to sense, that packet
366  *	is cloned and should be copied-on-write, so that it will
367  *	change it and subsequent readers will get broken packet.
368  *							--ANK (980803)
369  */
370 
371 static inline struct list_head *ptype_head(const struct packet_type *pt)
372 {
373 	if (pt->type == htons(ETH_P_ALL))
374 		return &ptype_all;
375 	else
376 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
377 }
378 
379 /**
380  *	dev_add_pack - add packet handler
381  *	@pt: packet type declaration
382  *
383  *	Add a protocol handler to the networking stack. The passed &packet_type
384  *	is linked into kernel lists and may not be freed until it has been
385  *	removed from the kernel lists.
386  *
387  *	This call does not sleep therefore it can not
388  *	guarantee all CPU's that are in middle of receiving packets
389  *	will see the new packet type (until the next received packet).
390  */
391 
392 void dev_add_pack(struct packet_type *pt)
393 {
394 	struct list_head *head = ptype_head(pt);
395 
396 	spin_lock(&ptype_lock);
397 	list_add_rcu(&pt->list, head);
398 	spin_unlock(&ptype_lock);
399 }
400 EXPORT_SYMBOL(dev_add_pack);
401 
402 /**
403  *	__dev_remove_pack	 - remove packet handler
404  *	@pt: packet type declaration
405  *
406  *	Remove a protocol handler that was previously added to the kernel
407  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
408  *	from the kernel lists and can be freed or reused once this function
409  *	returns.
410  *
411  *      The packet type might still be in use by receivers
412  *	and must not be freed until after all the CPU's have gone
413  *	through a quiescent state.
414  */
415 void __dev_remove_pack(struct packet_type *pt)
416 {
417 	struct list_head *head = ptype_head(pt);
418 	struct packet_type *pt1;
419 
420 	spin_lock(&ptype_lock);
421 
422 	list_for_each_entry(pt1, head, list) {
423 		if (pt == pt1) {
424 			list_del_rcu(&pt->list);
425 			goto out;
426 		}
427 	}
428 
429 	pr_warn("dev_remove_pack: %p not found\n", pt);
430 out:
431 	spin_unlock(&ptype_lock);
432 }
433 EXPORT_SYMBOL(__dev_remove_pack);
434 
435 /**
436  *	dev_remove_pack	 - remove packet handler
437  *	@pt: packet type declaration
438  *
439  *	Remove a protocol handler that was previously added to the kernel
440  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
441  *	from the kernel lists and can be freed or reused once this function
442  *	returns.
443  *
444  *	This call sleeps to guarantee that no CPU is looking at the packet
445  *	type after return.
446  */
447 void dev_remove_pack(struct packet_type *pt)
448 {
449 	__dev_remove_pack(pt);
450 
451 	synchronize_net();
452 }
453 EXPORT_SYMBOL(dev_remove_pack);
454 
455 
456 /**
457  *	dev_add_offload - register offload handlers
458  *	@po: protocol offload declaration
459  *
460  *	Add protocol offload handlers to the networking stack. The passed
461  *	&proto_offload is linked into kernel lists and may not be freed until
462  *	it has been removed from the kernel lists.
463  *
464  *	This call does not sleep therefore it can not
465  *	guarantee all CPU's that are in middle of receiving packets
466  *	will see the new offload handlers (until the next received packet).
467  */
468 void dev_add_offload(struct packet_offload *po)
469 {
470 	struct list_head *head = &offload_base;
471 
472 	spin_lock(&offload_lock);
473 	list_add_rcu(&po->list, head);
474 	spin_unlock(&offload_lock);
475 }
476 EXPORT_SYMBOL(dev_add_offload);
477 
478 /**
479  *	__dev_remove_offload	 - remove offload handler
480  *	@po: packet offload declaration
481  *
482  *	Remove a protocol offload handler that was previously added to the
483  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
484  *	is removed from the kernel lists and can be freed or reused once this
485  *	function returns.
486  *
487  *      The packet type might still be in use by receivers
488  *	and must not be freed until after all the CPU's have gone
489  *	through a quiescent state.
490  */
491 static void __dev_remove_offload(struct packet_offload *po)
492 {
493 	struct list_head *head = &offload_base;
494 	struct packet_offload *po1;
495 
496 	spin_lock(&offload_lock);
497 
498 	list_for_each_entry(po1, head, list) {
499 		if (po == po1) {
500 			list_del_rcu(&po->list);
501 			goto out;
502 		}
503 	}
504 
505 	pr_warn("dev_remove_offload: %p not found\n", po);
506 out:
507 	spin_unlock(&offload_lock);
508 }
509 
510 /**
511  *	dev_remove_offload	 - remove packet offload handler
512  *	@po: packet offload declaration
513  *
514  *	Remove a packet offload handler that was previously added to the kernel
515  *	offload handlers by dev_add_offload(). The passed &offload_type is
516  *	removed from the kernel lists and can be freed or reused once this
517  *	function returns.
518  *
519  *	This call sleeps to guarantee that no CPU is looking at the packet
520  *	type after return.
521  */
522 void dev_remove_offload(struct packet_offload *po)
523 {
524 	__dev_remove_offload(po);
525 
526 	synchronize_net();
527 }
528 EXPORT_SYMBOL(dev_remove_offload);
529 
530 /******************************************************************************
531 
532 		      Device Boot-time Settings Routines
533 
534 *******************************************************************************/
535 
536 /* Boot time configuration table */
537 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
538 
539 /**
540  *	netdev_boot_setup_add	- add new setup entry
541  *	@name: name of the device
542  *	@map: configured settings for the device
543  *
544  *	Adds new setup entry to the dev_boot_setup list.  The function
545  *	returns 0 on error and 1 on success.  This is a generic routine to
546  *	all netdevices.
547  */
548 static int netdev_boot_setup_add(char *name, struct ifmap *map)
549 {
550 	struct netdev_boot_setup *s;
551 	int i;
552 
553 	s = dev_boot_setup;
554 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
555 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
556 			memset(s[i].name, 0, sizeof(s[i].name));
557 			strlcpy(s[i].name, name, IFNAMSIZ);
558 			memcpy(&s[i].map, map, sizeof(s[i].map));
559 			break;
560 		}
561 	}
562 
563 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
564 }
565 
566 /**
567  *	netdev_boot_setup_check	- check boot time settings
568  *	@dev: the netdevice
569  *
570  * 	Check boot time settings for the device.
571  *	The found settings are set for the device to be used
572  *	later in the device probing.
573  *	Returns 0 if no settings found, 1 if they are.
574  */
575 int netdev_boot_setup_check(struct net_device *dev)
576 {
577 	struct netdev_boot_setup *s = dev_boot_setup;
578 	int i;
579 
580 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
581 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
582 		    !strcmp(dev->name, s[i].name)) {
583 			dev->irq 	= s[i].map.irq;
584 			dev->base_addr 	= s[i].map.base_addr;
585 			dev->mem_start 	= s[i].map.mem_start;
586 			dev->mem_end 	= s[i].map.mem_end;
587 			return 1;
588 		}
589 	}
590 	return 0;
591 }
592 EXPORT_SYMBOL(netdev_boot_setup_check);
593 
594 
595 /**
596  *	netdev_boot_base	- get address from boot time settings
597  *	@prefix: prefix for network device
598  *	@unit: id for network device
599  *
600  * 	Check boot time settings for the base address of device.
601  *	The found settings are set for the device to be used
602  *	later in the device probing.
603  *	Returns 0 if no settings found.
604  */
605 unsigned long netdev_boot_base(const char *prefix, int unit)
606 {
607 	const struct netdev_boot_setup *s = dev_boot_setup;
608 	char name[IFNAMSIZ];
609 	int i;
610 
611 	sprintf(name, "%s%d", prefix, unit);
612 
613 	/*
614 	 * If device already registered then return base of 1
615 	 * to indicate not to probe for this interface
616 	 */
617 	if (__dev_get_by_name(&init_net, name))
618 		return 1;
619 
620 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
621 		if (!strcmp(name, s[i].name))
622 			return s[i].map.base_addr;
623 	return 0;
624 }
625 
626 /*
627  * Saves at boot time configured settings for any netdevice.
628  */
629 int __init netdev_boot_setup(char *str)
630 {
631 	int ints[5];
632 	struct ifmap map;
633 
634 	str = get_options(str, ARRAY_SIZE(ints), ints);
635 	if (!str || !*str)
636 		return 0;
637 
638 	/* Save settings */
639 	memset(&map, 0, sizeof(map));
640 	if (ints[0] > 0)
641 		map.irq = ints[1];
642 	if (ints[0] > 1)
643 		map.base_addr = ints[2];
644 	if (ints[0] > 2)
645 		map.mem_start = ints[3];
646 	if (ints[0] > 3)
647 		map.mem_end = ints[4];
648 
649 	/* Add new entry to the list */
650 	return netdev_boot_setup_add(str, &map);
651 }
652 
653 __setup("netdev=", netdev_boot_setup);
654 
655 /*******************************************************************************
656 
657 			    Device Interface Subroutines
658 
659 *******************************************************************************/
660 
661 /**
662  *	__dev_get_by_name	- find a device by its name
663  *	@net: the applicable net namespace
664  *	@name: name to find
665  *
666  *	Find an interface by name. Must be called under RTNL semaphore
667  *	or @dev_base_lock. If the name is found a pointer to the device
668  *	is returned. If the name is not found then %NULL is returned. The
669  *	reference counters are not incremented so the caller must be
670  *	careful with locks.
671  */
672 
673 struct net_device *__dev_get_by_name(struct net *net, const char *name)
674 {
675 	struct net_device *dev;
676 	struct hlist_head *head = dev_name_hash(net, name);
677 
678 	hlist_for_each_entry(dev, head, name_hlist)
679 		if (!strncmp(dev->name, name, IFNAMSIZ))
680 			return dev;
681 
682 	return NULL;
683 }
684 EXPORT_SYMBOL(__dev_get_by_name);
685 
686 /**
687  *	dev_get_by_name_rcu	- find a device by its name
688  *	@net: the applicable net namespace
689  *	@name: name to find
690  *
691  *	Find an interface by name.
692  *	If the name is found a pointer to the device is returned.
693  * 	If the name is not found then %NULL is returned.
694  *	The reference counters are not incremented so the caller must be
695  *	careful with locks. The caller must hold RCU lock.
696  */
697 
698 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
699 {
700 	struct net_device *dev;
701 	struct hlist_head *head = dev_name_hash(net, name);
702 
703 	hlist_for_each_entry_rcu(dev, head, name_hlist)
704 		if (!strncmp(dev->name, name, IFNAMSIZ))
705 			return dev;
706 
707 	return NULL;
708 }
709 EXPORT_SYMBOL(dev_get_by_name_rcu);
710 
711 /**
712  *	dev_get_by_name		- find a device by its name
713  *	@net: the applicable net namespace
714  *	@name: name to find
715  *
716  *	Find an interface by name. This can be called from any
717  *	context and does its own locking. The returned handle has
718  *	the usage count incremented and the caller must use dev_put() to
719  *	release it when it is no longer needed. %NULL is returned if no
720  *	matching device is found.
721  */
722 
723 struct net_device *dev_get_by_name(struct net *net, const char *name)
724 {
725 	struct net_device *dev;
726 
727 	rcu_read_lock();
728 	dev = dev_get_by_name_rcu(net, name);
729 	if (dev)
730 		dev_hold(dev);
731 	rcu_read_unlock();
732 	return dev;
733 }
734 EXPORT_SYMBOL(dev_get_by_name);
735 
736 /**
737  *	__dev_get_by_index - find a device by its ifindex
738  *	@net: the applicable net namespace
739  *	@ifindex: index of device
740  *
741  *	Search for an interface by index. Returns %NULL if the device
742  *	is not found or a pointer to the device. The device has not
743  *	had its reference counter increased so the caller must be careful
744  *	about locking. The caller must hold either the RTNL semaphore
745  *	or @dev_base_lock.
746  */
747 
748 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
749 {
750 	struct net_device *dev;
751 	struct hlist_head *head = dev_index_hash(net, ifindex);
752 
753 	hlist_for_each_entry(dev, head, index_hlist)
754 		if (dev->ifindex == ifindex)
755 			return dev;
756 
757 	return NULL;
758 }
759 EXPORT_SYMBOL(__dev_get_by_index);
760 
761 /**
762  *	dev_get_by_index_rcu - find a device by its ifindex
763  *	@net: the applicable net namespace
764  *	@ifindex: index of device
765  *
766  *	Search for an interface by index. Returns %NULL if the device
767  *	is not found or a pointer to the device. The device has not
768  *	had its reference counter increased so the caller must be careful
769  *	about locking. The caller must hold RCU lock.
770  */
771 
772 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
773 {
774 	struct net_device *dev;
775 	struct hlist_head *head = dev_index_hash(net, ifindex);
776 
777 	hlist_for_each_entry_rcu(dev, head, index_hlist)
778 		if (dev->ifindex == ifindex)
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(dev_get_by_index_rcu);
784 
785 
786 /**
787  *	dev_get_by_index - find a device by its ifindex
788  *	@net: the applicable net namespace
789  *	@ifindex: index of device
790  *
791  *	Search for an interface by index. Returns NULL if the device
792  *	is not found or a pointer to the device. The device returned has
793  *	had a reference added and the pointer is safe until the user calls
794  *	dev_put to indicate they have finished with it.
795  */
796 
797 struct net_device *dev_get_by_index(struct net *net, int ifindex)
798 {
799 	struct net_device *dev;
800 
801 	rcu_read_lock();
802 	dev = dev_get_by_index_rcu(net, ifindex);
803 	if (dev)
804 		dev_hold(dev);
805 	rcu_read_unlock();
806 	return dev;
807 }
808 EXPORT_SYMBOL(dev_get_by_index);
809 
810 /**
811  *	netdev_get_name - get a netdevice name, knowing its ifindex.
812  *	@net: network namespace
813  *	@name: a pointer to the buffer where the name will be stored.
814  *	@ifindex: the ifindex of the interface to get the name from.
815  *
816  *	The use of raw_seqcount_begin() and cond_resched() before
817  *	retrying is required as we want to give the writers a chance
818  *	to complete when CONFIG_PREEMPT is not set.
819  */
820 int netdev_get_name(struct net *net, char *name, int ifindex)
821 {
822 	struct net_device *dev;
823 	unsigned int seq;
824 
825 retry:
826 	seq = raw_seqcount_begin(&devnet_rename_seq);
827 	rcu_read_lock();
828 	dev = dev_get_by_index_rcu(net, ifindex);
829 	if (!dev) {
830 		rcu_read_unlock();
831 		return -ENODEV;
832 	}
833 
834 	strcpy(name, dev->name);
835 	rcu_read_unlock();
836 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
837 		cond_resched();
838 		goto retry;
839 	}
840 
841 	return 0;
842 }
843 
844 /**
845  *	dev_getbyhwaddr_rcu - find a device by its hardware address
846  *	@net: the applicable net namespace
847  *	@type: media type of device
848  *	@ha: hardware address
849  *
850  *	Search for an interface by MAC address. Returns NULL if the device
851  *	is not found or a pointer to the device.
852  *	The caller must hold RCU or RTNL.
853  *	The returned device has not had its ref count increased
854  *	and the caller must therefore be careful about locking
855  *
856  */
857 
858 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
859 				       const char *ha)
860 {
861 	struct net_device *dev;
862 
863 	for_each_netdev_rcu(net, dev)
864 		if (dev->type == type &&
865 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
866 			return dev;
867 
868 	return NULL;
869 }
870 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
871 
872 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
873 {
874 	struct net_device *dev;
875 
876 	ASSERT_RTNL();
877 	for_each_netdev(net, dev)
878 		if (dev->type == type)
879 			return dev;
880 
881 	return NULL;
882 }
883 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
884 
885 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
886 {
887 	struct net_device *dev, *ret = NULL;
888 
889 	rcu_read_lock();
890 	for_each_netdev_rcu(net, dev)
891 		if (dev->type == type) {
892 			dev_hold(dev);
893 			ret = dev;
894 			break;
895 		}
896 	rcu_read_unlock();
897 	return ret;
898 }
899 EXPORT_SYMBOL(dev_getfirstbyhwtype);
900 
901 /**
902  *	__dev_get_by_flags - find any device with given flags
903  *	@net: the applicable net namespace
904  *	@if_flags: IFF_* values
905  *	@mask: bitmask of bits in if_flags to check
906  *
907  *	Search for any interface with the given flags. Returns NULL if a device
908  *	is not found or a pointer to the device. Must be called inside
909  *	rtnl_lock(), and result refcount is unchanged.
910  */
911 
912 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
913 				      unsigned short mask)
914 {
915 	struct net_device *dev, *ret;
916 
917 	ASSERT_RTNL();
918 
919 	ret = NULL;
920 	for_each_netdev(net, dev) {
921 		if (((dev->flags ^ if_flags) & mask) == 0) {
922 			ret = dev;
923 			break;
924 		}
925 	}
926 	return ret;
927 }
928 EXPORT_SYMBOL(__dev_get_by_flags);
929 
930 /**
931  *	dev_valid_name - check if name is okay for network device
932  *	@name: name string
933  *
934  *	Network device names need to be valid file names to
935  *	to allow sysfs to work.  We also disallow any kind of
936  *	whitespace.
937  */
938 bool dev_valid_name(const char *name)
939 {
940 	if (*name == '\0')
941 		return false;
942 	if (strlen(name) >= IFNAMSIZ)
943 		return false;
944 	if (!strcmp(name, ".") || !strcmp(name, ".."))
945 		return false;
946 
947 	while (*name) {
948 		if (*name == '/' || isspace(*name))
949 			return false;
950 		name++;
951 	}
952 	return true;
953 }
954 EXPORT_SYMBOL(dev_valid_name);
955 
956 /**
957  *	__dev_alloc_name - allocate a name for a device
958  *	@net: network namespace to allocate the device name in
959  *	@name: name format string
960  *	@buf:  scratch buffer and result name string
961  *
962  *	Passed a format string - eg "lt%d" it will try and find a suitable
963  *	id. It scans list of devices to build up a free map, then chooses
964  *	the first empty slot. The caller must hold the dev_base or rtnl lock
965  *	while allocating the name and adding the device in order to avoid
966  *	duplicates.
967  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
968  *	Returns the number of the unit assigned or a negative errno code.
969  */
970 
971 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
972 {
973 	int i = 0;
974 	const char *p;
975 	const int max_netdevices = 8*PAGE_SIZE;
976 	unsigned long *inuse;
977 	struct net_device *d;
978 
979 	p = strnchr(name, IFNAMSIZ-1, '%');
980 	if (p) {
981 		/*
982 		 * Verify the string as this thing may have come from
983 		 * the user.  There must be either one "%d" and no other "%"
984 		 * characters.
985 		 */
986 		if (p[1] != 'd' || strchr(p + 2, '%'))
987 			return -EINVAL;
988 
989 		/* Use one page as a bit array of possible slots */
990 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
991 		if (!inuse)
992 			return -ENOMEM;
993 
994 		for_each_netdev(net, d) {
995 			if (!sscanf(d->name, name, &i))
996 				continue;
997 			if (i < 0 || i >= max_netdevices)
998 				continue;
999 
1000 			/*  avoid cases where sscanf is not exact inverse of printf */
1001 			snprintf(buf, IFNAMSIZ, name, i);
1002 			if (!strncmp(buf, d->name, IFNAMSIZ))
1003 				set_bit(i, inuse);
1004 		}
1005 
1006 		i = find_first_zero_bit(inuse, max_netdevices);
1007 		free_page((unsigned long) inuse);
1008 	}
1009 
1010 	if (buf != name)
1011 		snprintf(buf, IFNAMSIZ, name, i);
1012 	if (!__dev_get_by_name(net, buf))
1013 		return i;
1014 
1015 	/* It is possible to run out of possible slots
1016 	 * when the name is long and there isn't enough space left
1017 	 * for the digits, or if all bits are used.
1018 	 */
1019 	return -ENFILE;
1020 }
1021 
1022 /**
1023  *	dev_alloc_name - allocate a name for a device
1024  *	@dev: device
1025  *	@name: name format string
1026  *
1027  *	Passed a format string - eg "lt%d" it will try and find a suitable
1028  *	id. It scans list of devices to build up a free map, then chooses
1029  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1030  *	while allocating the name and adding the device in order to avoid
1031  *	duplicates.
1032  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1033  *	Returns the number of the unit assigned or a negative errno code.
1034  */
1035 
1036 int dev_alloc_name(struct net_device *dev, const char *name)
1037 {
1038 	char buf[IFNAMSIZ];
1039 	struct net *net;
1040 	int ret;
1041 
1042 	BUG_ON(!dev_net(dev));
1043 	net = dev_net(dev);
1044 	ret = __dev_alloc_name(net, name, buf);
1045 	if (ret >= 0)
1046 		strlcpy(dev->name, buf, IFNAMSIZ);
1047 	return ret;
1048 }
1049 EXPORT_SYMBOL(dev_alloc_name);
1050 
1051 static int dev_alloc_name_ns(struct net *net,
1052 			     struct net_device *dev,
1053 			     const char *name)
1054 {
1055 	char buf[IFNAMSIZ];
1056 	int ret;
1057 
1058 	ret = __dev_alloc_name(net, name, buf);
1059 	if (ret >= 0)
1060 		strlcpy(dev->name, buf, IFNAMSIZ);
1061 	return ret;
1062 }
1063 
1064 static int dev_get_valid_name(struct net *net,
1065 			      struct net_device *dev,
1066 			      const char *name)
1067 {
1068 	BUG_ON(!net);
1069 
1070 	if (!dev_valid_name(name))
1071 		return -EINVAL;
1072 
1073 	if (strchr(name, '%'))
1074 		return dev_alloc_name_ns(net, dev, name);
1075 	else if (__dev_get_by_name(net, name))
1076 		return -EEXIST;
1077 	else if (dev->name != name)
1078 		strlcpy(dev->name, name, IFNAMSIZ);
1079 
1080 	return 0;
1081 }
1082 
1083 /**
1084  *	dev_change_name - change name of a device
1085  *	@dev: device
1086  *	@newname: name (or format string) must be at least IFNAMSIZ
1087  *
1088  *	Change name of a device, can pass format strings "eth%d".
1089  *	for wildcarding.
1090  */
1091 int dev_change_name(struct net_device *dev, const char *newname)
1092 {
1093 	unsigned char old_assign_type;
1094 	char oldname[IFNAMSIZ];
1095 	int err = 0;
1096 	int ret;
1097 	struct net *net;
1098 
1099 	ASSERT_RTNL();
1100 	BUG_ON(!dev_net(dev));
1101 
1102 	net = dev_net(dev);
1103 	if (dev->flags & IFF_UP)
1104 		return -EBUSY;
1105 
1106 	write_seqcount_begin(&devnet_rename_seq);
1107 
1108 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1109 		write_seqcount_end(&devnet_rename_seq);
1110 		return 0;
1111 	}
1112 
1113 	memcpy(oldname, dev->name, IFNAMSIZ);
1114 
1115 	err = dev_get_valid_name(net, dev, newname);
1116 	if (err < 0) {
1117 		write_seqcount_end(&devnet_rename_seq);
1118 		return err;
1119 	}
1120 
1121 	if (oldname[0] && !strchr(oldname, '%'))
1122 		netdev_info(dev, "renamed from %s\n", oldname);
1123 
1124 	old_assign_type = dev->name_assign_type;
1125 	dev->name_assign_type = NET_NAME_RENAMED;
1126 
1127 rollback:
1128 	ret = device_rename(&dev->dev, dev->name);
1129 	if (ret) {
1130 		memcpy(dev->name, oldname, IFNAMSIZ);
1131 		dev->name_assign_type = old_assign_type;
1132 		write_seqcount_end(&devnet_rename_seq);
1133 		return ret;
1134 	}
1135 
1136 	write_seqcount_end(&devnet_rename_seq);
1137 
1138 	netdev_adjacent_rename_links(dev, oldname);
1139 
1140 	write_lock_bh(&dev_base_lock);
1141 	hlist_del_rcu(&dev->name_hlist);
1142 	write_unlock_bh(&dev_base_lock);
1143 
1144 	synchronize_rcu();
1145 
1146 	write_lock_bh(&dev_base_lock);
1147 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1148 	write_unlock_bh(&dev_base_lock);
1149 
1150 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1151 	ret = notifier_to_errno(ret);
1152 
1153 	if (ret) {
1154 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1155 		if (err >= 0) {
1156 			err = ret;
1157 			write_seqcount_begin(&devnet_rename_seq);
1158 			memcpy(dev->name, oldname, IFNAMSIZ);
1159 			memcpy(oldname, newname, IFNAMSIZ);
1160 			dev->name_assign_type = old_assign_type;
1161 			old_assign_type = NET_NAME_RENAMED;
1162 			goto rollback;
1163 		} else {
1164 			pr_err("%s: name change rollback failed: %d\n",
1165 			       dev->name, ret);
1166 		}
1167 	}
1168 
1169 	return err;
1170 }
1171 
1172 /**
1173  *	dev_set_alias - change ifalias of a device
1174  *	@dev: device
1175  *	@alias: name up to IFALIASZ
1176  *	@len: limit of bytes to copy from info
1177  *
1178  *	Set ifalias for a device,
1179  */
1180 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1181 {
1182 	char *new_ifalias;
1183 
1184 	ASSERT_RTNL();
1185 
1186 	if (len >= IFALIASZ)
1187 		return -EINVAL;
1188 
1189 	if (!len) {
1190 		kfree(dev->ifalias);
1191 		dev->ifalias = NULL;
1192 		return 0;
1193 	}
1194 
1195 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1196 	if (!new_ifalias)
1197 		return -ENOMEM;
1198 	dev->ifalias = new_ifalias;
1199 
1200 	strlcpy(dev->ifalias, alias, len+1);
1201 	return len;
1202 }
1203 
1204 
1205 /**
1206  *	netdev_features_change - device changes features
1207  *	@dev: device to cause notification
1208  *
1209  *	Called to indicate a device has changed features.
1210  */
1211 void netdev_features_change(struct net_device *dev)
1212 {
1213 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1214 }
1215 EXPORT_SYMBOL(netdev_features_change);
1216 
1217 /**
1218  *	netdev_state_change - device changes state
1219  *	@dev: device to cause notification
1220  *
1221  *	Called to indicate a device has changed state. This function calls
1222  *	the notifier chains for netdev_chain and sends a NEWLINK message
1223  *	to the routing socket.
1224  */
1225 void netdev_state_change(struct net_device *dev)
1226 {
1227 	if (dev->flags & IFF_UP) {
1228 		struct netdev_notifier_change_info change_info;
1229 
1230 		change_info.flags_changed = 0;
1231 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1232 					      &change_info.info);
1233 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1234 	}
1235 }
1236 EXPORT_SYMBOL(netdev_state_change);
1237 
1238 /**
1239  * 	netdev_notify_peers - notify network peers about existence of @dev
1240  * 	@dev: network device
1241  *
1242  * Generate traffic such that interested network peers are aware of
1243  * @dev, such as by generating a gratuitous ARP. This may be used when
1244  * a device wants to inform the rest of the network about some sort of
1245  * reconfiguration such as a failover event or virtual machine
1246  * migration.
1247  */
1248 void netdev_notify_peers(struct net_device *dev)
1249 {
1250 	rtnl_lock();
1251 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1252 	rtnl_unlock();
1253 }
1254 EXPORT_SYMBOL(netdev_notify_peers);
1255 
1256 static int __dev_open(struct net_device *dev)
1257 {
1258 	const struct net_device_ops *ops = dev->netdev_ops;
1259 	int ret;
1260 
1261 	ASSERT_RTNL();
1262 
1263 	if (!netif_device_present(dev))
1264 		return -ENODEV;
1265 
1266 	/* Block netpoll from trying to do any rx path servicing.
1267 	 * If we don't do this there is a chance ndo_poll_controller
1268 	 * or ndo_poll may be running while we open the device
1269 	 */
1270 	netpoll_poll_disable(dev);
1271 
1272 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1273 	ret = notifier_to_errno(ret);
1274 	if (ret)
1275 		return ret;
1276 
1277 	set_bit(__LINK_STATE_START, &dev->state);
1278 
1279 	if (ops->ndo_validate_addr)
1280 		ret = ops->ndo_validate_addr(dev);
1281 
1282 	if (!ret && ops->ndo_open)
1283 		ret = ops->ndo_open(dev);
1284 
1285 	netpoll_poll_enable(dev);
1286 
1287 	if (ret)
1288 		clear_bit(__LINK_STATE_START, &dev->state);
1289 	else {
1290 		dev->flags |= IFF_UP;
1291 		dev_set_rx_mode(dev);
1292 		dev_activate(dev);
1293 		add_device_randomness(dev->dev_addr, dev->addr_len);
1294 	}
1295 
1296 	return ret;
1297 }
1298 
1299 /**
1300  *	dev_open	- prepare an interface for use.
1301  *	@dev:	device to open
1302  *
1303  *	Takes a device from down to up state. The device's private open
1304  *	function is invoked and then the multicast lists are loaded. Finally
1305  *	the device is moved into the up state and a %NETDEV_UP message is
1306  *	sent to the netdev notifier chain.
1307  *
1308  *	Calling this function on an active interface is a nop. On a failure
1309  *	a negative errno code is returned.
1310  */
1311 int dev_open(struct net_device *dev)
1312 {
1313 	int ret;
1314 
1315 	if (dev->flags & IFF_UP)
1316 		return 0;
1317 
1318 	ret = __dev_open(dev);
1319 	if (ret < 0)
1320 		return ret;
1321 
1322 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1323 	call_netdevice_notifiers(NETDEV_UP, dev);
1324 
1325 	return ret;
1326 }
1327 EXPORT_SYMBOL(dev_open);
1328 
1329 static int __dev_close_many(struct list_head *head)
1330 {
1331 	struct net_device *dev;
1332 
1333 	ASSERT_RTNL();
1334 	might_sleep();
1335 
1336 	list_for_each_entry(dev, head, close_list) {
1337 		/* Temporarily disable netpoll until the interface is down */
1338 		netpoll_poll_disable(dev);
1339 
1340 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1341 
1342 		clear_bit(__LINK_STATE_START, &dev->state);
1343 
1344 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1345 		 * can be even on different cpu. So just clear netif_running().
1346 		 *
1347 		 * dev->stop() will invoke napi_disable() on all of it's
1348 		 * napi_struct instances on this device.
1349 		 */
1350 		smp_mb__after_atomic(); /* Commit netif_running(). */
1351 	}
1352 
1353 	dev_deactivate_many(head);
1354 
1355 	list_for_each_entry(dev, head, close_list) {
1356 		const struct net_device_ops *ops = dev->netdev_ops;
1357 
1358 		/*
1359 		 *	Call the device specific close. This cannot fail.
1360 		 *	Only if device is UP
1361 		 *
1362 		 *	We allow it to be called even after a DETACH hot-plug
1363 		 *	event.
1364 		 */
1365 		if (ops->ndo_stop)
1366 			ops->ndo_stop(dev);
1367 
1368 		dev->flags &= ~IFF_UP;
1369 		netpoll_poll_enable(dev);
1370 	}
1371 
1372 	return 0;
1373 }
1374 
1375 static int __dev_close(struct net_device *dev)
1376 {
1377 	int retval;
1378 	LIST_HEAD(single);
1379 
1380 	list_add(&dev->close_list, &single);
1381 	retval = __dev_close_many(&single);
1382 	list_del(&single);
1383 
1384 	return retval;
1385 }
1386 
1387 static int dev_close_many(struct list_head *head)
1388 {
1389 	struct net_device *dev, *tmp;
1390 
1391 	/* Remove the devices that don't need to be closed */
1392 	list_for_each_entry_safe(dev, tmp, head, close_list)
1393 		if (!(dev->flags & IFF_UP))
1394 			list_del_init(&dev->close_list);
1395 
1396 	__dev_close_many(head);
1397 
1398 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1399 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1400 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1401 		list_del_init(&dev->close_list);
1402 	}
1403 
1404 	return 0;
1405 }
1406 
1407 /**
1408  *	dev_close - shutdown an interface.
1409  *	@dev: device to shutdown
1410  *
1411  *	This function moves an active device into down state. A
1412  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1413  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1414  *	chain.
1415  */
1416 int dev_close(struct net_device *dev)
1417 {
1418 	if (dev->flags & IFF_UP) {
1419 		LIST_HEAD(single);
1420 
1421 		list_add(&dev->close_list, &single);
1422 		dev_close_many(&single);
1423 		list_del(&single);
1424 	}
1425 	return 0;
1426 }
1427 EXPORT_SYMBOL(dev_close);
1428 
1429 
1430 /**
1431  *	dev_disable_lro - disable Large Receive Offload on a device
1432  *	@dev: device
1433  *
1434  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1435  *	called under RTNL.  This is needed if received packets may be
1436  *	forwarded to another interface.
1437  */
1438 void dev_disable_lro(struct net_device *dev)
1439 {
1440 	struct net_device *lower_dev;
1441 	struct list_head *iter;
1442 
1443 	dev->wanted_features &= ~NETIF_F_LRO;
1444 	netdev_update_features(dev);
1445 
1446 	if (unlikely(dev->features & NETIF_F_LRO))
1447 		netdev_WARN(dev, "failed to disable LRO!\n");
1448 
1449 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1450 		dev_disable_lro(lower_dev);
1451 }
1452 EXPORT_SYMBOL(dev_disable_lro);
1453 
1454 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1455 				   struct net_device *dev)
1456 {
1457 	struct netdev_notifier_info info;
1458 
1459 	netdev_notifier_info_init(&info, dev);
1460 	return nb->notifier_call(nb, val, &info);
1461 }
1462 
1463 static int dev_boot_phase = 1;
1464 
1465 /**
1466  *	register_netdevice_notifier - register a network notifier block
1467  *	@nb: notifier
1468  *
1469  *	Register a notifier to be called when network device events occur.
1470  *	The notifier passed is linked into the kernel structures and must
1471  *	not be reused until it has been unregistered. A negative errno code
1472  *	is returned on a failure.
1473  *
1474  * 	When registered all registration and up events are replayed
1475  *	to the new notifier to allow device to have a race free
1476  *	view of the network device list.
1477  */
1478 
1479 int register_netdevice_notifier(struct notifier_block *nb)
1480 {
1481 	struct net_device *dev;
1482 	struct net_device *last;
1483 	struct net *net;
1484 	int err;
1485 
1486 	rtnl_lock();
1487 	err = raw_notifier_chain_register(&netdev_chain, nb);
1488 	if (err)
1489 		goto unlock;
1490 	if (dev_boot_phase)
1491 		goto unlock;
1492 	for_each_net(net) {
1493 		for_each_netdev(net, dev) {
1494 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1495 			err = notifier_to_errno(err);
1496 			if (err)
1497 				goto rollback;
1498 
1499 			if (!(dev->flags & IFF_UP))
1500 				continue;
1501 
1502 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1503 		}
1504 	}
1505 
1506 unlock:
1507 	rtnl_unlock();
1508 	return err;
1509 
1510 rollback:
1511 	last = dev;
1512 	for_each_net(net) {
1513 		for_each_netdev(net, dev) {
1514 			if (dev == last)
1515 				goto outroll;
1516 
1517 			if (dev->flags & IFF_UP) {
1518 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1519 							dev);
1520 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1521 			}
1522 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1523 		}
1524 	}
1525 
1526 outroll:
1527 	raw_notifier_chain_unregister(&netdev_chain, nb);
1528 	goto unlock;
1529 }
1530 EXPORT_SYMBOL(register_netdevice_notifier);
1531 
1532 /**
1533  *	unregister_netdevice_notifier - unregister a network notifier block
1534  *	@nb: notifier
1535  *
1536  *	Unregister a notifier previously registered by
1537  *	register_netdevice_notifier(). The notifier is unlinked into the
1538  *	kernel structures and may then be reused. A negative errno code
1539  *	is returned on a failure.
1540  *
1541  * 	After unregistering unregister and down device events are synthesized
1542  *	for all devices on the device list to the removed notifier to remove
1543  *	the need for special case cleanup code.
1544  */
1545 
1546 int unregister_netdevice_notifier(struct notifier_block *nb)
1547 {
1548 	struct net_device *dev;
1549 	struct net *net;
1550 	int err;
1551 
1552 	rtnl_lock();
1553 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1554 	if (err)
1555 		goto unlock;
1556 
1557 	for_each_net(net) {
1558 		for_each_netdev(net, dev) {
1559 			if (dev->flags & IFF_UP) {
1560 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1561 							dev);
1562 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1563 			}
1564 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1565 		}
1566 	}
1567 unlock:
1568 	rtnl_unlock();
1569 	return err;
1570 }
1571 EXPORT_SYMBOL(unregister_netdevice_notifier);
1572 
1573 /**
1574  *	call_netdevice_notifiers_info - call all network notifier blocks
1575  *	@val: value passed unmodified to notifier function
1576  *	@dev: net_device pointer passed unmodified to notifier function
1577  *	@info: notifier information data
1578  *
1579  *	Call all network notifier blocks.  Parameters and return value
1580  *	are as for raw_notifier_call_chain().
1581  */
1582 
1583 static int call_netdevice_notifiers_info(unsigned long val,
1584 					 struct net_device *dev,
1585 					 struct netdev_notifier_info *info)
1586 {
1587 	ASSERT_RTNL();
1588 	netdev_notifier_info_init(info, dev);
1589 	return raw_notifier_call_chain(&netdev_chain, val, info);
1590 }
1591 
1592 /**
1593  *	call_netdevice_notifiers - call all network notifier blocks
1594  *      @val: value passed unmodified to notifier function
1595  *      @dev: net_device pointer passed unmodified to notifier function
1596  *
1597  *	Call all network notifier blocks.  Parameters and return value
1598  *	are as for raw_notifier_call_chain().
1599  */
1600 
1601 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1602 {
1603 	struct netdev_notifier_info info;
1604 
1605 	return call_netdevice_notifiers_info(val, dev, &info);
1606 }
1607 EXPORT_SYMBOL(call_netdevice_notifiers);
1608 
1609 static struct static_key netstamp_needed __read_mostly;
1610 #ifdef HAVE_JUMP_LABEL
1611 /* We are not allowed to call static_key_slow_dec() from irq context
1612  * If net_disable_timestamp() is called from irq context, defer the
1613  * static_key_slow_dec() calls.
1614  */
1615 static atomic_t netstamp_needed_deferred;
1616 #endif
1617 
1618 void net_enable_timestamp(void)
1619 {
1620 #ifdef HAVE_JUMP_LABEL
1621 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1622 
1623 	if (deferred) {
1624 		while (--deferred)
1625 			static_key_slow_dec(&netstamp_needed);
1626 		return;
1627 	}
1628 #endif
1629 	static_key_slow_inc(&netstamp_needed);
1630 }
1631 EXPORT_SYMBOL(net_enable_timestamp);
1632 
1633 void net_disable_timestamp(void)
1634 {
1635 #ifdef HAVE_JUMP_LABEL
1636 	if (in_interrupt()) {
1637 		atomic_inc(&netstamp_needed_deferred);
1638 		return;
1639 	}
1640 #endif
1641 	static_key_slow_dec(&netstamp_needed);
1642 }
1643 EXPORT_SYMBOL(net_disable_timestamp);
1644 
1645 static inline void net_timestamp_set(struct sk_buff *skb)
1646 {
1647 	skb->tstamp.tv64 = 0;
1648 	if (static_key_false(&netstamp_needed))
1649 		__net_timestamp(skb);
1650 }
1651 
1652 #define net_timestamp_check(COND, SKB)			\
1653 	if (static_key_false(&netstamp_needed)) {		\
1654 		if ((COND) && !(SKB)->tstamp.tv64)	\
1655 			__net_timestamp(SKB);		\
1656 	}						\
1657 
1658 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1659 {
1660 	unsigned int len;
1661 
1662 	if (!(dev->flags & IFF_UP))
1663 		return false;
1664 
1665 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1666 	if (skb->len <= len)
1667 		return true;
1668 
1669 	/* if TSO is enabled, we don't care about the length as the packet
1670 	 * could be forwarded without being segmented before
1671 	 */
1672 	if (skb_is_gso(skb))
1673 		return true;
1674 
1675 	return false;
1676 }
1677 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1678 
1679 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1680 {
1681 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1682 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1683 			atomic_long_inc(&dev->rx_dropped);
1684 			kfree_skb(skb);
1685 			return NET_RX_DROP;
1686 		}
1687 	}
1688 
1689 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1690 		atomic_long_inc(&dev->rx_dropped);
1691 		kfree_skb(skb);
1692 		return NET_RX_DROP;
1693 	}
1694 
1695 	skb_scrub_packet(skb, true);
1696 	skb->protocol = eth_type_trans(skb, dev);
1697 
1698 	return 0;
1699 }
1700 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1701 
1702 /**
1703  * dev_forward_skb - loopback an skb to another netif
1704  *
1705  * @dev: destination network device
1706  * @skb: buffer to forward
1707  *
1708  * return values:
1709  *	NET_RX_SUCCESS	(no congestion)
1710  *	NET_RX_DROP     (packet was dropped, but freed)
1711  *
1712  * dev_forward_skb can be used for injecting an skb from the
1713  * start_xmit function of one device into the receive queue
1714  * of another device.
1715  *
1716  * The receiving device may be in another namespace, so
1717  * we have to clear all information in the skb that could
1718  * impact namespace isolation.
1719  */
1720 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1721 {
1722 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1723 }
1724 EXPORT_SYMBOL_GPL(dev_forward_skb);
1725 
1726 static inline int deliver_skb(struct sk_buff *skb,
1727 			      struct packet_type *pt_prev,
1728 			      struct net_device *orig_dev)
1729 {
1730 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1731 		return -ENOMEM;
1732 	atomic_inc(&skb->users);
1733 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1734 }
1735 
1736 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1737 {
1738 	if (!ptype->af_packet_priv || !skb->sk)
1739 		return false;
1740 
1741 	if (ptype->id_match)
1742 		return ptype->id_match(ptype, skb->sk);
1743 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1744 		return true;
1745 
1746 	return false;
1747 }
1748 
1749 /*
1750  *	Support routine. Sends outgoing frames to any network
1751  *	taps currently in use.
1752  */
1753 
1754 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1755 {
1756 	struct packet_type *ptype;
1757 	struct sk_buff *skb2 = NULL;
1758 	struct packet_type *pt_prev = NULL;
1759 
1760 	rcu_read_lock();
1761 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1762 		/* Never send packets back to the socket
1763 		 * they originated from - MvS (miquels@drinkel.ow.org)
1764 		 */
1765 		if ((ptype->dev == dev || !ptype->dev) &&
1766 		    (!skb_loop_sk(ptype, skb))) {
1767 			if (pt_prev) {
1768 				deliver_skb(skb2, pt_prev, skb->dev);
1769 				pt_prev = ptype;
1770 				continue;
1771 			}
1772 
1773 			skb2 = skb_clone(skb, GFP_ATOMIC);
1774 			if (!skb2)
1775 				break;
1776 
1777 			net_timestamp_set(skb2);
1778 
1779 			/* skb->nh should be correctly
1780 			   set by sender, so that the second statement is
1781 			   just protection against buggy protocols.
1782 			 */
1783 			skb_reset_mac_header(skb2);
1784 
1785 			if (skb_network_header(skb2) < skb2->data ||
1786 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1787 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1788 						     ntohs(skb2->protocol),
1789 						     dev->name);
1790 				skb_reset_network_header(skb2);
1791 			}
1792 
1793 			skb2->transport_header = skb2->network_header;
1794 			skb2->pkt_type = PACKET_OUTGOING;
1795 			pt_prev = ptype;
1796 		}
1797 	}
1798 	if (pt_prev)
1799 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1800 	rcu_read_unlock();
1801 }
1802 
1803 /**
1804  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1805  * @dev: Network device
1806  * @txq: number of queues available
1807  *
1808  * If real_num_tx_queues is changed the tc mappings may no longer be
1809  * valid. To resolve this verify the tc mapping remains valid and if
1810  * not NULL the mapping. With no priorities mapping to this
1811  * offset/count pair it will no longer be used. In the worst case TC0
1812  * is invalid nothing can be done so disable priority mappings. If is
1813  * expected that drivers will fix this mapping if they can before
1814  * calling netif_set_real_num_tx_queues.
1815  */
1816 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1817 {
1818 	int i;
1819 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1820 
1821 	/* If TC0 is invalidated disable TC mapping */
1822 	if (tc->offset + tc->count > txq) {
1823 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1824 		dev->num_tc = 0;
1825 		return;
1826 	}
1827 
1828 	/* Invalidated prio to tc mappings set to TC0 */
1829 	for (i = 1; i < TC_BITMASK + 1; i++) {
1830 		int q = netdev_get_prio_tc_map(dev, i);
1831 
1832 		tc = &dev->tc_to_txq[q];
1833 		if (tc->offset + tc->count > txq) {
1834 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1835 				i, q);
1836 			netdev_set_prio_tc_map(dev, i, 0);
1837 		}
1838 	}
1839 }
1840 
1841 #ifdef CONFIG_XPS
1842 static DEFINE_MUTEX(xps_map_mutex);
1843 #define xmap_dereference(P)		\
1844 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1845 
1846 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1847 					int cpu, u16 index)
1848 {
1849 	struct xps_map *map = NULL;
1850 	int pos;
1851 
1852 	if (dev_maps)
1853 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1854 
1855 	for (pos = 0; map && pos < map->len; pos++) {
1856 		if (map->queues[pos] == index) {
1857 			if (map->len > 1) {
1858 				map->queues[pos] = map->queues[--map->len];
1859 			} else {
1860 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1861 				kfree_rcu(map, rcu);
1862 				map = NULL;
1863 			}
1864 			break;
1865 		}
1866 	}
1867 
1868 	return map;
1869 }
1870 
1871 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1872 {
1873 	struct xps_dev_maps *dev_maps;
1874 	int cpu, i;
1875 	bool active = false;
1876 
1877 	mutex_lock(&xps_map_mutex);
1878 	dev_maps = xmap_dereference(dev->xps_maps);
1879 
1880 	if (!dev_maps)
1881 		goto out_no_maps;
1882 
1883 	for_each_possible_cpu(cpu) {
1884 		for (i = index; i < dev->num_tx_queues; i++) {
1885 			if (!remove_xps_queue(dev_maps, cpu, i))
1886 				break;
1887 		}
1888 		if (i == dev->num_tx_queues)
1889 			active = true;
1890 	}
1891 
1892 	if (!active) {
1893 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1894 		kfree_rcu(dev_maps, rcu);
1895 	}
1896 
1897 	for (i = index; i < dev->num_tx_queues; i++)
1898 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1899 					     NUMA_NO_NODE);
1900 
1901 out_no_maps:
1902 	mutex_unlock(&xps_map_mutex);
1903 }
1904 
1905 static struct xps_map *expand_xps_map(struct xps_map *map,
1906 				      int cpu, u16 index)
1907 {
1908 	struct xps_map *new_map;
1909 	int alloc_len = XPS_MIN_MAP_ALLOC;
1910 	int i, pos;
1911 
1912 	for (pos = 0; map && pos < map->len; pos++) {
1913 		if (map->queues[pos] != index)
1914 			continue;
1915 		return map;
1916 	}
1917 
1918 	/* Need to add queue to this CPU's existing map */
1919 	if (map) {
1920 		if (pos < map->alloc_len)
1921 			return map;
1922 
1923 		alloc_len = map->alloc_len * 2;
1924 	}
1925 
1926 	/* Need to allocate new map to store queue on this CPU's map */
1927 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1928 			       cpu_to_node(cpu));
1929 	if (!new_map)
1930 		return NULL;
1931 
1932 	for (i = 0; i < pos; i++)
1933 		new_map->queues[i] = map->queues[i];
1934 	new_map->alloc_len = alloc_len;
1935 	new_map->len = pos;
1936 
1937 	return new_map;
1938 }
1939 
1940 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1941 			u16 index)
1942 {
1943 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1944 	struct xps_map *map, *new_map;
1945 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1946 	int cpu, numa_node_id = -2;
1947 	bool active = false;
1948 
1949 	mutex_lock(&xps_map_mutex);
1950 
1951 	dev_maps = xmap_dereference(dev->xps_maps);
1952 
1953 	/* allocate memory for queue storage */
1954 	for_each_online_cpu(cpu) {
1955 		if (!cpumask_test_cpu(cpu, mask))
1956 			continue;
1957 
1958 		if (!new_dev_maps)
1959 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1960 		if (!new_dev_maps) {
1961 			mutex_unlock(&xps_map_mutex);
1962 			return -ENOMEM;
1963 		}
1964 
1965 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1966 				 NULL;
1967 
1968 		map = expand_xps_map(map, cpu, index);
1969 		if (!map)
1970 			goto error;
1971 
1972 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1973 	}
1974 
1975 	if (!new_dev_maps)
1976 		goto out_no_new_maps;
1977 
1978 	for_each_possible_cpu(cpu) {
1979 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1980 			/* add queue to CPU maps */
1981 			int pos = 0;
1982 
1983 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1984 			while ((pos < map->len) && (map->queues[pos] != index))
1985 				pos++;
1986 
1987 			if (pos == map->len)
1988 				map->queues[map->len++] = index;
1989 #ifdef CONFIG_NUMA
1990 			if (numa_node_id == -2)
1991 				numa_node_id = cpu_to_node(cpu);
1992 			else if (numa_node_id != cpu_to_node(cpu))
1993 				numa_node_id = -1;
1994 #endif
1995 		} else if (dev_maps) {
1996 			/* fill in the new device map from the old device map */
1997 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1998 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1999 		}
2000 
2001 	}
2002 
2003 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2004 
2005 	/* Cleanup old maps */
2006 	if (dev_maps) {
2007 		for_each_possible_cpu(cpu) {
2008 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2010 			if (map && map != new_map)
2011 				kfree_rcu(map, rcu);
2012 		}
2013 
2014 		kfree_rcu(dev_maps, rcu);
2015 	}
2016 
2017 	dev_maps = new_dev_maps;
2018 	active = true;
2019 
2020 out_no_new_maps:
2021 	/* update Tx queue numa node */
2022 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2023 				     (numa_node_id >= 0) ? numa_node_id :
2024 				     NUMA_NO_NODE);
2025 
2026 	if (!dev_maps)
2027 		goto out_no_maps;
2028 
2029 	/* removes queue from unused CPUs */
2030 	for_each_possible_cpu(cpu) {
2031 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2032 			continue;
2033 
2034 		if (remove_xps_queue(dev_maps, cpu, index))
2035 			active = true;
2036 	}
2037 
2038 	/* free map if not active */
2039 	if (!active) {
2040 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2041 		kfree_rcu(dev_maps, rcu);
2042 	}
2043 
2044 out_no_maps:
2045 	mutex_unlock(&xps_map_mutex);
2046 
2047 	return 0;
2048 error:
2049 	/* remove any maps that we added */
2050 	for_each_possible_cpu(cpu) {
2051 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2052 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2053 				 NULL;
2054 		if (new_map && new_map != map)
2055 			kfree(new_map);
2056 	}
2057 
2058 	mutex_unlock(&xps_map_mutex);
2059 
2060 	kfree(new_dev_maps);
2061 	return -ENOMEM;
2062 }
2063 EXPORT_SYMBOL(netif_set_xps_queue);
2064 
2065 #endif
2066 /*
2067  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2068  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2069  */
2070 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2071 {
2072 	int rc;
2073 
2074 	if (txq < 1 || txq > dev->num_tx_queues)
2075 		return -EINVAL;
2076 
2077 	if (dev->reg_state == NETREG_REGISTERED ||
2078 	    dev->reg_state == NETREG_UNREGISTERING) {
2079 		ASSERT_RTNL();
2080 
2081 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2082 						  txq);
2083 		if (rc)
2084 			return rc;
2085 
2086 		if (dev->num_tc)
2087 			netif_setup_tc(dev, txq);
2088 
2089 		if (txq < dev->real_num_tx_queues) {
2090 			qdisc_reset_all_tx_gt(dev, txq);
2091 #ifdef CONFIG_XPS
2092 			netif_reset_xps_queues_gt(dev, txq);
2093 #endif
2094 		}
2095 	}
2096 
2097 	dev->real_num_tx_queues = txq;
2098 	return 0;
2099 }
2100 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2101 
2102 #ifdef CONFIG_SYSFS
2103 /**
2104  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2105  *	@dev: Network device
2106  *	@rxq: Actual number of RX queues
2107  *
2108  *	This must be called either with the rtnl_lock held or before
2109  *	registration of the net device.  Returns 0 on success, or a
2110  *	negative error code.  If called before registration, it always
2111  *	succeeds.
2112  */
2113 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2114 {
2115 	int rc;
2116 
2117 	if (rxq < 1 || rxq > dev->num_rx_queues)
2118 		return -EINVAL;
2119 
2120 	if (dev->reg_state == NETREG_REGISTERED) {
2121 		ASSERT_RTNL();
2122 
2123 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2124 						  rxq);
2125 		if (rc)
2126 			return rc;
2127 	}
2128 
2129 	dev->real_num_rx_queues = rxq;
2130 	return 0;
2131 }
2132 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2133 #endif
2134 
2135 /**
2136  * netif_get_num_default_rss_queues - default number of RSS queues
2137  *
2138  * This routine should set an upper limit on the number of RSS queues
2139  * used by default by multiqueue devices.
2140  */
2141 int netif_get_num_default_rss_queues(void)
2142 {
2143 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2144 }
2145 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2146 
2147 static inline void __netif_reschedule(struct Qdisc *q)
2148 {
2149 	struct softnet_data *sd;
2150 	unsigned long flags;
2151 
2152 	local_irq_save(flags);
2153 	sd = this_cpu_ptr(&softnet_data);
2154 	q->next_sched = NULL;
2155 	*sd->output_queue_tailp = q;
2156 	sd->output_queue_tailp = &q->next_sched;
2157 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2158 	local_irq_restore(flags);
2159 }
2160 
2161 void __netif_schedule(struct Qdisc *q)
2162 {
2163 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2164 		__netif_reschedule(q);
2165 }
2166 EXPORT_SYMBOL(__netif_schedule);
2167 
2168 struct dev_kfree_skb_cb {
2169 	enum skb_free_reason reason;
2170 };
2171 
2172 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2173 {
2174 	return (struct dev_kfree_skb_cb *)skb->cb;
2175 }
2176 
2177 void netif_schedule_queue(struct netdev_queue *txq)
2178 {
2179 	rcu_read_lock();
2180 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2181 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2182 
2183 		__netif_schedule(q);
2184 	}
2185 	rcu_read_unlock();
2186 }
2187 EXPORT_SYMBOL(netif_schedule_queue);
2188 
2189 /**
2190  *	netif_wake_subqueue - allow sending packets on subqueue
2191  *	@dev: network device
2192  *	@queue_index: sub queue index
2193  *
2194  * Resume individual transmit queue of a device with multiple transmit queues.
2195  */
2196 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2197 {
2198 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2199 
2200 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2201 		struct Qdisc *q;
2202 
2203 		rcu_read_lock();
2204 		q = rcu_dereference(txq->qdisc);
2205 		__netif_schedule(q);
2206 		rcu_read_unlock();
2207 	}
2208 }
2209 EXPORT_SYMBOL(netif_wake_subqueue);
2210 
2211 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2212 {
2213 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2214 		struct Qdisc *q;
2215 
2216 		rcu_read_lock();
2217 		q = rcu_dereference(dev_queue->qdisc);
2218 		__netif_schedule(q);
2219 		rcu_read_unlock();
2220 	}
2221 }
2222 EXPORT_SYMBOL(netif_tx_wake_queue);
2223 
2224 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2225 {
2226 	unsigned long flags;
2227 
2228 	if (likely(atomic_read(&skb->users) == 1)) {
2229 		smp_rmb();
2230 		atomic_set(&skb->users, 0);
2231 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2232 		return;
2233 	}
2234 	get_kfree_skb_cb(skb)->reason = reason;
2235 	local_irq_save(flags);
2236 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2237 	__this_cpu_write(softnet_data.completion_queue, skb);
2238 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2239 	local_irq_restore(flags);
2240 }
2241 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2242 
2243 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2244 {
2245 	if (in_irq() || irqs_disabled())
2246 		__dev_kfree_skb_irq(skb, reason);
2247 	else
2248 		dev_kfree_skb(skb);
2249 }
2250 EXPORT_SYMBOL(__dev_kfree_skb_any);
2251 
2252 
2253 /**
2254  * netif_device_detach - mark device as removed
2255  * @dev: network device
2256  *
2257  * Mark device as removed from system and therefore no longer available.
2258  */
2259 void netif_device_detach(struct net_device *dev)
2260 {
2261 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2262 	    netif_running(dev)) {
2263 		netif_tx_stop_all_queues(dev);
2264 	}
2265 }
2266 EXPORT_SYMBOL(netif_device_detach);
2267 
2268 /**
2269  * netif_device_attach - mark device as attached
2270  * @dev: network device
2271  *
2272  * Mark device as attached from system and restart if needed.
2273  */
2274 void netif_device_attach(struct net_device *dev)
2275 {
2276 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2277 	    netif_running(dev)) {
2278 		netif_tx_wake_all_queues(dev);
2279 		__netdev_watchdog_up(dev);
2280 	}
2281 }
2282 EXPORT_SYMBOL(netif_device_attach);
2283 
2284 static void skb_warn_bad_offload(const struct sk_buff *skb)
2285 {
2286 	static const netdev_features_t null_features = 0;
2287 	struct net_device *dev = skb->dev;
2288 	const char *driver = "";
2289 
2290 	if (!net_ratelimit())
2291 		return;
2292 
2293 	if (dev && dev->dev.parent)
2294 		driver = dev_driver_string(dev->dev.parent);
2295 
2296 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2297 	     "gso_type=%d ip_summed=%d\n",
2298 	     driver, dev ? &dev->features : &null_features,
2299 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2300 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2301 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2302 }
2303 
2304 /*
2305  * Invalidate hardware checksum when packet is to be mangled, and
2306  * complete checksum manually on outgoing path.
2307  */
2308 int skb_checksum_help(struct sk_buff *skb)
2309 {
2310 	__wsum csum;
2311 	int ret = 0, offset;
2312 
2313 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2314 		goto out_set_summed;
2315 
2316 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2317 		skb_warn_bad_offload(skb);
2318 		return -EINVAL;
2319 	}
2320 
2321 	/* Before computing a checksum, we should make sure no frag could
2322 	 * be modified by an external entity : checksum could be wrong.
2323 	 */
2324 	if (skb_has_shared_frag(skb)) {
2325 		ret = __skb_linearize(skb);
2326 		if (ret)
2327 			goto out;
2328 	}
2329 
2330 	offset = skb_checksum_start_offset(skb);
2331 	BUG_ON(offset >= skb_headlen(skb));
2332 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2333 
2334 	offset += skb->csum_offset;
2335 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2336 
2337 	if (skb_cloned(skb) &&
2338 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2339 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2340 		if (ret)
2341 			goto out;
2342 	}
2343 
2344 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2345 out_set_summed:
2346 	skb->ip_summed = CHECKSUM_NONE;
2347 out:
2348 	return ret;
2349 }
2350 EXPORT_SYMBOL(skb_checksum_help);
2351 
2352 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2353 {
2354 	unsigned int vlan_depth = skb->mac_len;
2355 	__be16 type = skb->protocol;
2356 
2357 	/* Tunnel gso handlers can set protocol to ethernet. */
2358 	if (type == htons(ETH_P_TEB)) {
2359 		struct ethhdr *eth;
2360 
2361 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2362 			return 0;
2363 
2364 		eth = (struct ethhdr *)skb_mac_header(skb);
2365 		type = eth->h_proto;
2366 	}
2367 
2368 	/* if skb->protocol is 802.1Q/AD then the header should already be
2369 	 * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
2370 	 * ETH_HLEN otherwise
2371 	 */
2372 	if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2373 		if (vlan_depth) {
2374 			if (WARN_ON(vlan_depth < VLAN_HLEN))
2375 				return 0;
2376 			vlan_depth -= VLAN_HLEN;
2377 		} else {
2378 			vlan_depth = ETH_HLEN;
2379 		}
2380 		do {
2381 			struct vlan_hdr *vh;
2382 
2383 			if (unlikely(!pskb_may_pull(skb,
2384 						    vlan_depth + VLAN_HLEN)))
2385 				return 0;
2386 
2387 			vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2388 			type = vh->h_vlan_encapsulated_proto;
2389 			vlan_depth += VLAN_HLEN;
2390 		} while (type == htons(ETH_P_8021Q) ||
2391 			 type == htons(ETH_P_8021AD));
2392 	}
2393 
2394 	*depth = vlan_depth;
2395 
2396 	return type;
2397 }
2398 
2399 /**
2400  *	skb_mac_gso_segment - mac layer segmentation handler.
2401  *	@skb: buffer to segment
2402  *	@features: features for the output path (see dev->features)
2403  */
2404 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2405 				    netdev_features_t features)
2406 {
2407 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2408 	struct packet_offload *ptype;
2409 	int vlan_depth = skb->mac_len;
2410 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2411 
2412 	if (unlikely(!type))
2413 		return ERR_PTR(-EINVAL);
2414 
2415 	__skb_pull(skb, vlan_depth);
2416 
2417 	rcu_read_lock();
2418 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2419 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2420 			segs = ptype->callbacks.gso_segment(skb, features);
2421 			break;
2422 		}
2423 	}
2424 	rcu_read_unlock();
2425 
2426 	__skb_push(skb, skb->data - skb_mac_header(skb));
2427 
2428 	return segs;
2429 }
2430 EXPORT_SYMBOL(skb_mac_gso_segment);
2431 
2432 
2433 /* openvswitch calls this on rx path, so we need a different check.
2434  */
2435 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2436 {
2437 	if (tx_path)
2438 		return skb->ip_summed != CHECKSUM_PARTIAL;
2439 	else
2440 		return skb->ip_summed == CHECKSUM_NONE;
2441 }
2442 
2443 /**
2444  *	__skb_gso_segment - Perform segmentation on skb.
2445  *	@skb: buffer to segment
2446  *	@features: features for the output path (see dev->features)
2447  *	@tx_path: whether it is called in TX path
2448  *
2449  *	This function segments the given skb and returns a list of segments.
2450  *
2451  *	It may return NULL if the skb requires no segmentation.  This is
2452  *	only possible when GSO is used for verifying header integrity.
2453  */
2454 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2455 				  netdev_features_t features, bool tx_path)
2456 {
2457 	if (unlikely(skb_needs_check(skb, tx_path))) {
2458 		int err;
2459 
2460 		skb_warn_bad_offload(skb);
2461 
2462 		err = skb_cow_head(skb, 0);
2463 		if (err < 0)
2464 			return ERR_PTR(err);
2465 	}
2466 
2467 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2468 	SKB_GSO_CB(skb)->encap_level = 0;
2469 
2470 	skb_reset_mac_header(skb);
2471 	skb_reset_mac_len(skb);
2472 
2473 	return skb_mac_gso_segment(skb, features);
2474 }
2475 EXPORT_SYMBOL(__skb_gso_segment);
2476 
2477 /* Take action when hardware reception checksum errors are detected. */
2478 #ifdef CONFIG_BUG
2479 void netdev_rx_csum_fault(struct net_device *dev)
2480 {
2481 	if (net_ratelimit()) {
2482 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2483 		dump_stack();
2484 	}
2485 }
2486 EXPORT_SYMBOL(netdev_rx_csum_fault);
2487 #endif
2488 
2489 /* Actually, we should eliminate this check as soon as we know, that:
2490  * 1. IOMMU is present and allows to map all the memory.
2491  * 2. No high memory really exists on this machine.
2492  */
2493 
2494 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2495 {
2496 #ifdef CONFIG_HIGHMEM
2497 	int i;
2498 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2499 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2500 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2501 			if (PageHighMem(skb_frag_page(frag)))
2502 				return 1;
2503 		}
2504 	}
2505 
2506 	if (PCI_DMA_BUS_IS_PHYS) {
2507 		struct device *pdev = dev->dev.parent;
2508 
2509 		if (!pdev)
2510 			return 0;
2511 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2512 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2513 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2514 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2515 				return 1;
2516 		}
2517 	}
2518 #endif
2519 	return 0;
2520 }
2521 
2522 /* If MPLS offload request, verify we are testing hardware MPLS features
2523  * instead of standard features for the netdev.
2524  */
2525 #ifdef CONFIG_NET_MPLS_GSO
2526 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2527 					   netdev_features_t features,
2528 					   __be16 type)
2529 {
2530 	if (eth_p_mpls(type))
2531 		features &= skb->dev->mpls_features;
2532 
2533 	return features;
2534 }
2535 #else
2536 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2537 					   netdev_features_t features,
2538 					   __be16 type)
2539 {
2540 	return features;
2541 }
2542 #endif
2543 
2544 static netdev_features_t harmonize_features(struct sk_buff *skb,
2545 	netdev_features_t features)
2546 {
2547 	int tmp;
2548 	__be16 type;
2549 
2550 	type = skb_network_protocol(skb, &tmp);
2551 	features = net_mpls_features(skb, features, type);
2552 
2553 	if (skb->ip_summed != CHECKSUM_NONE &&
2554 	    !can_checksum_protocol(features, type)) {
2555 		features &= ~NETIF_F_ALL_CSUM;
2556 	} else if (illegal_highdma(skb->dev, skb)) {
2557 		features &= ~NETIF_F_SG;
2558 	}
2559 
2560 	return features;
2561 }
2562 
2563 netdev_features_t netif_skb_features(struct sk_buff *skb)
2564 {
2565 	const struct net_device *dev = skb->dev;
2566 	netdev_features_t features = dev->features;
2567 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2568 	__be16 protocol = skb->protocol;
2569 
2570 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2571 		features &= ~NETIF_F_GSO_MASK;
2572 
2573 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2574 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2575 		protocol = veh->h_vlan_encapsulated_proto;
2576 	} else if (!vlan_tx_tag_present(skb)) {
2577 		return harmonize_features(skb, features);
2578 	}
2579 
2580 	features = netdev_intersect_features(features,
2581 					     dev->vlan_features |
2582 					     NETIF_F_HW_VLAN_CTAG_TX |
2583 					     NETIF_F_HW_VLAN_STAG_TX);
2584 
2585 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2586 		features = netdev_intersect_features(features,
2587 						     NETIF_F_SG |
2588 						     NETIF_F_HIGHDMA |
2589 						     NETIF_F_FRAGLIST |
2590 						     NETIF_F_GEN_CSUM |
2591 						     NETIF_F_HW_VLAN_CTAG_TX |
2592 						     NETIF_F_HW_VLAN_STAG_TX);
2593 
2594 	return harmonize_features(skb, features);
2595 }
2596 EXPORT_SYMBOL(netif_skb_features);
2597 
2598 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2599 		    struct netdev_queue *txq, bool more)
2600 {
2601 	unsigned int len;
2602 	int rc;
2603 
2604 	if (!list_empty(&ptype_all))
2605 		dev_queue_xmit_nit(skb, dev);
2606 
2607 	len = skb->len;
2608 	trace_net_dev_start_xmit(skb, dev);
2609 	rc = netdev_start_xmit(skb, dev, txq, more);
2610 	trace_net_dev_xmit(skb, rc, dev, len);
2611 
2612 	return rc;
2613 }
2614 
2615 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2616 				    struct netdev_queue *txq, int *ret)
2617 {
2618 	struct sk_buff *skb = first;
2619 	int rc = NETDEV_TX_OK;
2620 
2621 	while (skb) {
2622 		struct sk_buff *next = skb->next;
2623 
2624 		skb->next = NULL;
2625 		rc = xmit_one(skb, dev, txq, next != NULL);
2626 		if (unlikely(!dev_xmit_complete(rc))) {
2627 			skb->next = next;
2628 			goto out;
2629 		}
2630 
2631 		skb = next;
2632 		if (netif_xmit_stopped(txq) && skb) {
2633 			rc = NETDEV_TX_BUSY;
2634 			break;
2635 		}
2636 	}
2637 
2638 out:
2639 	*ret = rc;
2640 	return skb;
2641 }
2642 
2643 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2644 					  netdev_features_t features)
2645 {
2646 	if (vlan_tx_tag_present(skb) &&
2647 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2648 		skb = __vlan_hwaccel_push_inside(skb);
2649 	return skb;
2650 }
2651 
2652 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2653 {
2654 	netdev_features_t features;
2655 
2656 	if (skb->next)
2657 		return skb;
2658 
2659 	features = netif_skb_features(skb);
2660 	skb = validate_xmit_vlan(skb, features);
2661 	if (unlikely(!skb))
2662 		goto out_null;
2663 
2664 	/* If encapsulation offload request, verify we are testing
2665 	 * hardware encapsulation features instead of standard
2666 	 * features for the netdev
2667 	 */
2668 	if (skb->encapsulation)
2669 		features &= dev->hw_enc_features;
2670 
2671 	if (netif_needs_gso(dev, skb, features)) {
2672 		struct sk_buff *segs;
2673 
2674 		segs = skb_gso_segment(skb, features);
2675 		if (IS_ERR(segs)) {
2676 			segs = NULL;
2677 		} else if (segs) {
2678 			consume_skb(skb);
2679 			skb = segs;
2680 		}
2681 	} else {
2682 		if (skb_needs_linearize(skb, features) &&
2683 		    __skb_linearize(skb))
2684 			goto out_kfree_skb;
2685 
2686 		/* If packet is not checksummed and device does not
2687 		 * support checksumming for this protocol, complete
2688 		 * checksumming here.
2689 		 */
2690 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2691 			if (skb->encapsulation)
2692 				skb_set_inner_transport_header(skb,
2693 							       skb_checksum_start_offset(skb));
2694 			else
2695 				skb_set_transport_header(skb,
2696 							 skb_checksum_start_offset(skb));
2697 			if (!(features & NETIF_F_ALL_CSUM) &&
2698 			    skb_checksum_help(skb))
2699 				goto out_kfree_skb;
2700 		}
2701 	}
2702 
2703 	return skb;
2704 
2705 out_kfree_skb:
2706 	kfree_skb(skb);
2707 out_null:
2708 	return NULL;
2709 }
2710 
2711 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2712 {
2713 	struct sk_buff *next, *head = NULL, *tail;
2714 
2715 	for (; skb != NULL; skb = next) {
2716 		next = skb->next;
2717 		skb->next = NULL;
2718 
2719 		/* in case skb wont be segmented, point to itself */
2720 		skb->prev = skb;
2721 
2722 		skb = validate_xmit_skb(skb, dev);
2723 		if (!skb)
2724 			continue;
2725 
2726 		if (!head)
2727 			head = skb;
2728 		else
2729 			tail->next = skb;
2730 		/* If skb was segmented, skb->prev points to
2731 		 * the last segment. If not, it still contains skb.
2732 		 */
2733 		tail = skb->prev;
2734 	}
2735 	return head;
2736 }
2737 
2738 static void qdisc_pkt_len_init(struct sk_buff *skb)
2739 {
2740 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2741 
2742 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2743 
2744 	/* To get more precise estimation of bytes sent on wire,
2745 	 * we add to pkt_len the headers size of all segments
2746 	 */
2747 	if (shinfo->gso_size)  {
2748 		unsigned int hdr_len;
2749 		u16 gso_segs = shinfo->gso_segs;
2750 
2751 		/* mac layer + network layer */
2752 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2753 
2754 		/* + transport layer */
2755 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2756 			hdr_len += tcp_hdrlen(skb);
2757 		else
2758 			hdr_len += sizeof(struct udphdr);
2759 
2760 		if (shinfo->gso_type & SKB_GSO_DODGY)
2761 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2762 						shinfo->gso_size);
2763 
2764 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2765 	}
2766 }
2767 
2768 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2769 				 struct net_device *dev,
2770 				 struct netdev_queue *txq)
2771 {
2772 	spinlock_t *root_lock = qdisc_lock(q);
2773 	bool contended;
2774 	int rc;
2775 
2776 	qdisc_pkt_len_init(skb);
2777 	qdisc_calculate_pkt_len(skb, q);
2778 	/*
2779 	 * Heuristic to force contended enqueues to serialize on a
2780 	 * separate lock before trying to get qdisc main lock.
2781 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2782 	 * often and dequeue packets faster.
2783 	 */
2784 	contended = qdisc_is_running(q);
2785 	if (unlikely(contended))
2786 		spin_lock(&q->busylock);
2787 
2788 	spin_lock(root_lock);
2789 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2790 		kfree_skb(skb);
2791 		rc = NET_XMIT_DROP;
2792 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2793 		   qdisc_run_begin(q)) {
2794 		/*
2795 		 * This is a work-conserving queue; there are no old skbs
2796 		 * waiting to be sent out; and the qdisc is not running -
2797 		 * xmit the skb directly.
2798 		 */
2799 
2800 		qdisc_bstats_update(q, skb);
2801 
2802 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2803 			if (unlikely(contended)) {
2804 				spin_unlock(&q->busylock);
2805 				contended = false;
2806 			}
2807 			__qdisc_run(q);
2808 		} else
2809 			qdisc_run_end(q);
2810 
2811 		rc = NET_XMIT_SUCCESS;
2812 	} else {
2813 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2814 		if (qdisc_run_begin(q)) {
2815 			if (unlikely(contended)) {
2816 				spin_unlock(&q->busylock);
2817 				contended = false;
2818 			}
2819 			__qdisc_run(q);
2820 		}
2821 	}
2822 	spin_unlock(root_lock);
2823 	if (unlikely(contended))
2824 		spin_unlock(&q->busylock);
2825 	return rc;
2826 }
2827 
2828 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2829 static void skb_update_prio(struct sk_buff *skb)
2830 {
2831 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2832 
2833 	if (!skb->priority && skb->sk && map) {
2834 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2835 
2836 		if (prioidx < map->priomap_len)
2837 			skb->priority = map->priomap[prioidx];
2838 	}
2839 }
2840 #else
2841 #define skb_update_prio(skb)
2842 #endif
2843 
2844 static DEFINE_PER_CPU(int, xmit_recursion);
2845 #define RECURSION_LIMIT 10
2846 
2847 /**
2848  *	dev_loopback_xmit - loop back @skb
2849  *	@skb: buffer to transmit
2850  */
2851 int dev_loopback_xmit(struct sk_buff *skb)
2852 {
2853 	skb_reset_mac_header(skb);
2854 	__skb_pull(skb, skb_network_offset(skb));
2855 	skb->pkt_type = PACKET_LOOPBACK;
2856 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2857 	WARN_ON(!skb_dst(skb));
2858 	skb_dst_force(skb);
2859 	netif_rx_ni(skb);
2860 	return 0;
2861 }
2862 EXPORT_SYMBOL(dev_loopback_xmit);
2863 
2864 /**
2865  *	__dev_queue_xmit - transmit a buffer
2866  *	@skb: buffer to transmit
2867  *	@accel_priv: private data used for L2 forwarding offload
2868  *
2869  *	Queue a buffer for transmission to a network device. The caller must
2870  *	have set the device and priority and built the buffer before calling
2871  *	this function. The function can be called from an interrupt.
2872  *
2873  *	A negative errno code is returned on a failure. A success does not
2874  *	guarantee the frame will be transmitted as it may be dropped due
2875  *	to congestion or traffic shaping.
2876  *
2877  * -----------------------------------------------------------------------------------
2878  *      I notice this method can also return errors from the queue disciplines,
2879  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2880  *      be positive.
2881  *
2882  *      Regardless of the return value, the skb is consumed, so it is currently
2883  *      difficult to retry a send to this method.  (You can bump the ref count
2884  *      before sending to hold a reference for retry if you are careful.)
2885  *
2886  *      When calling this method, interrupts MUST be enabled.  This is because
2887  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2888  *          --BLG
2889  */
2890 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2891 {
2892 	struct net_device *dev = skb->dev;
2893 	struct netdev_queue *txq;
2894 	struct Qdisc *q;
2895 	int rc = -ENOMEM;
2896 
2897 	skb_reset_mac_header(skb);
2898 
2899 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2900 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2901 
2902 	/* Disable soft irqs for various locks below. Also
2903 	 * stops preemption for RCU.
2904 	 */
2905 	rcu_read_lock_bh();
2906 
2907 	skb_update_prio(skb);
2908 
2909 	/* If device/qdisc don't need skb->dst, release it right now while
2910 	 * its hot in this cpu cache.
2911 	 */
2912 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2913 		skb_dst_drop(skb);
2914 	else
2915 		skb_dst_force(skb);
2916 
2917 	txq = netdev_pick_tx(dev, skb, accel_priv);
2918 	q = rcu_dereference_bh(txq->qdisc);
2919 
2920 #ifdef CONFIG_NET_CLS_ACT
2921 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2922 #endif
2923 	trace_net_dev_queue(skb);
2924 	if (q->enqueue) {
2925 		rc = __dev_xmit_skb(skb, q, dev, txq);
2926 		goto out;
2927 	}
2928 
2929 	/* The device has no queue. Common case for software devices:
2930 	   loopback, all the sorts of tunnels...
2931 
2932 	   Really, it is unlikely that netif_tx_lock protection is necessary
2933 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2934 	   counters.)
2935 	   However, it is possible, that they rely on protection
2936 	   made by us here.
2937 
2938 	   Check this and shot the lock. It is not prone from deadlocks.
2939 	   Either shot noqueue qdisc, it is even simpler 8)
2940 	 */
2941 	if (dev->flags & IFF_UP) {
2942 		int cpu = smp_processor_id(); /* ok because BHs are off */
2943 
2944 		if (txq->xmit_lock_owner != cpu) {
2945 
2946 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2947 				goto recursion_alert;
2948 
2949 			skb = validate_xmit_skb(skb, dev);
2950 			if (!skb)
2951 				goto drop;
2952 
2953 			HARD_TX_LOCK(dev, txq, cpu);
2954 
2955 			if (!netif_xmit_stopped(txq)) {
2956 				__this_cpu_inc(xmit_recursion);
2957 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2958 				__this_cpu_dec(xmit_recursion);
2959 				if (dev_xmit_complete(rc)) {
2960 					HARD_TX_UNLOCK(dev, txq);
2961 					goto out;
2962 				}
2963 			}
2964 			HARD_TX_UNLOCK(dev, txq);
2965 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2966 					     dev->name);
2967 		} else {
2968 			/* Recursion is detected! It is possible,
2969 			 * unfortunately
2970 			 */
2971 recursion_alert:
2972 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2973 					     dev->name);
2974 		}
2975 	}
2976 
2977 	rc = -ENETDOWN;
2978 drop:
2979 	rcu_read_unlock_bh();
2980 
2981 	atomic_long_inc(&dev->tx_dropped);
2982 	kfree_skb_list(skb);
2983 	return rc;
2984 out:
2985 	rcu_read_unlock_bh();
2986 	return rc;
2987 }
2988 
2989 int dev_queue_xmit(struct sk_buff *skb)
2990 {
2991 	return __dev_queue_xmit(skb, NULL);
2992 }
2993 EXPORT_SYMBOL(dev_queue_xmit);
2994 
2995 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2996 {
2997 	return __dev_queue_xmit(skb, accel_priv);
2998 }
2999 EXPORT_SYMBOL(dev_queue_xmit_accel);
3000 
3001 
3002 /*=======================================================================
3003 			Receiver routines
3004   =======================================================================*/
3005 
3006 int netdev_max_backlog __read_mostly = 1000;
3007 EXPORT_SYMBOL(netdev_max_backlog);
3008 
3009 int netdev_tstamp_prequeue __read_mostly = 1;
3010 int netdev_budget __read_mostly = 300;
3011 int weight_p __read_mostly = 64;            /* old backlog weight */
3012 
3013 /* Called with irq disabled */
3014 static inline void ____napi_schedule(struct softnet_data *sd,
3015 				     struct napi_struct *napi)
3016 {
3017 	list_add_tail(&napi->poll_list, &sd->poll_list);
3018 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3019 }
3020 
3021 #ifdef CONFIG_RPS
3022 
3023 /* One global table that all flow-based protocols share. */
3024 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3025 EXPORT_SYMBOL(rps_sock_flow_table);
3026 
3027 struct static_key rps_needed __read_mostly;
3028 
3029 static struct rps_dev_flow *
3030 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3031 	    struct rps_dev_flow *rflow, u16 next_cpu)
3032 {
3033 	if (next_cpu != RPS_NO_CPU) {
3034 #ifdef CONFIG_RFS_ACCEL
3035 		struct netdev_rx_queue *rxqueue;
3036 		struct rps_dev_flow_table *flow_table;
3037 		struct rps_dev_flow *old_rflow;
3038 		u32 flow_id;
3039 		u16 rxq_index;
3040 		int rc;
3041 
3042 		/* Should we steer this flow to a different hardware queue? */
3043 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3044 		    !(dev->features & NETIF_F_NTUPLE))
3045 			goto out;
3046 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3047 		if (rxq_index == skb_get_rx_queue(skb))
3048 			goto out;
3049 
3050 		rxqueue = dev->_rx + rxq_index;
3051 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3052 		if (!flow_table)
3053 			goto out;
3054 		flow_id = skb_get_hash(skb) & flow_table->mask;
3055 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3056 							rxq_index, flow_id);
3057 		if (rc < 0)
3058 			goto out;
3059 		old_rflow = rflow;
3060 		rflow = &flow_table->flows[flow_id];
3061 		rflow->filter = rc;
3062 		if (old_rflow->filter == rflow->filter)
3063 			old_rflow->filter = RPS_NO_FILTER;
3064 	out:
3065 #endif
3066 		rflow->last_qtail =
3067 			per_cpu(softnet_data, next_cpu).input_queue_head;
3068 	}
3069 
3070 	rflow->cpu = next_cpu;
3071 	return rflow;
3072 }
3073 
3074 /*
3075  * get_rps_cpu is called from netif_receive_skb and returns the target
3076  * CPU from the RPS map of the receiving queue for a given skb.
3077  * rcu_read_lock must be held on entry.
3078  */
3079 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3080 		       struct rps_dev_flow **rflowp)
3081 {
3082 	struct netdev_rx_queue *rxqueue;
3083 	struct rps_map *map;
3084 	struct rps_dev_flow_table *flow_table;
3085 	struct rps_sock_flow_table *sock_flow_table;
3086 	int cpu = -1;
3087 	u16 tcpu;
3088 	u32 hash;
3089 
3090 	if (skb_rx_queue_recorded(skb)) {
3091 		u16 index = skb_get_rx_queue(skb);
3092 		if (unlikely(index >= dev->real_num_rx_queues)) {
3093 			WARN_ONCE(dev->real_num_rx_queues > 1,
3094 				  "%s received packet on queue %u, but number "
3095 				  "of RX queues is %u\n",
3096 				  dev->name, index, dev->real_num_rx_queues);
3097 			goto done;
3098 		}
3099 		rxqueue = dev->_rx + index;
3100 	} else
3101 		rxqueue = dev->_rx;
3102 
3103 	map = rcu_dereference(rxqueue->rps_map);
3104 	if (map) {
3105 		if (map->len == 1 &&
3106 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3107 			tcpu = map->cpus[0];
3108 			if (cpu_online(tcpu))
3109 				cpu = tcpu;
3110 			goto done;
3111 		}
3112 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3113 		goto done;
3114 	}
3115 
3116 	skb_reset_network_header(skb);
3117 	hash = skb_get_hash(skb);
3118 	if (!hash)
3119 		goto done;
3120 
3121 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3122 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3123 	if (flow_table && sock_flow_table) {
3124 		u16 next_cpu;
3125 		struct rps_dev_flow *rflow;
3126 
3127 		rflow = &flow_table->flows[hash & flow_table->mask];
3128 		tcpu = rflow->cpu;
3129 
3130 		next_cpu = sock_flow_table->ents[hash & sock_flow_table->mask];
3131 
3132 		/*
3133 		 * If the desired CPU (where last recvmsg was done) is
3134 		 * different from current CPU (one in the rx-queue flow
3135 		 * table entry), switch if one of the following holds:
3136 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3137 		 *   - Current CPU is offline.
3138 		 *   - The current CPU's queue tail has advanced beyond the
3139 		 *     last packet that was enqueued using this table entry.
3140 		 *     This guarantees that all previous packets for the flow
3141 		 *     have been dequeued, thus preserving in order delivery.
3142 		 */
3143 		if (unlikely(tcpu != next_cpu) &&
3144 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3145 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3146 		      rflow->last_qtail)) >= 0)) {
3147 			tcpu = next_cpu;
3148 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3149 		}
3150 
3151 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3152 			*rflowp = rflow;
3153 			cpu = tcpu;
3154 			goto done;
3155 		}
3156 	}
3157 
3158 	if (map) {
3159 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3160 		if (cpu_online(tcpu)) {
3161 			cpu = tcpu;
3162 			goto done;
3163 		}
3164 	}
3165 
3166 done:
3167 	return cpu;
3168 }
3169 
3170 #ifdef CONFIG_RFS_ACCEL
3171 
3172 /**
3173  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3174  * @dev: Device on which the filter was set
3175  * @rxq_index: RX queue index
3176  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3177  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3178  *
3179  * Drivers that implement ndo_rx_flow_steer() should periodically call
3180  * this function for each installed filter and remove the filters for
3181  * which it returns %true.
3182  */
3183 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3184 			 u32 flow_id, u16 filter_id)
3185 {
3186 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3187 	struct rps_dev_flow_table *flow_table;
3188 	struct rps_dev_flow *rflow;
3189 	bool expire = true;
3190 	int cpu;
3191 
3192 	rcu_read_lock();
3193 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3194 	if (flow_table && flow_id <= flow_table->mask) {
3195 		rflow = &flow_table->flows[flow_id];
3196 		cpu = ACCESS_ONCE(rflow->cpu);
3197 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3198 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3199 			   rflow->last_qtail) <
3200 		     (int)(10 * flow_table->mask)))
3201 			expire = false;
3202 	}
3203 	rcu_read_unlock();
3204 	return expire;
3205 }
3206 EXPORT_SYMBOL(rps_may_expire_flow);
3207 
3208 #endif /* CONFIG_RFS_ACCEL */
3209 
3210 /* Called from hardirq (IPI) context */
3211 static void rps_trigger_softirq(void *data)
3212 {
3213 	struct softnet_data *sd = data;
3214 
3215 	____napi_schedule(sd, &sd->backlog);
3216 	sd->received_rps++;
3217 }
3218 
3219 #endif /* CONFIG_RPS */
3220 
3221 /*
3222  * Check if this softnet_data structure is another cpu one
3223  * If yes, queue it to our IPI list and return 1
3224  * If no, return 0
3225  */
3226 static int rps_ipi_queued(struct softnet_data *sd)
3227 {
3228 #ifdef CONFIG_RPS
3229 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3230 
3231 	if (sd != mysd) {
3232 		sd->rps_ipi_next = mysd->rps_ipi_list;
3233 		mysd->rps_ipi_list = sd;
3234 
3235 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3236 		return 1;
3237 	}
3238 #endif /* CONFIG_RPS */
3239 	return 0;
3240 }
3241 
3242 #ifdef CONFIG_NET_FLOW_LIMIT
3243 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3244 #endif
3245 
3246 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3247 {
3248 #ifdef CONFIG_NET_FLOW_LIMIT
3249 	struct sd_flow_limit *fl;
3250 	struct softnet_data *sd;
3251 	unsigned int old_flow, new_flow;
3252 
3253 	if (qlen < (netdev_max_backlog >> 1))
3254 		return false;
3255 
3256 	sd = this_cpu_ptr(&softnet_data);
3257 
3258 	rcu_read_lock();
3259 	fl = rcu_dereference(sd->flow_limit);
3260 	if (fl) {
3261 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3262 		old_flow = fl->history[fl->history_head];
3263 		fl->history[fl->history_head] = new_flow;
3264 
3265 		fl->history_head++;
3266 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3267 
3268 		if (likely(fl->buckets[old_flow]))
3269 			fl->buckets[old_flow]--;
3270 
3271 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3272 			fl->count++;
3273 			rcu_read_unlock();
3274 			return true;
3275 		}
3276 	}
3277 	rcu_read_unlock();
3278 #endif
3279 	return false;
3280 }
3281 
3282 /*
3283  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3284  * queue (may be a remote CPU queue).
3285  */
3286 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3287 			      unsigned int *qtail)
3288 {
3289 	struct softnet_data *sd;
3290 	unsigned long flags;
3291 	unsigned int qlen;
3292 
3293 	sd = &per_cpu(softnet_data, cpu);
3294 
3295 	local_irq_save(flags);
3296 
3297 	rps_lock(sd);
3298 	qlen = skb_queue_len(&sd->input_pkt_queue);
3299 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3300 		if (qlen) {
3301 enqueue:
3302 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3303 			input_queue_tail_incr_save(sd, qtail);
3304 			rps_unlock(sd);
3305 			local_irq_restore(flags);
3306 			return NET_RX_SUCCESS;
3307 		}
3308 
3309 		/* Schedule NAPI for backlog device
3310 		 * We can use non atomic operation since we own the queue lock
3311 		 */
3312 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3313 			if (!rps_ipi_queued(sd))
3314 				____napi_schedule(sd, &sd->backlog);
3315 		}
3316 		goto enqueue;
3317 	}
3318 
3319 	sd->dropped++;
3320 	rps_unlock(sd);
3321 
3322 	local_irq_restore(flags);
3323 
3324 	atomic_long_inc(&skb->dev->rx_dropped);
3325 	kfree_skb(skb);
3326 	return NET_RX_DROP;
3327 }
3328 
3329 static int netif_rx_internal(struct sk_buff *skb)
3330 {
3331 	int ret;
3332 
3333 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3334 
3335 	trace_netif_rx(skb);
3336 #ifdef CONFIG_RPS
3337 	if (static_key_false(&rps_needed)) {
3338 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3339 		int cpu;
3340 
3341 		preempt_disable();
3342 		rcu_read_lock();
3343 
3344 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3345 		if (cpu < 0)
3346 			cpu = smp_processor_id();
3347 
3348 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3349 
3350 		rcu_read_unlock();
3351 		preempt_enable();
3352 	} else
3353 #endif
3354 	{
3355 		unsigned int qtail;
3356 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3357 		put_cpu();
3358 	}
3359 	return ret;
3360 }
3361 
3362 /**
3363  *	netif_rx	-	post buffer to the network code
3364  *	@skb: buffer to post
3365  *
3366  *	This function receives a packet from a device driver and queues it for
3367  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3368  *	may be dropped during processing for congestion control or by the
3369  *	protocol layers.
3370  *
3371  *	return values:
3372  *	NET_RX_SUCCESS	(no congestion)
3373  *	NET_RX_DROP     (packet was dropped)
3374  *
3375  */
3376 
3377 int netif_rx(struct sk_buff *skb)
3378 {
3379 	trace_netif_rx_entry(skb);
3380 
3381 	return netif_rx_internal(skb);
3382 }
3383 EXPORT_SYMBOL(netif_rx);
3384 
3385 int netif_rx_ni(struct sk_buff *skb)
3386 {
3387 	int err;
3388 
3389 	trace_netif_rx_ni_entry(skb);
3390 
3391 	preempt_disable();
3392 	err = netif_rx_internal(skb);
3393 	if (local_softirq_pending())
3394 		do_softirq();
3395 	preempt_enable();
3396 
3397 	return err;
3398 }
3399 EXPORT_SYMBOL(netif_rx_ni);
3400 
3401 static void net_tx_action(struct softirq_action *h)
3402 {
3403 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3404 
3405 	if (sd->completion_queue) {
3406 		struct sk_buff *clist;
3407 
3408 		local_irq_disable();
3409 		clist = sd->completion_queue;
3410 		sd->completion_queue = NULL;
3411 		local_irq_enable();
3412 
3413 		while (clist) {
3414 			struct sk_buff *skb = clist;
3415 			clist = clist->next;
3416 
3417 			WARN_ON(atomic_read(&skb->users));
3418 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3419 				trace_consume_skb(skb);
3420 			else
3421 				trace_kfree_skb(skb, net_tx_action);
3422 			__kfree_skb(skb);
3423 		}
3424 	}
3425 
3426 	if (sd->output_queue) {
3427 		struct Qdisc *head;
3428 
3429 		local_irq_disable();
3430 		head = sd->output_queue;
3431 		sd->output_queue = NULL;
3432 		sd->output_queue_tailp = &sd->output_queue;
3433 		local_irq_enable();
3434 
3435 		while (head) {
3436 			struct Qdisc *q = head;
3437 			spinlock_t *root_lock;
3438 
3439 			head = head->next_sched;
3440 
3441 			root_lock = qdisc_lock(q);
3442 			if (spin_trylock(root_lock)) {
3443 				smp_mb__before_atomic();
3444 				clear_bit(__QDISC_STATE_SCHED,
3445 					  &q->state);
3446 				qdisc_run(q);
3447 				spin_unlock(root_lock);
3448 			} else {
3449 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3450 					      &q->state)) {
3451 					__netif_reschedule(q);
3452 				} else {
3453 					smp_mb__before_atomic();
3454 					clear_bit(__QDISC_STATE_SCHED,
3455 						  &q->state);
3456 				}
3457 			}
3458 		}
3459 	}
3460 }
3461 
3462 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3463     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3464 /* This hook is defined here for ATM LANE */
3465 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3466 			     unsigned char *addr) __read_mostly;
3467 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3468 #endif
3469 
3470 #ifdef CONFIG_NET_CLS_ACT
3471 /* TODO: Maybe we should just force sch_ingress to be compiled in
3472  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3473  * a compare and 2 stores extra right now if we dont have it on
3474  * but have CONFIG_NET_CLS_ACT
3475  * NOTE: This doesn't stop any functionality; if you dont have
3476  * the ingress scheduler, you just can't add policies on ingress.
3477  *
3478  */
3479 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3480 {
3481 	struct net_device *dev = skb->dev;
3482 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3483 	int result = TC_ACT_OK;
3484 	struct Qdisc *q;
3485 
3486 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3487 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3488 				     skb->skb_iif, dev->ifindex);
3489 		return TC_ACT_SHOT;
3490 	}
3491 
3492 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3493 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3494 
3495 	q = rcu_dereference(rxq->qdisc);
3496 	if (q != &noop_qdisc) {
3497 		spin_lock(qdisc_lock(q));
3498 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3499 			result = qdisc_enqueue_root(skb, q);
3500 		spin_unlock(qdisc_lock(q));
3501 	}
3502 
3503 	return result;
3504 }
3505 
3506 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3507 					 struct packet_type **pt_prev,
3508 					 int *ret, struct net_device *orig_dev)
3509 {
3510 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3511 
3512 	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3513 		goto out;
3514 
3515 	if (*pt_prev) {
3516 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3517 		*pt_prev = NULL;
3518 	}
3519 
3520 	switch (ing_filter(skb, rxq)) {
3521 	case TC_ACT_SHOT:
3522 	case TC_ACT_STOLEN:
3523 		kfree_skb(skb);
3524 		return NULL;
3525 	}
3526 
3527 out:
3528 	skb->tc_verd = 0;
3529 	return skb;
3530 }
3531 #endif
3532 
3533 /**
3534  *	netdev_rx_handler_register - register receive handler
3535  *	@dev: device to register a handler for
3536  *	@rx_handler: receive handler to register
3537  *	@rx_handler_data: data pointer that is used by rx handler
3538  *
3539  *	Register a receive handler for a device. This handler will then be
3540  *	called from __netif_receive_skb. A negative errno code is returned
3541  *	on a failure.
3542  *
3543  *	The caller must hold the rtnl_mutex.
3544  *
3545  *	For a general description of rx_handler, see enum rx_handler_result.
3546  */
3547 int netdev_rx_handler_register(struct net_device *dev,
3548 			       rx_handler_func_t *rx_handler,
3549 			       void *rx_handler_data)
3550 {
3551 	ASSERT_RTNL();
3552 
3553 	if (dev->rx_handler)
3554 		return -EBUSY;
3555 
3556 	/* Note: rx_handler_data must be set before rx_handler */
3557 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3558 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3559 
3560 	return 0;
3561 }
3562 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3563 
3564 /**
3565  *	netdev_rx_handler_unregister - unregister receive handler
3566  *	@dev: device to unregister a handler from
3567  *
3568  *	Unregister a receive handler from a device.
3569  *
3570  *	The caller must hold the rtnl_mutex.
3571  */
3572 void netdev_rx_handler_unregister(struct net_device *dev)
3573 {
3574 
3575 	ASSERT_RTNL();
3576 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3577 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3578 	 * section has a guarantee to see a non NULL rx_handler_data
3579 	 * as well.
3580 	 */
3581 	synchronize_net();
3582 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3583 }
3584 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3585 
3586 /*
3587  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3588  * the special handling of PFMEMALLOC skbs.
3589  */
3590 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3591 {
3592 	switch (skb->protocol) {
3593 	case htons(ETH_P_ARP):
3594 	case htons(ETH_P_IP):
3595 	case htons(ETH_P_IPV6):
3596 	case htons(ETH_P_8021Q):
3597 	case htons(ETH_P_8021AD):
3598 		return true;
3599 	default:
3600 		return false;
3601 	}
3602 }
3603 
3604 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3605 {
3606 	struct packet_type *ptype, *pt_prev;
3607 	rx_handler_func_t *rx_handler;
3608 	struct net_device *orig_dev;
3609 	struct net_device *null_or_dev;
3610 	bool deliver_exact = false;
3611 	int ret = NET_RX_DROP;
3612 	__be16 type;
3613 
3614 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3615 
3616 	trace_netif_receive_skb(skb);
3617 
3618 	orig_dev = skb->dev;
3619 
3620 	skb_reset_network_header(skb);
3621 	if (!skb_transport_header_was_set(skb))
3622 		skb_reset_transport_header(skb);
3623 	skb_reset_mac_len(skb);
3624 
3625 	pt_prev = NULL;
3626 
3627 	rcu_read_lock();
3628 
3629 another_round:
3630 	skb->skb_iif = skb->dev->ifindex;
3631 
3632 	__this_cpu_inc(softnet_data.processed);
3633 
3634 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3635 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3636 		skb = skb_vlan_untag(skb);
3637 		if (unlikely(!skb))
3638 			goto unlock;
3639 	}
3640 
3641 #ifdef CONFIG_NET_CLS_ACT
3642 	if (skb->tc_verd & TC_NCLS) {
3643 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3644 		goto ncls;
3645 	}
3646 #endif
3647 
3648 	if (pfmemalloc)
3649 		goto skip_taps;
3650 
3651 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3652 		if (!ptype->dev || ptype->dev == skb->dev) {
3653 			if (pt_prev)
3654 				ret = deliver_skb(skb, pt_prev, orig_dev);
3655 			pt_prev = ptype;
3656 		}
3657 	}
3658 
3659 skip_taps:
3660 #ifdef CONFIG_NET_CLS_ACT
3661 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3662 	if (!skb)
3663 		goto unlock;
3664 ncls:
3665 #endif
3666 
3667 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3668 		goto drop;
3669 
3670 	if (vlan_tx_tag_present(skb)) {
3671 		if (pt_prev) {
3672 			ret = deliver_skb(skb, pt_prev, orig_dev);
3673 			pt_prev = NULL;
3674 		}
3675 		if (vlan_do_receive(&skb))
3676 			goto another_round;
3677 		else if (unlikely(!skb))
3678 			goto unlock;
3679 	}
3680 
3681 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3682 	if (rx_handler) {
3683 		if (pt_prev) {
3684 			ret = deliver_skb(skb, pt_prev, orig_dev);
3685 			pt_prev = NULL;
3686 		}
3687 		switch (rx_handler(&skb)) {
3688 		case RX_HANDLER_CONSUMED:
3689 			ret = NET_RX_SUCCESS;
3690 			goto unlock;
3691 		case RX_HANDLER_ANOTHER:
3692 			goto another_round;
3693 		case RX_HANDLER_EXACT:
3694 			deliver_exact = true;
3695 		case RX_HANDLER_PASS:
3696 			break;
3697 		default:
3698 			BUG();
3699 		}
3700 	}
3701 
3702 	if (unlikely(vlan_tx_tag_present(skb))) {
3703 		if (vlan_tx_tag_get_id(skb))
3704 			skb->pkt_type = PACKET_OTHERHOST;
3705 		/* Note: we might in the future use prio bits
3706 		 * and set skb->priority like in vlan_do_receive()
3707 		 * For the time being, just ignore Priority Code Point
3708 		 */
3709 		skb->vlan_tci = 0;
3710 	}
3711 
3712 	/* deliver only exact match when indicated */
3713 	null_or_dev = deliver_exact ? skb->dev : NULL;
3714 
3715 	type = skb->protocol;
3716 	list_for_each_entry_rcu(ptype,
3717 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3718 		if (ptype->type == type &&
3719 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3720 		     ptype->dev == orig_dev)) {
3721 			if (pt_prev)
3722 				ret = deliver_skb(skb, pt_prev, orig_dev);
3723 			pt_prev = ptype;
3724 		}
3725 	}
3726 
3727 	if (pt_prev) {
3728 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3729 			goto drop;
3730 		else
3731 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3732 	} else {
3733 drop:
3734 		atomic_long_inc(&skb->dev->rx_dropped);
3735 		kfree_skb(skb);
3736 		/* Jamal, now you will not able to escape explaining
3737 		 * me how you were going to use this. :-)
3738 		 */
3739 		ret = NET_RX_DROP;
3740 	}
3741 
3742 unlock:
3743 	rcu_read_unlock();
3744 	return ret;
3745 }
3746 
3747 static int __netif_receive_skb(struct sk_buff *skb)
3748 {
3749 	int ret;
3750 
3751 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3752 		unsigned long pflags = current->flags;
3753 
3754 		/*
3755 		 * PFMEMALLOC skbs are special, they should
3756 		 * - be delivered to SOCK_MEMALLOC sockets only
3757 		 * - stay away from userspace
3758 		 * - have bounded memory usage
3759 		 *
3760 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3761 		 * context down to all allocation sites.
3762 		 */
3763 		current->flags |= PF_MEMALLOC;
3764 		ret = __netif_receive_skb_core(skb, true);
3765 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3766 	} else
3767 		ret = __netif_receive_skb_core(skb, false);
3768 
3769 	return ret;
3770 }
3771 
3772 static int netif_receive_skb_internal(struct sk_buff *skb)
3773 {
3774 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3775 
3776 	if (skb_defer_rx_timestamp(skb))
3777 		return NET_RX_SUCCESS;
3778 
3779 #ifdef CONFIG_RPS
3780 	if (static_key_false(&rps_needed)) {
3781 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3782 		int cpu, ret;
3783 
3784 		rcu_read_lock();
3785 
3786 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3787 
3788 		if (cpu >= 0) {
3789 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3790 			rcu_read_unlock();
3791 			return ret;
3792 		}
3793 		rcu_read_unlock();
3794 	}
3795 #endif
3796 	return __netif_receive_skb(skb);
3797 }
3798 
3799 /**
3800  *	netif_receive_skb - process receive buffer from network
3801  *	@skb: buffer to process
3802  *
3803  *	netif_receive_skb() is the main receive data processing function.
3804  *	It always succeeds. The buffer may be dropped during processing
3805  *	for congestion control or by the protocol layers.
3806  *
3807  *	This function may only be called from softirq context and interrupts
3808  *	should be enabled.
3809  *
3810  *	Return values (usually ignored):
3811  *	NET_RX_SUCCESS: no congestion
3812  *	NET_RX_DROP: packet was dropped
3813  */
3814 int netif_receive_skb(struct sk_buff *skb)
3815 {
3816 	trace_netif_receive_skb_entry(skb);
3817 
3818 	return netif_receive_skb_internal(skb);
3819 }
3820 EXPORT_SYMBOL(netif_receive_skb);
3821 
3822 /* Network device is going away, flush any packets still pending
3823  * Called with irqs disabled.
3824  */
3825 static void flush_backlog(void *arg)
3826 {
3827 	struct net_device *dev = arg;
3828 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3829 	struct sk_buff *skb, *tmp;
3830 
3831 	rps_lock(sd);
3832 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3833 		if (skb->dev == dev) {
3834 			__skb_unlink(skb, &sd->input_pkt_queue);
3835 			kfree_skb(skb);
3836 			input_queue_head_incr(sd);
3837 		}
3838 	}
3839 	rps_unlock(sd);
3840 
3841 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3842 		if (skb->dev == dev) {
3843 			__skb_unlink(skb, &sd->process_queue);
3844 			kfree_skb(skb);
3845 			input_queue_head_incr(sd);
3846 		}
3847 	}
3848 }
3849 
3850 static int napi_gro_complete(struct sk_buff *skb)
3851 {
3852 	struct packet_offload *ptype;
3853 	__be16 type = skb->protocol;
3854 	struct list_head *head = &offload_base;
3855 	int err = -ENOENT;
3856 
3857 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3858 
3859 	if (NAPI_GRO_CB(skb)->count == 1) {
3860 		skb_shinfo(skb)->gso_size = 0;
3861 		goto out;
3862 	}
3863 
3864 	rcu_read_lock();
3865 	list_for_each_entry_rcu(ptype, head, list) {
3866 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3867 			continue;
3868 
3869 		err = ptype->callbacks.gro_complete(skb, 0);
3870 		break;
3871 	}
3872 	rcu_read_unlock();
3873 
3874 	if (err) {
3875 		WARN_ON(&ptype->list == head);
3876 		kfree_skb(skb);
3877 		return NET_RX_SUCCESS;
3878 	}
3879 
3880 out:
3881 	return netif_receive_skb_internal(skb);
3882 }
3883 
3884 /* napi->gro_list contains packets ordered by age.
3885  * youngest packets at the head of it.
3886  * Complete skbs in reverse order to reduce latencies.
3887  */
3888 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3889 {
3890 	struct sk_buff *skb, *prev = NULL;
3891 
3892 	/* scan list and build reverse chain */
3893 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3894 		skb->prev = prev;
3895 		prev = skb;
3896 	}
3897 
3898 	for (skb = prev; skb; skb = prev) {
3899 		skb->next = NULL;
3900 
3901 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3902 			return;
3903 
3904 		prev = skb->prev;
3905 		napi_gro_complete(skb);
3906 		napi->gro_count--;
3907 	}
3908 
3909 	napi->gro_list = NULL;
3910 }
3911 EXPORT_SYMBOL(napi_gro_flush);
3912 
3913 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3914 {
3915 	struct sk_buff *p;
3916 	unsigned int maclen = skb->dev->hard_header_len;
3917 	u32 hash = skb_get_hash_raw(skb);
3918 
3919 	for (p = napi->gro_list; p; p = p->next) {
3920 		unsigned long diffs;
3921 
3922 		NAPI_GRO_CB(p)->flush = 0;
3923 
3924 		if (hash != skb_get_hash_raw(p)) {
3925 			NAPI_GRO_CB(p)->same_flow = 0;
3926 			continue;
3927 		}
3928 
3929 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3930 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3931 		if (maclen == ETH_HLEN)
3932 			diffs |= compare_ether_header(skb_mac_header(p),
3933 						      skb_mac_header(skb));
3934 		else if (!diffs)
3935 			diffs = memcmp(skb_mac_header(p),
3936 				       skb_mac_header(skb),
3937 				       maclen);
3938 		NAPI_GRO_CB(p)->same_flow = !diffs;
3939 	}
3940 }
3941 
3942 static void skb_gro_reset_offset(struct sk_buff *skb)
3943 {
3944 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3945 	const skb_frag_t *frag0 = &pinfo->frags[0];
3946 
3947 	NAPI_GRO_CB(skb)->data_offset = 0;
3948 	NAPI_GRO_CB(skb)->frag0 = NULL;
3949 	NAPI_GRO_CB(skb)->frag0_len = 0;
3950 
3951 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3952 	    pinfo->nr_frags &&
3953 	    !PageHighMem(skb_frag_page(frag0))) {
3954 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3955 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3956 	}
3957 }
3958 
3959 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3960 {
3961 	struct skb_shared_info *pinfo = skb_shinfo(skb);
3962 
3963 	BUG_ON(skb->end - skb->tail < grow);
3964 
3965 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3966 
3967 	skb->data_len -= grow;
3968 	skb->tail += grow;
3969 
3970 	pinfo->frags[0].page_offset += grow;
3971 	skb_frag_size_sub(&pinfo->frags[0], grow);
3972 
3973 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3974 		skb_frag_unref(skb, 0);
3975 		memmove(pinfo->frags, pinfo->frags + 1,
3976 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3977 	}
3978 }
3979 
3980 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3981 {
3982 	struct sk_buff **pp = NULL;
3983 	struct packet_offload *ptype;
3984 	__be16 type = skb->protocol;
3985 	struct list_head *head = &offload_base;
3986 	int same_flow;
3987 	enum gro_result ret;
3988 	int grow;
3989 
3990 	if (!(skb->dev->features & NETIF_F_GRO))
3991 		goto normal;
3992 
3993 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
3994 		goto normal;
3995 
3996 	gro_list_prepare(napi, skb);
3997 
3998 	rcu_read_lock();
3999 	list_for_each_entry_rcu(ptype, head, list) {
4000 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4001 			continue;
4002 
4003 		skb_set_network_header(skb, skb_gro_offset(skb));
4004 		skb_reset_mac_len(skb);
4005 		NAPI_GRO_CB(skb)->same_flow = 0;
4006 		NAPI_GRO_CB(skb)->flush = 0;
4007 		NAPI_GRO_CB(skb)->free = 0;
4008 		NAPI_GRO_CB(skb)->udp_mark = 0;
4009 
4010 		/* Setup for GRO checksum validation */
4011 		switch (skb->ip_summed) {
4012 		case CHECKSUM_COMPLETE:
4013 			NAPI_GRO_CB(skb)->csum = skb->csum;
4014 			NAPI_GRO_CB(skb)->csum_valid = 1;
4015 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4016 			break;
4017 		case CHECKSUM_UNNECESSARY:
4018 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4019 			NAPI_GRO_CB(skb)->csum_valid = 0;
4020 			break;
4021 		default:
4022 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4023 			NAPI_GRO_CB(skb)->csum_valid = 0;
4024 		}
4025 
4026 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4027 		break;
4028 	}
4029 	rcu_read_unlock();
4030 
4031 	if (&ptype->list == head)
4032 		goto normal;
4033 
4034 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4035 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4036 
4037 	if (pp) {
4038 		struct sk_buff *nskb = *pp;
4039 
4040 		*pp = nskb->next;
4041 		nskb->next = NULL;
4042 		napi_gro_complete(nskb);
4043 		napi->gro_count--;
4044 	}
4045 
4046 	if (same_flow)
4047 		goto ok;
4048 
4049 	if (NAPI_GRO_CB(skb)->flush)
4050 		goto normal;
4051 
4052 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4053 		struct sk_buff *nskb = napi->gro_list;
4054 
4055 		/* locate the end of the list to select the 'oldest' flow */
4056 		while (nskb->next) {
4057 			pp = &nskb->next;
4058 			nskb = *pp;
4059 		}
4060 		*pp = NULL;
4061 		nskb->next = NULL;
4062 		napi_gro_complete(nskb);
4063 	} else {
4064 		napi->gro_count++;
4065 	}
4066 	NAPI_GRO_CB(skb)->count = 1;
4067 	NAPI_GRO_CB(skb)->age = jiffies;
4068 	NAPI_GRO_CB(skb)->last = skb;
4069 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4070 	skb->next = napi->gro_list;
4071 	napi->gro_list = skb;
4072 	ret = GRO_HELD;
4073 
4074 pull:
4075 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4076 	if (grow > 0)
4077 		gro_pull_from_frag0(skb, grow);
4078 ok:
4079 	return ret;
4080 
4081 normal:
4082 	ret = GRO_NORMAL;
4083 	goto pull;
4084 }
4085 
4086 struct packet_offload *gro_find_receive_by_type(__be16 type)
4087 {
4088 	struct list_head *offload_head = &offload_base;
4089 	struct packet_offload *ptype;
4090 
4091 	list_for_each_entry_rcu(ptype, offload_head, list) {
4092 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4093 			continue;
4094 		return ptype;
4095 	}
4096 	return NULL;
4097 }
4098 EXPORT_SYMBOL(gro_find_receive_by_type);
4099 
4100 struct packet_offload *gro_find_complete_by_type(__be16 type)
4101 {
4102 	struct list_head *offload_head = &offload_base;
4103 	struct packet_offload *ptype;
4104 
4105 	list_for_each_entry_rcu(ptype, offload_head, list) {
4106 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4107 			continue;
4108 		return ptype;
4109 	}
4110 	return NULL;
4111 }
4112 EXPORT_SYMBOL(gro_find_complete_by_type);
4113 
4114 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4115 {
4116 	switch (ret) {
4117 	case GRO_NORMAL:
4118 		if (netif_receive_skb_internal(skb))
4119 			ret = GRO_DROP;
4120 		break;
4121 
4122 	case GRO_DROP:
4123 		kfree_skb(skb);
4124 		break;
4125 
4126 	case GRO_MERGED_FREE:
4127 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4128 			kmem_cache_free(skbuff_head_cache, skb);
4129 		else
4130 			__kfree_skb(skb);
4131 		break;
4132 
4133 	case GRO_HELD:
4134 	case GRO_MERGED:
4135 		break;
4136 	}
4137 
4138 	return ret;
4139 }
4140 
4141 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4142 {
4143 	trace_napi_gro_receive_entry(skb);
4144 
4145 	skb_gro_reset_offset(skb);
4146 
4147 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4148 }
4149 EXPORT_SYMBOL(napi_gro_receive);
4150 
4151 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4152 {
4153 	if (unlikely(skb->pfmemalloc)) {
4154 		consume_skb(skb);
4155 		return;
4156 	}
4157 	__skb_pull(skb, skb_headlen(skb));
4158 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4159 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4160 	skb->vlan_tci = 0;
4161 	skb->dev = napi->dev;
4162 	skb->skb_iif = 0;
4163 	skb->encapsulation = 0;
4164 	skb_shinfo(skb)->gso_type = 0;
4165 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4166 
4167 	napi->skb = skb;
4168 }
4169 
4170 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4171 {
4172 	struct sk_buff *skb = napi->skb;
4173 
4174 	if (!skb) {
4175 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4176 		napi->skb = skb;
4177 	}
4178 	return skb;
4179 }
4180 EXPORT_SYMBOL(napi_get_frags);
4181 
4182 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4183 				      struct sk_buff *skb,
4184 				      gro_result_t ret)
4185 {
4186 	switch (ret) {
4187 	case GRO_NORMAL:
4188 	case GRO_HELD:
4189 		__skb_push(skb, ETH_HLEN);
4190 		skb->protocol = eth_type_trans(skb, skb->dev);
4191 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4192 			ret = GRO_DROP;
4193 		break;
4194 
4195 	case GRO_DROP:
4196 	case GRO_MERGED_FREE:
4197 		napi_reuse_skb(napi, skb);
4198 		break;
4199 
4200 	case GRO_MERGED:
4201 		break;
4202 	}
4203 
4204 	return ret;
4205 }
4206 
4207 /* Upper GRO stack assumes network header starts at gro_offset=0
4208  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4209  * We copy ethernet header into skb->data to have a common layout.
4210  */
4211 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4212 {
4213 	struct sk_buff *skb = napi->skb;
4214 	const struct ethhdr *eth;
4215 	unsigned int hlen = sizeof(*eth);
4216 
4217 	napi->skb = NULL;
4218 
4219 	skb_reset_mac_header(skb);
4220 	skb_gro_reset_offset(skb);
4221 
4222 	eth = skb_gro_header_fast(skb, 0);
4223 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4224 		eth = skb_gro_header_slow(skb, hlen, 0);
4225 		if (unlikely(!eth)) {
4226 			napi_reuse_skb(napi, skb);
4227 			return NULL;
4228 		}
4229 	} else {
4230 		gro_pull_from_frag0(skb, hlen);
4231 		NAPI_GRO_CB(skb)->frag0 += hlen;
4232 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4233 	}
4234 	__skb_pull(skb, hlen);
4235 
4236 	/*
4237 	 * This works because the only protocols we care about don't require
4238 	 * special handling.
4239 	 * We'll fix it up properly in napi_frags_finish()
4240 	 */
4241 	skb->protocol = eth->h_proto;
4242 
4243 	return skb;
4244 }
4245 
4246 gro_result_t napi_gro_frags(struct napi_struct *napi)
4247 {
4248 	struct sk_buff *skb = napi_frags_skb(napi);
4249 
4250 	if (!skb)
4251 		return GRO_DROP;
4252 
4253 	trace_napi_gro_frags_entry(skb);
4254 
4255 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4256 }
4257 EXPORT_SYMBOL(napi_gro_frags);
4258 
4259 /* Compute the checksum from gro_offset and return the folded value
4260  * after adding in any pseudo checksum.
4261  */
4262 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4263 {
4264 	__wsum wsum;
4265 	__sum16 sum;
4266 
4267 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4268 
4269 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4270 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4271 	if (likely(!sum)) {
4272 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4273 		    !skb->csum_complete_sw)
4274 			netdev_rx_csum_fault(skb->dev);
4275 	}
4276 
4277 	NAPI_GRO_CB(skb)->csum = wsum;
4278 	NAPI_GRO_CB(skb)->csum_valid = 1;
4279 
4280 	return sum;
4281 }
4282 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4283 
4284 /*
4285  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4286  * Note: called with local irq disabled, but exits with local irq enabled.
4287  */
4288 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4289 {
4290 #ifdef CONFIG_RPS
4291 	struct softnet_data *remsd = sd->rps_ipi_list;
4292 
4293 	if (remsd) {
4294 		sd->rps_ipi_list = NULL;
4295 
4296 		local_irq_enable();
4297 
4298 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4299 		while (remsd) {
4300 			struct softnet_data *next = remsd->rps_ipi_next;
4301 
4302 			if (cpu_online(remsd->cpu))
4303 				smp_call_function_single_async(remsd->cpu,
4304 							   &remsd->csd);
4305 			remsd = next;
4306 		}
4307 	} else
4308 #endif
4309 		local_irq_enable();
4310 }
4311 
4312 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4313 {
4314 #ifdef CONFIG_RPS
4315 	return sd->rps_ipi_list != NULL;
4316 #else
4317 	return false;
4318 #endif
4319 }
4320 
4321 static int process_backlog(struct napi_struct *napi, int quota)
4322 {
4323 	int work = 0;
4324 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4325 
4326 	/* Check if we have pending ipi, its better to send them now,
4327 	 * not waiting net_rx_action() end.
4328 	 */
4329 	if (sd_has_rps_ipi_waiting(sd)) {
4330 		local_irq_disable();
4331 		net_rps_action_and_irq_enable(sd);
4332 	}
4333 
4334 	napi->weight = weight_p;
4335 	local_irq_disable();
4336 	while (1) {
4337 		struct sk_buff *skb;
4338 
4339 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4340 			local_irq_enable();
4341 			__netif_receive_skb(skb);
4342 			local_irq_disable();
4343 			input_queue_head_incr(sd);
4344 			if (++work >= quota) {
4345 				local_irq_enable();
4346 				return work;
4347 			}
4348 		}
4349 
4350 		rps_lock(sd);
4351 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4352 			/*
4353 			 * Inline a custom version of __napi_complete().
4354 			 * only current cpu owns and manipulates this napi,
4355 			 * and NAPI_STATE_SCHED is the only possible flag set
4356 			 * on backlog.
4357 			 * We can use a plain write instead of clear_bit(),
4358 			 * and we dont need an smp_mb() memory barrier.
4359 			 */
4360 			napi->state = 0;
4361 			rps_unlock(sd);
4362 
4363 			break;
4364 		}
4365 
4366 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4367 					   &sd->process_queue);
4368 		rps_unlock(sd);
4369 	}
4370 	local_irq_enable();
4371 
4372 	return work;
4373 }
4374 
4375 /**
4376  * __napi_schedule - schedule for receive
4377  * @n: entry to schedule
4378  *
4379  * The entry's receive function will be scheduled to run.
4380  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4381  */
4382 void __napi_schedule(struct napi_struct *n)
4383 {
4384 	unsigned long flags;
4385 
4386 	local_irq_save(flags);
4387 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4388 	local_irq_restore(flags);
4389 }
4390 EXPORT_SYMBOL(__napi_schedule);
4391 
4392 /**
4393  * __napi_schedule_irqoff - schedule for receive
4394  * @n: entry to schedule
4395  *
4396  * Variant of __napi_schedule() assuming hard irqs are masked
4397  */
4398 void __napi_schedule_irqoff(struct napi_struct *n)
4399 {
4400 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4401 }
4402 EXPORT_SYMBOL(__napi_schedule_irqoff);
4403 
4404 void __napi_complete(struct napi_struct *n)
4405 {
4406 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4407 
4408 	list_del_init(&n->poll_list);
4409 	smp_mb__before_atomic();
4410 	clear_bit(NAPI_STATE_SCHED, &n->state);
4411 }
4412 EXPORT_SYMBOL(__napi_complete);
4413 
4414 void napi_complete_done(struct napi_struct *n, int work_done)
4415 {
4416 	unsigned long flags;
4417 
4418 	/*
4419 	 * don't let napi dequeue from the cpu poll list
4420 	 * just in case its running on a different cpu
4421 	 */
4422 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4423 		return;
4424 
4425 	if (n->gro_list) {
4426 		unsigned long timeout = 0;
4427 
4428 		if (work_done)
4429 			timeout = n->dev->gro_flush_timeout;
4430 
4431 		if (timeout)
4432 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4433 				      HRTIMER_MODE_REL_PINNED);
4434 		else
4435 			napi_gro_flush(n, false);
4436 	}
4437 	if (likely(list_empty(&n->poll_list))) {
4438 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4439 	} else {
4440 		/* If n->poll_list is not empty, we need to mask irqs */
4441 		local_irq_save(flags);
4442 		__napi_complete(n);
4443 		local_irq_restore(flags);
4444 	}
4445 }
4446 EXPORT_SYMBOL(napi_complete_done);
4447 
4448 /* must be called under rcu_read_lock(), as we dont take a reference */
4449 struct napi_struct *napi_by_id(unsigned int napi_id)
4450 {
4451 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4452 	struct napi_struct *napi;
4453 
4454 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4455 		if (napi->napi_id == napi_id)
4456 			return napi;
4457 
4458 	return NULL;
4459 }
4460 EXPORT_SYMBOL_GPL(napi_by_id);
4461 
4462 void napi_hash_add(struct napi_struct *napi)
4463 {
4464 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4465 
4466 		spin_lock(&napi_hash_lock);
4467 
4468 		/* 0 is not a valid id, we also skip an id that is taken
4469 		 * we expect both events to be extremely rare
4470 		 */
4471 		napi->napi_id = 0;
4472 		while (!napi->napi_id) {
4473 			napi->napi_id = ++napi_gen_id;
4474 			if (napi_by_id(napi->napi_id))
4475 				napi->napi_id = 0;
4476 		}
4477 
4478 		hlist_add_head_rcu(&napi->napi_hash_node,
4479 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4480 
4481 		spin_unlock(&napi_hash_lock);
4482 	}
4483 }
4484 EXPORT_SYMBOL_GPL(napi_hash_add);
4485 
4486 /* Warning : caller is responsible to make sure rcu grace period
4487  * is respected before freeing memory containing @napi
4488  */
4489 void napi_hash_del(struct napi_struct *napi)
4490 {
4491 	spin_lock(&napi_hash_lock);
4492 
4493 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4494 		hlist_del_rcu(&napi->napi_hash_node);
4495 
4496 	spin_unlock(&napi_hash_lock);
4497 }
4498 EXPORT_SYMBOL_GPL(napi_hash_del);
4499 
4500 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4501 {
4502 	struct napi_struct *napi;
4503 
4504 	napi = container_of(timer, struct napi_struct, timer);
4505 	if (napi->gro_list)
4506 		napi_schedule(napi);
4507 
4508 	return HRTIMER_NORESTART;
4509 }
4510 
4511 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4512 		    int (*poll)(struct napi_struct *, int), int weight)
4513 {
4514 	INIT_LIST_HEAD(&napi->poll_list);
4515 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4516 	napi->timer.function = napi_watchdog;
4517 	napi->gro_count = 0;
4518 	napi->gro_list = NULL;
4519 	napi->skb = NULL;
4520 	napi->poll = poll;
4521 	if (weight > NAPI_POLL_WEIGHT)
4522 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4523 			    weight, dev->name);
4524 	napi->weight = weight;
4525 	list_add(&napi->dev_list, &dev->napi_list);
4526 	napi->dev = dev;
4527 #ifdef CONFIG_NETPOLL
4528 	spin_lock_init(&napi->poll_lock);
4529 	napi->poll_owner = -1;
4530 #endif
4531 	set_bit(NAPI_STATE_SCHED, &napi->state);
4532 }
4533 EXPORT_SYMBOL(netif_napi_add);
4534 
4535 void napi_disable(struct napi_struct *n)
4536 {
4537 	might_sleep();
4538 	set_bit(NAPI_STATE_DISABLE, &n->state);
4539 
4540 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4541 		msleep(1);
4542 
4543 	hrtimer_cancel(&n->timer);
4544 
4545 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4546 }
4547 EXPORT_SYMBOL(napi_disable);
4548 
4549 void netif_napi_del(struct napi_struct *napi)
4550 {
4551 	list_del_init(&napi->dev_list);
4552 	napi_free_frags(napi);
4553 
4554 	kfree_skb_list(napi->gro_list);
4555 	napi->gro_list = NULL;
4556 	napi->gro_count = 0;
4557 }
4558 EXPORT_SYMBOL(netif_napi_del);
4559 
4560 static void net_rx_action(struct softirq_action *h)
4561 {
4562 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4563 	unsigned long time_limit = jiffies + 2;
4564 	int budget = netdev_budget;
4565 	LIST_HEAD(list);
4566 	LIST_HEAD(repoll);
4567 	void *have;
4568 
4569 	local_irq_disable();
4570 	list_splice_init(&sd->poll_list, &list);
4571 	local_irq_enable();
4572 
4573 	while (!list_empty(&list)) {
4574 		struct napi_struct *n;
4575 		int work, weight;
4576 
4577 		/* If softirq window is exhausted then punt.
4578 		 * Allow this to run for 2 jiffies since which will allow
4579 		 * an average latency of 1.5/HZ.
4580 		 */
4581 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4582 			goto softnet_break;
4583 
4584 
4585 		n = list_first_entry(&list, struct napi_struct, poll_list);
4586 		list_del_init(&n->poll_list);
4587 
4588 		have = netpoll_poll_lock(n);
4589 
4590 		weight = n->weight;
4591 
4592 		/* This NAPI_STATE_SCHED test is for avoiding a race
4593 		 * with netpoll's poll_napi().  Only the entity which
4594 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4595 		 * actually make the ->poll() call.  Therefore we avoid
4596 		 * accidentally calling ->poll() when NAPI is not scheduled.
4597 		 */
4598 		work = 0;
4599 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4600 			work = n->poll(n, weight);
4601 			trace_napi_poll(n);
4602 		}
4603 
4604 		WARN_ON_ONCE(work > weight);
4605 
4606 		budget -= work;
4607 
4608 		/* Drivers must not modify the NAPI state if they
4609 		 * consume the entire weight.  In such cases this code
4610 		 * still "owns" the NAPI instance and therefore can
4611 		 * move the instance around on the list at-will.
4612 		 */
4613 		if (unlikely(work == weight)) {
4614 			if (unlikely(napi_disable_pending(n))) {
4615 				napi_complete(n);
4616 			} else {
4617 				if (n->gro_list) {
4618 					/* flush too old packets
4619 					 * If HZ < 1000, flush all packets.
4620 					 */
4621 					napi_gro_flush(n, HZ >= 1000);
4622 				}
4623 				list_add_tail(&n->poll_list, &repoll);
4624 			}
4625 		}
4626 
4627 		netpoll_poll_unlock(have);
4628 	}
4629 
4630 	if (!sd_has_rps_ipi_waiting(sd) &&
4631 	    list_empty(&list) &&
4632 	    list_empty(&repoll))
4633 		return;
4634 out:
4635 	local_irq_disable();
4636 
4637 	list_splice_tail_init(&sd->poll_list, &list);
4638 	list_splice_tail(&repoll, &list);
4639 	list_splice(&list, &sd->poll_list);
4640 	if (!list_empty(&sd->poll_list))
4641 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4642 
4643 	net_rps_action_and_irq_enable(sd);
4644 
4645 	return;
4646 
4647 softnet_break:
4648 	sd->time_squeeze++;
4649 	goto out;
4650 }
4651 
4652 struct netdev_adjacent {
4653 	struct net_device *dev;
4654 
4655 	/* upper master flag, there can only be one master device per list */
4656 	bool master;
4657 
4658 	/* counter for the number of times this device was added to us */
4659 	u16 ref_nr;
4660 
4661 	/* private field for the users */
4662 	void *private;
4663 
4664 	struct list_head list;
4665 	struct rcu_head rcu;
4666 };
4667 
4668 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4669 						 struct net_device *adj_dev,
4670 						 struct list_head *adj_list)
4671 {
4672 	struct netdev_adjacent *adj;
4673 
4674 	list_for_each_entry(adj, adj_list, list) {
4675 		if (adj->dev == adj_dev)
4676 			return adj;
4677 	}
4678 	return NULL;
4679 }
4680 
4681 /**
4682  * netdev_has_upper_dev - Check if device is linked to an upper device
4683  * @dev: device
4684  * @upper_dev: upper device to check
4685  *
4686  * Find out if a device is linked to specified upper device and return true
4687  * in case it is. Note that this checks only immediate upper device,
4688  * not through a complete stack of devices. The caller must hold the RTNL lock.
4689  */
4690 bool netdev_has_upper_dev(struct net_device *dev,
4691 			  struct net_device *upper_dev)
4692 {
4693 	ASSERT_RTNL();
4694 
4695 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4696 }
4697 EXPORT_SYMBOL(netdev_has_upper_dev);
4698 
4699 /**
4700  * netdev_has_any_upper_dev - Check if device is linked to some device
4701  * @dev: device
4702  *
4703  * Find out if a device is linked to an upper device and return true in case
4704  * it is. The caller must hold the RTNL lock.
4705  */
4706 static bool netdev_has_any_upper_dev(struct net_device *dev)
4707 {
4708 	ASSERT_RTNL();
4709 
4710 	return !list_empty(&dev->all_adj_list.upper);
4711 }
4712 
4713 /**
4714  * netdev_master_upper_dev_get - Get master upper device
4715  * @dev: device
4716  *
4717  * Find a master upper device and return pointer to it or NULL in case
4718  * it's not there. The caller must hold the RTNL lock.
4719  */
4720 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4721 {
4722 	struct netdev_adjacent *upper;
4723 
4724 	ASSERT_RTNL();
4725 
4726 	if (list_empty(&dev->adj_list.upper))
4727 		return NULL;
4728 
4729 	upper = list_first_entry(&dev->adj_list.upper,
4730 				 struct netdev_adjacent, list);
4731 	if (likely(upper->master))
4732 		return upper->dev;
4733 	return NULL;
4734 }
4735 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4736 
4737 void *netdev_adjacent_get_private(struct list_head *adj_list)
4738 {
4739 	struct netdev_adjacent *adj;
4740 
4741 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4742 
4743 	return adj->private;
4744 }
4745 EXPORT_SYMBOL(netdev_adjacent_get_private);
4746 
4747 /**
4748  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4749  * @dev: device
4750  * @iter: list_head ** of the current position
4751  *
4752  * Gets the next device from the dev's upper list, starting from iter
4753  * position. The caller must hold RCU read lock.
4754  */
4755 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4756 						 struct list_head **iter)
4757 {
4758 	struct netdev_adjacent *upper;
4759 
4760 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4761 
4762 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4763 
4764 	if (&upper->list == &dev->adj_list.upper)
4765 		return NULL;
4766 
4767 	*iter = &upper->list;
4768 
4769 	return upper->dev;
4770 }
4771 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4772 
4773 /**
4774  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4775  * @dev: device
4776  * @iter: list_head ** of the current position
4777  *
4778  * Gets the next device from the dev's upper list, starting from iter
4779  * position. The caller must hold RCU read lock.
4780  */
4781 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4782 						     struct list_head **iter)
4783 {
4784 	struct netdev_adjacent *upper;
4785 
4786 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4787 
4788 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4789 
4790 	if (&upper->list == &dev->all_adj_list.upper)
4791 		return NULL;
4792 
4793 	*iter = &upper->list;
4794 
4795 	return upper->dev;
4796 }
4797 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4798 
4799 /**
4800  * netdev_lower_get_next_private - Get the next ->private from the
4801  *				   lower neighbour list
4802  * @dev: device
4803  * @iter: list_head ** of the current position
4804  *
4805  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4806  * list, starting from iter position. The caller must hold either hold the
4807  * RTNL lock or its own locking that guarantees that the neighbour lower
4808  * list will remain unchainged.
4809  */
4810 void *netdev_lower_get_next_private(struct net_device *dev,
4811 				    struct list_head **iter)
4812 {
4813 	struct netdev_adjacent *lower;
4814 
4815 	lower = list_entry(*iter, struct netdev_adjacent, list);
4816 
4817 	if (&lower->list == &dev->adj_list.lower)
4818 		return NULL;
4819 
4820 	*iter = lower->list.next;
4821 
4822 	return lower->private;
4823 }
4824 EXPORT_SYMBOL(netdev_lower_get_next_private);
4825 
4826 /**
4827  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4828  *				       lower neighbour list, RCU
4829  *				       variant
4830  * @dev: device
4831  * @iter: list_head ** of the current position
4832  *
4833  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4834  * list, starting from iter position. The caller must hold RCU read lock.
4835  */
4836 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4837 					struct list_head **iter)
4838 {
4839 	struct netdev_adjacent *lower;
4840 
4841 	WARN_ON_ONCE(!rcu_read_lock_held());
4842 
4843 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4844 
4845 	if (&lower->list == &dev->adj_list.lower)
4846 		return NULL;
4847 
4848 	*iter = &lower->list;
4849 
4850 	return lower->private;
4851 }
4852 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4853 
4854 /**
4855  * netdev_lower_get_next - Get the next device from the lower neighbour
4856  *                         list
4857  * @dev: device
4858  * @iter: list_head ** of the current position
4859  *
4860  * Gets the next netdev_adjacent from the dev's lower neighbour
4861  * list, starting from iter position. The caller must hold RTNL lock or
4862  * its own locking that guarantees that the neighbour lower
4863  * list will remain unchainged.
4864  */
4865 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4866 {
4867 	struct netdev_adjacent *lower;
4868 
4869 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4870 
4871 	if (&lower->list == &dev->adj_list.lower)
4872 		return NULL;
4873 
4874 	*iter = &lower->list;
4875 
4876 	return lower->dev;
4877 }
4878 EXPORT_SYMBOL(netdev_lower_get_next);
4879 
4880 /**
4881  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4882  *				       lower neighbour list, RCU
4883  *				       variant
4884  * @dev: device
4885  *
4886  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4887  * list. The caller must hold RCU read lock.
4888  */
4889 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4890 {
4891 	struct netdev_adjacent *lower;
4892 
4893 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4894 			struct netdev_adjacent, list);
4895 	if (lower)
4896 		return lower->private;
4897 	return NULL;
4898 }
4899 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4900 
4901 /**
4902  * netdev_master_upper_dev_get_rcu - Get master upper device
4903  * @dev: device
4904  *
4905  * Find a master upper device and return pointer to it or NULL in case
4906  * it's not there. The caller must hold the RCU read lock.
4907  */
4908 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4909 {
4910 	struct netdev_adjacent *upper;
4911 
4912 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4913 				       struct netdev_adjacent, list);
4914 	if (upper && likely(upper->master))
4915 		return upper->dev;
4916 	return NULL;
4917 }
4918 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4919 
4920 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4921 			      struct net_device *adj_dev,
4922 			      struct list_head *dev_list)
4923 {
4924 	char linkname[IFNAMSIZ+7];
4925 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4926 		"upper_%s" : "lower_%s", adj_dev->name);
4927 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4928 				 linkname);
4929 }
4930 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4931 			       char *name,
4932 			       struct list_head *dev_list)
4933 {
4934 	char linkname[IFNAMSIZ+7];
4935 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4936 		"upper_%s" : "lower_%s", name);
4937 	sysfs_remove_link(&(dev->dev.kobj), linkname);
4938 }
4939 
4940 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4941 						 struct net_device *adj_dev,
4942 						 struct list_head *dev_list)
4943 {
4944 	return (dev_list == &dev->adj_list.upper ||
4945 		dev_list == &dev->adj_list.lower) &&
4946 		net_eq(dev_net(dev), dev_net(adj_dev));
4947 }
4948 
4949 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4950 					struct net_device *adj_dev,
4951 					struct list_head *dev_list,
4952 					void *private, bool master)
4953 {
4954 	struct netdev_adjacent *adj;
4955 	int ret;
4956 
4957 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4958 
4959 	if (adj) {
4960 		adj->ref_nr++;
4961 		return 0;
4962 	}
4963 
4964 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4965 	if (!adj)
4966 		return -ENOMEM;
4967 
4968 	adj->dev = adj_dev;
4969 	adj->master = master;
4970 	adj->ref_nr = 1;
4971 	adj->private = private;
4972 	dev_hold(adj_dev);
4973 
4974 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4975 		 adj_dev->name, dev->name, adj_dev->name);
4976 
4977 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
4978 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
4979 		if (ret)
4980 			goto free_adj;
4981 	}
4982 
4983 	/* Ensure that master link is always the first item in list. */
4984 	if (master) {
4985 		ret = sysfs_create_link(&(dev->dev.kobj),
4986 					&(adj_dev->dev.kobj), "master");
4987 		if (ret)
4988 			goto remove_symlinks;
4989 
4990 		list_add_rcu(&adj->list, dev_list);
4991 	} else {
4992 		list_add_tail_rcu(&adj->list, dev_list);
4993 	}
4994 
4995 	return 0;
4996 
4997 remove_symlinks:
4998 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
4999 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5000 free_adj:
5001 	kfree(adj);
5002 	dev_put(adj_dev);
5003 
5004 	return ret;
5005 }
5006 
5007 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5008 					 struct net_device *adj_dev,
5009 					 struct list_head *dev_list)
5010 {
5011 	struct netdev_adjacent *adj;
5012 
5013 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5014 
5015 	if (!adj) {
5016 		pr_err("tried to remove device %s from %s\n",
5017 		       dev->name, adj_dev->name);
5018 		BUG();
5019 	}
5020 
5021 	if (adj->ref_nr > 1) {
5022 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5023 			 adj->ref_nr-1);
5024 		adj->ref_nr--;
5025 		return;
5026 	}
5027 
5028 	if (adj->master)
5029 		sysfs_remove_link(&(dev->dev.kobj), "master");
5030 
5031 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5032 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5033 
5034 	list_del_rcu(&adj->list);
5035 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5036 		 adj_dev->name, dev->name, adj_dev->name);
5037 	dev_put(adj_dev);
5038 	kfree_rcu(adj, rcu);
5039 }
5040 
5041 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5042 					    struct net_device *upper_dev,
5043 					    struct list_head *up_list,
5044 					    struct list_head *down_list,
5045 					    void *private, bool master)
5046 {
5047 	int ret;
5048 
5049 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5050 					   master);
5051 	if (ret)
5052 		return ret;
5053 
5054 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5055 					   false);
5056 	if (ret) {
5057 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5058 		return ret;
5059 	}
5060 
5061 	return 0;
5062 }
5063 
5064 static int __netdev_adjacent_dev_link(struct net_device *dev,
5065 				      struct net_device *upper_dev)
5066 {
5067 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5068 						&dev->all_adj_list.upper,
5069 						&upper_dev->all_adj_list.lower,
5070 						NULL, false);
5071 }
5072 
5073 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5074 					       struct net_device *upper_dev,
5075 					       struct list_head *up_list,
5076 					       struct list_head *down_list)
5077 {
5078 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5079 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5080 }
5081 
5082 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5083 					 struct net_device *upper_dev)
5084 {
5085 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5086 					   &dev->all_adj_list.upper,
5087 					   &upper_dev->all_adj_list.lower);
5088 }
5089 
5090 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5091 						struct net_device *upper_dev,
5092 						void *private, bool master)
5093 {
5094 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5095 
5096 	if (ret)
5097 		return ret;
5098 
5099 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5100 					       &dev->adj_list.upper,
5101 					       &upper_dev->adj_list.lower,
5102 					       private, master);
5103 	if (ret) {
5104 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5105 		return ret;
5106 	}
5107 
5108 	return 0;
5109 }
5110 
5111 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5112 						   struct net_device *upper_dev)
5113 {
5114 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5115 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5116 					   &dev->adj_list.upper,
5117 					   &upper_dev->adj_list.lower);
5118 }
5119 
5120 static int __netdev_upper_dev_link(struct net_device *dev,
5121 				   struct net_device *upper_dev, bool master,
5122 				   void *private)
5123 {
5124 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5125 	int ret = 0;
5126 
5127 	ASSERT_RTNL();
5128 
5129 	if (dev == upper_dev)
5130 		return -EBUSY;
5131 
5132 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5133 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5134 		return -EBUSY;
5135 
5136 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5137 		return -EEXIST;
5138 
5139 	if (master && netdev_master_upper_dev_get(dev))
5140 		return -EBUSY;
5141 
5142 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5143 						   master);
5144 	if (ret)
5145 		return ret;
5146 
5147 	/* Now that we linked these devs, make all the upper_dev's
5148 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5149 	 * versa, and don't forget the devices itself. All of these
5150 	 * links are non-neighbours.
5151 	 */
5152 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5153 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5154 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5155 				 i->dev->name, j->dev->name);
5156 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5157 			if (ret)
5158 				goto rollback_mesh;
5159 		}
5160 	}
5161 
5162 	/* add dev to every upper_dev's upper device */
5163 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5164 		pr_debug("linking %s's upper device %s with %s\n",
5165 			 upper_dev->name, i->dev->name, dev->name);
5166 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5167 		if (ret)
5168 			goto rollback_upper_mesh;
5169 	}
5170 
5171 	/* add upper_dev to every dev's lower device */
5172 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5173 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5174 			 i->dev->name, upper_dev->name);
5175 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5176 		if (ret)
5177 			goto rollback_lower_mesh;
5178 	}
5179 
5180 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5181 	return 0;
5182 
5183 rollback_lower_mesh:
5184 	to_i = i;
5185 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5186 		if (i == to_i)
5187 			break;
5188 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5189 	}
5190 
5191 	i = NULL;
5192 
5193 rollback_upper_mesh:
5194 	to_i = i;
5195 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5196 		if (i == to_i)
5197 			break;
5198 		__netdev_adjacent_dev_unlink(dev, i->dev);
5199 	}
5200 
5201 	i = j = NULL;
5202 
5203 rollback_mesh:
5204 	to_i = i;
5205 	to_j = j;
5206 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5207 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5208 			if (i == to_i && j == to_j)
5209 				break;
5210 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5211 		}
5212 		if (i == to_i)
5213 			break;
5214 	}
5215 
5216 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5217 
5218 	return ret;
5219 }
5220 
5221 /**
5222  * netdev_upper_dev_link - Add a link to the upper device
5223  * @dev: device
5224  * @upper_dev: new upper device
5225  *
5226  * Adds a link to device which is upper to this one. The caller must hold
5227  * the RTNL lock. On a failure a negative errno code is returned.
5228  * On success the reference counts are adjusted and the function
5229  * returns zero.
5230  */
5231 int netdev_upper_dev_link(struct net_device *dev,
5232 			  struct net_device *upper_dev)
5233 {
5234 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5235 }
5236 EXPORT_SYMBOL(netdev_upper_dev_link);
5237 
5238 /**
5239  * netdev_master_upper_dev_link - Add a master link to the upper device
5240  * @dev: device
5241  * @upper_dev: new upper device
5242  *
5243  * Adds a link to device which is upper to this one. In this case, only
5244  * one master upper device can be linked, although other non-master devices
5245  * might be linked as well. The caller must hold the RTNL lock.
5246  * On a failure a negative errno code is returned. On success the reference
5247  * counts are adjusted and the function returns zero.
5248  */
5249 int netdev_master_upper_dev_link(struct net_device *dev,
5250 				 struct net_device *upper_dev)
5251 {
5252 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5253 }
5254 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5255 
5256 int netdev_master_upper_dev_link_private(struct net_device *dev,
5257 					 struct net_device *upper_dev,
5258 					 void *private)
5259 {
5260 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5261 }
5262 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5263 
5264 /**
5265  * netdev_upper_dev_unlink - Removes a link to upper device
5266  * @dev: device
5267  * @upper_dev: new upper device
5268  *
5269  * Removes a link to device which is upper to this one. The caller must hold
5270  * the RTNL lock.
5271  */
5272 void netdev_upper_dev_unlink(struct net_device *dev,
5273 			     struct net_device *upper_dev)
5274 {
5275 	struct netdev_adjacent *i, *j;
5276 	ASSERT_RTNL();
5277 
5278 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5279 
5280 	/* Here is the tricky part. We must remove all dev's lower
5281 	 * devices from all upper_dev's upper devices and vice
5282 	 * versa, to maintain the graph relationship.
5283 	 */
5284 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5285 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5286 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5287 
5288 	/* remove also the devices itself from lower/upper device
5289 	 * list
5290 	 */
5291 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5292 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5293 
5294 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5295 		__netdev_adjacent_dev_unlink(dev, i->dev);
5296 
5297 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5298 }
5299 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5300 
5301 void netdev_adjacent_add_links(struct net_device *dev)
5302 {
5303 	struct netdev_adjacent *iter;
5304 
5305 	struct net *net = dev_net(dev);
5306 
5307 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5308 		if (!net_eq(net,dev_net(iter->dev)))
5309 			continue;
5310 		netdev_adjacent_sysfs_add(iter->dev, dev,
5311 					  &iter->dev->adj_list.lower);
5312 		netdev_adjacent_sysfs_add(dev, iter->dev,
5313 					  &dev->adj_list.upper);
5314 	}
5315 
5316 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5317 		if (!net_eq(net,dev_net(iter->dev)))
5318 			continue;
5319 		netdev_adjacent_sysfs_add(iter->dev, dev,
5320 					  &iter->dev->adj_list.upper);
5321 		netdev_adjacent_sysfs_add(dev, iter->dev,
5322 					  &dev->adj_list.lower);
5323 	}
5324 }
5325 
5326 void netdev_adjacent_del_links(struct net_device *dev)
5327 {
5328 	struct netdev_adjacent *iter;
5329 
5330 	struct net *net = dev_net(dev);
5331 
5332 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5333 		if (!net_eq(net,dev_net(iter->dev)))
5334 			continue;
5335 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5336 					  &iter->dev->adj_list.lower);
5337 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5338 					  &dev->adj_list.upper);
5339 	}
5340 
5341 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5342 		if (!net_eq(net,dev_net(iter->dev)))
5343 			continue;
5344 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5345 					  &iter->dev->adj_list.upper);
5346 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5347 					  &dev->adj_list.lower);
5348 	}
5349 }
5350 
5351 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5352 {
5353 	struct netdev_adjacent *iter;
5354 
5355 	struct net *net = dev_net(dev);
5356 
5357 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5358 		if (!net_eq(net,dev_net(iter->dev)))
5359 			continue;
5360 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5361 					  &iter->dev->adj_list.lower);
5362 		netdev_adjacent_sysfs_add(iter->dev, dev,
5363 					  &iter->dev->adj_list.lower);
5364 	}
5365 
5366 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5367 		if (!net_eq(net,dev_net(iter->dev)))
5368 			continue;
5369 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5370 					  &iter->dev->adj_list.upper);
5371 		netdev_adjacent_sysfs_add(iter->dev, dev,
5372 					  &iter->dev->adj_list.upper);
5373 	}
5374 }
5375 
5376 void *netdev_lower_dev_get_private(struct net_device *dev,
5377 				   struct net_device *lower_dev)
5378 {
5379 	struct netdev_adjacent *lower;
5380 
5381 	if (!lower_dev)
5382 		return NULL;
5383 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5384 	if (!lower)
5385 		return NULL;
5386 
5387 	return lower->private;
5388 }
5389 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5390 
5391 
5392 int dev_get_nest_level(struct net_device *dev,
5393 		       bool (*type_check)(struct net_device *dev))
5394 {
5395 	struct net_device *lower = NULL;
5396 	struct list_head *iter;
5397 	int max_nest = -1;
5398 	int nest;
5399 
5400 	ASSERT_RTNL();
5401 
5402 	netdev_for_each_lower_dev(dev, lower, iter) {
5403 		nest = dev_get_nest_level(lower, type_check);
5404 		if (max_nest < nest)
5405 			max_nest = nest;
5406 	}
5407 
5408 	if (type_check(dev))
5409 		max_nest++;
5410 
5411 	return max_nest;
5412 }
5413 EXPORT_SYMBOL(dev_get_nest_level);
5414 
5415 static void dev_change_rx_flags(struct net_device *dev, int flags)
5416 {
5417 	const struct net_device_ops *ops = dev->netdev_ops;
5418 
5419 	if (ops->ndo_change_rx_flags)
5420 		ops->ndo_change_rx_flags(dev, flags);
5421 }
5422 
5423 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5424 {
5425 	unsigned int old_flags = dev->flags;
5426 	kuid_t uid;
5427 	kgid_t gid;
5428 
5429 	ASSERT_RTNL();
5430 
5431 	dev->flags |= IFF_PROMISC;
5432 	dev->promiscuity += inc;
5433 	if (dev->promiscuity == 0) {
5434 		/*
5435 		 * Avoid overflow.
5436 		 * If inc causes overflow, untouch promisc and return error.
5437 		 */
5438 		if (inc < 0)
5439 			dev->flags &= ~IFF_PROMISC;
5440 		else {
5441 			dev->promiscuity -= inc;
5442 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5443 				dev->name);
5444 			return -EOVERFLOW;
5445 		}
5446 	}
5447 	if (dev->flags != old_flags) {
5448 		pr_info("device %s %s promiscuous mode\n",
5449 			dev->name,
5450 			dev->flags & IFF_PROMISC ? "entered" : "left");
5451 		if (audit_enabled) {
5452 			current_uid_gid(&uid, &gid);
5453 			audit_log(current->audit_context, GFP_ATOMIC,
5454 				AUDIT_ANOM_PROMISCUOUS,
5455 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5456 				dev->name, (dev->flags & IFF_PROMISC),
5457 				(old_flags & IFF_PROMISC),
5458 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5459 				from_kuid(&init_user_ns, uid),
5460 				from_kgid(&init_user_ns, gid),
5461 				audit_get_sessionid(current));
5462 		}
5463 
5464 		dev_change_rx_flags(dev, IFF_PROMISC);
5465 	}
5466 	if (notify)
5467 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5468 	return 0;
5469 }
5470 
5471 /**
5472  *	dev_set_promiscuity	- update promiscuity count on a device
5473  *	@dev: device
5474  *	@inc: modifier
5475  *
5476  *	Add or remove promiscuity from a device. While the count in the device
5477  *	remains above zero the interface remains promiscuous. Once it hits zero
5478  *	the device reverts back to normal filtering operation. A negative inc
5479  *	value is used to drop promiscuity on the device.
5480  *	Return 0 if successful or a negative errno code on error.
5481  */
5482 int dev_set_promiscuity(struct net_device *dev, int inc)
5483 {
5484 	unsigned int old_flags = dev->flags;
5485 	int err;
5486 
5487 	err = __dev_set_promiscuity(dev, inc, true);
5488 	if (err < 0)
5489 		return err;
5490 	if (dev->flags != old_flags)
5491 		dev_set_rx_mode(dev);
5492 	return err;
5493 }
5494 EXPORT_SYMBOL(dev_set_promiscuity);
5495 
5496 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5497 {
5498 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5499 
5500 	ASSERT_RTNL();
5501 
5502 	dev->flags |= IFF_ALLMULTI;
5503 	dev->allmulti += inc;
5504 	if (dev->allmulti == 0) {
5505 		/*
5506 		 * Avoid overflow.
5507 		 * If inc causes overflow, untouch allmulti and return error.
5508 		 */
5509 		if (inc < 0)
5510 			dev->flags &= ~IFF_ALLMULTI;
5511 		else {
5512 			dev->allmulti -= inc;
5513 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5514 				dev->name);
5515 			return -EOVERFLOW;
5516 		}
5517 	}
5518 	if (dev->flags ^ old_flags) {
5519 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5520 		dev_set_rx_mode(dev);
5521 		if (notify)
5522 			__dev_notify_flags(dev, old_flags,
5523 					   dev->gflags ^ old_gflags);
5524 	}
5525 	return 0;
5526 }
5527 
5528 /**
5529  *	dev_set_allmulti	- update allmulti count on a device
5530  *	@dev: device
5531  *	@inc: modifier
5532  *
5533  *	Add or remove reception of all multicast frames to a device. While the
5534  *	count in the device remains above zero the interface remains listening
5535  *	to all interfaces. Once it hits zero the device reverts back to normal
5536  *	filtering operation. A negative @inc value is used to drop the counter
5537  *	when releasing a resource needing all multicasts.
5538  *	Return 0 if successful or a negative errno code on error.
5539  */
5540 
5541 int dev_set_allmulti(struct net_device *dev, int inc)
5542 {
5543 	return __dev_set_allmulti(dev, inc, true);
5544 }
5545 EXPORT_SYMBOL(dev_set_allmulti);
5546 
5547 /*
5548  *	Upload unicast and multicast address lists to device and
5549  *	configure RX filtering. When the device doesn't support unicast
5550  *	filtering it is put in promiscuous mode while unicast addresses
5551  *	are present.
5552  */
5553 void __dev_set_rx_mode(struct net_device *dev)
5554 {
5555 	const struct net_device_ops *ops = dev->netdev_ops;
5556 
5557 	/* dev_open will call this function so the list will stay sane. */
5558 	if (!(dev->flags&IFF_UP))
5559 		return;
5560 
5561 	if (!netif_device_present(dev))
5562 		return;
5563 
5564 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5565 		/* Unicast addresses changes may only happen under the rtnl,
5566 		 * therefore calling __dev_set_promiscuity here is safe.
5567 		 */
5568 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5569 			__dev_set_promiscuity(dev, 1, false);
5570 			dev->uc_promisc = true;
5571 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5572 			__dev_set_promiscuity(dev, -1, false);
5573 			dev->uc_promisc = false;
5574 		}
5575 	}
5576 
5577 	if (ops->ndo_set_rx_mode)
5578 		ops->ndo_set_rx_mode(dev);
5579 }
5580 
5581 void dev_set_rx_mode(struct net_device *dev)
5582 {
5583 	netif_addr_lock_bh(dev);
5584 	__dev_set_rx_mode(dev);
5585 	netif_addr_unlock_bh(dev);
5586 }
5587 
5588 /**
5589  *	dev_get_flags - get flags reported to userspace
5590  *	@dev: device
5591  *
5592  *	Get the combination of flag bits exported through APIs to userspace.
5593  */
5594 unsigned int dev_get_flags(const struct net_device *dev)
5595 {
5596 	unsigned int flags;
5597 
5598 	flags = (dev->flags & ~(IFF_PROMISC |
5599 				IFF_ALLMULTI |
5600 				IFF_RUNNING |
5601 				IFF_LOWER_UP |
5602 				IFF_DORMANT)) |
5603 		(dev->gflags & (IFF_PROMISC |
5604 				IFF_ALLMULTI));
5605 
5606 	if (netif_running(dev)) {
5607 		if (netif_oper_up(dev))
5608 			flags |= IFF_RUNNING;
5609 		if (netif_carrier_ok(dev))
5610 			flags |= IFF_LOWER_UP;
5611 		if (netif_dormant(dev))
5612 			flags |= IFF_DORMANT;
5613 	}
5614 
5615 	return flags;
5616 }
5617 EXPORT_SYMBOL(dev_get_flags);
5618 
5619 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5620 {
5621 	unsigned int old_flags = dev->flags;
5622 	int ret;
5623 
5624 	ASSERT_RTNL();
5625 
5626 	/*
5627 	 *	Set the flags on our device.
5628 	 */
5629 
5630 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5631 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5632 			       IFF_AUTOMEDIA)) |
5633 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5634 				    IFF_ALLMULTI));
5635 
5636 	/*
5637 	 *	Load in the correct multicast list now the flags have changed.
5638 	 */
5639 
5640 	if ((old_flags ^ flags) & IFF_MULTICAST)
5641 		dev_change_rx_flags(dev, IFF_MULTICAST);
5642 
5643 	dev_set_rx_mode(dev);
5644 
5645 	/*
5646 	 *	Have we downed the interface. We handle IFF_UP ourselves
5647 	 *	according to user attempts to set it, rather than blindly
5648 	 *	setting it.
5649 	 */
5650 
5651 	ret = 0;
5652 	if ((old_flags ^ flags) & IFF_UP)
5653 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5654 
5655 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5656 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5657 		unsigned int old_flags = dev->flags;
5658 
5659 		dev->gflags ^= IFF_PROMISC;
5660 
5661 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5662 			if (dev->flags != old_flags)
5663 				dev_set_rx_mode(dev);
5664 	}
5665 
5666 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5667 	   is important. Some (broken) drivers set IFF_PROMISC, when
5668 	   IFF_ALLMULTI is requested not asking us and not reporting.
5669 	 */
5670 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5671 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5672 
5673 		dev->gflags ^= IFF_ALLMULTI;
5674 		__dev_set_allmulti(dev, inc, false);
5675 	}
5676 
5677 	return ret;
5678 }
5679 
5680 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5681 			unsigned int gchanges)
5682 {
5683 	unsigned int changes = dev->flags ^ old_flags;
5684 
5685 	if (gchanges)
5686 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5687 
5688 	if (changes & IFF_UP) {
5689 		if (dev->flags & IFF_UP)
5690 			call_netdevice_notifiers(NETDEV_UP, dev);
5691 		else
5692 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5693 	}
5694 
5695 	if (dev->flags & IFF_UP &&
5696 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5697 		struct netdev_notifier_change_info change_info;
5698 
5699 		change_info.flags_changed = changes;
5700 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5701 					      &change_info.info);
5702 	}
5703 }
5704 
5705 /**
5706  *	dev_change_flags - change device settings
5707  *	@dev: device
5708  *	@flags: device state flags
5709  *
5710  *	Change settings on device based state flags. The flags are
5711  *	in the userspace exported format.
5712  */
5713 int dev_change_flags(struct net_device *dev, unsigned int flags)
5714 {
5715 	int ret;
5716 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5717 
5718 	ret = __dev_change_flags(dev, flags);
5719 	if (ret < 0)
5720 		return ret;
5721 
5722 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5723 	__dev_notify_flags(dev, old_flags, changes);
5724 	return ret;
5725 }
5726 EXPORT_SYMBOL(dev_change_flags);
5727 
5728 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5729 {
5730 	const struct net_device_ops *ops = dev->netdev_ops;
5731 
5732 	if (ops->ndo_change_mtu)
5733 		return ops->ndo_change_mtu(dev, new_mtu);
5734 
5735 	dev->mtu = new_mtu;
5736 	return 0;
5737 }
5738 
5739 /**
5740  *	dev_set_mtu - Change maximum transfer unit
5741  *	@dev: device
5742  *	@new_mtu: new transfer unit
5743  *
5744  *	Change the maximum transfer size of the network device.
5745  */
5746 int dev_set_mtu(struct net_device *dev, int new_mtu)
5747 {
5748 	int err, orig_mtu;
5749 
5750 	if (new_mtu == dev->mtu)
5751 		return 0;
5752 
5753 	/*	MTU must be positive.	 */
5754 	if (new_mtu < 0)
5755 		return -EINVAL;
5756 
5757 	if (!netif_device_present(dev))
5758 		return -ENODEV;
5759 
5760 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5761 	err = notifier_to_errno(err);
5762 	if (err)
5763 		return err;
5764 
5765 	orig_mtu = dev->mtu;
5766 	err = __dev_set_mtu(dev, new_mtu);
5767 
5768 	if (!err) {
5769 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5770 		err = notifier_to_errno(err);
5771 		if (err) {
5772 			/* setting mtu back and notifying everyone again,
5773 			 * so that they have a chance to revert changes.
5774 			 */
5775 			__dev_set_mtu(dev, orig_mtu);
5776 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5777 		}
5778 	}
5779 	return err;
5780 }
5781 EXPORT_SYMBOL(dev_set_mtu);
5782 
5783 /**
5784  *	dev_set_group - Change group this device belongs to
5785  *	@dev: device
5786  *	@new_group: group this device should belong to
5787  */
5788 void dev_set_group(struct net_device *dev, int new_group)
5789 {
5790 	dev->group = new_group;
5791 }
5792 EXPORT_SYMBOL(dev_set_group);
5793 
5794 /**
5795  *	dev_set_mac_address - Change Media Access Control Address
5796  *	@dev: device
5797  *	@sa: new address
5798  *
5799  *	Change the hardware (MAC) address of the device
5800  */
5801 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5802 {
5803 	const struct net_device_ops *ops = dev->netdev_ops;
5804 	int err;
5805 
5806 	if (!ops->ndo_set_mac_address)
5807 		return -EOPNOTSUPP;
5808 	if (sa->sa_family != dev->type)
5809 		return -EINVAL;
5810 	if (!netif_device_present(dev))
5811 		return -ENODEV;
5812 	err = ops->ndo_set_mac_address(dev, sa);
5813 	if (err)
5814 		return err;
5815 	dev->addr_assign_type = NET_ADDR_SET;
5816 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5817 	add_device_randomness(dev->dev_addr, dev->addr_len);
5818 	return 0;
5819 }
5820 EXPORT_SYMBOL(dev_set_mac_address);
5821 
5822 /**
5823  *	dev_change_carrier - Change device carrier
5824  *	@dev: device
5825  *	@new_carrier: new value
5826  *
5827  *	Change device carrier
5828  */
5829 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5830 {
5831 	const struct net_device_ops *ops = dev->netdev_ops;
5832 
5833 	if (!ops->ndo_change_carrier)
5834 		return -EOPNOTSUPP;
5835 	if (!netif_device_present(dev))
5836 		return -ENODEV;
5837 	return ops->ndo_change_carrier(dev, new_carrier);
5838 }
5839 EXPORT_SYMBOL(dev_change_carrier);
5840 
5841 /**
5842  *	dev_get_phys_port_id - Get device physical port ID
5843  *	@dev: device
5844  *	@ppid: port ID
5845  *
5846  *	Get device physical port ID
5847  */
5848 int dev_get_phys_port_id(struct net_device *dev,
5849 			 struct netdev_phys_item_id *ppid)
5850 {
5851 	const struct net_device_ops *ops = dev->netdev_ops;
5852 
5853 	if (!ops->ndo_get_phys_port_id)
5854 		return -EOPNOTSUPP;
5855 	return ops->ndo_get_phys_port_id(dev, ppid);
5856 }
5857 EXPORT_SYMBOL(dev_get_phys_port_id);
5858 
5859 /**
5860  *	dev_new_index	-	allocate an ifindex
5861  *	@net: the applicable net namespace
5862  *
5863  *	Returns a suitable unique value for a new device interface
5864  *	number.  The caller must hold the rtnl semaphore or the
5865  *	dev_base_lock to be sure it remains unique.
5866  */
5867 static int dev_new_index(struct net *net)
5868 {
5869 	int ifindex = net->ifindex;
5870 	for (;;) {
5871 		if (++ifindex <= 0)
5872 			ifindex = 1;
5873 		if (!__dev_get_by_index(net, ifindex))
5874 			return net->ifindex = ifindex;
5875 	}
5876 }
5877 
5878 /* Delayed registration/unregisteration */
5879 static LIST_HEAD(net_todo_list);
5880 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5881 
5882 static void net_set_todo(struct net_device *dev)
5883 {
5884 	list_add_tail(&dev->todo_list, &net_todo_list);
5885 	dev_net(dev)->dev_unreg_count++;
5886 }
5887 
5888 static void rollback_registered_many(struct list_head *head)
5889 {
5890 	struct net_device *dev, *tmp;
5891 	LIST_HEAD(close_head);
5892 
5893 	BUG_ON(dev_boot_phase);
5894 	ASSERT_RTNL();
5895 
5896 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5897 		/* Some devices call without registering
5898 		 * for initialization unwind. Remove those
5899 		 * devices and proceed with the remaining.
5900 		 */
5901 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5902 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5903 				 dev->name, dev);
5904 
5905 			WARN_ON(1);
5906 			list_del(&dev->unreg_list);
5907 			continue;
5908 		}
5909 		dev->dismantle = true;
5910 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5911 	}
5912 
5913 	/* If device is running, close it first. */
5914 	list_for_each_entry(dev, head, unreg_list)
5915 		list_add_tail(&dev->close_list, &close_head);
5916 	dev_close_many(&close_head);
5917 
5918 	list_for_each_entry(dev, head, unreg_list) {
5919 		/* And unlink it from device chain. */
5920 		unlist_netdevice(dev);
5921 
5922 		dev->reg_state = NETREG_UNREGISTERING;
5923 	}
5924 
5925 	synchronize_net();
5926 
5927 	list_for_each_entry(dev, head, unreg_list) {
5928 		struct sk_buff *skb = NULL;
5929 
5930 		/* Shutdown queueing discipline. */
5931 		dev_shutdown(dev);
5932 
5933 
5934 		/* Notify protocols, that we are about to destroy
5935 		   this device. They should clean all the things.
5936 		*/
5937 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5938 
5939 		if (!dev->rtnl_link_ops ||
5940 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5941 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
5942 						     GFP_KERNEL);
5943 
5944 		/*
5945 		 *	Flush the unicast and multicast chains
5946 		 */
5947 		dev_uc_flush(dev);
5948 		dev_mc_flush(dev);
5949 
5950 		if (dev->netdev_ops->ndo_uninit)
5951 			dev->netdev_ops->ndo_uninit(dev);
5952 
5953 		if (skb)
5954 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
5955 
5956 		/* Notifier chain MUST detach us all upper devices. */
5957 		WARN_ON(netdev_has_any_upper_dev(dev));
5958 
5959 		/* Remove entries from kobject tree */
5960 		netdev_unregister_kobject(dev);
5961 #ifdef CONFIG_XPS
5962 		/* Remove XPS queueing entries */
5963 		netif_reset_xps_queues_gt(dev, 0);
5964 #endif
5965 	}
5966 
5967 	synchronize_net();
5968 
5969 	list_for_each_entry(dev, head, unreg_list)
5970 		dev_put(dev);
5971 }
5972 
5973 static void rollback_registered(struct net_device *dev)
5974 {
5975 	LIST_HEAD(single);
5976 
5977 	list_add(&dev->unreg_list, &single);
5978 	rollback_registered_many(&single);
5979 	list_del(&single);
5980 }
5981 
5982 static netdev_features_t netdev_fix_features(struct net_device *dev,
5983 	netdev_features_t features)
5984 {
5985 	/* Fix illegal checksum combinations */
5986 	if ((features & NETIF_F_HW_CSUM) &&
5987 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5988 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5989 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5990 	}
5991 
5992 	/* TSO requires that SG is present as well. */
5993 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5994 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5995 		features &= ~NETIF_F_ALL_TSO;
5996 	}
5997 
5998 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5999 					!(features & NETIF_F_IP_CSUM)) {
6000 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6001 		features &= ~NETIF_F_TSO;
6002 		features &= ~NETIF_F_TSO_ECN;
6003 	}
6004 
6005 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6006 					 !(features & NETIF_F_IPV6_CSUM)) {
6007 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6008 		features &= ~NETIF_F_TSO6;
6009 	}
6010 
6011 	/* TSO ECN requires that TSO is present as well. */
6012 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6013 		features &= ~NETIF_F_TSO_ECN;
6014 
6015 	/* Software GSO depends on SG. */
6016 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6017 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6018 		features &= ~NETIF_F_GSO;
6019 	}
6020 
6021 	/* UFO needs SG and checksumming */
6022 	if (features & NETIF_F_UFO) {
6023 		/* maybe split UFO into V4 and V6? */
6024 		if (!((features & NETIF_F_GEN_CSUM) ||
6025 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6026 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6027 			netdev_dbg(dev,
6028 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6029 			features &= ~NETIF_F_UFO;
6030 		}
6031 
6032 		if (!(features & NETIF_F_SG)) {
6033 			netdev_dbg(dev,
6034 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6035 			features &= ~NETIF_F_UFO;
6036 		}
6037 	}
6038 
6039 #ifdef CONFIG_NET_RX_BUSY_POLL
6040 	if (dev->netdev_ops->ndo_busy_poll)
6041 		features |= NETIF_F_BUSY_POLL;
6042 	else
6043 #endif
6044 		features &= ~NETIF_F_BUSY_POLL;
6045 
6046 	return features;
6047 }
6048 
6049 int __netdev_update_features(struct net_device *dev)
6050 {
6051 	netdev_features_t features;
6052 	int err = 0;
6053 
6054 	ASSERT_RTNL();
6055 
6056 	features = netdev_get_wanted_features(dev);
6057 
6058 	if (dev->netdev_ops->ndo_fix_features)
6059 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6060 
6061 	/* driver might be less strict about feature dependencies */
6062 	features = netdev_fix_features(dev, features);
6063 
6064 	if (dev->features == features)
6065 		return 0;
6066 
6067 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6068 		&dev->features, &features);
6069 
6070 	if (dev->netdev_ops->ndo_set_features)
6071 		err = dev->netdev_ops->ndo_set_features(dev, features);
6072 
6073 	if (unlikely(err < 0)) {
6074 		netdev_err(dev,
6075 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6076 			err, &features, &dev->features);
6077 		return -1;
6078 	}
6079 
6080 	if (!err)
6081 		dev->features = features;
6082 
6083 	return 1;
6084 }
6085 
6086 /**
6087  *	netdev_update_features - recalculate device features
6088  *	@dev: the device to check
6089  *
6090  *	Recalculate dev->features set and send notifications if it
6091  *	has changed. Should be called after driver or hardware dependent
6092  *	conditions might have changed that influence the features.
6093  */
6094 void netdev_update_features(struct net_device *dev)
6095 {
6096 	if (__netdev_update_features(dev))
6097 		netdev_features_change(dev);
6098 }
6099 EXPORT_SYMBOL(netdev_update_features);
6100 
6101 /**
6102  *	netdev_change_features - recalculate device features
6103  *	@dev: the device to check
6104  *
6105  *	Recalculate dev->features set and send notifications even
6106  *	if they have not changed. Should be called instead of
6107  *	netdev_update_features() if also dev->vlan_features might
6108  *	have changed to allow the changes to be propagated to stacked
6109  *	VLAN devices.
6110  */
6111 void netdev_change_features(struct net_device *dev)
6112 {
6113 	__netdev_update_features(dev);
6114 	netdev_features_change(dev);
6115 }
6116 EXPORT_SYMBOL(netdev_change_features);
6117 
6118 /**
6119  *	netif_stacked_transfer_operstate -	transfer operstate
6120  *	@rootdev: the root or lower level device to transfer state from
6121  *	@dev: the device to transfer operstate to
6122  *
6123  *	Transfer operational state from root to device. This is normally
6124  *	called when a stacking relationship exists between the root
6125  *	device and the device(a leaf device).
6126  */
6127 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6128 					struct net_device *dev)
6129 {
6130 	if (rootdev->operstate == IF_OPER_DORMANT)
6131 		netif_dormant_on(dev);
6132 	else
6133 		netif_dormant_off(dev);
6134 
6135 	if (netif_carrier_ok(rootdev)) {
6136 		if (!netif_carrier_ok(dev))
6137 			netif_carrier_on(dev);
6138 	} else {
6139 		if (netif_carrier_ok(dev))
6140 			netif_carrier_off(dev);
6141 	}
6142 }
6143 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6144 
6145 #ifdef CONFIG_SYSFS
6146 static int netif_alloc_rx_queues(struct net_device *dev)
6147 {
6148 	unsigned int i, count = dev->num_rx_queues;
6149 	struct netdev_rx_queue *rx;
6150 
6151 	BUG_ON(count < 1);
6152 
6153 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
6154 	if (!rx)
6155 		return -ENOMEM;
6156 
6157 	dev->_rx = rx;
6158 
6159 	for (i = 0; i < count; i++)
6160 		rx[i].dev = dev;
6161 	return 0;
6162 }
6163 #endif
6164 
6165 static void netdev_init_one_queue(struct net_device *dev,
6166 				  struct netdev_queue *queue, void *_unused)
6167 {
6168 	/* Initialize queue lock */
6169 	spin_lock_init(&queue->_xmit_lock);
6170 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6171 	queue->xmit_lock_owner = -1;
6172 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6173 	queue->dev = dev;
6174 #ifdef CONFIG_BQL
6175 	dql_init(&queue->dql, HZ);
6176 #endif
6177 }
6178 
6179 static void netif_free_tx_queues(struct net_device *dev)
6180 {
6181 	kvfree(dev->_tx);
6182 }
6183 
6184 static int netif_alloc_netdev_queues(struct net_device *dev)
6185 {
6186 	unsigned int count = dev->num_tx_queues;
6187 	struct netdev_queue *tx;
6188 	size_t sz = count * sizeof(*tx);
6189 
6190 	BUG_ON(count < 1 || count > 0xffff);
6191 
6192 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6193 	if (!tx) {
6194 		tx = vzalloc(sz);
6195 		if (!tx)
6196 			return -ENOMEM;
6197 	}
6198 	dev->_tx = tx;
6199 
6200 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6201 	spin_lock_init(&dev->tx_global_lock);
6202 
6203 	return 0;
6204 }
6205 
6206 /**
6207  *	register_netdevice	- register a network device
6208  *	@dev: device to register
6209  *
6210  *	Take a completed network device structure and add it to the kernel
6211  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6212  *	chain. 0 is returned on success. A negative errno code is returned
6213  *	on a failure to set up the device, or if the name is a duplicate.
6214  *
6215  *	Callers must hold the rtnl semaphore. You may want
6216  *	register_netdev() instead of this.
6217  *
6218  *	BUGS:
6219  *	The locking appears insufficient to guarantee two parallel registers
6220  *	will not get the same name.
6221  */
6222 
6223 int register_netdevice(struct net_device *dev)
6224 {
6225 	int ret;
6226 	struct net *net = dev_net(dev);
6227 
6228 	BUG_ON(dev_boot_phase);
6229 	ASSERT_RTNL();
6230 
6231 	might_sleep();
6232 
6233 	/* When net_device's are persistent, this will be fatal. */
6234 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6235 	BUG_ON(!net);
6236 
6237 	spin_lock_init(&dev->addr_list_lock);
6238 	netdev_set_addr_lockdep_class(dev);
6239 
6240 	dev->iflink = -1;
6241 
6242 	ret = dev_get_valid_name(net, dev, dev->name);
6243 	if (ret < 0)
6244 		goto out;
6245 
6246 	/* Init, if this function is available */
6247 	if (dev->netdev_ops->ndo_init) {
6248 		ret = dev->netdev_ops->ndo_init(dev);
6249 		if (ret) {
6250 			if (ret > 0)
6251 				ret = -EIO;
6252 			goto out;
6253 		}
6254 	}
6255 
6256 	if (((dev->hw_features | dev->features) &
6257 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6258 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6259 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6260 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6261 		ret = -EINVAL;
6262 		goto err_uninit;
6263 	}
6264 
6265 	ret = -EBUSY;
6266 	if (!dev->ifindex)
6267 		dev->ifindex = dev_new_index(net);
6268 	else if (__dev_get_by_index(net, dev->ifindex))
6269 		goto err_uninit;
6270 
6271 	if (dev->iflink == -1)
6272 		dev->iflink = dev->ifindex;
6273 
6274 	/* Transfer changeable features to wanted_features and enable
6275 	 * software offloads (GSO and GRO).
6276 	 */
6277 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6278 	dev->features |= NETIF_F_SOFT_FEATURES;
6279 	dev->wanted_features = dev->features & dev->hw_features;
6280 
6281 	if (!(dev->flags & IFF_LOOPBACK)) {
6282 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6283 	}
6284 
6285 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6286 	 */
6287 	dev->vlan_features |= NETIF_F_HIGHDMA;
6288 
6289 	/* Make NETIF_F_SG inheritable to tunnel devices.
6290 	 */
6291 	dev->hw_enc_features |= NETIF_F_SG;
6292 
6293 	/* Make NETIF_F_SG inheritable to MPLS.
6294 	 */
6295 	dev->mpls_features |= NETIF_F_SG;
6296 
6297 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6298 	ret = notifier_to_errno(ret);
6299 	if (ret)
6300 		goto err_uninit;
6301 
6302 	ret = netdev_register_kobject(dev);
6303 	if (ret)
6304 		goto err_uninit;
6305 	dev->reg_state = NETREG_REGISTERED;
6306 
6307 	__netdev_update_features(dev);
6308 
6309 	/*
6310 	 *	Default initial state at registry is that the
6311 	 *	device is present.
6312 	 */
6313 
6314 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6315 
6316 	linkwatch_init_dev(dev);
6317 
6318 	dev_init_scheduler(dev);
6319 	dev_hold(dev);
6320 	list_netdevice(dev);
6321 	add_device_randomness(dev->dev_addr, dev->addr_len);
6322 
6323 	/* If the device has permanent device address, driver should
6324 	 * set dev_addr and also addr_assign_type should be set to
6325 	 * NET_ADDR_PERM (default value).
6326 	 */
6327 	if (dev->addr_assign_type == NET_ADDR_PERM)
6328 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6329 
6330 	/* Notify protocols, that a new device appeared. */
6331 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6332 	ret = notifier_to_errno(ret);
6333 	if (ret) {
6334 		rollback_registered(dev);
6335 		dev->reg_state = NETREG_UNREGISTERED;
6336 	}
6337 	/*
6338 	 *	Prevent userspace races by waiting until the network
6339 	 *	device is fully setup before sending notifications.
6340 	 */
6341 	if (!dev->rtnl_link_ops ||
6342 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6343 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6344 
6345 out:
6346 	return ret;
6347 
6348 err_uninit:
6349 	if (dev->netdev_ops->ndo_uninit)
6350 		dev->netdev_ops->ndo_uninit(dev);
6351 	goto out;
6352 }
6353 EXPORT_SYMBOL(register_netdevice);
6354 
6355 /**
6356  *	init_dummy_netdev	- init a dummy network device for NAPI
6357  *	@dev: device to init
6358  *
6359  *	This takes a network device structure and initialize the minimum
6360  *	amount of fields so it can be used to schedule NAPI polls without
6361  *	registering a full blown interface. This is to be used by drivers
6362  *	that need to tie several hardware interfaces to a single NAPI
6363  *	poll scheduler due to HW limitations.
6364  */
6365 int init_dummy_netdev(struct net_device *dev)
6366 {
6367 	/* Clear everything. Note we don't initialize spinlocks
6368 	 * are they aren't supposed to be taken by any of the
6369 	 * NAPI code and this dummy netdev is supposed to be
6370 	 * only ever used for NAPI polls
6371 	 */
6372 	memset(dev, 0, sizeof(struct net_device));
6373 
6374 	/* make sure we BUG if trying to hit standard
6375 	 * register/unregister code path
6376 	 */
6377 	dev->reg_state = NETREG_DUMMY;
6378 
6379 	/* NAPI wants this */
6380 	INIT_LIST_HEAD(&dev->napi_list);
6381 
6382 	/* a dummy interface is started by default */
6383 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6384 	set_bit(__LINK_STATE_START, &dev->state);
6385 
6386 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6387 	 * because users of this 'device' dont need to change
6388 	 * its refcount.
6389 	 */
6390 
6391 	return 0;
6392 }
6393 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6394 
6395 
6396 /**
6397  *	register_netdev	- register a network device
6398  *	@dev: device to register
6399  *
6400  *	Take a completed network device structure and add it to the kernel
6401  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6402  *	chain. 0 is returned on success. A negative errno code is returned
6403  *	on a failure to set up the device, or if the name is a duplicate.
6404  *
6405  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6406  *	and expands the device name if you passed a format string to
6407  *	alloc_netdev.
6408  */
6409 int register_netdev(struct net_device *dev)
6410 {
6411 	int err;
6412 
6413 	rtnl_lock();
6414 	err = register_netdevice(dev);
6415 	rtnl_unlock();
6416 	return err;
6417 }
6418 EXPORT_SYMBOL(register_netdev);
6419 
6420 int netdev_refcnt_read(const struct net_device *dev)
6421 {
6422 	int i, refcnt = 0;
6423 
6424 	for_each_possible_cpu(i)
6425 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6426 	return refcnt;
6427 }
6428 EXPORT_SYMBOL(netdev_refcnt_read);
6429 
6430 /**
6431  * netdev_wait_allrefs - wait until all references are gone.
6432  * @dev: target net_device
6433  *
6434  * This is called when unregistering network devices.
6435  *
6436  * Any protocol or device that holds a reference should register
6437  * for netdevice notification, and cleanup and put back the
6438  * reference if they receive an UNREGISTER event.
6439  * We can get stuck here if buggy protocols don't correctly
6440  * call dev_put.
6441  */
6442 static void netdev_wait_allrefs(struct net_device *dev)
6443 {
6444 	unsigned long rebroadcast_time, warning_time;
6445 	int refcnt;
6446 
6447 	linkwatch_forget_dev(dev);
6448 
6449 	rebroadcast_time = warning_time = jiffies;
6450 	refcnt = netdev_refcnt_read(dev);
6451 
6452 	while (refcnt != 0) {
6453 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6454 			rtnl_lock();
6455 
6456 			/* Rebroadcast unregister notification */
6457 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6458 
6459 			__rtnl_unlock();
6460 			rcu_barrier();
6461 			rtnl_lock();
6462 
6463 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6464 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6465 				     &dev->state)) {
6466 				/* We must not have linkwatch events
6467 				 * pending on unregister. If this
6468 				 * happens, we simply run the queue
6469 				 * unscheduled, resulting in a noop
6470 				 * for this device.
6471 				 */
6472 				linkwatch_run_queue();
6473 			}
6474 
6475 			__rtnl_unlock();
6476 
6477 			rebroadcast_time = jiffies;
6478 		}
6479 
6480 		msleep(250);
6481 
6482 		refcnt = netdev_refcnt_read(dev);
6483 
6484 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6485 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6486 				 dev->name, refcnt);
6487 			warning_time = jiffies;
6488 		}
6489 	}
6490 }
6491 
6492 /* The sequence is:
6493  *
6494  *	rtnl_lock();
6495  *	...
6496  *	register_netdevice(x1);
6497  *	register_netdevice(x2);
6498  *	...
6499  *	unregister_netdevice(y1);
6500  *	unregister_netdevice(y2);
6501  *      ...
6502  *	rtnl_unlock();
6503  *	free_netdev(y1);
6504  *	free_netdev(y2);
6505  *
6506  * We are invoked by rtnl_unlock().
6507  * This allows us to deal with problems:
6508  * 1) We can delete sysfs objects which invoke hotplug
6509  *    without deadlocking with linkwatch via keventd.
6510  * 2) Since we run with the RTNL semaphore not held, we can sleep
6511  *    safely in order to wait for the netdev refcnt to drop to zero.
6512  *
6513  * We must not return until all unregister events added during
6514  * the interval the lock was held have been completed.
6515  */
6516 void netdev_run_todo(void)
6517 {
6518 	struct list_head list;
6519 
6520 	/* Snapshot list, allow later requests */
6521 	list_replace_init(&net_todo_list, &list);
6522 
6523 	__rtnl_unlock();
6524 
6525 
6526 	/* Wait for rcu callbacks to finish before next phase */
6527 	if (!list_empty(&list))
6528 		rcu_barrier();
6529 
6530 	while (!list_empty(&list)) {
6531 		struct net_device *dev
6532 			= list_first_entry(&list, struct net_device, todo_list);
6533 		list_del(&dev->todo_list);
6534 
6535 		rtnl_lock();
6536 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6537 		__rtnl_unlock();
6538 
6539 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6540 			pr_err("network todo '%s' but state %d\n",
6541 			       dev->name, dev->reg_state);
6542 			dump_stack();
6543 			continue;
6544 		}
6545 
6546 		dev->reg_state = NETREG_UNREGISTERED;
6547 
6548 		on_each_cpu(flush_backlog, dev, 1);
6549 
6550 		netdev_wait_allrefs(dev);
6551 
6552 		/* paranoia */
6553 		BUG_ON(netdev_refcnt_read(dev));
6554 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6555 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6556 		WARN_ON(dev->dn_ptr);
6557 
6558 		if (dev->destructor)
6559 			dev->destructor(dev);
6560 
6561 		/* Report a network device has been unregistered */
6562 		rtnl_lock();
6563 		dev_net(dev)->dev_unreg_count--;
6564 		__rtnl_unlock();
6565 		wake_up(&netdev_unregistering_wq);
6566 
6567 		/* Free network device */
6568 		kobject_put(&dev->dev.kobj);
6569 	}
6570 }
6571 
6572 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6573  * fields in the same order, with only the type differing.
6574  */
6575 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6576 			     const struct net_device_stats *netdev_stats)
6577 {
6578 #if BITS_PER_LONG == 64
6579 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6580 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6581 #else
6582 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6583 	const unsigned long *src = (const unsigned long *)netdev_stats;
6584 	u64 *dst = (u64 *)stats64;
6585 
6586 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6587 		     sizeof(*stats64) / sizeof(u64));
6588 	for (i = 0; i < n; i++)
6589 		dst[i] = src[i];
6590 #endif
6591 }
6592 EXPORT_SYMBOL(netdev_stats_to_stats64);
6593 
6594 /**
6595  *	dev_get_stats	- get network device statistics
6596  *	@dev: device to get statistics from
6597  *	@storage: place to store stats
6598  *
6599  *	Get network statistics from device. Return @storage.
6600  *	The device driver may provide its own method by setting
6601  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6602  *	otherwise the internal statistics structure is used.
6603  */
6604 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6605 					struct rtnl_link_stats64 *storage)
6606 {
6607 	const struct net_device_ops *ops = dev->netdev_ops;
6608 
6609 	if (ops->ndo_get_stats64) {
6610 		memset(storage, 0, sizeof(*storage));
6611 		ops->ndo_get_stats64(dev, storage);
6612 	} else if (ops->ndo_get_stats) {
6613 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6614 	} else {
6615 		netdev_stats_to_stats64(storage, &dev->stats);
6616 	}
6617 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6618 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6619 	return storage;
6620 }
6621 EXPORT_SYMBOL(dev_get_stats);
6622 
6623 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6624 {
6625 	struct netdev_queue *queue = dev_ingress_queue(dev);
6626 
6627 #ifdef CONFIG_NET_CLS_ACT
6628 	if (queue)
6629 		return queue;
6630 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6631 	if (!queue)
6632 		return NULL;
6633 	netdev_init_one_queue(dev, queue, NULL);
6634 	queue->qdisc = &noop_qdisc;
6635 	queue->qdisc_sleeping = &noop_qdisc;
6636 	rcu_assign_pointer(dev->ingress_queue, queue);
6637 #endif
6638 	return queue;
6639 }
6640 
6641 static const struct ethtool_ops default_ethtool_ops;
6642 
6643 void netdev_set_default_ethtool_ops(struct net_device *dev,
6644 				    const struct ethtool_ops *ops)
6645 {
6646 	if (dev->ethtool_ops == &default_ethtool_ops)
6647 		dev->ethtool_ops = ops;
6648 }
6649 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6650 
6651 void netdev_freemem(struct net_device *dev)
6652 {
6653 	char *addr = (char *)dev - dev->padded;
6654 
6655 	kvfree(addr);
6656 }
6657 
6658 /**
6659  *	alloc_netdev_mqs - allocate network device
6660  *	@sizeof_priv:		size of private data to allocate space for
6661  *	@name:			device name format string
6662  *	@name_assign_type: 	origin of device name
6663  *	@setup:			callback to initialize device
6664  *	@txqs:			the number of TX subqueues to allocate
6665  *	@rxqs:			the number of RX subqueues to allocate
6666  *
6667  *	Allocates a struct net_device with private data area for driver use
6668  *	and performs basic initialization.  Also allocates subqueue structs
6669  *	for each queue on the device.
6670  */
6671 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6672 		unsigned char name_assign_type,
6673 		void (*setup)(struct net_device *),
6674 		unsigned int txqs, unsigned int rxqs)
6675 {
6676 	struct net_device *dev;
6677 	size_t alloc_size;
6678 	struct net_device *p;
6679 
6680 	BUG_ON(strlen(name) >= sizeof(dev->name));
6681 
6682 	if (txqs < 1) {
6683 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6684 		return NULL;
6685 	}
6686 
6687 #ifdef CONFIG_SYSFS
6688 	if (rxqs < 1) {
6689 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6690 		return NULL;
6691 	}
6692 #endif
6693 
6694 	alloc_size = sizeof(struct net_device);
6695 	if (sizeof_priv) {
6696 		/* ensure 32-byte alignment of private area */
6697 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6698 		alloc_size += sizeof_priv;
6699 	}
6700 	/* ensure 32-byte alignment of whole construct */
6701 	alloc_size += NETDEV_ALIGN - 1;
6702 
6703 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6704 	if (!p)
6705 		p = vzalloc(alloc_size);
6706 	if (!p)
6707 		return NULL;
6708 
6709 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6710 	dev->padded = (char *)dev - (char *)p;
6711 
6712 	dev->pcpu_refcnt = alloc_percpu(int);
6713 	if (!dev->pcpu_refcnt)
6714 		goto free_dev;
6715 
6716 	if (dev_addr_init(dev))
6717 		goto free_pcpu;
6718 
6719 	dev_mc_init(dev);
6720 	dev_uc_init(dev);
6721 
6722 	dev_net_set(dev, &init_net);
6723 
6724 	dev->gso_max_size = GSO_MAX_SIZE;
6725 	dev->gso_max_segs = GSO_MAX_SEGS;
6726 	dev->gso_min_segs = 0;
6727 
6728 	INIT_LIST_HEAD(&dev->napi_list);
6729 	INIT_LIST_HEAD(&dev->unreg_list);
6730 	INIT_LIST_HEAD(&dev->close_list);
6731 	INIT_LIST_HEAD(&dev->link_watch_list);
6732 	INIT_LIST_HEAD(&dev->adj_list.upper);
6733 	INIT_LIST_HEAD(&dev->adj_list.lower);
6734 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6735 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6736 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6737 	setup(dev);
6738 
6739 	dev->num_tx_queues = txqs;
6740 	dev->real_num_tx_queues = txqs;
6741 	if (netif_alloc_netdev_queues(dev))
6742 		goto free_all;
6743 
6744 #ifdef CONFIG_SYSFS
6745 	dev->num_rx_queues = rxqs;
6746 	dev->real_num_rx_queues = rxqs;
6747 	if (netif_alloc_rx_queues(dev))
6748 		goto free_all;
6749 #endif
6750 
6751 	strcpy(dev->name, name);
6752 	dev->name_assign_type = name_assign_type;
6753 	dev->group = INIT_NETDEV_GROUP;
6754 	if (!dev->ethtool_ops)
6755 		dev->ethtool_ops = &default_ethtool_ops;
6756 	return dev;
6757 
6758 free_all:
6759 	free_netdev(dev);
6760 	return NULL;
6761 
6762 free_pcpu:
6763 	free_percpu(dev->pcpu_refcnt);
6764 free_dev:
6765 	netdev_freemem(dev);
6766 	return NULL;
6767 }
6768 EXPORT_SYMBOL(alloc_netdev_mqs);
6769 
6770 /**
6771  *	free_netdev - free network device
6772  *	@dev: device
6773  *
6774  *	This function does the last stage of destroying an allocated device
6775  * 	interface. The reference to the device object is released.
6776  *	If this is the last reference then it will be freed.
6777  */
6778 void free_netdev(struct net_device *dev)
6779 {
6780 	struct napi_struct *p, *n;
6781 
6782 	release_net(dev_net(dev));
6783 
6784 	netif_free_tx_queues(dev);
6785 #ifdef CONFIG_SYSFS
6786 	kfree(dev->_rx);
6787 #endif
6788 
6789 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6790 
6791 	/* Flush device addresses */
6792 	dev_addr_flush(dev);
6793 
6794 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6795 		netif_napi_del(p);
6796 
6797 	free_percpu(dev->pcpu_refcnt);
6798 	dev->pcpu_refcnt = NULL;
6799 
6800 	/*  Compatibility with error handling in drivers */
6801 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6802 		netdev_freemem(dev);
6803 		return;
6804 	}
6805 
6806 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6807 	dev->reg_state = NETREG_RELEASED;
6808 
6809 	/* will free via device release */
6810 	put_device(&dev->dev);
6811 }
6812 EXPORT_SYMBOL(free_netdev);
6813 
6814 /**
6815  *	synchronize_net -  Synchronize with packet receive processing
6816  *
6817  *	Wait for packets currently being received to be done.
6818  *	Does not block later packets from starting.
6819  */
6820 void synchronize_net(void)
6821 {
6822 	might_sleep();
6823 	if (rtnl_is_locked())
6824 		synchronize_rcu_expedited();
6825 	else
6826 		synchronize_rcu();
6827 }
6828 EXPORT_SYMBOL(synchronize_net);
6829 
6830 /**
6831  *	unregister_netdevice_queue - remove device from the kernel
6832  *	@dev: device
6833  *	@head: list
6834  *
6835  *	This function shuts down a device interface and removes it
6836  *	from the kernel tables.
6837  *	If head not NULL, device is queued to be unregistered later.
6838  *
6839  *	Callers must hold the rtnl semaphore.  You may want
6840  *	unregister_netdev() instead of this.
6841  */
6842 
6843 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6844 {
6845 	ASSERT_RTNL();
6846 
6847 	if (head) {
6848 		list_move_tail(&dev->unreg_list, head);
6849 	} else {
6850 		rollback_registered(dev);
6851 		/* Finish processing unregister after unlock */
6852 		net_set_todo(dev);
6853 	}
6854 }
6855 EXPORT_SYMBOL(unregister_netdevice_queue);
6856 
6857 /**
6858  *	unregister_netdevice_many - unregister many devices
6859  *	@head: list of devices
6860  *
6861  *  Note: As most callers use a stack allocated list_head,
6862  *  we force a list_del() to make sure stack wont be corrupted later.
6863  */
6864 void unregister_netdevice_many(struct list_head *head)
6865 {
6866 	struct net_device *dev;
6867 
6868 	if (!list_empty(head)) {
6869 		rollback_registered_many(head);
6870 		list_for_each_entry(dev, head, unreg_list)
6871 			net_set_todo(dev);
6872 		list_del(head);
6873 	}
6874 }
6875 EXPORT_SYMBOL(unregister_netdevice_many);
6876 
6877 /**
6878  *	unregister_netdev - remove device from the kernel
6879  *	@dev: device
6880  *
6881  *	This function shuts down a device interface and removes it
6882  *	from the kernel tables.
6883  *
6884  *	This is just a wrapper for unregister_netdevice that takes
6885  *	the rtnl semaphore.  In general you want to use this and not
6886  *	unregister_netdevice.
6887  */
6888 void unregister_netdev(struct net_device *dev)
6889 {
6890 	rtnl_lock();
6891 	unregister_netdevice(dev);
6892 	rtnl_unlock();
6893 }
6894 EXPORT_SYMBOL(unregister_netdev);
6895 
6896 /**
6897  *	dev_change_net_namespace - move device to different nethost namespace
6898  *	@dev: device
6899  *	@net: network namespace
6900  *	@pat: If not NULL name pattern to try if the current device name
6901  *	      is already taken in the destination network namespace.
6902  *
6903  *	This function shuts down a device interface and moves it
6904  *	to a new network namespace. On success 0 is returned, on
6905  *	a failure a netagive errno code is returned.
6906  *
6907  *	Callers must hold the rtnl semaphore.
6908  */
6909 
6910 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6911 {
6912 	int err;
6913 
6914 	ASSERT_RTNL();
6915 
6916 	/* Don't allow namespace local devices to be moved. */
6917 	err = -EINVAL;
6918 	if (dev->features & NETIF_F_NETNS_LOCAL)
6919 		goto out;
6920 
6921 	/* Ensure the device has been registrered */
6922 	if (dev->reg_state != NETREG_REGISTERED)
6923 		goto out;
6924 
6925 	/* Get out if there is nothing todo */
6926 	err = 0;
6927 	if (net_eq(dev_net(dev), net))
6928 		goto out;
6929 
6930 	/* Pick the destination device name, and ensure
6931 	 * we can use it in the destination network namespace.
6932 	 */
6933 	err = -EEXIST;
6934 	if (__dev_get_by_name(net, dev->name)) {
6935 		/* We get here if we can't use the current device name */
6936 		if (!pat)
6937 			goto out;
6938 		if (dev_get_valid_name(net, dev, pat) < 0)
6939 			goto out;
6940 	}
6941 
6942 	/*
6943 	 * And now a mini version of register_netdevice unregister_netdevice.
6944 	 */
6945 
6946 	/* If device is running close it first. */
6947 	dev_close(dev);
6948 
6949 	/* And unlink it from device chain */
6950 	err = -ENODEV;
6951 	unlist_netdevice(dev);
6952 
6953 	synchronize_net();
6954 
6955 	/* Shutdown queueing discipline. */
6956 	dev_shutdown(dev);
6957 
6958 	/* Notify protocols, that we are about to destroy
6959 	   this device. They should clean all the things.
6960 
6961 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6962 	   This is wanted because this way 8021q and macvlan know
6963 	   the device is just moving and can keep their slaves up.
6964 	*/
6965 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6966 	rcu_barrier();
6967 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6968 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6969 
6970 	/*
6971 	 *	Flush the unicast and multicast chains
6972 	 */
6973 	dev_uc_flush(dev);
6974 	dev_mc_flush(dev);
6975 
6976 	/* Send a netdev-removed uevent to the old namespace */
6977 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6978 	netdev_adjacent_del_links(dev);
6979 
6980 	/* Actually switch the network namespace */
6981 	dev_net_set(dev, net);
6982 
6983 	/* If there is an ifindex conflict assign a new one */
6984 	if (__dev_get_by_index(net, dev->ifindex)) {
6985 		int iflink = (dev->iflink == dev->ifindex);
6986 		dev->ifindex = dev_new_index(net);
6987 		if (iflink)
6988 			dev->iflink = dev->ifindex;
6989 	}
6990 
6991 	/* Send a netdev-add uevent to the new namespace */
6992 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6993 	netdev_adjacent_add_links(dev);
6994 
6995 	/* Fixup kobjects */
6996 	err = device_rename(&dev->dev, dev->name);
6997 	WARN_ON(err);
6998 
6999 	/* Add the device back in the hashes */
7000 	list_netdevice(dev);
7001 
7002 	/* Notify protocols, that a new device appeared. */
7003 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7004 
7005 	/*
7006 	 *	Prevent userspace races by waiting until the network
7007 	 *	device is fully setup before sending notifications.
7008 	 */
7009 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7010 
7011 	synchronize_net();
7012 	err = 0;
7013 out:
7014 	return err;
7015 }
7016 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7017 
7018 static int dev_cpu_callback(struct notifier_block *nfb,
7019 			    unsigned long action,
7020 			    void *ocpu)
7021 {
7022 	struct sk_buff **list_skb;
7023 	struct sk_buff *skb;
7024 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7025 	struct softnet_data *sd, *oldsd;
7026 
7027 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7028 		return NOTIFY_OK;
7029 
7030 	local_irq_disable();
7031 	cpu = smp_processor_id();
7032 	sd = &per_cpu(softnet_data, cpu);
7033 	oldsd = &per_cpu(softnet_data, oldcpu);
7034 
7035 	/* Find end of our completion_queue. */
7036 	list_skb = &sd->completion_queue;
7037 	while (*list_skb)
7038 		list_skb = &(*list_skb)->next;
7039 	/* Append completion queue from offline CPU. */
7040 	*list_skb = oldsd->completion_queue;
7041 	oldsd->completion_queue = NULL;
7042 
7043 	/* Append output queue from offline CPU. */
7044 	if (oldsd->output_queue) {
7045 		*sd->output_queue_tailp = oldsd->output_queue;
7046 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7047 		oldsd->output_queue = NULL;
7048 		oldsd->output_queue_tailp = &oldsd->output_queue;
7049 	}
7050 	/* Append NAPI poll list from offline CPU. */
7051 	if (!list_empty(&oldsd->poll_list)) {
7052 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
7053 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
7054 	}
7055 
7056 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7057 	local_irq_enable();
7058 
7059 	/* Process offline CPU's input_pkt_queue */
7060 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7061 		netif_rx_internal(skb);
7062 		input_queue_head_incr(oldsd);
7063 	}
7064 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
7065 		netif_rx_internal(skb);
7066 		input_queue_head_incr(oldsd);
7067 	}
7068 
7069 	return NOTIFY_OK;
7070 }
7071 
7072 
7073 /**
7074  *	netdev_increment_features - increment feature set by one
7075  *	@all: current feature set
7076  *	@one: new feature set
7077  *	@mask: mask feature set
7078  *
7079  *	Computes a new feature set after adding a device with feature set
7080  *	@one to the master device with current feature set @all.  Will not
7081  *	enable anything that is off in @mask. Returns the new feature set.
7082  */
7083 netdev_features_t netdev_increment_features(netdev_features_t all,
7084 	netdev_features_t one, netdev_features_t mask)
7085 {
7086 	if (mask & NETIF_F_GEN_CSUM)
7087 		mask |= NETIF_F_ALL_CSUM;
7088 	mask |= NETIF_F_VLAN_CHALLENGED;
7089 
7090 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7091 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7092 
7093 	/* If one device supports hw checksumming, set for all. */
7094 	if (all & NETIF_F_GEN_CSUM)
7095 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7096 
7097 	return all;
7098 }
7099 EXPORT_SYMBOL(netdev_increment_features);
7100 
7101 static struct hlist_head * __net_init netdev_create_hash(void)
7102 {
7103 	int i;
7104 	struct hlist_head *hash;
7105 
7106 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7107 	if (hash != NULL)
7108 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7109 			INIT_HLIST_HEAD(&hash[i]);
7110 
7111 	return hash;
7112 }
7113 
7114 /* Initialize per network namespace state */
7115 static int __net_init netdev_init(struct net *net)
7116 {
7117 	if (net != &init_net)
7118 		INIT_LIST_HEAD(&net->dev_base_head);
7119 
7120 	net->dev_name_head = netdev_create_hash();
7121 	if (net->dev_name_head == NULL)
7122 		goto err_name;
7123 
7124 	net->dev_index_head = netdev_create_hash();
7125 	if (net->dev_index_head == NULL)
7126 		goto err_idx;
7127 
7128 	return 0;
7129 
7130 err_idx:
7131 	kfree(net->dev_name_head);
7132 err_name:
7133 	return -ENOMEM;
7134 }
7135 
7136 /**
7137  *	netdev_drivername - network driver for the device
7138  *	@dev: network device
7139  *
7140  *	Determine network driver for device.
7141  */
7142 const char *netdev_drivername(const struct net_device *dev)
7143 {
7144 	const struct device_driver *driver;
7145 	const struct device *parent;
7146 	const char *empty = "";
7147 
7148 	parent = dev->dev.parent;
7149 	if (!parent)
7150 		return empty;
7151 
7152 	driver = parent->driver;
7153 	if (driver && driver->name)
7154 		return driver->name;
7155 	return empty;
7156 }
7157 
7158 static void __netdev_printk(const char *level, const struct net_device *dev,
7159 			    struct va_format *vaf)
7160 {
7161 	if (dev && dev->dev.parent) {
7162 		dev_printk_emit(level[1] - '0',
7163 				dev->dev.parent,
7164 				"%s %s %s%s: %pV",
7165 				dev_driver_string(dev->dev.parent),
7166 				dev_name(dev->dev.parent),
7167 				netdev_name(dev), netdev_reg_state(dev),
7168 				vaf);
7169 	} else if (dev) {
7170 		printk("%s%s%s: %pV",
7171 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7172 	} else {
7173 		printk("%s(NULL net_device): %pV", level, vaf);
7174 	}
7175 }
7176 
7177 void netdev_printk(const char *level, const struct net_device *dev,
7178 		   const char *format, ...)
7179 {
7180 	struct va_format vaf;
7181 	va_list args;
7182 
7183 	va_start(args, format);
7184 
7185 	vaf.fmt = format;
7186 	vaf.va = &args;
7187 
7188 	__netdev_printk(level, dev, &vaf);
7189 
7190 	va_end(args);
7191 }
7192 EXPORT_SYMBOL(netdev_printk);
7193 
7194 #define define_netdev_printk_level(func, level)			\
7195 void func(const struct net_device *dev, const char *fmt, ...)	\
7196 {								\
7197 	struct va_format vaf;					\
7198 	va_list args;						\
7199 								\
7200 	va_start(args, fmt);					\
7201 								\
7202 	vaf.fmt = fmt;						\
7203 	vaf.va = &args;						\
7204 								\
7205 	__netdev_printk(level, dev, &vaf);			\
7206 								\
7207 	va_end(args);						\
7208 }								\
7209 EXPORT_SYMBOL(func);
7210 
7211 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7212 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7213 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7214 define_netdev_printk_level(netdev_err, KERN_ERR);
7215 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7216 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7217 define_netdev_printk_level(netdev_info, KERN_INFO);
7218 
7219 static void __net_exit netdev_exit(struct net *net)
7220 {
7221 	kfree(net->dev_name_head);
7222 	kfree(net->dev_index_head);
7223 }
7224 
7225 static struct pernet_operations __net_initdata netdev_net_ops = {
7226 	.init = netdev_init,
7227 	.exit = netdev_exit,
7228 };
7229 
7230 static void __net_exit default_device_exit(struct net *net)
7231 {
7232 	struct net_device *dev, *aux;
7233 	/*
7234 	 * Push all migratable network devices back to the
7235 	 * initial network namespace
7236 	 */
7237 	rtnl_lock();
7238 	for_each_netdev_safe(net, dev, aux) {
7239 		int err;
7240 		char fb_name[IFNAMSIZ];
7241 
7242 		/* Ignore unmoveable devices (i.e. loopback) */
7243 		if (dev->features & NETIF_F_NETNS_LOCAL)
7244 			continue;
7245 
7246 		/* Leave virtual devices for the generic cleanup */
7247 		if (dev->rtnl_link_ops)
7248 			continue;
7249 
7250 		/* Push remaining network devices to init_net */
7251 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7252 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7253 		if (err) {
7254 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7255 				 __func__, dev->name, err);
7256 			BUG();
7257 		}
7258 	}
7259 	rtnl_unlock();
7260 }
7261 
7262 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7263 {
7264 	/* Return with the rtnl_lock held when there are no network
7265 	 * devices unregistering in any network namespace in net_list.
7266 	 */
7267 	struct net *net;
7268 	bool unregistering;
7269 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7270 
7271 	add_wait_queue(&netdev_unregistering_wq, &wait);
7272 	for (;;) {
7273 		unregistering = false;
7274 		rtnl_lock();
7275 		list_for_each_entry(net, net_list, exit_list) {
7276 			if (net->dev_unreg_count > 0) {
7277 				unregistering = true;
7278 				break;
7279 			}
7280 		}
7281 		if (!unregistering)
7282 			break;
7283 		__rtnl_unlock();
7284 
7285 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7286 	}
7287 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7288 }
7289 
7290 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7291 {
7292 	/* At exit all network devices most be removed from a network
7293 	 * namespace.  Do this in the reverse order of registration.
7294 	 * Do this across as many network namespaces as possible to
7295 	 * improve batching efficiency.
7296 	 */
7297 	struct net_device *dev;
7298 	struct net *net;
7299 	LIST_HEAD(dev_kill_list);
7300 
7301 	/* To prevent network device cleanup code from dereferencing
7302 	 * loopback devices or network devices that have been freed
7303 	 * wait here for all pending unregistrations to complete,
7304 	 * before unregistring the loopback device and allowing the
7305 	 * network namespace be freed.
7306 	 *
7307 	 * The netdev todo list containing all network devices
7308 	 * unregistrations that happen in default_device_exit_batch
7309 	 * will run in the rtnl_unlock() at the end of
7310 	 * default_device_exit_batch.
7311 	 */
7312 	rtnl_lock_unregistering(net_list);
7313 	list_for_each_entry(net, net_list, exit_list) {
7314 		for_each_netdev_reverse(net, dev) {
7315 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7316 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7317 			else
7318 				unregister_netdevice_queue(dev, &dev_kill_list);
7319 		}
7320 	}
7321 	unregister_netdevice_many(&dev_kill_list);
7322 	rtnl_unlock();
7323 }
7324 
7325 static struct pernet_operations __net_initdata default_device_ops = {
7326 	.exit = default_device_exit,
7327 	.exit_batch = default_device_exit_batch,
7328 };
7329 
7330 /*
7331  *	Initialize the DEV module. At boot time this walks the device list and
7332  *	unhooks any devices that fail to initialise (normally hardware not
7333  *	present) and leaves us with a valid list of present and active devices.
7334  *
7335  */
7336 
7337 /*
7338  *       This is called single threaded during boot, so no need
7339  *       to take the rtnl semaphore.
7340  */
7341 static int __init net_dev_init(void)
7342 {
7343 	int i, rc = -ENOMEM;
7344 
7345 	BUG_ON(!dev_boot_phase);
7346 
7347 	if (dev_proc_init())
7348 		goto out;
7349 
7350 	if (netdev_kobject_init())
7351 		goto out;
7352 
7353 	INIT_LIST_HEAD(&ptype_all);
7354 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7355 		INIT_LIST_HEAD(&ptype_base[i]);
7356 
7357 	INIT_LIST_HEAD(&offload_base);
7358 
7359 	if (register_pernet_subsys(&netdev_net_ops))
7360 		goto out;
7361 
7362 	/*
7363 	 *	Initialise the packet receive queues.
7364 	 */
7365 
7366 	for_each_possible_cpu(i) {
7367 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7368 
7369 		skb_queue_head_init(&sd->input_pkt_queue);
7370 		skb_queue_head_init(&sd->process_queue);
7371 		INIT_LIST_HEAD(&sd->poll_list);
7372 		sd->output_queue_tailp = &sd->output_queue;
7373 #ifdef CONFIG_RPS
7374 		sd->csd.func = rps_trigger_softirq;
7375 		sd->csd.info = sd;
7376 		sd->cpu = i;
7377 #endif
7378 
7379 		sd->backlog.poll = process_backlog;
7380 		sd->backlog.weight = weight_p;
7381 	}
7382 
7383 	dev_boot_phase = 0;
7384 
7385 	/* The loopback device is special if any other network devices
7386 	 * is present in a network namespace the loopback device must
7387 	 * be present. Since we now dynamically allocate and free the
7388 	 * loopback device ensure this invariant is maintained by
7389 	 * keeping the loopback device as the first device on the
7390 	 * list of network devices.  Ensuring the loopback devices
7391 	 * is the first device that appears and the last network device
7392 	 * that disappears.
7393 	 */
7394 	if (register_pernet_device(&loopback_net_ops))
7395 		goto out;
7396 
7397 	if (register_pernet_device(&default_device_ops))
7398 		goto out;
7399 
7400 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7401 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7402 
7403 	hotcpu_notifier(dev_cpu_callback, 0);
7404 	dst_init();
7405 	rc = 0;
7406 out:
7407 	return rc;
7408 }
7409 
7410 subsys_initcall(net_dev_init);
7411