xref: /openbmc/linux/net/core/dev.c (revision 930beb5a)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 static DEFINE_SPINLOCK(ptype_lock);
145 static DEFINE_SPINLOCK(offload_lock);
146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147 struct list_head ptype_all __read_mostly;	/* Taps */
148 static struct list_head offload_base __read_mostly;
149 
150 /*
151  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
152  * semaphore.
153  *
154  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
155  *
156  * Writers must hold the rtnl semaphore while they loop through the
157  * dev_base_head list, and hold dev_base_lock for writing when they do the
158  * actual updates.  This allows pure readers to access the list even
159  * while a writer is preparing to update it.
160  *
161  * To put it another way, dev_base_lock is held for writing only to
162  * protect against pure readers; the rtnl semaphore provides the
163  * protection against other writers.
164  *
165  * See, for example usages, register_netdevice() and
166  * unregister_netdevice(), which must be called with the rtnl
167  * semaphore held.
168  */
169 DEFINE_RWLOCK(dev_base_lock);
170 EXPORT_SYMBOL(dev_base_lock);
171 
172 /* protects napi_hash addition/deletion and napi_gen_id */
173 static DEFINE_SPINLOCK(napi_hash_lock);
174 
175 static unsigned int napi_gen_id;
176 static DEFINE_HASHTABLE(napi_hash, 8);
177 
178 static seqcount_t devnet_rename_seq;
179 
180 static inline void dev_base_seq_inc(struct net *net)
181 {
182 	while (++net->dev_base_seq == 0);
183 }
184 
185 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
186 {
187 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
188 
189 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
190 }
191 
192 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
193 {
194 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
195 }
196 
197 static inline void rps_lock(struct softnet_data *sd)
198 {
199 #ifdef CONFIG_RPS
200 	spin_lock(&sd->input_pkt_queue.lock);
201 #endif
202 }
203 
204 static inline void rps_unlock(struct softnet_data *sd)
205 {
206 #ifdef CONFIG_RPS
207 	spin_unlock(&sd->input_pkt_queue.lock);
208 #endif
209 }
210 
211 /* Device list insertion */
212 static void list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev_net(dev);
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head_rcu(&dev->index_hlist,
222 			   dev_index_hash(net, dev->ifindex));
223 	write_unlock_bh(&dev_base_lock);
224 
225 	dev_base_seq_inc(net);
226 }
227 
228 /* Device list removal
229  * caller must respect a RCU grace period before freeing/reusing dev
230  */
231 static void unlist_netdevice(struct net_device *dev)
232 {
233 	ASSERT_RTNL();
234 
235 	/* Unlink dev from the device chain */
236 	write_lock_bh(&dev_base_lock);
237 	list_del_rcu(&dev->dev_list);
238 	hlist_del_rcu(&dev->name_hlist);
239 	hlist_del_rcu(&dev->index_hlist);
240 	write_unlock_bh(&dev_base_lock);
241 
242 	dev_base_seq_inc(dev_net(dev));
243 }
244 
245 /*
246  *	Our notifier list
247  */
248 
249 static RAW_NOTIFIER_HEAD(netdev_chain);
250 
251 /*
252  *	Device drivers call our routines to queue packets here. We empty the
253  *	queue in the local softnet handler.
254  */
255 
256 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
257 EXPORT_PER_CPU_SYMBOL(softnet_data);
258 
259 #ifdef CONFIG_LOCKDEP
260 /*
261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
262  * according to dev->type
263  */
264 static const unsigned short netdev_lock_type[] =
265 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
266 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
267 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
268 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
269 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
270 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
271 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
272 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
273 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
274 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
275 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
276 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
277 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
278 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
279 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
280 
281 static const char *const netdev_lock_name[] =
282 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
283 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
284 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
285 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
286 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
287 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
288 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
289 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
290 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
291 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
292 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
293 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
294 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
295 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
296 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
297 
298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
300 
301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
302 {
303 	int i;
304 
305 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
306 		if (netdev_lock_type[i] == dev_type)
307 			return i;
308 	/* the last key is used by default */
309 	return ARRAY_SIZE(netdev_lock_type) - 1;
310 }
311 
312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
313 						 unsigned short dev_type)
314 {
315 	int i;
316 
317 	i = netdev_lock_pos(dev_type);
318 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
319 				   netdev_lock_name[i]);
320 }
321 
322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
323 {
324 	int i;
325 
326 	i = netdev_lock_pos(dev->type);
327 	lockdep_set_class_and_name(&dev->addr_list_lock,
328 				   &netdev_addr_lock_key[i],
329 				   netdev_lock_name[i]);
330 }
331 #else
332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
333 						 unsigned short dev_type)
334 {
335 }
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 }
339 #endif
340 
341 /*******************************************************************************
342 
343 		Protocol management and registration routines
344 
345 *******************************************************************************/
346 
347 /*
348  *	Add a protocol ID to the list. Now that the input handler is
349  *	smarter we can dispense with all the messy stuff that used to be
350  *	here.
351  *
352  *	BEWARE!!! Protocol handlers, mangling input packets,
353  *	MUST BE last in hash buckets and checking protocol handlers
354  *	MUST start from promiscuous ptype_all chain in net_bh.
355  *	It is true now, do not change it.
356  *	Explanation follows: if protocol handler, mangling packet, will
357  *	be the first on list, it is not able to sense, that packet
358  *	is cloned and should be copied-on-write, so that it will
359  *	change it and subsequent readers will get broken packet.
360  *							--ANK (980803)
361  */
362 
363 static inline struct list_head *ptype_head(const struct packet_type *pt)
364 {
365 	if (pt->type == htons(ETH_P_ALL))
366 		return &ptype_all;
367 	else
368 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
369 }
370 
371 /**
372  *	dev_add_pack - add packet handler
373  *	@pt: packet type declaration
374  *
375  *	Add a protocol handler to the networking stack. The passed &packet_type
376  *	is linked into kernel lists and may not be freed until it has been
377  *	removed from the kernel lists.
378  *
379  *	This call does not sleep therefore it can not
380  *	guarantee all CPU's that are in middle of receiving packets
381  *	will see the new packet type (until the next received packet).
382  */
383 
384 void dev_add_pack(struct packet_type *pt)
385 {
386 	struct list_head *head = ptype_head(pt);
387 
388 	spin_lock(&ptype_lock);
389 	list_add_rcu(&pt->list, head);
390 	spin_unlock(&ptype_lock);
391 }
392 EXPORT_SYMBOL(dev_add_pack);
393 
394 /**
395  *	__dev_remove_pack	 - remove packet handler
396  *	@pt: packet type declaration
397  *
398  *	Remove a protocol handler that was previously added to the kernel
399  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
400  *	from the kernel lists and can be freed or reused once this function
401  *	returns.
402  *
403  *      The packet type might still be in use by receivers
404  *	and must not be freed until after all the CPU's have gone
405  *	through a quiescent state.
406  */
407 void __dev_remove_pack(struct packet_type *pt)
408 {
409 	struct list_head *head = ptype_head(pt);
410 	struct packet_type *pt1;
411 
412 	spin_lock(&ptype_lock);
413 
414 	list_for_each_entry(pt1, head, list) {
415 		if (pt == pt1) {
416 			list_del_rcu(&pt->list);
417 			goto out;
418 		}
419 	}
420 
421 	pr_warn("dev_remove_pack: %p not found\n", pt);
422 out:
423 	spin_unlock(&ptype_lock);
424 }
425 EXPORT_SYMBOL(__dev_remove_pack);
426 
427 /**
428  *	dev_remove_pack	 - remove packet handler
429  *	@pt: packet type declaration
430  *
431  *	Remove a protocol handler that was previously added to the kernel
432  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
433  *	from the kernel lists and can be freed or reused once this function
434  *	returns.
435  *
436  *	This call sleeps to guarantee that no CPU is looking at the packet
437  *	type after return.
438  */
439 void dev_remove_pack(struct packet_type *pt)
440 {
441 	__dev_remove_pack(pt);
442 
443 	synchronize_net();
444 }
445 EXPORT_SYMBOL(dev_remove_pack);
446 
447 
448 /**
449  *	dev_add_offload - register offload handlers
450  *	@po: protocol offload declaration
451  *
452  *	Add protocol offload handlers to the networking stack. The passed
453  *	&proto_offload is linked into kernel lists and may not be freed until
454  *	it has been removed from the kernel lists.
455  *
456  *	This call does not sleep therefore it can not
457  *	guarantee all CPU's that are in middle of receiving packets
458  *	will see the new offload handlers (until the next received packet).
459  */
460 void dev_add_offload(struct packet_offload *po)
461 {
462 	struct list_head *head = &offload_base;
463 
464 	spin_lock(&offload_lock);
465 	list_add_rcu(&po->list, head);
466 	spin_unlock(&offload_lock);
467 }
468 EXPORT_SYMBOL(dev_add_offload);
469 
470 /**
471  *	__dev_remove_offload	 - remove offload handler
472  *	@po: packet offload declaration
473  *
474  *	Remove a protocol offload handler that was previously added to the
475  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
476  *	is removed from the kernel lists and can be freed or reused once this
477  *	function returns.
478  *
479  *      The packet type might still be in use by receivers
480  *	and must not be freed until after all the CPU's have gone
481  *	through a quiescent state.
482  */
483 void __dev_remove_offload(struct packet_offload *po)
484 {
485 	struct list_head *head = &offload_base;
486 	struct packet_offload *po1;
487 
488 	spin_lock(&offload_lock);
489 
490 	list_for_each_entry(po1, head, list) {
491 		if (po == po1) {
492 			list_del_rcu(&po->list);
493 			goto out;
494 		}
495 	}
496 
497 	pr_warn("dev_remove_offload: %p not found\n", po);
498 out:
499 	spin_unlock(&offload_lock);
500 }
501 EXPORT_SYMBOL(__dev_remove_offload);
502 
503 /**
504  *	dev_remove_offload	 - remove packet offload handler
505  *	@po: packet offload declaration
506  *
507  *	Remove a packet offload handler that was previously added to the kernel
508  *	offload handlers by dev_add_offload(). The passed &offload_type is
509  *	removed from the kernel lists and can be freed or reused once this
510  *	function returns.
511  *
512  *	This call sleeps to guarantee that no CPU is looking at the packet
513  *	type after return.
514  */
515 void dev_remove_offload(struct packet_offload *po)
516 {
517 	__dev_remove_offload(po);
518 
519 	synchronize_net();
520 }
521 EXPORT_SYMBOL(dev_remove_offload);
522 
523 /******************************************************************************
524 
525 		      Device Boot-time Settings Routines
526 
527 *******************************************************************************/
528 
529 /* Boot time configuration table */
530 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
531 
532 /**
533  *	netdev_boot_setup_add	- add new setup entry
534  *	@name: name of the device
535  *	@map: configured settings for the device
536  *
537  *	Adds new setup entry to the dev_boot_setup list.  The function
538  *	returns 0 on error and 1 on success.  This is a generic routine to
539  *	all netdevices.
540  */
541 static int netdev_boot_setup_add(char *name, struct ifmap *map)
542 {
543 	struct netdev_boot_setup *s;
544 	int i;
545 
546 	s = dev_boot_setup;
547 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
548 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
549 			memset(s[i].name, 0, sizeof(s[i].name));
550 			strlcpy(s[i].name, name, IFNAMSIZ);
551 			memcpy(&s[i].map, map, sizeof(s[i].map));
552 			break;
553 		}
554 	}
555 
556 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
557 }
558 
559 /**
560  *	netdev_boot_setup_check	- check boot time settings
561  *	@dev: the netdevice
562  *
563  * 	Check boot time settings for the device.
564  *	The found settings are set for the device to be used
565  *	later in the device probing.
566  *	Returns 0 if no settings found, 1 if they are.
567  */
568 int netdev_boot_setup_check(struct net_device *dev)
569 {
570 	struct netdev_boot_setup *s = dev_boot_setup;
571 	int i;
572 
573 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
574 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
575 		    !strcmp(dev->name, s[i].name)) {
576 			dev->irq 	= s[i].map.irq;
577 			dev->base_addr 	= s[i].map.base_addr;
578 			dev->mem_start 	= s[i].map.mem_start;
579 			dev->mem_end 	= s[i].map.mem_end;
580 			return 1;
581 		}
582 	}
583 	return 0;
584 }
585 EXPORT_SYMBOL(netdev_boot_setup_check);
586 
587 
588 /**
589  *	netdev_boot_base	- get address from boot time settings
590  *	@prefix: prefix for network device
591  *	@unit: id for network device
592  *
593  * 	Check boot time settings for the base address of device.
594  *	The found settings are set for the device to be used
595  *	later in the device probing.
596  *	Returns 0 if no settings found.
597  */
598 unsigned long netdev_boot_base(const char *prefix, int unit)
599 {
600 	const struct netdev_boot_setup *s = dev_boot_setup;
601 	char name[IFNAMSIZ];
602 	int i;
603 
604 	sprintf(name, "%s%d", prefix, unit);
605 
606 	/*
607 	 * If device already registered then return base of 1
608 	 * to indicate not to probe for this interface
609 	 */
610 	if (__dev_get_by_name(&init_net, name))
611 		return 1;
612 
613 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
614 		if (!strcmp(name, s[i].name))
615 			return s[i].map.base_addr;
616 	return 0;
617 }
618 
619 /*
620  * Saves at boot time configured settings for any netdevice.
621  */
622 int __init netdev_boot_setup(char *str)
623 {
624 	int ints[5];
625 	struct ifmap map;
626 
627 	str = get_options(str, ARRAY_SIZE(ints), ints);
628 	if (!str || !*str)
629 		return 0;
630 
631 	/* Save settings */
632 	memset(&map, 0, sizeof(map));
633 	if (ints[0] > 0)
634 		map.irq = ints[1];
635 	if (ints[0] > 1)
636 		map.base_addr = ints[2];
637 	if (ints[0] > 2)
638 		map.mem_start = ints[3];
639 	if (ints[0] > 3)
640 		map.mem_end = ints[4];
641 
642 	/* Add new entry to the list */
643 	return netdev_boot_setup_add(str, &map);
644 }
645 
646 __setup("netdev=", netdev_boot_setup);
647 
648 /*******************************************************************************
649 
650 			    Device Interface Subroutines
651 
652 *******************************************************************************/
653 
654 /**
655  *	__dev_get_by_name	- find a device by its name
656  *	@net: the applicable net namespace
657  *	@name: name to find
658  *
659  *	Find an interface by name. Must be called under RTNL semaphore
660  *	or @dev_base_lock. If the name is found a pointer to the device
661  *	is returned. If the name is not found then %NULL is returned. The
662  *	reference counters are not incremented so the caller must be
663  *	careful with locks.
664  */
665 
666 struct net_device *__dev_get_by_name(struct net *net, const char *name)
667 {
668 	struct net_device *dev;
669 	struct hlist_head *head = dev_name_hash(net, name);
670 
671 	hlist_for_each_entry(dev, head, name_hlist)
672 		if (!strncmp(dev->name, name, IFNAMSIZ))
673 			return dev;
674 
675 	return NULL;
676 }
677 EXPORT_SYMBOL(__dev_get_by_name);
678 
679 /**
680  *	dev_get_by_name_rcu	- find a device by its name
681  *	@net: the applicable net namespace
682  *	@name: name to find
683  *
684  *	Find an interface by name.
685  *	If the name is found a pointer to the device is returned.
686  * 	If the name is not found then %NULL is returned.
687  *	The reference counters are not incremented so the caller must be
688  *	careful with locks. The caller must hold RCU lock.
689  */
690 
691 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
692 {
693 	struct net_device *dev;
694 	struct hlist_head *head = dev_name_hash(net, name);
695 
696 	hlist_for_each_entry_rcu(dev, head, name_hlist)
697 		if (!strncmp(dev->name, name, IFNAMSIZ))
698 			return dev;
699 
700 	return NULL;
701 }
702 EXPORT_SYMBOL(dev_get_by_name_rcu);
703 
704 /**
705  *	dev_get_by_name		- find a device by its name
706  *	@net: the applicable net namespace
707  *	@name: name to find
708  *
709  *	Find an interface by name. This can be called from any
710  *	context and does its own locking. The returned handle has
711  *	the usage count incremented and the caller must use dev_put() to
712  *	release it when it is no longer needed. %NULL is returned if no
713  *	matching device is found.
714  */
715 
716 struct net_device *dev_get_by_name(struct net *net, const char *name)
717 {
718 	struct net_device *dev;
719 
720 	rcu_read_lock();
721 	dev = dev_get_by_name_rcu(net, name);
722 	if (dev)
723 		dev_hold(dev);
724 	rcu_read_unlock();
725 	return dev;
726 }
727 EXPORT_SYMBOL(dev_get_by_name);
728 
729 /**
730  *	__dev_get_by_index - find a device by its ifindex
731  *	@net: the applicable net namespace
732  *	@ifindex: index of device
733  *
734  *	Search for an interface by index. Returns %NULL if the device
735  *	is not found or a pointer to the device. The device has not
736  *	had its reference counter increased so the caller must be careful
737  *	about locking. The caller must hold either the RTNL semaphore
738  *	or @dev_base_lock.
739  */
740 
741 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
742 {
743 	struct net_device *dev;
744 	struct hlist_head *head = dev_index_hash(net, ifindex);
745 
746 	hlist_for_each_entry(dev, head, index_hlist)
747 		if (dev->ifindex == ifindex)
748 			return dev;
749 
750 	return NULL;
751 }
752 EXPORT_SYMBOL(__dev_get_by_index);
753 
754 /**
755  *	dev_get_by_index_rcu - find a device by its ifindex
756  *	@net: the applicable net namespace
757  *	@ifindex: index of device
758  *
759  *	Search for an interface by index. Returns %NULL if the device
760  *	is not found or a pointer to the device. The device has not
761  *	had its reference counter increased so the caller must be careful
762  *	about locking. The caller must hold RCU lock.
763  */
764 
765 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
766 {
767 	struct net_device *dev;
768 	struct hlist_head *head = dev_index_hash(net, ifindex);
769 
770 	hlist_for_each_entry_rcu(dev, head, index_hlist)
771 		if (dev->ifindex == ifindex)
772 			return dev;
773 
774 	return NULL;
775 }
776 EXPORT_SYMBOL(dev_get_by_index_rcu);
777 
778 
779 /**
780  *	dev_get_by_index - find a device by its ifindex
781  *	@net: the applicable net namespace
782  *	@ifindex: index of device
783  *
784  *	Search for an interface by index. Returns NULL if the device
785  *	is not found or a pointer to the device. The device returned has
786  *	had a reference added and the pointer is safe until the user calls
787  *	dev_put to indicate they have finished with it.
788  */
789 
790 struct net_device *dev_get_by_index(struct net *net, int ifindex)
791 {
792 	struct net_device *dev;
793 
794 	rcu_read_lock();
795 	dev = dev_get_by_index_rcu(net, ifindex);
796 	if (dev)
797 		dev_hold(dev);
798 	rcu_read_unlock();
799 	return dev;
800 }
801 EXPORT_SYMBOL(dev_get_by_index);
802 
803 /**
804  *	netdev_get_name - get a netdevice name, knowing its ifindex.
805  *	@net: network namespace
806  *	@name: a pointer to the buffer where the name will be stored.
807  *	@ifindex: the ifindex of the interface to get the name from.
808  *
809  *	The use of raw_seqcount_begin() and cond_resched() before
810  *	retrying is required as we want to give the writers a chance
811  *	to complete when CONFIG_PREEMPT is not set.
812  */
813 int netdev_get_name(struct net *net, char *name, int ifindex)
814 {
815 	struct net_device *dev;
816 	unsigned int seq;
817 
818 retry:
819 	seq = raw_seqcount_begin(&devnet_rename_seq);
820 	rcu_read_lock();
821 	dev = dev_get_by_index_rcu(net, ifindex);
822 	if (!dev) {
823 		rcu_read_unlock();
824 		return -ENODEV;
825 	}
826 
827 	strcpy(name, dev->name);
828 	rcu_read_unlock();
829 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
830 		cond_resched();
831 		goto retry;
832 	}
833 
834 	return 0;
835 }
836 
837 /**
838  *	dev_getbyhwaddr_rcu - find a device by its hardware address
839  *	@net: the applicable net namespace
840  *	@type: media type of device
841  *	@ha: hardware address
842  *
843  *	Search for an interface by MAC address. Returns NULL if the device
844  *	is not found or a pointer to the device.
845  *	The caller must hold RCU or RTNL.
846  *	The returned device has not had its ref count increased
847  *	and the caller must therefore be careful about locking
848  *
849  */
850 
851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
852 				       const char *ha)
853 {
854 	struct net_device *dev;
855 
856 	for_each_netdev_rcu(net, dev)
857 		if (dev->type == type &&
858 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
859 			return dev;
860 
861 	return NULL;
862 }
863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 
865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 {
867 	struct net_device *dev;
868 
869 	ASSERT_RTNL();
870 	for_each_netdev(net, dev)
871 		if (dev->type == type)
872 			return dev;
873 
874 	return NULL;
875 }
876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 
878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 {
880 	struct net_device *dev, *ret = NULL;
881 
882 	rcu_read_lock();
883 	for_each_netdev_rcu(net, dev)
884 		if (dev->type == type) {
885 			dev_hold(dev);
886 			ret = dev;
887 			break;
888 		}
889 	rcu_read_unlock();
890 	return ret;
891 }
892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
893 
894 /**
895  *	dev_get_by_flags_rcu - find any device with given flags
896  *	@net: the applicable net namespace
897  *	@if_flags: IFF_* values
898  *	@mask: bitmask of bits in if_flags to check
899  *
900  *	Search for any interface with the given flags. Returns NULL if a device
901  *	is not found or a pointer to the device. Must be called inside
902  *	rcu_read_lock(), and result refcount is unchanged.
903  */
904 
905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
906 				    unsigned short mask)
907 {
908 	struct net_device *dev, *ret;
909 
910 	ret = NULL;
911 	for_each_netdev_rcu(net, dev) {
912 		if (((dev->flags ^ if_flags) & mask) == 0) {
913 			ret = dev;
914 			break;
915 		}
916 	}
917 	return ret;
918 }
919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
920 
921 /**
922  *	dev_valid_name - check if name is okay for network device
923  *	@name: name string
924  *
925  *	Network device names need to be valid file names to
926  *	to allow sysfs to work.  We also disallow any kind of
927  *	whitespace.
928  */
929 bool dev_valid_name(const char *name)
930 {
931 	if (*name == '\0')
932 		return false;
933 	if (strlen(name) >= IFNAMSIZ)
934 		return false;
935 	if (!strcmp(name, ".") || !strcmp(name, ".."))
936 		return false;
937 
938 	while (*name) {
939 		if (*name == '/' || isspace(*name))
940 			return false;
941 		name++;
942 	}
943 	return true;
944 }
945 EXPORT_SYMBOL(dev_valid_name);
946 
947 /**
948  *	__dev_alloc_name - allocate a name for a device
949  *	@net: network namespace to allocate the device name in
950  *	@name: name format string
951  *	@buf:  scratch buffer and result name string
952  *
953  *	Passed a format string - eg "lt%d" it will try and find a suitable
954  *	id. It scans list of devices to build up a free map, then chooses
955  *	the first empty slot. The caller must hold the dev_base or rtnl lock
956  *	while allocating the name and adding the device in order to avoid
957  *	duplicates.
958  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
959  *	Returns the number of the unit assigned or a negative errno code.
960  */
961 
962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 {
964 	int i = 0;
965 	const char *p;
966 	const int max_netdevices = 8*PAGE_SIZE;
967 	unsigned long *inuse;
968 	struct net_device *d;
969 
970 	p = strnchr(name, IFNAMSIZ-1, '%');
971 	if (p) {
972 		/*
973 		 * Verify the string as this thing may have come from
974 		 * the user.  There must be either one "%d" and no other "%"
975 		 * characters.
976 		 */
977 		if (p[1] != 'd' || strchr(p + 2, '%'))
978 			return -EINVAL;
979 
980 		/* Use one page as a bit array of possible slots */
981 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 		if (!inuse)
983 			return -ENOMEM;
984 
985 		for_each_netdev(net, d) {
986 			if (!sscanf(d->name, name, &i))
987 				continue;
988 			if (i < 0 || i >= max_netdevices)
989 				continue;
990 
991 			/*  avoid cases where sscanf is not exact inverse of printf */
992 			snprintf(buf, IFNAMSIZ, name, i);
993 			if (!strncmp(buf, d->name, IFNAMSIZ))
994 				set_bit(i, inuse);
995 		}
996 
997 		i = find_first_zero_bit(inuse, max_netdevices);
998 		free_page((unsigned long) inuse);
999 	}
1000 
1001 	if (buf != name)
1002 		snprintf(buf, IFNAMSIZ, name, i);
1003 	if (!__dev_get_by_name(net, buf))
1004 		return i;
1005 
1006 	/* It is possible to run out of possible slots
1007 	 * when the name is long and there isn't enough space left
1008 	 * for the digits, or if all bits are used.
1009 	 */
1010 	return -ENFILE;
1011 }
1012 
1013 /**
1014  *	dev_alloc_name - allocate a name for a device
1015  *	@dev: device
1016  *	@name: name format string
1017  *
1018  *	Passed a format string - eg "lt%d" it will try and find a suitable
1019  *	id. It scans list of devices to build up a free map, then chooses
1020  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *	while allocating the name and adding the device in order to avoid
1022  *	duplicates.
1023  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *	Returns the number of the unit assigned or a negative errno code.
1025  */
1026 
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029 	char buf[IFNAMSIZ];
1030 	struct net *net;
1031 	int ret;
1032 
1033 	BUG_ON(!dev_net(dev));
1034 	net = dev_net(dev);
1035 	ret = __dev_alloc_name(net, name, buf);
1036 	if (ret >= 0)
1037 		strlcpy(dev->name, buf, IFNAMSIZ);
1038 	return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041 
1042 static int dev_alloc_name_ns(struct net *net,
1043 			     struct net_device *dev,
1044 			     const char *name)
1045 {
1046 	char buf[IFNAMSIZ];
1047 	int ret;
1048 
1049 	ret = __dev_alloc_name(net, name, buf);
1050 	if (ret >= 0)
1051 		strlcpy(dev->name, buf, IFNAMSIZ);
1052 	return ret;
1053 }
1054 
1055 static int dev_get_valid_name(struct net *net,
1056 			      struct net_device *dev,
1057 			      const char *name)
1058 {
1059 	BUG_ON(!net);
1060 
1061 	if (!dev_valid_name(name))
1062 		return -EINVAL;
1063 
1064 	if (strchr(name, '%'))
1065 		return dev_alloc_name_ns(net, dev, name);
1066 	else if (__dev_get_by_name(net, name))
1067 		return -EEXIST;
1068 	else if (dev->name != name)
1069 		strlcpy(dev->name, name, IFNAMSIZ);
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  *	dev_change_name - change name of a device
1076  *	@dev: device
1077  *	@newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *	Change name of a device, can pass format strings "eth%d".
1080  *	for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084 	char oldname[IFNAMSIZ];
1085 	int err = 0;
1086 	int ret;
1087 	struct net *net;
1088 
1089 	ASSERT_RTNL();
1090 	BUG_ON(!dev_net(dev));
1091 
1092 	net = dev_net(dev);
1093 	if (dev->flags & IFF_UP)
1094 		return -EBUSY;
1095 
1096 	write_seqcount_begin(&devnet_rename_seq);
1097 
1098 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 		write_seqcount_end(&devnet_rename_seq);
1100 		return 0;
1101 	}
1102 
1103 	memcpy(oldname, dev->name, IFNAMSIZ);
1104 
1105 	err = dev_get_valid_name(net, dev, newname);
1106 	if (err < 0) {
1107 		write_seqcount_end(&devnet_rename_seq);
1108 		return err;
1109 	}
1110 
1111 rollback:
1112 	ret = device_rename(&dev->dev, dev->name);
1113 	if (ret) {
1114 		memcpy(dev->name, oldname, IFNAMSIZ);
1115 		write_seqcount_end(&devnet_rename_seq);
1116 		return ret;
1117 	}
1118 
1119 	write_seqcount_end(&devnet_rename_seq);
1120 
1121 	write_lock_bh(&dev_base_lock);
1122 	hlist_del_rcu(&dev->name_hlist);
1123 	write_unlock_bh(&dev_base_lock);
1124 
1125 	synchronize_rcu();
1126 
1127 	write_lock_bh(&dev_base_lock);
1128 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129 	write_unlock_bh(&dev_base_lock);
1130 
1131 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132 	ret = notifier_to_errno(ret);
1133 
1134 	if (ret) {
1135 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1136 		if (err >= 0) {
1137 			err = ret;
1138 			write_seqcount_begin(&devnet_rename_seq);
1139 			memcpy(dev->name, oldname, IFNAMSIZ);
1140 			goto rollback;
1141 		} else {
1142 			pr_err("%s: name change rollback failed: %d\n",
1143 			       dev->name, ret);
1144 		}
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 /**
1151  *	dev_set_alias - change ifalias of a device
1152  *	@dev: device
1153  *	@alias: name up to IFALIASZ
1154  *	@len: limit of bytes to copy from info
1155  *
1156  *	Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160 	char *new_ifalias;
1161 
1162 	ASSERT_RTNL();
1163 
1164 	if (len >= IFALIASZ)
1165 		return -EINVAL;
1166 
1167 	if (!len) {
1168 		kfree(dev->ifalias);
1169 		dev->ifalias = NULL;
1170 		return 0;
1171 	}
1172 
1173 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174 	if (!new_ifalias)
1175 		return -ENOMEM;
1176 	dev->ifalias = new_ifalias;
1177 
1178 	strlcpy(dev->ifalias, alias, len+1);
1179 	return len;
1180 }
1181 
1182 
1183 /**
1184  *	netdev_features_change - device changes features
1185  *	@dev: device to cause notification
1186  *
1187  *	Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194 
1195 /**
1196  *	netdev_state_change - device changes state
1197  *	@dev: device to cause notification
1198  *
1199  *	Called to indicate a device has changed state. This function calls
1200  *	the notifier chains for netdev_chain and sends a NEWLINK message
1201  *	to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205 	if (dev->flags & IFF_UP) {
1206 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1208 	}
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211 
1212 /**
1213  * 	netdev_notify_peers - notify network peers about existence of @dev
1214  * 	@dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224 	rtnl_lock();
1225 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226 	rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229 
1230 static int __dev_open(struct net_device *dev)
1231 {
1232 	const struct net_device_ops *ops = dev->netdev_ops;
1233 	int ret;
1234 
1235 	ASSERT_RTNL();
1236 
1237 	if (!netif_device_present(dev))
1238 		return -ENODEV;
1239 
1240 	/* Block netpoll from trying to do any rx path servicing.
1241 	 * If we don't do this there is a chance ndo_poll_controller
1242 	 * or ndo_poll may be running while we open the device
1243 	 */
1244 	netpoll_rx_disable(dev);
1245 
1246 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1247 	ret = notifier_to_errno(ret);
1248 	if (ret)
1249 		return ret;
1250 
1251 	set_bit(__LINK_STATE_START, &dev->state);
1252 
1253 	if (ops->ndo_validate_addr)
1254 		ret = ops->ndo_validate_addr(dev);
1255 
1256 	if (!ret && ops->ndo_open)
1257 		ret = ops->ndo_open(dev);
1258 
1259 	netpoll_rx_enable(dev);
1260 
1261 	if (ret)
1262 		clear_bit(__LINK_STATE_START, &dev->state);
1263 	else {
1264 		dev->flags |= IFF_UP;
1265 		net_dmaengine_get();
1266 		dev_set_rx_mode(dev);
1267 		dev_activate(dev);
1268 		add_device_randomness(dev->dev_addr, dev->addr_len);
1269 	}
1270 
1271 	return ret;
1272 }
1273 
1274 /**
1275  *	dev_open	- prepare an interface for use.
1276  *	@dev:	device to open
1277  *
1278  *	Takes a device from down to up state. The device's private open
1279  *	function is invoked and then the multicast lists are loaded. Finally
1280  *	the device is moved into the up state and a %NETDEV_UP message is
1281  *	sent to the netdev notifier chain.
1282  *
1283  *	Calling this function on an active interface is a nop. On a failure
1284  *	a negative errno code is returned.
1285  */
1286 int dev_open(struct net_device *dev)
1287 {
1288 	int ret;
1289 
1290 	if (dev->flags & IFF_UP)
1291 		return 0;
1292 
1293 	ret = __dev_open(dev);
1294 	if (ret < 0)
1295 		return ret;
1296 
1297 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1298 	call_netdevice_notifiers(NETDEV_UP, dev);
1299 
1300 	return ret;
1301 }
1302 EXPORT_SYMBOL(dev_open);
1303 
1304 static int __dev_close_many(struct list_head *head)
1305 {
1306 	struct net_device *dev;
1307 
1308 	ASSERT_RTNL();
1309 	might_sleep();
1310 
1311 	list_for_each_entry(dev, head, close_list) {
1312 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 
1314 		clear_bit(__LINK_STATE_START, &dev->state);
1315 
1316 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1317 		 * can be even on different cpu. So just clear netif_running().
1318 		 *
1319 		 * dev->stop() will invoke napi_disable() on all of it's
1320 		 * napi_struct instances on this device.
1321 		 */
1322 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1323 	}
1324 
1325 	dev_deactivate_many(head);
1326 
1327 	list_for_each_entry(dev, head, close_list) {
1328 		const struct net_device_ops *ops = dev->netdev_ops;
1329 
1330 		/*
1331 		 *	Call the device specific close. This cannot fail.
1332 		 *	Only if device is UP
1333 		 *
1334 		 *	We allow it to be called even after a DETACH hot-plug
1335 		 *	event.
1336 		 */
1337 		if (ops->ndo_stop)
1338 			ops->ndo_stop(dev);
1339 
1340 		dev->flags &= ~IFF_UP;
1341 		net_dmaengine_put();
1342 	}
1343 
1344 	return 0;
1345 }
1346 
1347 static int __dev_close(struct net_device *dev)
1348 {
1349 	int retval;
1350 	LIST_HEAD(single);
1351 
1352 	/* Temporarily disable netpoll until the interface is down */
1353 	netpoll_rx_disable(dev);
1354 
1355 	list_add(&dev->close_list, &single);
1356 	retval = __dev_close_many(&single);
1357 	list_del(&single);
1358 
1359 	netpoll_rx_enable(dev);
1360 	return retval;
1361 }
1362 
1363 static int dev_close_many(struct list_head *head)
1364 {
1365 	struct net_device *dev, *tmp;
1366 
1367 	/* Remove the devices that don't need to be closed */
1368 	list_for_each_entry_safe(dev, tmp, head, close_list)
1369 		if (!(dev->flags & IFF_UP))
1370 			list_del_init(&dev->close_list);
1371 
1372 	__dev_close_many(head);
1373 
1374 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1375 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1377 		list_del_init(&dev->close_list);
1378 	}
1379 
1380 	return 0;
1381 }
1382 
1383 /**
1384  *	dev_close - shutdown an interface.
1385  *	@dev: device to shutdown
1386  *
1387  *	This function moves an active device into down state. A
1388  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *	chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394 	if (dev->flags & IFF_UP) {
1395 		LIST_HEAD(single);
1396 
1397 		/* Block netpoll rx while the interface is going down */
1398 		netpoll_rx_disable(dev);
1399 
1400 		list_add(&dev->close_list, &single);
1401 		dev_close_many(&single);
1402 		list_del(&single);
1403 
1404 		netpoll_rx_enable(dev);
1405 	}
1406 	return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close);
1409 
1410 
1411 /**
1412  *	dev_disable_lro - disable Large Receive Offload on a device
1413  *	@dev: device
1414  *
1415  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1416  *	called under RTNL.  This is needed if received packets may be
1417  *	forwarded to another interface.
1418  */
1419 void dev_disable_lro(struct net_device *dev)
1420 {
1421 	/*
1422 	 * If we're trying to disable lro on a vlan device
1423 	 * use the underlying physical device instead
1424 	 */
1425 	if (is_vlan_dev(dev))
1426 		dev = vlan_dev_real_dev(dev);
1427 
1428 	/* the same for macvlan devices */
1429 	if (netif_is_macvlan(dev))
1430 		dev = macvlan_dev_real_dev(dev);
1431 
1432 	dev->wanted_features &= ~NETIF_F_LRO;
1433 	netdev_update_features(dev);
1434 
1435 	if (unlikely(dev->features & NETIF_F_LRO))
1436 		netdev_WARN(dev, "failed to disable LRO!\n");
1437 }
1438 EXPORT_SYMBOL(dev_disable_lro);
1439 
1440 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1441 				   struct net_device *dev)
1442 {
1443 	struct netdev_notifier_info info;
1444 
1445 	netdev_notifier_info_init(&info, dev);
1446 	return nb->notifier_call(nb, val, &info);
1447 }
1448 
1449 static int dev_boot_phase = 1;
1450 
1451 /**
1452  *	register_netdevice_notifier - register a network notifier block
1453  *	@nb: notifier
1454  *
1455  *	Register a notifier to be called when network device events occur.
1456  *	The notifier passed is linked into the kernel structures and must
1457  *	not be reused until it has been unregistered. A negative errno code
1458  *	is returned on a failure.
1459  *
1460  * 	When registered all registration and up events are replayed
1461  *	to the new notifier to allow device to have a race free
1462  *	view of the network device list.
1463  */
1464 
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467 	struct net_device *dev;
1468 	struct net_device *last;
1469 	struct net *net;
1470 	int err;
1471 
1472 	rtnl_lock();
1473 	err = raw_notifier_chain_register(&netdev_chain, nb);
1474 	if (err)
1475 		goto unlock;
1476 	if (dev_boot_phase)
1477 		goto unlock;
1478 	for_each_net(net) {
1479 		for_each_netdev(net, dev) {
1480 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1481 			err = notifier_to_errno(err);
1482 			if (err)
1483 				goto rollback;
1484 
1485 			if (!(dev->flags & IFF_UP))
1486 				continue;
1487 
1488 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1489 		}
1490 	}
1491 
1492 unlock:
1493 	rtnl_unlock();
1494 	return err;
1495 
1496 rollback:
1497 	last = dev;
1498 	for_each_net(net) {
1499 		for_each_netdev(net, dev) {
1500 			if (dev == last)
1501 				goto outroll;
1502 
1503 			if (dev->flags & IFF_UP) {
1504 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1505 							dev);
1506 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1507 			}
1508 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1509 		}
1510 	}
1511 
1512 outroll:
1513 	raw_notifier_chain_unregister(&netdev_chain, nb);
1514 	goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517 
1518 /**
1519  *	unregister_netdevice_notifier - unregister a network notifier block
1520  *	@nb: notifier
1521  *
1522  *	Unregister a notifier previously registered by
1523  *	register_netdevice_notifier(). The notifier is unlinked into the
1524  *	kernel structures and may then be reused. A negative errno code
1525  *	is returned on a failure.
1526  *
1527  * 	After unregistering unregister and down device events are synthesized
1528  *	for all devices on the device list to the removed notifier to remove
1529  *	the need for special case cleanup code.
1530  */
1531 
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534 	struct net_device *dev;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 
1543 	for_each_net(net) {
1544 		for_each_netdev(net, dev) {
1545 			if (dev->flags & IFF_UP) {
1546 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1547 							dev);
1548 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1549 			}
1550 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1551 		}
1552 	}
1553 unlock:
1554 	rtnl_unlock();
1555 	return err;
1556 }
1557 EXPORT_SYMBOL(unregister_netdevice_notifier);
1558 
1559 /**
1560  *	call_netdevice_notifiers_info - call all network notifier blocks
1561  *	@val: value passed unmodified to notifier function
1562  *	@dev: net_device pointer passed unmodified to notifier function
1563  *	@info: notifier information data
1564  *
1565  *	Call all network notifier blocks.  Parameters and return value
1566  *	are as for raw_notifier_call_chain().
1567  */
1568 
1569 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1570 				  struct netdev_notifier_info *info)
1571 {
1572 	ASSERT_RTNL();
1573 	netdev_notifier_info_init(info, dev);
1574 	return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1577 
1578 /**
1579  *	call_netdevice_notifiers - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *
1583  *	Call all network notifier blocks.  Parameters and return value
1584  *	are as for raw_notifier_call_chain().
1585  */
1586 
1587 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1588 {
1589 	struct netdev_notifier_info info;
1590 
1591 	return call_netdevice_notifiers_info(val, dev, &info);
1592 }
1593 EXPORT_SYMBOL(call_netdevice_notifiers);
1594 
1595 static struct static_key netstamp_needed __read_mostly;
1596 #ifdef HAVE_JUMP_LABEL
1597 /* We are not allowed to call static_key_slow_dec() from irq context
1598  * If net_disable_timestamp() is called from irq context, defer the
1599  * static_key_slow_dec() calls.
1600  */
1601 static atomic_t netstamp_needed_deferred;
1602 #endif
1603 
1604 void net_enable_timestamp(void)
1605 {
1606 #ifdef HAVE_JUMP_LABEL
1607 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608 
1609 	if (deferred) {
1610 		while (--deferred)
1611 			static_key_slow_dec(&netstamp_needed);
1612 		return;
1613 	}
1614 #endif
1615 	static_key_slow_inc(&netstamp_needed);
1616 }
1617 EXPORT_SYMBOL(net_enable_timestamp);
1618 
1619 void net_disable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622 	if (in_interrupt()) {
1623 		atomic_inc(&netstamp_needed_deferred);
1624 		return;
1625 	}
1626 #endif
1627 	static_key_slow_dec(&netstamp_needed);
1628 }
1629 EXPORT_SYMBOL(net_disable_timestamp);
1630 
1631 static inline void net_timestamp_set(struct sk_buff *skb)
1632 {
1633 	skb->tstamp.tv64 = 0;
1634 	if (static_key_false(&netstamp_needed))
1635 		__net_timestamp(skb);
1636 }
1637 
1638 #define net_timestamp_check(COND, SKB)			\
1639 	if (static_key_false(&netstamp_needed)) {		\
1640 		if ((COND) && !(SKB)->tstamp.tv64)	\
1641 			__net_timestamp(SKB);		\
1642 	}						\
1643 
1644 static inline bool is_skb_forwardable(struct net_device *dev,
1645 				      struct sk_buff *skb)
1646 {
1647 	unsigned int len;
1648 
1649 	if (!(dev->flags & IFF_UP))
1650 		return false;
1651 
1652 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1653 	if (skb->len <= len)
1654 		return true;
1655 
1656 	/* if TSO is enabled, we don't care about the length as the packet
1657 	 * could be forwarded without being segmented before
1658 	 */
1659 	if (skb_is_gso(skb))
1660 		return true;
1661 
1662 	return false;
1663 }
1664 
1665 /**
1666  * dev_forward_skb - loopback an skb to another netif
1667  *
1668  * @dev: destination network device
1669  * @skb: buffer to forward
1670  *
1671  * return values:
1672  *	NET_RX_SUCCESS	(no congestion)
1673  *	NET_RX_DROP     (packet was dropped, but freed)
1674  *
1675  * dev_forward_skb can be used for injecting an skb from the
1676  * start_xmit function of one device into the receive queue
1677  * of another device.
1678  *
1679  * The receiving device may be in another namespace, so
1680  * we have to clear all information in the skb that could
1681  * impact namespace isolation.
1682  */
1683 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 			atomic_long_inc(&dev->rx_dropped);
1688 			kfree_skb(skb);
1689 			return NET_RX_DROP;
1690 		}
1691 	}
1692 
1693 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 		atomic_long_inc(&dev->rx_dropped);
1695 		kfree_skb(skb);
1696 		return NET_RX_DROP;
1697 	}
1698 
1699 	skb_scrub_packet(skb, true);
1700 	skb->protocol = eth_type_trans(skb, dev);
1701 
1702 	return netif_rx(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(dev_forward_skb);
1705 
1706 static inline int deliver_skb(struct sk_buff *skb,
1707 			      struct packet_type *pt_prev,
1708 			      struct net_device *orig_dev)
1709 {
1710 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711 		return -ENOMEM;
1712 	atomic_inc(&skb->users);
1713 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714 }
1715 
1716 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717 {
1718 	if (!ptype->af_packet_priv || !skb->sk)
1719 		return false;
1720 
1721 	if (ptype->id_match)
1722 		return ptype->id_match(ptype, skb->sk);
1723 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724 		return true;
1725 
1726 	return false;
1727 }
1728 
1729 /*
1730  *	Support routine. Sends outgoing frames to any network
1731  *	taps currently in use.
1732  */
1733 
1734 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735 {
1736 	struct packet_type *ptype;
1737 	struct sk_buff *skb2 = NULL;
1738 	struct packet_type *pt_prev = NULL;
1739 
1740 	rcu_read_lock();
1741 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742 		/* Never send packets back to the socket
1743 		 * they originated from - MvS (miquels@drinkel.ow.org)
1744 		 */
1745 		if ((ptype->dev == dev || !ptype->dev) &&
1746 		    (!skb_loop_sk(ptype, skb))) {
1747 			if (pt_prev) {
1748 				deliver_skb(skb2, pt_prev, skb->dev);
1749 				pt_prev = ptype;
1750 				continue;
1751 			}
1752 
1753 			skb2 = skb_clone(skb, GFP_ATOMIC);
1754 			if (!skb2)
1755 				break;
1756 
1757 			net_timestamp_set(skb2);
1758 
1759 			/* skb->nh should be correctly
1760 			   set by sender, so that the second statement is
1761 			   just protection against buggy protocols.
1762 			 */
1763 			skb_reset_mac_header(skb2);
1764 
1765 			if (skb_network_header(skb2) < skb2->data ||
1766 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768 						     ntohs(skb2->protocol),
1769 						     dev->name);
1770 				skb_reset_network_header(skb2);
1771 			}
1772 
1773 			skb2->transport_header = skb2->network_header;
1774 			skb2->pkt_type = PACKET_OUTGOING;
1775 			pt_prev = ptype;
1776 		}
1777 	}
1778 	if (pt_prev)
1779 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780 	rcu_read_unlock();
1781 }
1782 
1783 /**
1784  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785  * @dev: Network device
1786  * @txq: number of queues available
1787  *
1788  * If real_num_tx_queues is changed the tc mappings may no longer be
1789  * valid. To resolve this verify the tc mapping remains valid and if
1790  * not NULL the mapping. With no priorities mapping to this
1791  * offset/count pair it will no longer be used. In the worst case TC0
1792  * is invalid nothing can be done so disable priority mappings. If is
1793  * expected that drivers will fix this mapping if they can before
1794  * calling netif_set_real_num_tx_queues.
1795  */
1796 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797 {
1798 	int i;
1799 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800 
1801 	/* If TC0 is invalidated disable TC mapping */
1802 	if (tc->offset + tc->count > txq) {
1803 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804 		dev->num_tc = 0;
1805 		return;
1806 	}
1807 
1808 	/* Invalidated prio to tc mappings set to TC0 */
1809 	for (i = 1; i < TC_BITMASK + 1; i++) {
1810 		int q = netdev_get_prio_tc_map(dev, i);
1811 
1812 		tc = &dev->tc_to_txq[q];
1813 		if (tc->offset + tc->count > txq) {
1814 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815 				i, q);
1816 			netdev_set_prio_tc_map(dev, i, 0);
1817 		}
1818 	}
1819 }
1820 
1821 #ifdef CONFIG_XPS
1822 static DEFINE_MUTEX(xps_map_mutex);
1823 #define xmap_dereference(P)		\
1824 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825 
1826 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827 					int cpu, u16 index)
1828 {
1829 	struct xps_map *map = NULL;
1830 	int pos;
1831 
1832 	if (dev_maps)
1833 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834 
1835 	for (pos = 0; map && pos < map->len; pos++) {
1836 		if (map->queues[pos] == index) {
1837 			if (map->len > 1) {
1838 				map->queues[pos] = map->queues[--map->len];
1839 			} else {
1840 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841 				kfree_rcu(map, rcu);
1842 				map = NULL;
1843 			}
1844 			break;
1845 		}
1846 	}
1847 
1848 	return map;
1849 }
1850 
1851 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852 {
1853 	struct xps_dev_maps *dev_maps;
1854 	int cpu, i;
1855 	bool active = false;
1856 
1857 	mutex_lock(&xps_map_mutex);
1858 	dev_maps = xmap_dereference(dev->xps_maps);
1859 
1860 	if (!dev_maps)
1861 		goto out_no_maps;
1862 
1863 	for_each_possible_cpu(cpu) {
1864 		for (i = index; i < dev->num_tx_queues; i++) {
1865 			if (!remove_xps_queue(dev_maps, cpu, i))
1866 				break;
1867 		}
1868 		if (i == dev->num_tx_queues)
1869 			active = true;
1870 	}
1871 
1872 	if (!active) {
1873 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1874 		kfree_rcu(dev_maps, rcu);
1875 	}
1876 
1877 	for (i = index; i < dev->num_tx_queues; i++)
1878 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879 					     NUMA_NO_NODE);
1880 
1881 out_no_maps:
1882 	mutex_unlock(&xps_map_mutex);
1883 }
1884 
1885 static struct xps_map *expand_xps_map(struct xps_map *map,
1886 				      int cpu, u16 index)
1887 {
1888 	struct xps_map *new_map;
1889 	int alloc_len = XPS_MIN_MAP_ALLOC;
1890 	int i, pos;
1891 
1892 	for (pos = 0; map && pos < map->len; pos++) {
1893 		if (map->queues[pos] != index)
1894 			continue;
1895 		return map;
1896 	}
1897 
1898 	/* Need to add queue to this CPU's existing map */
1899 	if (map) {
1900 		if (pos < map->alloc_len)
1901 			return map;
1902 
1903 		alloc_len = map->alloc_len * 2;
1904 	}
1905 
1906 	/* Need to allocate new map to store queue on this CPU's map */
1907 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908 			       cpu_to_node(cpu));
1909 	if (!new_map)
1910 		return NULL;
1911 
1912 	for (i = 0; i < pos; i++)
1913 		new_map->queues[i] = map->queues[i];
1914 	new_map->alloc_len = alloc_len;
1915 	new_map->len = pos;
1916 
1917 	return new_map;
1918 }
1919 
1920 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1921 			u16 index)
1922 {
1923 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1924 	struct xps_map *map, *new_map;
1925 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1926 	int cpu, numa_node_id = -2;
1927 	bool active = false;
1928 
1929 	mutex_lock(&xps_map_mutex);
1930 
1931 	dev_maps = xmap_dereference(dev->xps_maps);
1932 
1933 	/* allocate memory for queue storage */
1934 	for_each_online_cpu(cpu) {
1935 		if (!cpumask_test_cpu(cpu, mask))
1936 			continue;
1937 
1938 		if (!new_dev_maps)
1939 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1940 		if (!new_dev_maps) {
1941 			mutex_unlock(&xps_map_mutex);
1942 			return -ENOMEM;
1943 		}
1944 
1945 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1946 				 NULL;
1947 
1948 		map = expand_xps_map(map, cpu, index);
1949 		if (!map)
1950 			goto error;
1951 
1952 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1953 	}
1954 
1955 	if (!new_dev_maps)
1956 		goto out_no_new_maps;
1957 
1958 	for_each_possible_cpu(cpu) {
1959 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1960 			/* add queue to CPU maps */
1961 			int pos = 0;
1962 
1963 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1964 			while ((pos < map->len) && (map->queues[pos] != index))
1965 				pos++;
1966 
1967 			if (pos == map->len)
1968 				map->queues[map->len++] = index;
1969 #ifdef CONFIG_NUMA
1970 			if (numa_node_id == -2)
1971 				numa_node_id = cpu_to_node(cpu);
1972 			else if (numa_node_id != cpu_to_node(cpu))
1973 				numa_node_id = -1;
1974 #endif
1975 		} else if (dev_maps) {
1976 			/* fill in the new device map from the old device map */
1977 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1978 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979 		}
1980 
1981 	}
1982 
1983 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1984 
1985 	/* Cleanup old maps */
1986 	if (dev_maps) {
1987 		for_each_possible_cpu(cpu) {
1988 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990 			if (map && map != new_map)
1991 				kfree_rcu(map, rcu);
1992 		}
1993 
1994 		kfree_rcu(dev_maps, rcu);
1995 	}
1996 
1997 	dev_maps = new_dev_maps;
1998 	active = true;
1999 
2000 out_no_new_maps:
2001 	/* update Tx queue numa node */
2002 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2003 				     (numa_node_id >= 0) ? numa_node_id :
2004 				     NUMA_NO_NODE);
2005 
2006 	if (!dev_maps)
2007 		goto out_no_maps;
2008 
2009 	/* removes queue from unused CPUs */
2010 	for_each_possible_cpu(cpu) {
2011 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2012 			continue;
2013 
2014 		if (remove_xps_queue(dev_maps, cpu, index))
2015 			active = true;
2016 	}
2017 
2018 	/* free map if not active */
2019 	if (!active) {
2020 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2021 		kfree_rcu(dev_maps, rcu);
2022 	}
2023 
2024 out_no_maps:
2025 	mutex_unlock(&xps_map_mutex);
2026 
2027 	return 0;
2028 error:
2029 	/* remove any maps that we added */
2030 	for_each_possible_cpu(cpu) {
2031 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2032 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033 				 NULL;
2034 		if (new_map && new_map != map)
2035 			kfree(new_map);
2036 	}
2037 
2038 	mutex_unlock(&xps_map_mutex);
2039 
2040 	kfree(new_dev_maps);
2041 	return -ENOMEM;
2042 }
2043 EXPORT_SYMBOL(netif_set_xps_queue);
2044 
2045 #endif
2046 /*
2047  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2048  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2049  */
2050 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2051 {
2052 	int rc;
2053 
2054 	if (txq < 1 || txq > dev->num_tx_queues)
2055 		return -EINVAL;
2056 
2057 	if (dev->reg_state == NETREG_REGISTERED ||
2058 	    dev->reg_state == NETREG_UNREGISTERING) {
2059 		ASSERT_RTNL();
2060 
2061 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2062 						  txq);
2063 		if (rc)
2064 			return rc;
2065 
2066 		if (dev->num_tc)
2067 			netif_setup_tc(dev, txq);
2068 
2069 		if (txq < dev->real_num_tx_queues) {
2070 			qdisc_reset_all_tx_gt(dev, txq);
2071 #ifdef CONFIG_XPS
2072 			netif_reset_xps_queues_gt(dev, txq);
2073 #endif
2074 		}
2075 	}
2076 
2077 	dev->real_num_tx_queues = txq;
2078 	return 0;
2079 }
2080 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2081 
2082 #ifdef CONFIG_RPS
2083 /**
2084  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2085  *	@dev: Network device
2086  *	@rxq: Actual number of RX queues
2087  *
2088  *	This must be called either with the rtnl_lock held or before
2089  *	registration of the net device.  Returns 0 on success, or a
2090  *	negative error code.  If called before registration, it always
2091  *	succeeds.
2092  */
2093 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2094 {
2095 	int rc;
2096 
2097 	if (rxq < 1 || rxq > dev->num_rx_queues)
2098 		return -EINVAL;
2099 
2100 	if (dev->reg_state == NETREG_REGISTERED) {
2101 		ASSERT_RTNL();
2102 
2103 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2104 						  rxq);
2105 		if (rc)
2106 			return rc;
2107 	}
2108 
2109 	dev->real_num_rx_queues = rxq;
2110 	return 0;
2111 }
2112 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2113 #endif
2114 
2115 /**
2116  * netif_get_num_default_rss_queues - default number of RSS queues
2117  *
2118  * This routine should set an upper limit on the number of RSS queues
2119  * used by default by multiqueue devices.
2120  */
2121 int netif_get_num_default_rss_queues(void)
2122 {
2123 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2124 }
2125 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2126 
2127 static inline void __netif_reschedule(struct Qdisc *q)
2128 {
2129 	struct softnet_data *sd;
2130 	unsigned long flags;
2131 
2132 	local_irq_save(flags);
2133 	sd = &__get_cpu_var(softnet_data);
2134 	q->next_sched = NULL;
2135 	*sd->output_queue_tailp = q;
2136 	sd->output_queue_tailp = &q->next_sched;
2137 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2138 	local_irq_restore(flags);
2139 }
2140 
2141 void __netif_schedule(struct Qdisc *q)
2142 {
2143 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2144 		__netif_reschedule(q);
2145 }
2146 EXPORT_SYMBOL(__netif_schedule);
2147 
2148 void dev_kfree_skb_irq(struct sk_buff *skb)
2149 {
2150 	if (atomic_dec_and_test(&skb->users)) {
2151 		struct softnet_data *sd;
2152 		unsigned long flags;
2153 
2154 		local_irq_save(flags);
2155 		sd = &__get_cpu_var(softnet_data);
2156 		skb->next = sd->completion_queue;
2157 		sd->completion_queue = skb;
2158 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159 		local_irq_restore(flags);
2160 	}
2161 }
2162 EXPORT_SYMBOL(dev_kfree_skb_irq);
2163 
2164 void dev_kfree_skb_any(struct sk_buff *skb)
2165 {
2166 	if (in_irq() || irqs_disabled())
2167 		dev_kfree_skb_irq(skb);
2168 	else
2169 		dev_kfree_skb(skb);
2170 }
2171 EXPORT_SYMBOL(dev_kfree_skb_any);
2172 
2173 
2174 /**
2175  * netif_device_detach - mark device as removed
2176  * @dev: network device
2177  *
2178  * Mark device as removed from system and therefore no longer available.
2179  */
2180 void netif_device_detach(struct net_device *dev)
2181 {
2182 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2183 	    netif_running(dev)) {
2184 		netif_tx_stop_all_queues(dev);
2185 	}
2186 }
2187 EXPORT_SYMBOL(netif_device_detach);
2188 
2189 /**
2190  * netif_device_attach - mark device as attached
2191  * @dev: network device
2192  *
2193  * Mark device as attached from system and restart if needed.
2194  */
2195 void netif_device_attach(struct net_device *dev)
2196 {
2197 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2198 	    netif_running(dev)) {
2199 		netif_tx_wake_all_queues(dev);
2200 		__netdev_watchdog_up(dev);
2201 	}
2202 }
2203 EXPORT_SYMBOL(netif_device_attach);
2204 
2205 static void skb_warn_bad_offload(const struct sk_buff *skb)
2206 {
2207 	static const netdev_features_t null_features = 0;
2208 	struct net_device *dev = skb->dev;
2209 	const char *driver = "";
2210 
2211 	if (!net_ratelimit())
2212 		return;
2213 
2214 	if (dev && dev->dev.parent)
2215 		driver = dev_driver_string(dev->dev.parent);
2216 
2217 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2218 	     "gso_type=%d ip_summed=%d\n",
2219 	     driver, dev ? &dev->features : &null_features,
2220 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2221 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2222 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2223 }
2224 
2225 /*
2226  * Invalidate hardware checksum when packet is to be mangled, and
2227  * complete checksum manually on outgoing path.
2228  */
2229 int skb_checksum_help(struct sk_buff *skb)
2230 {
2231 	__wsum csum;
2232 	int ret = 0, offset;
2233 
2234 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2235 		goto out_set_summed;
2236 
2237 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2238 		skb_warn_bad_offload(skb);
2239 		return -EINVAL;
2240 	}
2241 
2242 	/* Before computing a checksum, we should make sure no frag could
2243 	 * be modified by an external entity : checksum could be wrong.
2244 	 */
2245 	if (skb_has_shared_frag(skb)) {
2246 		ret = __skb_linearize(skb);
2247 		if (ret)
2248 			goto out;
2249 	}
2250 
2251 	offset = skb_checksum_start_offset(skb);
2252 	BUG_ON(offset >= skb_headlen(skb));
2253 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2254 
2255 	offset += skb->csum_offset;
2256 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2257 
2258 	if (skb_cloned(skb) &&
2259 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2260 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2261 		if (ret)
2262 			goto out;
2263 	}
2264 
2265 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2266 out_set_summed:
2267 	skb->ip_summed = CHECKSUM_NONE;
2268 out:
2269 	return ret;
2270 }
2271 EXPORT_SYMBOL(skb_checksum_help);
2272 
2273 __be16 skb_network_protocol(struct sk_buff *skb)
2274 {
2275 	__be16 type = skb->protocol;
2276 	int vlan_depth = ETH_HLEN;
2277 
2278 	/* Tunnel gso handlers can set protocol to ethernet. */
2279 	if (type == htons(ETH_P_TEB)) {
2280 		struct ethhdr *eth;
2281 
2282 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2283 			return 0;
2284 
2285 		eth = (struct ethhdr *)skb_mac_header(skb);
2286 		type = eth->h_proto;
2287 	}
2288 
2289 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2290 		struct vlan_hdr *vh;
2291 
2292 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2293 			return 0;
2294 
2295 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2296 		type = vh->h_vlan_encapsulated_proto;
2297 		vlan_depth += VLAN_HLEN;
2298 	}
2299 
2300 	return type;
2301 }
2302 
2303 /**
2304  *	skb_mac_gso_segment - mac layer segmentation handler.
2305  *	@skb: buffer to segment
2306  *	@features: features for the output path (see dev->features)
2307  */
2308 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2309 				    netdev_features_t features)
2310 {
2311 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2312 	struct packet_offload *ptype;
2313 	__be16 type = skb_network_protocol(skb);
2314 
2315 	if (unlikely(!type))
2316 		return ERR_PTR(-EINVAL);
2317 
2318 	__skb_pull(skb, skb->mac_len);
2319 
2320 	rcu_read_lock();
2321 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2322 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2323 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2324 				int err;
2325 
2326 				err = ptype->callbacks.gso_send_check(skb);
2327 				segs = ERR_PTR(err);
2328 				if (err || skb_gso_ok(skb, features))
2329 					break;
2330 				__skb_push(skb, (skb->data -
2331 						 skb_network_header(skb)));
2332 			}
2333 			segs = ptype->callbacks.gso_segment(skb, features);
2334 			break;
2335 		}
2336 	}
2337 	rcu_read_unlock();
2338 
2339 	__skb_push(skb, skb->data - skb_mac_header(skb));
2340 
2341 	return segs;
2342 }
2343 EXPORT_SYMBOL(skb_mac_gso_segment);
2344 
2345 
2346 /* openvswitch calls this on rx path, so we need a different check.
2347  */
2348 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2349 {
2350 	if (tx_path)
2351 		return skb->ip_summed != CHECKSUM_PARTIAL;
2352 	else
2353 		return skb->ip_summed == CHECKSUM_NONE;
2354 }
2355 
2356 /**
2357  *	__skb_gso_segment - Perform segmentation on skb.
2358  *	@skb: buffer to segment
2359  *	@features: features for the output path (see dev->features)
2360  *	@tx_path: whether it is called in TX path
2361  *
2362  *	This function segments the given skb and returns a list of segments.
2363  *
2364  *	It may return NULL if the skb requires no segmentation.  This is
2365  *	only possible when GSO is used for verifying header integrity.
2366  */
2367 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2368 				  netdev_features_t features, bool tx_path)
2369 {
2370 	if (unlikely(skb_needs_check(skb, tx_path))) {
2371 		int err;
2372 
2373 		skb_warn_bad_offload(skb);
2374 
2375 		if (skb_header_cloned(skb) &&
2376 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2377 			return ERR_PTR(err);
2378 	}
2379 
2380 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2381 	SKB_GSO_CB(skb)->encap_level = 0;
2382 
2383 	skb_reset_mac_header(skb);
2384 	skb_reset_mac_len(skb);
2385 
2386 	return skb_mac_gso_segment(skb, features);
2387 }
2388 EXPORT_SYMBOL(__skb_gso_segment);
2389 
2390 /* Take action when hardware reception checksum errors are detected. */
2391 #ifdef CONFIG_BUG
2392 void netdev_rx_csum_fault(struct net_device *dev)
2393 {
2394 	if (net_ratelimit()) {
2395 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2396 		dump_stack();
2397 	}
2398 }
2399 EXPORT_SYMBOL(netdev_rx_csum_fault);
2400 #endif
2401 
2402 /* Actually, we should eliminate this check as soon as we know, that:
2403  * 1. IOMMU is present and allows to map all the memory.
2404  * 2. No high memory really exists on this machine.
2405  */
2406 
2407 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2408 {
2409 #ifdef CONFIG_HIGHMEM
2410 	int i;
2411 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2412 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2413 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2414 			if (PageHighMem(skb_frag_page(frag)))
2415 				return 1;
2416 		}
2417 	}
2418 
2419 	if (PCI_DMA_BUS_IS_PHYS) {
2420 		struct device *pdev = dev->dev.parent;
2421 
2422 		if (!pdev)
2423 			return 0;
2424 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2425 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2426 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2427 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2428 				return 1;
2429 		}
2430 	}
2431 #endif
2432 	return 0;
2433 }
2434 
2435 struct dev_gso_cb {
2436 	void (*destructor)(struct sk_buff *skb);
2437 };
2438 
2439 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2440 
2441 static void dev_gso_skb_destructor(struct sk_buff *skb)
2442 {
2443 	struct dev_gso_cb *cb;
2444 
2445 	do {
2446 		struct sk_buff *nskb = skb->next;
2447 
2448 		skb->next = nskb->next;
2449 		nskb->next = NULL;
2450 		kfree_skb(nskb);
2451 	} while (skb->next);
2452 
2453 	cb = DEV_GSO_CB(skb);
2454 	if (cb->destructor)
2455 		cb->destructor(skb);
2456 }
2457 
2458 /**
2459  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2460  *	@skb: buffer to segment
2461  *	@features: device features as applicable to this skb
2462  *
2463  *	This function segments the given skb and stores the list of segments
2464  *	in skb->next.
2465  */
2466 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2467 {
2468 	struct sk_buff *segs;
2469 
2470 	segs = skb_gso_segment(skb, features);
2471 
2472 	/* Verifying header integrity only. */
2473 	if (!segs)
2474 		return 0;
2475 
2476 	if (IS_ERR(segs))
2477 		return PTR_ERR(segs);
2478 
2479 	skb->next = segs;
2480 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2481 	skb->destructor = dev_gso_skb_destructor;
2482 
2483 	return 0;
2484 }
2485 
2486 static netdev_features_t harmonize_features(struct sk_buff *skb,
2487 	netdev_features_t features)
2488 {
2489 	if (skb->ip_summed != CHECKSUM_NONE &&
2490 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
2491 		features &= ~NETIF_F_ALL_CSUM;
2492 	} else if (illegal_highdma(skb->dev, skb)) {
2493 		features &= ~NETIF_F_SG;
2494 	}
2495 
2496 	return features;
2497 }
2498 
2499 netdev_features_t netif_skb_features(struct sk_buff *skb)
2500 {
2501 	__be16 protocol = skb->protocol;
2502 	netdev_features_t features = skb->dev->features;
2503 
2504 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2505 		features &= ~NETIF_F_GSO_MASK;
2506 
2507 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2508 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2509 		protocol = veh->h_vlan_encapsulated_proto;
2510 	} else if (!vlan_tx_tag_present(skb)) {
2511 		return harmonize_features(skb, features);
2512 	}
2513 
2514 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2515 					       NETIF_F_HW_VLAN_STAG_TX);
2516 
2517 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2518 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2519 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2520 				NETIF_F_HW_VLAN_STAG_TX;
2521 
2522 	return harmonize_features(skb, features);
2523 }
2524 EXPORT_SYMBOL(netif_skb_features);
2525 
2526 /*
2527  * Returns true if either:
2528  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2529  *	2. skb is fragmented and the device does not support SG.
2530  */
2531 static inline int skb_needs_linearize(struct sk_buff *skb,
2532 				      netdev_features_t features)
2533 {
2534 	return skb_is_nonlinear(skb) &&
2535 			((skb_has_frag_list(skb) &&
2536 				!(features & NETIF_F_FRAGLIST)) ||
2537 			(skb_shinfo(skb)->nr_frags &&
2538 				!(features & NETIF_F_SG)));
2539 }
2540 
2541 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2542 			struct netdev_queue *txq)
2543 {
2544 	const struct net_device_ops *ops = dev->netdev_ops;
2545 	int rc = NETDEV_TX_OK;
2546 	unsigned int skb_len;
2547 
2548 	if (likely(!skb->next)) {
2549 		netdev_features_t features;
2550 
2551 		/*
2552 		 * If device doesn't need skb->dst, release it right now while
2553 		 * its hot in this cpu cache
2554 		 */
2555 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2556 			skb_dst_drop(skb);
2557 
2558 		features = netif_skb_features(skb);
2559 
2560 		if (vlan_tx_tag_present(skb) &&
2561 		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2562 			skb = __vlan_put_tag(skb, skb->vlan_proto,
2563 					     vlan_tx_tag_get(skb));
2564 			if (unlikely(!skb))
2565 				goto out;
2566 
2567 			skb->vlan_tci = 0;
2568 		}
2569 
2570 		/* If encapsulation offload request, verify we are testing
2571 		 * hardware encapsulation features instead of standard
2572 		 * features for the netdev
2573 		 */
2574 		if (skb->encapsulation)
2575 			features &= dev->hw_enc_features;
2576 
2577 		if (netif_needs_gso(skb, features)) {
2578 			if (unlikely(dev_gso_segment(skb, features)))
2579 				goto out_kfree_skb;
2580 			if (skb->next)
2581 				goto gso;
2582 		} else {
2583 			if (skb_needs_linearize(skb, features) &&
2584 			    __skb_linearize(skb))
2585 				goto out_kfree_skb;
2586 
2587 			/* If packet is not checksummed and device does not
2588 			 * support checksumming for this protocol, complete
2589 			 * checksumming here.
2590 			 */
2591 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2592 				if (skb->encapsulation)
2593 					skb_set_inner_transport_header(skb,
2594 						skb_checksum_start_offset(skb));
2595 				else
2596 					skb_set_transport_header(skb,
2597 						skb_checksum_start_offset(skb));
2598 				if (!(features & NETIF_F_ALL_CSUM) &&
2599 				     skb_checksum_help(skb))
2600 					goto out_kfree_skb;
2601 			}
2602 		}
2603 
2604 		if (!list_empty(&ptype_all))
2605 			dev_queue_xmit_nit(skb, dev);
2606 
2607 		skb_len = skb->len;
2608 			rc = ops->ndo_start_xmit(skb, dev);
2609 
2610 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2611 		if (rc == NETDEV_TX_OK)
2612 			txq_trans_update(txq);
2613 		return rc;
2614 	}
2615 
2616 gso:
2617 	do {
2618 		struct sk_buff *nskb = skb->next;
2619 
2620 		skb->next = nskb->next;
2621 		nskb->next = NULL;
2622 
2623 		if (!list_empty(&ptype_all))
2624 			dev_queue_xmit_nit(nskb, dev);
2625 
2626 		skb_len = nskb->len;
2627 		rc = ops->ndo_start_xmit(nskb, dev);
2628 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2629 		if (unlikely(rc != NETDEV_TX_OK)) {
2630 			if (rc & ~NETDEV_TX_MASK)
2631 				goto out_kfree_gso_skb;
2632 			nskb->next = skb->next;
2633 			skb->next = nskb;
2634 			return rc;
2635 		}
2636 		txq_trans_update(txq);
2637 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2638 			return NETDEV_TX_BUSY;
2639 	} while (skb->next);
2640 
2641 out_kfree_gso_skb:
2642 	if (likely(skb->next == NULL)) {
2643 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2644 		consume_skb(skb);
2645 		return rc;
2646 	}
2647 out_kfree_skb:
2648 	kfree_skb(skb);
2649 out:
2650 	return rc;
2651 }
2652 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2653 
2654 static void qdisc_pkt_len_init(struct sk_buff *skb)
2655 {
2656 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2657 
2658 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2659 
2660 	/* To get more precise estimation of bytes sent on wire,
2661 	 * we add to pkt_len the headers size of all segments
2662 	 */
2663 	if (shinfo->gso_size)  {
2664 		unsigned int hdr_len;
2665 		u16 gso_segs = shinfo->gso_segs;
2666 
2667 		/* mac layer + network layer */
2668 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2669 
2670 		/* + transport layer */
2671 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2672 			hdr_len += tcp_hdrlen(skb);
2673 		else
2674 			hdr_len += sizeof(struct udphdr);
2675 
2676 		if (shinfo->gso_type & SKB_GSO_DODGY)
2677 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2678 						shinfo->gso_size);
2679 
2680 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2681 	}
2682 }
2683 
2684 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2685 				 struct net_device *dev,
2686 				 struct netdev_queue *txq)
2687 {
2688 	spinlock_t *root_lock = qdisc_lock(q);
2689 	bool contended;
2690 	int rc;
2691 
2692 	qdisc_pkt_len_init(skb);
2693 	qdisc_calculate_pkt_len(skb, q);
2694 	/*
2695 	 * Heuristic to force contended enqueues to serialize on a
2696 	 * separate lock before trying to get qdisc main lock.
2697 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2698 	 * and dequeue packets faster.
2699 	 */
2700 	contended = qdisc_is_running(q);
2701 	if (unlikely(contended))
2702 		spin_lock(&q->busylock);
2703 
2704 	spin_lock(root_lock);
2705 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2706 		kfree_skb(skb);
2707 		rc = NET_XMIT_DROP;
2708 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2709 		   qdisc_run_begin(q)) {
2710 		/*
2711 		 * This is a work-conserving queue; there are no old skbs
2712 		 * waiting to be sent out; and the qdisc is not running -
2713 		 * xmit the skb directly.
2714 		 */
2715 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2716 			skb_dst_force(skb);
2717 
2718 		qdisc_bstats_update(q, skb);
2719 
2720 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2721 			if (unlikely(contended)) {
2722 				spin_unlock(&q->busylock);
2723 				contended = false;
2724 			}
2725 			__qdisc_run(q);
2726 		} else
2727 			qdisc_run_end(q);
2728 
2729 		rc = NET_XMIT_SUCCESS;
2730 	} else {
2731 		skb_dst_force(skb);
2732 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2733 		if (qdisc_run_begin(q)) {
2734 			if (unlikely(contended)) {
2735 				spin_unlock(&q->busylock);
2736 				contended = false;
2737 			}
2738 			__qdisc_run(q);
2739 		}
2740 	}
2741 	spin_unlock(root_lock);
2742 	if (unlikely(contended))
2743 		spin_unlock(&q->busylock);
2744 	return rc;
2745 }
2746 
2747 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2748 static void skb_update_prio(struct sk_buff *skb)
2749 {
2750 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2751 
2752 	if (!skb->priority && skb->sk && map) {
2753 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2754 
2755 		if (prioidx < map->priomap_len)
2756 			skb->priority = map->priomap[prioidx];
2757 	}
2758 }
2759 #else
2760 #define skb_update_prio(skb)
2761 #endif
2762 
2763 static DEFINE_PER_CPU(int, xmit_recursion);
2764 #define RECURSION_LIMIT 10
2765 
2766 /**
2767  *	dev_loopback_xmit - loop back @skb
2768  *	@skb: buffer to transmit
2769  */
2770 int dev_loopback_xmit(struct sk_buff *skb)
2771 {
2772 	skb_reset_mac_header(skb);
2773 	__skb_pull(skb, skb_network_offset(skb));
2774 	skb->pkt_type = PACKET_LOOPBACK;
2775 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2776 	WARN_ON(!skb_dst(skb));
2777 	skb_dst_force(skb);
2778 	netif_rx_ni(skb);
2779 	return 0;
2780 }
2781 EXPORT_SYMBOL(dev_loopback_xmit);
2782 
2783 /**
2784  *	dev_queue_xmit - transmit a buffer
2785  *	@skb: buffer to transmit
2786  *
2787  *	Queue a buffer for transmission to a network device. The caller must
2788  *	have set the device and priority and built the buffer before calling
2789  *	this function. The function can be called from an interrupt.
2790  *
2791  *	A negative errno code is returned on a failure. A success does not
2792  *	guarantee the frame will be transmitted as it may be dropped due
2793  *	to congestion or traffic shaping.
2794  *
2795  * -----------------------------------------------------------------------------------
2796  *      I notice this method can also return errors from the queue disciplines,
2797  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2798  *      be positive.
2799  *
2800  *      Regardless of the return value, the skb is consumed, so it is currently
2801  *      difficult to retry a send to this method.  (You can bump the ref count
2802  *      before sending to hold a reference for retry if you are careful.)
2803  *
2804  *      When calling this method, interrupts MUST be enabled.  This is because
2805  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2806  *          --BLG
2807  */
2808 int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2809 {
2810 	struct net_device *dev = skb->dev;
2811 	struct netdev_queue *txq;
2812 	struct Qdisc *q;
2813 	int rc = -ENOMEM;
2814 
2815 	skb_reset_mac_header(skb);
2816 
2817 	/* Disable soft irqs for various locks below. Also
2818 	 * stops preemption for RCU.
2819 	 */
2820 	rcu_read_lock_bh();
2821 
2822 	skb_update_prio(skb);
2823 
2824 	txq = netdev_pick_tx(dev, skb, accel_priv);
2825 	q = rcu_dereference_bh(txq->qdisc);
2826 
2827 #ifdef CONFIG_NET_CLS_ACT
2828 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2829 #endif
2830 	trace_net_dev_queue(skb);
2831 	if (q->enqueue) {
2832 		rc = __dev_xmit_skb(skb, q, dev, txq);
2833 		goto out;
2834 	}
2835 
2836 	/* The device has no queue. Common case for software devices:
2837 	   loopback, all the sorts of tunnels...
2838 
2839 	   Really, it is unlikely that netif_tx_lock protection is necessary
2840 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2841 	   counters.)
2842 	   However, it is possible, that they rely on protection
2843 	   made by us here.
2844 
2845 	   Check this and shot the lock. It is not prone from deadlocks.
2846 	   Either shot noqueue qdisc, it is even simpler 8)
2847 	 */
2848 	if (dev->flags & IFF_UP) {
2849 		int cpu = smp_processor_id(); /* ok because BHs are off */
2850 
2851 		if (txq->xmit_lock_owner != cpu) {
2852 
2853 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2854 				goto recursion_alert;
2855 
2856 			HARD_TX_LOCK(dev, txq, cpu);
2857 
2858 			if (!netif_xmit_stopped(txq)) {
2859 				__this_cpu_inc(xmit_recursion);
2860 				rc = dev_hard_start_xmit(skb, dev, txq);
2861 				__this_cpu_dec(xmit_recursion);
2862 				if (dev_xmit_complete(rc)) {
2863 					HARD_TX_UNLOCK(dev, txq);
2864 					goto out;
2865 				}
2866 			}
2867 			HARD_TX_UNLOCK(dev, txq);
2868 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2869 					     dev->name);
2870 		} else {
2871 			/* Recursion is detected! It is possible,
2872 			 * unfortunately
2873 			 */
2874 recursion_alert:
2875 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2876 					     dev->name);
2877 		}
2878 	}
2879 
2880 	rc = -ENETDOWN;
2881 	rcu_read_unlock_bh();
2882 
2883 	kfree_skb(skb);
2884 	return rc;
2885 out:
2886 	rcu_read_unlock_bh();
2887 	return rc;
2888 }
2889 
2890 int dev_queue_xmit(struct sk_buff *skb)
2891 {
2892 	return __dev_queue_xmit(skb, NULL);
2893 }
2894 EXPORT_SYMBOL(dev_queue_xmit);
2895 
2896 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
2897 {
2898 	return __dev_queue_xmit(skb, accel_priv);
2899 }
2900 EXPORT_SYMBOL(dev_queue_xmit_accel);
2901 
2902 
2903 /*=======================================================================
2904 			Receiver routines
2905   =======================================================================*/
2906 
2907 int netdev_max_backlog __read_mostly = 1000;
2908 EXPORT_SYMBOL(netdev_max_backlog);
2909 
2910 int netdev_tstamp_prequeue __read_mostly = 1;
2911 int netdev_budget __read_mostly = 300;
2912 int weight_p __read_mostly = 64;            /* old backlog weight */
2913 
2914 /* Called with irq disabled */
2915 static inline void ____napi_schedule(struct softnet_data *sd,
2916 				     struct napi_struct *napi)
2917 {
2918 	list_add_tail(&napi->poll_list, &sd->poll_list);
2919 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2920 }
2921 
2922 #ifdef CONFIG_RPS
2923 
2924 /* One global table that all flow-based protocols share. */
2925 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2926 EXPORT_SYMBOL(rps_sock_flow_table);
2927 
2928 struct static_key rps_needed __read_mostly;
2929 
2930 static struct rps_dev_flow *
2931 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2932 	    struct rps_dev_flow *rflow, u16 next_cpu)
2933 {
2934 	if (next_cpu != RPS_NO_CPU) {
2935 #ifdef CONFIG_RFS_ACCEL
2936 		struct netdev_rx_queue *rxqueue;
2937 		struct rps_dev_flow_table *flow_table;
2938 		struct rps_dev_flow *old_rflow;
2939 		u32 flow_id;
2940 		u16 rxq_index;
2941 		int rc;
2942 
2943 		/* Should we steer this flow to a different hardware queue? */
2944 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2945 		    !(dev->features & NETIF_F_NTUPLE))
2946 			goto out;
2947 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2948 		if (rxq_index == skb_get_rx_queue(skb))
2949 			goto out;
2950 
2951 		rxqueue = dev->_rx + rxq_index;
2952 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2953 		if (!flow_table)
2954 			goto out;
2955 		flow_id = skb->rxhash & flow_table->mask;
2956 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2957 							rxq_index, flow_id);
2958 		if (rc < 0)
2959 			goto out;
2960 		old_rflow = rflow;
2961 		rflow = &flow_table->flows[flow_id];
2962 		rflow->filter = rc;
2963 		if (old_rflow->filter == rflow->filter)
2964 			old_rflow->filter = RPS_NO_FILTER;
2965 	out:
2966 #endif
2967 		rflow->last_qtail =
2968 			per_cpu(softnet_data, next_cpu).input_queue_head;
2969 	}
2970 
2971 	rflow->cpu = next_cpu;
2972 	return rflow;
2973 }
2974 
2975 /*
2976  * get_rps_cpu is called from netif_receive_skb and returns the target
2977  * CPU from the RPS map of the receiving queue for a given skb.
2978  * rcu_read_lock must be held on entry.
2979  */
2980 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2981 		       struct rps_dev_flow **rflowp)
2982 {
2983 	struct netdev_rx_queue *rxqueue;
2984 	struct rps_map *map;
2985 	struct rps_dev_flow_table *flow_table;
2986 	struct rps_sock_flow_table *sock_flow_table;
2987 	int cpu = -1;
2988 	u16 tcpu;
2989 
2990 	if (skb_rx_queue_recorded(skb)) {
2991 		u16 index = skb_get_rx_queue(skb);
2992 		if (unlikely(index >= dev->real_num_rx_queues)) {
2993 			WARN_ONCE(dev->real_num_rx_queues > 1,
2994 				  "%s received packet on queue %u, but number "
2995 				  "of RX queues is %u\n",
2996 				  dev->name, index, dev->real_num_rx_queues);
2997 			goto done;
2998 		}
2999 		rxqueue = dev->_rx + index;
3000 	} else
3001 		rxqueue = dev->_rx;
3002 
3003 	map = rcu_dereference(rxqueue->rps_map);
3004 	if (map) {
3005 		if (map->len == 1 &&
3006 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3007 			tcpu = map->cpus[0];
3008 			if (cpu_online(tcpu))
3009 				cpu = tcpu;
3010 			goto done;
3011 		}
3012 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3013 		goto done;
3014 	}
3015 
3016 	skb_reset_network_header(skb);
3017 	if (!skb_get_rxhash(skb))
3018 		goto done;
3019 
3020 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3021 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3022 	if (flow_table && sock_flow_table) {
3023 		u16 next_cpu;
3024 		struct rps_dev_flow *rflow;
3025 
3026 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3027 		tcpu = rflow->cpu;
3028 
3029 		next_cpu = sock_flow_table->ents[skb->rxhash &
3030 		    sock_flow_table->mask];
3031 
3032 		/*
3033 		 * If the desired CPU (where last recvmsg was done) is
3034 		 * different from current CPU (one in the rx-queue flow
3035 		 * table entry), switch if one of the following holds:
3036 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3037 		 *   - Current CPU is offline.
3038 		 *   - The current CPU's queue tail has advanced beyond the
3039 		 *     last packet that was enqueued using this table entry.
3040 		 *     This guarantees that all previous packets for the flow
3041 		 *     have been dequeued, thus preserving in order delivery.
3042 		 */
3043 		if (unlikely(tcpu != next_cpu) &&
3044 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3045 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3046 		      rflow->last_qtail)) >= 0)) {
3047 			tcpu = next_cpu;
3048 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3049 		}
3050 
3051 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3052 			*rflowp = rflow;
3053 			cpu = tcpu;
3054 			goto done;
3055 		}
3056 	}
3057 
3058 	if (map) {
3059 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3060 
3061 		if (cpu_online(tcpu)) {
3062 			cpu = tcpu;
3063 			goto done;
3064 		}
3065 	}
3066 
3067 done:
3068 	return cpu;
3069 }
3070 
3071 #ifdef CONFIG_RFS_ACCEL
3072 
3073 /**
3074  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3075  * @dev: Device on which the filter was set
3076  * @rxq_index: RX queue index
3077  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3078  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3079  *
3080  * Drivers that implement ndo_rx_flow_steer() should periodically call
3081  * this function for each installed filter and remove the filters for
3082  * which it returns %true.
3083  */
3084 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3085 			 u32 flow_id, u16 filter_id)
3086 {
3087 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3088 	struct rps_dev_flow_table *flow_table;
3089 	struct rps_dev_flow *rflow;
3090 	bool expire = true;
3091 	int cpu;
3092 
3093 	rcu_read_lock();
3094 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3095 	if (flow_table && flow_id <= flow_table->mask) {
3096 		rflow = &flow_table->flows[flow_id];
3097 		cpu = ACCESS_ONCE(rflow->cpu);
3098 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3099 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3100 			   rflow->last_qtail) <
3101 		     (int)(10 * flow_table->mask)))
3102 			expire = false;
3103 	}
3104 	rcu_read_unlock();
3105 	return expire;
3106 }
3107 EXPORT_SYMBOL(rps_may_expire_flow);
3108 
3109 #endif /* CONFIG_RFS_ACCEL */
3110 
3111 /* Called from hardirq (IPI) context */
3112 static void rps_trigger_softirq(void *data)
3113 {
3114 	struct softnet_data *sd = data;
3115 
3116 	____napi_schedule(sd, &sd->backlog);
3117 	sd->received_rps++;
3118 }
3119 
3120 #endif /* CONFIG_RPS */
3121 
3122 /*
3123  * Check if this softnet_data structure is another cpu one
3124  * If yes, queue it to our IPI list and return 1
3125  * If no, return 0
3126  */
3127 static int rps_ipi_queued(struct softnet_data *sd)
3128 {
3129 #ifdef CONFIG_RPS
3130 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3131 
3132 	if (sd != mysd) {
3133 		sd->rps_ipi_next = mysd->rps_ipi_list;
3134 		mysd->rps_ipi_list = sd;
3135 
3136 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3137 		return 1;
3138 	}
3139 #endif /* CONFIG_RPS */
3140 	return 0;
3141 }
3142 
3143 #ifdef CONFIG_NET_FLOW_LIMIT
3144 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3145 #endif
3146 
3147 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3148 {
3149 #ifdef CONFIG_NET_FLOW_LIMIT
3150 	struct sd_flow_limit *fl;
3151 	struct softnet_data *sd;
3152 	unsigned int old_flow, new_flow;
3153 
3154 	if (qlen < (netdev_max_backlog >> 1))
3155 		return false;
3156 
3157 	sd = &__get_cpu_var(softnet_data);
3158 
3159 	rcu_read_lock();
3160 	fl = rcu_dereference(sd->flow_limit);
3161 	if (fl) {
3162 		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3163 		old_flow = fl->history[fl->history_head];
3164 		fl->history[fl->history_head] = new_flow;
3165 
3166 		fl->history_head++;
3167 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3168 
3169 		if (likely(fl->buckets[old_flow]))
3170 			fl->buckets[old_flow]--;
3171 
3172 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3173 			fl->count++;
3174 			rcu_read_unlock();
3175 			return true;
3176 		}
3177 	}
3178 	rcu_read_unlock();
3179 #endif
3180 	return false;
3181 }
3182 
3183 /*
3184  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3185  * queue (may be a remote CPU queue).
3186  */
3187 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3188 			      unsigned int *qtail)
3189 {
3190 	struct softnet_data *sd;
3191 	unsigned long flags;
3192 	unsigned int qlen;
3193 
3194 	sd = &per_cpu(softnet_data, cpu);
3195 
3196 	local_irq_save(flags);
3197 
3198 	rps_lock(sd);
3199 	qlen = skb_queue_len(&sd->input_pkt_queue);
3200 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3201 		if (skb_queue_len(&sd->input_pkt_queue)) {
3202 enqueue:
3203 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3204 			input_queue_tail_incr_save(sd, qtail);
3205 			rps_unlock(sd);
3206 			local_irq_restore(flags);
3207 			return NET_RX_SUCCESS;
3208 		}
3209 
3210 		/* Schedule NAPI for backlog device
3211 		 * We can use non atomic operation since we own the queue lock
3212 		 */
3213 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3214 			if (!rps_ipi_queued(sd))
3215 				____napi_schedule(sd, &sd->backlog);
3216 		}
3217 		goto enqueue;
3218 	}
3219 
3220 	sd->dropped++;
3221 	rps_unlock(sd);
3222 
3223 	local_irq_restore(flags);
3224 
3225 	atomic_long_inc(&skb->dev->rx_dropped);
3226 	kfree_skb(skb);
3227 	return NET_RX_DROP;
3228 }
3229 
3230 /**
3231  *	netif_rx	-	post buffer to the network code
3232  *	@skb: buffer to post
3233  *
3234  *	This function receives a packet from a device driver and queues it for
3235  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3236  *	may be dropped during processing for congestion control or by the
3237  *	protocol layers.
3238  *
3239  *	return values:
3240  *	NET_RX_SUCCESS	(no congestion)
3241  *	NET_RX_DROP     (packet was dropped)
3242  *
3243  */
3244 
3245 int netif_rx(struct sk_buff *skb)
3246 {
3247 	int ret;
3248 
3249 	/* if netpoll wants it, pretend we never saw it */
3250 	if (netpoll_rx(skb))
3251 		return NET_RX_DROP;
3252 
3253 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3254 
3255 	trace_netif_rx(skb);
3256 #ifdef CONFIG_RPS
3257 	if (static_key_false(&rps_needed)) {
3258 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3259 		int cpu;
3260 
3261 		preempt_disable();
3262 		rcu_read_lock();
3263 
3264 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3265 		if (cpu < 0)
3266 			cpu = smp_processor_id();
3267 
3268 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3269 
3270 		rcu_read_unlock();
3271 		preempt_enable();
3272 	} else
3273 #endif
3274 	{
3275 		unsigned int qtail;
3276 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3277 		put_cpu();
3278 	}
3279 	return ret;
3280 }
3281 EXPORT_SYMBOL(netif_rx);
3282 
3283 int netif_rx_ni(struct sk_buff *skb)
3284 {
3285 	int err;
3286 
3287 	preempt_disable();
3288 	err = netif_rx(skb);
3289 	if (local_softirq_pending())
3290 		do_softirq();
3291 	preempt_enable();
3292 
3293 	return err;
3294 }
3295 EXPORT_SYMBOL(netif_rx_ni);
3296 
3297 static void net_tx_action(struct softirq_action *h)
3298 {
3299 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3300 
3301 	if (sd->completion_queue) {
3302 		struct sk_buff *clist;
3303 
3304 		local_irq_disable();
3305 		clist = sd->completion_queue;
3306 		sd->completion_queue = NULL;
3307 		local_irq_enable();
3308 
3309 		while (clist) {
3310 			struct sk_buff *skb = clist;
3311 			clist = clist->next;
3312 
3313 			WARN_ON(atomic_read(&skb->users));
3314 			trace_kfree_skb(skb, net_tx_action);
3315 			__kfree_skb(skb);
3316 		}
3317 	}
3318 
3319 	if (sd->output_queue) {
3320 		struct Qdisc *head;
3321 
3322 		local_irq_disable();
3323 		head = sd->output_queue;
3324 		sd->output_queue = NULL;
3325 		sd->output_queue_tailp = &sd->output_queue;
3326 		local_irq_enable();
3327 
3328 		while (head) {
3329 			struct Qdisc *q = head;
3330 			spinlock_t *root_lock;
3331 
3332 			head = head->next_sched;
3333 
3334 			root_lock = qdisc_lock(q);
3335 			if (spin_trylock(root_lock)) {
3336 				smp_mb__before_clear_bit();
3337 				clear_bit(__QDISC_STATE_SCHED,
3338 					  &q->state);
3339 				qdisc_run(q);
3340 				spin_unlock(root_lock);
3341 			} else {
3342 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3343 					      &q->state)) {
3344 					__netif_reschedule(q);
3345 				} else {
3346 					smp_mb__before_clear_bit();
3347 					clear_bit(__QDISC_STATE_SCHED,
3348 						  &q->state);
3349 				}
3350 			}
3351 		}
3352 	}
3353 }
3354 
3355 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3356     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3357 /* This hook is defined here for ATM LANE */
3358 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3359 			     unsigned char *addr) __read_mostly;
3360 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3361 #endif
3362 
3363 #ifdef CONFIG_NET_CLS_ACT
3364 /* TODO: Maybe we should just force sch_ingress to be compiled in
3365  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3366  * a compare and 2 stores extra right now if we dont have it on
3367  * but have CONFIG_NET_CLS_ACT
3368  * NOTE: This doesn't stop any functionality; if you dont have
3369  * the ingress scheduler, you just can't add policies on ingress.
3370  *
3371  */
3372 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3373 {
3374 	struct net_device *dev = skb->dev;
3375 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3376 	int result = TC_ACT_OK;
3377 	struct Qdisc *q;
3378 
3379 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3380 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3381 				     skb->skb_iif, dev->ifindex);
3382 		return TC_ACT_SHOT;
3383 	}
3384 
3385 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3386 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3387 
3388 	q = rxq->qdisc;
3389 	if (q != &noop_qdisc) {
3390 		spin_lock(qdisc_lock(q));
3391 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3392 			result = qdisc_enqueue_root(skb, q);
3393 		spin_unlock(qdisc_lock(q));
3394 	}
3395 
3396 	return result;
3397 }
3398 
3399 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3400 					 struct packet_type **pt_prev,
3401 					 int *ret, struct net_device *orig_dev)
3402 {
3403 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3404 
3405 	if (!rxq || rxq->qdisc == &noop_qdisc)
3406 		goto out;
3407 
3408 	if (*pt_prev) {
3409 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3410 		*pt_prev = NULL;
3411 	}
3412 
3413 	switch (ing_filter(skb, rxq)) {
3414 	case TC_ACT_SHOT:
3415 	case TC_ACT_STOLEN:
3416 		kfree_skb(skb);
3417 		return NULL;
3418 	}
3419 
3420 out:
3421 	skb->tc_verd = 0;
3422 	return skb;
3423 }
3424 #endif
3425 
3426 /**
3427  *	netdev_rx_handler_register - register receive handler
3428  *	@dev: device to register a handler for
3429  *	@rx_handler: receive handler to register
3430  *	@rx_handler_data: data pointer that is used by rx handler
3431  *
3432  *	Register a receive hander for a device. This handler will then be
3433  *	called from __netif_receive_skb. A negative errno code is returned
3434  *	on a failure.
3435  *
3436  *	The caller must hold the rtnl_mutex.
3437  *
3438  *	For a general description of rx_handler, see enum rx_handler_result.
3439  */
3440 int netdev_rx_handler_register(struct net_device *dev,
3441 			       rx_handler_func_t *rx_handler,
3442 			       void *rx_handler_data)
3443 {
3444 	ASSERT_RTNL();
3445 
3446 	if (dev->rx_handler)
3447 		return -EBUSY;
3448 
3449 	/* Note: rx_handler_data must be set before rx_handler */
3450 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3451 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3452 
3453 	return 0;
3454 }
3455 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3456 
3457 /**
3458  *	netdev_rx_handler_unregister - unregister receive handler
3459  *	@dev: device to unregister a handler from
3460  *
3461  *	Unregister a receive handler from a device.
3462  *
3463  *	The caller must hold the rtnl_mutex.
3464  */
3465 void netdev_rx_handler_unregister(struct net_device *dev)
3466 {
3467 
3468 	ASSERT_RTNL();
3469 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3470 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3471 	 * section has a guarantee to see a non NULL rx_handler_data
3472 	 * as well.
3473 	 */
3474 	synchronize_net();
3475 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3476 }
3477 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3478 
3479 /*
3480  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3481  * the special handling of PFMEMALLOC skbs.
3482  */
3483 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3484 {
3485 	switch (skb->protocol) {
3486 	case __constant_htons(ETH_P_ARP):
3487 	case __constant_htons(ETH_P_IP):
3488 	case __constant_htons(ETH_P_IPV6):
3489 	case __constant_htons(ETH_P_8021Q):
3490 	case __constant_htons(ETH_P_8021AD):
3491 		return true;
3492 	default:
3493 		return false;
3494 	}
3495 }
3496 
3497 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3498 {
3499 	struct packet_type *ptype, *pt_prev;
3500 	rx_handler_func_t *rx_handler;
3501 	struct net_device *orig_dev;
3502 	struct net_device *null_or_dev;
3503 	bool deliver_exact = false;
3504 	int ret = NET_RX_DROP;
3505 	__be16 type;
3506 
3507 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3508 
3509 	trace_netif_receive_skb(skb);
3510 
3511 	/* if we've gotten here through NAPI, check netpoll */
3512 	if (netpoll_receive_skb(skb))
3513 		goto out;
3514 
3515 	orig_dev = skb->dev;
3516 
3517 	skb_reset_network_header(skb);
3518 	if (!skb_transport_header_was_set(skb))
3519 		skb_reset_transport_header(skb);
3520 	skb_reset_mac_len(skb);
3521 
3522 	pt_prev = NULL;
3523 
3524 	rcu_read_lock();
3525 
3526 another_round:
3527 	skb->skb_iif = skb->dev->ifindex;
3528 
3529 	__this_cpu_inc(softnet_data.processed);
3530 
3531 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3532 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3533 		skb = vlan_untag(skb);
3534 		if (unlikely(!skb))
3535 			goto unlock;
3536 	}
3537 
3538 #ifdef CONFIG_NET_CLS_ACT
3539 	if (skb->tc_verd & TC_NCLS) {
3540 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3541 		goto ncls;
3542 	}
3543 #endif
3544 
3545 	if (pfmemalloc)
3546 		goto skip_taps;
3547 
3548 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3549 		if (!ptype->dev || ptype->dev == skb->dev) {
3550 			if (pt_prev)
3551 				ret = deliver_skb(skb, pt_prev, orig_dev);
3552 			pt_prev = ptype;
3553 		}
3554 	}
3555 
3556 skip_taps:
3557 #ifdef CONFIG_NET_CLS_ACT
3558 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3559 	if (!skb)
3560 		goto unlock;
3561 ncls:
3562 #endif
3563 
3564 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3565 		goto drop;
3566 
3567 	if (vlan_tx_tag_present(skb)) {
3568 		if (pt_prev) {
3569 			ret = deliver_skb(skb, pt_prev, orig_dev);
3570 			pt_prev = NULL;
3571 		}
3572 		if (vlan_do_receive(&skb))
3573 			goto another_round;
3574 		else if (unlikely(!skb))
3575 			goto unlock;
3576 	}
3577 
3578 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3579 	if (rx_handler) {
3580 		if (pt_prev) {
3581 			ret = deliver_skb(skb, pt_prev, orig_dev);
3582 			pt_prev = NULL;
3583 		}
3584 		switch (rx_handler(&skb)) {
3585 		case RX_HANDLER_CONSUMED:
3586 			ret = NET_RX_SUCCESS;
3587 			goto unlock;
3588 		case RX_HANDLER_ANOTHER:
3589 			goto another_round;
3590 		case RX_HANDLER_EXACT:
3591 			deliver_exact = true;
3592 		case RX_HANDLER_PASS:
3593 			break;
3594 		default:
3595 			BUG();
3596 		}
3597 	}
3598 
3599 	if (unlikely(vlan_tx_tag_present(skb))) {
3600 		if (vlan_tx_tag_get_id(skb))
3601 			skb->pkt_type = PACKET_OTHERHOST;
3602 		/* Note: we might in the future use prio bits
3603 		 * and set skb->priority like in vlan_do_receive()
3604 		 * For the time being, just ignore Priority Code Point
3605 		 */
3606 		skb->vlan_tci = 0;
3607 	}
3608 
3609 	/* deliver only exact match when indicated */
3610 	null_or_dev = deliver_exact ? skb->dev : NULL;
3611 
3612 	type = skb->protocol;
3613 	list_for_each_entry_rcu(ptype,
3614 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3615 		if (ptype->type == type &&
3616 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3617 		     ptype->dev == orig_dev)) {
3618 			if (pt_prev)
3619 				ret = deliver_skb(skb, pt_prev, orig_dev);
3620 			pt_prev = ptype;
3621 		}
3622 	}
3623 
3624 	if (pt_prev) {
3625 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3626 			goto drop;
3627 		else
3628 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3629 	} else {
3630 drop:
3631 		atomic_long_inc(&skb->dev->rx_dropped);
3632 		kfree_skb(skb);
3633 		/* Jamal, now you will not able to escape explaining
3634 		 * me how you were going to use this. :-)
3635 		 */
3636 		ret = NET_RX_DROP;
3637 	}
3638 
3639 unlock:
3640 	rcu_read_unlock();
3641 out:
3642 	return ret;
3643 }
3644 
3645 static int __netif_receive_skb(struct sk_buff *skb)
3646 {
3647 	int ret;
3648 
3649 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3650 		unsigned long pflags = current->flags;
3651 
3652 		/*
3653 		 * PFMEMALLOC skbs are special, they should
3654 		 * - be delivered to SOCK_MEMALLOC sockets only
3655 		 * - stay away from userspace
3656 		 * - have bounded memory usage
3657 		 *
3658 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3659 		 * context down to all allocation sites.
3660 		 */
3661 		current->flags |= PF_MEMALLOC;
3662 		ret = __netif_receive_skb_core(skb, true);
3663 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3664 	} else
3665 		ret = __netif_receive_skb_core(skb, false);
3666 
3667 	return ret;
3668 }
3669 
3670 /**
3671  *	netif_receive_skb - process receive buffer from network
3672  *	@skb: buffer to process
3673  *
3674  *	netif_receive_skb() is the main receive data processing function.
3675  *	It always succeeds. The buffer may be dropped during processing
3676  *	for congestion control or by the protocol layers.
3677  *
3678  *	This function may only be called from softirq context and interrupts
3679  *	should be enabled.
3680  *
3681  *	Return values (usually ignored):
3682  *	NET_RX_SUCCESS: no congestion
3683  *	NET_RX_DROP: packet was dropped
3684  */
3685 int netif_receive_skb(struct sk_buff *skb)
3686 {
3687 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3688 
3689 	if (skb_defer_rx_timestamp(skb))
3690 		return NET_RX_SUCCESS;
3691 
3692 #ifdef CONFIG_RPS
3693 	if (static_key_false(&rps_needed)) {
3694 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3695 		int cpu, ret;
3696 
3697 		rcu_read_lock();
3698 
3699 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3700 
3701 		if (cpu >= 0) {
3702 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3703 			rcu_read_unlock();
3704 			return ret;
3705 		}
3706 		rcu_read_unlock();
3707 	}
3708 #endif
3709 	return __netif_receive_skb(skb);
3710 }
3711 EXPORT_SYMBOL(netif_receive_skb);
3712 
3713 /* Network device is going away, flush any packets still pending
3714  * Called with irqs disabled.
3715  */
3716 static void flush_backlog(void *arg)
3717 {
3718 	struct net_device *dev = arg;
3719 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3720 	struct sk_buff *skb, *tmp;
3721 
3722 	rps_lock(sd);
3723 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3724 		if (skb->dev == dev) {
3725 			__skb_unlink(skb, &sd->input_pkt_queue);
3726 			kfree_skb(skb);
3727 			input_queue_head_incr(sd);
3728 		}
3729 	}
3730 	rps_unlock(sd);
3731 
3732 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3733 		if (skb->dev == dev) {
3734 			__skb_unlink(skb, &sd->process_queue);
3735 			kfree_skb(skb);
3736 			input_queue_head_incr(sd);
3737 		}
3738 	}
3739 }
3740 
3741 static int napi_gro_complete(struct sk_buff *skb)
3742 {
3743 	struct packet_offload *ptype;
3744 	__be16 type = skb->protocol;
3745 	struct list_head *head = &offload_base;
3746 	int err = -ENOENT;
3747 
3748 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3749 
3750 	if (NAPI_GRO_CB(skb)->count == 1) {
3751 		skb_shinfo(skb)->gso_size = 0;
3752 		goto out;
3753 	}
3754 
3755 	rcu_read_lock();
3756 	list_for_each_entry_rcu(ptype, head, list) {
3757 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3758 			continue;
3759 
3760 		err = ptype->callbacks.gro_complete(skb);
3761 		break;
3762 	}
3763 	rcu_read_unlock();
3764 
3765 	if (err) {
3766 		WARN_ON(&ptype->list == head);
3767 		kfree_skb(skb);
3768 		return NET_RX_SUCCESS;
3769 	}
3770 
3771 out:
3772 	return netif_receive_skb(skb);
3773 }
3774 
3775 /* napi->gro_list contains packets ordered by age.
3776  * youngest packets at the head of it.
3777  * Complete skbs in reverse order to reduce latencies.
3778  */
3779 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3780 {
3781 	struct sk_buff *skb, *prev = NULL;
3782 
3783 	/* scan list and build reverse chain */
3784 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3785 		skb->prev = prev;
3786 		prev = skb;
3787 	}
3788 
3789 	for (skb = prev; skb; skb = prev) {
3790 		skb->next = NULL;
3791 
3792 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3793 			return;
3794 
3795 		prev = skb->prev;
3796 		napi_gro_complete(skb);
3797 		napi->gro_count--;
3798 	}
3799 
3800 	napi->gro_list = NULL;
3801 }
3802 EXPORT_SYMBOL(napi_gro_flush);
3803 
3804 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3805 {
3806 	struct sk_buff *p;
3807 	unsigned int maclen = skb->dev->hard_header_len;
3808 
3809 	for (p = napi->gro_list; p; p = p->next) {
3810 		unsigned long diffs;
3811 
3812 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3813 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3814 		if (maclen == ETH_HLEN)
3815 			diffs |= compare_ether_header(skb_mac_header(p),
3816 						      skb_gro_mac_header(skb));
3817 		else if (!diffs)
3818 			diffs = memcmp(skb_mac_header(p),
3819 				       skb_gro_mac_header(skb),
3820 				       maclen);
3821 		NAPI_GRO_CB(p)->same_flow = !diffs;
3822 		NAPI_GRO_CB(p)->flush = 0;
3823 	}
3824 }
3825 
3826 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3827 {
3828 	struct sk_buff **pp = NULL;
3829 	struct packet_offload *ptype;
3830 	__be16 type = skb->protocol;
3831 	struct list_head *head = &offload_base;
3832 	int same_flow;
3833 	enum gro_result ret;
3834 
3835 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3836 		goto normal;
3837 
3838 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3839 		goto normal;
3840 
3841 	gro_list_prepare(napi, skb);
3842 
3843 	rcu_read_lock();
3844 	list_for_each_entry_rcu(ptype, head, list) {
3845 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3846 			continue;
3847 
3848 		skb_set_network_header(skb, skb_gro_offset(skb));
3849 		skb_reset_mac_len(skb);
3850 		NAPI_GRO_CB(skb)->same_flow = 0;
3851 		NAPI_GRO_CB(skb)->flush = 0;
3852 		NAPI_GRO_CB(skb)->free = 0;
3853 
3854 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3855 		break;
3856 	}
3857 	rcu_read_unlock();
3858 
3859 	if (&ptype->list == head)
3860 		goto normal;
3861 
3862 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3863 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3864 
3865 	if (pp) {
3866 		struct sk_buff *nskb = *pp;
3867 
3868 		*pp = nskb->next;
3869 		nskb->next = NULL;
3870 		napi_gro_complete(nskb);
3871 		napi->gro_count--;
3872 	}
3873 
3874 	if (same_flow)
3875 		goto ok;
3876 
3877 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3878 		goto normal;
3879 
3880 	napi->gro_count++;
3881 	NAPI_GRO_CB(skb)->count = 1;
3882 	NAPI_GRO_CB(skb)->age = jiffies;
3883 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3884 	skb->next = napi->gro_list;
3885 	napi->gro_list = skb;
3886 	ret = GRO_HELD;
3887 
3888 pull:
3889 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3890 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3891 
3892 		BUG_ON(skb->end - skb->tail < grow);
3893 
3894 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3895 
3896 		skb->tail += grow;
3897 		skb->data_len -= grow;
3898 
3899 		skb_shinfo(skb)->frags[0].page_offset += grow;
3900 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3901 
3902 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3903 			skb_frag_unref(skb, 0);
3904 			memmove(skb_shinfo(skb)->frags,
3905 				skb_shinfo(skb)->frags + 1,
3906 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3907 		}
3908 	}
3909 
3910 ok:
3911 	return ret;
3912 
3913 normal:
3914 	ret = GRO_NORMAL;
3915 	goto pull;
3916 }
3917 
3918 
3919 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3920 {
3921 	switch (ret) {
3922 	case GRO_NORMAL:
3923 		if (netif_receive_skb(skb))
3924 			ret = GRO_DROP;
3925 		break;
3926 
3927 	case GRO_DROP:
3928 		kfree_skb(skb);
3929 		break;
3930 
3931 	case GRO_MERGED_FREE:
3932 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3933 			kmem_cache_free(skbuff_head_cache, skb);
3934 		else
3935 			__kfree_skb(skb);
3936 		break;
3937 
3938 	case GRO_HELD:
3939 	case GRO_MERGED:
3940 		break;
3941 	}
3942 
3943 	return ret;
3944 }
3945 
3946 static void skb_gro_reset_offset(struct sk_buff *skb)
3947 {
3948 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3949 	const skb_frag_t *frag0 = &pinfo->frags[0];
3950 
3951 	NAPI_GRO_CB(skb)->data_offset = 0;
3952 	NAPI_GRO_CB(skb)->frag0 = NULL;
3953 	NAPI_GRO_CB(skb)->frag0_len = 0;
3954 
3955 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3956 	    pinfo->nr_frags &&
3957 	    !PageHighMem(skb_frag_page(frag0))) {
3958 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3959 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3960 	}
3961 }
3962 
3963 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3964 {
3965 	skb_gro_reset_offset(skb);
3966 
3967 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3968 }
3969 EXPORT_SYMBOL(napi_gro_receive);
3970 
3971 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3972 {
3973 	__skb_pull(skb, skb_headlen(skb));
3974 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3975 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3976 	skb->vlan_tci = 0;
3977 	skb->dev = napi->dev;
3978 	skb->skb_iif = 0;
3979 
3980 	napi->skb = skb;
3981 }
3982 
3983 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3984 {
3985 	struct sk_buff *skb = napi->skb;
3986 
3987 	if (!skb) {
3988 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3989 		if (skb)
3990 			napi->skb = skb;
3991 	}
3992 	return skb;
3993 }
3994 EXPORT_SYMBOL(napi_get_frags);
3995 
3996 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3997 			       gro_result_t ret)
3998 {
3999 	switch (ret) {
4000 	case GRO_NORMAL:
4001 	case GRO_HELD:
4002 		skb->protocol = eth_type_trans(skb, skb->dev);
4003 
4004 		if (ret == GRO_HELD)
4005 			skb_gro_pull(skb, -ETH_HLEN);
4006 		else if (netif_receive_skb(skb))
4007 			ret = GRO_DROP;
4008 		break;
4009 
4010 	case GRO_DROP:
4011 	case GRO_MERGED_FREE:
4012 		napi_reuse_skb(napi, skb);
4013 		break;
4014 
4015 	case GRO_MERGED:
4016 		break;
4017 	}
4018 
4019 	return ret;
4020 }
4021 
4022 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4023 {
4024 	struct sk_buff *skb = napi->skb;
4025 	struct ethhdr *eth;
4026 	unsigned int hlen;
4027 	unsigned int off;
4028 
4029 	napi->skb = NULL;
4030 
4031 	skb_reset_mac_header(skb);
4032 	skb_gro_reset_offset(skb);
4033 
4034 	off = skb_gro_offset(skb);
4035 	hlen = off + sizeof(*eth);
4036 	eth = skb_gro_header_fast(skb, off);
4037 	if (skb_gro_header_hard(skb, hlen)) {
4038 		eth = skb_gro_header_slow(skb, hlen, off);
4039 		if (unlikely(!eth)) {
4040 			napi_reuse_skb(napi, skb);
4041 			skb = NULL;
4042 			goto out;
4043 		}
4044 	}
4045 
4046 	skb_gro_pull(skb, sizeof(*eth));
4047 
4048 	/*
4049 	 * This works because the only protocols we care about don't require
4050 	 * special handling.  We'll fix it up properly at the end.
4051 	 */
4052 	skb->protocol = eth->h_proto;
4053 
4054 out:
4055 	return skb;
4056 }
4057 
4058 gro_result_t napi_gro_frags(struct napi_struct *napi)
4059 {
4060 	struct sk_buff *skb = napi_frags_skb(napi);
4061 
4062 	if (!skb)
4063 		return GRO_DROP;
4064 
4065 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4066 }
4067 EXPORT_SYMBOL(napi_gro_frags);
4068 
4069 /*
4070  * net_rps_action sends any pending IPI's for rps.
4071  * Note: called with local irq disabled, but exits with local irq enabled.
4072  */
4073 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4074 {
4075 #ifdef CONFIG_RPS
4076 	struct softnet_data *remsd = sd->rps_ipi_list;
4077 
4078 	if (remsd) {
4079 		sd->rps_ipi_list = NULL;
4080 
4081 		local_irq_enable();
4082 
4083 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4084 		while (remsd) {
4085 			struct softnet_data *next = remsd->rps_ipi_next;
4086 
4087 			if (cpu_online(remsd->cpu))
4088 				__smp_call_function_single(remsd->cpu,
4089 							   &remsd->csd, 0);
4090 			remsd = next;
4091 		}
4092 	} else
4093 #endif
4094 		local_irq_enable();
4095 }
4096 
4097 static int process_backlog(struct napi_struct *napi, int quota)
4098 {
4099 	int work = 0;
4100 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4101 
4102 #ifdef CONFIG_RPS
4103 	/* Check if we have pending ipi, its better to send them now,
4104 	 * not waiting net_rx_action() end.
4105 	 */
4106 	if (sd->rps_ipi_list) {
4107 		local_irq_disable();
4108 		net_rps_action_and_irq_enable(sd);
4109 	}
4110 #endif
4111 	napi->weight = weight_p;
4112 	local_irq_disable();
4113 	while (work < quota) {
4114 		struct sk_buff *skb;
4115 		unsigned int qlen;
4116 
4117 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4118 			local_irq_enable();
4119 			__netif_receive_skb(skb);
4120 			local_irq_disable();
4121 			input_queue_head_incr(sd);
4122 			if (++work >= quota) {
4123 				local_irq_enable();
4124 				return work;
4125 			}
4126 		}
4127 
4128 		rps_lock(sd);
4129 		qlen = skb_queue_len(&sd->input_pkt_queue);
4130 		if (qlen)
4131 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4132 						   &sd->process_queue);
4133 
4134 		if (qlen < quota - work) {
4135 			/*
4136 			 * Inline a custom version of __napi_complete().
4137 			 * only current cpu owns and manipulates this napi,
4138 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4139 			 * we can use a plain write instead of clear_bit(),
4140 			 * and we dont need an smp_mb() memory barrier.
4141 			 */
4142 			list_del(&napi->poll_list);
4143 			napi->state = 0;
4144 
4145 			quota = work + qlen;
4146 		}
4147 		rps_unlock(sd);
4148 	}
4149 	local_irq_enable();
4150 
4151 	return work;
4152 }
4153 
4154 /**
4155  * __napi_schedule - schedule for receive
4156  * @n: entry to schedule
4157  *
4158  * The entry's receive function will be scheduled to run
4159  */
4160 void __napi_schedule(struct napi_struct *n)
4161 {
4162 	unsigned long flags;
4163 
4164 	local_irq_save(flags);
4165 	____napi_schedule(&__get_cpu_var(softnet_data), n);
4166 	local_irq_restore(flags);
4167 }
4168 EXPORT_SYMBOL(__napi_schedule);
4169 
4170 void __napi_complete(struct napi_struct *n)
4171 {
4172 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4173 	BUG_ON(n->gro_list);
4174 
4175 	list_del(&n->poll_list);
4176 	smp_mb__before_clear_bit();
4177 	clear_bit(NAPI_STATE_SCHED, &n->state);
4178 }
4179 EXPORT_SYMBOL(__napi_complete);
4180 
4181 void napi_complete(struct napi_struct *n)
4182 {
4183 	unsigned long flags;
4184 
4185 	/*
4186 	 * don't let napi dequeue from the cpu poll list
4187 	 * just in case its running on a different cpu
4188 	 */
4189 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4190 		return;
4191 
4192 	napi_gro_flush(n, false);
4193 	local_irq_save(flags);
4194 	__napi_complete(n);
4195 	local_irq_restore(flags);
4196 }
4197 EXPORT_SYMBOL(napi_complete);
4198 
4199 /* must be called under rcu_read_lock(), as we dont take a reference */
4200 struct napi_struct *napi_by_id(unsigned int napi_id)
4201 {
4202 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4203 	struct napi_struct *napi;
4204 
4205 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4206 		if (napi->napi_id == napi_id)
4207 			return napi;
4208 
4209 	return NULL;
4210 }
4211 EXPORT_SYMBOL_GPL(napi_by_id);
4212 
4213 void napi_hash_add(struct napi_struct *napi)
4214 {
4215 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4216 
4217 		spin_lock(&napi_hash_lock);
4218 
4219 		/* 0 is not a valid id, we also skip an id that is taken
4220 		 * we expect both events to be extremely rare
4221 		 */
4222 		napi->napi_id = 0;
4223 		while (!napi->napi_id) {
4224 			napi->napi_id = ++napi_gen_id;
4225 			if (napi_by_id(napi->napi_id))
4226 				napi->napi_id = 0;
4227 		}
4228 
4229 		hlist_add_head_rcu(&napi->napi_hash_node,
4230 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4231 
4232 		spin_unlock(&napi_hash_lock);
4233 	}
4234 }
4235 EXPORT_SYMBOL_GPL(napi_hash_add);
4236 
4237 /* Warning : caller is responsible to make sure rcu grace period
4238  * is respected before freeing memory containing @napi
4239  */
4240 void napi_hash_del(struct napi_struct *napi)
4241 {
4242 	spin_lock(&napi_hash_lock);
4243 
4244 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4245 		hlist_del_rcu(&napi->napi_hash_node);
4246 
4247 	spin_unlock(&napi_hash_lock);
4248 }
4249 EXPORT_SYMBOL_GPL(napi_hash_del);
4250 
4251 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4252 		    int (*poll)(struct napi_struct *, int), int weight)
4253 {
4254 	INIT_LIST_HEAD(&napi->poll_list);
4255 	napi->gro_count = 0;
4256 	napi->gro_list = NULL;
4257 	napi->skb = NULL;
4258 	napi->poll = poll;
4259 	if (weight > NAPI_POLL_WEIGHT)
4260 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4261 			    weight, dev->name);
4262 	napi->weight = weight;
4263 	list_add(&napi->dev_list, &dev->napi_list);
4264 	napi->dev = dev;
4265 #ifdef CONFIG_NETPOLL
4266 	spin_lock_init(&napi->poll_lock);
4267 	napi->poll_owner = -1;
4268 #endif
4269 	set_bit(NAPI_STATE_SCHED, &napi->state);
4270 }
4271 EXPORT_SYMBOL(netif_napi_add);
4272 
4273 void netif_napi_del(struct napi_struct *napi)
4274 {
4275 	struct sk_buff *skb, *next;
4276 
4277 	list_del_init(&napi->dev_list);
4278 	napi_free_frags(napi);
4279 
4280 	for (skb = napi->gro_list; skb; skb = next) {
4281 		next = skb->next;
4282 		skb->next = NULL;
4283 		kfree_skb(skb);
4284 	}
4285 
4286 	napi->gro_list = NULL;
4287 	napi->gro_count = 0;
4288 }
4289 EXPORT_SYMBOL(netif_napi_del);
4290 
4291 static void net_rx_action(struct softirq_action *h)
4292 {
4293 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4294 	unsigned long time_limit = jiffies + 2;
4295 	int budget = netdev_budget;
4296 	void *have;
4297 
4298 	local_irq_disable();
4299 
4300 	while (!list_empty(&sd->poll_list)) {
4301 		struct napi_struct *n;
4302 		int work, weight;
4303 
4304 		/* If softirq window is exhuasted then punt.
4305 		 * Allow this to run for 2 jiffies since which will allow
4306 		 * an average latency of 1.5/HZ.
4307 		 */
4308 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4309 			goto softnet_break;
4310 
4311 		local_irq_enable();
4312 
4313 		/* Even though interrupts have been re-enabled, this
4314 		 * access is safe because interrupts can only add new
4315 		 * entries to the tail of this list, and only ->poll()
4316 		 * calls can remove this head entry from the list.
4317 		 */
4318 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4319 
4320 		have = netpoll_poll_lock(n);
4321 
4322 		weight = n->weight;
4323 
4324 		/* This NAPI_STATE_SCHED test is for avoiding a race
4325 		 * with netpoll's poll_napi().  Only the entity which
4326 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4327 		 * actually make the ->poll() call.  Therefore we avoid
4328 		 * accidentally calling ->poll() when NAPI is not scheduled.
4329 		 */
4330 		work = 0;
4331 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4332 			work = n->poll(n, weight);
4333 			trace_napi_poll(n);
4334 		}
4335 
4336 		WARN_ON_ONCE(work > weight);
4337 
4338 		budget -= work;
4339 
4340 		local_irq_disable();
4341 
4342 		/* Drivers must not modify the NAPI state if they
4343 		 * consume the entire weight.  In such cases this code
4344 		 * still "owns" the NAPI instance and therefore can
4345 		 * move the instance around on the list at-will.
4346 		 */
4347 		if (unlikely(work == weight)) {
4348 			if (unlikely(napi_disable_pending(n))) {
4349 				local_irq_enable();
4350 				napi_complete(n);
4351 				local_irq_disable();
4352 			} else {
4353 				if (n->gro_list) {
4354 					/* flush too old packets
4355 					 * If HZ < 1000, flush all packets.
4356 					 */
4357 					local_irq_enable();
4358 					napi_gro_flush(n, HZ >= 1000);
4359 					local_irq_disable();
4360 				}
4361 				list_move_tail(&n->poll_list, &sd->poll_list);
4362 			}
4363 		}
4364 
4365 		netpoll_poll_unlock(have);
4366 	}
4367 out:
4368 	net_rps_action_and_irq_enable(sd);
4369 
4370 #ifdef CONFIG_NET_DMA
4371 	/*
4372 	 * There may not be any more sk_buffs coming right now, so push
4373 	 * any pending DMA copies to hardware
4374 	 */
4375 	dma_issue_pending_all();
4376 #endif
4377 
4378 	return;
4379 
4380 softnet_break:
4381 	sd->time_squeeze++;
4382 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4383 	goto out;
4384 }
4385 
4386 struct netdev_adjacent {
4387 	struct net_device *dev;
4388 
4389 	/* upper master flag, there can only be one master device per list */
4390 	bool master;
4391 
4392 	/* counter for the number of times this device was added to us */
4393 	u16 ref_nr;
4394 
4395 	/* private field for the users */
4396 	void *private;
4397 
4398 	struct list_head list;
4399 	struct rcu_head rcu;
4400 };
4401 
4402 static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4403 						     struct net_device *adj_dev,
4404 						     struct list_head *adj_list)
4405 {
4406 	struct netdev_adjacent *adj;
4407 
4408 	list_for_each_entry_rcu(adj, adj_list, list) {
4409 		if (adj->dev == adj_dev)
4410 			return adj;
4411 	}
4412 	return NULL;
4413 }
4414 
4415 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4416 						 struct net_device *adj_dev,
4417 						 struct list_head *adj_list)
4418 {
4419 	struct netdev_adjacent *adj;
4420 
4421 	list_for_each_entry(adj, adj_list, list) {
4422 		if (adj->dev == adj_dev)
4423 			return adj;
4424 	}
4425 	return NULL;
4426 }
4427 
4428 /**
4429  * netdev_has_upper_dev - Check if device is linked to an upper device
4430  * @dev: device
4431  * @upper_dev: upper device to check
4432  *
4433  * Find out if a device is linked to specified upper device and return true
4434  * in case it is. Note that this checks only immediate upper device,
4435  * not through a complete stack of devices. The caller must hold the RTNL lock.
4436  */
4437 bool netdev_has_upper_dev(struct net_device *dev,
4438 			  struct net_device *upper_dev)
4439 {
4440 	ASSERT_RTNL();
4441 
4442 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4443 }
4444 EXPORT_SYMBOL(netdev_has_upper_dev);
4445 
4446 /**
4447  * netdev_has_any_upper_dev - Check if device is linked to some device
4448  * @dev: device
4449  *
4450  * Find out if a device is linked to an upper device and return true in case
4451  * it is. The caller must hold the RTNL lock.
4452  */
4453 bool netdev_has_any_upper_dev(struct net_device *dev)
4454 {
4455 	ASSERT_RTNL();
4456 
4457 	return !list_empty(&dev->all_adj_list.upper);
4458 }
4459 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4460 
4461 /**
4462  * netdev_master_upper_dev_get - Get master upper device
4463  * @dev: device
4464  *
4465  * Find a master upper device and return pointer to it or NULL in case
4466  * it's not there. The caller must hold the RTNL lock.
4467  */
4468 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4469 {
4470 	struct netdev_adjacent *upper;
4471 
4472 	ASSERT_RTNL();
4473 
4474 	if (list_empty(&dev->adj_list.upper))
4475 		return NULL;
4476 
4477 	upper = list_first_entry(&dev->adj_list.upper,
4478 				 struct netdev_adjacent, list);
4479 	if (likely(upper->master))
4480 		return upper->dev;
4481 	return NULL;
4482 }
4483 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4484 
4485 void *netdev_adjacent_get_private(struct list_head *adj_list)
4486 {
4487 	struct netdev_adjacent *adj;
4488 
4489 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4490 
4491 	return adj->private;
4492 }
4493 EXPORT_SYMBOL(netdev_adjacent_get_private);
4494 
4495 /**
4496  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4497  * @dev: device
4498  * @iter: list_head ** of the current position
4499  *
4500  * Gets the next device from the dev's upper list, starting from iter
4501  * position. The caller must hold RCU read lock.
4502  */
4503 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4504 						     struct list_head **iter)
4505 {
4506 	struct netdev_adjacent *upper;
4507 
4508 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4509 
4510 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4511 
4512 	if (&upper->list == &dev->all_adj_list.upper)
4513 		return NULL;
4514 
4515 	*iter = &upper->list;
4516 
4517 	return upper->dev;
4518 }
4519 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4520 
4521 /**
4522  * netdev_lower_get_next_private - Get the next ->private from the
4523  *				   lower neighbour list
4524  * @dev: device
4525  * @iter: list_head ** of the current position
4526  *
4527  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4528  * list, starting from iter position. The caller must hold either hold the
4529  * RTNL lock or its own locking that guarantees that the neighbour lower
4530  * list will remain unchainged.
4531  */
4532 void *netdev_lower_get_next_private(struct net_device *dev,
4533 				    struct list_head **iter)
4534 {
4535 	struct netdev_adjacent *lower;
4536 
4537 	lower = list_entry(*iter, struct netdev_adjacent, list);
4538 
4539 	if (&lower->list == &dev->adj_list.lower)
4540 		return NULL;
4541 
4542 	if (iter)
4543 		*iter = lower->list.next;
4544 
4545 	return lower->private;
4546 }
4547 EXPORT_SYMBOL(netdev_lower_get_next_private);
4548 
4549 /**
4550  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4551  *				       lower neighbour list, RCU
4552  *				       variant
4553  * @dev: device
4554  * @iter: list_head ** of the current position
4555  *
4556  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4557  * list, starting from iter position. The caller must hold RCU read lock.
4558  */
4559 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4560 					struct list_head **iter)
4561 {
4562 	struct netdev_adjacent *lower;
4563 
4564 	WARN_ON_ONCE(!rcu_read_lock_held());
4565 
4566 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4567 
4568 	if (&lower->list == &dev->adj_list.lower)
4569 		return NULL;
4570 
4571 	if (iter)
4572 		*iter = &lower->list;
4573 
4574 	return lower->private;
4575 }
4576 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4577 
4578 /**
4579  * netdev_master_upper_dev_get_rcu - Get master upper device
4580  * @dev: device
4581  *
4582  * Find a master upper device and return pointer to it or NULL in case
4583  * it's not there. The caller must hold the RCU read lock.
4584  */
4585 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4586 {
4587 	struct netdev_adjacent *upper;
4588 
4589 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4590 				       struct netdev_adjacent, list);
4591 	if (upper && likely(upper->master))
4592 		return upper->dev;
4593 	return NULL;
4594 }
4595 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4596 
4597 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4598 					struct net_device *adj_dev,
4599 					struct list_head *dev_list,
4600 					void *private, bool master)
4601 {
4602 	struct netdev_adjacent *adj;
4603 	char linkname[IFNAMSIZ+7];
4604 	int ret;
4605 
4606 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4607 
4608 	if (adj) {
4609 		adj->ref_nr++;
4610 		return 0;
4611 	}
4612 
4613 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4614 	if (!adj)
4615 		return -ENOMEM;
4616 
4617 	adj->dev = adj_dev;
4618 	adj->master = master;
4619 	adj->ref_nr = 1;
4620 	adj->private = private;
4621 	dev_hold(adj_dev);
4622 
4623 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4624 		 adj_dev->name, dev->name, adj_dev->name);
4625 
4626 	if (dev_list == &dev->adj_list.lower) {
4627 		sprintf(linkname, "lower_%s", adj_dev->name);
4628 		ret = sysfs_create_link(&(dev->dev.kobj),
4629 					&(adj_dev->dev.kobj), linkname);
4630 		if (ret)
4631 			goto free_adj;
4632 	} else if (dev_list == &dev->adj_list.upper) {
4633 		sprintf(linkname, "upper_%s", adj_dev->name);
4634 		ret = sysfs_create_link(&(dev->dev.kobj),
4635 					&(adj_dev->dev.kobj), linkname);
4636 		if (ret)
4637 			goto free_adj;
4638 	}
4639 
4640 	/* Ensure that master link is always the first item in list. */
4641 	if (master) {
4642 		ret = sysfs_create_link(&(dev->dev.kobj),
4643 					&(adj_dev->dev.kobj), "master");
4644 		if (ret)
4645 			goto remove_symlinks;
4646 
4647 		list_add_rcu(&adj->list, dev_list);
4648 	} else {
4649 		list_add_tail_rcu(&adj->list, dev_list);
4650 	}
4651 
4652 	return 0;
4653 
4654 remove_symlinks:
4655 	if (dev_list == &dev->adj_list.lower) {
4656 		sprintf(linkname, "lower_%s", adj_dev->name);
4657 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4658 	} else if (dev_list == &dev->adj_list.upper) {
4659 		sprintf(linkname, "upper_%s", adj_dev->name);
4660 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4661 	}
4662 
4663 free_adj:
4664 	kfree(adj);
4665 	dev_put(adj_dev);
4666 
4667 	return ret;
4668 }
4669 
4670 void __netdev_adjacent_dev_remove(struct net_device *dev,
4671 				  struct net_device *adj_dev,
4672 				  struct list_head *dev_list)
4673 {
4674 	struct netdev_adjacent *adj;
4675 	char linkname[IFNAMSIZ+7];
4676 
4677 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4678 
4679 	if (!adj) {
4680 		pr_err("tried to remove device %s from %s\n",
4681 		       dev->name, adj_dev->name);
4682 		BUG();
4683 	}
4684 
4685 	if (adj->ref_nr > 1) {
4686 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4687 			 adj->ref_nr-1);
4688 		adj->ref_nr--;
4689 		return;
4690 	}
4691 
4692 	if (adj->master)
4693 		sysfs_remove_link(&(dev->dev.kobj), "master");
4694 
4695 	if (dev_list == &dev->adj_list.lower) {
4696 		sprintf(linkname, "lower_%s", adj_dev->name);
4697 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4698 	} else if (dev_list == &dev->adj_list.upper) {
4699 		sprintf(linkname, "upper_%s", adj_dev->name);
4700 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4701 	}
4702 
4703 	list_del_rcu(&adj->list);
4704 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4705 		 adj_dev->name, dev->name, adj_dev->name);
4706 	dev_put(adj_dev);
4707 	kfree_rcu(adj, rcu);
4708 }
4709 
4710 int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4711 				     struct net_device *upper_dev,
4712 				     struct list_head *up_list,
4713 				     struct list_head *down_list,
4714 				     void *private, bool master)
4715 {
4716 	int ret;
4717 
4718 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4719 					   master);
4720 	if (ret)
4721 		return ret;
4722 
4723 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4724 					   false);
4725 	if (ret) {
4726 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4727 		return ret;
4728 	}
4729 
4730 	return 0;
4731 }
4732 
4733 int __netdev_adjacent_dev_link(struct net_device *dev,
4734 			       struct net_device *upper_dev)
4735 {
4736 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4737 						&dev->all_adj_list.upper,
4738 						&upper_dev->all_adj_list.lower,
4739 						NULL, false);
4740 }
4741 
4742 void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4743 					struct net_device *upper_dev,
4744 					struct list_head *up_list,
4745 					struct list_head *down_list)
4746 {
4747 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4748 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4749 }
4750 
4751 void __netdev_adjacent_dev_unlink(struct net_device *dev,
4752 				  struct net_device *upper_dev)
4753 {
4754 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4755 					   &dev->all_adj_list.upper,
4756 					   &upper_dev->all_adj_list.lower);
4757 }
4758 
4759 int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4760 					 struct net_device *upper_dev,
4761 					 void *private, bool master)
4762 {
4763 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4764 
4765 	if (ret)
4766 		return ret;
4767 
4768 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4769 					       &dev->adj_list.upper,
4770 					       &upper_dev->adj_list.lower,
4771 					       private, master);
4772 	if (ret) {
4773 		__netdev_adjacent_dev_unlink(dev, upper_dev);
4774 		return ret;
4775 	}
4776 
4777 	return 0;
4778 }
4779 
4780 void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4781 					    struct net_device *upper_dev)
4782 {
4783 	__netdev_adjacent_dev_unlink(dev, upper_dev);
4784 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4785 					   &dev->adj_list.upper,
4786 					   &upper_dev->adj_list.lower);
4787 }
4788 
4789 static int __netdev_upper_dev_link(struct net_device *dev,
4790 				   struct net_device *upper_dev, bool master,
4791 				   void *private)
4792 {
4793 	struct netdev_adjacent *i, *j, *to_i, *to_j;
4794 	int ret = 0;
4795 
4796 	ASSERT_RTNL();
4797 
4798 	if (dev == upper_dev)
4799 		return -EBUSY;
4800 
4801 	/* To prevent loops, check if dev is not upper device to upper_dev. */
4802 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4803 		return -EBUSY;
4804 
4805 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4806 		return -EEXIST;
4807 
4808 	if (master && netdev_master_upper_dev_get(dev))
4809 		return -EBUSY;
4810 
4811 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4812 						   master);
4813 	if (ret)
4814 		return ret;
4815 
4816 	/* Now that we linked these devs, make all the upper_dev's
4817 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4818 	 * versa, and don't forget the devices itself. All of these
4819 	 * links are non-neighbours.
4820 	 */
4821 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4822 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4823 			pr_debug("Interlinking %s with %s, non-neighbour\n",
4824 				 i->dev->name, j->dev->name);
4825 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4826 			if (ret)
4827 				goto rollback_mesh;
4828 		}
4829 	}
4830 
4831 	/* add dev to every upper_dev's upper device */
4832 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4833 		pr_debug("linking %s's upper device %s with %s\n",
4834 			 upper_dev->name, i->dev->name, dev->name);
4835 		ret = __netdev_adjacent_dev_link(dev, i->dev);
4836 		if (ret)
4837 			goto rollback_upper_mesh;
4838 	}
4839 
4840 	/* add upper_dev to every dev's lower device */
4841 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4842 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4843 			 i->dev->name, upper_dev->name);
4844 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4845 		if (ret)
4846 			goto rollback_lower_mesh;
4847 	}
4848 
4849 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4850 	return 0;
4851 
4852 rollback_lower_mesh:
4853 	to_i = i;
4854 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4855 		if (i == to_i)
4856 			break;
4857 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4858 	}
4859 
4860 	i = NULL;
4861 
4862 rollback_upper_mesh:
4863 	to_i = i;
4864 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4865 		if (i == to_i)
4866 			break;
4867 		__netdev_adjacent_dev_unlink(dev, i->dev);
4868 	}
4869 
4870 	i = j = NULL;
4871 
4872 rollback_mesh:
4873 	to_i = i;
4874 	to_j = j;
4875 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4876 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4877 			if (i == to_i && j == to_j)
4878 				break;
4879 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4880 		}
4881 		if (i == to_i)
4882 			break;
4883 	}
4884 
4885 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4886 
4887 	return ret;
4888 }
4889 
4890 /**
4891  * netdev_upper_dev_link - Add a link to the upper device
4892  * @dev: device
4893  * @upper_dev: new upper device
4894  *
4895  * Adds a link to device which is upper to this one. The caller must hold
4896  * the RTNL lock. On a failure a negative errno code is returned.
4897  * On success the reference counts are adjusted and the function
4898  * returns zero.
4899  */
4900 int netdev_upper_dev_link(struct net_device *dev,
4901 			  struct net_device *upper_dev)
4902 {
4903 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4904 }
4905 EXPORT_SYMBOL(netdev_upper_dev_link);
4906 
4907 /**
4908  * netdev_master_upper_dev_link - Add a master link to the upper device
4909  * @dev: device
4910  * @upper_dev: new upper device
4911  *
4912  * Adds a link to device which is upper to this one. In this case, only
4913  * one master upper device can be linked, although other non-master devices
4914  * might be linked as well. The caller must hold the RTNL lock.
4915  * On a failure a negative errno code is returned. On success the reference
4916  * counts are adjusted and the function returns zero.
4917  */
4918 int netdev_master_upper_dev_link(struct net_device *dev,
4919 				 struct net_device *upper_dev)
4920 {
4921 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4922 }
4923 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4924 
4925 int netdev_master_upper_dev_link_private(struct net_device *dev,
4926 					 struct net_device *upper_dev,
4927 					 void *private)
4928 {
4929 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
4930 }
4931 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4932 
4933 /**
4934  * netdev_upper_dev_unlink - Removes a link to upper device
4935  * @dev: device
4936  * @upper_dev: new upper device
4937  *
4938  * Removes a link to device which is upper to this one. The caller must hold
4939  * the RTNL lock.
4940  */
4941 void netdev_upper_dev_unlink(struct net_device *dev,
4942 			     struct net_device *upper_dev)
4943 {
4944 	struct netdev_adjacent *i, *j;
4945 	ASSERT_RTNL();
4946 
4947 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4948 
4949 	/* Here is the tricky part. We must remove all dev's lower
4950 	 * devices from all upper_dev's upper devices and vice
4951 	 * versa, to maintain the graph relationship.
4952 	 */
4953 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4954 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
4955 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4956 
4957 	/* remove also the devices itself from lower/upper device
4958 	 * list
4959 	 */
4960 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4961 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4962 
4963 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
4964 		__netdev_adjacent_dev_unlink(dev, i->dev);
4965 
4966 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4967 }
4968 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4969 
4970 void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4971 				       struct net_device *lower_dev)
4972 {
4973 	struct netdev_adjacent *lower;
4974 
4975 	if (!lower_dev)
4976 		return NULL;
4977 	lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4978 	if (!lower)
4979 		return NULL;
4980 
4981 	return lower->private;
4982 }
4983 EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4984 
4985 void *netdev_lower_dev_get_private(struct net_device *dev,
4986 				   struct net_device *lower_dev)
4987 {
4988 	struct netdev_adjacent *lower;
4989 
4990 	if (!lower_dev)
4991 		return NULL;
4992 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4993 	if (!lower)
4994 		return NULL;
4995 
4996 	return lower->private;
4997 }
4998 EXPORT_SYMBOL(netdev_lower_dev_get_private);
4999 
5000 static void dev_change_rx_flags(struct net_device *dev, int flags)
5001 {
5002 	const struct net_device_ops *ops = dev->netdev_ops;
5003 
5004 	if (ops->ndo_change_rx_flags)
5005 		ops->ndo_change_rx_flags(dev, flags);
5006 }
5007 
5008 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5009 {
5010 	unsigned int old_flags = dev->flags;
5011 	kuid_t uid;
5012 	kgid_t gid;
5013 
5014 	ASSERT_RTNL();
5015 
5016 	dev->flags |= IFF_PROMISC;
5017 	dev->promiscuity += inc;
5018 	if (dev->promiscuity == 0) {
5019 		/*
5020 		 * Avoid overflow.
5021 		 * If inc causes overflow, untouch promisc and return error.
5022 		 */
5023 		if (inc < 0)
5024 			dev->flags &= ~IFF_PROMISC;
5025 		else {
5026 			dev->promiscuity -= inc;
5027 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5028 				dev->name);
5029 			return -EOVERFLOW;
5030 		}
5031 	}
5032 	if (dev->flags != old_flags) {
5033 		pr_info("device %s %s promiscuous mode\n",
5034 			dev->name,
5035 			dev->flags & IFF_PROMISC ? "entered" : "left");
5036 		if (audit_enabled) {
5037 			current_uid_gid(&uid, &gid);
5038 			audit_log(current->audit_context, GFP_ATOMIC,
5039 				AUDIT_ANOM_PROMISCUOUS,
5040 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5041 				dev->name, (dev->flags & IFF_PROMISC),
5042 				(old_flags & IFF_PROMISC),
5043 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5044 				from_kuid(&init_user_ns, uid),
5045 				from_kgid(&init_user_ns, gid),
5046 				audit_get_sessionid(current));
5047 		}
5048 
5049 		dev_change_rx_flags(dev, IFF_PROMISC);
5050 	}
5051 	if (notify)
5052 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5053 	return 0;
5054 }
5055 
5056 /**
5057  *	dev_set_promiscuity	- update promiscuity count on a device
5058  *	@dev: device
5059  *	@inc: modifier
5060  *
5061  *	Add or remove promiscuity from a device. While the count in the device
5062  *	remains above zero the interface remains promiscuous. Once it hits zero
5063  *	the device reverts back to normal filtering operation. A negative inc
5064  *	value is used to drop promiscuity on the device.
5065  *	Return 0 if successful or a negative errno code on error.
5066  */
5067 int dev_set_promiscuity(struct net_device *dev, int inc)
5068 {
5069 	unsigned int old_flags = dev->flags;
5070 	int err;
5071 
5072 	err = __dev_set_promiscuity(dev, inc, true);
5073 	if (err < 0)
5074 		return err;
5075 	if (dev->flags != old_flags)
5076 		dev_set_rx_mode(dev);
5077 	return err;
5078 }
5079 EXPORT_SYMBOL(dev_set_promiscuity);
5080 
5081 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5082 {
5083 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5084 
5085 	ASSERT_RTNL();
5086 
5087 	dev->flags |= IFF_ALLMULTI;
5088 	dev->allmulti += inc;
5089 	if (dev->allmulti == 0) {
5090 		/*
5091 		 * Avoid overflow.
5092 		 * If inc causes overflow, untouch allmulti and return error.
5093 		 */
5094 		if (inc < 0)
5095 			dev->flags &= ~IFF_ALLMULTI;
5096 		else {
5097 			dev->allmulti -= inc;
5098 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5099 				dev->name);
5100 			return -EOVERFLOW;
5101 		}
5102 	}
5103 	if (dev->flags ^ old_flags) {
5104 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5105 		dev_set_rx_mode(dev);
5106 		if (notify)
5107 			__dev_notify_flags(dev, old_flags,
5108 					   dev->gflags ^ old_gflags);
5109 	}
5110 	return 0;
5111 }
5112 
5113 /**
5114  *	dev_set_allmulti	- update allmulti count on a device
5115  *	@dev: device
5116  *	@inc: modifier
5117  *
5118  *	Add or remove reception of all multicast frames to a device. While the
5119  *	count in the device remains above zero the interface remains listening
5120  *	to all interfaces. Once it hits zero the device reverts back to normal
5121  *	filtering operation. A negative @inc value is used to drop the counter
5122  *	when releasing a resource needing all multicasts.
5123  *	Return 0 if successful or a negative errno code on error.
5124  */
5125 
5126 int dev_set_allmulti(struct net_device *dev, int inc)
5127 {
5128 	return __dev_set_allmulti(dev, inc, true);
5129 }
5130 EXPORT_SYMBOL(dev_set_allmulti);
5131 
5132 /*
5133  *	Upload unicast and multicast address lists to device and
5134  *	configure RX filtering. When the device doesn't support unicast
5135  *	filtering it is put in promiscuous mode while unicast addresses
5136  *	are present.
5137  */
5138 void __dev_set_rx_mode(struct net_device *dev)
5139 {
5140 	const struct net_device_ops *ops = dev->netdev_ops;
5141 
5142 	/* dev_open will call this function so the list will stay sane. */
5143 	if (!(dev->flags&IFF_UP))
5144 		return;
5145 
5146 	if (!netif_device_present(dev))
5147 		return;
5148 
5149 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5150 		/* Unicast addresses changes may only happen under the rtnl,
5151 		 * therefore calling __dev_set_promiscuity here is safe.
5152 		 */
5153 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5154 			__dev_set_promiscuity(dev, 1, false);
5155 			dev->uc_promisc = true;
5156 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5157 			__dev_set_promiscuity(dev, -1, false);
5158 			dev->uc_promisc = false;
5159 		}
5160 	}
5161 
5162 	if (ops->ndo_set_rx_mode)
5163 		ops->ndo_set_rx_mode(dev);
5164 }
5165 
5166 void dev_set_rx_mode(struct net_device *dev)
5167 {
5168 	netif_addr_lock_bh(dev);
5169 	__dev_set_rx_mode(dev);
5170 	netif_addr_unlock_bh(dev);
5171 }
5172 
5173 /**
5174  *	dev_get_flags - get flags reported to userspace
5175  *	@dev: device
5176  *
5177  *	Get the combination of flag bits exported through APIs to userspace.
5178  */
5179 unsigned int dev_get_flags(const struct net_device *dev)
5180 {
5181 	unsigned int flags;
5182 
5183 	flags = (dev->flags & ~(IFF_PROMISC |
5184 				IFF_ALLMULTI |
5185 				IFF_RUNNING |
5186 				IFF_LOWER_UP |
5187 				IFF_DORMANT)) |
5188 		(dev->gflags & (IFF_PROMISC |
5189 				IFF_ALLMULTI));
5190 
5191 	if (netif_running(dev)) {
5192 		if (netif_oper_up(dev))
5193 			flags |= IFF_RUNNING;
5194 		if (netif_carrier_ok(dev))
5195 			flags |= IFF_LOWER_UP;
5196 		if (netif_dormant(dev))
5197 			flags |= IFF_DORMANT;
5198 	}
5199 
5200 	return flags;
5201 }
5202 EXPORT_SYMBOL(dev_get_flags);
5203 
5204 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5205 {
5206 	unsigned int old_flags = dev->flags;
5207 	int ret;
5208 
5209 	ASSERT_RTNL();
5210 
5211 	/*
5212 	 *	Set the flags on our device.
5213 	 */
5214 
5215 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5216 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5217 			       IFF_AUTOMEDIA)) |
5218 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5219 				    IFF_ALLMULTI));
5220 
5221 	/*
5222 	 *	Load in the correct multicast list now the flags have changed.
5223 	 */
5224 
5225 	if ((old_flags ^ flags) & IFF_MULTICAST)
5226 		dev_change_rx_flags(dev, IFF_MULTICAST);
5227 
5228 	dev_set_rx_mode(dev);
5229 
5230 	/*
5231 	 *	Have we downed the interface. We handle IFF_UP ourselves
5232 	 *	according to user attempts to set it, rather than blindly
5233 	 *	setting it.
5234 	 */
5235 
5236 	ret = 0;
5237 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5238 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5239 
5240 		if (!ret)
5241 			dev_set_rx_mode(dev);
5242 	}
5243 
5244 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5245 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5246 		unsigned int old_flags = dev->flags;
5247 
5248 		dev->gflags ^= IFF_PROMISC;
5249 
5250 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5251 			if (dev->flags != old_flags)
5252 				dev_set_rx_mode(dev);
5253 	}
5254 
5255 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5256 	   is important. Some (broken) drivers set IFF_PROMISC, when
5257 	   IFF_ALLMULTI is requested not asking us and not reporting.
5258 	 */
5259 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5260 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5261 
5262 		dev->gflags ^= IFF_ALLMULTI;
5263 		__dev_set_allmulti(dev, inc, false);
5264 	}
5265 
5266 	return ret;
5267 }
5268 
5269 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5270 			unsigned int gchanges)
5271 {
5272 	unsigned int changes = dev->flags ^ old_flags;
5273 
5274 	if (gchanges)
5275 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5276 
5277 	if (changes & IFF_UP) {
5278 		if (dev->flags & IFF_UP)
5279 			call_netdevice_notifiers(NETDEV_UP, dev);
5280 		else
5281 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5282 	}
5283 
5284 	if (dev->flags & IFF_UP &&
5285 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5286 		struct netdev_notifier_change_info change_info;
5287 
5288 		change_info.flags_changed = changes;
5289 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5290 					      &change_info.info);
5291 	}
5292 }
5293 
5294 /**
5295  *	dev_change_flags - change device settings
5296  *	@dev: device
5297  *	@flags: device state flags
5298  *
5299  *	Change settings on device based state flags. The flags are
5300  *	in the userspace exported format.
5301  */
5302 int dev_change_flags(struct net_device *dev, unsigned int flags)
5303 {
5304 	int ret;
5305 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5306 
5307 	ret = __dev_change_flags(dev, flags);
5308 	if (ret < 0)
5309 		return ret;
5310 
5311 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5312 	__dev_notify_flags(dev, old_flags, changes);
5313 	return ret;
5314 }
5315 EXPORT_SYMBOL(dev_change_flags);
5316 
5317 /**
5318  *	dev_set_mtu - Change maximum transfer unit
5319  *	@dev: device
5320  *	@new_mtu: new transfer unit
5321  *
5322  *	Change the maximum transfer size of the network device.
5323  */
5324 int dev_set_mtu(struct net_device *dev, int new_mtu)
5325 {
5326 	const struct net_device_ops *ops = dev->netdev_ops;
5327 	int err;
5328 
5329 	if (new_mtu == dev->mtu)
5330 		return 0;
5331 
5332 	/*	MTU must be positive.	 */
5333 	if (new_mtu < 0)
5334 		return -EINVAL;
5335 
5336 	if (!netif_device_present(dev))
5337 		return -ENODEV;
5338 
5339 	err = 0;
5340 	if (ops->ndo_change_mtu)
5341 		err = ops->ndo_change_mtu(dev, new_mtu);
5342 	else
5343 		dev->mtu = new_mtu;
5344 
5345 	if (!err)
5346 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5347 	return err;
5348 }
5349 EXPORT_SYMBOL(dev_set_mtu);
5350 
5351 /**
5352  *	dev_set_group - Change group this device belongs to
5353  *	@dev: device
5354  *	@new_group: group this device should belong to
5355  */
5356 void dev_set_group(struct net_device *dev, int new_group)
5357 {
5358 	dev->group = new_group;
5359 }
5360 EXPORT_SYMBOL(dev_set_group);
5361 
5362 /**
5363  *	dev_set_mac_address - Change Media Access Control Address
5364  *	@dev: device
5365  *	@sa: new address
5366  *
5367  *	Change the hardware (MAC) address of the device
5368  */
5369 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5370 {
5371 	const struct net_device_ops *ops = dev->netdev_ops;
5372 	int err;
5373 
5374 	if (!ops->ndo_set_mac_address)
5375 		return -EOPNOTSUPP;
5376 	if (sa->sa_family != dev->type)
5377 		return -EINVAL;
5378 	if (!netif_device_present(dev))
5379 		return -ENODEV;
5380 	err = ops->ndo_set_mac_address(dev, sa);
5381 	if (err)
5382 		return err;
5383 	dev->addr_assign_type = NET_ADDR_SET;
5384 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5385 	add_device_randomness(dev->dev_addr, dev->addr_len);
5386 	return 0;
5387 }
5388 EXPORT_SYMBOL(dev_set_mac_address);
5389 
5390 /**
5391  *	dev_change_carrier - Change device carrier
5392  *	@dev: device
5393  *	@new_carrier: new value
5394  *
5395  *	Change device carrier
5396  */
5397 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5398 {
5399 	const struct net_device_ops *ops = dev->netdev_ops;
5400 
5401 	if (!ops->ndo_change_carrier)
5402 		return -EOPNOTSUPP;
5403 	if (!netif_device_present(dev))
5404 		return -ENODEV;
5405 	return ops->ndo_change_carrier(dev, new_carrier);
5406 }
5407 EXPORT_SYMBOL(dev_change_carrier);
5408 
5409 /**
5410  *	dev_get_phys_port_id - Get device physical port ID
5411  *	@dev: device
5412  *	@ppid: port ID
5413  *
5414  *	Get device physical port ID
5415  */
5416 int dev_get_phys_port_id(struct net_device *dev,
5417 			 struct netdev_phys_port_id *ppid)
5418 {
5419 	const struct net_device_ops *ops = dev->netdev_ops;
5420 
5421 	if (!ops->ndo_get_phys_port_id)
5422 		return -EOPNOTSUPP;
5423 	return ops->ndo_get_phys_port_id(dev, ppid);
5424 }
5425 EXPORT_SYMBOL(dev_get_phys_port_id);
5426 
5427 /**
5428  *	dev_new_index	-	allocate an ifindex
5429  *	@net: the applicable net namespace
5430  *
5431  *	Returns a suitable unique value for a new device interface
5432  *	number.  The caller must hold the rtnl semaphore or the
5433  *	dev_base_lock to be sure it remains unique.
5434  */
5435 static int dev_new_index(struct net *net)
5436 {
5437 	int ifindex = net->ifindex;
5438 	for (;;) {
5439 		if (++ifindex <= 0)
5440 			ifindex = 1;
5441 		if (!__dev_get_by_index(net, ifindex))
5442 			return net->ifindex = ifindex;
5443 	}
5444 }
5445 
5446 /* Delayed registration/unregisteration */
5447 static LIST_HEAD(net_todo_list);
5448 static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5449 
5450 static void net_set_todo(struct net_device *dev)
5451 {
5452 	list_add_tail(&dev->todo_list, &net_todo_list);
5453 	dev_net(dev)->dev_unreg_count++;
5454 }
5455 
5456 static void rollback_registered_many(struct list_head *head)
5457 {
5458 	struct net_device *dev, *tmp;
5459 	LIST_HEAD(close_head);
5460 
5461 	BUG_ON(dev_boot_phase);
5462 	ASSERT_RTNL();
5463 
5464 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5465 		/* Some devices call without registering
5466 		 * for initialization unwind. Remove those
5467 		 * devices and proceed with the remaining.
5468 		 */
5469 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5470 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5471 				 dev->name, dev);
5472 
5473 			WARN_ON(1);
5474 			list_del(&dev->unreg_list);
5475 			continue;
5476 		}
5477 		dev->dismantle = true;
5478 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5479 	}
5480 
5481 	/* If device is running, close it first. */
5482 	list_for_each_entry(dev, head, unreg_list)
5483 		list_add_tail(&dev->close_list, &close_head);
5484 	dev_close_many(&close_head);
5485 
5486 	list_for_each_entry(dev, head, unreg_list) {
5487 		/* And unlink it from device chain. */
5488 		unlist_netdevice(dev);
5489 
5490 		dev->reg_state = NETREG_UNREGISTERING;
5491 	}
5492 
5493 	synchronize_net();
5494 
5495 	list_for_each_entry(dev, head, unreg_list) {
5496 		/* Shutdown queueing discipline. */
5497 		dev_shutdown(dev);
5498 
5499 
5500 		/* Notify protocols, that we are about to destroy
5501 		   this device. They should clean all the things.
5502 		*/
5503 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5504 
5505 		if (!dev->rtnl_link_ops ||
5506 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5507 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5508 
5509 		/*
5510 		 *	Flush the unicast and multicast chains
5511 		 */
5512 		dev_uc_flush(dev);
5513 		dev_mc_flush(dev);
5514 
5515 		if (dev->netdev_ops->ndo_uninit)
5516 			dev->netdev_ops->ndo_uninit(dev);
5517 
5518 		/* Notifier chain MUST detach us all upper devices. */
5519 		WARN_ON(netdev_has_any_upper_dev(dev));
5520 
5521 		/* Remove entries from kobject tree */
5522 		netdev_unregister_kobject(dev);
5523 #ifdef CONFIG_XPS
5524 		/* Remove XPS queueing entries */
5525 		netif_reset_xps_queues_gt(dev, 0);
5526 #endif
5527 	}
5528 
5529 	synchronize_net();
5530 
5531 	list_for_each_entry(dev, head, unreg_list)
5532 		dev_put(dev);
5533 }
5534 
5535 static void rollback_registered(struct net_device *dev)
5536 {
5537 	LIST_HEAD(single);
5538 
5539 	list_add(&dev->unreg_list, &single);
5540 	rollback_registered_many(&single);
5541 	list_del(&single);
5542 }
5543 
5544 static netdev_features_t netdev_fix_features(struct net_device *dev,
5545 	netdev_features_t features)
5546 {
5547 	/* Fix illegal checksum combinations */
5548 	if ((features & NETIF_F_HW_CSUM) &&
5549 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5550 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5551 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5552 	}
5553 
5554 	/* TSO requires that SG is present as well. */
5555 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5556 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5557 		features &= ~NETIF_F_ALL_TSO;
5558 	}
5559 
5560 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5561 					!(features & NETIF_F_IP_CSUM)) {
5562 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5563 		features &= ~NETIF_F_TSO;
5564 		features &= ~NETIF_F_TSO_ECN;
5565 	}
5566 
5567 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5568 					 !(features & NETIF_F_IPV6_CSUM)) {
5569 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5570 		features &= ~NETIF_F_TSO6;
5571 	}
5572 
5573 	/* TSO ECN requires that TSO is present as well. */
5574 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5575 		features &= ~NETIF_F_TSO_ECN;
5576 
5577 	/* Software GSO depends on SG. */
5578 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5579 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5580 		features &= ~NETIF_F_GSO;
5581 	}
5582 
5583 	/* UFO needs SG and checksumming */
5584 	if (features & NETIF_F_UFO) {
5585 		/* maybe split UFO into V4 and V6? */
5586 		if (!((features & NETIF_F_GEN_CSUM) ||
5587 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5588 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5589 			netdev_dbg(dev,
5590 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5591 			features &= ~NETIF_F_UFO;
5592 		}
5593 
5594 		if (!(features & NETIF_F_SG)) {
5595 			netdev_dbg(dev,
5596 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5597 			features &= ~NETIF_F_UFO;
5598 		}
5599 	}
5600 
5601 	return features;
5602 }
5603 
5604 int __netdev_update_features(struct net_device *dev)
5605 {
5606 	netdev_features_t features;
5607 	int err = 0;
5608 
5609 	ASSERT_RTNL();
5610 
5611 	features = netdev_get_wanted_features(dev);
5612 
5613 	if (dev->netdev_ops->ndo_fix_features)
5614 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5615 
5616 	/* driver might be less strict about feature dependencies */
5617 	features = netdev_fix_features(dev, features);
5618 
5619 	if (dev->features == features)
5620 		return 0;
5621 
5622 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5623 		&dev->features, &features);
5624 
5625 	if (dev->netdev_ops->ndo_set_features)
5626 		err = dev->netdev_ops->ndo_set_features(dev, features);
5627 
5628 	if (unlikely(err < 0)) {
5629 		netdev_err(dev,
5630 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5631 			err, &features, &dev->features);
5632 		return -1;
5633 	}
5634 
5635 	if (!err)
5636 		dev->features = features;
5637 
5638 	return 1;
5639 }
5640 
5641 /**
5642  *	netdev_update_features - recalculate device features
5643  *	@dev: the device to check
5644  *
5645  *	Recalculate dev->features set and send notifications if it
5646  *	has changed. Should be called after driver or hardware dependent
5647  *	conditions might have changed that influence the features.
5648  */
5649 void netdev_update_features(struct net_device *dev)
5650 {
5651 	if (__netdev_update_features(dev))
5652 		netdev_features_change(dev);
5653 }
5654 EXPORT_SYMBOL(netdev_update_features);
5655 
5656 /**
5657  *	netdev_change_features - recalculate device features
5658  *	@dev: the device to check
5659  *
5660  *	Recalculate dev->features set and send notifications even
5661  *	if they have not changed. Should be called instead of
5662  *	netdev_update_features() if also dev->vlan_features might
5663  *	have changed to allow the changes to be propagated to stacked
5664  *	VLAN devices.
5665  */
5666 void netdev_change_features(struct net_device *dev)
5667 {
5668 	__netdev_update_features(dev);
5669 	netdev_features_change(dev);
5670 }
5671 EXPORT_SYMBOL(netdev_change_features);
5672 
5673 /**
5674  *	netif_stacked_transfer_operstate -	transfer operstate
5675  *	@rootdev: the root or lower level device to transfer state from
5676  *	@dev: the device to transfer operstate to
5677  *
5678  *	Transfer operational state from root to device. This is normally
5679  *	called when a stacking relationship exists between the root
5680  *	device and the device(a leaf device).
5681  */
5682 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5683 					struct net_device *dev)
5684 {
5685 	if (rootdev->operstate == IF_OPER_DORMANT)
5686 		netif_dormant_on(dev);
5687 	else
5688 		netif_dormant_off(dev);
5689 
5690 	if (netif_carrier_ok(rootdev)) {
5691 		if (!netif_carrier_ok(dev))
5692 			netif_carrier_on(dev);
5693 	} else {
5694 		if (netif_carrier_ok(dev))
5695 			netif_carrier_off(dev);
5696 	}
5697 }
5698 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5699 
5700 #ifdef CONFIG_RPS
5701 static int netif_alloc_rx_queues(struct net_device *dev)
5702 {
5703 	unsigned int i, count = dev->num_rx_queues;
5704 	struct netdev_rx_queue *rx;
5705 
5706 	BUG_ON(count < 1);
5707 
5708 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5709 	if (!rx)
5710 		return -ENOMEM;
5711 
5712 	dev->_rx = rx;
5713 
5714 	for (i = 0; i < count; i++)
5715 		rx[i].dev = dev;
5716 	return 0;
5717 }
5718 #endif
5719 
5720 static void netdev_init_one_queue(struct net_device *dev,
5721 				  struct netdev_queue *queue, void *_unused)
5722 {
5723 	/* Initialize queue lock */
5724 	spin_lock_init(&queue->_xmit_lock);
5725 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5726 	queue->xmit_lock_owner = -1;
5727 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5728 	queue->dev = dev;
5729 #ifdef CONFIG_BQL
5730 	dql_init(&queue->dql, HZ);
5731 #endif
5732 }
5733 
5734 static void netif_free_tx_queues(struct net_device *dev)
5735 {
5736 	if (is_vmalloc_addr(dev->_tx))
5737 		vfree(dev->_tx);
5738 	else
5739 		kfree(dev->_tx);
5740 }
5741 
5742 static int netif_alloc_netdev_queues(struct net_device *dev)
5743 {
5744 	unsigned int count = dev->num_tx_queues;
5745 	struct netdev_queue *tx;
5746 	size_t sz = count * sizeof(*tx);
5747 
5748 	BUG_ON(count < 1 || count > 0xffff);
5749 
5750 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5751 	if (!tx) {
5752 		tx = vzalloc(sz);
5753 		if (!tx)
5754 			return -ENOMEM;
5755 	}
5756 	dev->_tx = tx;
5757 
5758 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5759 	spin_lock_init(&dev->tx_global_lock);
5760 
5761 	return 0;
5762 }
5763 
5764 /**
5765  *	register_netdevice	- register a network device
5766  *	@dev: device to register
5767  *
5768  *	Take a completed network device structure and add it to the kernel
5769  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5770  *	chain. 0 is returned on success. A negative errno code is returned
5771  *	on a failure to set up the device, or if the name is a duplicate.
5772  *
5773  *	Callers must hold the rtnl semaphore. You may want
5774  *	register_netdev() instead of this.
5775  *
5776  *	BUGS:
5777  *	The locking appears insufficient to guarantee two parallel registers
5778  *	will not get the same name.
5779  */
5780 
5781 int register_netdevice(struct net_device *dev)
5782 {
5783 	int ret;
5784 	struct net *net = dev_net(dev);
5785 
5786 	BUG_ON(dev_boot_phase);
5787 	ASSERT_RTNL();
5788 
5789 	might_sleep();
5790 
5791 	/* When net_device's are persistent, this will be fatal. */
5792 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5793 	BUG_ON(!net);
5794 
5795 	spin_lock_init(&dev->addr_list_lock);
5796 	netdev_set_addr_lockdep_class(dev);
5797 
5798 	dev->iflink = -1;
5799 
5800 	ret = dev_get_valid_name(net, dev, dev->name);
5801 	if (ret < 0)
5802 		goto out;
5803 
5804 	/* Init, if this function is available */
5805 	if (dev->netdev_ops->ndo_init) {
5806 		ret = dev->netdev_ops->ndo_init(dev);
5807 		if (ret) {
5808 			if (ret > 0)
5809 				ret = -EIO;
5810 			goto out;
5811 		}
5812 	}
5813 
5814 	if (((dev->hw_features | dev->features) &
5815 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
5816 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5817 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5818 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5819 		ret = -EINVAL;
5820 		goto err_uninit;
5821 	}
5822 
5823 	ret = -EBUSY;
5824 	if (!dev->ifindex)
5825 		dev->ifindex = dev_new_index(net);
5826 	else if (__dev_get_by_index(net, dev->ifindex))
5827 		goto err_uninit;
5828 
5829 	if (dev->iflink == -1)
5830 		dev->iflink = dev->ifindex;
5831 
5832 	/* Transfer changeable features to wanted_features and enable
5833 	 * software offloads (GSO and GRO).
5834 	 */
5835 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5836 	dev->features |= NETIF_F_SOFT_FEATURES;
5837 	dev->wanted_features = dev->features & dev->hw_features;
5838 
5839 	/* Turn on no cache copy if HW is doing checksum */
5840 	if (!(dev->flags & IFF_LOOPBACK)) {
5841 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5842 		if (dev->features & NETIF_F_ALL_CSUM) {
5843 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5844 			dev->features |= NETIF_F_NOCACHE_COPY;
5845 		}
5846 	}
5847 
5848 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5849 	 */
5850 	dev->vlan_features |= NETIF_F_HIGHDMA;
5851 
5852 	/* Make NETIF_F_SG inheritable to tunnel devices.
5853 	 */
5854 	dev->hw_enc_features |= NETIF_F_SG;
5855 
5856 	/* Make NETIF_F_SG inheritable to MPLS.
5857 	 */
5858 	dev->mpls_features |= NETIF_F_SG;
5859 
5860 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5861 	ret = notifier_to_errno(ret);
5862 	if (ret)
5863 		goto err_uninit;
5864 
5865 	ret = netdev_register_kobject(dev);
5866 	if (ret)
5867 		goto err_uninit;
5868 	dev->reg_state = NETREG_REGISTERED;
5869 
5870 	__netdev_update_features(dev);
5871 
5872 	/*
5873 	 *	Default initial state at registry is that the
5874 	 *	device is present.
5875 	 */
5876 
5877 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5878 
5879 	linkwatch_init_dev(dev);
5880 
5881 	dev_init_scheduler(dev);
5882 	dev_hold(dev);
5883 	list_netdevice(dev);
5884 	add_device_randomness(dev->dev_addr, dev->addr_len);
5885 
5886 	/* If the device has permanent device address, driver should
5887 	 * set dev_addr and also addr_assign_type should be set to
5888 	 * NET_ADDR_PERM (default value).
5889 	 */
5890 	if (dev->addr_assign_type == NET_ADDR_PERM)
5891 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5892 
5893 	/* Notify protocols, that a new device appeared. */
5894 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5895 	ret = notifier_to_errno(ret);
5896 	if (ret) {
5897 		rollback_registered(dev);
5898 		dev->reg_state = NETREG_UNREGISTERED;
5899 	}
5900 	/*
5901 	 *	Prevent userspace races by waiting until the network
5902 	 *	device is fully setup before sending notifications.
5903 	 */
5904 	if (!dev->rtnl_link_ops ||
5905 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5906 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5907 
5908 out:
5909 	return ret;
5910 
5911 err_uninit:
5912 	if (dev->netdev_ops->ndo_uninit)
5913 		dev->netdev_ops->ndo_uninit(dev);
5914 	goto out;
5915 }
5916 EXPORT_SYMBOL(register_netdevice);
5917 
5918 /**
5919  *	init_dummy_netdev	- init a dummy network device for NAPI
5920  *	@dev: device to init
5921  *
5922  *	This takes a network device structure and initialize the minimum
5923  *	amount of fields so it can be used to schedule NAPI polls without
5924  *	registering a full blown interface. This is to be used by drivers
5925  *	that need to tie several hardware interfaces to a single NAPI
5926  *	poll scheduler due to HW limitations.
5927  */
5928 int init_dummy_netdev(struct net_device *dev)
5929 {
5930 	/* Clear everything. Note we don't initialize spinlocks
5931 	 * are they aren't supposed to be taken by any of the
5932 	 * NAPI code and this dummy netdev is supposed to be
5933 	 * only ever used for NAPI polls
5934 	 */
5935 	memset(dev, 0, sizeof(struct net_device));
5936 
5937 	/* make sure we BUG if trying to hit standard
5938 	 * register/unregister code path
5939 	 */
5940 	dev->reg_state = NETREG_DUMMY;
5941 
5942 	/* NAPI wants this */
5943 	INIT_LIST_HEAD(&dev->napi_list);
5944 
5945 	/* a dummy interface is started by default */
5946 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5947 	set_bit(__LINK_STATE_START, &dev->state);
5948 
5949 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5950 	 * because users of this 'device' dont need to change
5951 	 * its refcount.
5952 	 */
5953 
5954 	return 0;
5955 }
5956 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5957 
5958 
5959 /**
5960  *	register_netdev	- register a network device
5961  *	@dev: device to register
5962  *
5963  *	Take a completed network device structure and add it to the kernel
5964  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5965  *	chain. 0 is returned on success. A negative errno code is returned
5966  *	on a failure to set up the device, or if the name is a duplicate.
5967  *
5968  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5969  *	and expands the device name if you passed a format string to
5970  *	alloc_netdev.
5971  */
5972 int register_netdev(struct net_device *dev)
5973 {
5974 	int err;
5975 
5976 	rtnl_lock();
5977 	err = register_netdevice(dev);
5978 	rtnl_unlock();
5979 	return err;
5980 }
5981 EXPORT_SYMBOL(register_netdev);
5982 
5983 int netdev_refcnt_read(const struct net_device *dev)
5984 {
5985 	int i, refcnt = 0;
5986 
5987 	for_each_possible_cpu(i)
5988 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5989 	return refcnt;
5990 }
5991 EXPORT_SYMBOL(netdev_refcnt_read);
5992 
5993 /**
5994  * netdev_wait_allrefs - wait until all references are gone.
5995  * @dev: target net_device
5996  *
5997  * This is called when unregistering network devices.
5998  *
5999  * Any protocol or device that holds a reference should register
6000  * for netdevice notification, and cleanup and put back the
6001  * reference if they receive an UNREGISTER event.
6002  * We can get stuck here if buggy protocols don't correctly
6003  * call dev_put.
6004  */
6005 static void netdev_wait_allrefs(struct net_device *dev)
6006 {
6007 	unsigned long rebroadcast_time, warning_time;
6008 	int refcnt;
6009 
6010 	linkwatch_forget_dev(dev);
6011 
6012 	rebroadcast_time = warning_time = jiffies;
6013 	refcnt = netdev_refcnt_read(dev);
6014 
6015 	while (refcnt != 0) {
6016 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6017 			rtnl_lock();
6018 
6019 			/* Rebroadcast unregister notification */
6020 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6021 
6022 			__rtnl_unlock();
6023 			rcu_barrier();
6024 			rtnl_lock();
6025 
6026 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6027 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6028 				     &dev->state)) {
6029 				/* We must not have linkwatch events
6030 				 * pending on unregister. If this
6031 				 * happens, we simply run the queue
6032 				 * unscheduled, resulting in a noop
6033 				 * for this device.
6034 				 */
6035 				linkwatch_run_queue();
6036 			}
6037 
6038 			__rtnl_unlock();
6039 
6040 			rebroadcast_time = jiffies;
6041 		}
6042 
6043 		msleep(250);
6044 
6045 		refcnt = netdev_refcnt_read(dev);
6046 
6047 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6048 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6049 				 dev->name, refcnt);
6050 			warning_time = jiffies;
6051 		}
6052 	}
6053 }
6054 
6055 /* The sequence is:
6056  *
6057  *	rtnl_lock();
6058  *	...
6059  *	register_netdevice(x1);
6060  *	register_netdevice(x2);
6061  *	...
6062  *	unregister_netdevice(y1);
6063  *	unregister_netdevice(y2);
6064  *      ...
6065  *	rtnl_unlock();
6066  *	free_netdev(y1);
6067  *	free_netdev(y2);
6068  *
6069  * We are invoked by rtnl_unlock().
6070  * This allows us to deal with problems:
6071  * 1) We can delete sysfs objects which invoke hotplug
6072  *    without deadlocking with linkwatch via keventd.
6073  * 2) Since we run with the RTNL semaphore not held, we can sleep
6074  *    safely in order to wait for the netdev refcnt to drop to zero.
6075  *
6076  * We must not return until all unregister events added during
6077  * the interval the lock was held have been completed.
6078  */
6079 void netdev_run_todo(void)
6080 {
6081 	struct list_head list;
6082 
6083 	/* Snapshot list, allow later requests */
6084 	list_replace_init(&net_todo_list, &list);
6085 
6086 	__rtnl_unlock();
6087 
6088 
6089 	/* Wait for rcu callbacks to finish before next phase */
6090 	if (!list_empty(&list))
6091 		rcu_barrier();
6092 
6093 	while (!list_empty(&list)) {
6094 		struct net_device *dev
6095 			= list_first_entry(&list, struct net_device, todo_list);
6096 		list_del(&dev->todo_list);
6097 
6098 		rtnl_lock();
6099 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6100 		__rtnl_unlock();
6101 
6102 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6103 			pr_err("network todo '%s' but state %d\n",
6104 			       dev->name, dev->reg_state);
6105 			dump_stack();
6106 			continue;
6107 		}
6108 
6109 		dev->reg_state = NETREG_UNREGISTERED;
6110 
6111 		on_each_cpu(flush_backlog, dev, 1);
6112 
6113 		netdev_wait_allrefs(dev);
6114 
6115 		/* paranoia */
6116 		BUG_ON(netdev_refcnt_read(dev));
6117 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6118 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6119 		WARN_ON(dev->dn_ptr);
6120 
6121 		if (dev->destructor)
6122 			dev->destructor(dev);
6123 
6124 		/* Report a network device has been unregistered */
6125 		rtnl_lock();
6126 		dev_net(dev)->dev_unreg_count--;
6127 		__rtnl_unlock();
6128 		wake_up(&netdev_unregistering_wq);
6129 
6130 		/* Free network device */
6131 		kobject_put(&dev->dev.kobj);
6132 	}
6133 }
6134 
6135 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6136  * fields in the same order, with only the type differing.
6137  */
6138 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6139 			     const struct net_device_stats *netdev_stats)
6140 {
6141 #if BITS_PER_LONG == 64
6142 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6143 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6144 #else
6145 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6146 	const unsigned long *src = (const unsigned long *)netdev_stats;
6147 	u64 *dst = (u64 *)stats64;
6148 
6149 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6150 		     sizeof(*stats64) / sizeof(u64));
6151 	for (i = 0; i < n; i++)
6152 		dst[i] = src[i];
6153 #endif
6154 }
6155 EXPORT_SYMBOL(netdev_stats_to_stats64);
6156 
6157 /**
6158  *	dev_get_stats	- get network device statistics
6159  *	@dev: device to get statistics from
6160  *	@storage: place to store stats
6161  *
6162  *	Get network statistics from device. Return @storage.
6163  *	The device driver may provide its own method by setting
6164  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6165  *	otherwise the internal statistics structure is used.
6166  */
6167 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6168 					struct rtnl_link_stats64 *storage)
6169 {
6170 	const struct net_device_ops *ops = dev->netdev_ops;
6171 
6172 	if (ops->ndo_get_stats64) {
6173 		memset(storage, 0, sizeof(*storage));
6174 		ops->ndo_get_stats64(dev, storage);
6175 	} else if (ops->ndo_get_stats) {
6176 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6177 	} else {
6178 		netdev_stats_to_stats64(storage, &dev->stats);
6179 	}
6180 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6181 	return storage;
6182 }
6183 EXPORT_SYMBOL(dev_get_stats);
6184 
6185 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6186 {
6187 	struct netdev_queue *queue = dev_ingress_queue(dev);
6188 
6189 #ifdef CONFIG_NET_CLS_ACT
6190 	if (queue)
6191 		return queue;
6192 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6193 	if (!queue)
6194 		return NULL;
6195 	netdev_init_one_queue(dev, queue, NULL);
6196 	queue->qdisc = &noop_qdisc;
6197 	queue->qdisc_sleeping = &noop_qdisc;
6198 	rcu_assign_pointer(dev->ingress_queue, queue);
6199 #endif
6200 	return queue;
6201 }
6202 
6203 static const struct ethtool_ops default_ethtool_ops;
6204 
6205 void netdev_set_default_ethtool_ops(struct net_device *dev,
6206 				    const struct ethtool_ops *ops)
6207 {
6208 	if (dev->ethtool_ops == &default_ethtool_ops)
6209 		dev->ethtool_ops = ops;
6210 }
6211 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6212 
6213 void netdev_freemem(struct net_device *dev)
6214 {
6215 	char *addr = (char *)dev - dev->padded;
6216 
6217 	if (is_vmalloc_addr(addr))
6218 		vfree(addr);
6219 	else
6220 		kfree(addr);
6221 }
6222 
6223 /**
6224  *	alloc_netdev_mqs - allocate network device
6225  *	@sizeof_priv:	size of private data to allocate space for
6226  *	@name:		device name format string
6227  *	@setup:		callback to initialize device
6228  *	@txqs:		the number of TX subqueues to allocate
6229  *	@rxqs:		the number of RX subqueues to allocate
6230  *
6231  *	Allocates a struct net_device with private data area for driver use
6232  *	and performs basic initialization.  Also allocates subquue structs
6233  *	for each queue on the device.
6234  */
6235 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6236 		void (*setup)(struct net_device *),
6237 		unsigned int txqs, unsigned int rxqs)
6238 {
6239 	struct net_device *dev;
6240 	size_t alloc_size;
6241 	struct net_device *p;
6242 
6243 	BUG_ON(strlen(name) >= sizeof(dev->name));
6244 
6245 	if (txqs < 1) {
6246 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6247 		return NULL;
6248 	}
6249 
6250 #ifdef CONFIG_RPS
6251 	if (rxqs < 1) {
6252 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6253 		return NULL;
6254 	}
6255 #endif
6256 
6257 	alloc_size = sizeof(struct net_device);
6258 	if (sizeof_priv) {
6259 		/* ensure 32-byte alignment of private area */
6260 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6261 		alloc_size += sizeof_priv;
6262 	}
6263 	/* ensure 32-byte alignment of whole construct */
6264 	alloc_size += NETDEV_ALIGN - 1;
6265 
6266 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6267 	if (!p)
6268 		p = vzalloc(alloc_size);
6269 	if (!p)
6270 		return NULL;
6271 
6272 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6273 	dev->padded = (char *)dev - (char *)p;
6274 
6275 	dev->pcpu_refcnt = alloc_percpu(int);
6276 	if (!dev->pcpu_refcnt)
6277 		goto free_dev;
6278 
6279 	if (dev_addr_init(dev))
6280 		goto free_pcpu;
6281 
6282 	dev_mc_init(dev);
6283 	dev_uc_init(dev);
6284 
6285 	dev_net_set(dev, &init_net);
6286 
6287 	dev->gso_max_size = GSO_MAX_SIZE;
6288 	dev->gso_max_segs = GSO_MAX_SEGS;
6289 
6290 	INIT_LIST_HEAD(&dev->napi_list);
6291 	INIT_LIST_HEAD(&dev->unreg_list);
6292 	INIT_LIST_HEAD(&dev->close_list);
6293 	INIT_LIST_HEAD(&dev->link_watch_list);
6294 	INIT_LIST_HEAD(&dev->adj_list.upper);
6295 	INIT_LIST_HEAD(&dev->adj_list.lower);
6296 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6297 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6298 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6299 	setup(dev);
6300 
6301 	dev->num_tx_queues = txqs;
6302 	dev->real_num_tx_queues = txqs;
6303 	if (netif_alloc_netdev_queues(dev))
6304 		goto free_all;
6305 
6306 #ifdef CONFIG_RPS
6307 	dev->num_rx_queues = rxqs;
6308 	dev->real_num_rx_queues = rxqs;
6309 	if (netif_alloc_rx_queues(dev))
6310 		goto free_all;
6311 #endif
6312 
6313 	strcpy(dev->name, name);
6314 	dev->group = INIT_NETDEV_GROUP;
6315 	if (!dev->ethtool_ops)
6316 		dev->ethtool_ops = &default_ethtool_ops;
6317 	return dev;
6318 
6319 free_all:
6320 	free_netdev(dev);
6321 	return NULL;
6322 
6323 free_pcpu:
6324 	free_percpu(dev->pcpu_refcnt);
6325 	netif_free_tx_queues(dev);
6326 #ifdef CONFIG_RPS
6327 	kfree(dev->_rx);
6328 #endif
6329 
6330 free_dev:
6331 	netdev_freemem(dev);
6332 	return NULL;
6333 }
6334 EXPORT_SYMBOL(alloc_netdev_mqs);
6335 
6336 /**
6337  *	free_netdev - free network device
6338  *	@dev: device
6339  *
6340  *	This function does the last stage of destroying an allocated device
6341  * 	interface. The reference to the device object is released.
6342  *	If this is the last reference then it will be freed.
6343  */
6344 void free_netdev(struct net_device *dev)
6345 {
6346 	struct napi_struct *p, *n;
6347 
6348 	release_net(dev_net(dev));
6349 
6350 	netif_free_tx_queues(dev);
6351 #ifdef CONFIG_RPS
6352 	kfree(dev->_rx);
6353 #endif
6354 
6355 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6356 
6357 	/* Flush device addresses */
6358 	dev_addr_flush(dev);
6359 
6360 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6361 		netif_napi_del(p);
6362 
6363 	free_percpu(dev->pcpu_refcnt);
6364 	dev->pcpu_refcnt = NULL;
6365 
6366 	/*  Compatibility with error handling in drivers */
6367 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6368 		netdev_freemem(dev);
6369 		return;
6370 	}
6371 
6372 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6373 	dev->reg_state = NETREG_RELEASED;
6374 
6375 	/* will free via device release */
6376 	put_device(&dev->dev);
6377 }
6378 EXPORT_SYMBOL(free_netdev);
6379 
6380 /**
6381  *	synchronize_net -  Synchronize with packet receive processing
6382  *
6383  *	Wait for packets currently being received to be done.
6384  *	Does not block later packets from starting.
6385  */
6386 void synchronize_net(void)
6387 {
6388 	might_sleep();
6389 	if (rtnl_is_locked())
6390 		synchronize_rcu_expedited();
6391 	else
6392 		synchronize_rcu();
6393 }
6394 EXPORT_SYMBOL(synchronize_net);
6395 
6396 /**
6397  *	unregister_netdevice_queue - remove device from the kernel
6398  *	@dev: device
6399  *	@head: list
6400  *
6401  *	This function shuts down a device interface and removes it
6402  *	from the kernel tables.
6403  *	If head not NULL, device is queued to be unregistered later.
6404  *
6405  *	Callers must hold the rtnl semaphore.  You may want
6406  *	unregister_netdev() instead of this.
6407  */
6408 
6409 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6410 {
6411 	ASSERT_RTNL();
6412 
6413 	if (head) {
6414 		list_move_tail(&dev->unreg_list, head);
6415 	} else {
6416 		rollback_registered(dev);
6417 		/* Finish processing unregister after unlock */
6418 		net_set_todo(dev);
6419 	}
6420 }
6421 EXPORT_SYMBOL(unregister_netdevice_queue);
6422 
6423 /**
6424  *	unregister_netdevice_many - unregister many devices
6425  *	@head: list of devices
6426  */
6427 void unregister_netdevice_many(struct list_head *head)
6428 {
6429 	struct net_device *dev;
6430 
6431 	if (!list_empty(head)) {
6432 		rollback_registered_many(head);
6433 		list_for_each_entry(dev, head, unreg_list)
6434 			net_set_todo(dev);
6435 	}
6436 }
6437 EXPORT_SYMBOL(unregister_netdevice_many);
6438 
6439 /**
6440  *	unregister_netdev - remove device from the kernel
6441  *	@dev: device
6442  *
6443  *	This function shuts down a device interface and removes it
6444  *	from the kernel tables.
6445  *
6446  *	This is just a wrapper for unregister_netdevice that takes
6447  *	the rtnl semaphore.  In general you want to use this and not
6448  *	unregister_netdevice.
6449  */
6450 void unregister_netdev(struct net_device *dev)
6451 {
6452 	rtnl_lock();
6453 	unregister_netdevice(dev);
6454 	rtnl_unlock();
6455 }
6456 EXPORT_SYMBOL(unregister_netdev);
6457 
6458 /**
6459  *	dev_change_net_namespace - move device to different nethost namespace
6460  *	@dev: device
6461  *	@net: network namespace
6462  *	@pat: If not NULL name pattern to try if the current device name
6463  *	      is already taken in the destination network namespace.
6464  *
6465  *	This function shuts down a device interface and moves it
6466  *	to a new network namespace. On success 0 is returned, on
6467  *	a failure a netagive errno code is returned.
6468  *
6469  *	Callers must hold the rtnl semaphore.
6470  */
6471 
6472 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6473 {
6474 	int err;
6475 
6476 	ASSERT_RTNL();
6477 
6478 	/* Don't allow namespace local devices to be moved. */
6479 	err = -EINVAL;
6480 	if (dev->features & NETIF_F_NETNS_LOCAL)
6481 		goto out;
6482 
6483 	/* Ensure the device has been registrered */
6484 	if (dev->reg_state != NETREG_REGISTERED)
6485 		goto out;
6486 
6487 	/* Get out if there is nothing todo */
6488 	err = 0;
6489 	if (net_eq(dev_net(dev), net))
6490 		goto out;
6491 
6492 	/* Pick the destination device name, and ensure
6493 	 * we can use it in the destination network namespace.
6494 	 */
6495 	err = -EEXIST;
6496 	if (__dev_get_by_name(net, dev->name)) {
6497 		/* We get here if we can't use the current device name */
6498 		if (!pat)
6499 			goto out;
6500 		if (dev_get_valid_name(net, dev, pat) < 0)
6501 			goto out;
6502 	}
6503 
6504 	/*
6505 	 * And now a mini version of register_netdevice unregister_netdevice.
6506 	 */
6507 
6508 	/* If device is running close it first. */
6509 	dev_close(dev);
6510 
6511 	/* And unlink it from device chain */
6512 	err = -ENODEV;
6513 	unlist_netdevice(dev);
6514 
6515 	synchronize_net();
6516 
6517 	/* Shutdown queueing discipline. */
6518 	dev_shutdown(dev);
6519 
6520 	/* Notify protocols, that we are about to destroy
6521 	   this device. They should clean all the things.
6522 
6523 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6524 	   This is wanted because this way 8021q and macvlan know
6525 	   the device is just moving and can keep their slaves up.
6526 	*/
6527 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6528 	rcu_barrier();
6529 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6530 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6531 
6532 	/*
6533 	 *	Flush the unicast and multicast chains
6534 	 */
6535 	dev_uc_flush(dev);
6536 	dev_mc_flush(dev);
6537 
6538 	/* Send a netdev-removed uevent to the old namespace */
6539 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6540 
6541 	/* Actually switch the network namespace */
6542 	dev_net_set(dev, net);
6543 
6544 	/* If there is an ifindex conflict assign a new one */
6545 	if (__dev_get_by_index(net, dev->ifindex)) {
6546 		int iflink = (dev->iflink == dev->ifindex);
6547 		dev->ifindex = dev_new_index(net);
6548 		if (iflink)
6549 			dev->iflink = dev->ifindex;
6550 	}
6551 
6552 	/* Send a netdev-add uevent to the new namespace */
6553 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6554 
6555 	/* Fixup kobjects */
6556 	err = device_rename(&dev->dev, dev->name);
6557 	WARN_ON(err);
6558 
6559 	/* Add the device back in the hashes */
6560 	list_netdevice(dev);
6561 
6562 	/* Notify protocols, that a new device appeared. */
6563 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6564 
6565 	/*
6566 	 *	Prevent userspace races by waiting until the network
6567 	 *	device is fully setup before sending notifications.
6568 	 */
6569 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6570 
6571 	synchronize_net();
6572 	err = 0;
6573 out:
6574 	return err;
6575 }
6576 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6577 
6578 static int dev_cpu_callback(struct notifier_block *nfb,
6579 			    unsigned long action,
6580 			    void *ocpu)
6581 {
6582 	struct sk_buff **list_skb;
6583 	struct sk_buff *skb;
6584 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6585 	struct softnet_data *sd, *oldsd;
6586 
6587 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6588 		return NOTIFY_OK;
6589 
6590 	local_irq_disable();
6591 	cpu = smp_processor_id();
6592 	sd = &per_cpu(softnet_data, cpu);
6593 	oldsd = &per_cpu(softnet_data, oldcpu);
6594 
6595 	/* Find end of our completion_queue. */
6596 	list_skb = &sd->completion_queue;
6597 	while (*list_skb)
6598 		list_skb = &(*list_skb)->next;
6599 	/* Append completion queue from offline CPU. */
6600 	*list_skb = oldsd->completion_queue;
6601 	oldsd->completion_queue = NULL;
6602 
6603 	/* Append output queue from offline CPU. */
6604 	if (oldsd->output_queue) {
6605 		*sd->output_queue_tailp = oldsd->output_queue;
6606 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6607 		oldsd->output_queue = NULL;
6608 		oldsd->output_queue_tailp = &oldsd->output_queue;
6609 	}
6610 	/* Append NAPI poll list from offline CPU. */
6611 	if (!list_empty(&oldsd->poll_list)) {
6612 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6613 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6614 	}
6615 
6616 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6617 	local_irq_enable();
6618 
6619 	/* Process offline CPU's input_pkt_queue */
6620 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6621 		netif_rx(skb);
6622 		input_queue_head_incr(oldsd);
6623 	}
6624 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6625 		netif_rx(skb);
6626 		input_queue_head_incr(oldsd);
6627 	}
6628 
6629 	return NOTIFY_OK;
6630 }
6631 
6632 
6633 /**
6634  *	netdev_increment_features - increment feature set by one
6635  *	@all: current feature set
6636  *	@one: new feature set
6637  *	@mask: mask feature set
6638  *
6639  *	Computes a new feature set after adding a device with feature set
6640  *	@one to the master device with current feature set @all.  Will not
6641  *	enable anything that is off in @mask. Returns the new feature set.
6642  */
6643 netdev_features_t netdev_increment_features(netdev_features_t all,
6644 	netdev_features_t one, netdev_features_t mask)
6645 {
6646 	if (mask & NETIF_F_GEN_CSUM)
6647 		mask |= NETIF_F_ALL_CSUM;
6648 	mask |= NETIF_F_VLAN_CHALLENGED;
6649 
6650 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6651 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6652 
6653 	/* If one device supports hw checksumming, set for all. */
6654 	if (all & NETIF_F_GEN_CSUM)
6655 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6656 
6657 	return all;
6658 }
6659 EXPORT_SYMBOL(netdev_increment_features);
6660 
6661 static struct hlist_head * __net_init netdev_create_hash(void)
6662 {
6663 	int i;
6664 	struct hlist_head *hash;
6665 
6666 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6667 	if (hash != NULL)
6668 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6669 			INIT_HLIST_HEAD(&hash[i]);
6670 
6671 	return hash;
6672 }
6673 
6674 /* Initialize per network namespace state */
6675 static int __net_init netdev_init(struct net *net)
6676 {
6677 	if (net != &init_net)
6678 		INIT_LIST_HEAD(&net->dev_base_head);
6679 
6680 	net->dev_name_head = netdev_create_hash();
6681 	if (net->dev_name_head == NULL)
6682 		goto err_name;
6683 
6684 	net->dev_index_head = netdev_create_hash();
6685 	if (net->dev_index_head == NULL)
6686 		goto err_idx;
6687 
6688 	return 0;
6689 
6690 err_idx:
6691 	kfree(net->dev_name_head);
6692 err_name:
6693 	return -ENOMEM;
6694 }
6695 
6696 /**
6697  *	netdev_drivername - network driver for the device
6698  *	@dev: network device
6699  *
6700  *	Determine network driver for device.
6701  */
6702 const char *netdev_drivername(const struct net_device *dev)
6703 {
6704 	const struct device_driver *driver;
6705 	const struct device *parent;
6706 	const char *empty = "";
6707 
6708 	parent = dev->dev.parent;
6709 	if (!parent)
6710 		return empty;
6711 
6712 	driver = parent->driver;
6713 	if (driver && driver->name)
6714 		return driver->name;
6715 	return empty;
6716 }
6717 
6718 static int __netdev_printk(const char *level, const struct net_device *dev,
6719 			   struct va_format *vaf)
6720 {
6721 	int r;
6722 
6723 	if (dev && dev->dev.parent) {
6724 		r = dev_printk_emit(level[1] - '0',
6725 				    dev->dev.parent,
6726 				    "%s %s %s: %pV",
6727 				    dev_driver_string(dev->dev.parent),
6728 				    dev_name(dev->dev.parent),
6729 				    netdev_name(dev), vaf);
6730 	} else if (dev) {
6731 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6732 	} else {
6733 		r = printk("%s(NULL net_device): %pV", level, vaf);
6734 	}
6735 
6736 	return r;
6737 }
6738 
6739 int netdev_printk(const char *level, const struct net_device *dev,
6740 		  const char *format, ...)
6741 {
6742 	struct va_format vaf;
6743 	va_list args;
6744 	int r;
6745 
6746 	va_start(args, format);
6747 
6748 	vaf.fmt = format;
6749 	vaf.va = &args;
6750 
6751 	r = __netdev_printk(level, dev, &vaf);
6752 
6753 	va_end(args);
6754 
6755 	return r;
6756 }
6757 EXPORT_SYMBOL(netdev_printk);
6758 
6759 #define define_netdev_printk_level(func, level)			\
6760 int func(const struct net_device *dev, const char *fmt, ...)	\
6761 {								\
6762 	int r;							\
6763 	struct va_format vaf;					\
6764 	va_list args;						\
6765 								\
6766 	va_start(args, fmt);					\
6767 								\
6768 	vaf.fmt = fmt;						\
6769 	vaf.va = &args;						\
6770 								\
6771 	r = __netdev_printk(level, dev, &vaf);			\
6772 								\
6773 	va_end(args);						\
6774 								\
6775 	return r;						\
6776 }								\
6777 EXPORT_SYMBOL(func);
6778 
6779 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6780 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6781 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6782 define_netdev_printk_level(netdev_err, KERN_ERR);
6783 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6784 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6785 define_netdev_printk_level(netdev_info, KERN_INFO);
6786 
6787 static void __net_exit netdev_exit(struct net *net)
6788 {
6789 	kfree(net->dev_name_head);
6790 	kfree(net->dev_index_head);
6791 }
6792 
6793 static struct pernet_operations __net_initdata netdev_net_ops = {
6794 	.init = netdev_init,
6795 	.exit = netdev_exit,
6796 };
6797 
6798 static void __net_exit default_device_exit(struct net *net)
6799 {
6800 	struct net_device *dev, *aux;
6801 	/*
6802 	 * Push all migratable network devices back to the
6803 	 * initial network namespace
6804 	 */
6805 	rtnl_lock();
6806 	for_each_netdev_safe(net, dev, aux) {
6807 		int err;
6808 		char fb_name[IFNAMSIZ];
6809 
6810 		/* Ignore unmoveable devices (i.e. loopback) */
6811 		if (dev->features & NETIF_F_NETNS_LOCAL)
6812 			continue;
6813 
6814 		/* Leave virtual devices for the generic cleanup */
6815 		if (dev->rtnl_link_ops)
6816 			continue;
6817 
6818 		/* Push remaining network devices to init_net */
6819 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6820 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6821 		if (err) {
6822 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6823 				 __func__, dev->name, err);
6824 			BUG();
6825 		}
6826 	}
6827 	rtnl_unlock();
6828 }
6829 
6830 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6831 {
6832 	/* Return with the rtnl_lock held when there are no network
6833 	 * devices unregistering in any network namespace in net_list.
6834 	 */
6835 	struct net *net;
6836 	bool unregistering;
6837 	DEFINE_WAIT(wait);
6838 
6839 	for (;;) {
6840 		prepare_to_wait(&netdev_unregistering_wq, &wait,
6841 				TASK_UNINTERRUPTIBLE);
6842 		unregistering = false;
6843 		rtnl_lock();
6844 		list_for_each_entry(net, net_list, exit_list) {
6845 			if (net->dev_unreg_count > 0) {
6846 				unregistering = true;
6847 				break;
6848 			}
6849 		}
6850 		if (!unregistering)
6851 			break;
6852 		__rtnl_unlock();
6853 		schedule();
6854 	}
6855 	finish_wait(&netdev_unregistering_wq, &wait);
6856 }
6857 
6858 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6859 {
6860 	/* At exit all network devices most be removed from a network
6861 	 * namespace.  Do this in the reverse order of registration.
6862 	 * Do this across as many network namespaces as possible to
6863 	 * improve batching efficiency.
6864 	 */
6865 	struct net_device *dev;
6866 	struct net *net;
6867 	LIST_HEAD(dev_kill_list);
6868 
6869 	/* To prevent network device cleanup code from dereferencing
6870 	 * loopback devices or network devices that have been freed
6871 	 * wait here for all pending unregistrations to complete,
6872 	 * before unregistring the loopback device and allowing the
6873 	 * network namespace be freed.
6874 	 *
6875 	 * The netdev todo list containing all network devices
6876 	 * unregistrations that happen in default_device_exit_batch
6877 	 * will run in the rtnl_unlock() at the end of
6878 	 * default_device_exit_batch.
6879 	 */
6880 	rtnl_lock_unregistering(net_list);
6881 	list_for_each_entry(net, net_list, exit_list) {
6882 		for_each_netdev_reverse(net, dev) {
6883 			if (dev->rtnl_link_ops)
6884 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6885 			else
6886 				unregister_netdevice_queue(dev, &dev_kill_list);
6887 		}
6888 	}
6889 	unregister_netdevice_many(&dev_kill_list);
6890 	list_del(&dev_kill_list);
6891 	rtnl_unlock();
6892 }
6893 
6894 static struct pernet_operations __net_initdata default_device_ops = {
6895 	.exit = default_device_exit,
6896 	.exit_batch = default_device_exit_batch,
6897 };
6898 
6899 /*
6900  *	Initialize the DEV module. At boot time this walks the device list and
6901  *	unhooks any devices that fail to initialise (normally hardware not
6902  *	present) and leaves us with a valid list of present and active devices.
6903  *
6904  */
6905 
6906 /*
6907  *       This is called single threaded during boot, so no need
6908  *       to take the rtnl semaphore.
6909  */
6910 static int __init net_dev_init(void)
6911 {
6912 	int i, rc = -ENOMEM;
6913 
6914 	BUG_ON(!dev_boot_phase);
6915 
6916 	if (dev_proc_init())
6917 		goto out;
6918 
6919 	if (netdev_kobject_init())
6920 		goto out;
6921 
6922 	INIT_LIST_HEAD(&ptype_all);
6923 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6924 		INIT_LIST_HEAD(&ptype_base[i]);
6925 
6926 	INIT_LIST_HEAD(&offload_base);
6927 
6928 	if (register_pernet_subsys(&netdev_net_ops))
6929 		goto out;
6930 
6931 	/*
6932 	 *	Initialise the packet receive queues.
6933 	 */
6934 
6935 	for_each_possible_cpu(i) {
6936 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6937 
6938 		memset(sd, 0, sizeof(*sd));
6939 		skb_queue_head_init(&sd->input_pkt_queue);
6940 		skb_queue_head_init(&sd->process_queue);
6941 		sd->completion_queue = NULL;
6942 		INIT_LIST_HEAD(&sd->poll_list);
6943 		sd->output_queue = NULL;
6944 		sd->output_queue_tailp = &sd->output_queue;
6945 #ifdef CONFIG_RPS
6946 		sd->csd.func = rps_trigger_softirq;
6947 		sd->csd.info = sd;
6948 		sd->csd.flags = 0;
6949 		sd->cpu = i;
6950 #endif
6951 
6952 		sd->backlog.poll = process_backlog;
6953 		sd->backlog.weight = weight_p;
6954 		sd->backlog.gro_list = NULL;
6955 		sd->backlog.gro_count = 0;
6956 
6957 #ifdef CONFIG_NET_FLOW_LIMIT
6958 		sd->flow_limit = NULL;
6959 #endif
6960 	}
6961 
6962 	dev_boot_phase = 0;
6963 
6964 	/* The loopback device is special if any other network devices
6965 	 * is present in a network namespace the loopback device must
6966 	 * be present. Since we now dynamically allocate and free the
6967 	 * loopback device ensure this invariant is maintained by
6968 	 * keeping the loopback device as the first device on the
6969 	 * list of network devices.  Ensuring the loopback devices
6970 	 * is the first device that appears and the last network device
6971 	 * that disappears.
6972 	 */
6973 	if (register_pernet_device(&loopback_net_ops))
6974 		goto out;
6975 
6976 	if (register_pernet_device(&default_device_ops))
6977 		goto out;
6978 
6979 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6980 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6981 
6982 	hotcpu_notifier(dev_cpu_callback, 0);
6983 	dst_init();
6984 	rc = 0;
6985 out:
6986 	return rc;
6987 }
6988 
6989 subsys_initcall(net_dev_init);
6990