xref: /openbmc/linux/net/core/dev.c (revision bc000245)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
134 #include <linux/if_macvlan.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 static DEFINE_SPINLOCK(ptype_lock);
145 static DEFINE_SPINLOCK(offload_lock);
146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
147 struct list_head ptype_all __read_mostly;	/* Taps */
148 static struct list_head offload_base __read_mostly;
149 
150 /*
151  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
152  * semaphore.
153  *
154  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
155  *
156  * Writers must hold the rtnl semaphore while they loop through the
157  * dev_base_head list, and hold dev_base_lock for writing when they do the
158  * actual updates.  This allows pure readers to access the list even
159  * while a writer is preparing to update it.
160  *
161  * To put it another way, dev_base_lock is held for writing only to
162  * protect against pure readers; the rtnl semaphore provides the
163  * protection against other writers.
164  *
165  * See, for example usages, register_netdevice() and
166  * unregister_netdevice(), which must be called with the rtnl
167  * semaphore held.
168  */
169 DEFINE_RWLOCK(dev_base_lock);
170 EXPORT_SYMBOL(dev_base_lock);
171 
172 /* protects napi_hash addition/deletion and napi_gen_id */
173 static DEFINE_SPINLOCK(napi_hash_lock);
174 
175 static unsigned int napi_gen_id;
176 static DEFINE_HASHTABLE(napi_hash, 8);
177 
178 static seqcount_t devnet_rename_seq;
179 
180 static inline void dev_base_seq_inc(struct net *net)
181 {
182 	while (++net->dev_base_seq == 0);
183 }
184 
185 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
186 {
187 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
188 
189 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
190 }
191 
192 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
193 {
194 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
195 }
196 
197 static inline void rps_lock(struct softnet_data *sd)
198 {
199 #ifdef CONFIG_RPS
200 	spin_lock(&sd->input_pkt_queue.lock);
201 #endif
202 }
203 
204 static inline void rps_unlock(struct softnet_data *sd)
205 {
206 #ifdef CONFIG_RPS
207 	spin_unlock(&sd->input_pkt_queue.lock);
208 #endif
209 }
210 
211 /* Device list insertion */
212 static void list_netdevice(struct net_device *dev)
213 {
214 	struct net *net = dev_net(dev);
215 
216 	ASSERT_RTNL();
217 
218 	write_lock_bh(&dev_base_lock);
219 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
220 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
221 	hlist_add_head_rcu(&dev->index_hlist,
222 			   dev_index_hash(net, dev->ifindex));
223 	write_unlock_bh(&dev_base_lock);
224 
225 	dev_base_seq_inc(net);
226 }
227 
228 /* Device list removal
229  * caller must respect a RCU grace period before freeing/reusing dev
230  */
231 static void unlist_netdevice(struct net_device *dev)
232 {
233 	ASSERT_RTNL();
234 
235 	/* Unlink dev from the device chain */
236 	write_lock_bh(&dev_base_lock);
237 	list_del_rcu(&dev->dev_list);
238 	hlist_del_rcu(&dev->name_hlist);
239 	hlist_del_rcu(&dev->index_hlist);
240 	write_unlock_bh(&dev_base_lock);
241 
242 	dev_base_seq_inc(dev_net(dev));
243 }
244 
245 /*
246  *	Our notifier list
247  */
248 
249 static RAW_NOTIFIER_HEAD(netdev_chain);
250 
251 /*
252  *	Device drivers call our routines to queue packets here. We empty the
253  *	queue in the local softnet handler.
254  */
255 
256 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
257 EXPORT_PER_CPU_SYMBOL(softnet_data);
258 
259 #ifdef CONFIG_LOCKDEP
260 /*
261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
262  * according to dev->type
263  */
264 static const unsigned short netdev_lock_type[] =
265 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
266 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
267 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
268 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
269 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
270 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
271 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
272 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
273 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
274 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
275 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
276 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
277 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
278 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
279 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
280 
281 static const char *const netdev_lock_name[] =
282 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
283 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
284 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
285 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
286 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
287 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
288 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
289 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
290 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
291 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
292 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
293 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
294 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
295 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
296 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
297 
298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
300 
301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
302 {
303 	int i;
304 
305 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
306 		if (netdev_lock_type[i] == dev_type)
307 			return i;
308 	/* the last key is used by default */
309 	return ARRAY_SIZE(netdev_lock_type) - 1;
310 }
311 
312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
313 						 unsigned short dev_type)
314 {
315 	int i;
316 
317 	i = netdev_lock_pos(dev_type);
318 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
319 				   netdev_lock_name[i]);
320 }
321 
322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
323 {
324 	int i;
325 
326 	i = netdev_lock_pos(dev->type);
327 	lockdep_set_class_and_name(&dev->addr_list_lock,
328 				   &netdev_addr_lock_key[i],
329 				   netdev_lock_name[i]);
330 }
331 #else
332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
333 						 unsigned short dev_type)
334 {
335 }
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 }
339 #endif
340 
341 /*******************************************************************************
342 
343 		Protocol management and registration routines
344 
345 *******************************************************************************/
346 
347 /*
348  *	Add a protocol ID to the list. Now that the input handler is
349  *	smarter we can dispense with all the messy stuff that used to be
350  *	here.
351  *
352  *	BEWARE!!! Protocol handlers, mangling input packets,
353  *	MUST BE last in hash buckets and checking protocol handlers
354  *	MUST start from promiscuous ptype_all chain in net_bh.
355  *	It is true now, do not change it.
356  *	Explanation follows: if protocol handler, mangling packet, will
357  *	be the first on list, it is not able to sense, that packet
358  *	is cloned and should be copied-on-write, so that it will
359  *	change it and subsequent readers will get broken packet.
360  *							--ANK (980803)
361  */
362 
363 static inline struct list_head *ptype_head(const struct packet_type *pt)
364 {
365 	if (pt->type == htons(ETH_P_ALL))
366 		return &ptype_all;
367 	else
368 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
369 }
370 
371 /**
372  *	dev_add_pack - add packet handler
373  *	@pt: packet type declaration
374  *
375  *	Add a protocol handler to the networking stack. The passed &packet_type
376  *	is linked into kernel lists and may not be freed until it has been
377  *	removed from the kernel lists.
378  *
379  *	This call does not sleep therefore it can not
380  *	guarantee all CPU's that are in middle of receiving packets
381  *	will see the new packet type (until the next received packet).
382  */
383 
384 void dev_add_pack(struct packet_type *pt)
385 {
386 	struct list_head *head = ptype_head(pt);
387 
388 	spin_lock(&ptype_lock);
389 	list_add_rcu(&pt->list, head);
390 	spin_unlock(&ptype_lock);
391 }
392 EXPORT_SYMBOL(dev_add_pack);
393 
394 /**
395  *	__dev_remove_pack	 - remove packet handler
396  *	@pt: packet type declaration
397  *
398  *	Remove a protocol handler that was previously added to the kernel
399  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
400  *	from the kernel lists and can be freed or reused once this function
401  *	returns.
402  *
403  *      The packet type might still be in use by receivers
404  *	and must not be freed until after all the CPU's have gone
405  *	through a quiescent state.
406  */
407 void __dev_remove_pack(struct packet_type *pt)
408 {
409 	struct list_head *head = ptype_head(pt);
410 	struct packet_type *pt1;
411 
412 	spin_lock(&ptype_lock);
413 
414 	list_for_each_entry(pt1, head, list) {
415 		if (pt == pt1) {
416 			list_del_rcu(&pt->list);
417 			goto out;
418 		}
419 	}
420 
421 	pr_warn("dev_remove_pack: %p not found\n", pt);
422 out:
423 	spin_unlock(&ptype_lock);
424 }
425 EXPORT_SYMBOL(__dev_remove_pack);
426 
427 /**
428  *	dev_remove_pack	 - remove packet handler
429  *	@pt: packet type declaration
430  *
431  *	Remove a protocol handler that was previously added to the kernel
432  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
433  *	from the kernel lists and can be freed or reused once this function
434  *	returns.
435  *
436  *	This call sleeps to guarantee that no CPU is looking at the packet
437  *	type after return.
438  */
439 void dev_remove_pack(struct packet_type *pt)
440 {
441 	__dev_remove_pack(pt);
442 
443 	synchronize_net();
444 }
445 EXPORT_SYMBOL(dev_remove_pack);
446 
447 
448 /**
449  *	dev_add_offload - register offload handlers
450  *	@po: protocol offload declaration
451  *
452  *	Add protocol offload handlers to the networking stack. The passed
453  *	&proto_offload is linked into kernel lists and may not be freed until
454  *	it has been removed from the kernel lists.
455  *
456  *	This call does not sleep therefore it can not
457  *	guarantee all CPU's that are in middle of receiving packets
458  *	will see the new offload handlers (until the next received packet).
459  */
460 void dev_add_offload(struct packet_offload *po)
461 {
462 	struct list_head *head = &offload_base;
463 
464 	spin_lock(&offload_lock);
465 	list_add_rcu(&po->list, head);
466 	spin_unlock(&offload_lock);
467 }
468 EXPORT_SYMBOL(dev_add_offload);
469 
470 /**
471  *	__dev_remove_offload	 - remove offload handler
472  *	@po: packet offload declaration
473  *
474  *	Remove a protocol offload handler that was previously added to the
475  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
476  *	is removed from the kernel lists and can be freed or reused once this
477  *	function returns.
478  *
479  *      The packet type might still be in use by receivers
480  *	and must not be freed until after all the CPU's have gone
481  *	through a quiescent state.
482  */
483 void __dev_remove_offload(struct packet_offload *po)
484 {
485 	struct list_head *head = &offload_base;
486 	struct packet_offload *po1;
487 
488 	spin_lock(&offload_lock);
489 
490 	list_for_each_entry(po1, head, list) {
491 		if (po == po1) {
492 			list_del_rcu(&po->list);
493 			goto out;
494 		}
495 	}
496 
497 	pr_warn("dev_remove_offload: %p not found\n", po);
498 out:
499 	spin_unlock(&offload_lock);
500 }
501 EXPORT_SYMBOL(__dev_remove_offload);
502 
503 /**
504  *	dev_remove_offload	 - remove packet offload handler
505  *	@po: packet offload declaration
506  *
507  *	Remove a packet offload handler that was previously added to the kernel
508  *	offload handlers by dev_add_offload(). The passed &offload_type is
509  *	removed from the kernel lists and can be freed or reused once this
510  *	function returns.
511  *
512  *	This call sleeps to guarantee that no CPU is looking at the packet
513  *	type after return.
514  */
515 void dev_remove_offload(struct packet_offload *po)
516 {
517 	__dev_remove_offload(po);
518 
519 	synchronize_net();
520 }
521 EXPORT_SYMBOL(dev_remove_offload);
522 
523 /******************************************************************************
524 
525 		      Device Boot-time Settings Routines
526 
527 *******************************************************************************/
528 
529 /* Boot time configuration table */
530 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
531 
532 /**
533  *	netdev_boot_setup_add	- add new setup entry
534  *	@name: name of the device
535  *	@map: configured settings for the device
536  *
537  *	Adds new setup entry to the dev_boot_setup list.  The function
538  *	returns 0 on error and 1 on success.  This is a generic routine to
539  *	all netdevices.
540  */
541 static int netdev_boot_setup_add(char *name, struct ifmap *map)
542 {
543 	struct netdev_boot_setup *s;
544 	int i;
545 
546 	s = dev_boot_setup;
547 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
548 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
549 			memset(s[i].name, 0, sizeof(s[i].name));
550 			strlcpy(s[i].name, name, IFNAMSIZ);
551 			memcpy(&s[i].map, map, sizeof(s[i].map));
552 			break;
553 		}
554 	}
555 
556 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
557 }
558 
559 /**
560  *	netdev_boot_setup_check	- check boot time settings
561  *	@dev: the netdevice
562  *
563  * 	Check boot time settings for the device.
564  *	The found settings are set for the device to be used
565  *	later in the device probing.
566  *	Returns 0 if no settings found, 1 if they are.
567  */
568 int netdev_boot_setup_check(struct net_device *dev)
569 {
570 	struct netdev_boot_setup *s = dev_boot_setup;
571 	int i;
572 
573 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
574 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
575 		    !strcmp(dev->name, s[i].name)) {
576 			dev->irq 	= s[i].map.irq;
577 			dev->base_addr 	= s[i].map.base_addr;
578 			dev->mem_start 	= s[i].map.mem_start;
579 			dev->mem_end 	= s[i].map.mem_end;
580 			return 1;
581 		}
582 	}
583 	return 0;
584 }
585 EXPORT_SYMBOL(netdev_boot_setup_check);
586 
587 
588 /**
589  *	netdev_boot_base	- get address from boot time settings
590  *	@prefix: prefix for network device
591  *	@unit: id for network device
592  *
593  * 	Check boot time settings for the base address of device.
594  *	The found settings are set for the device to be used
595  *	later in the device probing.
596  *	Returns 0 if no settings found.
597  */
598 unsigned long netdev_boot_base(const char *prefix, int unit)
599 {
600 	const struct netdev_boot_setup *s = dev_boot_setup;
601 	char name[IFNAMSIZ];
602 	int i;
603 
604 	sprintf(name, "%s%d", prefix, unit);
605 
606 	/*
607 	 * If device already registered then return base of 1
608 	 * to indicate not to probe for this interface
609 	 */
610 	if (__dev_get_by_name(&init_net, name))
611 		return 1;
612 
613 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
614 		if (!strcmp(name, s[i].name))
615 			return s[i].map.base_addr;
616 	return 0;
617 }
618 
619 /*
620  * Saves at boot time configured settings for any netdevice.
621  */
622 int __init netdev_boot_setup(char *str)
623 {
624 	int ints[5];
625 	struct ifmap map;
626 
627 	str = get_options(str, ARRAY_SIZE(ints), ints);
628 	if (!str || !*str)
629 		return 0;
630 
631 	/* Save settings */
632 	memset(&map, 0, sizeof(map));
633 	if (ints[0] > 0)
634 		map.irq = ints[1];
635 	if (ints[0] > 1)
636 		map.base_addr = ints[2];
637 	if (ints[0] > 2)
638 		map.mem_start = ints[3];
639 	if (ints[0] > 3)
640 		map.mem_end = ints[4];
641 
642 	/* Add new entry to the list */
643 	return netdev_boot_setup_add(str, &map);
644 }
645 
646 __setup("netdev=", netdev_boot_setup);
647 
648 /*******************************************************************************
649 
650 			    Device Interface Subroutines
651 
652 *******************************************************************************/
653 
654 /**
655  *	__dev_get_by_name	- find a device by its name
656  *	@net: the applicable net namespace
657  *	@name: name to find
658  *
659  *	Find an interface by name. Must be called under RTNL semaphore
660  *	or @dev_base_lock. If the name is found a pointer to the device
661  *	is returned. If the name is not found then %NULL is returned. The
662  *	reference counters are not incremented so the caller must be
663  *	careful with locks.
664  */
665 
666 struct net_device *__dev_get_by_name(struct net *net, const char *name)
667 {
668 	struct net_device *dev;
669 	struct hlist_head *head = dev_name_hash(net, name);
670 
671 	hlist_for_each_entry(dev, head, name_hlist)
672 		if (!strncmp(dev->name, name, IFNAMSIZ))
673 			return dev;
674 
675 	return NULL;
676 }
677 EXPORT_SYMBOL(__dev_get_by_name);
678 
679 /**
680  *	dev_get_by_name_rcu	- find a device by its name
681  *	@net: the applicable net namespace
682  *	@name: name to find
683  *
684  *	Find an interface by name.
685  *	If the name is found a pointer to the device is returned.
686  * 	If the name is not found then %NULL is returned.
687  *	The reference counters are not incremented so the caller must be
688  *	careful with locks. The caller must hold RCU lock.
689  */
690 
691 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
692 {
693 	struct net_device *dev;
694 	struct hlist_head *head = dev_name_hash(net, name);
695 
696 	hlist_for_each_entry_rcu(dev, head, name_hlist)
697 		if (!strncmp(dev->name, name, IFNAMSIZ))
698 			return dev;
699 
700 	return NULL;
701 }
702 EXPORT_SYMBOL(dev_get_by_name_rcu);
703 
704 /**
705  *	dev_get_by_name		- find a device by its name
706  *	@net: the applicable net namespace
707  *	@name: name to find
708  *
709  *	Find an interface by name. This can be called from any
710  *	context and does its own locking. The returned handle has
711  *	the usage count incremented and the caller must use dev_put() to
712  *	release it when it is no longer needed. %NULL is returned if no
713  *	matching device is found.
714  */
715 
716 struct net_device *dev_get_by_name(struct net *net, const char *name)
717 {
718 	struct net_device *dev;
719 
720 	rcu_read_lock();
721 	dev = dev_get_by_name_rcu(net, name);
722 	if (dev)
723 		dev_hold(dev);
724 	rcu_read_unlock();
725 	return dev;
726 }
727 EXPORT_SYMBOL(dev_get_by_name);
728 
729 /**
730  *	__dev_get_by_index - find a device by its ifindex
731  *	@net: the applicable net namespace
732  *	@ifindex: index of device
733  *
734  *	Search for an interface by index. Returns %NULL if the device
735  *	is not found or a pointer to the device. The device has not
736  *	had its reference counter increased so the caller must be careful
737  *	about locking. The caller must hold either the RTNL semaphore
738  *	or @dev_base_lock.
739  */
740 
741 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
742 {
743 	struct net_device *dev;
744 	struct hlist_head *head = dev_index_hash(net, ifindex);
745 
746 	hlist_for_each_entry(dev, head, index_hlist)
747 		if (dev->ifindex == ifindex)
748 			return dev;
749 
750 	return NULL;
751 }
752 EXPORT_SYMBOL(__dev_get_by_index);
753 
754 /**
755  *	dev_get_by_index_rcu - find a device by its ifindex
756  *	@net: the applicable net namespace
757  *	@ifindex: index of device
758  *
759  *	Search for an interface by index. Returns %NULL if the device
760  *	is not found or a pointer to the device. The device has not
761  *	had its reference counter increased so the caller must be careful
762  *	about locking. The caller must hold RCU lock.
763  */
764 
765 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
766 {
767 	struct net_device *dev;
768 	struct hlist_head *head = dev_index_hash(net, ifindex);
769 
770 	hlist_for_each_entry_rcu(dev, head, index_hlist)
771 		if (dev->ifindex == ifindex)
772 			return dev;
773 
774 	return NULL;
775 }
776 EXPORT_SYMBOL(dev_get_by_index_rcu);
777 
778 
779 /**
780  *	dev_get_by_index - find a device by its ifindex
781  *	@net: the applicable net namespace
782  *	@ifindex: index of device
783  *
784  *	Search for an interface by index. Returns NULL if the device
785  *	is not found or a pointer to the device. The device returned has
786  *	had a reference added and the pointer is safe until the user calls
787  *	dev_put to indicate they have finished with it.
788  */
789 
790 struct net_device *dev_get_by_index(struct net *net, int ifindex)
791 {
792 	struct net_device *dev;
793 
794 	rcu_read_lock();
795 	dev = dev_get_by_index_rcu(net, ifindex);
796 	if (dev)
797 		dev_hold(dev);
798 	rcu_read_unlock();
799 	return dev;
800 }
801 EXPORT_SYMBOL(dev_get_by_index);
802 
803 /**
804  *	netdev_get_name - get a netdevice name, knowing its ifindex.
805  *	@net: network namespace
806  *	@name: a pointer to the buffer where the name will be stored.
807  *	@ifindex: the ifindex of the interface to get the name from.
808  *
809  *	The use of raw_seqcount_begin() and cond_resched() before
810  *	retrying is required as we want to give the writers a chance
811  *	to complete when CONFIG_PREEMPT is not set.
812  */
813 int netdev_get_name(struct net *net, char *name, int ifindex)
814 {
815 	struct net_device *dev;
816 	unsigned int seq;
817 
818 retry:
819 	seq = raw_seqcount_begin(&devnet_rename_seq);
820 	rcu_read_lock();
821 	dev = dev_get_by_index_rcu(net, ifindex);
822 	if (!dev) {
823 		rcu_read_unlock();
824 		return -ENODEV;
825 	}
826 
827 	strcpy(name, dev->name);
828 	rcu_read_unlock();
829 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
830 		cond_resched();
831 		goto retry;
832 	}
833 
834 	return 0;
835 }
836 
837 /**
838  *	dev_getbyhwaddr_rcu - find a device by its hardware address
839  *	@net: the applicable net namespace
840  *	@type: media type of device
841  *	@ha: hardware address
842  *
843  *	Search for an interface by MAC address. Returns NULL if the device
844  *	is not found or a pointer to the device.
845  *	The caller must hold RCU or RTNL.
846  *	The returned device has not had its ref count increased
847  *	and the caller must therefore be careful about locking
848  *
849  */
850 
851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
852 				       const char *ha)
853 {
854 	struct net_device *dev;
855 
856 	for_each_netdev_rcu(net, dev)
857 		if (dev->type == type &&
858 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
859 			return dev;
860 
861 	return NULL;
862 }
863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
864 
865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
866 {
867 	struct net_device *dev;
868 
869 	ASSERT_RTNL();
870 	for_each_netdev(net, dev)
871 		if (dev->type == type)
872 			return dev;
873 
874 	return NULL;
875 }
876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
877 
878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
879 {
880 	struct net_device *dev, *ret = NULL;
881 
882 	rcu_read_lock();
883 	for_each_netdev_rcu(net, dev)
884 		if (dev->type == type) {
885 			dev_hold(dev);
886 			ret = dev;
887 			break;
888 		}
889 	rcu_read_unlock();
890 	return ret;
891 }
892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
893 
894 /**
895  *	dev_get_by_flags_rcu - find any device with given flags
896  *	@net: the applicable net namespace
897  *	@if_flags: IFF_* values
898  *	@mask: bitmask of bits in if_flags to check
899  *
900  *	Search for any interface with the given flags. Returns NULL if a device
901  *	is not found or a pointer to the device. Must be called inside
902  *	rcu_read_lock(), and result refcount is unchanged.
903  */
904 
905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
906 				    unsigned short mask)
907 {
908 	struct net_device *dev, *ret;
909 
910 	ret = NULL;
911 	for_each_netdev_rcu(net, dev) {
912 		if (((dev->flags ^ if_flags) & mask) == 0) {
913 			ret = dev;
914 			break;
915 		}
916 	}
917 	return ret;
918 }
919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
920 
921 /**
922  *	dev_valid_name - check if name is okay for network device
923  *	@name: name string
924  *
925  *	Network device names need to be valid file names to
926  *	to allow sysfs to work.  We also disallow any kind of
927  *	whitespace.
928  */
929 bool dev_valid_name(const char *name)
930 {
931 	if (*name == '\0')
932 		return false;
933 	if (strlen(name) >= IFNAMSIZ)
934 		return false;
935 	if (!strcmp(name, ".") || !strcmp(name, ".."))
936 		return false;
937 
938 	while (*name) {
939 		if (*name == '/' || isspace(*name))
940 			return false;
941 		name++;
942 	}
943 	return true;
944 }
945 EXPORT_SYMBOL(dev_valid_name);
946 
947 /**
948  *	__dev_alloc_name - allocate a name for a device
949  *	@net: network namespace to allocate the device name in
950  *	@name: name format string
951  *	@buf:  scratch buffer and result name string
952  *
953  *	Passed a format string - eg "lt%d" it will try and find a suitable
954  *	id. It scans list of devices to build up a free map, then chooses
955  *	the first empty slot. The caller must hold the dev_base or rtnl lock
956  *	while allocating the name and adding the device in order to avoid
957  *	duplicates.
958  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
959  *	Returns the number of the unit assigned or a negative errno code.
960  */
961 
962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
963 {
964 	int i = 0;
965 	const char *p;
966 	const int max_netdevices = 8*PAGE_SIZE;
967 	unsigned long *inuse;
968 	struct net_device *d;
969 
970 	p = strnchr(name, IFNAMSIZ-1, '%');
971 	if (p) {
972 		/*
973 		 * Verify the string as this thing may have come from
974 		 * the user.  There must be either one "%d" and no other "%"
975 		 * characters.
976 		 */
977 		if (p[1] != 'd' || strchr(p + 2, '%'))
978 			return -EINVAL;
979 
980 		/* Use one page as a bit array of possible slots */
981 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
982 		if (!inuse)
983 			return -ENOMEM;
984 
985 		for_each_netdev(net, d) {
986 			if (!sscanf(d->name, name, &i))
987 				continue;
988 			if (i < 0 || i >= max_netdevices)
989 				continue;
990 
991 			/*  avoid cases where sscanf is not exact inverse of printf */
992 			snprintf(buf, IFNAMSIZ, name, i);
993 			if (!strncmp(buf, d->name, IFNAMSIZ))
994 				set_bit(i, inuse);
995 		}
996 
997 		i = find_first_zero_bit(inuse, max_netdevices);
998 		free_page((unsigned long) inuse);
999 	}
1000 
1001 	if (buf != name)
1002 		snprintf(buf, IFNAMSIZ, name, i);
1003 	if (!__dev_get_by_name(net, buf))
1004 		return i;
1005 
1006 	/* It is possible to run out of possible slots
1007 	 * when the name is long and there isn't enough space left
1008 	 * for the digits, or if all bits are used.
1009 	 */
1010 	return -ENFILE;
1011 }
1012 
1013 /**
1014  *	dev_alloc_name - allocate a name for a device
1015  *	@dev: device
1016  *	@name: name format string
1017  *
1018  *	Passed a format string - eg "lt%d" it will try and find a suitable
1019  *	id. It scans list of devices to build up a free map, then chooses
1020  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *	while allocating the name and adding the device in order to avoid
1022  *	duplicates.
1023  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *	Returns the number of the unit assigned or a negative errno code.
1025  */
1026 
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029 	char buf[IFNAMSIZ];
1030 	struct net *net;
1031 	int ret;
1032 
1033 	BUG_ON(!dev_net(dev));
1034 	net = dev_net(dev);
1035 	ret = __dev_alloc_name(net, name, buf);
1036 	if (ret >= 0)
1037 		strlcpy(dev->name, buf, IFNAMSIZ);
1038 	return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041 
1042 static int dev_alloc_name_ns(struct net *net,
1043 			     struct net_device *dev,
1044 			     const char *name)
1045 {
1046 	char buf[IFNAMSIZ];
1047 	int ret;
1048 
1049 	ret = __dev_alloc_name(net, name, buf);
1050 	if (ret >= 0)
1051 		strlcpy(dev->name, buf, IFNAMSIZ);
1052 	return ret;
1053 }
1054 
1055 static int dev_get_valid_name(struct net *net,
1056 			      struct net_device *dev,
1057 			      const char *name)
1058 {
1059 	BUG_ON(!net);
1060 
1061 	if (!dev_valid_name(name))
1062 		return -EINVAL;
1063 
1064 	if (strchr(name, '%'))
1065 		return dev_alloc_name_ns(net, dev, name);
1066 	else if (__dev_get_by_name(net, name))
1067 		return -EEXIST;
1068 	else if (dev->name != name)
1069 		strlcpy(dev->name, name, IFNAMSIZ);
1070 
1071 	return 0;
1072 }
1073 
1074 /**
1075  *	dev_change_name - change name of a device
1076  *	@dev: device
1077  *	@newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *	Change name of a device, can pass format strings "eth%d".
1080  *	for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084 	char oldname[IFNAMSIZ];
1085 	int err = 0;
1086 	int ret;
1087 	struct net *net;
1088 
1089 	ASSERT_RTNL();
1090 	BUG_ON(!dev_net(dev));
1091 
1092 	net = dev_net(dev);
1093 	if (dev->flags & IFF_UP)
1094 		return -EBUSY;
1095 
1096 	write_seqcount_begin(&devnet_rename_seq);
1097 
1098 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099 		write_seqcount_end(&devnet_rename_seq);
1100 		return 0;
1101 	}
1102 
1103 	memcpy(oldname, dev->name, IFNAMSIZ);
1104 
1105 	err = dev_get_valid_name(net, dev, newname);
1106 	if (err < 0) {
1107 		write_seqcount_end(&devnet_rename_seq);
1108 		return err;
1109 	}
1110 
1111 rollback:
1112 	ret = device_rename(&dev->dev, dev->name);
1113 	if (ret) {
1114 		memcpy(dev->name, oldname, IFNAMSIZ);
1115 		write_seqcount_end(&devnet_rename_seq);
1116 		return ret;
1117 	}
1118 
1119 	write_seqcount_end(&devnet_rename_seq);
1120 
1121 	write_lock_bh(&dev_base_lock);
1122 	hlist_del_rcu(&dev->name_hlist);
1123 	write_unlock_bh(&dev_base_lock);
1124 
1125 	synchronize_rcu();
1126 
1127 	write_lock_bh(&dev_base_lock);
1128 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129 	write_unlock_bh(&dev_base_lock);
1130 
1131 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132 	ret = notifier_to_errno(ret);
1133 
1134 	if (ret) {
1135 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1136 		if (err >= 0) {
1137 			err = ret;
1138 			write_seqcount_begin(&devnet_rename_seq);
1139 			memcpy(dev->name, oldname, IFNAMSIZ);
1140 			goto rollback;
1141 		} else {
1142 			pr_err("%s: name change rollback failed: %d\n",
1143 			       dev->name, ret);
1144 		}
1145 	}
1146 
1147 	return err;
1148 }
1149 
1150 /**
1151  *	dev_set_alias - change ifalias of a device
1152  *	@dev: device
1153  *	@alias: name up to IFALIASZ
1154  *	@len: limit of bytes to copy from info
1155  *
1156  *	Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160 	char *new_ifalias;
1161 
1162 	ASSERT_RTNL();
1163 
1164 	if (len >= IFALIASZ)
1165 		return -EINVAL;
1166 
1167 	if (!len) {
1168 		kfree(dev->ifalias);
1169 		dev->ifalias = NULL;
1170 		return 0;
1171 	}
1172 
1173 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174 	if (!new_ifalias)
1175 		return -ENOMEM;
1176 	dev->ifalias = new_ifalias;
1177 
1178 	strlcpy(dev->ifalias, alias, len+1);
1179 	return len;
1180 }
1181 
1182 
1183 /**
1184  *	netdev_features_change - device changes features
1185  *	@dev: device to cause notification
1186  *
1187  *	Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194 
1195 /**
1196  *	netdev_state_change - device changes state
1197  *	@dev: device to cause notification
1198  *
1199  *	Called to indicate a device has changed state. This function calls
1200  *	the notifier chains for netdev_chain and sends a NEWLINK message
1201  *	to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205 	if (dev->flags & IFF_UP) {
1206 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1208 	}
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211 
1212 /**
1213  * 	netdev_notify_peers - notify network peers about existence of @dev
1214  * 	@dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224 	rtnl_lock();
1225 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226 	rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229 
1230 static int __dev_open(struct net_device *dev)
1231 {
1232 	const struct net_device_ops *ops = dev->netdev_ops;
1233 	int ret;
1234 
1235 	ASSERT_RTNL();
1236 
1237 	if (!netif_device_present(dev))
1238 		return -ENODEV;
1239 
1240 	/* Block netpoll from trying to do any rx path servicing.
1241 	 * If we don't do this there is a chance ndo_poll_controller
1242 	 * or ndo_poll may be running while we open the device
1243 	 */
1244 	netpoll_rx_disable(dev);
1245 
1246 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1247 	ret = notifier_to_errno(ret);
1248 	if (ret)
1249 		return ret;
1250 
1251 	set_bit(__LINK_STATE_START, &dev->state);
1252 
1253 	if (ops->ndo_validate_addr)
1254 		ret = ops->ndo_validate_addr(dev);
1255 
1256 	if (!ret && ops->ndo_open)
1257 		ret = ops->ndo_open(dev);
1258 
1259 	netpoll_rx_enable(dev);
1260 
1261 	if (ret)
1262 		clear_bit(__LINK_STATE_START, &dev->state);
1263 	else {
1264 		dev->flags |= IFF_UP;
1265 		net_dmaengine_get();
1266 		dev_set_rx_mode(dev);
1267 		dev_activate(dev);
1268 		add_device_randomness(dev->dev_addr, dev->addr_len);
1269 	}
1270 
1271 	return ret;
1272 }
1273 
1274 /**
1275  *	dev_open	- prepare an interface for use.
1276  *	@dev:	device to open
1277  *
1278  *	Takes a device from down to up state. The device's private open
1279  *	function is invoked and then the multicast lists are loaded. Finally
1280  *	the device is moved into the up state and a %NETDEV_UP message is
1281  *	sent to the netdev notifier chain.
1282  *
1283  *	Calling this function on an active interface is a nop. On a failure
1284  *	a negative errno code is returned.
1285  */
1286 int dev_open(struct net_device *dev)
1287 {
1288 	int ret;
1289 
1290 	if (dev->flags & IFF_UP)
1291 		return 0;
1292 
1293 	ret = __dev_open(dev);
1294 	if (ret < 0)
1295 		return ret;
1296 
1297 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1298 	call_netdevice_notifiers(NETDEV_UP, dev);
1299 
1300 	return ret;
1301 }
1302 EXPORT_SYMBOL(dev_open);
1303 
1304 static int __dev_close_many(struct list_head *head)
1305 {
1306 	struct net_device *dev;
1307 
1308 	ASSERT_RTNL();
1309 	might_sleep();
1310 
1311 	list_for_each_entry(dev, head, close_list) {
1312 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313 
1314 		clear_bit(__LINK_STATE_START, &dev->state);
1315 
1316 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1317 		 * can be even on different cpu. So just clear netif_running().
1318 		 *
1319 		 * dev->stop() will invoke napi_disable() on all of it's
1320 		 * napi_struct instances on this device.
1321 		 */
1322 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1323 	}
1324 
1325 	dev_deactivate_many(head);
1326 
1327 	list_for_each_entry(dev, head, close_list) {
1328 		const struct net_device_ops *ops = dev->netdev_ops;
1329 
1330 		/*
1331 		 *	Call the device specific close. This cannot fail.
1332 		 *	Only if device is UP
1333 		 *
1334 		 *	We allow it to be called even after a DETACH hot-plug
1335 		 *	event.
1336 		 */
1337 		if (ops->ndo_stop)
1338 			ops->ndo_stop(dev);
1339 
1340 		dev->flags &= ~IFF_UP;
1341 		net_dmaengine_put();
1342 	}
1343 
1344 	return 0;
1345 }
1346 
1347 static int __dev_close(struct net_device *dev)
1348 {
1349 	int retval;
1350 	LIST_HEAD(single);
1351 
1352 	/* Temporarily disable netpoll until the interface is down */
1353 	netpoll_rx_disable(dev);
1354 
1355 	list_add(&dev->close_list, &single);
1356 	retval = __dev_close_many(&single);
1357 	list_del(&single);
1358 
1359 	netpoll_rx_enable(dev);
1360 	return retval;
1361 }
1362 
1363 static int dev_close_many(struct list_head *head)
1364 {
1365 	struct net_device *dev, *tmp;
1366 
1367 	/* Remove the devices that don't need to be closed */
1368 	list_for_each_entry_safe(dev, tmp, head, close_list)
1369 		if (!(dev->flags & IFF_UP))
1370 			list_del_init(&dev->close_list);
1371 
1372 	__dev_close_many(head);
1373 
1374 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1375 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1377 		list_del_init(&dev->close_list);
1378 	}
1379 
1380 	return 0;
1381 }
1382 
1383 /**
1384  *	dev_close - shutdown an interface.
1385  *	@dev: device to shutdown
1386  *
1387  *	This function moves an active device into down state. A
1388  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *	chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394 	if (dev->flags & IFF_UP) {
1395 		LIST_HEAD(single);
1396 
1397 		/* Block netpoll rx while the interface is going down */
1398 		netpoll_rx_disable(dev);
1399 
1400 		list_add(&dev->close_list, &single);
1401 		dev_close_many(&single);
1402 		list_del(&single);
1403 
1404 		netpoll_rx_enable(dev);
1405 	}
1406 	return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close);
1409 
1410 
1411 /**
1412  *	dev_disable_lro - disable Large Receive Offload on a device
1413  *	@dev: device
1414  *
1415  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1416  *	called under RTNL.  This is needed if received packets may be
1417  *	forwarded to another interface.
1418  */
1419 void dev_disable_lro(struct net_device *dev)
1420 {
1421 	/*
1422 	 * If we're trying to disable lro on a vlan device
1423 	 * use the underlying physical device instead
1424 	 */
1425 	if (is_vlan_dev(dev))
1426 		dev = vlan_dev_real_dev(dev);
1427 
1428 	/* the same for macvlan devices */
1429 	if (netif_is_macvlan(dev))
1430 		dev = macvlan_dev_real_dev(dev);
1431 
1432 	dev->wanted_features &= ~NETIF_F_LRO;
1433 	netdev_update_features(dev);
1434 
1435 	if (unlikely(dev->features & NETIF_F_LRO))
1436 		netdev_WARN(dev, "failed to disable LRO!\n");
1437 }
1438 EXPORT_SYMBOL(dev_disable_lro);
1439 
1440 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1441 				   struct net_device *dev)
1442 {
1443 	struct netdev_notifier_info info;
1444 
1445 	netdev_notifier_info_init(&info, dev);
1446 	return nb->notifier_call(nb, val, &info);
1447 }
1448 
1449 static int dev_boot_phase = 1;
1450 
1451 /**
1452  *	register_netdevice_notifier - register a network notifier block
1453  *	@nb: notifier
1454  *
1455  *	Register a notifier to be called when network device events occur.
1456  *	The notifier passed is linked into the kernel structures and must
1457  *	not be reused until it has been unregistered. A negative errno code
1458  *	is returned on a failure.
1459  *
1460  * 	When registered all registration and up events are replayed
1461  *	to the new notifier to allow device to have a race free
1462  *	view of the network device list.
1463  */
1464 
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467 	struct net_device *dev;
1468 	struct net_device *last;
1469 	struct net *net;
1470 	int err;
1471 
1472 	rtnl_lock();
1473 	err = raw_notifier_chain_register(&netdev_chain, nb);
1474 	if (err)
1475 		goto unlock;
1476 	if (dev_boot_phase)
1477 		goto unlock;
1478 	for_each_net(net) {
1479 		for_each_netdev(net, dev) {
1480 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1481 			err = notifier_to_errno(err);
1482 			if (err)
1483 				goto rollback;
1484 
1485 			if (!(dev->flags & IFF_UP))
1486 				continue;
1487 
1488 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1489 		}
1490 	}
1491 
1492 unlock:
1493 	rtnl_unlock();
1494 	return err;
1495 
1496 rollback:
1497 	last = dev;
1498 	for_each_net(net) {
1499 		for_each_netdev(net, dev) {
1500 			if (dev == last)
1501 				goto outroll;
1502 
1503 			if (dev->flags & IFF_UP) {
1504 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1505 							dev);
1506 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1507 			}
1508 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1509 		}
1510 	}
1511 
1512 outroll:
1513 	raw_notifier_chain_unregister(&netdev_chain, nb);
1514 	goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517 
1518 /**
1519  *	unregister_netdevice_notifier - unregister a network notifier block
1520  *	@nb: notifier
1521  *
1522  *	Unregister a notifier previously registered by
1523  *	register_netdevice_notifier(). The notifier is unlinked into the
1524  *	kernel structures and may then be reused. A negative errno code
1525  *	is returned on a failure.
1526  *
1527  * 	After unregistering unregister and down device events are synthesized
1528  *	for all devices on the device list to the removed notifier to remove
1529  *	the need for special case cleanup code.
1530  */
1531 
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534 	struct net_device *dev;
1535 	struct net *net;
1536 	int err;
1537 
1538 	rtnl_lock();
1539 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540 	if (err)
1541 		goto unlock;
1542 
1543 	for_each_net(net) {
1544 		for_each_netdev(net, dev) {
1545 			if (dev->flags & IFF_UP) {
1546 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1547 							dev);
1548 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1549 			}
1550 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1551 		}
1552 	}
1553 unlock:
1554 	rtnl_unlock();
1555 	return err;
1556 }
1557 EXPORT_SYMBOL(unregister_netdevice_notifier);
1558 
1559 /**
1560  *	call_netdevice_notifiers_info - call all network notifier blocks
1561  *	@val: value passed unmodified to notifier function
1562  *	@dev: net_device pointer passed unmodified to notifier function
1563  *	@info: notifier information data
1564  *
1565  *	Call all network notifier blocks.  Parameters and return value
1566  *	are as for raw_notifier_call_chain().
1567  */
1568 
1569 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1570 				  struct netdev_notifier_info *info)
1571 {
1572 	ASSERT_RTNL();
1573 	netdev_notifier_info_init(info, dev);
1574 	return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1577 
1578 /**
1579  *	call_netdevice_notifiers - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *
1583  *	Call all network notifier blocks.  Parameters and return value
1584  *	are as for raw_notifier_call_chain().
1585  */
1586 
1587 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1588 {
1589 	struct netdev_notifier_info info;
1590 
1591 	return call_netdevice_notifiers_info(val, dev, &info);
1592 }
1593 EXPORT_SYMBOL(call_netdevice_notifiers);
1594 
1595 static struct static_key netstamp_needed __read_mostly;
1596 #ifdef HAVE_JUMP_LABEL
1597 /* We are not allowed to call static_key_slow_dec() from irq context
1598  * If net_disable_timestamp() is called from irq context, defer the
1599  * static_key_slow_dec() calls.
1600  */
1601 static atomic_t netstamp_needed_deferred;
1602 #endif
1603 
1604 void net_enable_timestamp(void)
1605 {
1606 #ifdef HAVE_JUMP_LABEL
1607 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608 
1609 	if (deferred) {
1610 		while (--deferred)
1611 			static_key_slow_dec(&netstamp_needed);
1612 		return;
1613 	}
1614 #endif
1615 	static_key_slow_inc(&netstamp_needed);
1616 }
1617 EXPORT_SYMBOL(net_enable_timestamp);
1618 
1619 void net_disable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622 	if (in_interrupt()) {
1623 		atomic_inc(&netstamp_needed_deferred);
1624 		return;
1625 	}
1626 #endif
1627 	static_key_slow_dec(&netstamp_needed);
1628 }
1629 EXPORT_SYMBOL(net_disable_timestamp);
1630 
1631 static inline void net_timestamp_set(struct sk_buff *skb)
1632 {
1633 	skb->tstamp.tv64 = 0;
1634 	if (static_key_false(&netstamp_needed))
1635 		__net_timestamp(skb);
1636 }
1637 
1638 #define net_timestamp_check(COND, SKB)			\
1639 	if (static_key_false(&netstamp_needed)) {		\
1640 		if ((COND) && !(SKB)->tstamp.tv64)	\
1641 			__net_timestamp(SKB);		\
1642 	}						\
1643 
1644 static inline bool is_skb_forwardable(struct net_device *dev,
1645 				      struct sk_buff *skb)
1646 {
1647 	unsigned int len;
1648 
1649 	if (!(dev->flags & IFF_UP))
1650 		return false;
1651 
1652 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1653 	if (skb->len <= len)
1654 		return true;
1655 
1656 	/* if TSO is enabled, we don't care about the length as the packet
1657 	 * could be forwarded without being segmented before
1658 	 */
1659 	if (skb_is_gso(skb))
1660 		return true;
1661 
1662 	return false;
1663 }
1664 
1665 /**
1666  * dev_forward_skb - loopback an skb to another netif
1667  *
1668  * @dev: destination network device
1669  * @skb: buffer to forward
1670  *
1671  * return values:
1672  *	NET_RX_SUCCESS	(no congestion)
1673  *	NET_RX_DROP     (packet was dropped, but freed)
1674  *
1675  * dev_forward_skb can be used for injecting an skb from the
1676  * start_xmit function of one device into the receive queue
1677  * of another device.
1678  *
1679  * The receiving device may be in another namespace, so
1680  * we have to clear all information in the skb that could
1681  * impact namespace isolation.
1682  */
1683 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687 			atomic_long_inc(&dev->rx_dropped);
1688 			kfree_skb(skb);
1689 			return NET_RX_DROP;
1690 		}
1691 	}
1692 
1693 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1694 		atomic_long_inc(&dev->rx_dropped);
1695 		kfree_skb(skb);
1696 		return NET_RX_DROP;
1697 	}
1698 
1699 	skb_scrub_packet(skb, true);
1700 	skb->protocol = eth_type_trans(skb, dev);
1701 
1702 	return netif_rx(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(dev_forward_skb);
1705 
1706 static inline int deliver_skb(struct sk_buff *skb,
1707 			      struct packet_type *pt_prev,
1708 			      struct net_device *orig_dev)
1709 {
1710 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711 		return -ENOMEM;
1712 	atomic_inc(&skb->users);
1713 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714 }
1715 
1716 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717 {
1718 	if (!ptype->af_packet_priv || !skb->sk)
1719 		return false;
1720 
1721 	if (ptype->id_match)
1722 		return ptype->id_match(ptype, skb->sk);
1723 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724 		return true;
1725 
1726 	return false;
1727 }
1728 
1729 /*
1730  *	Support routine. Sends outgoing frames to any network
1731  *	taps currently in use.
1732  */
1733 
1734 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735 {
1736 	struct packet_type *ptype;
1737 	struct sk_buff *skb2 = NULL;
1738 	struct packet_type *pt_prev = NULL;
1739 
1740 	rcu_read_lock();
1741 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742 		/* Never send packets back to the socket
1743 		 * they originated from - MvS (miquels@drinkel.ow.org)
1744 		 */
1745 		if ((ptype->dev == dev || !ptype->dev) &&
1746 		    (!skb_loop_sk(ptype, skb))) {
1747 			if (pt_prev) {
1748 				deliver_skb(skb2, pt_prev, skb->dev);
1749 				pt_prev = ptype;
1750 				continue;
1751 			}
1752 
1753 			skb2 = skb_clone(skb, GFP_ATOMIC);
1754 			if (!skb2)
1755 				break;
1756 
1757 			net_timestamp_set(skb2);
1758 
1759 			/* skb->nh should be correctly
1760 			   set by sender, so that the second statement is
1761 			   just protection against buggy protocols.
1762 			 */
1763 			skb_reset_mac_header(skb2);
1764 
1765 			if (skb_network_header(skb2) < skb2->data ||
1766 			    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768 						     ntohs(skb2->protocol),
1769 						     dev->name);
1770 				skb_reset_network_header(skb2);
1771 			}
1772 
1773 			skb2->transport_header = skb2->network_header;
1774 			skb2->pkt_type = PACKET_OUTGOING;
1775 			pt_prev = ptype;
1776 		}
1777 	}
1778 	if (pt_prev)
1779 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780 	rcu_read_unlock();
1781 }
1782 
1783 /**
1784  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785  * @dev: Network device
1786  * @txq: number of queues available
1787  *
1788  * If real_num_tx_queues is changed the tc mappings may no longer be
1789  * valid. To resolve this verify the tc mapping remains valid and if
1790  * not NULL the mapping. With no priorities mapping to this
1791  * offset/count pair it will no longer be used. In the worst case TC0
1792  * is invalid nothing can be done so disable priority mappings. If is
1793  * expected that drivers will fix this mapping if they can before
1794  * calling netif_set_real_num_tx_queues.
1795  */
1796 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797 {
1798 	int i;
1799 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800 
1801 	/* If TC0 is invalidated disable TC mapping */
1802 	if (tc->offset + tc->count > txq) {
1803 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804 		dev->num_tc = 0;
1805 		return;
1806 	}
1807 
1808 	/* Invalidated prio to tc mappings set to TC0 */
1809 	for (i = 1; i < TC_BITMASK + 1; i++) {
1810 		int q = netdev_get_prio_tc_map(dev, i);
1811 
1812 		tc = &dev->tc_to_txq[q];
1813 		if (tc->offset + tc->count > txq) {
1814 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815 				i, q);
1816 			netdev_set_prio_tc_map(dev, i, 0);
1817 		}
1818 	}
1819 }
1820 
1821 #ifdef CONFIG_XPS
1822 static DEFINE_MUTEX(xps_map_mutex);
1823 #define xmap_dereference(P)		\
1824 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825 
1826 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827 					int cpu, u16 index)
1828 {
1829 	struct xps_map *map = NULL;
1830 	int pos;
1831 
1832 	if (dev_maps)
1833 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834 
1835 	for (pos = 0; map && pos < map->len; pos++) {
1836 		if (map->queues[pos] == index) {
1837 			if (map->len > 1) {
1838 				map->queues[pos] = map->queues[--map->len];
1839 			} else {
1840 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841 				kfree_rcu(map, rcu);
1842 				map = NULL;
1843 			}
1844 			break;
1845 		}
1846 	}
1847 
1848 	return map;
1849 }
1850 
1851 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852 {
1853 	struct xps_dev_maps *dev_maps;
1854 	int cpu, i;
1855 	bool active = false;
1856 
1857 	mutex_lock(&xps_map_mutex);
1858 	dev_maps = xmap_dereference(dev->xps_maps);
1859 
1860 	if (!dev_maps)
1861 		goto out_no_maps;
1862 
1863 	for_each_possible_cpu(cpu) {
1864 		for (i = index; i < dev->num_tx_queues; i++) {
1865 			if (!remove_xps_queue(dev_maps, cpu, i))
1866 				break;
1867 		}
1868 		if (i == dev->num_tx_queues)
1869 			active = true;
1870 	}
1871 
1872 	if (!active) {
1873 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1874 		kfree_rcu(dev_maps, rcu);
1875 	}
1876 
1877 	for (i = index; i < dev->num_tx_queues; i++)
1878 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879 					     NUMA_NO_NODE);
1880 
1881 out_no_maps:
1882 	mutex_unlock(&xps_map_mutex);
1883 }
1884 
1885 static struct xps_map *expand_xps_map(struct xps_map *map,
1886 				      int cpu, u16 index)
1887 {
1888 	struct xps_map *new_map;
1889 	int alloc_len = XPS_MIN_MAP_ALLOC;
1890 	int i, pos;
1891 
1892 	for (pos = 0; map && pos < map->len; pos++) {
1893 		if (map->queues[pos] != index)
1894 			continue;
1895 		return map;
1896 	}
1897 
1898 	/* Need to add queue to this CPU's existing map */
1899 	if (map) {
1900 		if (pos < map->alloc_len)
1901 			return map;
1902 
1903 		alloc_len = map->alloc_len * 2;
1904 	}
1905 
1906 	/* Need to allocate new map to store queue on this CPU's map */
1907 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908 			       cpu_to_node(cpu));
1909 	if (!new_map)
1910 		return NULL;
1911 
1912 	for (i = 0; i < pos; i++)
1913 		new_map->queues[i] = map->queues[i];
1914 	new_map->alloc_len = alloc_len;
1915 	new_map->len = pos;
1916 
1917 	return new_map;
1918 }
1919 
1920 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1921 			u16 index)
1922 {
1923 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1924 	struct xps_map *map, *new_map;
1925 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1926 	int cpu, numa_node_id = -2;
1927 	bool active = false;
1928 
1929 	mutex_lock(&xps_map_mutex);
1930 
1931 	dev_maps = xmap_dereference(dev->xps_maps);
1932 
1933 	/* allocate memory for queue storage */
1934 	for_each_online_cpu(cpu) {
1935 		if (!cpumask_test_cpu(cpu, mask))
1936 			continue;
1937 
1938 		if (!new_dev_maps)
1939 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1940 		if (!new_dev_maps) {
1941 			mutex_unlock(&xps_map_mutex);
1942 			return -ENOMEM;
1943 		}
1944 
1945 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1946 				 NULL;
1947 
1948 		map = expand_xps_map(map, cpu, index);
1949 		if (!map)
1950 			goto error;
1951 
1952 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1953 	}
1954 
1955 	if (!new_dev_maps)
1956 		goto out_no_new_maps;
1957 
1958 	for_each_possible_cpu(cpu) {
1959 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1960 			/* add queue to CPU maps */
1961 			int pos = 0;
1962 
1963 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1964 			while ((pos < map->len) && (map->queues[pos] != index))
1965 				pos++;
1966 
1967 			if (pos == map->len)
1968 				map->queues[map->len++] = index;
1969 #ifdef CONFIG_NUMA
1970 			if (numa_node_id == -2)
1971 				numa_node_id = cpu_to_node(cpu);
1972 			else if (numa_node_id != cpu_to_node(cpu))
1973 				numa_node_id = -1;
1974 #endif
1975 		} else if (dev_maps) {
1976 			/* fill in the new device map from the old device map */
1977 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1978 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979 		}
1980 
1981 	}
1982 
1983 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1984 
1985 	/* Cleanup old maps */
1986 	if (dev_maps) {
1987 		for_each_possible_cpu(cpu) {
1988 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990 			if (map && map != new_map)
1991 				kfree_rcu(map, rcu);
1992 		}
1993 
1994 		kfree_rcu(dev_maps, rcu);
1995 	}
1996 
1997 	dev_maps = new_dev_maps;
1998 	active = true;
1999 
2000 out_no_new_maps:
2001 	/* update Tx queue numa node */
2002 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2003 				     (numa_node_id >= 0) ? numa_node_id :
2004 				     NUMA_NO_NODE);
2005 
2006 	if (!dev_maps)
2007 		goto out_no_maps;
2008 
2009 	/* removes queue from unused CPUs */
2010 	for_each_possible_cpu(cpu) {
2011 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2012 			continue;
2013 
2014 		if (remove_xps_queue(dev_maps, cpu, index))
2015 			active = true;
2016 	}
2017 
2018 	/* free map if not active */
2019 	if (!active) {
2020 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2021 		kfree_rcu(dev_maps, rcu);
2022 	}
2023 
2024 out_no_maps:
2025 	mutex_unlock(&xps_map_mutex);
2026 
2027 	return 0;
2028 error:
2029 	/* remove any maps that we added */
2030 	for_each_possible_cpu(cpu) {
2031 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2032 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033 				 NULL;
2034 		if (new_map && new_map != map)
2035 			kfree(new_map);
2036 	}
2037 
2038 	mutex_unlock(&xps_map_mutex);
2039 
2040 	kfree(new_dev_maps);
2041 	return -ENOMEM;
2042 }
2043 EXPORT_SYMBOL(netif_set_xps_queue);
2044 
2045 #endif
2046 /*
2047  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2048  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2049  */
2050 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2051 {
2052 	int rc;
2053 
2054 	if (txq < 1 || txq > dev->num_tx_queues)
2055 		return -EINVAL;
2056 
2057 	if (dev->reg_state == NETREG_REGISTERED ||
2058 	    dev->reg_state == NETREG_UNREGISTERING) {
2059 		ASSERT_RTNL();
2060 
2061 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2062 						  txq);
2063 		if (rc)
2064 			return rc;
2065 
2066 		if (dev->num_tc)
2067 			netif_setup_tc(dev, txq);
2068 
2069 		if (txq < dev->real_num_tx_queues) {
2070 			qdisc_reset_all_tx_gt(dev, txq);
2071 #ifdef CONFIG_XPS
2072 			netif_reset_xps_queues_gt(dev, txq);
2073 #endif
2074 		}
2075 	}
2076 
2077 	dev->real_num_tx_queues = txq;
2078 	return 0;
2079 }
2080 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2081 
2082 #ifdef CONFIG_RPS
2083 /**
2084  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2085  *	@dev: Network device
2086  *	@rxq: Actual number of RX queues
2087  *
2088  *	This must be called either with the rtnl_lock held or before
2089  *	registration of the net device.  Returns 0 on success, or a
2090  *	negative error code.  If called before registration, it always
2091  *	succeeds.
2092  */
2093 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2094 {
2095 	int rc;
2096 
2097 	if (rxq < 1 || rxq > dev->num_rx_queues)
2098 		return -EINVAL;
2099 
2100 	if (dev->reg_state == NETREG_REGISTERED) {
2101 		ASSERT_RTNL();
2102 
2103 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2104 						  rxq);
2105 		if (rc)
2106 			return rc;
2107 	}
2108 
2109 	dev->real_num_rx_queues = rxq;
2110 	return 0;
2111 }
2112 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2113 #endif
2114 
2115 /**
2116  * netif_get_num_default_rss_queues - default number of RSS queues
2117  *
2118  * This routine should set an upper limit on the number of RSS queues
2119  * used by default by multiqueue devices.
2120  */
2121 int netif_get_num_default_rss_queues(void)
2122 {
2123 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2124 }
2125 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2126 
2127 static inline void __netif_reschedule(struct Qdisc *q)
2128 {
2129 	struct softnet_data *sd;
2130 	unsigned long flags;
2131 
2132 	local_irq_save(flags);
2133 	sd = &__get_cpu_var(softnet_data);
2134 	q->next_sched = NULL;
2135 	*sd->output_queue_tailp = q;
2136 	sd->output_queue_tailp = &q->next_sched;
2137 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2138 	local_irq_restore(flags);
2139 }
2140 
2141 void __netif_schedule(struct Qdisc *q)
2142 {
2143 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2144 		__netif_reschedule(q);
2145 }
2146 EXPORT_SYMBOL(__netif_schedule);
2147 
2148 void dev_kfree_skb_irq(struct sk_buff *skb)
2149 {
2150 	if (atomic_dec_and_test(&skb->users)) {
2151 		struct softnet_data *sd;
2152 		unsigned long flags;
2153 
2154 		local_irq_save(flags);
2155 		sd = &__get_cpu_var(softnet_data);
2156 		skb->next = sd->completion_queue;
2157 		sd->completion_queue = skb;
2158 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
2159 		local_irq_restore(flags);
2160 	}
2161 }
2162 EXPORT_SYMBOL(dev_kfree_skb_irq);
2163 
2164 void dev_kfree_skb_any(struct sk_buff *skb)
2165 {
2166 	if (in_irq() || irqs_disabled())
2167 		dev_kfree_skb_irq(skb);
2168 	else
2169 		dev_kfree_skb(skb);
2170 }
2171 EXPORT_SYMBOL(dev_kfree_skb_any);
2172 
2173 
2174 /**
2175  * netif_device_detach - mark device as removed
2176  * @dev: network device
2177  *
2178  * Mark device as removed from system and therefore no longer available.
2179  */
2180 void netif_device_detach(struct net_device *dev)
2181 {
2182 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2183 	    netif_running(dev)) {
2184 		netif_tx_stop_all_queues(dev);
2185 	}
2186 }
2187 EXPORT_SYMBOL(netif_device_detach);
2188 
2189 /**
2190  * netif_device_attach - mark device as attached
2191  * @dev: network device
2192  *
2193  * Mark device as attached from system and restart if needed.
2194  */
2195 void netif_device_attach(struct net_device *dev)
2196 {
2197 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2198 	    netif_running(dev)) {
2199 		netif_tx_wake_all_queues(dev);
2200 		__netdev_watchdog_up(dev);
2201 	}
2202 }
2203 EXPORT_SYMBOL(netif_device_attach);
2204 
2205 static void skb_warn_bad_offload(const struct sk_buff *skb)
2206 {
2207 	static const netdev_features_t null_features = 0;
2208 	struct net_device *dev = skb->dev;
2209 	const char *driver = "";
2210 
2211 	if (!net_ratelimit())
2212 		return;
2213 
2214 	if (dev && dev->dev.parent)
2215 		driver = dev_driver_string(dev->dev.parent);
2216 
2217 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2218 	     "gso_type=%d ip_summed=%d\n",
2219 	     driver, dev ? &dev->features : &null_features,
2220 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2221 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2222 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2223 }
2224 
2225 /*
2226  * Invalidate hardware checksum when packet is to be mangled, and
2227  * complete checksum manually on outgoing path.
2228  */
2229 int skb_checksum_help(struct sk_buff *skb)
2230 {
2231 	__wsum csum;
2232 	int ret = 0, offset;
2233 
2234 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2235 		goto out_set_summed;
2236 
2237 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2238 		skb_warn_bad_offload(skb);
2239 		return -EINVAL;
2240 	}
2241 
2242 	/* Before computing a checksum, we should make sure no frag could
2243 	 * be modified by an external entity : checksum could be wrong.
2244 	 */
2245 	if (skb_has_shared_frag(skb)) {
2246 		ret = __skb_linearize(skb);
2247 		if (ret)
2248 			goto out;
2249 	}
2250 
2251 	offset = skb_checksum_start_offset(skb);
2252 	BUG_ON(offset >= skb_headlen(skb));
2253 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2254 
2255 	offset += skb->csum_offset;
2256 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2257 
2258 	if (skb_cloned(skb) &&
2259 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2260 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2261 		if (ret)
2262 			goto out;
2263 	}
2264 
2265 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2266 out_set_summed:
2267 	skb->ip_summed = CHECKSUM_NONE;
2268 out:
2269 	return ret;
2270 }
2271 EXPORT_SYMBOL(skb_checksum_help);
2272 
2273 __be16 skb_network_protocol(struct sk_buff *skb)
2274 {
2275 	__be16 type = skb->protocol;
2276 	int vlan_depth = ETH_HLEN;
2277 
2278 	/* Tunnel gso handlers can set protocol to ethernet. */
2279 	if (type == htons(ETH_P_TEB)) {
2280 		struct ethhdr *eth;
2281 
2282 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2283 			return 0;
2284 
2285 		eth = (struct ethhdr *)skb_mac_header(skb);
2286 		type = eth->h_proto;
2287 	}
2288 
2289 	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2290 		struct vlan_hdr *vh;
2291 
2292 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2293 			return 0;
2294 
2295 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2296 		type = vh->h_vlan_encapsulated_proto;
2297 		vlan_depth += VLAN_HLEN;
2298 	}
2299 
2300 	return type;
2301 }
2302 
2303 /**
2304  *	skb_mac_gso_segment - mac layer segmentation handler.
2305  *	@skb: buffer to segment
2306  *	@features: features for the output path (see dev->features)
2307  */
2308 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2309 				    netdev_features_t features)
2310 {
2311 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2312 	struct packet_offload *ptype;
2313 	__be16 type = skb_network_protocol(skb);
2314 
2315 	if (unlikely(!type))
2316 		return ERR_PTR(-EINVAL);
2317 
2318 	__skb_pull(skb, skb->mac_len);
2319 
2320 	rcu_read_lock();
2321 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2322 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2323 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2324 				int err;
2325 
2326 				err = ptype->callbacks.gso_send_check(skb);
2327 				segs = ERR_PTR(err);
2328 				if (err || skb_gso_ok(skb, features))
2329 					break;
2330 				__skb_push(skb, (skb->data -
2331 						 skb_network_header(skb)));
2332 			}
2333 			segs = ptype->callbacks.gso_segment(skb, features);
2334 			break;
2335 		}
2336 	}
2337 	rcu_read_unlock();
2338 
2339 	__skb_push(skb, skb->data - skb_mac_header(skb));
2340 
2341 	return segs;
2342 }
2343 EXPORT_SYMBOL(skb_mac_gso_segment);
2344 
2345 
2346 /* openvswitch calls this on rx path, so we need a different check.
2347  */
2348 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2349 {
2350 	if (tx_path)
2351 		return skb->ip_summed != CHECKSUM_PARTIAL;
2352 	else
2353 		return skb->ip_summed == CHECKSUM_NONE;
2354 }
2355 
2356 /**
2357  *	__skb_gso_segment - Perform segmentation on skb.
2358  *	@skb: buffer to segment
2359  *	@features: features for the output path (see dev->features)
2360  *	@tx_path: whether it is called in TX path
2361  *
2362  *	This function segments the given skb and returns a list of segments.
2363  *
2364  *	It may return NULL if the skb requires no segmentation.  This is
2365  *	only possible when GSO is used for verifying header integrity.
2366  */
2367 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2368 				  netdev_features_t features, bool tx_path)
2369 {
2370 	if (unlikely(skb_needs_check(skb, tx_path))) {
2371 		int err;
2372 
2373 		skb_warn_bad_offload(skb);
2374 
2375 		if (skb_header_cloned(skb) &&
2376 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2377 			return ERR_PTR(err);
2378 	}
2379 
2380 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2381 	SKB_GSO_CB(skb)->encap_level = 0;
2382 
2383 	skb_reset_mac_header(skb);
2384 	skb_reset_mac_len(skb);
2385 
2386 	return skb_mac_gso_segment(skb, features);
2387 }
2388 EXPORT_SYMBOL(__skb_gso_segment);
2389 
2390 /* Take action when hardware reception checksum errors are detected. */
2391 #ifdef CONFIG_BUG
2392 void netdev_rx_csum_fault(struct net_device *dev)
2393 {
2394 	if (net_ratelimit()) {
2395 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2396 		dump_stack();
2397 	}
2398 }
2399 EXPORT_SYMBOL(netdev_rx_csum_fault);
2400 #endif
2401 
2402 /* Actually, we should eliminate this check as soon as we know, that:
2403  * 1. IOMMU is present and allows to map all the memory.
2404  * 2. No high memory really exists on this machine.
2405  */
2406 
2407 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2408 {
2409 #ifdef CONFIG_HIGHMEM
2410 	int i;
2411 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2412 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2413 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2414 			if (PageHighMem(skb_frag_page(frag)))
2415 				return 1;
2416 		}
2417 	}
2418 
2419 	if (PCI_DMA_BUS_IS_PHYS) {
2420 		struct device *pdev = dev->dev.parent;
2421 
2422 		if (!pdev)
2423 			return 0;
2424 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2425 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2426 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2427 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2428 				return 1;
2429 		}
2430 	}
2431 #endif
2432 	return 0;
2433 }
2434 
2435 struct dev_gso_cb {
2436 	void (*destructor)(struct sk_buff *skb);
2437 };
2438 
2439 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2440 
2441 static void dev_gso_skb_destructor(struct sk_buff *skb)
2442 {
2443 	struct dev_gso_cb *cb;
2444 
2445 	do {
2446 		struct sk_buff *nskb = skb->next;
2447 
2448 		skb->next = nskb->next;
2449 		nskb->next = NULL;
2450 		kfree_skb(nskb);
2451 	} while (skb->next);
2452 
2453 	cb = DEV_GSO_CB(skb);
2454 	if (cb->destructor)
2455 		cb->destructor(skb);
2456 }
2457 
2458 /**
2459  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2460  *	@skb: buffer to segment
2461  *	@features: device features as applicable to this skb
2462  *
2463  *	This function segments the given skb and stores the list of segments
2464  *	in skb->next.
2465  */
2466 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2467 {
2468 	struct sk_buff *segs;
2469 
2470 	segs = skb_gso_segment(skb, features);
2471 
2472 	/* Verifying header integrity only. */
2473 	if (!segs)
2474 		return 0;
2475 
2476 	if (IS_ERR(segs))
2477 		return PTR_ERR(segs);
2478 
2479 	skb->next = segs;
2480 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2481 	skb->destructor = dev_gso_skb_destructor;
2482 
2483 	return 0;
2484 }
2485 
2486 static netdev_features_t harmonize_features(struct sk_buff *skb,
2487 	netdev_features_t features)
2488 {
2489 	if (skb->ip_summed != CHECKSUM_NONE &&
2490 	    !can_checksum_protocol(features, skb_network_protocol(skb))) {
2491 		features &= ~NETIF_F_ALL_CSUM;
2492 	} else if (illegal_highdma(skb->dev, skb)) {
2493 		features &= ~NETIF_F_SG;
2494 	}
2495 
2496 	return features;
2497 }
2498 
2499 netdev_features_t netif_skb_features(struct sk_buff *skb)
2500 {
2501 	__be16 protocol = skb->protocol;
2502 	netdev_features_t features = skb->dev->features;
2503 
2504 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2505 		features &= ~NETIF_F_GSO_MASK;
2506 
2507 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2508 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2509 		protocol = veh->h_vlan_encapsulated_proto;
2510 	} else if (!vlan_tx_tag_present(skb)) {
2511 		return harmonize_features(skb, features);
2512 	}
2513 
2514 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2515 					       NETIF_F_HW_VLAN_STAG_TX);
2516 
2517 	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2518 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2519 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2520 				NETIF_F_HW_VLAN_STAG_TX;
2521 
2522 	return harmonize_features(skb, features);
2523 }
2524 EXPORT_SYMBOL(netif_skb_features);
2525 
2526 /*
2527  * Returns true if either:
2528  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2529  *	2. skb is fragmented and the device does not support SG.
2530  */
2531 static inline int skb_needs_linearize(struct sk_buff *skb,
2532 				      netdev_features_t features)
2533 {
2534 	return skb_is_nonlinear(skb) &&
2535 			((skb_has_frag_list(skb) &&
2536 				!(features & NETIF_F_FRAGLIST)) ||
2537 			(skb_shinfo(skb)->nr_frags &&
2538 				!(features & NETIF_F_SG)));
2539 }
2540 
2541 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2542 			struct netdev_queue *txq, void *accel_priv)
2543 {
2544 	const struct net_device_ops *ops = dev->netdev_ops;
2545 	int rc = NETDEV_TX_OK;
2546 	unsigned int skb_len;
2547 
2548 	if (likely(!skb->next)) {
2549 		netdev_features_t features;
2550 
2551 		/*
2552 		 * If device doesn't need skb->dst, release it right now while
2553 		 * its hot in this cpu cache
2554 		 */
2555 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2556 			skb_dst_drop(skb);
2557 
2558 		features = netif_skb_features(skb);
2559 
2560 		if (vlan_tx_tag_present(skb) &&
2561 		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2562 			skb = __vlan_put_tag(skb, skb->vlan_proto,
2563 					     vlan_tx_tag_get(skb));
2564 			if (unlikely(!skb))
2565 				goto out;
2566 
2567 			skb->vlan_tci = 0;
2568 		}
2569 
2570 		/* If encapsulation offload request, verify we are testing
2571 		 * hardware encapsulation features instead of standard
2572 		 * features for the netdev
2573 		 */
2574 		if (skb->encapsulation)
2575 			features &= dev->hw_enc_features;
2576 
2577 		if (netif_needs_gso(skb, features)) {
2578 			if (unlikely(dev_gso_segment(skb, features)))
2579 				goto out_kfree_skb;
2580 			if (skb->next)
2581 				goto gso;
2582 		} else {
2583 			if (skb_needs_linearize(skb, features) &&
2584 			    __skb_linearize(skb))
2585 				goto out_kfree_skb;
2586 
2587 			/* If packet is not checksummed and device does not
2588 			 * support checksumming for this protocol, complete
2589 			 * checksumming here.
2590 			 */
2591 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2592 				if (skb->encapsulation)
2593 					skb_set_inner_transport_header(skb,
2594 						skb_checksum_start_offset(skb));
2595 				else
2596 					skb_set_transport_header(skb,
2597 						skb_checksum_start_offset(skb));
2598 				if (!(features & NETIF_F_ALL_CSUM) &&
2599 				     skb_checksum_help(skb))
2600 					goto out_kfree_skb;
2601 			}
2602 		}
2603 
2604 		if (!list_empty(&ptype_all))
2605 			dev_queue_xmit_nit(skb, dev);
2606 
2607 		skb_len = skb->len;
2608 		if (accel_priv)
2609 			rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
2610 		else
2611 			rc = ops->ndo_start_xmit(skb, dev);
2612 
2613 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2614 		if (rc == NETDEV_TX_OK && txq)
2615 			txq_trans_update(txq);
2616 		return rc;
2617 	}
2618 
2619 gso:
2620 	do {
2621 		struct sk_buff *nskb = skb->next;
2622 
2623 		skb->next = nskb->next;
2624 		nskb->next = NULL;
2625 
2626 		if (!list_empty(&ptype_all))
2627 			dev_queue_xmit_nit(nskb, dev);
2628 
2629 		skb_len = nskb->len;
2630 		if (accel_priv)
2631 			rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
2632 		else
2633 			rc = ops->ndo_start_xmit(nskb, dev);
2634 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2635 		if (unlikely(rc != NETDEV_TX_OK)) {
2636 			if (rc & ~NETDEV_TX_MASK)
2637 				goto out_kfree_gso_skb;
2638 			nskb->next = skb->next;
2639 			skb->next = nskb;
2640 			return rc;
2641 		}
2642 		txq_trans_update(txq);
2643 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2644 			return NETDEV_TX_BUSY;
2645 	} while (skb->next);
2646 
2647 out_kfree_gso_skb:
2648 	if (likely(skb->next == NULL)) {
2649 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2650 		consume_skb(skb);
2651 		return rc;
2652 	}
2653 out_kfree_skb:
2654 	kfree_skb(skb);
2655 out:
2656 	return rc;
2657 }
2658 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2659 
2660 static void qdisc_pkt_len_init(struct sk_buff *skb)
2661 {
2662 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2663 
2664 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2665 
2666 	/* To get more precise estimation of bytes sent on wire,
2667 	 * we add to pkt_len the headers size of all segments
2668 	 */
2669 	if (shinfo->gso_size)  {
2670 		unsigned int hdr_len;
2671 		u16 gso_segs = shinfo->gso_segs;
2672 
2673 		/* mac layer + network layer */
2674 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2675 
2676 		/* + transport layer */
2677 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2678 			hdr_len += tcp_hdrlen(skb);
2679 		else
2680 			hdr_len += sizeof(struct udphdr);
2681 
2682 		if (shinfo->gso_type & SKB_GSO_DODGY)
2683 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2684 						shinfo->gso_size);
2685 
2686 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2687 	}
2688 }
2689 
2690 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2691 				 struct net_device *dev,
2692 				 struct netdev_queue *txq)
2693 {
2694 	spinlock_t *root_lock = qdisc_lock(q);
2695 	bool contended;
2696 	int rc;
2697 
2698 	qdisc_pkt_len_init(skb);
2699 	qdisc_calculate_pkt_len(skb, q);
2700 	/*
2701 	 * Heuristic to force contended enqueues to serialize on a
2702 	 * separate lock before trying to get qdisc main lock.
2703 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2704 	 * and dequeue packets faster.
2705 	 */
2706 	contended = qdisc_is_running(q);
2707 	if (unlikely(contended))
2708 		spin_lock(&q->busylock);
2709 
2710 	spin_lock(root_lock);
2711 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2712 		kfree_skb(skb);
2713 		rc = NET_XMIT_DROP;
2714 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2715 		   qdisc_run_begin(q)) {
2716 		/*
2717 		 * This is a work-conserving queue; there are no old skbs
2718 		 * waiting to be sent out; and the qdisc is not running -
2719 		 * xmit the skb directly.
2720 		 */
2721 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2722 			skb_dst_force(skb);
2723 
2724 		qdisc_bstats_update(q, skb);
2725 
2726 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2727 			if (unlikely(contended)) {
2728 				spin_unlock(&q->busylock);
2729 				contended = false;
2730 			}
2731 			__qdisc_run(q);
2732 		} else
2733 			qdisc_run_end(q);
2734 
2735 		rc = NET_XMIT_SUCCESS;
2736 	} else {
2737 		skb_dst_force(skb);
2738 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2739 		if (qdisc_run_begin(q)) {
2740 			if (unlikely(contended)) {
2741 				spin_unlock(&q->busylock);
2742 				contended = false;
2743 			}
2744 			__qdisc_run(q);
2745 		}
2746 	}
2747 	spin_unlock(root_lock);
2748 	if (unlikely(contended))
2749 		spin_unlock(&q->busylock);
2750 	return rc;
2751 }
2752 
2753 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2754 static void skb_update_prio(struct sk_buff *skb)
2755 {
2756 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2757 
2758 	if (!skb->priority && skb->sk && map) {
2759 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2760 
2761 		if (prioidx < map->priomap_len)
2762 			skb->priority = map->priomap[prioidx];
2763 	}
2764 }
2765 #else
2766 #define skb_update_prio(skb)
2767 #endif
2768 
2769 static DEFINE_PER_CPU(int, xmit_recursion);
2770 #define RECURSION_LIMIT 10
2771 
2772 /**
2773  *	dev_loopback_xmit - loop back @skb
2774  *	@skb: buffer to transmit
2775  */
2776 int dev_loopback_xmit(struct sk_buff *skb)
2777 {
2778 	skb_reset_mac_header(skb);
2779 	__skb_pull(skb, skb_network_offset(skb));
2780 	skb->pkt_type = PACKET_LOOPBACK;
2781 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2782 	WARN_ON(!skb_dst(skb));
2783 	skb_dst_force(skb);
2784 	netif_rx_ni(skb);
2785 	return 0;
2786 }
2787 EXPORT_SYMBOL(dev_loopback_xmit);
2788 
2789 /**
2790  *	dev_queue_xmit - transmit a buffer
2791  *	@skb: buffer to transmit
2792  *
2793  *	Queue a buffer for transmission to a network device. The caller must
2794  *	have set the device and priority and built the buffer before calling
2795  *	this function. The function can be called from an interrupt.
2796  *
2797  *	A negative errno code is returned on a failure. A success does not
2798  *	guarantee the frame will be transmitted as it may be dropped due
2799  *	to congestion or traffic shaping.
2800  *
2801  * -----------------------------------------------------------------------------------
2802  *      I notice this method can also return errors from the queue disciplines,
2803  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2804  *      be positive.
2805  *
2806  *      Regardless of the return value, the skb is consumed, so it is currently
2807  *      difficult to retry a send to this method.  (You can bump the ref count
2808  *      before sending to hold a reference for retry if you are careful.)
2809  *
2810  *      When calling this method, interrupts MUST be enabled.  This is because
2811  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2812  *          --BLG
2813  */
2814 int dev_queue_xmit(struct sk_buff *skb)
2815 {
2816 	struct net_device *dev = skb->dev;
2817 	struct netdev_queue *txq;
2818 	struct Qdisc *q;
2819 	int rc = -ENOMEM;
2820 
2821 	skb_reset_mac_header(skb);
2822 
2823 	/* Disable soft irqs for various locks below. Also
2824 	 * stops preemption for RCU.
2825 	 */
2826 	rcu_read_lock_bh();
2827 
2828 	skb_update_prio(skb);
2829 
2830 	txq = netdev_pick_tx(dev, skb);
2831 	q = rcu_dereference_bh(txq->qdisc);
2832 
2833 #ifdef CONFIG_NET_CLS_ACT
2834 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2835 #endif
2836 	trace_net_dev_queue(skb);
2837 	if (q->enqueue) {
2838 		rc = __dev_xmit_skb(skb, q, dev, txq);
2839 		goto out;
2840 	}
2841 
2842 	/* The device has no queue. Common case for software devices:
2843 	   loopback, all the sorts of tunnels...
2844 
2845 	   Really, it is unlikely that netif_tx_lock protection is necessary
2846 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2847 	   counters.)
2848 	   However, it is possible, that they rely on protection
2849 	   made by us here.
2850 
2851 	   Check this and shot the lock. It is not prone from deadlocks.
2852 	   Either shot noqueue qdisc, it is even simpler 8)
2853 	 */
2854 	if (dev->flags & IFF_UP) {
2855 		int cpu = smp_processor_id(); /* ok because BHs are off */
2856 
2857 		if (txq->xmit_lock_owner != cpu) {
2858 
2859 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2860 				goto recursion_alert;
2861 
2862 			HARD_TX_LOCK(dev, txq, cpu);
2863 
2864 			if (!netif_xmit_stopped(txq)) {
2865 				__this_cpu_inc(xmit_recursion);
2866 				rc = dev_hard_start_xmit(skb, dev, txq, NULL);
2867 				__this_cpu_dec(xmit_recursion);
2868 				if (dev_xmit_complete(rc)) {
2869 					HARD_TX_UNLOCK(dev, txq);
2870 					goto out;
2871 				}
2872 			}
2873 			HARD_TX_UNLOCK(dev, txq);
2874 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2875 					     dev->name);
2876 		} else {
2877 			/* Recursion is detected! It is possible,
2878 			 * unfortunately
2879 			 */
2880 recursion_alert:
2881 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2882 					     dev->name);
2883 		}
2884 	}
2885 
2886 	rc = -ENETDOWN;
2887 	rcu_read_unlock_bh();
2888 
2889 	kfree_skb(skb);
2890 	return rc;
2891 out:
2892 	rcu_read_unlock_bh();
2893 	return rc;
2894 }
2895 EXPORT_SYMBOL(dev_queue_xmit);
2896 
2897 
2898 /*=======================================================================
2899 			Receiver routines
2900   =======================================================================*/
2901 
2902 int netdev_max_backlog __read_mostly = 1000;
2903 EXPORT_SYMBOL(netdev_max_backlog);
2904 
2905 int netdev_tstamp_prequeue __read_mostly = 1;
2906 int netdev_budget __read_mostly = 300;
2907 int weight_p __read_mostly = 64;            /* old backlog weight */
2908 
2909 /* Called with irq disabled */
2910 static inline void ____napi_schedule(struct softnet_data *sd,
2911 				     struct napi_struct *napi)
2912 {
2913 	list_add_tail(&napi->poll_list, &sd->poll_list);
2914 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2915 }
2916 
2917 #ifdef CONFIG_RPS
2918 
2919 /* One global table that all flow-based protocols share. */
2920 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2921 EXPORT_SYMBOL(rps_sock_flow_table);
2922 
2923 struct static_key rps_needed __read_mostly;
2924 
2925 static struct rps_dev_flow *
2926 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2927 	    struct rps_dev_flow *rflow, u16 next_cpu)
2928 {
2929 	if (next_cpu != RPS_NO_CPU) {
2930 #ifdef CONFIG_RFS_ACCEL
2931 		struct netdev_rx_queue *rxqueue;
2932 		struct rps_dev_flow_table *flow_table;
2933 		struct rps_dev_flow *old_rflow;
2934 		u32 flow_id;
2935 		u16 rxq_index;
2936 		int rc;
2937 
2938 		/* Should we steer this flow to a different hardware queue? */
2939 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2940 		    !(dev->features & NETIF_F_NTUPLE))
2941 			goto out;
2942 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2943 		if (rxq_index == skb_get_rx_queue(skb))
2944 			goto out;
2945 
2946 		rxqueue = dev->_rx + rxq_index;
2947 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2948 		if (!flow_table)
2949 			goto out;
2950 		flow_id = skb->rxhash & flow_table->mask;
2951 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2952 							rxq_index, flow_id);
2953 		if (rc < 0)
2954 			goto out;
2955 		old_rflow = rflow;
2956 		rflow = &flow_table->flows[flow_id];
2957 		rflow->filter = rc;
2958 		if (old_rflow->filter == rflow->filter)
2959 			old_rflow->filter = RPS_NO_FILTER;
2960 	out:
2961 #endif
2962 		rflow->last_qtail =
2963 			per_cpu(softnet_data, next_cpu).input_queue_head;
2964 	}
2965 
2966 	rflow->cpu = next_cpu;
2967 	return rflow;
2968 }
2969 
2970 /*
2971  * get_rps_cpu is called from netif_receive_skb and returns the target
2972  * CPU from the RPS map of the receiving queue for a given skb.
2973  * rcu_read_lock must be held on entry.
2974  */
2975 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2976 		       struct rps_dev_flow **rflowp)
2977 {
2978 	struct netdev_rx_queue *rxqueue;
2979 	struct rps_map *map;
2980 	struct rps_dev_flow_table *flow_table;
2981 	struct rps_sock_flow_table *sock_flow_table;
2982 	int cpu = -1;
2983 	u16 tcpu;
2984 
2985 	if (skb_rx_queue_recorded(skb)) {
2986 		u16 index = skb_get_rx_queue(skb);
2987 		if (unlikely(index >= dev->real_num_rx_queues)) {
2988 			WARN_ONCE(dev->real_num_rx_queues > 1,
2989 				  "%s received packet on queue %u, but number "
2990 				  "of RX queues is %u\n",
2991 				  dev->name, index, dev->real_num_rx_queues);
2992 			goto done;
2993 		}
2994 		rxqueue = dev->_rx + index;
2995 	} else
2996 		rxqueue = dev->_rx;
2997 
2998 	map = rcu_dereference(rxqueue->rps_map);
2999 	if (map) {
3000 		if (map->len == 1 &&
3001 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
3002 			tcpu = map->cpus[0];
3003 			if (cpu_online(tcpu))
3004 				cpu = tcpu;
3005 			goto done;
3006 		}
3007 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3008 		goto done;
3009 	}
3010 
3011 	skb_reset_network_header(skb);
3012 	if (!skb_get_rxhash(skb))
3013 		goto done;
3014 
3015 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3016 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3017 	if (flow_table && sock_flow_table) {
3018 		u16 next_cpu;
3019 		struct rps_dev_flow *rflow;
3020 
3021 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3022 		tcpu = rflow->cpu;
3023 
3024 		next_cpu = sock_flow_table->ents[skb->rxhash &
3025 		    sock_flow_table->mask];
3026 
3027 		/*
3028 		 * If the desired CPU (where last recvmsg was done) is
3029 		 * different from current CPU (one in the rx-queue flow
3030 		 * table entry), switch if one of the following holds:
3031 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3032 		 *   - Current CPU is offline.
3033 		 *   - The current CPU's queue tail has advanced beyond the
3034 		 *     last packet that was enqueued using this table entry.
3035 		 *     This guarantees that all previous packets for the flow
3036 		 *     have been dequeued, thus preserving in order delivery.
3037 		 */
3038 		if (unlikely(tcpu != next_cpu) &&
3039 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3040 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3041 		      rflow->last_qtail)) >= 0)) {
3042 			tcpu = next_cpu;
3043 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3044 		}
3045 
3046 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3047 			*rflowp = rflow;
3048 			cpu = tcpu;
3049 			goto done;
3050 		}
3051 	}
3052 
3053 	if (map) {
3054 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3055 
3056 		if (cpu_online(tcpu)) {
3057 			cpu = tcpu;
3058 			goto done;
3059 		}
3060 	}
3061 
3062 done:
3063 	return cpu;
3064 }
3065 
3066 #ifdef CONFIG_RFS_ACCEL
3067 
3068 /**
3069  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3070  * @dev: Device on which the filter was set
3071  * @rxq_index: RX queue index
3072  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3073  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3074  *
3075  * Drivers that implement ndo_rx_flow_steer() should periodically call
3076  * this function for each installed filter and remove the filters for
3077  * which it returns %true.
3078  */
3079 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3080 			 u32 flow_id, u16 filter_id)
3081 {
3082 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3083 	struct rps_dev_flow_table *flow_table;
3084 	struct rps_dev_flow *rflow;
3085 	bool expire = true;
3086 	int cpu;
3087 
3088 	rcu_read_lock();
3089 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3090 	if (flow_table && flow_id <= flow_table->mask) {
3091 		rflow = &flow_table->flows[flow_id];
3092 		cpu = ACCESS_ONCE(rflow->cpu);
3093 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3094 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3095 			   rflow->last_qtail) <
3096 		     (int)(10 * flow_table->mask)))
3097 			expire = false;
3098 	}
3099 	rcu_read_unlock();
3100 	return expire;
3101 }
3102 EXPORT_SYMBOL(rps_may_expire_flow);
3103 
3104 #endif /* CONFIG_RFS_ACCEL */
3105 
3106 /* Called from hardirq (IPI) context */
3107 static void rps_trigger_softirq(void *data)
3108 {
3109 	struct softnet_data *sd = data;
3110 
3111 	____napi_schedule(sd, &sd->backlog);
3112 	sd->received_rps++;
3113 }
3114 
3115 #endif /* CONFIG_RPS */
3116 
3117 /*
3118  * Check if this softnet_data structure is another cpu one
3119  * If yes, queue it to our IPI list and return 1
3120  * If no, return 0
3121  */
3122 static int rps_ipi_queued(struct softnet_data *sd)
3123 {
3124 #ifdef CONFIG_RPS
3125 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3126 
3127 	if (sd != mysd) {
3128 		sd->rps_ipi_next = mysd->rps_ipi_list;
3129 		mysd->rps_ipi_list = sd;
3130 
3131 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3132 		return 1;
3133 	}
3134 #endif /* CONFIG_RPS */
3135 	return 0;
3136 }
3137 
3138 #ifdef CONFIG_NET_FLOW_LIMIT
3139 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3140 #endif
3141 
3142 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3143 {
3144 #ifdef CONFIG_NET_FLOW_LIMIT
3145 	struct sd_flow_limit *fl;
3146 	struct softnet_data *sd;
3147 	unsigned int old_flow, new_flow;
3148 
3149 	if (qlen < (netdev_max_backlog >> 1))
3150 		return false;
3151 
3152 	sd = &__get_cpu_var(softnet_data);
3153 
3154 	rcu_read_lock();
3155 	fl = rcu_dereference(sd->flow_limit);
3156 	if (fl) {
3157 		new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3158 		old_flow = fl->history[fl->history_head];
3159 		fl->history[fl->history_head] = new_flow;
3160 
3161 		fl->history_head++;
3162 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3163 
3164 		if (likely(fl->buckets[old_flow]))
3165 			fl->buckets[old_flow]--;
3166 
3167 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3168 			fl->count++;
3169 			rcu_read_unlock();
3170 			return true;
3171 		}
3172 	}
3173 	rcu_read_unlock();
3174 #endif
3175 	return false;
3176 }
3177 
3178 /*
3179  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3180  * queue (may be a remote CPU queue).
3181  */
3182 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3183 			      unsigned int *qtail)
3184 {
3185 	struct softnet_data *sd;
3186 	unsigned long flags;
3187 	unsigned int qlen;
3188 
3189 	sd = &per_cpu(softnet_data, cpu);
3190 
3191 	local_irq_save(flags);
3192 
3193 	rps_lock(sd);
3194 	qlen = skb_queue_len(&sd->input_pkt_queue);
3195 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3196 		if (skb_queue_len(&sd->input_pkt_queue)) {
3197 enqueue:
3198 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3199 			input_queue_tail_incr_save(sd, qtail);
3200 			rps_unlock(sd);
3201 			local_irq_restore(flags);
3202 			return NET_RX_SUCCESS;
3203 		}
3204 
3205 		/* Schedule NAPI for backlog device
3206 		 * We can use non atomic operation since we own the queue lock
3207 		 */
3208 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3209 			if (!rps_ipi_queued(sd))
3210 				____napi_schedule(sd, &sd->backlog);
3211 		}
3212 		goto enqueue;
3213 	}
3214 
3215 	sd->dropped++;
3216 	rps_unlock(sd);
3217 
3218 	local_irq_restore(flags);
3219 
3220 	atomic_long_inc(&skb->dev->rx_dropped);
3221 	kfree_skb(skb);
3222 	return NET_RX_DROP;
3223 }
3224 
3225 /**
3226  *	netif_rx	-	post buffer to the network code
3227  *	@skb: buffer to post
3228  *
3229  *	This function receives a packet from a device driver and queues it for
3230  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3231  *	may be dropped during processing for congestion control or by the
3232  *	protocol layers.
3233  *
3234  *	return values:
3235  *	NET_RX_SUCCESS	(no congestion)
3236  *	NET_RX_DROP     (packet was dropped)
3237  *
3238  */
3239 
3240 int netif_rx(struct sk_buff *skb)
3241 {
3242 	int ret;
3243 
3244 	/* if netpoll wants it, pretend we never saw it */
3245 	if (netpoll_rx(skb))
3246 		return NET_RX_DROP;
3247 
3248 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3249 
3250 	trace_netif_rx(skb);
3251 #ifdef CONFIG_RPS
3252 	if (static_key_false(&rps_needed)) {
3253 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3254 		int cpu;
3255 
3256 		preempt_disable();
3257 		rcu_read_lock();
3258 
3259 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3260 		if (cpu < 0)
3261 			cpu = smp_processor_id();
3262 
3263 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3264 
3265 		rcu_read_unlock();
3266 		preempt_enable();
3267 	} else
3268 #endif
3269 	{
3270 		unsigned int qtail;
3271 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3272 		put_cpu();
3273 	}
3274 	return ret;
3275 }
3276 EXPORT_SYMBOL(netif_rx);
3277 
3278 int netif_rx_ni(struct sk_buff *skb)
3279 {
3280 	int err;
3281 
3282 	preempt_disable();
3283 	err = netif_rx(skb);
3284 	if (local_softirq_pending())
3285 		do_softirq();
3286 	preempt_enable();
3287 
3288 	return err;
3289 }
3290 EXPORT_SYMBOL(netif_rx_ni);
3291 
3292 static void net_tx_action(struct softirq_action *h)
3293 {
3294 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3295 
3296 	if (sd->completion_queue) {
3297 		struct sk_buff *clist;
3298 
3299 		local_irq_disable();
3300 		clist = sd->completion_queue;
3301 		sd->completion_queue = NULL;
3302 		local_irq_enable();
3303 
3304 		while (clist) {
3305 			struct sk_buff *skb = clist;
3306 			clist = clist->next;
3307 
3308 			WARN_ON(atomic_read(&skb->users));
3309 			trace_kfree_skb(skb, net_tx_action);
3310 			__kfree_skb(skb);
3311 		}
3312 	}
3313 
3314 	if (sd->output_queue) {
3315 		struct Qdisc *head;
3316 
3317 		local_irq_disable();
3318 		head = sd->output_queue;
3319 		sd->output_queue = NULL;
3320 		sd->output_queue_tailp = &sd->output_queue;
3321 		local_irq_enable();
3322 
3323 		while (head) {
3324 			struct Qdisc *q = head;
3325 			spinlock_t *root_lock;
3326 
3327 			head = head->next_sched;
3328 
3329 			root_lock = qdisc_lock(q);
3330 			if (spin_trylock(root_lock)) {
3331 				smp_mb__before_clear_bit();
3332 				clear_bit(__QDISC_STATE_SCHED,
3333 					  &q->state);
3334 				qdisc_run(q);
3335 				spin_unlock(root_lock);
3336 			} else {
3337 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3338 					      &q->state)) {
3339 					__netif_reschedule(q);
3340 				} else {
3341 					smp_mb__before_clear_bit();
3342 					clear_bit(__QDISC_STATE_SCHED,
3343 						  &q->state);
3344 				}
3345 			}
3346 		}
3347 	}
3348 }
3349 
3350 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3351     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3352 /* This hook is defined here for ATM LANE */
3353 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3354 			     unsigned char *addr) __read_mostly;
3355 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3356 #endif
3357 
3358 #ifdef CONFIG_NET_CLS_ACT
3359 /* TODO: Maybe we should just force sch_ingress to be compiled in
3360  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3361  * a compare and 2 stores extra right now if we dont have it on
3362  * but have CONFIG_NET_CLS_ACT
3363  * NOTE: This doesn't stop any functionality; if you dont have
3364  * the ingress scheduler, you just can't add policies on ingress.
3365  *
3366  */
3367 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3368 {
3369 	struct net_device *dev = skb->dev;
3370 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3371 	int result = TC_ACT_OK;
3372 	struct Qdisc *q;
3373 
3374 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3375 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3376 				     skb->skb_iif, dev->ifindex);
3377 		return TC_ACT_SHOT;
3378 	}
3379 
3380 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3381 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3382 
3383 	q = rxq->qdisc;
3384 	if (q != &noop_qdisc) {
3385 		spin_lock(qdisc_lock(q));
3386 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3387 			result = qdisc_enqueue_root(skb, q);
3388 		spin_unlock(qdisc_lock(q));
3389 	}
3390 
3391 	return result;
3392 }
3393 
3394 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3395 					 struct packet_type **pt_prev,
3396 					 int *ret, struct net_device *orig_dev)
3397 {
3398 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3399 
3400 	if (!rxq || rxq->qdisc == &noop_qdisc)
3401 		goto out;
3402 
3403 	if (*pt_prev) {
3404 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3405 		*pt_prev = NULL;
3406 	}
3407 
3408 	switch (ing_filter(skb, rxq)) {
3409 	case TC_ACT_SHOT:
3410 	case TC_ACT_STOLEN:
3411 		kfree_skb(skb);
3412 		return NULL;
3413 	}
3414 
3415 out:
3416 	skb->tc_verd = 0;
3417 	return skb;
3418 }
3419 #endif
3420 
3421 /**
3422  *	netdev_rx_handler_register - register receive handler
3423  *	@dev: device to register a handler for
3424  *	@rx_handler: receive handler to register
3425  *	@rx_handler_data: data pointer that is used by rx handler
3426  *
3427  *	Register a receive hander for a device. This handler will then be
3428  *	called from __netif_receive_skb. A negative errno code is returned
3429  *	on a failure.
3430  *
3431  *	The caller must hold the rtnl_mutex.
3432  *
3433  *	For a general description of rx_handler, see enum rx_handler_result.
3434  */
3435 int netdev_rx_handler_register(struct net_device *dev,
3436 			       rx_handler_func_t *rx_handler,
3437 			       void *rx_handler_data)
3438 {
3439 	ASSERT_RTNL();
3440 
3441 	if (dev->rx_handler)
3442 		return -EBUSY;
3443 
3444 	/* Note: rx_handler_data must be set before rx_handler */
3445 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3446 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3447 
3448 	return 0;
3449 }
3450 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3451 
3452 /**
3453  *	netdev_rx_handler_unregister - unregister receive handler
3454  *	@dev: device to unregister a handler from
3455  *
3456  *	Unregister a receive handler from a device.
3457  *
3458  *	The caller must hold the rtnl_mutex.
3459  */
3460 void netdev_rx_handler_unregister(struct net_device *dev)
3461 {
3462 
3463 	ASSERT_RTNL();
3464 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3465 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3466 	 * section has a guarantee to see a non NULL rx_handler_data
3467 	 * as well.
3468 	 */
3469 	synchronize_net();
3470 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3471 }
3472 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3473 
3474 /*
3475  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3476  * the special handling of PFMEMALLOC skbs.
3477  */
3478 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3479 {
3480 	switch (skb->protocol) {
3481 	case __constant_htons(ETH_P_ARP):
3482 	case __constant_htons(ETH_P_IP):
3483 	case __constant_htons(ETH_P_IPV6):
3484 	case __constant_htons(ETH_P_8021Q):
3485 	case __constant_htons(ETH_P_8021AD):
3486 		return true;
3487 	default:
3488 		return false;
3489 	}
3490 }
3491 
3492 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3493 {
3494 	struct packet_type *ptype, *pt_prev;
3495 	rx_handler_func_t *rx_handler;
3496 	struct net_device *orig_dev;
3497 	struct net_device *null_or_dev;
3498 	bool deliver_exact = false;
3499 	int ret = NET_RX_DROP;
3500 	__be16 type;
3501 
3502 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3503 
3504 	trace_netif_receive_skb(skb);
3505 
3506 	/* if we've gotten here through NAPI, check netpoll */
3507 	if (netpoll_receive_skb(skb))
3508 		goto out;
3509 
3510 	orig_dev = skb->dev;
3511 
3512 	skb_reset_network_header(skb);
3513 	if (!skb_transport_header_was_set(skb))
3514 		skb_reset_transport_header(skb);
3515 	skb_reset_mac_len(skb);
3516 
3517 	pt_prev = NULL;
3518 
3519 	rcu_read_lock();
3520 
3521 another_round:
3522 	skb->skb_iif = skb->dev->ifindex;
3523 
3524 	__this_cpu_inc(softnet_data.processed);
3525 
3526 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3527 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3528 		skb = vlan_untag(skb);
3529 		if (unlikely(!skb))
3530 			goto unlock;
3531 	}
3532 
3533 #ifdef CONFIG_NET_CLS_ACT
3534 	if (skb->tc_verd & TC_NCLS) {
3535 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3536 		goto ncls;
3537 	}
3538 #endif
3539 
3540 	if (pfmemalloc)
3541 		goto skip_taps;
3542 
3543 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3544 		if (!ptype->dev || ptype->dev == skb->dev) {
3545 			if (pt_prev)
3546 				ret = deliver_skb(skb, pt_prev, orig_dev);
3547 			pt_prev = ptype;
3548 		}
3549 	}
3550 
3551 skip_taps:
3552 #ifdef CONFIG_NET_CLS_ACT
3553 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3554 	if (!skb)
3555 		goto unlock;
3556 ncls:
3557 #endif
3558 
3559 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3560 		goto drop;
3561 
3562 	if (vlan_tx_tag_present(skb)) {
3563 		if (pt_prev) {
3564 			ret = deliver_skb(skb, pt_prev, orig_dev);
3565 			pt_prev = NULL;
3566 		}
3567 		if (vlan_do_receive(&skb))
3568 			goto another_round;
3569 		else if (unlikely(!skb))
3570 			goto unlock;
3571 	}
3572 
3573 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3574 	if (rx_handler) {
3575 		if (pt_prev) {
3576 			ret = deliver_skb(skb, pt_prev, orig_dev);
3577 			pt_prev = NULL;
3578 		}
3579 		switch (rx_handler(&skb)) {
3580 		case RX_HANDLER_CONSUMED:
3581 			ret = NET_RX_SUCCESS;
3582 			goto unlock;
3583 		case RX_HANDLER_ANOTHER:
3584 			goto another_round;
3585 		case RX_HANDLER_EXACT:
3586 			deliver_exact = true;
3587 		case RX_HANDLER_PASS:
3588 			break;
3589 		default:
3590 			BUG();
3591 		}
3592 	}
3593 
3594 	if (unlikely(vlan_tx_tag_present(skb))) {
3595 		if (vlan_tx_tag_get_id(skb))
3596 			skb->pkt_type = PACKET_OTHERHOST;
3597 		/* Note: we might in the future use prio bits
3598 		 * and set skb->priority like in vlan_do_receive()
3599 		 * For the time being, just ignore Priority Code Point
3600 		 */
3601 		skb->vlan_tci = 0;
3602 	}
3603 
3604 	/* deliver only exact match when indicated */
3605 	null_or_dev = deliver_exact ? skb->dev : NULL;
3606 
3607 	type = skb->protocol;
3608 	list_for_each_entry_rcu(ptype,
3609 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3610 		if (ptype->type == type &&
3611 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3612 		     ptype->dev == orig_dev)) {
3613 			if (pt_prev)
3614 				ret = deliver_skb(skb, pt_prev, orig_dev);
3615 			pt_prev = ptype;
3616 		}
3617 	}
3618 
3619 	if (pt_prev) {
3620 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3621 			goto drop;
3622 		else
3623 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3624 	} else {
3625 drop:
3626 		atomic_long_inc(&skb->dev->rx_dropped);
3627 		kfree_skb(skb);
3628 		/* Jamal, now you will not able to escape explaining
3629 		 * me how you were going to use this. :-)
3630 		 */
3631 		ret = NET_RX_DROP;
3632 	}
3633 
3634 unlock:
3635 	rcu_read_unlock();
3636 out:
3637 	return ret;
3638 }
3639 
3640 static int __netif_receive_skb(struct sk_buff *skb)
3641 {
3642 	int ret;
3643 
3644 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3645 		unsigned long pflags = current->flags;
3646 
3647 		/*
3648 		 * PFMEMALLOC skbs are special, they should
3649 		 * - be delivered to SOCK_MEMALLOC sockets only
3650 		 * - stay away from userspace
3651 		 * - have bounded memory usage
3652 		 *
3653 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3654 		 * context down to all allocation sites.
3655 		 */
3656 		current->flags |= PF_MEMALLOC;
3657 		ret = __netif_receive_skb_core(skb, true);
3658 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3659 	} else
3660 		ret = __netif_receive_skb_core(skb, false);
3661 
3662 	return ret;
3663 }
3664 
3665 /**
3666  *	netif_receive_skb - process receive buffer from network
3667  *	@skb: buffer to process
3668  *
3669  *	netif_receive_skb() is the main receive data processing function.
3670  *	It always succeeds. The buffer may be dropped during processing
3671  *	for congestion control or by the protocol layers.
3672  *
3673  *	This function may only be called from softirq context and interrupts
3674  *	should be enabled.
3675  *
3676  *	Return values (usually ignored):
3677  *	NET_RX_SUCCESS: no congestion
3678  *	NET_RX_DROP: packet was dropped
3679  */
3680 int netif_receive_skb(struct sk_buff *skb)
3681 {
3682 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3683 
3684 	if (skb_defer_rx_timestamp(skb))
3685 		return NET_RX_SUCCESS;
3686 
3687 #ifdef CONFIG_RPS
3688 	if (static_key_false(&rps_needed)) {
3689 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3690 		int cpu, ret;
3691 
3692 		rcu_read_lock();
3693 
3694 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3695 
3696 		if (cpu >= 0) {
3697 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3698 			rcu_read_unlock();
3699 			return ret;
3700 		}
3701 		rcu_read_unlock();
3702 	}
3703 #endif
3704 	return __netif_receive_skb(skb);
3705 }
3706 EXPORT_SYMBOL(netif_receive_skb);
3707 
3708 /* Network device is going away, flush any packets still pending
3709  * Called with irqs disabled.
3710  */
3711 static void flush_backlog(void *arg)
3712 {
3713 	struct net_device *dev = arg;
3714 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3715 	struct sk_buff *skb, *tmp;
3716 
3717 	rps_lock(sd);
3718 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3719 		if (skb->dev == dev) {
3720 			__skb_unlink(skb, &sd->input_pkt_queue);
3721 			kfree_skb(skb);
3722 			input_queue_head_incr(sd);
3723 		}
3724 	}
3725 	rps_unlock(sd);
3726 
3727 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3728 		if (skb->dev == dev) {
3729 			__skb_unlink(skb, &sd->process_queue);
3730 			kfree_skb(skb);
3731 			input_queue_head_incr(sd);
3732 		}
3733 	}
3734 }
3735 
3736 static int napi_gro_complete(struct sk_buff *skb)
3737 {
3738 	struct packet_offload *ptype;
3739 	__be16 type = skb->protocol;
3740 	struct list_head *head = &offload_base;
3741 	int err = -ENOENT;
3742 
3743 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3744 
3745 	if (NAPI_GRO_CB(skb)->count == 1) {
3746 		skb_shinfo(skb)->gso_size = 0;
3747 		goto out;
3748 	}
3749 
3750 	rcu_read_lock();
3751 	list_for_each_entry_rcu(ptype, head, list) {
3752 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3753 			continue;
3754 
3755 		err = ptype->callbacks.gro_complete(skb);
3756 		break;
3757 	}
3758 	rcu_read_unlock();
3759 
3760 	if (err) {
3761 		WARN_ON(&ptype->list == head);
3762 		kfree_skb(skb);
3763 		return NET_RX_SUCCESS;
3764 	}
3765 
3766 out:
3767 	return netif_receive_skb(skb);
3768 }
3769 
3770 /* napi->gro_list contains packets ordered by age.
3771  * youngest packets at the head of it.
3772  * Complete skbs in reverse order to reduce latencies.
3773  */
3774 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3775 {
3776 	struct sk_buff *skb, *prev = NULL;
3777 
3778 	/* scan list and build reverse chain */
3779 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3780 		skb->prev = prev;
3781 		prev = skb;
3782 	}
3783 
3784 	for (skb = prev; skb; skb = prev) {
3785 		skb->next = NULL;
3786 
3787 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3788 			return;
3789 
3790 		prev = skb->prev;
3791 		napi_gro_complete(skb);
3792 		napi->gro_count--;
3793 	}
3794 
3795 	napi->gro_list = NULL;
3796 }
3797 EXPORT_SYMBOL(napi_gro_flush);
3798 
3799 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3800 {
3801 	struct sk_buff *p;
3802 	unsigned int maclen = skb->dev->hard_header_len;
3803 
3804 	for (p = napi->gro_list; p; p = p->next) {
3805 		unsigned long diffs;
3806 
3807 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3808 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3809 		if (maclen == ETH_HLEN)
3810 			diffs |= compare_ether_header(skb_mac_header(p),
3811 						      skb_gro_mac_header(skb));
3812 		else if (!diffs)
3813 			diffs = memcmp(skb_mac_header(p),
3814 				       skb_gro_mac_header(skb),
3815 				       maclen);
3816 		NAPI_GRO_CB(p)->same_flow = !diffs;
3817 		NAPI_GRO_CB(p)->flush = 0;
3818 	}
3819 }
3820 
3821 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3822 {
3823 	struct sk_buff **pp = NULL;
3824 	struct packet_offload *ptype;
3825 	__be16 type = skb->protocol;
3826 	struct list_head *head = &offload_base;
3827 	int same_flow;
3828 	enum gro_result ret;
3829 
3830 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3831 		goto normal;
3832 
3833 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3834 		goto normal;
3835 
3836 	gro_list_prepare(napi, skb);
3837 
3838 	rcu_read_lock();
3839 	list_for_each_entry_rcu(ptype, head, list) {
3840 		if (ptype->type != type || !ptype->callbacks.gro_receive)
3841 			continue;
3842 
3843 		skb_set_network_header(skb, skb_gro_offset(skb));
3844 		skb_reset_mac_len(skb);
3845 		NAPI_GRO_CB(skb)->same_flow = 0;
3846 		NAPI_GRO_CB(skb)->flush = 0;
3847 		NAPI_GRO_CB(skb)->free = 0;
3848 
3849 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3850 		break;
3851 	}
3852 	rcu_read_unlock();
3853 
3854 	if (&ptype->list == head)
3855 		goto normal;
3856 
3857 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3858 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3859 
3860 	if (pp) {
3861 		struct sk_buff *nskb = *pp;
3862 
3863 		*pp = nskb->next;
3864 		nskb->next = NULL;
3865 		napi_gro_complete(nskb);
3866 		napi->gro_count--;
3867 	}
3868 
3869 	if (same_flow)
3870 		goto ok;
3871 
3872 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3873 		goto normal;
3874 
3875 	napi->gro_count++;
3876 	NAPI_GRO_CB(skb)->count = 1;
3877 	NAPI_GRO_CB(skb)->age = jiffies;
3878 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3879 	skb->next = napi->gro_list;
3880 	napi->gro_list = skb;
3881 	ret = GRO_HELD;
3882 
3883 pull:
3884 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3885 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3886 
3887 		BUG_ON(skb->end - skb->tail < grow);
3888 
3889 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3890 
3891 		skb->tail += grow;
3892 		skb->data_len -= grow;
3893 
3894 		skb_shinfo(skb)->frags[0].page_offset += grow;
3895 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3896 
3897 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3898 			skb_frag_unref(skb, 0);
3899 			memmove(skb_shinfo(skb)->frags,
3900 				skb_shinfo(skb)->frags + 1,
3901 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3902 		}
3903 	}
3904 
3905 ok:
3906 	return ret;
3907 
3908 normal:
3909 	ret = GRO_NORMAL;
3910 	goto pull;
3911 }
3912 
3913 
3914 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3915 {
3916 	switch (ret) {
3917 	case GRO_NORMAL:
3918 		if (netif_receive_skb(skb))
3919 			ret = GRO_DROP;
3920 		break;
3921 
3922 	case GRO_DROP:
3923 		kfree_skb(skb);
3924 		break;
3925 
3926 	case GRO_MERGED_FREE:
3927 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3928 			kmem_cache_free(skbuff_head_cache, skb);
3929 		else
3930 			__kfree_skb(skb);
3931 		break;
3932 
3933 	case GRO_HELD:
3934 	case GRO_MERGED:
3935 		break;
3936 	}
3937 
3938 	return ret;
3939 }
3940 
3941 static void skb_gro_reset_offset(struct sk_buff *skb)
3942 {
3943 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3944 	const skb_frag_t *frag0 = &pinfo->frags[0];
3945 
3946 	NAPI_GRO_CB(skb)->data_offset = 0;
3947 	NAPI_GRO_CB(skb)->frag0 = NULL;
3948 	NAPI_GRO_CB(skb)->frag0_len = 0;
3949 
3950 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3951 	    pinfo->nr_frags &&
3952 	    !PageHighMem(skb_frag_page(frag0))) {
3953 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3954 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3955 	}
3956 }
3957 
3958 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3959 {
3960 	skb_gro_reset_offset(skb);
3961 
3962 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3963 }
3964 EXPORT_SYMBOL(napi_gro_receive);
3965 
3966 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3967 {
3968 	__skb_pull(skb, skb_headlen(skb));
3969 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3970 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3971 	skb->vlan_tci = 0;
3972 	skb->dev = napi->dev;
3973 	skb->skb_iif = 0;
3974 
3975 	napi->skb = skb;
3976 }
3977 
3978 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3979 {
3980 	struct sk_buff *skb = napi->skb;
3981 
3982 	if (!skb) {
3983 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3984 		if (skb)
3985 			napi->skb = skb;
3986 	}
3987 	return skb;
3988 }
3989 EXPORT_SYMBOL(napi_get_frags);
3990 
3991 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3992 			       gro_result_t ret)
3993 {
3994 	switch (ret) {
3995 	case GRO_NORMAL:
3996 	case GRO_HELD:
3997 		skb->protocol = eth_type_trans(skb, skb->dev);
3998 
3999 		if (ret == GRO_HELD)
4000 			skb_gro_pull(skb, -ETH_HLEN);
4001 		else if (netif_receive_skb(skb))
4002 			ret = GRO_DROP;
4003 		break;
4004 
4005 	case GRO_DROP:
4006 	case GRO_MERGED_FREE:
4007 		napi_reuse_skb(napi, skb);
4008 		break;
4009 
4010 	case GRO_MERGED:
4011 		break;
4012 	}
4013 
4014 	return ret;
4015 }
4016 
4017 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4018 {
4019 	struct sk_buff *skb = napi->skb;
4020 	struct ethhdr *eth;
4021 	unsigned int hlen;
4022 	unsigned int off;
4023 
4024 	napi->skb = NULL;
4025 
4026 	skb_reset_mac_header(skb);
4027 	skb_gro_reset_offset(skb);
4028 
4029 	off = skb_gro_offset(skb);
4030 	hlen = off + sizeof(*eth);
4031 	eth = skb_gro_header_fast(skb, off);
4032 	if (skb_gro_header_hard(skb, hlen)) {
4033 		eth = skb_gro_header_slow(skb, hlen, off);
4034 		if (unlikely(!eth)) {
4035 			napi_reuse_skb(napi, skb);
4036 			skb = NULL;
4037 			goto out;
4038 		}
4039 	}
4040 
4041 	skb_gro_pull(skb, sizeof(*eth));
4042 
4043 	/*
4044 	 * This works because the only protocols we care about don't require
4045 	 * special handling.  We'll fix it up properly at the end.
4046 	 */
4047 	skb->protocol = eth->h_proto;
4048 
4049 out:
4050 	return skb;
4051 }
4052 
4053 gro_result_t napi_gro_frags(struct napi_struct *napi)
4054 {
4055 	struct sk_buff *skb = napi_frags_skb(napi);
4056 
4057 	if (!skb)
4058 		return GRO_DROP;
4059 
4060 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4061 }
4062 EXPORT_SYMBOL(napi_gro_frags);
4063 
4064 /*
4065  * net_rps_action sends any pending IPI's for rps.
4066  * Note: called with local irq disabled, but exits with local irq enabled.
4067  */
4068 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4069 {
4070 #ifdef CONFIG_RPS
4071 	struct softnet_data *remsd = sd->rps_ipi_list;
4072 
4073 	if (remsd) {
4074 		sd->rps_ipi_list = NULL;
4075 
4076 		local_irq_enable();
4077 
4078 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4079 		while (remsd) {
4080 			struct softnet_data *next = remsd->rps_ipi_next;
4081 
4082 			if (cpu_online(remsd->cpu))
4083 				__smp_call_function_single(remsd->cpu,
4084 							   &remsd->csd, 0);
4085 			remsd = next;
4086 		}
4087 	} else
4088 #endif
4089 		local_irq_enable();
4090 }
4091 
4092 static int process_backlog(struct napi_struct *napi, int quota)
4093 {
4094 	int work = 0;
4095 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4096 
4097 #ifdef CONFIG_RPS
4098 	/* Check if we have pending ipi, its better to send them now,
4099 	 * not waiting net_rx_action() end.
4100 	 */
4101 	if (sd->rps_ipi_list) {
4102 		local_irq_disable();
4103 		net_rps_action_and_irq_enable(sd);
4104 	}
4105 #endif
4106 	napi->weight = weight_p;
4107 	local_irq_disable();
4108 	while (work < quota) {
4109 		struct sk_buff *skb;
4110 		unsigned int qlen;
4111 
4112 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4113 			local_irq_enable();
4114 			__netif_receive_skb(skb);
4115 			local_irq_disable();
4116 			input_queue_head_incr(sd);
4117 			if (++work >= quota) {
4118 				local_irq_enable();
4119 				return work;
4120 			}
4121 		}
4122 
4123 		rps_lock(sd);
4124 		qlen = skb_queue_len(&sd->input_pkt_queue);
4125 		if (qlen)
4126 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
4127 						   &sd->process_queue);
4128 
4129 		if (qlen < quota - work) {
4130 			/*
4131 			 * Inline a custom version of __napi_complete().
4132 			 * only current cpu owns and manipulates this napi,
4133 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4134 			 * we can use a plain write instead of clear_bit(),
4135 			 * and we dont need an smp_mb() memory barrier.
4136 			 */
4137 			list_del(&napi->poll_list);
4138 			napi->state = 0;
4139 
4140 			quota = work + qlen;
4141 		}
4142 		rps_unlock(sd);
4143 	}
4144 	local_irq_enable();
4145 
4146 	return work;
4147 }
4148 
4149 /**
4150  * __napi_schedule - schedule for receive
4151  * @n: entry to schedule
4152  *
4153  * The entry's receive function will be scheduled to run
4154  */
4155 void __napi_schedule(struct napi_struct *n)
4156 {
4157 	unsigned long flags;
4158 
4159 	local_irq_save(flags);
4160 	____napi_schedule(&__get_cpu_var(softnet_data), n);
4161 	local_irq_restore(flags);
4162 }
4163 EXPORT_SYMBOL(__napi_schedule);
4164 
4165 void __napi_complete(struct napi_struct *n)
4166 {
4167 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4168 	BUG_ON(n->gro_list);
4169 
4170 	list_del(&n->poll_list);
4171 	smp_mb__before_clear_bit();
4172 	clear_bit(NAPI_STATE_SCHED, &n->state);
4173 }
4174 EXPORT_SYMBOL(__napi_complete);
4175 
4176 void napi_complete(struct napi_struct *n)
4177 {
4178 	unsigned long flags;
4179 
4180 	/*
4181 	 * don't let napi dequeue from the cpu poll list
4182 	 * just in case its running on a different cpu
4183 	 */
4184 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4185 		return;
4186 
4187 	napi_gro_flush(n, false);
4188 	local_irq_save(flags);
4189 	__napi_complete(n);
4190 	local_irq_restore(flags);
4191 }
4192 EXPORT_SYMBOL(napi_complete);
4193 
4194 /* must be called under rcu_read_lock(), as we dont take a reference */
4195 struct napi_struct *napi_by_id(unsigned int napi_id)
4196 {
4197 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4198 	struct napi_struct *napi;
4199 
4200 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4201 		if (napi->napi_id == napi_id)
4202 			return napi;
4203 
4204 	return NULL;
4205 }
4206 EXPORT_SYMBOL_GPL(napi_by_id);
4207 
4208 void napi_hash_add(struct napi_struct *napi)
4209 {
4210 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4211 
4212 		spin_lock(&napi_hash_lock);
4213 
4214 		/* 0 is not a valid id, we also skip an id that is taken
4215 		 * we expect both events to be extremely rare
4216 		 */
4217 		napi->napi_id = 0;
4218 		while (!napi->napi_id) {
4219 			napi->napi_id = ++napi_gen_id;
4220 			if (napi_by_id(napi->napi_id))
4221 				napi->napi_id = 0;
4222 		}
4223 
4224 		hlist_add_head_rcu(&napi->napi_hash_node,
4225 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4226 
4227 		spin_unlock(&napi_hash_lock);
4228 	}
4229 }
4230 EXPORT_SYMBOL_GPL(napi_hash_add);
4231 
4232 /* Warning : caller is responsible to make sure rcu grace period
4233  * is respected before freeing memory containing @napi
4234  */
4235 void napi_hash_del(struct napi_struct *napi)
4236 {
4237 	spin_lock(&napi_hash_lock);
4238 
4239 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4240 		hlist_del_rcu(&napi->napi_hash_node);
4241 
4242 	spin_unlock(&napi_hash_lock);
4243 }
4244 EXPORT_SYMBOL_GPL(napi_hash_del);
4245 
4246 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4247 		    int (*poll)(struct napi_struct *, int), int weight)
4248 {
4249 	INIT_LIST_HEAD(&napi->poll_list);
4250 	napi->gro_count = 0;
4251 	napi->gro_list = NULL;
4252 	napi->skb = NULL;
4253 	napi->poll = poll;
4254 	if (weight > NAPI_POLL_WEIGHT)
4255 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4256 			    weight, dev->name);
4257 	napi->weight = weight;
4258 	list_add(&napi->dev_list, &dev->napi_list);
4259 	napi->dev = dev;
4260 #ifdef CONFIG_NETPOLL
4261 	spin_lock_init(&napi->poll_lock);
4262 	napi->poll_owner = -1;
4263 #endif
4264 	set_bit(NAPI_STATE_SCHED, &napi->state);
4265 }
4266 EXPORT_SYMBOL(netif_napi_add);
4267 
4268 void netif_napi_del(struct napi_struct *napi)
4269 {
4270 	struct sk_buff *skb, *next;
4271 
4272 	list_del_init(&napi->dev_list);
4273 	napi_free_frags(napi);
4274 
4275 	for (skb = napi->gro_list; skb; skb = next) {
4276 		next = skb->next;
4277 		skb->next = NULL;
4278 		kfree_skb(skb);
4279 	}
4280 
4281 	napi->gro_list = NULL;
4282 	napi->gro_count = 0;
4283 }
4284 EXPORT_SYMBOL(netif_napi_del);
4285 
4286 static void net_rx_action(struct softirq_action *h)
4287 {
4288 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
4289 	unsigned long time_limit = jiffies + 2;
4290 	int budget = netdev_budget;
4291 	void *have;
4292 
4293 	local_irq_disable();
4294 
4295 	while (!list_empty(&sd->poll_list)) {
4296 		struct napi_struct *n;
4297 		int work, weight;
4298 
4299 		/* If softirq window is exhuasted then punt.
4300 		 * Allow this to run for 2 jiffies since which will allow
4301 		 * an average latency of 1.5/HZ.
4302 		 */
4303 		if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4304 			goto softnet_break;
4305 
4306 		local_irq_enable();
4307 
4308 		/* Even though interrupts have been re-enabled, this
4309 		 * access is safe because interrupts can only add new
4310 		 * entries to the tail of this list, and only ->poll()
4311 		 * calls can remove this head entry from the list.
4312 		 */
4313 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4314 
4315 		have = netpoll_poll_lock(n);
4316 
4317 		weight = n->weight;
4318 
4319 		/* This NAPI_STATE_SCHED test is for avoiding a race
4320 		 * with netpoll's poll_napi().  Only the entity which
4321 		 * obtains the lock and sees NAPI_STATE_SCHED set will
4322 		 * actually make the ->poll() call.  Therefore we avoid
4323 		 * accidentally calling ->poll() when NAPI is not scheduled.
4324 		 */
4325 		work = 0;
4326 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4327 			work = n->poll(n, weight);
4328 			trace_napi_poll(n);
4329 		}
4330 
4331 		WARN_ON_ONCE(work > weight);
4332 
4333 		budget -= work;
4334 
4335 		local_irq_disable();
4336 
4337 		/* Drivers must not modify the NAPI state if they
4338 		 * consume the entire weight.  In such cases this code
4339 		 * still "owns" the NAPI instance and therefore can
4340 		 * move the instance around on the list at-will.
4341 		 */
4342 		if (unlikely(work == weight)) {
4343 			if (unlikely(napi_disable_pending(n))) {
4344 				local_irq_enable();
4345 				napi_complete(n);
4346 				local_irq_disable();
4347 			} else {
4348 				if (n->gro_list) {
4349 					/* flush too old packets
4350 					 * If HZ < 1000, flush all packets.
4351 					 */
4352 					local_irq_enable();
4353 					napi_gro_flush(n, HZ >= 1000);
4354 					local_irq_disable();
4355 				}
4356 				list_move_tail(&n->poll_list, &sd->poll_list);
4357 			}
4358 		}
4359 
4360 		netpoll_poll_unlock(have);
4361 	}
4362 out:
4363 	net_rps_action_and_irq_enable(sd);
4364 
4365 #ifdef CONFIG_NET_DMA
4366 	/*
4367 	 * There may not be any more sk_buffs coming right now, so push
4368 	 * any pending DMA copies to hardware
4369 	 */
4370 	dma_issue_pending_all();
4371 #endif
4372 
4373 	return;
4374 
4375 softnet_break:
4376 	sd->time_squeeze++;
4377 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4378 	goto out;
4379 }
4380 
4381 struct netdev_adjacent {
4382 	struct net_device *dev;
4383 
4384 	/* upper master flag, there can only be one master device per list */
4385 	bool master;
4386 
4387 	/* counter for the number of times this device was added to us */
4388 	u16 ref_nr;
4389 
4390 	/* private field for the users */
4391 	void *private;
4392 
4393 	struct list_head list;
4394 	struct rcu_head rcu;
4395 };
4396 
4397 static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4398 						     struct net_device *adj_dev,
4399 						     struct list_head *adj_list)
4400 {
4401 	struct netdev_adjacent *adj;
4402 
4403 	list_for_each_entry_rcu(adj, adj_list, list) {
4404 		if (adj->dev == adj_dev)
4405 			return adj;
4406 	}
4407 	return NULL;
4408 }
4409 
4410 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4411 						 struct net_device *adj_dev,
4412 						 struct list_head *adj_list)
4413 {
4414 	struct netdev_adjacent *adj;
4415 
4416 	list_for_each_entry(adj, adj_list, list) {
4417 		if (adj->dev == adj_dev)
4418 			return adj;
4419 	}
4420 	return NULL;
4421 }
4422 
4423 /**
4424  * netdev_has_upper_dev - Check if device is linked to an upper device
4425  * @dev: device
4426  * @upper_dev: upper device to check
4427  *
4428  * Find out if a device is linked to specified upper device and return true
4429  * in case it is. Note that this checks only immediate upper device,
4430  * not through a complete stack of devices. The caller must hold the RTNL lock.
4431  */
4432 bool netdev_has_upper_dev(struct net_device *dev,
4433 			  struct net_device *upper_dev)
4434 {
4435 	ASSERT_RTNL();
4436 
4437 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4438 }
4439 EXPORT_SYMBOL(netdev_has_upper_dev);
4440 
4441 /**
4442  * netdev_has_any_upper_dev - Check if device is linked to some device
4443  * @dev: device
4444  *
4445  * Find out if a device is linked to an upper device and return true in case
4446  * it is. The caller must hold the RTNL lock.
4447  */
4448 bool netdev_has_any_upper_dev(struct net_device *dev)
4449 {
4450 	ASSERT_RTNL();
4451 
4452 	return !list_empty(&dev->all_adj_list.upper);
4453 }
4454 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4455 
4456 /**
4457  * netdev_master_upper_dev_get - Get master upper device
4458  * @dev: device
4459  *
4460  * Find a master upper device and return pointer to it or NULL in case
4461  * it's not there. The caller must hold the RTNL lock.
4462  */
4463 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4464 {
4465 	struct netdev_adjacent *upper;
4466 
4467 	ASSERT_RTNL();
4468 
4469 	if (list_empty(&dev->adj_list.upper))
4470 		return NULL;
4471 
4472 	upper = list_first_entry(&dev->adj_list.upper,
4473 				 struct netdev_adjacent, list);
4474 	if (likely(upper->master))
4475 		return upper->dev;
4476 	return NULL;
4477 }
4478 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4479 
4480 void *netdev_adjacent_get_private(struct list_head *adj_list)
4481 {
4482 	struct netdev_adjacent *adj;
4483 
4484 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4485 
4486 	return adj->private;
4487 }
4488 EXPORT_SYMBOL(netdev_adjacent_get_private);
4489 
4490 /**
4491  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4492  * @dev: device
4493  * @iter: list_head ** of the current position
4494  *
4495  * Gets the next device from the dev's upper list, starting from iter
4496  * position. The caller must hold RCU read lock.
4497  */
4498 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4499 						     struct list_head **iter)
4500 {
4501 	struct netdev_adjacent *upper;
4502 
4503 	WARN_ON_ONCE(!rcu_read_lock_held());
4504 
4505 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4506 
4507 	if (&upper->list == &dev->all_adj_list.upper)
4508 		return NULL;
4509 
4510 	*iter = &upper->list;
4511 
4512 	return upper->dev;
4513 }
4514 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4515 
4516 /**
4517  * netdev_lower_get_next_private - Get the next ->private from the
4518  *				   lower neighbour list
4519  * @dev: device
4520  * @iter: list_head ** of the current position
4521  *
4522  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4523  * list, starting from iter position. The caller must hold either hold the
4524  * RTNL lock or its own locking that guarantees that the neighbour lower
4525  * list will remain unchainged.
4526  */
4527 void *netdev_lower_get_next_private(struct net_device *dev,
4528 				    struct list_head **iter)
4529 {
4530 	struct netdev_adjacent *lower;
4531 
4532 	lower = list_entry(*iter, struct netdev_adjacent, list);
4533 
4534 	if (&lower->list == &dev->adj_list.lower)
4535 		return NULL;
4536 
4537 	if (iter)
4538 		*iter = lower->list.next;
4539 
4540 	return lower->private;
4541 }
4542 EXPORT_SYMBOL(netdev_lower_get_next_private);
4543 
4544 /**
4545  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4546  *				       lower neighbour list, RCU
4547  *				       variant
4548  * @dev: device
4549  * @iter: list_head ** of the current position
4550  *
4551  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4552  * list, starting from iter position. The caller must hold RCU read lock.
4553  */
4554 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4555 					struct list_head **iter)
4556 {
4557 	struct netdev_adjacent *lower;
4558 
4559 	WARN_ON_ONCE(!rcu_read_lock_held());
4560 
4561 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4562 
4563 	if (&lower->list == &dev->adj_list.lower)
4564 		return NULL;
4565 
4566 	if (iter)
4567 		*iter = &lower->list;
4568 
4569 	return lower->private;
4570 }
4571 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4572 
4573 /**
4574  * netdev_master_upper_dev_get_rcu - Get master upper device
4575  * @dev: device
4576  *
4577  * Find a master upper device and return pointer to it or NULL in case
4578  * it's not there. The caller must hold the RCU read lock.
4579  */
4580 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4581 {
4582 	struct netdev_adjacent *upper;
4583 
4584 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4585 				       struct netdev_adjacent, list);
4586 	if (upper && likely(upper->master))
4587 		return upper->dev;
4588 	return NULL;
4589 }
4590 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4591 
4592 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4593 					struct net_device *adj_dev,
4594 					struct list_head *dev_list,
4595 					void *private, bool master)
4596 {
4597 	struct netdev_adjacent *adj;
4598 	char linkname[IFNAMSIZ+7];
4599 	int ret;
4600 
4601 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4602 
4603 	if (adj) {
4604 		adj->ref_nr++;
4605 		return 0;
4606 	}
4607 
4608 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4609 	if (!adj)
4610 		return -ENOMEM;
4611 
4612 	adj->dev = adj_dev;
4613 	adj->master = master;
4614 	adj->ref_nr = 1;
4615 	adj->private = private;
4616 	dev_hold(adj_dev);
4617 
4618 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4619 		 adj_dev->name, dev->name, adj_dev->name);
4620 
4621 	if (dev_list == &dev->adj_list.lower) {
4622 		sprintf(linkname, "lower_%s", adj_dev->name);
4623 		ret = sysfs_create_link(&(dev->dev.kobj),
4624 					&(adj_dev->dev.kobj), linkname);
4625 		if (ret)
4626 			goto free_adj;
4627 	} else if (dev_list == &dev->adj_list.upper) {
4628 		sprintf(linkname, "upper_%s", adj_dev->name);
4629 		ret = sysfs_create_link(&(dev->dev.kobj),
4630 					&(adj_dev->dev.kobj), linkname);
4631 		if (ret)
4632 			goto free_adj;
4633 	}
4634 
4635 	/* Ensure that master link is always the first item in list. */
4636 	if (master) {
4637 		ret = sysfs_create_link(&(dev->dev.kobj),
4638 					&(adj_dev->dev.kobj), "master");
4639 		if (ret)
4640 			goto remove_symlinks;
4641 
4642 		list_add_rcu(&adj->list, dev_list);
4643 	} else {
4644 		list_add_tail_rcu(&adj->list, dev_list);
4645 	}
4646 
4647 	return 0;
4648 
4649 remove_symlinks:
4650 	if (dev_list == &dev->adj_list.lower) {
4651 		sprintf(linkname, "lower_%s", adj_dev->name);
4652 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4653 	} else if (dev_list == &dev->adj_list.upper) {
4654 		sprintf(linkname, "upper_%s", adj_dev->name);
4655 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4656 	}
4657 
4658 free_adj:
4659 	kfree(adj);
4660 	dev_put(adj_dev);
4661 
4662 	return ret;
4663 }
4664 
4665 void __netdev_adjacent_dev_remove(struct net_device *dev,
4666 				  struct net_device *adj_dev,
4667 				  struct list_head *dev_list)
4668 {
4669 	struct netdev_adjacent *adj;
4670 	char linkname[IFNAMSIZ+7];
4671 
4672 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4673 
4674 	if (!adj) {
4675 		pr_err("tried to remove device %s from %s\n",
4676 		       dev->name, adj_dev->name);
4677 		BUG();
4678 	}
4679 
4680 	if (adj->ref_nr > 1) {
4681 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4682 			 adj->ref_nr-1);
4683 		adj->ref_nr--;
4684 		return;
4685 	}
4686 
4687 	if (adj->master)
4688 		sysfs_remove_link(&(dev->dev.kobj), "master");
4689 
4690 	if (dev_list == &dev->adj_list.lower) {
4691 		sprintf(linkname, "lower_%s", adj_dev->name);
4692 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4693 	} else if (dev_list == &dev->adj_list.upper) {
4694 		sprintf(linkname, "upper_%s", adj_dev->name);
4695 		sysfs_remove_link(&(dev->dev.kobj), linkname);
4696 	}
4697 
4698 	list_del_rcu(&adj->list);
4699 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
4700 		 adj_dev->name, dev->name, adj_dev->name);
4701 	dev_put(adj_dev);
4702 	kfree_rcu(adj, rcu);
4703 }
4704 
4705 int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4706 				     struct net_device *upper_dev,
4707 				     struct list_head *up_list,
4708 				     struct list_head *down_list,
4709 				     void *private, bool master)
4710 {
4711 	int ret;
4712 
4713 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4714 					   master);
4715 	if (ret)
4716 		return ret;
4717 
4718 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4719 					   false);
4720 	if (ret) {
4721 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4722 		return ret;
4723 	}
4724 
4725 	return 0;
4726 }
4727 
4728 int __netdev_adjacent_dev_link(struct net_device *dev,
4729 			       struct net_device *upper_dev)
4730 {
4731 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4732 						&dev->all_adj_list.upper,
4733 						&upper_dev->all_adj_list.lower,
4734 						NULL, false);
4735 }
4736 
4737 void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4738 					struct net_device *upper_dev,
4739 					struct list_head *up_list,
4740 					struct list_head *down_list)
4741 {
4742 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4743 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4744 }
4745 
4746 void __netdev_adjacent_dev_unlink(struct net_device *dev,
4747 				  struct net_device *upper_dev)
4748 {
4749 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4750 					   &dev->all_adj_list.upper,
4751 					   &upper_dev->all_adj_list.lower);
4752 }
4753 
4754 int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4755 					 struct net_device *upper_dev,
4756 					 void *private, bool master)
4757 {
4758 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4759 
4760 	if (ret)
4761 		return ret;
4762 
4763 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4764 					       &dev->adj_list.upper,
4765 					       &upper_dev->adj_list.lower,
4766 					       private, master);
4767 	if (ret) {
4768 		__netdev_adjacent_dev_unlink(dev, upper_dev);
4769 		return ret;
4770 	}
4771 
4772 	return 0;
4773 }
4774 
4775 void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4776 					    struct net_device *upper_dev)
4777 {
4778 	__netdev_adjacent_dev_unlink(dev, upper_dev);
4779 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4780 					   &dev->adj_list.upper,
4781 					   &upper_dev->adj_list.lower);
4782 }
4783 
4784 static int __netdev_upper_dev_link(struct net_device *dev,
4785 				   struct net_device *upper_dev, bool master,
4786 				   void *private)
4787 {
4788 	struct netdev_adjacent *i, *j, *to_i, *to_j;
4789 	int ret = 0;
4790 
4791 	ASSERT_RTNL();
4792 
4793 	if (dev == upper_dev)
4794 		return -EBUSY;
4795 
4796 	/* To prevent loops, check if dev is not upper device to upper_dev. */
4797 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4798 		return -EBUSY;
4799 
4800 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4801 		return -EEXIST;
4802 
4803 	if (master && netdev_master_upper_dev_get(dev))
4804 		return -EBUSY;
4805 
4806 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4807 						   master);
4808 	if (ret)
4809 		return ret;
4810 
4811 	/* Now that we linked these devs, make all the upper_dev's
4812 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
4813 	 * versa, and don't forget the devices itself. All of these
4814 	 * links are non-neighbours.
4815 	 */
4816 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4817 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4818 			pr_debug("Interlinking %s with %s, non-neighbour\n",
4819 				 i->dev->name, j->dev->name);
4820 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4821 			if (ret)
4822 				goto rollback_mesh;
4823 		}
4824 	}
4825 
4826 	/* add dev to every upper_dev's upper device */
4827 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4828 		pr_debug("linking %s's upper device %s with %s\n",
4829 			 upper_dev->name, i->dev->name, dev->name);
4830 		ret = __netdev_adjacent_dev_link(dev, i->dev);
4831 		if (ret)
4832 			goto rollback_upper_mesh;
4833 	}
4834 
4835 	/* add upper_dev to every dev's lower device */
4836 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4837 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
4838 			 i->dev->name, upper_dev->name);
4839 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4840 		if (ret)
4841 			goto rollback_lower_mesh;
4842 	}
4843 
4844 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4845 	return 0;
4846 
4847 rollback_lower_mesh:
4848 	to_i = i;
4849 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4850 		if (i == to_i)
4851 			break;
4852 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4853 	}
4854 
4855 	i = NULL;
4856 
4857 rollback_upper_mesh:
4858 	to_i = i;
4859 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4860 		if (i == to_i)
4861 			break;
4862 		__netdev_adjacent_dev_unlink(dev, i->dev);
4863 	}
4864 
4865 	i = j = NULL;
4866 
4867 rollback_mesh:
4868 	to_i = i;
4869 	to_j = j;
4870 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4871 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4872 			if (i == to_i && j == to_j)
4873 				break;
4874 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4875 		}
4876 		if (i == to_i)
4877 			break;
4878 	}
4879 
4880 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4881 
4882 	return ret;
4883 }
4884 
4885 /**
4886  * netdev_upper_dev_link - Add a link to the upper device
4887  * @dev: device
4888  * @upper_dev: new upper device
4889  *
4890  * Adds a link to device which is upper to this one. The caller must hold
4891  * the RTNL lock. On a failure a negative errno code is returned.
4892  * On success the reference counts are adjusted and the function
4893  * returns zero.
4894  */
4895 int netdev_upper_dev_link(struct net_device *dev,
4896 			  struct net_device *upper_dev)
4897 {
4898 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4899 }
4900 EXPORT_SYMBOL(netdev_upper_dev_link);
4901 
4902 /**
4903  * netdev_master_upper_dev_link - Add a master link to the upper device
4904  * @dev: device
4905  * @upper_dev: new upper device
4906  *
4907  * Adds a link to device which is upper to this one. In this case, only
4908  * one master upper device can be linked, although other non-master devices
4909  * might be linked as well. The caller must hold the RTNL lock.
4910  * On a failure a negative errno code is returned. On success the reference
4911  * counts are adjusted and the function returns zero.
4912  */
4913 int netdev_master_upper_dev_link(struct net_device *dev,
4914 				 struct net_device *upper_dev)
4915 {
4916 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4917 }
4918 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4919 
4920 int netdev_master_upper_dev_link_private(struct net_device *dev,
4921 					 struct net_device *upper_dev,
4922 					 void *private)
4923 {
4924 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
4925 }
4926 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4927 
4928 /**
4929  * netdev_upper_dev_unlink - Removes a link to upper device
4930  * @dev: device
4931  * @upper_dev: new upper device
4932  *
4933  * Removes a link to device which is upper to this one. The caller must hold
4934  * the RTNL lock.
4935  */
4936 void netdev_upper_dev_unlink(struct net_device *dev,
4937 			     struct net_device *upper_dev)
4938 {
4939 	struct netdev_adjacent *i, *j;
4940 	ASSERT_RTNL();
4941 
4942 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4943 
4944 	/* Here is the tricky part. We must remove all dev's lower
4945 	 * devices from all upper_dev's upper devices and vice
4946 	 * versa, to maintain the graph relationship.
4947 	 */
4948 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4949 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
4950 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
4951 
4952 	/* remove also the devices itself from lower/upper device
4953 	 * list
4954 	 */
4955 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
4956 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
4957 
4958 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
4959 		__netdev_adjacent_dev_unlink(dev, i->dev);
4960 
4961 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4962 }
4963 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4964 
4965 void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4966 				       struct net_device *lower_dev)
4967 {
4968 	struct netdev_adjacent *lower;
4969 
4970 	if (!lower_dev)
4971 		return NULL;
4972 	lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4973 	if (!lower)
4974 		return NULL;
4975 
4976 	return lower->private;
4977 }
4978 EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4979 
4980 void *netdev_lower_dev_get_private(struct net_device *dev,
4981 				   struct net_device *lower_dev)
4982 {
4983 	struct netdev_adjacent *lower;
4984 
4985 	if (!lower_dev)
4986 		return NULL;
4987 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4988 	if (!lower)
4989 		return NULL;
4990 
4991 	return lower->private;
4992 }
4993 EXPORT_SYMBOL(netdev_lower_dev_get_private);
4994 
4995 static void dev_change_rx_flags(struct net_device *dev, int flags)
4996 {
4997 	const struct net_device_ops *ops = dev->netdev_ops;
4998 
4999 	if (ops->ndo_change_rx_flags)
5000 		ops->ndo_change_rx_flags(dev, flags);
5001 }
5002 
5003 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5004 {
5005 	unsigned int old_flags = dev->flags;
5006 	kuid_t uid;
5007 	kgid_t gid;
5008 
5009 	ASSERT_RTNL();
5010 
5011 	dev->flags |= IFF_PROMISC;
5012 	dev->promiscuity += inc;
5013 	if (dev->promiscuity == 0) {
5014 		/*
5015 		 * Avoid overflow.
5016 		 * If inc causes overflow, untouch promisc and return error.
5017 		 */
5018 		if (inc < 0)
5019 			dev->flags &= ~IFF_PROMISC;
5020 		else {
5021 			dev->promiscuity -= inc;
5022 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5023 				dev->name);
5024 			return -EOVERFLOW;
5025 		}
5026 	}
5027 	if (dev->flags != old_flags) {
5028 		pr_info("device %s %s promiscuous mode\n",
5029 			dev->name,
5030 			dev->flags & IFF_PROMISC ? "entered" : "left");
5031 		if (audit_enabled) {
5032 			current_uid_gid(&uid, &gid);
5033 			audit_log(current->audit_context, GFP_ATOMIC,
5034 				AUDIT_ANOM_PROMISCUOUS,
5035 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5036 				dev->name, (dev->flags & IFF_PROMISC),
5037 				(old_flags & IFF_PROMISC),
5038 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5039 				from_kuid(&init_user_ns, uid),
5040 				from_kgid(&init_user_ns, gid),
5041 				audit_get_sessionid(current));
5042 		}
5043 
5044 		dev_change_rx_flags(dev, IFF_PROMISC);
5045 	}
5046 	if (notify)
5047 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5048 	return 0;
5049 }
5050 
5051 /**
5052  *	dev_set_promiscuity	- update promiscuity count on a device
5053  *	@dev: device
5054  *	@inc: modifier
5055  *
5056  *	Add or remove promiscuity from a device. While the count in the device
5057  *	remains above zero the interface remains promiscuous. Once it hits zero
5058  *	the device reverts back to normal filtering operation. A negative inc
5059  *	value is used to drop promiscuity on the device.
5060  *	Return 0 if successful or a negative errno code on error.
5061  */
5062 int dev_set_promiscuity(struct net_device *dev, int inc)
5063 {
5064 	unsigned int old_flags = dev->flags;
5065 	int err;
5066 
5067 	err = __dev_set_promiscuity(dev, inc, true);
5068 	if (err < 0)
5069 		return err;
5070 	if (dev->flags != old_flags)
5071 		dev_set_rx_mode(dev);
5072 	return err;
5073 }
5074 EXPORT_SYMBOL(dev_set_promiscuity);
5075 
5076 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5077 {
5078 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5079 
5080 	ASSERT_RTNL();
5081 
5082 	dev->flags |= IFF_ALLMULTI;
5083 	dev->allmulti += inc;
5084 	if (dev->allmulti == 0) {
5085 		/*
5086 		 * Avoid overflow.
5087 		 * If inc causes overflow, untouch allmulti and return error.
5088 		 */
5089 		if (inc < 0)
5090 			dev->flags &= ~IFF_ALLMULTI;
5091 		else {
5092 			dev->allmulti -= inc;
5093 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5094 				dev->name);
5095 			return -EOVERFLOW;
5096 		}
5097 	}
5098 	if (dev->flags ^ old_flags) {
5099 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5100 		dev_set_rx_mode(dev);
5101 		if (notify)
5102 			__dev_notify_flags(dev, old_flags,
5103 					   dev->gflags ^ old_gflags);
5104 	}
5105 	return 0;
5106 }
5107 
5108 /**
5109  *	dev_set_allmulti	- update allmulti count on a device
5110  *	@dev: device
5111  *	@inc: modifier
5112  *
5113  *	Add or remove reception of all multicast frames to a device. While the
5114  *	count in the device remains above zero the interface remains listening
5115  *	to all interfaces. Once it hits zero the device reverts back to normal
5116  *	filtering operation. A negative @inc value is used to drop the counter
5117  *	when releasing a resource needing all multicasts.
5118  *	Return 0 if successful or a negative errno code on error.
5119  */
5120 
5121 int dev_set_allmulti(struct net_device *dev, int inc)
5122 {
5123 	return __dev_set_allmulti(dev, inc, true);
5124 }
5125 EXPORT_SYMBOL(dev_set_allmulti);
5126 
5127 /*
5128  *	Upload unicast and multicast address lists to device and
5129  *	configure RX filtering. When the device doesn't support unicast
5130  *	filtering it is put in promiscuous mode while unicast addresses
5131  *	are present.
5132  */
5133 void __dev_set_rx_mode(struct net_device *dev)
5134 {
5135 	const struct net_device_ops *ops = dev->netdev_ops;
5136 
5137 	/* dev_open will call this function so the list will stay sane. */
5138 	if (!(dev->flags&IFF_UP))
5139 		return;
5140 
5141 	if (!netif_device_present(dev))
5142 		return;
5143 
5144 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5145 		/* Unicast addresses changes may only happen under the rtnl,
5146 		 * therefore calling __dev_set_promiscuity here is safe.
5147 		 */
5148 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5149 			__dev_set_promiscuity(dev, 1, false);
5150 			dev->uc_promisc = true;
5151 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5152 			__dev_set_promiscuity(dev, -1, false);
5153 			dev->uc_promisc = false;
5154 		}
5155 	}
5156 
5157 	if (ops->ndo_set_rx_mode)
5158 		ops->ndo_set_rx_mode(dev);
5159 }
5160 
5161 void dev_set_rx_mode(struct net_device *dev)
5162 {
5163 	netif_addr_lock_bh(dev);
5164 	__dev_set_rx_mode(dev);
5165 	netif_addr_unlock_bh(dev);
5166 }
5167 
5168 /**
5169  *	dev_get_flags - get flags reported to userspace
5170  *	@dev: device
5171  *
5172  *	Get the combination of flag bits exported through APIs to userspace.
5173  */
5174 unsigned int dev_get_flags(const struct net_device *dev)
5175 {
5176 	unsigned int flags;
5177 
5178 	flags = (dev->flags & ~(IFF_PROMISC |
5179 				IFF_ALLMULTI |
5180 				IFF_RUNNING |
5181 				IFF_LOWER_UP |
5182 				IFF_DORMANT)) |
5183 		(dev->gflags & (IFF_PROMISC |
5184 				IFF_ALLMULTI));
5185 
5186 	if (netif_running(dev)) {
5187 		if (netif_oper_up(dev))
5188 			flags |= IFF_RUNNING;
5189 		if (netif_carrier_ok(dev))
5190 			flags |= IFF_LOWER_UP;
5191 		if (netif_dormant(dev))
5192 			flags |= IFF_DORMANT;
5193 	}
5194 
5195 	return flags;
5196 }
5197 EXPORT_SYMBOL(dev_get_flags);
5198 
5199 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5200 {
5201 	unsigned int old_flags = dev->flags;
5202 	int ret;
5203 
5204 	ASSERT_RTNL();
5205 
5206 	/*
5207 	 *	Set the flags on our device.
5208 	 */
5209 
5210 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5211 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5212 			       IFF_AUTOMEDIA)) |
5213 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5214 				    IFF_ALLMULTI));
5215 
5216 	/*
5217 	 *	Load in the correct multicast list now the flags have changed.
5218 	 */
5219 
5220 	if ((old_flags ^ flags) & IFF_MULTICAST)
5221 		dev_change_rx_flags(dev, IFF_MULTICAST);
5222 
5223 	dev_set_rx_mode(dev);
5224 
5225 	/*
5226 	 *	Have we downed the interface. We handle IFF_UP ourselves
5227 	 *	according to user attempts to set it, rather than blindly
5228 	 *	setting it.
5229 	 */
5230 
5231 	ret = 0;
5232 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
5233 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5234 
5235 		if (!ret)
5236 			dev_set_rx_mode(dev);
5237 	}
5238 
5239 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5240 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5241 		unsigned int old_flags = dev->flags;
5242 
5243 		dev->gflags ^= IFF_PROMISC;
5244 
5245 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5246 			if (dev->flags != old_flags)
5247 				dev_set_rx_mode(dev);
5248 	}
5249 
5250 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5251 	   is important. Some (broken) drivers set IFF_PROMISC, when
5252 	   IFF_ALLMULTI is requested not asking us and not reporting.
5253 	 */
5254 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5255 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5256 
5257 		dev->gflags ^= IFF_ALLMULTI;
5258 		__dev_set_allmulti(dev, inc, false);
5259 	}
5260 
5261 	return ret;
5262 }
5263 
5264 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5265 			unsigned int gchanges)
5266 {
5267 	unsigned int changes = dev->flags ^ old_flags;
5268 
5269 	if (gchanges)
5270 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5271 
5272 	if (changes & IFF_UP) {
5273 		if (dev->flags & IFF_UP)
5274 			call_netdevice_notifiers(NETDEV_UP, dev);
5275 		else
5276 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5277 	}
5278 
5279 	if (dev->flags & IFF_UP &&
5280 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5281 		struct netdev_notifier_change_info change_info;
5282 
5283 		change_info.flags_changed = changes;
5284 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5285 					      &change_info.info);
5286 	}
5287 }
5288 
5289 /**
5290  *	dev_change_flags - change device settings
5291  *	@dev: device
5292  *	@flags: device state flags
5293  *
5294  *	Change settings on device based state flags. The flags are
5295  *	in the userspace exported format.
5296  */
5297 int dev_change_flags(struct net_device *dev, unsigned int flags)
5298 {
5299 	int ret;
5300 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5301 
5302 	ret = __dev_change_flags(dev, flags);
5303 	if (ret < 0)
5304 		return ret;
5305 
5306 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5307 	__dev_notify_flags(dev, old_flags, changes);
5308 	return ret;
5309 }
5310 EXPORT_SYMBOL(dev_change_flags);
5311 
5312 /**
5313  *	dev_set_mtu - Change maximum transfer unit
5314  *	@dev: device
5315  *	@new_mtu: new transfer unit
5316  *
5317  *	Change the maximum transfer size of the network device.
5318  */
5319 int dev_set_mtu(struct net_device *dev, int new_mtu)
5320 {
5321 	const struct net_device_ops *ops = dev->netdev_ops;
5322 	int err;
5323 
5324 	if (new_mtu == dev->mtu)
5325 		return 0;
5326 
5327 	/*	MTU must be positive.	 */
5328 	if (new_mtu < 0)
5329 		return -EINVAL;
5330 
5331 	if (!netif_device_present(dev))
5332 		return -ENODEV;
5333 
5334 	err = 0;
5335 	if (ops->ndo_change_mtu)
5336 		err = ops->ndo_change_mtu(dev, new_mtu);
5337 	else
5338 		dev->mtu = new_mtu;
5339 
5340 	if (!err)
5341 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5342 	return err;
5343 }
5344 EXPORT_SYMBOL(dev_set_mtu);
5345 
5346 /**
5347  *	dev_set_group - Change group this device belongs to
5348  *	@dev: device
5349  *	@new_group: group this device should belong to
5350  */
5351 void dev_set_group(struct net_device *dev, int new_group)
5352 {
5353 	dev->group = new_group;
5354 }
5355 EXPORT_SYMBOL(dev_set_group);
5356 
5357 /**
5358  *	dev_set_mac_address - Change Media Access Control Address
5359  *	@dev: device
5360  *	@sa: new address
5361  *
5362  *	Change the hardware (MAC) address of the device
5363  */
5364 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5365 {
5366 	const struct net_device_ops *ops = dev->netdev_ops;
5367 	int err;
5368 
5369 	if (!ops->ndo_set_mac_address)
5370 		return -EOPNOTSUPP;
5371 	if (sa->sa_family != dev->type)
5372 		return -EINVAL;
5373 	if (!netif_device_present(dev))
5374 		return -ENODEV;
5375 	err = ops->ndo_set_mac_address(dev, sa);
5376 	if (err)
5377 		return err;
5378 	dev->addr_assign_type = NET_ADDR_SET;
5379 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5380 	add_device_randomness(dev->dev_addr, dev->addr_len);
5381 	return 0;
5382 }
5383 EXPORT_SYMBOL(dev_set_mac_address);
5384 
5385 /**
5386  *	dev_change_carrier - Change device carrier
5387  *	@dev: device
5388  *	@new_carrier: new value
5389  *
5390  *	Change device carrier
5391  */
5392 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5393 {
5394 	const struct net_device_ops *ops = dev->netdev_ops;
5395 
5396 	if (!ops->ndo_change_carrier)
5397 		return -EOPNOTSUPP;
5398 	if (!netif_device_present(dev))
5399 		return -ENODEV;
5400 	return ops->ndo_change_carrier(dev, new_carrier);
5401 }
5402 EXPORT_SYMBOL(dev_change_carrier);
5403 
5404 /**
5405  *	dev_get_phys_port_id - Get device physical port ID
5406  *	@dev: device
5407  *	@ppid: port ID
5408  *
5409  *	Get device physical port ID
5410  */
5411 int dev_get_phys_port_id(struct net_device *dev,
5412 			 struct netdev_phys_port_id *ppid)
5413 {
5414 	const struct net_device_ops *ops = dev->netdev_ops;
5415 
5416 	if (!ops->ndo_get_phys_port_id)
5417 		return -EOPNOTSUPP;
5418 	return ops->ndo_get_phys_port_id(dev, ppid);
5419 }
5420 EXPORT_SYMBOL(dev_get_phys_port_id);
5421 
5422 /**
5423  *	dev_new_index	-	allocate an ifindex
5424  *	@net: the applicable net namespace
5425  *
5426  *	Returns a suitable unique value for a new device interface
5427  *	number.  The caller must hold the rtnl semaphore or the
5428  *	dev_base_lock to be sure it remains unique.
5429  */
5430 static int dev_new_index(struct net *net)
5431 {
5432 	int ifindex = net->ifindex;
5433 	for (;;) {
5434 		if (++ifindex <= 0)
5435 			ifindex = 1;
5436 		if (!__dev_get_by_index(net, ifindex))
5437 			return net->ifindex = ifindex;
5438 	}
5439 }
5440 
5441 /* Delayed registration/unregisteration */
5442 static LIST_HEAD(net_todo_list);
5443 static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5444 
5445 static void net_set_todo(struct net_device *dev)
5446 {
5447 	list_add_tail(&dev->todo_list, &net_todo_list);
5448 	dev_net(dev)->dev_unreg_count++;
5449 }
5450 
5451 static void rollback_registered_many(struct list_head *head)
5452 {
5453 	struct net_device *dev, *tmp;
5454 	LIST_HEAD(close_head);
5455 
5456 	BUG_ON(dev_boot_phase);
5457 	ASSERT_RTNL();
5458 
5459 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5460 		/* Some devices call without registering
5461 		 * for initialization unwind. Remove those
5462 		 * devices and proceed with the remaining.
5463 		 */
5464 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5465 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5466 				 dev->name, dev);
5467 
5468 			WARN_ON(1);
5469 			list_del(&dev->unreg_list);
5470 			continue;
5471 		}
5472 		dev->dismantle = true;
5473 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5474 	}
5475 
5476 	/* If device is running, close it first. */
5477 	list_for_each_entry(dev, head, unreg_list)
5478 		list_add_tail(&dev->close_list, &close_head);
5479 	dev_close_many(&close_head);
5480 
5481 	list_for_each_entry(dev, head, unreg_list) {
5482 		/* And unlink it from device chain. */
5483 		unlist_netdevice(dev);
5484 
5485 		dev->reg_state = NETREG_UNREGISTERING;
5486 	}
5487 
5488 	synchronize_net();
5489 
5490 	list_for_each_entry(dev, head, unreg_list) {
5491 		/* Shutdown queueing discipline. */
5492 		dev_shutdown(dev);
5493 
5494 
5495 		/* Notify protocols, that we are about to destroy
5496 		   this device. They should clean all the things.
5497 		*/
5498 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5499 
5500 		if (!dev->rtnl_link_ops ||
5501 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5502 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5503 
5504 		/*
5505 		 *	Flush the unicast and multicast chains
5506 		 */
5507 		dev_uc_flush(dev);
5508 		dev_mc_flush(dev);
5509 
5510 		if (dev->netdev_ops->ndo_uninit)
5511 			dev->netdev_ops->ndo_uninit(dev);
5512 
5513 		/* Notifier chain MUST detach us all upper devices. */
5514 		WARN_ON(netdev_has_any_upper_dev(dev));
5515 
5516 		/* Remove entries from kobject tree */
5517 		netdev_unregister_kobject(dev);
5518 #ifdef CONFIG_XPS
5519 		/* Remove XPS queueing entries */
5520 		netif_reset_xps_queues_gt(dev, 0);
5521 #endif
5522 	}
5523 
5524 	synchronize_net();
5525 
5526 	list_for_each_entry(dev, head, unreg_list)
5527 		dev_put(dev);
5528 }
5529 
5530 static void rollback_registered(struct net_device *dev)
5531 {
5532 	LIST_HEAD(single);
5533 
5534 	list_add(&dev->unreg_list, &single);
5535 	rollback_registered_many(&single);
5536 	list_del(&single);
5537 }
5538 
5539 static netdev_features_t netdev_fix_features(struct net_device *dev,
5540 	netdev_features_t features)
5541 {
5542 	/* Fix illegal checksum combinations */
5543 	if ((features & NETIF_F_HW_CSUM) &&
5544 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5545 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5546 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5547 	}
5548 
5549 	/* TSO requires that SG is present as well. */
5550 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5551 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5552 		features &= ~NETIF_F_ALL_TSO;
5553 	}
5554 
5555 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5556 					!(features & NETIF_F_IP_CSUM)) {
5557 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5558 		features &= ~NETIF_F_TSO;
5559 		features &= ~NETIF_F_TSO_ECN;
5560 	}
5561 
5562 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5563 					 !(features & NETIF_F_IPV6_CSUM)) {
5564 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5565 		features &= ~NETIF_F_TSO6;
5566 	}
5567 
5568 	/* TSO ECN requires that TSO is present as well. */
5569 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5570 		features &= ~NETIF_F_TSO_ECN;
5571 
5572 	/* Software GSO depends on SG. */
5573 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5574 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5575 		features &= ~NETIF_F_GSO;
5576 	}
5577 
5578 	/* UFO needs SG and checksumming */
5579 	if (features & NETIF_F_UFO) {
5580 		/* maybe split UFO into V4 and V6? */
5581 		if (!((features & NETIF_F_GEN_CSUM) ||
5582 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5583 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5584 			netdev_dbg(dev,
5585 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5586 			features &= ~NETIF_F_UFO;
5587 		}
5588 
5589 		if (!(features & NETIF_F_SG)) {
5590 			netdev_dbg(dev,
5591 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5592 			features &= ~NETIF_F_UFO;
5593 		}
5594 	}
5595 
5596 	return features;
5597 }
5598 
5599 int __netdev_update_features(struct net_device *dev)
5600 {
5601 	netdev_features_t features;
5602 	int err = 0;
5603 
5604 	ASSERT_RTNL();
5605 
5606 	features = netdev_get_wanted_features(dev);
5607 
5608 	if (dev->netdev_ops->ndo_fix_features)
5609 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5610 
5611 	/* driver might be less strict about feature dependencies */
5612 	features = netdev_fix_features(dev, features);
5613 
5614 	if (dev->features == features)
5615 		return 0;
5616 
5617 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5618 		&dev->features, &features);
5619 
5620 	if (dev->netdev_ops->ndo_set_features)
5621 		err = dev->netdev_ops->ndo_set_features(dev, features);
5622 
5623 	if (unlikely(err < 0)) {
5624 		netdev_err(dev,
5625 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5626 			err, &features, &dev->features);
5627 		return -1;
5628 	}
5629 
5630 	if (!err)
5631 		dev->features = features;
5632 
5633 	return 1;
5634 }
5635 
5636 /**
5637  *	netdev_update_features - recalculate device features
5638  *	@dev: the device to check
5639  *
5640  *	Recalculate dev->features set and send notifications if it
5641  *	has changed. Should be called after driver or hardware dependent
5642  *	conditions might have changed that influence the features.
5643  */
5644 void netdev_update_features(struct net_device *dev)
5645 {
5646 	if (__netdev_update_features(dev))
5647 		netdev_features_change(dev);
5648 }
5649 EXPORT_SYMBOL(netdev_update_features);
5650 
5651 /**
5652  *	netdev_change_features - recalculate device features
5653  *	@dev: the device to check
5654  *
5655  *	Recalculate dev->features set and send notifications even
5656  *	if they have not changed. Should be called instead of
5657  *	netdev_update_features() if also dev->vlan_features might
5658  *	have changed to allow the changes to be propagated to stacked
5659  *	VLAN devices.
5660  */
5661 void netdev_change_features(struct net_device *dev)
5662 {
5663 	__netdev_update_features(dev);
5664 	netdev_features_change(dev);
5665 }
5666 EXPORT_SYMBOL(netdev_change_features);
5667 
5668 /**
5669  *	netif_stacked_transfer_operstate -	transfer operstate
5670  *	@rootdev: the root or lower level device to transfer state from
5671  *	@dev: the device to transfer operstate to
5672  *
5673  *	Transfer operational state from root to device. This is normally
5674  *	called when a stacking relationship exists between the root
5675  *	device and the device(a leaf device).
5676  */
5677 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5678 					struct net_device *dev)
5679 {
5680 	if (rootdev->operstate == IF_OPER_DORMANT)
5681 		netif_dormant_on(dev);
5682 	else
5683 		netif_dormant_off(dev);
5684 
5685 	if (netif_carrier_ok(rootdev)) {
5686 		if (!netif_carrier_ok(dev))
5687 			netif_carrier_on(dev);
5688 	} else {
5689 		if (netif_carrier_ok(dev))
5690 			netif_carrier_off(dev);
5691 	}
5692 }
5693 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5694 
5695 #ifdef CONFIG_RPS
5696 static int netif_alloc_rx_queues(struct net_device *dev)
5697 {
5698 	unsigned int i, count = dev->num_rx_queues;
5699 	struct netdev_rx_queue *rx;
5700 
5701 	BUG_ON(count < 1);
5702 
5703 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5704 	if (!rx)
5705 		return -ENOMEM;
5706 
5707 	dev->_rx = rx;
5708 
5709 	for (i = 0; i < count; i++)
5710 		rx[i].dev = dev;
5711 	return 0;
5712 }
5713 #endif
5714 
5715 static void netdev_init_one_queue(struct net_device *dev,
5716 				  struct netdev_queue *queue, void *_unused)
5717 {
5718 	/* Initialize queue lock */
5719 	spin_lock_init(&queue->_xmit_lock);
5720 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5721 	queue->xmit_lock_owner = -1;
5722 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5723 	queue->dev = dev;
5724 #ifdef CONFIG_BQL
5725 	dql_init(&queue->dql, HZ);
5726 #endif
5727 }
5728 
5729 static void netif_free_tx_queues(struct net_device *dev)
5730 {
5731 	if (is_vmalloc_addr(dev->_tx))
5732 		vfree(dev->_tx);
5733 	else
5734 		kfree(dev->_tx);
5735 }
5736 
5737 static int netif_alloc_netdev_queues(struct net_device *dev)
5738 {
5739 	unsigned int count = dev->num_tx_queues;
5740 	struct netdev_queue *tx;
5741 	size_t sz = count * sizeof(*tx);
5742 
5743 	BUG_ON(count < 1 || count > 0xffff);
5744 
5745 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5746 	if (!tx) {
5747 		tx = vzalloc(sz);
5748 		if (!tx)
5749 			return -ENOMEM;
5750 	}
5751 	dev->_tx = tx;
5752 
5753 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5754 	spin_lock_init(&dev->tx_global_lock);
5755 
5756 	return 0;
5757 }
5758 
5759 /**
5760  *	register_netdevice	- register a network device
5761  *	@dev: device to register
5762  *
5763  *	Take a completed network device structure and add it to the kernel
5764  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5765  *	chain. 0 is returned on success. A negative errno code is returned
5766  *	on a failure to set up the device, or if the name is a duplicate.
5767  *
5768  *	Callers must hold the rtnl semaphore. You may want
5769  *	register_netdev() instead of this.
5770  *
5771  *	BUGS:
5772  *	The locking appears insufficient to guarantee two parallel registers
5773  *	will not get the same name.
5774  */
5775 
5776 int register_netdevice(struct net_device *dev)
5777 {
5778 	int ret;
5779 	struct net *net = dev_net(dev);
5780 
5781 	BUG_ON(dev_boot_phase);
5782 	ASSERT_RTNL();
5783 
5784 	might_sleep();
5785 
5786 	/* When net_device's are persistent, this will be fatal. */
5787 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5788 	BUG_ON(!net);
5789 
5790 	spin_lock_init(&dev->addr_list_lock);
5791 	netdev_set_addr_lockdep_class(dev);
5792 
5793 	dev->iflink = -1;
5794 
5795 	ret = dev_get_valid_name(net, dev, dev->name);
5796 	if (ret < 0)
5797 		goto out;
5798 
5799 	/* Init, if this function is available */
5800 	if (dev->netdev_ops->ndo_init) {
5801 		ret = dev->netdev_ops->ndo_init(dev);
5802 		if (ret) {
5803 			if (ret > 0)
5804 				ret = -EIO;
5805 			goto out;
5806 		}
5807 	}
5808 
5809 	if (((dev->hw_features | dev->features) &
5810 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
5811 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5812 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5813 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5814 		ret = -EINVAL;
5815 		goto err_uninit;
5816 	}
5817 
5818 	ret = -EBUSY;
5819 	if (!dev->ifindex)
5820 		dev->ifindex = dev_new_index(net);
5821 	else if (__dev_get_by_index(net, dev->ifindex))
5822 		goto err_uninit;
5823 
5824 	if (dev->iflink == -1)
5825 		dev->iflink = dev->ifindex;
5826 
5827 	/* Transfer changeable features to wanted_features and enable
5828 	 * software offloads (GSO and GRO).
5829 	 */
5830 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5831 	dev->features |= NETIF_F_SOFT_FEATURES;
5832 	dev->wanted_features = dev->features & dev->hw_features;
5833 
5834 	/* Turn on no cache copy if HW is doing checksum */
5835 	if (!(dev->flags & IFF_LOOPBACK)) {
5836 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5837 		if (dev->features & NETIF_F_ALL_CSUM) {
5838 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5839 			dev->features |= NETIF_F_NOCACHE_COPY;
5840 		}
5841 	}
5842 
5843 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5844 	 */
5845 	dev->vlan_features |= NETIF_F_HIGHDMA;
5846 
5847 	/* Make NETIF_F_SG inheritable to tunnel devices.
5848 	 */
5849 	dev->hw_enc_features |= NETIF_F_SG;
5850 
5851 	/* Make NETIF_F_SG inheritable to MPLS.
5852 	 */
5853 	dev->mpls_features |= NETIF_F_SG;
5854 
5855 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5856 	ret = notifier_to_errno(ret);
5857 	if (ret)
5858 		goto err_uninit;
5859 
5860 	ret = netdev_register_kobject(dev);
5861 	if (ret)
5862 		goto err_uninit;
5863 	dev->reg_state = NETREG_REGISTERED;
5864 
5865 	__netdev_update_features(dev);
5866 
5867 	/*
5868 	 *	Default initial state at registry is that the
5869 	 *	device is present.
5870 	 */
5871 
5872 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5873 
5874 	linkwatch_init_dev(dev);
5875 
5876 	dev_init_scheduler(dev);
5877 	dev_hold(dev);
5878 	list_netdevice(dev);
5879 	add_device_randomness(dev->dev_addr, dev->addr_len);
5880 
5881 	/* If the device has permanent device address, driver should
5882 	 * set dev_addr and also addr_assign_type should be set to
5883 	 * NET_ADDR_PERM (default value).
5884 	 */
5885 	if (dev->addr_assign_type == NET_ADDR_PERM)
5886 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5887 
5888 	/* Notify protocols, that a new device appeared. */
5889 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5890 	ret = notifier_to_errno(ret);
5891 	if (ret) {
5892 		rollback_registered(dev);
5893 		dev->reg_state = NETREG_UNREGISTERED;
5894 	}
5895 	/*
5896 	 *	Prevent userspace races by waiting until the network
5897 	 *	device is fully setup before sending notifications.
5898 	 */
5899 	if (!dev->rtnl_link_ops ||
5900 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5901 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5902 
5903 out:
5904 	return ret;
5905 
5906 err_uninit:
5907 	if (dev->netdev_ops->ndo_uninit)
5908 		dev->netdev_ops->ndo_uninit(dev);
5909 	goto out;
5910 }
5911 EXPORT_SYMBOL(register_netdevice);
5912 
5913 /**
5914  *	init_dummy_netdev	- init a dummy network device for NAPI
5915  *	@dev: device to init
5916  *
5917  *	This takes a network device structure and initialize the minimum
5918  *	amount of fields so it can be used to schedule NAPI polls without
5919  *	registering a full blown interface. This is to be used by drivers
5920  *	that need to tie several hardware interfaces to a single NAPI
5921  *	poll scheduler due to HW limitations.
5922  */
5923 int init_dummy_netdev(struct net_device *dev)
5924 {
5925 	/* Clear everything. Note we don't initialize spinlocks
5926 	 * are they aren't supposed to be taken by any of the
5927 	 * NAPI code and this dummy netdev is supposed to be
5928 	 * only ever used for NAPI polls
5929 	 */
5930 	memset(dev, 0, sizeof(struct net_device));
5931 
5932 	/* make sure we BUG if trying to hit standard
5933 	 * register/unregister code path
5934 	 */
5935 	dev->reg_state = NETREG_DUMMY;
5936 
5937 	/* NAPI wants this */
5938 	INIT_LIST_HEAD(&dev->napi_list);
5939 
5940 	/* a dummy interface is started by default */
5941 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5942 	set_bit(__LINK_STATE_START, &dev->state);
5943 
5944 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5945 	 * because users of this 'device' dont need to change
5946 	 * its refcount.
5947 	 */
5948 
5949 	return 0;
5950 }
5951 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5952 
5953 
5954 /**
5955  *	register_netdev	- register a network device
5956  *	@dev: device to register
5957  *
5958  *	Take a completed network device structure and add it to the kernel
5959  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5960  *	chain. 0 is returned on success. A negative errno code is returned
5961  *	on a failure to set up the device, or if the name is a duplicate.
5962  *
5963  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5964  *	and expands the device name if you passed a format string to
5965  *	alloc_netdev.
5966  */
5967 int register_netdev(struct net_device *dev)
5968 {
5969 	int err;
5970 
5971 	rtnl_lock();
5972 	err = register_netdevice(dev);
5973 	rtnl_unlock();
5974 	return err;
5975 }
5976 EXPORT_SYMBOL(register_netdev);
5977 
5978 int netdev_refcnt_read(const struct net_device *dev)
5979 {
5980 	int i, refcnt = 0;
5981 
5982 	for_each_possible_cpu(i)
5983 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5984 	return refcnt;
5985 }
5986 EXPORT_SYMBOL(netdev_refcnt_read);
5987 
5988 /**
5989  * netdev_wait_allrefs - wait until all references are gone.
5990  * @dev: target net_device
5991  *
5992  * This is called when unregistering network devices.
5993  *
5994  * Any protocol or device that holds a reference should register
5995  * for netdevice notification, and cleanup and put back the
5996  * reference if they receive an UNREGISTER event.
5997  * We can get stuck here if buggy protocols don't correctly
5998  * call dev_put.
5999  */
6000 static void netdev_wait_allrefs(struct net_device *dev)
6001 {
6002 	unsigned long rebroadcast_time, warning_time;
6003 	int refcnt;
6004 
6005 	linkwatch_forget_dev(dev);
6006 
6007 	rebroadcast_time = warning_time = jiffies;
6008 	refcnt = netdev_refcnt_read(dev);
6009 
6010 	while (refcnt != 0) {
6011 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6012 			rtnl_lock();
6013 
6014 			/* Rebroadcast unregister notification */
6015 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6016 
6017 			__rtnl_unlock();
6018 			rcu_barrier();
6019 			rtnl_lock();
6020 
6021 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6022 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6023 				     &dev->state)) {
6024 				/* We must not have linkwatch events
6025 				 * pending on unregister. If this
6026 				 * happens, we simply run the queue
6027 				 * unscheduled, resulting in a noop
6028 				 * for this device.
6029 				 */
6030 				linkwatch_run_queue();
6031 			}
6032 
6033 			__rtnl_unlock();
6034 
6035 			rebroadcast_time = jiffies;
6036 		}
6037 
6038 		msleep(250);
6039 
6040 		refcnt = netdev_refcnt_read(dev);
6041 
6042 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6043 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6044 				 dev->name, refcnt);
6045 			warning_time = jiffies;
6046 		}
6047 	}
6048 }
6049 
6050 /* The sequence is:
6051  *
6052  *	rtnl_lock();
6053  *	...
6054  *	register_netdevice(x1);
6055  *	register_netdevice(x2);
6056  *	...
6057  *	unregister_netdevice(y1);
6058  *	unregister_netdevice(y2);
6059  *      ...
6060  *	rtnl_unlock();
6061  *	free_netdev(y1);
6062  *	free_netdev(y2);
6063  *
6064  * We are invoked by rtnl_unlock().
6065  * This allows us to deal with problems:
6066  * 1) We can delete sysfs objects which invoke hotplug
6067  *    without deadlocking with linkwatch via keventd.
6068  * 2) Since we run with the RTNL semaphore not held, we can sleep
6069  *    safely in order to wait for the netdev refcnt to drop to zero.
6070  *
6071  * We must not return until all unregister events added during
6072  * the interval the lock was held have been completed.
6073  */
6074 void netdev_run_todo(void)
6075 {
6076 	struct list_head list;
6077 
6078 	/* Snapshot list, allow later requests */
6079 	list_replace_init(&net_todo_list, &list);
6080 
6081 	__rtnl_unlock();
6082 
6083 
6084 	/* Wait for rcu callbacks to finish before next phase */
6085 	if (!list_empty(&list))
6086 		rcu_barrier();
6087 
6088 	while (!list_empty(&list)) {
6089 		struct net_device *dev
6090 			= list_first_entry(&list, struct net_device, todo_list);
6091 		list_del(&dev->todo_list);
6092 
6093 		rtnl_lock();
6094 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6095 		__rtnl_unlock();
6096 
6097 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6098 			pr_err("network todo '%s' but state %d\n",
6099 			       dev->name, dev->reg_state);
6100 			dump_stack();
6101 			continue;
6102 		}
6103 
6104 		dev->reg_state = NETREG_UNREGISTERED;
6105 
6106 		on_each_cpu(flush_backlog, dev, 1);
6107 
6108 		netdev_wait_allrefs(dev);
6109 
6110 		/* paranoia */
6111 		BUG_ON(netdev_refcnt_read(dev));
6112 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6113 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6114 		WARN_ON(dev->dn_ptr);
6115 
6116 		if (dev->destructor)
6117 			dev->destructor(dev);
6118 
6119 		/* Report a network device has been unregistered */
6120 		rtnl_lock();
6121 		dev_net(dev)->dev_unreg_count--;
6122 		__rtnl_unlock();
6123 		wake_up(&netdev_unregistering_wq);
6124 
6125 		/* Free network device */
6126 		kobject_put(&dev->dev.kobj);
6127 	}
6128 }
6129 
6130 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6131  * fields in the same order, with only the type differing.
6132  */
6133 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6134 			     const struct net_device_stats *netdev_stats)
6135 {
6136 #if BITS_PER_LONG == 64
6137 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6138 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6139 #else
6140 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6141 	const unsigned long *src = (const unsigned long *)netdev_stats;
6142 	u64 *dst = (u64 *)stats64;
6143 
6144 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6145 		     sizeof(*stats64) / sizeof(u64));
6146 	for (i = 0; i < n; i++)
6147 		dst[i] = src[i];
6148 #endif
6149 }
6150 EXPORT_SYMBOL(netdev_stats_to_stats64);
6151 
6152 /**
6153  *	dev_get_stats	- get network device statistics
6154  *	@dev: device to get statistics from
6155  *	@storage: place to store stats
6156  *
6157  *	Get network statistics from device. Return @storage.
6158  *	The device driver may provide its own method by setting
6159  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6160  *	otherwise the internal statistics structure is used.
6161  */
6162 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6163 					struct rtnl_link_stats64 *storage)
6164 {
6165 	const struct net_device_ops *ops = dev->netdev_ops;
6166 
6167 	if (ops->ndo_get_stats64) {
6168 		memset(storage, 0, sizeof(*storage));
6169 		ops->ndo_get_stats64(dev, storage);
6170 	} else if (ops->ndo_get_stats) {
6171 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6172 	} else {
6173 		netdev_stats_to_stats64(storage, &dev->stats);
6174 	}
6175 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6176 	return storage;
6177 }
6178 EXPORT_SYMBOL(dev_get_stats);
6179 
6180 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6181 {
6182 	struct netdev_queue *queue = dev_ingress_queue(dev);
6183 
6184 #ifdef CONFIG_NET_CLS_ACT
6185 	if (queue)
6186 		return queue;
6187 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6188 	if (!queue)
6189 		return NULL;
6190 	netdev_init_one_queue(dev, queue, NULL);
6191 	queue->qdisc = &noop_qdisc;
6192 	queue->qdisc_sleeping = &noop_qdisc;
6193 	rcu_assign_pointer(dev->ingress_queue, queue);
6194 #endif
6195 	return queue;
6196 }
6197 
6198 static const struct ethtool_ops default_ethtool_ops;
6199 
6200 void netdev_set_default_ethtool_ops(struct net_device *dev,
6201 				    const struct ethtool_ops *ops)
6202 {
6203 	if (dev->ethtool_ops == &default_ethtool_ops)
6204 		dev->ethtool_ops = ops;
6205 }
6206 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6207 
6208 void netdev_freemem(struct net_device *dev)
6209 {
6210 	char *addr = (char *)dev - dev->padded;
6211 
6212 	if (is_vmalloc_addr(addr))
6213 		vfree(addr);
6214 	else
6215 		kfree(addr);
6216 }
6217 
6218 /**
6219  *	alloc_netdev_mqs - allocate network device
6220  *	@sizeof_priv:	size of private data to allocate space for
6221  *	@name:		device name format string
6222  *	@setup:		callback to initialize device
6223  *	@txqs:		the number of TX subqueues to allocate
6224  *	@rxqs:		the number of RX subqueues to allocate
6225  *
6226  *	Allocates a struct net_device with private data area for driver use
6227  *	and performs basic initialization.  Also allocates subquue structs
6228  *	for each queue on the device.
6229  */
6230 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6231 		void (*setup)(struct net_device *),
6232 		unsigned int txqs, unsigned int rxqs)
6233 {
6234 	struct net_device *dev;
6235 	size_t alloc_size;
6236 	struct net_device *p;
6237 
6238 	BUG_ON(strlen(name) >= sizeof(dev->name));
6239 
6240 	if (txqs < 1) {
6241 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6242 		return NULL;
6243 	}
6244 
6245 #ifdef CONFIG_RPS
6246 	if (rxqs < 1) {
6247 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6248 		return NULL;
6249 	}
6250 #endif
6251 
6252 	alloc_size = sizeof(struct net_device);
6253 	if (sizeof_priv) {
6254 		/* ensure 32-byte alignment of private area */
6255 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6256 		alloc_size += sizeof_priv;
6257 	}
6258 	/* ensure 32-byte alignment of whole construct */
6259 	alloc_size += NETDEV_ALIGN - 1;
6260 
6261 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6262 	if (!p)
6263 		p = vzalloc(alloc_size);
6264 	if (!p)
6265 		return NULL;
6266 
6267 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6268 	dev->padded = (char *)dev - (char *)p;
6269 
6270 	dev->pcpu_refcnt = alloc_percpu(int);
6271 	if (!dev->pcpu_refcnt)
6272 		goto free_dev;
6273 
6274 	if (dev_addr_init(dev))
6275 		goto free_pcpu;
6276 
6277 	dev_mc_init(dev);
6278 	dev_uc_init(dev);
6279 
6280 	dev_net_set(dev, &init_net);
6281 
6282 	dev->gso_max_size = GSO_MAX_SIZE;
6283 	dev->gso_max_segs = GSO_MAX_SEGS;
6284 
6285 	INIT_LIST_HEAD(&dev->napi_list);
6286 	INIT_LIST_HEAD(&dev->unreg_list);
6287 	INIT_LIST_HEAD(&dev->close_list);
6288 	INIT_LIST_HEAD(&dev->link_watch_list);
6289 	INIT_LIST_HEAD(&dev->adj_list.upper);
6290 	INIT_LIST_HEAD(&dev->adj_list.lower);
6291 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6292 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6293 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6294 	setup(dev);
6295 
6296 	dev->num_tx_queues = txqs;
6297 	dev->real_num_tx_queues = txqs;
6298 	if (netif_alloc_netdev_queues(dev))
6299 		goto free_all;
6300 
6301 #ifdef CONFIG_RPS
6302 	dev->num_rx_queues = rxqs;
6303 	dev->real_num_rx_queues = rxqs;
6304 	if (netif_alloc_rx_queues(dev))
6305 		goto free_all;
6306 #endif
6307 
6308 	strcpy(dev->name, name);
6309 	dev->group = INIT_NETDEV_GROUP;
6310 	if (!dev->ethtool_ops)
6311 		dev->ethtool_ops = &default_ethtool_ops;
6312 	return dev;
6313 
6314 free_all:
6315 	free_netdev(dev);
6316 	return NULL;
6317 
6318 free_pcpu:
6319 	free_percpu(dev->pcpu_refcnt);
6320 	netif_free_tx_queues(dev);
6321 #ifdef CONFIG_RPS
6322 	kfree(dev->_rx);
6323 #endif
6324 
6325 free_dev:
6326 	netdev_freemem(dev);
6327 	return NULL;
6328 }
6329 EXPORT_SYMBOL(alloc_netdev_mqs);
6330 
6331 /**
6332  *	free_netdev - free network device
6333  *	@dev: device
6334  *
6335  *	This function does the last stage of destroying an allocated device
6336  * 	interface. The reference to the device object is released.
6337  *	If this is the last reference then it will be freed.
6338  */
6339 void free_netdev(struct net_device *dev)
6340 {
6341 	struct napi_struct *p, *n;
6342 
6343 	release_net(dev_net(dev));
6344 
6345 	netif_free_tx_queues(dev);
6346 #ifdef CONFIG_RPS
6347 	kfree(dev->_rx);
6348 #endif
6349 
6350 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6351 
6352 	/* Flush device addresses */
6353 	dev_addr_flush(dev);
6354 
6355 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6356 		netif_napi_del(p);
6357 
6358 	free_percpu(dev->pcpu_refcnt);
6359 	dev->pcpu_refcnt = NULL;
6360 
6361 	/*  Compatibility with error handling in drivers */
6362 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6363 		netdev_freemem(dev);
6364 		return;
6365 	}
6366 
6367 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6368 	dev->reg_state = NETREG_RELEASED;
6369 
6370 	/* will free via device release */
6371 	put_device(&dev->dev);
6372 }
6373 EXPORT_SYMBOL(free_netdev);
6374 
6375 /**
6376  *	synchronize_net -  Synchronize with packet receive processing
6377  *
6378  *	Wait for packets currently being received to be done.
6379  *	Does not block later packets from starting.
6380  */
6381 void synchronize_net(void)
6382 {
6383 	might_sleep();
6384 	if (rtnl_is_locked())
6385 		synchronize_rcu_expedited();
6386 	else
6387 		synchronize_rcu();
6388 }
6389 EXPORT_SYMBOL(synchronize_net);
6390 
6391 /**
6392  *	unregister_netdevice_queue - remove device from the kernel
6393  *	@dev: device
6394  *	@head: list
6395  *
6396  *	This function shuts down a device interface and removes it
6397  *	from the kernel tables.
6398  *	If head not NULL, device is queued to be unregistered later.
6399  *
6400  *	Callers must hold the rtnl semaphore.  You may want
6401  *	unregister_netdev() instead of this.
6402  */
6403 
6404 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6405 {
6406 	ASSERT_RTNL();
6407 
6408 	if (head) {
6409 		list_move_tail(&dev->unreg_list, head);
6410 	} else {
6411 		rollback_registered(dev);
6412 		/* Finish processing unregister after unlock */
6413 		net_set_todo(dev);
6414 	}
6415 }
6416 EXPORT_SYMBOL(unregister_netdevice_queue);
6417 
6418 /**
6419  *	unregister_netdevice_many - unregister many devices
6420  *	@head: list of devices
6421  */
6422 void unregister_netdevice_many(struct list_head *head)
6423 {
6424 	struct net_device *dev;
6425 
6426 	if (!list_empty(head)) {
6427 		rollback_registered_many(head);
6428 		list_for_each_entry(dev, head, unreg_list)
6429 			net_set_todo(dev);
6430 	}
6431 }
6432 EXPORT_SYMBOL(unregister_netdevice_many);
6433 
6434 /**
6435  *	unregister_netdev - remove device from the kernel
6436  *	@dev: device
6437  *
6438  *	This function shuts down a device interface and removes it
6439  *	from the kernel tables.
6440  *
6441  *	This is just a wrapper for unregister_netdevice that takes
6442  *	the rtnl semaphore.  In general you want to use this and not
6443  *	unregister_netdevice.
6444  */
6445 void unregister_netdev(struct net_device *dev)
6446 {
6447 	rtnl_lock();
6448 	unregister_netdevice(dev);
6449 	rtnl_unlock();
6450 }
6451 EXPORT_SYMBOL(unregister_netdev);
6452 
6453 /**
6454  *	dev_change_net_namespace - move device to different nethost namespace
6455  *	@dev: device
6456  *	@net: network namespace
6457  *	@pat: If not NULL name pattern to try if the current device name
6458  *	      is already taken in the destination network namespace.
6459  *
6460  *	This function shuts down a device interface and moves it
6461  *	to a new network namespace. On success 0 is returned, on
6462  *	a failure a netagive errno code is returned.
6463  *
6464  *	Callers must hold the rtnl semaphore.
6465  */
6466 
6467 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6468 {
6469 	int err;
6470 
6471 	ASSERT_RTNL();
6472 
6473 	/* Don't allow namespace local devices to be moved. */
6474 	err = -EINVAL;
6475 	if (dev->features & NETIF_F_NETNS_LOCAL)
6476 		goto out;
6477 
6478 	/* Ensure the device has been registrered */
6479 	if (dev->reg_state != NETREG_REGISTERED)
6480 		goto out;
6481 
6482 	/* Get out if there is nothing todo */
6483 	err = 0;
6484 	if (net_eq(dev_net(dev), net))
6485 		goto out;
6486 
6487 	/* Pick the destination device name, and ensure
6488 	 * we can use it in the destination network namespace.
6489 	 */
6490 	err = -EEXIST;
6491 	if (__dev_get_by_name(net, dev->name)) {
6492 		/* We get here if we can't use the current device name */
6493 		if (!pat)
6494 			goto out;
6495 		if (dev_get_valid_name(net, dev, pat) < 0)
6496 			goto out;
6497 	}
6498 
6499 	/*
6500 	 * And now a mini version of register_netdevice unregister_netdevice.
6501 	 */
6502 
6503 	/* If device is running close it first. */
6504 	dev_close(dev);
6505 
6506 	/* And unlink it from device chain */
6507 	err = -ENODEV;
6508 	unlist_netdevice(dev);
6509 
6510 	synchronize_net();
6511 
6512 	/* Shutdown queueing discipline. */
6513 	dev_shutdown(dev);
6514 
6515 	/* Notify protocols, that we are about to destroy
6516 	   this device. They should clean all the things.
6517 
6518 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6519 	   This is wanted because this way 8021q and macvlan know
6520 	   the device is just moving and can keep their slaves up.
6521 	*/
6522 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6523 	rcu_barrier();
6524 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6525 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6526 
6527 	/*
6528 	 *	Flush the unicast and multicast chains
6529 	 */
6530 	dev_uc_flush(dev);
6531 	dev_mc_flush(dev);
6532 
6533 	/* Send a netdev-removed uevent to the old namespace */
6534 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6535 
6536 	/* Actually switch the network namespace */
6537 	dev_net_set(dev, net);
6538 
6539 	/* If there is an ifindex conflict assign a new one */
6540 	if (__dev_get_by_index(net, dev->ifindex)) {
6541 		int iflink = (dev->iflink == dev->ifindex);
6542 		dev->ifindex = dev_new_index(net);
6543 		if (iflink)
6544 			dev->iflink = dev->ifindex;
6545 	}
6546 
6547 	/* Send a netdev-add uevent to the new namespace */
6548 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6549 
6550 	/* Fixup kobjects */
6551 	err = device_rename(&dev->dev, dev->name);
6552 	WARN_ON(err);
6553 
6554 	/* Add the device back in the hashes */
6555 	list_netdevice(dev);
6556 
6557 	/* Notify protocols, that a new device appeared. */
6558 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6559 
6560 	/*
6561 	 *	Prevent userspace races by waiting until the network
6562 	 *	device is fully setup before sending notifications.
6563 	 */
6564 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6565 
6566 	synchronize_net();
6567 	err = 0;
6568 out:
6569 	return err;
6570 }
6571 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6572 
6573 static int dev_cpu_callback(struct notifier_block *nfb,
6574 			    unsigned long action,
6575 			    void *ocpu)
6576 {
6577 	struct sk_buff **list_skb;
6578 	struct sk_buff *skb;
6579 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6580 	struct softnet_data *sd, *oldsd;
6581 
6582 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6583 		return NOTIFY_OK;
6584 
6585 	local_irq_disable();
6586 	cpu = smp_processor_id();
6587 	sd = &per_cpu(softnet_data, cpu);
6588 	oldsd = &per_cpu(softnet_data, oldcpu);
6589 
6590 	/* Find end of our completion_queue. */
6591 	list_skb = &sd->completion_queue;
6592 	while (*list_skb)
6593 		list_skb = &(*list_skb)->next;
6594 	/* Append completion queue from offline CPU. */
6595 	*list_skb = oldsd->completion_queue;
6596 	oldsd->completion_queue = NULL;
6597 
6598 	/* Append output queue from offline CPU. */
6599 	if (oldsd->output_queue) {
6600 		*sd->output_queue_tailp = oldsd->output_queue;
6601 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6602 		oldsd->output_queue = NULL;
6603 		oldsd->output_queue_tailp = &oldsd->output_queue;
6604 	}
6605 	/* Append NAPI poll list from offline CPU. */
6606 	if (!list_empty(&oldsd->poll_list)) {
6607 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6608 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6609 	}
6610 
6611 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6612 	local_irq_enable();
6613 
6614 	/* Process offline CPU's input_pkt_queue */
6615 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6616 		netif_rx(skb);
6617 		input_queue_head_incr(oldsd);
6618 	}
6619 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6620 		netif_rx(skb);
6621 		input_queue_head_incr(oldsd);
6622 	}
6623 
6624 	return NOTIFY_OK;
6625 }
6626 
6627 
6628 /**
6629  *	netdev_increment_features - increment feature set by one
6630  *	@all: current feature set
6631  *	@one: new feature set
6632  *	@mask: mask feature set
6633  *
6634  *	Computes a new feature set after adding a device with feature set
6635  *	@one to the master device with current feature set @all.  Will not
6636  *	enable anything that is off in @mask. Returns the new feature set.
6637  */
6638 netdev_features_t netdev_increment_features(netdev_features_t all,
6639 	netdev_features_t one, netdev_features_t mask)
6640 {
6641 	if (mask & NETIF_F_GEN_CSUM)
6642 		mask |= NETIF_F_ALL_CSUM;
6643 	mask |= NETIF_F_VLAN_CHALLENGED;
6644 
6645 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6646 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6647 
6648 	/* If one device supports hw checksumming, set for all. */
6649 	if (all & NETIF_F_GEN_CSUM)
6650 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6651 
6652 	return all;
6653 }
6654 EXPORT_SYMBOL(netdev_increment_features);
6655 
6656 static struct hlist_head * __net_init netdev_create_hash(void)
6657 {
6658 	int i;
6659 	struct hlist_head *hash;
6660 
6661 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6662 	if (hash != NULL)
6663 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6664 			INIT_HLIST_HEAD(&hash[i]);
6665 
6666 	return hash;
6667 }
6668 
6669 /* Initialize per network namespace state */
6670 static int __net_init netdev_init(struct net *net)
6671 {
6672 	if (net != &init_net)
6673 		INIT_LIST_HEAD(&net->dev_base_head);
6674 
6675 	net->dev_name_head = netdev_create_hash();
6676 	if (net->dev_name_head == NULL)
6677 		goto err_name;
6678 
6679 	net->dev_index_head = netdev_create_hash();
6680 	if (net->dev_index_head == NULL)
6681 		goto err_idx;
6682 
6683 	return 0;
6684 
6685 err_idx:
6686 	kfree(net->dev_name_head);
6687 err_name:
6688 	return -ENOMEM;
6689 }
6690 
6691 /**
6692  *	netdev_drivername - network driver for the device
6693  *	@dev: network device
6694  *
6695  *	Determine network driver for device.
6696  */
6697 const char *netdev_drivername(const struct net_device *dev)
6698 {
6699 	const struct device_driver *driver;
6700 	const struct device *parent;
6701 	const char *empty = "";
6702 
6703 	parent = dev->dev.parent;
6704 	if (!parent)
6705 		return empty;
6706 
6707 	driver = parent->driver;
6708 	if (driver && driver->name)
6709 		return driver->name;
6710 	return empty;
6711 }
6712 
6713 static int __netdev_printk(const char *level, const struct net_device *dev,
6714 			   struct va_format *vaf)
6715 {
6716 	int r;
6717 
6718 	if (dev && dev->dev.parent) {
6719 		r = dev_printk_emit(level[1] - '0',
6720 				    dev->dev.parent,
6721 				    "%s %s %s: %pV",
6722 				    dev_driver_string(dev->dev.parent),
6723 				    dev_name(dev->dev.parent),
6724 				    netdev_name(dev), vaf);
6725 	} else if (dev) {
6726 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6727 	} else {
6728 		r = printk("%s(NULL net_device): %pV", level, vaf);
6729 	}
6730 
6731 	return r;
6732 }
6733 
6734 int netdev_printk(const char *level, const struct net_device *dev,
6735 		  const char *format, ...)
6736 {
6737 	struct va_format vaf;
6738 	va_list args;
6739 	int r;
6740 
6741 	va_start(args, format);
6742 
6743 	vaf.fmt = format;
6744 	vaf.va = &args;
6745 
6746 	r = __netdev_printk(level, dev, &vaf);
6747 
6748 	va_end(args);
6749 
6750 	return r;
6751 }
6752 EXPORT_SYMBOL(netdev_printk);
6753 
6754 #define define_netdev_printk_level(func, level)			\
6755 int func(const struct net_device *dev, const char *fmt, ...)	\
6756 {								\
6757 	int r;							\
6758 	struct va_format vaf;					\
6759 	va_list args;						\
6760 								\
6761 	va_start(args, fmt);					\
6762 								\
6763 	vaf.fmt = fmt;						\
6764 	vaf.va = &args;						\
6765 								\
6766 	r = __netdev_printk(level, dev, &vaf);			\
6767 								\
6768 	va_end(args);						\
6769 								\
6770 	return r;						\
6771 }								\
6772 EXPORT_SYMBOL(func);
6773 
6774 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6775 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6776 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6777 define_netdev_printk_level(netdev_err, KERN_ERR);
6778 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6779 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6780 define_netdev_printk_level(netdev_info, KERN_INFO);
6781 
6782 static void __net_exit netdev_exit(struct net *net)
6783 {
6784 	kfree(net->dev_name_head);
6785 	kfree(net->dev_index_head);
6786 }
6787 
6788 static struct pernet_operations __net_initdata netdev_net_ops = {
6789 	.init = netdev_init,
6790 	.exit = netdev_exit,
6791 };
6792 
6793 static void __net_exit default_device_exit(struct net *net)
6794 {
6795 	struct net_device *dev, *aux;
6796 	/*
6797 	 * Push all migratable network devices back to the
6798 	 * initial network namespace
6799 	 */
6800 	rtnl_lock();
6801 	for_each_netdev_safe(net, dev, aux) {
6802 		int err;
6803 		char fb_name[IFNAMSIZ];
6804 
6805 		/* Ignore unmoveable devices (i.e. loopback) */
6806 		if (dev->features & NETIF_F_NETNS_LOCAL)
6807 			continue;
6808 
6809 		/* Leave virtual devices for the generic cleanup */
6810 		if (dev->rtnl_link_ops)
6811 			continue;
6812 
6813 		/* Push remaining network devices to init_net */
6814 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6815 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6816 		if (err) {
6817 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6818 				 __func__, dev->name, err);
6819 			BUG();
6820 		}
6821 	}
6822 	rtnl_unlock();
6823 }
6824 
6825 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6826 {
6827 	/* Return with the rtnl_lock held when there are no network
6828 	 * devices unregistering in any network namespace in net_list.
6829 	 */
6830 	struct net *net;
6831 	bool unregistering;
6832 	DEFINE_WAIT(wait);
6833 
6834 	for (;;) {
6835 		prepare_to_wait(&netdev_unregistering_wq, &wait,
6836 				TASK_UNINTERRUPTIBLE);
6837 		unregistering = false;
6838 		rtnl_lock();
6839 		list_for_each_entry(net, net_list, exit_list) {
6840 			if (net->dev_unreg_count > 0) {
6841 				unregistering = true;
6842 				break;
6843 			}
6844 		}
6845 		if (!unregistering)
6846 			break;
6847 		__rtnl_unlock();
6848 		schedule();
6849 	}
6850 	finish_wait(&netdev_unregistering_wq, &wait);
6851 }
6852 
6853 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6854 {
6855 	/* At exit all network devices most be removed from a network
6856 	 * namespace.  Do this in the reverse order of registration.
6857 	 * Do this across as many network namespaces as possible to
6858 	 * improve batching efficiency.
6859 	 */
6860 	struct net_device *dev;
6861 	struct net *net;
6862 	LIST_HEAD(dev_kill_list);
6863 
6864 	/* To prevent network device cleanup code from dereferencing
6865 	 * loopback devices or network devices that have been freed
6866 	 * wait here for all pending unregistrations to complete,
6867 	 * before unregistring the loopback device and allowing the
6868 	 * network namespace be freed.
6869 	 *
6870 	 * The netdev todo list containing all network devices
6871 	 * unregistrations that happen in default_device_exit_batch
6872 	 * will run in the rtnl_unlock() at the end of
6873 	 * default_device_exit_batch.
6874 	 */
6875 	rtnl_lock_unregistering(net_list);
6876 	list_for_each_entry(net, net_list, exit_list) {
6877 		for_each_netdev_reverse(net, dev) {
6878 			if (dev->rtnl_link_ops)
6879 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6880 			else
6881 				unregister_netdevice_queue(dev, &dev_kill_list);
6882 		}
6883 	}
6884 	unregister_netdevice_many(&dev_kill_list);
6885 	list_del(&dev_kill_list);
6886 	rtnl_unlock();
6887 }
6888 
6889 static struct pernet_operations __net_initdata default_device_ops = {
6890 	.exit = default_device_exit,
6891 	.exit_batch = default_device_exit_batch,
6892 };
6893 
6894 /*
6895  *	Initialize the DEV module. At boot time this walks the device list and
6896  *	unhooks any devices that fail to initialise (normally hardware not
6897  *	present) and leaves us with a valid list of present and active devices.
6898  *
6899  */
6900 
6901 /*
6902  *       This is called single threaded during boot, so no need
6903  *       to take the rtnl semaphore.
6904  */
6905 static int __init net_dev_init(void)
6906 {
6907 	int i, rc = -ENOMEM;
6908 
6909 	BUG_ON(!dev_boot_phase);
6910 
6911 	if (dev_proc_init())
6912 		goto out;
6913 
6914 	if (netdev_kobject_init())
6915 		goto out;
6916 
6917 	INIT_LIST_HEAD(&ptype_all);
6918 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6919 		INIT_LIST_HEAD(&ptype_base[i]);
6920 
6921 	INIT_LIST_HEAD(&offload_base);
6922 
6923 	if (register_pernet_subsys(&netdev_net_ops))
6924 		goto out;
6925 
6926 	/*
6927 	 *	Initialise the packet receive queues.
6928 	 */
6929 
6930 	for_each_possible_cpu(i) {
6931 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6932 
6933 		memset(sd, 0, sizeof(*sd));
6934 		skb_queue_head_init(&sd->input_pkt_queue);
6935 		skb_queue_head_init(&sd->process_queue);
6936 		sd->completion_queue = NULL;
6937 		INIT_LIST_HEAD(&sd->poll_list);
6938 		sd->output_queue = NULL;
6939 		sd->output_queue_tailp = &sd->output_queue;
6940 #ifdef CONFIG_RPS
6941 		sd->csd.func = rps_trigger_softirq;
6942 		sd->csd.info = sd;
6943 		sd->csd.flags = 0;
6944 		sd->cpu = i;
6945 #endif
6946 
6947 		sd->backlog.poll = process_backlog;
6948 		sd->backlog.weight = weight_p;
6949 		sd->backlog.gro_list = NULL;
6950 		sd->backlog.gro_count = 0;
6951 
6952 #ifdef CONFIG_NET_FLOW_LIMIT
6953 		sd->flow_limit = NULL;
6954 #endif
6955 	}
6956 
6957 	dev_boot_phase = 0;
6958 
6959 	/* The loopback device is special if any other network devices
6960 	 * is present in a network namespace the loopback device must
6961 	 * be present. Since we now dynamically allocate and free the
6962 	 * loopback device ensure this invariant is maintained by
6963 	 * keeping the loopback device as the first device on the
6964 	 * list of network devices.  Ensuring the loopback devices
6965 	 * is the first device that appears and the last network device
6966 	 * that disappears.
6967 	 */
6968 	if (register_pernet_device(&loopback_net_ops))
6969 		goto out;
6970 
6971 	if (register_pernet_device(&default_device_ops))
6972 		goto out;
6973 
6974 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6975 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6976 
6977 	hotcpu_notifier(dev_cpu_callback, 0);
6978 	dst_init();
6979 	rc = 0;
6980 out:
6981 	return rc;
6982 }
6983 
6984 subsys_initcall(net_dev_init);
6985