xref: /openbmc/linux/net/core/dev.c (revision 9c7dafbfab1554705f85523fead578aa1a3d338c)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <net/dst.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/kmod.h>
110 #include <linux/module.h>
111 #include <linux/netpoll.h>
112 #include <linux/rcupdate.h>
113 #include <linux/delay.h>
114 #include <net/wext.h>
115 #include <net/iw_handler.h>
116 #include <asm/current.h>
117 #include <linux/audit.h>
118 #include <linux/dmaengine.h>
119 #include <linux/err.h>
120 #include <linux/ctype.h>
121 #include <linux/if_arp.h>
122 #include <linux/if_vlan.h>
123 #include <linux/ip.h>
124 #include <net/ip.h>
125 #include <linux/ipv6.h>
126 #include <linux/in.h>
127 #include <linux/jhash.h>
128 #include <linux/random.h>
129 #include <trace/events/napi.h>
130 #include <trace/events/net.h>
131 #include <trace/events/skb.h>
132 #include <linux/pci.h>
133 #include <linux/inetdevice.h>
134 #include <linux/cpu_rmap.h>
135 #include <linux/net_tstamp.h>
136 #include <linux/static_key.h>
137 #include <net/flow_keys.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 /*
148  *	The list of packet types we will receive (as opposed to discard)
149  *	and the routines to invoke.
150  *
151  *	Why 16. Because with 16 the only overlap we get on a hash of the
152  *	low nibble of the protocol value is RARP/SNAP/X.25.
153  *
154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
155  *             sure which should go first, but I bet it won't make much
156  *             difference if we are running VLANs.  The good news is that
157  *             this protocol won't be in the list unless compiled in, so
158  *             the average user (w/out VLANs) will not be adversely affected.
159  *             --BLG
160  *
161  *		0800	IP
162  *		8100    802.1Q VLAN
163  *		0001	802.3
164  *		0002	AX.25
165  *		0004	802.2
166  *		8035	RARP
167  *		0005	SNAP
168  *		0805	X.25
169  *		0806	ARP
170  *		8137	IPX
171  *		0009	Localtalk
172  *		86DD	IPv6
173  */
174 
175 #define PTYPE_HASH_SIZE	(16)
176 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
177 
178 static DEFINE_SPINLOCK(ptype_lock);
179 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
180 static struct list_head ptype_all __read_mostly;	/* Taps */
181 
182 /*
183  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184  * semaphore.
185  *
186  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
187  *
188  * Writers must hold the rtnl semaphore while they loop through the
189  * dev_base_head list, and hold dev_base_lock for writing when they do the
190  * actual updates.  This allows pure readers to access the list even
191  * while a writer is preparing to update it.
192  *
193  * To put it another way, dev_base_lock is held for writing only to
194  * protect against pure readers; the rtnl semaphore provides the
195  * protection against other writers.
196  *
197  * See, for example usages, register_netdevice() and
198  * unregister_netdevice(), which must be called with the rtnl
199  * semaphore held.
200  */
201 DEFINE_RWLOCK(dev_base_lock);
202 EXPORT_SYMBOL(dev_base_lock);
203 
204 static inline void dev_base_seq_inc(struct net *net)
205 {
206 	while (++net->dev_base_seq == 0);
207 }
208 
209 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
210 {
211 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
212 
213 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
214 }
215 
216 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
217 {
218 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
219 }
220 
221 static inline void rps_lock(struct softnet_data *sd)
222 {
223 #ifdef CONFIG_RPS
224 	spin_lock(&sd->input_pkt_queue.lock);
225 #endif
226 }
227 
228 static inline void rps_unlock(struct softnet_data *sd)
229 {
230 #ifdef CONFIG_RPS
231 	spin_unlock(&sd->input_pkt_queue.lock);
232 #endif
233 }
234 
235 /* Device list insertion */
236 static int list_netdevice(struct net_device *dev)
237 {
238 	struct net *net = dev_net(dev);
239 
240 	ASSERT_RTNL();
241 
242 	write_lock_bh(&dev_base_lock);
243 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
244 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
245 	hlist_add_head_rcu(&dev->index_hlist,
246 			   dev_index_hash(net, dev->ifindex));
247 	write_unlock_bh(&dev_base_lock);
248 
249 	dev_base_seq_inc(net);
250 
251 	return 0;
252 }
253 
254 /* Device list removal
255  * caller must respect a RCU grace period before freeing/reusing dev
256  */
257 static void unlist_netdevice(struct net_device *dev)
258 {
259 	ASSERT_RTNL();
260 
261 	/* Unlink dev from the device chain */
262 	write_lock_bh(&dev_base_lock);
263 	list_del_rcu(&dev->dev_list);
264 	hlist_del_rcu(&dev->name_hlist);
265 	hlist_del_rcu(&dev->index_hlist);
266 	write_unlock_bh(&dev_base_lock);
267 
268 	dev_base_seq_inc(dev_net(dev));
269 }
270 
271 /*
272  *	Our notifier list
273  */
274 
275 static RAW_NOTIFIER_HEAD(netdev_chain);
276 
277 /*
278  *	Device drivers call our routines to queue packets here. We empty the
279  *	queue in the local softnet handler.
280  */
281 
282 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
283 EXPORT_PER_CPU_SYMBOL(softnet_data);
284 
285 #ifdef CONFIG_LOCKDEP
286 /*
287  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
288  * according to dev->type
289  */
290 static const unsigned short netdev_lock_type[] =
291 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
292 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
293 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
294 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
295 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
296 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
297 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
298 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
299 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
300 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
301 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
302 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
303 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
304 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
305 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
306 
307 static const char *const netdev_lock_name[] =
308 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
321 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
322 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
323 
324 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
326 
327 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
328 {
329 	int i;
330 
331 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
332 		if (netdev_lock_type[i] == dev_type)
333 			return i;
334 	/* the last key is used by default */
335 	return ARRAY_SIZE(netdev_lock_type) - 1;
336 }
337 
338 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
339 						 unsigned short dev_type)
340 {
341 	int i;
342 
343 	i = netdev_lock_pos(dev_type);
344 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
345 				   netdev_lock_name[i]);
346 }
347 
348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 {
350 	int i;
351 
352 	i = netdev_lock_pos(dev->type);
353 	lockdep_set_class_and_name(&dev->addr_list_lock,
354 				   &netdev_addr_lock_key[i],
355 				   netdev_lock_name[i]);
356 }
357 #else
358 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
359 						 unsigned short dev_type)
360 {
361 }
362 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
363 {
364 }
365 #endif
366 
367 /*******************************************************************************
368 
369 		Protocol management and registration routines
370 
371 *******************************************************************************/
372 
373 /*
374  *	Add a protocol ID to the list. Now that the input handler is
375  *	smarter we can dispense with all the messy stuff that used to be
376  *	here.
377  *
378  *	BEWARE!!! Protocol handlers, mangling input packets,
379  *	MUST BE last in hash buckets and checking protocol handlers
380  *	MUST start from promiscuous ptype_all chain in net_bh.
381  *	It is true now, do not change it.
382  *	Explanation follows: if protocol handler, mangling packet, will
383  *	be the first on list, it is not able to sense, that packet
384  *	is cloned and should be copied-on-write, so that it will
385  *	change it and subsequent readers will get broken packet.
386  *							--ANK (980803)
387  */
388 
389 static inline struct list_head *ptype_head(const struct packet_type *pt)
390 {
391 	if (pt->type == htons(ETH_P_ALL))
392 		return &ptype_all;
393 	else
394 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
395 }
396 
397 /**
398  *	dev_add_pack - add packet handler
399  *	@pt: packet type declaration
400  *
401  *	Add a protocol handler to the networking stack. The passed &packet_type
402  *	is linked into kernel lists and may not be freed until it has been
403  *	removed from the kernel lists.
404  *
405  *	This call does not sleep therefore it can not
406  *	guarantee all CPU's that are in middle of receiving packets
407  *	will see the new packet type (until the next received packet).
408  */
409 
410 void dev_add_pack(struct packet_type *pt)
411 {
412 	struct list_head *head = ptype_head(pt);
413 
414 	spin_lock(&ptype_lock);
415 	list_add_rcu(&pt->list, head);
416 	spin_unlock(&ptype_lock);
417 }
418 EXPORT_SYMBOL(dev_add_pack);
419 
420 /**
421  *	__dev_remove_pack	 - remove packet handler
422  *	@pt: packet type declaration
423  *
424  *	Remove a protocol handler that was previously added to the kernel
425  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
426  *	from the kernel lists and can be freed or reused once this function
427  *	returns.
428  *
429  *      The packet type might still be in use by receivers
430  *	and must not be freed until after all the CPU's have gone
431  *	through a quiescent state.
432  */
433 void __dev_remove_pack(struct packet_type *pt)
434 {
435 	struct list_head *head = ptype_head(pt);
436 	struct packet_type *pt1;
437 
438 	spin_lock(&ptype_lock);
439 
440 	list_for_each_entry(pt1, head, list) {
441 		if (pt == pt1) {
442 			list_del_rcu(&pt->list);
443 			goto out;
444 		}
445 	}
446 
447 	pr_warn("dev_remove_pack: %p not found\n", pt);
448 out:
449 	spin_unlock(&ptype_lock);
450 }
451 EXPORT_SYMBOL(__dev_remove_pack);
452 
453 /**
454  *	dev_remove_pack	 - remove packet handler
455  *	@pt: packet type declaration
456  *
457  *	Remove a protocol handler that was previously added to the kernel
458  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
459  *	from the kernel lists and can be freed or reused once this function
460  *	returns.
461  *
462  *	This call sleeps to guarantee that no CPU is looking at the packet
463  *	type after return.
464  */
465 void dev_remove_pack(struct packet_type *pt)
466 {
467 	__dev_remove_pack(pt);
468 
469 	synchronize_net();
470 }
471 EXPORT_SYMBOL(dev_remove_pack);
472 
473 /******************************************************************************
474 
475 		      Device Boot-time Settings Routines
476 
477 *******************************************************************************/
478 
479 /* Boot time configuration table */
480 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
481 
482 /**
483  *	netdev_boot_setup_add	- add new setup entry
484  *	@name: name of the device
485  *	@map: configured settings for the device
486  *
487  *	Adds new setup entry to the dev_boot_setup list.  The function
488  *	returns 0 on error and 1 on success.  This is a generic routine to
489  *	all netdevices.
490  */
491 static int netdev_boot_setup_add(char *name, struct ifmap *map)
492 {
493 	struct netdev_boot_setup *s;
494 	int i;
495 
496 	s = dev_boot_setup;
497 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
498 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
499 			memset(s[i].name, 0, sizeof(s[i].name));
500 			strlcpy(s[i].name, name, IFNAMSIZ);
501 			memcpy(&s[i].map, map, sizeof(s[i].map));
502 			break;
503 		}
504 	}
505 
506 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
507 }
508 
509 /**
510  *	netdev_boot_setup_check	- check boot time settings
511  *	@dev: the netdevice
512  *
513  * 	Check boot time settings for the device.
514  *	The found settings are set for the device to be used
515  *	later in the device probing.
516  *	Returns 0 if no settings found, 1 if they are.
517  */
518 int netdev_boot_setup_check(struct net_device *dev)
519 {
520 	struct netdev_boot_setup *s = dev_boot_setup;
521 	int i;
522 
523 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
524 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
525 		    !strcmp(dev->name, s[i].name)) {
526 			dev->irq 	= s[i].map.irq;
527 			dev->base_addr 	= s[i].map.base_addr;
528 			dev->mem_start 	= s[i].map.mem_start;
529 			dev->mem_end 	= s[i].map.mem_end;
530 			return 1;
531 		}
532 	}
533 	return 0;
534 }
535 EXPORT_SYMBOL(netdev_boot_setup_check);
536 
537 
538 /**
539  *	netdev_boot_base	- get address from boot time settings
540  *	@prefix: prefix for network device
541  *	@unit: id for network device
542  *
543  * 	Check boot time settings for the base address of device.
544  *	The found settings are set for the device to be used
545  *	later in the device probing.
546  *	Returns 0 if no settings found.
547  */
548 unsigned long netdev_boot_base(const char *prefix, int unit)
549 {
550 	const struct netdev_boot_setup *s = dev_boot_setup;
551 	char name[IFNAMSIZ];
552 	int i;
553 
554 	sprintf(name, "%s%d", prefix, unit);
555 
556 	/*
557 	 * If device already registered then return base of 1
558 	 * to indicate not to probe for this interface
559 	 */
560 	if (__dev_get_by_name(&init_net, name))
561 		return 1;
562 
563 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
564 		if (!strcmp(name, s[i].name))
565 			return s[i].map.base_addr;
566 	return 0;
567 }
568 
569 /*
570  * Saves at boot time configured settings for any netdevice.
571  */
572 int __init netdev_boot_setup(char *str)
573 {
574 	int ints[5];
575 	struct ifmap map;
576 
577 	str = get_options(str, ARRAY_SIZE(ints), ints);
578 	if (!str || !*str)
579 		return 0;
580 
581 	/* Save settings */
582 	memset(&map, 0, sizeof(map));
583 	if (ints[0] > 0)
584 		map.irq = ints[1];
585 	if (ints[0] > 1)
586 		map.base_addr = ints[2];
587 	if (ints[0] > 2)
588 		map.mem_start = ints[3];
589 	if (ints[0] > 3)
590 		map.mem_end = ints[4];
591 
592 	/* Add new entry to the list */
593 	return netdev_boot_setup_add(str, &map);
594 }
595 
596 __setup("netdev=", netdev_boot_setup);
597 
598 /*******************************************************************************
599 
600 			    Device Interface Subroutines
601 
602 *******************************************************************************/
603 
604 /**
605  *	__dev_get_by_name	- find a device by its name
606  *	@net: the applicable net namespace
607  *	@name: name to find
608  *
609  *	Find an interface by name. Must be called under RTNL semaphore
610  *	or @dev_base_lock. If the name is found a pointer to the device
611  *	is returned. If the name is not found then %NULL is returned. The
612  *	reference counters are not incremented so the caller must be
613  *	careful with locks.
614  */
615 
616 struct net_device *__dev_get_by_name(struct net *net, const char *name)
617 {
618 	struct hlist_node *p;
619 	struct net_device *dev;
620 	struct hlist_head *head = dev_name_hash(net, name);
621 
622 	hlist_for_each_entry(dev, p, head, name_hlist)
623 		if (!strncmp(dev->name, name, IFNAMSIZ))
624 			return dev;
625 
626 	return NULL;
627 }
628 EXPORT_SYMBOL(__dev_get_by_name);
629 
630 /**
631  *	dev_get_by_name_rcu	- find a device by its name
632  *	@net: the applicable net namespace
633  *	@name: name to find
634  *
635  *	Find an interface by name.
636  *	If the name is found a pointer to the device is returned.
637  * 	If the name is not found then %NULL is returned.
638  *	The reference counters are not incremented so the caller must be
639  *	careful with locks. The caller must hold RCU lock.
640  */
641 
642 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
643 {
644 	struct hlist_node *p;
645 	struct net_device *dev;
646 	struct hlist_head *head = dev_name_hash(net, name);
647 
648 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
649 		if (!strncmp(dev->name, name, IFNAMSIZ))
650 			return dev;
651 
652 	return NULL;
653 }
654 EXPORT_SYMBOL(dev_get_by_name_rcu);
655 
656 /**
657  *	dev_get_by_name		- find a device by its name
658  *	@net: the applicable net namespace
659  *	@name: name to find
660  *
661  *	Find an interface by name. This can be called from any
662  *	context and does its own locking. The returned handle has
663  *	the usage count incremented and the caller must use dev_put() to
664  *	release it when it is no longer needed. %NULL is returned if no
665  *	matching device is found.
666  */
667 
668 struct net_device *dev_get_by_name(struct net *net, const char *name)
669 {
670 	struct net_device *dev;
671 
672 	rcu_read_lock();
673 	dev = dev_get_by_name_rcu(net, name);
674 	if (dev)
675 		dev_hold(dev);
676 	rcu_read_unlock();
677 	return dev;
678 }
679 EXPORT_SYMBOL(dev_get_by_name);
680 
681 /**
682  *	__dev_get_by_index - find a device by its ifindex
683  *	@net: the applicable net namespace
684  *	@ifindex: index of device
685  *
686  *	Search for an interface by index. Returns %NULL if the device
687  *	is not found or a pointer to the device. The device has not
688  *	had its reference counter increased so the caller must be careful
689  *	about locking. The caller must hold either the RTNL semaphore
690  *	or @dev_base_lock.
691  */
692 
693 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
694 {
695 	struct hlist_node *p;
696 	struct net_device *dev;
697 	struct hlist_head *head = dev_index_hash(net, ifindex);
698 
699 	hlist_for_each_entry(dev, p, head, index_hlist)
700 		if (dev->ifindex == ifindex)
701 			return dev;
702 
703 	return NULL;
704 }
705 EXPORT_SYMBOL(__dev_get_by_index);
706 
707 /**
708  *	dev_get_by_index_rcu - find a device by its ifindex
709  *	@net: the applicable net namespace
710  *	@ifindex: index of device
711  *
712  *	Search for an interface by index. Returns %NULL if the device
713  *	is not found or a pointer to the device. The device has not
714  *	had its reference counter increased so the caller must be careful
715  *	about locking. The caller must hold RCU lock.
716  */
717 
718 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
719 {
720 	struct hlist_node *p;
721 	struct net_device *dev;
722 	struct hlist_head *head = dev_index_hash(net, ifindex);
723 
724 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
725 		if (dev->ifindex == ifindex)
726 			return dev;
727 
728 	return NULL;
729 }
730 EXPORT_SYMBOL(dev_get_by_index_rcu);
731 
732 
733 /**
734  *	dev_get_by_index - find a device by its ifindex
735  *	@net: the applicable net namespace
736  *	@ifindex: index of device
737  *
738  *	Search for an interface by index. Returns NULL if the device
739  *	is not found or a pointer to the device. The device returned has
740  *	had a reference added and the pointer is safe until the user calls
741  *	dev_put to indicate they have finished with it.
742  */
743 
744 struct net_device *dev_get_by_index(struct net *net, int ifindex)
745 {
746 	struct net_device *dev;
747 
748 	rcu_read_lock();
749 	dev = dev_get_by_index_rcu(net, ifindex);
750 	if (dev)
751 		dev_hold(dev);
752 	rcu_read_unlock();
753 	return dev;
754 }
755 EXPORT_SYMBOL(dev_get_by_index);
756 
757 /**
758  *	dev_getbyhwaddr_rcu - find a device by its hardware address
759  *	@net: the applicable net namespace
760  *	@type: media type of device
761  *	@ha: hardware address
762  *
763  *	Search for an interface by MAC address. Returns NULL if the device
764  *	is not found or a pointer to the device.
765  *	The caller must hold RCU or RTNL.
766  *	The returned device has not had its ref count increased
767  *	and the caller must therefore be careful about locking
768  *
769  */
770 
771 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
772 				       const char *ha)
773 {
774 	struct net_device *dev;
775 
776 	for_each_netdev_rcu(net, dev)
777 		if (dev->type == type &&
778 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
784 
785 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 	struct net_device *dev;
788 
789 	ASSERT_RTNL();
790 	for_each_netdev(net, dev)
791 		if (dev->type == type)
792 			return dev;
793 
794 	return NULL;
795 }
796 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
797 
798 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
799 {
800 	struct net_device *dev, *ret = NULL;
801 
802 	rcu_read_lock();
803 	for_each_netdev_rcu(net, dev)
804 		if (dev->type == type) {
805 			dev_hold(dev);
806 			ret = dev;
807 			break;
808 		}
809 	rcu_read_unlock();
810 	return ret;
811 }
812 EXPORT_SYMBOL(dev_getfirstbyhwtype);
813 
814 /**
815  *	dev_get_by_flags_rcu - find any device with given flags
816  *	@net: the applicable net namespace
817  *	@if_flags: IFF_* values
818  *	@mask: bitmask of bits in if_flags to check
819  *
820  *	Search for any interface with the given flags. Returns NULL if a device
821  *	is not found or a pointer to the device. Must be called inside
822  *	rcu_read_lock(), and result refcount is unchanged.
823  */
824 
825 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
826 				    unsigned short mask)
827 {
828 	struct net_device *dev, *ret;
829 
830 	ret = NULL;
831 	for_each_netdev_rcu(net, dev) {
832 		if (((dev->flags ^ if_flags) & mask) == 0) {
833 			ret = dev;
834 			break;
835 		}
836 	}
837 	return ret;
838 }
839 EXPORT_SYMBOL(dev_get_by_flags_rcu);
840 
841 /**
842  *	dev_valid_name - check if name is okay for network device
843  *	@name: name string
844  *
845  *	Network device names need to be valid file names to
846  *	to allow sysfs to work.  We also disallow any kind of
847  *	whitespace.
848  */
849 bool dev_valid_name(const char *name)
850 {
851 	if (*name == '\0')
852 		return false;
853 	if (strlen(name) >= IFNAMSIZ)
854 		return false;
855 	if (!strcmp(name, ".") || !strcmp(name, ".."))
856 		return false;
857 
858 	while (*name) {
859 		if (*name == '/' || isspace(*name))
860 			return false;
861 		name++;
862 	}
863 	return true;
864 }
865 EXPORT_SYMBOL(dev_valid_name);
866 
867 /**
868  *	__dev_alloc_name - allocate a name for a device
869  *	@net: network namespace to allocate the device name in
870  *	@name: name format string
871  *	@buf:  scratch buffer and result name string
872  *
873  *	Passed a format string - eg "lt%d" it will try and find a suitable
874  *	id. It scans list of devices to build up a free map, then chooses
875  *	the first empty slot. The caller must hold the dev_base or rtnl lock
876  *	while allocating the name and adding the device in order to avoid
877  *	duplicates.
878  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
879  *	Returns the number of the unit assigned or a negative errno code.
880  */
881 
882 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
883 {
884 	int i = 0;
885 	const char *p;
886 	const int max_netdevices = 8*PAGE_SIZE;
887 	unsigned long *inuse;
888 	struct net_device *d;
889 
890 	p = strnchr(name, IFNAMSIZ-1, '%');
891 	if (p) {
892 		/*
893 		 * Verify the string as this thing may have come from
894 		 * the user.  There must be either one "%d" and no other "%"
895 		 * characters.
896 		 */
897 		if (p[1] != 'd' || strchr(p + 2, '%'))
898 			return -EINVAL;
899 
900 		/* Use one page as a bit array of possible slots */
901 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
902 		if (!inuse)
903 			return -ENOMEM;
904 
905 		for_each_netdev(net, d) {
906 			if (!sscanf(d->name, name, &i))
907 				continue;
908 			if (i < 0 || i >= max_netdevices)
909 				continue;
910 
911 			/*  avoid cases where sscanf is not exact inverse of printf */
912 			snprintf(buf, IFNAMSIZ, name, i);
913 			if (!strncmp(buf, d->name, IFNAMSIZ))
914 				set_bit(i, inuse);
915 		}
916 
917 		i = find_first_zero_bit(inuse, max_netdevices);
918 		free_page((unsigned long) inuse);
919 	}
920 
921 	if (buf != name)
922 		snprintf(buf, IFNAMSIZ, name, i);
923 	if (!__dev_get_by_name(net, buf))
924 		return i;
925 
926 	/* It is possible to run out of possible slots
927 	 * when the name is long and there isn't enough space left
928 	 * for the digits, or if all bits are used.
929 	 */
930 	return -ENFILE;
931 }
932 
933 /**
934  *	dev_alloc_name - allocate a name for a device
935  *	@dev: device
936  *	@name: name format string
937  *
938  *	Passed a format string - eg "lt%d" it will try and find a suitable
939  *	id. It scans list of devices to build up a free map, then chooses
940  *	the first empty slot. The caller must hold the dev_base or rtnl lock
941  *	while allocating the name and adding the device in order to avoid
942  *	duplicates.
943  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
944  *	Returns the number of the unit assigned or a negative errno code.
945  */
946 
947 int dev_alloc_name(struct net_device *dev, const char *name)
948 {
949 	char buf[IFNAMSIZ];
950 	struct net *net;
951 	int ret;
952 
953 	BUG_ON(!dev_net(dev));
954 	net = dev_net(dev);
955 	ret = __dev_alloc_name(net, name, buf);
956 	if (ret >= 0)
957 		strlcpy(dev->name, buf, IFNAMSIZ);
958 	return ret;
959 }
960 EXPORT_SYMBOL(dev_alloc_name);
961 
962 static int dev_get_valid_name(struct net_device *dev, const char *name)
963 {
964 	struct net *net;
965 
966 	BUG_ON(!dev_net(dev));
967 	net = dev_net(dev);
968 
969 	if (!dev_valid_name(name))
970 		return -EINVAL;
971 
972 	if (strchr(name, '%'))
973 		return dev_alloc_name(dev, name);
974 	else if (__dev_get_by_name(net, name))
975 		return -EEXIST;
976 	else if (dev->name != name)
977 		strlcpy(dev->name, name, IFNAMSIZ);
978 
979 	return 0;
980 }
981 
982 /**
983  *	dev_change_name - change name of a device
984  *	@dev: device
985  *	@newname: name (or format string) must be at least IFNAMSIZ
986  *
987  *	Change name of a device, can pass format strings "eth%d".
988  *	for wildcarding.
989  */
990 int dev_change_name(struct net_device *dev, const char *newname)
991 {
992 	char oldname[IFNAMSIZ];
993 	int err = 0;
994 	int ret;
995 	struct net *net;
996 
997 	ASSERT_RTNL();
998 	BUG_ON(!dev_net(dev));
999 
1000 	net = dev_net(dev);
1001 	if (dev->flags & IFF_UP)
1002 		return -EBUSY;
1003 
1004 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1005 		return 0;
1006 
1007 	memcpy(oldname, dev->name, IFNAMSIZ);
1008 
1009 	err = dev_get_valid_name(dev, newname);
1010 	if (err < 0)
1011 		return err;
1012 
1013 rollback:
1014 	ret = device_rename(&dev->dev, dev->name);
1015 	if (ret) {
1016 		memcpy(dev->name, oldname, IFNAMSIZ);
1017 		return ret;
1018 	}
1019 
1020 	write_lock_bh(&dev_base_lock);
1021 	hlist_del_rcu(&dev->name_hlist);
1022 	write_unlock_bh(&dev_base_lock);
1023 
1024 	synchronize_rcu();
1025 
1026 	write_lock_bh(&dev_base_lock);
1027 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1028 	write_unlock_bh(&dev_base_lock);
1029 
1030 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1031 	ret = notifier_to_errno(ret);
1032 
1033 	if (ret) {
1034 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1035 		if (err >= 0) {
1036 			err = ret;
1037 			memcpy(dev->name, oldname, IFNAMSIZ);
1038 			goto rollback;
1039 		} else {
1040 			pr_err("%s: name change rollback failed: %d\n",
1041 			       dev->name, ret);
1042 		}
1043 	}
1044 
1045 	return err;
1046 }
1047 
1048 /**
1049  *	dev_set_alias - change ifalias of a device
1050  *	@dev: device
1051  *	@alias: name up to IFALIASZ
1052  *	@len: limit of bytes to copy from info
1053  *
1054  *	Set ifalias for a device,
1055  */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058 	ASSERT_RTNL();
1059 
1060 	if (len >= IFALIASZ)
1061 		return -EINVAL;
1062 
1063 	if (!len) {
1064 		if (dev->ifalias) {
1065 			kfree(dev->ifalias);
1066 			dev->ifalias = NULL;
1067 		}
1068 		return 0;
1069 	}
1070 
1071 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 	if (!dev->ifalias)
1073 		return -ENOMEM;
1074 
1075 	strlcpy(dev->ifalias, alias, len+1);
1076 	return len;
1077 }
1078 
1079 
1080 /**
1081  *	netdev_features_change - device changes features
1082  *	@dev: device to cause notification
1083  *
1084  *	Called to indicate a device has changed features.
1085  */
1086 void netdev_features_change(struct net_device *dev)
1087 {
1088 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089 }
1090 EXPORT_SYMBOL(netdev_features_change);
1091 
1092 /**
1093  *	netdev_state_change - device changes state
1094  *	@dev: device to cause notification
1095  *
1096  *	Called to indicate a device has changed state. This function calls
1097  *	the notifier chains for netdev_chain and sends a NEWLINK message
1098  *	to the routing socket.
1099  */
1100 void netdev_state_change(struct net_device *dev)
1101 {
1102 	if (dev->flags & IFF_UP) {
1103 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105 	}
1106 }
1107 EXPORT_SYMBOL(netdev_state_change);
1108 
1109 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110 {
1111 	return call_netdevice_notifiers(event, dev);
1112 }
1113 EXPORT_SYMBOL(netdev_bonding_change);
1114 
1115 /**
1116  *	dev_load 	- load a network module
1117  *	@net: the applicable net namespace
1118  *	@name: name of interface
1119  *
1120  *	If a network interface is not present and the process has suitable
1121  *	privileges this function loads the module. If module loading is not
1122  *	available in this kernel then it becomes a nop.
1123  */
1124 
1125 void dev_load(struct net *net, const char *name)
1126 {
1127 	struct net_device *dev;
1128 	int no_module;
1129 
1130 	rcu_read_lock();
1131 	dev = dev_get_by_name_rcu(net, name);
1132 	rcu_read_unlock();
1133 
1134 	no_module = !dev;
1135 	if (no_module && capable(CAP_NET_ADMIN))
1136 		no_module = request_module("netdev-%s", name);
1137 	if (no_module && capable(CAP_SYS_MODULE)) {
1138 		if (!request_module("%s", name))
1139 			pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1140 				name);
1141 	}
1142 }
1143 EXPORT_SYMBOL(dev_load);
1144 
1145 static int __dev_open(struct net_device *dev)
1146 {
1147 	const struct net_device_ops *ops = dev->netdev_ops;
1148 	int ret;
1149 
1150 	ASSERT_RTNL();
1151 
1152 	if (!netif_device_present(dev))
1153 		return -ENODEV;
1154 
1155 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1156 	ret = notifier_to_errno(ret);
1157 	if (ret)
1158 		return ret;
1159 
1160 	set_bit(__LINK_STATE_START, &dev->state);
1161 
1162 	if (ops->ndo_validate_addr)
1163 		ret = ops->ndo_validate_addr(dev);
1164 
1165 	if (!ret && ops->ndo_open)
1166 		ret = ops->ndo_open(dev);
1167 
1168 	if (ret)
1169 		clear_bit(__LINK_STATE_START, &dev->state);
1170 	else {
1171 		dev->flags |= IFF_UP;
1172 		net_dmaengine_get();
1173 		dev_set_rx_mode(dev);
1174 		dev_activate(dev);
1175 		add_device_randomness(dev->dev_addr, dev->addr_len);
1176 	}
1177 
1178 	return ret;
1179 }
1180 
1181 /**
1182  *	dev_open	- prepare an interface for use.
1183  *	@dev:	device to open
1184  *
1185  *	Takes a device from down to up state. The device's private open
1186  *	function is invoked and then the multicast lists are loaded. Finally
1187  *	the device is moved into the up state and a %NETDEV_UP message is
1188  *	sent to the netdev notifier chain.
1189  *
1190  *	Calling this function on an active interface is a nop. On a failure
1191  *	a negative errno code is returned.
1192  */
1193 int dev_open(struct net_device *dev)
1194 {
1195 	int ret;
1196 
1197 	if (dev->flags & IFF_UP)
1198 		return 0;
1199 
1200 	ret = __dev_open(dev);
1201 	if (ret < 0)
1202 		return ret;
1203 
1204 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205 	call_netdevice_notifiers(NETDEV_UP, dev);
1206 
1207 	return ret;
1208 }
1209 EXPORT_SYMBOL(dev_open);
1210 
1211 static int __dev_close_many(struct list_head *head)
1212 {
1213 	struct net_device *dev;
1214 
1215 	ASSERT_RTNL();
1216 	might_sleep();
1217 
1218 	list_for_each_entry(dev, head, unreg_list) {
1219 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220 
1221 		clear_bit(__LINK_STATE_START, &dev->state);
1222 
1223 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1224 		 * can be even on different cpu. So just clear netif_running().
1225 		 *
1226 		 * dev->stop() will invoke napi_disable() on all of it's
1227 		 * napi_struct instances on this device.
1228 		 */
1229 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230 	}
1231 
1232 	dev_deactivate_many(head);
1233 
1234 	list_for_each_entry(dev, head, unreg_list) {
1235 		const struct net_device_ops *ops = dev->netdev_ops;
1236 
1237 		/*
1238 		 *	Call the device specific close. This cannot fail.
1239 		 *	Only if device is UP
1240 		 *
1241 		 *	We allow it to be called even after a DETACH hot-plug
1242 		 *	event.
1243 		 */
1244 		if (ops->ndo_stop)
1245 			ops->ndo_stop(dev);
1246 
1247 		dev->flags &= ~IFF_UP;
1248 		net_dmaengine_put();
1249 	}
1250 
1251 	return 0;
1252 }
1253 
1254 static int __dev_close(struct net_device *dev)
1255 {
1256 	int retval;
1257 	LIST_HEAD(single);
1258 
1259 	list_add(&dev->unreg_list, &single);
1260 	retval = __dev_close_many(&single);
1261 	list_del(&single);
1262 	return retval;
1263 }
1264 
1265 static int dev_close_many(struct list_head *head)
1266 {
1267 	struct net_device *dev, *tmp;
1268 	LIST_HEAD(tmp_list);
1269 
1270 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271 		if (!(dev->flags & IFF_UP))
1272 			list_move(&dev->unreg_list, &tmp_list);
1273 
1274 	__dev_close_many(head);
1275 
1276 	list_for_each_entry(dev, head, unreg_list) {
1277 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1279 	}
1280 
1281 	/* rollback_registered_many needs the complete original list */
1282 	list_splice(&tmp_list, head);
1283 	return 0;
1284 }
1285 
1286 /**
1287  *	dev_close - shutdown an interface.
1288  *	@dev: device to shutdown
1289  *
1290  *	This function moves an active device into down state. A
1291  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293  *	chain.
1294  */
1295 int dev_close(struct net_device *dev)
1296 {
1297 	if (dev->flags & IFF_UP) {
1298 		LIST_HEAD(single);
1299 
1300 		list_add(&dev->unreg_list, &single);
1301 		dev_close_many(&single);
1302 		list_del(&single);
1303 	}
1304 	return 0;
1305 }
1306 EXPORT_SYMBOL(dev_close);
1307 
1308 
1309 /**
1310  *	dev_disable_lro - disable Large Receive Offload on a device
1311  *	@dev: device
1312  *
1313  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314  *	called under RTNL.  This is needed if received packets may be
1315  *	forwarded to another interface.
1316  */
1317 void dev_disable_lro(struct net_device *dev)
1318 {
1319 	/*
1320 	 * If we're trying to disable lro on a vlan device
1321 	 * use the underlying physical device instead
1322 	 */
1323 	if (is_vlan_dev(dev))
1324 		dev = vlan_dev_real_dev(dev);
1325 
1326 	dev->wanted_features &= ~NETIF_F_LRO;
1327 	netdev_update_features(dev);
1328 
1329 	if (unlikely(dev->features & NETIF_F_LRO))
1330 		netdev_WARN(dev, "failed to disable LRO!\n");
1331 }
1332 EXPORT_SYMBOL(dev_disable_lro);
1333 
1334 
1335 static int dev_boot_phase = 1;
1336 
1337 /**
1338  *	register_netdevice_notifier - register a network notifier block
1339  *	@nb: notifier
1340  *
1341  *	Register a notifier to be called when network device events occur.
1342  *	The notifier passed is linked into the kernel structures and must
1343  *	not be reused until it has been unregistered. A negative errno code
1344  *	is returned on a failure.
1345  *
1346  * 	When registered all registration and up events are replayed
1347  *	to the new notifier to allow device to have a race free
1348  *	view of the network device list.
1349  */
1350 
1351 int register_netdevice_notifier(struct notifier_block *nb)
1352 {
1353 	struct net_device *dev;
1354 	struct net_device *last;
1355 	struct net *net;
1356 	int err;
1357 
1358 	rtnl_lock();
1359 	err = raw_notifier_chain_register(&netdev_chain, nb);
1360 	if (err)
1361 		goto unlock;
1362 	if (dev_boot_phase)
1363 		goto unlock;
1364 	for_each_net(net) {
1365 		for_each_netdev(net, dev) {
1366 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1367 			err = notifier_to_errno(err);
1368 			if (err)
1369 				goto rollback;
1370 
1371 			if (!(dev->flags & IFF_UP))
1372 				continue;
1373 
1374 			nb->notifier_call(nb, NETDEV_UP, dev);
1375 		}
1376 	}
1377 
1378 unlock:
1379 	rtnl_unlock();
1380 	return err;
1381 
1382 rollback:
1383 	last = dev;
1384 	for_each_net(net) {
1385 		for_each_netdev(net, dev) {
1386 			if (dev == last)
1387 				goto outroll;
1388 
1389 			if (dev->flags & IFF_UP) {
1390 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1391 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1392 			}
1393 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1394 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1395 		}
1396 	}
1397 
1398 outroll:
1399 	raw_notifier_chain_unregister(&netdev_chain, nb);
1400 	goto unlock;
1401 }
1402 EXPORT_SYMBOL(register_netdevice_notifier);
1403 
1404 /**
1405  *	unregister_netdevice_notifier - unregister a network notifier block
1406  *	@nb: notifier
1407  *
1408  *	Unregister a notifier previously registered by
1409  *	register_netdevice_notifier(). The notifier is unlinked into the
1410  *	kernel structures and may then be reused. A negative errno code
1411  *	is returned on a failure.
1412  *
1413  * 	After unregistering unregister and down device events are synthesized
1414  *	for all devices on the device list to the removed notifier to remove
1415  *	the need for special case cleanup code.
1416  */
1417 
1418 int unregister_netdevice_notifier(struct notifier_block *nb)
1419 {
1420 	struct net_device *dev;
1421 	struct net *net;
1422 	int err;
1423 
1424 	rtnl_lock();
1425 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1426 	if (err)
1427 		goto unlock;
1428 
1429 	for_each_net(net) {
1430 		for_each_netdev(net, dev) {
1431 			if (dev->flags & IFF_UP) {
1432 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1433 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1434 			}
1435 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1436 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1437 		}
1438 	}
1439 unlock:
1440 	rtnl_unlock();
1441 	return err;
1442 }
1443 EXPORT_SYMBOL(unregister_netdevice_notifier);
1444 
1445 /**
1446  *	call_netdevice_notifiers - call all network notifier blocks
1447  *      @val: value passed unmodified to notifier function
1448  *      @dev: net_device pointer passed unmodified to notifier function
1449  *
1450  *	Call all network notifier blocks.  Parameters and return value
1451  *	are as for raw_notifier_call_chain().
1452  */
1453 
1454 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1455 {
1456 	ASSERT_RTNL();
1457 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1458 }
1459 EXPORT_SYMBOL(call_netdevice_notifiers);
1460 
1461 static struct static_key netstamp_needed __read_mostly;
1462 #ifdef HAVE_JUMP_LABEL
1463 /* We are not allowed to call static_key_slow_dec() from irq context
1464  * If net_disable_timestamp() is called from irq context, defer the
1465  * static_key_slow_dec() calls.
1466  */
1467 static atomic_t netstamp_needed_deferred;
1468 #endif
1469 
1470 void net_enable_timestamp(void)
1471 {
1472 #ifdef HAVE_JUMP_LABEL
1473 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1474 
1475 	if (deferred) {
1476 		while (--deferred)
1477 			static_key_slow_dec(&netstamp_needed);
1478 		return;
1479 	}
1480 #endif
1481 	WARN_ON(in_interrupt());
1482 	static_key_slow_inc(&netstamp_needed);
1483 }
1484 EXPORT_SYMBOL(net_enable_timestamp);
1485 
1486 void net_disable_timestamp(void)
1487 {
1488 #ifdef HAVE_JUMP_LABEL
1489 	if (in_interrupt()) {
1490 		atomic_inc(&netstamp_needed_deferred);
1491 		return;
1492 	}
1493 #endif
1494 	static_key_slow_dec(&netstamp_needed);
1495 }
1496 EXPORT_SYMBOL(net_disable_timestamp);
1497 
1498 static inline void net_timestamp_set(struct sk_buff *skb)
1499 {
1500 	skb->tstamp.tv64 = 0;
1501 	if (static_key_false(&netstamp_needed))
1502 		__net_timestamp(skb);
1503 }
1504 
1505 #define net_timestamp_check(COND, SKB)			\
1506 	if (static_key_false(&netstamp_needed)) {		\
1507 		if ((COND) && !(SKB)->tstamp.tv64)	\
1508 			__net_timestamp(SKB);		\
1509 	}						\
1510 
1511 static int net_hwtstamp_validate(struct ifreq *ifr)
1512 {
1513 	struct hwtstamp_config cfg;
1514 	enum hwtstamp_tx_types tx_type;
1515 	enum hwtstamp_rx_filters rx_filter;
1516 	int tx_type_valid = 0;
1517 	int rx_filter_valid = 0;
1518 
1519 	if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1520 		return -EFAULT;
1521 
1522 	if (cfg.flags) /* reserved for future extensions */
1523 		return -EINVAL;
1524 
1525 	tx_type = cfg.tx_type;
1526 	rx_filter = cfg.rx_filter;
1527 
1528 	switch (tx_type) {
1529 	case HWTSTAMP_TX_OFF:
1530 	case HWTSTAMP_TX_ON:
1531 	case HWTSTAMP_TX_ONESTEP_SYNC:
1532 		tx_type_valid = 1;
1533 		break;
1534 	}
1535 
1536 	switch (rx_filter) {
1537 	case HWTSTAMP_FILTER_NONE:
1538 	case HWTSTAMP_FILTER_ALL:
1539 	case HWTSTAMP_FILTER_SOME:
1540 	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1541 	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1542 	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1543 	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1544 	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1545 	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1546 	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1547 	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1548 	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1549 	case HWTSTAMP_FILTER_PTP_V2_EVENT:
1550 	case HWTSTAMP_FILTER_PTP_V2_SYNC:
1551 	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1552 		rx_filter_valid = 1;
1553 		break;
1554 	}
1555 
1556 	if (!tx_type_valid || !rx_filter_valid)
1557 		return -ERANGE;
1558 
1559 	return 0;
1560 }
1561 
1562 static inline bool is_skb_forwardable(struct net_device *dev,
1563 				      struct sk_buff *skb)
1564 {
1565 	unsigned int len;
1566 
1567 	if (!(dev->flags & IFF_UP))
1568 		return false;
1569 
1570 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1571 	if (skb->len <= len)
1572 		return true;
1573 
1574 	/* if TSO is enabled, we don't care about the length as the packet
1575 	 * could be forwarded without being segmented before
1576 	 */
1577 	if (skb_is_gso(skb))
1578 		return true;
1579 
1580 	return false;
1581 }
1582 
1583 /**
1584  * dev_forward_skb - loopback an skb to another netif
1585  *
1586  * @dev: destination network device
1587  * @skb: buffer to forward
1588  *
1589  * return values:
1590  *	NET_RX_SUCCESS	(no congestion)
1591  *	NET_RX_DROP     (packet was dropped, but freed)
1592  *
1593  * dev_forward_skb can be used for injecting an skb from the
1594  * start_xmit function of one device into the receive queue
1595  * of another device.
1596  *
1597  * The receiving device may be in another namespace, so
1598  * we have to clear all information in the skb that could
1599  * impact namespace isolation.
1600  */
1601 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1602 {
1603 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1604 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1605 			atomic_long_inc(&dev->rx_dropped);
1606 			kfree_skb(skb);
1607 			return NET_RX_DROP;
1608 		}
1609 	}
1610 
1611 	skb_orphan(skb);
1612 	nf_reset(skb);
1613 
1614 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1615 		atomic_long_inc(&dev->rx_dropped);
1616 		kfree_skb(skb);
1617 		return NET_RX_DROP;
1618 	}
1619 	skb->skb_iif = 0;
1620 	skb->dev = dev;
1621 	skb_dst_drop(skb);
1622 	skb->tstamp.tv64 = 0;
1623 	skb->pkt_type = PACKET_HOST;
1624 	skb->protocol = eth_type_trans(skb, dev);
1625 	skb->mark = 0;
1626 	secpath_reset(skb);
1627 	nf_reset(skb);
1628 	return netif_rx(skb);
1629 }
1630 EXPORT_SYMBOL_GPL(dev_forward_skb);
1631 
1632 static inline int deliver_skb(struct sk_buff *skb,
1633 			      struct packet_type *pt_prev,
1634 			      struct net_device *orig_dev)
1635 {
1636 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1637 		return -ENOMEM;
1638 	atomic_inc(&skb->users);
1639 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1640 }
1641 
1642 /*
1643  *	Support routine. Sends outgoing frames to any network
1644  *	taps currently in use.
1645  */
1646 
1647 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1648 {
1649 	struct packet_type *ptype;
1650 	struct sk_buff *skb2 = NULL;
1651 	struct packet_type *pt_prev = NULL;
1652 
1653 	rcu_read_lock();
1654 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1655 		/* Never send packets back to the socket
1656 		 * they originated from - MvS (miquels@drinkel.ow.org)
1657 		 */
1658 		if ((ptype->dev == dev || !ptype->dev) &&
1659 		    (ptype->af_packet_priv == NULL ||
1660 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1661 			if (pt_prev) {
1662 				deliver_skb(skb2, pt_prev, skb->dev);
1663 				pt_prev = ptype;
1664 				continue;
1665 			}
1666 
1667 			skb2 = skb_clone(skb, GFP_ATOMIC);
1668 			if (!skb2)
1669 				break;
1670 
1671 			net_timestamp_set(skb2);
1672 
1673 			/* skb->nh should be correctly
1674 			   set by sender, so that the second statement is
1675 			   just protection against buggy protocols.
1676 			 */
1677 			skb_reset_mac_header(skb2);
1678 
1679 			if (skb_network_header(skb2) < skb2->data ||
1680 			    skb2->network_header > skb2->tail) {
1681 				net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1682 						     ntohs(skb2->protocol),
1683 						     dev->name);
1684 				skb_reset_network_header(skb2);
1685 			}
1686 
1687 			skb2->transport_header = skb2->network_header;
1688 			skb2->pkt_type = PACKET_OUTGOING;
1689 			pt_prev = ptype;
1690 		}
1691 	}
1692 	if (pt_prev)
1693 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1694 	rcu_read_unlock();
1695 }
1696 
1697 /**
1698  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1699  * @dev: Network device
1700  * @txq: number of queues available
1701  *
1702  * If real_num_tx_queues is changed the tc mappings may no longer be
1703  * valid. To resolve this verify the tc mapping remains valid and if
1704  * not NULL the mapping. With no priorities mapping to this
1705  * offset/count pair it will no longer be used. In the worst case TC0
1706  * is invalid nothing can be done so disable priority mappings. If is
1707  * expected that drivers will fix this mapping if they can before
1708  * calling netif_set_real_num_tx_queues.
1709  */
1710 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1711 {
1712 	int i;
1713 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1714 
1715 	/* If TC0 is invalidated disable TC mapping */
1716 	if (tc->offset + tc->count > txq) {
1717 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1718 		dev->num_tc = 0;
1719 		return;
1720 	}
1721 
1722 	/* Invalidated prio to tc mappings set to TC0 */
1723 	for (i = 1; i < TC_BITMASK + 1; i++) {
1724 		int q = netdev_get_prio_tc_map(dev, i);
1725 
1726 		tc = &dev->tc_to_txq[q];
1727 		if (tc->offset + tc->count > txq) {
1728 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1729 				i, q);
1730 			netdev_set_prio_tc_map(dev, i, 0);
1731 		}
1732 	}
1733 }
1734 
1735 /*
1736  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1737  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1738  */
1739 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1740 {
1741 	int rc;
1742 
1743 	if (txq < 1 || txq > dev->num_tx_queues)
1744 		return -EINVAL;
1745 
1746 	if (dev->reg_state == NETREG_REGISTERED ||
1747 	    dev->reg_state == NETREG_UNREGISTERING) {
1748 		ASSERT_RTNL();
1749 
1750 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1751 						  txq);
1752 		if (rc)
1753 			return rc;
1754 
1755 		if (dev->num_tc)
1756 			netif_setup_tc(dev, txq);
1757 
1758 		if (txq < dev->real_num_tx_queues)
1759 			qdisc_reset_all_tx_gt(dev, txq);
1760 	}
1761 
1762 	dev->real_num_tx_queues = txq;
1763 	return 0;
1764 }
1765 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1766 
1767 #ifdef CONFIG_RPS
1768 /**
1769  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1770  *	@dev: Network device
1771  *	@rxq: Actual number of RX queues
1772  *
1773  *	This must be called either with the rtnl_lock held or before
1774  *	registration of the net device.  Returns 0 on success, or a
1775  *	negative error code.  If called before registration, it always
1776  *	succeeds.
1777  */
1778 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1779 {
1780 	int rc;
1781 
1782 	if (rxq < 1 || rxq > dev->num_rx_queues)
1783 		return -EINVAL;
1784 
1785 	if (dev->reg_state == NETREG_REGISTERED) {
1786 		ASSERT_RTNL();
1787 
1788 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1789 						  rxq);
1790 		if (rc)
1791 			return rc;
1792 	}
1793 
1794 	dev->real_num_rx_queues = rxq;
1795 	return 0;
1796 }
1797 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1798 #endif
1799 
1800 /**
1801  * netif_get_num_default_rss_queues - default number of RSS queues
1802  *
1803  * This routine should set an upper limit on the number of RSS queues
1804  * used by default by multiqueue devices.
1805  */
1806 int netif_get_num_default_rss_queues(void)
1807 {
1808 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1809 }
1810 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1811 
1812 static inline void __netif_reschedule(struct Qdisc *q)
1813 {
1814 	struct softnet_data *sd;
1815 	unsigned long flags;
1816 
1817 	local_irq_save(flags);
1818 	sd = &__get_cpu_var(softnet_data);
1819 	q->next_sched = NULL;
1820 	*sd->output_queue_tailp = q;
1821 	sd->output_queue_tailp = &q->next_sched;
1822 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1823 	local_irq_restore(flags);
1824 }
1825 
1826 void __netif_schedule(struct Qdisc *q)
1827 {
1828 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1829 		__netif_reschedule(q);
1830 }
1831 EXPORT_SYMBOL(__netif_schedule);
1832 
1833 void dev_kfree_skb_irq(struct sk_buff *skb)
1834 {
1835 	if (atomic_dec_and_test(&skb->users)) {
1836 		struct softnet_data *sd;
1837 		unsigned long flags;
1838 
1839 		local_irq_save(flags);
1840 		sd = &__get_cpu_var(softnet_data);
1841 		skb->next = sd->completion_queue;
1842 		sd->completion_queue = skb;
1843 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1844 		local_irq_restore(flags);
1845 	}
1846 }
1847 EXPORT_SYMBOL(dev_kfree_skb_irq);
1848 
1849 void dev_kfree_skb_any(struct sk_buff *skb)
1850 {
1851 	if (in_irq() || irqs_disabled())
1852 		dev_kfree_skb_irq(skb);
1853 	else
1854 		dev_kfree_skb(skb);
1855 }
1856 EXPORT_SYMBOL(dev_kfree_skb_any);
1857 
1858 
1859 /**
1860  * netif_device_detach - mark device as removed
1861  * @dev: network device
1862  *
1863  * Mark device as removed from system and therefore no longer available.
1864  */
1865 void netif_device_detach(struct net_device *dev)
1866 {
1867 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1868 	    netif_running(dev)) {
1869 		netif_tx_stop_all_queues(dev);
1870 	}
1871 }
1872 EXPORT_SYMBOL(netif_device_detach);
1873 
1874 /**
1875  * netif_device_attach - mark device as attached
1876  * @dev: network device
1877  *
1878  * Mark device as attached from system and restart if needed.
1879  */
1880 void netif_device_attach(struct net_device *dev)
1881 {
1882 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1883 	    netif_running(dev)) {
1884 		netif_tx_wake_all_queues(dev);
1885 		__netdev_watchdog_up(dev);
1886 	}
1887 }
1888 EXPORT_SYMBOL(netif_device_attach);
1889 
1890 static void skb_warn_bad_offload(const struct sk_buff *skb)
1891 {
1892 	static const netdev_features_t null_features = 0;
1893 	struct net_device *dev = skb->dev;
1894 	const char *driver = "";
1895 
1896 	if (dev && dev->dev.parent)
1897 		driver = dev_driver_string(dev->dev.parent);
1898 
1899 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1900 	     "gso_type=%d ip_summed=%d\n",
1901 	     driver, dev ? &dev->features : &null_features,
1902 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
1903 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1904 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
1905 }
1906 
1907 /*
1908  * Invalidate hardware checksum when packet is to be mangled, and
1909  * complete checksum manually on outgoing path.
1910  */
1911 int skb_checksum_help(struct sk_buff *skb)
1912 {
1913 	__wsum csum;
1914 	int ret = 0, offset;
1915 
1916 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1917 		goto out_set_summed;
1918 
1919 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1920 		skb_warn_bad_offload(skb);
1921 		return -EINVAL;
1922 	}
1923 
1924 	offset = skb_checksum_start_offset(skb);
1925 	BUG_ON(offset >= skb_headlen(skb));
1926 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1927 
1928 	offset += skb->csum_offset;
1929 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1930 
1931 	if (skb_cloned(skb) &&
1932 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1933 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1934 		if (ret)
1935 			goto out;
1936 	}
1937 
1938 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1939 out_set_summed:
1940 	skb->ip_summed = CHECKSUM_NONE;
1941 out:
1942 	return ret;
1943 }
1944 EXPORT_SYMBOL(skb_checksum_help);
1945 
1946 /**
1947  *	skb_gso_segment - Perform segmentation on skb.
1948  *	@skb: buffer to segment
1949  *	@features: features for the output path (see dev->features)
1950  *
1951  *	This function segments the given skb and returns a list of segments.
1952  *
1953  *	It may return NULL if the skb requires no segmentation.  This is
1954  *	only possible when GSO is used for verifying header integrity.
1955  */
1956 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1957 	netdev_features_t features)
1958 {
1959 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1960 	struct packet_type *ptype;
1961 	__be16 type = skb->protocol;
1962 	int vlan_depth = ETH_HLEN;
1963 	int err;
1964 
1965 	while (type == htons(ETH_P_8021Q)) {
1966 		struct vlan_hdr *vh;
1967 
1968 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1969 			return ERR_PTR(-EINVAL);
1970 
1971 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1972 		type = vh->h_vlan_encapsulated_proto;
1973 		vlan_depth += VLAN_HLEN;
1974 	}
1975 
1976 	skb_reset_mac_header(skb);
1977 	skb->mac_len = skb->network_header - skb->mac_header;
1978 	__skb_pull(skb, skb->mac_len);
1979 
1980 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1981 		skb_warn_bad_offload(skb);
1982 
1983 		if (skb_header_cloned(skb) &&
1984 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1985 			return ERR_PTR(err);
1986 	}
1987 
1988 	rcu_read_lock();
1989 	list_for_each_entry_rcu(ptype,
1990 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1991 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1992 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1993 				err = ptype->gso_send_check(skb);
1994 				segs = ERR_PTR(err);
1995 				if (err || skb_gso_ok(skb, features))
1996 					break;
1997 				__skb_push(skb, (skb->data -
1998 						 skb_network_header(skb)));
1999 			}
2000 			segs = ptype->gso_segment(skb, features);
2001 			break;
2002 		}
2003 	}
2004 	rcu_read_unlock();
2005 
2006 	__skb_push(skb, skb->data - skb_mac_header(skb));
2007 
2008 	return segs;
2009 }
2010 EXPORT_SYMBOL(skb_gso_segment);
2011 
2012 /* Take action when hardware reception checksum errors are detected. */
2013 #ifdef CONFIG_BUG
2014 void netdev_rx_csum_fault(struct net_device *dev)
2015 {
2016 	if (net_ratelimit()) {
2017 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2018 		dump_stack();
2019 	}
2020 }
2021 EXPORT_SYMBOL(netdev_rx_csum_fault);
2022 #endif
2023 
2024 /* Actually, we should eliminate this check as soon as we know, that:
2025  * 1. IOMMU is present and allows to map all the memory.
2026  * 2. No high memory really exists on this machine.
2027  */
2028 
2029 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2030 {
2031 #ifdef CONFIG_HIGHMEM
2032 	int i;
2033 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2034 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2035 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2036 			if (PageHighMem(skb_frag_page(frag)))
2037 				return 1;
2038 		}
2039 	}
2040 
2041 	if (PCI_DMA_BUS_IS_PHYS) {
2042 		struct device *pdev = dev->dev.parent;
2043 
2044 		if (!pdev)
2045 			return 0;
2046 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2049 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2050 				return 1;
2051 		}
2052 	}
2053 #endif
2054 	return 0;
2055 }
2056 
2057 struct dev_gso_cb {
2058 	void (*destructor)(struct sk_buff *skb);
2059 };
2060 
2061 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2062 
2063 static void dev_gso_skb_destructor(struct sk_buff *skb)
2064 {
2065 	struct dev_gso_cb *cb;
2066 
2067 	do {
2068 		struct sk_buff *nskb = skb->next;
2069 
2070 		skb->next = nskb->next;
2071 		nskb->next = NULL;
2072 		kfree_skb(nskb);
2073 	} while (skb->next);
2074 
2075 	cb = DEV_GSO_CB(skb);
2076 	if (cb->destructor)
2077 		cb->destructor(skb);
2078 }
2079 
2080 /**
2081  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2082  *	@skb: buffer to segment
2083  *	@features: device features as applicable to this skb
2084  *
2085  *	This function segments the given skb and stores the list of segments
2086  *	in skb->next.
2087  */
2088 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2089 {
2090 	struct sk_buff *segs;
2091 
2092 	segs = skb_gso_segment(skb, features);
2093 
2094 	/* Verifying header integrity only. */
2095 	if (!segs)
2096 		return 0;
2097 
2098 	if (IS_ERR(segs))
2099 		return PTR_ERR(segs);
2100 
2101 	skb->next = segs;
2102 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2103 	skb->destructor = dev_gso_skb_destructor;
2104 
2105 	return 0;
2106 }
2107 
2108 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2109 {
2110 	return ((features & NETIF_F_GEN_CSUM) ||
2111 		((features & NETIF_F_V4_CSUM) &&
2112 		 protocol == htons(ETH_P_IP)) ||
2113 		((features & NETIF_F_V6_CSUM) &&
2114 		 protocol == htons(ETH_P_IPV6)) ||
2115 		((features & NETIF_F_FCOE_CRC) &&
2116 		 protocol == htons(ETH_P_FCOE)));
2117 }
2118 
2119 static netdev_features_t harmonize_features(struct sk_buff *skb,
2120 	__be16 protocol, netdev_features_t features)
2121 {
2122 	if (!can_checksum_protocol(features, protocol)) {
2123 		features &= ~NETIF_F_ALL_CSUM;
2124 		features &= ~NETIF_F_SG;
2125 	} else if (illegal_highdma(skb->dev, skb)) {
2126 		features &= ~NETIF_F_SG;
2127 	}
2128 
2129 	return features;
2130 }
2131 
2132 netdev_features_t netif_skb_features(struct sk_buff *skb)
2133 {
2134 	__be16 protocol = skb->protocol;
2135 	netdev_features_t features = skb->dev->features;
2136 
2137 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2138 		features &= ~NETIF_F_GSO_MASK;
2139 
2140 	if (protocol == htons(ETH_P_8021Q)) {
2141 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2142 		protocol = veh->h_vlan_encapsulated_proto;
2143 	} else if (!vlan_tx_tag_present(skb)) {
2144 		return harmonize_features(skb, protocol, features);
2145 	}
2146 
2147 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2148 
2149 	if (protocol != htons(ETH_P_8021Q)) {
2150 		return harmonize_features(skb, protocol, features);
2151 	} else {
2152 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2153 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2154 		return harmonize_features(skb, protocol, features);
2155 	}
2156 }
2157 EXPORT_SYMBOL(netif_skb_features);
2158 
2159 /*
2160  * Returns true if either:
2161  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2162  *	2. skb is fragmented and the device does not support SG, or if
2163  *	   at least one of fragments is in highmem and device does not
2164  *	   support DMA from it.
2165  */
2166 static inline int skb_needs_linearize(struct sk_buff *skb,
2167 				      int features)
2168 {
2169 	return skb_is_nonlinear(skb) &&
2170 			((skb_has_frag_list(skb) &&
2171 				!(features & NETIF_F_FRAGLIST)) ||
2172 			(skb_shinfo(skb)->nr_frags &&
2173 				!(features & NETIF_F_SG)));
2174 }
2175 
2176 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2177 			struct netdev_queue *txq)
2178 {
2179 	const struct net_device_ops *ops = dev->netdev_ops;
2180 	int rc = NETDEV_TX_OK;
2181 	unsigned int skb_len;
2182 
2183 	if (likely(!skb->next)) {
2184 		netdev_features_t features;
2185 
2186 		/*
2187 		 * If device doesn't need skb->dst, release it right now while
2188 		 * its hot in this cpu cache
2189 		 */
2190 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2191 			skb_dst_drop(skb);
2192 
2193 		if (!list_empty(&ptype_all))
2194 			dev_queue_xmit_nit(skb, dev);
2195 
2196 		features = netif_skb_features(skb);
2197 
2198 		if (vlan_tx_tag_present(skb) &&
2199 		    !(features & NETIF_F_HW_VLAN_TX)) {
2200 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2201 			if (unlikely(!skb))
2202 				goto out;
2203 
2204 			skb->vlan_tci = 0;
2205 		}
2206 
2207 		if (netif_needs_gso(skb, features)) {
2208 			if (unlikely(dev_gso_segment(skb, features)))
2209 				goto out_kfree_skb;
2210 			if (skb->next)
2211 				goto gso;
2212 		} else {
2213 			if (skb_needs_linearize(skb, features) &&
2214 			    __skb_linearize(skb))
2215 				goto out_kfree_skb;
2216 
2217 			/* If packet is not checksummed and device does not
2218 			 * support checksumming for this protocol, complete
2219 			 * checksumming here.
2220 			 */
2221 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2222 				skb_set_transport_header(skb,
2223 					skb_checksum_start_offset(skb));
2224 				if (!(features & NETIF_F_ALL_CSUM) &&
2225 				     skb_checksum_help(skb))
2226 					goto out_kfree_skb;
2227 			}
2228 		}
2229 
2230 		skb_len = skb->len;
2231 		rc = ops->ndo_start_xmit(skb, dev);
2232 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2233 		if (rc == NETDEV_TX_OK)
2234 			txq_trans_update(txq);
2235 		return rc;
2236 	}
2237 
2238 gso:
2239 	do {
2240 		struct sk_buff *nskb = skb->next;
2241 
2242 		skb->next = nskb->next;
2243 		nskb->next = NULL;
2244 
2245 		/*
2246 		 * If device doesn't need nskb->dst, release it right now while
2247 		 * its hot in this cpu cache
2248 		 */
2249 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2250 			skb_dst_drop(nskb);
2251 
2252 		skb_len = nskb->len;
2253 		rc = ops->ndo_start_xmit(nskb, dev);
2254 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2255 		if (unlikely(rc != NETDEV_TX_OK)) {
2256 			if (rc & ~NETDEV_TX_MASK)
2257 				goto out_kfree_gso_skb;
2258 			nskb->next = skb->next;
2259 			skb->next = nskb;
2260 			return rc;
2261 		}
2262 		txq_trans_update(txq);
2263 		if (unlikely(netif_xmit_stopped(txq) && skb->next))
2264 			return NETDEV_TX_BUSY;
2265 	} while (skb->next);
2266 
2267 out_kfree_gso_skb:
2268 	if (likely(skb->next == NULL))
2269 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2270 out_kfree_skb:
2271 	kfree_skb(skb);
2272 out:
2273 	return rc;
2274 }
2275 
2276 static u32 hashrnd __read_mostly;
2277 
2278 /*
2279  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2280  * to be used as a distribution range.
2281  */
2282 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2283 		  unsigned int num_tx_queues)
2284 {
2285 	u32 hash;
2286 	u16 qoffset = 0;
2287 	u16 qcount = num_tx_queues;
2288 
2289 	if (skb_rx_queue_recorded(skb)) {
2290 		hash = skb_get_rx_queue(skb);
2291 		while (unlikely(hash >= num_tx_queues))
2292 			hash -= num_tx_queues;
2293 		return hash;
2294 	}
2295 
2296 	if (dev->num_tc) {
2297 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2298 		qoffset = dev->tc_to_txq[tc].offset;
2299 		qcount = dev->tc_to_txq[tc].count;
2300 	}
2301 
2302 	if (skb->sk && skb->sk->sk_hash)
2303 		hash = skb->sk->sk_hash;
2304 	else
2305 		hash = (__force u16) skb->protocol;
2306 	hash = jhash_1word(hash, hashrnd);
2307 
2308 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2309 }
2310 EXPORT_SYMBOL(__skb_tx_hash);
2311 
2312 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2313 {
2314 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2315 		net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2316 				     dev->name, queue_index,
2317 				     dev->real_num_tx_queues);
2318 		return 0;
2319 	}
2320 	return queue_index;
2321 }
2322 
2323 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2324 {
2325 #ifdef CONFIG_XPS
2326 	struct xps_dev_maps *dev_maps;
2327 	struct xps_map *map;
2328 	int queue_index = -1;
2329 
2330 	rcu_read_lock();
2331 	dev_maps = rcu_dereference(dev->xps_maps);
2332 	if (dev_maps) {
2333 		map = rcu_dereference(
2334 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2335 		if (map) {
2336 			if (map->len == 1)
2337 				queue_index = map->queues[0];
2338 			else {
2339 				u32 hash;
2340 				if (skb->sk && skb->sk->sk_hash)
2341 					hash = skb->sk->sk_hash;
2342 				else
2343 					hash = (__force u16) skb->protocol ^
2344 					    skb->rxhash;
2345 				hash = jhash_1word(hash, hashrnd);
2346 				queue_index = map->queues[
2347 				    ((u64)hash * map->len) >> 32];
2348 			}
2349 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2350 				queue_index = -1;
2351 		}
2352 	}
2353 	rcu_read_unlock();
2354 
2355 	return queue_index;
2356 #else
2357 	return -1;
2358 #endif
2359 }
2360 
2361 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2362 					struct sk_buff *skb)
2363 {
2364 	int queue_index;
2365 	const struct net_device_ops *ops = dev->netdev_ops;
2366 
2367 	if (dev->real_num_tx_queues == 1)
2368 		queue_index = 0;
2369 	else if (ops->ndo_select_queue) {
2370 		queue_index = ops->ndo_select_queue(dev, skb);
2371 		queue_index = dev_cap_txqueue(dev, queue_index);
2372 	} else {
2373 		struct sock *sk = skb->sk;
2374 		queue_index = sk_tx_queue_get(sk);
2375 
2376 		if (queue_index < 0 || skb->ooo_okay ||
2377 		    queue_index >= dev->real_num_tx_queues) {
2378 			int old_index = queue_index;
2379 
2380 			queue_index = get_xps_queue(dev, skb);
2381 			if (queue_index < 0)
2382 				queue_index = skb_tx_hash(dev, skb);
2383 
2384 			if (queue_index != old_index && sk) {
2385 				struct dst_entry *dst =
2386 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2387 
2388 				if (dst && skb_dst(skb) == dst)
2389 					sk_tx_queue_set(sk, queue_index);
2390 			}
2391 		}
2392 	}
2393 
2394 	skb_set_queue_mapping(skb, queue_index);
2395 	return netdev_get_tx_queue(dev, queue_index);
2396 }
2397 
2398 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2399 				 struct net_device *dev,
2400 				 struct netdev_queue *txq)
2401 {
2402 	spinlock_t *root_lock = qdisc_lock(q);
2403 	bool contended;
2404 	int rc;
2405 
2406 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2407 	qdisc_calculate_pkt_len(skb, q);
2408 	/*
2409 	 * Heuristic to force contended enqueues to serialize on a
2410 	 * separate lock before trying to get qdisc main lock.
2411 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2412 	 * and dequeue packets faster.
2413 	 */
2414 	contended = qdisc_is_running(q);
2415 	if (unlikely(contended))
2416 		spin_lock(&q->busylock);
2417 
2418 	spin_lock(root_lock);
2419 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2420 		kfree_skb(skb);
2421 		rc = NET_XMIT_DROP;
2422 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2423 		   qdisc_run_begin(q)) {
2424 		/*
2425 		 * This is a work-conserving queue; there are no old skbs
2426 		 * waiting to be sent out; and the qdisc is not running -
2427 		 * xmit the skb directly.
2428 		 */
2429 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2430 			skb_dst_force(skb);
2431 
2432 		qdisc_bstats_update(q, skb);
2433 
2434 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2435 			if (unlikely(contended)) {
2436 				spin_unlock(&q->busylock);
2437 				contended = false;
2438 			}
2439 			__qdisc_run(q);
2440 		} else
2441 			qdisc_run_end(q);
2442 
2443 		rc = NET_XMIT_SUCCESS;
2444 	} else {
2445 		skb_dst_force(skb);
2446 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2447 		if (qdisc_run_begin(q)) {
2448 			if (unlikely(contended)) {
2449 				spin_unlock(&q->busylock);
2450 				contended = false;
2451 			}
2452 			__qdisc_run(q);
2453 		}
2454 	}
2455 	spin_unlock(root_lock);
2456 	if (unlikely(contended))
2457 		spin_unlock(&q->busylock);
2458 	return rc;
2459 }
2460 
2461 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2462 static void skb_update_prio(struct sk_buff *skb)
2463 {
2464 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2465 
2466 	if (!skb->priority && skb->sk && map) {
2467 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2468 
2469 		if (prioidx < map->priomap_len)
2470 			skb->priority = map->priomap[prioidx];
2471 	}
2472 }
2473 #else
2474 #define skb_update_prio(skb)
2475 #endif
2476 
2477 static DEFINE_PER_CPU(int, xmit_recursion);
2478 #define RECURSION_LIMIT 10
2479 
2480 /**
2481  *	dev_loopback_xmit - loop back @skb
2482  *	@skb: buffer to transmit
2483  */
2484 int dev_loopback_xmit(struct sk_buff *skb)
2485 {
2486 	skb_reset_mac_header(skb);
2487 	__skb_pull(skb, skb_network_offset(skb));
2488 	skb->pkt_type = PACKET_LOOPBACK;
2489 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2490 	WARN_ON(!skb_dst(skb));
2491 	skb_dst_force(skb);
2492 	netif_rx_ni(skb);
2493 	return 0;
2494 }
2495 EXPORT_SYMBOL(dev_loopback_xmit);
2496 
2497 /**
2498  *	dev_queue_xmit - transmit a buffer
2499  *	@skb: buffer to transmit
2500  *
2501  *	Queue a buffer for transmission to a network device. The caller must
2502  *	have set the device and priority and built the buffer before calling
2503  *	this function. The function can be called from an interrupt.
2504  *
2505  *	A negative errno code is returned on a failure. A success does not
2506  *	guarantee the frame will be transmitted as it may be dropped due
2507  *	to congestion or traffic shaping.
2508  *
2509  * -----------------------------------------------------------------------------------
2510  *      I notice this method can also return errors from the queue disciplines,
2511  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2512  *      be positive.
2513  *
2514  *      Regardless of the return value, the skb is consumed, so it is currently
2515  *      difficult to retry a send to this method.  (You can bump the ref count
2516  *      before sending to hold a reference for retry if you are careful.)
2517  *
2518  *      When calling this method, interrupts MUST be enabled.  This is because
2519  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2520  *          --BLG
2521  */
2522 int dev_queue_xmit(struct sk_buff *skb)
2523 {
2524 	struct net_device *dev = skb->dev;
2525 	struct netdev_queue *txq;
2526 	struct Qdisc *q;
2527 	int rc = -ENOMEM;
2528 
2529 	/* Disable soft irqs for various locks below. Also
2530 	 * stops preemption for RCU.
2531 	 */
2532 	rcu_read_lock_bh();
2533 
2534 	skb_update_prio(skb);
2535 
2536 	txq = dev_pick_tx(dev, skb);
2537 	q = rcu_dereference_bh(txq->qdisc);
2538 
2539 #ifdef CONFIG_NET_CLS_ACT
2540 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2541 #endif
2542 	trace_net_dev_queue(skb);
2543 	if (q->enqueue) {
2544 		rc = __dev_xmit_skb(skb, q, dev, txq);
2545 		goto out;
2546 	}
2547 
2548 	/* The device has no queue. Common case for software devices:
2549 	   loopback, all the sorts of tunnels...
2550 
2551 	   Really, it is unlikely that netif_tx_lock protection is necessary
2552 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2553 	   counters.)
2554 	   However, it is possible, that they rely on protection
2555 	   made by us here.
2556 
2557 	   Check this and shot the lock. It is not prone from deadlocks.
2558 	   Either shot noqueue qdisc, it is even simpler 8)
2559 	 */
2560 	if (dev->flags & IFF_UP) {
2561 		int cpu = smp_processor_id(); /* ok because BHs are off */
2562 
2563 		if (txq->xmit_lock_owner != cpu) {
2564 
2565 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2566 				goto recursion_alert;
2567 
2568 			HARD_TX_LOCK(dev, txq, cpu);
2569 
2570 			if (!netif_xmit_stopped(txq)) {
2571 				__this_cpu_inc(xmit_recursion);
2572 				rc = dev_hard_start_xmit(skb, dev, txq);
2573 				__this_cpu_dec(xmit_recursion);
2574 				if (dev_xmit_complete(rc)) {
2575 					HARD_TX_UNLOCK(dev, txq);
2576 					goto out;
2577 				}
2578 			}
2579 			HARD_TX_UNLOCK(dev, txq);
2580 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2581 					     dev->name);
2582 		} else {
2583 			/* Recursion is detected! It is possible,
2584 			 * unfortunately
2585 			 */
2586 recursion_alert:
2587 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2588 					     dev->name);
2589 		}
2590 	}
2591 
2592 	rc = -ENETDOWN;
2593 	rcu_read_unlock_bh();
2594 
2595 	kfree_skb(skb);
2596 	return rc;
2597 out:
2598 	rcu_read_unlock_bh();
2599 	return rc;
2600 }
2601 EXPORT_SYMBOL(dev_queue_xmit);
2602 
2603 
2604 /*=======================================================================
2605 			Receiver routines
2606   =======================================================================*/
2607 
2608 int netdev_max_backlog __read_mostly = 1000;
2609 int netdev_tstamp_prequeue __read_mostly = 1;
2610 int netdev_budget __read_mostly = 300;
2611 int weight_p __read_mostly = 64;            /* old backlog weight */
2612 
2613 /* Called with irq disabled */
2614 static inline void ____napi_schedule(struct softnet_data *sd,
2615 				     struct napi_struct *napi)
2616 {
2617 	list_add_tail(&napi->poll_list, &sd->poll_list);
2618 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2619 }
2620 
2621 /*
2622  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2623  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2624  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2625  * if hash is a canonical 4-tuple hash over transport ports.
2626  */
2627 void __skb_get_rxhash(struct sk_buff *skb)
2628 {
2629 	struct flow_keys keys;
2630 	u32 hash;
2631 
2632 	if (!skb_flow_dissect(skb, &keys))
2633 		return;
2634 
2635 	if (keys.ports) {
2636 		if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2637 			swap(keys.port16[0], keys.port16[1]);
2638 		skb->l4_rxhash = 1;
2639 	}
2640 
2641 	/* get a consistent hash (same value on both flow directions) */
2642 	if ((__force u32)keys.dst < (__force u32)keys.src)
2643 		swap(keys.dst, keys.src);
2644 
2645 	hash = jhash_3words((__force u32)keys.dst,
2646 			    (__force u32)keys.src,
2647 			    (__force u32)keys.ports, hashrnd);
2648 	if (!hash)
2649 		hash = 1;
2650 
2651 	skb->rxhash = hash;
2652 }
2653 EXPORT_SYMBOL(__skb_get_rxhash);
2654 
2655 #ifdef CONFIG_RPS
2656 
2657 /* One global table that all flow-based protocols share. */
2658 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2659 EXPORT_SYMBOL(rps_sock_flow_table);
2660 
2661 struct static_key rps_needed __read_mostly;
2662 
2663 static struct rps_dev_flow *
2664 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2665 	    struct rps_dev_flow *rflow, u16 next_cpu)
2666 {
2667 	if (next_cpu != RPS_NO_CPU) {
2668 #ifdef CONFIG_RFS_ACCEL
2669 		struct netdev_rx_queue *rxqueue;
2670 		struct rps_dev_flow_table *flow_table;
2671 		struct rps_dev_flow *old_rflow;
2672 		u32 flow_id;
2673 		u16 rxq_index;
2674 		int rc;
2675 
2676 		/* Should we steer this flow to a different hardware queue? */
2677 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2678 		    !(dev->features & NETIF_F_NTUPLE))
2679 			goto out;
2680 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2681 		if (rxq_index == skb_get_rx_queue(skb))
2682 			goto out;
2683 
2684 		rxqueue = dev->_rx + rxq_index;
2685 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2686 		if (!flow_table)
2687 			goto out;
2688 		flow_id = skb->rxhash & flow_table->mask;
2689 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2690 							rxq_index, flow_id);
2691 		if (rc < 0)
2692 			goto out;
2693 		old_rflow = rflow;
2694 		rflow = &flow_table->flows[flow_id];
2695 		rflow->filter = rc;
2696 		if (old_rflow->filter == rflow->filter)
2697 			old_rflow->filter = RPS_NO_FILTER;
2698 	out:
2699 #endif
2700 		rflow->last_qtail =
2701 			per_cpu(softnet_data, next_cpu).input_queue_head;
2702 	}
2703 
2704 	rflow->cpu = next_cpu;
2705 	return rflow;
2706 }
2707 
2708 /*
2709  * get_rps_cpu is called from netif_receive_skb and returns the target
2710  * CPU from the RPS map of the receiving queue for a given skb.
2711  * rcu_read_lock must be held on entry.
2712  */
2713 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2714 		       struct rps_dev_flow **rflowp)
2715 {
2716 	struct netdev_rx_queue *rxqueue;
2717 	struct rps_map *map;
2718 	struct rps_dev_flow_table *flow_table;
2719 	struct rps_sock_flow_table *sock_flow_table;
2720 	int cpu = -1;
2721 	u16 tcpu;
2722 
2723 	if (skb_rx_queue_recorded(skb)) {
2724 		u16 index = skb_get_rx_queue(skb);
2725 		if (unlikely(index >= dev->real_num_rx_queues)) {
2726 			WARN_ONCE(dev->real_num_rx_queues > 1,
2727 				  "%s received packet on queue %u, but number "
2728 				  "of RX queues is %u\n",
2729 				  dev->name, index, dev->real_num_rx_queues);
2730 			goto done;
2731 		}
2732 		rxqueue = dev->_rx + index;
2733 	} else
2734 		rxqueue = dev->_rx;
2735 
2736 	map = rcu_dereference(rxqueue->rps_map);
2737 	if (map) {
2738 		if (map->len == 1 &&
2739 		    !rcu_access_pointer(rxqueue->rps_flow_table)) {
2740 			tcpu = map->cpus[0];
2741 			if (cpu_online(tcpu))
2742 				cpu = tcpu;
2743 			goto done;
2744 		}
2745 	} else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2746 		goto done;
2747 	}
2748 
2749 	skb_reset_network_header(skb);
2750 	if (!skb_get_rxhash(skb))
2751 		goto done;
2752 
2753 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2754 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2755 	if (flow_table && sock_flow_table) {
2756 		u16 next_cpu;
2757 		struct rps_dev_flow *rflow;
2758 
2759 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2760 		tcpu = rflow->cpu;
2761 
2762 		next_cpu = sock_flow_table->ents[skb->rxhash &
2763 		    sock_flow_table->mask];
2764 
2765 		/*
2766 		 * If the desired CPU (where last recvmsg was done) is
2767 		 * different from current CPU (one in the rx-queue flow
2768 		 * table entry), switch if one of the following holds:
2769 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2770 		 *   - Current CPU is offline.
2771 		 *   - The current CPU's queue tail has advanced beyond the
2772 		 *     last packet that was enqueued using this table entry.
2773 		 *     This guarantees that all previous packets for the flow
2774 		 *     have been dequeued, thus preserving in order delivery.
2775 		 */
2776 		if (unlikely(tcpu != next_cpu) &&
2777 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2778 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2779 		      rflow->last_qtail)) >= 0))
2780 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2781 
2782 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2783 			*rflowp = rflow;
2784 			cpu = tcpu;
2785 			goto done;
2786 		}
2787 	}
2788 
2789 	if (map) {
2790 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2791 
2792 		if (cpu_online(tcpu)) {
2793 			cpu = tcpu;
2794 			goto done;
2795 		}
2796 	}
2797 
2798 done:
2799 	return cpu;
2800 }
2801 
2802 #ifdef CONFIG_RFS_ACCEL
2803 
2804 /**
2805  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2806  * @dev: Device on which the filter was set
2807  * @rxq_index: RX queue index
2808  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2809  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2810  *
2811  * Drivers that implement ndo_rx_flow_steer() should periodically call
2812  * this function for each installed filter and remove the filters for
2813  * which it returns %true.
2814  */
2815 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2816 			 u32 flow_id, u16 filter_id)
2817 {
2818 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2819 	struct rps_dev_flow_table *flow_table;
2820 	struct rps_dev_flow *rflow;
2821 	bool expire = true;
2822 	int cpu;
2823 
2824 	rcu_read_lock();
2825 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2826 	if (flow_table && flow_id <= flow_table->mask) {
2827 		rflow = &flow_table->flows[flow_id];
2828 		cpu = ACCESS_ONCE(rflow->cpu);
2829 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2830 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2831 			   rflow->last_qtail) <
2832 		     (int)(10 * flow_table->mask)))
2833 			expire = false;
2834 	}
2835 	rcu_read_unlock();
2836 	return expire;
2837 }
2838 EXPORT_SYMBOL(rps_may_expire_flow);
2839 
2840 #endif /* CONFIG_RFS_ACCEL */
2841 
2842 /* Called from hardirq (IPI) context */
2843 static void rps_trigger_softirq(void *data)
2844 {
2845 	struct softnet_data *sd = data;
2846 
2847 	____napi_schedule(sd, &sd->backlog);
2848 	sd->received_rps++;
2849 }
2850 
2851 #endif /* CONFIG_RPS */
2852 
2853 /*
2854  * Check if this softnet_data structure is another cpu one
2855  * If yes, queue it to our IPI list and return 1
2856  * If no, return 0
2857  */
2858 static int rps_ipi_queued(struct softnet_data *sd)
2859 {
2860 #ifdef CONFIG_RPS
2861 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2862 
2863 	if (sd != mysd) {
2864 		sd->rps_ipi_next = mysd->rps_ipi_list;
2865 		mysd->rps_ipi_list = sd;
2866 
2867 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2868 		return 1;
2869 	}
2870 #endif /* CONFIG_RPS */
2871 	return 0;
2872 }
2873 
2874 /*
2875  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2876  * queue (may be a remote CPU queue).
2877  */
2878 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2879 			      unsigned int *qtail)
2880 {
2881 	struct softnet_data *sd;
2882 	unsigned long flags;
2883 
2884 	sd = &per_cpu(softnet_data, cpu);
2885 
2886 	local_irq_save(flags);
2887 
2888 	rps_lock(sd);
2889 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2890 		if (skb_queue_len(&sd->input_pkt_queue)) {
2891 enqueue:
2892 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2893 			input_queue_tail_incr_save(sd, qtail);
2894 			rps_unlock(sd);
2895 			local_irq_restore(flags);
2896 			return NET_RX_SUCCESS;
2897 		}
2898 
2899 		/* Schedule NAPI for backlog device
2900 		 * We can use non atomic operation since we own the queue lock
2901 		 */
2902 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2903 			if (!rps_ipi_queued(sd))
2904 				____napi_schedule(sd, &sd->backlog);
2905 		}
2906 		goto enqueue;
2907 	}
2908 
2909 	sd->dropped++;
2910 	rps_unlock(sd);
2911 
2912 	local_irq_restore(flags);
2913 
2914 	atomic_long_inc(&skb->dev->rx_dropped);
2915 	kfree_skb(skb);
2916 	return NET_RX_DROP;
2917 }
2918 
2919 /**
2920  *	netif_rx	-	post buffer to the network code
2921  *	@skb: buffer to post
2922  *
2923  *	This function receives a packet from a device driver and queues it for
2924  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2925  *	may be dropped during processing for congestion control or by the
2926  *	protocol layers.
2927  *
2928  *	return values:
2929  *	NET_RX_SUCCESS	(no congestion)
2930  *	NET_RX_DROP     (packet was dropped)
2931  *
2932  */
2933 
2934 int netif_rx(struct sk_buff *skb)
2935 {
2936 	int ret;
2937 
2938 	/* if netpoll wants it, pretend we never saw it */
2939 	if (netpoll_rx(skb))
2940 		return NET_RX_DROP;
2941 
2942 	net_timestamp_check(netdev_tstamp_prequeue, skb);
2943 
2944 	trace_netif_rx(skb);
2945 #ifdef CONFIG_RPS
2946 	if (static_key_false(&rps_needed)) {
2947 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2948 		int cpu;
2949 
2950 		preempt_disable();
2951 		rcu_read_lock();
2952 
2953 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2954 		if (cpu < 0)
2955 			cpu = smp_processor_id();
2956 
2957 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2958 
2959 		rcu_read_unlock();
2960 		preempt_enable();
2961 	} else
2962 #endif
2963 	{
2964 		unsigned int qtail;
2965 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2966 		put_cpu();
2967 	}
2968 	return ret;
2969 }
2970 EXPORT_SYMBOL(netif_rx);
2971 
2972 int netif_rx_ni(struct sk_buff *skb)
2973 {
2974 	int err;
2975 
2976 	preempt_disable();
2977 	err = netif_rx(skb);
2978 	if (local_softirq_pending())
2979 		do_softirq();
2980 	preempt_enable();
2981 
2982 	return err;
2983 }
2984 EXPORT_SYMBOL(netif_rx_ni);
2985 
2986 static void net_tx_action(struct softirq_action *h)
2987 {
2988 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2989 
2990 	if (sd->completion_queue) {
2991 		struct sk_buff *clist;
2992 
2993 		local_irq_disable();
2994 		clist = sd->completion_queue;
2995 		sd->completion_queue = NULL;
2996 		local_irq_enable();
2997 
2998 		while (clist) {
2999 			struct sk_buff *skb = clist;
3000 			clist = clist->next;
3001 
3002 			WARN_ON(atomic_read(&skb->users));
3003 			trace_kfree_skb(skb, net_tx_action);
3004 			__kfree_skb(skb);
3005 		}
3006 	}
3007 
3008 	if (sd->output_queue) {
3009 		struct Qdisc *head;
3010 
3011 		local_irq_disable();
3012 		head = sd->output_queue;
3013 		sd->output_queue = NULL;
3014 		sd->output_queue_tailp = &sd->output_queue;
3015 		local_irq_enable();
3016 
3017 		while (head) {
3018 			struct Qdisc *q = head;
3019 			spinlock_t *root_lock;
3020 
3021 			head = head->next_sched;
3022 
3023 			root_lock = qdisc_lock(q);
3024 			if (spin_trylock(root_lock)) {
3025 				smp_mb__before_clear_bit();
3026 				clear_bit(__QDISC_STATE_SCHED,
3027 					  &q->state);
3028 				qdisc_run(q);
3029 				spin_unlock(root_lock);
3030 			} else {
3031 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3032 					      &q->state)) {
3033 					__netif_reschedule(q);
3034 				} else {
3035 					smp_mb__before_clear_bit();
3036 					clear_bit(__QDISC_STATE_SCHED,
3037 						  &q->state);
3038 				}
3039 			}
3040 		}
3041 	}
3042 }
3043 
3044 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3045     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3046 /* This hook is defined here for ATM LANE */
3047 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3048 			     unsigned char *addr) __read_mostly;
3049 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3050 #endif
3051 
3052 #ifdef CONFIG_NET_CLS_ACT
3053 /* TODO: Maybe we should just force sch_ingress to be compiled in
3054  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3055  * a compare and 2 stores extra right now if we dont have it on
3056  * but have CONFIG_NET_CLS_ACT
3057  * NOTE: This doesn't stop any functionality; if you dont have
3058  * the ingress scheduler, you just can't add policies on ingress.
3059  *
3060  */
3061 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3062 {
3063 	struct net_device *dev = skb->dev;
3064 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3065 	int result = TC_ACT_OK;
3066 	struct Qdisc *q;
3067 
3068 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3069 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3070 				     skb->skb_iif, dev->ifindex);
3071 		return TC_ACT_SHOT;
3072 	}
3073 
3074 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3075 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3076 
3077 	q = rxq->qdisc;
3078 	if (q != &noop_qdisc) {
3079 		spin_lock(qdisc_lock(q));
3080 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3081 			result = qdisc_enqueue_root(skb, q);
3082 		spin_unlock(qdisc_lock(q));
3083 	}
3084 
3085 	return result;
3086 }
3087 
3088 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3089 					 struct packet_type **pt_prev,
3090 					 int *ret, struct net_device *orig_dev)
3091 {
3092 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3093 
3094 	if (!rxq || rxq->qdisc == &noop_qdisc)
3095 		goto out;
3096 
3097 	if (*pt_prev) {
3098 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3099 		*pt_prev = NULL;
3100 	}
3101 
3102 	switch (ing_filter(skb, rxq)) {
3103 	case TC_ACT_SHOT:
3104 	case TC_ACT_STOLEN:
3105 		kfree_skb(skb);
3106 		return NULL;
3107 	}
3108 
3109 out:
3110 	skb->tc_verd = 0;
3111 	return skb;
3112 }
3113 #endif
3114 
3115 /**
3116  *	netdev_rx_handler_register - register receive handler
3117  *	@dev: device to register a handler for
3118  *	@rx_handler: receive handler to register
3119  *	@rx_handler_data: data pointer that is used by rx handler
3120  *
3121  *	Register a receive hander for a device. This handler will then be
3122  *	called from __netif_receive_skb. A negative errno code is returned
3123  *	on a failure.
3124  *
3125  *	The caller must hold the rtnl_mutex.
3126  *
3127  *	For a general description of rx_handler, see enum rx_handler_result.
3128  */
3129 int netdev_rx_handler_register(struct net_device *dev,
3130 			       rx_handler_func_t *rx_handler,
3131 			       void *rx_handler_data)
3132 {
3133 	ASSERT_RTNL();
3134 
3135 	if (dev->rx_handler)
3136 		return -EBUSY;
3137 
3138 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3139 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3140 
3141 	return 0;
3142 }
3143 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3144 
3145 /**
3146  *	netdev_rx_handler_unregister - unregister receive handler
3147  *	@dev: device to unregister a handler from
3148  *
3149  *	Unregister a receive hander from a device.
3150  *
3151  *	The caller must hold the rtnl_mutex.
3152  */
3153 void netdev_rx_handler_unregister(struct net_device *dev)
3154 {
3155 
3156 	ASSERT_RTNL();
3157 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3158 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3159 }
3160 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3161 
3162 /*
3163  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3164  * the special handling of PFMEMALLOC skbs.
3165  */
3166 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3167 {
3168 	switch (skb->protocol) {
3169 	case __constant_htons(ETH_P_ARP):
3170 	case __constant_htons(ETH_P_IP):
3171 	case __constant_htons(ETH_P_IPV6):
3172 	case __constant_htons(ETH_P_8021Q):
3173 		return true;
3174 	default:
3175 		return false;
3176 	}
3177 }
3178 
3179 static int __netif_receive_skb(struct sk_buff *skb)
3180 {
3181 	struct packet_type *ptype, *pt_prev;
3182 	rx_handler_func_t *rx_handler;
3183 	struct net_device *orig_dev;
3184 	struct net_device *null_or_dev;
3185 	bool deliver_exact = false;
3186 	int ret = NET_RX_DROP;
3187 	__be16 type;
3188 	unsigned long pflags = current->flags;
3189 
3190 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3191 
3192 	trace_netif_receive_skb(skb);
3193 
3194 	/*
3195 	 * PFMEMALLOC skbs are special, they should
3196 	 * - be delivered to SOCK_MEMALLOC sockets only
3197 	 * - stay away from userspace
3198 	 * - have bounded memory usage
3199 	 *
3200 	 * Use PF_MEMALLOC as this saves us from propagating the allocation
3201 	 * context down to all allocation sites.
3202 	 */
3203 	if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3204 		current->flags |= PF_MEMALLOC;
3205 
3206 	/* if we've gotten here through NAPI, check netpoll */
3207 	if (netpoll_receive_skb(skb))
3208 		goto out;
3209 
3210 	orig_dev = skb->dev;
3211 
3212 	skb_reset_network_header(skb);
3213 	skb_reset_transport_header(skb);
3214 	skb_reset_mac_len(skb);
3215 
3216 	pt_prev = NULL;
3217 
3218 	rcu_read_lock();
3219 
3220 another_round:
3221 	skb->skb_iif = skb->dev->ifindex;
3222 
3223 	__this_cpu_inc(softnet_data.processed);
3224 
3225 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3226 		skb = vlan_untag(skb);
3227 		if (unlikely(!skb))
3228 			goto unlock;
3229 	}
3230 
3231 #ifdef CONFIG_NET_CLS_ACT
3232 	if (skb->tc_verd & TC_NCLS) {
3233 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3234 		goto ncls;
3235 	}
3236 #endif
3237 
3238 	if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3239 		goto skip_taps;
3240 
3241 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3242 		if (!ptype->dev || ptype->dev == skb->dev) {
3243 			if (pt_prev)
3244 				ret = deliver_skb(skb, pt_prev, orig_dev);
3245 			pt_prev = ptype;
3246 		}
3247 	}
3248 
3249 skip_taps:
3250 #ifdef CONFIG_NET_CLS_ACT
3251 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3252 	if (!skb)
3253 		goto unlock;
3254 ncls:
3255 #endif
3256 
3257 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3258 				&& !skb_pfmemalloc_protocol(skb))
3259 		goto drop;
3260 
3261 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3262 	if (vlan_tx_tag_present(skb)) {
3263 		if (pt_prev) {
3264 			ret = deliver_skb(skb, pt_prev, orig_dev);
3265 			pt_prev = NULL;
3266 		}
3267 		if (vlan_do_receive(&skb, !rx_handler))
3268 			goto another_round;
3269 		else if (unlikely(!skb))
3270 			goto unlock;
3271 	}
3272 
3273 	if (rx_handler) {
3274 		if (pt_prev) {
3275 			ret = deliver_skb(skb, pt_prev, orig_dev);
3276 			pt_prev = NULL;
3277 		}
3278 		switch (rx_handler(&skb)) {
3279 		case RX_HANDLER_CONSUMED:
3280 			goto unlock;
3281 		case RX_HANDLER_ANOTHER:
3282 			goto another_round;
3283 		case RX_HANDLER_EXACT:
3284 			deliver_exact = true;
3285 		case RX_HANDLER_PASS:
3286 			break;
3287 		default:
3288 			BUG();
3289 		}
3290 	}
3291 
3292 	/* deliver only exact match when indicated */
3293 	null_or_dev = deliver_exact ? skb->dev : NULL;
3294 
3295 	type = skb->protocol;
3296 	list_for_each_entry_rcu(ptype,
3297 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3298 		if (ptype->type == type &&
3299 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3300 		     ptype->dev == orig_dev)) {
3301 			if (pt_prev)
3302 				ret = deliver_skb(skb, pt_prev, orig_dev);
3303 			pt_prev = ptype;
3304 		}
3305 	}
3306 
3307 	if (pt_prev) {
3308 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3309 			ret = -ENOMEM;
3310 		else
3311 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3312 	} else {
3313 drop:
3314 		atomic_long_inc(&skb->dev->rx_dropped);
3315 		kfree_skb(skb);
3316 		/* Jamal, now you will not able to escape explaining
3317 		 * me how you were going to use this. :-)
3318 		 */
3319 		ret = NET_RX_DROP;
3320 	}
3321 
3322 unlock:
3323 	rcu_read_unlock();
3324 out:
3325 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
3326 	return ret;
3327 }
3328 
3329 /**
3330  *	netif_receive_skb - process receive buffer from network
3331  *	@skb: buffer to process
3332  *
3333  *	netif_receive_skb() is the main receive data processing function.
3334  *	It always succeeds. The buffer may be dropped during processing
3335  *	for congestion control or by the protocol layers.
3336  *
3337  *	This function may only be called from softirq context and interrupts
3338  *	should be enabled.
3339  *
3340  *	Return values (usually ignored):
3341  *	NET_RX_SUCCESS: no congestion
3342  *	NET_RX_DROP: packet was dropped
3343  */
3344 int netif_receive_skb(struct sk_buff *skb)
3345 {
3346 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3347 
3348 	if (skb_defer_rx_timestamp(skb))
3349 		return NET_RX_SUCCESS;
3350 
3351 #ifdef CONFIG_RPS
3352 	if (static_key_false(&rps_needed)) {
3353 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3354 		int cpu, ret;
3355 
3356 		rcu_read_lock();
3357 
3358 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3359 
3360 		if (cpu >= 0) {
3361 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3362 			rcu_read_unlock();
3363 			return ret;
3364 		}
3365 		rcu_read_unlock();
3366 	}
3367 #endif
3368 	return __netif_receive_skb(skb);
3369 }
3370 EXPORT_SYMBOL(netif_receive_skb);
3371 
3372 /* Network device is going away, flush any packets still pending
3373  * Called with irqs disabled.
3374  */
3375 static void flush_backlog(void *arg)
3376 {
3377 	struct net_device *dev = arg;
3378 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3379 	struct sk_buff *skb, *tmp;
3380 
3381 	rps_lock(sd);
3382 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3383 		if (skb->dev == dev) {
3384 			__skb_unlink(skb, &sd->input_pkt_queue);
3385 			kfree_skb(skb);
3386 			input_queue_head_incr(sd);
3387 		}
3388 	}
3389 	rps_unlock(sd);
3390 
3391 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3392 		if (skb->dev == dev) {
3393 			__skb_unlink(skb, &sd->process_queue);
3394 			kfree_skb(skb);
3395 			input_queue_head_incr(sd);
3396 		}
3397 	}
3398 }
3399 
3400 static int napi_gro_complete(struct sk_buff *skb)
3401 {
3402 	struct packet_type *ptype;
3403 	__be16 type = skb->protocol;
3404 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3405 	int err = -ENOENT;
3406 
3407 	if (NAPI_GRO_CB(skb)->count == 1) {
3408 		skb_shinfo(skb)->gso_size = 0;
3409 		goto out;
3410 	}
3411 
3412 	rcu_read_lock();
3413 	list_for_each_entry_rcu(ptype, head, list) {
3414 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3415 			continue;
3416 
3417 		err = ptype->gro_complete(skb);
3418 		break;
3419 	}
3420 	rcu_read_unlock();
3421 
3422 	if (err) {
3423 		WARN_ON(&ptype->list == head);
3424 		kfree_skb(skb);
3425 		return NET_RX_SUCCESS;
3426 	}
3427 
3428 out:
3429 	return netif_receive_skb(skb);
3430 }
3431 
3432 inline void napi_gro_flush(struct napi_struct *napi)
3433 {
3434 	struct sk_buff *skb, *next;
3435 
3436 	for (skb = napi->gro_list; skb; skb = next) {
3437 		next = skb->next;
3438 		skb->next = NULL;
3439 		napi_gro_complete(skb);
3440 	}
3441 
3442 	napi->gro_count = 0;
3443 	napi->gro_list = NULL;
3444 }
3445 EXPORT_SYMBOL(napi_gro_flush);
3446 
3447 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3448 {
3449 	struct sk_buff **pp = NULL;
3450 	struct packet_type *ptype;
3451 	__be16 type = skb->protocol;
3452 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3453 	int same_flow;
3454 	int mac_len;
3455 	enum gro_result ret;
3456 
3457 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3458 		goto normal;
3459 
3460 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3461 		goto normal;
3462 
3463 	rcu_read_lock();
3464 	list_for_each_entry_rcu(ptype, head, list) {
3465 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3466 			continue;
3467 
3468 		skb_set_network_header(skb, skb_gro_offset(skb));
3469 		mac_len = skb->network_header - skb->mac_header;
3470 		skb->mac_len = mac_len;
3471 		NAPI_GRO_CB(skb)->same_flow = 0;
3472 		NAPI_GRO_CB(skb)->flush = 0;
3473 		NAPI_GRO_CB(skb)->free = 0;
3474 
3475 		pp = ptype->gro_receive(&napi->gro_list, skb);
3476 		break;
3477 	}
3478 	rcu_read_unlock();
3479 
3480 	if (&ptype->list == head)
3481 		goto normal;
3482 
3483 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3484 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3485 
3486 	if (pp) {
3487 		struct sk_buff *nskb = *pp;
3488 
3489 		*pp = nskb->next;
3490 		nskb->next = NULL;
3491 		napi_gro_complete(nskb);
3492 		napi->gro_count--;
3493 	}
3494 
3495 	if (same_flow)
3496 		goto ok;
3497 
3498 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3499 		goto normal;
3500 
3501 	napi->gro_count++;
3502 	NAPI_GRO_CB(skb)->count = 1;
3503 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3504 	skb->next = napi->gro_list;
3505 	napi->gro_list = skb;
3506 	ret = GRO_HELD;
3507 
3508 pull:
3509 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3510 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3511 
3512 		BUG_ON(skb->end - skb->tail < grow);
3513 
3514 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3515 
3516 		skb->tail += grow;
3517 		skb->data_len -= grow;
3518 
3519 		skb_shinfo(skb)->frags[0].page_offset += grow;
3520 		skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3521 
3522 		if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3523 			skb_frag_unref(skb, 0);
3524 			memmove(skb_shinfo(skb)->frags,
3525 				skb_shinfo(skb)->frags + 1,
3526 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3527 		}
3528 	}
3529 
3530 ok:
3531 	return ret;
3532 
3533 normal:
3534 	ret = GRO_NORMAL;
3535 	goto pull;
3536 }
3537 EXPORT_SYMBOL(dev_gro_receive);
3538 
3539 static inline gro_result_t
3540 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3541 {
3542 	struct sk_buff *p;
3543 	unsigned int maclen = skb->dev->hard_header_len;
3544 
3545 	for (p = napi->gro_list; p; p = p->next) {
3546 		unsigned long diffs;
3547 
3548 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3549 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3550 		if (maclen == ETH_HLEN)
3551 			diffs |= compare_ether_header(skb_mac_header(p),
3552 						      skb_gro_mac_header(skb));
3553 		else if (!diffs)
3554 			diffs = memcmp(skb_mac_header(p),
3555 				       skb_gro_mac_header(skb),
3556 				       maclen);
3557 		NAPI_GRO_CB(p)->same_flow = !diffs;
3558 		NAPI_GRO_CB(p)->flush = 0;
3559 	}
3560 
3561 	return dev_gro_receive(napi, skb);
3562 }
3563 
3564 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3565 {
3566 	switch (ret) {
3567 	case GRO_NORMAL:
3568 		if (netif_receive_skb(skb))
3569 			ret = GRO_DROP;
3570 		break;
3571 
3572 	case GRO_DROP:
3573 		kfree_skb(skb);
3574 		break;
3575 
3576 	case GRO_MERGED_FREE:
3577 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3578 			kmem_cache_free(skbuff_head_cache, skb);
3579 		else
3580 			__kfree_skb(skb);
3581 		break;
3582 
3583 	case GRO_HELD:
3584 	case GRO_MERGED:
3585 		break;
3586 	}
3587 
3588 	return ret;
3589 }
3590 EXPORT_SYMBOL(napi_skb_finish);
3591 
3592 void skb_gro_reset_offset(struct sk_buff *skb)
3593 {
3594 	NAPI_GRO_CB(skb)->data_offset = 0;
3595 	NAPI_GRO_CB(skb)->frag0 = NULL;
3596 	NAPI_GRO_CB(skb)->frag0_len = 0;
3597 
3598 	if (skb->mac_header == skb->tail &&
3599 	    !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
3600 		NAPI_GRO_CB(skb)->frag0 =
3601 			skb_frag_address(&skb_shinfo(skb)->frags[0]);
3602 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
3603 	}
3604 }
3605 EXPORT_SYMBOL(skb_gro_reset_offset);
3606 
3607 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3608 {
3609 	skb_gro_reset_offset(skb);
3610 
3611 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3612 }
3613 EXPORT_SYMBOL(napi_gro_receive);
3614 
3615 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3616 {
3617 	__skb_pull(skb, skb_headlen(skb));
3618 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
3619 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3620 	skb->vlan_tci = 0;
3621 	skb->dev = napi->dev;
3622 	skb->skb_iif = 0;
3623 
3624 	napi->skb = skb;
3625 }
3626 
3627 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3628 {
3629 	struct sk_buff *skb = napi->skb;
3630 
3631 	if (!skb) {
3632 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3633 		if (skb)
3634 			napi->skb = skb;
3635 	}
3636 	return skb;
3637 }
3638 EXPORT_SYMBOL(napi_get_frags);
3639 
3640 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3641 			       gro_result_t ret)
3642 {
3643 	switch (ret) {
3644 	case GRO_NORMAL:
3645 	case GRO_HELD:
3646 		skb->protocol = eth_type_trans(skb, skb->dev);
3647 
3648 		if (ret == GRO_HELD)
3649 			skb_gro_pull(skb, -ETH_HLEN);
3650 		else if (netif_receive_skb(skb))
3651 			ret = GRO_DROP;
3652 		break;
3653 
3654 	case GRO_DROP:
3655 	case GRO_MERGED_FREE:
3656 		napi_reuse_skb(napi, skb);
3657 		break;
3658 
3659 	case GRO_MERGED:
3660 		break;
3661 	}
3662 
3663 	return ret;
3664 }
3665 EXPORT_SYMBOL(napi_frags_finish);
3666 
3667 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3668 {
3669 	struct sk_buff *skb = napi->skb;
3670 	struct ethhdr *eth;
3671 	unsigned int hlen;
3672 	unsigned int off;
3673 
3674 	napi->skb = NULL;
3675 
3676 	skb_reset_mac_header(skb);
3677 	skb_gro_reset_offset(skb);
3678 
3679 	off = skb_gro_offset(skb);
3680 	hlen = off + sizeof(*eth);
3681 	eth = skb_gro_header_fast(skb, off);
3682 	if (skb_gro_header_hard(skb, hlen)) {
3683 		eth = skb_gro_header_slow(skb, hlen, off);
3684 		if (unlikely(!eth)) {
3685 			napi_reuse_skb(napi, skb);
3686 			skb = NULL;
3687 			goto out;
3688 		}
3689 	}
3690 
3691 	skb_gro_pull(skb, sizeof(*eth));
3692 
3693 	/*
3694 	 * This works because the only protocols we care about don't require
3695 	 * special handling.  We'll fix it up properly at the end.
3696 	 */
3697 	skb->protocol = eth->h_proto;
3698 
3699 out:
3700 	return skb;
3701 }
3702 
3703 gro_result_t napi_gro_frags(struct napi_struct *napi)
3704 {
3705 	struct sk_buff *skb = napi_frags_skb(napi);
3706 
3707 	if (!skb)
3708 		return GRO_DROP;
3709 
3710 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3711 }
3712 EXPORT_SYMBOL(napi_gro_frags);
3713 
3714 /*
3715  * net_rps_action sends any pending IPI's for rps.
3716  * Note: called with local irq disabled, but exits with local irq enabled.
3717  */
3718 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3719 {
3720 #ifdef CONFIG_RPS
3721 	struct softnet_data *remsd = sd->rps_ipi_list;
3722 
3723 	if (remsd) {
3724 		sd->rps_ipi_list = NULL;
3725 
3726 		local_irq_enable();
3727 
3728 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3729 		while (remsd) {
3730 			struct softnet_data *next = remsd->rps_ipi_next;
3731 
3732 			if (cpu_online(remsd->cpu))
3733 				__smp_call_function_single(remsd->cpu,
3734 							   &remsd->csd, 0);
3735 			remsd = next;
3736 		}
3737 	} else
3738 #endif
3739 		local_irq_enable();
3740 }
3741 
3742 static int process_backlog(struct napi_struct *napi, int quota)
3743 {
3744 	int work = 0;
3745 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3746 
3747 #ifdef CONFIG_RPS
3748 	/* Check if we have pending ipi, its better to send them now,
3749 	 * not waiting net_rx_action() end.
3750 	 */
3751 	if (sd->rps_ipi_list) {
3752 		local_irq_disable();
3753 		net_rps_action_and_irq_enable(sd);
3754 	}
3755 #endif
3756 	napi->weight = weight_p;
3757 	local_irq_disable();
3758 	while (work < quota) {
3759 		struct sk_buff *skb;
3760 		unsigned int qlen;
3761 
3762 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3763 			local_irq_enable();
3764 			__netif_receive_skb(skb);
3765 			local_irq_disable();
3766 			input_queue_head_incr(sd);
3767 			if (++work >= quota) {
3768 				local_irq_enable();
3769 				return work;
3770 			}
3771 		}
3772 
3773 		rps_lock(sd);
3774 		qlen = skb_queue_len(&sd->input_pkt_queue);
3775 		if (qlen)
3776 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3777 						   &sd->process_queue);
3778 
3779 		if (qlen < quota - work) {
3780 			/*
3781 			 * Inline a custom version of __napi_complete().
3782 			 * only current cpu owns and manipulates this napi,
3783 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3784 			 * we can use a plain write instead of clear_bit(),
3785 			 * and we dont need an smp_mb() memory barrier.
3786 			 */
3787 			list_del(&napi->poll_list);
3788 			napi->state = 0;
3789 
3790 			quota = work + qlen;
3791 		}
3792 		rps_unlock(sd);
3793 	}
3794 	local_irq_enable();
3795 
3796 	return work;
3797 }
3798 
3799 /**
3800  * __napi_schedule - schedule for receive
3801  * @n: entry to schedule
3802  *
3803  * The entry's receive function will be scheduled to run
3804  */
3805 void __napi_schedule(struct napi_struct *n)
3806 {
3807 	unsigned long flags;
3808 
3809 	local_irq_save(flags);
3810 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3811 	local_irq_restore(flags);
3812 }
3813 EXPORT_SYMBOL(__napi_schedule);
3814 
3815 void __napi_complete(struct napi_struct *n)
3816 {
3817 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3818 	BUG_ON(n->gro_list);
3819 
3820 	list_del(&n->poll_list);
3821 	smp_mb__before_clear_bit();
3822 	clear_bit(NAPI_STATE_SCHED, &n->state);
3823 }
3824 EXPORT_SYMBOL(__napi_complete);
3825 
3826 void napi_complete(struct napi_struct *n)
3827 {
3828 	unsigned long flags;
3829 
3830 	/*
3831 	 * don't let napi dequeue from the cpu poll list
3832 	 * just in case its running on a different cpu
3833 	 */
3834 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3835 		return;
3836 
3837 	napi_gro_flush(n);
3838 	local_irq_save(flags);
3839 	__napi_complete(n);
3840 	local_irq_restore(flags);
3841 }
3842 EXPORT_SYMBOL(napi_complete);
3843 
3844 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3845 		    int (*poll)(struct napi_struct *, int), int weight)
3846 {
3847 	INIT_LIST_HEAD(&napi->poll_list);
3848 	napi->gro_count = 0;
3849 	napi->gro_list = NULL;
3850 	napi->skb = NULL;
3851 	napi->poll = poll;
3852 	napi->weight = weight;
3853 	list_add(&napi->dev_list, &dev->napi_list);
3854 	napi->dev = dev;
3855 #ifdef CONFIG_NETPOLL
3856 	spin_lock_init(&napi->poll_lock);
3857 	napi->poll_owner = -1;
3858 #endif
3859 	set_bit(NAPI_STATE_SCHED, &napi->state);
3860 }
3861 EXPORT_SYMBOL(netif_napi_add);
3862 
3863 void netif_napi_del(struct napi_struct *napi)
3864 {
3865 	struct sk_buff *skb, *next;
3866 
3867 	list_del_init(&napi->dev_list);
3868 	napi_free_frags(napi);
3869 
3870 	for (skb = napi->gro_list; skb; skb = next) {
3871 		next = skb->next;
3872 		skb->next = NULL;
3873 		kfree_skb(skb);
3874 	}
3875 
3876 	napi->gro_list = NULL;
3877 	napi->gro_count = 0;
3878 }
3879 EXPORT_SYMBOL(netif_napi_del);
3880 
3881 static void net_rx_action(struct softirq_action *h)
3882 {
3883 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3884 	unsigned long time_limit = jiffies + 2;
3885 	int budget = netdev_budget;
3886 	void *have;
3887 
3888 	local_irq_disable();
3889 
3890 	while (!list_empty(&sd->poll_list)) {
3891 		struct napi_struct *n;
3892 		int work, weight;
3893 
3894 		/* If softirq window is exhuasted then punt.
3895 		 * Allow this to run for 2 jiffies since which will allow
3896 		 * an average latency of 1.5/HZ.
3897 		 */
3898 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3899 			goto softnet_break;
3900 
3901 		local_irq_enable();
3902 
3903 		/* Even though interrupts have been re-enabled, this
3904 		 * access is safe because interrupts can only add new
3905 		 * entries to the tail of this list, and only ->poll()
3906 		 * calls can remove this head entry from the list.
3907 		 */
3908 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3909 
3910 		have = netpoll_poll_lock(n);
3911 
3912 		weight = n->weight;
3913 
3914 		/* This NAPI_STATE_SCHED test is for avoiding a race
3915 		 * with netpoll's poll_napi().  Only the entity which
3916 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3917 		 * actually make the ->poll() call.  Therefore we avoid
3918 		 * accidentally calling ->poll() when NAPI is not scheduled.
3919 		 */
3920 		work = 0;
3921 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3922 			work = n->poll(n, weight);
3923 			trace_napi_poll(n);
3924 		}
3925 
3926 		WARN_ON_ONCE(work > weight);
3927 
3928 		budget -= work;
3929 
3930 		local_irq_disable();
3931 
3932 		/* Drivers must not modify the NAPI state if they
3933 		 * consume the entire weight.  In such cases this code
3934 		 * still "owns" the NAPI instance and therefore can
3935 		 * move the instance around on the list at-will.
3936 		 */
3937 		if (unlikely(work == weight)) {
3938 			if (unlikely(napi_disable_pending(n))) {
3939 				local_irq_enable();
3940 				napi_complete(n);
3941 				local_irq_disable();
3942 			} else
3943 				list_move_tail(&n->poll_list, &sd->poll_list);
3944 		}
3945 
3946 		netpoll_poll_unlock(have);
3947 	}
3948 out:
3949 	net_rps_action_and_irq_enable(sd);
3950 
3951 #ifdef CONFIG_NET_DMA
3952 	/*
3953 	 * There may not be any more sk_buffs coming right now, so push
3954 	 * any pending DMA copies to hardware
3955 	 */
3956 	dma_issue_pending_all();
3957 #endif
3958 
3959 	return;
3960 
3961 softnet_break:
3962 	sd->time_squeeze++;
3963 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3964 	goto out;
3965 }
3966 
3967 static gifconf_func_t *gifconf_list[NPROTO];
3968 
3969 /**
3970  *	register_gifconf	-	register a SIOCGIF handler
3971  *	@family: Address family
3972  *	@gifconf: Function handler
3973  *
3974  *	Register protocol dependent address dumping routines. The handler
3975  *	that is passed must not be freed or reused until it has been replaced
3976  *	by another handler.
3977  */
3978 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3979 {
3980 	if (family >= NPROTO)
3981 		return -EINVAL;
3982 	gifconf_list[family] = gifconf;
3983 	return 0;
3984 }
3985 EXPORT_SYMBOL(register_gifconf);
3986 
3987 
3988 /*
3989  *	Map an interface index to its name (SIOCGIFNAME)
3990  */
3991 
3992 /*
3993  *	We need this ioctl for efficient implementation of the
3994  *	if_indextoname() function required by the IPv6 API.  Without
3995  *	it, we would have to search all the interfaces to find a
3996  *	match.  --pb
3997  */
3998 
3999 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4000 {
4001 	struct net_device *dev;
4002 	struct ifreq ifr;
4003 
4004 	/*
4005 	 *	Fetch the caller's info block.
4006 	 */
4007 
4008 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4009 		return -EFAULT;
4010 
4011 	rcu_read_lock();
4012 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4013 	if (!dev) {
4014 		rcu_read_unlock();
4015 		return -ENODEV;
4016 	}
4017 
4018 	strcpy(ifr.ifr_name, dev->name);
4019 	rcu_read_unlock();
4020 
4021 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4022 		return -EFAULT;
4023 	return 0;
4024 }
4025 
4026 /*
4027  *	Perform a SIOCGIFCONF call. This structure will change
4028  *	size eventually, and there is nothing I can do about it.
4029  *	Thus we will need a 'compatibility mode'.
4030  */
4031 
4032 static int dev_ifconf(struct net *net, char __user *arg)
4033 {
4034 	struct ifconf ifc;
4035 	struct net_device *dev;
4036 	char __user *pos;
4037 	int len;
4038 	int total;
4039 	int i;
4040 
4041 	/*
4042 	 *	Fetch the caller's info block.
4043 	 */
4044 
4045 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4046 		return -EFAULT;
4047 
4048 	pos = ifc.ifc_buf;
4049 	len = ifc.ifc_len;
4050 
4051 	/*
4052 	 *	Loop over the interfaces, and write an info block for each.
4053 	 */
4054 
4055 	total = 0;
4056 	for_each_netdev(net, dev) {
4057 		for (i = 0; i < NPROTO; i++) {
4058 			if (gifconf_list[i]) {
4059 				int done;
4060 				if (!pos)
4061 					done = gifconf_list[i](dev, NULL, 0);
4062 				else
4063 					done = gifconf_list[i](dev, pos + total,
4064 							       len - total);
4065 				if (done < 0)
4066 					return -EFAULT;
4067 				total += done;
4068 			}
4069 		}
4070 	}
4071 
4072 	/*
4073 	 *	All done.  Write the updated control block back to the caller.
4074 	 */
4075 	ifc.ifc_len = total;
4076 
4077 	/*
4078 	 * 	Both BSD and Solaris return 0 here, so we do too.
4079 	 */
4080 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4081 }
4082 
4083 #ifdef CONFIG_PROC_FS
4084 
4085 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4086 
4087 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4088 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4089 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4090 
4091 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4092 {
4093 	struct net *net = seq_file_net(seq);
4094 	struct net_device *dev;
4095 	struct hlist_node *p;
4096 	struct hlist_head *h;
4097 	unsigned int count = 0, offset = get_offset(*pos);
4098 
4099 	h = &net->dev_name_head[get_bucket(*pos)];
4100 	hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4101 		if (++count == offset)
4102 			return dev;
4103 	}
4104 
4105 	return NULL;
4106 }
4107 
4108 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4109 {
4110 	struct net_device *dev;
4111 	unsigned int bucket;
4112 
4113 	do {
4114 		dev = dev_from_same_bucket(seq, pos);
4115 		if (dev)
4116 			return dev;
4117 
4118 		bucket = get_bucket(*pos) + 1;
4119 		*pos = set_bucket_offset(bucket, 1);
4120 	} while (bucket < NETDEV_HASHENTRIES);
4121 
4122 	return NULL;
4123 }
4124 
4125 /*
4126  *	This is invoked by the /proc filesystem handler to display a device
4127  *	in detail.
4128  */
4129 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4130 	__acquires(RCU)
4131 {
4132 	rcu_read_lock();
4133 	if (!*pos)
4134 		return SEQ_START_TOKEN;
4135 
4136 	if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4137 		return NULL;
4138 
4139 	return dev_from_bucket(seq, pos);
4140 }
4141 
4142 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4143 {
4144 	++*pos;
4145 	return dev_from_bucket(seq, pos);
4146 }
4147 
4148 void dev_seq_stop(struct seq_file *seq, void *v)
4149 	__releases(RCU)
4150 {
4151 	rcu_read_unlock();
4152 }
4153 
4154 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4155 {
4156 	struct rtnl_link_stats64 temp;
4157 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4158 
4159 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4160 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4161 		   dev->name, stats->rx_bytes, stats->rx_packets,
4162 		   stats->rx_errors,
4163 		   stats->rx_dropped + stats->rx_missed_errors,
4164 		   stats->rx_fifo_errors,
4165 		   stats->rx_length_errors + stats->rx_over_errors +
4166 		    stats->rx_crc_errors + stats->rx_frame_errors,
4167 		   stats->rx_compressed, stats->multicast,
4168 		   stats->tx_bytes, stats->tx_packets,
4169 		   stats->tx_errors, stats->tx_dropped,
4170 		   stats->tx_fifo_errors, stats->collisions,
4171 		   stats->tx_carrier_errors +
4172 		    stats->tx_aborted_errors +
4173 		    stats->tx_window_errors +
4174 		    stats->tx_heartbeat_errors,
4175 		   stats->tx_compressed);
4176 }
4177 
4178 /*
4179  *	Called from the PROCfs module. This now uses the new arbitrary sized
4180  *	/proc/net interface to create /proc/net/dev
4181  */
4182 static int dev_seq_show(struct seq_file *seq, void *v)
4183 {
4184 	if (v == SEQ_START_TOKEN)
4185 		seq_puts(seq, "Inter-|   Receive                            "
4186 			      "                    |  Transmit\n"
4187 			      " face |bytes    packets errs drop fifo frame "
4188 			      "compressed multicast|bytes    packets errs "
4189 			      "drop fifo colls carrier compressed\n");
4190 	else
4191 		dev_seq_printf_stats(seq, v);
4192 	return 0;
4193 }
4194 
4195 static struct softnet_data *softnet_get_online(loff_t *pos)
4196 {
4197 	struct softnet_data *sd = NULL;
4198 
4199 	while (*pos < nr_cpu_ids)
4200 		if (cpu_online(*pos)) {
4201 			sd = &per_cpu(softnet_data, *pos);
4202 			break;
4203 		} else
4204 			++*pos;
4205 	return sd;
4206 }
4207 
4208 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4209 {
4210 	return softnet_get_online(pos);
4211 }
4212 
4213 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4214 {
4215 	++*pos;
4216 	return softnet_get_online(pos);
4217 }
4218 
4219 static void softnet_seq_stop(struct seq_file *seq, void *v)
4220 {
4221 }
4222 
4223 static int softnet_seq_show(struct seq_file *seq, void *v)
4224 {
4225 	struct softnet_data *sd = v;
4226 
4227 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4228 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4229 		   0, 0, 0, 0, /* was fastroute */
4230 		   sd->cpu_collision, sd->received_rps);
4231 	return 0;
4232 }
4233 
4234 static const struct seq_operations dev_seq_ops = {
4235 	.start = dev_seq_start,
4236 	.next  = dev_seq_next,
4237 	.stop  = dev_seq_stop,
4238 	.show  = dev_seq_show,
4239 };
4240 
4241 static int dev_seq_open(struct inode *inode, struct file *file)
4242 {
4243 	return seq_open_net(inode, file, &dev_seq_ops,
4244 			    sizeof(struct seq_net_private));
4245 }
4246 
4247 static const struct file_operations dev_seq_fops = {
4248 	.owner	 = THIS_MODULE,
4249 	.open    = dev_seq_open,
4250 	.read    = seq_read,
4251 	.llseek  = seq_lseek,
4252 	.release = seq_release_net,
4253 };
4254 
4255 static const struct seq_operations softnet_seq_ops = {
4256 	.start = softnet_seq_start,
4257 	.next  = softnet_seq_next,
4258 	.stop  = softnet_seq_stop,
4259 	.show  = softnet_seq_show,
4260 };
4261 
4262 static int softnet_seq_open(struct inode *inode, struct file *file)
4263 {
4264 	return seq_open(file, &softnet_seq_ops);
4265 }
4266 
4267 static const struct file_operations softnet_seq_fops = {
4268 	.owner	 = THIS_MODULE,
4269 	.open    = softnet_seq_open,
4270 	.read    = seq_read,
4271 	.llseek  = seq_lseek,
4272 	.release = seq_release,
4273 };
4274 
4275 static void *ptype_get_idx(loff_t pos)
4276 {
4277 	struct packet_type *pt = NULL;
4278 	loff_t i = 0;
4279 	int t;
4280 
4281 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4282 		if (i == pos)
4283 			return pt;
4284 		++i;
4285 	}
4286 
4287 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4288 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4289 			if (i == pos)
4290 				return pt;
4291 			++i;
4292 		}
4293 	}
4294 	return NULL;
4295 }
4296 
4297 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4298 	__acquires(RCU)
4299 {
4300 	rcu_read_lock();
4301 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4302 }
4303 
4304 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4305 {
4306 	struct packet_type *pt;
4307 	struct list_head *nxt;
4308 	int hash;
4309 
4310 	++*pos;
4311 	if (v == SEQ_START_TOKEN)
4312 		return ptype_get_idx(0);
4313 
4314 	pt = v;
4315 	nxt = pt->list.next;
4316 	if (pt->type == htons(ETH_P_ALL)) {
4317 		if (nxt != &ptype_all)
4318 			goto found;
4319 		hash = 0;
4320 		nxt = ptype_base[0].next;
4321 	} else
4322 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4323 
4324 	while (nxt == &ptype_base[hash]) {
4325 		if (++hash >= PTYPE_HASH_SIZE)
4326 			return NULL;
4327 		nxt = ptype_base[hash].next;
4328 	}
4329 found:
4330 	return list_entry(nxt, struct packet_type, list);
4331 }
4332 
4333 static void ptype_seq_stop(struct seq_file *seq, void *v)
4334 	__releases(RCU)
4335 {
4336 	rcu_read_unlock();
4337 }
4338 
4339 static int ptype_seq_show(struct seq_file *seq, void *v)
4340 {
4341 	struct packet_type *pt = v;
4342 
4343 	if (v == SEQ_START_TOKEN)
4344 		seq_puts(seq, "Type Device      Function\n");
4345 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4346 		if (pt->type == htons(ETH_P_ALL))
4347 			seq_puts(seq, "ALL ");
4348 		else
4349 			seq_printf(seq, "%04x", ntohs(pt->type));
4350 
4351 		seq_printf(seq, " %-8s %pF\n",
4352 			   pt->dev ? pt->dev->name : "", pt->func);
4353 	}
4354 
4355 	return 0;
4356 }
4357 
4358 static const struct seq_operations ptype_seq_ops = {
4359 	.start = ptype_seq_start,
4360 	.next  = ptype_seq_next,
4361 	.stop  = ptype_seq_stop,
4362 	.show  = ptype_seq_show,
4363 };
4364 
4365 static int ptype_seq_open(struct inode *inode, struct file *file)
4366 {
4367 	return seq_open_net(inode, file, &ptype_seq_ops,
4368 			sizeof(struct seq_net_private));
4369 }
4370 
4371 static const struct file_operations ptype_seq_fops = {
4372 	.owner	 = THIS_MODULE,
4373 	.open    = ptype_seq_open,
4374 	.read    = seq_read,
4375 	.llseek  = seq_lseek,
4376 	.release = seq_release_net,
4377 };
4378 
4379 
4380 static int __net_init dev_proc_net_init(struct net *net)
4381 {
4382 	int rc = -ENOMEM;
4383 
4384 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4385 		goto out;
4386 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4387 		goto out_dev;
4388 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4389 		goto out_softnet;
4390 
4391 	if (wext_proc_init(net))
4392 		goto out_ptype;
4393 	rc = 0;
4394 out:
4395 	return rc;
4396 out_ptype:
4397 	proc_net_remove(net, "ptype");
4398 out_softnet:
4399 	proc_net_remove(net, "softnet_stat");
4400 out_dev:
4401 	proc_net_remove(net, "dev");
4402 	goto out;
4403 }
4404 
4405 static void __net_exit dev_proc_net_exit(struct net *net)
4406 {
4407 	wext_proc_exit(net);
4408 
4409 	proc_net_remove(net, "ptype");
4410 	proc_net_remove(net, "softnet_stat");
4411 	proc_net_remove(net, "dev");
4412 }
4413 
4414 static struct pernet_operations __net_initdata dev_proc_ops = {
4415 	.init = dev_proc_net_init,
4416 	.exit = dev_proc_net_exit,
4417 };
4418 
4419 static int __init dev_proc_init(void)
4420 {
4421 	return register_pernet_subsys(&dev_proc_ops);
4422 }
4423 #else
4424 #define dev_proc_init() 0
4425 #endif	/* CONFIG_PROC_FS */
4426 
4427 
4428 /**
4429  *	netdev_set_master	-	set up master pointer
4430  *	@slave: slave device
4431  *	@master: new master device
4432  *
4433  *	Changes the master device of the slave. Pass %NULL to break the
4434  *	bonding. The caller must hold the RTNL semaphore. On a failure
4435  *	a negative errno code is returned. On success the reference counts
4436  *	are adjusted and the function returns zero.
4437  */
4438 int netdev_set_master(struct net_device *slave, struct net_device *master)
4439 {
4440 	struct net_device *old = slave->master;
4441 
4442 	ASSERT_RTNL();
4443 
4444 	if (master) {
4445 		if (old)
4446 			return -EBUSY;
4447 		dev_hold(master);
4448 	}
4449 
4450 	slave->master = master;
4451 
4452 	if (old)
4453 		dev_put(old);
4454 	return 0;
4455 }
4456 EXPORT_SYMBOL(netdev_set_master);
4457 
4458 /**
4459  *	netdev_set_bond_master	-	set up bonding master/slave pair
4460  *	@slave: slave device
4461  *	@master: new master device
4462  *
4463  *	Changes the master device of the slave. Pass %NULL to break the
4464  *	bonding. The caller must hold the RTNL semaphore. On a failure
4465  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4466  *	to the routing socket and the function returns zero.
4467  */
4468 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4469 {
4470 	int err;
4471 
4472 	ASSERT_RTNL();
4473 
4474 	err = netdev_set_master(slave, master);
4475 	if (err)
4476 		return err;
4477 	if (master)
4478 		slave->flags |= IFF_SLAVE;
4479 	else
4480 		slave->flags &= ~IFF_SLAVE;
4481 
4482 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4483 	return 0;
4484 }
4485 EXPORT_SYMBOL(netdev_set_bond_master);
4486 
4487 static void dev_change_rx_flags(struct net_device *dev, int flags)
4488 {
4489 	const struct net_device_ops *ops = dev->netdev_ops;
4490 
4491 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4492 		ops->ndo_change_rx_flags(dev, flags);
4493 }
4494 
4495 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4496 {
4497 	unsigned int old_flags = dev->flags;
4498 	uid_t uid;
4499 	gid_t gid;
4500 
4501 	ASSERT_RTNL();
4502 
4503 	dev->flags |= IFF_PROMISC;
4504 	dev->promiscuity += inc;
4505 	if (dev->promiscuity == 0) {
4506 		/*
4507 		 * Avoid overflow.
4508 		 * If inc causes overflow, untouch promisc and return error.
4509 		 */
4510 		if (inc < 0)
4511 			dev->flags &= ~IFF_PROMISC;
4512 		else {
4513 			dev->promiscuity -= inc;
4514 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4515 				dev->name);
4516 			return -EOVERFLOW;
4517 		}
4518 	}
4519 	if (dev->flags != old_flags) {
4520 		pr_info("device %s %s promiscuous mode\n",
4521 			dev->name,
4522 			dev->flags & IFF_PROMISC ? "entered" : "left");
4523 		if (audit_enabled) {
4524 			current_uid_gid(&uid, &gid);
4525 			audit_log(current->audit_context, GFP_ATOMIC,
4526 				AUDIT_ANOM_PROMISCUOUS,
4527 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4528 				dev->name, (dev->flags & IFF_PROMISC),
4529 				(old_flags & IFF_PROMISC),
4530 				audit_get_loginuid(current),
4531 				uid, gid,
4532 				audit_get_sessionid(current));
4533 		}
4534 
4535 		dev_change_rx_flags(dev, IFF_PROMISC);
4536 	}
4537 	return 0;
4538 }
4539 
4540 /**
4541  *	dev_set_promiscuity	- update promiscuity count on a device
4542  *	@dev: device
4543  *	@inc: modifier
4544  *
4545  *	Add or remove promiscuity from a device. While the count in the device
4546  *	remains above zero the interface remains promiscuous. Once it hits zero
4547  *	the device reverts back to normal filtering operation. A negative inc
4548  *	value is used to drop promiscuity on the device.
4549  *	Return 0 if successful or a negative errno code on error.
4550  */
4551 int dev_set_promiscuity(struct net_device *dev, int inc)
4552 {
4553 	unsigned int old_flags = dev->flags;
4554 	int err;
4555 
4556 	err = __dev_set_promiscuity(dev, inc);
4557 	if (err < 0)
4558 		return err;
4559 	if (dev->flags != old_flags)
4560 		dev_set_rx_mode(dev);
4561 	return err;
4562 }
4563 EXPORT_SYMBOL(dev_set_promiscuity);
4564 
4565 /**
4566  *	dev_set_allmulti	- update allmulti count on a device
4567  *	@dev: device
4568  *	@inc: modifier
4569  *
4570  *	Add or remove reception of all multicast frames to a device. While the
4571  *	count in the device remains above zero the interface remains listening
4572  *	to all interfaces. Once it hits zero the device reverts back to normal
4573  *	filtering operation. A negative @inc value is used to drop the counter
4574  *	when releasing a resource needing all multicasts.
4575  *	Return 0 if successful or a negative errno code on error.
4576  */
4577 
4578 int dev_set_allmulti(struct net_device *dev, int inc)
4579 {
4580 	unsigned int old_flags = dev->flags;
4581 
4582 	ASSERT_RTNL();
4583 
4584 	dev->flags |= IFF_ALLMULTI;
4585 	dev->allmulti += inc;
4586 	if (dev->allmulti == 0) {
4587 		/*
4588 		 * Avoid overflow.
4589 		 * If inc causes overflow, untouch allmulti and return error.
4590 		 */
4591 		if (inc < 0)
4592 			dev->flags &= ~IFF_ALLMULTI;
4593 		else {
4594 			dev->allmulti -= inc;
4595 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4596 				dev->name);
4597 			return -EOVERFLOW;
4598 		}
4599 	}
4600 	if (dev->flags ^ old_flags) {
4601 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4602 		dev_set_rx_mode(dev);
4603 	}
4604 	return 0;
4605 }
4606 EXPORT_SYMBOL(dev_set_allmulti);
4607 
4608 /*
4609  *	Upload unicast and multicast address lists to device and
4610  *	configure RX filtering. When the device doesn't support unicast
4611  *	filtering it is put in promiscuous mode while unicast addresses
4612  *	are present.
4613  */
4614 void __dev_set_rx_mode(struct net_device *dev)
4615 {
4616 	const struct net_device_ops *ops = dev->netdev_ops;
4617 
4618 	/* dev_open will call this function so the list will stay sane. */
4619 	if (!(dev->flags&IFF_UP))
4620 		return;
4621 
4622 	if (!netif_device_present(dev))
4623 		return;
4624 
4625 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4626 		/* Unicast addresses changes may only happen under the rtnl,
4627 		 * therefore calling __dev_set_promiscuity here is safe.
4628 		 */
4629 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4630 			__dev_set_promiscuity(dev, 1);
4631 			dev->uc_promisc = true;
4632 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4633 			__dev_set_promiscuity(dev, -1);
4634 			dev->uc_promisc = false;
4635 		}
4636 	}
4637 
4638 	if (ops->ndo_set_rx_mode)
4639 		ops->ndo_set_rx_mode(dev);
4640 }
4641 
4642 void dev_set_rx_mode(struct net_device *dev)
4643 {
4644 	netif_addr_lock_bh(dev);
4645 	__dev_set_rx_mode(dev);
4646 	netif_addr_unlock_bh(dev);
4647 }
4648 
4649 /**
4650  *	dev_get_flags - get flags reported to userspace
4651  *	@dev: device
4652  *
4653  *	Get the combination of flag bits exported through APIs to userspace.
4654  */
4655 unsigned int dev_get_flags(const struct net_device *dev)
4656 {
4657 	unsigned int flags;
4658 
4659 	flags = (dev->flags & ~(IFF_PROMISC |
4660 				IFF_ALLMULTI |
4661 				IFF_RUNNING |
4662 				IFF_LOWER_UP |
4663 				IFF_DORMANT)) |
4664 		(dev->gflags & (IFF_PROMISC |
4665 				IFF_ALLMULTI));
4666 
4667 	if (netif_running(dev)) {
4668 		if (netif_oper_up(dev))
4669 			flags |= IFF_RUNNING;
4670 		if (netif_carrier_ok(dev))
4671 			flags |= IFF_LOWER_UP;
4672 		if (netif_dormant(dev))
4673 			flags |= IFF_DORMANT;
4674 	}
4675 
4676 	return flags;
4677 }
4678 EXPORT_SYMBOL(dev_get_flags);
4679 
4680 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4681 {
4682 	unsigned int old_flags = dev->flags;
4683 	int ret;
4684 
4685 	ASSERT_RTNL();
4686 
4687 	/*
4688 	 *	Set the flags on our device.
4689 	 */
4690 
4691 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4692 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4693 			       IFF_AUTOMEDIA)) |
4694 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4695 				    IFF_ALLMULTI));
4696 
4697 	/*
4698 	 *	Load in the correct multicast list now the flags have changed.
4699 	 */
4700 
4701 	if ((old_flags ^ flags) & IFF_MULTICAST)
4702 		dev_change_rx_flags(dev, IFF_MULTICAST);
4703 
4704 	dev_set_rx_mode(dev);
4705 
4706 	/*
4707 	 *	Have we downed the interface. We handle IFF_UP ourselves
4708 	 *	according to user attempts to set it, rather than blindly
4709 	 *	setting it.
4710 	 */
4711 
4712 	ret = 0;
4713 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4714 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4715 
4716 		if (!ret)
4717 			dev_set_rx_mode(dev);
4718 	}
4719 
4720 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4721 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4722 
4723 		dev->gflags ^= IFF_PROMISC;
4724 		dev_set_promiscuity(dev, inc);
4725 	}
4726 
4727 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4728 	   is important. Some (broken) drivers set IFF_PROMISC, when
4729 	   IFF_ALLMULTI is requested not asking us and not reporting.
4730 	 */
4731 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4732 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4733 
4734 		dev->gflags ^= IFF_ALLMULTI;
4735 		dev_set_allmulti(dev, inc);
4736 	}
4737 
4738 	return ret;
4739 }
4740 
4741 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4742 {
4743 	unsigned int changes = dev->flags ^ old_flags;
4744 
4745 	if (changes & IFF_UP) {
4746 		if (dev->flags & IFF_UP)
4747 			call_netdevice_notifiers(NETDEV_UP, dev);
4748 		else
4749 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4750 	}
4751 
4752 	if (dev->flags & IFF_UP &&
4753 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4754 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4755 }
4756 
4757 /**
4758  *	dev_change_flags - change device settings
4759  *	@dev: device
4760  *	@flags: device state flags
4761  *
4762  *	Change settings on device based state flags. The flags are
4763  *	in the userspace exported format.
4764  */
4765 int dev_change_flags(struct net_device *dev, unsigned int flags)
4766 {
4767 	int ret;
4768 	unsigned int changes, old_flags = dev->flags;
4769 
4770 	ret = __dev_change_flags(dev, flags);
4771 	if (ret < 0)
4772 		return ret;
4773 
4774 	changes = old_flags ^ dev->flags;
4775 	if (changes)
4776 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4777 
4778 	__dev_notify_flags(dev, old_flags);
4779 	return ret;
4780 }
4781 EXPORT_SYMBOL(dev_change_flags);
4782 
4783 /**
4784  *	dev_set_mtu - Change maximum transfer unit
4785  *	@dev: device
4786  *	@new_mtu: new transfer unit
4787  *
4788  *	Change the maximum transfer size of the network device.
4789  */
4790 int dev_set_mtu(struct net_device *dev, int new_mtu)
4791 {
4792 	const struct net_device_ops *ops = dev->netdev_ops;
4793 	int err;
4794 
4795 	if (new_mtu == dev->mtu)
4796 		return 0;
4797 
4798 	/*	MTU must be positive.	 */
4799 	if (new_mtu < 0)
4800 		return -EINVAL;
4801 
4802 	if (!netif_device_present(dev))
4803 		return -ENODEV;
4804 
4805 	err = 0;
4806 	if (ops->ndo_change_mtu)
4807 		err = ops->ndo_change_mtu(dev, new_mtu);
4808 	else
4809 		dev->mtu = new_mtu;
4810 
4811 	if (!err && dev->flags & IFF_UP)
4812 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4813 	return err;
4814 }
4815 EXPORT_SYMBOL(dev_set_mtu);
4816 
4817 /**
4818  *	dev_set_group - Change group this device belongs to
4819  *	@dev: device
4820  *	@new_group: group this device should belong to
4821  */
4822 void dev_set_group(struct net_device *dev, int new_group)
4823 {
4824 	dev->group = new_group;
4825 }
4826 EXPORT_SYMBOL(dev_set_group);
4827 
4828 /**
4829  *	dev_set_mac_address - Change Media Access Control Address
4830  *	@dev: device
4831  *	@sa: new address
4832  *
4833  *	Change the hardware (MAC) address of the device
4834  */
4835 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4836 {
4837 	const struct net_device_ops *ops = dev->netdev_ops;
4838 	int err;
4839 
4840 	if (!ops->ndo_set_mac_address)
4841 		return -EOPNOTSUPP;
4842 	if (sa->sa_family != dev->type)
4843 		return -EINVAL;
4844 	if (!netif_device_present(dev))
4845 		return -ENODEV;
4846 	err = ops->ndo_set_mac_address(dev, sa);
4847 	if (!err)
4848 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4849 	add_device_randomness(dev->dev_addr, dev->addr_len);
4850 	return err;
4851 }
4852 EXPORT_SYMBOL(dev_set_mac_address);
4853 
4854 /*
4855  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4856  */
4857 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4858 {
4859 	int err;
4860 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4861 
4862 	if (!dev)
4863 		return -ENODEV;
4864 
4865 	switch (cmd) {
4866 	case SIOCGIFFLAGS:	/* Get interface flags */
4867 		ifr->ifr_flags = (short) dev_get_flags(dev);
4868 		return 0;
4869 
4870 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4871 				   (currently unused) */
4872 		ifr->ifr_metric = 0;
4873 		return 0;
4874 
4875 	case SIOCGIFMTU:	/* Get the MTU of a device */
4876 		ifr->ifr_mtu = dev->mtu;
4877 		return 0;
4878 
4879 	case SIOCGIFHWADDR:
4880 		if (!dev->addr_len)
4881 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4882 		else
4883 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4884 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4885 		ifr->ifr_hwaddr.sa_family = dev->type;
4886 		return 0;
4887 
4888 	case SIOCGIFSLAVE:
4889 		err = -EINVAL;
4890 		break;
4891 
4892 	case SIOCGIFMAP:
4893 		ifr->ifr_map.mem_start = dev->mem_start;
4894 		ifr->ifr_map.mem_end   = dev->mem_end;
4895 		ifr->ifr_map.base_addr = dev->base_addr;
4896 		ifr->ifr_map.irq       = dev->irq;
4897 		ifr->ifr_map.dma       = dev->dma;
4898 		ifr->ifr_map.port      = dev->if_port;
4899 		return 0;
4900 
4901 	case SIOCGIFINDEX:
4902 		ifr->ifr_ifindex = dev->ifindex;
4903 		return 0;
4904 
4905 	case SIOCGIFTXQLEN:
4906 		ifr->ifr_qlen = dev->tx_queue_len;
4907 		return 0;
4908 
4909 	default:
4910 		/* dev_ioctl() should ensure this case
4911 		 * is never reached
4912 		 */
4913 		WARN_ON(1);
4914 		err = -ENOTTY;
4915 		break;
4916 
4917 	}
4918 	return err;
4919 }
4920 
4921 /*
4922  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4923  */
4924 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4925 {
4926 	int err;
4927 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4928 	const struct net_device_ops *ops;
4929 
4930 	if (!dev)
4931 		return -ENODEV;
4932 
4933 	ops = dev->netdev_ops;
4934 
4935 	switch (cmd) {
4936 	case SIOCSIFFLAGS:	/* Set interface flags */
4937 		return dev_change_flags(dev, ifr->ifr_flags);
4938 
4939 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4940 				   (currently unused) */
4941 		return -EOPNOTSUPP;
4942 
4943 	case SIOCSIFMTU:	/* Set the MTU of a device */
4944 		return dev_set_mtu(dev, ifr->ifr_mtu);
4945 
4946 	case SIOCSIFHWADDR:
4947 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4948 
4949 	case SIOCSIFHWBROADCAST:
4950 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4951 			return -EINVAL;
4952 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4953 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4954 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4955 		return 0;
4956 
4957 	case SIOCSIFMAP:
4958 		if (ops->ndo_set_config) {
4959 			if (!netif_device_present(dev))
4960 				return -ENODEV;
4961 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4962 		}
4963 		return -EOPNOTSUPP;
4964 
4965 	case SIOCADDMULTI:
4966 		if (!ops->ndo_set_rx_mode ||
4967 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4968 			return -EINVAL;
4969 		if (!netif_device_present(dev))
4970 			return -ENODEV;
4971 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4972 
4973 	case SIOCDELMULTI:
4974 		if (!ops->ndo_set_rx_mode ||
4975 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4976 			return -EINVAL;
4977 		if (!netif_device_present(dev))
4978 			return -ENODEV;
4979 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4980 
4981 	case SIOCSIFTXQLEN:
4982 		if (ifr->ifr_qlen < 0)
4983 			return -EINVAL;
4984 		dev->tx_queue_len = ifr->ifr_qlen;
4985 		return 0;
4986 
4987 	case SIOCSIFNAME:
4988 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4989 		return dev_change_name(dev, ifr->ifr_newname);
4990 
4991 	case SIOCSHWTSTAMP:
4992 		err = net_hwtstamp_validate(ifr);
4993 		if (err)
4994 			return err;
4995 		/* fall through */
4996 
4997 	/*
4998 	 *	Unknown or private ioctl
4999 	 */
5000 	default:
5001 		if ((cmd >= SIOCDEVPRIVATE &&
5002 		    cmd <= SIOCDEVPRIVATE + 15) ||
5003 		    cmd == SIOCBONDENSLAVE ||
5004 		    cmd == SIOCBONDRELEASE ||
5005 		    cmd == SIOCBONDSETHWADDR ||
5006 		    cmd == SIOCBONDSLAVEINFOQUERY ||
5007 		    cmd == SIOCBONDINFOQUERY ||
5008 		    cmd == SIOCBONDCHANGEACTIVE ||
5009 		    cmd == SIOCGMIIPHY ||
5010 		    cmd == SIOCGMIIREG ||
5011 		    cmd == SIOCSMIIREG ||
5012 		    cmd == SIOCBRADDIF ||
5013 		    cmd == SIOCBRDELIF ||
5014 		    cmd == SIOCSHWTSTAMP ||
5015 		    cmd == SIOCWANDEV) {
5016 			err = -EOPNOTSUPP;
5017 			if (ops->ndo_do_ioctl) {
5018 				if (netif_device_present(dev))
5019 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
5020 				else
5021 					err = -ENODEV;
5022 			}
5023 		} else
5024 			err = -EINVAL;
5025 
5026 	}
5027 	return err;
5028 }
5029 
5030 /*
5031  *	This function handles all "interface"-type I/O control requests. The actual
5032  *	'doing' part of this is dev_ifsioc above.
5033  */
5034 
5035 /**
5036  *	dev_ioctl	-	network device ioctl
5037  *	@net: the applicable net namespace
5038  *	@cmd: command to issue
5039  *	@arg: pointer to a struct ifreq in user space
5040  *
5041  *	Issue ioctl functions to devices. This is normally called by the
5042  *	user space syscall interfaces but can sometimes be useful for
5043  *	other purposes. The return value is the return from the syscall if
5044  *	positive or a negative errno code on error.
5045  */
5046 
5047 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5048 {
5049 	struct ifreq ifr;
5050 	int ret;
5051 	char *colon;
5052 
5053 	/* One special case: SIOCGIFCONF takes ifconf argument
5054 	   and requires shared lock, because it sleeps writing
5055 	   to user space.
5056 	 */
5057 
5058 	if (cmd == SIOCGIFCONF) {
5059 		rtnl_lock();
5060 		ret = dev_ifconf(net, (char __user *) arg);
5061 		rtnl_unlock();
5062 		return ret;
5063 	}
5064 	if (cmd == SIOCGIFNAME)
5065 		return dev_ifname(net, (struct ifreq __user *)arg);
5066 
5067 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5068 		return -EFAULT;
5069 
5070 	ifr.ifr_name[IFNAMSIZ-1] = 0;
5071 
5072 	colon = strchr(ifr.ifr_name, ':');
5073 	if (colon)
5074 		*colon = 0;
5075 
5076 	/*
5077 	 *	See which interface the caller is talking about.
5078 	 */
5079 
5080 	switch (cmd) {
5081 	/*
5082 	 *	These ioctl calls:
5083 	 *	- can be done by all.
5084 	 *	- atomic and do not require locking.
5085 	 *	- return a value
5086 	 */
5087 	case SIOCGIFFLAGS:
5088 	case SIOCGIFMETRIC:
5089 	case SIOCGIFMTU:
5090 	case SIOCGIFHWADDR:
5091 	case SIOCGIFSLAVE:
5092 	case SIOCGIFMAP:
5093 	case SIOCGIFINDEX:
5094 	case SIOCGIFTXQLEN:
5095 		dev_load(net, ifr.ifr_name);
5096 		rcu_read_lock();
5097 		ret = dev_ifsioc_locked(net, &ifr, cmd);
5098 		rcu_read_unlock();
5099 		if (!ret) {
5100 			if (colon)
5101 				*colon = ':';
5102 			if (copy_to_user(arg, &ifr,
5103 					 sizeof(struct ifreq)))
5104 				ret = -EFAULT;
5105 		}
5106 		return ret;
5107 
5108 	case SIOCETHTOOL:
5109 		dev_load(net, ifr.ifr_name);
5110 		rtnl_lock();
5111 		ret = dev_ethtool(net, &ifr);
5112 		rtnl_unlock();
5113 		if (!ret) {
5114 			if (colon)
5115 				*colon = ':';
5116 			if (copy_to_user(arg, &ifr,
5117 					 sizeof(struct ifreq)))
5118 				ret = -EFAULT;
5119 		}
5120 		return ret;
5121 
5122 	/*
5123 	 *	These ioctl calls:
5124 	 *	- require superuser power.
5125 	 *	- require strict serialization.
5126 	 *	- return a value
5127 	 */
5128 	case SIOCGMIIPHY:
5129 	case SIOCGMIIREG:
5130 	case SIOCSIFNAME:
5131 		if (!capable(CAP_NET_ADMIN))
5132 			return -EPERM;
5133 		dev_load(net, ifr.ifr_name);
5134 		rtnl_lock();
5135 		ret = dev_ifsioc(net, &ifr, cmd);
5136 		rtnl_unlock();
5137 		if (!ret) {
5138 			if (colon)
5139 				*colon = ':';
5140 			if (copy_to_user(arg, &ifr,
5141 					 sizeof(struct ifreq)))
5142 				ret = -EFAULT;
5143 		}
5144 		return ret;
5145 
5146 	/*
5147 	 *	These ioctl calls:
5148 	 *	- require superuser power.
5149 	 *	- require strict serialization.
5150 	 *	- do not return a value
5151 	 */
5152 	case SIOCSIFFLAGS:
5153 	case SIOCSIFMETRIC:
5154 	case SIOCSIFMTU:
5155 	case SIOCSIFMAP:
5156 	case SIOCSIFHWADDR:
5157 	case SIOCSIFSLAVE:
5158 	case SIOCADDMULTI:
5159 	case SIOCDELMULTI:
5160 	case SIOCSIFHWBROADCAST:
5161 	case SIOCSIFTXQLEN:
5162 	case SIOCSMIIREG:
5163 	case SIOCBONDENSLAVE:
5164 	case SIOCBONDRELEASE:
5165 	case SIOCBONDSETHWADDR:
5166 	case SIOCBONDCHANGEACTIVE:
5167 	case SIOCBRADDIF:
5168 	case SIOCBRDELIF:
5169 	case SIOCSHWTSTAMP:
5170 		if (!capable(CAP_NET_ADMIN))
5171 			return -EPERM;
5172 		/* fall through */
5173 	case SIOCBONDSLAVEINFOQUERY:
5174 	case SIOCBONDINFOQUERY:
5175 		dev_load(net, ifr.ifr_name);
5176 		rtnl_lock();
5177 		ret = dev_ifsioc(net, &ifr, cmd);
5178 		rtnl_unlock();
5179 		return ret;
5180 
5181 	case SIOCGIFMEM:
5182 		/* Get the per device memory space. We can add this but
5183 		 * currently do not support it */
5184 	case SIOCSIFMEM:
5185 		/* Set the per device memory buffer space.
5186 		 * Not applicable in our case */
5187 	case SIOCSIFLINK:
5188 		return -ENOTTY;
5189 
5190 	/*
5191 	 *	Unknown or private ioctl.
5192 	 */
5193 	default:
5194 		if (cmd == SIOCWANDEV ||
5195 		    (cmd >= SIOCDEVPRIVATE &&
5196 		     cmd <= SIOCDEVPRIVATE + 15)) {
5197 			dev_load(net, ifr.ifr_name);
5198 			rtnl_lock();
5199 			ret = dev_ifsioc(net, &ifr, cmd);
5200 			rtnl_unlock();
5201 			if (!ret && copy_to_user(arg, &ifr,
5202 						 sizeof(struct ifreq)))
5203 				ret = -EFAULT;
5204 			return ret;
5205 		}
5206 		/* Take care of Wireless Extensions */
5207 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5208 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5209 		return -ENOTTY;
5210 	}
5211 }
5212 
5213 
5214 /**
5215  *	dev_new_index	-	allocate an ifindex
5216  *	@net: the applicable net namespace
5217  *
5218  *	Returns a suitable unique value for a new device interface
5219  *	number.  The caller must hold the rtnl semaphore or the
5220  *	dev_base_lock to be sure it remains unique.
5221  */
5222 static int dev_new_index(struct net *net)
5223 {
5224 	static int ifindex;
5225 	for (;;) {
5226 		if (++ifindex <= 0)
5227 			ifindex = 1;
5228 		if (!__dev_get_by_index(net, ifindex))
5229 			return ifindex;
5230 	}
5231 }
5232 
5233 /* Delayed registration/unregisteration */
5234 static LIST_HEAD(net_todo_list);
5235 
5236 static void net_set_todo(struct net_device *dev)
5237 {
5238 	list_add_tail(&dev->todo_list, &net_todo_list);
5239 }
5240 
5241 static void rollback_registered_many(struct list_head *head)
5242 {
5243 	struct net_device *dev, *tmp;
5244 
5245 	BUG_ON(dev_boot_phase);
5246 	ASSERT_RTNL();
5247 
5248 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5249 		/* Some devices call without registering
5250 		 * for initialization unwind. Remove those
5251 		 * devices and proceed with the remaining.
5252 		 */
5253 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5254 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5255 				 dev->name, dev);
5256 
5257 			WARN_ON(1);
5258 			list_del(&dev->unreg_list);
5259 			continue;
5260 		}
5261 		dev->dismantle = true;
5262 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5263 	}
5264 
5265 	/* If device is running, close it first. */
5266 	dev_close_many(head);
5267 
5268 	list_for_each_entry(dev, head, unreg_list) {
5269 		/* And unlink it from device chain. */
5270 		unlist_netdevice(dev);
5271 
5272 		dev->reg_state = NETREG_UNREGISTERING;
5273 	}
5274 
5275 	synchronize_net();
5276 
5277 	list_for_each_entry(dev, head, unreg_list) {
5278 		/* Shutdown queueing discipline. */
5279 		dev_shutdown(dev);
5280 
5281 
5282 		/* Notify protocols, that we are about to destroy
5283 		   this device. They should clean all the things.
5284 		*/
5285 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5286 
5287 		if (!dev->rtnl_link_ops ||
5288 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5289 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5290 
5291 		/*
5292 		 *	Flush the unicast and multicast chains
5293 		 */
5294 		dev_uc_flush(dev);
5295 		dev_mc_flush(dev);
5296 
5297 		if (dev->netdev_ops->ndo_uninit)
5298 			dev->netdev_ops->ndo_uninit(dev);
5299 
5300 		/* Notifier chain MUST detach us from master device. */
5301 		WARN_ON(dev->master);
5302 
5303 		/* Remove entries from kobject tree */
5304 		netdev_unregister_kobject(dev);
5305 	}
5306 
5307 	/* Process any work delayed until the end of the batch */
5308 	dev = list_first_entry(head, struct net_device, unreg_list);
5309 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5310 
5311 	synchronize_net();
5312 
5313 	list_for_each_entry(dev, head, unreg_list)
5314 		dev_put(dev);
5315 }
5316 
5317 static void rollback_registered(struct net_device *dev)
5318 {
5319 	LIST_HEAD(single);
5320 
5321 	list_add(&dev->unreg_list, &single);
5322 	rollback_registered_many(&single);
5323 	list_del(&single);
5324 }
5325 
5326 static netdev_features_t netdev_fix_features(struct net_device *dev,
5327 	netdev_features_t features)
5328 {
5329 	/* Fix illegal checksum combinations */
5330 	if ((features & NETIF_F_HW_CSUM) &&
5331 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5332 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5333 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5334 	}
5335 
5336 	/* Fix illegal SG+CSUM combinations. */
5337 	if ((features & NETIF_F_SG) &&
5338 	    !(features & NETIF_F_ALL_CSUM)) {
5339 		netdev_dbg(dev,
5340 			"Dropping NETIF_F_SG since no checksum feature.\n");
5341 		features &= ~NETIF_F_SG;
5342 	}
5343 
5344 	/* TSO requires that SG is present as well. */
5345 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5346 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5347 		features &= ~NETIF_F_ALL_TSO;
5348 	}
5349 
5350 	/* TSO ECN requires that TSO is present as well. */
5351 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5352 		features &= ~NETIF_F_TSO_ECN;
5353 
5354 	/* Software GSO depends on SG. */
5355 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5356 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5357 		features &= ~NETIF_F_GSO;
5358 	}
5359 
5360 	/* UFO needs SG and checksumming */
5361 	if (features & NETIF_F_UFO) {
5362 		/* maybe split UFO into V4 and V6? */
5363 		if (!((features & NETIF_F_GEN_CSUM) ||
5364 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5365 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5366 			netdev_dbg(dev,
5367 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5368 			features &= ~NETIF_F_UFO;
5369 		}
5370 
5371 		if (!(features & NETIF_F_SG)) {
5372 			netdev_dbg(dev,
5373 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5374 			features &= ~NETIF_F_UFO;
5375 		}
5376 	}
5377 
5378 	return features;
5379 }
5380 
5381 int __netdev_update_features(struct net_device *dev)
5382 {
5383 	netdev_features_t features;
5384 	int err = 0;
5385 
5386 	ASSERT_RTNL();
5387 
5388 	features = netdev_get_wanted_features(dev);
5389 
5390 	if (dev->netdev_ops->ndo_fix_features)
5391 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5392 
5393 	/* driver might be less strict about feature dependencies */
5394 	features = netdev_fix_features(dev, features);
5395 
5396 	if (dev->features == features)
5397 		return 0;
5398 
5399 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5400 		&dev->features, &features);
5401 
5402 	if (dev->netdev_ops->ndo_set_features)
5403 		err = dev->netdev_ops->ndo_set_features(dev, features);
5404 
5405 	if (unlikely(err < 0)) {
5406 		netdev_err(dev,
5407 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
5408 			err, &features, &dev->features);
5409 		return -1;
5410 	}
5411 
5412 	if (!err)
5413 		dev->features = features;
5414 
5415 	return 1;
5416 }
5417 
5418 /**
5419  *	netdev_update_features - recalculate device features
5420  *	@dev: the device to check
5421  *
5422  *	Recalculate dev->features set and send notifications if it
5423  *	has changed. Should be called after driver or hardware dependent
5424  *	conditions might have changed that influence the features.
5425  */
5426 void netdev_update_features(struct net_device *dev)
5427 {
5428 	if (__netdev_update_features(dev))
5429 		netdev_features_change(dev);
5430 }
5431 EXPORT_SYMBOL(netdev_update_features);
5432 
5433 /**
5434  *	netdev_change_features - recalculate device features
5435  *	@dev: the device to check
5436  *
5437  *	Recalculate dev->features set and send notifications even
5438  *	if they have not changed. Should be called instead of
5439  *	netdev_update_features() if also dev->vlan_features might
5440  *	have changed to allow the changes to be propagated to stacked
5441  *	VLAN devices.
5442  */
5443 void netdev_change_features(struct net_device *dev)
5444 {
5445 	__netdev_update_features(dev);
5446 	netdev_features_change(dev);
5447 }
5448 EXPORT_SYMBOL(netdev_change_features);
5449 
5450 /**
5451  *	netif_stacked_transfer_operstate -	transfer operstate
5452  *	@rootdev: the root or lower level device to transfer state from
5453  *	@dev: the device to transfer operstate to
5454  *
5455  *	Transfer operational state from root to device. This is normally
5456  *	called when a stacking relationship exists between the root
5457  *	device and the device(a leaf device).
5458  */
5459 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5460 					struct net_device *dev)
5461 {
5462 	if (rootdev->operstate == IF_OPER_DORMANT)
5463 		netif_dormant_on(dev);
5464 	else
5465 		netif_dormant_off(dev);
5466 
5467 	if (netif_carrier_ok(rootdev)) {
5468 		if (!netif_carrier_ok(dev))
5469 			netif_carrier_on(dev);
5470 	} else {
5471 		if (netif_carrier_ok(dev))
5472 			netif_carrier_off(dev);
5473 	}
5474 }
5475 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5476 
5477 #ifdef CONFIG_RPS
5478 static int netif_alloc_rx_queues(struct net_device *dev)
5479 {
5480 	unsigned int i, count = dev->num_rx_queues;
5481 	struct netdev_rx_queue *rx;
5482 
5483 	BUG_ON(count < 1);
5484 
5485 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5486 	if (!rx) {
5487 		pr_err("netdev: Unable to allocate %u rx queues\n", count);
5488 		return -ENOMEM;
5489 	}
5490 	dev->_rx = rx;
5491 
5492 	for (i = 0; i < count; i++)
5493 		rx[i].dev = dev;
5494 	return 0;
5495 }
5496 #endif
5497 
5498 static void netdev_init_one_queue(struct net_device *dev,
5499 				  struct netdev_queue *queue, void *_unused)
5500 {
5501 	/* Initialize queue lock */
5502 	spin_lock_init(&queue->_xmit_lock);
5503 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5504 	queue->xmit_lock_owner = -1;
5505 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5506 	queue->dev = dev;
5507 #ifdef CONFIG_BQL
5508 	dql_init(&queue->dql, HZ);
5509 #endif
5510 }
5511 
5512 static int netif_alloc_netdev_queues(struct net_device *dev)
5513 {
5514 	unsigned int count = dev->num_tx_queues;
5515 	struct netdev_queue *tx;
5516 
5517 	BUG_ON(count < 1);
5518 
5519 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5520 	if (!tx) {
5521 		pr_err("netdev: Unable to allocate %u tx queues\n", count);
5522 		return -ENOMEM;
5523 	}
5524 	dev->_tx = tx;
5525 
5526 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5527 	spin_lock_init(&dev->tx_global_lock);
5528 
5529 	return 0;
5530 }
5531 
5532 /**
5533  *	register_netdevice	- register a network device
5534  *	@dev: device to register
5535  *
5536  *	Take a completed network device structure and add it to the kernel
5537  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5538  *	chain. 0 is returned on success. A negative errno code is returned
5539  *	on a failure to set up the device, or if the name is a duplicate.
5540  *
5541  *	Callers must hold the rtnl semaphore. You may want
5542  *	register_netdev() instead of this.
5543  *
5544  *	BUGS:
5545  *	The locking appears insufficient to guarantee two parallel registers
5546  *	will not get the same name.
5547  */
5548 
5549 int register_netdevice(struct net_device *dev)
5550 {
5551 	int ret;
5552 	struct net *net = dev_net(dev);
5553 
5554 	BUG_ON(dev_boot_phase);
5555 	ASSERT_RTNL();
5556 
5557 	might_sleep();
5558 
5559 	/* When net_device's are persistent, this will be fatal. */
5560 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5561 	BUG_ON(!net);
5562 
5563 	spin_lock_init(&dev->addr_list_lock);
5564 	netdev_set_addr_lockdep_class(dev);
5565 
5566 	dev->iflink = -1;
5567 
5568 	ret = dev_get_valid_name(dev, dev->name);
5569 	if (ret < 0)
5570 		goto out;
5571 
5572 	/* Init, if this function is available */
5573 	if (dev->netdev_ops->ndo_init) {
5574 		ret = dev->netdev_ops->ndo_init(dev);
5575 		if (ret) {
5576 			if (ret > 0)
5577 				ret = -EIO;
5578 			goto out;
5579 		}
5580 	}
5581 
5582 	ret = -EBUSY;
5583 	if (!dev->ifindex)
5584 		dev->ifindex = dev_new_index(net);
5585 	else if (__dev_get_by_index(net, dev->ifindex))
5586 		goto err_uninit;
5587 
5588 	if (dev->iflink == -1)
5589 		dev->iflink = dev->ifindex;
5590 
5591 	/* Transfer changeable features to wanted_features and enable
5592 	 * software offloads (GSO and GRO).
5593 	 */
5594 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5595 	dev->features |= NETIF_F_SOFT_FEATURES;
5596 	dev->wanted_features = dev->features & dev->hw_features;
5597 
5598 	/* Turn on no cache copy if HW is doing checksum */
5599 	if (!(dev->flags & IFF_LOOPBACK)) {
5600 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
5601 		if (dev->features & NETIF_F_ALL_CSUM) {
5602 			dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5603 			dev->features |= NETIF_F_NOCACHE_COPY;
5604 		}
5605 	}
5606 
5607 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5608 	 */
5609 	dev->vlan_features |= NETIF_F_HIGHDMA;
5610 
5611 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5612 	ret = notifier_to_errno(ret);
5613 	if (ret)
5614 		goto err_uninit;
5615 
5616 	ret = netdev_register_kobject(dev);
5617 	if (ret)
5618 		goto err_uninit;
5619 	dev->reg_state = NETREG_REGISTERED;
5620 
5621 	__netdev_update_features(dev);
5622 
5623 	/*
5624 	 *	Default initial state at registry is that the
5625 	 *	device is present.
5626 	 */
5627 
5628 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5629 
5630 	dev_init_scheduler(dev);
5631 	dev_hold(dev);
5632 	list_netdevice(dev);
5633 	add_device_randomness(dev->dev_addr, dev->addr_len);
5634 
5635 	/* Notify protocols, that a new device appeared. */
5636 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5637 	ret = notifier_to_errno(ret);
5638 	if (ret) {
5639 		rollback_registered(dev);
5640 		dev->reg_state = NETREG_UNREGISTERED;
5641 	}
5642 	/*
5643 	 *	Prevent userspace races by waiting until the network
5644 	 *	device is fully setup before sending notifications.
5645 	 */
5646 	if (!dev->rtnl_link_ops ||
5647 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5648 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5649 
5650 out:
5651 	return ret;
5652 
5653 err_uninit:
5654 	if (dev->netdev_ops->ndo_uninit)
5655 		dev->netdev_ops->ndo_uninit(dev);
5656 	goto out;
5657 }
5658 EXPORT_SYMBOL(register_netdevice);
5659 
5660 /**
5661  *	init_dummy_netdev	- init a dummy network device for NAPI
5662  *	@dev: device to init
5663  *
5664  *	This takes a network device structure and initialize the minimum
5665  *	amount of fields so it can be used to schedule NAPI polls without
5666  *	registering a full blown interface. This is to be used by drivers
5667  *	that need to tie several hardware interfaces to a single NAPI
5668  *	poll scheduler due to HW limitations.
5669  */
5670 int init_dummy_netdev(struct net_device *dev)
5671 {
5672 	/* Clear everything. Note we don't initialize spinlocks
5673 	 * are they aren't supposed to be taken by any of the
5674 	 * NAPI code and this dummy netdev is supposed to be
5675 	 * only ever used for NAPI polls
5676 	 */
5677 	memset(dev, 0, sizeof(struct net_device));
5678 
5679 	/* make sure we BUG if trying to hit standard
5680 	 * register/unregister code path
5681 	 */
5682 	dev->reg_state = NETREG_DUMMY;
5683 
5684 	/* NAPI wants this */
5685 	INIT_LIST_HEAD(&dev->napi_list);
5686 
5687 	/* a dummy interface is started by default */
5688 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5689 	set_bit(__LINK_STATE_START, &dev->state);
5690 
5691 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5692 	 * because users of this 'device' dont need to change
5693 	 * its refcount.
5694 	 */
5695 
5696 	return 0;
5697 }
5698 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5699 
5700 
5701 /**
5702  *	register_netdev	- register a network device
5703  *	@dev: device to register
5704  *
5705  *	Take a completed network device structure and add it to the kernel
5706  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5707  *	chain. 0 is returned on success. A negative errno code is returned
5708  *	on a failure to set up the device, or if the name is a duplicate.
5709  *
5710  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5711  *	and expands the device name if you passed a format string to
5712  *	alloc_netdev.
5713  */
5714 int register_netdev(struct net_device *dev)
5715 {
5716 	int err;
5717 
5718 	rtnl_lock();
5719 	err = register_netdevice(dev);
5720 	rtnl_unlock();
5721 	return err;
5722 }
5723 EXPORT_SYMBOL(register_netdev);
5724 
5725 int netdev_refcnt_read(const struct net_device *dev)
5726 {
5727 	int i, refcnt = 0;
5728 
5729 	for_each_possible_cpu(i)
5730 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5731 	return refcnt;
5732 }
5733 EXPORT_SYMBOL(netdev_refcnt_read);
5734 
5735 /**
5736  * netdev_wait_allrefs - wait until all references are gone.
5737  *
5738  * This is called when unregistering network devices.
5739  *
5740  * Any protocol or device that holds a reference should register
5741  * for netdevice notification, and cleanup and put back the
5742  * reference if they receive an UNREGISTER event.
5743  * We can get stuck here if buggy protocols don't correctly
5744  * call dev_put.
5745  */
5746 static void netdev_wait_allrefs(struct net_device *dev)
5747 {
5748 	unsigned long rebroadcast_time, warning_time;
5749 	int refcnt;
5750 
5751 	linkwatch_forget_dev(dev);
5752 
5753 	rebroadcast_time = warning_time = jiffies;
5754 	refcnt = netdev_refcnt_read(dev);
5755 
5756 	while (refcnt != 0) {
5757 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5758 			rtnl_lock();
5759 
5760 			/* Rebroadcast unregister notification */
5761 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5762 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5763 			 * should have already handle it the first time */
5764 
5765 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5766 				     &dev->state)) {
5767 				/* We must not have linkwatch events
5768 				 * pending on unregister. If this
5769 				 * happens, we simply run the queue
5770 				 * unscheduled, resulting in a noop
5771 				 * for this device.
5772 				 */
5773 				linkwatch_run_queue();
5774 			}
5775 
5776 			__rtnl_unlock();
5777 
5778 			rebroadcast_time = jiffies;
5779 		}
5780 
5781 		msleep(250);
5782 
5783 		refcnt = netdev_refcnt_read(dev);
5784 
5785 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5786 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5787 				 dev->name, refcnt);
5788 			warning_time = jiffies;
5789 		}
5790 	}
5791 }
5792 
5793 /* The sequence is:
5794  *
5795  *	rtnl_lock();
5796  *	...
5797  *	register_netdevice(x1);
5798  *	register_netdevice(x2);
5799  *	...
5800  *	unregister_netdevice(y1);
5801  *	unregister_netdevice(y2);
5802  *      ...
5803  *	rtnl_unlock();
5804  *	free_netdev(y1);
5805  *	free_netdev(y2);
5806  *
5807  * We are invoked by rtnl_unlock().
5808  * This allows us to deal with problems:
5809  * 1) We can delete sysfs objects which invoke hotplug
5810  *    without deadlocking with linkwatch via keventd.
5811  * 2) Since we run with the RTNL semaphore not held, we can sleep
5812  *    safely in order to wait for the netdev refcnt to drop to zero.
5813  *
5814  * We must not return until all unregister events added during
5815  * the interval the lock was held have been completed.
5816  */
5817 void netdev_run_todo(void)
5818 {
5819 	struct list_head list;
5820 
5821 	/* Snapshot list, allow later requests */
5822 	list_replace_init(&net_todo_list, &list);
5823 
5824 	__rtnl_unlock();
5825 
5826 	/* Wait for rcu callbacks to finish before attempting to drain
5827 	 * the device list.  This usually avoids a 250ms wait.
5828 	 */
5829 	if (!list_empty(&list))
5830 		rcu_barrier();
5831 
5832 	while (!list_empty(&list)) {
5833 		struct net_device *dev
5834 			= list_first_entry(&list, struct net_device, todo_list);
5835 		list_del(&dev->todo_list);
5836 
5837 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5838 			pr_err("network todo '%s' but state %d\n",
5839 			       dev->name, dev->reg_state);
5840 			dump_stack();
5841 			continue;
5842 		}
5843 
5844 		dev->reg_state = NETREG_UNREGISTERED;
5845 
5846 		on_each_cpu(flush_backlog, dev, 1);
5847 
5848 		netdev_wait_allrefs(dev);
5849 
5850 		/* paranoia */
5851 		BUG_ON(netdev_refcnt_read(dev));
5852 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
5853 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5854 		WARN_ON(dev->dn_ptr);
5855 
5856 		if (dev->destructor)
5857 			dev->destructor(dev);
5858 
5859 		/* Free network device */
5860 		kobject_put(&dev->dev.kobj);
5861 	}
5862 }
5863 
5864 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5865  * fields in the same order, with only the type differing.
5866  */
5867 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5868 			     const struct net_device_stats *netdev_stats)
5869 {
5870 #if BITS_PER_LONG == 64
5871 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5872 	memcpy(stats64, netdev_stats, sizeof(*stats64));
5873 #else
5874 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5875 	const unsigned long *src = (const unsigned long *)netdev_stats;
5876 	u64 *dst = (u64 *)stats64;
5877 
5878 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5879 		     sizeof(*stats64) / sizeof(u64));
5880 	for (i = 0; i < n; i++)
5881 		dst[i] = src[i];
5882 #endif
5883 }
5884 EXPORT_SYMBOL(netdev_stats_to_stats64);
5885 
5886 /**
5887  *	dev_get_stats	- get network device statistics
5888  *	@dev: device to get statistics from
5889  *	@storage: place to store stats
5890  *
5891  *	Get network statistics from device. Return @storage.
5892  *	The device driver may provide its own method by setting
5893  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5894  *	otherwise the internal statistics structure is used.
5895  */
5896 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5897 					struct rtnl_link_stats64 *storage)
5898 {
5899 	const struct net_device_ops *ops = dev->netdev_ops;
5900 
5901 	if (ops->ndo_get_stats64) {
5902 		memset(storage, 0, sizeof(*storage));
5903 		ops->ndo_get_stats64(dev, storage);
5904 	} else if (ops->ndo_get_stats) {
5905 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5906 	} else {
5907 		netdev_stats_to_stats64(storage, &dev->stats);
5908 	}
5909 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5910 	return storage;
5911 }
5912 EXPORT_SYMBOL(dev_get_stats);
5913 
5914 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5915 {
5916 	struct netdev_queue *queue = dev_ingress_queue(dev);
5917 
5918 #ifdef CONFIG_NET_CLS_ACT
5919 	if (queue)
5920 		return queue;
5921 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5922 	if (!queue)
5923 		return NULL;
5924 	netdev_init_one_queue(dev, queue, NULL);
5925 	queue->qdisc = &noop_qdisc;
5926 	queue->qdisc_sleeping = &noop_qdisc;
5927 	rcu_assign_pointer(dev->ingress_queue, queue);
5928 #endif
5929 	return queue;
5930 }
5931 
5932 /**
5933  *	alloc_netdev_mqs - allocate network device
5934  *	@sizeof_priv:	size of private data to allocate space for
5935  *	@name:		device name format string
5936  *	@setup:		callback to initialize device
5937  *	@txqs:		the number of TX subqueues to allocate
5938  *	@rxqs:		the number of RX subqueues to allocate
5939  *
5940  *	Allocates a struct net_device with private data area for driver use
5941  *	and performs basic initialization.  Also allocates subquue structs
5942  *	for each queue on the device.
5943  */
5944 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5945 		void (*setup)(struct net_device *),
5946 		unsigned int txqs, unsigned int rxqs)
5947 {
5948 	struct net_device *dev;
5949 	size_t alloc_size;
5950 	struct net_device *p;
5951 
5952 	BUG_ON(strlen(name) >= sizeof(dev->name));
5953 
5954 	if (txqs < 1) {
5955 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5956 		return NULL;
5957 	}
5958 
5959 #ifdef CONFIG_RPS
5960 	if (rxqs < 1) {
5961 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5962 		return NULL;
5963 	}
5964 #endif
5965 
5966 	alloc_size = sizeof(struct net_device);
5967 	if (sizeof_priv) {
5968 		/* ensure 32-byte alignment of private area */
5969 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5970 		alloc_size += sizeof_priv;
5971 	}
5972 	/* ensure 32-byte alignment of whole construct */
5973 	alloc_size += NETDEV_ALIGN - 1;
5974 
5975 	p = kzalloc(alloc_size, GFP_KERNEL);
5976 	if (!p) {
5977 		pr_err("alloc_netdev: Unable to allocate device\n");
5978 		return NULL;
5979 	}
5980 
5981 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5982 	dev->padded = (char *)dev - (char *)p;
5983 
5984 	dev->pcpu_refcnt = alloc_percpu(int);
5985 	if (!dev->pcpu_refcnt)
5986 		goto free_p;
5987 
5988 	if (dev_addr_init(dev))
5989 		goto free_pcpu;
5990 
5991 	dev_mc_init(dev);
5992 	dev_uc_init(dev);
5993 
5994 	dev_net_set(dev, &init_net);
5995 
5996 	dev->gso_max_size = GSO_MAX_SIZE;
5997 	dev->gso_max_segs = GSO_MAX_SEGS;
5998 
5999 	INIT_LIST_HEAD(&dev->napi_list);
6000 	INIT_LIST_HEAD(&dev->unreg_list);
6001 	INIT_LIST_HEAD(&dev->link_watch_list);
6002 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
6003 	setup(dev);
6004 
6005 	dev->num_tx_queues = txqs;
6006 	dev->real_num_tx_queues = txqs;
6007 	if (netif_alloc_netdev_queues(dev))
6008 		goto free_all;
6009 
6010 #ifdef CONFIG_RPS
6011 	dev->num_rx_queues = rxqs;
6012 	dev->real_num_rx_queues = rxqs;
6013 	if (netif_alloc_rx_queues(dev))
6014 		goto free_all;
6015 #endif
6016 
6017 	strcpy(dev->name, name);
6018 	dev->group = INIT_NETDEV_GROUP;
6019 	return dev;
6020 
6021 free_all:
6022 	free_netdev(dev);
6023 	return NULL;
6024 
6025 free_pcpu:
6026 	free_percpu(dev->pcpu_refcnt);
6027 	kfree(dev->_tx);
6028 #ifdef CONFIG_RPS
6029 	kfree(dev->_rx);
6030 #endif
6031 
6032 free_p:
6033 	kfree(p);
6034 	return NULL;
6035 }
6036 EXPORT_SYMBOL(alloc_netdev_mqs);
6037 
6038 /**
6039  *	free_netdev - free network device
6040  *	@dev: device
6041  *
6042  *	This function does the last stage of destroying an allocated device
6043  * 	interface. The reference to the device object is released.
6044  *	If this is the last reference then it will be freed.
6045  */
6046 void free_netdev(struct net_device *dev)
6047 {
6048 	struct napi_struct *p, *n;
6049 
6050 	release_net(dev_net(dev));
6051 
6052 	kfree(dev->_tx);
6053 #ifdef CONFIG_RPS
6054 	kfree(dev->_rx);
6055 #endif
6056 
6057 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6058 
6059 	/* Flush device addresses */
6060 	dev_addr_flush(dev);
6061 
6062 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6063 		netif_napi_del(p);
6064 
6065 	free_percpu(dev->pcpu_refcnt);
6066 	dev->pcpu_refcnt = NULL;
6067 
6068 	/*  Compatibility with error handling in drivers */
6069 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6070 		kfree((char *)dev - dev->padded);
6071 		return;
6072 	}
6073 
6074 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6075 	dev->reg_state = NETREG_RELEASED;
6076 
6077 	/* will free via device release */
6078 	put_device(&dev->dev);
6079 }
6080 EXPORT_SYMBOL(free_netdev);
6081 
6082 /**
6083  *	synchronize_net -  Synchronize with packet receive processing
6084  *
6085  *	Wait for packets currently being received to be done.
6086  *	Does not block later packets from starting.
6087  */
6088 void synchronize_net(void)
6089 {
6090 	might_sleep();
6091 	if (rtnl_is_locked())
6092 		synchronize_rcu_expedited();
6093 	else
6094 		synchronize_rcu();
6095 }
6096 EXPORT_SYMBOL(synchronize_net);
6097 
6098 /**
6099  *	unregister_netdevice_queue - remove device from the kernel
6100  *	@dev: device
6101  *	@head: list
6102  *
6103  *	This function shuts down a device interface and removes it
6104  *	from the kernel tables.
6105  *	If head not NULL, device is queued to be unregistered later.
6106  *
6107  *	Callers must hold the rtnl semaphore.  You may want
6108  *	unregister_netdev() instead of this.
6109  */
6110 
6111 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6112 {
6113 	ASSERT_RTNL();
6114 
6115 	if (head) {
6116 		list_move_tail(&dev->unreg_list, head);
6117 	} else {
6118 		rollback_registered(dev);
6119 		/* Finish processing unregister after unlock */
6120 		net_set_todo(dev);
6121 	}
6122 }
6123 EXPORT_SYMBOL(unregister_netdevice_queue);
6124 
6125 /**
6126  *	unregister_netdevice_many - unregister many devices
6127  *	@head: list of devices
6128  */
6129 void unregister_netdevice_many(struct list_head *head)
6130 {
6131 	struct net_device *dev;
6132 
6133 	if (!list_empty(head)) {
6134 		rollback_registered_many(head);
6135 		list_for_each_entry(dev, head, unreg_list)
6136 			net_set_todo(dev);
6137 	}
6138 }
6139 EXPORT_SYMBOL(unregister_netdevice_many);
6140 
6141 /**
6142  *	unregister_netdev - remove device from the kernel
6143  *	@dev: device
6144  *
6145  *	This function shuts down a device interface and removes it
6146  *	from the kernel tables.
6147  *
6148  *	This is just a wrapper for unregister_netdevice that takes
6149  *	the rtnl semaphore.  In general you want to use this and not
6150  *	unregister_netdevice.
6151  */
6152 void unregister_netdev(struct net_device *dev)
6153 {
6154 	rtnl_lock();
6155 	unregister_netdevice(dev);
6156 	rtnl_unlock();
6157 }
6158 EXPORT_SYMBOL(unregister_netdev);
6159 
6160 /**
6161  *	dev_change_net_namespace - move device to different nethost namespace
6162  *	@dev: device
6163  *	@net: network namespace
6164  *	@pat: If not NULL name pattern to try if the current device name
6165  *	      is already taken in the destination network namespace.
6166  *
6167  *	This function shuts down a device interface and moves it
6168  *	to a new network namespace. On success 0 is returned, on
6169  *	a failure a netagive errno code is returned.
6170  *
6171  *	Callers must hold the rtnl semaphore.
6172  */
6173 
6174 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6175 {
6176 	int err;
6177 
6178 	ASSERT_RTNL();
6179 
6180 	/* Don't allow namespace local devices to be moved. */
6181 	err = -EINVAL;
6182 	if (dev->features & NETIF_F_NETNS_LOCAL)
6183 		goto out;
6184 
6185 	/* Ensure the device has been registrered */
6186 	err = -EINVAL;
6187 	if (dev->reg_state != NETREG_REGISTERED)
6188 		goto out;
6189 
6190 	/* Get out if there is nothing todo */
6191 	err = 0;
6192 	if (net_eq(dev_net(dev), net))
6193 		goto out;
6194 
6195 	/* Pick the destination device name, and ensure
6196 	 * we can use it in the destination network namespace.
6197 	 */
6198 	err = -EEXIST;
6199 	if (__dev_get_by_name(net, dev->name)) {
6200 		/* We get here if we can't use the current device name */
6201 		if (!pat)
6202 			goto out;
6203 		if (dev_get_valid_name(dev, pat) < 0)
6204 			goto out;
6205 	}
6206 
6207 	/*
6208 	 * And now a mini version of register_netdevice unregister_netdevice.
6209 	 */
6210 
6211 	/* If device is running close it first. */
6212 	dev_close(dev);
6213 
6214 	/* And unlink it from device chain */
6215 	err = -ENODEV;
6216 	unlist_netdevice(dev);
6217 
6218 	synchronize_net();
6219 
6220 	/* Shutdown queueing discipline. */
6221 	dev_shutdown(dev);
6222 
6223 	/* Notify protocols, that we are about to destroy
6224 	   this device. They should clean all the things.
6225 
6226 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6227 	   This is wanted because this way 8021q and macvlan know
6228 	   the device is just moving and can keep their slaves up.
6229 	*/
6230 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6231 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6232 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6233 
6234 	/*
6235 	 *	Flush the unicast and multicast chains
6236 	 */
6237 	dev_uc_flush(dev);
6238 	dev_mc_flush(dev);
6239 
6240 	/* Actually switch the network namespace */
6241 	dev_net_set(dev, net);
6242 
6243 	/* If there is an ifindex conflict assign a new one */
6244 	if (__dev_get_by_index(net, dev->ifindex)) {
6245 		int iflink = (dev->iflink == dev->ifindex);
6246 		dev->ifindex = dev_new_index(net);
6247 		if (iflink)
6248 			dev->iflink = dev->ifindex;
6249 	}
6250 
6251 	/* Fixup kobjects */
6252 	err = device_rename(&dev->dev, dev->name);
6253 	WARN_ON(err);
6254 
6255 	/* Add the device back in the hashes */
6256 	list_netdevice(dev);
6257 
6258 	/* Notify protocols, that a new device appeared. */
6259 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6260 
6261 	/*
6262 	 *	Prevent userspace races by waiting until the network
6263 	 *	device is fully setup before sending notifications.
6264 	 */
6265 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6266 
6267 	synchronize_net();
6268 	err = 0;
6269 out:
6270 	return err;
6271 }
6272 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6273 
6274 static int dev_cpu_callback(struct notifier_block *nfb,
6275 			    unsigned long action,
6276 			    void *ocpu)
6277 {
6278 	struct sk_buff **list_skb;
6279 	struct sk_buff *skb;
6280 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6281 	struct softnet_data *sd, *oldsd;
6282 
6283 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6284 		return NOTIFY_OK;
6285 
6286 	local_irq_disable();
6287 	cpu = smp_processor_id();
6288 	sd = &per_cpu(softnet_data, cpu);
6289 	oldsd = &per_cpu(softnet_data, oldcpu);
6290 
6291 	/* Find end of our completion_queue. */
6292 	list_skb = &sd->completion_queue;
6293 	while (*list_skb)
6294 		list_skb = &(*list_skb)->next;
6295 	/* Append completion queue from offline CPU. */
6296 	*list_skb = oldsd->completion_queue;
6297 	oldsd->completion_queue = NULL;
6298 
6299 	/* Append output queue from offline CPU. */
6300 	if (oldsd->output_queue) {
6301 		*sd->output_queue_tailp = oldsd->output_queue;
6302 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6303 		oldsd->output_queue = NULL;
6304 		oldsd->output_queue_tailp = &oldsd->output_queue;
6305 	}
6306 	/* Append NAPI poll list from offline CPU. */
6307 	if (!list_empty(&oldsd->poll_list)) {
6308 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6309 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6310 	}
6311 
6312 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6313 	local_irq_enable();
6314 
6315 	/* Process offline CPU's input_pkt_queue */
6316 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6317 		netif_rx(skb);
6318 		input_queue_head_incr(oldsd);
6319 	}
6320 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6321 		netif_rx(skb);
6322 		input_queue_head_incr(oldsd);
6323 	}
6324 
6325 	return NOTIFY_OK;
6326 }
6327 
6328 
6329 /**
6330  *	netdev_increment_features - increment feature set by one
6331  *	@all: current feature set
6332  *	@one: new feature set
6333  *	@mask: mask feature set
6334  *
6335  *	Computes a new feature set after adding a device with feature set
6336  *	@one to the master device with current feature set @all.  Will not
6337  *	enable anything that is off in @mask. Returns the new feature set.
6338  */
6339 netdev_features_t netdev_increment_features(netdev_features_t all,
6340 	netdev_features_t one, netdev_features_t mask)
6341 {
6342 	if (mask & NETIF_F_GEN_CSUM)
6343 		mask |= NETIF_F_ALL_CSUM;
6344 	mask |= NETIF_F_VLAN_CHALLENGED;
6345 
6346 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6347 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6348 
6349 	/* If one device supports hw checksumming, set for all. */
6350 	if (all & NETIF_F_GEN_CSUM)
6351 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6352 
6353 	return all;
6354 }
6355 EXPORT_SYMBOL(netdev_increment_features);
6356 
6357 static struct hlist_head *netdev_create_hash(void)
6358 {
6359 	int i;
6360 	struct hlist_head *hash;
6361 
6362 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6363 	if (hash != NULL)
6364 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6365 			INIT_HLIST_HEAD(&hash[i]);
6366 
6367 	return hash;
6368 }
6369 
6370 /* Initialize per network namespace state */
6371 static int __net_init netdev_init(struct net *net)
6372 {
6373 	if (net != &init_net)
6374 		INIT_LIST_HEAD(&net->dev_base_head);
6375 
6376 	net->dev_name_head = netdev_create_hash();
6377 	if (net->dev_name_head == NULL)
6378 		goto err_name;
6379 
6380 	net->dev_index_head = netdev_create_hash();
6381 	if (net->dev_index_head == NULL)
6382 		goto err_idx;
6383 
6384 	return 0;
6385 
6386 err_idx:
6387 	kfree(net->dev_name_head);
6388 err_name:
6389 	return -ENOMEM;
6390 }
6391 
6392 /**
6393  *	netdev_drivername - network driver for the device
6394  *	@dev: network device
6395  *
6396  *	Determine network driver for device.
6397  */
6398 const char *netdev_drivername(const struct net_device *dev)
6399 {
6400 	const struct device_driver *driver;
6401 	const struct device *parent;
6402 	const char *empty = "";
6403 
6404 	parent = dev->dev.parent;
6405 	if (!parent)
6406 		return empty;
6407 
6408 	driver = parent->driver;
6409 	if (driver && driver->name)
6410 		return driver->name;
6411 	return empty;
6412 }
6413 
6414 int __netdev_printk(const char *level, const struct net_device *dev,
6415 			   struct va_format *vaf)
6416 {
6417 	int r;
6418 
6419 	if (dev && dev->dev.parent)
6420 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6421 			       netdev_name(dev), vaf);
6422 	else if (dev)
6423 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6424 	else
6425 		r = printk("%s(NULL net_device): %pV", level, vaf);
6426 
6427 	return r;
6428 }
6429 EXPORT_SYMBOL(__netdev_printk);
6430 
6431 int netdev_printk(const char *level, const struct net_device *dev,
6432 		  const char *format, ...)
6433 {
6434 	struct va_format vaf;
6435 	va_list args;
6436 	int r;
6437 
6438 	va_start(args, format);
6439 
6440 	vaf.fmt = format;
6441 	vaf.va = &args;
6442 
6443 	r = __netdev_printk(level, dev, &vaf);
6444 	va_end(args);
6445 
6446 	return r;
6447 }
6448 EXPORT_SYMBOL(netdev_printk);
6449 
6450 #define define_netdev_printk_level(func, level)			\
6451 int func(const struct net_device *dev, const char *fmt, ...)	\
6452 {								\
6453 	int r;							\
6454 	struct va_format vaf;					\
6455 	va_list args;						\
6456 								\
6457 	va_start(args, fmt);					\
6458 								\
6459 	vaf.fmt = fmt;						\
6460 	vaf.va = &args;						\
6461 								\
6462 	r = __netdev_printk(level, dev, &vaf);			\
6463 	va_end(args);						\
6464 								\
6465 	return r;						\
6466 }								\
6467 EXPORT_SYMBOL(func);
6468 
6469 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6470 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6471 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6472 define_netdev_printk_level(netdev_err, KERN_ERR);
6473 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6474 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6475 define_netdev_printk_level(netdev_info, KERN_INFO);
6476 
6477 static void __net_exit netdev_exit(struct net *net)
6478 {
6479 	kfree(net->dev_name_head);
6480 	kfree(net->dev_index_head);
6481 }
6482 
6483 static struct pernet_operations __net_initdata netdev_net_ops = {
6484 	.init = netdev_init,
6485 	.exit = netdev_exit,
6486 };
6487 
6488 static void __net_exit default_device_exit(struct net *net)
6489 {
6490 	struct net_device *dev, *aux;
6491 	/*
6492 	 * Push all migratable network devices back to the
6493 	 * initial network namespace
6494 	 */
6495 	rtnl_lock();
6496 	for_each_netdev_safe(net, dev, aux) {
6497 		int err;
6498 		char fb_name[IFNAMSIZ];
6499 
6500 		/* Ignore unmoveable devices (i.e. loopback) */
6501 		if (dev->features & NETIF_F_NETNS_LOCAL)
6502 			continue;
6503 
6504 		/* Leave virtual devices for the generic cleanup */
6505 		if (dev->rtnl_link_ops)
6506 			continue;
6507 
6508 		/* Push remaining network devices to init_net */
6509 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6510 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6511 		if (err) {
6512 			pr_emerg("%s: failed to move %s to init_net: %d\n",
6513 				 __func__, dev->name, err);
6514 			BUG();
6515 		}
6516 	}
6517 	rtnl_unlock();
6518 }
6519 
6520 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6521 {
6522 	/* At exit all network devices most be removed from a network
6523 	 * namespace.  Do this in the reverse order of registration.
6524 	 * Do this across as many network namespaces as possible to
6525 	 * improve batching efficiency.
6526 	 */
6527 	struct net_device *dev;
6528 	struct net *net;
6529 	LIST_HEAD(dev_kill_list);
6530 
6531 	rtnl_lock();
6532 	list_for_each_entry(net, net_list, exit_list) {
6533 		for_each_netdev_reverse(net, dev) {
6534 			if (dev->rtnl_link_ops)
6535 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6536 			else
6537 				unregister_netdevice_queue(dev, &dev_kill_list);
6538 		}
6539 	}
6540 	unregister_netdevice_many(&dev_kill_list);
6541 	list_del(&dev_kill_list);
6542 	rtnl_unlock();
6543 }
6544 
6545 static struct pernet_operations __net_initdata default_device_ops = {
6546 	.exit = default_device_exit,
6547 	.exit_batch = default_device_exit_batch,
6548 };
6549 
6550 /*
6551  *	Initialize the DEV module. At boot time this walks the device list and
6552  *	unhooks any devices that fail to initialise (normally hardware not
6553  *	present) and leaves us with a valid list of present and active devices.
6554  *
6555  */
6556 
6557 /*
6558  *       This is called single threaded during boot, so no need
6559  *       to take the rtnl semaphore.
6560  */
6561 static int __init net_dev_init(void)
6562 {
6563 	int i, rc = -ENOMEM;
6564 
6565 	BUG_ON(!dev_boot_phase);
6566 
6567 	if (dev_proc_init())
6568 		goto out;
6569 
6570 	if (netdev_kobject_init())
6571 		goto out;
6572 
6573 	INIT_LIST_HEAD(&ptype_all);
6574 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6575 		INIT_LIST_HEAD(&ptype_base[i]);
6576 
6577 	if (register_pernet_subsys(&netdev_net_ops))
6578 		goto out;
6579 
6580 	/*
6581 	 *	Initialise the packet receive queues.
6582 	 */
6583 
6584 	for_each_possible_cpu(i) {
6585 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6586 
6587 		memset(sd, 0, sizeof(*sd));
6588 		skb_queue_head_init(&sd->input_pkt_queue);
6589 		skb_queue_head_init(&sd->process_queue);
6590 		sd->completion_queue = NULL;
6591 		INIT_LIST_HEAD(&sd->poll_list);
6592 		sd->output_queue = NULL;
6593 		sd->output_queue_tailp = &sd->output_queue;
6594 #ifdef CONFIG_RPS
6595 		sd->csd.func = rps_trigger_softirq;
6596 		sd->csd.info = sd;
6597 		sd->csd.flags = 0;
6598 		sd->cpu = i;
6599 #endif
6600 
6601 		sd->backlog.poll = process_backlog;
6602 		sd->backlog.weight = weight_p;
6603 		sd->backlog.gro_list = NULL;
6604 		sd->backlog.gro_count = 0;
6605 	}
6606 
6607 	dev_boot_phase = 0;
6608 
6609 	/* The loopback device is special if any other network devices
6610 	 * is present in a network namespace the loopback device must
6611 	 * be present. Since we now dynamically allocate and free the
6612 	 * loopback device ensure this invariant is maintained by
6613 	 * keeping the loopback device as the first device on the
6614 	 * list of network devices.  Ensuring the loopback devices
6615 	 * is the first device that appears and the last network device
6616 	 * that disappears.
6617 	 */
6618 	if (register_pernet_device(&loopback_net_ops))
6619 		goto out;
6620 
6621 	if (register_pernet_device(&default_device_ops))
6622 		goto out;
6623 
6624 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6625 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6626 
6627 	hotcpu_notifier(dev_cpu_callback, 0);
6628 	dst_init();
6629 	dev_mcast_init();
6630 	rc = 0;
6631 out:
6632 	return rc;
6633 }
6634 
6635 subsys_initcall(net_dev_init);
6636 
6637 static int __init initialize_hashrnd(void)
6638 {
6639 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6640 	return 0;
6641 }
6642 
6643 late_initcall_sync(initialize_hashrnd);
6644 
6645