xref: /openbmc/linux/net/core/dev.c (revision 565d76cb)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
203 {
204 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
205 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
206 }
207 
208 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
209 {
210 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
211 }
212 
213 static inline void rps_lock(struct softnet_data *sd)
214 {
215 #ifdef CONFIG_RPS
216 	spin_lock(&sd->input_pkt_queue.lock);
217 #endif
218 }
219 
220 static inline void rps_unlock(struct softnet_data *sd)
221 {
222 #ifdef CONFIG_RPS
223 	spin_unlock(&sd->input_pkt_queue.lock);
224 #endif
225 }
226 
227 /* Device list insertion */
228 static int list_netdevice(struct net_device *dev)
229 {
230 	struct net *net = dev_net(dev);
231 
232 	ASSERT_RTNL();
233 
234 	write_lock_bh(&dev_base_lock);
235 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
236 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
237 	hlist_add_head_rcu(&dev->index_hlist,
238 			   dev_index_hash(net, dev->ifindex));
239 	write_unlock_bh(&dev_base_lock);
240 	return 0;
241 }
242 
243 /* Device list removal
244  * caller must respect a RCU grace period before freeing/reusing dev
245  */
246 static void unlist_netdevice(struct net_device *dev)
247 {
248 	ASSERT_RTNL();
249 
250 	/* Unlink dev from the device chain */
251 	write_lock_bh(&dev_base_lock);
252 	list_del_rcu(&dev->dev_list);
253 	hlist_del_rcu(&dev->name_hlist);
254 	hlist_del_rcu(&dev->index_hlist);
255 	write_unlock_bh(&dev_base_lock);
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
291 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
292 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
293 	 ARPHRD_VOID, ARPHRD_NONE};
294 
295 static const char *const netdev_lock_name[] =
296 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 	 "_xmit_VOID", "_xmit_NONE"};
312 
313 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
315 
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
317 {
318 	int i;
319 
320 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
321 		if (netdev_lock_type[i] == dev_type)
322 			return i;
323 	/* the last key is used by default */
324 	return ARRAY_SIZE(netdev_lock_type) - 1;
325 }
326 
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 						 unsigned short dev_type)
329 {
330 	int i;
331 
332 	i = netdev_lock_pos(dev_type);
333 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
334 				   netdev_lock_name[i]);
335 }
336 
337 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
338 {
339 	int i;
340 
341 	i = netdev_lock_pos(dev->type);
342 	lockdep_set_class_and_name(&dev->addr_list_lock,
343 				   &netdev_addr_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 #else
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
348 						 unsigned short dev_type)
349 {
350 }
351 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
352 {
353 }
354 #endif
355 
356 /*******************************************************************************
357 
358 		Protocol management and registration routines
359 
360 *******************************************************************************/
361 
362 /*
363  *	Add a protocol ID to the list. Now that the input handler is
364  *	smarter we can dispense with all the messy stuff that used to be
365  *	here.
366  *
367  *	BEWARE!!! Protocol handlers, mangling input packets,
368  *	MUST BE last in hash buckets and checking protocol handlers
369  *	MUST start from promiscuous ptype_all chain in net_bh.
370  *	It is true now, do not change it.
371  *	Explanation follows: if protocol handler, mangling packet, will
372  *	be the first on list, it is not able to sense, that packet
373  *	is cloned and should be copied-on-write, so that it will
374  *	change it and subsequent readers will get broken packet.
375  *							--ANK (980803)
376  */
377 
378 static inline struct list_head *ptype_head(const struct packet_type *pt)
379 {
380 	if (pt->type == htons(ETH_P_ALL))
381 		return &ptype_all;
382 	else
383 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
384 }
385 
386 /**
387  *	dev_add_pack - add packet handler
388  *	@pt: packet type declaration
389  *
390  *	Add a protocol handler to the networking stack. The passed &packet_type
391  *	is linked into kernel lists and may not be freed until it has been
392  *	removed from the kernel lists.
393  *
394  *	This call does not sleep therefore it can not
395  *	guarantee all CPU's that are in middle of receiving packets
396  *	will see the new packet type (until the next received packet).
397  */
398 
399 void dev_add_pack(struct packet_type *pt)
400 {
401 	struct list_head *head = ptype_head(pt);
402 
403 	spin_lock(&ptype_lock);
404 	list_add_rcu(&pt->list, head);
405 	spin_unlock(&ptype_lock);
406 }
407 EXPORT_SYMBOL(dev_add_pack);
408 
409 /**
410  *	__dev_remove_pack	 - remove packet handler
411  *	@pt: packet type declaration
412  *
413  *	Remove a protocol handler that was previously added to the kernel
414  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
415  *	from the kernel lists and can be freed or reused once this function
416  *	returns.
417  *
418  *      The packet type might still be in use by receivers
419  *	and must not be freed until after all the CPU's have gone
420  *	through a quiescent state.
421  */
422 void __dev_remove_pack(struct packet_type *pt)
423 {
424 	struct list_head *head = ptype_head(pt);
425 	struct packet_type *pt1;
426 
427 	spin_lock(&ptype_lock);
428 
429 	list_for_each_entry(pt1, head, list) {
430 		if (pt == pt1) {
431 			list_del_rcu(&pt->list);
432 			goto out;
433 		}
434 	}
435 
436 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437 out:
438 	spin_unlock(&ptype_lock);
439 }
440 EXPORT_SYMBOL(__dev_remove_pack);
441 
442 /**
443  *	dev_remove_pack	 - remove packet handler
444  *	@pt: packet type declaration
445  *
446  *	Remove a protocol handler that was previously added to the kernel
447  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
448  *	from the kernel lists and can be freed or reused once this function
449  *	returns.
450  *
451  *	This call sleeps to guarantee that no CPU is looking at the packet
452  *	type after return.
453  */
454 void dev_remove_pack(struct packet_type *pt)
455 {
456 	__dev_remove_pack(pt);
457 
458 	synchronize_net();
459 }
460 EXPORT_SYMBOL(dev_remove_pack);
461 
462 /******************************************************************************
463 
464 		      Device Boot-time Settings Routines
465 
466 *******************************************************************************/
467 
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470 
471 /**
472  *	netdev_boot_setup_add	- add new setup entry
473  *	@name: name of the device
474  *	@map: configured settings for the device
475  *
476  *	Adds new setup entry to the dev_boot_setup list.  The function
477  *	returns 0 on error and 1 on success.  This is a generic routine to
478  *	all netdevices.
479  */
480 static int netdev_boot_setup_add(char *name, struct ifmap *map)
481 {
482 	struct netdev_boot_setup *s;
483 	int i;
484 
485 	s = dev_boot_setup;
486 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 			memset(s[i].name, 0, sizeof(s[i].name));
489 			strlcpy(s[i].name, name, IFNAMSIZ);
490 			memcpy(&s[i].map, map, sizeof(s[i].map));
491 			break;
492 		}
493 	}
494 
495 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496 }
497 
498 /**
499  *	netdev_boot_setup_check	- check boot time settings
500  *	@dev: the netdevice
501  *
502  * 	Check boot time settings for the device.
503  *	The found settings are set for the device to be used
504  *	later in the device probing.
505  *	Returns 0 if no settings found, 1 if they are.
506  */
507 int netdev_boot_setup_check(struct net_device *dev)
508 {
509 	struct netdev_boot_setup *s = dev_boot_setup;
510 	int i;
511 
512 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
514 		    !strcmp(dev->name, s[i].name)) {
515 			dev->irq 	= s[i].map.irq;
516 			dev->base_addr 	= s[i].map.base_addr;
517 			dev->mem_start 	= s[i].map.mem_start;
518 			dev->mem_end 	= s[i].map.mem_end;
519 			return 1;
520 		}
521 	}
522 	return 0;
523 }
524 EXPORT_SYMBOL(netdev_boot_setup_check);
525 
526 
527 /**
528  *	netdev_boot_base	- get address from boot time settings
529  *	@prefix: prefix for network device
530  *	@unit: id for network device
531  *
532  * 	Check boot time settings for the base address of device.
533  *	The found settings are set for the device to be used
534  *	later in the device probing.
535  *	Returns 0 if no settings found.
536  */
537 unsigned long netdev_boot_base(const char *prefix, int unit)
538 {
539 	const struct netdev_boot_setup *s = dev_boot_setup;
540 	char name[IFNAMSIZ];
541 	int i;
542 
543 	sprintf(name, "%s%d", prefix, unit);
544 
545 	/*
546 	 * If device already registered then return base of 1
547 	 * to indicate not to probe for this interface
548 	 */
549 	if (__dev_get_by_name(&init_net, name))
550 		return 1;
551 
552 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 		if (!strcmp(name, s[i].name))
554 			return s[i].map.base_addr;
555 	return 0;
556 }
557 
558 /*
559  * Saves at boot time configured settings for any netdevice.
560  */
561 int __init netdev_boot_setup(char *str)
562 {
563 	int ints[5];
564 	struct ifmap map;
565 
566 	str = get_options(str, ARRAY_SIZE(ints), ints);
567 	if (!str || !*str)
568 		return 0;
569 
570 	/* Save settings */
571 	memset(&map, 0, sizeof(map));
572 	if (ints[0] > 0)
573 		map.irq = ints[1];
574 	if (ints[0] > 1)
575 		map.base_addr = ints[2];
576 	if (ints[0] > 2)
577 		map.mem_start = ints[3];
578 	if (ints[0] > 3)
579 		map.mem_end = ints[4];
580 
581 	/* Add new entry to the list */
582 	return netdev_boot_setup_add(str, &map);
583 }
584 
585 __setup("netdev=", netdev_boot_setup);
586 
587 /*******************************************************************************
588 
589 			    Device Interface Subroutines
590 
591 *******************************************************************************/
592 
593 /**
594  *	__dev_get_by_name	- find a device by its name
595  *	@net: the applicable net namespace
596  *	@name: name to find
597  *
598  *	Find an interface by name. Must be called under RTNL semaphore
599  *	or @dev_base_lock. If the name is found a pointer to the device
600  *	is returned. If the name is not found then %NULL is returned. The
601  *	reference counters are not incremented so the caller must be
602  *	careful with locks.
603  */
604 
605 struct net_device *__dev_get_by_name(struct net *net, const char *name)
606 {
607 	struct hlist_node *p;
608 	struct net_device *dev;
609 	struct hlist_head *head = dev_name_hash(net, name);
610 
611 	hlist_for_each_entry(dev, p, head, name_hlist)
612 		if (!strncmp(dev->name, name, IFNAMSIZ))
613 			return dev;
614 
615 	return NULL;
616 }
617 EXPORT_SYMBOL(__dev_get_by_name);
618 
619 /**
620  *	dev_get_by_name_rcu	- find a device by its name
621  *	@net: the applicable net namespace
622  *	@name: name to find
623  *
624  *	Find an interface by name.
625  *	If the name is found a pointer to the device is returned.
626  * 	If the name is not found then %NULL is returned.
627  *	The reference counters are not incremented so the caller must be
628  *	careful with locks. The caller must hold RCU lock.
629  */
630 
631 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632 {
633 	struct hlist_node *p;
634 	struct net_device *dev;
635 	struct hlist_head *head = dev_name_hash(net, name);
636 
637 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 		if (!strncmp(dev->name, name, IFNAMSIZ))
639 			return dev;
640 
641 	return NULL;
642 }
643 EXPORT_SYMBOL(dev_get_by_name_rcu);
644 
645 /**
646  *	dev_get_by_name		- find a device by its name
647  *	@net: the applicable net namespace
648  *	@name: name to find
649  *
650  *	Find an interface by name. This can be called from any
651  *	context and does its own locking. The returned handle has
652  *	the usage count incremented and the caller must use dev_put() to
653  *	release it when it is no longer needed. %NULL is returned if no
654  *	matching device is found.
655  */
656 
657 struct net_device *dev_get_by_name(struct net *net, const char *name)
658 {
659 	struct net_device *dev;
660 
661 	rcu_read_lock();
662 	dev = dev_get_by_name_rcu(net, name);
663 	if (dev)
664 		dev_hold(dev);
665 	rcu_read_unlock();
666 	return dev;
667 }
668 EXPORT_SYMBOL(dev_get_by_name);
669 
670 /**
671  *	__dev_get_by_index - find a device by its ifindex
672  *	@net: the applicable net namespace
673  *	@ifindex: index of device
674  *
675  *	Search for an interface by index. Returns %NULL if the device
676  *	is not found or a pointer to the device. The device has not
677  *	had its reference counter increased so the caller must be careful
678  *	about locking. The caller must hold either the RTNL semaphore
679  *	or @dev_base_lock.
680  */
681 
682 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
683 {
684 	struct hlist_node *p;
685 	struct net_device *dev;
686 	struct hlist_head *head = dev_index_hash(net, ifindex);
687 
688 	hlist_for_each_entry(dev, p, head, index_hlist)
689 		if (dev->ifindex == ifindex)
690 			return dev;
691 
692 	return NULL;
693 }
694 EXPORT_SYMBOL(__dev_get_by_index);
695 
696 /**
697  *	dev_get_by_index_rcu - find a device by its ifindex
698  *	@net: the applicable net namespace
699  *	@ifindex: index of device
700  *
701  *	Search for an interface by index. Returns %NULL if the device
702  *	is not found or a pointer to the device. The device has not
703  *	had its reference counter increased so the caller must be careful
704  *	about locking. The caller must hold RCU lock.
705  */
706 
707 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708 {
709 	struct hlist_node *p;
710 	struct net_device *dev;
711 	struct hlist_head *head = dev_index_hash(net, ifindex);
712 
713 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 		if (dev->ifindex == ifindex)
715 			return dev;
716 
717 	return NULL;
718 }
719 EXPORT_SYMBOL(dev_get_by_index_rcu);
720 
721 
722 /**
723  *	dev_get_by_index - find a device by its ifindex
724  *	@net: the applicable net namespace
725  *	@ifindex: index of device
726  *
727  *	Search for an interface by index. Returns NULL if the device
728  *	is not found or a pointer to the device. The device returned has
729  *	had a reference added and the pointer is safe until the user calls
730  *	dev_put to indicate they have finished with it.
731  */
732 
733 struct net_device *dev_get_by_index(struct net *net, int ifindex)
734 {
735 	struct net_device *dev;
736 
737 	rcu_read_lock();
738 	dev = dev_get_by_index_rcu(net, ifindex);
739 	if (dev)
740 		dev_hold(dev);
741 	rcu_read_unlock();
742 	return dev;
743 }
744 EXPORT_SYMBOL(dev_get_by_index);
745 
746 /**
747  *	dev_getbyhwaddr_rcu - find a device by its hardware address
748  *	@net: the applicable net namespace
749  *	@type: media type of device
750  *	@ha: hardware address
751  *
752  *	Search for an interface by MAC address. Returns NULL if the device
753  *	is not found or a pointer to the device.
754  *	The caller must hold RCU or RTNL.
755  *	The returned device has not had its ref count increased
756  *	and the caller must therefore be careful about locking
757  *
758  */
759 
760 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
761 				       const char *ha)
762 {
763 	struct net_device *dev;
764 
765 	for_each_netdev_rcu(net, dev)
766 		if (dev->type == type &&
767 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
768 			return dev;
769 
770 	return NULL;
771 }
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
773 
774 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
775 {
776 	struct net_device *dev;
777 
778 	ASSERT_RTNL();
779 	for_each_netdev(net, dev)
780 		if (dev->type == type)
781 			return dev;
782 
783 	return NULL;
784 }
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
786 
787 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
788 {
789 	struct net_device *dev, *ret = NULL;
790 
791 	rcu_read_lock();
792 	for_each_netdev_rcu(net, dev)
793 		if (dev->type == type) {
794 			dev_hold(dev);
795 			ret = dev;
796 			break;
797 		}
798 	rcu_read_unlock();
799 	return ret;
800 }
801 EXPORT_SYMBOL(dev_getfirstbyhwtype);
802 
803 /**
804  *	dev_get_by_flags_rcu - find any device with given flags
805  *	@net: the applicable net namespace
806  *	@if_flags: IFF_* values
807  *	@mask: bitmask of bits in if_flags to check
808  *
809  *	Search for any interface with the given flags. Returns NULL if a device
810  *	is not found or a pointer to the device. Must be called inside
811  *	rcu_read_lock(), and result refcount is unchanged.
812  */
813 
814 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
815 				    unsigned short mask)
816 {
817 	struct net_device *dev, *ret;
818 
819 	ret = NULL;
820 	for_each_netdev_rcu(net, dev) {
821 		if (((dev->flags ^ if_flags) & mask) == 0) {
822 			ret = dev;
823 			break;
824 		}
825 	}
826 	return ret;
827 }
828 EXPORT_SYMBOL(dev_get_by_flags_rcu);
829 
830 /**
831  *	dev_valid_name - check if name is okay for network device
832  *	@name: name string
833  *
834  *	Network device names need to be valid file names to
835  *	to allow sysfs to work.  We also disallow any kind of
836  *	whitespace.
837  */
838 int dev_valid_name(const char *name)
839 {
840 	if (*name == '\0')
841 		return 0;
842 	if (strlen(name) >= IFNAMSIZ)
843 		return 0;
844 	if (!strcmp(name, ".") || !strcmp(name, ".."))
845 		return 0;
846 
847 	while (*name) {
848 		if (*name == '/' || isspace(*name))
849 			return 0;
850 		name++;
851 	}
852 	return 1;
853 }
854 EXPORT_SYMBOL(dev_valid_name);
855 
856 /**
857  *	__dev_alloc_name - allocate a name for a device
858  *	@net: network namespace to allocate the device name in
859  *	@name: name format string
860  *	@buf:  scratch buffer and result name string
861  *
862  *	Passed a format string - eg "lt%d" it will try and find a suitable
863  *	id. It scans list of devices to build up a free map, then chooses
864  *	the first empty slot. The caller must hold the dev_base or rtnl lock
865  *	while allocating the name and adding the device in order to avoid
866  *	duplicates.
867  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868  *	Returns the number of the unit assigned or a negative errno code.
869  */
870 
871 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
872 {
873 	int i = 0;
874 	const char *p;
875 	const int max_netdevices = 8*PAGE_SIZE;
876 	unsigned long *inuse;
877 	struct net_device *d;
878 
879 	p = strnchr(name, IFNAMSIZ-1, '%');
880 	if (p) {
881 		/*
882 		 * Verify the string as this thing may have come from
883 		 * the user.  There must be either one "%d" and no other "%"
884 		 * characters.
885 		 */
886 		if (p[1] != 'd' || strchr(p + 2, '%'))
887 			return -EINVAL;
888 
889 		/* Use one page as a bit array of possible slots */
890 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
891 		if (!inuse)
892 			return -ENOMEM;
893 
894 		for_each_netdev(net, d) {
895 			if (!sscanf(d->name, name, &i))
896 				continue;
897 			if (i < 0 || i >= max_netdevices)
898 				continue;
899 
900 			/*  avoid cases where sscanf is not exact inverse of printf */
901 			snprintf(buf, IFNAMSIZ, name, i);
902 			if (!strncmp(buf, d->name, IFNAMSIZ))
903 				set_bit(i, inuse);
904 		}
905 
906 		i = find_first_zero_bit(inuse, max_netdevices);
907 		free_page((unsigned long) inuse);
908 	}
909 
910 	if (buf != name)
911 		snprintf(buf, IFNAMSIZ, name, i);
912 	if (!__dev_get_by_name(net, buf))
913 		return i;
914 
915 	/* It is possible to run out of possible slots
916 	 * when the name is long and there isn't enough space left
917 	 * for the digits, or if all bits are used.
918 	 */
919 	return -ENFILE;
920 }
921 
922 /**
923  *	dev_alloc_name - allocate a name for a device
924  *	@dev: device
925  *	@name: name format string
926  *
927  *	Passed a format string - eg "lt%d" it will try and find a suitable
928  *	id. It scans list of devices to build up a free map, then chooses
929  *	the first empty slot. The caller must hold the dev_base or rtnl lock
930  *	while allocating the name and adding the device in order to avoid
931  *	duplicates.
932  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933  *	Returns the number of the unit assigned or a negative errno code.
934  */
935 
936 int dev_alloc_name(struct net_device *dev, const char *name)
937 {
938 	char buf[IFNAMSIZ];
939 	struct net *net;
940 	int ret;
941 
942 	BUG_ON(!dev_net(dev));
943 	net = dev_net(dev);
944 	ret = __dev_alloc_name(net, name, buf);
945 	if (ret >= 0)
946 		strlcpy(dev->name, buf, IFNAMSIZ);
947 	return ret;
948 }
949 EXPORT_SYMBOL(dev_alloc_name);
950 
951 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
952 {
953 	struct net *net;
954 
955 	BUG_ON(!dev_net(dev));
956 	net = dev_net(dev);
957 
958 	if (!dev_valid_name(name))
959 		return -EINVAL;
960 
961 	if (fmt && strchr(name, '%'))
962 		return dev_alloc_name(dev, name);
963 	else if (__dev_get_by_name(net, name))
964 		return -EEXIST;
965 	else if (dev->name != name)
966 		strlcpy(dev->name, name, IFNAMSIZ);
967 
968 	return 0;
969 }
970 
971 /**
972  *	dev_change_name - change name of a device
973  *	@dev: device
974  *	@newname: name (or format string) must be at least IFNAMSIZ
975  *
976  *	Change name of a device, can pass format strings "eth%d".
977  *	for wildcarding.
978  */
979 int dev_change_name(struct net_device *dev, const char *newname)
980 {
981 	char oldname[IFNAMSIZ];
982 	int err = 0;
983 	int ret;
984 	struct net *net;
985 
986 	ASSERT_RTNL();
987 	BUG_ON(!dev_net(dev));
988 
989 	net = dev_net(dev);
990 	if (dev->flags & IFF_UP)
991 		return -EBUSY;
992 
993 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
994 		return 0;
995 
996 	memcpy(oldname, dev->name, IFNAMSIZ);
997 
998 	err = dev_get_valid_name(dev, newname, 1);
999 	if (err < 0)
1000 		return err;
1001 
1002 rollback:
1003 	ret = device_rename(&dev->dev, dev->name);
1004 	if (ret) {
1005 		memcpy(dev->name, oldname, IFNAMSIZ);
1006 		return ret;
1007 	}
1008 
1009 	write_lock_bh(&dev_base_lock);
1010 	hlist_del(&dev->name_hlist);
1011 	write_unlock_bh(&dev_base_lock);
1012 
1013 	synchronize_rcu();
1014 
1015 	write_lock_bh(&dev_base_lock);
1016 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1017 	write_unlock_bh(&dev_base_lock);
1018 
1019 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1020 	ret = notifier_to_errno(ret);
1021 
1022 	if (ret) {
1023 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1024 		if (err >= 0) {
1025 			err = ret;
1026 			memcpy(dev->name, oldname, IFNAMSIZ);
1027 			goto rollback;
1028 		} else {
1029 			printk(KERN_ERR
1030 			       "%s: name change rollback failed: %d.\n",
1031 			       dev->name, ret);
1032 		}
1033 	}
1034 
1035 	return err;
1036 }
1037 
1038 /**
1039  *	dev_set_alias - change ifalias of a device
1040  *	@dev: device
1041  *	@alias: name up to IFALIASZ
1042  *	@len: limit of bytes to copy from info
1043  *
1044  *	Set ifalias for a device,
1045  */
1046 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1047 {
1048 	ASSERT_RTNL();
1049 
1050 	if (len >= IFALIASZ)
1051 		return -EINVAL;
1052 
1053 	if (!len) {
1054 		if (dev->ifalias) {
1055 			kfree(dev->ifalias);
1056 			dev->ifalias = NULL;
1057 		}
1058 		return 0;
1059 	}
1060 
1061 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1062 	if (!dev->ifalias)
1063 		return -ENOMEM;
1064 
1065 	strlcpy(dev->ifalias, alias, len+1);
1066 	return len;
1067 }
1068 
1069 
1070 /**
1071  *	netdev_features_change - device changes features
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed features.
1075  */
1076 void netdev_features_change(struct net_device *dev)
1077 {
1078 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1079 }
1080 EXPORT_SYMBOL(netdev_features_change);
1081 
1082 /**
1083  *	netdev_state_change - device changes state
1084  *	@dev: device to cause notification
1085  *
1086  *	Called to indicate a device has changed state. This function calls
1087  *	the notifier chains for netdev_chain and sends a NEWLINK message
1088  *	to the routing socket.
1089  */
1090 void netdev_state_change(struct net_device *dev)
1091 {
1092 	if (dev->flags & IFF_UP) {
1093 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1094 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1095 	}
1096 }
1097 EXPORT_SYMBOL(netdev_state_change);
1098 
1099 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1100 {
1101 	return call_netdevice_notifiers(event, dev);
1102 }
1103 EXPORT_SYMBOL(netdev_bonding_change);
1104 
1105 /**
1106  *	dev_load 	- load a network module
1107  *	@net: the applicable net namespace
1108  *	@name: name of interface
1109  *
1110  *	If a network interface is not present and the process has suitable
1111  *	privileges this function loads the module. If module loading is not
1112  *	available in this kernel then it becomes a nop.
1113  */
1114 
1115 void dev_load(struct net *net, const char *name)
1116 {
1117 	struct net_device *dev;
1118 	int no_module;
1119 
1120 	rcu_read_lock();
1121 	dev = dev_get_by_name_rcu(net, name);
1122 	rcu_read_unlock();
1123 
1124 	no_module = !dev;
1125 	if (no_module && capable(CAP_NET_ADMIN))
1126 		no_module = request_module("netdev-%s", name);
1127 	if (no_module && capable(CAP_SYS_MODULE)) {
1128 		if (!request_module("%s", name))
1129 			pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1131 "instead\n", name);
1132 	}
1133 }
1134 EXPORT_SYMBOL(dev_load);
1135 
1136 static int __dev_open(struct net_device *dev)
1137 {
1138 	const struct net_device_ops *ops = dev->netdev_ops;
1139 	int ret;
1140 
1141 	ASSERT_RTNL();
1142 
1143 	/*
1144 	 *	Is it even present?
1145 	 */
1146 	if (!netif_device_present(dev))
1147 		return -ENODEV;
1148 
1149 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1150 	ret = notifier_to_errno(ret);
1151 	if (ret)
1152 		return ret;
1153 
1154 	/*
1155 	 *	Call device private open method
1156 	 */
1157 	set_bit(__LINK_STATE_START, &dev->state);
1158 
1159 	if (ops->ndo_validate_addr)
1160 		ret = ops->ndo_validate_addr(dev);
1161 
1162 	if (!ret && ops->ndo_open)
1163 		ret = ops->ndo_open(dev);
1164 
1165 	/*
1166 	 *	If it went open OK then:
1167 	 */
1168 
1169 	if (ret)
1170 		clear_bit(__LINK_STATE_START, &dev->state);
1171 	else {
1172 		/*
1173 		 *	Set the flags.
1174 		 */
1175 		dev->flags |= IFF_UP;
1176 
1177 		/*
1178 		 *	Enable NET_DMA
1179 		 */
1180 		net_dmaengine_get();
1181 
1182 		/*
1183 		 *	Initialize multicasting status
1184 		 */
1185 		dev_set_rx_mode(dev);
1186 
1187 		/*
1188 		 *	Wakeup transmit queue engine
1189 		 */
1190 		dev_activate(dev);
1191 	}
1192 
1193 	return ret;
1194 }
1195 
1196 /**
1197  *	dev_open	- prepare an interface for use.
1198  *	@dev:	device to open
1199  *
1200  *	Takes a device from down to up state. The device's private open
1201  *	function is invoked and then the multicast lists are loaded. Finally
1202  *	the device is moved into the up state and a %NETDEV_UP message is
1203  *	sent to the netdev notifier chain.
1204  *
1205  *	Calling this function on an active interface is a nop. On a failure
1206  *	a negative errno code is returned.
1207  */
1208 int dev_open(struct net_device *dev)
1209 {
1210 	int ret;
1211 
1212 	/*
1213 	 *	Is it already up?
1214 	 */
1215 	if (dev->flags & IFF_UP)
1216 		return 0;
1217 
1218 	/*
1219 	 *	Open device
1220 	 */
1221 	ret = __dev_open(dev);
1222 	if (ret < 0)
1223 		return ret;
1224 
1225 	/*
1226 	 *	... and announce new interface.
1227 	 */
1228 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1229 	call_netdevice_notifiers(NETDEV_UP, dev);
1230 
1231 	return ret;
1232 }
1233 EXPORT_SYMBOL(dev_open);
1234 
1235 static int __dev_close_many(struct list_head *head)
1236 {
1237 	struct net_device *dev;
1238 
1239 	ASSERT_RTNL();
1240 	might_sleep();
1241 
1242 	list_for_each_entry(dev, head, unreg_list) {
1243 		/*
1244 		 *	Tell people we are going down, so that they can
1245 		 *	prepare to death, when device is still operating.
1246 		 */
1247 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1248 
1249 		clear_bit(__LINK_STATE_START, &dev->state);
1250 
1251 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1252 		 * can be even on different cpu. So just clear netif_running().
1253 		 *
1254 		 * dev->stop() will invoke napi_disable() on all of it's
1255 		 * napi_struct instances on this device.
1256 		 */
1257 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1258 	}
1259 
1260 	dev_deactivate_many(head);
1261 
1262 	list_for_each_entry(dev, head, unreg_list) {
1263 		const struct net_device_ops *ops = dev->netdev_ops;
1264 
1265 		/*
1266 		 *	Call the device specific close. This cannot fail.
1267 		 *	Only if device is UP
1268 		 *
1269 		 *	We allow it to be called even after a DETACH hot-plug
1270 		 *	event.
1271 		 */
1272 		if (ops->ndo_stop)
1273 			ops->ndo_stop(dev);
1274 
1275 		/*
1276 		 *	Device is now down.
1277 		 */
1278 
1279 		dev->flags &= ~IFF_UP;
1280 
1281 		/*
1282 		 *	Shutdown NET_DMA
1283 		 */
1284 		net_dmaengine_put();
1285 	}
1286 
1287 	return 0;
1288 }
1289 
1290 static int __dev_close(struct net_device *dev)
1291 {
1292 	int retval;
1293 	LIST_HEAD(single);
1294 
1295 	list_add(&dev->unreg_list, &single);
1296 	retval = __dev_close_many(&single);
1297 	list_del(&single);
1298 	return retval;
1299 }
1300 
1301 static int dev_close_many(struct list_head *head)
1302 {
1303 	struct net_device *dev, *tmp;
1304 	LIST_HEAD(tmp_list);
1305 
1306 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1307 		if (!(dev->flags & IFF_UP))
1308 			list_move(&dev->unreg_list, &tmp_list);
1309 
1310 	__dev_close_many(head);
1311 
1312 	/*
1313 	 * Tell people we are down
1314 	 */
1315 	list_for_each_entry(dev, head, unreg_list) {
1316 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1317 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1318 	}
1319 
1320 	/* rollback_registered_many needs the complete original list */
1321 	list_splice(&tmp_list, head);
1322 	return 0;
1323 }
1324 
1325 /**
1326  *	dev_close - shutdown an interface.
1327  *	@dev: device to shutdown
1328  *
1329  *	This function moves an active device into down state. A
1330  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1331  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1332  *	chain.
1333  */
1334 int dev_close(struct net_device *dev)
1335 {
1336 	LIST_HEAD(single);
1337 
1338 	list_add(&dev->unreg_list, &single);
1339 	dev_close_many(&single);
1340 	list_del(&single);
1341 	return 0;
1342 }
1343 EXPORT_SYMBOL(dev_close);
1344 
1345 
1346 /**
1347  *	dev_disable_lro - disable Large Receive Offload on a device
1348  *	@dev: device
1349  *
1350  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1351  *	called under RTNL.  This is needed if received packets may be
1352  *	forwarded to another interface.
1353  */
1354 void dev_disable_lro(struct net_device *dev)
1355 {
1356 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1357 	    dev->ethtool_ops->set_flags) {
1358 		u32 flags = dev->ethtool_ops->get_flags(dev);
1359 		if (flags & ETH_FLAG_LRO) {
1360 			flags &= ~ETH_FLAG_LRO;
1361 			dev->ethtool_ops->set_flags(dev, flags);
1362 		}
1363 	}
1364 	WARN_ON(dev->features & NETIF_F_LRO);
1365 }
1366 EXPORT_SYMBOL(dev_disable_lro);
1367 
1368 
1369 static int dev_boot_phase = 1;
1370 
1371 /*
1372  *	Device change register/unregister. These are not inline or static
1373  *	as we export them to the world.
1374  */
1375 
1376 /**
1377  *	register_netdevice_notifier - register a network notifier block
1378  *	@nb: notifier
1379  *
1380  *	Register a notifier to be called when network device events occur.
1381  *	The notifier passed is linked into the kernel structures and must
1382  *	not be reused until it has been unregistered. A negative errno code
1383  *	is returned on a failure.
1384  *
1385  * 	When registered all registration and up events are replayed
1386  *	to the new notifier to allow device to have a race free
1387  *	view of the network device list.
1388  */
1389 
1390 int register_netdevice_notifier(struct notifier_block *nb)
1391 {
1392 	struct net_device *dev;
1393 	struct net_device *last;
1394 	struct net *net;
1395 	int err;
1396 
1397 	rtnl_lock();
1398 	err = raw_notifier_chain_register(&netdev_chain, nb);
1399 	if (err)
1400 		goto unlock;
1401 	if (dev_boot_phase)
1402 		goto unlock;
1403 	for_each_net(net) {
1404 		for_each_netdev(net, dev) {
1405 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1406 			err = notifier_to_errno(err);
1407 			if (err)
1408 				goto rollback;
1409 
1410 			if (!(dev->flags & IFF_UP))
1411 				continue;
1412 
1413 			nb->notifier_call(nb, NETDEV_UP, dev);
1414 		}
1415 	}
1416 
1417 unlock:
1418 	rtnl_unlock();
1419 	return err;
1420 
1421 rollback:
1422 	last = dev;
1423 	for_each_net(net) {
1424 		for_each_netdev(net, dev) {
1425 			if (dev == last)
1426 				break;
1427 
1428 			if (dev->flags & IFF_UP) {
1429 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1430 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1431 			}
1432 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1433 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1434 		}
1435 	}
1436 
1437 	raw_notifier_chain_unregister(&netdev_chain, nb);
1438 	goto unlock;
1439 }
1440 EXPORT_SYMBOL(register_netdevice_notifier);
1441 
1442 /**
1443  *	unregister_netdevice_notifier - unregister a network notifier block
1444  *	@nb: notifier
1445  *
1446  *	Unregister a notifier previously registered by
1447  *	register_netdevice_notifier(). The notifier is unlinked into the
1448  *	kernel structures and may then be reused. A negative errno code
1449  *	is returned on a failure.
1450  */
1451 
1452 int unregister_netdevice_notifier(struct notifier_block *nb)
1453 {
1454 	int err;
1455 
1456 	rtnl_lock();
1457 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1458 	rtnl_unlock();
1459 	return err;
1460 }
1461 EXPORT_SYMBOL(unregister_netdevice_notifier);
1462 
1463 /**
1464  *	call_netdevice_notifiers - call all network notifier blocks
1465  *      @val: value passed unmodified to notifier function
1466  *      @dev: net_device pointer passed unmodified to notifier function
1467  *
1468  *	Call all network notifier blocks.  Parameters and return value
1469  *	are as for raw_notifier_call_chain().
1470  */
1471 
1472 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1473 {
1474 	ASSERT_RTNL();
1475 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1476 }
1477 
1478 /* When > 0 there are consumers of rx skb time stamps */
1479 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1480 
1481 void net_enable_timestamp(void)
1482 {
1483 	atomic_inc(&netstamp_needed);
1484 }
1485 EXPORT_SYMBOL(net_enable_timestamp);
1486 
1487 void net_disable_timestamp(void)
1488 {
1489 	atomic_dec(&netstamp_needed);
1490 }
1491 EXPORT_SYMBOL(net_disable_timestamp);
1492 
1493 static inline void net_timestamp_set(struct sk_buff *skb)
1494 {
1495 	if (atomic_read(&netstamp_needed))
1496 		__net_timestamp(skb);
1497 	else
1498 		skb->tstamp.tv64 = 0;
1499 }
1500 
1501 static inline void net_timestamp_check(struct sk_buff *skb)
1502 {
1503 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1504 		__net_timestamp(skb);
1505 }
1506 
1507 /**
1508  * dev_forward_skb - loopback an skb to another netif
1509  *
1510  * @dev: destination network device
1511  * @skb: buffer to forward
1512  *
1513  * return values:
1514  *	NET_RX_SUCCESS	(no congestion)
1515  *	NET_RX_DROP     (packet was dropped, but freed)
1516  *
1517  * dev_forward_skb can be used for injecting an skb from the
1518  * start_xmit function of one device into the receive queue
1519  * of another device.
1520  *
1521  * The receiving device may be in another namespace, so
1522  * we have to clear all information in the skb that could
1523  * impact namespace isolation.
1524  */
1525 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1526 {
1527 	skb_orphan(skb);
1528 	nf_reset(skb);
1529 
1530 	if (unlikely(!(dev->flags & IFF_UP) ||
1531 		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1532 		atomic_long_inc(&dev->rx_dropped);
1533 		kfree_skb(skb);
1534 		return NET_RX_DROP;
1535 	}
1536 	skb_set_dev(skb, dev);
1537 	skb->tstamp.tv64 = 0;
1538 	skb->pkt_type = PACKET_HOST;
1539 	skb->protocol = eth_type_trans(skb, dev);
1540 	return netif_rx(skb);
1541 }
1542 EXPORT_SYMBOL_GPL(dev_forward_skb);
1543 
1544 static inline int deliver_skb(struct sk_buff *skb,
1545 			      struct packet_type *pt_prev,
1546 			      struct net_device *orig_dev)
1547 {
1548 	atomic_inc(&skb->users);
1549 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1550 }
1551 
1552 /*
1553  *	Support routine. Sends outgoing frames to any network
1554  *	taps currently in use.
1555  */
1556 
1557 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1558 {
1559 	struct packet_type *ptype;
1560 	struct sk_buff *skb2 = NULL;
1561 	struct packet_type *pt_prev = NULL;
1562 
1563 	rcu_read_lock();
1564 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1565 		/* Never send packets back to the socket
1566 		 * they originated from - MvS (miquels@drinkel.ow.org)
1567 		 */
1568 		if ((ptype->dev == dev || !ptype->dev) &&
1569 		    (ptype->af_packet_priv == NULL ||
1570 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1571 			if (pt_prev) {
1572 				deliver_skb(skb2, pt_prev, skb->dev);
1573 				pt_prev = ptype;
1574 				continue;
1575 			}
1576 
1577 			skb2 = skb_clone(skb, GFP_ATOMIC);
1578 			if (!skb2)
1579 				break;
1580 
1581 			net_timestamp_set(skb2);
1582 
1583 			/* skb->nh should be correctly
1584 			   set by sender, so that the second statement is
1585 			   just protection against buggy protocols.
1586 			 */
1587 			skb_reset_mac_header(skb2);
1588 
1589 			if (skb_network_header(skb2) < skb2->data ||
1590 			    skb2->network_header > skb2->tail) {
1591 				if (net_ratelimit())
1592 					printk(KERN_CRIT "protocol %04x is "
1593 					       "buggy, dev %s\n",
1594 					       ntohs(skb2->protocol),
1595 					       dev->name);
1596 				skb_reset_network_header(skb2);
1597 			}
1598 
1599 			skb2->transport_header = skb2->network_header;
1600 			skb2->pkt_type = PACKET_OUTGOING;
1601 			pt_prev = ptype;
1602 		}
1603 	}
1604 	if (pt_prev)
1605 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1606 	rcu_read_unlock();
1607 }
1608 
1609 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1610  * @dev: Network device
1611  * @txq: number of queues available
1612  *
1613  * If real_num_tx_queues is changed the tc mappings may no longer be
1614  * valid. To resolve this verify the tc mapping remains valid and if
1615  * not NULL the mapping. With no priorities mapping to this
1616  * offset/count pair it will no longer be used. In the worst case TC0
1617  * is invalid nothing can be done so disable priority mappings. If is
1618  * expected that drivers will fix this mapping if they can before
1619  * calling netif_set_real_num_tx_queues.
1620  */
1621 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1622 {
1623 	int i;
1624 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1625 
1626 	/* If TC0 is invalidated disable TC mapping */
1627 	if (tc->offset + tc->count > txq) {
1628 		pr_warning("Number of in use tx queues changed "
1629 			   "invalidating tc mappings. Priority "
1630 			   "traffic classification disabled!\n");
1631 		dev->num_tc = 0;
1632 		return;
1633 	}
1634 
1635 	/* Invalidated prio to tc mappings set to TC0 */
1636 	for (i = 1; i < TC_BITMASK + 1; i++) {
1637 		int q = netdev_get_prio_tc_map(dev, i);
1638 
1639 		tc = &dev->tc_to_txq[q];
1640 		if (tc->offset + tc->count > txq) {
1641 			pr_warning("Number of in use tx queues "
1642 				   "changed. Priority %i to tc "
1643 				   "mapping %i is no longer valid "
1644 				   "setting map to 0\n",
1645 				   i, q);
1646 			netdev_set_prio_tc_map(dev, i, 0);
1647 		}
1648 	}
1649 }
1650 
1651 /*
1652  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1653  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1654  */
1655 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1656 {
1657 	int rc;
1658 
1659 	if (txq < 1 || txq > dev->num_tx_queues)
1660 		return -EINVAL;
1661 
1662 	if (dev->reg_state == NETREG_REGISTERED ||
1663 	    dev->reg_state == NETREG_UNREGISTERING) {
1664 		ASSERT_RTNL();
1665 
1666 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1667 						  txq);
1668 		if (rc)
1669 			return rc;
1670 
1671 		if (dev->num_tc)
1672 			netif_setup_tc(dev, txq);
1673 
1674 		if (txq < dev->real_num_tx_queues)
1675 			qdisc_reset_all_tx_gt(dev, txq);
1676 	}
1677 
1678 	dev->real_num_tx_queues = txq;
1679 	return 0;
1680 }
1681 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1682 
1683 #ifdef CONFIG_RPS
1684 /**
1685  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1686  *	@dev: Network device
1687  *	@rxq: Actual number of RX queues
1688  *
1689  *	This must be called either with the rtnl_lock held or before
1690  *	registration of the net device.  Returns 0 on success, or a
1691  *	negative error code.  If called before registration, it always
1692  *	succeeds.
1693  */
1694 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1695 {
1696 	int rc;
1697 
1698 	if (rxq < 1 || rxq > dev->num_rx_queues)
1699 		return -EINVAL;
1700 
1701 	if (dev->reg_state == NETREG_REGISTERED) {
1702 		ASSERT_RTNL();
1703 
1704 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1705 						  rxq);
1706 		if (rc)
1707 			return rc;
1708 	}
1709 
1710 	dev->real_num_rx_queues = rxq;
1711 	return 0;
1712 }
1713 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1714 #endif
1715 
1716 static inline void __netif_reschedule(struct Qdisc *q)
1717 {
1718 	struct softnet_data *sd;
1719 	unsigned long flags;
1720 
1721 	local_irq_save(flags);
1722 	sd = &__get_cpu_var(softnet_data);
1723 	q->next_sched = NULL;
1724 	*sd->output_queue_tailp = q;
1725 	sd->output_queue_tailp = &q->next_sched;
1726 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1727 	local_irq_restore(flags);
1728 }
1729 
1730 void __netif_schedule(struct Qdisc *q)
1731 {
1732 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1733 		__netif_reschedule(q);
1734 }
1735 EXPORT_SYMBOL(__netif_schedule);
1736 
1737 void dev_kfree_skb_irq(struct sk_buff *skb)
1738 {
1739 	if (atomic_dec_and_test(&skb->users)) {
1740 		struct softnet_data *sd;
1741 		unsigned long flags;
1742 
1743 		local_irq_save(flags);
1744 		sd = &__get_cpu_var(softnet_data);
1745 		skb->next = sd->completion_queue;
1746 		sd->completion_queue = skb;
1747 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1748 		local_irq_restore(flags);
1749 	}
1750 }
1751 EXPORT_SYMBOL(dev_kfree_skb_irq);
1752 
1753 void dev_kfree_skb_any(struct sk_buff *skb)
1754 {
1755 	if (in_irq() || irqs_disabled())
1756 		dev_kfree_skb_irq(skb);
1757 	else
1758 		dev_kfree_skb(skb);
1759 }
1760 EXPORT_SYMBOL(dev_kfree_skb_any);
1761 
1762 
1763 /**
1764  * netif_device_detach - mark device as removed
1765  * @dev: network device
1766  *
1767  * Mark device as removed from system and therefore no longer available.
1768  */
1769 void netif_device_detach(struct net_device *dev)
1770 {
1771 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1772 	    netif_running(dev)) {
1773 		netif_tx_stop_all_queues(dev);
1774 	}
1775 }
1776 EXPORT_SYMBOL(netif_device_detach);
1777 
1778 /**
1779  * netif_device_attach - mark device as attached
1780  * @dev: network device
1781  *
1782  * Mark device as attached from system and restart if needed.
1783  */
1784 void netif_device_attach(struct net_device *dev)
1785 {
1786 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1787 	    netif_running(dev)) {
1788 		netif_tx_wake_all_queues(dev);
1789 		__netdev_watchdog_up(dev);
1790 	}
1791 }
1792 EXPORT_SYMBOL(netif_device_attach);
1793 
1794 /**
1795  * skb_dev_set -- assign a new device to a buffer
1796  * @skb: buffer for the new device
1797  * @dev: network device
1798  *
1799  * If an skb is owned by a device already, we have to reset
1800  * all data private to the namespace a device belongs to
1801  * before assigning it a new device.
1802  */
1803 #ifdef CONFIG_NET_NS
1804 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1805 {
1806 	skb_dst_drop(skb);
1807 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1808 		secpath_reset(skb);
1809 		nf_reset(skb);
1810 		skb_init_secmark(skb);
1811 		skb->mark = 0;
1812 		skb->priority = 0;
1813 		skb->nf_trace = 0;
1814 		skb->ipvs_property = 0;
1815 #ifdef CONFIG_NET_SCHED
1816 		skb->tc_index = 0;
1817 #endif
1818 	}
1819 	skb->dev = dev;
1820 }
1821 EXPORT_SYMBOL(skb_set_dev);
1822 #endif /* CONFIG_NET_NS */
1823 
1824 /*
1825  * Invalidate hardware checksum when packet is to be mangled, and
1826  * complete checksum manually on outgoing path.
1827  */
1828 int skb_checksum_help(struct sk_buff *skb)
1829 {
1830 	__wsum csum;
1831 	int ret = 0, offset;
1832 
1833 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1834 		goto out_set_summed;
1835 
1836 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1837 		/* Let GSO fix up the checksum. */
1838 		goto out_set_summed;
1839 	}
1840 
1841 	offset = skb_checksum_start_offset(skb);
1842 	BUG_ON(offset >= skb_headlen(skb));
1843 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1844 
1845 	offset += skb->csum_offset;
1846 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1847 
1848 	if (skb_cloned(skb) &&
1849 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1850 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1851 		if (ret)
1852 			goto out;
1853 	}
1854 
1855 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1856 out_set_summed:
1857 	skb->ip_summed = CHECKSUM_NONE;
1858 out:
1859 	return ret;
1860 }
1861 EXPORT_SYMBOL(skb_checksum_help);
1862 
1863 /**
1864  *	skb_gso_segment - Perform segmentation on skb.
1865  *	@skb: buffer to segment
1866  *	@features: features for the output path (see dev->features)
1867  *
1868  *	This function segments the given skb and returns a list of segments.
1869  *
1870  *	It may return NULL if the skb requires no segmentation.  This is
1871  *	only possible when GSO is used for verifying header integrity.
1872  */
1873 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1874 {
1875 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1876 	struct packet_type *ptype;
1877 	__be16 type = skb->protocol;
1878 	int vlan_depth = ETH_HLEN;
1879 	int err;
1880 
1881 	while (type == htons(ETH_P_8021Q)) {
1882 		struct vlan_hdr *vh;
1883 
1884 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1885 			return ERR_PTR(-EINVAL);
1886 
1887 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1888 		type = vh->h_vlan_encapsulated_proto;
1889 		vlan_depth += VLAN_HLEN;
1890 	}
1891 
1892 	skb_reset_mac_header(skb);
1893 	skb->mac_len = skb->network_header - skb->mac_header;
1894 	__skb_pull(skb, skb->mac_len);
1895 
1896 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1897 		struct net_device *dev = skb->dev;
1898 		struct ethtool_drvinfo info = {};
1899 
1900 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1901 			dev->ethtool_ops->get_drvinfo(dev, &info);
1902 
1903 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1904 		     info.driver, dev ? dev->features : 0L,
1905 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1906 		     skb->len, skb->data_len, skb->ip_summed);
1907 
1908 		if (skb_header_cloned(skb) &&
1909 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1910 			return ERR_PTR(err);
1911 	}
1912 
1913 	rcu_read_lock();
1914 	list_for_each_entry_rcu(ptype,
1915 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1916 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1917 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1918 				err = ptype->gso_send_check(skb);
1919 				segs = ERR_PTR(err);
1920 				if (err || skb_gso_ok(skb, features))
1921 					break;
1922 				__skb_push(skb, (skb->data -
1923 						 skb_network_header(skb)));
1924 			}
1925 			segs = ptype->gso_segment(skb, features);
1926 			break;
1927 		}
1928 	}
1929 	rcu_read_unlock();
1930 
1931 	__skb_push(skb, skb->data - skb_mac_header(skb));
1932 
1933 	return segs;
1934 }
1935 EXPORT_SYMBOL(skb_gso_segment);
1936 
1937 /* Take action when hardware reception checksum errors are detected. */
1938 #ifdef CONFIG_BUG
1939 void netdev_rx_csum_fault(struct net_device *dev)
1940 {
1941 	if (net_ratelimit()) {
1942 		printk(KERN_ERR "%s: hw csum failure.\n",
1943 			dev ? dev->name : "<unknown>");
1944 		dump_stack();
1945 	}
1946 }
1947 EXPORT_SYMBOL(netdev_rx_csum_fault);
1948 #endif
1949 
1950 /* Actually, we should eliminate this check as soon as we know, that:
1951  * 1. IOMMU is present and allows to map all the memory.
1952  * 2. No high memory really exists on this machine.
1953  */
1954 
1955 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1956 {
1957 #ifdef CONFIG_HIGHMEM
1958 	int i;
1959 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1960 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1961 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1962 				return 1;
1963 	}
1964 
1965 	if (PCI_DMA_BUS_IS_PHYS) {
1966 		struct device *pdev = dev->dev.parent;
1967 
1968 		if (!pdev)
1969 			return 0;
1970 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1971 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1972 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1973 				return 1;
1974 		}
1975 	}
1976 #endif
1977 	return 0;
1978 }
1979 
1980 struct dev_gso_cb {
1981 	void (*destructor)(struct sk_buff *skb);
1982 };
1983 
1984 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1985 
1986 static void dev_gso_skb_destructor(struct sk_buff *skb)
1987 {
1988 	struct dev_gso_cb *cb;
1989 
1990 	do {
1991 		struct sk_buff *nskb = skb->next;
1992 
1993 		skb->next = nskb->next;
1994 		nskb->next = NULL;
1995 		kfree_skb(nskb);
1996 	} while (skb->next);
1997 
1998 	cb = DEV_GSO_CB(skb);
1999 	if (cb->destructor)
2000 		cb->destructor(skb);
2001 }
2002 
2003 /**
2004  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
2005  *	@skb: buffer to segment
2006  *	@features: device features as applicable to this skb
2007  *
2008  *	This function segments the given skb and stores the list of segments
2009  *	in skb->next.
2010  */
2011 static int dev_gso_segment(struct sk_buff *skb, int features)
2012 {
2013 	struct sk_buff *segs;
2014 
2015 	segs = skb_gso_segment(skb, features);
2016 
2017 	/* Verifying header integrity only. */
2018 	if (!segs)
2019 		return 0;
2020 
2021 	if (IS_ERR(segs))
2022 		return PTR_ERR(segs);
2023 
2024 	skb->next = segs;
2025 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2026 	skb->destructor = dev_gso_skb_destructor;
2027 
2028 	return 0;
2029 }
2030 
2031 /*
2032  * Try to orphan skb early, right before transmission by the device.
2033  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2034  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2035  */
2036 static inline void skb_orphan_try(struct sk_buff *skb)
2037 {
2038 	struct sock *sk = skb->sk;
2039 
2040 	if (sk && !skb_shinfo(skb)->tx_flags) {
2041 		/* skb_tx_hash() wont be able to get sk.
2042 		 * We copy sk_hash into skb->rxhash
2043 		 */
2044 		if (!skb->rxhash)
2045 			skb->rxhash = sk->sk_hash;
2046 		skb_orphan(skb);
2047 	}
2048 }
2049 
2050 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2051 {
2052 	return ((features & NETIF_F_GEN_CSUM) ||
2053 		((features & NETIF_F_V4_CSUM) &&
2054 		 protocol == htons(ETH_P_IP)) ||
2055 		((features & NETIF_F_V6_CSUM) &&
2056 		 protocol == htons(ETH_P_IPV6)) ||
2057 		((features & NETIF_F_FCOE_CRC) &&
2058 		 protocol == htons(ETH_P_FCOE)));
2059 }
2060 
2061 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2062 {
2063 	if (!can_checksum_protocol(features, protocol)) {
2064 		features &= ~NETIF_F_ALL_CSUM;
2065 		features &= ~NETIF_F_SG;
2066 	} else if (illegal_highdma(skb->dev, skb)) {
2067 		features &= ~NETIF_F_SG;
2068 	}
2069 
2070 	return features;
2071 }
2072 
2073 u32 netif_skb_features(struct sk_buff *skb)
2074 {
2075 	__be16 protocol = skb->protocol;
2076 	u32 features = skb->dev->features;
2077 
2078 	if (protocol == htons(ETH_P_8021Q)) {
2079 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2080 		protocol = veh->h_vlan_encapsulated_proto;
2081 	} else if (!vlan_tx_tag_present(skb)) {
2082 		return harmonize_features(skb, protocol, features);
2083 	}
2084 
2085 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2086 
2087 	if (protocol != htons(ETH_P_8021Q)) {
2088 		return harmonize_features(skb, protocol, features);
2089 	} else {
2090 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2091 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2092 		return harmonize_features(skb, protocol, features);
2093 	}
2094 }
2095 EXPORT_SYMBOL(netif_skb_features);
2096 
2097 /*
2098  * Returns true if either:
2099  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2100  *	2. skb is fragmented and the device does not support SG, or if
2101  *	   at least one of fragments is in highmem and device does not
2102  *	   support DMA from it.
2103  */
2104 static inline int skb_needs_linearize(struct sk_buff *skb,
2105 				      int features)
2106 {
2107 	return skb_is_nonlinear(skb) &&
2108 			((skb_has_frag_list(skb) &&
2109 				!(features & NETIF_F_FRAGLIST)) ||
2110 			(skb_shinfo(skb)->nr_frags &&
2111 				!(features & NETIF_F_SG)));
2112 }
2113 
2114 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2115 			struct netdev_queue *txq)
2116 {
2117 	const struct net_device_ops *ops = dev->netdev_ops;
2118 	int rc = NETDEV_TX_OK;
2119 
2120 	if (likely(!skb->next)) {
2121 		u32 features;
2122 
2123 		/*
2124 		 * If device doesnt need skb->dst, release it right now while
2125 		 * its hot in this cpu cache
2126 		 */
2127 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2128 			skb_dst_drop(skb);
2129 
2130 		if (!list_empty(&ptype_all))
2131 			dev_queue_xmit_nit(skb, dev);
2132 
2133 		skb_orphan_try(skb);
2134 
2135 		features = netif_skb_features(skb);
2136 
2137 		if (vlan_tx_tag_present(skb) &&
2138 		    !(features & NETIF_F_HW_VLAN_TX)) {
2139 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2140 			if (unlikely(!skb))
2141 				goto out;
2142 
2143 			skb->vlan_tci = 0;
2144 		}
2145 
2146 		if (netif_needs_gso(skb, features)) {
2147 			if (unlikely(dev_gso_segment(skb, features)))
2148 				goto out_kfree_skb;
2149 			if (skb->next)
2150 				goto gso;
2151 		} else {
2152 			if (skb_needs_linearize(skb, features) &&
2153 			    __skb_linearize(skb))
2154 				goto out_kfree_skb;
2155 
2156 			/* If packet is not checksummed and device does not
2157 			 * support checksumming for this protocol, complete
2158 			 * checksumming here.
2159 			 */
2160 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2161 				skb_set_transport_header(skb,
2162 					skb_checksum_start_offset(skb));
2163 				if (!(features & NETIF_F_ALL_CSUM) &&
2164 				     skb_checksum_help(skb))
2165 					goto out_kfree_skb;
2166 			}
2167 		}
2168 
2169 		rc = ops->ndo_start_xmit(skb, dev);
2170 		trace_net_dev_xmit(skb, rc);
2171 		if (rc == NETDEV_TX_OK)
2172 			txq_trans_update(txq);
2173 		return rc;
2174 	}
2175 
2176 gso:
2177 	do {
2178 		struct sk_buff *nskb = skb->next;
2179 
2180 		skb->next = nskb->next;
2181 		nskb->next = NULL;
2182 
2183 		/*
2184 		 * If device doesnt need nskb->dst, release it right now while
2185 		 * its hot in this cpu cache
2186 		 */
2187 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2188 			skb_dst_drop(nskb);
2189 
2190 		rc = ops->ndo_start_xmit(nskb, dev);
2191 		trace_net_dev_xmit(nskb, rc);
2192 		if (unlikely(rc != NETDEV_TX_OK)) {
2193 			if (rc & ~NETDEV_TX_MASK)
2194 				goto out_kfree_gso_skb;
2195 			nskb->next = skb->next;
2196 			skb->next = nskb;
2197 			return rc;
2198 		}
2199 		txq_trans_update(txq);
2200 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2201 			return NETDEV_TX_BUSY;
2202 	} while (skb->next);
2203 
2204 out_kfree_gso_skb:
2205 	if (likely(skb->next == NULL))
2206 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2207 out_kfree_skb:
2208 	kfree_skb(skb);
2209 out:
2210 	return rc;
2211 }
2212 
2213 static u32 hashrnd __read_mostly;
2214 
2215 /*
2216  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2217  * to be used as a distribution range.
2218  */
2219 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2220 		  unsigned int num_tx_queues)
2221 {
2222 	u32 hash;
2223 	u16 qoffset = 0;
2224 	u16 qcount = num_tx_queues;
2225 
2226 	if (skb_rx_queue_recorded(skb)) {
2227 		hash = skb_get_rx_queue(skb);
2228 		while (unlikely(hash >= num_tx_queues))
2229 			hash -= num_tx_queues;
2230 		return hash;
2231 	}
2232 
2233 	if (dev->num_tc) {
2234 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2235 		qoffset = dev->tc_to_txq[tc].offset;
2236 		qcount = dev->tc_to_txq[tc].count;
2237 	}
2238 
2239 	if (skb->sk && skb->sk->sk_hash)
2240 		hash = skb->sk->sk_hash;
2241 	else
2242 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2243 	hash = jhash_1word(hash, hashrnd);
2244 
2245 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2246 }
2247 EXPORT_SYMBOL(__skb_tx_hash);
2248 
2249 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2250 {
2251 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2252 		if (net_ratelimit()) {
2253 			pr_warning("%s selects TX queue %d, but "
2254 				"real number of TX queues is %d\n",
2255 				dev->name, queue_index, dev->real_num_tx_queues);
2256 		}
2257 		return 0;
2258 	}
2259 	return queue_index;
2260 }
2261 
2262 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2263 {
2264 #ifdef CONFIG_XPS
2265 	struct xps_dev_maps *dev_maps;
2266 	struct xps_map *map;
2267 	int queue_index = -1;
2268 
2269 	rcu_read_lock();
2270 	dev_maps = rcu_dereference(dev->xps_maps);
2271 	if (dev_maps) {
2272 		map = rcu_dereference(
2273 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2274 		if (map) {
2275 			if (map->len == 1)
2276 				queue_index = map->queues[0];
2277 			else {
2278 				u32 hash;
2279 				if (skb->sk && skb->sk->sk_hash)
2280 					hash = skb->sk->sk_hash;
2281 				else
2282 					hash = (__force u16) skb->protocol ^
2283 					    skb->rxhash;
2284 				hash = jhash_1word(hash, hashrnd);
2285 				queue_index = map->queues[
2286 				    ((u64)hash * map->len) >> 32];
2287 			}
2288 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2289 				queue_index = -1;
2290 		}
2291 	}
2292 	rcu_read_unlock();
2293 
2294 	return queue_index;
2295 #else
2296 	return -1;
2297 #endif
2298 }
2299 
2300 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2301 					struct sk_buff *skb)
2302 {
2303 	int queue_index;
2304 	const struct net_device_ops *ops = dev->netdev_ops;
2305 
2306 	if (dev->real_num_tx_queues == 1)
2307 		queue_index = 0;
2308 	else if (ops->ndo_select_queue) {
2309 		queue_index = ops->ndo_select_queue(dev, skb);
2310 		queue_index = dev_cap_txqueue(dev, queue_index);
2311 	} else {
2312 		struct sock *sk = skb->sk;
2313 		queue_index = sk_tx_queue_get(sk);
2314 
2315 		if (queue_index < 0 || skb->ooo_okay ||
2316 		    queue_index >= dev->real_num_tx_queues) {
2317 			int old_index = queue_index;
2318 
2319 			queue_index = get_xps_queue(dev, skb);
2320 			if (queue_index < 0)
2321 				queue_index = skb_tx_hash(dev, skb);
2322 
2323 			if (queue_index != old_index && sk) {
2324 				struct dst_entry *dst =
2325 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2326 
2327 				if (dst && skb_dst(skb) == dst)
2328 					sk_tx_queue_set(sk, queue_index);
2329 			}
2330 		}
2331 	}
2332 
2333 	skb_set_queue_mapping(skb, queue_index);
2334 	return netdev_get_tx_queue(dev, queue_index);
2335 }
2336 
2337 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2338 				 struct net_device *dev,
2339 				 struct netdev_queue *txq)
2340 {
2341 	spinlock_t *root_lock = qdisc_lock(q);
2342 	bool contended;
2343 	int rc;
2344 
2345 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2346 	qdisc_calculate_pkt_len(skb, q);
2347 	/*
2348 	 * Heuristic to force contended enqueues to serialize on a
2349 	 * separate lock before trying to get qdisc main lock.
2350 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2351 	 * and dequeue packets faster.
2352 	 */
2353 	contended = qdisc_is_running(q);
2354 	if (unlikely(contended))
2355 		spin_lock(&q->busylock);
2356 
2357 	spin_lock(root_lock);
2358 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2359 		kfree_skb(skb);
2360 		rc = NET_XMIT_DROP;
2361 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2362 		   qdisc_run_begin(q)) {
2363 		/*
2364 		 * This is a work-conserving queue; there are no old skbs
2365 		 * waiting to be sent out; and the qdisc is not running -
2366 		 * xmit the skb directly.
2367 		 */
2368 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2369 			skb_dst_force(skb);
2370 
2371 		qdisc_bstats_update(q, skb);
2372 
2373 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2374 			if (unlikely(contended)) {
2375 				spin_unlock(&q->busylock);
2376 				contended = false;
2377 			}
2378 			__qdisc_run(q);
2379 		} else
2380 			qdisc_run_end(q);
2381 
2382 		rc = NET_XMIT_SUCCESS;
2383 	} else {
2384 		skb_dst_force(skb);
2385 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2386 		if (qdisc_run_begin(q)) {
2387 			if (unlikely(contended)) {
2388 				spin_unlock(&q->busylock);
2389 				contended = false;
2390 			}
2391 			__qdisc_run(q);
2392 		}
2393 	}
2394 	spin_unlock(root_lock);
2395 	if (unlikely(contended))
2396 		spin_unlock(&q->busylock);
2397 	return rc;
2398 }
2399 
2400 static DEFINE_PER_CPU(int, xmit_recursion);
2401 #define RECURSION_LIMIT 10
2402 
2403 /**
2404  *	dev_queue_xmit - transmit a buffer
2405  *	@skb: buffer to transmit
2406  *
2407  *	Queue a buffer for transmission to a network device. The caller must
2408  *	have set the device and priority and built the buffer before calling
2409  *	this function. The function can be called from an interrupt.
2410  *
2411  *	A negative errno code is returned on a failure. A success does not
2412  *	guarantee the frame will be transmitted as it may be dropped due
2413  *	to congestion or traffic shaping.
2414  *
2415  * -----------------------------------------------------------------------------------
2416  *      I notice this method can also return errors from the queue disciplines,
2417  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2418  *      be positive.
2419  *
2420  *      Regardless of the return value, the skb is consumed, so it is currently
2421  *      difficult to retry a send to this method.  (You can bump the ref count
2422  *      before sending to hold a reference for retry if you are careful.)
2423  *
2424  *      When calling this method, interrupts MUST be enabled.  This is because
2425  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2426  *          --BLG
2427  */
2428 int dev_queue_xmit(struct sk_buff *skb)
2429 {
2430 	struct net_device *dev = skb->dev;
2431 	struct netdev_queue *txq;
2432 	struct Qdisc *q;
2433 	int rc = -ENOMEM;
2434 
2435 	/* Disable soft irqs for various locks below. Also
2436 	 * stops preemption for RCU.
2437 	 */
2438 	rcu_read_lock_bh();
2439 
2440 	txq = dev_pick_tx(dev, skb);
2441 	q = rcu_dereference_bh(txq->qdisc);
2442 
2443 #ifdef CONFIG_NET_CLS_ACT
2444 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2445 #endif
2446 	trace_net_dev_queue(skb);
2447 	if (q->enqueue) {
2448 		rc = __dev_xmit_skb(skb, q, dev, txq);
2449 		goto out;
2450 	}
2451 
2452 	/* The device has no queue. Common case for software devices:
2453 	   loopback, all the sorts of tunnels...
2454 
2455 	   Really, it is unlikely that netif_tx_lock protection is necessary
2456 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2457 	   counters.)
2458 	   However, it is possible, that they rely on protection
2459 	   made by us here.
2460 
2461 	   Check this and shot the lock. It is not prone from deadlocks.
2462 	   Either shot noqueue qdisc, it is even simpler 8)
2463 	 */
2464 	if (dev->flags & IFF_UP) {
2465 		int cpu = smp_processor_id(); /* ok because BHs are off */
2466 
2467 		if (txq->xmit_lock_owner != cpu) {
2468 
2469 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2470 				goto recursion_alert;
2471 
2472 			HARD_TX_LOCK(dev, txq, cpu);
2473 
2474 			if (!netif_tx_queue_stopped(txq)) {
2475 				__this_cpu_inc(xmit_recursion);
2476 				rc = dev_hard_start_xmit(skb, dev, txq);
2477 				__this_cpu_dec(xmit_recursion);
2478 				if (dev_xmit_complete(rc)) {
2479 					HARD_TX_UNLOCK(dev, txq);
2480 					goto out;
2481 				}
2482 			}
2483 			HARD_TX_UNLOCK(dev, txq);
2484 			if (net_ratelimit())
2485 				printk(KERN_CRIT "Virtual device %s asks to "
2486 				       "queue packet!\n", dev->name);
2487 		} else {
2488 			/* Recursion is detected! It is possible,
2489 			 * unfortunately
2490 			 */
2491 recursion_alert:
2492 			if (net_ratelimit())
2493 				printk(KERN_CRIT "Dead loop on virtual device "
2494 				       "%s, fix it urgently!\n", dev->name);
2495 		}
2496 	}
2497 
2498 	rc = -ENETDOWN;
2499 	rcu_read_unlock_bh();
2500 
2501 	kfree_skb(skb);
2502 	return rc;
2503 out:
2504 	rcu_read_unlock_bh();
2505 	return rc;
2506 }
2507 EXPORT_SYMBOL(dev_queue_xmit);
2508 
2509 
2510 /*=======================================================================
2511 			Receiver routines
2512   =======================================================================*/
2513 
2514 int netdev_max_backlog __read_mostly = 1000;
2515 int netdev_tstamp_prequeue __read_mostly = 1;
2516 int netdev_budget __read_mostly = 300;
2517 int weight_p __read_mostly = 64;            /* old backlog weight */
2518 
2519 /* Called with irq disabled */
2520 static inline void ____napi_schedule(struct softnet_data *sd,
2521 				     struct napi_struct *napi)
2522 {
2523 	list_add_tail(&napi->poll_list, &sd->poll_list);
2524 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2525 }
2526 
2527 /*
2528  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2529  * and src/dst port numbers. Returns a non-zero hash number on success
2530  * and 0 on failure.
2531  */
2532 __u32 __skb_get_rxhash(struct sk_buff *skb)
2533 {
2534 	int nhoff, hash = 0, poff;
2535 	struct ipv6hdr *ip6;
2536 	struct iphdr *ip;
2537 	u8 ip_proto;
2538 	u32 addr1, addr2, ihl;
2539 	union {
2540 		u32 v32;
2541 		u16 v16[2];
2542 	} ports;
2543 
2544 	nhoff = skb_network_offset(skb);
2545 
2546 	switch (skb->protocol) {
2547 	case __constant_htons(ETH_P_IP):
2548 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2549 			goto done;
2550 
2551 		ip = (struct iphdr *) (skb->data + nhoff);
2552 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2553 			ip_proto = 0;
2554 		else
2555 			ip_proto = ip->protocol;
2556 		addr1 = (__force u32) ip->saddr;
2557 		addr2 = (__force u32) ip->daddr;
2558 		ihl = ip->ihl;
2559 		break;
2560 	case __constant_htons(ETH_P_IPV6):
2561 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2562 			goto done;
2563 
2564 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2565 		ip_proto = ip6->nexthdr;
2566 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2567 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2568 		ihl = (40 >> 2);
2569 		break;
2570 	default:
2571 		goto done;
2572 	}
2573 
2574 	ports.v32 = 0;
2575 	poff = proto_ports_offset(ip_proto);
2576 	if (poff >= 0) {
2577 		nhoff += ihl * 4 + poff;
2578 		if (pskb_may_pull(skb, nhoff + 4)) {
2579 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2580 			if (ports.v16[1] < ports.v16[0])
2581 				swap(ports.v16[0], ports.v16[1]);
2582 		}
2583 	}
2584 
2585 	/* get a consistent hash (same value on both flow directions) */
2586 	if (addr2 < addr1)
2587 		swap(addr1, addr2);
2588 
2589 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2590 	if (!hash)
2591 		hash = 1;
2592 
2593 done:
2594 	return hash;
2595 }
2596 EXPORT_SYMBOL(__skb_get_rxhash);
2597 
2598 #ifdef CONFIG_RPS
2599 
2600 /* One global table that all flow-based protocols share. */
2601 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2602 EXPORT_SYMBOL(rps_sock_flow_table);
2603 
2604 static struct rps_dev_flow *
2605 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2606 	    struct rps_dev_flow *rflow, u16 next_cpu)
2607 {
2608 	u16 tcpu;
2609 
2610 	tcpu = rflow->cpu = next_cpu;
2611 	if (tcpu != RPS_NO_CPU) {
2612 #ifdef CONFIG_RFS_ACCEL
2613 		struct netdev_rx_queue *rxqueue;
2614 		struct rps_dev_flow_table *flow_table;
2615 		struct rps_dev_flow *old_rflow;
2616 		u32 flow_id;
2617 		u16 rxq_index;
2618 		int rc;
2619 
2620 		/* Should we steer this flow to a different hardware queue? */
2621 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2622 		    !(dev->features & NETIF_F_NTUPLE))
2623 			goto out;
2624 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2625 		if (rxq_index == skb_get_rx_queue(skb))
2626 			goto out;
2627 
2628 		rxqueue = dev->_rx + rxq_index;
2629 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2630 		if (!flow_table)
2631 			goto out;
2632 		flow_id = skb->rxhash & flow_table->mask;
2633 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2634 							rxq_index, flow_id);
2635 		if (rc < 0)
2636 			goto out;
2637 		old_rflow = rflow;
2638 		rflow = &flow_table->flows[flow_id];
2639 		rflow->cpu = next_cpu;
2640 		rflow->filter = rc;
2641 		if (old_rflow->filter == rflow->filter)
2642 			old_rflow->filter = RPS_NO_FILTER;
2643 	out:
2644 #endif
2645 		rflow->last_qtail =
2646 			per_cpu(softnet_data, tcpu).input_queue_head;
2647 	}
2648 
2649 	return rflow;
2650 }
2651 
2652 /*
2653  * get_rps_cpu is called from netif_receive_skb and returns the target
2654  * CPU from the RPS map of the receiving queue for a given skb.
2655  * rcu_read_lock must be held on entry.
2656  */
2657 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2658 		       struct rps_dev_flow **rflowp)
2659 {
2660 	struct netdev_rx_queue *rxqueue;
2661 	struct rps_map *map;
2662 	struct rps_dev_flow_table *flow_table;
2663 	struct rps_sock_flow_table *sock_flow_table;
2664 	int cpu = -1;
2665 	u16 tcpu;
2666 
2667 	if (skb_rx_queue_recorded(skb)) {
2668 		u16 index = skb_get_rx_queue(skb);
2669 		if (unlikely(index >= dev->real_num_rx_queues)) {
2670 			WARN_ONCE(dev->real_num_rx_queues > 1,
2671 				  "%s received packet on queue %u, but number "
2672 				  "of RX queues is %u\n",
2673 				  dev->name, index, dev->real_num_rx_queues);
2674 			goto done;
2675 		}
2676 		rxqueue = dev->_rx + index;
2677 	} else
2678 		rxqueue = dev->_rx;
2679 
2680 	map = rcu_dereference(rxqueue->rps_map);
2681 	if (map) {
2682 		if (map->len == 1 &&
2683 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2684 			tcpu = map->cpus[0];
2685 			if (cpu_online(tcpu))
2686 				cpu = tcpu;
2687 			goto done;
2688 		}
2689 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2690 		goto done;
2691 	}
2692 
2693 	skb_reset_network_header(skb);
2694 	if (!skb_get_rxhash(skb))
2695 		goto done;
2696 
2697 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2698 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2699 	if (flow_table && sock_flow_table) {
2700 		u16 next_cpu;
2701 		struct rps_dev_flow *rflow;
2702 
2703 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2704 		tcpu = rflow->cpu;
2705 
2706 		next_cpu = sock_flow_table->ents[skb->rxhash &
2707 		    sock_flow_table->mask];
2708 
2709 		/*
2710 		 * If the desired CPU (where last recvmsg was done) is
2711 		 * different from current CPU (one in the rx-queue flow
2712 		 * table entry), switch if one of the following holds:
2713 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2714 		 *   - Current CPU is offline.
2715 		 *   - The current CPU's queue tail has advanced beyond the
2716 		 *     last packet that was enqueued using this table entry.
2717 		 *     This guarantees that all previous packets for the flow
2718 		 *     have been dequeued, thus preserving in order delivery.
2719 		 */
2720 		if (unlikely(tcpu != next_cpu) &&
2721 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2722 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2723 		      rflow->last_qtail)) >= 0))
2724 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2725 
2726 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2727 			*rflowp = rflow;
2728 			cpu = tcpu;
2729 			goto done;
2730 		}
2731 	}
2732 
2733 	if (map) {
2734 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2735 
2736 		if (cpu_online(tcpu)) {
2737 			cpu = tcpu;
2738 			goto done;
2739 		}
2740 	}
2741 
2742 done:
2743 	return cpu;
2744 }
2745 
2746 #ifdef CONFIG_RFS_ACCEL
2747 
2748 /**
2749  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2750  * @dev: Device on which the filter was set
2751  * @rxq_index: RX queue index
2752  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2753  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2754  *
2755  * Drivers that implement ndo_rx_flow_steer() should periodically call
2756  * this function for each installed filter and remove the filters for
2757  * which it returns %true.
2758  */
2759 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2760 			 u32 flow_id, u16 filter_id)
2761 {
2762 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2763 	struct rps_dev_flow_table *flow_table;
2764 	struct rps_dev_flow *rflow;
2765 	bool expire = true;
2766 	int cpu;
2767 
2768 	rcu_read_lock();
2769 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2770 	if (flow_table && flow_id <= flow_table->mask) {
2771 		rflow = &flow_table->flows[flow_id];
2772 		cpu = ACCESS_ONCE(rflow->cpu);
2773 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2774 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2775 			   rflow->last_qtail) <
2776 		     (int)(10 * flow_table->mask)))
2777 			expire = false;
2778 	}
2779 	rcu_read_unlock();
2780 	return expire;
2781 }
2782 EXPORT_SYMBOL(rps_may_expire_flow);
2783 
2784 #endif /* CONFIG_RFS_ACCEL */
2785 
2786 /* Called from hardirq (IPI) context */
2787 static void rps_trigger_softirq(void *data)
2788 {
2789 	struct softnet_data *sd = data;
2790 
2791 	____napi_schedule(sd, &sd->backlog);
2792 	sd->received_rps++;
2793 }
2794 
2795 #endif /* CONFIG_RPS */
2796 
2797 /*
2798  * Check if this softnet_data structure is another cpu one
2799  * If yes, queue it to our IPI list and return 1
2800  * If no, return 0
2801  */
2802 static int rps_ipi_queued(struct softnet_data *sd)
2803 {
2804 #ifdef CONFIG_RPS
2805 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2806 
2807 	if (sd != mysd) {
2808 		sd->rps_ipi_next = mysd->rps_ipi_list;
2809 		mysd->rps_ipi_list = sd;
2810 
2811 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2812 		return 1;
2813 	}
2814 #endif /* CONFIG_RPS */
2815 	return 0;
2816 }
2817 
2818 /*
2819  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2820  * queue (may be a remote CPU queue).
2821  */
2822 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2823 			      unsigned int *qtail)
2824 {
2825 	struct softnet_data *sd;
2826 	unsigned long flags;
2827 
2828 	sd = &per_cpu(softnet_data, cpu);
2829 
2830 	local_irq_save(flags);
2831 
2832 	rps_lock(sd);
2833 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2834 		if (skb_queue_len(&sd->input_pkt_queue)) {
2835 enqueue:
2836 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2837 			input_queue_tail_incr_save(sd, qtail);
2838 			rps_unlock(sd);
2839 			local_irq_restore(flags);
2840 			return NET_RX_SUCCESS;
2841 		}
2842 
2843 		/* Schedule NAPI for backlog device
2844 		 * We can use non atomic operation since we own the queue lock
2845 		 */
2846 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2847 			if (!rps_ipi_queued(sd))
2848 				____napi_schedule(sd, &sd->backlog);
2849 		}
2850 		goto enqueue;
2851 	}
2852 
2853 	sd->dropped++;
2854 	rps_unlock(sd);
2855 
2856 	local_irq_restore(flags);
2857 
2858 	atomic_long_inc(&skb->dev->rx_dropped);
2859 	kfree_skb(skb);
2860 	return NET_RX_DROP;
2861 }
2862 
2863 /**
2864  *	netif_rx	-	post buffer to the network code
2865  *	@skb: buffer to post
2866  *
2867  *	This function receives a packet from a device driver and queues it for
2868  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2869  *	may be dropped during processing for congestion control or by the
2870  *	protocol layers.
2871  *
2872  *	return values:
2873  *	NET_RX_SUCCESS	(no congestion)
2874  *	NET_RX_DROP     (packet was dropped)
2875  *
2876  */
2877 
2878 int netif_rx(struct sk_buff *skb)
2879 {
2880 	int ret;
2881 
2882 	/* if netpoll wants it, pretend we never saw it */
2883 	if (netpoll_rx(skb))
2884 		return NET_RX_DROP;
2885 
2886 	if (netdev_tstamp_prequeue)
2887 		net_timestamp_check(skb);
2888 
2889 	trace_netif_rx(skb);
2890 #ifdef CONFIG_RPS
2891 	{
2892 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2893 		int cpu;
2894 
2895 		preempt_disable();
2896 		rcu_read_lock();
2897 
2898 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2899 		if (cpu < 0)
2900 			cpu = smp_processor_id();
2901 
2902 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2903 
2904 		rcu_read_unlock();
2905 		preempt_enable();
2906 	}
2907 #else
2908 	{
2909 		unsigned int qtail;
2910 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2911 		put_cpu();
2912 	}
2913 #endif
2914 	return ret;
2915 }
2916 EXPORT_SYMBOL(netif_rx);
2917 
2918 int netif_rx_ni(struct sk_buff *skb)
2919 {
2920 	int err;
2921 
2922 	preempt_disable();
2923 	err = netif_rx(skb);
2924 	if (local_softirq_pending())
2925 		do_softirq();
2926 	preempt_enable();
2927 
2928 	return err;
2929 }
2930 EXPORT_SYMBOL(netif_rx_ni);
2931 
2932 static void net_tx_action(struct softirq_action *h)
2933 {
2934 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2935 
2936 	if (sd->completion_queue) {
2937 		struct sk_buff *clist;
2938 
2939 		local_irq_disable();
2940 		clist = sd->completion_queue;
2941 		sd->completion_queue = NULL;
2942 		local_irq_enable();
2943 
2944 		while (clist) {
2945 			struct sk_buff *skb = clist;
2946 			clist = clist->next;
2947 
2948 			WARN_ON(atomic_read(&skb->users));
2949 			trace_kfree_skb(skb, net_tx_action);
2950 			__kfree_skb(skb);
2951 		}
2952 	}
2953 
2954 	if (sd->output_queue) {
2955 		struct Qdisc *head;
2956 
2957 		local_irq_disable();
2958 		head = sd->output_queue;
2959 		sd->output_queue = NULL;
2960 		sd->output_queue_tailp = &sd->output_queue;
2961 		local_irq_enable();
2962 
2963 		while (head) {
2964 			struct Qdisc *q = head;
2965 			spinlock_t *root_lock;
2966 
2967 			head = head->next_sched;
2968 
2969 			root_lock = qdisc_lock(q);
2970 			if (spin_trylock(root_lock)) {
2971 				smp_mb__before_clear_bit();
2972 				clear_bit(__QDISC_STATE_SCHED,
2973 					  &q->state);
2974 				qdisc_run(q);
2975 				spin_unlock(root_lock);
2976 			} else {
2977 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2978 					      &q->state)) {
2979 					__netif_reschedule(q);
2980 				} else {
2981 					smp_mb__before_clear_bit();
2982 					clear_bit(__QDISC_STATE_SCHED,
2983 						  &q->state);
2984 				}
2985 			}
2986 		}
2987 	}
2988 }
2989 
2990 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2991     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2992 /* This hook is defined here for ATM LANE */
2993 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2994 			     unsigned char *addr) __read_mostly;
2995 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2996 #endif
2997 
2998 #ifdef CONFIG_NET_CLS_ACT
2999 /* TODO: Maybe we should just force sch_ingress to be compiled in
3000  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3001  * a compare and 2 stores extra right now if we dont have it on
3002  * but have CONFIG_NET_CLS_ACT
3003  * NOTE: This doesnt stop any functionality; if you dont have
3004  * the ingress scheduler, you just cant add policies on ingress.
3005  *
3006  */
3007 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3008 {
3009 	struct net_device *dev = skb->dev;
3010 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3011 	int result = TC_ACT_OK;
3012 	struct Qdisc *q;
3013 
3014 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3015 		if (net_ratelimit())
3016 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3017 			       skb->skb_iif, dev->ifindex);
3018 		return TC_ACT_SHOT;
3019 	}
3020 
3021 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3022 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3023 
3024 	q = rxq->qdisc;
3025 	if (q != &noop_qdisc) {
3026 		spin_lock(qdisc_lock(q));
3027 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3028 			result = qdisc_enqueue_root(skb, q);
3029 		spin_unlock(qdisc_lock(q));
3030 	}
3031 
3032 	return result;
3033 }
3034 
3035 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3036 					 struct packet_type **pt_prev,
3037 					 int *ret, struct net_device *orig_dev)
3038 {
3039 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3040 
3041 	if (!rxq || rxq->qdisc == &noop_qdisc)
3042 		goto out;
3043 
3044 	if (*pt_prev) {
3045 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3046 		*pt_prev = NULL;
3047 	}
3048 
3049 	switch (ing_filter(skb, rxq)) {
3050 	case TC_ACT_SHOT:
3051 	case TC_ACT_STOLEN:
3052 		kfree_skb(skb);
3053 		return NULL;
3054 	}
3055 
3056 out:
3057 	skb->tc_verd = 0;
3058 	return skb;
3059 }
3060 #endif
3061 
3062 /**
3063  *	netdev_rx_handler_register - register receive handler
3064  *	@dev: device to register a handler for
3065  *	@rx_handler: receive handler to register
3066  *	@rx_handler_data: data pointer that is used by rx handler
3067  *
3068  *	Register a receive hander for a device. This handler will then be
3069  *	called from __netif_receive_skb. A negative errno code is returned
3070  *	on a failure.
3071  *
3072  *	The caller must hold the rtnl_mutex.
3073  *
3074  *	For a general description of rx_handler, see enum rx_handler_result.
3075  */
3076 int netdev_rx_handler_register(struct net_device *dev,
3077 			       rx_handler_func_t *rx_handler,
3078 			       void *rx_handler_data)
3079 {
3080 	ASSERT_RTNL();
3081 
3082 	if (dev->rx_handler)
3083 		return -EBUSY;
3084 
3085 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3086 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3087 
3088 	return 0;
3089 }
3090 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3091 
3092 /**
3093  *	netdev_rx_handler_unregister - unregister receive handler
3094  *	@dev: device to unregister a handler from
3095  *
3096  *	Unregister a receive hander from a device.
3097  *
3098  *	The caller must hold the rtnl_mutex.
3099  */
3100 void netdev_rx_handler_unregister(struct net_device *dev)
3101 {
3102 
3103 	ASSERT_RTNL();
3104 	rcu_assign_pointer(dev->rx_handler, NULL);
3105 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3106 }
3107 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3108 
3109 static void vlan_on_bond_hook(struct sk_buff *skb)
3110 {
3111 	/*
3112 	 * Make sure ARP frames received on VLAN interfaces stacked on
3113 	 * bonding interfaces still make their way to any base bonding
3114 	 * device that may have registered for a specific ptype.
3115 	 */
3116 	if (skb->dev->priv_flags & IFF_802_1Q_VLAN &&
3117 	    vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING &&
3118 	    skb->protocol == htons(ETH_P_ARP)) {
3119 		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
3120 
3121 		if (!skb2)
3122 			return;
3123 		skb2->dev = vlan_dev_real_dev(skb->dev);
3124 		netif_rx(skb2);
3125 	}
3126 }
3127 
3128 static int __netif_receive_skb(struct sk_buff *skb)
3129 {
3130 	struct packet_type *ptype, *pt_prev;
3131 	rx_handler_func_t *rx_handler;
3132 	struct net_device *orig_dev;
3133 	struct net_device *null_or_dev;
3134 	bool deliver_exact = false;
3135 	int ret = NET_RX_DROP;
3136 	__be16 type;
3137 
3138 	if (!netdev_tstamp_prequeue)
3139 		net_timestamp_check(skb);
3140 
3141 	trace_netif_receive_skb(skb);
3142 
3143 	/* if we've gotten here through NAPI, check netpoll */
3144 	if (netpoll_receive_skb(skb))
3145 		return NET_RX_DROP;
3146 
3147 	if (!skb->skb_iif)
3148 		skb->skb_iif = skb->dev->ifindex;
3149 	orig_dev = skb->dev;
3150 
3151 	skb_reset_network_header(skb);
3152 	skb_reset_transport_header(skb);
3153 	skb->mac_len = skb->network_header - skb->mac_header;
3154 
3155 	pt_prev = NULL;
3156 
3157 	rcu_read_lock();
3158 
3159 another_round:
3160 
3161 	__this_cpu_inc(softnet_data.processed);
3162 
3163 #ifdef CONFIG_NET_CLS_ACT
3164 	if (skb->tc_verd & TC_NCLS) {
3165 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3166 		goto ncls;
3167 	}
3168 #endif
3169 
3170 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3171 		if (!ptype->dev || ptype->dev == skb->dev) {
3172 			if (pt_prev)
3173 				ret = deliver_skb(skb, pt_prev, orig_dev);
3174 			pt_prev = ptype;
3175 		}
3176 	}
3177 
3178 #ifdef CONFIG_NET_CLS_ACT
3179 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3180 	if (!skb)
3181 		goto out;
3182 ncls:
3183 #endif
3184 
3185 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3186 	if (rx_handler) {
3187 		if (pt_prev) {
3188 			ret = deliver_skb(skb, pt_prev, orig_dev);
3189 			pt_prev = NULL;
3190 		}
3191 		switch (rx_handler(&skb)) {
3192 		case RX_HANDLER_CONSUMED:
3193 			goto out;
3194 		case RX_HANDLER_ANOTHER:
3195 			goto another_round;
3196 		case RX_HANDLER_EXACT:
3197 			deliver_exact = true;
3198 		case RX_HANDLER_PASS:
3199 			break;
3200 		default:
3201 			BUG();
3202 		}
3203 	}
3204 
3205 	if (vlan_tx_tag_present(skb)) {
3206 		if (pt_prev) {
3207 			ret = deliver_skb(skb, pt_prev, orig_dev);
3208 			pt_prev = NULL;
3209 		}
3210 		if (vlan_hwaccel_do_receive(&skb)) {
3211 			ret = __netif_receive_skb(skb);
3212 			goto out;
3213 		} else if (unlikely(!skb))
3214 			goto out;
3215 	}
3216 
3217 	vlan_on_bond_hook(skb);
3218 
3219 	/* deliver only exact match when indicated */
3220 	null_or_dev = deliver_exact ? skb->dev : NULL;
3221 
3222 	type = skb->protocol;
3223 	list_for_each_entry_rcu(ptype,
3224 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3225 		if (ptype->type == type &&
3226 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3227 		     ptype->dev == orig_dev)) {
3228 			if (pt_prev)
3229 				ret = deliver_skb(skb, pt_prev, orig_dev);
3230 			pt_prev = ptype;
3231 		}
3232 	}
3233 
3234 	if (pt_prev) {
3235 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3236 	} else {
3237 		atomic_long_inc(&skb->dev->rx_dropped);
3238 		kfree_skb(skb);
3239 		/* Jamal, now you will not able to escape explaining
3240 		 * me how you were going to use this. :-)
3241 		 */
3242 		ret = NET_RX_DROP;
3243 	}
3244 
3245 out:
3246 	rcu_read_unlock();
3247 	return ret;
3248 }
3249 
3250 /**
3251  *	netif_receive_skb - process receive buffer from network
3252  *	@skb: buffer to process
3253  *
3254  *	netif_receive_skb() is the main receive data processing function.
3255  *	It always succeeds. The buffer may be dropped during processing
3256  *	for congestion control or by the protocol layers.
3257  *
3258  *	This function may only be called from softirq context and interrupts
3259  *	should be enabled.
3260  *
3261  *	Return values (usually ignored):
3262  *	NET_RX_SUCCESS: no congestion
3263  *	NET_RX_DROP: packet was dropped
3264  */
3265 int netif_receive_skb(struct sk_buff *skb)
3266 {
3267 	if (netdev_tstamp_prequeue)
3268 		net_timestamp_check(skb);
3269 
3270 	if (skb_defer_rx_timestamp(skb))
3271 		return NET_RX_SUCCESS;
3272 
3273 #ifdef CONFIG_RPS
3274 	{
3275 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3276 		int cpu, ret;
3277 
3278 		rcu_read_lock();
3279 
3280 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3281 
3282 		if (cpu >= 0) {
3283 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3284 			rcu_read_unlock();
3285 		} else {
3286 			rcu_read_unlock();
3287 			ret = __netif_receive_skb(skb);
3288 		}
3289 
3290 		return ret;
3291 	}
3292 #else
3293 	return __netif_receive_skb(skb);
3294 #endif
3295 }
3296 EXPORT_SYMBOL(netif_receive_skb);
3297 
3298 /* Network device is going away, flush any packets still pending
3299  * Called with irqs disabled.
3300  */
3301 static void flush_backlog(void *arg)
3302 {
3303 	struct net_device *dev = arg;
3304 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3305 	struct sk_buff *skb, *tmp;
3306 
3307 	rps_lock(sd);
3308 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3309 		if (skb->dev == dev) {
3310 			__skb_unlink(skb, &sd->input_pkt_queue);
3311 			kfree_skb(skb);
3312 			input_queue_head_incr(sd);
3313 		}
3314 	}
3315 	rps_unlock(sd);
3316 
3317 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3318 		if (skb->dev == dev) {
3319 			__skb_unlink(skb, &sd->process_queue);
3320 			kfree_skb(skb);
3321 			input_queue_head_incr(sd);
3322 		}
3323 	}
3324 }
3325 
3326 static int napi_gro_complete(struct sk_buff *skb)
3327 {
3328 	struct packet_type *ptype;
3329 	__be16 type = skb->protocol;
3330 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3331 	int err = -ENOENT;
3332 
3333 	if (NAPI_GRO_CB(skb)->count == 1) {
3334 		skb_shinfo(skb)->gso_size = 0;
3335 		goto out;
3336 	}
3337 
3338 	rcu_read_lock();
3339 	list_for_each_entry_rcu(ptype, head, list) {
3340 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3341 			continue;
3342 
3343 		err = ptype->gro_complete(skb);
3344 		break;
3345 	}
3346 	rcu_read_unlock();
3347 
3348 	if (err) {
3349 		WARN_ON(&ptype->list == head);
3350 		kfree_skb(skb);
3351 		return NET_RX_SUCCESS;
3352 	}
3353 
3354 out:
3355 	return netif_receive_skb(skb);
3356 }
3357 
3358 inline void napi_gro_flush(struct napi_struct *napi)
3359 {
3360 	struct sk_buff *skb, *next;
3361 
3362 	for (skb = napi->gro_list; skb; skb = next) {
3363 		next = skb->next;
3364 		skb->next = NULL;
3365 		napi_gro_complete(skb);
3366 	}
3367 
3368 	napi->gro_count = 0;
3369 	napi->gro_list = NULL;
3370 }
3371 EXPORT_SYMBOL(napi_gro_flush);
3372 
3373 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3374 {
3375 	struct sk_buff **pp = NULL;
3376 	struct packet_type *ptype;
3377 	__be16 type = skb->protocol;
3378 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3379 	int same_flow;
3380 	int mac_len;
3381 	enum gro_result ret;
3382 
3383 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3384 		goto normal;
3385 
3386 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3387 		goto normal;
3388 
3389 	rcu_read_lock();
3390 	list_for_each_entry_rcu(ptype, head, list) {
3391 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3392 			continue;
3393 
3394 		skb_set_network_header(skb, skb_gro_offset(skb));
3395 		mac_len = skb->network_header - skb->mac_header;
3396 		skb->mac_len = mac_len;
3397 		NAPI_GRO_CB(skb)->same_flow = 0;
3398 		NAPI_GRO_CB(skb)->flush = 0;
3399 		NAPI_GRO_CB(skb)->free = 0;
3400 
3401 		pp = ptype->gro_receive(&napi->gro_list, skb);
3402 		break;
3403 	}
3404 	rcu_read_unlock();
3405 
3406 	if (&ptype->list == head)
3407 		goto normal;
3408 
3409 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3410 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3411 
3412 	if (pp) {
3413 		struct sk_buff *nskb = *pp;
3414 
3415 		*pp = nskb->next;
3416 		nskb->next = NULL;
3417 		napi_gro_complete(nskb);
3418 		napi->gro_count--;
3419 	}
3420 
3421 	if (same_flow)
3422 		goto ok;
3423 
3424 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3425 		goto normal;
3426 
3427 	napi->gro_count++;
3428 	NAPI_GRO_CB(skb)->count = 1;
3429 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3430 	skb->next = napi->gro_list;
3431 	napi->gro_list = skb;
3432 	ret = GRO_HELD;
3433 
3434 pull:
3435 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3436 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3437 
3438 		BUG_ON(skb->end - skb->tail < grow);
3439 
3440 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3441 
3442 		skb->tail += grow;
3443 		skb->data_len -= grow;
3444 
3445 		skb_shinfo(skb)->frags[0].page_offset += grow;
3446 		skb_shinfo(skb)->frags[0].size -= grow;
3447 
3448 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3449 			put_page(skb_shinfo(skb)->frags[0].page);
3450 			memmove(skb_shinfo(skb)->frags,
3451 				skb_shinfo(skb)->frags + 1,
3452 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3453 		}
3454 	}
3455 
3456 ok:
3457 	return ret;
3458 
3459 normal:
3460 	ret = GRO_NORMAL;
3461 	goto pull;
3462 }
3463 EXPORT_SYMBOL(dev_gro_receive);
3464 
3465 static inline gro_result_t
3466 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3467 {
3468 	struct sk_buff *p;
3469 
3470 	for (p = napi->gro_list; p; p = p->next) {
3471 		unsigned long diffs;
3472 
3473 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3474 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3475 		diffs |= compare_ether_header(skb_mac_header(p),
3476 					      skb_gro_mac_header(skb));
3477 		NAPI_GRO_CB(p)->same_flow = !diffs;
3478 		NAPI_GRO_CB(p)->flush = 0;
3479 	}
3480 
3481 	return dev_gro_receive(napi, skb);
3482 }
3483 
3484 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3485 {
3486 	switch (ret) {
3487 	case GRO_NORMAL:
3488 		if (netif_receive_skb(skb))
3489 			ret = GRO_DROP;
3490 		break;
3491 
3492 	case GRO_DROP:
3493 	case GRO_MERGED_FREE:
3494 		kfree_skb(skb);
3495 		break;
3496 
3497 	case GRO_HELD:
3498 	case GRO_MERGED:
3499 		break;
3500 	}
3501 
3502 	return ret;
3503 }
3504 EXPORT_SYMBOL(napi_skb_finish);
3505 
3506 void skb_gro_reset_offset(struct sk_buff *skb)
3507 {
3508 	NAPI_GRO_CB(skb)->data_offset = 0;
3509 	NAPI_GRO_CB(skb)->frag0 = NULL;
3510 	NAPI_GRO_CB(skb)->frag0_len = 0;
3511 
3512 	if (skb->mac_header == skb->tail &&
3513 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3514 		NAPI_GRO_CB(skb)->frag0 =
3515 			page_address(skb_shinfo(skb)->frags[0].page) +
3516 			skb_shinfo(skb)->frags[0].page_offset;
3517 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3518 	}
3519 }
3520 EXPORT_SYMBOL(skb_gro_reset_offset);
3521 
3522 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3523 {
3524 	skb_gro_reset_offset(skb);
3525 
3526 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3527 }
3528 EXPORT_SYMBOL(napi_gro_receive);
3529 
3530 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3531 {
3532 	__skb_pull(skb, skb_headlen(skb));
3533 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3534 	skb->vlan_tci = 0;
3535 	skb->dev = napi->dev;
3536 	skb->skb_iif = 0;
3537 
3538 	napi->skb = skb;
3539 }
3540 
3541 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3542 {
3543 	struct sk_buff *skb = napi->skb;
3544 
3545 	if (!skb) {
3546 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3547 		if (skb)
3548 			napi->skb = skb;
3549 	}
3550 	return skb;
3551 }
3552 EXPORT_SYMBOL(napi_get_frags);
3553 
3554 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3555 			       gro_result_t ret)
3556 {
3557 	switch (ret) {
3558 	case GRO_NORMAL:
3559 	case GRO_HELD:
3560 		skb->protocol = eth_type_trans(skb, skb->dev);
3561 
3562 		if (ret == GRO_HELD)
3563 			skb_gro_pull(skb, -ETH_HLEN);
3564 		else if (netif_receive_skb(skb))
3565 			ret = GRO_DROP;
3566 		break;
3567 
3568 	case GRO_DROP:
3569 	case GRO_MERGED_FREE:
3570 		napi_reuse_skb(napi, skb);
3571 		break;
3572 
3573 	case GRO_MERGED:
3574 		break;
3575 	}
3576 
3577 	return ret;
3578 }
3579 EXPORT_SYMBOL(napi_frags_finish);
3580 
3581 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3582 {
3583 	struct sk_buff *skb = napi->skb;
3584 	struct ethhdr *eth;
3585 	unsigned int hlen;
3586 	unsigned int off;
3587 
3588 	napi->skb = NULL;
3589 
3590 	skb_reset_mac_header(skb);
3591 	skb_gro_reset_offset(skb);
3592 
3593 	off = skb_gro_offset(skb);
3594 	hlen = off + sizeof(*eth);
3595 	eth = skb_gro_header_fast(skb, off);
3596 	if (skb_gro_header_hard(skb, hlen)) {
3597 		eth = skb_gro_header_slow(skb, hlen, off);
3598 		if (unlikely(!eth)) {
3599 			napi_reuse_skb(napi, skb);
3600 			skb = NULL;
3601 			goto out;
3602 		}
3603 	}
3604 
3605 	skb_gro_pull(skb, sizeof(*eth));
3606 
3607 	/*
3608 	 * This works because the only protocols we care about don't require
3609 	 * special handling.  We'll fix it up properly at the end.
3610 	 */
3611 	skb->protocol = eth->h_proto;
3612 
3613 out:
3614 	return skb;
3615 }
3616 EXPORT_SYMBOL(napi_frags_skb);
3617 
3618 gro_result_t napi_gro_frags(struct napi_struct *napi)
3619 {
3620 	struct sk_buff *skb = napi_frags_skb(napi);
3621 
3622 	if (!skb)
3623 		return GRO_DROP;
3624 
3625 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3626 }
3627 EXPORT_SYMBOL(napi_gro_frags);
3628 
3629 /*
3630  * net_rps_action sends any pending IPI's for rps.
3631  * Note: called with local irq disabled, but exits with local irq enabled.
3632  */
3633 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3634 {
3635 #ifdef CONFIG_RPS
3636 	struct softnet_data *remsd = sd->rps_ipi_list;
3637 
3638 	if (remsd) {
3639 		sd->rps_ipi_list = NULL;
3640 
3641 		local_irq_enable();
3642 
3643 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3644 		while (remsd) {
3645 			struct softnet_data *next = remsd->rps_ipi_next;
3646 
3647 			if (cpu_online(remsd->cpu))
3648 				__smp_call_function_single(remsd->cpu,
3649 							   &remsd->csd, 0);
3650 			remsd = next;
3651 		}
3652 	} else
3653 #endif
3654 		local_irq_enable();
3655 }
3656 
3657 static int process_backlog(struct napi_struct *napi, int quota)
3658 {
3659 	int work = 0;
3660 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3661 
3662 #ifdef CONFIG_RPS
3663 	/* Check if we have pending ipi, its better to send them now,
3664 	 * not waiting net_rx_action() end.
3665 	 */
3666 	if (sd->rps_ipi_list) {
3667 		local_irq_disable();
3668 		net_rps_action_and_irq_enable(sd);
3669 	}
3670 #endif
3671 	napi->weight = weight_p;
3672 	local_irq_disable();
3673 	while (work < quota) {
3674 		struct sk_buff *skb;
3675 		unsigned int qlen;
3676 
3677 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3678 			local_irq_enable();
3679 			__netif_receive_skb(skb);
3680 			local_irq_disable();
3681 			input_queue_head_incr(sd);
3682 			if (++work >= quota) {
3683 				local_irq_enable();
3684 				return work;
3685 			}
3686 		}
3687 
3688 		rps_lock(sd);
3689 		qlen = skb_queue_len(&sd->input_pkt_queue);
3690 		if (qlen)
3691 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3692 						   &sd->process_queue);
3693 
3694 		if (qlen < quota - work) {
3695 			/*
3696 			 * Inline a custom version of __napi_complete().
3697 			 * only current cpu owns and manipulates this napi,
3698 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3699 			 * we can use a plain write instead of clear_bit(),
3700 			 * and we dont need an smp_mb() memory barrier.
3701 			 */
3702 			list_del(&napi->poll_list);
3703 			napi->state = 0;
3704 
3705 			quota = work + qlen;
3706 		}
3707 		rps_unlock(sd);
3708 	}
3709 	local_irq_enable();
3710 
3711 	return work;
3712 }
3713 
3714 /**
3715  * __napi_schedule - schedule for receive
3716  * @n: entry to schedule
3717  *
3718  * The entry's receive function will be scheduled to run
3719  */
3720 void __napi_schedule(struct napi_struct *n)
3721 {
3722 	unsigned long flags;
3723 
3724 	local_irq_save(flags);
3725 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3726 	local_irq_restore(flags);
3727 }
3728 EXPORT_SYMBOL(__napi_schedule);
3729 
3730 void __napi_complete(struct napi_struct *n)
3731 {
3732 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3733 	BUG_ON(n->gro_list);
3734 
3735 	list_del(&n->poll_list);
3736 	smp_mb__before_clear_bit();
3737 	clear_bit(NAPI_STATE_SCHED, &n->state);
3738 }
3739 EXPORT_SYMBOL(__napi_complete);
3740 
3741 void napi_complete(struct napi_struct *n)
3742 {
3743 	unsigned long flags;
3744 
3745 	/*
3746 	 * don't let napi dequeue from the cpu poll list
3747 	 * just in case its running on a different cpu
3748 	 */
3749 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3750 		return;
3751 
3752 	napi_gro_flush(n);
3753 	local_irq_save(flags);
3754 	__napi_complete(n);
3755 	local_irq_restore(flags);
3756 }
3757 EXPORT_SYMBOL(napi_complete);
3758 
3759 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3760 		    int (*poll)(struct napi_struct *, int), int weight)
3761 {
3762 	INIT_LIST_HEAD(&napi->poll_list);
3763 	napi->gro_count = 0;
3764 	napi->gro_list = NULL;
3765 	napi->skb = NULL;
3766 	napi->poll = poll;
3767 	napi->weight = weight;
3768 	list_add(&napi->dev_list, &dev->napi_list);
3769 	napi->dev = dev;
3770 #ifdef CONFIG_NETPOLL
3771 	spin_lock_init(&napi->poll_lock);
3772 	napi->poll_owner = -1;
3773 #endif
3774 	set_bit(NAPI_STATE_SCHED, &napi->state);
3775 }
3776 EXPORT_SYMBOL(netif_napi_add);
3777 
3778 void netif_napi_del(struct napi_struct *napi)
3779 {
3780 	struct sk_buff *skb, *next;
3781 
3782 	list_del_init(&napi->dev_list);
3783 	napi_free_frags(napi);
3784 
3785 	for (skb = napi->gro_list; skb; skb = next) {
3786 		next = skb->next;
3787 		skb->next = NULL;
3788 		kfree_skb(skb);
3789 	}
3790 
3791 	napi->gro_list = NULL;
3792 	napi->gro_count = 0;
3793 }
3794 EXPORT_SYMBOL(netif_napi_del);
3795 
3796 static void net_rx_action(struct softirq_action *h)
3797 {
3798 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3799 	unsigned long time_limit = jiffies + 2;
3800 	int budget = netdev_budget;
3801 	void *have;
3802 
3803 	local_irq_disable();
3804 
3805 	while (!list_empty(&sd->poll_list)) {
3806 		struct napi_struct *n;
3807 		int work, weight;
3808 
3809 		/* If softirq window is exhuasted then punt.
3810 		 * Allow this to run for 2 jiffies since which will allow
3811 		 * an average latency of 1.5/HZ.
3812 		 */
3813 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3814 			goto softnet_break;
3815 
3816 		local_irq_enable();
3817 
3818 		/* Even though interrupts have been re-enabled, this
3819 		 * access is safe because interrupts can only add new
3820 		 * entries to the tail of this list, and only ->poll()
3821 		 * calls can remove this head entry from the list.
3822 		 */
3823 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3824 
3825 		have = netpoll_poll_lock(n);
3826 
3827 		weight = n->weight;
3828 
3829 		/* This NAPI_STATE_SCHED test is for avoiding a race
3830 		 * with netpoll's poll_napi().  Only the entity which
3831 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3832 		 * actually make the ->poll() call.  Therefore we avoid
3833 		 * accidently calling ->poll() when NAPI is not scheduled.
3834 		 */
3835 		work = 0;
3836 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3837 			work = n->poll(n, weight);
3838 			trace_napi_poll(n);
3839 		}
3840 
3841 		WARN_ON_ONCE(work > weight);
3842 
3843 		budget -= work;
3844 
3845 		local_irq_disable();
3846 
3847 		/* Drivers must not modify the NAPI state if they
3848 		 * consume the entire weight.  In such cases this code
3849 		 * still "owns" the NAPI instance and therefore can
3850 		 * move the instance around on the list at-will.
3851 		 */
3852 		if (unlikely(work == weight)) {
3853 			if (unlikely(napi_disable_pending(n))) {
3854 				local_irq_enable();
3855 				napi_complete(n);
3856 				local_irq_disable();
3857 			} else
3858 				list_move_tail(&n->poll_list, &sd->poll_list);
3859 		}
3860 
3861 		netpoll_poll_unlock(have);
3862 	}
3863 out:
3864 	net_rps_action_and_irq_enable(sd);
3865 
3866 #ifdef CONFIG_NET_DMA
3867 	/*
3868 	 * There may not be any more sk_buffs coming right now, so push
3869 	 * any pending DMA copies to hardware
3870 	 */
3871 	dma_issue_pending_all();
3872 #endif
3873 
3874 	return;
3875 
3876 softnet_break:
3877 	sd->time_squeeze++;
3878 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3879 	goto out;
3880 }
3881 
3882 static gifconf_func_t *gifconf_list[NPROTO];
3883 
3884 /**
3885  *	register_gifconf	-	register a SIOCGIF handler
3886  *	@family: Address family
3887  *	@gifconf: Function handler
3888  *
3889  *	Register protocol dependent address dumping routines. The handler
3890  *	that is passed must not be freed or reused until it has been replaced
3891  *	by another handler.
3892  */
3893 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3894 {
3895 	if (family >= NPROTO)
3896 		return -EINVAL;
3897 	gifconf_list[family] = gifconf;
3898 	return 0;
3899 }
3900 EXPORT_SYMBOL(register_gifconf);
3901 
3902 
3903 /*
3904  *	Map an interface index to its name (SIOCGIFNAME)
3905  */
3906 
3907 /*
3908  *	We need this ioctl for efficient implementation of the
3909  *	if_indextoname() function required by the IPv6 API.  Without
3910  *	it, we would have to search all the interfaces to find a
3911  *	match.  --pb
3912  */
3913 
3914 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3915 {
3916 	struct net_device *dev;
3917 	struct ifreq ifr;
3918 
3919 	/*
3920 	 *	Fetch the caller's info block.
3921 	 */
3922 
3923 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3924 		return -EFAULT;
3925 
3926 	rcu_read_lock();
3927 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3928 	if (!dev) {
3929 		rcu_read_unlock();
3930 		return -ENODEV;
3931 	}
3932 
3933 	strcpy(ifr.ifr_name, dev->name);
3934 	rcu_read_unlock();
3935 
3936 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3937 		return -EFAULT;
3938 	return 0;
3939 }
3940 
3941 /*
3942  *	Perform a SIOCGIFCONF call. This structure will change
3943  *	size eventually, and there is nothing I can do about it.
3944  *	Thus we will need a 'compatibility mode'.
3945  */
3946 
3947 static int dev_ifconf(struct net *net, char __user *arg)
3948 {
3949 	struct ifconf ifc;
3950 	struct net_device *dev;
3951 	char __user *pos;
3952 	int len;
3953 	int total;
3954 	int i;
3955 
3956 	/*
3957 	 *	Fetch the caller's info block.
3958 	 */
3959 
3960 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3961 		return -EFAULT;
3962 
3963 	pos = ifc.ifc_buf;
3964 	len = ifc.ifc_len;
3965 
3966 	/*
3967 	 *	Loop over the interfaces, and write an info block for each.
3968 	 */
3969 
3970 	total = 0;
3971 	for_each_netdev(net, dev) {
3972 		for (i = 0; i < NPROTO; i++) {
3973 			if (gifconf_list[i]) {
3974 				int done;
3975 				if (!pos)
3976 					done = gifconf_list[i](dev, NULL, 0);
3977 				else
3978 					done = gifconf_list[i](dev, pos + total,
3979 							       len - total);
3980 				if (done < 0)
3981 					return -EFAULT;
3982 				total += done;
3983 			}
3984 		}
3985 	}
3986 
3987 	/*
3988 	 *	All done.  Write the updated control block back to the caller.
3989 	 */
3990 	ifc.ifc_len = total;
3991 
3992 	/*
3993 	 * 	Both BSD and Solaris return 0 here, so we do too.
3994 	 */
3995 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3996 }
3997 
3998 #ifdef CONFIG_PROC_FS
3999 /*
4000  *	This is invoked by the /proc filesystem handler to display a device
4001  *	in detail.
4002  */
4003 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4004 	__acquires(RCU)
4005 {
4006 	struct net *net = seq_file_net(seq);
4007 	loff_t off;
4008 	struct net_device *dev;
4009 
4010 	rcu_read_lock();
4011 	if (!*pos)
4012 		return SEQ_START_TOKEN;
4013 
4014 	off = 1;
4015 	for_each_netdev_rcu(net, dev)
4016 		if (off++ == *pos)
4017 			return dev;
4018 
4019 	return NULL;
4020 }
4021 
4022 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4023 {
4024 	struct net_device *dev = v;
4025 
4026 	if (v == SEQ_START_TOKEN)
4027 		dev = first_net_device_rcu(seq_file_net(seq));
4028 	else
4029 		dev = next_net_device_rcu(dev);
4030 
4031 	++*pos;
4032 	return dev;
4033 }
4034 
4035 void dev_seq_stop(struct seq_file *seq, void *v)
4036 	__releases(RCU)
4037 {
4038 	rcu_read_unlock();
4039 }
4040 
4041 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4042 {
4043 	struct rtnl_link_stats64 temp;
4044 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4045 
4046 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4047 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4048 		   dev->name, stats->rx_bytes, stats->rx_packets,
4049 		   stats->rx_errors,
4050 		   stats->rx_dropped + stats->rx_missed_errors,
4051 		   stats->rx_fifo_errors,
4052 		   stats->rx_length_errors + stats->rx_over_errors +
4053 		    stats->rx_crc_errors + stats->rx_frame_errors,
4054 		   stats->rx_compressed, stats->multicast,
4055 		   stats->tx_bytes, stats->tx_packets,
4056 		   stats->tx_errors, stats->tx_dropped,
4057 		   stats->tx_fifo_errors, stats->collisions,
4058 		   stats->tx_carrier_errors +
4059 		    stats->tx_aborted_errors +
4060 		    stats->tx_window_errors +
4061 		    stats->tx_heartbeat_errors,
4062 		   stats->tx_compressed);
4063 }
4064 
4065 /*
4066  *	Called from the PROCfs module. This now uses the new arbitrary sized
4067  *	/proc/net interface to create /proc/net/dev
4068  */
4069 static int dev_seq_show(struct seq_file *seq, void *v)
4070 {
4071 	if (v == SEQ_START_TOKEN)
4072 		seq_puts(seq, "Inter-|   Receive                            "
4073 			      "                    |  Transmit\n"
4074 			      " face |bytes    packets errs drop fifo frame "
4075 			      "compressed multicast|bytes    packets errs "
4076 			      "drop fifo colls carrier compressed\n");
4077 	else
4078 		dev_seq_printf_stats(seq, v);
4079 	return 0;
4080 }
4081 
4082 static struct softnet_data *softnet_get_online(loff_t *pos)
4083 {
4084 	struct softnet_data *sd = NULL;
4085 
4086 	while (*pos < nr_cpu_ids)
4087 		if (cpu_online(*pos)) {
4088 			sd = &per_cpu(softnet_data, *pos);
4089 			break;
4090 		} else
4091 			++*pos;
4092 	return sd;
4093 }
4094 
4095 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4096 {
4097 	return softnet_get_online(pos);
4098 }
4099 
4100 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4101 {
4102 	++*pos;
4103 	return softnet_get_online(pos);
4104 }
4105 
4106 static void softnet_seq_stop(struct seq_file *seq, void *v)
4107 {
4108 }
4109 
4110 static int softnet_seq_show(struct seq_file *seq, void *v)
4111 {
4112 	struct softnet_data *sd = v;
4113 
4114 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4115 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4116 		   0, 0, 0, 0, /* was fastroute */
4117 		   sd->cpu_collision, sd->received_rps);
4118 	return 0;
4119 }
4120 
4121 static const struct seq_operations dev_seq_ops = {
4122 	.start = dev_seq_start,
4123 	.next  = dev_seq_next,
4124 	.stop  = dev_seq_stop,
4125 	.show  = dev_seq_show,
4126 };
4127 
4128 static int dev_seq_open(struct inode *inode, struct file *file)
4129 {
4130 	return seq_open_net(inode, file, &dev_seq_ops,
4131 			    sizeof(struct seq_net_private));
4132 }
4133 
4134 static const struct file_operations dev_seq_fops = {
4135 	.owner	 = THIS_MODULE,
4136 	.open    = dev_seq_open,
4137 	.read    = seq_read,
4138 	.llseek  = seq_lseek,
4139 	.release = seq_release_net,
4140 };
4141 
4142 static const struct seq_operations softnet_seq_ops = {
4143 	.start = softnet_seq_start,
4144 	.next  = softnet_seq_next,
4145 	.stop  = softnet_seq_stop,
4146 	.show  = softnet_seq_show,
4147 };
4148 
4149 static int softnet_seq_open(struct inode *inode, struct file *file)
4150 {
4151 	return seq_open(file, &softnet_seq_ops);
4152 }
4153 
4154 static const struct file_operations softnet_seq_fops = {
4155 	.owner	 = THIS_MODULE,
4156 	.open    = softnet_seq_open,
4157 	.read    = seq_read,
4158 	.llseek  = seq_lseek,
4159 	.release = seq_release,
4160 };
4161 
4162 static void *ptype_get_idx(loff_t pos)
4163 {
4164 	struct packet_type *pt = NULL;
4165 	loff_t i = 0;
4166 	int t;
4167 
4168 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4169 		if (i == pos)
4170 			return pt;
4171 		++i;
4172 	}
4173 
4174 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4175 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4176 			if (i == pos)
4177 				return pt;
4178 			++i;
4179 		}
4180 	}
4181 	return NULL;
4182 }
4183 
4184 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4185 	__acquires(RCU)
4186 {
4187 	rcu_read_lock();
4188 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4189 }
4190 
4191 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4192 {
4193 	struct packet_type *pt;
4194 	struct list_head *nxt;
4195 	int hash;
4196 
4197 	++*pos;
4198 	if (v == SEQ_START_TOKEN)
4199 		return ptype_get_idx(0);
4200 
4201 	pt = v;
4202 	nxt = pt->list.next;
4203 	if (pt->type == htons(ETH_P_ALL)) {
4204 		if (nxt != &ptype_all)
4205 			goto found;
4206 		hash = 0;
4207 		nxt = ptype_base[0].next;
4208 	} else
4209 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4210 
4211 	while (nxt == &ptype_base[hash]) {
4212 		if (++hash >= PTYPE_HASH_SIZE)
4213 			return NULL;
4214 		nxt = ptype_base[hash].next;
4215 	}
4216 found:
4217 	return list_entry(nxt, struct packet_type, list);
4218 }
4219 
4220 static void ptype_seq_stop(struct seq_file *seq, void *v)
4221 	__releases(RCU)
4222 {
4223 	rcu_read_unlock();
4224 }
4225 
4226 static int ptype_seq_show(struct seq_file *seq, void *v)
4227 {
4228 	struct packet_type *pt = v;
4229 
4230 	if (v == SEQ_START_TOKEN)
4231 		seq_puts(seq, "Type Device      Function\n");
4232 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4233 		if (pt->type == htons(ETH_P_ALL))
4234 			seq_puts(seq, "ALL ");
4235 		else
4236 			seq_printf(seq, "%04x", ntohs(pt->type));
4237 
4238 		seq_printf(seq, " %-8s %pF\n",
4239 			   pt->dev ? pt->dev->name : "", pt->func);
4240 	}
4241 
4242 	return 0;
4243 }
4244 
4245 static const struct seq_operations ptype_seq_ops = {
4246 	.start = ptype_seq_start,
4247 	.next  = ptype_seq_next,
4248 	.stop  = ptype_seq_stop,
4249 	.show  = ptype_seq_show,
4250 };
4251 
4252 static int ptype_seq_open(struct inode *inode, struct file *file)
4253 {
4254 	return seq_open_net(inode, file, &ptype_seq_ops,
4255 			sizeof(struct seq_net_private));
4256 }
4257 
4258 static const struct file_operations ptype_seq_fops = {
4259 	.owner	 = THIS_MODULE,
4260 	.open    = ptype_seq_open,
4261 	.read    = seq_read,
4262 	.llseek  = seq_lseek,
4263 	.release = seq_release_net,
4264 };
4265 
4266 
4267 static int __net_init dev_proc_net_init(struct net *net)
4268 {
4269 	int rc = -ENOMEM;
4270 
4271 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4272 		goto out;
4273 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4274 		goto out_dev;
4275 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4276 		goto out_softnet;
4277 
4278 	if (wext_proc_init(net))
4279 		goto out_ptype;
4280 	rc = 0;
4281 out:
4282 	return rc;
4283 out_ptype:
4284 	proc_net_remove(net, "ptype");
4285 out_softnet:
4286 	proc_net_remove(net, "softnet_stat");
4287 out_dev:
4288 	proc_net_remove(net, "dev");
4289 	goto out;
4290 }
4291 
4292 static void __net_exit dev_proc_net_exit(struct net *net)
4293 {
4294 	wext_proc_exit(net);
4295 
4296 	proc_net_remove(net, "ptype");
4297 	proc_net_remove(net, "softnet_stat");
4298 	proc_net_remove(net, "dev");
4299 }
4300 
4301 static struct pernet_operations __net_initdata dev_proc_ops = {
4302 	.init = dev_proc_net_init,
4303 	.exit = dev_proc_net_exit,
4304 };
4305 
4306 static int __init dev_proc_init(void)
4307 {
4308 	return register_pernet_subsys(&dev_proc_ops);
4309 }
4310 #else
4311 #define dev_proc_init() 0
4312 #endif	/* CONFIG_PROC_FS */
4313 
4314 
4315 /**
4316  *	netdev_set_master	-	set up master pointer
4317  *	@slave: slave device
4318  *	@master: new master device
4319  *
4320  *	Changes the master device of the slave. Pass %NULL to break the
4321  *	bonding. The caller must hold the RTNL semaphore. On a failure
4322  *	a negative errno code is returned. On success the reference counts
4323  *	are adjusted and the function returns zero.
4324  */
4325 int netdev_set_master(struct net_device *slave, struct net_device *master)
4326 {
4327 	struct net_device *old = slave->master;
4328 
4329 	ASSERT_RTNL();
4330 
4331 	if (master) {
4332 		if (old)
4333 			return -EBUSY;
4334 		dev_hold(master);
4335 	}
4336 
4337 	slave->master = master;
4338 
4339 	if (old) {
4340 		synchronize_net();
4341 		dev_put(old);
4342 	}
4343 	return 0;
4344 }
4345 EXPORT_SYMBOL(netdev_set_master);
4346 
4347 /**
4348  *	netdev_set_bond_master	-	set up bonding master/slave pair
4349  *	@slave: slave device
4350  *	@master: new master device
4351  *
4352  *	Changes the master device of the slave. Pass %NULL to break the
4353  *	bonding. The caller must hold the RTNL semaphore. On a failure
4354  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4355  *	to the routing socket and the function returns zero.
4356  */
4357 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4358 {
4359 	int err;
4360 
4361 	ASSERT_RTNL();
4362 
4363 	err = netdev_set_master(slave, master);
4364 	if (err)
4365 		return err;
4366 	if (master)
4367 		slave->flags |= IFF_SLAVE;
4368 	else
4369 		slave->flags &= ~IFF_SLAVE;
4370 
4371 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4372 	return 0;
4373 }
4374 EXPORT_SYMBOL(netdev_set_bond_master);
4375 
4376 static void dev_change_rx_flags(struct net_device *dev, int flags)
4377 {
4378 	const struct net_device_ops *ops = dev->netdev_ops;
4379 
4380 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4381 		ops->ndo_change_rx_flags(dev, flags);
4382 }
4383 
4384 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4385 {
4386 	unsigned short old_flags = dev->flags;
4387 	uid_t uid;
4388 	gid_t gid;
4389 
4390 	ASSERT_RTNL();
4391 
4392 	dev->flags |= IFF_PROMISC;
4393 	dev->promiscuity += inc;
4394 	if (dev->promiscuity == 0) {
4395 		/*
4396 		 * Avoid overflow.
4397 		 * If inc causes overflow, untouch promisc and return error.
4398 		 */
4399 		if (inc < 0)
4400 			dev->flags &= ~IFF_PROMISC;
4401 		else {
4402 			dev->promiscuity -= inc;
4403 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4404 				"set promiscuity failed, promiscuity feature "
4405 				"of device might be broken.\n", dev->name);
4406 			return -EOVERFLOW;
4407 		}
4408 	}
4409 	if (dev->flags != old_flags) {
4410 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4411 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4412 							       "left");
4413 		if (audit_enabled) {
4414 			current_uid_gid(&uid, &gid);
4415 			audit_log(current->audit_context, GFP_ATOMIC,
4416 				AUDIT_ANOM_PROMISCUOUS,
4417 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4418 				dev->name, (dev->flags & IFF_PROMISC),
4419 				(old_flags & IFF_PROMISC),
4420 				audit_get_loginuid(current),
4421 				uid, gid,
4422 				audit_get_sessionid(current));
4423 		}
4424 
4425 		dev_change_rx_flags(dev, IFF_PROMISC);
4426 	}
4427 	return 0;
4428 }
4429 
4430 /**
4431  *	dev_set_promiscuity	- update promiscuity count on a device
4432  *	@dev: device
4433  *	@inc: modifier
4434  *
4435  *	Add or remove promiscuity from a device. While the count in the device
4436  *	remains above zero the interface remains promiscuous. Once it hits zero
4437  *	the device reverts back to normal filtering operation. A negative inc
4438  *	value is used to drop promiscuity on the device.
4439  *	Return 0 if successful or a negative errno code on error.
4440  */
4441 int dev_set_promiscuity(struct net_device *dev, int inc)
4442 {
4443 	unsigned short old_flags = dev->flags;
4444 	int err;
4445 
4446 	err = __dev_set_promiscuity(dev, inc);
4447 	if (err < 0)
4448 		return err;
4449 	if (dev->flags != old_flags)
4450 		dev_set_rx_mode(dev);
4451 	return err;
4452 }
4453 EXPORT_SYMBOL(dev_set_promiscuity);
4454 
4455 /**
4456  *	dev_set_allmulti	- update allmulti count on a device
4457  *	@dev: device
4458  *	@inc: modifier
4459  *
4460  *	Add or remove reception of all multicast frames to a device. While the
4461  *	count in the device remains above zero the interface remains listening
4462  *	to all interfaces. Once it hits zero the device reverts back to normal
4463  *	filtering operation. A negative @inc value is used to drop the counter
4464  *	when releasing a resource needing all multicasts.
4465  *	Return 0 if successful or a negative errno code on error.
4466  */
4467 
4468 int dev_set_allmulti(struct net_device *dev, int inc)
4469 {
4470 	unsigned short old_flags = dev->flags;
4471 
4472 	ASSERT_RTNL();
4473 
4474 	dev->flags |= IFF_ALLMULTI;
4475 	dev->allmulti += inc;
4476 	if (dev->allmulti == 0) {
4477 		/*
4478 		 * Avoid overflow.
4479 		 * If inc causes overflow, untouch allmulti and return error.
4480 		 */
4481 		if (inc < 0)
4482 			dev->flags &= ~IFF_ALLMULTI;
4483 		else {
4484 			dev->allmulti -= inc;
4485 			printk(KERN_WARNING "%s: allmulti touches roof, "
4486 				"set allmulti failed, allmulti feature of "
4487 				"device might be broken.\n", dev->name);
4488 			return -EOVERFLOW;
4489 		}
4490 	}
4491 	if (dev->flags ^ old_flags) {
4492 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4493 		dev_set_rx_mode(dev);
4494 	}
4495 	return 0;
4496 }
4497 EXPORT_SYMBOL(dev_set_allmulti);
4498 
4499 /*
4500  *	Upload unicast and multicast address lists to device and
4501  *	configure RX filtering. When the device doesn't support unicast
4502  *	filtering it is put in promiscuous mode while unicast addresses
4503  *	are present.
4504  */
4505 void __dev_set_rx_mode(struct net_device *dev)
4506 {
4507 	const struct net_device_ops *ops = dev->netdev_ops;
4508 
4509 	/* dev_open will call this function so the list will stay sane. */
4510 	if (!(dev->flags&IFF_UP))
4511 		return;
4512 
4513 	if (!netif_device_present(dev))
4514 		return;
4515 
4516 	if (ops->ndo_set_rx_mode)
4517 		ops->ndo_set_rx_mode(dev);
4518 	else {
4519 		/* Unicast addresses changes may only happen under the rtnl,
4520 		 * therefore calling __dev_set_promiscuity here is safe.
4521 		 */
4522 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4523 			__dev_set_promiscuity(dev, 1);
4524 			dev->uc_promisc = 1;
4525 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4526 			__dev_set_promiscuity(dev, -1);
4527 			dev->uc_promisc = 0;
4528 		}
4529 
4530 		if (ops->ndo_set_multicast_list)
4531 			ops->ndo_set_multicast_list(dev);
4532 	}
4533 }
4534 
4535 void dev_set_rx_mode(struct net_device *dev)
4536 {
4537 	netif_addr_lock_bh(dev);
4538 	__dev_set_rx_mode(dev);
4539 	netif_addr_unlock_bh(dev);
4540 }
4541 
4542 /**
4543  *	dev_get_flags - get flags reported to userspace
4544  *	@dev: device
4545  *
4546  *	Get the combination of flag bits exported through APIs to userspace.
4547  */
4548 unsigned dev_get_flags(const struct net_device *dev)
4549 {
4550 	unsigned flags;
4551 
4552 	flags = (dev->flags & ~(IFF_PROMISC |
4553 				IFF_ALLMULTI |
4554 				IFF_RUNNING |
4555 				IFF_LOWER_UP |
4556 				IFF_DORMANT)) |
4557 		(dev->gflags & (IFF_PROMISC |
4558 				IFF_ALLMULTI));
4559 
4560 	if (netif_running(dev)) {
4561 		if (netif_oper_up(dev))
4562 			flags |= IFF_RUNNING;
4563 		if (netif_carrier_ok(dev))
4564 			flags |= IFF_LOWER_UP;
4565 		if (netif_dormant(dev))
4566 			flags |= IFF_DORMANT;
4567 	}
4568 
4569 	return flags;
4570 }
4571 EXPORT_SYMBOL(dev_get_flags);
4572 
4573 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4574 {
4575 	int old_flags = dev->flags;
4576 	int ret;
4577 
4578 	ASSERT_RTNL();
4579 
4580 	/*
4581 	 *	Set the flags on our device.
4582 	 */
4583 
4584 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4585 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4586 			       IFF_AUTOMEDIA)) |
4587 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4588 				    IFF_ALLMULTI));
4589 
4590 	/*
4591 	 *	Load in the correct multicast list now the flags have changed.
4592 	 */
4593 
4594 	if ((old_flags ^ flags) & IFF_MULTICAST)
4595 		dev_change_rx_flags(dev, IFF_MULTICAST);
4596 
4597 	dev_set_rx_mode(dev);
4598 
4599 	/*
4600 	 *	Have we downed the interface. We handle IFF_UP ourselves
4601 	 *	according to user attempts to set it, rather than blindly
4602 	 *	setting it.
4603 	 */
4604 
4605 	ret = 0;
4606 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4607 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4608 
4609 		if (!ret)
4610 			dev_set_rx_mode(dev);
4611 	}
4612 
4613 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4614 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4615 
4616 		dev->gflags ^= IFF_PROMISC;
4617 		dev_set_promiscuity(dev, inc);
4618 	}
4619 
4620 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4621 	   is important. Some (broken) drivers set IFF_PROMISC, when
4622 	   IFF_ALLMULTI is requested not asking us and not reporting.
4623 	 */
4624 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4625 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4626 
4627 		dev->gflags ^= IFF_ALLMULTI;
4628 		dev_set_allmulti(dev, inc);
4629 	}
4630 
4631 	return ret;
4632 }
4633 
4634 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4635 {
4636 	unsigned int changes = dev->flags ^ old_flags;
4637 
4638 	if (changes & IFF_UP) {
4639 		if (dev->flags & IFF_UP)
4640 			call_netdevice_notifiers(NETDEV_UP, dev);
4641 		else
4642 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4643 	}
4644 
4645 	if (dev->flags & IFF_UP &&
4646 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4647 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4648 }
4649 
4650 /**
4651  *	dev_change_flags - change device settings
4652  *	@dev: device
4653  *	@flags: device state flags
4654  *
4655  *	Change settings on device based state flags. The flags are
4656  *	in the userspace exported format.
4657  */
4658 int dev_change_flags(struct net_device *dev, unsigned flags)
4659 {
4660 	int ret, changes;
4661 	int old_flags = dev->flags;
4662 
4663 	ret = __dev_change_flags(dev, flags);
4664 	if (ret < 0)
4665 		return ret;
4666 
4667 	changes = old_flags ^ dev->flags;
4668 	if (changes)
4669 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4670 
4671 	__dev_notify_flags(dev, old_flags);
4672 	return ret;
4673 }
4674 EXPORT_SYMBOL(dev_change_flags);
4675 
4676 /**
4677  *	dev_set_mtu - Change maximum transfer unit
4678  *	@dev: device
4679  *	@new_mtu: new transfer unit
4680  *
4681  *	Change the maximum transfer size of the network device.
4682  */
4683 int dev_set_mtu(struct net_device *dev, int new_mtu)
4684 {
4685 	const struct net_device_ops *ops = dev->netdev_ops;
4686 	int err;
4687 
4688 	if (new_mtu == dev->mtu)
4689 		return 0;
4690 
4691 	/*	MTU must be positive.	 */
4692 	if (new_mtu < 0)
4693 		return -EINVAL;
4694 
4695 	if (!netif_device_present(dev))
4696 		return -ENODEV;
4697 
4698 	err = 0;
4699 	if (ops->ndo_change_mtu)
4700 		err = ops->ndo_change_mtu(dev, new_mtu);
4701 	else
4702 		dev->mtu = new_mtu;
4703 
4704 	if (!err && dev->flags & IFF_UP)
4705 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4706 	return err;
4707 }
4708 EXPORT_SYMBOL(dev_set_mtu);
4709 
4710 /**
4711  *	dev_set_group - Change group this device belongs to
4712  *	@dev: device
4713  *	@new_group: group this device should belong to
4714  */
4715 void dev_set_group(struct net_device *dev, int new_group)
4716 {
4717 	dev->group = new_group;
4718 }
4719 EXPORT_SYMBOL(dev_set_group);
4720 
4721 /**
4722  *	dev_set_mac_address - Change Media Access Control Address
4723  *	@dev: device
4724  *	@sa: new address
4725  *
4726  *	Change the hardware (MAC) address of the device
4727  */
4728 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4729 {
4730 	const struct net_device_ops *ops = dev->netdev_ops;
4731 	int err;
4732 
4733 	if (!ops->ndo_set_mac_address)
4734 		return -EOPNOTSUPP;
4735 	if (sa->sa_family != dev->type)
4736 		return -EINVAL;
4737 	if (!netif_device_present(dev))
4738 		return -ENODEV;
4739 	err = ops->ndo_set_mac_address(dev, sa);
4740 	if (!err)
4741 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4742 	return err;
4743 }
4744 EXPORT_SYMBOL(dev_set_mac_address);
4745 
4746 /*
4747  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4748  */
4749 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4750 {
4751 	int err;
4752 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4753 
4754 	if (!dev)
4755 		return -ENODEV;
4756 
4757 	switch (cmd) {
4758 	case SIOCGIFFLAGS:	/* Get interface flags */
4759 		ifr->ifr_flags = (short) dev_get_flags(dev);
4760 		return 0;
4761 
4762 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4763 				   (currently unused) */
4764 		ifr->ifr_metric = 0;
4765 		return 0;
4766 
4767 	case SIOCGIFMTU:	/* Get the MTU of a device */
4768 		ifr->ifr_mtu = dev->mtu;
4769 		return 0;
4770 
4771 	case SIOCGIFHWADDR:
4772 		if (!dev->addr_len)
4773 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4774 		else
4775 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4776 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4777 		ifr->ifr_hwaddr.sa_family = dev->type;
4778 		return 0;
4779 
4780 	case SIOCGIFSLAVE:
4781 		err = -EINVAL;
4782 		break;
4783 
4784 	case SIOCGIFMAP:
4785 		ifr->ifr_map.mem_start = dev->mem_start;
4786 		ifr->ifr_map.mem_end   = dev->mem_end;
4787 		ifr->ifr_map.base_addr = dev->base_addr;
4788 		ifr->ifr_map.irq       = dev->irq;
4789 		ifr->ifr_map.dma       = dev->dma;
4790 		ifr->ifr_map.port      = dev->if_port;
4791 		return 0;
4792 
4793 	case SIOCGIFINDEX:
4794 		ifr->ifr_ifindex = dev->ifindex;
4795 		return 0;
4796 
4797 	case SIOCGIFTXQLEN:
4798 		ifr->ifr_qlen = dev->tx_queue_len;
4799 		return 0;
4800 
4801 	default:
4802 		/* dev_ioctl() should ensure this case
4803 		 * is never reached
4804 		 */
4805 		WARN_ON(1);
4806 		err = -EINVAL;
4807 		break;
4808 
4809 	}
4810 	return err;
4811 }
4812 
4813 /*
4814  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4815  */
4816 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4817 {
4818 	int err;
4819 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4820 	const struct net_device_ops *ops;
4821 
4822 	if (!dev)
4823 		return -ENODEV;
4824 
4825 	ops = dev->netdev_ops;
4826 
4827 	switch (cmd) {
4828 	case SIOCSIFFLAGS:	/* Set interface flags */
4829 		return dev_change_flags(dev, ifr->ifr_flags);
4830 
4831 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4832 				   (currently unused) */
4833 		return -EOPNOTSUPP;
4834 
4835 	case SIOCSIFMTU:	/* Set the MTU of a device */
4836 		return dev_set_mtu(dev, ifr->ifr_mtu);
4837 
4838 	case SIOCSIFHWADDR:
4839 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4840 
4841 	case SIOCSIFHWBROADCAST:
4842 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4843 			return -EINVAL;
4844 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4845 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4846 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4847 		return 0;
4848 
4849 	case SIOCSIFMAP:
4850 		if (ops->ndo_set_config) {
4851 			if (!netif_device_present(dev))
4852 				return -ENODEV;
4853 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4854 		}
4855 		return -EOPNOTSUPP;
4856 
4857 	case SIOCADDMULTI:
4858 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4859 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4860 			return -EINVAL;
4861 		if (!netif_device_present(dev))
4862 			return -ENODEV;
4863 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4864 
4865 	case SIOCDELMULTI:
4866 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868 			return -EINVAL;
4869 		if (!netif_device_present(dev))
4870 			return -ENODEV;
4871 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4872 
4873 	case SIOCSIFTXQLEN:
4874 		if (ifr->ifr_qlen < 0)
4875 			return -EINVAL;
4876 		dev->tx_queue_len = ifr->ifr_qlen;
4877 		return 0;
4878 
4879 	case SIOCSIFNAME:
4880 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4881 		return dev_change_name(dev, ifr->ifr_newname);
4882 
4883 	/*
4884 	 *	Unknown or private ioctl
4885 	 */
4886 	default:
4887 		if ((cmd >= SIOCDEVPRIVATE &&
4888 		    cmd <= SIOCDEVPRIVATE + 15) ||
4889 		    cmd == SIOCBONDENSLAVE ||
4890 		    cmd == SIOCBONDRELEASE ||
4891 		    cmd == SIOCBONDSETHWADDR ||
4892 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4893 		    cmd == SIOCBONDINFOQUERY ||
4894 		    cmd == SIOCBONDCHANGEACTIVE ||
4895 		    cmd == SIOCGMIIPHY ||
4896 		    cmd == SIOCGMIIREG ||
4897 		    cmd == SIOCSMIIREG ||
4898 		    cmd == SIOCBRADDIF ||
4899 		    cmd == SIOCBRDELIF ||
4900 		    cmd == SIOCSHWTSTAMP ||
4901 		    cmd == SIOCWANDEV) {
4902 			err = -EOPNOTSUPP;
4903 			if (ops->ndo_do_ioctl) {
4904 				if (netif_device_present(dev))
4905 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4906 				else
4907 					err = -ENODEV;
4908 			}
4909 		} else
4910 			err = -EINVAL;
4911 
4912 	}
4913 	return err;
4914 }
4915 
4916 /*
4917  *	This function handles all "interface"-type I/O control requests. The actual
4918  *	'doing' part of this is dev_ifsioc above.
4919  */
4920 
4921 /**
4922  *	dev_ioctl	-	network device ioctl
4923  *	@net: the applicable net namespace
4924  *	@cmd: command to issue
4925  *	@arg: pointer to a struct ifreq in user space
4926  *
4927  *	Issue ioctl functions to devices. This is normally called by the
4928  *	user space syscall interfaces but can sometimes be useful for
4929  *	other purposes. The return value is the return from the syscall if
4930  *	positive or a negative errno code on error.
4931  */
4932 
4933 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4934 {
4935 	struct ifreq ifr;
4936 	int ret;
4937 	char *colon;
4938 
4939 	/* One special case: SIOCGIFCONF takes ifconf argument
4940 	   and requires shared lock, because it sleeps writing
4941 	   to user space.
4942 	 */
4943 
4944 	if (cmd == SIOCGIFCONF) {
4945 		rtnl_lock();
4946 		ret = dev_ifconf(net, (char __user *) arg);
4947 		rtnl_unlock();
4948 		return ret;
4949 	}
4950 	if (cmd == SIOCGIFNAME)
4951 		return dev_ifname(net, (struct ifreq __user *)arg);
4952 
4953 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4954 		return -EFAULT;
4955 
4956 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4957 
4958 	colon = strchr(ifr.ifr_name, ':');
4959 	if (colon)
4960 		*colon = 0;
4961 
4962 	/*
4963 	 *	See which interface the caller is talking about.
4964 	 */
4965 
4966 	switch (cmd) {
4967 	/*
4968 	 *	These ioctl calls:
4969 	 *	- can be done by all.
4970 	 *	- atomic and do not require locking.
4971 	 *	- return a value
4972 	 */
4973 	case SIOCGIFFLAGS:
4974 	case SIOCGIFMETRIC:
4975 	case SIOCGIFMTU:
4976 	case SIOCGIFHWADDR:
4977 	case SIOCGIFSLAVE:
4978 	case SIOCGIFMAP:
4979 	case SIOCGIFINDEX:
4980 	case SIOCGIFTXQLEN:
4981 		dev_load(net, ifr.ifr_name);
4982 		rcu_read_lock();
4983 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4984 		rcu_read_unlock();
4985 		if (!ret) {
4986 			if (colon)
4987 				*colon = ':';
4988 			if (copy_to_user(arg, &ifr,
4989 					 sizeof(struct ifreq)))
4990 				ret = -EFAULT;
4991 		}
4992 		return ret;
4993 
4994 	case SIOCETHTOOL:
4995 		dev_load(net, ifr.ifr_name);
4996 		rtnl_lock();
4997 		ret = dev_ethtool(net, &ifr);
4998 		rtnl_unlock();
4999 		if (!ret) {
5000 			if (colon)
5001 				*colon = ':';
5002 			if (copy_to_user(arg, &ifr,
5003 					 sizeof(struct ifreq)))
5004 				ret = -EFAULT;
5005 		}
5006 		return ret;
5007 
5008 	/*
5009 	 *	These ioctl calls:
5010 	 *	- require superuser power.
5011 	 *	- require strict serialization.
5012 	 *	- return a value
5013 	 */
5014 	case SIOCGMIIPHY:
5015 	case SIOCGMIIREG:
5016 	case SIOCSIFNAME:
5017 		if (!capable(CAP_NET_ADMIN))
5018 			return -EPERM;
5019 		dev_load(net, ifr.ifr_name);
5020 		rtnl_lock();
5021 		ret = dev_ifsioc(net, &ifr, cmd);
5022 		rtnl_unlock();
5023 		if (!ret) {
5024 			if (colon)
5025 				*colon = ':';
5026 			if (copy_to_user(arg, &ifr,
5027 					 sizeof(struct ifreq)))
5028 				ret = -EFAULT;
5029 		}
5030 		return ret;
5031 
5032 	/*
5033 	 *	These ioctl calls:
5034 	 *	- require superuser power.
5035 	 *	- require strict serialization.
5036 	 *	- do not return a value
5037 	 */
5038 	case SIOCSIFFLAGS:
5039 	case SIOCSIFMETRIC:
5040 	case SIOCSIFMTU:
5041 	case SIOCSIFMAP:
5042 	case SIOCSIFHWADDR:
5043 	case SIOCSIFSLAVE:
5044 	case SIOCADDMULTI:
5045 	case SIOCDELMULTI:
5046 	case SIOCSIFHWBROADCAST:
5047 	case SIOCSIFTXQLEN:
5048 	case SIOCSMIIREG:
5049 	case SIOCBONDENSLAVE:
5050 	case SIOCBONDRELEASE:
5051 	case SIOCBONDSETHWADDR:
5052 	case SIOCBONDCHANGEACTIVE:
5053 	case SIOCBRADDIF:
5054 	case SIOCBRDELIF:
5055 	case SIOCSHWTSTAMP:
5056 		if (!capable(CAP_NET_ADMIN))
5057 			return -EPERM;
5058 		/* fall through */
5059 	case SIOCBONDSLAVEINFOQUERY:
5060 	case SIOCBONDINFOQUERY:
5061 		dev_load(net, ifr.ifr_name);
5062 		rtnl_lock();
5063 		ret = dev_ifsioc(net, &ifr, cmd);
5064 		rtnl_unlock();
5065 		return ret;
5066 
5067 	case SIOCGIFMEM:
5068 		/* Get the per device memory space. We can add this but
5069 		 * currently do not support it */
5070 	case SIOCSIFMEM:
5071 		/* Set the per device memory buffer space.
5072 		 * Not applicable in our case */
5073 	case SIOCSIFLINK:
5074 		return -EINVAL;
5075 
5076 	/*
5077 	 *	Unknown or private ioctl.
5078 	 */
5079 	default:
5080 		if (cmd == SIOCWANDEV ||
5081 		    (cmd >= SIOCDEVPRIVATE &&
5082 		     cmd <= SIOCDEVPRIVATE + 15)) {
5083 			dev_load(net, ifr.ifr_name);
5084 			rtnl_lock();
5085 			ret = dev_ifsioc(net, &ifr, cmd);
5086 			rtnl_unlock();
5087 			if (!ret && copy_to_user(arg, &ifr,
5088 						 sizeof(struct ifreq)))
5089 				ret = -EFAULT;
5090 			return ret;
5091 		}
5092 		/* Take care of Wireless Extensions */
5093 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5094 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5095 		return -EINVAL;
5096 	}
5097 }
5098 
5099 
5100 /**
5101  *	dev_new_index	-	allocate an ifindex
5102  *	@net: the applicable net namespace
5103  *
5104  *	Returns a suitable unique value for a new device interface
5105  *	number.  The caller must hold the rtnl semaphore or the
5106  *	dev_base_lock to be sure it remains unique.
5107  */
5108 static int dev_new_index(struct net *net)
5109 {
5110 	static int ifindex;
5111 	for (;;) {
5112 		if (++ifindex <= 0)
5113 			ifindex = 1;
5114 		if (!__dev_get_by_index(net, ifindex))
5115 			return ifindex;
5116 	}
5117 }
5118 
5119 /* Delayed registration/unregisteration */
5120 static LIST_HEAD(net_todo_list);
5121 
5122 static void net_set_todo(struct net_device *dev)
5123 {
5124 	list_add_tail(&dev->todo_list, &net_todo_list);
5125 }
5126 
5127 static void rollback_registered_many(struct list_head *head)
5128 {
5129 	struct net_device *dev, *tmp;
5130 
5131 	BUG_ON(dev_boot_phase);
5132 	ASSERT_RTNL();
5133 
5134 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5135 		/* Some devices call without registering
5136 		 * for initialization unwind. Remove those
5137 		 * devices and proceed with the remaining.
5138 		 */
5139 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5140 			pr_debug("unregister_netdevice: device %s/%p never "
5141 				 "was registered\n", dev->name, dev);
5142 
5143 			WARN_ON(1);
5144 			list_del(&dev->unreg_list);
5145 			continue;
5146 		}
5147 
5148 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5149 	}
5150 
5151 	/* If device is running, close it first. */
5152 	dev_close_many(head);
5153 
5154 	list_for_each_entry(dev, head, unreg_list) {
5155 		/* And unlink it from device chain. */
5156 		unlist_netdevice(dev);
5157 
5158 		dev->reg_state = NETREG_UNREGISTERING;
5159 	}
5160 
5161 	synchronize_net();
5162 
5163 	list_for_each_entry(dev, head, unreg_list) {
5164 		/* Shutdown queueing discipline. */
5165 		dev_shutdown(dev);
5166 
5167 
5168 		/* Notify protocols, that we are about to destroy
5169 		   this device. They should clean all the things.
5170 		*/
5171 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5172 
5173 		if (!dev->rtnl_link_ops ||
5174 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5175 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5176 
5177 		/*
5178 		 *	Flush the unicast and multicast chains
5179 		 */
5180 		dev_uc_flush(dev);
5181 		dev_mc_flush(dev);
5182 
5183 		if (dev->netdev_ops->ndo_uninit)
5184 			dev->netdev_ops->ndo_uninit(dev);
5185 
5186 		/* Notifier chain MUST detach us from master device. */
5187 		WARN_ON(dev->master);
5188 
5189 		/* Remove entries from kobject tree */
5190 		netdev_unregister_kobject(dev);
5191 	}
5192 
5193 	/* Process any work delayed until the end of the batch */
5194 	dev = list_first_entry(head, struct net_device, unreg_list);
5195 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5196 
5197 	rcu_barrier();
5198 
5199 	list_for_each_entry(dev, head, unreg_list)
5200 		dev_put(dev);
5201 }
5202 
5203 static void rollback_registered(struct net_device *dev)
5204 {
5205 	LIST_HEAD(single);
5206 
5207 	list_add(&dev->unreg_list, &single);
5208 	rollback_registered_many(&single);
5209 	list_del(&single);
5210 }
5211 
5212 u32 netdev_fix_features(struct net_device *dev, u32 features)
5213 {
5214 	/* Fix illegal checksum combinations */
5215 	if ((features & NETIF_F_HW_CSUM) &&
5216 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5217 		netdev_info(dev, "mixed HW and IP checksum settings.\n");
5218 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5219 	}
5220 
5221 	if ((features & NETIF_F_NO_CSUM) &&
5222 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5223 		netdev_info(dev, "mixed no checksumming and other settings.\n");
5224 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5225 	}
5226 
5227 	/* Fix illegal SG+CSUM combinations. */
5228 	if ((features & NETIF_F_SG) &&
5229 	    !(features & NETIF_F_ALL_CSUM)) {
5230 		netdev_info(dev,
5231 			    "Dropping NETIF_F_SG since no checksum feature.\n");
5232 		features &= ~NETIF_F_SG;
5233 	}
5234 
5235 	/* TSO requires that SG is present as well. */
5236 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5237 		netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5238 		features &= ~NETIF_F_TSO;
5239 	}
5240 
5241 	/* Software GSO depends on SG. */
5242 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5243 		netdev_info(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5244 		features &= ~NETIF_F_GSO;
5245 	}
5246 
5247 	/* UFO needs SG and checksumming */
5248 	if (features & NETIF_F_UFO) {
5249 		/* maybe split UFO into V4 and V6? */
5250 		if (!((features & NETIF_F_GEN_CSUM) ||
5251 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5252 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5253 			netdev_info(dev,
5254 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5255 			features &= ~NETIF_F_UFO;
5256 		}
5257 
5258 		if (!(features & NETIF_F_SG)) {
5259 			netdev_info(dev,
5260 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5261 			features &= ~NETIF_F_UFO;
5262 		}
5263 	}
5264 
5265 	return features;
5266 }
5267 EXPORT_SYMBOL(netdev_fix_features);
5268 
5269 void netdev_update_features(struct net_device *dev)
5270 {
5271 	u32 features;
5272 	int err = 0;
5273 
5274 	features = netdev_get_wanted_features(dev);
5275 
5276 	if (dev->netdev_ops->ndo_fix_features)
5277 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5278 
5279 	/* driver might be less strict about feature dependencies */
5280 	features = netdev_fix_features(dev, features);
5281 
5282 	if (dev->features == features)
5283 		return;
5284 
5285 	netdev_info(dev, "Features changed: 0x%08x -> 0x%08x\n",
5286 		dev->features, features);
5287 
5288 	if (dev->netdev_ops->ndo_set_features)
5289 		err = dev->netdev_ops->ndo_set_features(dev, features);
5290 
5291 	if (!err)
5292 		dev->features = features;
5293 	else if (err < 0)
5294 		netdev_err(dev,
5295 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5296 			err, features, dev->features);
5297 }
5298 EXPORT_SYMBOL(netdev_update_features);
5299 
5300 /**
5301  *	netif_stacked_transfer_operstate -	transfer operstate
5302  *	@rootdev: the root or lower level device to transfer state from
5303  *	@dev: the device to transfer operstate to
5304  *
5305  *	Transfer operational state from root to device. This is normally
5306  *	called when a stacking relationship exists between the root
5307  *	device and the device(a leaf device).
5308  */
5309 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5310 					struct net_device *dev)
5311 {
5312 	if (rootdev->operstate == IF_OPER_DORMANT)
5313 		netif_dormant_on(dev);
5314 	else
5315 		netif_dormant_off(dev);
5316 
5317 	if (netif_carrier_ok(rootdev)) {
5318 		if (!netif_carrier_ok(dev))
5319 			netif_carrier_on(dev);
5320 	} else {
5321 		if (netif_carrier_ok(dev))
5322 			netif_carrier_off(dev);
5323 	}
5324 }
5325 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5326 
5327 #ifdef CONFIG_RPS
5328 static int netif_alloc_rx_queues(struct net_device *dev)
5329 {
5330 	unsigned int i, count = dev->num_rx_queues;
5331 	struct netdev_rx_queue *rx;
5332 
5333 	BUG_ON(count < 1);
5334 
5335 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5336 	if (!rx) {
5337 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5338 		return -ENOMEM;
5339 	}
5340 	dev->_rx = rx;
5341 
5342 	for (i = 0; i < count; i++)
5343 		rx[i].dev = dev;
5344 	return 0;
5345 }
5346 #endif
5347 
5348 static void netdev_init_one_queue(struct net_device *dev,
5349 				  struct netdev_queue *queue, void *_unused)
5350 {
5351 	/* Initialize queue lock */
5352 	spin_lock_init(&queue->_xmit_lock);
5353 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5354 	queue->xmit_lock_owner = -1;
5355 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5356 	queue->dev = dev;
5357 }
5358 
5359 static int netif_alloc_netdev_queues(struct net_device *dev)
5360 {
5361 	unsigned int count = dev->num_tx_queues;
5362 	struct netdev_queue *tx;
5363 
5364 	BUG_ON(count < 1);
5365 
5366 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5367 	if (!tx) {
5368 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5369 		       count);
5370 		return -ENOMEM;
5371 	}
5372 	dev->_tx = tx;
5373 
5374 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5375 	spin_lock_init(&dev->tx_global_lock);
5376 
5377 	return 0;
5378 }
5379 
5380 /**
5381  *	register_netdevice	- register a network device
5382  *	@dev: device to register
5383  *
5384  *	Take a completed network device structure and add it to the kernel
5385  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5386  *	chain. 0 is returned on success. A negative errno code is returned
5387  *	on a failure to set up the device, or if the name is a duplicate.
5388  *
5389  *	Callers must hold the rtnl semaphore. You may want
5390  *	register_netdev() instead of this.
5391  *
5392  *	BUGS:
5393  *	The locking appears insufficient to guarantee two parallel registers
5394  *	will not get the same name.
5395  */
5396 
5397 int register_netdevice(struct net_device *dev)
5398 {
5399 	int ret;
5400 	struct net *net = dev_net(dev);
5401 
5402 	BUG_ON(dev_boot_phase);
5403 	ASSERT_RTNL();
5404 
5405 	might_sleep();
5406 
5407 	/* When net_device's are persistent, this will be fatal. */
5408 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5409 	BUG_ON(!net);
5410 
5411 	spin_lock_init(&dev->addr_list_lock);
5412 	netdev_set_addr_lockdep_class(dev);
5413 
5414 	dev->iflink = -1;
5415 
5416 	/* Init, if this function is available */
5417 	if (dev->netdev_ops->ndo_init) {
5418 		ret = dev->netdev_ops->ndo_init(dev);
5419 		if (ret) {
5420 			if (ret > 0)
5421 				ret = -EIO;
5422 			goto out;
5423 		}
5424 	}
5425 
5426 	ret = dev_get_valid_name(dev, dev->name, 0);
5427 	if (ret)
5428 		goto err_uninit;
5429 
5430 	dev->ifindex = dev_new_index(net);
5431 	if (dev->iflink == -1)
5432 		dev->iflink = dev->ifindex;
5433 
5434 	/* Transfer changeable features to wanted_features and enable
5435 	 * software offloads (GSO and GRO).
5436 	 */
5437 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5438 	dev->features |= NETIF_F_SOFT_FEATURES;
5439 	dev->wanted_features = dev->features & dev->hw_features;
5440 
5441 	/* Avoid warning from netdev_fix_features() for GSO without SG */
5442 	if (!(dev->wanted_features & NETIF_F_SG)) {
5443 		dev->wanted_features &= ~NETIF_F_GSO;
5444 		dev->features &= ~NETIF_F_GSO;
5445 	}
5446 
5447 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5448 	 * vlan_dev_init() will do the dev->features check, so these features
5449 	 * are enabled only if supported by underlying device.
5450 	 */
5451 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5452 
5453 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5454 	ret = notifier_to_errno(ret);
5455 	if (ret)
5456 		goto err_uninit;
5457 
5458 	ret = netdev_register_kobject(dev);
5459 	if (ret)
5460 		goto err_uninit;
5461 	dev->reg_state = NETREG_REGISTERED;
5462 
5463 	netdev_update_features(dev);
5464 
5465 	/*
5466 	 *	Default initial state at registry is that the
5467 	 *	device is present.
5468 	 */
5469 
5470 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5471 
5472 	dev_init_scheduler(dev);
5473 	dev_hold(dev);
5474 	list_netdevice(dev);
5475 
5476 	/* Notify protocols, that a new device appeared. */
5477 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5478 	ret = notifier_to_errno(ret);
5479 	if (ret) {
5480 		rollback_registered(dev);
5481 		dev->reg_state = NETREG_UNREGISTERED;
5482 	}
5483 	/*
5484 	 *	Prevent userspace races by waiting until the network
5485 	 *	device is fully setup before sending notifications.
5486 	 */
5487 	if (!dev->rtnl_link_ops ||
5488 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5489 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5490 
5491 out:
5492 	return ret;
5493 
5494 err_uninit:
5495 	if (dev->netdev_ops->ndo_uninit)
5496 		dev->netdev_ops->ndo_uninit(dev);
5497 	goto out;
5498 }
5499 EXPORT_SYMBOL(register_netdevice);
5500 
5501 /**
5502  *	init_dummy_netdev	- init a dummy network device for NAPI
5503  *	@dev: device to init
5504  *
5505  *	This takes a network device structure and initialize the minimum
5506  *	amount of fields so it can be used to schedule NAPI polls without
5507  *	registering a full blown interface. This is to be used by drivers
5508  *	that need to tie several hardware interfaces to a single NAPI
5509  *	poll scheduler due to HW limitations.
5510  */
5511 int init_dummy_netdev(struct net_device *dev)
5512 {
5513 	/* Clear everything. Note we don't initialize spinlocks
5514 	 * are they aren't supposed to be taken by any of the
5515 	 * NAPI code and this dummy netdev is supposed to be
5516 	 * only ever used for NAPI polls
5517 	 */
5518 	memset(dev, 0, sizeof(struct net_device));
5519 
5520 	/* make sure we BUG if trying to hit standard
5521 	 * register/unregister code path
5522 	 */
5523 	dev->reg_state = NETREG_DUMMY;
5524 
5525 	/* NAPI wants this */
5526 	INIT_LIST_HEAD(&dev->napi_list);
5527 
5528 	/* a dummy interface is started by default */
5529 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5530 	set_bit(__LINK_STATE_START, &dev->state);
5531 
5532 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5533 	 * because users of this 'device' dont need to change
5534 	 * its refcount.
5535 	 */
5536 
5537 	return 0;
5538 }
5539 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5540 
5541 
5542 /**
5543  *	register_netdev	- register a network device
5544  *	@dev: device to register
5545  *
5546  *	Take a completed network device structure and add it to the kernel
5547  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5548  *	chain. 0 is returned on success. A negative errno code is returned
5549  *	on a failure to set up the device, or if the name is a duplicate.
5550  *
5551  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5552  *	and expands the device name if you passed a format string to
5553  *	alloc_netdev.
5554  */
5555 int register_netdev(struct net_device *dev)
5556 {
5557 	int err;
5558 
5559 	rtnl_lock();
5560 
5561 	/*
5562 	 * If the name is a format string the caller wants us to do a
5563 	 * name allocation.
5564 	 */
5565 	if (strchr(dev->name, '%')) {
5566 		err = dev_alloc_name(dev, dev->name);
5567 		if (err < 0)
5568 			goto out;
5569 	}
5570 
5571 	err = register_netdevice(dev);
5572 out:
5573 	rtnl_unlock();
5574 	return err;
5575 }
5576 EXPORT_SYMBOL(register_netdev);
5577 
5578 int netdev_refcnt_read(const struct net_device *dev)
5579 {
5580 	int i, refcnt = 0;
5581 
5582 	for_each_possible_cpu(i)
5583 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5584 	return refcnt;
5585 }
5586 EXPORT_SYMBOL(netdev_refcnt_read);
5587 
5588 /*
5589  * netdev_wait_allrefs - wait until all references are gone.
5590  *
5591  * This is called when unregistering network devices.
5592  *
5593  * Any protocol or device that holds a reference should register
5594  * for netdevice notification, and cleanup and put back the
5595  * reference if they receive an UNREGISTER event.
5596  * We can get stuck here if buggy protocols don't correctly
5597  * call dev_put.
5598  */
5599 static void netdev_wait_allrefs(struct net_device *dev)
5600 {
5601 	unsigned long rebroadcast_time, warning_time;
5602 	int refcnt;
5603 
5604 	linkwatch_forget_dev(dev);
5605 
5606 	rebroadcast_time = warning_time = jiffies;
5607 	refcnt = netdev_refcnt_read(dev);
5608 
5609 	while (refcnt != 0) {
5610 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5611 			rtnl_lock();
5612 
5613 			/* Rebroadcast unregister notification */
5614 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5615 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5616 			 * should have already handle it the first time */
5617 
5618 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5619 				     &dev->state)) {
5620 				/* We must not have linkwatch events
5621 				 * pending on unregister. If this
5622 				 * happens, we simply run the queue
5623 				 * unscheduled, resulting in a noop
5624 				 * for this device.
5625 				 */
5626 				linkwatch_run_queue();
5627 			}
5628 
5629 			__rtnl_unlock();
5630 
5631 			rebroadcast_time = jiffies;
5632 		}
5633 
5634 		msleep(250);
5635 
5636 		refcnt = netdev_refcnt_read(dev);
5637 
5638 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5639 			printk(KERN_EMERG "unregister_netdevice: "
5640 			       "waiting for %s to become free. Usage "
5641 			       "count = %d\n",
5642 			       dev->name, refcnt);
5643 			warning_time = jiffies;
5644 		}
5645 	}
5646 }
5647 
5648 /* The sequence is:
5649  *
5650  *	rtnl_lock();
5651  *	...
5652  *	register_netdevice(x1);
5653  *	register_netdevice(x2);
5654  *	...
5655  *	unregister_netdevice(y1);
5656  *	unregister_netdevice(y2);
5657  *      ...
5658  *	rtnl_unlock();
5659  *	free_netdev(y1);
5660  *	free_netdev(y2);
5661  *
5662  * We are invoked by rtnl_unlock().
5663  * This allows us to deal with problems:
5664  * 1) We can delete sysfs objects which invoke hotplug
5665  *    without deadlocking with linkwatch via keventd.
5666  * 2) Since we run with the RTNL semaphore not held, we can sleep
5667  *    safely in order to wait for the netdev refcnt to drop to zero.
5668  *
5669  * We must not return until all unregister events added during
5670  * the interval the lock was held have been completed.
5671  */
5672 void netdev_run_todo(void)
5673 {
5674 	struct list_head list;
5675 
5676 	/* Snapshot list, allow later requests */
5677 	list_replace_init(&net_todo_list, &list);
5678 
5679 	__rtnl_unlock();
5680 
5681 	while (!list_empty(&list)) {
5682 		struct net_device *dev
5683 			= list_first_entry(&list, struct net_device, todo_list);
5684 		list_del(&dev->todo_list);
5685 
5686 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5687 			printk(KERN_ERR "network todo '%s' but state %d\n",
5688 			       dev->name, dev->reg_state);
5689 			dump_stack();
5690 			continue;
5691 		}
5692 
5693 		dev->reg_state = NETREG_UNREGISTERED;
5694 
5695 		on_each_cpu(flush_backlog, dev, 1);
5696 
5697 		netdev_wait_allrefs(dev);
5698 
5699 		/* paranoia */
5700 		BUG_ON(netdev_refcnt_read(dev));
5701 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5702 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5703 		WARN_ON(dev->dn_ptr);
5704 
5705 		if (dev->destructor)
5706 			dev->destructor(dev);
5707 
5708 		/* Free network device */
5709 		kobject_put(&dev->dev.kobj);
5710 	}
5711 }
5712 
5713 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5714  * fields in the same order, with only the type differing.
5715  */
5716 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5717 				    const struct net_device_stats *netdev_stats)
5718 {
5719 #if BITS_PER_LONG == 64
5720         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5721         memcpy(stats64, netdev_stats, sizeof(*stats64));
5722 #else
5723 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5724 	const unsigned long *src = (const unsigned long *)netdev_stats;
5725 	u64 *dst = (u64 *)stats64;
5726 
5727 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5728 		     sizeof(*stats64) / sizeof(u64));
5729 	for (i = 0; i < n; i++)
5730 		dst[i] = src[i];
5731 #endif
5732 }
5733 
5734 /**
5735  *	dev_get_stats	- get network device statistics
5736  *	@dev: device to get statistics from
5737  *	@storage: place to store stats
5738  *
5739  *	Get network statistics from device. Return @storage.
5740  *	The device driver may provide its own method by setting
5741  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5742  *	otherwise the internal statistics structure is used.
5743  */
5744 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5745 					struct rtnl_link_stats64 *storage)
5746 {
5747 	const struct net_device_ops *ops = dev->netdev_ops;
5748 
5749 	if (ops->ndo_get_stats64) {
5750 		memset(storage, 0, sizeof(*storage));
5751 		ops->ndo_get_stats64(dev, storage);
5752 	} else if (ops->ndo_get_stats) {
5753 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5754 	} else {
5755 		netdev_stats_to_stats64(storage, &dev->stats);
5756 	}
5757 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5758 	return storage;
5759 }
5760 EXPORT_SYMBOL(dev_get_stats);
5761 
5762 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5763 {
5764 	struct netdev_queue *queue = dev_ingress_queue(dev);
5765 
5766 #ifdef CONFIG_NET_CLS_ACT
5767 	if (queue)
5768 		return queue;
5769 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5770 	if (!queue)
5771 		return NULL;
5772 	netdev_init_one_queue(dev, queue, NULL);
5773 	queue->qdisc = &noop_qdisc;
5774 	queue->qdisc_sleeping = &noop_qdisc;
5775 	rcu_assign_pointer(dev->ingress_queue, queue);
5776 #endif
5777 	return queue;
5778 }
5779 
5780 /**
5781  *	alloc_netdev_mqs - allocate network device
5782  *	@sizeof_priv:	size of private data to allocate space for
5783  *	@name:		device name format string
5784  *	@setup:		callback to initialize device
5785  *	@txqs:		the number of TX subqueues to allocate
5786  *	@rxqs:		the number of RX subqueues to allocate
5787  *
5788  *	Allocates a struct net_device with private data area for driver use
5789  *	and performs basic initialization.  Also allocates subquue structs
5790  *	for each queue on the device.
5791  */
5792 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5793 		void (*setup)(struct net_device *),
5794 		unsigned int txqs, unsigned int rxqs)
5795 {
5796 	struct net_device *dev;
5797 	size_t alloc_size;
5798 	struct net_device *p;
5799 
5800 	BUG_ON(strlen(name) >= sizeof(dev->name));
5801 
5802 	if (txqs < 1) {
5803 		pr_err("alloc_netdev: Unable to allocate device "
5804 		       "with zero queues.\n");
5805 		return NULL;
5806 	}
5807 
5808 #ifdef CONFIG_RPS
5809 	if (rxqs < 1) {
5810 		pr_err("alloc_netdev: Unable to allocate device "
5811 		       "with zero RX queues.\n");
5812 		return NULL;
5813 	}
5814 #endif
5815 
5816 	alloc_size = sizeof(struct net_device);
5817 	if (sizeof_priv) {
5818 		/* ensure 32-byte alignment of private area */
5819 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5820 		alloc_size += sizeof_priv;
5821 	}
5822 	/* ensure 32-byte alignment of whole construct */
5823 	alloc_size += NETDEV_ALIGN - 1;
5824 
5825 	p = kzalloc(alloc_size, GFP_KERNEL);
5826 	if (!p) {
5827 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5828 		return NULL;
5829 	}
5830 
5831 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5832 	dev->padded = (char *)dev - (char *)p;
5833 
5834 	dev->pcpu_refcnt = alloc_percpu(int);
5835 	if (!dev->pcpu_refcnt)
5836 		goto free_p;
5837 
5838 	if (dev_addr_init(dev))
5839 		goto free_pcpu;
5840 
5841 	dev_mc_init(dev);
5842 	dev_uc_init(dev);
5843 
5844 	dev_net_set(dev, &init_net);
5845 
5846 	dev->gso_max_size = GSO_MAX_SIZE;
5847 
5848 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5849 	dev->ethtool_ntuple_list.count = 0;
5850 	INIT_LIST_HEAD(&dev->napi_list);
5851 	INIT_LIST_HEAD(&dev->unreg_list);
5852 	INIT_LIST_HEAD(&dev->link_watch_list);
5853 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5854 	setup(dev);
5855 
5856 	dev->num_tx_queues = txqs;
5857 	dev->real_num_tx_queues = txqs;
5858 	if (netif_alloc_netdev_queues(dev))
5859 		goto free_all;
5860 
5861 #ifdef CONFIG_RPS
5862 	dev->num_rx_queues = rxqs;
5863 	dev->real_num_rx_queues = rxqs;
5864 	if (netif_alloc_rx_queues(dev))
5865 		goto free_all;
5866 #endif
5867 
5868 	strcpy(dev->name, name);
5869 	dev->group = INIT_NETDEV_GROUP;
5870 	return dev;
5871 
5872 free_all:
5873 	free_netdev(dev);
5874 	return NULL;
5875 
5876 free_pcpu:
5877 	free_percpu(dev->pcpu_refcnt);
5878 	kfree(dev->_tx);
5879 #ifdef CONFIG_RPS
5880 	kfree(dev->_rx);
5881 #endif
5882 
5883 free_p:
5884 	kfree(p);
5885 	return NULL;
5886 }
5887 EXPORT_SYMBOL(alloc_netdev_mqs);
5888 
5889 /**
5890  *	free_netdev - free network device
5891  *	@dev: device
5892  *
5893  *	This function does the last stage of destroying an allocated device
5894  * 	interface. The reference to the device object is released.
5895  *	If this is the last reference then it will be freed.
5896  */
5897 void free_netdev(struct net_device *dev)
5898 {
5899 	struct napi_struct *p, *n;
5900 
5901 	release_net(dev_net(dev));
5902 
5903 	kfree(dev->_tx);
5904 #ifdef CONFIG_RPS
5905 	kfree(dev->_rx);
5906 #endif
5907 
5908 	kfree(rcu_dereference_raw(dev->ingress_queue));
5909 
5910 	/* Flush device addresses */
5911 	dev_addr_flush(dev);
5912 
5913 	/* Clear ethtool n-tuple list */
5914 	ethtool_ntuple_flush(dev);
5915 
5916 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5917 		netif_napi_del(p);
5918 
5919 	free_percpu(dev->pcpu_refcnt);
5920 	dev->pcpu_refcnt = NULL;
5921 
5922 	/*  Compatibility with error handling in drivers */
5923 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5924 		kfree((char *)dev - dev->padded);
5925 		return;
5926 	}
5927 
5928 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5929 	dev->reg_state = NETREG_RELEASED;
5930 
5931 	/* will free via device release */
5932 	put_device(&dev->dev);
5933 }
5934 EXPORT_SYMBOL(free_netdev);
5935 
5936 /**
5937  *	synchronize_net -  Synchronize with packet receive processing
5938  *
5939  *	Wait for packets currently being received to be done.
5940  *	Does not block later packets from starting.
5941  */
5942 void synchronize_net(void)
5943 {
5944 	might_sleep();
5945 	synchronize_rcu();
5946 }
5947 EXPORT_SYMBOL(synchronize_net);
5948 
5949 /**
5950  *	unregister_netdevice_queue - remove device from the kernel
5951  *	@dev: device
5952  *	@head: list
5953  *
5954  *	This function shuts down a device interface and removes it
5955  *	from the kernel tables.
5956  *	If head not NULL, device is queued to be unregistered later.
5957  *
5958  *	Callers must hold the rtnl semaphore.  You may want
5959  *	unregister_netdev() instead of this.
5960  */
5961 
5962 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5963 {
5964 	ASSERT_RTNL();
5965 
5966 	if (head) {
5967 		list_move_tail(&dev->unreg_list, head);
5968 	} else {
5969 		rollback_registered(dev);
5970 		/* Finish processing unregister after unlock */
5971 		net_set_todo(dev);
5972 	}
5973 }
5974 EXPORT_SYMBOL(unregister_netdevice_queue);
5975 
5976 /**
5977  *	unregister_netdevice_many - unregister many devices
5978  *	@head: list of devices
5979  */
5980 void unregister_netdevice_many(struct list_head *head)
5981 {
5982 	struct net_device *dev;
5983 
5984 	if (!list_empty(head)) {
5985 		rollback_registered_many(head);
5986 		list_for_each_entry(dev, head, unreg_list)
5987 			net_set_todo(dev);
5988 	}
5989 }
5990 EXPORT_SYMBOL(unregister_netdevice_many);
5991 
5992 /**
5993  *	unregister_netdev - remove device from the kernel
5994  *	@dev: device
5995  *
5996  *	This function shuts down a device interface and removes it
5997  *	from the kernel tables.
5998  *
5999  *	This is just a wrapper for unregister_netdevice that takes
6000  *	the rtnl semaphore.  In general you want to use this and not
6001  *	unregister_netdevice.
6002  */
6003 void unregister_netdev(struct net_device *dev)
6004 {
6005 	rtnl_lock();
6006 	unregister_netdevice(dev);
6007 	rtnl_unlock();
6008 }
6009 EXPORT_SYMBOL(unregister_netdev);
6010 
6011 /**
6012  *	dev_change_net_namespace - move device to different nethost namespace
6013  *	@dev: device
6014  *	@net: network namespace
6015  *	@pat: If not NULL name pattern to try if the current device name
6016  *	      is already taken in the destination network namespace.
6017  *
6018  *	This function shuts down a device interface and moves it
6019  *	to a new network namespace. On success 0 is returned, on
6020  *	a failure a netagive errno code is returned.
6021  *
6022  *	Callers must hold the rtnl semaphore.
6023  */
6024 
6025 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6026 {
6027 	int err;
6028 
6029 	ASSERT_RTNL();
6030 
6031 	/* Don't allow namespace local devices to be moved. */
6032 	err = -EINVAL;
6033 	if (dev->features & NETIF_F_NETNS_LOCAL)
6034 		goto out;
6035 
6036 	/* Ensure the device has been registrered */
6037 	err = -EINVAL;
6038 	if (dev->reg_state != NETREG_REGISTERED)
6039 		goto out;
6040 
6041 	/* Get out if there is nothing todo */
6042 	err = 0;
6043 	if (net_eq(dev_net(dev), net))
6044 		goto out;
6045 
6046 	/* Pick the destination device name, and ensure
6047 	 * we can use it in the destination network namespace.
6048 	 */
6049 	err = -EEXIST;
6050 	if (__dev_get_by_name(net, dev->name)) {
6051 		/* We get here if we can't use the current device name */
6052 		if (!pat)
6053 			goto out;
6054 		if (dev_get_valid_name(dev, pat, 1))
6055 			goto out;
6056 	}
6057 
6058 	/*
6059 	 * And now a mini version of register_netdevice unregister_netdevice.
6060 	 */
6061 
6062 	/* If device is running close it first. */
6063 	dev_close(dev);
6064 
6065 	/* And unlink it from device chain */
6066 	err = -ENODEV;
6067 	unlist_netdevice(dev);
6068 
6069 	synchronize_net();
6070 
6071 	/* Shutdown queueing discipline. */
6072 	dev_shutdown(dev);
6073 
6074 	/* Notify protocols, that we are about to destroy
6075 	   this device. They should clean all the things.
6076 
6077 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6078 	   This is wanted because this way 8021q and macvlan know
6079 	   the device is just moving and can keep their slaves up.
6080 	*/
6081 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6082 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6083 
6084 	/*
6085 	 *	Flush the unicast and multicast chains
6086 	 */
6087 	dev_uc_flush(dev);
6088 	dev_mc_flush(dev);
6089 
6090 	/* Actually switch the network namespace */
6091 	dev_net_set(dev, net);
6092 
6093 	/* If there is an ifindex conflict assign a new one */
6094 	if (__dev_get_by_index(net, dev->ifindex)) {
6095 		int iflink = (dev->iflink == dev->ifindex);
6096 		dev->ifindex = dev_new_index(net);
6097 		if (iflink)
6098 			dev->iflink = dev->ifindex;
6099 	}
6100 
6101 	/* Fixup kobjects */
6102 	err = device_rename(&dev->dev, dev->name);
6103 	WARN_ON(err);
6104 
6105 	/* Add the device back in the hashes */
6106 	list_netdevice(dev);
6107 
6108 	/* Notify protocols, that a new device appeared. */
6109 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6110 
6111 	/*
6112 	 *	Prevent userspace races by waiting until the network
6113 	 *	device is fully setup before sending notifications.
6114 	 */
6115 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6116 
6117 	synchronize_net();
6118 	err = 0;
6119 out:
6120 	return err;
6121 }
6122 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6123 
6124 static int dev_cpu_callback(struct notifier_block *nfb,
6125 			    unsigned long action,
6126 			    void *ocpu)
6127 {
6128 	struct sk_buff **list_skb;
6129 	struct sk_buff *skb;
6130 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6131 	struct softnet_data *sd, *oldsd;
6132 
6133 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6134 		return NOTIFY_OK;
6135 
6136 	local_irq_disable();
6137 	cpu = smp_processor_id();
6138 	sd = &per_cpu(softnet_data, cpu);
6139 	oldsd = &per_cpu(softnet_data, oldcpu);
6140 
6141 	/* Find end of our completion_queue. */
6142 	list_skb = &sd->completion_queue;
6143 	while (*list_skb)
6144 		list_skb = &(*list_skb)->next;
6145 	/* Append completion queue from offline CPU. */
6146 	*list_skb = oldsd->completion_queue;
6147 	oldsd->completion_queue = NULL;
6148 
6149 	/* Append output queue from offline CPU. */
6150 	if (oldsd->output_queue) {
6151 		*sd->output_queue_tailp = oldsd->output_queue;
6152 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6153 		oldsd->output_queue = NULL;
6154 		oldsd->output_queue_tailp = &oldsd->output_queue;
6155 	}
6156 
6157 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6158 	local_irq_enable();
6159 
6160 	/* Process offline CPU's input_pkt_queue */
6161 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6162 		netif_rx(skb);
6163 		input_queue_head_incr(oldsd);
6164 	}
6165 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6166 		netif_rx(skb);
6167 		input_queue_head_incr(oldsd);
6168 	}
6169 
6170 	return NOTIFY_OK;
6171 }
6172 
6173 
6174 /**
6175  *	netdev_increment_features - increment feature set by one
6176  *	@all: current feature set
6177  *	@one: new feature set
6178  *	@mask: mask feature set
6179  *
6180  *	Computes a new feature set after adding a device with feature set
6181  *	@one to the master device with current feature set @all.  Will not
6182  *	enable anything that is off in @mask. Returns the new feature set.
6183  */
6184 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6185 {
6186 	/* If device needs checksumming, downgrade to it. */
6187 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6188 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6189 	else if (mask & NETIF_F_ALL_CSUM) {
6190 		/* If one device supports v4/v6 checksumming, set for all. */
6191 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6192 		    !(all & NETIF_F_GEN_CSUM)) {
6193 			all &= ~NETIF_F_ALL_CSUM;
6194 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6195 		}
6196 
6197 		/* If one device supports hw checksumming, set for all. */
6198 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6199 			all &= ~NETIF_F_ALL_CSUM;
6200 			all |= NETIF_F_HW_CSUM;
6201 		}
6202 	}
6203 
6204 	one |= NETIF_F_ALL_CSUM;
6205 
6206 	one |= all & NETIF_F_ONE_FOR_ALL;
6207 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6208 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6209 
6210 	return all;
6211 }
6212 EXPORT_SYMBOL(netdev_increment_features);
6213 
6214 static struct hlist_head *netdev_create_hash(void)
6215 {
6216 	int i;
6217 	struct hlist_head *hash;
6218 
6219 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6220 	if (hash != NULL)
6221 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6222 			INIT_HLIST_HEAD(&hash[i]);
6223 
6224 	return hash;
6225 }
6226 
6227 /* Initialize per network namespace state */
6228 static int __net_init netdev_init(struct net *net)
6229 {
6230 	INIT_LIST_HEAD(&net->dev_base_head);
6231 
6232 	net->dev_name_head = netdev_create_hash();
6233 	if (net->dev_name_head == NULL)
6234 		goto err_name;
6235 
6236 	net->dev_index_head = netdev_create_hash();
6237 	if (net->dev_index_head == NULL)
6238 		goto err_idx;
6239 
6240 	return 0;
6241 
6242 err_idx:
6243 	kfree(net->dev_name_head);
6244 err_name:
6245 	return -ENOMEM;
6246 }
6247 
6248 /**
6249  *	netdev_drivername - network driver for the device
6250  *	@dev: network device
6251  *	@buffer: buffer for resulting name
6252  *	@len: size of buffer
6253  *
6254  *	Determine network driver for device.
6255  */
6256 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6257 {
6258 	const struct device_driver *driver;
6259 	const struct device *parent;
6260 
6261 	if (len <= 0 || !buffer)
6262 		return buffer;
6263 	buffer[0] = 0;
6264 
6265 	parent = dev->dev.parent;
6266 
6267 	if (!parent)
6268 		return buffer;
6269 
6270 	driver = parent->driver;
6271 	if (driver && driver->name)
6272 		strlcpy(buffer, driver->name, len);
6273 	return buffer;
6274 }
6275 
6276 static int __netdev_printk(const char *level, const struct net_device *dev,
6277 			   struct va_format *vaf)
6278 {
6279 	int r;
6280 
6281 	if (dev && dev->dev.parent)
6282 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6283 			       netdev_name(dev), vaf);
6284 	else if (dev)
6285 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6286 	else
6287 		r = printk("%s(NULL net_device): %pV", level, vaf);
6288 
6289 	return r;
6290 }
6291 
6292 int netdev_printk(const char *level, const struct net_device *dev,
6293 		  const char *format, ...)
6294 {
6295 	struct va_format vaf;
6296 	va_list args;
6297 	int r;
6298 
6299 	va_start(args, format);
6300 
6301 	vaf.fmt = format;
6302 	vaf.va = &args;
6303 
6304 	r = __netdev_printk(level, dev, &vaf);
6305 	va_end(args);
6306 
6307 	return r;
6308 }
6309 EXPORT_SYMBOL(netdev_printk);
6310 
6311 #define define_netdev_printk_level(func, level)			\
6312 int func(const struct net_device *dev, const char *fmt, ...)	\
6313 {								\
6314 	int r;							\
6315 	struct va_format vaf;					\
6316 	va_list args;						\
6317 								\
6318 	va_start(args, fmt);					\
6319 								\
6320 	vaf.fmt = fmt;						\
6321 	vaf.va = &args;						\
6322 								\
6323 	r = __netdev_printk(level, dev, &vaf);			\
6324 	va_end(args);						\
6325 								\
6326 	return r;						\
6327 }								\
6328 EXPORT_SYMBOL(func);
6329 
6330 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6331 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6332 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6333 define_netdev_printk_level(netdev_err, KERN_ERR);
6334 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6335 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6336 define_netdev_printk_level(netdev_info, KERN_INFO);
6337 
6338 static void __net_exit netdev_exit(struct net *net)
6339 {
6340 	kfree(net->dev_name_head);
6341 	kfree(net->dev_index_head);
6342 }
6343 
6344 static struct pernet_operations __net_initdata netdev_net_ops = {
6345 	.init = netdev_init,
6346 	.exit = netdev_exit,
6347 };
6348 
6349 static void __net_exit default_device_exit(struct net *net)
6350 {
6351 	struct net_device *dev, *aux;
6352 	/*
6353 	 * Push all migratable network devices back to the
6354 	 * initial network namespace
6355 	 */
6356 	rtnl_lock();
6357 	for_each_netdev_safe(net, dev, aux) {
6358 		int err;
6359 		char fb_name[IFNAMSIZ];
6360 
6361 		/* Ignore unmoveable devices (i.e. loopback) */
6362 		if (dev->features & NETIF_F_NETNS_LOCAL)
6363 			continue;
6364 
6365 		/* Leave virtual devices for the generic cleanup */
6366 		if (dev->rtnl_link_ops)
6367 			continue;
6368 
6369 		/* Push remaing network devices to init_net */
6370 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6371 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6372 		if (err) {
6373 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6374 				__func__, dev->name, err);
6375 			BUG();
6376 		}
6377 	}
6378 	rtnl_unlock();
6379 }
6380 
6381 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6382 {
6383 	/* At exit all network devices most be removed from a network
6384 	 * namespace.  Do this in the reverse order of registration.
6385 	 * Do this across as many network namespaces as possible to
6386 	 * improve batching efficiency.
6387 	 */
6388 	struct net_device *dev;
6389 	struct net *net;
6390 	LIST_HEAD(dev_kill_list);
6391 
6392 	rtnl_lock();
6393 	list_for_each_entry(net, net_list, exit_list) {
6394 		for_each_netdev_reverse(net, dev) {
6395 			if (dev->rtnl_link_ops)
6396 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6397 			else
6398 				unregister_netdevice_queue(dev, &dev_kill_list);
6399 		}
6400 	}
6401 	unregister_netdevice_many(&dev_kill_list);
6402 	list_del(&dev_kill_list);
6403 	rtnl_unlock();
6404 }
6405 
6406 static struct pernet_operations __net_initdata default_device_ops = {
6407 	.exit = default_device_exit,
6408 	.exit_batch = default_device_exit_batch,
6409 };
6410 
6411 /*
6412  *	Initialize the DEV module. At boot time this walks the device list and
6413  *	unhooks any devices that fail to initialise (normally hardware not
6414  *	present) and leaves us with a valid list of present and active devices.
6415  *
6416  */
6417 
6418 /*
6419  *       This is called single threaded during boot, so no need
6420  *       to take the rtnl semaphore.
6421  */
6422 static int __init net_dev_init(void)
6423 {
6424 	int i, rc = -ENOMEM;
6425 
6426 	BUG_ON(!dev_boot_phase);
6427 
6428 	if (dev_proc_init())
6429 		goto out;
6430 
6431 	if (netdev_kobject_init())
6432 		goto out;
6433 
6434 	INIT_LIST_HEAD(&ptype_all);
6435 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6436 		INIT_LIST_HEAD(&ptype_base[i]);
6437 
6438 	if (register_pernet_subsys(&netdev_net_ops))
6439 		goto out;
6440 
6441 	/*
6442 	 *	Initialise the packet receive queues.
6443 	 */
6444 
6445 	for_each_possible_cpu(i) {
6446 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6447 
6448 		memset(sd, 0, sizeof(*sd));
6449 		skb_queue_head_init(&sd->input_pkt_queue);
6450 		skb_queue_head_init(&sd->process_queue);
6451 		sd->completion_queue = NULL;
6452 		INIT_LIST_HEAD(&sd->poll_list);
6453 		sd->output_queue = NULL;
6454 		sd->output_queue_tailp = &sd->output_queue;
6455 #ifdef CONFIG_RPS
6456 		sd->csd.func = rps_trigger_softirq;
6457 		sd->csd.info = sd;
6458 		sd->csd.flags = 0;
6459 		sd->cpu = i;
6460 #endif
6461 
6462 		sd->backlog.poll = process_backlog;
6463 		sd->backlog.weight = weight_p;
6464 		sd->backlog.gro_list = NULL;
6465 		sd->backlog.gro_count = 0;
6466 	}
6467 
6468 	dev_boot_phase = 0;
6469 
6470 	/* The loopback device is special if any other network devices
6471 	 * is present in a network namespace the loopback device must
6472 	 * be present. Since we now dynamically allocate and free the
6473 	 * loopback device ensure this invariant is maintained by
6474 	 * keeping the loopback device as the first device on the
6475 	 * list of network devices.  Ensuring the loopback devices
6476 	 * is the first device that appears and the last network device
6477 	 * that disappears.
6478 	 */
6479 	if (register_pernet_device(&loopback_net_ops))
6480 		goto out;
6481 
6482 	if (register_pernet_device(&default_device_ops))
6483 		goto out;
6484 
6485 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6486 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6487 
6488 	hotcpu_notifier(dev_cpu_callback, 0);
6489 	dst_init();
6490 	dev_mcast_init();
6491 	rc = 0;
6492 out:
6493 	return rc;
6494 }
6495 
6496 subsys_initcall(net_dev_init);
6497 
6498 static int __init initialize_hashrnd(void)
6499 {
6500 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6501 	return 0;
6502 }
6503 
6504 late_initcall_sync(initialize_hashrnd);
6505 
6506