xref: /openbmc/linux/net/core/dev.c (revision df2634f43f5106947f3735a0b61a6527a4b278cd)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 /*
145  *	The list of packet types we will receive (as opposed to discard)
146  *	and the routines to invoke.
147  *
148  *	Why 16. Because with 16 the only overlap we get on a hash of the
149  *	low nibble of the protocol value is RARP/SNAP/X.25.
150  *
151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
152  *             sure which should go first, but I bet it won't make much
153  *             difference if we are running VLANs.  The good news is that
154  *             this protocol won't be in the list unless compiled in, so
155  *             the average user (w/out VLANs) will not be adversely affected.
156  *             --BLG
157  *
158  *		0800	IP
159  *		8100    802.1Q VLAN
160  *		0001	802.3
161  *		0002	AX.25
162  *		0004	802.2
163  *		8035	RARP
164  *		0005	SNAP
165  *		0805	X.25
166  *		0806	ARP
167  *		8137	IPX
168  *		0009	Localtalk
169  *		86DD	IPv6
170  */
171 
172 #define PTYPE_HASH_SIZE	(16)
173 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
174 
175 static DEFINE_SPINLOCK(ptype_lock);
176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
177 static struct list_head ptype_all __read_mostly;	/* Taps */
178 
179 /*
180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
181  * semaphore.
182  *
183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184  *
185  * Writers must hold the rtnl semaphore while they loop through the
186  * dev_base_head list, and hold dev_base_lock for writing when they do the
187  * actual updates.  This allows pure readers to access the list even
188  * while a writer is preparing to update it.
189  *
190  * To put it another way, dev_base_lock is held for writing only to
191  * protect against pure readers; the rtnl semaphore provides the
192  * protection against other writers.
193  *
194  * See, for example usages, register_netdevice() and
195  * unregister_netdevice(), which must be called with the rtnl
196  * semaphore held.
197  */
198 DEFINE_RWLOCK(dev_base_lock);
199 EXPORT_SYMBOL(dev_base_lock);
200 
201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 {
203 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
204 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
205 }
206 
207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 {
209 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
210 }
211 
212 static inline void rps_lock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_lock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 static inline void rps_unlock(struct softnet_data *sd)
220 {
221 #ifdef CONFIG_RPS
222 	spin_unlock(&sd->input_pkt_queue.lock);
223 #endif
224 }
225 
226 /* Device list insertion */
227 static int list_netdevice(struct net_device *dev)
228 {
229 	struct net *net = dev_net(dev);
230 
231 	ASSERT_RTNL();
232 
233 	write_lock_bh(&dev_base_lock);
234 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
235 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
236 	hlist_add_head_rcu(&dev->index_hlist,
237 			   dev_index_hash(net, dev->ifindex));
238 	write_unlock_bh(&dev_base_lock);
239 	return 0;
240 }
241 
242 /* Device list removal
243  * caller must respect a RCU grace period before freeing/reusing dev
244  */
245 static void unlist_netdevice(struct net_device *dev)
246 {
247 	ASSERT_RTNL();
248 
249 	/* Unlink dev from the device chain */
250 	write_lock_bh(&dev_base_lock);
251 	list_del_rcu(&dev->dev_list);
252 	hlist_del_rcu(&dev->name_hlist);
253 	hlist_del_rcu(&dev->index_hlist);
254 	write_unlock_bh(&dev_base_lock);
255 }
256 
257 /*
258  *	Our notifier list
259  */
260 
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262 
263 /*
264  *	Device drivers call our routines to queue packets here. We empty the
265  *	queue in the local softnet handler.
266  */
267 
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
290 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
291 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
292 	 ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
308 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
309 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
310 	 "_xmit_VOID", "_xmit_NONE"};
311 
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 
315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 	int i;
318 
319 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 		if (netdev_lock_type[i] == dev_type)
321 			return i;
322 	/* the last key is used by default */
323 	return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325 
326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 						 unsigned short dev_type)
328 {
329 	int i;
330 
331 	i = netdev_lock_pos(dev_type);
332 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 				   netdev_lock_name[i]);
334 }
335 
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 	int i;
339 
340 	i = netdev_lock_pos(dev->type);
341 	lockdep_set_class_and_name(&dev->addr_list_lock,
342 				   &netdev_addr_lock_key[i],
343 				   netdev_lock_name[i]);
344 }
345 #else
346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 						 unsigned short dev_type)
348 {
349 }
350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354 
355 /*******************************************************************************
356 
357 		Protocol management and registration routines
358 
359 *******************************************************************************/
360 
361 /*
362  *	Add a protocol ID to the list. Now that the input handler is
363  *	smarter we can dispense with all the messy stuff that used to be
364  *	here.
365  *
366  *	BEWARE!!! Protocol handlers, mangling input packets,
367  *	MUST BE last in hash buckets and checking protocol handlers
368  *	MUST start from promiscuous ptype_all chain in net_bh.
369  *	It is true now, do not change it.
370  *	Explanation follows: if protocol handler, mangling packet, will
371  *	be the first on list, it is not able to sense, that packet
372  *	is cloned and should be copied-on-write, so that it will
373  *	change it and subsequent readers will get broken packet.
374  *							--ANK (980803)
375  */
376 
377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 	if (pt->type == htons(ETH_P_ALL))
380 		return &ptype_all;
381 	else
382 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 /******************************************************************************
462 
463 		      Device Boot-time Settings Routines
464 
465 *******************************************************************************/
466 
467 /* Boot time configuration table */
468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469 
470 /**
471  *	netdev_boot_setup_add	- add new setup entry
472  *	@name: name of the device
473  *	@map: configured settings for the device
474  *
475  *	Adds new setup entry to the dev_boot_setup list.  The function
476  *	returns 0 on error and 1 on success.  This is a generic routine to
477  *	all netdevices.
478  */
479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
480 {
481 	struct netdev_boot_setup *s;
482 	int i;
483 
484 	s = dev_boot_setup;
485 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 			memset(s[i].name, 0, sizeof(s[i].name));
488 			strlcpy(s[i].name, name, IFNAMSIZ);
489 			memcpy(&s[i].map, map, sizeof(s[i].map));
490 			break;
491 		}
492 	}
493 
494 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495 }
496 
497 /**
498  *	netdev_boot_setup_check	- check boot time settings
499  *	@dev: the netdevice
500  *
501  * 	Check boot time settings for the device.
502  *	The found settings are set for the device to be used
503  *	later in the device probing.
504  *	Returns 0 if no settings found, 1 if they are.
505  */
506 int netdev_boot_setup_check(struct net_device *dev)
507 {
508 	struct netdev_boot_setup *s = dev_boot_setup;
509 	int i;
510 
511 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
513 		    !strcmp(dev->name, s[i].name)) {
514 			dev->irq 	= s[i].map.irq;
515 			dev->base_addr 	= s[i].map.base_addr;
516 			dev->mem_start 	= s[i].map.mem_start;
517 			dev->mem_end 	= s[i].map.mem_end;
518 			return 1;
519 		}
520 	}
521 	return 0;
522 }
523 EXPORT_SYMBOL(netdev_boot_setup_check);
524 
525 
526 /**
527  *	netdev_boot_base	- get address from boot time settings
528  *	@prefix: prefix for network device
529  *	@unit: id for network device
530  *
531  * 	Check boot time settings for the base address of device.
532  *	The found settings are set for the device to be used
533  *	later in the device probing.
534  *	Returns 0 if no settings found.
535  */
536 unsigned long netdev_boot_base(const char *prefix, int unit)
537 {
538 	const struct netdev_boot_setup *s = dev_boot_setup;
539 	char name[IFNAMSIZ];
540 	int i;
541 
542 	sprintf(name, "%s%d", prefix, unit);
543 
544 	/*
545 	 * If device already registered then return base of 1
546 	 * to indicate not to probe for this interface
547 	 */
548 	if (__dev_get_by_name(&init_net, name))
549 		return 1;
550 
551 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 		if (!strcmp(name, s[i].name))
553 			return s[i].map.base_addr;
554 	return 0;
555 }
556 
557 /*
558  * Saves at boot time configured settings for any netdevice.
559  */
560 int __init netdev_boot_setup(char *str)
561 {
562 	int ints[5];
563 	struct ifmap map;
564 
565 	str = get_options(str, ARRAY_SIZE(ints), ints);
566 	if (!str || !*str)
567 		return 0;
568 
569 	/* Save settings */
570 	memset(&map, 0, sizeof(map));
571 	if (ints[0] > 0)
572 		map.irq = ints[1];
573 	if (ints[0] > 1)
574 		map.base_addr = ints[2];
575 	if (ints[0] > 2)
576 		map.mem_start = ints[3];
577 	if (ints[0] > 3)
578 		map.mem_end = ints[4];
579 
580 	/* Add new entry to the list */
581 	return netdev_boot_setup_add(str, &map);
582 }
583 
584 __setup("netdev=", netdev_boot_setup);
585 
586 /*******************************************************************************
587 
588 			    Device Interface Subroutines
589 
590 *******************************************************************************/
591 
592 /**
593  *	__dev_get_by_name	- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. Must be called under RTNL semaphore
598  *	or @dev_base_lock. If the name is found a pointer to the device
599  *	is returned. If the name is not found then %NULL is returned. The
600  *	reference counters are not incremented so the caller must be
601  *	careful with locks.
602  */
603 
604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct hlist_node *p;
607 	struct net_device *dev;
608 	struct hlist_head *head = dev_name_hash(net, name);
609 
610 	hlist_for_each_entry(dev, p, head, name_hlist)
611 		if (!strncmp(dev->name, name, IFNAMSIZ))
612 			return dev;
613 
614 	return NULL;
615 }
616 EXPORT_SYMBOL(__dev_get_by_name);
617 
618 /**
619  *	dev_get_by_name_rcu	- find a device by its name
620  *	@net: the applicable net namespace
621  *	@name: name to find
622  *
623  *	Find an interface by name.
624  *	If the name is found a pointer to the device is returned.
625  * 	If the name is not found then %NULL is returned.
626  *	The reference counters are not incremented so the caller must be
627  *	careful with locks. The caller must hold RCU lock.
628  */
629 
630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631 {
632 	struct hlist_node *p;
633 	struct net_device *dev;
634 	struct hlist_head *head = dev_name_hash(net, name);
635 
636 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 		if (!strncmp(dev->name, name, IFNAMSIZ))
638 			return dev;
639 
640 	return NULL;
641 }
642 EXPORT_SYMBOL(dev_get_by_name_rcu);
643 
644 /**
645  *	dev_get_by_name		- find a device by its name
646  *	@net: the applicable net namespace
647  *	@name: name to find
648  *
649  *	Find an interface by name. This can be called from any
650  *	context and does its own locking. The returned handle has
651  *	the usage count incremented and the caller must use dev_put() to
652  *	release it when it is no longer needed. %NULL is returned if no
653  *	matching device is found.
654  */
655 
656 struct net_device *dev_get_by_name(struct net *net, const char *name)
657 {
658 	struct net_device *dev;
659 
660 	rcu_read_lock();
661 	dev = dev_get_by_name_rcu(net, name);
662 	if (dev)
663 		dev_hold(dev);
664 	rcu_read_unlock();
665 	return dev;
666 }
667 EXPORT_SYMBOL(dev_get_by_name);
668 
669 /**
670  *	__dev_get_by_index - find a device by its ifindex
671  *	@net: the applicable net namespace
672  *	@ifindex: index of device
673  *
674  *	Search for an interface by index. Returns %NULL if the device
675  *	is not found or a pointer to the device. The device has not
676  *	had its reference counter increased so the caller must be careful
677  *	about locking. The caller must hold either the RTNL semaphore
678  *	or @dev_base_lock.
679  */
680 
681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
682 {
683 	struct hlist_node *p;
684 	struct net_device *dev;
685 	struct hlist_head *head = dev_index_hash(net, ifindex);
686 
687 	hlist_for_each_entry(dev, p, head, index_hlist)
688 		if (dev->ifindex == ifindex)
689 			return dev;
690 
691 	return NULL;
692 }
693 EXPORT_SYMBOL(__dev_get_by_index);
694 
695 /**
696  *	dev_get_by_index_rcu - find a device by its ifindex
697  *	@net: the applicable net namespace
698  *	@ifindex: index of device
699  *
700  *	Search for an interface by index. Returns %NULL if the device
701  *	is not found or a pointer to the device. The device has not
702  *	had its reference counter increased so the caller must be careful
703  *	about locking. The caller must hold RCU lock.
704  */
705 
706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707 {
708 	struct hlist_node *p;
709 	struct net_device *dev;
710 	struct hlist_head *head = dev_index_hash(net, ifindex);
711 
712 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 		if (dev->ifindex == ifindex)
714 			return dev;
715 
716 	return NULL;
717 }
718 EXPORT_SYMBOL(dev_get_by_index_rcu);
719 
720 
721 /**
722  *	dev_get_by_index - find a device by its ifindex
723  *	@net: the applicable net namespace
724  *	@ifindex: index of device
725  *
726  *	Search for an interface by index. Returns NULL if the device
727  *	is not found or a pointer to the device. The device returned has
728  *	had a reference added and the pointer is safe until the user calls
729  *	dev_put to indicate they have finished with it.
730  */
731 
732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
733 {
734 	struct net_device *dev;
735 
736 	rcu_read_lock();
737 	dev = dev_get_by_index_rcu(net, ifindex);
738 	if (dev)
739 		dev_hold(dev);
740 	rcu_read_unlock();
741 	return dev;
742 }
743 EXPORT_SYMBOL(dev_get_by_index);
744 
745 /**
746  *	dev_getbyhwaddr_rcu - find a device by its hardware address
747  *	@net: the applicable net namespace
748  *	@type: media type of device
749  *	@ha: hardware address
750  *
751  *	Search for an interface by MAC address. Returns NULL if the device
752  *	is not found or a pointer to the device.
753  *	The caller must hold RCU or RTNL.
754  *	The returned device has not had its ref count increased
755  *	and the caller must therefore be careful about locking
756  *
757  */
758 
759 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
760 				       const char *ha)
761 {
762 	struct net_device *dev;
763 
764 	for_each_netdev_rcu(net, dev)
765 		if (dev->type == type &&
766 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
767 			return dev;
768 
769 	return NULL;
770 }
771 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
772 
773 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
774 {
775 	struct net_device *dev;
776 
777 	ASSERT_RTNL();
778 	for_each_netdev(net, dev)
779 		if (dev->type == type)
780 			return dev;
781 
782 	return NULL;
783 }
784 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
785 
786 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
787 {
788 	struct net_device *dev, *ret = NULL;
789 
790 	rcu_read_lock();
791 	for_each_netdev_rcu(net, dev)
792 		if (dev->type == type) {
793 			dev_hold(dev);
794 			ret = dev;
795 			break;
796 		}
797 	rcu_read_unlock();
798 	return ret;
799 }
800 EXPORT_SYMBOL(dev_getfirstbyhwtype);
801 
802 /**
803  *	dev_get_by_flags_rcu - find any device with given flags
804  *	@net: the applicable net namespace
805  *	@if_flags: IFF_* values
806  *	@mask: bitmask of bits in if_flags to check
807  *
808  *	Search for any interface with the given flags. Returns NULL if a device
809  *	is not found or a pointer to the device. Must be called inside
810  *	rcu_read_lock(), and result refcount is unchanged.
811  */
812 
813 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
814 				    unsigned short mask)
815 {
816 	struct net_device *dev, *ret;
817 
818 	ret = NULL;
819 	for_each_netdev_rcu(net, dev) {
820 		if (((dev->flags ^ if_flags) & mask) == 0) {
821 			ret = dev;
822 			break;
823 		}
824 	}
825 	return ret;
826 }
827 EXPORT_SYMBOL(dev_get_by_flags_rcu);
828 
829 /**
830  *	dev_valid_name - check if name is okay for network device
831  *	@name: name string
832  *
833  *	Network device names need to be valid file names to
834  *	to allow sysfs to work.  We also disallow any kind of
835  *	whitespace.
836  */
837 int dev_valid_name(const char *name)
838 {
839 	if (*name == '\0')
840 		return 0;
841 	if (strlen(name) >= IFNAMSIZ)
842 		return 0;
843 	if (!strcmp(name, ".") || !strcmp(name, ".."))
844 		return 0;
845 
846 	while (*name) {
847 		if (*name == '/' || isspace(*name))
848 			return 0;
849 		name++;
850 	}
851 	return 1;
852 }
853 EXPORT_SYMBOL(dev_valid_name);
854 
855 /**
856  *	__dev_alloc_name - allocate a name for a device
857  *	@net: network namespace to allocate the device name in
858  *	@name: name format string
859  *	@buf:  scratch buffer and result name string
860  *
861  *	Passed a format string - eg "lt%d" it will try and find a suitable
862  *	id. It scans list of devices to build up a free map, then chooses
863  *	the first empty slot. The caller must hold the dev_base or rtnl lock
864  *	while allocating the name and adding the device in order to avoid
865  *	duplicates.
866  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
867  *	Returns the number of the unit assigned or a negative errno code.
868  */
869 
870 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
871 {
872 	int i = 0;
873 	const char *p;
874 	const int max_netdevices = 8*PAGE_SIZE;
875 	unsigned long *inuse;
876 	struct net_device *d;
877 
878 	p = strnchr(name, IFNAMSIZ-1, '%');
879 	if (p) {
880 		/*
881 		 * Verify the string as this thing may have come from
882 		 * the user.  There must be either one "%d" and no other "%"
883 		 * characters.
884 		 */
885 		if (p[1] != 'd' || strchr(p + 2, '%'))
886 			return -EINVAL;
887 
888 		/* Use one page as a bit array of possible slots */
889 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
890 		if (!inuse)
891 			return -ENOMEM;
892 
893 		for_each_netdev(net, d) {
894 			if (!sscanf(d->name, name, &i))
895 				continue;
896 			if (i < 0 || i >= max_netdevices)
897 				continue;
898 
899 			/*  avoid cases where sscanf is not exact inverse of printf */
900 			snprintf(buf, IFNAMSIZ, name, i);
901 			if (!strncmp(buf, d->name, IFNAMSIZ))
902 				set_bit(i, inuse);
903 		}
904 
905 		i = find_first_zero_bit(inuse, max_netdevices);
906 		free_page((unsigned long) inuse);
907 	}
908 
909 	if (buf != name)
910 		snprintf(buf, IFNAMSIZ, name, i);
911 	if (!__dev_get_by_name(net, buf))
912 		return i;
913 
914 	/* It is possible to run out of possible slots
915 	 * when the name is long and there isn't enough space left
916 	 * for the digits, or if all bits are used.
917 	 */
918 	return -ENFILE;
919 }
920 
921 /**
922  *	dev_alloc_name - allocate a name for a device
923  *	@dev: device
924  *	@name: name format string
925  *
926  *	Passed a format string - eg "lt%d" it will try and find a suitable
927  *	id. It scans list of devices to build up a free map, then chooses
928  *	the first empty slot. The caller must hold the dev_base or rtnl lock
929  *	while allocating the name and adding the device in order to avoid
930  *	duplicates.
931  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
932  *	Returns the number of the unit assigned or a negative errno code.
933  */
934 
935 int dev_alloc_name(struct net_device *dev, const char *name)
936 {
937 	char buf[IFNAMSIZ];
938 	struct net *net;
939 	int ret;
940 
941 	BUG_ON(!dev_net(dev));
942 	net = dev_net(dev);
943 	ret = __dev_alloc_name(net, name, buf);
944 	if (ret >= 0)
945 		strlcpy(dev->name, buf, IFNAMSIZ);
946 	return ret;
947 }
948 EXPORT_SYMBOL(dev_alloc_name);
949 
950 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
951 {
952 	struct net *net;
953 
954 	BUG_ON(!dev_net(dev));
955 	net = dev_net(dev);
956 
957 	if (!dev_valid_name(name))
958 		return -EINVAL;
959 
960 	if (fmt && strchr(name, '%'))
961 		return dev_alloc_name(dev, name);
962 	else if (__dev_get_by_name(net, name))
963 		return -EEXIST;
964 	else if (dev->name != name)
965 		strlcpy(dev->name, name, IFNAMSIZ);
966 
967 	return 0;
968 }
969 
970 /**
971  *	dev_change_name - change name of a device
972  *	@dev: device
973  *	@newname: name (or format string) must be at least IFNAMSIZ
974  *
975  *	Change name of a device, can pass format strings "eth%d".
976  *	for wildcarding.
977  */
978 int dev_change_name(struct net_device *dev, const char *newname)
979 {
980 	char oldname[IFNAMSIZ];
981 	int err = 0;
982 	int ret;
983 	struct net *net;
984 
985 	ASSERT_RTNL();
986 	BUG_ON(!dev_net(dev));
987 
988 	net = dev_net(dev);
989 	if (dev->flags & IFF_UP)
990 		return -EBUSY;
991 
992 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
993 		return 0;
994 
995 	memcpy(oldname, dev->name, IFNAMSIZ);
996 
997 	err = dev_get_valid_name(dev, newname, 1);
998 	if (err < 0)
999 		return err;
1000 
1001 rollback:
1002 	ret = device_rename(&dev->dev, dev->name);
1003 	if (ret) {
1004 		memcpy(dev->name, oldname, IFNAMSIZ);
1005 		return ret;
1006 	}
1007 
1008 	write_lock_bh(&dev_base_lock);
1009 	hlist_del(&dev->name_hlist);
1010 	write_unlock_bh(&dev_base_lock);
1011 
1012 	synchronize_rcu();
1013 
1014 	write_lock_bh(&dev_base_lock);
1015 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1016 	write_unlock_bh(&dev_base_lock);
1017 
1018 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1019 	ret = notifier_to_errno(ret);
1020 
1021 	if (ret) {
1022 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1023 		if (err >= 0) {
1024 			err = ret;
1025 			memcpy(dev->name, oldname, IFNAMSIZ);
1026 			goto rollback;
1027 		} else {
1028 			printk(KERN_ERR
1029 			       "%s: name change rollback failed: %d.\n",
1030 			       dev->name, ret);
1031 		}
1032 	}
1033 
1034 	return err;
1035 }
1036 
1037 /**
1038  *	dev_set_alias - change ifalias of a device
1039  *	@dev: device
1040  *	@alias: name up to IFALIASZ
1041  *	@len: limit of bytes to copy from info
1042  *
1043  *	Set ifalias for a device,
1044  */
1045 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1046 {
1047 	ASSERT_RTNL();
1048 
1049 	if (len >= IFALIASZ)
1050 		return -EINVAL;
1051 
1052 	if (!len) {
1053 		if (dev->ifalias) {
1054 			kfree(dev->ifalias);
1055 			dev->ifalias = NULL;
1056 		}
1057 		return 0;
1058 	}
1059 
1060 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1061 	if (!dev->ifalias)
1062 		return -ENOMEM;
1063 
1064 	strlcpy(dev->ifalias, alias, len+1);
1065 	return len;
1066 }
1067 
1068 
1069 /**
1070  *	netdev_features_change - device changes features
1071  *	@dev: device to cause notification
1072  *
1073  *	Called to indicate a device has changed features.
1074  */
1075 void netdev_features_change(struct net_device *dev)
1076 {
1077 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1078 }
1079 EXPORT_SYMBOL(netdev_features_change);
1080 
1081 /**
1082  *	netdev_state_change - device changes state
1083  *	@dev: device to cause notification
1084  *
1085  *	Called to indicate a device has changed state. This function calls
1086  *	the notifier chains for netdev_chain and sends a NEWLINK message
1087  *	to the routing socket.
1088  */
1089 void netdev_state_change(struct net_device *dev)
1090 {
1091 	if (dev->flags & IFF_UP) {
1092 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1093 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1094 	}
1095 }
1096 EXPORT_SYMBOL(netdev_state_change);
1097 
1098 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1099 {
1100 	return call_netdevice_notifiers(event, dev);
1101 }
1102 EXPORT_SYMBOL(netdev_bonding_change);
1103 
1104 /**
1105  *	dev_load 	- load a network module
1106  *	@net: the applicable net namespace
1107  *	@name: name of interface
1108  *
1109  *	If a network interface is not present and the process has suitable
1110  *	privileges this function loads the module. If module loading is not
1111  *	available in this kernel then it becomes a nop.
1112  */
1113 
1114 void dev_load(struct net *net, const char *name)
1115 {
1116 	struct net_device *dev;
1117 
1118 	rcu_read_lock();
1119 	dev = dev_get_by_name_rcu(net, name);
1120 	rcu_read_unlock();
1121 
1122 	if (!dev && capable(CAP_NET_ADMIN))
1123 		request_module("%s", name);
1124 }
1125 EXPORT_SYMBOL(dev_load);
1126 
1127 static int __dev_open(struct net_device *dev)
1128 {
1129 	const struct net_device_ops *ops = dev->netdev_ops;
1130 	int ret;
1131 
1132 	ASSERT_RTNL();
1133 
1134 	/*
1135 	 *	Is it even present?
1136 	 */
1137 	if (!netif_device_present(dev))
1138 		return -ENODEV;
1139 
1140 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1141 	ret = notifier_to_errno(ret);
1142 	if (ret)
1143 		return ret;
1144 
1145 	/*
1146 	 *	Call device private open method
1147 	 */
1148 	set_bit(__LINK_STATE_START, &dev->state);
1149 
1150 	if (ops->ndo_validate_addr)
1151 		ret = ops->ndo_validate_addr(dev);
1152 
1153 	if (!ret && ops->ndo_open)
1154 		ret = ops->ndo_open(dev);
1155 
1156 	/*
1157 	 *	If it went open OK then:
1158 	 */
1159 
1160 	if (ret)
1161 		clear_bit(__LINK_STATE_START, &dev->state);
1162 	else {
1163 		/*
1164 		 *	Set the flags.
1165 		 */
1166 		dev->flags |= IFF_UP;
1167 
1168 		/*
1169 		 *	Enable NET_DMA
1170 		 */
1171 		net_dmaengine_get();
1172 
1173 		/*
1174 		 *	Initialize multicasting status
1175 		 */
1176 		dev_set_rx_mode(dev);
1177 
1178 		/*
1179 		 *	Wakeup transmit queue engine
1180 		 */
1181 		dev_activate(dev);
1182 	}
1183 
1184 	return ret;
1185 }
1186 
1187 /**
1188  *	dev_open	- prepare an interface for use.
1189  *	@dev:	device to open
1190  *
1191  *	Takes a device from down to up state. The device's private open
1192  *	function is invoked and then the multicast lists are loaded. Finally
1193  *	the device is moved into the up state and a %NETDEV_UP message is
1194  *	sent to the netdev notifier chain.
1195  *
1196  *	Calling this function on an active interface is a nop. On a failure
1197  *	a negative errno code is returned.
1198  */
1199 int dev_open(struct net_device *dev)
1200 {
1201 	int ret;
1202 
1203 	/*
1204 	 *	Is it already up?
1205 	 */
1206 	if (dev->flags & IFF_UP)
1207 		return 0;
1208 
1209 	/*
1210 	 *	Open device
1211 	 */
1212 	ret = __dev_open(dev);
1213 	if (ret < 0)
1214 		return ret;
1215 
1216 	/*
1217 	 *	... and announce new interface.
1218 	 */
1219 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1220 	call_netdevice_notifiers(NETDEV_UP, dev);
1221 
1222 	return ret;
1223 }
1224 EXPORT_SYMBOL(dev_open);
1225 
1226 static int __dev_close_many(struct list_head *head)
1227 {
1228 	struct net_device *dev;
1229 
1230 	ASSERT_RTNL();
1231 	might_sleep();
1232 
1233 	list_for_each_entry(dev, head, unreg_list) {
1234 		/*
1235 		 *	Tell people we are going down, so that they can
1236 		 *	prepare to death, when device is still operating.
1237 		 */
1238 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1239 
1240 		clear_bit(__LINK_STATE_START, &dev->state);
1241 
1242 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1243 		 * can be even on different cpu. So just clear netif_running().
1244 		 *
1245 		 * dev->stop() will invoke napi_disable() on all of it's
1246 		 * napi_struct instances on this device.
1247 		 */
1248 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1249 	}
1250 
1251 	dev_deactivate_many(head);
1252 
1253 	list_for_each_entry(dev, head, unreg_list) {
1254 		const struct net_device_ops *ops = dev->netdev_ops;
1255 
1256 		/*
1257 		 *	Call the device specific close. This cannot fail.
1258 		 *	Only if device is UP
1259 		 *
1260 		 *	We allow it to be called even after a DETACH hot-plug
1261 		 *	event.
1262 		 */
1263 		if (ops->ndo_stop)
1264 			ops->ndo_stop(dev);
1265 
1266 		/*
1267 		 *	Device is now down.
1268 		 */
1269 
1270 		dev->flags &= ~IFF_UP;
1271 
1272 		/*
1273 		 *	Shutdown NET_DMA
1274 		 */
1275 		net_dmaengine_put();
1276 	}
1277 
1278 	return 0;
1279 }
1280 
1281 static int __dev_close(struct net_device *dev)
1282 {
1283 	int retval;
1284 	LIST_HEAD(single);
1285 
1286 	list_add(&dev->unreg_list, &single);
1287 	retval = __dev_close_many(&single);
1288 	list_del(&single);
1289 	return retval;
1290 }
1291 
1292 int dev_close_many(struct list_head *head)
1293 {
1294 	struct net_device *dev, *tmp;
1295 	LIST_HEAD(tmp_list);
1296 
1297 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1298 		if (!(dev->flags & IFF_UP))
1299 			list_move(&dev->unreg_list, &tmp_list);
1300 
1301 	__dev_close_many(head);
1302 
1303 	/*
1304 	 * Tell people we are down
1305 	 */
1306 	list_for_each_entry(dev, head, unreg_list) {
1307 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1308 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1309 	}
1310 
1311 	/* rollback_registered_many needs the complete original list */
1312 	list_splice(&tmp_list, head);
1313 	return 0;
1314 }
1315 
1316 /**
1317  *	dev_close - shutdown an interface.
1318  *	@dev: device to shutdown
1319  *
1320  *	This function moves an active device into down state. A
1321  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1322  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1323  *	chain.
1324  */
1325 int dev_close(struct net_device *dev)
1326 {
1327 	LIST_HEAD(single);
1328 
1329 	list_add(&dev->unreg_list, &single);
1330 	dev_close_many(&single);
1331 	list_del(&single);
1332 	return 0;
1333 }
1334 EXPORT_SYMBOL(dev_close);
1335 
1336 
1337 /**
1338  *	dev_disable_lro - disable Large Receive Offload on a device
1339  *	@dev: device
1340  *
1341  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1342  *	called under RTNL.  This is needed if received packets may be
1343  *	forwarded to another interface.
1344  */
1345 void dev_disable_lro(struct net_device *dev)
1346 {
1347 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1348 	    dev->ethtool_ops->set_flags) {
1349 		u32 flags = dev->ethtool_ops->get_flags(dev);
1350 		if (flags & ETH_FLAG_LRO) {
1351 			flags &= ~ETH_FLAG_LRO;
1352 			dev->ethtool_ops->set_flags(dev, flags);
1353 		}
1354 	}
1355 	WARN_ON(dev->features & NETIF_F_LRO);
1356 }
1357 EXPORT_SYMBOL(dev_disable_lro);
1358 
1359 
1360 static int dev_boot_phase = 1;
1361 
1362 /*
1363  *	Device change register/unregister. These are not inline or static
1364  *	as we export them to the world.
1365  */
1366 
1367 /**
1368  *	register_netdevice_notifier - register a network notifier block
1369  *	@nb: notifier
1370  *
1371  *	Register a notifier to be called when network device events occur.
1372  *	The notifier passed is linked into the kernel structures and must
1373  *	not be reused until it has been unregistered. A negative errno code
1374  *	is returned on a failure.
1375  *
1376  * 	When registered all registration and up events are replayed
1377  *	to the new notifier to allow device to have a race free
1378  *	view of the network device list.
1379  */
1380 
1381 int register_netdevice_notifier(struct notifier_block *nb)
1382 {
1383 	struct net_device *dev;
1384 	struct net_device *last;
1385 	struct net *net;
1386 	int err;
1387 
1388 	rtnl_lock();
1389 	err = raw_notifier_chain_register(&netdev_chain, nb);
1390 	if (err)
1391 		goto unlock;
1392 	if (dev_boot_phase)
1393 		goto unlock;
1394 	for_each_net(net) {
1395 		for_each_netdev(net, dev) {
1396 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1397 			err = notifier_to_errno(err);
1398 			if (err)
1399 				goto rollback;
1400 
1401 			if (!(dev->flags & IFF_UP))
1402 				continue;
1403 
1404 			nb->notifier_call(nb, NETDEV_UP, dev);
1405 		}
1406 	}
1407 
1408 unlock:
1409 	rtnl_unlock();
1410 	return err;
1411 
1412 rollback:
1413 	last = dev;
1414 	for_each_net(net) {
1415 		for_each_netdev(net, dev) {
1416 			if (dev == last)
1417 				break;
1418 
1419 			if (dev->flags & IFF_UP) {
1420 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1421 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1422 			}
1423 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1424 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1425 		}
1426 	}
1427 
1428 	raw_notifier_chain_unregister(&netdev_chain, nb);
1429 	goto unlock;
1430 }
1431 EXPORT_SYMBOL(register_netdevice_notifier);
1432 
1433 /**
1434  *	unregister_netdevice_notifier - unregister a network notifier block
1435  *	@nb: notifier
1436  *
1437  *	Unregister a notifier previously registered by
1438  *	register_netdevice_notifier(). The notifier is unlinked into the
1439  *	kernel structures and may then be reused. A negative errno code
1440  *	is returned on a failure.
1441  */
1442 
1443 int unregister_netdevice_notifier(struct notifier_block *nb)
1444 {
1445 	int err;
1446 
1447 	rtnl_lock();
1448 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1449 	rtnl_unlock();
1450 	return err;
1451 }
1452 EXPORT_SYMBOL(unregister_netdevice_notifier);
1453 
1454 /**
1455  *	call_netdevice_notifiers - call all network notifier blocks
1456  *      @val: value passed unmodified to notifier function
1457  *      @dev: net_device pointer passed unmodified to notifier function
1458  *
1459  *	Call all network notifier blocks.  Parameters and return value
1460  *	are as for raw_notifier_call_chain().
1461  */
1462 
1463 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1464 {
1465 	ASSERT_RTNL();
1466 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1467 }
1468 
1469 /* When > 0 there are consumers of rx skb time stamps */
1470 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1471 
1472 void net_enable_timestamp(void)
1473 {
1474 	atomic_inc(&netstamp_needed);
1475 }
1476 EXPORT_SYMBOL(net_enable_timestamp);
1477 
1478 void net_disable_timestamp(void)
1479 {
1480 	atomic_dec(&netstamp_needed);
1481 }
1482 EXPORT_SYMBOL(net_disable_timestamp);
1483 
1484 static inline void net_timestamp_set(struct sk_buff *skb)
1485 {
1486 	if (atomic_read(&netstamp_needed))
1487 		__net_timestamp(skb);
1488 	else
1489 		skb->tstamp.tv64 = 0;
1490 }
1491 
1492 static inline void net_timestamp_check(struct sk_buff *skb)
1493 {
1494 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1495 		__net_timestamp(skb);
1496 }
1497 
1498 /**
1499  * dev_forward_skb - loopback an skb to another netif
1500  *
1501  * @dev: destination network device
1502  * @skb: buffer to forward
1503  *
1504  * return values:
1505  *	NET_RX_SUCCESS	(no congestion)
1506  *	NET_RX_DROP     (packet was dropped, but freed)
1507  *
1508  * dev_forward_skb can be used for injecting an skb from the
1509  * start_xmit function of one device into the receive queue
1510  * of another device.
1511  *
1512  * The receiving device may be in another namespace, so
1513  * we have to clear all information in the skb that could
1514  * impact namespace isolation.
1515  */
1516 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517 {
1518 	skb_orphan(skb);
1519 	nf_reset(skb);
1520 
1521 	if (unlikely(!(dev->flags & IFF_UP) ||
1522 		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1523 		atomic_long_inc(&dev->rx_dropped);
1524 		kfree_skb(skb);
1525 		return NET_RX_DROP;
1526 	}
1527 	skb_set_dev(skb, dev);
1528 	skb->tstamp.tv64 = 0;
1529 	skb->pkt_type = PACKET_HOST;
1530 	skb->protocol = eth_type_trans(skb, dev);
1531 	return netif_rx(skb);
1532 }
1533 EXPORT_SYMBOL_GPL(dev_forward_skb);
1534 
1535 static inline int deliver_skb(struct sk_buff *skb,
1536 			      struct packet_type *pt_prev,
1537 			      struct net_device *orig_dev)
1538 {
1539 	atomic_inc(&skb->users);
1540 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1541 }
1542 
1543 /*
1544  *	Support routine. Sends outgoing frames to any network
1545  *	taps currently in use.
1546  */
1547 
1548 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1549 {
1550 	struct packet_type *ptype;
1551 	struct sk_buff *skb2 = NULL;
1552 	struct packet_type *pt_prev = NULL;
1553 
1554 	rcu_read_lock();
1555 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1556 		/* Never send packets back to the socket
1557 		 * they originated from - MvS (miquels@drinkel.ow.org)
1558 		 */
1559 		if ((ptype->dev == dev || !ptype->dev) &&
1560 		    (ptype->af_packet_priv == NULL ||
1561 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1562 			if (pt_prev) {
1563 				deliver_skb(skb2, pt_prev, skb->dev);
1564 				pt_prev = ptype;
1565 				continue;
1566 			}
1567 
1568 			skb2 = skb_clone(skb, GFP_ATOMIC);
1569 			if (!skb2)
1570 				break;
1571 
1572 			net_timestamp_set(skb2);
1573 
1574 			/* skb->nh should be correctly
1575 			   set by sender, so that the second statement is
1576 			   just protection against buggy protocols.
1577 			 */
1578 			skb_reset_mac_header(skb2);
1579 
1580 			if (skb_network_header(skb2) < skb2->data ||
1581 			    skb2->network_header > skb2->tail) {
1582 				if (net_ratelimit())
1583 					printk(KERN_CRIT "protocol %04x is "
1584 					       "buggy, dev %s\n",
1585 					       ntohs(skb2->protocol),
1586 					       dev->name);
1587 				skb_reset_network_header(skb2);
1588 			}
1589 
1590 			skb2->transport_header = skb2->network_header;
1591 			skb2->pkt_type = PACKET_OUTGOING;
1592 			pt_prev = ptype;
1593 		}
1594 	}
1595 	if (pt_prev)
1596 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1597 	rcu_read_unlock();
1598 }
1599 
1600 /*
1601  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1602  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1603  */
1604 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1605 {
1606 	int rc;
1607 
1608 	if (txq < 1 || txq > dev->num_tx_queues)
1609 		return -EINVAL;
1610 
1611 	if (dev->reg_state == NETREG_REGISTERED) {
1612 		ASSERT_RTNL();
1613 
1614 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1615 						  txq);
1616 		if (rc)
1617 			return rc;
1618 
1619 		if (txq < dev->real_num_tx_queues)
1620 			qdisc_reset_all_tx_gt(dev, txq);
1621 	}
1622 
1623 	dev->real_num_tx_queues = txq;
1624 	return 0;
1625 }
1626 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1627 
1628 #ifdef CONFIG_RPS
1629 /**
1630  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1631  *	@dev: Network device
1632  *	@rxq: Actual number of RX queues
1633  *
1634  *	This must be called either with the rtnl_lock held or before
1635  *	registration of the net device.  Returns 0 on success, or a
1636  *	negative error code.  If called before registration, it always
1637  *	succeeds.
1638  */
1639 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1640 {
1641 	int rc;
1642 
1643 	if (rxq < 1 || rxq > dev->num_rx_queues)
1644 		return -EINVAL;
1645 
1646 	if (dev->reg_state == NETREG_REGISTERED) {
1647 		ASSERT_RTNL();
1648 
1649 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1650 						  rxq);
1651 		if (rc)
1652 			return rc;
1653 	}
1654 
1655 	dev->real_num_rx_queues = rxq;
1656 	return 0;
1657 }
1658 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1659 #endif
1660 
1661 static inline void __netif_reschedule(struct Qdisc *q)
1662 {
1663 	struct softnet_data *sd;
1664 	unsigned long flags;
1665 
1666 	local_irq_save(flags);
1667 	sd = &__get_cpu_var(softnet_data);
1668 	q->next_sched = NULL;
1669 	*sd->output_queue_tailp = q;
1670 	sd->output_queue_tailp = &q->next_sched;
1671 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1672 	local_irq_restore(flags);
1673 }
1674 
1675 void __netif_schedule(struct Qdisc *q)
1676 {
1677 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1678 		__netif_reschedule(q);
1679 }
1680 EXPORT_SYMBOL(__netif_schedule);
1681 
1682 void dev_kfree_skb_irq(struct sk_buff *skb)
1683 {
1684 	if (atomic_dec_and_test(&skb->users)) {
1685 		struct softnet_data *sd;
1686 		unsigned long flags;
1687 
1688 		local_irq_save(flags);
1689 		sd = &__get_cpu_var(softnet_data);
1690 		skb->next = sd->completion_queue;
1691 		sd->completion_queue = skb;
1692 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1693 		local_irq_restore(flags);
1694 	}
1695 }
1696 EXPORT_SYMBOL(dev_kfree_skb_irq);
1697 
1698 void dev_kfree_skb_any(struct sk_buff *skb)
1699 {
1700 	if (in_irq() || irqs_disabled())
1701 		dev_kfree_skb_irq(skb);
1702 	else
1703 		dev_kfree_skb(skb);
1704 }
1705 EXPORT_SYMBOL(dev_kfree_skb_any);
1706 
1707 
1708 /**
1709  * netif_device_detach - mark device as removed
1710  * @dev: network device
1711  *
1712  * Mark device as removed from system and therefore no longer available.
1713  */
1714 void netif_device_detach(struct net_device *dev)
1715 {
1716 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1717 	    netif_running(dev)) {
1718 		netif_tx_stop_all_queues(dev);
1719 	}
1720 }
1721 EXPORT_SYMBOL(netif_device_detach);
1722 
1723 /**
1724  * netif_device_attach - mark device as attached
1725  * @dev: network device
1726  *
1727  * Mark device as attached from system and restart if needed.
1728  */
1729 void netif_device_attach(struct net_device *dev)
1730 {
1731 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1732 	    netif_running(dev)) {
1733 		netif_tx_wake_all_queues(dev);
1734 		__netdev_watchdog_up(dev);
1735 	}
1736 }
1737 EXPORT_SYMBOL(netif_device_attach);
1738 
1739 /**
1740  * skb_dev_set -- assign a new device to a buffer
1741  * @skb: buffer for the new device
1742  * @dev: network device
1743  *
1744  * If an skb is owned by a device already, we have to reset
1745  * all data private to the namespace a device belongs to
1746  * before assigning it a new device.
1747  */
1748 #ifdef CONFIG_NET_NS
1749 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1750 {
1751 	skb_dst_drop(skb);
1752 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1753 		secpath_reset(skb);
1754 		nf_reset(skb);
1755 		skb_init_secmark(skb);
1756 		skb->mark = 0;
1757 		skb->priority = 0;
1758 		skb->nf_trace = 0;
1759 		skb->ipvs_property = 0;
1760 #ifdef CONFIG_NET_SCHED
1761 		skb->tc_index = 0;
1762 #endif
1763 	}
1764 	skb->dev = dev;
1765 }
1766 EXPORT_SYMBOL(skb_set_dev);
1767 #endif /* CONFIG_NET_NS */
1768 
1769 /*
1770  * Invalidate hardware checksum when packet is to be mangled, and
1771  * complete checksum manually on outgoing path.
1772  */
1773 int skb_checksum_help(struct sk_buff *skb)
1774 {
1775 	__wsum csum;
1776 	int ret = 0, offset;
1777 
1778 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1779 		goto out_set_summed;
1780 
1781 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1782 		/* Let GSO fix up the checksum. */
1783 		goto out_set_summed;
1784 	}
1785 
1786 	offset = skb_checksum_start_offset(skb);
1787 	BUG_ON(offset >= skb_headlen(skb));
1788 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1789 
1790 	offset += skb->csum_offset;
1791 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1792 
1793 	if (skb_cloned(skb) &&
1794 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1795 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1796 		if (ret)
1797 			goto out;
1798 	}
1799 
1800 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1801 out_set_summed:
1802 	skb->ip_summed = CHECKSUM_NONE;
1803 out:
1804 	return ret;
1805 }
1806 EXPORT_SYMBOL(skb_checksum_help);
1807 
1808 /**
1809  *	skb_gso_segment - Perform segmentation on skb.
1810  *	@skb: buffer to segment
1811  *	@features: features for the output path (see dev->features)
1812  *
1813  *	This function segments the given skb and returns a list of segments.
1814  *
1815  *	It may return NULL if the skb requires no segmentation.  This is
1816  *	only possible when GSO is used for verifying header integrity.
1817  */
1818 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1819 {
1820 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1821 	struct packet_type *ptype;
1822 	__be16 type = skb->protocol;
1823 	int vlan_depth = ETH_HLEN;
1824 	int err;
1825 
1826 	while (type == htons(ETH_P_8021Q)) {
1827 		struct vlan_hdr *vh;
1828 
1829 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1830 			return ERR_PTR(-EINVAL);
1831 
1832 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1833 		type = vh->h_vlan_encapsulated_proto;
1834 		vlan_depth += VLAN_HLEN;
1835 	}
1836 
1837 	skb_reset_mac_header(skb);
1838 	skb->mac_len = skb->network_header - skb->mac_header;
1839 	__skb_pull(skb, skb->mac_len);
1840 
1841 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1842 		struct net_device *dev = skb->dev;
1843 		struct ethtool_drvinfo info = {};
1844 
1845 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1846 			dev->ethtool_ops->get_drvinfo(dev, &info);
1847 
1848 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1849 		     info.driver, dev ? dev->features : 0L,
1850 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1851 		     skb->len, skb->data_len, skb->ip_summed);
1852 
1853 		if (skb_header_cloned(skb) &&
1854 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1855 			return ERR_PTR(err);
1856 	}
1857 
1858 	rcu_read_lock();
1859 	list_for_each_entry_rcu(ptype,
1860 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1861 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1862 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1863 				err = ptype->gso_send_check(skb);
1864 				segs = ERR_PTR(err);
1865 				if (err || skb_gso_ok(skb, features))
1866 					break;
1867 				__skb_push(skb, (skb->data -
1868 						 skb_network_header(skb)));
1869 			}
1870 			segs = ptype->gso_segment(skb, features);
1871 			break;
1872 		}
1873 	}
1874 	rcu_read_unlock();
1875 
1876 	__skb_push(skb, skb->data - skb_mac_header(skb));
1877 
1878 	return segs;
1879 }
1880 EXPORT_SYMBOL(skb_gso_segment);
1881 
1882 /* Take action when hardware reception checksum errors are detected. */
1883 #ifdef CONFIG_BUG
1884 void netdev_rx_csum_fault(struct net_device *dev)
1885 {
1886 	if (net_ratelimit()) {
1887 		printk(KERN_ERR "%s: hw csum failure.\n",
1888 			dev ? dev->name : "<unknown>");
1889 		dump_stack();
1890 	}
1891 }
1892 EXPORT_SYMBOL(netdev_rx_csum_fault);
1893 #endif
1894 
1895 /* Actually, we should eliminate this check as soon as we know, that:
1896  * 1. IOMMU is present and allows to map all the memory.
1897  * 2. No high memory really exists on this machine.
1898  */
1899 
1900 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1901 {
1902 #ifdef CONFIG_HIGHMEM
1903 	int i;
1904 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1905 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1906 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1907 				return 1;
1908 	}
1909 
1910 	if (PCI_DMA_BUS_IS_PHYS) {
1911 		struct device *pdev = dev->dev.parent;
1912 
1913 		if (!pdev)
1914 			return 0;
1915 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1916 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1917 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1918 				return 1;
1919 		}
1920 	}
1921 #endif
1922 	return 0;
1923 }
1924 
1925 struct dev_gso_cb {
1926 	void (*destructor)(struct sk_buff *skb);
1927 };
1928 
1929 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1930 
1931 static void dev_gso_skb_destructor(struct sk_buff *skb)
1932 {
1933 	struct dev_gso_cb *cb;
1934 
1935 	do {
1936 		struct sk_buff *nskb = skb->next;
1937 
1938 		skb->next = nskb->next;
1939 		nskb->next = NULL;
1940 		kfree_skb(nskb);
1941 	} while (skb->next);
1942 
1943 	cb = DEV_GSO_CB(skb);
1944 	if (cb->destructor)
1945 		cb->destructor(skb);
1946 }
1947 
1948 /**
1949  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1950  *	@skb: buffer to segment
1951  *	@features: device features as applicable to this skb
1952  *
1953  *	This function segments the given skb and stores the list of segments
1954  *	in skb->next.
1955  */
1956 static int dev_gso_segment(struct sk_buff *skb, int features)
1957 {
1958 	struct sk_buff *segs;
1959 
1960 	segs = skb_gso_segment(skb, features);
1961 
1962 	/* Verifying header integrity only. */
1963 	if (!segs)
1964 		return 0;
1965 
1966 	if (IS_ERR(segs))
1967 		return PTR_ERR(segs);
1968 
1969 	skb->next = segs;
1970 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1971 	skb->destructor = dev_gso_skb_destructor;
1972 
1973 	return 0;
1974 }
1975 
1976 /*
1977  * Try to orphan skb early, right before transmission by the device.
1978  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1979  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1980  */
1981 static inline void skb_orphan_try(struct sk_buff *skb)
1982 {
1983 	struct sock *sk = skb->sk;
1984 
1985 	if (sk && !skb_shinfo(skb)->tx_flags) {
1986 		/* skb_tx_hash() wont be able to get sk.
1987 		 * We copy sk_hash into skb->rxhash
1988 		 */
1989 		if (!skb->rxhash)
1990 			skb->rxhash = sk->sk_hash;
1991 		skb_orphan(skb);
1992 	}
1993 }
1994 
1995 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1996 {
1997 	return ((features & NETIF_F_GEN_CSUM) ||
1998 		((features & NETIF_F_V4_CSUM) &&
1999 		 protocol == htons(ETH_P_IP)) ||
2000 		((features & NETIF_F_V6_CSUM) &&
2001 		 protocol == htons(ETH_P_IPV6)) ||
2002 		((features & NETIF_F_FCOE_CRC) &&
2003 		 protocol == htons(ETH_P_FCOE)));
2004 }
2005 
2006 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2007 {
2008 	if (!can_checksum_protocol(features, protocol)) {
2009 		features &= ~NETIF_F_ALL_CSUM;
2010 		features &= ~NETIF_F_SG;
2011 	} else if (illegal_highdma(skb->dev, skb)) {
2012 		features &= ~NETIF_F_SG;
2013 	}
2014 
2015 	return features;
2016 }
2017 
2018 int netif_skb_features(struct sk_buff *skb)
2019 {
2020 	__be16 protocol = skb->protocol;
2021 	int features = skb->dev->features;
2022 
2023 	if (protocol == htons(ETH_P_8021Q)) {
2024 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2025 		protocol = veh->h_vlan_encapsulated_proto;
2026 	} else if (!vlan_tx_tag_present(skb)) {
2027 		return harmonize_features(skb, protocol, features);
2028 	}
2029 
2030 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2031 
2032 	if (protocol != htons(ETH_P_8021Q)) {
2033 		return harmonize_features(skb, protocol, features);
2034 	} else {
2035 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2036 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2037 		return harmonize_features(skb, protocol, features);
2038 	}
2039 }
2040 EXPORT_SYMBOL(netif_skb_features);
2041 
2042 /*
2043  * Returns true if either:
2044  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2045  *	2. skb is fragmented and the device does not support SG, or if
2046  *	   at least one of fragments is in highmem and device does not
2047  *	   support DMA from it.
2048  */
2049 static inline int skb_needs_linearize(struct sk_buff *skb,
2050 				      int features)
2051 {
2052 	return skb_is_nonlinear(skb) &&
2053 			((skb_has_frag_list(skb) &&
2054 				!(features & NETIF_F_FRAGLIST)) ||
2055 			(skb_shinfo(skb)->nr_frags &&
2056 				!(features & NETIF_F_SG)));
2057 }
2058 
2059 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2060 			struct netdev_queue *txq)
2061 {
2062 	const struct net_device_ops *ops = dev->netdev_ops;
2063 	int rc = NETDEV_TX_OK;
2064 
2065 	if (likely(!skb->next)) {
2066 		int features;
2067 
2068 		/*
2069 		 * If device doesnt need skb->dst, release it right now while
2070 		 * its hot in this cpu cache
2071 		 */
2072 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2073 			skb_dst_drop(skb);
2074 
2075 		if (!list_empty(&ptype_all))
2076 			dev_queue_xmit_nit(skb, dev);
2077 
2078 		skb_orphan_try(skb);
2079 
2080 		features = netif_skb_features(skb);
2081 
2082 		if (vlan_tx_tag_present(skb) &&
2083 		    !(features & NETIF_F_HW_VLAN_TX)) {
2084 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2085 			if (unlikely(!skb))
2086 				goto out;
2087 
2088 			skb->vlan_tci = 0;
2089 		}
2090 
2091 		if (netif_needs_gso(skb, features)) {
2092 			if (unlikely(dev_gso_segment(skb, features)))
2093 				goto out_kfree_skb;
2094 			if (skb->next)
2095 				goto gso;
2096 		} else {
2097 			if (skb_needs_linearize(skb, features) &&
2098 			    __skb_linearize(skb))
2099 				goto out_kfree_skb;
2100 
2101 			/* If packet is not checksummed and device does not
2102 			 * support checksumming for this protocol, complete
2103 			 * checksumming here.
2104 			 */
2105 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2106 				skb_set_transport_header(skb,
2107 					skb_checksum_start_offset(skb));
2108 				if (!(features & NETIF_F_ALL_CSUM) &&
2109 				     skb_checksum_help(skb))
2110 					goto out_kfree_skb;
2111 			}
2112 		}
2113 
2114 		rc = ops->ndo_start_xmit(skb, dev);
2115 		trace_net_dev_xmit(skb, rc);
2116 		if (rc == NETDEV_TX_OK)
2117 			txq_trans_update(txq);
2118 		return rc;
2119 	}
2120 
2121 gso:
2122 	do {
2123 		struct sk_buff *nskb = skb->next;
2124 
2125 		skb->next = nskb->next;
2126 		nskb->next = NULL;
2127 
2128 		/*
2129 		 * If device doesnt need nskb->dst, release it right now while
2130 		 * its hot in this cpu cache
2131 		 */
2132 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2133 			skb_dst_drop(nskb);
2134 
2135 		rc = ops->ndo_start_xmit(nskb, dev);
2136 		trace_net_dev_xmit(nskb, rc);
2137 		if (unlikely(rc != NETDEV_TX_OK)) {
2138 			if (rc & ~NETDEV_TX_MASK)
2139 				goto out_kfree_gso_skb;
2140 			nskb->next = skb->next;
2141 			skb->next = nskb;
2142 			return rc;
2143 		}
2144 		txq_trans_update(txq);
2145 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2146 			return NETDEV_TX_BUSY;
2147 	} while (skb->next);
2148 
2149 out_kfree_gso_skb:
2150 	if (likely(skb->next == NULL))
2151 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2152 out_kfree_skb:
2153 	kfree_skb(skb);
2154 out:
2155 	return rc;
2156 }
2157 
2158 static u32 hashrnd __read_mostly;
2159 
2160 /*
2161  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2162  * to be used as a distribution range.
2163  */
2164 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2165 		  unsigned int num_tx_queues)
2166 {
2167 	u32 hash;
2168 
2169 	if (skb_rx_queue_recorded(skb)) {
2170 		hash = skb_get_rx_queue(skb);
2171 		while (unlikely(hash >= num_tx_queues))
2172 			hash -= num_tx_queues;
2173 		return hash;
2174 	}
2175 
2176 	if (skb->sk && skb->sk->sk_hash)
2177 		hash = skb->sk->sk_hash;
2178 	else
2179 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2180 	hash = jhash_1word(hash, hashrnd);
2181 
2182 	return (u16) (((u64) hash * num_tx_queues) >> 32);
2183 }
2184 EXPORT_SYMBOL(__skb_tx_hash);
2185 
2186 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2187 {
2188 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2189 		if (net_ratelimit()) {
2190 			pr_warning("%s selects TX queue %d, but "
2191 				"real number of TX queues is %d\n",
2192 				dev->name, queue_index, dev->real_num_tx_queues);
2193 		}
2194 		return 0;
2195 	}
2196 	return queue_index;
2197 }
2198 
2199 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2200 {
2201 #ifdef CONFIG_XPS
2202 	struct xps_dev_maps *dev_maps;
2203 	struct xps_map *map;
2204 	int queue_index = -1;
2205 
2206 	rcu_read_lock();
2207 	dev_maps = rcu_dereference(dev->xps_maps);
2208 	if (dev_maps) {
2209 		map = rcu_dereference(
2210 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2211 		if (map) {
2212 			if (map->len == 1)
2213 				queue_index = map->queues[0];
2214 			else {
2215 				u32 hash;
2216 				if (skb->sk && skb->sk->sk_hash)
2217 					hash = skb->sk->sk_hash;
2218 				else
2219 					hash = (__force u16) skb->protocol ^
2220 					    skb->rxhash;
2221 				hash = jhash_1word(hash, hashrnd);
2222 				queue_index = map->queues[
2223 				    ((u64)hash * map->len) >> 32];
2224 			}
2225 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2226 				queue_index = -1;
2227 		}
2228 	}
2229 	rcu_read_unlock();
2230 
2231 	return queue_index;
2232 #else
2233 	return -1;
2234 #endif
2235 }
2236 
2237 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2238 					struct sk_buff *skb)
2239 {
2240 	int queue_index;
2241 	const struct net_device_ops *ops = dev->netdev_ops;
2242 
2243 	if (dev->real_num_tx_queues == 1)
2244 		queue_index = 0;
2245 	else if (ops->ndo_select_queue) {
2246 		queue_index = ops->ndo_select_queue(dev, skb);
2247 		queue_index = dev_cap_txqueue(dev, queue_index);
2248 	} else {
2249 		struct sock *sk = skb->sk;
2250 		queue_index = sk_tx_queue_get(sk);
2251 
2252 		if (queue_index < 0 || skb->ooo_okay ||
2253 		    queue_index >= dev->real_num_tx_queues) {
2254 			int old_index = queue_index;
2255 
2256 			queue_index = get_xps_queue(dev, skb);
2257 			if (queue_index < 0)
2258 				queue_index = skb_tx_hash(dev, skb);
2259 
2260 			if (queue_index != old_index && sk) {
2261 				struct dst_entry *dst =
2262 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2263 
2264 				if (dst && skb_dst(skb) == dst)
2265 					sk_tx_queue_set(sk, queue_index);
2266 			}
2267 		}
2268 	}
2269 
2270 	skb_set_queue_mapping(skb, queue_index);
2271 	return netdev_get_tx_queue(dev, queue_index);
2272 }
2273 
2274 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2275 				 struct net_device *dev,
2276 				 struct netdev_queue *txq)
2277 {
2278 	spinlock_t *root_lock = qdisc_lock(q);
2279 	bool contended = qdisc_is_running(q);
2280 	int rc;
2281 
2282 	/*
2283 	 * Heuristic to force contended enqueues to serialize on a
2284 	 * separate lock before trying to get qdisc main lock.
2285 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2286 	 * and dequeue packets faster.
2287 	 */
2288 	if (unlikely(contended))
2289 		spin_lock(&q->busylock);
2290 
2291 	spin_lock(root_lock);
2292 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2293 		kfree_skb(skb);
2294 		rc = NET_XMIT_DROP;
2295 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2296 		   qdisc_run_begin(q)) {
2297 		/*
2298 		 * This is a work-conserving queue; there are no old skbs
2299 		 * waiting to be sent out; and the qdisc is not running -
2300 		 * xmit the skb directly.
2301 		 */
2302 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2303 			skb_dst_force(skb);
2304 
2305 		qdisc_skb_cb(skb)->pkt_len = skb->len;
2306 		qdisc_bstats_update(q, skb);
2307 
2308 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2309 			if (unlikely(contended)) {
2310 				spin_unlock(&q->busylock);
2311 				contended = false;
2312 			}
2313 			__qdisc_run(q);
2314 		} else
2315 			qdisc_run_end(q);
2316 
2317 		rc = NET_XMIT_SUCCESS;
2318 	} else {
2319 		skb_dst_force(skb);
2320 		rc = qdisc_enqueue_root(skb, q);
2321 		if (qdisc_run_begin(q)) {
2322 			if (unlikely(contended)) {
2323 				spin_unlock(&q->busylock);
2324 				contended = false;
2325 			}
2326 			__qdisc_run(q);
2327 		}
2328 	}
2329 	spin_unlock(root_lock);
2330 	if (unlikely(contended))
2331 		spin_unlock(&q->busylock);
2332 	return rc;
2333 }
2334 
2335 static DEFINE_PER_CPU(int, xmit_recursion);
2336 #define RECURSION_LIMIT 10
2337 
2338 /**
2339  *	dev_queue_xmit - transmit a buffer
2340  *	@skb: buffer to transmit
2341  *
2342  *	Queue a buffer for transmission to a network device. The caller must
2343  *	have set the device and priority and built the buffer before calling
2344  *	this function. The function can be called from an interrupt.
2345  *
2346  *	A negative errno code is returned on a failure. A success does not
2347  *	guarantee the frame will be transmitted as it may be dropped due
2348  *	to congestion or traffic shaping.
2349  *
2350  * -----------------------------------------------------------------------------------
2351  *      I notice this method can also return errors from the queue disciplines,
2352  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2353  *      be positive.
2354  *
2355  *      Regardless of the return value, the skb is consumed, so it is currently
2356  *      difficult to retry a send to this method.  (You can bump the ref count
2357  *      before sending to hold a reference for retry if you are careful.)
2358  *
2359  *      When calling this method, interrupts MUST be enabled.  This is because
2360  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2361  *          --BLG
2362  */
2363 int dev_queue_xmit(struct sk_buff *skb)
2364 {
2365 	struct net_device *dev = skb->dev;
2366 	struct netdev_queue *txq;
2367 	struct Qdisc *q;
2368 	int rc = -ENOMEM;
2369 
2370 	/* Disable soft irqs for various locks below. Also
2371 	 * stops preemption for RCU.
2372 	 */
2373 	rcu_read_lock_bh();
2374 
2375 	txq = dev_pick_tx(dev, skb);
2376 	q = rcu_dereference_bh(txq->qdisc);
2377 
2378 #ifdef CONFIG_NET_CLS_ACT
2379 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2380 #endif
2381 	trace_net_dev_queue(skb);
2382 	if (q->enqueue) {
2383 		rc = __dev_xmit_skb(skb, q, dev, txq);
2384 		goto out;
2385 	}
2386 
2387 	/* The device has no queue. Common case for software devices:
2388 	   loopback, all the sorts of tunnels...
2389 
2390 	   Really, it is unlikely that netif_tx_lock protection is necessary
2391 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2392 	   counters.)
2393 	   However, it is possible, that they rely on protection
2394 	   made by us here.
2395 
2396 	   Check this and shot the lock. It is not prone from deadlocks.
2397 	   Either shot noqueue qdisc, it is even simpler 8)
2398 	 */
2399 	if (dev->flags & IFF_UP) {
2400 		int cpu = smp_processor_id(); /* ok because BHs are off */
2401 
2402 		if (txq->xmit_lock_owner != cpu) {
2403 
2404 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2405 				goto recursion_alert;
2406 
2407 			HARD_TX_LOCK(dev, txq, cpu);
2408 
2409 			if (!netif_tx_queue_stopped(txq)) {
2410 				__this_cpu_inc(xmit_recursion);
2411 				rc = dev_hard_start_xmit(skb, dev, txq);
2412 				__this_cpu_dec(xmit_recursion);
2413 				if (dev_xmit_complete(rc)) {
2414 					HARD_TX_UNLOCK(dev, txq);
2415 					goto out;
2416 				}
2417 			}
2418 			HARD_TX_UNLOCK(dev, txq);
2419 			if (net_ratelimit())
2420 				printk(KERN_CRIT "Virtual device %s asks to "
2421 				       "queue packet!\n", dev->name);
2422 		} else {
2423 			/* Recursion is detected! It is possible,
2424 			 * unfortunately
2425 			 */
2426 recursion_alert:
2427 			if (net_ratelimit())
2428 				printk(KERN_CRIT "Dead loop on virtual device "
2429 				       "%s, fix it urgently!\n", dev->name);
2430 		}
2431 	}
2432 
2433 	rc = -ENETDOWN;
2434 	rcu_read_unlock_bh();
2435 
2436 	kfree_skb(skb);
2437 	return rc;
2438 out:
2439 	rcu_read_unlock_bh();
2440 	return rc;
2441 }
2442 EXPORT_SYMBOL(dev_queue_xmit);
2443 
2444 
2445 /*=======================================================================
2446 			Receiver routines
2447   =======================================================================*/
2448 
2449 int netdev_max_backlog __read_mostly = 1000;
2450 int netdev_tstamp_prequeue __read_mostly = 1;
2451 int netdev_budget __read_mostly = 300;
2452 int weight_p __read_mostly = 64;            /* old backlog weight */
2453 
2454 /* Called with irq disabled */
2455 static inline void ____napi_schedule(struct softnet_data *sd,
2456 				     struct napi_struct *napi)
2457 {
2458 	list_add_tail(&napi->poll_list, &sd->poll_list);
2459 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2460 }
2461 
2462 /*
2463  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2464  * and src/dst port numbers. Returns a non-zero hash number on success
2465  * and 0 on failure.
2466  */
2467 __u32 __skb_get_rxhash(struct sk_buff *skb)
2468 {
2469 	int nhoff, hash = 0, poff;
2470 	struct ipv6hdr *ip6;
2471 	struct iphdr *ip;
2472 	u8 ip_proto;
2473 	u32 addr1, addr2, ihl;
2474 	union {
2475 		u32 v32;
2476 		u16 v16[2];
2477 	} ports;
2478 
2479 	nhoff = skb_network_offset(skb);
2480 
2481 	switch (skb->protocol) {
2482 	case __constant_htons(ETH_P_IP):
2483 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2484 			goto done;
2485 
2486 		ip = (struct iphdr *) (skb->data + nhoff);
2487 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2488 			ip_proto = 0;
2489 		else
2490 			ip_proto = ip->protocol;
2491 		addr1 = (__force u32) ip->saddr;
2492 		addr2 = (__force u32) ip->daddr;
2493 		ihl = ip->ihl;
2494 		break;
2495 	case __constant_htons(ETH_P_IPV6):
2496 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2497 			goto done;
2498 
2499 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2500 		ip_proto = ip6->nexthdr;
2501 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2502 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2503 		ihl = (40 >> 2);
2504 		break;
2505 	default:
2506 		goto done;
2507 	}
2508 
2509 	ports.v32 = 0;
2510 	poff = proto_ports_offset(ip_proto);
2511 	if (poff >= 0) {
2512 		nhoff += ihl * 4 + poff;
2513 		if (pskb_may_pull(skb, nhoff + 4)) {
2514 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2515 			if (ports.v16[1] < ports.v16[0])
2516 				swap(ports.v16[0], ports.v16[1]);
2517 		}
2518 	}
2519 
2520 	/* get a consistent hash (same value on both flow directions) */
2521 	if (addr2 < addr1)
2522 		swap(addr1, addr2);
2523 
2524 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2525 	if (!hash)
2526 		hash = 1;
2527 
2528 done:
2529 	return hash;
2530 }
2531 EXPORT_SYMBOL(__skb_get_rxhash);
2532 
2533 #ifdef CONFIG_RPS
2534 
2535 /* One global table that all flow-based protocols share. */
2536 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2537 EXPORT_SYMBOL(rps_sock_flow_table);
2538 
2539 /*
2540  * get_rps_cpu is called from netif_receive_skb and returns the target
2541  * CPU from the RPS map of the receiving queue for a given skb.
2542  * rcu_read_lock must be held on entry.
2543  */
2544 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2545 		       struct rps_dev_flow **rflowp)
2546 {
2547 	struct netdev_rx_queue *rxqueue;
2548 	struct rps_map *map;
2549 	struct rps_dev_flow_table *flow_table;
2550 	struct rps_sock_flow_table *sock_flow_table;
2551 	int cpu = -1;
2552 	u16 tcpu;
2553 
2554 	if (skb_rx_queue_recorded(skb)) {
2555 		u16 index = skb_get_rx_queue(skb);
2556 		if (unlikely(index >= dev->real_num_rx_queues)) {
2557 			WARN_ONCE(dev->real_num_rx_queues > 1,
2558 				  "%s received packet on queue %u, but number "
2559 				  "of RX queues is %u\n",
2560 				  dev->name, index, dev->real_num_rx_queues);
2561 			goto done;
2562 		}
2563 		rxqueue = dev->_rx + index;
2564 	} else
2565 		rxqueue = dev->_rx;
2566 
2567 	map = rcu_dereference(rxqueue->rps_map);
2568 	if (map) {
2569 		if (map->len == 1 &&
2570 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2571 			tcpu = map->cpus[0];
2572 			if (cpu_online(tcpu))
2573 				cpu = tcpu;
2574 			goto done;
2575 		}
2576 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2577 		goto done;
2578 	}
2579 
2580 	skb_reset_network_header(skb);
2581 	if (!skb_get_rxhash(skb))
2582 		goto done;
2583 
2584 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2585 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2586 	if (flow_table && sock_flow_table) {
2587 		u16 next_cpu;
2588 		struct rps_dev_flow *rflow;
2589 
2590 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2591 		tcpu = rflow->cpu;
2592 
2593 		next_cpu = sock_flow_table->ents[skb->rxhash &
2594 		    sock_flow_table->mask];
2595 
2596 		/*
2597 		 * If the desired CPU (where last recvmsg was done) is
2598 		 * different from current CPU (one in the rx-queue flow
2599 		 * table entry), switch if one of the following holds:
2600 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2601 		 *   - Current CPU is offline.
2602 		 *   - The current CPU's queue tail has advanced beyond the
2603 		 *     last packet that was enqueued using this table entry.
2604 		 *     This guarantees that all previous packets for the flow
2605 		 *     have been dequeued, thus preserving in order delivery.
2606 		 */
2607 		if (unlikely(tcpu != next_cpu) &&
2608 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2609 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2610 		      rflow->last_qtail)) >= 0)) {
2611 			tcpu = rflow->cpu = next_cpu;
2612 			if (tcpu != RPS_NO_CPU)
2613 				rflow->last_qtail = per_cpu(softnet_data,
2614 				    tcpu).input_queue_head;
2615 		}
2616 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2617 			*rflowp = rflow;
2618 			cpu = tcpu;
2619 			goto done;
2620 		}
2621 	}
2622 
2623 	if (map) {
2624 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2625 
2626 		if (cpu_online(tcpu)) {
2627 			cpu = tcpu;
2628 			goto done;
2629 		}
2630 	}
2631 
2632 done:
2633 	return cpu;
2634 }
2635 
2636 /* Called from hardirq (IPI) context */
2637 static void rps_trigger_softirq(void *data)
2638 {
2639 	struct softnet_data *sd = data;
2640 
2641 	____napi_schedule(sd, &sd->backlog);
2642 	sd->received_rps++;
2643 }
2644 
2645 #endif /* CONFIG_RPS */
2646 
2647 /*
2648  * Check if this softnet_data structure is another cpu one
2649  * If yes, queue it to our IPI list and return 1
2650  * If no, return 0
2651  */
2652 static int rps_ipi_queued(struct softnet_data *sd)
2653 {
2654 #ifdef CONFIG_RPS
2655 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2656 
2657 	if (sd != mysd) {
2658 		sd->rps_ipi_next = mysd->rps_ipi_list;
2659 		mysd->rps_ipi_list = sd;
2660 
2661 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2662 		return 1;
2663 	}
2664 #endif /* CONFIG_RPS */
2665 	return 0;
2666 }
2667 
2668 /*
2669  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2670  * queue (may be a remote CPU queue).
2671  */
2672 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2673 			      unsigned int *qtail)
2674 {
2675 	struct softnet_data *sd;
2676 	unsigned long flags;
2677 
2678 	sd = &per_cpu(softnet_data, cpu);
2679 
2680 	local_irq_save(flags);
2681 
2682 	rps_lock(sd);
2683 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2684 		if (skb_queue_len(&sd->input_pkt_queue)) {
2685 enqueue:
2686 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2687 			input_queue_tail_incr_save(sd, qtail);
2688 			rps_unlock(sd);
2689 			local_irq_restore(flags);
2690 			return NET_RX_SUCCESS;
2691 		}
2692 
2693 		/* Schedule NAPI for backlog device
2694 		 * We can use non atomic operation since we own the queue lock
2695 		 */
2696 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2697 			if (!rps_ipi_queued(sd))
2698 				____napi_schedule(sd, &sd->backlog);
2699 		}
2700 		goto enqueue;
2701 	}
2702 
2703 	sd->dropped++;
2704 	rps_unlock(sd);
2705 
2706 	local_irq_restore(flags);
2707 
2708 	atomic_long_inc(&skb->dev->rx_dropped);
2709 	kfree_skb(skb);
2710 	return NET_RX_DROP;
2711 }
2712 
2713 /**
2714  *	netif_rx	-	post buffer to the network code
2715  *	@skb: buffer to post
2716  *
2717  *	This function receives a packet from a device driver and queues it for
2718  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2719  *	may be dropped during processing for congestion control or by the
2720  *	protocol layers.
2721  *
2722  *	return values:
2723  *	NET_RX_SUCCESS	(no congestion)
2724  *	NET_RX_DROP     (packet was dropped)
2725  *
2726  */
2727 
2728 int netif_rx(struct sk_buff *skb)
2729 {
2730 	int ret;
2731 
2732 	/* if netpoll wants it, pretend we never saw it */
2733 	if (netpoll_rx(skb))
2734 		return NET_RX_DROP;
2735 
2736 	if (netdev_tstamp_prequeue)
2737 		net_timestamp_check(skb);
2738 
2739 	trace_netif_rx(skb);
2740 #ifdef CONFIG_RPS
2741 	{
2742 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2743 		int cpu;
2744 
2745 		preempt_disable();
2746 		rcu_read_lock();
2747 
2748 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2749 		if (cpu < 0)
2750 			cpu = smp_processor_id();
2751 
2752 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2753 
2754 		rcu_read_unlock();
2755 		preempt_enable();
2756 	}
2757 #else
2758 	{
2759 		unsigned int qtail;
2760 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2761 		put_cpu();
2762 	}
2763 #endif
2764 	return ret;
2765 }
2766 EXPORT_SYMBOL(netif_rx);
2767 
2768 int netif_rx_ni(struct sk_buff *skb)
2769 {
2770 	int err;
2771 
2772 	preempt_disable();
2773 	err = netif_rx(skb);
2774 	if (local_softirq_pending())
2775 		do_softirq();
2776 	preempt_enable();
2777 
2778 	return err;
2779 }
2780 EXPORT_SYMBOL(netif_rx_ni);
2781 
2782 static void net_tx_action(struct softirq_action *h)
2783 {
2784 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2785 
2786 	if (sd->completion_queue) {
2787 		struct sk_buff *clist;
2788 
2789 		local_irq_disable();
2790 		clist = sd->completion_queue;
2791 		sd->completion_queue = NULL;
2792 		local_irq_enable();
2793 
2794 		while (clist) {
2795 			struct sk_buff *skb = clist;
2796 			clist = clist->next;
2797 
2798 			WARN_ON(atomic_read(&skb->users));
2799 			trace_kfree_skb(skb, net_tx_action);
2800 			__kfree_skb(skb);
2801 		}
2802 	}
2803 
2804 	if (sd->output_queue) {
2805 		struct Qdisc *head;
2806 
2807 		local_irq_disable();
2808 		head = sd->output_queue;
2809 		sd->output_queue = NULL;
2810 		sd->output_queue_tailp = &sd->output_queue;
2811 		local_irq_enable();
2812 
2813 		while (head) {
2814 			struct Qdisc *q = head;
2815 			spinlock_t *root_lock;
2816 
2817 			head = head->next_sched;
2818 
2819 			root_lock = qdisc_lock(q);
2820 			if (spin_trylock(root_lock)) {
2821 				smp_mb__before_clear_bit();
2822 				clear_bit(__QDISC_STATE_SCHED,
2823 					  &q->state);
2824 				qdisc_run(q);
2825 				spin_unlock(root_lock);
2826 			} else {
2827 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2828 					      &q->state)) {
2829 					__netif_reschedule(q);
2830 				} else {
2831 					smp_mb__before_clear_bit();
2832 					clear_bit(__QDISC_STATE_SCHED,
2833 						  &q->state);
2834 				}
2835 			}
2836 		}
2837 	}
2838 }
2839 
2840 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2841     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2842 /* This hook is defined here for ATM LANE */
2843 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2844 			     unsigned char *addr) __read_mostly;
2845 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2846 #endif
2847 
2848 #ifdef CONFIG_NET_CLS_ACT
2849 /* TODO: Maybe we should just force sch_ingress to be compiled in
2850  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2851  * a compare and 2 stores extra right now if we dont have it on
2852  * but have CONFIG_NET_CLS_ACT
2853  * NOTE: This doesnt stop any functionality; if you dont have
2854  * the ingress scheduler, you just cant add policies on ingress.
2855  *
2856  */
2857 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2858 {
2859 	struct net_device *dev = skb->dev;
2860 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2861 	int result = TC_ACT_OK;
2862 	struct Qdisc *q;
2863 
2864 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2865 		if (net_ratelimit())
2866 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2867 			       skb->skb_iif, dev->ifindex);
2868 		return TC_ACT_SHOT;
2869 	}
2870 
2871 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2872 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2873 
2874 	q = rxq->qdisc;
2875 	if (q != &noop_qdisc) {
2876 		spin_lock(qdisc_lock(q));
2877 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2878 			result = qdisc_enqueue_root(skb, q);
2879 		spin_unlock(qdisc_lock(q));
2880 	}
2881 
2882 	return result;
2883 }
2884 
2885 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2886 					 struct packet_type **pt_prev,
2887 					 int *ret, struct net_device *orig_dev)
2888 {
2889 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2890 
2891 	if (!rxq || rxq->qdisc == &noop_qdisc)
2892 		goto out;
2893 
2894 	if (*pt_prev) {
2895 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2896 		*pt_prev = NULL;
2897 	}
2898 
2899 	switch (ing_filter(skb, rxq)) {
2900 	case TC_ACT_SHOT:
2901 	case TC_ACT_STOLEN:
2902 		kfree_skb(skb);
2903 		return NULL;
2904 	}
2905 
2906 out:
2907 	skb->tc_verd = 0;
2908 	return skb;
2909 }
2910 #endif
2911 
2912 /**
2913  *	netdev_rx_handler_register - register receive handler
2914  *	@dev: device to register a handler for
2915  *	@rx_handler: receive handler to register
2916  *	@rx_handler_data: data pointer that is used by rx handler
2917  *
2918  *	Register a receive hander for a device. This handler will then be
2919  *	called from __netif_receive_skb. A negative errno code is returned
2920  *	on a failure.
2921  *
2922  *	The caller must hold the rtnl_mutex.
2923  */
2924 int netdev_rx_handler_register(struct net_device *dev,
2925 			       rx_handler_func_t *rx_handler,
2926 			       void *rx_handler_data)
2927 {
2928 	ASSERT_RTNL();
2929 
2930 	if (dev->rx_handler)
2931 		return -EBUSY;
2932 
2933 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2934 	rcu_assign_pointer(dev->rx_handler, rx_handler);
2935 
2936 	return 0;
2937 }
2938 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2939 
2940 /**
2941  *	netdev_rx_handler_unregister - unregister receive handler
2942  *	@dev: device to unregister a handler from
2943  *
2944  *	Unregister a receive hander from a device.
2945  *
2946  *	The caller must hold the rtnl_mutex.
2947  */
2948 void netdev_rx_handler_unregister(struct net_device *dev)
2949 {
2950 
2951 	ASSERT_RTNL();
2952 	rcu_assign_pointer(dev->rx_handler, NULL);
2953 	rcu_assign_pointer(dev->rx_handler_data, NULL);
2954 }
2955 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2956 
2957 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2958 					      struct net_device *master)
2959 {
2960 	if (skb->pkt_type == PACKET_HOST) {
2961 		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2962 
2963 		memcpy(dest, master->dev_addr, ETH_ALEN);
2964 	}
2965 }
2966 
2967 /* On bonding slaves other than the currently active slave, suppress
2968  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2969  * ARP on active-backup slaves with arp_validate enabled.
2970  */
2971 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2972 {
2973 	struct net_device *dev = skb->dev;
2974 
2975 	if (master->priv_flags & IFF_MASTER_ARPMON)
2976 		dev->last_rx = jiffies;
2977 
2978 	if ((master->priv_flags & IFF_MASTER_ALB) &&
2979 	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2980 		/* Do address unmangle. The local destination address
2981 		 * will be always the one master has. Provides the right
2982 		 * functionality in a bridge.
2983 		 */
2984 		skb_bond_set_mac_by_master(skb, master);
2985 	}
2986 
2987 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2988 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2989 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2990 			return 0;
2991 
2992 		if (master->priv_flags & IFF_MASTER_ALB) {
2993 			if (skb->pkt_type != PACKET_BROADCAST &&
2994 			    skb->pkt_type != PACKET_MULTICAST)
2995 				return 0;
2996 		}
2997 		if (master->priv_flags & IFF_MASTER_8023AD &&
2998 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2999 			return 0;
3000 
3001 		return 1;
3002 	}
3003 	return 0;
3004 }
3005 EXPORT_SYMBOL(__skb_bond_should_drop);
3006 
3007 static int __netif_receive_skb(struct sk_buff *skb)
3008 {
3009 	struct packet_type *ptype, *pt_prev;
3010 	rx_handler_func_t *rx_handler;
3011 	struct net_device *orig_dev;
3012 	struct net_device *master;
3013 	struct net_device *null_or_orig;
3014 	struct net_device *orig_or_bond;
3015 	int ret = NET_RX_DROP;
3016 	__be16 type;
3017 
3018 	if (!netdev_tstamp_prequeue)
3019 		net_timestamp_check(skb);
3020 
3021 	trace_netif_receive_skb(skb);
3022 
3023 	/* if we've gotten here through NAPI, check netpoll */
3024 	if (netpoll_receive_skb(skb))
3025 		return NET_RX_DROP;
3026 
3027 	if (!skb->skb_iif)
3028 		skb->skb_iif = skb->dev->ifindex;
3029 
3030 	/*
3031 	 * bonding note: skbs received on inactive slaves should only
3032 	 * be delivered to pkt handlers that are exact matches.  Also
3033 	 * the deliver_no_wcard flag will be set.  If packet handlers
3034 	 * are sensitive to duplicate packets these skbs will need to
3035 	 * be dropped at the handler.
3036 	 */
3037 	null_or_orig = NULL;
3038 	orig_dev = skb->dev;
3039 	master = ACCESS_ONCE(orig_dev->master);
3040 	if (skb->deliver_no_wcard)
3041 		null_or_orig = orig_dev;
3042 	else if (master) {
3043 		if (skb_bond_should_drop(skb, master)) {
3044 			skb->deliver_no_wcard = 1;
3045 			null_or_orig = orig_dev; /* deliver only exact match */
3046 		} else
3047 			skb->dev = master;
3048 	}
3049 
3050 	__this_cpu_inc(softnet_data.processed);
3051 	skb_reset_network_header(skb);
3052 	skb_reset_transport_header(skb);
3053 	skb->mac_len = skb->network_header - skb->mac_header;
3054 
3055 	pt_prev = NULL;
3056 
3057 	rcu_read_lock();
3058 
3059 #ifdef CONFIG_NET_CLS_ACT
3060 	if (skb->tc_verd & TC_NCLS) {
3061 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3062 		goto ncls;
3063 	}
3064 #endif
3065 
3066 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3067 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3068 		    ptype->dev == orig_dev) {
3069 			if (pt_prev)
3070 				ret = deliver_skb(skb, pt_prev, orig_dev);
3071 			pt_prev = ptype;
3072 		}
3073 	}
3074 
3075 #ifdef CONFIG_NET_CLS_ACT
3076 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3077 	if (!skb)
3078 		goto out;
3079 ncls:
3080 #endif
3081 
3082 	/* Handle special case of bridge or macvlan */
3083 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3084 	if (rx_handler) {
3085 		if (pt_prev) {
3086 			ret = deliver_skb(skb, pt_prev, orig_dev);
3087 			pt_prev = NULL;
3088 		}
3089 		skb = rx_handler(skb);
3090 		if (!skb)
3091 			goto out;
3092 	}
3093 
3094 	if (vlan_tx_tag_present(skb)) {
3095 		if (pt_prev) {
3096 			ret = deliver_skb(skb, pt_prev, orig_dev);
3097 			pt_prev = NULL;
3098 		}
3099 		if (vlan_hwaccel_do_receive(&skb)) {
3100 			ret = __netif_receive_skb(skb);
3101 			goto out;
3102 		} else if (unlikely(!skb))
3103 			goto out;
3104 	}
3105 
3106 	/*
3107 	 * Make sure frames received on VLAN interfaces stacked on
3108 	 * bonding interfaces still make their way to any base bonding
3109 	 * device that may have registered for a specific ptype.  The
3110 	 * handler may have to adjust skb->dev and orig_dev.
3111 	 */
3112 	orig_or_bond = orig_dev;
3113 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3114 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3115 		orig_or_bond = vlan_dev_real_dev(skb->dev);
3116 	}
3117 
3118 	type = skb->protocol;
3119 	list_for_each_entry_rcu(ptype,
3120 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3121 		if (ptype->type == type && (ptype->dev == null_or_orig ||
3122 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
3123 		     ptype->dev == orig_or_bond)) {
3124 			if (pt_prev)
3125 				ret = deliver_skb(skb, pt_prev, orig_dev);
3126 			pt_prev = ptype;
3127 		}
3128 	}
3129 
3130 	if (pt_prev) {
3131 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3132 	} else {
3133 		atomic_long_inc(&skb->dev->rx_dropped);
3134 		kfree_skb(skb);
3135 		/* Jamal, now you will not able to escape explaining
3136 		 * me how you were going to use this. :-)
3137 		 */
3138 		ret = NET_RX_DROP;
3139 	}
3140 
3141 out:
3142 	rcu_read_unlock();
3143 	return ret;
3144 }
3145 
3146 /**
3147  *	netif_receive_skb - process receive buffer from network
3148  *	@skb: buffer to process
3149  *
3150  *	netif_receive_skb() is the main receive data processing function.
3151  *	It always succeeds. The buffer may be dropped during processing
3152  *	for congestion control or by the protocol layers.
3153  *
3154  *	This function may only be called from softirq context and interrupts
3155  *	should be enabled.
3156  *
3157  *	Return values (usually ignored):
3158  *	NET_RX_SUCCESS: no congestion
3159  *	NET_RX_DROP: packet was dropped
3160  */
3161 int netif_receive_skb(struct sk_buff *skb)
3162 {
3163 	if (netdev_tstamp_prequeue)
3164 		net_timestamp_check(skb);
3165 
3166 	if (skb_defer_rx_timestamp(skb))
3167 		return NET_RX_SUCCESS;
3168 
3169 #ifdef CONFIG_RPS
3170 	{
3171 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3172 		int cpu, ret;
3173 
3174 		rcu_read_lock();
3175 
3176 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3177 
3178 		if (cpu >= 0) {
3179 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3180 			rcu_read_unlock();
3181 		} else {
3182 			rcu_read_unlock();
3183 			ret = __netif_receive_skb(skb);
3184 		}
3185 
3186 		return ret;
3187 	}
3188 #else
3189 	return __netif_receive_skb(skb);
3190 #endif
3191 }
3192 EXPORT_SYMBOL(netif_receive_skb);
3193 
3194 /* Network device is going away, flush any packets still pending
3195  * Called with irqs disabled.
3196  */
3197 static void flush_backlog(void *arg)
3198 {
3199 	struct net_device *dev = arg;
3200 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3201 	struct sk_buff *skb, *tmp;
3202 
3203 	rps_lock(sd);
3204 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3205 		if (skb->dev == dev) {
3206 			__skb_unlink(skb, &sd->input_pkt_queue);
3207 			kfree_skb(skb);
3208 			input_queue_head_incr(sd);
3209 		}
3210 	}
3211 	rps_unlock(sd);
3212 
3213 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3214 		if (skb->dev == dev) {
3215 			__skb_unlink(skb, &sd->process_queue);
3216 			kfree_skb(skb);
3217 			input_queue_head_incr(sd);
3218 		}
3219 	}
3220 }
3221 
3222 static int napi_gro_complete(struct sk_buff *skb)
3223 {
3224 	struct packet_type *ptype;
3225 	__be16 type = skb->protocol;
3226 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3227 	int err = -ENOENT;
3228 
3229 	if (NAPI_GRO_CB(skb)->count == 1) {
3230 		skb_shinfo(skb)->gso_size = 0;
3231 		goto out;
3232 	}
3233 
3234 	rcu_read_lock();
3235 	list_for_each_entry_rcu(ptype, head, list) {
3236 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3237 			continue;
3238 
3239 		err = ptype->gro_complete(skb);
3240 		break;
3241 	}
3242 	rcu_read_unlock();
3243 
3244 	if (err) {
3245 		WARN_ON(&ptype->list == head);
3246 		kfree_skb(skb);
3247 		return NET_RX_SUCCESS;
3248 	}
3249 
3250 out:
3251 	return netif_receive_skb(skb);
3252 }
3253 
3254 inline void napi_gro_flush(struct napi_struct *napi)
3255 {
3256 	struct sk_buff *skb, *next;
3257 
3258 	for (skb = napi->gro_list; skb; skb = next) {
3259 		next = skb->next;
3260 		skb->next = NULL;
3261 		napi_gro_complete(skb);
3262 	}
3263 
3264 	napi->gro_count = 0;
3265 	napi->gro_list = NULL;
3266 }
3267 EXPORT_SYMBOL(napi_gro_flush);
3268 
3269 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3270 {
3271 	struct sk_buff **pp = NULL;
3272 	struct packet_type *ptype;
3273 	__be16 type = skb->protocol;
3274 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3275 	int same_flow;
3276 	int mac_len;
3277 	enum gro_result ret;
3278 
3279 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3280 		goto normal;
3281 
3282 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3283 		goto normal;
3284 
3285 	rcu_read_lock();
3286 	list_for_each_entry_rcu(ptype, head, list) {
3287 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3288 			continue;
3289 
3290 		skb_set_network_header(skb, skb_gro_offset(skb));
3291 		mac_len = skb->network_header - skb->mac_header;
3292 		skb->mac_len = mac_len;
3293 		NAPI_GRO_CB(skb)->same_flow = 0;
3294 		NAPI_GRO_CB(skb)->flush = 0;
3295 		NAPI_GRO_CB(skb)->free = 0;
3296 
3297 		pp = ptype->gro_receive(&napi->gro_list, skb);
3298 		break;
3299 	}
3300 	rcu_read_unlock();
3301 
3302 	if (&ptype->list == head)
3303 		goto normal;
3304 
3305 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3306 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3307 
3308 	if (pp) {
3309 		struct sk_buff *nskb = *pp;
3310 
3311 		*pp = nskb->next;
3312 		nskb->next = NULL;
3313 		napi_gro_complete(nskb);
3314 		napi->gro_count--;
3315 	}
3316 
3317 	if (same_flow)
3318 		goto ok;
3319 
3320 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3321 		goto normal;
3322 
3323 	napi->gro_count++;
3324 	NAPI_GRO_CB(skb)->count = 1;
3325 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3326 	skb->next = napi->gro_list;
3327 	napi->gro_list = skb;
3328 	ret = GRO_HELD;
3329 
3330 pull:
3331 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3332 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3333 
3334 		BUG_ON(skb->end - skb->tail < grow);
3335 
3336 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3337 
3338 		skb->tail += grow;
3339 		skb->data_len -= grow;
3340 
3341 		skb_shinfo(skb)->frags[0].page_offset += grow;
3342 		skb_shinfo(skb)->frags[0].size -= grow;
3343 
3344 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3345 			put_page(skb_shinfo(skb)->frags[0].page);
3346 			memmove(skb_shinfo(skb)->frags,
3347 				skb_shinfo(skb)->frags + 1,
3348 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3349 		}
3350 	}
3351 
3352 ok:
3353 	return ret;
3354 
3355 normal:
3356 	ret = GRO_NORMAL;
3357 	goto pull;
3358 }
3359 EXPORT_SYMBOL(dev_gro_receive);
3360 
3361 static inline gro_result_t
3362 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3363 {
3364 	struct sk_buff *p;
3365 
3366 	for (p = napi->gro_list; p; p = p->next) {
3367 		unsigned long diffs;
3368 
3369 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3370 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3371 		diffs |= compare_ether_header(skb_mac_header(p),
3372 					      skb_gro_mac_header(skb));
3373 		NAPI_GRO_CB(p)->same_flow = !diffs;
3374 		NAPI_GRO_CB(p)->flush = 0;
3375 	}
3376 
3377 	return dev_gro_receive(napi, skb);
3378 }
3379 
3380 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3381 {
3382 	switch (ret) {
3383 	case GRO_NORMAL:
3384 		if (netif_receive_skb(skb))
3385 			ret = GRO_DROP;
3386 		break;
3387 
3388 	case GRO_DROP:
3389 	case GRO_MERGED_FREE:
3390 		kfree_skb(skb);
3391 		break;
3392 
3393 	case GRO_HELD:
3394 	case GRO_MERGED:
3395 		break;
3396 	}
3397 
3398 	return ret;
3399 }
3400 EXPORT_SYMBOL(napi_skb_finish);
3401 
3402 void skb_gro_reset_offset(struct sk_buff *skb)
3403 {
3404 	NAPI_GRO_CB(skb)->data_offset = 0;
3405 	NAPI_GRO_CB(skb)->frag0 = NULL;
3406 	NAPI_GRO_CB(skb)->frag0_len = 0;
3407 
3408 	if (skb->mac_header == skb->tail &&
3409 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3410 		NAPI_GRO_CB(skb)->frag0 =
3411 			page_address(skb_shinfo(skb)->frags[0].page) +
3412 			skb_shinfo(skb)->frags[0].page_offset;
3413 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3414 	}
3415 }
3416 EXPORT_SYMBOL(skb_gro_reset_offset);
3417 
3418 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3419 {
3420 	skb_gro_reset_offset(skb);
3421 
3422 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3423 }
3424 EXPORT_SYMBOL(napi_gro_receive);
3425 
3426 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3427 {
3428 	__skb_pull(skb, skb_headlen(skb));
3429 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3430 	skb->vlan_tci = 0;
3431 	skb->dev = napi->dev;
3432 	skb->skb_iif = 0;
3433 
3434 	napi->skb = skb;
3435 }
3436 
3437 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3438 {
3439 	struct sk_buff *skb = napi->skb;
3440 
3441 	if (!skb) {
3442 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3443 		if (skb)
3444 			napi->skb = skb;
3445 	}
3446 	return skb;
3447 }
3448 EXPORT_SYMBOL(napi_get_frags);
3449 
3450 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3451 			       gro_result_t ret)
3452 {
3453 	switch (ret) {
3454 	case GRO_NORMAL:
3455 	case GRO_HELD:
3456 		skb->protocol = eth_type_trans(skb, skb->dev);
3457 
3458 		if (ret == GRO_HELD)
3459 			skb_gro_pull(skb, -ETH_HLEN);
3460 		else if (netif_receive_skb(skb))
3461 			ret = GRO_DROP;
3462 		break;
3463 
3464 	case GRO_DROP:
3465 	case GRO_MERGED_FREE:
3466 		napi_reuse_skb(napi, skb);
3467 		break;
3468 
3469 	case GRO_MERGED:
3470 		break;
3471 	}
3472 
3473 	return ret;
3474 }
3475 EXPORT_SYMBOL(napi_frags_finish);
3476 
3477 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3478 {
3479 	struct sk_buff *skb = napi->skb;
3480 	struct ethhdr *eth;
3481 	unsigned int hlen;
3482 	unsigned int off;
3483 
3484 	napi->skb = NULL;
3485 
3486 	skb_reset_mac_header(skb);
3487 	skb_gro_reset_offset(skb);
3488 
3489 	off = skb_gro_offset(skb);
3490 	hlen = off + sizeof(*eth);
3491 	eth = skb_gro_header_fast(skb, off);
3492 	if (skb_gro_header_hard(skb, hlen)) {
3493 		eth = skb_gro_header_slow(skb, hlen, off);
3494 		if (unlikely(!eth)) {
3495 			napi_reuse_skb(napi, skb);
3496 			skb = NULL;
3497 			goto out;
3498 		}
3499 	}
3500 
3501 	skb_gro_pull(skb, sizeof(*eth));
3502 
3503 	/*
3504 	 * This works because the only protocols we care about don't require
3505 	 * special handling.  We'll fix it up properly at the end.
3506 	 */
3507 	skb->protocol = eth->h_proto;
3508 
3509 out:
3510 	return skb;
3511 }
3512 EXPORT_SYMBOL(napi_frags_skb);
3513 
3514 gro_result_t napi_gro_frags(struct napi_struct *napi)
3515 {
3516 	struct sk_buff *skb = napi_frags_skb(napi);
3517 
3518 	if (!skb)
3519 		return GRO_DROP;
3520 
3521 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3522 }
3523 EXPORT_SYMBOL(napi_gro_frags);
3524 
3525 /*
3526  * net_rps_action sends any pending IPI's for rps.
3527  * Note: called with local irq disabled, but exits with local irq enabled.
3528  */
3529 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3530 {
3531 #ifdef CONFIG_RPS
3532 	struct softnet_data *remsd = sd->rps_ipi_list;
3533 
3534 	if (remsd) {
3535 		sd->rps_ipi_list = NULL;
3536 
3537 		local_irq_enable();
3538 
3539 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3540 		while (remsd) {
3541 			struct softnet_data *next = remsd->rps_ipi_next;
3542 
3543 			if (cpu_online(remsd->cpu))
3544 				__smp_call_function_single(remsd->cpu,
3545 							   &remsd->csd, 0);
3546 			remsd = next;
3547 		}
3548 	} else
3549 #endif
3550 		local_irq_enable();
3551 }
3552 
3553 static int process_backlog(struct napi_struct *napi, int quota)
3554 {
3555 	int work = 0;
3556 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3557 
3558 #ifdef CONFIG_RPS
3559 	/* Check if we have pending ipi, its better to send them now,
3560 	 * not waiting net_rx_action() end.
3561 	 */
3562 	if (sd->rps_ipi_list) {
3563 		local_irq_disable();
3564 		net_rps_action_and_irq_enable(sd);
3565 	}
3566 #endif
3567 	napi->weight = weight_p;
3568 	local_irq_disable();
3569 	while (work < quota) {
3570 		struct sk_buff *skb;
3571 		unsigned int qlen;
3572 
3573 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3574 			local_irq_enable();
3575 			__netif_receive_skb(skb);
3576 			local_irq_disable();
3577 			input_queue_head_incr(sd);
3578 			if (++work >= quota) {
3579 				local_irq_enable();
3580 				return work;
3581 			}
3582 		}
3583 
3584 		rps_lock(sd);
3585 		qlen = skb_queue_len(&sd->input_pkt_queue);
3586 		if (qlen)
3587 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3588 						   &sd->process_queue);
3589 
3590 		if (qlen < quota - work) {
3591 			/*
3592 			 * Inline a custom version of __napi_complete().
3593 			 * only current cpu owns and manipulates this napi,
3594 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3595 			 * we can use a plain write instead of clear_bit(),
3596 			 * and we dont need an smp_mb() memory barrier.
3597 			 */
3598 			list_del(&napi->poll_list);
3599 			napi->state = 0;
3600 
3601 			quota = work + qlen;
3602 		}
3603 		rps_unlock(sd);
3604 	}
3605 	local_irq_enable();
3606 
3607 	return work;
3608 }
3609 
3610 /**
3611  * __napi_schedule - schedule for receive
3612  * @n: entry to schedule
3613  *
3614  * The entry's receive function will be scheduled to run
3615  */
3616 void __napi_schedule(struct napi_struct *n)
3617 {
3618 	unsigned long flags;
3619 
3620 	local_irq_save(flags);
3621 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3622 	local_irq_restore(flags);
3623 }
3624 EXPORT_SYMBOL(__napi_schedule);
3625 
3626 void __napi_complete(struct napi_struct *n)
3627 {
3628 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3629 	BUG_ON(n->gro_list);
3630 
3631 	list_del(&n->poll_list);
3632 	smp_mb__before_clear_bit();
3633 	clear_bit(NAPI_STATE_SCHED, &n->state);
3634 }
3635 EXPORT_SYMBOL(__napi_complete);
3636 
3637 void napi_complete(struct napi_struct *n)
3638 {
3639 	unsigned long flags;
3640 
3641 	/*
3642 	 * don't let napi dequeue from the cpu poll list
3643 	 * just in case its running on a different cpu
3644 	 */
3645 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3646 		return;
3647 
3648 	napi_gro_flush(n);
3649 	local_irq_save(flags);
3650 	__napi_complete(n);
3651 	local_irq_restore(flags);
3652 }
3653 EXPORT_SYMBOL(napi_complete);
3654 
3655 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3656 		    int (*poll)(struct napi_struct *, int), int weight)
3657 {
3658 	INIT_LIST_HEAD(&napi->poll_list);
3659 	napi->gro_count = 0;
3660 	napi->gro_list = NULL;
3661 	napi->skb = NULL;
3662 	napi->poll = poll;
3663 	napi->weight = weight;
3664 	list_add(&napi->dev_list, &dev->napi_list);
3665 	napi->dev = dev;
3666 #ifdef CONFIG_NETPOLL
3667 	spin_lock_init(&napi->poll_lock);
3668 	napi->poll_owner = -1;
3669 #endif
3670 	set_bit(NAPI_STATE_SCHED, &napi->state);
3671 }
3672 EXPORT_SYMBOL(netif_napi_add);
3673 
3674 void netif_napi_del(struct napi_struct *napi)
3675 {
3676 	struct sk_buff *skb, *next;
3677 
3678 	list_del_init(&napi->dev_list);
3679 	napi_free_frags(napi);
3680 
3681 	for (skb = napi->gro_list; skb; skb = next) {
3682 		next = skb->next;
3683 		skb->next = NULL;
3684 		kfree_skb(skb);
3685 	}
3686 
3687 	napi->gro_list = NULL;
3688 	napi->gro_count = 0;
3689 }
3690 EXPORT_SYMBOL(netif_napi_del);
3691 
3692 static void net_rx_action(struct softirq_action *h)
3693 {
3694 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3695 	unsigned long time_limit = jiffies + 2;
3696 	int budget = netdev_budget;
3697 	void *have;
3698 
3699 	local_irq_disable();
3700 
3701 	while (!list_empty(&sd->poll_list)) {
3702 		struct napi_struct *n;
3703 		int work, weight;
3704 
3705 		/* If softirq window is exhuasted then punt.
3706 		 * Allow this to run for 2 jiffies since which will allow
3707 		 * an average latency of 1.5/HZ.
3708 		 */
3709 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3710 			goto softnet_break;
3711 
3712 		local_irq_enable();
3713 
3714 		/* Even though interrupts have been re-enabled, this
3715 		 * access is safe because interrupts can only add new
3716 		 * entries to the tail of this list, and only ->poll()
3717 		 * calls can remove this head entry from the list.
3718 		 */
3719 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3720 
3721 		have = netpoll_poll_lock(n);
3722 
3723 		weight = n->weight;
3724 
3725 		/* This NAPI_STATE_SCHED test is for avoiding a race
3726 		 * with netpoll's poll_napi().  Only the entity which
3727 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3728 		 * actually make the ->poll() call.  Therefore we avoid
3729 		 * accidently calling ->poll() when NAPI is not scheduled.
3730 		 */
3731 		work = 0;
3732 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3733 			work = n->poll(n, weight);
3734 			trace_napi_poll(n);
3735 		}
3736 
3737 		WARN_ON_ONCE(work > weight);
3738 
3739 		budget -= work;
3740 
3741 		local_irq_disable();
3742 
3743 		/* Drivers must not modify the NAPI state if they
3744 		 * consume the entire weight.  In such cases this code
3745 		 * still "owns" the NAPI instance and therefore can
3746 		 * move the instance around on the list at-will.
3747 		 */
3748 		if (unlikely(work == weight)) {
3749 			if (unlikely(napi_disable_pending(n))) {
3750 				local_irq_enable();
3751 				napi_complete(n);
3752 				local_irq_disable();
3753 			} else
3754 				list_move_tail(&n->poll_list, &sd->poll_list);
3755 		}
3756 
3757 		netpoll_poll_unlock(have);
3758 	}
3759 out:
3760 	net_rps_action_and_irq_enable(sd);
3761 
3762 #ifdef CONFIG_NET_DMA
3763 	/*
3764 	 * There may not be any more sk_buffs coming right now, so push
3765 	 * any pending DMA copies to hardware
3766 	 */
3767 	dma_issue_pending_all();
3768 #endif
3769 
3770 	return;
3771 
3772 softnet_break:
3773 	sd->time_squeeze++;
3774 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3775 	goto out;
3776 }
3777 
3778 static gifconf_func_t *gifconf_list[NPROTO];
3779 
3780 /**
3781  *	register_gifconf	-	register a SIOCGIF handler
3782  *	@family: Address family
3783  *	@gifconf: Function handler
3784  *
3785  *	Register protocol dependent address dumping routines. The handler
3786  *	that is passed must not be freed or reused until it has been replaced
3787  *	by another handler.
3788  */
3789 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3790 {
3791 	if (family >= NPROTO)
3792 		return -EINVAL;
3793 	gifconf_list[family] = gifconf;
3794 	return 0;
3795 }
3796 EXPORT_SYMBOL(register_gifconf);
3797 
3798 
3799 /*
3800  *	Map an interface index to its name (SIOCGIFNAME)
3801  */
3802 
3803 /*
3804  *	We need this ioctl for efficient implementation of the
3805  *	if_indextoname() function required by the IPv6 API.  Without
3806  *	it, we would have to search all the interfaces to find a
3807  *	match.  --pb
3808  */
3809 
3810 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3811 {
3812 	struct net_device *dev;
3813 	struct ifreq ifr;
3814 
3815 	/*
3816 	 *	Fetch the caller's info block.
3817 	 */
3818 
3819 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3820 		return -EFAULT;
3821 
3822 	rcu_read_lock();
3823 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3824 	if (!dev) {
3825 		rcu_read_unlock();
3826 		return -ENODEV;
3827 	}
3828 
3829 	strcpy(ifr.ifr_name, dev->name);
3830 	rcu_read_unlock();
3831 
3832 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3833 		return -EFAULT;
3834 	return 0;
3835 }
3836 
3837 /*
3838  *	Perform a SIOCGIFCONF call. This structure will change
3839  *	size eventually, and there is nothing I can do about it.
3840  *	Thus we will need a 'compatibility mode'.
3841  */
3842 
3843 static int dev_ifconf(struct net *net, char __user *arg)
3844 {
3845 	struct ifconf ifc;
3846 	struct net_device *dev;
3847 	char __user *pos;
3848 	int len;
3849 	int total;
3850 	int i;
3851 
3852 	/*
3853 	 *	Fetch the caller's info block.
3854 	 */
3855 
3856 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3857 		return -EFAULT;
3858 
3859 	pos = ifc.ifc_buf;
3860 	len = ifc.ifc_len;
3861 
3862 	/*
3863 	 *	Loop over the interfaces, and write an info block for each.
3864 	 */
3865 
3866 	total = 0;
3867 	for_each_netdev(net, dev) {
3868 		for (i = 0; i < NPROTO; i++) {
3869 			if (gifconf_list[i]) {
3870 				int done;
3871 				if (!pos)
3872 					done = gifconf_list[i](dev, NULL, 0);
3873 				else
3874 					done = gifconf_list[i](dev, pos + total,
3875 							       len - total);
3876 				if (done < 0)
3877 					return -EFAULT;
3878 				total += done;
3879 			}
3880 		}
3881 	}
3882 
3883 	/*
3884 	 *	All done.  Write the updated control block back to the caller.
3885 	 */
3886 	ifc.ifc_len = total;
3887 
3888 	/*
3889 	 * 	Both BSD and Solaris return 0 here, so we do too.
3890 	 */
3891 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3892 }
3893 
3894 #ifdef CONFIG_PROC_FS
3895 /*
3896  *	This is invoked by the /proc filesystem handler to display a device
3897  *	in detail.
3898  */
3899 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3900 	__acquires(RCU)
3901 {
3902 	struct net *net = seq_file_net(seq);
3903 	loff_t off;
3904 	struct net_device *dev;
3905 
3906 	rcu_read_lock();
3907 	if (!*pos)
3908 		return SEQ_START_TOKEN;
3909 
3910 	off = 1;
3911 	for_each_netdev_rcu(net, dev)
3912 		if (off++ == *pos)
3913 			return dev;
3914 
3915 	return NULL;
3916 }
3917 
3918 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3919 {
3920 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3921 				  first_net_device(seq_file_net(seq)) :
3922 				  next_net_device((struct net_device *)v);
3923 
3924 	++*pos;
3925 	return rcu_dereference(dev);
3926 }
3927 
3928 void dev_seq_stop(struct seq_file *seq, void *v)
3929 	__releases(RCU)
3930 {
3931 	rcu_read_unlock();
3932 }
3933 
3934 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3935 {
3936 	struct rtnl_link_stats64 temp;
3937 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3938 
3939 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3940 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3941 		   dev->name, stats->rx_bytes, stats->rx_packets,
3942 		   stats->rx_errors,
3943 		   stats->rx_dropped + stats->rx_missed_errors,
3944 		   stats->rx_fifo_errors,
3945 		   stats->rx_length_errors + stats->rx_over_errors +
3946 		    stats->rx_crc_errors + stats->rx_frame_errors,
3947 		   stats->rx_compressed, stats->multicast,
3948 		   stats->tx_bytes, stats->tx_packets,
3949 		   stats->tx_errors, stats->tx_dropped,
3950 		   stats->tx_fifo_errors, stats->collisions,
3951 		   stats->tx_carrier_errors +
3952 		    stats->tx_aborted_errors +
3953 		    stats->tx_window_errors +
3954 		    stats->tx_heartbeat_errors,
3955 		   stats->tx_compressed);
3956 }
3957 
3958 /*
3959  *	Called from the PROCfs module. This now uses the new arbitrary sized
3960  *	/proc/net interface to create /proc/net/dev
3961  */
3962 static int dev_seq_show(struct seq_file *seq, void *v)
3963 {
3964 	if (v == SEQ_START_TOKEN)
3965 		seq_puts(seq, "Inter-|   Receive                            "
3966 			      "                    |  Transmit\n"
3967 			      " face |bytes    packets errs drop fifo frame "
3968 			      "compressed multicast|bytes    packets errs "
3969 			      "drop fifo colls carrier compressed\n");
3970 	else
3971 		dev_seq_printf_stats(seq, v);
3972 	return 0;
3973 }
3974 
3975 static struct softnet_data *softnet_get_online(loff_t *pos)
3976 {
3977 	struct softnet_data *sd = NULL;
3978 
3979 	while (*pos < nr_cpu_ids)
3980 		if (cpu_online(*pos)) {
3981 			sd = &per_cpu(softnet_data, *pos);
3982 			break;
3983 		} else
3984 			++*pos;
3985 	return sd;
3986 }
3987 
3988 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3989 {
3990 	return softnet_get_online(pos);
3991 }
3992 
3993 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3994 {
3995 	++*pos;
3996 	return softnet_get_online(pos);
3997 }
3998 
3999 static void softnet_seq_stop(struct seq_file *seq, void *v)
4000 {
4001 }
4002 
4003 static int softnet_seq_show(struct seq_file *seq, void *v)
4004 {
4005 	struct softnet_data *sd = v;
4006 
4007 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4008 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4009 		   0, 0, 0, 0, /* was fastroute */
4010 		   sd->cpu_collision, sd->received_rps);
4011 	return 0;
4012 }
4013 
4014 static const struct seq_operations dev_seq_ops = {
4015 	.start = dev_seq_start,
4016 	.next  = dev_seq_next,
4017 	.stop  = dev_seq_stop,
4018 	.show  = dev_seq_show,
4019 };
4020 
4021 static int dev_seq_open(struct inode *inode, struct file *file)
4022 {
4023 	return seq_open_net(inode, file, &dev_seq_ops,
4024 			    sizeof(struct seq_net_private));
4025 }
4026 
4027 static const struct file_operations dev_seq_fops = {
4028 	.owner	 = THIS_MODULE,
4029 	.open    = dev_seq_open,
4030 	.read    = seq_read,
4031 	.llseek  = seq_lseek,
4032 	.release = seq_release_net,
4033 };
4034 
4035 static const struct seq_operations softnet_seq_ops = {
4036 	.start = softnet_seq_start,
4037 	.next  = softnet_seq_next,
4038 	.stop  = softnet_seq_stop,
4039 	.show  = softnet_seq_show,
4040 };
4041 
4042 static int softnet_seq_open(struct inode *inode, struct file *file)
4043 {
4044 	return seq_open(file, &softnet_seq_ops);
4045 }
4046 
4047 static const struct file_operations softnet_seq_fops = {
4048 	.owner	 = THIS_MODULE,
4049 	.open    = softnet_seq_open,
4050 	.read    = seq_read,
4051 	.llseek  = seq_lseek,
4052 	.release = seq_release,
4053 };
4054 
4055 static void *ptype_get_idx(loff_t pos)
4056 {
4057 	struct packet_type *pt = NULL;
4058 	loff_t i = 0;
4059 	int t;
4060 
4061 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4062 		if (i == pos)
4063 			return pt;
4064 		++i;
4065 	}
4066 
4067 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4068 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4069 			if (i == pos)
4070 				return pt;
4071 			++i;
4072 		}
4073 	}
4074 	return NULL;
4075 }
4076 
4077 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4078 	__acquires(RCU)
4079 {
4080 	rcu_read_lock();
4081 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4082 }
4083 
4084 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4085 {
4086 	struct packet_type *pt;
4087 	struct list_head *nxt;
4088 	int hash;
4089 
4090 	++*pos;
4091 	if (v == SEQ_START_TOKEN)
4092 		return ptype_get_idx(0);
4093 
4094 	pt = v;
4095 	nxt = pt->list.next;
4096 	if (pt->type == htons(ETH_P_ALL)) {
4097 		if (nxt != &ptype_all)
4098 			goto found;
4099 		hash = 0;
4100 		nxt = ptype_base[0].next;
4101 	} else
4102 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4103 
4104 	while (nxt == &ptype_base[hash]) {
4105 		if (++hash >= PTYPE_HASH_SIZE)
4106 			return NULL;
4107 		nxt = ptype_base[hash].next;
4108 	}
4109 found:
4110 	return list_entry(nxt, struct packet_type, list);
4111 }
4112 
4113 static void ptype_seq_stop(struct seq_file *seq, void *v)
4114 	__releases(RCU)
4115 {
4116 	rcu_read_unlock();
4117 }
4118 
4119 static int ptype_seq_show(struct seq_file *seq, void *v)
4120 {
4121 	struct packet_type *pt = v;
4122 
4123 	if (v == SEQ_START_TOKEN)
4124 		seq_puts(seq, "Type Device      Function\n");
4125 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4126 		if (pt->type == htons(ETH_P_ALL))
4127 			seq_puts(seq, "ALL ");
4128 		else
4129 			seq_printf(seq, "%04x", ntohs(pt->type));
4130 
4131 		seq_printf(seq, " %-8s %pF\n",
4132 			   pt->dev ? pt->dev->name : "", pt->func);
4133 	}
4134 
4135 	return 0;
4136 }
4137 
4138 static const struct seq_operations ptype_seq_ops = {
4139 	.start = ptype_seq_start,
4140 	.next  = ptype_seq_next,
4141 	.stop  = ptype_seq_stop,
4142 	.show  = ptype_seq_show,
4143 };
4144 
4145 static int ptype_seq_open(struct inode *inode, struct file *file)
4146 {
4147 	return seq_open_net(inode, file, &ptype_seq_ops,
4148 			sizeof(struct seq_net_private));
4149 }
4150 
4151 static const struct file_operations ptype_seq_fops = {
4152 	.owner	 = THIS_MODULE,
4153 	.open    = ptype_seq_open,
4154 	.read    = seq_read,
4155 	.llseek  = seq_lseek,
4156 	.release = seq_release_net,
4157 };
4158 
4159 
4160 static int __net_init dev_proc_net_init(struct net *net)
4161 {
4162 	int rc = -ENOMEM;
4163 
4164 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4165 		goto out;
4166 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4167 		goto out_dev;
4168 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4169 		goto out_softnet;
4170 
4171 	if (wext_proc_init(net))
4172 		goto out_ptype;
4173 	rc = 0;
4174 out:
4175 	return rc;
4176 out_ptype:
4177 	proc_net_remove(net, "ptype");
4178 out_softnet:
4179 	proc_net_remove(net, "softnet_stat");
4180 out_dev:
4181 	proc_net_remove(net, "dev");
4182 	goto out;
4183 }
4184 
4185 static void __net_exit dev_proc_net_exit(struct net *net)
4186 {
4187 	wext_proc_exit(net);
4188 
4189 	proc_net_remove(net, "ptype");
4190 	proc_net_remove(net, "softnet_stat");
4191 	proc_net_remove(net, "dev");
4192 }
4193 
4194 static struct pernet_operations __net_initdata dev_proc_ops = {
4195 	.init = dev_proc_net_init,
4196 	.exit = dev_proc_net_exit,
4197 };
4198 
4199 static int __init dev_proc_init(void)
4200 {
4201 	return register_pernet_subsys(&dev_proc_ops);
4202 }
4203 #else
4204 #define dev_proc_init() 0
4205 #endif	/* CONFIG_PROC_FS */
4206 
4207 
4208 /**
4209  *	netdev_set_master	-	set up master/slave pair
4210  *	@slave: slave device
4211  *	@master: new master device
4212  *
4213  *	Changes the master device of the slave. Pass %NULL to break the
4214  *	bonding. The caller must hold the RTNL semaphore. On a failure
4215  *	a negative errno code is returned. On success the reference counts
4216  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4217  *	function returns zero.
4218  */
4219 int netdev_set_master(struct net_device *slave, struct net_device *master)
4220 {
4221 	struct net_device *old = slave->master;
4222 
4223 	ASSERT_RTNL();
4224 
4225 	if (master) {
4226 		if (old)
4227 			return -EBUSY;
4228 		dev_hold(master);
4229 	}
4230 
4231 	slave->master = master;
4232 
4233 	if (old) {
4234 		synchronize_net();
4235 		dev_put(old);
4236 	}
4237 	if (master)
4238 		slave->flags |= IFF_SLAVE;
4239 	else
4240 		slave->flags &= ~IFF_SLAVE;
4241 
4242 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4243 	return 0;
4244 }
4245 EXPORT_SYMBOL(netdev_set_master);
4246 
4247 static void dev_change_rx_flags(struct net_device *dev, int flags)
4248 {
4249 	const struct net_device_ops *ops = dev->netdev_ops;
4250 
4251 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4252 		ops->ndo_change_rx_flags(dev, flags);
4253 }
4254 
4255 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4256 {
4257 	unsigned short old_flags = dev->flags;
4258 	uid_t uid;
4259 	gid_t gid;
4260 
4261 	ASSERT_RTNL();
4262 
4263 	dev->flags |= IFF_PROMISC;
4264 	dev->promiscuity += inc;
4265 	if (dev->promiscuity == 0) {
4266 		/*
4267 		 * Avoid overflow.
4268 		 * If inc causes overflow, untouch promisc and return error.
4269 		 */
4270 		if (inc < 0)
4271 			dev->flags &= ~IFF_PROMISC;
4272 		else {
4273 			dev->promiscuity -= inc;
4274 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4275 				"set promiscuity failed, promiscuity feature "
4276 				"of device might be broken.\n", dev->name);
4277 			return -EOVERFLOW;
4278 		}
4279 	}
4280 	if (dev->flags != old_flags) {
4281 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4282 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4283 							       "left");
4284 		if (audit_enabled) {
4285 			current_uid_gid(&uid, &gid);
4286 			audit_log(current->audit_context, GFP_ATOMIC,
4287 				AUDIT_ANOM_PROMISCUOUS,
4288 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4289 				dev->name, (dev->flags & IFF_PROMISC),
4290 				(old_flags & IFF_PROMISC),
4291 				audit_get_loginuid(current),
4292 				uid, gid,
4293 				audit_get_sessionid(current));
4294 		}
4295 
4296 		dev_change_rx_flags(dev, IFF_PROMISC);
4297 	}
4298 	return 0;
4299 }
4300 
4301 /**
4302  *	dev_set_promiscuity	- update promiscuity count on a device
4303  *	@dev: device
4304  *	@inc: modifier
4305  *
4306  *	Add or remove promiscuity from a device. While the count in the device
4307  *	remains above zero the interface remains promiscuous. Once it hits zero
4308  *	the device reverts back to normal filtering operation. A negative inc
4309  *	value is used to drop promiscuity on the device.
4310  *	Return 0 if successful or a negative errno code on error.
4311  */
4312 int dev_set_promiscuity(struct net_device *dev, int inc)
4313 {
4314 	unsigned short old_flags = dev->flags;
4315 	int err;
4316 
4317 	err = __dev_set_promiscuity(dev, inc);
4318 	if (err < 0)
4319 		return err;
4320 	if (dev->flags != old_flags)
4321 		dev_set_rx_mode(dev);
4322 	return err;
4323 }
4324 EXPORT_SYMBOL(dev_set_promiscuity);
4325 
4326 /**
4327  *	dev_set_allmulti	- update allmulti count on a device
4328  *	@dev: device
4329  *	@inc: modifier
4330  *
4331  *	Add or remove reception of all multicast frames to a device. While the
4332  *	count in the device remains above zero the interface remains listening
4333  *	to all interfaces. Once it hits zero the device reverts back to normal
4334  *	filtering operation. A negative @inc value is used to drop the counter
4335  *	when releasing a resource needing all multicasts.
4336  *	Return 0 if successful or a negative errno code on error.
4337  */
4338 
4339 int dev_set_allmulti(struct net_device *dev, int inc)
4340 {
4341 	unsigned short old_flags = dev->flags;
4342 
4343 	ASSERT_RTNL();
4344 
4345 	dev->flags |= IFF_ALLMULTI;
4346 	dev->allmulti += inc;
4347 	if (dev->allmulti == 0) {
4348 		/*
4349 		 * Avoid overflow.
4350 		 * If inc causes overflow, untouch allmulti and return error.
4351 		 */
4352 		if (inc < 0)
4353 			dev->flags &= ~IFF_ALLMULTI;
4354 		else {
4355 			dev->allmulti -= inc;
4356 			printk(KERN_WARNING "%s: allmulti touches roof, "
4357 				"set allmulti failed, allmulti feature of "
4358 				"device might be broken.\n", dev->name);
4359 			return -EOVERFLOW;
4360 		}
4361 	}
4362 	if (dev->flags ^ old_flags) {
4363 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4364 		dev_set_rx_mode(dev);
4365 	}
4366 	return 0;
4367 }
4368 EXPORT_SYMBOL(dev_set_allmulti);
4369 
4370 /*
4371  *	Upload unicast and multicast address lists to device and
4372  *	configure RX filtering. When the device doesn't support unicast
4373  *	filtering it is put in promiscuous mode while unicast addresses
4374  *	are present.
4375  */
4376 void __dev_set_rx_mode(struct net_device *dev)
4377 {
4378 	const struct net_device_ops *ops = dev->netdev_ops;
4379 
4380 	/* dev_open will call this function so the list will stay sane. */
4381 	if (!(dev->flags&IFF_UP))
4382 		return;
4383 
4384 	if (!netif_device_present(dev))
4385 		return;
4386 
4387 	if (ops->ndo_set_rx_mode)
4388 		ops->ndo_set_rx_mode(dev);
4389 	else {
4390 		/* Unicast addresses changes may only happen under the rtnl,
4391 		 * therefore calling __dev_set_promiscuity here is safe.
4392 		 */
4393 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4394 			__dev_set_promiscuity(dev, 1);
4395 			dev->uc_promisc = 1;
4396 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4397 			__dev_set_promiscuity(dev, -1);
4398 			dev->uc_promisc = 0;
4399 		}
4400 
4401 		if (ops->ndo_set_multicast_list)
4402 			ops->ndo_set_multicast_list(dev);
4403 	}
4404 }
4405 
4406 void dev_set_rx_mode(struct net_device *dev)
4407 {
4408 	netif_addr_lock_bh(dev);
4409 	__dev_set_rx_mode(dev);
4410 	netif_addr_unlock_bh(dev);
4411 }
4412 
4413 /**
4414  *	dev_get_flags - get flags reported to userspace
4415  *	@dev: device
4416  *
4417  *	Get the combination of flag bits exported through APIs to userspace.
4418  */
4419 unsigned dev_get_flags(const struct net_device *dev)
4420 {
4421 	unsigned flags;
4422 
4423 	flags = (dev->flags & ~(IFF_PROMISC |
4424 				IFF_ALLMULTI |
4425 				IFF_RUNNING |
4426 				IFF_LOWER_UP |
4427 				IFF_DORMANT)) |
4428 		(dev->gflags & (IFF_PROMISC |
4429 				IFF_ALLMULTI));
4430 
4431 	if (netif_running(dev)) {
4432 		if (netif_oper_up(dev))
4433 			flags |= IFF_RUNNING;
4434 		if (netif_carrier_ok(dev))
4435 			flags |= IFF_LOWER_UP;
4436 		if (netif_dormant(dev))
4437 			flags |= IFF_DORMANT;
4438 	}
4439 
4440 	return flags;
4441 }
4442 EXPORT_SYMBOL(dev_get_flags);
4443 
4444 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4445 {
4446 	int old_flags = dev->flags;
4447 	int ret;
4448 
4449 	ASSERT_RTNL();
4450 
4451 	/*
4452 	 *	Set the flags on our device.
4453 	 */
4454 
4455 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4456 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4457 			       IFF_AUTOMEDIA)) |
4458 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4459 				    IFF_ALLMULTI));
4460 
4461 	/*
4462 	 *	Load in the correct multicast list now the flags have changed.
4463 	 */
4464 
4465 	if ((old_flags ^ flags) & IFF_MULTICAST)
4466 		dev_change_rx_flags(dev, IFF_MULTICAST);
4467 
4468 	dev_set_rx_mode(dev);
4469 
4470 	/*
4471 	 *	Have we downed the interface. We handle IFF_UP ourselves
4472 	 *	according to user attempts to set it, rather than blindly
4473 	 *	setting it.
4474 	 */
4475 
4476 	ret = 0;
4477 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4478 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4479 
4480 		if (!ret)
4481 			dev_set_rx_mode(dev);
4482 	}
4483 
4484 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4485 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4486 
4487 		dev->gflags ^= IFF_PROMISC;
4488 		dev_set_promiscuity(dev, inc);
4489 	}
4490 
4491 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4492 	   is important. Some (broken) drivers set IFF_PROMISC, when
4493 	   IFF_ALLMULTI is requested not asking us and not reporting.
4494 	 */
4495 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4496 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4497 
4498 		dev->gflags ^= IFF_ALLMULTI;
4499 		dev_set_allmulti(dev, inc);
4500 	}
4501 
4502 	return ret;
4503 }
4504 
4505 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4506 {
4507 	unsigned int changes = dev->flags ^ old_flags;
4508 
4509 	if (changes & IFF_UP) {
4510 		if (dev->flags & IFF_UP)
4511 			call_netdevice_notifiers(NETDEV_UP, dev);
4512 		else
4513 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4514 	}
4515 
4516 	if (dev->flags & IFF_UP &&
4517 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4518 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4519 }
4520 
4521 /**
4522  *	dev_change_flags - change device settings
4523  *	@dev: device
4524  *	@flags: device state flags
4525  *
4526  *	Change settings on device based state flags. The flags are
4527  *	in the userspace exported format.
4528  */
4529 int dev_change_flags(struct net_device *dev, unsigned flags)
4530 {
4531 	int ret, changes;
4532 	int old_flags = dev->flags;
4533 
4534 	ret = __dev_change_flags(dev, flags);
4535 	if (ret < 0)
4536 		return ret;
4537 
4538 	changes = old_flags ^ dev->flags;
4539 	if (changes)
4540 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4541 
4542 	__dev_notify_flags(dev, old_flags);
4543 	return ret;
4544 }
4545 EXPORT_SYMBOL(dev_change_flags);
4546 
4547 /**
4548  *	dev_set_mtu - Change maximum transfer unit
4549  *	@dev: device
4550  *	@new_mtu: new transfer unit
4551  *
4552  *	Change the maximum transfer size of the network device.
4553  */
4554 int dev_set_mtu(struct net_device *dev, int new_mtu)
4555 {
4556 	const struct net_device_ops *ops = dev->netdev_ops;
4557 	int err;
4558 
4559 	if (new_mtu == dev->mtu)
4560 		return 0;
4561 
4562 	/*	MTU must be positive.	 */
4563 	if (new_mtu < 0)
4564 		return -EINVAL;
4565 
4566 	if (!netif_device_present(dev))
4567 		return -ENODEV;
4568 
4569 	err = 0;
4570 	if (ops->ndo_change_mtu)
4571 		err = ops->ndo_change_mtu(dev, new_mtu);
4572 	else
4573 		dev->mtu = new_mtu;
4574 
4575 	if (!err && dev->flags & IFF_UP)
4576 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4577 	return err;
4578 }
4579 EXPORT_SYMBOL(dev_set_mtu);
4580 
4581 /**
4582  *	dev_set_mac_address - Change Media Access Control Address
4583  *	@dev: device
4584  *	@sa: new address
4585  *
4586  *	Change the hardware (MAC) address of the device
4587  */
4588 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4589 {
4590 	const struct net_device_ops *ops = dev->netdev_ops;
4591 	int err;
4592 
4593 	if (!ops->ndo_set_mac_address)
4594 		return -EOPNOTSUPP;
4595 	if (sa->sa_family != dev->type)
4596 		return -EINVAL;
4597 	if (!netif_device_present(dev))
4598 		return -ENODEV;
4599 	err = ops->ndo_set_mac_address(dev, sa);
4600 	if (!err)
4601 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4602 	return err;
4603 }
4604 EXPORT_SYMBOL(dev_set_mac_address);
4605 
4606 /*
4607  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4608  */
4609 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4610 {
4611 	int err;
4612 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4613 
4614 	if (!dev)
4615 		return -ENODEV;
4616 
4617 	switch (cmd) {
4618 	case SIOCGIFFLAGS:	/* Get interface flags */
4619 		ifr->ifr_flags = (short) dev_get_flags(dev);
4620 		return 0;
4621 
4622 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4623 				   (currently unused) */
4624 		ifr->ifr_metric = 0;
4625 		return 0;
4626 
4627 	case SIOCGIFMTU:	/* Get the MTU of a device */
4628 		ifr->ifr_mtu = dev->mtu;
4629 		return 0;
4630 
4631 	case SIOCGIFHWADDR:
4632 		if (!dev->addr_len)
4633 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4634 		else
4635 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4636 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4637 		ifr->ifr_hwaddr.sa_family = dev->type;
4638 		return 0;
4639 
4640 	case SIOCGIFSLAVE:
4641 		err = -EINVAL;
4642 		break;
4643 
4644 	case SIOCGIFMAP:
4645 		ifr->ifr_map.mem_start = dev->mem_start;
4646 		ifr->ifr_map.mem_end   = dev->mem_end;
4647 		ifr->ifr_map.base_addr = dev->base_addr;
4648 		ifr->ifr_map.irq       = dev->irq;
4649 		ifr->ifr_map.dma       = dev->dma;
4650 		ifr->ifr_map.port      = dev->if_port;
4651 		return 0;
4652 
4653 	case SIOCGIFINDEX:
4654 		ifr->ifr_ifindex = dev->ifindex;
4655 		return 0;
4656 
4657 	case SIOCGIFTXQLEN:
4658 		ifr->ifr_qlen = dev->tx_queue_len;
4659 		return 0;
4660 
4661 	default:
4662 		/* dev_ioctl() should ensure this case
4663 		 * is never reached
4664 		 */
4665 		WARN_ON(1);
4666 		err = -EINVAL;
4667 		break;
4668 
4669 	}
4670 	return err;
4671 }
4672 
4673 /*
4674  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4675  */
4676 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4677 {
4678 	int err;
4679 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4680 	const struct net_device_ops *ops;
4681 
4682 	if (!dev)
4683 		return -ENODEV;
4684 
4685 	ops = dev->netdev_ops;
4686 
4687 	switch (cmd) {
4688 	case SIOCSIFFLAGS:	/* Set interface flags */
4689 		return dev_change_flags(dev, ifr->ifr_flags);
4690 
4691 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4692 				   (currently unused) */
4693 		return -EOPNOTSUPP;
4694 
4695 	case SIOCSIFMTU:	/* Set the MTU of a device */
4696 		return dev_set_mtu(dev, ifr->ifr_mtu);
4697 
4698 	case SIOCSIFHWADDR:
4699 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4700 
4701 	case SIOCSIFHWBROADCAST:
4702 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4703 			return -EINVAL;
4704 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4705 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4706 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4707 		return 0;
4708 
4709 	case SIOCSIFMAP:
4710 		if (ops->ndo_set_config) {
4711 			if (!netif_device_present(dev))
4712 				return -ENODEV;
4713 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4714 		}
4715 		return -EOPNOTSUPP;
4716 
4717 	case SIOCADDMULTI:
4718 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4719 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4720 			return -EINVAL;
4721 		if (!netif_device_present(dev))
4722 			return -ENODEV;
4723 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4724 
4725 	case SIOCDELMULTI:
4726 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4727 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4728 			return -EINVAL;
4729 		if (!netif_device_present(dev))
4730 			return -ENODEV;
4731 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4732 
4733 	case SIOCSIFTXQLEN:
4734 		if (ifr->ifr_qlen < 0)
4735 			return -EINVAL;
4736 		dev->tx_queue_len = ifr->ifr_qlen;
4737 		return 0;
4738 
4739 	case SIOCSIFNAME:
4740 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4741 		return dev_change_name(dev, ifr->ifr_newname);
4742 
4743 	/*
4744 	 *	Unknown or private ioctl
4745 	 */
4746 	default:
4747 		if ((cmd >= SIOCDEVPRIVATE &&
4748 		    cmd <= SIOCDEVPRIVATE + 15) ||
4749 		    cmd == SIOCBONDENSLAVE ||
4750 		    cmd == SIOCBONDRELEASE ||
4751 		    cmd == SIOCBONDSETHWADDR ||
4752 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4753 		    cmd == SIOCBONDINFOQUERY ||
4754 		    cmd == SIOCBONDCHANGEACTIVE ||
4755 		    cmd == SIOCGMIIPHY ||
4756 		    cmd == SIOCGMIIREG ||
4757 		    cmd == SIOCSMIIREG ||
4758 		    cmd == SIOCBRADDIF ||
4759 		    cmd == SIOCBRDELIF ||
4760 		    cmd == SIOCSHWTSTAMP ||
4761 		    cmd == SIOCWANDEV) {
4762 			err = -EOPNOTSUPP;
4763 			if (ops->ndo_do_ioctl) {
4764 				if (netif_device_present(dev))
4765 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4766 				else
4767 					err = -ENODEV;
4768 			}
4769 		} else
4770 			err = -EINVAL;
4771 
4772 	}
4773 	return err;
4774 }
4775 
4776 /*
4777  *	This function handles all "interface"-type I/O control requests. The actual
4778  *	'doing' part of this is dev_ifsioc above.
4779  */
4780 
4781 /**
4782  *	dev_ioctl	-	network device ioctl
4783  *	@net: the applicable net namespace
4784  *	@cmd: command to issue
4785  *	@arg: pointer to a struct ifreq in user space
4786  *
4787  *	Issue ioctl functions to devices. This is normally called by the
4788  *	user space syscall interfaces but can sometimes be useful for
4789  *	other purposes. The return value is the return from the syscall if
4790  *	positive or a negative errno code on error.
4791  */
4792 
4793 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4794 {
4795 	struct ifreq ifr;
4796 	int ret;
4797 	char *colon;
4798 
4799 	/* One special case: SIOCGIFCONF takes ifconf argument
4800 	   and requires shared lock, because it sleeps writing
4801 	   to user space.
4802 	 */
4803 
4804 	if (cmd == SIOCGIFCONF) {
4805 		rtnl_lock();
4806 		ret = dev_ifconf(net, (char __user *) arg);
4807 		rtnl_unlock();
4808 		return ret;
4809 	}
4810 	if (cmd == SIOCGIFNAME)
4811 		return dev_ifname(net, (struct ifreq __user *)arg);
4812 
4813 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4814 		return -EFAULT;
4815 
4816 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4817 
4818 	colon = strchr(ifr.ifr_name, ':');
4819 	if (colon)
4820 		*colon = 0;
4821 
4822 	/*
4823 	 *	See which interface the caller is talking about.
4824 	 */
4825 
4826 	switch (cmd) {
4827 	/*
4828 	 *	These ioctl calls:
4829 	 *	- can be done by all.
4830 	 *	- atomic and do not require locking.
4831 	 *	- return a value
4832 	 */
4833 	case SIOCGIFFLAGS:
4834 	case SIOCGIFMETRIC:
4835 	case SIOCGIFMTU:
4836 	case SIOCGIFHWADDR:
4837 	case SIOCGIFSLAVE:
4838 	case SIOCGIFMAP:
4839 	case SIOCGIFINDEX:
4840 	case SIOCGIFTXQLEN:
4841 		dev_load(net, ifr.ifr_name);
4842 		rcu_read_lock();
4843 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4844 		rcu_read_unlock();
4845 		if (!ret) {
4846 			if (colon)
4847 				*colon = ':';
4848 			if (copy_to_user(arg, &ifr,
4849 					 sizeof(struct ifreq)))
4850 				ret = -EFAULT;
4851 		}
4852 		return ret;
4853 
4854 	case SIOCETHTOOL:
4855 		dev_load(net, ifr.ifr_name);
4856 		rtnl_lock();
4857 		ret = dev_ethtool(net, &ifr);
4858 		rtnl_unlock();
4859 		if (!ret) {
4860 			if (colon)
4861 				*colon = ':';
4862 			if (copy_to_user(arg, &ifr,
4863 					 sizeof(struct ifreq)))
4864 				ret = -EFAULT;
4865 		}
4866 		return ret;
4867 
4868 	/*
4869 	 *	These ioctl calls:
4870 	 *	- require superuser power.
4871 	 *	- require strict serialization.
4872 	 *	- return a value
4873 	 */
4874 	case SIOCGMIIPHY:
4875 	case SIOCGMIIREG:
4876 	case SIOCSIFNAME:
4877 		if (!capable(CAP_NET_ADMIN))
4878 			return -EPERM;
4879 		dev_load(net, ifr.ifr_name);
4880 		rtnl_lock();
4881 		ret = dev_ifsioc(net, &ifr, cmd);
4882 		rtnl_unlock();
4883 		if (!ret) {
4884 			if (colon)
4885 				*colon = ':';
4886 			if (copy_to_user(arg, &ifr,
4887 					 sizeof(struct ifreq)))
4888 				ret = -EFAULT;
4889 		}
4890 		return ret;
4891 
4892 	/*
4893 	 *	These ioctl calls:
4894 	 *	- require superuser power.
4895 	 *	- require strict serialization.
4896 	 *	- do not return a value
4897 	 */
4898 	case SIOCSIFFLAGS:
4899 	case SIOCSIFMETRIC:
4900 	case SIOCSIFMTU:
4901 	case SIOCSIFMAP:
4902 	case SIOCSIFHWADDR:
4903 	case SIOCSIFSLAVE:
4904 	case SIOCADDMULTI:
4905 	case SIOCDELMULTI:
4906 	case SIOCSIFHWBROADCAST:
4907 	case SIOCSIFTXQLEN:
4908 	case SIOCSMIIREG:
4909 	case SIOCBONDENSLAVE:
4910 	case SIOCBONDRELEASE:
4911 	case SIOCBONDSETHWADDR:
4912 	case SIOCBONDCHANGEACTIVE:
4913 	case SIOCBRADDIF:
4914 	case SIOCBRDELIF:
4915 	case SIOCSHWTSTAMP:
4916 		if (!capable(CAP_NET_ADMIN))
4917 			return -EPERM;
4918 		/* fall through */
4919 	case SIOCBONDSLAVEINFOQUERY:
4920 	case SIOCBONDINFOQUERY:
4921 		dev_load(net, ifr.ifr_name);
4922 		rtnl_lock();
4923 		ret = dev_ifsioc(net, &ifr, cmd);
4924 		rtnl_unlock();
4925 		return ret;
4926 
4927 	case SIOCGIFMEM:
4928 		/* Get the per device memory space. We can add this but
4929 		 * currently do not support it */
4930 	case SIOCSIFMEM:
4931 		/* Set the per device memory buffer space.
4932 		 * Not applicable in our case */
4933 	case SIOCSIFLINK:
4934 		return -EINVAL;
4935 
4936 	/*
4937 	 *	Unknown or private ioctl.
4938 	 */
4939 	default:
4940 		if (cmd == SIOCWANDEV ||
4941 		    (cmd >= SIOCDEVPRIVATE &&
4942 		     cmd <= SIOCDEVPRIVATE + 15)) {
4943 			dev_load(net, ifr.ifr_name);
4944 			rtnl_lock();
4945 			ret = dev_ifsioc(net, &ifr, cmd);
4946 			rtnl_unlock();
4947 			if (!ret && copy_to_user(arg, &ifr,
4948 						 sizeof(struct ifreq)))
4949 				ret = -EFAULT;
4950 			return ret;
4951 		}
4952 		/* Take care of Wireless Extensions */
4953 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4954 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4955 		return -EINVAL;
4956 	}
4957 }
4958 
4959 
4960 /**
4961  *	dev_new_index	-	allocate an ifindex
4962  *	@net: the applicable net namespace
4963  *
4964  *	Returns a suitable unique value for a new device interface
4965  *	number.  The caller must hold the rtnl semaphore or the
4966  *	dev_base_lock to be sure it remains unique.
4967  */
4968 static int dev_new_index(struct net *net)
4969 {
4970 	static int ifindex;
4971 	for (;;) {
4972 		if (++ifindex <= 0)
4973 			ifindex = 1;
4974 		if (!__dev_get_by_index(net, ifindex))
4975 			return ifindex;
4976 	}
4977 }
4978 
4979 /* Delayed registration/unregisteration */
4980 static LIST_HEAD(net_todo_list);
4981 
4982 static void net_set_todo(struct net_device *dev)
4983 {
4984 	list_add_tail(&dev->todo_list, &net_todo_list);
4985 }
4986 
4987 static void rollback_registered_many(struct list_head *head)
4988 {
4989 	struct net_device *dev, *tmp;
4990 
4991 	BUG_ON(dev_boot_phase);
4992 	ASSERT_RTNL();
4993 
4994 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4995 		/* Some devices call without registering
4996 		 * for initialization unwind. Remove those
4997 		 * devices and proceed with the remaining.
4998 		 */
4999 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5000 			pr_debug("unregister_netdevice: device %s/%p never "
5001 				 "was registered\n", dev->name, dev);
5002 
5003 			WARN_ON(1);
5004 			list_del(&dev->unreg_list);
5005 			continue;
5006 		}
5007 
5008 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5009 	}
5010 
5011 	/* If device is running, close it first. */
5012 	dev_close_many(head);
5013 
5014 	list_for_each_entry(dev, head, unreg_list) {
5015 		/* And unlink it from device chain. */
5016 		unlist_netdevice(dev);
5017 
5018 		dev->reg_state = NETREG_UNREGISTERING;
5019 	}
5020 
5021 	synchronize_net();
5022 
5023 	list_for_each_entry(dev, head, unreg_list) {
5024 		/* Shutdown queueing discipline. */
5025 		dev_shutdown(dev);
5026 
5027 
5028 		/* Notify protocols, that we are about to destroy
5029 		   this device. They should clean all the things.
5030 		*/
5031 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5032 
5033 		if (!dev->rtnl_link_ops ||
5034 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5035 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5036 
5037 		/*
5038 		 *	Flush the unicast and multicast chains
5039 		 */
5040 		dev_uc_flush(dev);
5041 		dev_mc_flush(dev);
5042 
5043 		if (dev->netdev_ops->ndo_uninit)
5044 			dev->netdev_ops->ndo_uninit(dev);
5045 
5046 		/* Notifier chain MUST detach us from master device. */
5047 		WARN_ON(dev->master);
5048 
5049 		/* Remove entries from kobject tree */
5050 		netdev_unregister_kobject(dev);
5051 	}
5052 
5053 	/* Process any work delayed until the end of the batch */
5054 	dev = list_first_entry(head, struct net_device, unreg_list);
5055 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5056 
5057 	rcu_barrier();
5058 
5059 	list_for_each_entry(dev, head, unreg_list)
5060 		dev_put(dev);
5061 }
5062 
5063 static void rollback_registered(struct net_device *dev)
5064 {
5065 	LIST_HEAD(single);
5066 
5067 	list_add(&dev->unreg_list, &single);
5068 	rollback_registered_many(&single);
5069 	list_del(&single);
5070 }
5071 
5072 unsigned long netdev_fix_features(unsigned long features, const char *name)
5073 {
5074 	/* Fix illegal SG+CSUM combinations. */
5075 	if ((features & NETIF_F_SG) &&
5076 	    !(features & NETIF_F_ALL_CSUM)) {
5077 		if (name)
5078 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5079 			       "checksum feature.\n", name);
5080 		features &= ~NETIF_F_SG;
5081 	}
5082 
5083 	/* TSO requires that SG is present as well. */
5084 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5085 		if (name)
5086 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5087 			       "SG feature.\n", name);
5088 		features &= ~NETIF_F_TSO;
5089 	}
5090 
5091 	if (features & NETIF_F_UFO) {
5092 		/* maybe split UFO into V4 and V6? */
5093 		if (!((features & NETIF_F_GEN_CSUM) ||
5094 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5095 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5096 			if (name)
5097 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5098 				       "since no checksum offload features.\n",
5099 				       name);
5100 			features &= ~NETIF_F_UFO;
5101 		}
5102 
5103 		if (!(features & NETIF_F_SG)) {
5104 			if (name)
5105 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5106 				       "since no NETIF_F_SG feature.\n", name);
5107 			features &= ~NETIF_F_UFO;
5108 		}
5109 	}
5110 
5111 	return features;
5112 }
5113 EXPORT_SYMBOL(netdev_fix_features);
5114 
5115 /**
5116  *	netif_stacked_transfer_operstate -	transfer operstate
5117  *	@rootdev: the root or lower level device to transfer state from
5118  *	@dev: the device to transfer operstate to
5119  *
5120  *	Transfer operational state from root to device. This is normally
5121  *	called when a stacking relationship exists between the root
5122  *	device and the device(a leaf device).
5123  */
5124 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5125 					struct net_device *dev)
5126 {
5127 	if (rootdev->operstate == IF_OPER_DORMANT)
5128 		netif_dormant_on(dev);
5129 	else
5130 		netif_dormant_off(dev);
5131 
5132 	if (netif_carrier_ok(rootdev)) {
5133 		if (!netif_carrier_ok(dev))
5134 			netif_carrier_on(dev);
5135 	} else {
5136 		if (netif_carrier_ok(dev))
5137 			netif_carrier_off(dev);
5138 	}
5139 }
5140 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5141 
5142 #ifdef CONFIG_RPS
5143 static int netif_alloc_rx_queues(struct net_device *dev)
5144 {
5145 	unsigned int i, count = dev->num_rx_queues;
5146 	struct netdev_rx_queue *rx;
5147 
5148 	BUG_ON(count < 1);
5149 
5150 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5151 	if (!rx) {
5152 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5153 		return -ENOMEM;
5154 	}
5155 	dev->_rx = rx;
5156 
5157 	for (i = 0; i < count; i++)
5158 		rx[i].dev = dev;
5159 	return 0;
5160 }
5161 #endif
5162 
5163 static void netdev_init_one_queue(struct net_device *dev,
5164 				  struct netdev_queue *queue, void *_unused)
5165 {
5166 	/* Initialize queue lock */
5167 	spin_lock_init(&queue->_xmit_lock);
5168 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5169 	queue->xmit_lock_owner = -1;
5170 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5171 	queue->dev = dev;
5172 }
5173 
5174 static int netif_alloc_netdev_queues(struct net_device *dev)
5175 {
5176 	unsigned int count = dev->num_tx_queues;
5177 	struct netdev_queue *tx;
5178 
5179 	BUG_ON(count < 1);
5180 
5181 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5182 	if (!tx) {
5183 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5184 		       count);
5185 		return -ENOMEM;
5186 	}
5187 	dev->_tx = tx;
5188 
5189 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5190 	spin_lock_init(&dev->tx_global_lock);
5191 
5192 	return 0;
5193 }
5194 
5195 /**
5196  *	register_netdevice	- register a network device
5197  *	@dev: device to register
5198  *
5199  *	Take a completed network device structure and add it to the kernel
5200  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5201  *	chain. 0 is returned on success. A negative errno code is returned
5202  *	on a failure to set up the device, or if the name is a duplicate.
5203  *
5204  *	Callers must hold the rtnl semaphore. You may want
5205  *	register_netdev() instead of this.
5206  *
5207  *	BUGS:
5208  *	The locking appears insufficient to guarantee two parallel registers
5209  *	will not get the same name.
5210  */
5211 
5212 int register_netdevice(struct net_device *dev)
5213 {
5214 	int ret;
5215 	struct net *net = dev_net(dev);
5216 
5217 	BUG_ON(dev_boot_phase);
5218 	ASSERT_RTNL();
5219 
5220 	might_sleep();
5221 
5222 	/* When net_device's are persistent, this will be fatal. */
5223 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5224 	BUG_ON(!net);
5225 
5226 	spin_lock_init(&dev->addr_list_lock);
5227 	netdev_set_addr_lockdep_class(dev);
5228 
5229 	dev->iflink = -1;
5230 
5231 	/* Init, if this function is available */
5232 	if (dev->netdev_ops->ndo_init) {
5233 		ret = dev->netdev_ops->ndo_init(dev);
5234 		if (ret) {
5235 			if (ret > 0)
5236 				ret = -EIO;
5237 			goto out;
5238 		}
5239 	}
5240 
5241 	ret = dev_get_valid_name(dev, dev->name, 0);
5242 	if (ret)
5243 		goto err_uninit;
5244 
5245 	dev->ifindex = dev_new_index(net);
5246 	if (dev->iflink == -1)
5247 		dev->iflink = dev->ifindex;
5248 
5249 	/* Fix illegal checksum combinations */
5250 	if ((dev->features & NETIF_F_HW_CSUM) &&
5251 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5252 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5253 		       dev->name);
5254 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5255 	}
5256 
5257 	if ((dev->features & NETIF_F_NO_CSUM) &&
5258 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5259 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5260 		       dev->name);
5261 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5262 	}
5263 
5264 	dev->features = netdev_fix_features(dev->features, dev->name);
5265 
5266 	/* Enable software GSO if SG is supported. */
5267 	if (dev->features & NETIF_F_SG)
5268 		dev->features |= NETIF_F_GSO;
5269 
5270 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5271 	 * vlan_dev_init() will do the dev->features check, so these features
5272 	 * are enabled only if supported by underlying device.
5273 	 */
5274 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5275 
5276 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5277 	ret = notifier_to_errno(ret);
5278 	if (ret)
5279 		goto err_uninit;
5280 
5281 	ret = netdev_register_kobject(dev);
5282 	if (ret)
5283 		goto err_uninit;
5284 	dev->reg_state = NETREG_REGISTERED;
5285 
5286 	/*
5287 	 *	Default initial state at registry is that the
5288 	 *	device is present.
5289 	 */
5290 
5291 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5292 
5293 	dev_init_scheduler(dev);
5294 	dev_hold(dev);
5295 	list_netdevice(dev);
5296 
5297 	/* Notify protocols, that a new device appeared. */
5298 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5299 	ret = notifier_to_errno(ret);
5300 	if (ret) {
5301 		rollback_registered(dev);
5302 		dev->reg_state = NETREG_UNREGISTERED;
5303 	}
5304 	/*
5305 	 *	Prevent userspace races by waiting until the network
5306 	 *	device is fully setup before sending notifications.
5307 	 */
5308 	if (!dev->rtnl_link_ops ||
5309 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5310 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5311 
5312 out:
5313 	return ret;
5314 
5315 err_uninit:
5316 	if (dev->netdev_ops->ndo_uninit)
5317 		dev->netdev_ops->ndo_uninit(dev);
5318 	goto out;
5319 }
5320 EXPORT_SYMBOL(register_netdevice);
5321 
5322 /**
5323  *	init_dummy_netdev	- init a dummy network device for NAPI
5324  *	@dev: device to init
5325  *
5326  *	This takes a network device structure and initialize the minimum
5327  *	amount of fields so it can be used to schedule NAPI polls without
5328  *	registering a full blown interface. This is to be used by drivers
5329  *	that need to tie several hardware interfaces to a single NAPI
5330  *	poll scheduler due to HW limitations.
5331  */
5332 int init_dummy_netdev(struct net_device *dev)
5333 {
5334 	/* Clear everything. Note we don't initialize spinlocks
5335 	 * are they aren't supposed to be taken by any of the
5336 	 * NAPI code and this dummy netdev is supposed to be
5337 	 * only ever used for NAPI polls
5338 	 */
5339 	memset(dev, 0, sizeof(struct net_device));
5340 
5341 	/* make sure we BUG if trying to hit standard
5342 	 * register/unregister code path
5343 	 */
5344 	dev->reg_state = NETREG_DUMMY;
5345 
5346 	/* NAPI wants this */
5347 	INIT_LIST_HEAD(&dev->napi_list);
5348 
5349 	/* a dummy interface is started by default */
5350 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5351 	set_bit(__LINK_STATE_START, &dev->state);
5352 
5353 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5354 	 * because users of this 'device' dont need to change
5355 	 * its refcount.
5356 	 */
5357 
5358 	return 0;
5359 }
5360 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5361 
5362 
5363 /**
5364  *	register_netdev	- register a network device
5365  *	@dev: device to register
5366  *
5367  *	Take a completed network device structure and add it to the kernel
5368  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5369  *	chain. 0 is returned on success. A negative errno code is returned
5370  *	on a failure to set up the device, or if the name is a duplicate.
5371  *
5372  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5373  *	and expands the device name if you passed a format string to
5374  *	alloc_netdev.
5375  */
5376 int register_netdev(struct net_device *dev)
5377 {
5378 	int err;
5379 
5380 	rtnl_lock();
5381 
5382 	/*
5383 	 * If the name is a format string the caller wants us to do a
5384 	 * name allocation.
5385 	 */
5386 	if (strchr(dev->name, '%')) {
5387 		err = dev_alloc_name(dev, dev->name);
5388 		if (err < 0)
5389 			goto out;
5390 	}
5391 
5392 	err = register_netdevice(dev);
5393 out:
5394 	rtnl_unlock();
5395 	return err;
5396 }
5397 EXPORT_SYMBOL(register_netdev);
5398 
5399 int netdev_refcnt_read(const struct net_device *dev)
5400 {
5401 	int i, refcnt = 0;
5402 
5403 	for_each_possible_cpu(i)
5404 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5405 	return refcnt;
5406 }
5407 EXPORT_SYMBOL(netdev_refcnt_read);
5408 
5409 /*
5410  * netdev_wait_allrefs - wait until all references are gone.
5411  *
5412  * This is called when unregistering network devices.
5413  *
5414  * Any protocol or device that holds a reference should register
5415  * for netdevice notification, and cleanup and put back the
5416  * reference if they receive an UNREGISTER event.
5417  * We can get stuck here if buggy protocols don't correctly
5418  * call dev_put.
5419  */
5420 static void netdev_wait_allrefs(struct net_device *dev)
5421 {
5422 	unsigned long rebroadcast_time, warning_time;
5423 	int refcnt;
5424 
5425 	linkwatch_forget_dev(dev);
5426 
5427 	rebroadcast_time = warning_time = jiffies;
5428 	refcnt = netdev_refcnt_read(dev);
5429 
5430 	while (refcnt != 0) {
5431 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5432 			rtnl_lock();
5433 
5434 			/* Rebroadcast unregister notification */
5435 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5436 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5437 			 * should have already handle it the first time */
5438 
5439 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5440 				     &dev->state)) {
5441 				/* We must not have linkwatch events
5442 				 * pending on unregister. If this
5443 				 * happens, we simply run the queue
5444 				 * unscheduled, resulting in a noop
5445 				 * for this device.
5446 				 */
5447 				linkwatch_run_queue();
5448 			}
5449 
5450 			__rtnl_unlock();
5451 
5452 			rebroadcast_time = jiffies;
5453 		}
5454 
5455 		msleep(250);
5456 
5457 		refcnt = netdev_refcnt_read(dev);
5458 
5459 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5460 			printk(KERN_EMERG "unregister_netdevice: "
5461 			       "waiting for %s to become free. Usage "
5462 			       "count = %d\n",
5463 			       dev->name, refcnt);
5464 			warning_time = jiffies;
5465 		}
5466 	}
5467 }
5468 
5469 /* The sequence is:
5470  *
5471  *	rtnl_lock();
5472  *	...
5473  *	register_netdevice(x1);
5474  *	register_netdevice(x2);
5475  *	...
5476  *	unregister_netdevice(y1);
5477  *	unregister_netdevice(y2);
5478  *      ...
5479  *	rtnl_unlock();
5480  *	free_netdev(y1);
5481  *	free_netdev(y2);
5482  *
5483  * We are invoked by rtnl_unlock().
5484  * This allows us to deal with problems:
5485  * 1) We can delete sysfs objects which invoke hotplug
5486  *    without deadlocking with linkwatch via keventd.
5487  * 2) Since we run with the RTNL semaphore not held, we can sleep
5488  *    safely in order to wait for the netdev refcnt to drop to zero.
5489  *
5490  * We must not return until all unregister events added during
5491  * the interval the lock was held have been completed.
5492  */
5493 void netdev_run_todo(void)
5494 {
5495 	struct list_head list;
5496 
5497 	/* Snapshot list, allow later requests */
5498 	list_replace_init(&net_todo_list, &list);
5499 
5500 	__rtnl_unlock();
5501 
5502 	while (!list_empty(&list)) {
5503 		struct net_device *dev
5504 			= list_first_entry(&list, struct net_device, todo_list);
5505 		list_del(&dev->todo_list);
5506 
5507 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5508 			printk(KERN_ERR "network todo '%s' but state %d\n",
5509 			       dev->name, dev->reg_state);
5510 			dump_stack();
5511 			continue;
5512 		}
5513 
5514 		dev->reg_state = NETREG_UNREGISTERED;
5515 
5516 		on_each_cpu(flush_backlog, dev, 1);
5517 
5518 		netdev_wait_allrefs(dev);
5519 
5520 		/* paranoia */
5521 		BUG_ON(netdev_refcnt_read(dev));
5522 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5523 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5524 		WARN_ON(dev->dn_ptr);
5525 
5526 		if (dev->destructor)
5527 			dev->destructor(dev);
5528 
5529 		/* Free network device */
5530 		kobject_put(&dev->dev.kobj);
5531 	}
5532 }
5533 
5534 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5535  * fields in the same order, with only the type differing.
5536  */
5537 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5538 				    const struct net_device_stats *netdev_stats)
5539 {
5540 #if BITS_PER_LONG == 64
5541         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5542         memcpy(stats64, netdev_stats, sizeof(*stats64));
5543 #else
5544 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5545 	const unsigned long *src = (const unsigned long *)netdev_stats;
5546 	u64 *dst = (u64 *)stats64;
5547 
5548 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5549 		     sizeof(*stats64) / sizeof(u64));
5550 	for (i = 0; i < n; i++)
5551 		dst[i] = src[i];
5552 #endif
5553 }
5554 
5555 /**
5556  *	dev_get_stats	- get network device statistics
5557  *	@dev: device to get statistics from
5558  *	@storage: place to store stats
5559  *
5560  *	Get network statistics from device. Return @storage.
5561  *	The device driver may provide its own method by setting
5562  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5563  *	otherwise the internal statistics structure is used.
5564  */
5565 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5566 					struct rtnl_link_stats64 *storage)
5567 {
5568 	const struct net_device_ops *ops = dev->netdev_ops;
5569 
5570 	if (ops->ndo_get_stats64) {
5571 		memset(storage, 0, sizeof(*storage));
5572 		ops->ndo_get_stats64(dev, storage);
5573 	} else if (ops->ndo_get_stats) {
5574 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5575 	} else {
5576 		netdev_stats_to_stats64(storage, &dev->stats);
5577 	}
5578 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5579 	return storage;
5580 }
5581 EXPORT_SYMBOL(dev_get_stats);
5582 
5583 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5584 {
5585 	struct netdev_queue *queue = dev_ingress_queue(dev);
5586 
5587 #ifdef CONFIG_NET_CLS_ACT
5588 	if (queue)
5589 		return queue;
5590 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5591 	if (!queue)
5592 		return NULL;
5593 	netdev_init_one_queue(dev, queue, NULL);
5594 	queue->qdisc = &noop_qdisc;
5595 	queue->qdisc_sleeping = &noop_qdisc;
5596 	rcu_assign_pointer(dev->ingress_queue, queue);
5597 #endif
5598 	return queue;
5599 }
5600 
5601 /**
5602  *	alloc_netdev_mqs - allocate network device
5603  *	@sizeof_priv:	size of private data to allocate space for
5604  *	@name:		device name format string
5605  *	@setup:		callback to initialize device
5606  *	@txqs:		the number of TX subqueues to allocate
5607  *	@rxqs:		the number of RX subqueues to allocate
5608  *
5609  *	Allocates a struct net_device with private data area for driver use
5610  *	and performs basic initialization.  Also allocates subquue structs
5611  *	for each queue on the device.
5612  */
5613 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5614 		void (*setup)(struct net_device *),
5615 		unsigned int txqs, unsigned int rxqs)
5616 {
5617 	struct net_device *dev;
5618 	size_t alloc_size;
5619 	struct net_device *p;
5620 
5621 	BUG_ON(strlen(name) >= sizeof(dev->name));
5622 
5623 	if (txqs < 1) {
5624 		pr_err("alloc_netdev: Unable to allocate device "
5625 		       "with zero queues.\n");
5626 		return NULL;
5627 	}
5628 
5629 #ifdef CONFIG_RPS
5630 	if (rxqs < 1) {
5631 		pr_err("alloc_netdev: Unable to allocate device "
5632 		       "with zero RX queues.\n");
5633 		return NULL;
5634 	}
5635 #endif
5636 
5637 	alloc_size = sizeof(struct net_device);
5638 	if (sizeof_priv) {
5639 		/* ensure 32-byte alignment of private area */
5640 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5641 		alloc_size += sizeof_priv;
5642 	}
5643 	/* ensure 32-byte alignment of whole construct */
5644 	alloc_size += NETDEV_ALIGN - 1;
5645 
5646 	p = kzalloc(alloc_size, GFP_KERNEL);
5647 	if (!p) {
5648 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5649 		return NULL;
5650 	}
5651 
5652 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5653 	dev->padded = (char *)dev - (char *)p;
5654 
5655 	dev->pcpu_refcnt = alloc_percpu(int);
5656 	if (!dev->pcpu_refcnt)
5657 		goto free_p;
5658 
5659 	if (dev_addr_init(dev))
5660 		goto free_pcpu;
5661 
5662 	dev_mc_init(dev);
5663 	dev_uc_init(dev);
5664 
5665 	dev_net_set(dev, &init_net);
5666 
5667 	dev->gso_max_size = GSO_MAX_SIZE;
5668 
5669 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5670 	dev->ethtool_ntuple_list.count = 0;
5671 	INIT_LIST_HEAD(&dev->napi_list);
5672 	INIT_LIST_HEAD(&dev->unreg_list);
5673 	INIT_LIST_HEAD(&dev->link_watch_list);
5674 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5675 	setup(dev);
5676 
5677 	dev->num_tx_queues = txqs;
5678 	dev->real_num_tx_queues = txqs;
5679 	if (netif_alloc_netdev_queues(dev))
5680 		goto free_all;
5681 
5682 #ifdef CONFIG_RPS
5683 	dev->num_rx_queues = rxqs;
5684 	dev->real_num_rx_queues = rxqs;
5685 	if (netif_alloc_rx_queues(dev))
5686 		goto free_all;
5687 #endif
5688 
5689 	strcpy(dev->name, name);
5690 	return dev;
5691 
5692 free_all:
5693 	free_netdev(dev);
5694 	return NULL;
5695 
5696 free_pcpu:
5697 	free_percpu(dev->pcpu_refcnt);
5698 	kfree(dev->_tx);
5699 #ifdef CONFIG_RPS
5700 	kfree(dev->_rx);
5701 #endif
5702 
5703 free_p:
5704 	kfree(p);
5705 	return NULL;
5706 }
5707 EXPORT_SYMBOL(alloc_netdev_mqs);
5708 
5709 /**
5710  *	free_netdev - free network device
5711  *	@dev: device
5712  *
5713  *	This function does the last stage of destroying an allocated device
5714  * 	interface. The reference to the device object is released.
5715  *	If this is the last reference then it will be freed.
5716  */
5717 void free_netdev(struct net_device *dev)
5718 {
5719 	struct napi_struct *p, *n;
5720 
5721 	release_net(dev_net(dev));
5722 
5723 	kfree(dev->_tx);
5724 #ifdef CONFIG_RPS
5725 	kfree(dev->_rx);
5726 #endif
5727 
5728 	kfree(rcu_dereference_raw(dev->ingress_queue));
5729 
5730 	/* Flush device addresses */
5731 	dev_addr_flush(dev);
5732 
5733 	/* Clear ethtool n-tuple list */
5734 	ethtool_ntuple_flush(dev);
5735 
5736 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5737 		netif_napi_del(p);
5738 
5739 	free_percpu(dev->pcpu_refcnt);
5740 	dev->pcpu_refcnt = NULL;
5741 
5742 	/*  Compatibility with error handling in drivers */
5743 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5744 		kfree((char *)dev - dev->padded);
5745 		return;
5746 	}
5747 
5748 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5749 	dev->reg_state = NETREG_RELEASED;
5750 
5751 	/* will free via device release */
5752 	put_device(&dev->dev);
5753 }
5754 EXPORT_SYMBOL(free_netdev);
5755 
5756 /**
5757  *	synchronize_net -  Synchronize with packet receive processing
5758  *
5759  *	Wait for packets currently being received to be done.
5760  *	Does not block later packets from starting.
5761  */
5762 void synchronize_net(void)
5763 {
5764 	might_sleep();
5765 	synchronize_rcu();
5766 }
5767 EXPORT_SYMBOL(synchronize_net);
5768 
5769 /**
5770  *	unregister_netdevice_queue - remove device from the kernel
5771  *	@dev: device
5772  *	@head: list
5773  *
5774  *	This function shuts down a device interface and removes it
5775  *	from the kernel tables.
5776  *	If head not NULL, device is queued to be unregistered later.
5777  *
5778  *	Callers must hold the rtnl semaphore.  You may want
5779  *	unregister_netdev() instead of this.
5780  */
5781 
5782 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5783 {
5784 	ASSERT_RTNL();
5785 
5786 	if (head) {
5787 		list_move_tail(&dev->unreg_list, head);
5788 	} else {
5789 		rollback_registered(dev);
5790 		/* Finish processing unregister after unlock */
5791 		net_set_todo(dev);
5792 	}
5793 }
5794 EXPORT_SYMBOL(unregister_netdevice_queue);
5795 
5796 /**
5797  *	unregister_netdevice_many - unregister many devices
5798  *	@head: list of devices
5799  */
5800 void unregister_netdevice_many(struct list_head *head)
5801 {
5802 	struct net_device *dev;
5803 
5804 	if (!list_empty(head)) {
5805 		rollback_registered_many(head);
5806 		list_for_each_entry(dev, head, unreg_list)
5807 			net_set_todo(dev);
5808 	}
5809 }
5810 EXPORT_SYMBOL(unregister_netdevice_many);
5811 
5812 /**
5813  *	unregister_netdev - remove device from the kernel
5814  *	@dev: device
5815  *
5816  *	This function shuts down a device interface and removes it
5817  *	from the kernel tables.
5818  *
5819  *	This is just a wrapper for unregister_netdevice that takes
5820  *	the rtnl semaphore.  In general you want to use this and not
5821  *	unregister_netdevice.
5822  */
5823 void unregister_netdev(struct net_device *dev)
5824 {
5825 	rtnl_lock();
5826 	unregister_netdevice(dev);
5827 	rtnl_unlock();
5828 }
5829 EXPORT_SYMBOL(unregister_netdev);
5830 
5831 /**
5832  *	dev_change_net_namespace - move device to different nethost namespace
5833  *	@dev: device
5834  *	@net: network namespace
5835  *	@pat: If not NULL name pattern to try if the current device name
5836  *	      is already taken in the destination network namespace.
5837  *
5838  *	This function shuts down a device interface and moves it
5839  *	to a new network namespace. On success 0 is returned, on
5840  *	a failure a netagive errno code is returned.
5841  *
5842  *	Callers must hold the rtnl semaphore.
5843  */
5844 
5845 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5846 {
5847 	int err;
5848 
5849 	ASSERT_RTNL();
5850 
5851 	/* Don't allow namespace local devices to be moved. */
5852 	err = -EINVAL;
5853 	if (dev->features & NETIF_F_NETNS_LOCAL)
5854 		goto out;
5855 
5856 	/* Ensure the device has been registrered */
5857 	err = -EINVAL;
5858 	if (dev->reg_state != NETREG_REGISTERED)
5859 		goto out;
5860 
5861 	/* Get out if there is nothing todo */
5862 	err = 0;
5863 	if (net_eq(dev_net(dev), net))
5864 		goto out;
5865 
5866 	/* Pick the destination device name, and ensure
5867 	 * we can use it in the destination network namespace.
5868 	 */
5869 	err = -EEXIST;
5870 	if (__dev_get_by_name(net, dev->name)) {
5871 		/* We get here if we can't use the current device name */
5872 		if (!pat)
5873 			goto out;
5874 		if (dev_get_valid_name(dev, pat, 1))
5875 			goto out;
5876 	}
5877 
5878 	/*
5879 	 * And now a mini version of register_netdevice unregister_netdevice.
5880 	 */
5881 
5882 	/* If device is running close it first. */
5883 	dev_close(dev);
5884 
5885 	/* And unlink it from device chain */
5886 	err = -ENODEV;
5887 	unlist_netdevice(dev);
5888 
5889 	synchronize_net();
5890 
5891 	/* Shutdown queueing discipline. */
5892 	dev_shutdown(dev);
5893 
5894 	/* Notify protocols, that we are about to destroy
5895 	   this device. They should clean all the things.
5896 
5897 	   Note that dev->reg_state stays at NETREG_REGISTERED.
5898 	   This is wanted because this way 8021q and macvlan know
5899 	   the device is just moving and can keep their slaves up.
5900 	*/
5901 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5902 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5903 
5904 	/*
5905 	 *	Flush the unicast and multicast chains
5906 	 */
5907 	dev_uc_flush(dev);
5908 	dev_mc_flush(dev);
5909 
5910 	/* Actually switch the network namespace */
5911 	dev_net_set(dev, net);
5912 
5913 	/* If there is an ifindex conflict assign a new one */
5914 	if (__dev_get_by_index(net, dev->ifindex)) {
5915 		int iflink = (dev->iflink == dev->ifindex);
5916 		dev->ifindex = dev_new_index(net);
5917 		if (iflink)
5918 			dev->iflink = dev->ifindex;
5919 	}
5920 
5921 	/* Fixup kobjects */
5922 	err = device_rename(&dev->dev, dev->name);
5923 	WARN_ON(err);
5924 
5925 	/* Add the device back in the hashes */
5926 	list_netdevice(dev);
5927 
5928 	/* Notify protocols, that a new device appeared. */
5929 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5930 
5931 	/*
5932 	 *	Prevent userspace races by waiting until the network
5933 	 *	device is fully setup before sending notifications.
5934 	 */
5935 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5936 
5937 	synchronize_net();
5938 	err = 0;
5939 out:
5940 	return err;
5941 }
5942 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5943 
5944 static int dev_cpu_callback(struct notifier_block *nfb,
5945 			    unsigned long action,
5946 			    void *ocpu)
5947 {
5948 	struct sk_buff **list_skb;
5949 	struct sk_buff *skb;
5950 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5951 	struct softnet_data *sd, *oldsd;
5952 
5953 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5954 		return NOTIFY_OK;
5955 
5956 	local_irq_disable();
5957 	cpu = smp_processor_id();
5958 	sd = &per_cpu(softnet_data, cpu);
5959 	oldsd = &per_cpu(softnet_data, oldcpu);
5960 
5961 	/* Find end of our completion_queue. */
5962 	list_skb = &sd->completion_queue;
5963 	while (*list_skb)
5964 		list_skb = &(*list_skb)->next;
5965 	/* Append completion queue from offline CPU. */
5966 	*list_skb = oldsd->completion_queue;
5967 	oldsd->completion_queue = NULL;
5968 
5969 	/* Append output queue from offline CPU. */
5970 	if (oldsd->output_queue) {
5971 		*sd->output_queue_tailp = oldsd->output_queue;
5972 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5973 		oldsd->output_queue = NULL;
5974 		oldsd->output_queue_tailp = &oldsd->output_queue;
5975 	}
5976 
5977 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5978 	local_irq_enable();
5979 
5980 	/* Process offline CPU's input_pkt_queue */
5981 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5982 		netif_rx(skb);
5983 		input_queue_head_incr(oldsd);
5984 	}
5985 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5986 		netif_rx(skb);
5987 		input_queue_head_incr(oldsd);
5988 	}
5989 
5990 	return NOTIFY_OK;
5991 }
5992 
5993 
5994 /**
5995  *	netdev_increment_features - increment feature set by one
5996  *	@all: current feature set
5997  *	@one: new feature set
5998  *	@mask: mask feature set
5999  *
6000  *	Computes a new feature set after adding a device with feature set
6001  *	@one to the master device with current feature set @all.  Will not
6002  *	enable anything that is off in @mask. Returns the new feature set.
6003  */
6004 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6005 					unsigned long mask)
6006 {
6007 	/* If device needs checksumming, downgrade to it. */
6008 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6009 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6010 	else if (mask & NETIF_F_ALL_CSUM) {
6011 		/* If one device supports v4/v6 checksumming, set for all. */
6012 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6013 		    !(all & NETIF_F_GEN_CSUM)) {
6014 			all &= ~NETIF_F_ALL_CSUM;
6015 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6016 		}
6017 
6018 		/* If one device supports hw checksumming, set for all. */
6019 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6020 			all &= ~NETIF_F_ALL_CSUM;
6021 			all |= NETIF_F_HW_CSUM;
6022 		}
6023 	}
6024 
6025 	one |= NETIF_F_ALL_CSUM;
6026 
6027 	one |= all & NETIF_F_ONE_FOR_ALL;
6028 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6029 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6030 
6031 	return all;
6032 }
6033 EXPORT_SYMBOL(netdev_increment_features);
6034 
6035 static struct hlist_head *netdev_create_hash(void)
6036 {
6037 	int i;
6038 	struct hlist_head *hash;
6039 
6040 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6041 	if (hash != NULL)
6042 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6043 			INIT_HLIST_HEAD(&hash[i]);
6044 
6045 	return hash;
6046 }
6047 
6048 /* Initialize per network namespace state */
6049 static int __net_init netdev_init(struct net *net)
6050 {
6051 	INIT_LIST_HEAD(&net->dev_base_head);
6052 
6053 	net->dev_name_head = netdev_create_hash();
6054 	if (net->dev_name_head == NULL)
6055 		goto err_name;
6056 
6057 	net->dev_index_head = netdev_create_hash();
6058 	if (net->dev_index_head == NULL)
6059 		goto err_idx;
6060 
6061 	return 0;
6062 
6063 err_idx:
6064 	kfree(net->dev_name_head);
6065 err_name:
6066 	return -ENOMEM;
6067 }
6068 
6069 /**
6070  *	netdev_drivername - network driver for the device
6071  *	@dev: network device
6072  *	@buffer: buffer for resulting name
6073  *	@len: size of buffer
6074  *
6075  *	Determine network driver for device.
6076  */
6077 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6078 {
6079 	const struct device_driver *driver;
6080 	const struct device *parent;
6081 
6082 	if (len <= 0 || !buffer)
6083 		return buffer;
6084 	buffer[0] = 0;
6085 
6086 	parent = dev->dev.parent;
6087 
6088 	if (!parent)
6089 		return buffer;
6090 
6091 	driver = parent->driver;
6092 	if (driver && driver->name)
6093 		strlcpy(buffer, driver->name, len);
6094 	return buffer;
6095 }
6096 
6097 static int __netdev_printk(const char *level, const struct net_device *dev,
6098 			   struct va_format *vaf)
6099 {
6100 	int r;
6101 
6102 	if (dev && dev->dev.parent)
6103 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6104 			       netdev_name(dev), vaf);
6105 	else if (dev)
6106 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6107 	else
6108 		r = printk("%s(NULL net_device): %pV", level, vaf);
6109 
6110 	return r;
6111 }
6112 
6113 int netdev_printk(const char *level, const struct net_device *dev,
6114 		  const char *format, ...)
6115 {
6116 	struct va_format vaf;
6117 	va_list args;
6118 	int r;
6119 
6120 	va_start(args, format);
6121 
6122 	vaf.fmt = format;
6123 	vaf.va = &args;
6124 
6125 	r = __netdev_printk(level, dev, &vaf);
6126 	va_end(args);
6127 
6128 	return r;
6129 }
6130 EXPORT_SYMBOL(netdev_printk);
6131 
6132 #define define_netdev_printk_level(func, level)			\
6133 int func(const struct net_device *dev, const char *fmt, ...)	\
6134 {								\
6135 	int r;							\
6136 	struct va_format vaf;					\
6137 	va_list args;						\
6138 								\
6139 	va_start(args, fmt);					\
6140 								\
6141 	vaf.fmt = fmt;						\
6142 	vaf.va = &args;						\
6143 								\
6144 	r = __netdev_printk(level, dev, &vaf);			\
6145 	va_end(args);						\
6146 								\
6147 	return r;						\
6148 }								\
6149 EXPORT_SYMBOL(func);
6150 
6151 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6152 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6153 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6154 define_netdev_printk_level(netdev_err, KERN_ERR);
6155 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6156 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6157 define_netdev_printk_level(netdev_info, KERN_INFO);
6158 
6159 static void __net_exit netdev_exit(struct net *net)
6160 {
6161 	kfree(net->dev_name_head);
6162 	kfree(net->dev_index_head);
6163 }
6164 
6165 static struct pernet_operations __net_initdata netdev_net_ops = {
6166 	.init = netdev_init,
6167 	.exit = netdev_exit,
6168 };
6169 
6170 static void __net_exit default_device_exit(struct net *net)
6171 {
6172 	struct net_device *dev, *aux;
6173 	/*
6174 	 * Push all migratable network devices back to the
6175 	 * initial network namespace
6176 	 */
6177 	rtnl_lock();
6178 	for_each_netdev_safe(net, dev, aux) {
6179 		int err;
6180 		char fb_name[IFNAMSIZ];
6181 
6182 		/* Ignore unmoveable devices (i.e. loopback) */
6183 		if (dev->features & NETIF_F_NETNS_LOCAL)
6184 			continue;
6185 
6186 		/* Leave virtual devices for the generic cleanup */
6187 		if (dev->rtnl_link_ops)
6188 			continue;
6189 
6190 		/* Push remaing network devices to init_net */
6191 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6192 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6193 		if (err) {
6194 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6195 				__func__, dev->name, err);
6196 			BUG();
6197 		}
6198 	}
6199 	rtnl_unlock();
6200 }
6201 
6202 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6203 {
6204 	/* At exit all network devices most be removed from a network
6205 	 * namespace.  Do this in the reverse order of registration.
6206 	 * Do this across as many network namespaces as possible to
6207 	 * improve batching efficiency.
6208 	 */
6209 	struct net_device *dev;
6210 	struct net *net;
6211 	LIST_HEAD(dev_kill_list);
6212 
6213 	rtnl_lock();
6214 	list_for_each_entry(net, net_list, exit_list) {
6215 		for_each_netdev_reverse(net, dev) {
6216 			if (dev->rtnl_link_ops)
6217 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6218 			else
6219 				unregister_netdevice_queue(dev, &dev_kill_list);
6220 		}
6221 	}
6222 	unregister_netdevice_many(&dev_kill_list);
6223 	list_del(&dev_kill_list);
6224 	rtnl_unlock();
6225 }
6226 
6227 static struct pernet_operations __net_initdata default_device_ops = {
6228 	.exit = default_device_exit,
6229 	.exit_batch = default_device_exit_batch,
6230 };
6231 
6232 /*
6233  *	Initialize the DEV module. At boot time this walks the device list and
6234  *	unhooks any devices that fail to initialise (normally hardware not
6235  *	present) and leaves us with a valid list of present and active devices.
6236  *
6237  */
6238 
6239 /*
6240  *       This is called single threaded during boot, so no need
6241  *       to take the rtnl semaphore.
6242  */
6243 static int __init net_dev_init(void)
6244 {
6245 	int i, rc = -ENOMEM;
6246 
6247 	BUG_ON(!dev_boot_phase);
6248 
6249 	if (dev_proc_init())
6250 		goto out;
6251 
6252 	if (netdev_kobject_init())
6253 		goto out;
6254 
6255 	INIT_LIST_HEAD(&ptype_all);
6256 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6257 		INIT_LIST_HEAD(&ptype_base[i]);
6258 
6259 	if (register_pernet_subsys(&netdev_net_ops))
6260 		goto out;
6261 
6262 	/*
6263 	 *	Initialise the packet receive queues.
6264 	 */
6265 
6266 	for_each_possible_cpu(i) {
6267 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6268 
6269 		memset(sd, 0, sizeof(*sd));
6270 		skb_queue_head_init(&sd->input_pkt_queue);
6271 		skb_queue_head_init(&sd->process_queue);
6272 		sd->completion_queue = NULL;
6273 		INIT_LIST_HEAD(&sd->poll_list);
6274 		sd->output_queue = NULL;
6275 		sd->output_queue_tailp = &sd->output_queue;
6276 #ifdef CONFIG_RPS
6277 		sd->csd.func = rps_trigger_softirq;
6278 		sd->csd.info = sd;
6279 		sd->csd.flags = 0;
6280 		sd->cpu = i;
6281 #endif
6282 
6283 		sd->backlog.poll = process_backlog;
6284 		sd->backlog.weight = weight_p;
6285 		sd->backlog.gro_list = NULL;
6286 		sd->backlog.gro_count = 0;
6287 	}
6288 
6289 	dev_boot_phase = 0;
6290 
6291 	/* The loopback device is special if any other network devices
6292 	 * is present in a network namespace the loopback device must
6293 	 * be present. Since we now dynamically allocate and free the
6294 	 * loopback device ensure this invariant is maintained by
6295 	 * keeping the loopback device as the first device on the
6296 	 * list of network devices.  Ensuring the loopback devices
6297 	 * is the first device that appears and the last network device
6298 	 * that disappears.
6299 	 */
6300 	if (register_pernet_device(&loopback_net_ops))
6301 		goto out;
6302 
6303 	if (register_pernet_device(&default_device_ops))
6304 		goto out;
6305 
6306 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6307 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6308 
6309 	hotcpu_notifier(dev_cpu_callback, 0);
6310 	dst_init();
6311 	dev_mcast_init();
6312 	rc = 0;
6313 out:
6314 	return rc;
6315 }
6316 
6317 subsys_initcall(net_dev_init);
6318 
6319 static int __init initialize_hashrnd(void)
6320 {
6321 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6322 	return 0;
6323 }
6324 
6325 late_initcall_sync(initialize_hashrnd);
6326 
6327