xref: /openbmc/linux/net/core/dev.c (revision 1268afe676ee9431a229fc68a2efb0dad4d5852f)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 
136 #include "net-sysfs.h"
137 
138 /* Instead of increasing this, you should create a hash table. */
139 #define MAX_GRO_SKBS 8
140 
141 /* This should be increased if a protocol with a bigger head is added. */
142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 
144 /*
145  *	The list of packet types we will receive (as opposed to discard)
146  *	and the routines to invoke.
147  *
148  *	Why 16. Because with 16 the only overlap we get on a hash of the
149  *	low nibble of the protocol value is RARP/SNAP/X.25.
150  *
151  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
152  *             sure which should go first, but I bet it won't make much
153  *             difference if we are running VLANs.  The good news is that
154  *             this protocol won't be in the list unless compiled in, so
155  *             the average user (w/out VLANs) will not be adversely affected.
156  *             --BLG
157  *
158  *		0800	IP
159  *		8100    802.1Q VLAN
160  *		0001	802.3
161  *		0002	AX.25
162  *		0004	802.2
163  *		8035	RARP
164  *		0005	SNAP
165  *		0805	X.25
166  *		0806	ARP
167  *		8137	IPX
168  *		0009	Localtalk
169  *		86DD	IPv6
170  */
171 
172 #define PTYPE_HASH_SIZE	(16)
173 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
174 
175 static DEFINE_SPINLOCK(ptype_lock);
176 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
177 static struct list_head ptype_all __read_mostly;	/* Taps */
178 
179 /*
180  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
181  * semaphore.
182  *
183  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
184  *
185  * Writers must hold the rtnl semaphore while they loop through the
186  * dev_base_head list, and hold dev_base_lock for writing when they do the
187  * actual updates.  This allows pure readers to access the list even
188  * while a writer is preparing to update it.
189  *
190  * To put it another way, dev_base_lock is held for writing only to
191  * protect against pure readers; the rtnl semaphore provides the
192  * protection against other writers.
193  *
194  * See, for example usages, register_netdevice() and
195  * unregister_netdevice(), which must be called with the rtnl
196  * semaphore held.
197  */
198 DEFINE_RWLOCK(dev_base_lock);
199 EXPORT_SYMBOL(dev_base_lock);
200 
201 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
202 {
203 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
204 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
205 }
206 
207 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
208 {
209 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
210 }
211 
212 static inline void rps_lock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_lock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 static inline void rps_unlock(struct softnet_data *sd)
220 {
221 #ifdef CONFIG_RPS
222 	spin_unlock(&sd->input_pkt_queue.lock);
223 #endif
224 }
225 
226 /* Device list insertion */
227 static int list_netdevice(struct net_device *dev)
228 {
229 	struct net *net = dev_net(dev);
230 
231 	ASSERT_RTNL();
232 
233 	write_lock_bh(&dev_base_lock);
234 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
235 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
236 	hlist_add_head_rcu(&dev->index_hlist,
237 			   dev_index_hash(net, dev->ifindex));
238 	write_unlock_bh(&dev_base_lock);
239 	return 0;
240 }
241 
242 /* Device list removal
243  * caller must respect a RCU grace period before freeing/reusing dev
244  */
245 static void unlist_netdevice(struct net_device *dev)
246 {
247 	ASSERT_RTNL();
248 
249 	/* Unlink dev from the device chain */
250 	write_lock_bh(&dev_base_lock);
251 	list_del_rcu(&dev->dev_list);
252 	hlist_del_rcu(&dev->name_hlist);
253 	hlist_del_rcu(&dev->index_hlist);
254 	write_unlock_bh(&dev_base_lock);
255 }
256 
257 /*
258  *	Our notifier list
259  */
260 
261 static RAW_NOTIFIER_HEAD(netdev_chain);
262 
263 /*
264  *	Device drivers call our routines to queue packets here. We empty the
265  *	queue in the local softnet handler.
266  */
267 
268 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
269 EXPORT_PER_CPU_SYMBOL(softnet_data);
270 
271 #ifdef CONFIG_LOCKDEP
272 /*
273  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
274  * according to dev->type
275  */
276 static const unsigned short netdev_lock_type[] =
277 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
278 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
279 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
280 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
281 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
282 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
283 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
284 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
285 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
286 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
287 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
288 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
289 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
290 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
291 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
292 	 ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
308 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
309 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
310 	 "_xmit_VOID", "_xmit_NONE"};
311 
312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
314 
315 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
316 {
317 	int i;
318 
319 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
320 		if (netdev_lock_type[i] == dev_type)
321 			return i;
322 	/* the last key is used by default */
323 	return ARRAY_SIZE(netdev_lock_type) - 1;
324 }
325 
326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
327 						 unsigned short dev_type)
328 {
329 	int i;
330 
331 	i = netdev_lock_pos(dev_type);
332 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
333 				   netdev_lock_name[i]);
334 }
335 
336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
337 {
338 	int i;
339 
340 	i = netdev_lock_pos(dev->type);
341 	lockdep_set_class_and_name(&dev->addr_list_lock,
342 				   &netdev_addr_lock_key[i],
343 				   netdev_lock_name[i]);
344 }
345 #else
346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
347 						 unsigned short dev_type)
348 {
349 }
350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
351 {
352 }
353 #endif
354 
355 /*******************************************************************************
356 
357 		Protocol management and registration routines
358 
359 *******************************************************************************/
360 
361 /*
362  *	Add a protocol ID to the list. Now that the input handler is
363  *	smarter we can dispense with all the messy stuff that used to be
364  *	here.
365  *
366  *	BEWARE!!! Protocol handlers, mangling input packets,
367  *	MUST BE last in hash buckets and checking protocol handlers
368  *	MUST start from promiscuous ptype_all chain in net_bh.
369  *	It is true now, do not change it.
370  *	Explanation follows: if protocol handler, mangling packet, will
371  *	be the first on list, it is not able to sense, that packet
372  *	is cloned and should be copied-on-write, so that it will
373  *	change it and subsequent readers will get broken packet.
374  *							--ANK (980803)
375  */
376 
377 static inline struct list_head *ptype_head(const struct packet_type *pt)
378 {
379 	if (pt->type == htons(ETH_P_ALL))
380 		return &ptype_all;
381 	else
382 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 /******************************************************************************
462 
463 		      Device Boot-time Settings Routines
464 
465 *******************************************************************************/
466 
467 /* Boot time configuration table */
468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
469 
470 /**
471  *	netdev_boot_setup_add	- add new setup entry
472  *	@name: name of the device
473  *	@map: configured settings for the device
474  *
475  *	Adds new setup entry to the dev_boot_setup list.  The function
476  *	returns 0 on error and 1 on success.  This is a generic routine to
477  *	all netdevices.
478  */
479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
480 {
481 	struct netdev_boot_setup *s;
482 	int i;
483 
484 	s = dev_boot_setup;
485 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
486 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
487 			memset(s[i].name, 0, sizeof(s[i].name));
488 			strlcpy(s[i].name, name, IFNAMSIZ);
489 			memcpy(&s[i].map, map, sizeof(s[i].map));
490 			break;
491 		}
492 	}
493 
494 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
495 }
496 
497 /**
498  *	netdev_boot_setup_check	- check boot time settings
499  *	@dev: the netdevice
500  *
501  * 	Check boot time settings for the device.
502  *	The found settings are set for the device to be used
503  *	later in the device probing.
504  *	Returns 0 if no settings found, 1 if they are.
505  */
506 int netdev_boot_setup_check(struct net_device *dev)
507 {
508 	struct netdev_boot_setup *s = dev_boot_setup;
509 	int i;
510 
511 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
512 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
513 		    !strcmp(dev->name, s[i].name)) {
514 			dev->irq 	= s[i].map.irq;
515 			dev->base_addr 	= s[i].map.base_addr;
516 			dev->mem_start 	= s[i].map.mem_start;
517 			dev->mem_end 	= s[i].map.mem_end;
518 			return 1;
519 		}
520 	}
521 	return 0;
522 }
523 EXPORT_SYMBOL(netdev_boot_setup_check);
524 
525 
526 /**
527  *	netdev_boot_base	- get address from boot time settings
528  *	@prefix: prefix for network device
529  *	@unit: id for network device
530  *
531  * 	Check boot time settings for the base address of device.
532  *	The found settings are set for the device to be used
533  *	later in the device probing.
534  *	Returns 0 if no settings found.
535  */
536 unsigned long netdev_boot_base(const char *prefix, int unit)
537 {
538 	const struct netdev_boot_setup *s = dev_boot_setup;
539 	char name[IFNAMSIZ];
540 	int i;
541 
542 	sprintf(name, "%s%d", prefix, unit);
543 
544 	/*
545 	 * If device already registered then return base of 1
546 	 * to indicate not to probe for this interface
547 	 */
548 	if (__dev_get_by_name(&init_net, name))
549 		return 1;
550 
551 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
552 		if (!strcmp(name, s[i].name))
553 			return s[i].map.base_addr;
554 	return 0;
555 }
556 
557 /*
558  * Saves at boot time configured settings for any netdevice.
559  */
560 int __init netdev_boot_setup(char *str)
561 {
562 	int ints[5];
563 	struct ifmap map;
564 
565 	str = get_options(str, ARRAY_SIZE(ints), ints);
566 	if (!str || !*str)
567 		return 0;
568 
569 	/* Save settings */
570 	memset(&map, 0, sizeof(map));
571 	if (ints[0] > 0)
572 		map.irq = ints[1];
573 	if (ints[0] > 1)
574 		map.base_addr = ints[2];
575 	if (ints[0] > 2)
576 		map.mem_start = ints[3];
577 	if (ints[0] > 3)
578 		map.mem_end = ints[4];
579 
580 	/* Add new entry to the list */
581 	return netdev_boot_setup_add(str, &map);
582 }
583 
584 __setup("netdev=", netdev_boot_setup);
585 
586 /*******************************************************************************
587 
588 			    Device Interface Subroutines
589 
590 *******************************************************************************/
591 
592 /**
593  *	__dev_get_by_name	- find a device by its name
594  *	@net: the applicable net namespace
595  *	@name: name to find
596  *
597  *	Find an interface by name. Must be called under RTNL semaphore
598  *	or @dev_base_lock. If the name is found a pointer to the device
599  *	is returned. If the name is not found then %NULL is returned. The
600  *	reference counters are not incremented so the caller must be
601  *	careful with locks.
602  */
603 
604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
605 {
606 	struct hlist_node *p;
607 	struct net_device *dev;
608 	struct hlist_head *head = dev_name_hash(net, name);
609 
610 	hlist_for_each_entry(dev, p, head, name_hlist)
611 		if (!strncmp(dev->name, name, IFNAMSIZ))
612 			return dev;
613 
614 	return NULL;
615 }
616 EXPORT_SYMBOL(__dev_get_by_name);
617 
618 /**
619  *	dev_get_by_name_rcu	- find a device by its name
620  *	@net: the applicable net namespace
621  *	@name: name to find
622  *
623  *	Find an interface by name.
624  *	If the name is found a pointer to the device is returned.
625  * 	If the name is not found then %NULL is returned.
626  *	The reference counters are not incremented so the caller must be
627  *	careful with locks. The caller must hold RCU lock.
628  */
629 
630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
631 {
632 	struct hlist_node *p;
633 	struct net_device *dev;
634 	struct hlist_head *head = dev_name_hash(net, name);
635 
636 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
637 		if (!strncmp(dev->name, name, IFNAMSIZ))
638 			return dev;
639 
640 	return NULL;
641 }
642 EXPORT_SYMBOL(dev_get_by_name_rcu);
643 
644 /**
645  *	dev_get_by_name		- find a device by its name
646  *	@net: the applicable net namespace
647  *	@name: name to find
648  *
649  *	Find an interface by name. This can be called from any
650  *	context and does its own locking. The returned handle has
651  *	the usage count incremented and the caller must use dev_put() to
652  *	release it when it is no longer needed. %NULL is returned if no
653  *	matching device is found.
654  */
655 
656 struct net_device *dev_get_by_name(struct net *net, const char *name)
657 {
658 	struct net_device *dev;
659 
660 	rcu_read_lock();
661 	dev = dev_get_by_name_rcu(net, name);
662 	if (dev)
663 		dev_hold(dev);
664 	rcu_read_unlock();
665 	return dev;
666 }
667 EXPORT_SYMBOL(dev_get_by_name);
668 
669 /**
670  *	__dev_get_by_index - find a device by its ifindex
671  *	@net: the applicable net namespace
672  *	@ifindex: index of device
673  *
674  *	Search for an interface by index. Returns %NULL if the device
675  *	is not found or a pointer to the device. The device has not
676  *	had its reference counter increased so the caller must be careful
677  *	about locking. The caller must hold either the RTNL semaphore
678  *	or @dev_base_lock.
679  */
680 
681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
682 {
683 	struct hlist_node *p;
684 	struct net_device *dev;
685 	struct hlist_head *head = dev_index_hash(net, ifindex);
686 
687 	hlist_for_each_entry(dev, p, head, index_hlist)
688 		if (dev->ifindex == ifindex)
689 			return dev;
690 
691 	return NULL;
692 }
693 EXPORT_SYMBOL(__dev_get_by_index);
694 
695 /**
696  *	dev_get_by_index_rcu - find a device by its ifindex
697  *	@net: the applicable net namespace
698  *	@ifindex: index of device
699  *
700  *	Search for an interface by index. Returns %NULL if the device
701  *	is not found or a pointer to the device. The device has not
702  *	had its reference counter increased so the caller must be careful
703  *	about locking. The caller must hold RCU lock.
704  */
705 
706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
707 {
708 	struct hlist_node *p;
709 	struct net_device *dev;
710 	struct hlist_head *head = dev_index_hash(net, ifindex);
711 
712 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
713 		if (dev->ifindex == ifindex)
714 			return dev;
715 
716 	return NULL;
717 }
718 EXPORT_SYMBOL(dev_get_by_index_rcu);
719 
720 
721 /**
722  *	dev_get_by_index - find a device by its ifindex
723  *	@net: the applicable net namespace
724  *	@ifindex: index of device
725  *
726  *	Search for an interface by index. Returns NULL if the device
727  *	is not found or a pointer to the device. The device returned has
728  *	had a reference added and the pointer is safe until the user calls
729  *	dev_put to indicate they have finished with it.
730  */
731 
732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
733 {
734 	struct net_device *dev;
735 
736 	rcu_read_lock();
737 	dev = dev_get_by_index_rcu(net, ifindex);
738 	if (dev)
739 		dev_hold(dev);
740 	rcu_read_unlock();
741 	return dev;
742 }
743 EXPORT_SYMBOL(dev_get_by_index);
744 
745 /**
746  *	dev_getbyhwaddr_rcu - find a device by its hardware address
747  *	@net: the applicable net namespace
748  *	@type: media type of device
749  *	@ha: hardware address
750  *
751  *	Search for an interface by MAC address. Returns NULL if the device
752  *	is not found or a pointer to the device. The caller must hold RCU
753  *	The returned device has not had its ref count increased
754  *	and the caller must therefore be careful about locking
755  *
756  */
757 
758 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
759 				       const char *ha)
760 {
761 	struct net_device *dev;
762 
763 	for_each_netdev_rcu(net, dev)
764 		if (dev->type == type &&
765 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
766 			return dev;
767 
768 	return NULL;
769 }
770 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
771 
772 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
773 {
774 	struct net_device *dev;
775 
776 	ASSERT_RTNL();
777 	for_each_netdev(net, dev)
778 		if (dev->type == type)
779 			return dev;
780 
781 	return NULL;
782 }
783 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784 
785 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
786 {
787 	struct net_device *dev, *ret = NULL;
788 
789 	rcu_read_lock();
790 	for_each_netdev_rcu(net, dev)
791 		if (dev->type == type) {
792 			dev_hold(dev);
793 			ret = dev;
794 			break;
795 		}
796 	rcu_read_unlock();
797 	return ret;
798 }
799 EXPORT_SYMBOL(dev_getfirstbyhwtype);
800 
801 /**
802  *	dev_get_by_flags_rcu - find any device with given flags
803  *	@net: the applicable net namespace
804  *	@if_flags: IFF_* values
805  *	@mask: bitmask of bits in if_flags to check
806  *
807  *	Search for any interface with the given flags. Returns NULL if a device
808  *	is not found or a pointer to the device. Must be called inside
809  *	rcu_read_lock(), and result refcount is unchanged.
810  */
811 
812 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
813 				    unsigned short mask)
814 {
815 	struct net_device *dev, *ret;
816 
817 	ret = NULL;
818 	for_each_netdev_rcu(net, dev) {
819 		if (((dev->flags ^ if_flags) & mask) == 0) {
820 			ret = dev;
821 			break;
822 		}
823 	}
824 	return ret;
825 }
826 EXPORT_SYMBOL(dev_get_by_flags_rcu);
827 
828 /**
829  *	dev_valid_name - check if name is okay for network device
830  *	@name: name string
831  *
832  *	Network device names need to be valid file names to
833  *	to allow sysfs to work.  We also disallow any kind of
834  *	whitespace.
835  */
836 int dev_valid_name(const char *name)
837 {
838 	if (*name == '\0')
839 		return 0;
840 	if (strlen(name) >= IFNAMSIZ)
841 		return 0;
842 	if (!strcmp(name, ".") || !strcmp(name, ".."))
843 		return 0;
844 
845 	while (*name) {
846 		if (*name == '/' || isspace(*name))
847 			return 0;
848 		name++;
849 	}
850 	return 1;
851 }
852 EXPORT_SYMBOL(dev_valid_name);
853 
854 /**
855  *	__dev_alloc_name - allocate a name for a device
856  *	@net: network namespace to allocate the device name in
857  *	@name: name format string
858  *	@buf:  scratch buffer and result name string
859  *
860  *	Passed a format string - eg "lt%d" it will try and find a suitable
861  *	id. It scans list of devices to build up a free map, then chooses
862  *	the first empty slot. The caller must hold the dev_base or rtnl lock
863  *	while allocating the name and adding the device in order to avoid
864  *	duplicates.
865  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866  *	Returns the number of the unit assigned or a negative errno code.
867  */
868 
869 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
870 {
871 	int i = 0;
872 	const char *p;
873 	const int max_netdevices = 8*PAGE_SIZE;
874 	unsigned long *inuse;
875 	struct net_device *d;
876 
877 	p = strnchr(name, IFNAMSIZ-1, '%');
878 	if (p) {
879 		/*
880 		 * Verify the string as this thing may have come from
881 		 * the user.  There must be either one "%d" and no other "%"
882 		 * characters.
883 		 */
884 		if (p[1] != 'd' || strchr(p + 2, '%'))
885 			return -EINVAL;
886 
887 		/* Use one page as a bit array of possible slots */
888 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
889 		if (!inuse)
890 			return -ENOMEM;
891 
892 		for_each_netdev(net, d) {
893 			if (!sscanf(d->name, name, &i))
894 				continue;
895 			if (i < 0 || i >= max_netdevices)
896 				continue;
897 
898 			/*  avoid cases where sscanf is not exact inverse of printf */
899 			snprintf(buf, IFNAMSIZ, name, i);
900 			if (!strncmp(buf, d->name, IFNAMSIZ))
901 				set_bit(i, inuse);
902 		}
903 
904 		i = find_first_zero_bit(inuse, max_netdevices);
905 		free_page((unsigned long) inuse);
906 	}
907 
908 	if (buf != name)
909 		snprintf(buf, IFNAMSIZ, name, i);
910 	if (!__dev_get_by_name(net, buf))
911 		return i;
912 
913 	/* It is possible to run out of possible slots
914 	 * when the name is long and there isn't enough space left
915 	 * for the digits, or if all bits are used.
916 	 */
917 	return -ENFILE;
918 }
919 
920 /**
921  *	dev_alloc_name - allocate a name for a device
922  *	@dev: device
923  *	@name: name format string
924  *
925  *	Passed a format string - eg "lt%d" it will try and find a suitable
926  *	id. It scans list of devices to build up a free map, then chooses
927  *	the first empty slot. The caller must hold the dev_base or rtnl lock
928  *	while allocating the name and adding the device in order to avoid
929  *	duplicates.
930  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931  *	Returns the number of the unit assigned or a negative errno code.
932  */
933 
934 int dev_alloc_name(struct net_device *dev, const char *name)
935 {
936 	char buf[IFNAMSIZ];
937 	struct net *net;
938 	int ret;
939 
940 	BUG_ON(!dev_net(dev));
941 	net = dev_net(dev);
942 	ret = __dev_alloc_name(net, name, buf);
943 	if (ret >= 0)
944 		strlcpy(dev->name, buf, IFNAMSIZ);
945 	return ret;
946 }
947 EXPORT_SYMBOL(dev_alloc_name);
948 
949 static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
950 {
951 	struct net *net;
952 
953 	BUG_ON(!dev_net(dev));
954 	net = dev_net(dev);
955 
956 	if (!dev_valid_name(name))
957 		return -EINVAL;
958 
959 	if (fmt && strchr(name, '%'))
960 		return dev_alloc_name(dev, name);
961 	else if (__dev_get_by_name(net, name))
962 		return -EEXIST;
963 	else if (dev->name != name)
964 		strlcpy(dev->name, name, IFNAMSIZ);
965 
966 	return 0;
967 }
968 
969 /**
970  *	dev_change_name - change name of a device
971  *	@dev: device
972  *	@newname: name (or format string) must be at least IFNAMSIZ
973  *
974  *	Change name of a device, can pass format strings "eth%d".
975  *	for wildcarding.
976  */
977 int dev_change_name(struct net_device *dev, const char *newname)
978 {
979 	char oldname[IFNAMSIZ];
980 	int err = 0;
981 	int ret;
982 	struct net *net;
983 
984 	ASSERT_RTNL();
985 	BUG_ON(!dev_net(dev));
986 
987 	net = dev_net(dev);
988 	if (dev->flags & IFF_UP)
989 		return -EBUSY;
990 
991 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 		return 0;
993 
994 	memcpy(oldname, dev->name, IFNAMSIZ);
995 
996 	err = dev_get_valid_name(dev, newname, 1);
997 	if (err < 0)
998 		return err;
999 
1000 rollback:
1001 	ret = device_rename(&dev->dev, dev->name);
1002 	if (ret) {
1003 		memcpy(dev->name, oldname, IFNAMSIZ);
1004 		return ret;
1005 	}
1006 
1007 	write_lock_bh(&dev_base_lock);
1008 	hlist_del(&dev->name_hlist);
1009 	write_unlock_bh(&dev_base_lock);
1010 
1011 	synchronize_rcu();
1012 
1013 	write_lock_bh(&dev_base_lock);
1014 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1015 	write_unlock_bh(&dev_base_lock);
1016 
1017 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1018 	ret = notifier_to_errno(ret);
1019 
1020 	if (ret) {
1021 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1022 		if (err >= 0) {
1023 			err = ret;
1024 			memcpy(dev->name, oldname, IFNAMSIZ);
1025 			goto rollback;
1026 		} else {
1027 			printk(KERN_ERR
1028 			       "%s: name change rollback failed: %d.\n",
1029 			       dev->name, ret);
1030 		}
1031 	}
1032 
1033 	return err;
1034 }
1035 
1036 /**
1037  *	dev_set_alias - change ifalias of a device
1038  *	@dev: device
1039  *	@alias: name up to IFALIASZ
1040  *	@len: limit of bytes to copy from info
1041  *
1042  *	Set ifalias for a device,
1043  */
1044 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045 {
1046 	ASSERT_RTNL();
1047 
1048 	if (len >= IFALIASZ)
1049 		return -EINVAL;
1050 
1051 	if (!len) {
1052 		if (dev->ifalias) {
1053 			kfree(dev->ifalias);
1054 			dev->ifalias = NULL;
1055 		}
1056 		return 0;
1057 	}
1058 
1059 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1060 	if (!dev->ifalias)
1061 		return -ENOMEM;
1062 
1063 	strlcpy(dev->ifalias, alias, len+1);
1064 	return len;
1065 }
1066 
1067 
1068 /**
1069  *	netdev_features_change - device changes features
1070  *	@dev: device to cause notification
1071  *
1072  *	Called to indicate a device has changed features.
1073  */
1074 void netdev_features_change(struct net_device *dev)
1075 {
1076 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1077 }
1078 EXPORT_SYMBOL(netdev_features_change);
1079 
1080 /**
1081  *	netdev_state_change - device changes state
1082  *	@dev: device to cause notification
1083  *
1084  *	Called to indicate a device has changed state. This function calls
1085  *	the notifier chains for netdev_chain and sends a NEWLINK message
1086  *	to the routing socket.
1087  */
1088 void netdev_state_change(struct net_device *dev)
1089 {
1090 	if (dev->flags & IFF_UP) {
1091 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1092 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 	}
1094 }
1095 EXPORT_SYMBOL(netdev_state_change);
1096 
1097 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1098 {
1099 	return call_netdevice_notifiers(event, dev);
1100 }
1101 EXPORT_SYMBOL(netdev_bonding_change);
1102 
1103 /**
1104  *	dev_load 	- load a network module
1105  *	@net: the applicable net namespace
1106  *	@name: name of interface
1107  *
1108  *	If a network interface is not present and the process has suitable
1109  *	privileges this function loads the module. If module loading is not
1110  *	available in this kernel then it becomes a nop.
1111  */
1112 
1113 void dev_load(struct net *net, const char *name)
1114 {
1115 	struct net_device *dev;
1116 
1117 	rcu_read_lock();
1118 	dev = dev_get_by_name_rcu(net, name);
1119 	rcu_read_unlock();
1120 
1121 	if (!dev && capable(CAP_NET_ADMIN))
1122 		request_module("%s", name);
1123 }
1124 EXPORT_SYMBOL(dev_load);
1125 
1126 static int __dev_open(struct net_device *dev)
1127 {
1128 	const struct net_device_ops *ops = dev->netdev_ops;
1129 	int ret;
1130 
1131 	ASSERT_RTNL();
1132 
1133 	/*
1134 	 *	Is it even present?
1135 	 */
1136 	if (!netif_device_present(dev))
1137 		return -ENODEV;
1138 
1139 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 	ret = notifier_to_errno(ret);
1141 	if (ret)
1142 		return ret;
1143 
1144 	/*
1145 	 *	Call device private open method
1146 	 */
1147 	set_bit(__LINK_STATE_START, &dev->state);
1148 
1149 	if (ops->ndo_validate_addr)
1150 		ret = ops->ndo_validate_addr(dev);
1151 
1152 	if (!ret && ops->ndo_open)
1153 		ret = ops->ndo_open(dev);
1154 
1155 	/*
1156 	 *	If it went open OK then:
1157 	 */
1158 
1159 	if (ret)
1160 		clear_bit(__LINK_STATE_START, &dev->state);
1161 	else {
1162 		/*
1163 		 *	Set the flags.
1164 		 */
1165 		dev->flags |= IFF_UP;
1166 
1167 		/*
1168 		 *	Enable NET_DMA
1169 		 */
1170 		net_dmaengine_get();
1171 
1172 		/*
1173 		 *	Initialize multicasting status
1174 		 */
1175 		dev_set_rx_mode(dev);
1176 
1177 		/*
1178 		 *	Wakeup transmit queue engine
1179 		 */
1180 		dev_activate(dev);
1181 	}
1182 
1183 	return ret;
1184 }
1185 
1186 /**
1187  *	dev_open	- prepare an interface for use.
1188  *	@dev:	device to open
1189  *
1190  *	Takes a device from down to up state. The device's private open
1191  *	function is invoked and then the multicast lists are loaded. Finally
1192  *	the device is moved into the up state and a %NETDEV_UP message is
1193  *	sent to the netdev notifier chain.
1194  *
1195  *	Calling this function on an active interface is a nop. On a failure
1196  *	a negative errno code is returned.
1197  */
1198 int dev_open(struct net_device *dev)
1199 {
1200 	int ret;
1201 
1202 	/*
1203 	 *	Is it already up?
1204 	 */
1205 	if (dev->flags & IFF_UP)
1206 		return 0;
1207 
1208 	/*
1209 	 *	Open device
1210 	 */
1211 	ret = __dev_open(dev);
1212 	if (ret < 0)
1213 		return ret;
1214 
1215 	/*
1216 	 *	... and announce new interface.
1217 	 */
1218 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 	call_netdevice_notifiers(NETDEV_UP, dev);
1220 
1221 	return ret;
1222 }
1223 EXPORT_SYMBOL(dev_open);
1224 
1225 static int __dev_close_many(struct list_head *head)
1226 {
1227 	struct net_device *dev;
1228 
1229 	ASSERT_RTNL();
1230 	might_sleep();
1231 
1232 	list_for_each_entry(dev, head, unreg_list) {
1233 		/*
1234 		 *	Tell people we are going down, so that they can
1235 		 *	prepare to death, when device is still operating.
1236 		 */
1237 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1238 
1239 		clear_bit(__LINK_STATE_START, &dev->state);
1240 
1241 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1242 		 * can be even on different cpu. So just clear netif_running().
1243 		 *
1244 		 * dev->stop() will invoke napi_disable() on all of it's
1245 		 * napi_struct instances on this device.
1246 		 */
1247 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1248 	}
1249 
1250 	dev_deactivate_many(head);
1251 
1252 	list_for_each_entry(dev, head, unreg_list) {
1253 		const struct net_device_ops *ops = dev->netdev_ops;
1254 
1255 		/*
1256 		 *	Call the device specific close. This cannot fail.
1257 		 *	Only if device is UP
1258 		 *
1259 		 *	We allow it to be called even after a DETACH hot-plug
1260 		 *	event.
1261 		 */
1262 		if (ops->ndo_stop)
1263 			ops->ndo_stop(dev);
1264 
1265 		/*
1266 		 *	Device is now down.
1267 		 */
1268 
1269 		dev->flags &= ~IFF_UP;
1270 
1271 		/*
1272 		 *	Shutdown NET_DMA
1273 		 */
1274 		net_dmaengine_put();
1275 	}
1276 
1277 	return 0;
1278 }
1279 
1280 static int __dev_close(struct net_device *dev)
1281 {
1282 	LIST_HEAD(single);
1283 
1284 	list_add(&dev->unreg_list, &single);
1285 	return __dev_close_many(&single);
1286 }
1287 
1288 int dev_close_many(struct list_head *head)
1289 {
1290 	struct net_device *dev, *tmp;
1291 	LIST_HEAD(tmp_list);
1292 
1293 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1294 		if (!(dev->flags & IFF_UP))
1295 			list_move(&dev->unreg_list, &tmp_list);
1296 
1297 	__dev_close_many(head);
1298 
1299 	/*
1300 	 * Tell people we are down
1301 	 */
1302 	list_for_each_entry(dev, head, unreg_list) {
1303 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1304 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1305 	}
1306 
1307 	/* rollback_registered_many needs the complete original list */
1308 	list_splice(&tmp_list, head);
1309 	return 0;
1310 }
1311 
1312 /**
1313  *	dev_close - shutdown an interface.
1314  *	@dev: device to shutdown
1315  *
1316  *	This function moves an active device into down state. A
1317  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1318  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1319  *	chain.
1320  */
1321 int dev_close(struct net_device *dev)
1322 {
1323 	LIST_HEAD(single);
1324 
1325 	list_add(&dev->unreg_list, &single);
1326 	dev_close_many(&single);
1327 
1328 	return 0;
1329 }
1330 EXPORT_SYMBOL(dev_close);
1331 
1332 
1333 /**
1334  *	dev_disable_lro - disable Large Receive Offload on a device
1335  *	@dev: device
1336  *
1337  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1338  *	called under RTNL.  This is needed if received packets may be
1339  *	forwarded to another interface.
1340  */
1341 void dev_disable_lro(struct net_device *dev)
1342 {
1343 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1344 	    dev->ethtool_ops->set_flags) {
1345 		u32 flags = dev->ethtool_ops->get_flags(dev);
1346 		if (flags & ETH_FLAG_LRO) {
1347 			flags &= ~ETH_FLAG_LRO;
1348 			dev->ethtool_ops->set_flags(dev, flags);
1349 		}
1350 	}
1351 	WARN_ON(dev->features & NETIF_F_LRO);
1352 }
1353 EXPORT_SYMBOL(dev_disable_lro);
1354 
1355 
1356 static int dev_boot_phase = 1;
1357 
1358 /*
1359  *	Device change register/unregister. These are not inline or static
1360  *	as we export them to the world.
1361  */
1362 
1363 /**
1364  *	register_netdevice_notifier - register a network notifier block
1365  *	@nb: notifier
1366  *
1367  *	Register a notifier to be called when network device events occur.
1368  *	The notifier passed is linked into the kernel structures and must
1369  *	not be reused until it has been unregistered. A negative errno code
1370  *	is returned on a failure.
1371  *
1372  * 	When registered all registration and up events are replayed
1373  *	to the new notifier to allow device to have a race free
1374  *	view of the network device list.
1375  */
1376 
1377 int register_netdevice_notifier(struct notifier_block *nb)
1378 {
1379 	struct net_device *dev;
1380 	struct net_device *last;
1381 	struct net *net;
1382 	int err;
1383 
1384 	rtnl_lock();
1385 	err = raw_notifier_chain_register(&netdev_chain, nb);
1386 	if (err)
1387 		goto unlock;
1388 	if (dev_boot_phase)
1389 		goto unlock;
1390 	for_each_net(net) {
1391 		for_each_netdev(net, dev) {
1392 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1393 			err = notifier_to_errno(err);
1394 			if (err)
1395 				goto rollback;
1396 
1397 			if (!(dev->flags & IFF_UP))
1398 				continue;
1399 
1400 			nb->notifier_call(nb, NETDEV_UP, dev);
1401 		}
1402 	}
1403 
1404 unlock:
1405 	rtnl_unlock();
1406 	return err;
1407 
1408 rollback:
1409 	last = dev;
1410 	for_each_net(net) {
1411 		for_each_netdev(net, dev) {
1412 			if (dev == last)
1413 				break;
1414 
1415 			if (dev->flags & IFF_UP) {
1416 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1417 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1418 			}
1419 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1420 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1421 		}
1422 	}
1423 
1424 	raw_notifier_chain_unregister(&netdev_chain, nb);
1425 	goto unlock;
1426 }
1427 EXPORT_SYMBOL(register_netdevice_notifier);
1428 
1429 /**
1430  *	unregister_netdevice_notifier - unregister a network notifier block
1431  *	@nb: notifier
1432  *
1433  *	Unregister a notifier previously registered by
1434  *	register_netdevice_notifier(). The notifier is unlinked into the
1435  *	kernel structures and may then be reused. A negative errno code
1436  *	is returned on a failure.
1437  */
1438 
1439 int unregister_netdevice_notifier(struct notifier_block *nb)
1440 {
1441 	int err;
1442 
1443 	rtnl_lock();
1444 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1445 	rtnl_unlock();
1446 	return err;
1447 }
1448 EXPORT_SYMBOL(unregister_netdevice_notifier);
1449 
1450 /**
1451  *	call_netdevice_notifiers - call all network notifier blocks
1452  *      @val: value passed unmodified to notifier function
1453  *      @dev: net_device pointer passed unmodified to notifier function
1454  *
1455  *	Call all network notifier blocks.  Parameters and return value
1456  *	are as for raw_notifier_call_chain().
1457  */
1458 
1459 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1460 {
1461 	ASSERT_RTNL();
1462 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1463 }
1464 
1465 /* When > 0 there are consumers of rx skb time stamps */
1466 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1467 
1468 void net_enable_timestamp(void)
1469 {
1470 	atomic_inc(&netstamp_needed);
1471 }
1472 EXPORT_SYMBOL(net_enable_timestamp);
1473 
1474 void net_disable_timestamp(void)
1475 {
1476 	atomic_dec(&netstamp_needed);
1477 }
1478 EXPORT_SYMBOL(net_disable_timestamp);
1479 
1480 static inline void net_timestamp_set(struct sk_buff *skb)
1481 {
1482 	if (atomic_read(&netstamp_needed))
1483 		__net_timestamp(skb);
1484 	else
1485 		skb->tstamp.tv64 = 0;
1486 }
1487 
1488 static inline void net_timestamp_check(struct sk_buff *skb)
1489 {
1490 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1491 		__net_timestamp(skb);
1492 }
1493 
1494 /**
1495  * dev_forward_skb - loopback an skb to another netif
1496  *
1497  * @dev: destination network device
1498  * @skb: buffer to forward
1499  *
1500  * return values:
1501  *	NET_RX_SUCCESS	(no congestion)
1502  *	NET_RX_DROP     (packet was dropped, but freed)
1503  *
1504  * dev_forward_skb can be used for injecting an skb from the
1505  * start_xmit function of one device into the receive queue
1506  * of another device.
1507  *
1508  * The receiving device may be in another namespace, so
1509  * we have to clear all information in the skb that could
1510  * impact namespace isolation.
1511  */
1512 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1513 {
1514 	skb_orphan(skb);
1515 	nf_reset(skb);
1516 
1517 	if (unlikely(!(dev->flags & IFF_UP) ||
1518 		     (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN)))) {
1519 		atomic_long_inc(&dev->rx_dropped);
1520 		kfree_skb(skb);
1521 		return NET_RX_DROP;
1522 	}
1523 	skb_set_dev(skb, dev);
1524 	skb->tstamp.tv64 = 0;
1525 	skb->pkt_type = PACKET_HOST;
1526 	skb->protocol = eth_type_trans(skb, dev);
1527 	return netif_rx(skb);
1528 }
1529 EXPORT_SYMBOL_GPL(dev_forward_skb);
1530 
1531 static inline int deliver_skb(struct sk_buff *skb,
1532 			      struct packet_type *pt_prev,
1533 			      struct net_device *orig_dev)
1534 {
1535 	atomic_inc(&skb->users);
1536 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1537 }
1538 
1539 /*
1540  *	Support routine. Sends outgoing frames to any network
1541  *	taps currently in use.
1542  */
1543 
1544 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1545 {
1546 	struct packet_type *ptype;
1547 	struct sk_buff *skb2 = NULL;
1548 	struct packet_type *pt_prev = NULL;
1549 
1550 	rcu_read_lock();
1551 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1552 		/* Never send packets back to the socket
1553 		 * they originated from - MvS (miquels@drinkel.ow.org)
1554 		 */
1555 		if ((ptype->dev == dev || !ptype->dev) &&
1556 		    (ptype->af_packet_priv == NULL ||
1557 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1558 			if (pt_prev) {
1559 				deliver_skb(skb2, pt_prev, skb->dev);
1560 				pt_prev = ptype;
1561 				continue;
1562 			}
1563 
1564 			skb2 = skb_clone(skb, GFP_ATOMIC);
1565 			if (!skb2)
1566 				break;
1567 
1568 			net_timestamp_set(skb2);
1569 
1570 			/* skb->nh should be correctly
1571 			   set by sender, so that the second statement is
1572 			   just protection against buggy protocols.
1573 			 */
1574 			skb_reset_mac_header(skb2);
1575 
1576 			if (skb_network_header(skb2) < skb2->data ||
1577 			    skb2->network_header > skb2->tail) {
1578 				if (net_ratelimit())
1579 					printk(KERN_CRIT "protocol %04x is "
1580 					       "buggy, dev %s\n",
1581 					       ntohs(skb2->protocol),
1582 					       dev->name);
1583 				skb_reset_network_header(skb2);
1584 			}
1585 
1586 			skb2->transport_header = skb2->network_header;
1587 			skb2->pkt_type = PACKET_OUTGOING;
1588 			pt_prev = ptype;
1589 		}
1590 	}
1591 	if (pt_prev)
1592 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1593 	rcu_read_unlock();
1594 }
1595 
1596 /*
1597  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1598  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1599  */
1600 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1601 {
1602 	int rc;
1603 
1604 	if (txq < 1 || txq > dev->num_tx_queues)
1605 		return -EINVAL;
1606 
1607 	if (dev->reg_state == NETREG_REGISTERED) {
1608 		ASSERT_RTNL();
1609 
1610 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1611 						  txq);
1612 		if (rc)
1613 			return rc;
1614 
1615 		if (txq < dev->real_num_tx_queues)
1616 			qdisc_reset_all_tx_gt(dev, txq);
1617 	}
1618 
1619 	dev->real_num_tx_queues = txq;
1620 	return 0;
1621 }
1622 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1623 
1624 #ifdef CONFIG_RPS
1625 /**
1626  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1627  *	@dev: Network device
1628  *	@rxq: Actual number of RX queues
1629  *
1630  *	This must be called either with the rtnl_lock held or before
1631  *	registration of the net device.  Returns 0 on success, or a
1632  *	negative error code.  If called before registration, it always
1633  *	succeeds.
1634  */
1635 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1636 {
1637 	int rc;
1638 
1639 	if (rxq < 1 || rxq > dev->num_rx_queues)
1640 		return -EINVAL;
1641 
1642 	if (dev->reg_state == NETREG_REGISTERED) {
1643 		ASSERT_RTNL();
1644 
1645 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1646 						  rxq);
1647 		if (rc)
1648 			return rc;
1649 	}
1650 
1651 	dev->real_num_rx_queues = rxq;
1652 	return 0;
1653 }
1654 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1655 #endif
1656 
1657 static inline void __netif_reschedule(struct Qdisc *q)
1658 {
1659 	struct softnet_data *sd;
1660 	unsigned long flags;
1661 
1662 	local_irq_save(flags);
1663 	sd = &__get_cpu_var(softnet_data);
1664 	q->next_sched = NULL;
1665 	*sd->output_queue_tailp = q;
1666 	sd->output_queue_tailp = &q->next_sched;
1667 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1668 	local_irq_restore(flags);
1669 }
1670 
1671 void __netif_schedule(struct Qdisc *q)
1672 {
1673 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1674 		__netif_reschedule(q);
1675 }
1676 EXPORT_SYMBOL(__netif_schedule);
1677 
1678 void dev_kfree_skb_irq(struct sk_buff *skb)
1679 {
1680 	if (atomic_dec_and_test(&skb->users)) {
1681 		struct softnet_data *sd;
1682 		unsigned long flags;
1683 
1684 		local_irq_save(flags);
1685 		sd = &__get_cpu_var(softnet_data);
1686 		skb->next = sd->completion_queue;
1687 		sd->completion_queue = skb;
1688 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1689 		local_irq_restore(flags);
1690 	}
1691 }
1692 EXPORT_SYMBOL(dev_kfree_skb_irq);
1693 
1694 void dev_kfree_skb_any(struct sk_buff *skb)
1695 {
1696 	if (in_irq() || irqs_disabled())
1697 		dev_kfree_skb_irq(skb);
1698 	else
1699 		dev_kfree_skb(skb);
1700 }
1701 EXPORT_SYMBOL(dev_kfree_skb_any);
1702 
1703 
1704 /**
1705  * netif_device_detach - mark device as removed
1706  * @dev: network device
1707  *
1708  * Mark device as removed from system and therefore no longer available.
1709  */
1710 void netif_device_detach(struct net_device *dev)
1711 {
1712 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1713 	    netif_running(dev)) {
1714 		netif_tx_stop_all_queues(dev);
1715 	}
1716 }
1717 EXPORT_SYMBOL(netif_device_detach);
1718 
1719 /**
1720  * netif_device_attach - mark device as attached
1721  * @dev: network device
1722  *
1723  * Mark device as attached from system and restart if needed.
1724  */
1725 void netif_device_attach(struct net_device *dev)
1726 {
1727 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1728 	    netif_running(dev)) {
1729 		netif_tx_wake_all_queues(dev);
1730 		__netdev_watchdog_up(dev);
1731 	}
1732 }
1733 EXPORT_SYMBOL(netif_device_attach);
1734 
1735 /**
1736  * skb_dev_set -- assign a new device to a buffer
1737  * @skb: buffer for the new device
1738  * @dev: network device
1739  *
1740  * If an skb is owned by a device already, we have to reset
1741  * all data private to the namespace a device belongs to
1742  * before assigning it a new device.
1743  */
1744 #ifdef CONFIG_NET_NS
1745 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1746 {
1747 	skb_dst_drop(skb);
1748 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1749 		secpath_reset(skb);
1750 		nf_reset(skb);
1751 		skb_init_secmark(skb);
1752 		skb->mark = 0;
1753 		skb->priority = 0;
1754 		skb->nf_trace = 0;
1755 		skb->ipvs_property = 0;
1756 #ifdef CONFIG_NET_SCHED
1757 		skb->tc_index = 0;
1758 #endif
1759 	}
1760 	skb->dev = dev;
1761 }
1762 EXPORT_SYMBOL(skb_set_dev);
1763 #endif /* CONFIG_NET_NS */
1764 
1765 /*
1766  * Invalidate hardware checksum when packet is to be mangled, and
1767  * complete checksum manually on outgoing path.
1768  */
1769 int skb_checksum_help(struct sk_buff *skb)
1770 {
1771 	__wsum csum;
1772 	int ret = 0, offset;
1773 
1774 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1775 		goto out_set_summed;
1776 
1777 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1778 		/* Let GSO fix up the checksum. */
1779 		goto out_set_summed;
1780 	}
1781 
1782 	offset = skb_checksum_start_offset(skb);
1783 	BUG_ON(offset >= skb_headlen(skb));
1784 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1785 
1786 	offset += skb->csum_offset;
1787 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1788 
1789 	if (skb_cloned(skb) &&
1790 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1791 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1792 		if (ret)
1793 			goto out;
1794 	}
1795 
1796 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1797 out_set_summed:
1798 	skb->ip_summed = CHECKSUM_NONE;
1799 out:
1800 	return ret;
1801 }
1802 EXPORT_SYMBOL(skb_checksum_help);
1803 
1804 /**
1805  *	skb_gso_segment - Perform segmentation on skb.
1806  *	@skb: buffer to segment
1807  *	@features: features for the output path (see dev->features)
1808  *
1809  *	This function segments the given skb and returns a list of segments.
1810  *
1811  *	It may return NULL if the skb requires no segmentation.  This is
1812  *	only possible when GSO is used for verifying header integrity.
1813  */
1814 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1815 {
1816 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1817 	struct packet_type *ptype;
1818 	__be16 type = skb->protocol;
1819 	int vlan_depth = ETH_HLEN;
1820 	int err;
1821 
1822 	while (type == htons(ETH_P_8021Q)) {
1823 		struct vlan_hdr *vh;
1824 
1825 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1826 			return ERR_PTR(-EINVAL);
1827 
1828 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1829 		type = vh->h_vlan_encapsulated_proto;
1830 		vlan_depth += VLAN_HLEN;
1831 	}
1832 
1833 	skb_reset_mac_header(skb);
1834 	skb->mac_len = skb->network_header - skb->mac_header;
1835 	__skb_pull(skb, skb->mac_len);
1836 
1837 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1838 		struct net_device *dev = skb->dev;
1839 		struct ethtool_drvinfo info = {};
1840 
1841 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1842 			dev->ethtool_ops->get_drvinfo(dev, &info);
1843 
1844 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1845 		     info.driver, dev ? dev->features : 0L,
1846 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1847 		     skb->len, skb->data_len, skb->ip_summed);
1848 
1849 		if (skb_header_cloned(skb) &&
1850 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1851 			return ERR_PTR(err);
1852 	}
1853 
1854 	rcu_read_lock();
1855 	list_for_each_entry_rcu(ptype,
1856 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1857 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1858 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1859 				err = ptype->gso_send_check(skb);
1860 				segs = ERR_PTR(err);
1861 				if (err || skb_gso_ok(skb, features))
1862 					break;
1863 				__skb_push(skb, (skb->data -
1864 						 skb_network_header(skb)));
1865 			}
1866 			segs = ptype->gso_segment(skb, features);
1867 			break;
1868 		}
1869 	}
1870 	rcu_read_unlock();
1871 
1872 	__skb_push(skb, skb->data - skb_mac_header(skb));
1873 
1874 	return segs;
1875 }
1876 EXPORT_SYMBOL(skb_gso_segment);
1877 
1878 /* Take action when hardware reception checksum errors are detected. */
1879 #ifdef CONFIG_BUG
1880 void netdev_rx_csum_fault(struct net_device *dev)
1881 {
1882 	if (net_ratelimit()) {
1883 		printk(KERN_ERR "%s: hw csum failure.\n",
1884 			dev ? dev->name : "<unknown>");
1885 		dump_stack();
1886 	}
1887 }
1888 EXPORT_SYMBOL(netdev_rx_csum_fault);
1889 #endif
1890 
1891 /* Actually, we should eliminate this check as soon as we know, that:
1892  * 1. IOMMU is present and allows to map all the memory.
1893  * 2. No high memory really exists on this machine.
1894  */
1895 
1896 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1897 {
1898 #ifdef CONFIG_HIGHMEM
1899 	int i;
1900 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1901 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1902 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1903 				return 1;
1904 	}
1905 
1906 	if (PCI_DMA_BUS_IS_PHYS) {
1907 		struct device *pdev = dev->dev.parent;
1908 
1909 		if (!pdev)
1910 			return 0;
1911 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1912 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1913 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1914 				return 1;
1915 		}
1916 	}
1917 #endif
1918 	return 0;
1919 }
1920 
1921 struct dev_gso_cb {
1922 	void (*destructor)(struct sk_buff *skb);
1923 };
1924 
1925 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1926 
1927 static void dev_gso_skb_destructor(struct sk_buff *skb)
1928 {
1929 	struct dev_gso_cb *cb;
1930 
1931 	do {
1932 		struct sk_buff *nskb = skb->next;
1933 
1934 		skb->next = nskb->next;
1935 		nskb->next = NULL;
1936 		kfree_skb(nskb);
1937 	} while (skb->next);
1938 
1939 	cb = DEV_GSO_CB(skb);
1940 	if (cb->destructor)
1941 		cb->destructor(skb);
1942 }
1943 
1944 /**
1945  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1946  *	@skb: buffer to segment
1947  *	@features: device features as applicable to this skb
1948  *
1949  *	This function segments the given skb and stores the list of segments
1950  *	in skb->next.
1951  */
1952 static int dev_gso_segment(struct sk_buff *skb, int features)
1953 {
1954 	struct sk_buff *segs;
1955 
1956 	segs = skb_gso_segment(skb, features);
1957 
1958 	/* Verifying header integrity only. */
1959 	if (!segs)
1960 		return 0;
1961 
1962 	if (IS_ERR(segs))
1963 		return PTR_ERR(segs);
1964 
1965 	skb->next = segs;
1966 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1967 	skb->destructor = dev_gso_skb_destructor;
1968 
1969 	return 0;
1970 }
1971 
1972 /*
1973  * Try to orphan skb early, right before transmission by the device.
1974  * We cannot orphan skb if tx timestamp is requested or the sk-reference
1975  * is needed on driver level for other reasons, e.g. see net/can/raw.c
1976  */
1977 static inline void skb_orphan_try(struct sk_buff *skb)
1978 {
1979 	struct sock *sk = skb->sk;
1980 
1981 	if (sk && !skb_shinfo(skb)->tx_flags) {
1982 		/* skb_tx_hash() wont be able to get sk.
1983 		 * We copy sk_hash into skb->rxhash
1984 		 */
1985 		if (!skb->rxhash)
1986 			skb->rxhash = sk->sk_hash;
1987 		skb_orphan(skb);
1988 	}
1989 }
1990 
1991 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1992 {
1993 	return ((features & NETIF_F_GEN_CSUM) ||
1994 		((features & NETIF_F_V4_CSUM) &&
1995 		 protocol == htons(ETH_P_IP)) ||
1996 		((features & NETIF_F_V6_CSUM) &&
1997 		 protocol == htons(ETH_P_IPV6)) ||
1998 		((features & NETIF_F_FCOE_CRC) &&
1999 		 protocol == htons(ETH_P_FCOE)));
2000 }
2001 
2002 static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features)
2003 {
2004 	if (!can_checksum_protocol(features, protocol)) {
2005 		features &= ~NETIF_F_ALL_CSUM;
2006 		features &= ~NETIF_F_SG;
2007 	} else if (illegal_highdma(skb->dev, skb)) {
2008 		features &= ~NETIF_F_SG;
2009 	}
2010 
2011 	return features;
2012 }
2013 
2014 int netif_skb_features(struct sk_buff *skb)
2015 {
2016 	__be16 protocol = skb->protocol;
2017 	int features = skb->dev->features;
2018 
2019 	if (protocol == htons(ETH_P_8021Q)) {
2020 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2021 		protocol = veh->h_vlan_encapsulated_proto;
2022 	} else if (!vlan_tx_tag_present(skb)) {
2023 		return harmonize_features(skb, protocol, features);
2024 	}
2025 
2026 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2027 
2028 	if (protocol != htons(ETH_P_8021Q)) {
2029 		return harmonize_features(skb, protocol, features);
2030 	} else {
2031 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2032 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2033 		return harmonize_features(skb, protocol, features);
2034 	}
2035 }
2036 EXPORT_SYMBOL(netif_skb_features);
2037 
2038 /*
2039  * Returns true if either:
2040  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2041  *	2. skb is fragmented and the device does not support SG, or if
2042  *	   at least one of fragments is in highmem and device does not
2043  *	   support DMA from it.
2044  */
2045 static inline int skb_needs_linearize(struct sk_buff *skb,
2046 				      int features)
2047 {
2048 	return skb_is_nonlinear(skb) &&
2049 			((skb_has_frag_list(skb) &&
2050 				!(features & NETIF_F_FRAGLIST)) ||
2051 			(skb_shinfo(skb)->nr_frags &&
2052 				!(features & NETIF_F_SG)));
2053 }
2054 
2055 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2056 			struct netdev_queue *txq)
2057 {
2058 	const struct net_device_ops *ops = dev->netdev_ops;
2059 	int rc = NETDEV_TX_OK;
2060 
2061 	if (likely(!skb->next)) {
2062 		int features;
2063 
2064 		/*
2065 		 * If device doesnt need skb->dst, release it right now while
2066 		 * its hot in this cpu cache
2067 		 */
2068 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2069 			skb_dst_drop(skb);
2070 
2071 		if (!list_empty(&ptype_all))
2072 			dev_queue_xmit_nit(skb, dev);
2073 
2074 		skb_orphan_try(skb);
2075 
2076 		features = netif_skb_features(skb);
2077 
2078 		if (vlan_tx_tag_present(skb) &&
2079 		    !(features & NETIF_F_HW_VLAN_TX)) {
2080 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2081 			if (unlikely(!skb))
2082 				goto out;
2083 
2084 			skb->vlan_tci = 0;
2085 		}
2086 
2087 		if (netif_needs_gso(skb, features)) {
2088 			if (unlikely(dev_gso_segment(skb, features)))
2089 				goto out_kfree_skb;
2090 			if (skb->next)
2091 				goto gso;
2092 		} else {
2093 			if (skb_needs_linearize(skb, features) &&
2094 			    __skb_linearize(skb))
2095 				goto out_kfree_skb;
2096 
2097 			/* If packet is not checksummed and device does not
2098 			 * support checksumming for this protocol, complete
2099 			 * checksumming here.
2100 			 */
2101 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2102 				skb_set_transport_header(skb,
2103 					skb_checksum_start_offset(skb));
2104 				if (!(features & NETIF_F_ALL_CSUM) &&
2105 				     skb_checksum_help(skb))
2106 					goto out_kfree_skb;
2107 			}
2108 		}
2109 
2110 		rc = ops->ndo_start_xmit(skb, dev);
2111 		trace_net_dev_xmit(skb, rc);
2112 		if (rc == NETDEV_TX_OK)
2113 			txq_trans_update(txq);
2114 		return rc;
2115 	}
2116 
2117 gso:
2118 	do {
2119 		struct sk_buff *nskb = skb->next;
2120 
2121 		skb->next = nskb->next;
2122 		nskb->next = NULL;
2123 
2124 		/*
2125 		 * If device doesnt need nskb->dst, release it right now while
2126 		 * its hot in this cpu cache
2127 		 */
2128 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2129 			skb_dst_drop(nskb);
2130 
2131 		rc = ops->ndo_start_xmit(nskb, dev);
2132 		trace_net_dev_xmit(nskb, rc);
2133 		if (unlikely(rc != NETDEV_TX_OK)) {
2134 			if (rc & ~NETDEV_TX_MASK)
2135 				goto out_kfree_gso_skb;
2136 			nskb->next = skb->next;
2137 			skb->next = nskb;
2138 			return rc;
2139 		}
2140 		txq_trans_update(txq);
2141 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2142 			return NETDEV_TX_BUSY;
2143 	} while (skb->next);
2144 
2145 out_kfree_gso_skb:
2146 	if (likely(skb->next == NULL))
2147 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2148 out_kfree_skb:
2149 	kfree_skb(skb);
2150 out:
2151 	return rc;
2152 }
2153 
2154 static u32 hashrnd __read_mostly;
2155 
2156 /*
2157  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2158  * to be used as a distribution range.
2159  */
2160 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2161 		  unsigned int num_tx_queues)
2162 {
2163 	u32 hash;
2164 
2165 	if (skb_rx_queue_recorded(skb)) {
2166 		hash = skb_get_rx_queue(skb);
2167 		while (unlikely(hash >= num_tx_queues))
2168 			hash -= num_tx_queues;
2169 		return hash;
2170 	}
2171 
2172 	if (skb->sk && skb->sk->sk_hash)
2173 		hash = skb->sk->sk_hash;
2174 	else
2175 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2176 	hash = jhash_1word(hash, hashrnd);
2177 
2178 	return (u16) (((u64) hash * num_tx_queues) >> 32);
2179 }
2180 EXPORT_SYMBOL(__skb_tx_hash);
2181 
2182 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2183 {
2184 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2185 		if (net_ratelimit()) {
2186 			pr_warning("%s selects TX queue %d, but "
2187 				"real number of TX queues is %d\n",
2188 				dev->name, queue_index, dev->real_num_tx_queues);
2189 		}
2190 		return 0;
2191 	}
2192 	return queue_index;
2193 }
2194 
2195 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2196 {
2197 #ifdef CONFIG_XPS
2198 	struct xps_dev_maps *dev_maps;
2199 	struct xps_map *map;
2200 	int queue_index = -1;
2201 
2202 	rcu_read_lock();
2203 	dev_maps = rcu_dereference(dev->xps_maps);
2204 	if (dev_maps) {
2205 		map = rcu_dereference(
2206 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2207 		if (map) {
2208 			if (map->len == 1)
2209 				queue_index = map->queues[0];
2210 			else {
2211 				u32 hash;
2212 				if (skb->sk && skb->sk->sk_hash)
2213 					hash = skb->sk->sk_hash;
2214 				else
2215 					hash = (__force u16) skb->protocol ^
2216 					    skb->rxhash;
2217 				hash = jhash_1word(hash, hashrnd);
2218 				queue_index = map->queues[
2219 				    ((u64)hash * map->len) >> 32];
2220 			}
2221 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2222 				queue_index = -1;
2223 		}
2224 	}
2225 	rcu_read_unlock();
2226 
2227 	return queue_index;
2228 #else
2229 	return -1;
2230 #endif
2231 }
2232 
2233 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2234 					struct sk_buff *skb)
2235 {
2236 	int queue_index;
2237 	const struct net_device_ops *ops = dev->netdev_ops;
2238 
2239 	if (dev->real_num_tx_queues == 1)
2240 		queue_index = 0;
2241 	else if (ops->ndo_select_queue) {
2242 		queue_index = ops->ndo_select_queue(dev, skb);
2243 		queue_index = dev_cap_txqueue(dev, queue_index);
2244 	} else {
2245 		struct sock *sk = skb->sk;
2246 		queue_index = sk_tx_queue_get(sk);
2247 
2248 		if (queue_index < 0 || skb->ooo_okay ||
2249 		    queue_index >= dev->real_num_tx_queues) {
2250 			int old_index = queue_index;
2251 
2252 			queue_index = get_xps_queue(dev, skb);
2253 			if (queue_index < 0)
2254 				queue_index = skb_tx_hash(dev, skb);
2255 
2256 			if (queue_index != old_index && sk) {
2257 				struct dst_entry *dst =
2258 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2259 
2260 				if (dst && skb_dst(skb) == dst)
2261 					sk_tx_queue_set(sk, queue_index);
2262 			}
2263 		}
2264 	}
2265 
2266 	skb_set_queue_mapping(skb, queue_index);
2267 	return netdev_get_tx_queue(dev, queue_index);
2268 }
2269 
2270 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2271 				 struct net_device *dev,
2272 				 struct netdev_queue *txq)
2273 {
2274 	spinlock_t *root_lock = qdisc_lock(q);
2275 	bool contended = qdisc_is_running(q);
2276 	int rc;
2277 
2278 	/*
2279 	 * Heuristic to force contended enqueues to serialize on a
2280 	 * separate lock before trying to get qdisc main lock.
2281 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2282 	 * and dequeue packets faster.
2283 	 */
2284 	if (unlikely(contended))
2285 		spin_lock(&q->busylock);
2286 
2287 	spin_lock(root_lock);
2288 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2289 		kfree_skb(skb);
2290 		rc = NET_XMIT_DROP;
2291 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2292 		   qdisc_run_begin(q)) {
2293 		/*
2294 		 * This is a work-conserving queue; there are no old skbs
2295 		 * waiting to be sent out; and the qdisc is not running -
2296 		 * xmit the skb directly.
2297 		 */
2298 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2299 			skb_dst_force(skb);
2300 
2301 		qdisc_skb_cb(skb)->pkt_len = skb->len;
2302 		qdisc_bstats_update(q, skb);
2303 
2304 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2305 			if (unlikely(contended)) {
2306 				spin_unlock(&q->busylock);
2307 				contended = false;
2308 			}
2309 			__qdisc_run(q);
2310 		} else
2311 			qdisc_run_end(q);
2312 
2313 		rc = NET_XMIT_SUCCESS;
2314 	} else {
2315 		skb_dst_force(skb);
2316 		rc = qdisc_enqueue_root(skb, q);
2317 		if (qdisc_run_begin(q)) {
2318 			if (unlikely(contended)) {
2319 				spin_unlock(&q->busylock);
2320 				contended = false;
2321 			}
2322 			__qdisc_run(q);
2323 		}
2324 	}
2325 	spin_unlock(root_lock);
2326 	if (unlikely(contended))
2327 		spin_unlock(&q->busylock);
2328 	return rc;
2329 }
2330 
2331 static DEFINE_PER_CPU(int, xmit_recursion);
2332 #define RECURSION_LIMIT 10
2333 
2334 /**
2335  *	dev_queue_xmit - transmit a buffer
2336  *	@skb: buffer to transmit
2337  *
2338  *	Queue a buffer for transmission to a network device. The caller must
2339  *	have set the device and priority and built the buffer before calling
2340  *	this function. The function can be called from an interrupt.
2341  *
2342  *	A negative errno code is returned on a failure. A success does not
2343  *	guarantee the frame will be transmitted as it may be dropped due
2344  *	to congestion or traffic shaping.
2345  *
2346  * -----------------------------------------------------------------------------------
2347  *      I notice this method can also return errors from the queue disciplines,
2348  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2349  *      be positive.
2350  *
2351  *      Regardless of the return value, the skb is consumed, so it is currently
2352  *      difficult to retry a send to this method.  (You can bump the ref count
2353  *      before sending to hold a reference for retry if you are careful.)
2354  *
2355  *      When calling this method, interrupts MUST be enabled.  This is because
2356  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2357  *          --BLG
2358  */
2359 int dev_queue_xmit(struct sk_buff *skb)
2360 {
2361 	struct net_device *dev = skb->dev;
2362 	struct netdev_queue *txq;
2363 	struct Qdisc *q;
2364 	int rc = -ENOMEM;
2365 
2366 	/* Disable soft irqs for various locks below. Also
2367 	 * stops preemption for RCU.
2368 	 */
2369 	rcu_read_lock_bh();
2370 
2371 	txq = dev_pick_tx(dev, skb);
2372 	q = rcu_dereference_bh(txq->qdisc);
2373 
2374 #ifdef CONFIG_NET_CLS_ACT
2375 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2376 #endif
2377 	trace_net_dev_queue(skb);
2378 	if (q->enqueue) {
2379 		rc = __dev_xmit_skb(skb, q, dev, txq);
2380 		goto out;
2381 	}
2382 
2383 	/* The device has no queue. Common case for software devices:
2384 	   loopback, all the sorts of tunnels...
2385 
2386 	   Really, it is unlikely that netif_tx_lock protection is necessary
2387 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2388 	   counters.)
2389 	   However, it is possible, that they rely on protection
2390 	   made by us here.
2391 
2392 	   Check this and shot the lock. It is not prone from deadlocks.
2393 	   Either shot noqueue qdisc, it is even simpler 8)
2394 	 */
2395 	if (dev->flags & IFF_UP) {
2396 		int cpu = smp_processor_id(); /* ok because BHs are off */
2397 
2398 		if (txq->xmit_lock_owner != cpu) {
2399 
2400 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2401 				goto recursion_alert;
2402 
2403 			HARD_TX_LOCK(dev, txq, cpu);
2404 
2405 			if (!netif_tx_queue_stopped(txq)) {
2406 				__this_cpu_inc(xmit_recursion);
2407 				rc = dev_hard_start_xmit(skb, dev, txq);
2408 				__this_cpu_dec(xmit_recursion);
2409 				if (dev_xmit_complete(rc)) {
2410 					HARD_TX_UNLOCK(dev, txq);
2411 					goto out;
2412 				}
2413 			}
2414 			HARD_TX_UNLOCK(dev, txq);
2415 			if (net_ratelimit())
2416 				printk(KERN_CRIT "Virtual device %s asks to "
2417 				       "queue packet!\n", dev->name);
2418 		} else {
2419 			/* Recursion is detected! It is possible,
2420 			 * unfortunately
2421 			 */
2422 recursion_alert:
2423 			if (net_ratelimit())
2424 				printk(KERN_CRIT "Dead loop on virtual device "
2425 				       "%s, fix it urgently!\n", dev->name);
2426 		}
2427 	}
2428 
2429 	rc = -ENETDOWN;
2430 	rcu_read_unlock_bh();
2431 
2432 	kfree_skb(skb);
2433 	return rc;
2434 out:
2435 	rcu_read_unlock_bh();
2436 	return rc;
2437 }
2438 EXPORT_SYMBOL(dev_queue_xmit);
2439 
2440 
2441 /*=======================================================================
2442 			Receiver routines
2443   =======================================================================*/
2444 
2445 int netdev_max_backlog __read_mostly = 1000;
2446 int netdev_tstamp_prequeue __read_mostly = 1;
2447 int netdev_budget __read_mostly = 300;
2448 int weight_p __read_mostly = 64;            /* old backlog weight */
2449 
2450 /* Called with irq disabled */
2451 static inline void ____napi_schedule(struct softnet_data *sd,
2452 				     struct napi_struct *napi)
2453 {
2454 	list_add_tail(&napi->poll_list, &sd->poll_list);
2455 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2456 }
2457 
2458 /*
2459  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2460  * and src/dst port numbers. Returns a non-zero hash number on success
2461  * and 0 on failure.
2462  */
2463 __u32 __skb_get_rxhash(struct sk_buff *skb)
2464 {
2465 	int nhoff, hash = 0, poff;
2466 	struct ipv6hdr *ip6;
2467 	struct iphdr *ip;
2468 	u8 ip_proto;
2469 	u32 addr1, addr2, ihl;
2470 	union {
2471 		u32 v32;
2472 		u16 v16[2];
2473 	} ports;
2474 
2475 	nhoff = skb_network_offset(skb);
2476 
2477 	switch (skb->protocol) {
2478 	case __constant_htons(ETH_P_IP):
2479 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2480 			goto done;
2481 
2482 		ip = (struct iphdr *) (skb->data + nhoff);
2483 		if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2484 			ip_proto = 0;
2485 		else
2486 			ip_proto = ip->protocol;
2487 		addr1 = (__force u32) ip->saddr;
2488 		addr2 = (__force u32) ip->daddr;
2489 		ihl = ip->ihl;
2490 		break;
2491 	case __constant_htons(ETH_P_IPV6):
2492 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2493 			goto done;
2494 
2495 		ip6 = (struct ipv6hdr *) (skb->data + nhoff);
2496 		ip_proto = ip6->nexthdr;
2497 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2498 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2499 		ihl = (40 >> 2);
2500 		break;
2501 	default:
2502 		goto done;
2503 	}
2504 
2505 	ports.v32 = 0;
2506 	poff = proto_ports_offset(ip_proto);
2507 	if (poff >= 0) {
2508 		nhoff += ihl * 4 + poff;
2509 		if (pskb_may_pull(skb, nhoff + 4)) {
2510 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2511 			if (ports.v16[1] < ports.v16[0])
2512 				swap(ports.v16[0], ports.v16[1]);
2513 		}
2514 	}
2515 
2516 	/* get a consistent hash (same value on both flow directions) */
2517 	if (addr2 < addr1)
2518 		swap(addr1, addr2);
2519 
2520 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2521 	if (!hash)
2522 		hash = 1;
2523 
2524 done:
2525 	return hash;
2526 }
2527 EXPORT_SYMBOL(__skb_get_rxhash);
2528 
2529 #ifdef CONFIG_RPS
2530 
2531 /* One global table that all flow-based protocols share. */
2532 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2533 EXPORT_SYMBOL(rps_sock_flow_table);
2534 
2535 /*
2536  * get_rps_cpu is called from netif_receive_skb and returns the target
2537  * CPU from the RPS map of the receiving queue for a given skb.
2538  * rcu_read_lock must be held on entry.
2539  */
2540 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2541 		       struct rps_dev_flow **rflowp)
2542 {
2543 	struct netdev_rx_queue *rxqueue;
2544 	struct rps_map *map;
2545 	struct rps_dev_flow_table *flow_table;
2546 	struct rps_sock_flow_table *sock_flow_table;
2547 	int cpu = -1;
2548 	u16 tcpu;
2549 
2550 	if (skb_rx_queue_recorded(skb)) {
2551 		u16 index = skb_get_rx_queue(skb);
2552 		if (unlikely(index >= dev->real_num_rx_queues)) {
2553 			WARN_ONCE(dev->real_num_rx_queues > 1,
2554 				  "%s received packet on queue %u, but number "
2555 				  "of RX queues is %u\n",
2556 				  dev->name, index, dev->real_num_rx_queues);
2557 			goto done;
2558 		}
2559 		rxqueue = dev->_rx + index;
2560 	} else
2561 		rxqueue = dev->_rx;
2562 
2563 	map = rcu_dereference(rxqueue->rps_map);
2564 	if (map) {
2565 		if (map->len == 1) {
2566 			tcpu = map->cpus[0];
2567 			if (cpu_online(tcpu))
2568 				cpu = tcpu;
2569 			goto done;
2570 		}
2571 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2572 		goto done;
2573 	}
2574 
2575 	skb_reset_network_header(skb);
2576 	if (!skb_get_rxhash(skb))
2577 		goto done;
2578 
2579 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2580 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2581 	if (flow_table && sock_flow_table) {
2582 		u16 next_cpu;
2583 		struct rps_dev_flow *rflow;
2584 
2585 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2586 		tcpu = rflow->cpu;
2587 
2588 		next_cpu = sock_flow_table->ents[skb->rxhash &
2589 		    sock_flow_table->mask];
2590 
2591 		/*
2592 		 * If the desired CPU (where last recvmsg was done) is
2593 		 * different from current CPU (one in the rx-queue flow
2594 		 * table entry), switch if one of the following holds:
2595 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2596 		 *   - Current CPU is offline.
2597 		 *   - The current CPU's queue tail has advanced beyond the
2598 		 *     last packet that was enqueued using this table entry.
2599 		 *     This guarantees that all previous packets for the flow
2600 		 *     have been dequeued, thus preserving in order delivery.
2601 		 */
2602 		if (unlikely(tcpu != next_cpu) &&
2603 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2604 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2605 		      rflow->last_qtail)) >= 0)) {
2606 			tcpu = rflow->cpu = next_cpu;
2607 			if (tcpu != RPS_NO_CPU)
2608 				rflow->last_qtail = per_cpu(softnet_data,
2609 				    tcpu).input_queue_head;
2610 		}
2611 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2612 			*rflowp = rflow;
2613 			cpu = tcpu;
2614 			goto done;
2615 		}
2616 	}
2617 
2618 	if (map) {
2619 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2620 
2621 		if (cpu_online(tcpu)) {
2622 			cpu = tcpu;
2623 			goto done;
2624 		}
2625 	}
2626 
2627 done:
2628 	return cpu;
2629 }
2630 
2631 /* Called from hardirq (IPI) context */
2632 static void rps_trigger_softirq(void *data)
2633 {
2634 	struct softnet_data *sd = data;
2635 
2636 	____napi_schedule(sd, &sd->backlog);
2637 	sd->received_rps++;
2638 }
2639 
2640 #endif /* CONFIG_RPS */
2641 
2642 /*
2643  * Check if this softnet_data structure is another cpu one
2644  * If yes, queue it to our IPI list and return 1
2645  * If no, return 0
2646  */
2647 static int rps_ipi_queued(struct softnet_data *sd)
2648 {
2649 #ifdef CONFIG_RPS
2650 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2651 
2652 	if (sd != mysd) {
2653 		sd->rps_ipi_next = mysd->rps_ipi_list;
2654 		mysd->rps_ipi_list = sd;
2655 
2656 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2657 		return 1;
2658 	}
2659 #endif /* CONFIG_RPS */
2660 	return 0;
2661 }
2662 
2663 /*
2664  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2665  * queue (may be a remote CPU queue).
2666  */
2667 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2668 			      unsigned int *qtail)
2669 {
2670 	struct softnet_data *sd;
2671 	unsigned long flags;
2672 
2673 	sd = &per_cpu(softnet_data, cpu);
2674 
2675 	local_irq_save(flags);
2676 
2677 	rps_lock(sd);
2678 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2679 		if (skb_queue_len(&sd->input_pkt_queue)) {
2680 enqueue:
2681 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2682 			input_queue_tail_incr_save(sd, qtail);
2683 			rps_unlock(sd);
2684 			local_irq_restore(flags);
2685 			return NET_RX_SUCCESS;
2686 		}
2687 
2688 		/* Schedule NAPI for backlog device
2689 		 * We can use non atomic operation since we own the queue lock
2690 		 */
2691 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2692 			if (!rps_ipi_queued(sd))
2693 				____napi_schedule(sd, &sd->backlog);
2694 		}
2695 		goto enqueue;
2696 	}
2697 
2698 	sd->dropped++;
2699 	rps_unlock(sd);
2700 
2701 	local_irq_restore(flags);
2702 
2703 	atomic_long_inc(&skb->dev->rx_dropped);
2704 	kfree_skb(skb);
2705 	return NET_RX_DROP;
2706 }
2707 
2708 /**
2709  *	netif_rx	-	post buffer to the network code
2710  *	@skb: buffer to post
2711  *
2712  *	This function receives a packet from a device driver and queues it for
2713  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2714  *	may be dropped during processing for congestion control or by the
2715  *	protocol layers.
2716  *
2717  *	return values:
2718  *	NET_RX_SUCCESS	(no congestion)
2719  *	NET_RX_DROP     (packet was dropped)
2720  *
2721  */
2722 
2723 int netif_rx(struct sk_buff *skb)
2724 {
2725 	int ret;
2726 
2727 	/* if netpoll wants it, pretend we never saw it */
2728 	if (netpoll_rx(skb))
2729 		return NET_RX_DROP;
2730 
2731 	if (netdev_tstamp_prequeue)
2732 		net_timestamp_check(skb);
2733 
2734 	trace_netif_rx(skb);
2735 #ifdef CONFIG_RPS
2736 	{
2737 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2738 		int cpu;
2739 
2740 		preempt_disable();
2741 		rcu_read_lock();
2742 
2743 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2744 		if (cpu < 0)
2745 			cpu = smp_processor_id();
2746 
2747 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2748 
2749 		rcu_read_unlock();
2750 		preempt_enable();
2751 	}
2752 #else
2753 	{
2754 		unsigned int qtail;
2755 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2756 		put_cpu();
2757 	}
2758 #endif
2759 	return ret;
2760 }
2761 EXPORT_SYMBOL(netif_rx);
2762 
2763 int netif_rx_ni(struct sk_buff *skb)
2764 {
2765 	int err;
2766 
2767 	preempt_disable();
2768 	err = netif_rx(skb);
2769 	if (local_softirq_pending())
2770 		do_softirq();
2771 	preempt_enable();
2772 
2773 	return err;
2774 }
2775 EXPORT_SYMBOL(netif_rx_ni);
2776 
2777 static void net_tx_action(struct softirq_action *h)
2778 {
2779 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2780 
2781 	if (sd->completion_queue) {
2782 		struct sk_buff *clist;
2783 
2784 		local_irq_disable();
2785 		clist = sd->completion_queue;
2786 		sd->completion_queue = NULL;
2787 		local_irq_enable();
2788 
2789 		while (clist) {
2790 			struct sk_buff *skb = clist;
2791 			clist = clist->next;
2792 
2793 			WARN_ON(atomic_read(&skb->users));
2794 			trace_kfree_skb(skb, net_tx_action);
2795 			__kfree_skb(skb);
2796 		}
2797 	}
2798 
2799 	if (sd->output_queue) {
2800 		struct Qdisc *head;
2801 
2802 		local_irq_disable();
2803 		head = sd->output_queue;
2804 		sd->output_queue = NULL;
2805 		sd->output_queue_tailp = &sd->output_queue;
2806 		local_irq_enable();
2807 
2808 		while (head) {
2809 			struct Qdisc *q = head;
2810 			spinlock_t *root_lock;
2811 
2812 			head = head->next_sched;
2813 
2814 			root_lock = qdisc_lock(q);
2815 			if (spin_trylock(root_lock)) {
2816 				smp_mb__before_clear_bit();
2817 				clear_bit(__QDISC_STATE_SCHED,
2818 					  &q->state);
2819 				qdisc_run(q);
2820 				spin_unlock(root_lock);
2821 			} else {
2822 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2823 					      &q->state)) {
2824 					__netif_reschedule(q);
2825 				} else {
2826 					smp_mb__before_clear_bit();
2827 					clear_bit(__QDISC_STATE_SCHED,
2828 						  &q->state);
2829 				}
2830 			}
2831 		}
2832 	}
2833 }
2834 
2835 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2836     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2837 /* This hook is defined here for ATM LANE */
2838 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2839 			     unsigned char *addr) __read_mostly;
2840 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2841 #endif
2842 
2843 #ifdef CONFIG_NET_CLS_ACT
2844 /* TODO: Maybe we should just force sch_ingress to be compiled in
2845  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2846  * a compare and 2 stores extra right now if we dont have it on
2847  * but have CONFIG_NET_CLS_ACT
2848  * NOTE: This doesnt stop any functionality; if you dont have
2849  * the ingress scheduler, you just cant add policies on ingress.
2850  *
2851  */
2852 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
2853 {
2854 	struct net_device *dev = skb->dev;
2855 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2856 	int result = TC_ACT_OK;
2857 	struct Qdisc *q;
2858 
2859 	if (unlikely(MAX_RED_LOOP < ttl++)) {
2860 		if (net_ratelimit())
2861 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2862 			       skb->skb_iif, dev->ifindex);
2863 		return TC_ACT_SHOT;
2864 	}
2865 
2866 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2867 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2868 
2869 	q = rxq->qdisc;
2870 	if (q != &noop_qdisc) {
2871 		spin_lock(qdisc_lock(q));
2872 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2873 			result = qdisc_enqueue_root(skb, q);
2874 		spin_unlock(qdisc_lock(q));
2875 	}
2876 
2877 	return result;
2878 }
2879 
2880 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2881 					 struct packet_type **pt_prev,
2882 					 int *ret, struct net_device *orig_dev)
2883 {
2884 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
2885 
2886 	if (!rxq || rxq->qdisc == &noop_qdisc)
2887 		goto out;
2888 
2889 	if (*pt_prev) {
2890 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2891 		*pt_prev = NULL;
2892 	}
2893 
2894 	switch (ing_filter(skb, rxq)) {
2895 	case TC_ACT_SHOT:
2896 	case TC_ACT_STOLEN:
2897 		kfree_skb(skb);
2898 		return NULL;
2899 	}
2900 
2901 out:
2902 	skb->tc_verd = 0;
2903 	return skb;
2904 }
2905 #endif
2906 
2907 /**
2908  *	netdev_rx_handler_register - register receive handler
2909  *	@dev: device to register a handler for
2910  *	@rx_handler: receive handler to register
2911  *	@rx_handler_data: data pointer that is used by rx handler
2912  *
2913  *	Register a receive hander for a device. This handler will then be
2914  *	called from __netif_receive_skb. A negative errno code is returned
2915  *	on a failure.
2916  *
2917  *	The caller must hold the rtnl_mutex.
2918  */
2919 int netdev_rx_handler_register(struct net_device *dev,
2920 			       rx_handler_func_t *rx_handler,
2921 			       void *rx_handler_data)
2922 {
2923 	ASSERT_RTNL();
2924 
2925 	if (dev->rx_handler)
2926 		return -EBUSY;
2927 
2928 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
2929 	rcu_assign_pointer(dev->rx_handler, rx_handler);
2930 
2931 	return 0;
2932 }
2933 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2934 
2935 /**
2936  *	netdev_rx_handler_unregister - unregister receive handler
2937  *	@dev: device to unregister a handler from
2938  *
2939  *	Unregister a receive hander from a device.
2940  *
2941  *	The caller must hold the rtnl_mutex.
2942  */
2943 void netdev_rx_handler_unregister(struct net_device *dev)
2944 {
2945 
2946 	ASSERT_RTNL();
2947 	rcu_assign_pointer(dev->rx_handler, NULL);
2948 	rcu_assign_pointer(dev->rx_handler_data, NULL);
2949 }
2950 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2951 
2952 static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2953 					      struct net_device *master)
2954 {
2955 	if (skb->pkt_type == PACKET_HOST) {
2956 		u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2957 
2958 		memcpy(dest, master->dev_addr, ETH_ALEN);
2959 	}
2960 }
2961 
2962 /* On bonding slaves other than the currently active slave, suppress
2963  * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2964  * ARP on active-backup slaves with arp_validate enabled.
2965  */
2966 int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2967 {
2968 	struct net_device *dev = skb->dev;
2969 
2970 	if (master->priv_flags & IFF_MASTER_ARPMON)
2971 		dev->last_rx = jiffies;
2972 
2973 	if ((master->priv_flags & IFF_MASTER_ALB) &&
2974 	    (master->priv_flags & IFF_BRIDGE_PORT)) {
2975 		/* Do address unmangle. The local destination address
2976 		 * will be always the one master has. Provides the right
2977 		 * functionality in a bridge.
2978 		 */
2979 		skb_bond_set_mac_by_master(skb, master);
2980 	}
2981 
2982 	if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2983 		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2984 		    skb->protocol == __cpu_to_be16(ETH_P_ARP))
2985 			return 0;
2986 
2987 		if (master->priv_flags & IFF_MASTER_ALB) {
2988 			if (skb->pkt_type != PACKET_BROADCAST &&
2989 			    skb->pkt_type != PACKET_MULTICAST)
2990 				return 0;
2991 		}
2992 		if (master->priv_flags & IFF_MASTER_8023AD &&
2993 		    skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2994 			return 0;
2995 
2996 		return 1;
2997 	}
2998 	return 0;
2999 }
3000 EXPORT_SYMBOL(__skb_bond_should_drop);
3001 
3002 static int __netif_receive_skb(struct sk_buff *skb)
3003 {
3004 	struct packet_type *ptype, *pt_prev;
3005 	rx_handler_func_t *rx_handler;
3006 	struct net_device *orig_dev;
3007 	struct net_device *master;
3008 	struct net_device *null_or_orig;
3009 	struct net_device *orig_or_bond;
3010 	int ret = NET_RX_DROP;
3011 	__be16 type;
3012 
3013 	if (!netdev_tstamp_prequeue)
3014 		net_timestamp_check(skb);
3015 
3016 	trace_netif_receive_skb(skb);
3017 
3018 	/* if we've gotten here through NAPI, check netpoll */
3019 	if (netpoll_receive_skb(skb))
3020 		return NET_RX_DROP;
3021 
3022 	if (!skb->skb_iif)
3023 		skb->skb_iif = skb->dev->ifindex;
3024 
3025 	/*
3026 	 * bonding note: skbs received on inactive slaves should only
3027 	 * be delivered to pkt handlers that are exact matches.  Also
3028 	 * the deliver_no_wcard flag will be set.  If packet handlers
3029 	 * are sensitive to duplicate packets these skbs will need to
3030 	 * be dropped at the handler.
3031 	 */
3032 	null_or_orig = NULL;
3033 	orig_dev = skb->dev;
3034 	master = ACCESS_ONCE(orig_dev->master);
3035 	if (skb->deliver_no_wcard)
3036 		null_or_orig = orig_dev;
3037 	else if (master) {
3038 		if (skb_bond_should_drop(skb, master)) {
3039 			skb->deliver_no_wcard = 1;
3040 			null_or_orig = orig_dev; /* deliver only exact match */
3041 		} else
3042 			skb->dev = master;
3043 	}
3044 
3045 	__this_cpu_inc(softnet_data.processed);
3046 	skb_reset_network_header(skb);
3047 	skb_reset_transport_header(skb);
3048 	skb->mac_len = skb->network_header - skb->mac_header;
3049 
3050 	pt_prev = NULL;
3051 
3052 	rcu_read_lock();
3053 
3054 #ifdef CONFIG_NET_CLS_ACT
3055 	if (skb->tc_verd & TC_NCLS) {
3056 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3057 		goto ncls;
3058 	}
3059 #endif
3060 
3061 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3062 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
3063 		    ptype->dev == orig_dev) {
3064 			if (pt_prev)
3065 				ret = deliver_skb(skb, pt_prev, orig_dev);
3066 			pt_prev = ptype;
3067 		}
3068 	}
3069 
3070 #ifdef CONFIG_NET_CLS_ACT
3071 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3072 	if (!skb)
3073 		goto out;
3074 ncls:
3075 #endif
3076 
3077 	/* Handle special case of bridge or macvlan */
3078 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3079 	if (rx_handler) {
3080 		if (pt_prev) {
3081 			ret = deliver_skb(skb, pt_prev, orig_dev);
3082 			pt_prev = NULL;
3083 		}
3084 		skb = rx_handler(skb);
3085 		if (!skb)
3086 			goto out;
3087 	}
3088 
3089 	if (vlan_tx_tag_present(skb)) {
3090 		if (pt_prev) {
3091 			ret = deliver_skb(skb, pt_prev, orig_dev);
3092 			pt_prev = NULL;
3093 		}
3094 		if (vlan_hwaccel_do_receive(&skb)) {
3095 			ret = __netif_receive_skb(skb);
3096 			goto out;
3097 		} else if (unlikely(!skb))
3098 			goto out;
3099 	}
3100 
3101 	/*
3102 	 * Make sure frames received on VLAN interfaces stacked on
3103 	 * bonding interfaces still make their way to any base bonding
3104 	 * device that may have registered for a specific ptype.  The
3105 	 * handler may have to adjust skb->dev and orig_dev.
3106 	 */
3107 	orig_or_bond = orig_dev;
3108 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
3109 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
3110 		orig_or_bond = vlan_dev_real_dev(skb->dev);
3111 	}
3112 
3113 	type = skb->protocol;
3114 	list_for_each_entry_rcu(ptype,
3115 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3116 		if (ptype->type == type && (ptype->dev == null_or_orig ||
3117 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
3118 		     ptype->dev == orig_or_bond)) {
3119 			if (pt_prev)
3120 				ret = deliver_skb(skb, pt_prev, orig_dev);
3121 			pt_prev = ptype;
3122 		}
3123 	}
3124 
3125 	if (pt_prev) {
3126 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3127 	} else {
3128 		atomic_long_inc(&skb->dev->rx_dropped);
3129 		kfree_skb(skb);
3130 		/* Jamal, now you will not able to escape explaining
3131 		 * me how you were going to use this. :-)
3132 		 */
3133 		ret = NET_RX_DROP;
3134 	}
3135 
3136 out:
3137 	rcu_read_unlock();
3138 	return ret;
3139 }
3140 
3141 /**
3142  *	netif_receive_skb - process receive buffer from network
3143  *	@skb: buffer to process
3144  *
3145  *	netif_receive_skb() is the main receive data processing function.
3146  *	It always succeeds. The buffer may be dropped during processing
3147  *	for congestion control or by the protocol layers.
3148  *
3149  *	This function may only be called from softirq context and interrupts
3150  *	should be enabled.
3151  *
3152  *	Return values (usually ignored):
3153  *	NET_RX_SUCCESS: no congestion
3154  *	NET_RX_DROP: packet was dropped
3155  */
3156 int netif_receive_skb(struct sk_buff *skb)
3157 {
3158 	if (netdev_tstamp_prequeue)
3159 		net_timestamp_check(skb);
3160 
3161 	if (skb_defer_rx_timestamp(skb))
3162 		return NET_RX_SUCCESS;
3163 
3164 #ifdef CONFIG_RPS
3165 	{
3166 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3167 		int cpu, ret;
3168 
3169 		rcu_read_lock();
3170 
3171 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3172 
3173 		if (cpu >= 0) {
3174 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3175 			rcu_read_unlock();
3176 		} else {
3177 			rcu_read_unlock();
3178 			ret = __netif_receive_skb(skb);
3179 		}
3180 
3181 		return ret;
3182 	}
3183 #else
3184 	return __netif_receive_skb(skb);
3185 #endif
3186 }
3187 EXPORT_SYMBOL(netif_receive_skb);
3188 
3189 /* Network device is going away, flush any packets still pending
3190  * Called with irqs disabled.
3191  */
3192 static void flush_backlog(void *arg)
3193 {
3194 	struct net_device *dev = arg;
3195 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3196 	struct sk_buff *skb, *tmp;
3197 
3198 	rps_lock(sd);
3199 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3200 		if (skb->dev == dev) {
3201 			__skb_unlink(skb, &sd->input_pkt_queue);
3202 			kfree_skb(skb);
3203 			input_queue_head_incr(sd);
3204 		}
3205 	}
3206 	rps_unlock(sd);
3207 
3208 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3209 		if (skb->dev == dev) {
3210 			__skb_unlink(skb, &sd->process_queue);
3211 			kfree_skb(skb);
3212 			input_queue_head_incr(sd);
3213 		}
3214 	}
3215 }
3216 
3217 static int napi_gro_complete(struct sk_buff *skb)
3218 {
3219 	struct packet_type *ptype;
3220 	__be16 type = skb->protocol;
3221 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3222 	int err = -ENOENT;
3223 
3224 	if (NAPI_GRO_CB(skb)->count == 1) {
3225 		skb_shinfo(skb)->gso_size = 0;
3226 		goto out;
3227 	}
3228 
3229 	rcu_read_lock();
3230 	list_for_each_entry_rcu(ptype, head, list) {
3231 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3232 			continue;
3233 
3234 		err = ptype->gro_complete(skb);
3235 		break;
3236 	}
3237 	rcu_read_unlock();
3238 
3239 	if (err) {
3240 		WARN_ON(&ptype->list == head);
3241 		kfree_skb(skb);
3242 		return NET_RX_SUCCESS;
3243 	}
3244 
3245 out:
3246 	return netif_receive_skb(skb);
3247 }
3248 
3249 inline void napi_gro_flush(struct napi_struct *napi)
3250 {
3251 	struct sk_buff *skb, *next;
3252 
3253 	for (skb = napi->gro_list; skb; skb = next) {
3254 		next = skb->next;
3255 		skb->next = NULL;
3256 		napi_gro_complete(skb);
3257 	}
3258 
3259 	napi->gro_count = 0;
3260 	napi->gro_list = NULL;
3261 }
3262 EXPORT_SYMBOL(napi_gro_flush);
3263 
3264 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3265 {
3266 	struct sk_buff **pp = NULL;
3267 	struct packet_type *ptype;
3268 	__be16 type = skb->protocol;
3269 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3270 	int same_flow;
3271 	int mac_len;
3272 	enum gro_result ret;
3273 
3274 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3275 		goto normal;
3276 
3277 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3278 		goto normal;
3279 
3280 	rcu_read_lock();
3281 	list_for_each_entry_rcu(ptype, head, list) {
3282 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3283 			continue;
3284 
3285 		skb_set_network_header(skb, skb_gro_offset(skb));
3286 		mac_len = skb->network_header - skb->mac_header;
3287 		skb->mac_len = mac_len;
3288 		NAPI_GRO_CB(skb)->same_flow = 0;
3289 		NAPI_GRO_CB(skb)->flush = 0;
3290 		NAPI_GRO_CB(skb)->free = 0;
3291 
3292 		pp = ptype->gro_receive(&napi->gro_list, skb);
3293 		break;
3294 	}
3295 	rcu_read_unlock();
3296 
3297 	if (&ptype->list == head)
3298 		goto normal;
3299 
3300 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3301 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3302 
3303 	if (pp) {
3304 		struct sk_buff *nskb = *pp;
3305 
3306 		*pp = nskb->next;
3307 		nskb->next = NULL;
3308 		napi_gro_complete(nskb);
3309 		napi->gro_count--;
3310 	}
3311 
3312 	if (same_flow)
3313 		goto ok;
3314 
3315 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3316 		goto normal;
3317 
3318 	napi->gro_count++;
3319 	NAPI_GRO_CB(skb)->count = 1;
3320 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3321 	skb->next = napi->gro_list;
3322 	napi->gro_list = skb;
3323 	ret = GRO_HELD;
3324 
3325 pull:
3326 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3327 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3328 
3329 		BUG_ON(skb->end - skb->tail < grow);
3330 
3331 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3332 
3333 		skb->tail += grow;
3334 		skb->data_len -= grow;
3335 
3336 		skb_shinfo(skb)->frags[0].page_offset += grow;
3337 		skb_shinfo(skb)->frags[0].size -= grow;
3338 
3339 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3340 			put_page(skb_shinfo(skb)->frags[0].page);
3341 			memmove(skb_shinfo(skb)->frags,
3342 				skb_shinfo(skb)->frags + 1,
3343 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3344 		}
3345 	}
3346 
3347 ok:
3348 	return ret;
3349 
3350 normal:
3351 	ret = GRO_NORMAL;
3352 	goto pull;
3353 }
3354 EXPORT_SYMBOL(dev_gro_receive);
3355 
3356 static inline gro_result_t
3357 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3358 {
3359 	struct sk_buff *p;
3360 
3361 	for (p = napi->gro_list; p; p = p->next) {
3362 		unsigned long diffs;
3363 
3364 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3365 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3366 		diffs |= compare_ether_header(skb_mac_header(p),
3367 					      skb_gro_mac_header(skb));
3368 		NAPI_GRO_CB(p)->same_flow = !diffs;
3369 		NAPI_GRO_CB(p)->flush = 0;
3370 	}
3371 
3372 	return dev_gro_receive(napi, skb);
3373 }
3374 
3375 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3376 {
3377 	switch (ret) {
3378 	case GRO_NORMAL:
3379 		if (netif_receive_skb(skb))
3380 			ret = GRO_DROP;
3381 		break;
3382 
3383 	case GRO_DROP:
3384 	case GRO_MERGED_FREE:
3385 		kfree_skb(skb);
3386 		break;
3387 
3388 	case GRO_HELD:
3389 	case GRO_MERGED:
3390 		break;
3391 	}
3392 
3393 	return ret;
3394 }
3395 EXPORT_SYMBOL(napi_skb_finish);
3396 
3397 void skb_gro_reset_offset(struct sk_buff *skb)
3398 {
3399 	NAPI_GRO_CB(skb)->data_offset = 0;
3400 	NAPI_GRO_CB(skb)->frag0 = NULL;
3401 	NAPI_GRO_CB(skb)->frag0_len = 0;
3402 
3403 	if (skb->mac_header == skb->tail &&
3404 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3405 		NAPI_GRO_CB(skb)->frag0 =
3406 			page_address(skb_shinfo(skb)->frags[0].page) +
3407 			skb_shinfo(skb)->frags[0].page_offset;
3408 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3409 	}
3410 }
3411 EXPORT_SYMBOL(skb_gro_reset_offset);
3412 
3413 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3414 {
3415 	skb_gro_reset_offset(skb);
3416 
3417 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3418 }
3419 EXPORT_SYMBOL(napi_gro_receive);
3420 
3421 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3422 {
3423 	__skb_pull(skb, skb_headlen(skb));
3424 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3425 	skb->vlan_tci = 0;
3426 
3427 	napi->skb = skb;
3428 }
3429 
3430 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3431 {
3432 	struct sk_buff *skb = napi->skb;
3433 
3434 	if (!skb) {
3435 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3436 		if (skb)
3437 			napi->skb = skb;
3438 	}
3439 	return skb;
3440 }
3441 EXPORT_SYMBOL(napi_get_frags);
3442 
3443 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3444 			       gro_result_t ret)
3445 {
3446 	switch (ret) {
3447 	case GRO_NORMAL:
3448 	case GRO_HELD:
3449 		skb->protocol = eth_type_trans(skb, skb->dev);
3450 
3451 		if (ret == GRO_HELD)
3452 			skb_gro_pull(skb, -ETH_HLEN);
3453 		else if (netif_receive_skb(skb))
3454 			ret = GRO_DROP;
3455 		break;
3456 
3457 	case GRO_DROP:
3458 	case GRO_MERGED_FREE:
3459 		napi_reuse_skb(napi, skb);
3460 		break;
3461 
3462 	case GRO_MERGED:
3463 		break;
3464 	}
3465 
3466 	return ret;
3467 }
3468 EXPORT_SYMBOL(napi_frags_finish);
3469 
3470 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3471 {
3472 	struct sk_buff *skb = napi->skb;
3473 	struct ethhdr *eth;
3474 	unsigned int hlen;
3475 	unsigned int off;
3476 
3477 	napi->skb = NULL;
3478 
3479 	skb_reset_mac_header(skb);
3480 	skb_gro_reset_offset(skb);
3481 
3482 	off = skb_gro_offset(skb);
3483 	hlen = off + sizeof(*eth);
3484 	eth = skb_gro_header_fast(skb, off);
3485 	if (skb_gro_header_hard(skb, hlen)) {
3486 		eth = skb_gro_header_slow(skb, hlen, off);
3487 		if (unlikely(!eth)) {
3488 			napi_reuse_skb(napi, skb);
3489 			skb = NULL;
3490 			goto out;
3491 		}
3492 	}
3493 
3494 	skb_gro_pull(skb, sizeof(*eth));
3495 
3496 	/*
3497 	 * This works because the only protocols we care about don't require
3498 	 * special handling.  We'll fix it up properly at the end.
3499 	 */
3500 	skb->protocol = eth->h_proto;
3501 
3502 out:
3503 	return skb;
3504 }
3505 EXPORT_SYMBOL(napi_frags_skb);
3506 
3507 gro_result_t napi_gro_frags(struct napi_struct *napi)
3508 {
3509 	struct sk_buff *skb = napi_frags_skb(napi);
3510 
3511 	if (!skb)
3512 		return GRO_DROP;
3513 
3514 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3515 }
3516 EXPORT_SYMBOL(napi_gro_frags);
3517 
3518 /*
3519  * net_rps_action sends any pending IPI's for rps.
3520  * Note: called with local irq disabled, but exits with local irq enabled.
3521  */
3522 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3523 {
3524 #ifdef CONFIG_RPS
3525 	struct softnet_data *remsd = sd->rps_ipi_list;
3526 
3527 	if (remsd) {
3528 		sd->rps_ipi_list = NULL;
3529 
3530 		local_irq_enable();
3531 
3532 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3533 		while (remsd) {
3534 			struct softnet_data *next = remsd->rps_ipi_next;
3535 
3536 			if (cpu_online(remsd->cpu))
3537 				__smp_call_function_single(remsd->cpu,
3538 							   &remsd->csd, 0);
3539 			remsd = next;
3540 		}
3541 	} else
3542 #endif
3543 		local_irq_enable();
3544 }
3545 
3546 static int process_backlog(struct napi_struct *napi, int quota)
3547 {
3548 	int work = 0;
3549 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3550 
3551 #ifdef CONFIG_RPS
3552 	/* Check if we have pending ipi, its better to send them now,
3553 	 * not waiting net_rx_action() end.
3554 	 */
3555 	if (sd->rps_ipi_list) {
3556 		local_irq_disable();
3557 		net_rps_action_and_irq_enable(sd);
3558 	}
3559 #endif
3560 	napi->weight = weight_p;
3561 	local_irq_disable();
3562 	while (work < quota) {
3563 		struct sk_buff *skb;
3564 		unsigned int qlen;
3565 
3566 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3567 			local_irq_enable();
3568 			__netif_receive_skb(skb);
3569 			local_irq_disable();
3570 			input_queue_head_incr(sd);
3571 			if (++work >= quota) {
3572 				local_irq_enable();
3573 				return work;
3574 			}
3575 		}
3576 
3577 		rps_lock(sd);
3578 		qlen = skb_queue_len(&sd->input_pkt_queue);
3579 		if (qlen)
3580 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3581 						   &sd->process_queue);
3582 
3583 		if (qlen < quota - work) {
3584 			/*
3585 			 * Inline a custom version of __napi_complete().
3586 			 * only current cpu owns and manipulates this napi,
3587 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3588 			 * we can use a plain write instead of clear_bit(),
3589 			 * and we dont need an smp_mb() memory barrier.
3590 			 */
3591 			list_del(&napi->poll_list);
3592 			napi->state = 0;
3593 
3594 			quota = work + qlen;
3595 		}
3596 		rps_unlock(sd);
3597 	}
3598 	local_irq_enable();
3599 
3600 	return work;
3601 }
3602 
3603 /**
3604  * __napi_schedule - schedule for receive
3605  * @n: entry to schedule
3606  *
3607  * The entry's receive function will be scheduled to run
3608  */
3609 void __napi_schedule(struct napi_struct *n)
3610 {
3611 	unsigned long flags;
3612 
3613 	local_irq_save(flags);
3614 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3615 	local_irq_restore(flags);
3616 }
3617 EXPORT_SYMBOL(__napi_schedule);
3618 
3619 void __napi_complete(struct napi_struct *n)
3620 {
3621 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3622 	BUG_ON(n->gro_list);
3623 
3624 	list_del(&n->poll_list);
3625 	smp_mb__before_clear_bit();
3626 	clear_bit(NAPI_STATE_SCHED, &n->state);
3627 }
3628 EXPORT_SYMBOL(__napi_complete);
3629 
3630 void napi_complete(struct napi_struct *n)
3631 {
3632 	unsigned long flags;
3633 
3634 	/*
3635 	 * don't let napi dequeue from the cpu poll list
3636 	 * just in case its running on a different cpu
3637 	 */
3638 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3639 		return;
3640 
3641 	napi_gro_flush(n);
3642 	local_irq_save(flags);
3643 	__napi_complete(n);
3644 	local_irq_restore(flags);
3645 }
3646 EXPORT_SYMBOL(napi_complete);
3647 
3648 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3649 		    int (*poll)(struct napi_struct *, int), int weight)
3650 {
3651 	INIT_LIST_HEAD(&napi->poll_list);
3652 	napi->gro_count = 0;
3653 	napi->gro_list = NULL;
3654 	napi->skb = NULL;
3655 	napi->poll = poll;
3656 	napi->weight = weight;
3657 	list_add(&napi->dev_list, &dev->napi_list);
3658 	napi->dev = dev;
3659 #ifdef CONFIG_NETPOLL
3660 	spin_lock_init(&napi->poll_lock);
3661 	napi->poll_owner = -1;
3662 #endif
3663 	set_bit(NAPI_STATE_SCHED, &napi->state);
3664 }
3665 EXPORT_SYMBOL(netif_napi_add);
3666 
3667 void netif_napi_del(struct napi_struct *napi)
3668 {
3669 	struct sk_buff *skb, *next;
3670 
3671 	list_del_init(&napi->dev_list);
3672 	napi_free_frags(napi);
3673 
3674 	for (skb = napi->gro_list; skb; skb = next) {
3675 		next = skb->next;
3676 		skb->next = NULL;
3677 		kfree_skb(skb);
3678 	}
3679 
3680 	napi->gro_list = NULL;
3681 	napi->gro_count = 0;
3682 }
3683 EXPORT_SYMBOL(netif_napi_del);
3684 
3685 static void net_rx_action(struct softirq_action *h)
3686 {
3687 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3688 	unsigned long time_limit = jiffies + 2;
3689 	int budget = netdev_budget;
3690 	void *have;
3691 
3692 	local_irq_disable();
3693 
3694 	while (!list_empty(&sd->poll_list)) {
3695 		struct napi_struct *n;
3696 		int work, weight;
3697 
3698 		/* If softirq window is exhuasted then punt.
3699 		 * Allow this to run for 2 jiffies since which will allow
3700 		 * an average latency of 1.5/HZ.
3701 		 */
3702 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3703 			goto softnet_break;
3704 
3705 		local_irq_enable();
3706 
3707 		/* Even though interrupts have been re-enabled, this
3708 		 * access is safe because interrupts can only add new
3709 		 * entries to the tail of this list, and only ->poll()
3710 		 * calls can remove this head entry from the list.
3711 		 */
3712 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3713 
3714 		have = netpoll_poll_lock(n);
3715 
3716 		weight = n->weight;
3717 
3718 		/* This NAPI_STATE_SCHED test is for avoiding a race
3719 		 * with netpoll's poll_napi().  Only the entity which
3720 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3721 		 * actually make the ->poll() call.  Therefore we avoid
3722 		 * accidently calling ->poll() when NAPI is not scheduled.
3723 		 */
3724 		work = 0;
3725 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3726 			work = n->poll(n, weight);
3727 			trace_napi_poll(n);
3728 		}
3729 
3730 		WARN_ON_ONCE(work > weight);
3731 
3732 		budget -= work;
3733 
3734 		local_irq_disable();
3735 
3736 		/* Drivers must not modify the NAPI state if they
3737 		 * consume the entire weight.  In such cases this code
3738 		 * still "owns" the NAPI instance and therefore can
3739 		 * move the instance around on the list at-will.
3740 		 */
3741 		if (unlikely(work == weight)) {
3742 			if (unlikely(napi_disable_pending(n))) {
3743 				local_irq_enable();
3744 				napi_complete(n);
3745 				local_irq_disable();
3746 			} else
3747 				list_move_tail(&n->poll_list, &sd->poll_list);
3748 		}
3749 
3750 		netpoll_poll_unlock(have);
3751 	}
3752 out:
3753 	net_rps_action_and_irq_enable(sd);
3754 
3755 #ifdef CONFIG_NET_DMA
3756 	/*
3757 	 * There may not be any more sk_buffs coming right now, so push
3758 	 * any pending DMA copies to hardware
3759 	 */
3760 	dma_issue_pending_all();
3761 #endif
3762 
3763 	return;
3764 
3765 softnet_break:
3766 	sd->time_squeeze++;
3767 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3768 	goto out;
3769 }
3770 
3771 static gifconf_func_t *gifconf_list[NPROTO];
3772 
3773 /**
3774  *	register_gifconf	-	register a SIOCGIF handler
3775  *	@family: Address family
3776  *	@gifconf: Function handler
3777  *
3778  *	Register protocol dependent address dumping routines. The handler
3779  *	that is passed must not be freed or reused until it has been replaced
3780  *	by another handler.
3781  */
3782 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3783 {
3784 	if (family >= NPROTO)
3785 		return -EINVAL;
3786 	gifconf_list[family] = gifconf;
3787 	return 0;
3788 }
3789 EXPORT_SYMBOL(register_gifconf);
3790 
3791 
3792 /*
3793  *	Map an interface index to its name (SIOCGIFNAME)
3794  */
3795 
3796 /*
3797  *	We need this ioctl for efficient implementation of the
3798  *	if_indextoname() function required by the IPv6 API.  Without
3799  *	it, we would have to search all the interfaces to find a
3800  *	match.  --pb
3801  */
3802 
3803 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3804 {
3805 	struct net_device *dev;
3806 	struct ifreq ifr;
3807 
3808 	/*
3809 	 *	Fetch the caller's info block.
3810 	 */
3811 
3812 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3813 		return -EFAULT;
3814 
3815 	rcu_read_lock();
3816 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3817 	if (!dev) {
3818 		rcu_read_unlock();
3819 		return -ENODEV;
3820 	}
3821 
3822 	strcpy(ifr.ifr_name, dev->name);
3823 	rcu_read_unlock();
3824 
3825 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3826 		return -EFAULT;
3827 	return 0;
3828 }
3829 
3830 /*
3831  *	Perform a SIOCGIFCONF call. This structure will change
3832  *	size eventually, and there is nothing I can do about it.
3833  *	Thus we will need a 'compatibility mode'.
3834  */
3835 
3836 static int dev_ifconf(struct net *net, char __user *arg)
3837 {
3838 	struct ifconf ifc;
3839 	struct net_device *dev;
3840 	char __user *pos;
3841 	int len;
3842 	int total;
3843 	int i;
3844 
3845 	/*
3846 	 *	Fetch the caller's info block.
3847 	 */
3848 
3849 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3850 		return -EFAULT;
3851 
3852 	pos = ifc.ifc_buf;
3853 	len = ifc.ifc_len;
3854 
3855 	/*
3856 	 *	Loop over the interfaces, and write an info block for each.
3857 	 */
3858 
3859 	total = 0;
3860 	for_each_netdev(net, dev) {
3861 		for (i = 0; i < NPROTO; i++) {
3862 			if (gifconf_list[i]) {
3863 				int done;
3864 				if (!pos)
3865 					done = gifconf_list[i](dev, NULL, 0);
3866 				else
3867 					done = gifconf_list[i](dev, pos + total,
3868 							       len - total);
3869 				if (done < 0)
3870 					return -EFAULT;
3871 				total += done;
3872 			}
3873 		}
3874 	}
3875 
3876 	/*
3877 	 *	All done.  Write the updated control block back to the caller.
3878 	 */
3879 	ifc.ifc_len = total;
3880 
3881 	/*
3882 	 * 	Both BSD and Solaris return 0 here, so we do too.
3883 	 */
3884 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3885 }
3886 
3887 #ifdef CONFIG_PROC_FS
3888 /*
3889  *	This is invoked by the /proc filesystem handler to display a device
3890  *	in detail.
3891  */
3892 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3893 	__acquires(RCU)
3894 {
3895 	struct net *net = seq_file_net(seq);
3896 	loff_t off;
3897 	struct net_device *dev;
3898 
3899 	rcu_read_lock();
3900 	if (!*pos)
3901 		return SEQ_START_TOKEN;
3902 
3903 	off = 1;
3904 	for_each_netdev_rcu(net, dev)
3905 		if (off++ == *pos)
3906 			return dev;
3907 
3908 	return NULL;
3909 }
3910 
3911 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3912 {
3913 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3914 				  first_net_device(seq_file_net(seq)) :
3915 				  next_net_device((struct net_device *)v);
3916 
3917 	++*pos;
3918 	return rcu_dereference(dev);
3919 }
3920 
3921 void dev_seq_stop(struct seq_file *seq, void *v)
3922 	__releases(RCU)
3923 {
3924 	rcu_read_unlock();
3925 }
3926 
3927 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3928 {
3929 	struct rtnl_link_stats64 temp;
3930 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
3931 
3932 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3933 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3934 		   dev->name, stats->rx_bytes, stats->rx_packets,
3935 		   stats->rx_errors,
3936 		   stats->rx_dropped + stats->rx_missed_errors,
3937 		   stats->rx_fifo_errors,
3938 		   stats->rx_length_errors + stats->rx_over_errors +
3939 		    stats->rx_crc_errors + stats->rx_frame_errors,
3940 		   stats->rx_compressed, stats->multicast,
3941 		   stats->tx_bytes, stats->tx_packets,
3942 		   stats->tx_errors, stats->tx_dropped,
3943 		   stats->tx_fifo_errors, stats->collisions,
3944 		   stats->tx_carrier_errors +
3945 		    stats->tx_aborted_errors +
3946 		    stats->tx_window_errors +
3947 		    stats->tx_heartbeat_errors,
3948 		   stats->tx_compressed);
3949 }
3950 
3951 /*
3952  *	Called from the PROCfs module. This now uses the new arbitrary sized
3953  *	/proc/net interface to create /proc/net/dev
3954  */
3955 static int dev_seq_show(struct seq_file *seq, void *v)
3956 {
3957 	if (v == SEQ_START_TOKEN)
3958 		seq_puts(seq, "Inter-|   Receive                            "
3959 			      "                    |  Transmit\n"
3960 			      " face |bytes    packets errs drop fifo frame "
3961 			      "compressed multicast|bytes    packets errs "
3962 			      "drop fifo colls carrier compressed\n");
3963 	else
3964 		dev_seq_printf_stats(seq, v);
3965 	return 0;
3966 }
3967 
3968 static struct softnet_data *softnet_get_online(loff_t *pos)
3969 {
3970 	struct softnet_data *sd = NULL;
3971 
3972 	while (*pos < nr_cpu_ids)
3973 		if (cpu_online(*pos)) {
3974 			sd = &per_cpu(softnet_data, *pos);
3975 			break;
3976 		} else
3977 			++*pos;
3978 	return sd;
3979 }
3980 
3981 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3982 {
3983 	return softnet_get_online(pos);
3984 }
3985 
3986 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3987 {
3988 	++*pos;
3989 	return softnet_get_online(pos);
3990 }
3991 
3992 static void softnet_seq_stop(struct seq_file *seq, void *v)
3993 {
3994 }
3995 
3996 static int softnet_seq_show(struct seq_file *seq, void *v)
3997 {
3998 	struct softnet_data *sd = v;
3999 
4000 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4001 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4002 		   0, 0, 0, 0, /* was fastroute */
4003 		   sd->cpu_collision, sd->received_rps);
4004 	return 0;
4005 }
4006 
4007 static const struct seq_operations dev_seq_ops = {
4008 	.start = dev_seq_start,
4009 	.next  = dev_seq_next,
4010 	.stop  = dev_seq_stop,
4011 	.show  = dev_seq_show,
4012 };
4013 
4014 static int dev_seq_open(struct inode *inode, struct file *file)
4015 {
4016 	return seq_open_net(inode, file, &dev_seq_ops,
4017 			    sizeof(struct seq_net_private));
4018 }
4019 
4020 static const struct file_operations dev_seq_fops = {
4021 	.owner	 = THIS_MODULE,
4022 	.open    = dev_seq_open,
4023 	.read    = seq_read,
4024 	.llseek  = seq_lseek,
4025 	.release = seq_release_net,
4026 };
4027 
4028 static const struct seq_operations softnet_seq_ops = {
4029 	.start = softnet_seq_start,
4030 	.next  = softnet_seq_next,
4031 	.stop  = softnet_seq_stop,
4032 	.show  = softnet_seq_show,
4033 };
4034 
4035 static int softnet_seq_open(struct inode *inode, struct file *file)
4036 {
4037 	return seq_open(file, &softnet_seq_ops);
4038 }
4039 
4040 static const struct file_operations softnet_seq_fops = {
4041 	.owner	 = THIS_MODULE,
4042 	.open    = softnet_seq_open,
4043 	.read    = seq_read,
4044 	.llseek  = seq_lseek,
4045 	.release = seq_release,
4046 };
4047 
4048 static void *ptype_get_idx(loff_t pos)
4049 {
4050 	struct packet_type *pt = NULL;
4051 	loff_t i = 0;
4052 	int t;
4053 
4054 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4055 		if (i == pos)
4056 			return pt;
4057 		++i;
4058 	}
4059 
4060 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4061 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4062 			if (i == pos)
4063 				return pt;
4064 			++i;
4065 		}
4066 	}
4067 	return NULL;
4068 }
4069 
4070 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4071 	__acquires(RCU)
4072 {
4073 	rcu_read_lock();
4074 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4075 }
4076 
4077 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4078 {
4079 	struct packet_type *pt;
4080 	struct list_head *nxt;
4081 	int hash;
4082 
4083 	++*pos;
4084 	if (v == SEQ_START_TOKEN)
4085 		return ptype_get_idx(0);
4086 
4087 	pt = v;
4088 	nxt = pt->list.next;
4089 	if (pt->type == htons(ETH_P_ALL)) {
4090 		if (nxt != &ptype_all)
4091 			goto found;
4092 		hash = 0;
4093 		nxt = ptype_base[0].next;
4094 	} else
4095 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4096 
4097 	while (nxt == &ptype_base[hash]) {
4098 		if (++hash >= PTYPE_HASH_SIZE)
4099 			return NULL;
4100 		nxt = ptype_base[hash].next;
4101 	}
4102 found:
4103 	return list_entry(nxt, struct packet_type, list);
4104 }
4105 
4106 static void ptype_seq_stop(struct seq_file *seq, void *v)
4107 	__releases(RCU)
4108 {
4109 	rcu_read_unlock();
4110 }
4111 
4112 static int ptype_seq_show(struct seq_file *seq, void *v)
4113 {
4114 	struct packet_type *pt = v;
4115 
4116 	if (v == SEQ_START_TOKEN)
4117 		seq_puts(seq, "Type Device      Function\n");
4118 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4119 		if (pt->type == htons(ETH_P_ALL))
4120 			seq_puts(seq, "ALL ");
4121 		else
4122 			seq_printf(seq, "%04x", ntohs(pt->type));
4123 
4124 		seq_printf(seq, " %-8s %pF\n",
4125 			   pt->dev ? pt->dev->name : "", pt->func);
4126 	}
4127 
4128 	return 0;
4129 }
4130 
4131 static const struct seq_operations ptype_seq_ops = {
4132 	.start = ptype_seq_start,
4133 	.next  = ptype_seq_next,
4134 	.stop  = ptype_seq_stop,
4135 	.show  = ptype_seq_show,
4136 };
4137 
4138 static int ptype_seq_open(struct inode *inode, struct file *file)
4139 {
4140 	return seq_open_net(inode, file, &ptype_seq_ops,
4141 			sizeof(struct seq_net_private));
4142 }
4143 
4144 static const struct file_operations ptype_seq_fops = {
4145 	.owner	 = THIS_MODULE,
4146 	.open    = ptype_seq_open,
4147 	.read    = seq_read,
4148 	.llseek  = seq_lseek,
4149 	.release = seq_release_net,
4150 };
4151 
4152 
4153 static int __net_init dev_proc_net_init(struct net *net)
4154 {
4155 	int rc = -ENOMEM;
4156 
4157 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4158 		goto out;
4159 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4160 		goto out_dev;
4161 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4162 		goto out_softnet;
4163 
4164 	if (wext_proc_init(net))
4165 		goto out_ptype;
4166 	rc = 0;
4167 out:
4168 	return rc;
4169 out_ptype:
4170 	proc_net_remove(net, "ptype");
4171 out_softnet:
4172 	proc_net_remove(net, "softnet_stat");
4173 out_dev:
4174 	proc_net_remove(net, "dev");
4175 	goto out;
4176 }
4177 
4178 static void __net_exit dev_proc_net_exit(struct net *net)
4179 {
4180 	wext_proc_exit(net);
4181 
4182 	proc_net_remove(net, "ptype");
4183 	proc_net_remove(net, "softnet_stat");
4184 	proc_net_remove(net, "dev");
4185 }
4186 
4187 static struct pernet_operations __net_initdata dev_proc_ops = {
4188 	.init = dev_proc_net_init,
4189 	.exit = dev_proc_net_exit,
4190 };
4191 
4192 static int __init dev_proc_init(void)
4193 {
4194 	return register_pernet_subsys(&dev_proc_ops);
4195 }
4196 #else
4197 #define dev_proc_init() 0
4198 #endif	/* CONFIG_PROC_FS */
4199 
4200 
4201 /**
4202  *	netdev_set_master	-	set up master/slave pair
4203  *	@slave: slave device
4204  *	@master: new master device
4205  *
4206  *	Changes the master device of the slave. Pass %NULL to break the
4207  *	bonding. The caller must hold the RTNL semaphore. On a failure
4208  *	a negative errno code is returned. On success the reference counts
4209  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4210  *	function returns zero.
4211  */
4212 int netdev_set_master(struct net_device *slave, struct net_device *master)
4213 {
4214 	struct net_device *old = slave->master;
4215 
4216 	ASSERT_RTNL();
4217 
4218 	if (master) {
4219 		if (old)
4220 			return -EBUSY;
4221 		dev_hold(master);
4222 	}
4223 
4224 	slave->master = master;
4225 
4226 	if (old) {
4227 		synchronize_net();
4228 		dev_put(old);
4229 	}
4230 	if (master)
4231 		slave->flags |= IFF_SLAVE;
4232 	else
4233 		slave->flags &= ~IFF_SLAVE;
4234 
4235 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4236 	return 0;
4237 }
4238 EXPORT_SYMBOL(netdev_set_master);
4239 
4240 static void dev_change_rx_flags(struct net_device *dev, int flags)
4241 {
4242 	const struct net_device_ops *ops = dev->netdev_ops;
4243 
4244 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4245 		ops->ndo_change_rx_flags(dev, flags);
4246 }
4247 
4248 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4249 {
4250 	unsigned short old_flags = dev->flags;
4251 	uid_t uid;
4252 	gid_t gid;
4253 
4254 	ASSERT_RTNL();
4255 
4256 	dev->flags |= IFF_PROMISC;
4257 	dev->promiscuity += inc;
4258 	if (dev->promiscuity == 0) {
4259 		/*
4260 		 * Avoid overflow.
4261 		 * If inc causes overflow, untouch promisc and return error.
4262 		 */
4263 		if (inc < 0)
4264 			dev->flags &= ~IFF_PROMISC;
4265 		else {
4266 			dev->promiscuity -= inc;
4267 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4268 				"set promiscuity failed, promiscuity feature "
4269 				"of device might be broken.\n", dev->name);
4270 			return -EOVERFLOW;
4271 		}
4272 	}
4273 	if (dev->flags != old_flags) {
4274 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4275 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4276 							       "left");
4277 		if (audit_enabled) {
4278 			current_uid_gid(&uid, &gid);
4279 			audit_log(current->audit_context, GFP_ATOMIC,
4280 				AUDIT_ANOM_PROMISCUOUS,
4281 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4282 				dev->name, (dev->flags & IFF_PROMISC),
4283 				(old_flags & IFF_PROMISC),
4284 				audit_get_loginuid(current),
4285 				uid, gid,
4286 				audit_get_sessionid(current));
4287 		}
4288 
4289 		dev_change_rx_flags(dev, IFF_PROMISC);
4290 	}
4291 	return 0;
4292 }
4293 
4294 /**
4295  *	dev_set_promiscuity	- update promiscuity count on a device
4296  *	@dev: device
4297  *	@inc: modifier
4298  *
4299  *	Add or remove promiscuity from a device. While the count in the device
4300  *	remains above zero the interface remains promiscuous. Once it hits zero
4301  *	the device reverts back to normal filtering operation. A negative inc
4302  *	value is used to drop promiscuity on the device.
4303  *	Return 0 if successful or a negative errno code on error.
4304  */
4305 int dev_set_promiscuity(struct net_device *dev, int inc)
4306 {
4307 	unsigned short old_flags = dev->flags;
4308 	int err;
4309 
4310 	err = __dev_set_promiscuity(dev, inc);
4311 	if (err < 0)
4312 		return err;
4313 	if (dev->flags != old_flags)
4314 		dev_set_rx_mode(dev);
4315 	return err;
4316 }
4317 EXPORT_SYMBOL(dev_set_promiscuity);
4318 
4319 /**
4320  *	dev_set_allmulti	- update allmulti count on a device
4321  *	@dev: device
4322  *	@inc: modifier
4323  *
4324  *	Add or remove reception of all multicast frames to a device. While the
4325  *	count in the device remains above zero the interface remains listening
4326  *	to all interfaces. Once it hits zero the device reverts back to normal
4327  *	filtering operation. A negative @inc value is used to drop the counter
4328  *	when releasing a resource needing all multicasts.
4329  *	Return 0 if successful or a negative errno code on error.
4330  */
4331 
4332 int dev_set_allmulti(struct net_device *dev, int inc)
4333 {
4334 	unsigned short old_flags = dev->flags;
4335 
4336 	ASSERT_RTNL();
4337 
4338 	dev->flags |= IFF_ALLMULTI;
4339 	dev->allmulti += inc;
4340 	if (dev->allmulti == 0) {
4341 		/*
4342 		 * Avoid overflow.
4343 		 * If inc causes overflow, untouch allmulti and return error.
4344 		 */
4345 		if (inc < 0)
4346 			dev->flags &= ~IFF_ALLMULTI;
4347 		else {
4348 			dev->allmulti -= inc;
4349 			printk(KERN_WARNING "%s: allmulti touches roof, "
4350 				"set allmulti failed, allmulti feature of "
4351 				"device might be broken.\n", dev->name);
4352 			return -EOVERFLOW;
4353 		}
4354 	}
4355 	if (dev->flags ^ old_flags) {
4356 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4357 		dev_set_rx_mode(dev);
4358 	}
4359 	return 0;
4360 }
4361 EXPORT_SYMBOL(dev_set_allmulti);
4362 
4363 /*
4364  *	Upload unicast and multicast address lists to device and
4365  *	configure RX filtering. When the device doesn't support unicast
4366  *	filtering it is put in promiscuous mode while unicast addresses
4367  *	are present.
4368  */
4369 void __dev_set_rx_mode(struct net_device *dev)
4370 {
4371 	const struct net_device_ops *ops = dev->netdev_ops;
4372 
4373 	/* dev_open will call this function so the list will stay sane. */
4374 	if (!(dev->flags&IFF_UP))
4375 		return;
4376 
4377 	if (!netif_device_present(dev))
4378 		return;
4379 
4380 	if (ops->ndo_set_rx_mode)
4381 		ops->ndo_set_rx_mode(dev);
4382 	else {
4383 		/* Unicast addresses changes may only happen under the rtnl,
4384 		 * therefore calling __dev_set_promiscuity here is safe.
4385 		 */
4386 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4387 			__dev_set_promiscuity(dev, 1);
4388 			dev->uc_promisc = 1;
4389 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4390 			__dev_set_promiscuity(dev, -1);
4391 			dev->uc_promisc = 0;
4392 		}
4393 
4394 		if (ops->ndo_set_multicast_list)
4395 			ops->ndo_set_multicast_list(dev);
4396 	}
4397 }
4398 
4399 void dev_set_rx_mode(struct net_device *dev)
4400 {
4401 	netif_addr_lock_bh(dev);
4402 	__dev_set_rx_mode(dev);
4403 	netif_addr_unlock_bh(dev);
4404 }
4405 
4406 /**
4407  *	dev_get_flags - get flags reported to userspace
4408  *	@dev: device
4409  *
4410  *	Get the combination of flag bits exported through APIs to userspace.
4411  */
4412 unsigned dev_get_flags(const struct net_device *dev)
4413 {
4414 	unsigned flags;
4415 
4416 	flags = (dev->flags & ~(IFF_PROMISC |
4417 				IFF_ALLMULTI |
4418 				IFF_RUNNING |
4419 				IFF_LOWER_UP |
4420 				IFF_DORMANT)) |
4421 		(dev->gflags & (IFF_PROMISC |
4422 				IFF_ALLMULTI));
4423 
4424 	if (netif_running(dev)) {
4425 		if (netif_oper_up(dev))
4426 			flags |= IFF_RUNNING;
4427 		if (netif_carrier_ok(dev))
4428 			flags |= IFF_LOWER_UP;
4429 		if (netif_dormant(dev))
4430 			flags |= IFF_DORMANT;
4431 	}
4432 
4433 	return flags;
4434 }
4435 EXPORT_SYMBOL(dev_get_flags);
4436 
4437 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4438 {
4439 	int old_flags = dev->flags;
4440 	int ret;
4441 
4442 	ASSERT_RTNL();
4443 
4444 	/*
4445 	 *	Set the flags on our device.
4446 	 */
4447 
4448 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4449 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4450 			       IFF_AUTOMEDIA)) |
4451 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4452 				    IFF_ALLMULTI));
4453 
4454 	/*
4455 	 *	Load in the correct multicast list now the flags have changed.
4456 	 */
4457 
4458 	if ((old_flags ^ flags) & IFF_MULTICAST)
4459 		dev_change_rx_flags(dev, IFF_MULTICAST);
4460 
4461 	dev_set_rx_mode(dev);
4462 
4463 	/*
4464 	 *	Have we downed the interface. We handle IFF_UP ourselves
4465 	 *	according to user attempts to set it, rather than blindly
4466 	 *	setting it.
4467 	 */
4468 
4469 	ret = 0;
4470 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4471 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4472 
4473 		if (!ret)
4474 			dev_set_rx_mode(dev);
4475 	}
4476 
4477 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4478 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4479 
4480 		dev->gflags ^= IFF_PROMISC;
4481 		dev_set_promiscuity(dev, inc);
4482 	}
4483 
4484 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4485 	   is important. Some (broken) drivers set IFF_PROMISC, when
4486 	   IFF_ALLMULTI is requested not asking us and not reporting.
4487 	 */
4488 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4489 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4490 
4491 		dev->gflags ^= IFF_ALLMULTI;
4492 		dev_set_allmulti(dev, inc);
4493 	}
4494 
4495 	return ret;
4496 }
4497 
4498 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4499 {
4500 	unsigned int changes = dev->flags ^ old_flags;
4501 
4502 	if (changes & IFF_UP) {
4503 		if (dev->flags & IFF_UP)
4504 			call_netdevice_notifiers(NETDEV_UP, dev);
4505 		else
4506 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4507 	}
4508 
4509 	if (dev->flags & IFF_UP &&
4510 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4511 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4512 }
4513 
4514 /**
4515  *	dev_change_flags - change device settings
4516  *	@dev: device
4517  *	@flags: device state flags
4518  *
4519  *	Change settings on device based state flags. The flags are
4520  *	in the userspace exported format.
4521  */
4522 int dev_change_flags(struct net_device *dev, unsigned flags)
4523 {
4524 	int ret, changes;
4525 	int old_flags = dev->flags;
4526 
4527 	ret = __dev_change_flags(dev, flags);
4528 	if (ret < 0)
4529 		return ret;
4530 
4531 	changes = old_flags ^ dev->flags;
4532 	if (changes)
4533 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4534 
4535 	__dev_notify_flags(dev, old_flags);
4536 	return ret;
4537 }
4538 EXPORT_SYMBOL(dev_change_flags);
4539 
4540 /**
4541  *	dev_set_mtu - Change maximum transfer unit
4542  *	@dev: device
4543  *	@new_mtu: new transfer unit
4544  *
4545  *	Change the maximum transfer size of the network device.
4546  */
4547 int dev_set_mtu(struct net_device *dev, int new_mtu)
4548 {
4549 	const struct net_device_ops *ops = dev->netdev_ops;
4550 	int err;
4551 
4552 	if (new_mtu == dev->mtu)
4553 		return 0;
4554 
4555 	/*	MTU must be positive.	 */
4556 	if (new_mtu < 0)
4557 		return -EINVAL;
4558 
4559 	if (!netif_device_present(dev))
4560 		return -ENODEV;
4561 
4562 	err = 0;
4563 	if (ops->ndo_change_mtu)
4564 		err = ops->ndo_change_mtu(dev, new_mtu);
4565 	else
4566 		dev->mtu = new_mtu;
4567 
4568 	if (!err && dev->flags & IFF_UP)
4569 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4570 	return err;
4571 }
4572 EXPORT_SYMBOL(dev_set_mtu);
4573 
4574 /**
4575  *	dev_set_mac_address - Change Media Access Control Address
4576  *	@dev: device
4577  *	@sa: new address
4578  *
4579  *	Change the hardware (MAC) address of the device
4580  */
4581 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4582 {
4583 	const struct net_device_ops *ops = dev->netdev_ops;
4584 	int err;
4585 
4586 	if (!ops->ndo_set_mac_address)
4587 		return -EOPNOTSUPP;
4588 	if (sa->sa_family != dev->type)
4589 		return -EINVAL;
4590 	if (!netif_device_present(dev))
4591 		return -ENODEV;
4592 	err = ops->ndo_set_mac_address(dev, sa);
4593 	if (!err)
4594 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4595 	return err;
4596 }
4597 EXPORT_SYMBOL(dev_set_mac_address);
4598 
4599 /*
4600  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4601  */
4602 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4603 {
4604 	int err;
4605 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4606 
4607 	if (!dev)
4608 		return -ENODEV;
4609 
4610 	switch (cmd) {
4611 	case SIOCGIFFLAGS:	/* Get interface flags */
4612 		ifr->ifr_flags = (short) dev_get_flags(dev);
4613 		return 0;
4614 
4615 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4616 				   (currently unused) */
4617 		ifr->ifr_metric = 0;
4618 		return 0;
4619 
4620 	case SIOCGIFMTU:	/* Get the MTU of a device */
4621 		ifr->ifr_mtu = dev->mtu;
4622 		return 0;
4623 
4624 	case SIOCGIFHWADDR:
4625 		if (!dev->addr_len)
4626 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4627 		else
4628 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4629 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4630 		ifr->ifr_hwaddr.sa_family = dev->type;
4631 		return 0;
4632 
4633 	case SIOCGIFSLAVE:
4634 		err = -EINVAL;
4635 		break;
4636 
4637 	case SIOCGIFMAP:
4638 		ifr->ifr_map.mem_start = dev->mem_start;
4639 		ifr->ifr_map.mem_end   = dev->mem_end;
4640 		ifr->ifr_map.base_addr = dev->base_addr;
4641 		ifr->ifr_map.irq       = dev->irq;
4642 		ifr->ifr_map.dma       = dev->dma;
4643 		ifr->ifr_map.port      = dev->if_port;
4644 		return 0;
4645 
4646 	case SIOCGIFINDEX:
4647 		ifr->ifr_ifindex = dev->ifindex;
4648 		return 0;
4649 
4650 	case SIOCGIFTXQLEN:
4651 		ifr->ifr_qlen = dev->tx_queue_len;
4652 		return 0;
4653 
4654 	default:
4655 		/* dev_ioctl() should ensure this case
4656 		 * is never reached
4657 		 */
4658 		WARN_ON(1);
4659 		err = -EINVAL;
4660 		break;
4661 
4662 	}
4663 	return err;
4664 }
4665 
4666 /*
4667  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4668  */
4669 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4670 {
4671 	int err;
4672 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4673 	const struct net_device_ops *ops;
4674 
4675 	if (!dev)
4676 		return -ENODEV;
4677 
4678 	ops = dev->netdev_ops;
4679 
4680 	switch (cmd) {
4681 	case SIOCSIFFLAGS:	/* Set interface flags */
4682 		return dev_change_flags(dev, ifr->ifr_flags);
4683 
4684 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4685 				   (currently unused) */
4686 		return -EOPNOTSUPP;
4687 
4688 	case SIOCSIFMTU:	/* Set the MTU of a device */
4689 		return dev_set_mtu(dev, ifr->ifr_mtu);
4690 
4691 	case SIOCSIFHWADDR:
4692 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4693 
4694 	case SIOCSIFHWBROADCAST:
4695 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4696 			return -EINVAL;
4697 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4698 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4699 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4700 		return 0;
4701 
4702 	case SIOCSIFMAP:
4703 		if (ops->ndo_set_config) {
4704 			if (!netif_device_present(dev))
4705 				return -ENODEV;
4706 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4707 		}
4708 		return -EOPNOTSUPP;
4709 
4710 	case SIOCADDMULTI:
4711 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4712 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4713 			return -EINVAL;
4714 		if (!netif_device_present(dev))
4715 			return -ENODEV;
4716 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4717 
4718 	case SIOCDELMULTI:
4719 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4720 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4721 			return -EINVAL;
4722 		if (!netif_device_present(dev))
4723 			return -ENODEV;
4724 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4725 
4726 	case SIOCSIFTXQLEN:
4727 		if (ifr->ifr_qlen < 0)
4728 			return -EINVAL;
4729 		dev->tx_queue_len = ifr->ifr_qlen;
4730 		return 0;
4731 
4732 	case SIOCSIFNAME:
4733 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4734 		return dev_change_name(dev, ifr->ifr_newname);
4735 
4736 	/*
4737 	 *	Unknown or private ioctl
4738 	 */
4739 	default:
4740 		if ((cmd >= SIOCDEVPRIVATE &&
4741 		    cmd <= SIOCDEVPRIVATE + 15) ||
4742 		    cmd == SIOCBONDENSLAVE ||
4743 		    cmd == SIOCBONDRELEASE ||
4744 		    cmd == SIOCBONDSETHWADDR ||
4745 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4746 		    cmd == SIOCBONDINFOQUERY ||
4747 		    cmd == SIOCBONDCHANGEACTIVE ||
4748 		    cmd == SIOCGMIIPHY ||
4749 		    cmd == SIOCGMIIREG ||
4750 		    cmd == SIOCSMIIREG ||
4751 		    cmd == SIOCBRADDIF ||
4752 		    cmd == SIOCBRDELIF ||
4753 		    cmd == SIOCSHWTSTAMP ||
4754 		    cmd == SIOCWANDEV) {
4755 			err = -EOPNOTSUPP;
4756 			if (ops->ndo_do_ioctl) {
4757 				if (netif_device_present(dev))
4758 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4759 				else
4760 					err = -ENODEV;
4761 			}
4762 		} else
4763 			err = -EINVAL;
4764 
4765 	}
4766 	return err;
4767 }
4768 
4769 /*
4770  *	This function handles all "interface"-type I/O control requests. The actual
4771  *	'doing' part of this is dev_ifsioc above.
4772  */
4773 
4774 /**
4775  *	dev_ioctl	-	network device ioctl
4776  *	@net: the applicable net namespace
4777  *	@cmd: command to issue
4778  *	@arg: pointer to a struct ifreq in user space
4779  *
4780  *	Issue ioctl functions to devices. This is normally called by the
4781  *	user space syscall interfaces but can sometimes be useful for
4782  *	other purposes. The return value is the return from the syscall if
4783  *	positive or a negative errno code on error.
4784  */
4785 
4786 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4787 {
4788 	struct ifreq ifr;
4789 	int ret;
4790 	char *colon;
4791 
4792 	/* One special case: SIOCGIFCONF takes ifconf argument
4793 	   and requires shared lock, because it sleeps writing
4794 	   to user space.
4795 	 */
4796 
4797 	if (cmd == SIOCGIFCONF) {
4798 		rtnl_lock();
4799 		ret = dev_ifconf(net, (char __user *) arg);
4800 		rtnl_unlock();
4801 		return ret;
4802 	}
4803 	if (cmd == SIOCGIFNAME)
4804 		return dev_ifname(net, (struct ifreq __user *)arg);
4805 
4806 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4807 		return -EFAULT;
4808 
4809 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4810 
4811 	colon = strchr(ifr.ifr_name, ':');
4812 	if (colon)
4813 		*colon = 0;
4814 
4815 	/*
4816 	 *	See which interface the caller is talking about.
4817 	 */
4818 
4819 	switch (cmd) {
4820 	/*
4821 	 *	These ioctl calls:
4822 	 *	- can be done by all.
4823 	 *	- atomic and do not require locking.
4824 	 *	- return a value
4825 	 */
4826 	case SIOCGIFFLAGS:
4827 	case SIOCGIFMETRIC:
4828 	case SIOCGIFMTU:
4829 	case SIOCGIFHWADDR:
4830 	case SIOCGIFSLAVE:
4831 	case SIOCGIFMAP:
4832 	case SIOCGIFINDEX:
4833 	case SIOCGIFTXQLEN:
4834 		dev_load(net, ifr.ifr_name);
4835 		rcu_read_lock();
4836 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4837 		rcu_read_unlock();
4838 		if (!ret) {
4839 			if (colon)
4840 				*colon = ':';
4841 			if (copy_to_user(arg, &ifr,
4842 					 sizeof(struct ifreq)))
4843 				ret = -EFAULT;
4844 		}
4845 		return ret;
4846 
4847 	case SIOCETHTOOL:
4848 		dev_load(net, ifr.ifr_name);
4849 		rtnl_lock();
4850 		ret = dev_ethtool(net, &ifr);
4851 		rtnl_unlock();
4852 		if (!ret) {
4853 			if (colon)
4854 				*colon = ':';
4855 			if (copy_to_user(arg, &ifr,
4856 					 sizeof(struct ifreq)))
4857 				ret = -EFAULT;
4858 		}
4859 		return ret;
4860 
4861 	/*
4862 	 *	These ioctl calls:
4863 	 *	- require superuser power.
4864 	 *	- require strict serialization.
4865 	 *	- return a value
4866 	 */
4867 	case SIOCGMIIPHY:
4868 	case SIOCGMIIREG:
4869 	case SIOCSIFNAME:
4870 		if (!capable(CAP_NET_ADMIN))
4871 			return -EPERM;
4872 		dev_load(net, ifr.ifr_name);
4873 		rtnl_lock();
4874 		ret = dev_ifsioc(net, &ifr, cmd);
4875 		rtnl_unlock();
4876 		if (!ret) {
4877 			if (colon)
4878 				*colon = ':';
4879 			if (copy_to_user(arg, &ifr,
4880 					 sizeof(struct ifreq)))
4881 				ret = -EFAULT;
4882 		}
4883 		return ret;
4884 
4885 	/*
4886 	 *	These ioctl calls:
4887 	 *	- require superuser power.
4888 	 *	- require strict serialization.
4889 	 *	- do not return a value
4890 	 */
4891 	case SIOCSIFFLAGS:
4892 	case SIOCSIFMETRIC:
4893 	case SIOCSIFMTU:
4894 	case SIOCSIFMAP:
4895 	case SIOCSIFHWADDR:
4896 	case SIOCSIFSLAVE:
4897 	case SIOCADDMULTI:
4898 	case SIOCDELMULTI:
4899 	case SIOCSIFHWBROADCAST:
4900 	case SIOCSIFTXQLEN:
4901 	case SIOCSMIIREG:
4902 	case SIOCBONDENSLAVE:
4903 	case SIOCBONDRELEASE:
4904 	case SIOCBONDSETHWADDR:
4905 	case SIOCBONDCHANGEACTIVE:
4906 	case SIOCBRADDIF:
4907 	case SIOCBRDELIF:
4908 	case SIOCSHWTSTAMP:
4909 		if (!capable(CAP_NET_ADMIN))
4910 			return -EPERM;
4911 		/* fall through */
4912 	case SIOCBONDSLAVEINFOQUERY:
4913 	case SIOCBONDINFOQUERY:
4914 		dev_load(net, ifr.ifr_name);
4915 		rtnl_lock();
4916 		ret = dev_ifsioc(net, &ifr, cmd);
4917 		rtnl_unlock();
4918 		return ret;
4919 
4920 	case SIOCGIFMEM:
4921 		/* Get the per device memory space. We can add this but
4922 		 * currently do not support it */
4923 	case SIOCSIFMEM:
4924 		/* Set the per device memory buffer space.
4925 		 * Not applicable in our case */
4926 	case SIOCSIFLINK:
4927 		return -EINVAL;
4928 
4929 	/*
4930 	 *	Unknown or private ioctl.
4931 	 */
4932 	default:
4933 		if (cmd == SIOCWANDEV ||
4934 		    (cmd >= SIOCDEVPRIVATE &&
4935 		     cmd <= SIOCDEVPRIVATE + 15)) {
4936 			dev_load(net, ifr.ifr_name);
4937 			rtnl_lock();
4938 			ret = dev_ifsioc(net, &ifr, cmd);
4939 			rtnl_unlock();
4940 			if (!ret && copy_to_user(arg, &ifr,
4941 						 sizeof(struct ifreq)))
4942 				ret = -EFAULT;
4943 			return ret;
4944 		}
4945 		/* Take care of Wireless Extensions */
4946 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4947 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4948 		return -EINVAL;
4949 	}
4950 }
4951 
4952 
4953 /**
4954  *	dev_new_index	-	allocate an ifindex
4955  *	@net: the applicable net namespace
4956  *
4957  *	Returns a suitable unique value for a new device interface
4958  *	number.  The caller must hold the rtnl semaphore or the
4959  *	dev_base_lock to be sure it remains unique.
4960  */
4961 static int dev_new_index(struct net *net)
4962 {
4963 	static int ifindex;
4964 	for (;;) {
4965 		if (++ifindex <= 0)
4966 			ifindex = 1;
4967 		if (!__dev_get_by_index(net, ifindex))
4968 			return ifindex;
4969 	}
4970 }
4971 
4972 /* Delayed registration/unregisteration */
4973 static LIST_HEAD(net_todo_list);
4974 
4975 static void net_set_todo(struct net_device *dev)
4976 {
4977 	list_add_tail(&dev->todo_list, &net_todo_list);
4978 }
4979 
4980 static void rollback_registered_many(struct list_head *head)
4981 {
4982 	struct net_device *dev, *tmp;
4983 
4984 	BUG_ON(dev_boot_phase);
4985 	ASSERT_RTNL();
4986 
4987 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4988 		/* Some devices call without registering
4989 		 * for initialization unwind. Remove those
4990 		 * devices and proceed with the remaining.
4991 		 */
4992 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4993 			pr_debug("unregister_netdevice: device %s/%p never "
4994 				 "was registered\n", dev->name, dev);
4995 
4996 			WARN_ON(1);
4997 			list_del(&dev->unreg_list);
4998 			continue;
4999 		}
5000 
5001 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5002 	}
5003 
5004 	/* If device is running, close it first. */
5005 	dev_close_many(head);
5006 
5007 	list_for_each_entry(dev, head, unreg_list) {
5008 		/* And unlink it from device chain. */
5009 		unlist_netdevice(dev);
5010 
5011 		dev->reg_state = NETREG_UNREGISTERING;
5012 	}
5013 
5014 	synchronize_net();
5015 
5016 	list_for_each_entry(dev, head, unreg_list) {
5017 		/* Shutdown queueing discipline. */
5018 		dev_shutdown(dev);
5019 
5020 
5021 		/* Notify protocols, that we are about to destroy
5022 		   this device. They should clean all the things.
5023 		*/
5024 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5025 
5026 		if (!dev->rtnl_link_ops ||
5027 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5028 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5029 
5030 		/*
5031 		 *	Flush the unicast and multicast chains
5032 		 */
5033 		dev_uc_flush(dev);
5034 		dev_mc_flush(dev);
5035 
5036 		if (dev->netdev_ops->ndo_uninit)
5037 			dev->netdev_ops->ndo_uninit(dev);
5038 
5039 		/* Notifier chain MUST detach us from master device. */
5040 		WARN_ON(dev->master);
5041 
5042 		/* Remove entries from kobject tree */
5043 		netdev_unregister_kobject(dev);
5044 	}
5045 
5046 	/* Process any work delayed until the end of the batch */
5047 	dev = list_first_entry(head, struct net_device, unreg_list);
5048 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5049 
5050 	rcu_barrier();
5051 
5052 	list_for_each_entry(dev, head, unreg_list)
5053 		dev_put(dev);
5054 }
5055 
5056 static void rollback_registered(struct net_device *dev)
5057 {
5058 	LIST_HEAD(single);
5059 
5060 	list_add(&dev->unreg_list, &single);
5061 	rollback_registered_many(&single);
5062 }
5063 
5064 unsigned long netdev_fix_features(unsigned long features, const char *name)
5065 {
5066 	/* Fix illegal SG+CSUM combinations. */
5067 	if ((features & NETIF_F_SG) &&
5068 	    !(features & NETIF_F_ALL_CSUM)) {
5069 		if (name)
5070 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5071 			       "checksum feature.\n", name);
5072 		features &= ~NETIF_F_SG;
5073 	}
5074 
5075 	/* TSO requires that SG is present as well. */
5076 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5077 		if (name)
5078 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5079 			       "SG feature.\n", name);
5080 		features &= ~NETIF_F_TSO;
5081 	}
5082 
5083 	if (features & NETIF_F_UFO) {
5084 		/* maybe split UFO into V4 and V6? */
5085 		if (!((features & NETIF_F_GEN_CSUM) ||
5086 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5087 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5088 			if (name)
5089 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5090 				       "since no checksum offload features.\n",
5091 				       name);
5092 			features &= ~NETIF_F_UFO;
5093 		}
5094 
5095 		if (!(features & NETIF_F_SG)) {
5096 			if (name)
5097 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5098 				       "since no NETIF_F_SG feature.\n", name);
5099 			features &= ~NETIF_F_UFO;
5100 		}
5101 	}
5102 
5103 	return features;
5104 }
5105 EXPORT_SYMBOL(netdev_fix_features);
5106 
5107 /**
5108  *	netif_stacked_transfer_operstate -	transfer operstate
5109  *	@rootdev: the root or lower level device to transfer state from
5110  *	@dev: the device to transfer operstate to
5111  *
5112  *	Transfer operational state from root to device. This is normally
5113  *	called when a stacking relationship exists between the root
5114  *	device and the device(a leaf device).
5115  */
5116 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5117 					struct net_device *dev)
5118 {
5119 	if (rootdev->operstate == IF_OPER_DORMANT)
5120 		netif_dormant_on(dev);
5121 	else
5122 		netif_dormant_off(dev);
5123 
5124 	if (netif_carrier_ok(rootdev)) {
5125 		if (!netif_carrier_ok(dev))
5126 			netif_carrier_on(dev);
5127 	} else {
5128 		if (netif_carrier_ok(dev))
5129 			netif_carrier_off(dev);
5130 	}
5131 }
5132 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5133 
5134 #ifdef CONFIG_RPS
5135 static int netif_alloc_rx_queues(struct net_device *dev)
5136 {
5137 	unsigned int i, count = dev->num_rx_queues;
5138 	struct netdev_rx_queue *rx;
5139 
5140 	BUG_ON(count < 1);
5141 
5142 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5143 	if (!rx) {
5144 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5145 		return -ENOMEM;
5146 	}
5147 	dev->_rx = rx;
5148 
5149 	for (i = 0; i < count; i++)
5150 		rx[i].dev = dev;
5151 	return 0;
5152 }
5153 #endif
5154 
5155 static void netdev_init_one_queue(struct net_device *dev,
5156 				  struct netdev_queue *queue, void *_unused)
5157 {
5158 	/* Initialize queue lock */
5159 	spin_lock_init(&queue->_xmit_lock);
5160 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5161 	queue->xmit_lock_owner = -1;
5162 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5163 	queue->dev = dev;
5164 }
5165 
5166 static int netif_alloc_netdev_queues(struct net_device *dev)
5167 {
5168 	unsigned int count = dev->num_tx_queues;
5169 	struct netdev_queue *tx;
5170 
5171 	BUG_ON(count < 1);
5172 
5173 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5174 	if (!tx) {
5175 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5176 		       count);
5177 		return -ENOMEM;
5178 	}
5179 	dev->_tx = tx;
5180 
5181 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5182 	spin_lock_init(&dev->tx_global_lock);
5183 
5184 	return 0;
5185 }
5186 
5187 /**
5188  *	register_netdevice	- register a network device
5189  *	@dev: device to register
5190  *
5191  *	Take a completed network device structure and add it to the kernel
5192  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5193  *	chain. 0 is returned on success. A negative errno code is returned
5194  *	on a failure to set up the device, or if the name is a duplicate.
5195  *
5196  *	Callers must hold the rtnl semaphore. You may want
5197  *	register_netdev() instead of this.
5198  *
5199  *	BUGS:
5200  *	The locking appears insufficient to guarantee two parallel registers
5201  *	will not get the same name.
5202  */
5203 
5204 int register_netdevice(struct net_device *dev)
5205 {
5206 	int ret;
5207 	struct net *net = dev_net(dev);
5208 
5209 	BUG_ON(dev_boot_phase);
5210 	ASSERT_RTNL();
5211 
5212 	might_sleep();
5213 
5214 	/* When net_device's are persistent, this will be fatal. */
5215 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5216 	BUG_ON(!net);
5217 
5218 	spin_lock_init(&dev->addr_list_lock);
5219 	netdev_set_addr_lockdep_class(dev);
5220 
5221 	dev->iflink = -1;
5222 
5223 	/* Init, if this function is available */
5224 	if (dev->netdev_ops->ndo_init) {
5225 		ret = dev->netdev_ops->ndo_init(dev);
5226 		if (ret) {
5227 			if (ret > 0)
5228 				ret = -EIO;
5229 			goto out;
5230 		}
5231 	}
5232 
5233 	ret = dev_get_valid_name(dev, dev->name, 0);
5234 	if (ret)
5235 		goto err_uninit;
5236 
5237 	dev->ifindex = dev_new_index(net);
5238 	if (dev->iflink == -1)
5239 		dev->iflink = dev->ifindex;
5240 
5241 	/* Fix illegal checksum combinations */
5242 	if ((dev->features & NETIF_F_HW_CSUM) &&
5243 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5244 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5245 		       dev->name);
5246 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5247 	}
5248 
5249 	if ((dev->features & NETIF_F_NO_CSUM) &&
5250 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5251 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5252 		       dev->name);
5253 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5254 	}
5255 
5256 	dev->features = netdev_fix_features(dev->features, dev->name);
5257 
5258 	/* Enable software GSO if SG is supported. */
5259 	if (dev->features & NETIF_F_SG)
5260 		dev->features |= NETIF_F_GSO;
5261 
5262 	/* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5263 	 * vlan_dev_init() will do the dev->features check, so these features
5264 	 * are enabled only if supported by underlying device.
5265 	 */
5266 	dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA);
5267 
5268 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5269 	ret = notifier_to_errno(ret);
5270 	if (ret)
5271 		goto err_uninit;
5272 
5273 	ret = netdev_register_kobject(dev);
5274 	if (ret)
5275 		goto err_uninit;
5276 	dev->reg_state = NETREG_REGISTERED;
5277 
5278 	/*
5279 	 *	Default initial state at registry is that the
5280 	 *	device is present.
5281 	 */
5282 
5283 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5284 
5285 	dev_init_scheduler(dev);
5286 	dev_hold(dev);
5287 	list_netdevice(dev);
5288 
5289 	/* Notify protocols, that a new device appeared. */
5290 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5291 	ret = notifier_to_errno(ret);
5292 	if (ret) {
5293 		rollback_registered(dev);
5294 		dev->reg_state = NETREG_UNREGISTERED;
5295 	}
5296 	/*
5297 	 *	Prevent userspace races by waiting until the network
5298 	 *	device is fully setup before sending notifications.
5299 	 */
5300 	if (!dev->rtnl_link_ops ||
5301 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5302 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5303 
5304 out:
5305 	return ret;
5306 
5307 err_uninit:
5308 	if (dev->netdev_ops->ndo_uninit)
5309 		dev->netdev_ops->ndo_uninit(dev);
5310 	goto out;
5311 }
5312 EXPORT_SYMBOL(register_netdevice);
5313 
5314 /**
5315  *	init_dummy_netdev	- init a dummy network device for NAPI
5316  *	@dev: device to init
5317  *
5318  *	This takes a network device structure and initialize the minimum
5319  *	amount of fields so it can be used to schedule NAPI polls without
5320  *	registering a full blown interface. This is to be used by drivers
5321  *	that need to tie several hardware interfaces to a single NAPI
5322  *	poll scheduler due to HW limitations.
5323  */
5324 int init_dummy_netdev(struct net_device *dev)
5325 {
5326 	/* Clear everything. Note we don't initialize spinlocks
5327 	 * are they aren't supposed to be taken by any of the
5328 	 * NAPI code and this dummy netdev is supposed to be
5329 	 * only ever used for NAPI polls
5330 	 */
5331 	memset(dev, 0, sizeof(struct net_device));
5332 
5333 	/* make sure we BUG if trying to hit standard
5334 	 * register/unregister code path
5335 	 */
5336 	dev->reg_state = NETREG_DUMMY;
5337 
5338 	/* NAPI wants this */
5339 	INIT_LIST_HEAD(&dev->napi_list);
5340 
5341 	/* a dummy interface is started by default */
5342 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5343 	set_bit(__LINK_STATE_START, &dev->state);
5344 
5345 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5346 	 * because users of this 'device' dont need to change
5347 	 * its refcount.
5348 	 */
5349 
5350 	return 0;
5351 }
5352 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5353 
5354 
5355 /**
5356  *	register_netdev	- register a network device
5357  *	@dev: device to register
5358  *
5359  *	Take a completed network device structure and add it to the kernel
5360  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5361  *	chain. 0 is returned on success. A negative errno code is returned
5362  *	on a failure to set up the device, or if the name is a duplicate.
5363  *
5364  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5365  *	and expands the device name if you passed a format string to
5366  *	alloc_netdev.
5367  */
5368 int register_netdev(struct net_device *dev)
5369 {
5370 	int err;
5371 
5372 	rtnl_lock();
5373 
5374 	/*
5375 	 * If the name is a format string the caller wants us to do a
5376 	 * name allocation.
5377 	 */
5378 	if (strchr(dev->name, '%')) {
5379 		err = dev_alloc_name(dev, dev->name);
5380 		if (err < 0)
5381 			goto out;
5382 	}
5383 
5384 	err = register_netdevice(dev);
5385 out:
5386 	rtnl_unlock();
5387 	return err;
5388 }
5389 EXPORT_SYMBOL(register_netdev);
5390 
5391 int netdev_refcnt_read(const struct net_device *dev)
5392 {
5393 	int i, refcnt = 0;
5394 
5395 	for_each_possible_cpu(i)
5396 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5397 	return refcnt;
5398 }
5399 EXPORT_SYMBOL(netdev_refcnt_read);
5400 
5401 /*
5402  * netdev_wait_allrefs - wait until all references are gone.
5403  *
5404  * This is called when unregistering network devices.
5405  *
5406  * Any protocol or device that holds a reference should register
5407  * for netdevice notification, and cleanup and put back the
5408  * reference if they receive an UNREGISTER event.
5409  * We can get stuck here if buggy protocols don't correctly
5410  * call dev_put.
5411  */
5412 static void netdev_wait_allrefs(struct net_device *dev)
5413 {
5414 	unsigned long rebroadcast_time, warning_time;
5415 	int refcnt;
5416 
5417 	linkwatch_forget_dev(dev);
5418 
5419 	rebroadcast_time = warning_time = jiffies;
5420 	refcnt = netdev_refcnt_read(dev);
5421 
5422 	while (refcnt != 0) {
5423 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5424 			rtnl_lock();
5425 
5426 			/* Rebroadcast unregister notification */
5427 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5428 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5429 			 * should have already handle it the first time */
5430 
5431 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5432 				     &dev->state)) {
5433 				/* We must not have linkwatch events
5434 				 * pending on unregister. If this
5435 				 * happens, we simply run the queue
5436 				 * unscheduled, resulting in a noop
5437 				 * for this device.
5438 				 */
5439 				linkwatch_run_queue();
5440 			}
5441 
5442 			__rtnl_unlock();
5443 
5444 			rebroadcast_time = jiffies;
5445 		}
5446 
5447 		msleep(250);
5448 
5449 		refcnt = netdev_refcnt_read(dev);
5450 
5451 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5452 			printk(KERN_EMERG "unregister_netdevice: "
5453 			       "waiting for %s to become free. Usage "
5454 			       "count = %d\n",
5455 			       dev->name, refcnt);
5456 			warning_time = jiffies;
5457 		}
5458 	}
5459 }
5460 
5461 /* The sequence is:
5462  *
5463  *	rtnl_lock();
5464  *	...
5465  *	register_netdevice(x1);
5466  *	register_netdevice(x2);
5467  *	...
5468  *	unregister_netdevice(y1);
5469  *	unregister_netdevice(y2);
5470  *      ...
5471  *	rtnl_unlock();
5472  *	free_netdev(y1);
5473  *	free_netdev(y2);
5474  *
5475  * We are invoked by rtnl_unlock().
5476  * This allows us to deal with problems:
5477  * 1) We can delete sysfs objects which invoke hotplug
5478  *    without deadlocking with linkwatch via keventd.
5479  * 2) Since we run with the RTNL semaphore not held, we can sleep
5480  *    safely in order to wait for the netdev refcnt to drop to zero.
5481  *
5482  * We must not return until all unregister events added during
5483  * the interval the lock was held have been completed.
5484  */
5485 void netdev_run_todo(void)
5486 {
5487 	struct list_head list;
5488 
5489 	/* Snapshot list, allow later requests */
5490 	list_replace_init(&net_todo_list, &list);
5491 
5492 	__rtnl_unlock();
5493 
5494 	while (!list_empty(&list)) {
5495 		struct net_device *dev
5496 			= list_first_entry(&list, struct net_device, todo_list);
5497 		list_del(&dev->todo_list);
5498 
5499 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5500 			printk(KERN_ERR "network todo '%s' but state %d\n",
5501 			       dev->name, dev->reg_state);
5502 			dump_stack();
5503 			continue;
5504 		}
5505 
5506 		dev->reg_state = NETREG_UNREGISTERED;
5507 
5508 		on_each_cpu(flush_backlog, dev, 1);
5509 
5510 		netdev_wait_allrefs(dev);
5511 
5512 		/* paranoia */
5513 		BUG_ON(netdev_refcnt_read(dev));
5514 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5515 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5516 		WARN_ON(dev->dn_ptr);
5517 
5518 		if (dev->destructor)
5519 			dev->destructor(dev);
5520 
5521 		/* Free network device */
5522 		kobject_put(&dev->dev.kobj);
5523 	}
5524 }
5525 
5526 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5527  * fields in the same order, with only the type differing.
5528  */
5529 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5530 				    const struct net_device_stats *netdev_stats)
5531 {
5532 #if BITS_PER_LONG == 64
5533         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5534         memcpy(stats64, netdev_stats, sizeof(*stats64));
5535 #else
5536 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5537 	const unsigned long *src = (const unsigned long *)netdev_stats;
5538 	u64 *dst = (u64 *)stats64;
5539 
5540 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5541 		     sizeof(*stats64) / sizeof(u64));
5542 	for (i = 0; i < n; i++)
5543 		dst[i] = src[i];
5544 #endif
5545 }
5546 
5547 /**
5548  *	dev_get_stats	- get network device statistics
5549  *	@dev: device to get statistics from
5550  *	@storage: place to store stats
5551  *
5552  *	Get network statistics from device. Return @storage.
5553  *	The device driver may provide its own method by setting
5554  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5555  *	otherwise the internal statistics structure is used.
5556  */
5557 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5558 					struct rtnl_link_stats64 *storage)
5559 {
5560 	const struct net_device_ops *ops = dev->netdev_ops;
5561 
5562 	if (ops->ndo_get_stats64) {
5563 		memset(storage, 0, sizeof(*storage));
5564 		ops->ndo_get_stats64(dev, storage);
5565 	} else if (ops->ndo_get_stats) {
5566 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5567 	} else {
5568 		netdev_stats_to_stats64(storage, &dev->stats);
5569 	}
5570 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5571 	return storage;
5572 }
5573 EXPORT_SYMBOL(dev_get_stats);
5574 
5575 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5576 {
5577 	struct netdev_queue *queue = dev_ingress_queue(dev);
5578 
5579 #ifdef CONFIG_NET_CLS_ACT
5580 	if (queue)
5581 		return queue;
5582 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5583 	if (!queue)
5584 		return NULL;
5585 	netdev_init_one_queue(dev, queue, NULL);
5586 	queue->qdisc = &noop_qdisc;
5587 	queue->qdisc_sleeping = &noop_qdisc;
5588 	rcu_assign_pointer(dev->ingress_queue, queue);
5589 #endif
5590 	return queue;
5591 }
5592 
5593 /**
5594  *	alloc_netdev_mqs - allocate network device
5595  *	@sizeof_priv:	size of private data to allocate space for
5596  *	@name:		device name format string
5597  *	@setup:		callback to initialize device
5598  *	@txqs:		the number of TX subqueues to allocate
5599  *	@rxqs:		the number of RX subqueues to allocate
5600  *
5601  *	Allocates a struct net_device with private data area for driver use
5602  *	and performs basic initialization.  Also allocates subquue structs
5603  *	for each queue on the device.
5604  */
5605 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5606 		void (*setup)(struct net_device *),
5607 		unsigned int txqs, unsigned int rxqs)
5608 {
5609 	struct net_device *dev;
5610 	size_t alloc_size;
5611 	struct net_device *p;
5612 
5613 	BUG_ON(strlen(name) >= sizeof(dev->name));
5614 
5615 	if (txqs < 1) {
5616 		pr_err("alloc_netdev: Unable to allocate device "
5617 		       "with zero queues.\n");
5618 		return NULL;
5619 	}
5620 
5621 #ifdef CONFIG_RPS
5622 	if (rxqs < 1) {
5623 		pr_err("alloc_netdev: Unable to allocate device "
5624 		       "with zero RX queues.\n");
5625 		return NULL;
5626 	}
5627 #endif
5628 
5629 	alloc_size = sizeof(struct net_device);
5630 	if (sizeof_priv) {
5631 		/* ensure 32-byte alignment of private area */
5632 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5633 		alloc_size += sizeof_priv;
5634 	}
5635 	/* ensure 32-byte alignment of whole construct */
5636 	alloc_size += NETDEV_ALIGN - 1;
5637 
5638 	p = kzalloc(alloc_size, GFP_KERNEL);
5639 	if (!p) {
5640 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5641 		return NULL;
5642 	}
5643 
5644 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5645 	dev->padded = (char *)dev - (char *)p;
5646 
5647 	dev->pcpu_refcnt = alloc_percpu(int);
5648 	if (!dev->pcpu_refcnt)
5649 		goto free_p;
5650 
5651 	if (dev_addr_init(dev))
5652 		goto free_pcpu;
5653 
5654 	dev_mc_init(dev);
5655 	dev_uc_init(dev);
5656 
5657 	dev_net_set(dev, &init_net);
5658 
5659 	dev->num_tx_queues = txqs;
5660 	dev->real_num_tx_queues = txqs;
5661 	if (netif_alloc_netdev_queues(dev))
5662 		goto free_pcpu;
5663 
5664 #ifdef CONFIG_RPS
5665 	dev->num_rx_queues = rxqs;
5666 	dev->real_num_rx_queues = rxqs;
5667 	if (netif_alloc_rx_queues(dev))
5668 		goto free_pcpu;
5669 #endif
5670 
5671 	dev->gso_max_size = GSO_MAX_SIZE;
5672 
5673 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5674 	dev->ethtool_ntuple_list.count = 0;
5675 	INIT_LIST_HEAD(&dev->napi_list);
5676 	INIT_LIST_HEAD(&dev->unreg_list);
5677 	INIT_LIST_HEAD(&dev->link_watch_list);
5678 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5679 	setup(dev);
5680 	strcpy(dev->name, name);
5681 	return dev;
5682 
5683 free_pcpu:
5684 	free_percpu(dev->pcpu_refcnt);
5685 	kfree(dev->_tx);
5686 #ifdef CONFIG_RPS
5687 	kfree(dev->_rx);
5688 #endif
5689 
5690 free_p:
5691 	kfree(p);
5692 	return NULL;
5693 }
5694 EXPORT_SYMBOL(alloc_netdev_mqs);
5695 
5696 /**
5697  *	free_netdev - free network device
5698  *	@dev: device
5699  *
5700  *	This function does the last stage of destroying an allocated device
5701  * 	interface. The reference to the device object is released.
5702  *	If this is the last reference then it will be freed.
5703  */
5704 void free_netdev(struct net_device *dev)
5705 {
5706 	struct napi_struct *p, *n;
5707 
5708 	release_net(dev_net(dev));
5709 
5710 	kfree(dev->_tx);
5711 #ifdef CONFIG_RPS
5712 	kfree(dev->_rx);
5713 #endif
5714 
5715 	kfree(rcu_dereference_raw(dev->ingress_queue));
5716 
5717 	/* Flush device addresses */
5718 	dev_addr_flush(dev);
5719 
5720 	/* Clear ethtool n-tuple list */
5721 	ethtool_ntuple_flush(dev);
5722 
5723 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5724 		netif_napi_del(p);
5725 
5726 	free_percpu(dev->pcpu_refcnt);
5727 	dev->pcpu_refcnt = NULL;
5728 
5729 	/*  Compatibility with error handling in drivers */
5730 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5731 		kfree((char *)dev - dev->padded);
5732 		return;
5733 	}
5734 
5735 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5736 	dev->reg_state = NETREG_RELEASED;
5737 
5738 	/* will free via device release */
5739 	put_device(&dev->dev);
5740 }
5741 EXPORT_SYMBOL(free_netdev);
5742 
5743 /**
5744  *	synchronize_net -  Synchronize with packet receive processing
5745  *
5746  *	Wait for packets currently being received to be done.
5747  *	Does not block later packets from starting.
5748  */
5749 void synchronize_net(void)
5750 {
5751 	might_sleep();
5752 	synchronize_rcu();
5753 }
5754 EXPORT_SYMBOL(synchronize_net);
5755 
5756 /**
5757  *	unregister_netdevice_queue - remove device from the kernel
5758  *	@dev: device
5759  *	@head: list
5760  *
5761  *	This function shuts down a device interface and removes it
5762  *	from the kernel tables.
5763  *	If head not NULL, device is queued to be unregistered later.
5764  *
5765  *	Callers must hold the rtnl semaphore.  You may want
5766  *	unregister_netdev() instead of this.
5767  */
5768 
5769 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5770 {
5771 	ASSERT_RTNL();
5772 
5773 	if (head) {
5774 		list_move_tail(&dev->unreg_list, head);
5775 	} else {
5776 		rollback_registered(dev);
5777 		/* Finish processing unregister after unlock */
5778 		net_set_todo(dev);
5779 	}
5780 }
5781 EXPORT_SYMBOL(unregister_netdevice_queue);
5782 
5783 /**
5784  *	unregister_netdevice_many - unregister many devices
5785  *	@head: list of devices
5786  */
5787 void unregister_netdevice_many(struct list_head *head)
5788 {
5789 	struct net_device *dev;
5790 
5791 	if (!list_empty(head)) {
5792 		rollback_registered_many(head);
5793 		list_for_each_entry(dev, head, unreg_list)
5794 			net_set_todo(dev);
5795 	}
5796 }
5797 EXPORT_SYMBOL(unregister_netdevice_many);
5798 
5799 /**
5800  *	unregister_netdev - remove device from the kernel
5801  *	@dev: device
5802  *
5803  *	This function shuts down a device interface and removes it
5804  *	from the kernel tables.
5805  *
5806  *	This is just a wrapper for unregister_netdevice that takes
5807  *	the rtnl semaphore.  In general you want to use this and not
5808  *	unregister_netdevice.
5809  */
5810 void unregister_netdev(struct net_device *dev)
5811 {
5812 	rtnl_lock();
5813 	unregister_netdevice(dev);
5814 	rtnl_unlock();
5815 }
5816 EXPORT_SYMBOL(unregister_netdev);
5817 
5818 /**
5819  *	dev_change_net_namespace - move device to different nethost namespace
5820  *	@dev: device
5821  *	@net: network namespace
5822  *	@pat: If not NULL name pattern to try if the current device name
5823  *	      is already taken in the destination network namespace.
5824  *
5825  *	This function shuts down a device interface and moves it
5826  *	to a new network namespace. On success 0 is returned, on
5827  *	a failure a netagive errno code is returned.
5828  *
5829  *	Callers must hold the rtnl semaphore.
5830  */
5831 
5832 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5833 {
5834 	int err;
5835 
5836 	ASSERT_RTNL();
5837 
5838 	/* Don't allow namespace local devices to be moved. */
5839 	err = -EINVAL;
5840 	if (dev->features & NETIF_F_NETNS_LOCAL)
5841 		goto out;
5842 
5843 	/* Ensure the device has been registrered */
5844 	err = -EINVAL;
5845 	if (dev->reg_state != NETREG_REGISTERED)
5846 		goto out;
5847 
5848 	/* Get out if there is nothing todo */
5849 	err = 0;
5850 	if (net_eq(dev_net(dev), net))
5851 		goto out;
5852 
5853 	/* Pick the destination device name, and ensure
5854 	 * we can use it in the destination network namespace.
5855 	 */
5856 	err = -EEXIST;
5857 	if (__dev_get_by_name(net, dev->name)) {
5858 		/* We get here if we can't use the current device name */
5859 		if (!pat)
5860 			goto out;
5861 		if (dev_get_valid_name(dev, pat, 1))
5862 			goto out;
5863 	}
5864 
5865 	/*
5866 	 * And now a mini version of register_netdevice unregister_netdevice.
5867 	 */
5868 
5869 	/* If device is running close it first. */
5870 	dev_close(dev);
5871 
5872 	/* And unlink it from device chain */
5873 	err = -ENODEV;
5874 	unlist_netdevice(dev);
5875 
5876 	synchronize_net();
5877 
5878 	/* Shutdown queueing discipline. */
5879 	dev_shutdown(dev);
5880 
5881 	/* Notify protocols, that we are about to destroy
5882 	   this device. They should clean all the things.
5883 
5884 	   Note that dev->reg_state stays at NETREG_REGISTERED.
5885 	   This is wanted because this way 8021q and macvlan know
5886 	   the device is just moving and can keep their slaves up.
5887 	*/
5888 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5889 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5890 
5891 	/*
5892 	 *	Flush the unicast and multicast chains
5893 	 */
5894 	dev_uc_flush(dev);
5895 	dev_mc_flush(dev);
5896 
5897 	/* Actually switch the network namespace */
5898 	dev_net_set(dev, net);
5899 
5900 	/* If there is an ifindex conflict assign a new one */
5901 	if (__dev_get_by_index(net, dev->ifindex)) {
5902 		int iflink = (dev->iflink == dev->ifindex);
5903 		dev->ifindex = dev_new_index(net);
5904 		if (iflink)
5905 			dev->iflink = dev->ifindex;
5906 	}
5907 
5908 	/* Fixup kobjects */
5909 	err = device_rename(&dev->dev, dev->name);
5910 	WARN_ON(err);
5911 
5912 	/* Add the device back in the hashes */
5913 	list_netdevice(dev);
5914 
5915 	/* Notify protocols, that a new device appeared. */
5916 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5917 
5918 	/*
5919 	 *	Prevent userspace races by waiting until the network
5920 	 *	device is fully setup before sending notifications.
5921 	 */
5922 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5923 
5924 	synchronize_net();
5925 	err = 0;
5926 out:
5927 	return err;
5928 }
5929 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5930 
5931 static int dev_cpu_callback(struct notifier_block *nfb,
5932 			    unsigned long action,
5933 			    void *ocpu)
5934 {
5935 	struct sk_buff **list_skb;
5936 	struct sk_buff *skb;
5937 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5938 	struct softnet_data *sd, *oldsd;
5939 
5940 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5941 		return NOTIFY_OK;
5942 
5943 	local_irq_disable();
5944 	cpu = smp_processor_id();
5945 	sd = &per_cpu(softnet_data, cpu);
5946 	oldsd = &per_cpu(softnet_data, oldcpu);
5947 
5948 	/* Find end of our completion_queue. */
5949 	list_skb = &sd->completion_queue;
5950 	while (*list_skb)
5951 		list_skb = &(*list_skb)->next;
5952 	/* Append completion queue from offline CPU. */
5953 	*list_skb = oldsd->completion_queue;
5954 	oldsd->completion_queue = NULL;
5955 
5956 	/* Append output queue from offline CPU. */
5957 	if (oldsd->output_queue) {
5958 		*sd->output_queue_tailp = oldsd->output_queue;
5959 		sd->output_queue_tailp = oldsd->output_queue_tailp;
5960 		oldsd->output_queue = NULL;
5961 		oldsd->output_queue_tailp = &oldsd->output_queue;
5962 	}
5963 
5964 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5965 	local_irq_enable();
5966 
5967 	/* Process offline CPU's input_pkt_queue */
5968 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5969 		netif_rx(skb);
5970 		input_queue_head_incr(oldsd);
5971 	}
5972 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
5973 		netif_rx(skb);
5974 		input_queue_head_incr(oldsd);
5975 	}
5976 
5977 	return NOTIFY_OK;
5978 }
5979 
5980 
5981 /**
5982  *	netdev_increment_features - increment feature set by one
5983  *	@all: current feature set
5984  *	@one: new feature set
5985  *	@mask: mask feature set
5986  *
5987  *	Computes a new feature set after adding a device with feature set
5988  *	@one to the master device with current feature set @all.  Will not
5989  *	enable anything that is off in @mask. Returns the new feature set.
5990  */
5991 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5992 					unsigned long mask)
5993 {
5994 	/* If device needs checksumming, downgrade to it. */
5995 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5996 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5997 	else if (mask & NETIF_F_ALL_CSUM) {
5998 		/* If one device supports v4/v6 checksumming, set for all. */
5999 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6000 		    !(all & NETIF_F_GEN_CSUM)) {
6001 			all &= ~NETIF_F_ALL_CSUM;
6002 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6003 		}
6004 
6005 		/* If one device supports hw checksumming, set for all. */
6006 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6007 			all &= ~NETIF_F_ALL_CSUM;
6008 			all |= NETIF_F_HW_CSUM;
6009 		}
6010 	}
6011 
6012 	one |= NETIF_F_ALL_CSUM;
6013 
6014 	one |= all & NETIF_F_ONE_FOR_ALL;
6015 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6016 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
6017 
6018 	return all;
6019 }
6020 EXPORT_SYMBOL(netdev_increment_features);
6021 
6022 static struct hlist_head *netdev_create_hash(void)
6023 {
6024 	int i;
6025 	struct hlist_head *hash;
6026 
6027 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6028 	if (hash != NULL)
6029 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6030 			INIT_HLIST_HEAD(&hash[i]);
6031 
6032 	return hash;
6033 }
6034 
6035 /* Initialize per network namespace state */
6036 static int __net_init netdev_init(struct net *net)
6037 {
6038 	INIT_LIST_HEAD(&net->dev_base_head);
6039 
6040 	net->dev_name_head = netdev_create_hash();
6041 	if (net->dev_name_head == NULL)
6042 		goto err_name;
6043 
6044 	net->dev_index_head = netdev_create_hash();
6045 	if (net->dev_index_head == NULL)
6046 		goto err_idx;
6047 
6048 	return 0;
6049 
6050 err_idx:
6051 	kfree(net->dev_name_head);
6052 err_name:
6053 	return -ENOMEM;
6054 }
6055 
6056 /**
6057  *	netdev_drivername - network driver for the device
6058  *	@dev: network device
6059  *	@buffer: buffer for resulting name
6060  *	@len: size of buffer
6061  *
6062  *	Determine network driver for device.
6063  */
6064 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6065 {
6066 	const struct device_driver *driver;
6067 	const struct device *parent;
6068 
6069 	if (len <= 0 || !buffer)
6070 		return buffer;
6071 	buffer[0] = 0;
6072 
6073 	parent = dev->dev.parent;
6074 
6075 	if (!parent)
6076 		return buffer;
6077 
6078 	driver = parent->driver;
6079 	if (driver && driver->name)
6080 		strlcpy(buffer, driver->name, len);
6081 	return buffer;
6082 }
6083 
6084 static int __netdev_printk(const char *level, const struct net_device *dev,
6085 			   struct va_format *vaf)
6086 {
6087 	int r;
6088 
6089 	if (dev && dev->dev.parent)
6090 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6091 			       netdev_name(dev), vaf);
6092 	else if (dev)
6093 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6094 	else
6095 		r = printk("%s(NULL net_device): %pV", level, vaf);
6096 
6097 	return r;
6098 }
6099 
6100 int netdev_printk(const char *level, const struct net_device *dev,
6101 		  const char *format, ...)
6102 {
6103 	struct va_format vaf;
6104 	va_list args;
6105 	int r;
6106 
6107 	va_start(args, format);
6108 
6109 	vaf.fmt = format;
6110 	vaf.va = &args;
6111 
6112 	r = __netdev_printk(level, dev, &vaf);
6113 	va_end(args);
6114 
6115 	return r;
6116 }
6117 EXPORT_SYMBOL(netdev_printk);
6118 
6119 #define define_netdev_printk_level(func, level)			\
6120 int func(const struct net_device *dev, const char *fmt, ...)	\
6121 {								\
6122 	int r;							\
6123 	struct va_format vaf;					\
6124 	va_list args;						\
6125 								\
6126 	va_start(args, fmt);					\
6127 								\
6128 	vaf.fmt = fmt;						\
6129 	vaf.va = &args;						\
6130 								\
6131 	r = __netdev_printk(level, dev, &vaf);			\
6132 	va_end(args);						\
6133 								\
6134 	return r;						\
6135 }								\
6136 EXPORT_SYMBOL(func);
6137 
6138 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6139 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6140 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6141 define_netdev_printk_level(netdev_err, KERN_ERR);
6142 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6143 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6144 define_netdev_printk_level(netdev_info, KERN_INFO);
6145 
6146 static void __net_exit netdev_exit(struct net *net)
6147 {
6148 	kfree(net->dev_name_head);
6149 	kfree(net->dev_index_head);
6150 }
6151 
6152 static struct pernet_operations __net_initdata netdev_net_ops = {
6153 	.init = netdev_init,
6154 	.exit = netdev_exit,
6155 };
6156 
6157 static void __net_exit default_device_exit(struct net *net)
6158 {
6159 	struct net_device *dev, *aux;
6160 	/*
6161 	 * Push all migratable network devices back to the
6162 	 * initial network namespace
6163 	 */
6164 	rtnl_lock();
6165 	for_each_netdev_safe(net, dev, aux) {
6166 		int err;
6167 		char fb_name[IFNAMSIZ];
6168 
6169 		/* Ignore unmoveable devices (i.e. loopback) */
6170 		if (dev->features & NETIF_F_NETNS_LOCAL)
6171 			continue;
6172 
6173 		/* Leave virtual devices for the generic cleanup */
6174 		if (dev->rtnl_link_ops)
6175 			continue;
6176 
6177 		/* Push remaing network devices to init_net */
6178 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6179 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6180 		if (err) {
6181 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6182 				__func__, dev->name, err);
6183 			BUG();
6184 		}
6185 	}
6186 	rtnl_unlock();
6187 }
6188 
6189 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6190 {
6191 	/* At exit all network devices most be removed from a network
6192 	 * namespace.  Do this in the reverse order of registration.
6193 	 * Do this across as many network namespaces as possible to
6194 	 * improve batching efficiency.
6195 	 */
6196 	struct net_device *dev;
6197 	struct net *net;
6198 	LIST_HEAD(dev_kill_list);
6199 
6200 	rtnl_lock();
6201 	list_for_each_entry(net, net_list, exit_list) {
6202 		for_each_netdev_reverse(net, dev) {
6203 			if (dev->rtnl_link_ops)
6204 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6205 			else
6206 				unregister_netdevice_queue(dev, &dev_kill_list);
6207 		}
6208 	}
6209 	unregister_netdevice_many(&dev_kill_list);
6210 	rtnl_unlock();
6211 }
6212 
6213 static struct pernet_operations __net_initdata default_device_ops = {
6214 	.exit = default_device_exit,
6215 	.exit_batch = default_device_exit_batch,
6216 };
6217 
6218 /*
6219  *	Initialize the DEV module. At boot time this walks the device list and
6220  *	unhooks any devices that fail to initialise (normally hardware not
6221  *	present) and leaves us with a valid list of present and active devices.
6222  *
6223  */
6224 
6225 /*
6226  *       This is called single threaded during boot, so no need
6227  *       to take the rtnl semaphore.
6228  */
6229 static int __init net_dev_init(void)
6230 {
6231 	int i, rc = -ENOMEM;
6232 
6233 	BUG_ON(!dev_boot_phase);
6234 
6235 	if (dev_proc_init())
6236 		goto out;
6237 
6238 	if (netdev_kobject_init())
6239 		goto out;
6240 
6241 	INIT_LIST_HEAD(&ptype_all);
6242 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6243 		INIT_LIST_HEAD(&ptype_base[i]);
6244 
6245 	if (register_pernet_subsys(&netdev_net_ops))
6246 		goto out;
6247 
6248 	/*
6249 	 *	Initialise the packet receive queues.
6250 	 */
6251 
6252 	for_each_possible_cpu(i) {
6253 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6254 
6255 		memset(sd, 0, sizeof(*sd));
6256 		skb_queue_head_init(&sd->input_pkt_queue);
6257 		skb_queue_head_init(&sd->process_queue);
6258 		sd->completion_queue = NULL;
6259 		INIT_LIST_HEAD(&sd->poll_list);
6260 		sd->output_queue = NULL;
6261 		sd->output_queue_tailp = &sd->output_queue;
6262 #ifdef CONFIG_RPS
6263 		sd->csd.func = rps_trigger_softirq;
6264 		sd->csd.info = sd;
6265 		sd->csd.flags = 0;
6266 		sd->cpu = i;
6267 #endif
6268 
6269 		sd->backlog.poll = process_backlog;
6270 		sd->backlog.weight = weight_p;
6271 		sd->backlog.gro_list = NULL;
6272 		sd->backlog.gro_count = 0;
6273 	}
6274 
6275 	dev_boot_phase = 0;
6276 
6277 	/* The loopback device is special if any other network devices
6278 	 * is present in a network namespace the loopback device must
6279 	 * be present. Since we now dynamically allocate and free the
6280 	 * loopback device ensure this invariant is maintained by
6281 	 * keeping the loopback device as the first device on the
6282 	 * list of network devices.  Ensuring the loopback devices
6283 	 * is the first device that appears and the last network device
6284 	 * that disappears.
6285 	 */
6286 	if (register_pernet_device(&loopback_net_ops))
6287 		goto out;
6288 
6289 	if (register_pernet_device(&default_device_ops))
6290 		goto out;
6291 
6292 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6293 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6294 
6295 	hotcpu_notifier(dev_cpu_callback, 0);
6296 	dst_init();
6297 	dev_mcast_init();
6298 	rc = 0;
6299 out:
6300 	return rc;
6301 }
6302 
6303 subsys_initcall(net_dev_init);
6304 
6305 static int __init initialize_hashrnd(void)
6306 {
6307 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6308 	return 0;
6309 }
6310 
6311 late_initcall_sync(initialize_hashrnd);
6312 
6313