xref: /openbmc/linux/net/core/dev.c (revision 81d67439)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
87 #include <linux/mm.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
99 #include <net/sock.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
104 #include <net/dst.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
125 #include <net/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
136 
137 #include "net-sysfs.h"
138 
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141 
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144 
145 /*
146  *	The list of packet types we will receive (as opposed to discard)
147  *	and the routines to invoke.
148  *
149  *	Why 16. Because with 16 the only overlap we get on a hash of the
150  *	low nibble of the protocol value is RARP/SNAP/X.25.
151  *
152  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
153  *             sure which should go first, but I bet it won't make much
154  *             difference if we are running VLANs.  The good news is that
155  *             this protocol won't be in the list unless compiled in, so
156  *             the average user (w/out VLANs) will not be adversely affected.
157  *             --BLG
158  *
159  *		0800	IP
160  *		8100    802.1Q VLAN
161  *		0001	802.3
162  *		0002	AX.25
163  *		0004	802.2
164  *		8035	RARP
165  *		0005	SNAP
166  *		0805	X.25
167  *		0806	ARP
168  *		8137	IPX
169  *		0009	Localtalk
170  *		86DD	IPv6
171  */
172 
173 #define PTYPE_HASH_SIZE	(16)
174 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
175 
176 static DEFINE_SPINLOCK(ptype_lock);
177 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
178 static struct list_head ptype_all __read_mostly;	/* Taps */
179 
180 /*
181  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
182  * semaphore.
183  *
184  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
185  *
186  * Writers must hold the rtnl semaphore while they loop through the
187  * dev_base_head list, and hold dev_base_lock for writing when they do the
188  * actual updates.  This allows pure readers to access the list even
189  * while a writer is preparing to update it.
190  *
191  * To put it another way, dev_base_lock is held for writing only to
192  * protect against pure readers; the rtnl semaphore provides the
193  * protection against other writers.
194  *
195  * See, for example usages, register_netdevice() and
196  * unregister_netdevice(), which must be called with the rtnl
197  * semaphore held.
198  */
199 DEFINE_RWLOCK(dev_base_lock);
200 EXPORT_SYMBOL(dev_base_lock);
201 
202 static inline void dev_base_seq_inc(struct net *net)
203 {
204 	while (++net->dev_base_seq == 0);
205 }
206 
207 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
208 {
209 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
210 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
211 }
212 
213 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
214 {
215 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
216 }
217 
218 static inline void rps_lock(struct softnet_data *sd)
219 {
220 #ifdef CONFIG_RPS
221 	spin_lock(&sd->input_pkt_queue.lock);
222 #endif
223 }
224 
225 static inline void rps_unlock(struct softnet_data *sd)
226 {
227 #ifdef CONFIG_RPS
228 	spin_unlock(&sd->input_pkt_queue.lock);
229 #endif
230 }
231 
232 /* Device list insertion */
233 static int list_netdevice(struct net_device *dev)
234 {
235 	struct net *net = dev_net(dev);
236 
237 	ASSERT_RTNL();
238 
239 	write_lock_bh(&dev_base_lock);
240 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
241 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
242 	hlist_add_head_rcu(&dev->index_hlist,
243 			   dev_index_hash(net, dev->ifindex));
244 	write_unlock_bh(&dev_base_lock);
245 
246 	dev_base_seq_inc(net);
247 
248 	return 0;
249 }
250 
251 /* Device list removal
252  * caller must respect a RCU grace period before freeing/reusing dev
253  */
254 static void unlist_netdevice(struct net_device *dev)
255 {
256 	ASSERT_RTNL();
257 
258 	/* Unlink dev from the device chain */
259 	write_lock_bh(&dev_base_lock);
260 	list_del_rcu(&dev->dev_list);
261 	hlist_del_rcu(&dev->name_hlist);
262 	hlist_del_rcu(&dev->index_hlist);
263 	write_unlock_bh(&dev_base_lock);
264 
265 	dev_base_seq_inc(dev_net(dev));
266 }
267 
268 /*
269  *	Our notifier list
270  */
271 
272 static RAW_NOTIFIER_HEAD(netdev_chain);
273 
274 /*
275  *	Device drivers call our routines to queue packets here. We empty the
276  *	queue in the local softnet handler.
277  */
278 
279 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
280 EXPORT_PER_CPU_SYMBOL(softnet_data);
281 
282 #ifdef CONFIG_LOCKDEP
283 /*
284  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
285  * according to dev->type
286  */
287 static const unsigned short netdev_lock_type[] =
288 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
289 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
290 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
291 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
292 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
293 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
294 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
295 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
296 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
297 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
298 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
299 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
300 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
301 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
302 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
303 	 ARPHRD_VOID, ARPHRD_NONE};
304 
305 static const char *const netdev_lock_name[] =
306 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
307 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
308 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
309 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
310 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
311 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
312 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
313 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
314 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
315 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
316 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
317 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
318 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
319 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
320 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
321 	 "_xmit_VOID", "_xmit_NONE"};
322 
323 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
324 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
325 
326 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
327 {
328 	int i;
329 
330 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
331 		if (netdev_lock_type[i] == dev_type)
332 			return i;
333 	/* the last key is used by default */
334 	return ARRAY_SIZE(netdev_lock_type) - 1;
335 }
336 
337 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
338 						 unsigned short dev_type)
339 {
340 	int i;
341 
342 	i = netdev_lock_pos(dev_type);
343 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
344 				   netdev_lock_name[i]);
345 }
346 
347 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
348 {
349 	int i;
350 
351 	i = netdev_lock_pos(dev->type);
352 	lockdep_set_class_and_name(&dev->addr_list_lock,
353 				   &netdev_addr_lock_key[i],
354 				   netdev_lock_name[i]);
355 }
356 #else
357 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
358 						 unsigned short dev_type)
359 {
360 }
361 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
362 {
363 }
364 #endif
365 
366 /*******************************************************************************
367 
368 		Protocol management and registration routines
369 
370 *******************************************************************************/
371 
372 /*
373  *	Add a protocol ID to the list. Now that the input handler is
374  *	smarter we can dispense with all the messy stuff that used to be
375  *	here.
376  *
377  *	BEWARE!!! Protocol handlers, mangling input packets,
378  *	MUST BE last in hash buckets and checking protocol handlers
379  *	MUST start from promiscuous ptype_all chain in net_bh.
380  *	It is true now, do not change it.
381  *	Explanation follows: if protocol handler, mangling packet, will
382  *	be the first on list, it is not able to sense, that packet
383  *	is cloned and should be copied-on-write, so that it will
384  *	change it and subsequent readers will get broken packet.
385  *							--ANK (980803)
386  */
387 
388 static inline struct list_head *ptype_head(const struct packet_type *pt)
389 {
390 	if (pt->type == htons(ETH_P_ALL))
391 		return &ptype_all;
392 	else
393 		return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
394 }
395 
396 /**
397  *	dev_add_pack - add packet handler
398  *	@pt: packet type declaration
399  *
400  *	Add a protocol handler to the networking stack. The passed &packet_type
401  *	is linked into kernel lists and may not be freed until it has been
402  *	removed from the kernel lists.
403  *
404  *	This call does not sleep therefore it can not
405  *	guarantee all CPU's that are in middle of receiving packets
406  *	will see the new packet type (until the next received packet).
407  */
408 
409 void dev_add_pack(struct packet_type *pt)
410 {
411 	struct list_head *head = ptype_head(pt);
412 
413 	spin_lock(&ptype_lock);
414 	list_add_rcu(&pt->list, head);
415 	spin_unlock(&ptype_lock);
416 }
417 EXPORT_SYMBOL(dev_add_pack);
418 
419 /**
420  *	__dev_remove_pack	 - remove packet handler
421  *	@pt: packet type declaration
422  *
423  *	Remove a protocol handler that was previously added to the kernel
424  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
425  *	from the kernel lists and can be freed or reused once this function
426  *	returns.
427  *
428  *      The packet type might still be in use by receivers
429  *	and must not be freed until after all the CPU's have gone
430  *	through a quiescent state.
431  */
432 void __dev_remove_pack(struct packet_type *pt)
433 {
434 	struct list_head *head = ptype_head(pt);
435 	struct packet_type *pt1;
436 
437 	spin_lock(&ptype_lock);
438 
439 	list_for_each_entry(pt1, head, list) {
440 		if (pt == pt1) {
441 			list_del_rcu(&pt->list);
442 			goto out;
443 		}
444 	}
445 
446 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
447 out:
448 	spin_unlock(&ptype_lock);
449 }
450 EXPORT_SYMBOL(__dev_remove_pack);
451 
452 /**
453  *	dev_remove_pack	 - remove packet handler
454  *	@pt: packet type declaration
455  *
456  *	Remove a protocol handler that was previously added to the kernel
457  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
458  *	from the kernel lists and can be freed or reused once this function
459  *	returns.
460  *
461  *	This call sleeps to guarantee that no CPU is looking at the packet
462  *	type after return.
463  */
464 void dev_remove_pack(struct packet_type *pt)
465 {
466 	__dev_remove_pack(pt);
467 
468 	synchronize_net();
469 }
470 EXPORT_SYMBOL(dev_remove_pack);
471 
472 /******************************************************************************
473 
474 		      Device Boot-time Settings Routines
475 
476 *******************************************************************************/
477 
478 /* Boot time configuration table */
479 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
480 
481 /**
482  *	netdev_boot_setup_add	- add new setup entry
483  *	@name: name of the device
484  *	@map: configured settings for the device
485  *
486  *	Adds new setup entry to the dev_boot_setup list.  The function
487  *	returns 0 on error and 1 on success.  This is a generic routine to
488  *	all netdevices.
489  */
490 static int netdev_boot_setup_add(char *name, struct ifmap *map)
491 {
492 	struct netdev_boot_setup *s;
493 	int i;
494 
495 	s = dev_boot_setup;
496 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
497 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
498 			memset(s[i].name, 0, sizeof(s[i].name));
499 			strlcpy(s[i].name, name, IFNAMSIZ);
500 			memcpy(&s[i].map, map, sizeof(s[i].map));
501 			break;
502 		}
503 	}
504 
505 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
506 }
507 
508 /**
509  *	netdev_boot_setup_check	- check boot time settings
510  *	@dev: the netdevice
511  *
512  * 	Check boot time settings for the device.
513  *	The found settings are set for the device to be used
514  *	later in the device probing.
515  *	Returns 0 if no settings found, 1 if they are.
516  */
517 int netdev_boot_setup_check(struct net_device *dev)
518 {
519 	struct netdev_boot_setup *s = dev_boot_setup;
520 	int i;
521 
522 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
523 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
524 		    !strcmp(dev->name, s[i].name)) {
525 			dev->irq 	= s[i].map.irq;
526 			dev->base_addr 	= s[i].map.base_addr;
527 			dev->mem_start 	= s[i].map.mem_start;
528 			dev->mem_end 	= s[i].map.mem_end;
529 			return 1;
530 		}
531 	}
532 	return 0;
533 }
534 EXPORT_SYMBOL(netdev_boot_setup_check);
535 
536 
537 /**
538  *	netdev_boot_base	- get address from boot time settings
539  *	@prefix: prefix for network device
540  *	@unit: id for network device
541  *
542  * 	Check boot time settings for the base address of device.
543  *	The found settings are set for the device to be used
544  *	later in the device probing.
545  *	Returns 0 if no settings found.
546  */
547 unsigned long netdev_boot_base(const char *prefix, int unit)
548 {
549 	const struct netdev_boot_setup *s = dev_boot_setup;
550 	char name[IFNAMSIZ];
551 	int i;
552 
553 	sprintf(name, "%s%d", prefix, unit);
554 
555 	/*
556 	 * If device already registered then return base of 1
557 	 * to indicate not to probe for this interface
558 	 */
559 	if (__dev_get_by_name(&init_net, name))
560 		return 1;
561 
562 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
563 		if (!strcmp(name, s[i].name))
564 			return s[i].map.base_addr;
565 	return 0;
566 }
567 
568 /*
569  * Saves at boot time configured settings for any netdevice.
570  */
571 int __init netdev_boot_setup(char *str)
572 {
573 	int ints[5];
574 	struct ifmap map;
575 
576 	str = get_options(str, ARRAY_SIZE(ints), ints);
577 	if (!str || !*str)
578 		return 0;
579 
580 	/* Save settings */
581 	memset(&map, 0, sizeof(map));
582 	if (ints[0] > 0)
583 		map.irq = ints[1];
584 	if (ints[0] > 1)
585 		map.base_addr = ints[2];
586 	if (ints[0] > 2)
587 		map.mem_start = ints[3];
588 	if (ints[0] > 3)
589 		map.mem_end = ints[4];
590 
591 	/* Add new entry to the list */
592 	return netdev_boot_setup_add(str, &map);
593 }
594 
595 __setup("netdev=", netdev_boot_setup);
596 
597 /*******************************************************************************
598 
599 			    Device Interface Subroutines
600 
601 *******************************************************************************/
602 
603 /**
604  *	__dev_get_by_name	- find a device by its name
605  *	@net: the applicable net namespace
606  *	@name: name to find
607  *
608  *	Find an interface by name. Must be called under RTNL semaphore
609  *	or @dev_base_lock. If the name is found a pointer to the device
610  *	is returned. If the name is not found then %NULL is returned. The
611  *	reference counters are not incremented so the caller must be
612  *	careful with locks.
613  */
614 
615 struct net_device *__dev_get_by_name(struct net *net, const char *name)
616 {
617 	struct hlist_node *p;
618 	struct net_device *dev;
619 	struct hlist_head *head = dev_name_hash(net, name);
620 
621 	hlist_for_each_entry(dev, p, head, name_hlist)
622 		if (!strncmp(dev->name, name, IFNAMSIZ))
623 			return dev;
624 
625 	return NULL;
626 }
627 EXPORT_SYMBOL(__dev_get_by_name);
628 
629 /**
630  *	dev_get_by_name_rcu	- find a device by its name
631  *	@net: the applicable net namespace
632  *	@name: name to find
633  *
634  *	Find an interface by name.
635  *	If the name is found a pointer to the device is returned.
636  * 	If the name is not found then %NULL is returned.
637  *	The reference counters are not incremented so the caller must be
638  *	careful with locks. The caller must hold RCU lock.
639  */
640 
641 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
642 {
643 	struct hlist_node *p;
644 	struct net_device *dev;
645 	struct hlist_head *head = dev_name_hash(net, name);
646 
647 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
648 		if (!strncmp(dev->name, name, IFNAMSIZ))
649 			return dev;
650 
651 	return NULL;
652 }
653 EXPORT_SYMBOL(dev_get_by_name_rcu);
654 
655 /**
656  *	dev_get_by_name		- find a device by its name
657  *	@net: the applicable net namespace
658  *	@name: name to find
659  *
660  *	Find an interface by name. This can be called from any
661  *	context and does its own locking. The returned handle has
662  *	the usage count incremented and the caller must use dev_put() to
663  *	release it when it is no longer needed. %NULL is returned if no
664  *	matching device is found.
665  */
666 
667 struct net_device *dev_get_by_name(struct net *net, const char *name)
668 {
669 	struct net_device *dev;
670 
671 	rcu_read_lock();
672 	dev = dev_get_by_name_rcu(net, name);
673 	if (dev)
674 		dev_hold(dev);
675 	rcu_read_unlock();
676 	return dev;
677 }
678 EXPORT_SYMBOL(dev_get_by_name);
679 
680 /**
681  *	__dev_get_by_index - find a device by its ifindex
682  *	@net: the applicable net namespace
683  *	@ifindex: index of device
684  *
685  *	Search for an interface by index. Returns %NULL if the device
686  *	is not found or a pointer to the device. The device has not
687  *	had its reference counter increased so the caller must be careful
688  *	about locking. The caller must hold either the RTNL semaphore
689  *	or @dev_base_lock.
690  */
691 
692 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
693 {
694 	struct hlist_node *p;
695 	struct net_device *dev;
696 	struct hlist_head *head = dev_index_hash(net, ifindex);
697 
698 	hlist_for_each_entry(dev, p, head, index_hlist)
699 		if (dev->ifindex == ifindex)
700 			return dev;
701 
702 	return NULL;
703 }
704 EXPORT_SYMBOL(__dev_get_by_index);
705 
706 /**
707  *	dev_get_by_index_rcu - find a device by its ifindex
708  *	@net: the applicable net namespace
709  *	@ifindex: index of device
710  *
711  *	Search for an interface by index. Returns %NULL if the device
712  *	is not found or a pointer to the device. The device has not
713  *	had its reference counter increased so the caller must be careful
714  *	about locking. The caller must hold RCU lock.
715  */
716 
717 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
718 {
719 	struct hlist_node *p;
720 	struct net_device *dev;
721 	struct hlist_head *head = dev_index_hash(net, ifindex);
722 
723 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
724 		if (dev->ifindex == ifindex)
725 			return dev;
726 
727 	return NULL;
728 }
729 EXPORT_SYMBOL(dev_get_by_index_rcu);
730 
731 
732 /**
733  *	dev_get_by_index - find a device by its ifindex
734  *	@net: the applicable net namespace
735  *	@ifindex: index of device
736  *
737  *	Search for an interface by index. Returns NULL if the device
738  *	is not found or a pointer to the device. The device returned has
739  *	had a reference added and the pointer is safe until the user calls
740  *	dev_put to indicate they have finished with it.
741  */
742 
743 struct net_device *dev_get_by_index(struct net *net, int ifindex)
744 {
745 	struct net_device *dev;
746 
747 	rcu_read_lock();
748 	dev = dev_get_by_index_rcu(net, ifindex);
749 	if (dev)
750 		dev_hold(dev);
751 	rcu_read_unlock();
752 	return dev;
753 }
754 EXPORT_SYMBOL(dev_get_by_index);
755 
756 /**
757  *	dev_getbyhwaddr_rcu - find a device by its hardware address
758  *	@net: the applicable net namespace
759  *	@type: media type of device
760  *	@ha: hardware address
761  *
762  *	Search for an interface by MAC address. Returns NULL if the device
763  *	is not found or a pointer to the device.
764  *	The caller must hold RCU or RTNL.
765  *	The returned device has not had its ref count increased
766  *	and the caller must therefore be careful about locking
767  *
768  */
769 
770 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
771 				       const char *ha)
772 {
773 	struct net_device *dev;
774 
775 	for_each_netdev_rcu(net, dev)
776 		if (dev->type == type &&
777 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
778 			return dev;
779 
780 	return NULL;
781 }
782 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
783 
784 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
785 {
786 	struct net_device *dev;
787 
788 	ASSERT_RTNL();
789 	for_each_netdev(net, dev)
790 		if (dev->type == type)
791 			return dev;
792 
793 	return NULL;
794 }
795 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
796 
797 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
798 {
799 	struct net_device *dev, *ret = NULL;
800 
801 	rcu_read_lock();
802 	for_each_netdev_rcu(net, dev)
803 		if (dev->type == type) {
804 			dev_hold(dev);
805 			ret = dev;
806 			break;
807 		}
808 	rcu_read_unlock();
809 	return ret;
810 }
811 EXPORT_SYMBOL(dev_getfirstbyhwtype);
812 
813 /**
814  *	dev_get_by_flags_rcu - find any device with given flags
815  *	@net: the applicable net namespace
816  *	@if_flags: IFF_* values
817  *	@mask: bitmask of bits in if_flags to check
818  *
819  *	Search for any interface with the given flags. Returns NULL if a device
820  *	is not found or a pointer to the device. Must be called inside
821  *	rcu_read_lock(), and result refcount is unchanged.
822  */
823 
824 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
825 				    unsigned short mask)
826 {
827 	struct net_device *dev, *ret;
828 
829 	ret = NULL;
830 	for_each_netdev_rcu(net, dev) {
831 		if (((dev->flags ^ if_flags) & mask) == 0) {
832 			ret = dev;
833 			break;
834 		}
835 	}
836 	return ret;
837 }
838 EXPORT_SYMBOL(dev_get_by_flags_rcu);
839 
840 /**
841  *	dev_valid_name - check if name is okay for network device
842  *	@name: name string
843  *
844  *	Network device names need to be valid file names to
845  *	to allow sysfs to work.  We also disallow any kind of
846  *	whitespace.
847  */
848 int dev_valid_name(const char *name)
849 {
850 	if (*name == '\0')
851 		return 0;
852 	if (strlen(name) >= IFNAMSIZ)
853 		return 0;
854 	if (!strcmp(name, ".") || !strcmp(name, ".."))
855 		return 0;
856 
857 	while (*name) {
858 		if (*name == '/' || isspace(*name))
859 			return 0;
860 		name++;
861 	}
862 	return 1;
863 }
864 EXPORT_SYMBOL(dev_valid_name);
865 
866 /**
867  *	__dev_alloc_name - allocate a name for a device
868  *	@net: network namespace to allocate the device name in
869  *	@name: name format string
870  *	@buf:  scratch buffer and result name string
871  *
872  *	Passed a format string - eg "lt%d" it will try and find a suitable
873  *	id. It scans list of devices to build up a free map, then chooses
874  *	the first empty slot. The caller must hold the dev_base or rtnl lock
875  *	while allocating the name and adding the device in order to avoid
876  *	duplicates.
877  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
878  *	Returns the number of the unit assigned or a negative errno code.
879  */
880 
881 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
882 {
883 	int i = 0;
884 	const char *p;
885 	const int max_netdevices = 8*PAGE_SIZE;
886 	unsigned long *inuse;
887 	struct net_device *d;
888 
889 	p = strnchr(name, IFNAMSIZ-1, '%');
890 	if (p) {
891 		/*
892 		 * Verify the string as this thing may have come from
893 		 * the user.  There must be either one "%d" and no other "%"
894 		 * characters.
895 		 */
896 		if (p[1] != 'd' || strchr(p + 2, '%'))
897 			return -EINVAL;
898 
899 		/* Use one page as a bit array of possible slots */
900 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
901 		if (!inuse)
902 			return -ENOMEM;
903 
904 		for_each_netdev(net, d) {
905 			if (!sscanf(d->name, name, &i))
906 				continue;
907 			if (i < 0 || i >= max_netdevices)
908 				continue;
909 
910 			/*  avoid cases where sscanf is not exact inverse of printf */
911 			snprintf(buf, IFNAMSIZ, name, i);
912 			if (!strncmp(buf, d->name, IFNAMSIZ))
913 				set_bit(i, inuse);
914 		}
915 
916 		i = find_first_zero_bit(inuse, max_netdevices);
917 		free_page((unsigned long) inuse);
918 	}
919 
920 	if (buf != name)
921 		snprintf(buf, IFNAMSIZ, name, i);
922 	if (!__dev_get_by_name(net, buf))
923 		return i;
924 
925 	/* It is possible to run out of possible slots
926 	 * when the name is long and there isn't enough space left
927 	 * for the digits, or if all bits are used.
928 	 */
929 	return -ENFILE;
930 }
931 
932 /**
933  *	dev_alloc_name - allocate a name for a device
934  *	@dev: device
935  *	@name: name format string
936  *
937  *	Passed a format string - eg "lt%d" it will try and find a suitable
938  *	id. It scans list of devices to build up a free map, then chooses
939  *	the first empty slot. The caller must hold the dev_base or rtnl lock
940  *	while allocating the name and adding the device in order to avoid
941  *	duplicates.
942  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
943  *	Returns the number of the unit assigned or a negative errno code.
944  */
945 
946 int dev_alloc_name(struct net_device *dev, const char *name)
947 {
948 	char buf[IFNAMSIZ];
949 	struct net *net;
950 	int ret;
951 
952 	BUG_ON(!dev_net(dev));
953 	net = dev_net(dev);
954 	ret = __dev_alloc_name(net, name, buf);
955 	if (ret >= 0)
956 		strlcpy(dev->name, buf, IFNAMSIZ);
957 	return ret;
958 }
959 EXPORT_SYMBOL(dev_alloc_name);
960 
961 static int dev_get_valid_name(struct net_device *dev, const char *name)
962 {
963 	struct net *net;
964 
965 	BUG_ON(!dev_net(dev));
966 	net = dev_net(dev);
967 
968 	if (!dev_valid_name(name))
969 		return -EINVAL;
970 
971 	if (strchr(name, '%'))
972 		return dev_alloc_name(dev, name);
973 	else if (__dev_get_by_name(net, name))
974 		return -EEXIST;
975 	else if (dev->name != name)
976 		strlcpy(dev->name, name, IFNAMSIZ);
977 
978 	return 0;
979 }
980 
981 /**
982  *	dev_change_name - change name of a device
983  *	@dev: device
984  *	@newname: name (or format string) must be at least IFNAMSIZ
985  *
986  *	Change name of a device, can pass format strings "eth%d".
987  *	for wildcarding.
988  */
989 int dev_change_name(struct net_device *dev, const char *newname)
990 {
991 	char oldname[IFNAMSIZ];
992 	int err = 0;
993 	int ret;
994 	struct net *net;
995 
996 	ASSERT_RTNL();
997 	BUG_ON(!dev_net(dev));
998 
999 	net = dev_net(dev);
1000 	if (dev->flags & IFF_UP)
1001 		return -EBUSY;
1002 
1003 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1004 		return 0;
1005 
1006 	memcpy(oldname, dev->name, IFNAMSIZ);
1007 
1008 	err = dev_get_valid_name(dev, newname);
1009 	if (err < 0)
1010 		return err;
1011 
1012 rollback:
1013 	ret = device_rename(&dev->dev, dev->name);
1014 	if (ret) {
1015 		memcpy(dev->name, oldname, IFNAMSIZ);
1016 		return ret;
1017 	}
1018 
1019 	write_lock_bh(&dev_base_lock);
1020 	hlist_del_rcu(&dev->name_hlist);
1021 	write_unlock_bh(&dev_base_lock);
1022 
1023 	synchronize_rcu();
1024 
1025 	write_lock_bh(&dev_base_lock);
1026 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1027 	write_unlock_bh(&dev_base_lock);
1028 
1029 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1030 	ret = notifier_to_errno(ret);
1031 
1032 	if (ret) {
1033 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1034 		if (err >= 0) {
1035 			err = ret;
1036 			memcpy(dev->name, oldname, IFNAMSIZ);
1037 			goto rollback;
1038 		} else {
1039 			printk(KERN_ERR
1040 			       "%s: name change rollback failed: %d.\n",
1041 			       dev->name, ret);
1042 		}
1043 	}
1044 
1045 	return err;
1046 }
1047 
1048 /**
1049  *	dev_set_alias - change ifalias of a device
1050  *	@dev: device
1051  *	@alias: name up to IFALIASZ
1052  *	@len: limit of bytes to copy from info
1053  *
1054  *	Set ifalias for a device,
1055  */
1056 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057 {
1058 	ASSERT_RTNL();
1059 
1060 	if (len >= IFALIASZ)
1061 		return -EINVAL;
1062 
1063 	if (!len) {
1064 		if (dev->ifalias) {
1065 			kfree(dev->ifalias);
1066 			dev->ifalias = NULL;
1067 		}
1068 		return 0;
1069 	}
1070 
1071 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 	if (!dev->ifalias)
1073 		return -ENOMEM;
1074 
1075 	strlcpy(dev->ifalias, alias, len+1);
1076 	return len;
1077 }
1078 
1079 
1080 /**
1081  *	netdev_features_change - device changes features
1082  *	@dev: device to cause notification
1083  *
1084  *	Called to indicate a device has changed features.
1085  */
1086 void netdev_features_change(struct net_device *dev)
1087 {
1088 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1089 }
1090 EXPORT_SYMBOL(netdev_features_change);
1091 
1092 /**
1093  *	netdev_state_change - device changes state
1094  *	@dev: device to cause notification
1095  *
1096  *	Called to indicate a device has changed state. This function calls
1097  *	the notifier chains for netdev_chain and sends a NEWLINK message
1098  *	to the routing socket.
1099  */
1100 void netdev_state_change(struct net_device *dev)
1101 {
1102 	if (dev->flags & IFF_UP) {
1103 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1104 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1105 	}
1106 }
1107 EXPORT_SYMBOL(netdev_state_change);
1108 
1109 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1110 {
1111 	return call_netdevice_notifiers(event, dev);
1112 }
1113 EXPORT_SYMBOL(netdev_bonding_change);
1114 
1115 /**
1116  *	dev_load 	- load a network module
1117  *	@net: the applicable net namespace
1118  *	@name: name of interface
1119  *
1120  *	If a network interface is not present and the process has suitable
1121  *	privileges this function loads the module. If module loading is not
1122  *	available in this kernel then it becomes a nop.
1123  */
1124 
1125 void dev_load(struct net *net, const char *name)
1126 {
1127 	struct net_device *dev;
1128 	int no_module;
1129 
1130 	rcu_read_lock();
1131 	dev = dev_get_by_name_rcu(net, name);
1132 	rcu_read_unlock();
1133 
1134 	no_module = !dev;
1135 	if (no_module && capable(CAP_NET_ADMIN))
1136 		no_module = request_module("netdev-%s", name);
1137 	if (no_module && capable(CAP_SYS_MODULE)) {
1138 		if (!request_module("%s", name))
1139 			pr_err("Loading kernel module for a network device "
1140 "with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s "
1141 "instead\n", name);
1142 	}
1143 }
1144 EXPORT_SYMBOL(dev_load);
1145 
1146 static int __dev_open(struct net_device *dev)
1147 {
1148 	const struct net_device_ops *ops = dev->netdev_ops;
1149 	int ret;
1150 
1151 	ASSERT_RTNL();
1152 
1153 	if (!netif_device_present(dev))
1154 		return -ENODEV;
1155 
1156 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157 	ret = notifier_to_errno(ret);
1158 	if (ret)
1159 		return ret;
1160 
1161 	set_bit(__LINK_STATE_START, &dev->state);
1162 
1163 	if (ops->ndo_validate_addr)
1164 		ret = ops->ndo_validate_addr(dev);
1165 
1166 	if (!ret && ops->ndo_open)
1167 		ret = ops->ndo_open(dev);
1168 
1169 	if (ret)
1170 		clear_bit(__LINK_STATE_START, &dev->state);
1171 	else {
1172 		dev->flags |= IFF_UP;
1173 		net_dmaengine_get();
1174 		dev_set_rx_mode(dev);
1175 		dev_activate(dev);
1176 	}
1177 
1178 	return ret;
1179 }
1180 
1181 /**
1182  *	dev_open	- prepare an interface for use.
1183  *	@dev:	device to open
1184  *
1185  *	Takes a device from down to up state. The device's private open
1186  *	function is invoked and then the multicast lists are loaded. Finally
1187  *	the device is moved into the up state and a %NETDEV_UP message is
1188  *	sent to the netdev notifier chain.
1189  *
1190  *	Calling this function on an active interface is a nop. On a failure
1191  *	a negative errno code is returned.
1192  */
1193 int dev_open(struct net_device *dev)
1194 {
1195 	int ret;
1196 
1197 	if (dev->flags & IFF_UP)
1198 		return 0;
1199 
1200 	ret = __dev_open(dev);
1201 	if (ret < 0)
1202 		return ret;
1203 
1204 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205 	call_netdevice_notifiers(NETDEV_UP, dev);
1206 
1207 	return ret;
1208 }
1209 EXPORT_SYMBOL(dev_open);
1210 
1211 static int __dev_close_many(struct list_head *head)
1212 {
1213 	struct net_device *dev;
1214 
1215 	ASSERT_RTNL();
1216 	might_sleep();
1217 
1218 	list_for_each_entry(dev, head, unreg_list) {
1219 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1220 
1221 		clear_bit(__LINK_STATE_START, &dev->state);
1222 
1223 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1224 		 * can be even on different cpu. So just clear netif_running().
1225 		 *
1226 		 * dev->stop() will invoke napi_disable() on all of it's
1227 		 * napi_struct instances on this device.
1228 		 */
1229 		smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230 	}
1231 
1232 	dev_deactivate_many(head);
1233 
1234 	list_for_each_entry(dev, head, unreg_list) {
1235 		const struct net_device_ops *ops = dev->netdev_ops;
1236 
1237 		/*
1238 		 *	Call the device specific close. This cannot fail.
1239 		 *	Only if device is UP
1240 		 *
1241 		 *	We allow it to be called even after a DETACH hot-plug
1242 		 *	event.
1243 		 */
1244 		if (ops->ndo_stop)
1245 			ops->ndo_stop(dev);
1246 
1247 		dev->flags &= ~IFF_UP;
1248 		net_dmaengine_put();
1249 	}
1250 
1251 	return 0;
1252 }
1253 
1254 static int __dev_close(struct net_device *dev)
1255 {
1256 	int retval;
1257 	LIST_HEAD(single);
1258 
1259 	list_add(&dev->unreg_list, &single);
1260 	retval = __dev_close_many(&single);
1261 	list_del(&single);
1262 	return retval;
1263 }
1264 
1265 static int dev_close_many(struct list_head *head)
1266 {
1267 	struct net_device *dev, *tmp;
1268 	LIST_HEAD(tmp_list);
1269 
1270 	list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271 		if (!(dev->flags & IFF_UP))
1272 			list_move(&dev->unreg_list, &tmp_list);
1273 
1274 	__dev_close_many(head);
1275 
1276 	list_for_each_entry(dev, head, unreg_list) {
1277 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1279 	}
1280 
1281 	/* rollback_registered_many needs the complete original list */
1282 	list_splice(&tmp_list, head);
1283 	return 0;
1284 }
1285 
1286 /**
1287  *	dev_close - shutdown an interface.
1288  *	@dev: device to shutdown
1289  *
1290  *	This function moves an active device into down state. A
1291  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293  *	chain.
1294  */
1295 int dev_close(struct net_device *dev)
1296 {
1297 	if (dev->flags & IFF_UP) {
1298 		LIST_HEAD(single);
1299 
1300 		list_add(&dev->unreg_list, &single);
1301 		dev_close_many(&single);
1302 		list_del(&single);
1303 	}
1304 	return 0;
1305 }
1306 EXPORT_SYMBOL(dev_close);
1307 
1308 
1309 /**
1310  *	dev_disable_lro - disable Large Receive Offload on a device
1311  *	@dev: device
1312  *
1313  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1314  *	called under RTNL.  This is needed if received packets may be
1315  *	forwarded to another interface.
1316  */
1317 void dev_disable_lro(struct net_device *dev)
1318 {
1319 	u32 flags;
1320 
1321 	/*
1322 	 * If we're trying to disable lro on a vlan device
1323 	 * use the underlying physical device instead
1324 	 */
1325 	if (is_vlan_dev(dev))
1326 		dev = vlan_dev_real_dev(dev);
1327 
1328 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
1329 		flags = dev->ethtool_ops->get_flags(dev);
1330 	else
1331 		flags = ethtool_op_get_flags(dev);
1332 
1333 	if (!(flags & ETH_FLAG_LRO))
1334 		return;
1335 
1336 	__ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
1337 	if (unlikely(dev->features & NETIF_F_LRO))
1338 		netdev_WARN(dev, "failed to disable LRO!\n");
1339 }
1340 EXPORT_SYMBOL(dev_disable_lro);
1341 
1342 
1343 static int dev_boot_phase = 1;
1344 
1345 /**
1346  *	register_netdevice_notifier - register a network notifier block
1347  *	@nb: notifier
1348  *
1349  *	Register a notifier to be called when network device events occur.
1350  *	The notifier passed is linked into the kernel structures and must
1351  *	not be reused until it has been unregistered. A negative errno code
1352  *	is returned on a failure.
1353  *
1354  * 	When registered all registration and up events are replayed
1355  *	to the new notifier to allow device to have a race free
1356  *	view of the network device list.
1357  */
1358 
1359 int register_netdevice_notifier(struct notifier_block *nb)
1360 {
1361 	struct net_device *dev;
1362 	struct net_device *last;
1363 	struct net *net;
1364 	int err;
1365 
1366 	rtnl_lock();
1367 	err = raw_notifier_chain_register(&netdev_chain, nb);
1368 	if (err)
1369 		goto unlock;
1370 	if (dev_boot_phase)
1371 		goto unlock;
1372 	for_each_net(net) {
1373 		for_each_netdev(net, dev) {
1374 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1375 			err = notifier_to_errno(err);
1376 			if (err)
1377 				goto rollback;
1378 
1379 			if (!(dev->flags & IFF_UP))
1380 				continue;
1381 
1382 			nb->notifier_call(nb, NETDEV_UP, dev);
1383 		}
1384 	}
1385 
1386 unlock:
1387 	rtnl_unlock();
1388 	return err;
1389 
1390 rollback:
1391 	last = dev;
1392 	for_each_net(net) {
1393 		for_each_netdev(net, dev) {
1394 			if (dev == last)
1395 				break;
1396 
1397 			if (dev->flags & IFF_UP) {
1398 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1399 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1400 			}
1401 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1402 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1403 		}
1404 	}
1405 
1406 	raw_notifier_chain_unregister(&netdev_chain, nb);
1407 	goto unlock;
1408 }
1409 EXPORT_SYMBOL(register_netdevice_notifier);
1410 
1411 /**
1412  *	unregister_netdevice_notifier - unregister a network notifier block
1413  *	@nb: notifier
1414  *
1415  *	Unregister a notifier previously registered by
1416  *	register_netdevice_notifier(). The notifier is unlinked into the
1417  *	kernel structures and may then be reused. A negative errno code
1418  *	is returned on a failure.
1419  */
1420 
1421 int unregister_netdevice_notifier(struct notifier_block *nb)
1422 {
1423 	int err;
1424 
1425 	rtnl_lock();
1426 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1427 	rtnl_unlock();
1428 	return err;
1429 }
1430 EXPORT_SYMBOL(unregister_netdevice_notifier);
1431 
1432 /**
1433  *	call_netdevice_notifiers - call all network notifier blocks
1434  *      @val: value passed unmodified to notifier function
1435  *      @dev: net_device pointer passed unmodified to notifier function
1436  *
1437  *	Call all network notifier blocks.  Parameters and return value
1438  *	are as for raw_notifier_call_chain().
1439  */
1440 
1441 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1442 {
1443 	ASSERT_RTNL();
1444 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1445 }
1446 EXPORT_SYMBOL(call_netdevice_notifiers);
1447 
1448 /* When > 0 there are consumers of rx skb time stamps */
1449 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1450 
1451 void net_enable_timestamp(void)
1452 {
1453 	atomic_inc(&netstamp_needed);
1454 }
1455 EXPORT_SYMBOL(net_enable_timestamp);
1456 
1457 void net_disable_timestamp(void)
1458 {
1459 	atomic_dec(&netstamp_needed);
1460 }
1461 EXPORT_SYMBOL(net_disable_timestamp);
1462 
1463 static inline void net_timestamp_set(struct sk_buff *skb)
1464 {
1465 	if (atomic_read(&netstamp_needed))
1466 		__net_timestamp(skb);
1467 	else
1468 		skb->tstamp.tv64 = 0;
1469 }
1470 
1471 static inline void net_timestamp_check(struct sk_buff *skb)
1472 {
1473 	if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1474 		__net_timestamp(skb);
1475 }
1476 
1477 static inline bool is_skb_forwardable(struct net_device *dev,
1478 				      struct sk_buff *skb)
1479 {
1480 	unsigned int len;
1481 
1482 	if (!(dev->flags & IFF_UP))
1483 		return false;
1484 
1485 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1486 	if (skb->len <= len)
1487 		return true;
1488 
1489 	/* if TSO is enabled, we don't care about the length as the packet
1490 	 * could be forwarded without being segmented before
1491 	 */
1492 	if (skb_is_gso(skb))
1493 		return true;
1494 
1495 	return false;
1496 }
1497 
1498 /**
1499  * dev_forward_skb - loopback an skb to another netif
1500  *
1501  * @dev: destination network device
1502  * @skb: buffer to forward
1503  *
1504  * return values:
1505  *	NET_RX_SUCCESS	(no congestion)
1506  *	NET_RX_DROP     (packet was dropped, but freed)
1507  *
1508  * dev_forward_skb can be used for injecting an skb from the
1509  * start_xmit function of one device into the receive queue
1510  * of another device.
1511  *
1512  * The receiving device may be in another namespace, so
1513  * we have to clear all information in the skb that could
1514  * impact namespace isolation.
1515  */
1516 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1517 {
1518 	skb_orphan(skb);
1519 	nf_reset(skb);
1520 
1521 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1522 		atomic_long_inc(&dev->rx_dropped);
1523 		kfree_skb(skb);
1524 		return NET_RX_DROP;
1525 	}
1526 	skb_set_dev(skb, dev);
1527 	skb->tstamp.tv64 = 0;
1528 	skb->pkt_type = PACKET_HOST;
1529 	skb->protocol = eth_type_trans(skb, dev);
1530 	return netif_rx(skb);
1531 }
1532 EXPORT_SYMBOL_GPL(dev_forward_skb);
1533 
1534 static inline int deliver_skb(struct sk_buff *skb,
1535 			      struct packet_type *pt_prev,
1536 			      struct net_device *orig_dev)
1537 {
1538 	atomic_inc(&skb->users);
1539 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1540 }
1541 
1542 /*
1543  *	Support routine. Sends outgoing frames to any network
1544  *	taps currently in use.
1545  */
1546 
1547 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1548 {
1549 	struct packet_type *ptype;
1550 	struct sk_buff *skb2 = NULL;
1551 	struct packet_type *pt_prev = NULL;
1552 
1553 	rcu_read_lock();
1554 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1555 		/* Never send packets back to the socket
1556 		 * they originated from - MvS (miquels@drinkel.ow.org)
1557 		 */
1558 		if ((ptype->dev == dev || !ptype->dev) &&
1559 		    (ptype->af_packet_priv == NULL ||
1560 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1561 			if (pt_prev) {
1562 				deliver_skb(skb2, pt_prev, skb->dev);
1563 				pt_prev = ptype;
1564 				continue;
1565 			}
1566 
1567 			skb2 = skb_clone(skb, GFP_ATOMIC);
1568 			if (!skb2)
1569 				break;
1570 
1571 			net_timestamp_set(skb2);
1572 
1573 			/* skb->nh should be correctly
1574 			   set by sender, so that the second statement is
1575 			   just protection against buggy protocols.
1576 			 */
1577 			skb_reset_mac_header(skb2);
1578 
1579 			if (skb_network_header(skb2) < skb2->data ||
1580 			    skb2->network_header > skb2->tail) {
1581 				if (net_ratelimit())
1582 					printk(KERN_CRIT "protocol %04x is "
1583 					       "buggy, dev %s\n",
1584 					       ntohs(skb2->protocol),
1585 					       dev->name);
1586 				skb_reset_network_header(skb2);
1587 			}
1588 
1589 			skb2->transport_header = skb2->network_header;
1590 			skb2->pkt_type = PACKET_OUTGOING;
1591 			pt_prev = ptype;
1592 		}
1593 	}
1594 	if (pt_prev)
1595 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1596 	rcu_read_unlock();
1597 }
1598 
1599 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1600  * @dev: Network device
1601  * @txq: number of queues available
1602  *
1603  * If real_num_tx_queues is changed the tc mappings may no longer be
1604  * valid. To resolve this verify the tc mapping remains valid and if
1605  * not NULL the mapping. With no priorities mapping to this
1606  * offset/count pair it will no longer be used. In the worst case TC0
1607  * is invalid nothing can be done so disable priority mappings. If is
1608  * expected that drivers will fix this mapping if they can before
1609  * calling netif_set_real_num_tx_queues.
1610  */
1611 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1612 {
1613 	int i;
1614 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1615 
1616 	/* If TC0 is invalidated disable TC mapping */
1617 	if (tc->offset + tc->count > txq) {
1618 		pr_warning("Number of in use tx queues changed "
1619 			   "invalidating tc mappings. Priority "
1620 			   "traffic classification disabled!\n");
1621 		dev->num_tc = 0;
1622 		return;
1623 	}
1624 
1625 	/* Invalidated prio to tc mappings set to TC0 */
1626 	for (i = 1; i < TC_BITMASK + 1; i++) {
1627 		int q = netdev_get_prio_tc_map(dev, i);
1628 
1629 		tc = &dev->tc_to_txq[q];
1630 		if (tc->offset + tc->count > txq) {
1631 			pr_warning("Number of in use tx queues "
1632 				   "changed. Priority %i to tc "
1633 				   "mapping %i is no longer valid "
1634 				   "setting map to 0\n",
1635 				   i, q);
1636 			netdev_set_prio_tc_map(dev, i, 0);
1637 		}
1638 	}
1639 }
1640 
1641 /*
1642  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1643  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1644  */
1645 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1646 {
1647 	int rc;
1648 
1649 	if (txq < 1 || txq > dev->num_tx_queues)
1650 		return -EINVAL;
1651 
1652 	if (dev->reg_state == NETREG_REGISTERED ||
1653 	    dev->reg_state == NETREG_UNREGISTERING) {
1654 		ASSERT_RTNL();
1655 
1656 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1657 						  txq);
1658 		if (rc)
1659 			return rc;
1660 
1661 		if (dev->num_tc)
1662 			netif_setup_tc(dev, txq);
1663 
1664 		if (txq < dev->real_num_tx_queues)
1665 			qdisc_reset_all_tx_gt(dev, txq);
1666 	}
1667 
1668 	dev->real_num_tx_queues = txq;
1669 	return 0;
1670 }
1671 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1672 
1673 #ifdef CONFIG_RPS
1674 /**
1675  *	netif_set_real_num_rx_queues - set actual number of RX queues used
1676  *	@dev: Network device
1677  *	@rxq: Actual number of RX queues
1678  *
1679  *	This must be called either with the rtnl_lock held or before
1680  *	registration of the net device.  Returns 0 on success, or a
1681  *	negative error code.  If called before registration, it always
1682  *	succeeds.
1683  */
1684 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1685 {
1686 	int rc;
1687 
1688 	if (rxq < 1 || rxq > dev->num_rx_queues)
1689 		return -EINVAL;
1690 
1691 	if (dev->reg_state == NETREG_REGISTERED) {
1692 		ASSERT_RTNL();
1693 
1694 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1695 						  rxq);
1696 		if (rc)
1697 			return rc;
1698 	}
1699 
1700 	dev->real_num_rx_queues = rxq;
1701 	return 0;
1702 }
1703 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1704 #endif
1705 
1706 static inline void __netif_reschedule(struct Qdisc *q)
1707 {
1708 	struct softnet_data *sd;
1709 	unsigned long flags;
1710 
1711 	local_irq_save(flags);
1712 	sd = &__get_cpu_var(softnet_data);
1713 	q->next_sched = NULL;
1714 	*sd->output_queue_tailp = q;
1715 	sd->output_queue_tailp = &q->next_sched;
1716 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1717 	local_irq_restore(flags);
1718 }
1719 
1720 void __netif_schedule(struct Qdisc *q)
1721 {
1722 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1723 		__netif_reschedule(q);
1724 }
1725 EXPORT_SYMBOL(__netif_schedule);
1726 
1727 void dev_kfree_skb_irq(struct sk_buff *skb)
1728 {
1729 	if (atomic_dec_and_test(&skb->users)) {
1730 		struct softnet_data *sd;
1731 		unsigned long flags;
1732 
1733 		local_irq_save(flags);
1734 		sd = &__get_cpu_var(softnet_data);
1735 		skb->next = sd->completion_queue;
1736 		sd->completion_queue = skb;
1737 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1738 		local_irq_restore(flags);
1739 	}
1740 }
1741 EXPORT_SYMBOL(dev_kfree_skb_irq);
1742 
1743 void dev_kfree_skb_any(struct sk_buff *skb)
1744 {
1745 	if (in_irq() || irqs_disabled())
1746 		dev_kfree_skb_irq(skb);
1747 	else
1748 		dev_kfree_skb(skb);
1749 }
1750 EXPORT_SYMBOL(dev_kfree_skb_any);
1751 
1752 
1753 /**
1754  * netif_device_detach - mark device as removed
1755  * @dev: network device
1756  *
1757  * Mark device as removed from system and therefore no longer available.
1758  */
1759 void netif_device_detach(struct net_device *dev)
1760 {
1761 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1762 	    netif_running(dev)) {
1763 		netif_tx_stop_all_queues(dev);
1764 	}
1765 }
1766 EXPORT_SYMBOL(netif_device_detach);
1767 
1768 /**
1769  * netif_device_attach - mark device as attached
1770  * @dev: network device
1771  *
1772  * Mark device as attached from system and restart if needed.
1773  */
1774 void netif_device_attach(struct net_device *dev)
1775 {
1776 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1777 	    netif_running(dev)) {
1778 		netif_tx_wake_all_queues(dev);
1779 		__netdev_watchdog_up(dev);
1780 	}
1781 }
1782 EXPORT_SYMBOL(netif_device_attach);
1783 
1784 /**
1785  * skb_dev_set -- assign a new device to a buffer
1786  * @skb: buffer for the new device
1787  * @dev: network device
1788  *
1789  * If an skb is owned by a device already, we have to reset
1790  * all data private to the namespace a device belongs to
1791  * before assigning it a new device.
1792  */
1793 #ifdef CONFIG_NET_NS
1794 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1795 {
1796 	skb_dst_drop(skb);
1797 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1798 		secpath_reset(skb);
1799 		nf_reset(skb);
1800 		skb_init_secmark(skb);
1801 		skb->mark = 0;
1802 		skb->priority = 0;
1803 		skb->nf_trace = 0;
1804 		skb->ipvs_property = 0;
1805 #ifdef CONFIG_NET_SCHED
1806 		skb->tc_index = 0;
1807 #endif
1808 	}
1809 	skb->dev = dev;
1810 }
1811 EXPORT_SYMBOL(skb_set_dev);
1812 #endif /* CONFIG_NET_NS */
1813 
1814 /*
1815  * Invalidate hardware checksum when packet is to be mangled, and
1816  * complete checksum manually on outgoing path.
1817  */
1818 int skb_checksum_help(struct sk_buff *skb)
1819 {
1820 	__wsum csum;
1821 	int ret = 0, offset;
1822 
1823 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1824 		goto out_set_summed;
1825 
1826 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1827 		/* Let GSO fix up the checksum. */
1828 		goto out_set_summed;
1829 	}
1830 
1831 	offset = skb_checksum_start_offset(skb);
1832 	BUG_ON(offset >= skb_headlen(skb));
1833 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1834 
1835 	offset += skb->csum_offset;
1836 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1837 
1838 	if (skb_cloned(skb) &&
1839 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1840 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1841 		if (ret)
1842 			goto out;
1843 	}
1844 
1845 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1846 out_set_summed:
1847 	skb->ip_summed = CHECKSUM_NONE;
1848 out:
1849 	return ret;
1850 }
1851 EXPORT_SYMBOL(skb_checksum_help);
1852 
1853 /**
1854  *	skb_gso_segment - Perform segmentation on skb.
1855  *	@skb: buffer to segment
1856  *	@features: features for the output path (see dev->features)
1857  *
1858  *	This function segments the given skb and returns a list of segments.
1859  *
1860  *	It may return NULL if the skb requires no segmentation.  This is
1861  *	only possible when GSO is used for verifying header integrity.
1862  */
1863 struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1864 {
1865 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1866 	struct packet_type *ptype;
1867 	__be16 type = skb->protocol;
1868 	int vlan_depth = ETH_HLEN;
1869 	int err;
1870 
1871 	while (type == htons(ETH_P_8021Q)) {
1872 		struct vlan_hdr *vh;
1873 
1874 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
1875 			return ERR_PTR(-EINVAL);
1876 
1877 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1878 		type = vh->h_vlan_encapsulated_proto;
1879 		vlan_depth += VLAN_HLEN;
1880 	}
1881 
1882 	skb_reset_mac_header(skb);
1883 	skb->mac_len = skb->network_header - skb->mac_header;
1884 	__skb_pull(skb, skb->mac_len);
1885 
1886 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1887 		struct net_device *dev = skb->dev;
1888 		struct ethtool_drvinfo info = {};
1889 
1890 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1891 			dev->ethtool_ops->get_drvinfo(dev, &info);
1892 
1893 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1894 		     info.driver, dev ? dev->features : 0L,
1895 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1896 		     skb->len, skb->data_len, skb->ip_summed);
1897 
1898 		if (skb_header_cloned(skb) &&
1899 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1900 			return ERR_PTR(err);
1901 	}
1902 
1903 	rcu_read_lock();
1904 	list_for_each_entry_rcu(ptype,
1905 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1906 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1907 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1908 				err = ptype->gso_send_check(skb);
1909 				segs = ERR_PTR(err);
1910 				if (err || skb_gso_ok(skb, features))
1911 					break;
1912 				__skb_push(skb, (skb->data -
1913 						 skb_network_header(skb)));
1914 			}
1915 			segs = ptype->gso_segment(skb, features);
1916 			break;
1917 		}
1918 	}
1919 	rcu_read_unlock();
1920 
1921 	__skb_push(skb, skb->data - skb_mac_header(skb));
1922 
1923 	return segs;
1924 }
1925 EXPORT_SYMBOL(skb_gso_segment);
1926 
1927 /* Take action when hardware reception checksum errors are detected. */
1928 #ifdef CONFIG_BUG
1929 void netdev_rx_csum_fault(struct net_device *dev)
1930 {
1931 	if (net_ratelimit()) {
1932 		printk(KERN_ERR "%s: hw csum failure.\n",
1933 			dev ? dev->name : "<unknown>");
1934 		dump_stack();
1935 	}
1936 }
1937 EXPORT_SYMBOL(netdev_rx_csum_fault);
1938 #endif
1939 
1940 /* Actually, we should eliminate this check as soon as we know, that:
1941  * 1. IOMMU is present and allows to map all the memory.
1942  * 2. No high memory really exists on this machine.
1943  */
1944 
1945 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1946 {
1947 #ifdef CONFIG_HIGHMEM
1948 	int i;
1949 	if (!(dev->features & NETIF_F_HIGHDMA)) {
1950 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1951 			if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1952 				return 1;
1953 	}
1954 
1955 	if (PCI_DMA_BUS_IS_PHYS) {
1956 		struct device *pdev = dev->dev.parent;
1957 
1958 		if (!pdev)
1959 			return 0;
1960 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1961 			dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1962 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1963 				return 1;
1964 		}
1965 	}
1966 #endif
1967 	return 0;
1968 }
1969 
1970 struct dev_gso_cb {
1971 	void (*destructor)(struct sk_buff *skb);
1972 };
1973 
1974 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1975 
1976 static void dev_gso_skb_destructor(struct sk_buff *skb)
1977 {
1978 	struct dev_gso_cb *cb;
1979 
1980 	do {
1981 		struct sk_buff *nskb = skb->next;
1982 
1983 		skb->next = nskb->next;
1984 		nskb->next = NULL;
1985 		kfree_skb(nskb);
1986 	} while (skb->next);
1987 
1988 	cb = DEV_GSO_CB(skb);
1989 	if (cb->destructor)
1990 		cb->destructor(skb);
1991 }
1992 
1993 /**
1994  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1995  *	@skb: buffer to segment
1996  *	@features: device features as applicable to this skb
1997  *
1998  *	This function segments the given skb and stores the list of segments
1999  *	in skb->next.
2000  */
2001 static int dev_gso_segment(struct sk_buff *skb, int features)
2002 {
2003 	struct sk_buff *segs;
2004 
2005 	segs = skb_gso_segment(skb, features);
2006 
2007 	/* Verifying header integrity only. */
2008 	if (!segs)
2009 		return 0;
2010 
2011 	if (IS_ERR(segs))
2012 		return PTR_ERR(segs);
2013 
2014 	skb->next = segs;
2015 	DEV_GSO_CB(skb)->destructor = skb->destructor;
2016 	skb->destructor = dev_gso_skb_destructor;
2017 
2018 	return 0;
2019 }
2020 
2021 /*
2022  * Try to orphan skb early, right before transmission by the device.
2023  * We cannot orphan skb if tx timestamp is requested or the sk-reference
2024  * is needed on driver level for other reasons, e.g. see net/can/raw.c
2025  */
2026 static inline void skb_orphan_try(struct sk_buff *skb)
2027 {
2028 	struct sock *sk = skb->sk;
2029 
2030 	if (sk && !skb_shinfo(skb)->tx_flags) {
2031 		/* skb_tx_hash() wont be able to get sk.
2032 		 * We copy sk_hash into skb->rxhash
2033 		 */
2034 		if (!skb->rxhash)
2035 			skb->rxhash = sk->sk_hash;
2036 		skb_orphan(skb);
2037 	}
2038 }
2039 
2040 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2041 {
2042 	return ((features & NETIF_F_GEN_CSUM) ||
2043 		((features & NETIF_F_V4_CSUM) &&
2044 		 protocol == htons(ETH_P_IP)) ||
2045 		((features & NETIF_F_V6_CSUM) &&
2046 		 protocol == htons(ETH_P_IPV6)) ||
2047 		((features & NETIF_F_FCOE_CRC) &&
2048 		 protocol == htons(ETH_P_FCOE)));
2049 }
2050 
2051 static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2052 {
2053 	if (!can_checksum_protocol(features, protocol)) {
2054 		features &= ~NETIF_F_ALL_CSUM;
2055 		features &= ~NETIF_F_SG;
2056 	} else if (illegal_highdma(skb->dev, skb)) {
2057 		features &= ~NETIF_F_SG;
2058 	}
2059 
2060 	return features;
2061 }
2062 
2063 u32 netif_skb_features(struct sk_buff *skb)
2064 {
2065 	__be16 protocol = skb->protocol;
2066 	u32 features = skb->dev->features;
2067 
2068 	if (protocol == htons(ETH_P_8021Q)) {
2069 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2070 		protocol = veh->h_vlan_encapsulated_proto;
2071 	} else if (!vlan_tx_tag_present(skb)) {
2072 		return harmonize_features(skb, protocol, features);
2073 	}
2074 
2075 	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2076 
2077 	if (protocol != htons(ETH_P_8021Q)) {
2078 		return harmonize_features(skb, protocol, features);
2079 	} else {
2080 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2081 				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2082 		return harmonize_features(skb, protocol, features);
2083 	}
2084 }
2085 EXPORT_SYMBOL(netif_skb_features);
2086 
2087 /*
2088  * Returns true if either:
2089  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2090  *	2. skb is fragmented and the device does not support SG, or if
2091  *	   at least one of fragments is in highmem and device does not
2092  *	   support DMA from it.
2093  */
2094 static inline int skb_needs_linearize(struct sk_buff *skb,
2095 				      int features)
2096 {
2097 	return skb_is_nonlinear(skb) &&
2098 			((skb_has_frag_list(skb) &&
2099 				!(features & NETIF_F_FRAGLIST)) ||
2100 			(skb_shinfo(skb)->nr_frags &&
2101 				!(features & NETIF_F_SG)));
2102 }
2103 
2104 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2105 			struct netdev_queue *txq)
2106 {
2107 	const struct net_device_ops *ops = dev->netdev_ops;
2108 	int rc = NETDEV_TX_OK;
2109 	unsigned int skb_len;
2110 
2111 	if (likely(!skb->next)) {
2112 		u32 features;
2113 
2114 		/*
2115 		 * If device doesn't need skb->dst, release it right now while
2116 		 * its hot in this cpu cache
2117 		 */
2118 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2119 			skb_dst_drop(skb);
2120 
2121 		if (!list_empty(&ptype_all))
2122 			dev_queue_xmit_nit(skb, dev);
2123 
2124 		skb_orphan_try(skb);
2125 
2126 		features = netif_skb_features(skb);
2127 
2128 		if (vlan_tx_tag_present(skb) &&
2129 		    !(features & NETIF_F_HW_VLAN_TX)) {
2130 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2131 			if (unlikely(!skb))
2132 				goto out;
2133 
2134 			skb->vlan_tci = 0;
2135 		}
2136 
2137 		if (netif_needs_gso(skb, features)) {
2138 			if (unlikely(dev_gso_segment(skb, features)))
2139 				goto out_kfree_skb;
2140 			if (skb->next)
2141 				goto gso;
2142 		} else {
2143 			if (skb_needs_linearize(skb, features) &&
2144 			    __skb_linearize(skb))
2145 				goto out_kfree_skb;
2146 
2147 			/* If packet is not checksummed and device does not
2148 			 * support checksumming for this protocol, complete
2149 			 * checksumming here.
2150 			 */
2151 			if (skb->ip_summed == CHECKSUM_PARTIAL) {
2152 				skb_set_transport_header(skb,
2153 					skb_checksum_start_offset(skb));
2154 				if (!(features & NETIF_F_ALL_CSUM) &&
2155 				     skb_checksum_help(skb))
2156 					goto out_kfree_skb;
2157 			}
2158 		}
2159 
2160 		skb_len = skb->len;
2161 		rc = ops->ndo_start_xmit(skb, dev);
2162 		trace_net_dev_xmit(skb, rc, dev, skb_len);
2163 		if (rc == NETDEV_TX_OK)
2164 			txq_trans_update(txq);
2165 		return rc;
2166 	}
2167 
2168 gso:
2169 	do {
2170 		struct sk_buff *nskb = skb->next;
2171 
2172 		skb->next = nskb->next;
2173 		nskb->next = NULL;
2174 
2175 		/*
2176 		 * If device doesn't need nskb->dst, release it right now while
2177 		 * its hot in this cpu cache
2178 		 */
2179 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2180 			skb_dst_drop(nskb);
2181 
2182 		skb_len = nskb->len;
2183 		rc = ops->ndo_start_xmit(nskb, dev);
2184 		trace_net_dev_xmit(nskb, rc, dev, skb_len);
2185 		if (unlikely(rc != NETDEV_TX_OK)) {
2186 			if (rc & ~NETDEV_TX_MASK)
2187 				goto out_kfree_gso_skb;
2188 			nskb->next = skb->next;
2189 			skb->next = nskb;
2190 			return rc;
2191 		}
2192 		txq_trans_update(txq);
2193 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
2194 			return NETDEV_TX_BUSY;
2195 	} while (skb->next);
2196 
2197 out_kfree_gso_skb:
2198 	if (likely(skb->next == NULL))
2199 		skb->destructor = DEV_GSO_CB(skb)->destructor;
2200 out_kfree_skb:
2201 	kfree_skb(skb);
2202 out:
2203 	return rc;
2204 }
2205 
2206 static u32 hashrnd __read_mostly;
2207 
2208 /*
2209  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2210  * to be used as a distribution range.
2211  */
2212 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2213 		  unsigned int num_tx_queues)
2214 {
2215 	u32 hash;
2216 	u16 qoffset = 0;
2217 	u16 qcount = num_tx_queues;
2218 
2219 	if (skb_rx_queue_recorded(skb)) {
2220 		hash = skb_get_rx_queue(skb);
2221 		while (unlikely(hash >= num_tx_queues))
2222 			hash -= num_tx_queues;
2223 		return hash;
2224 	}
2225 
2226 	if (dev->num_tc) {
2227 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2228 		qoffset = dev->tc_to_txq[tc].offset;
2229 		qcount = dev->tc_to_txq[tc].count;
2230 	}
2231 
2232 	if (skb->sk && skb->sk->sk_hash)
2233 		hash = skb->sk->sk_hash;
2234 	else
2235 		hash = (__force u16) skb->protocol ^ skb->rxhash;
2236 	hash = jhash_1word(hash, hashrnd);
2237 
2238 	return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2239 }
2240 EXPORT_SYMBOL(__skb_tx_hash);
2241 
2242 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2243 {
2244 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2245 		if (net_ratelimit()) {
2246 			pr_warning("%s selects TX queue %d, but "
2247 				"real number of TX queues is %d\n",
2248 				dev->name, queue_index, dev->real_num_tx_queues);
2249 		}
2250 		return 0;
2251 	}
2252 	return queue_index;
2253 }
2254 
2255 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2256 {
2257 #ifdef CONFIG_XPS
2258 	struct xps_dev_maps *dev_maps;
2259 	struct xps_map *map;
2260 	int queue_index = -1;
2261 
2262 	rcu_read_lock();
2263 	dev_maps = rcu_dereference(dev->xps_maps);
2264 	if (dev_maps) {
2265 		map = rcu_dereference(
2266 		    dev_maps->cpu_map[raw_smp_processor_id()]);
2267 		if (map) {
2268 			if (map->len == 1)
2269 				queue_index = map->queues[0];
2270 			else {
2271 				u32 hash;
2272 				if (skb->sk && skb->sk->sk_hash)
2273 					hash = skb->sk->sk_hash;
2274 				else
2275 					hash = (__force u16) skb->protocol ^
2276 					    skb->rxhash;
2277 				hash = jhash_1word(hash, hashrnd);
2278 				queue_index = map->queues[
2279 				    ((u64)hash * map->len) >> 32];
2280 			}
2281 			if (unlikely(queue_index >= dev->real_num_tx_queues))
2282 				queue_index = -1;
2283 		}
2284 	}
2285 	rcu_read_unlock();
2286 
2287 	return queue_index;
2288 #else
2289 	return -1;
2290 #endif
2291 }
2292 
2293 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2294 					struct sk_buff *skb)
2295 {
2296 	int queue_index;
2297 	const struct net_device_ops *ops = dev->netdev_ops;
2298 
2299 	if (dev->real_num_tx_queues == 1)
2300 		queue_index = 0;
2301 	else if (ops->ndo_select_queue) {
2302 		queue_index = ops->ndo_select_queue(dev, skb);
2303 		queue_index = dev_cap_txqueue(dev, queue_index);
2304 	} else {
2305 		struct sock *sk = skb->sk;
2306 		queue_index = sk_tx_queue_get(sk);
2307 
2308 		if (queue_index < 0 || skb->ooo_okay ||
2309 		    queue_index >= dev->real_num_tx_queues) {
2310 			int old_index = queue_index;
2311 
2312 			queue_index = get_xps_queue(dev, skb);
2313 			if (queue_index < 0)
2314 				queue_index = skb_tx_hash(dev, skb);
2315 
2316 			if (queue_index != old_index && sk) {
2317 				struct dst_entry *dst =
2318 				    rcu_dereference_check(sk->sk_dst_cache, 1);
2319 
2320 				if (dst && skb_dst(skb) == dst)
2321 					sk_tx_queue_set(sk, queue_index);
2322 			}
2323 		}
2324 	}
2325 
2326 	skb_set_queue_mapping(skb, queue_index);
2327 	return netdev_get_tx_queue(dev, queue_index);
2328 }
2329 
2330 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2331 				 struct net_device *dev,
2332 				 struct netdev_queue *txq)
2333 {
2334 	spinlock_t *root_lock = qdisc_lock(q);
2335 	bool contended;
2336 	int rc;
2337 
2338 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2339 	qdisc_calculate_pkt_len(skb, q);
2340 	/*
2341 	 * Heuristic to force contended enqueues to serialize on a
2342 	 * separate lock before trying to get qdisc main lock.
2343 	 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2344 	 * and dequeue packets faster.
2345 	 */
2346 	contended = qdisc_is_running(q);
2347 	if (unlikely(contended))
2348 		spin_lock(&q->busylock);
2349 
2350 	spin_lock(root_lock);
2351 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2352 		kfree_skb(skb);
2353 		rc = NET_XMIT_DROP;
2354 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2355 		   qdisc_run_begin(q)) {
2356 		/*
2357 		 * This is a work-conserving queue; there are no old skbs
2358 		 * waiting to be sent out; and the qdisc is not running -
2359 		 * xmit the skb directly.
2360 		 */
2361 		if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2362 			skb_dst_force(skb);
2363 
2364 		qdisc_bstats_update(q, skb);
2365 
2366 		if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2367 			if (unlikely(contended)) {
2368 				spin_unlock(&q->busylock);
2369 				contended = false;
2370 			}
2371 			__qdisc_run(q);
2372 		} else
2373 			qdisc_run_end(q);
2374 
2375 		rc = NET_XMIT_SUCCESS;
2376 	} else {
2377 		skb_dst_force(skb);
2378 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2379 		if (qdisc_run_begin(q)) {
2380 			if (unlikely(contended)) {
2381 				spin_unlock(&q->busylock);
2382 				contended = false;
2383 			}
2384 			__qdisc_run(q);
2385 		}
2386 	}
2387 	spin_unlock(root_lock);
2388 	if (unlikely(contended))
2389 		spin_unlock(&q->busylock);
2390 	return rc;
2391 }
2392 
2393 static DEFINE_PER_CPU(int, xmit_recursion);
2394 #define RECURSION_LIMIT 10
2395 
2396 /**
2397  *	dev_queue_xmit - transmit a buffer
2398  *	@skb: buffer to transmit
2399  *
2400  *	Queue a buffer for transmission to a network device. The caller must
2401  *	have set the device and priority and built the buffer before calling
2402  *	this function. The function can be called from an interrupt.
2403  *
2404  *	A negative errno code is returned on a failure. A success does not
2405  *	guarantee the frame will be transmitted as it may be dropped due
2406  *	to congestion or traffic shaping.
2407  *
2408  * -----------------------------------------------------------------------------------
2409  *      I notice this method can also return errors from the queue disciplines,
2410  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2411  *      be positive.
2412  *
2413  *      Regardless of the return value, the skb is consumed, so it is currently
2414  *      difficult to retry a send to this method.  (You can bump the ref count
2415  *      before sending to hold a reference for retry if you are careful.)
2416  *
2417  *      When calling this method, interrupts MUST be enabled.  This is because
2418  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2419  *          --BLG
2420  */
2421 int dev_queue_xmit(struct sk_buff *skb)
2422 {
2423 	struct net_device *dev = skb->dev;
2424 	struct netdev_queue *txq;
2425 	struct Qdisc *q;
2426 	int rc = -ENOMEM;
2427 
2428 	/* Disable soft irqs for various locks below. Also
2429 	 * stops preemption for RCU.
2430 	 */
2431 	rcu_read_lock_bh();
2432 
2433 	txq = dev_pick_tx(dev, skb);
2434 	q = rcu_dereference_bh(txq->qdisc);
2435 
2436 #ifdef CONFIG_NET_CLS_ACT
2437 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2438 #endif
2439 	trace_net_dev_queue(skb);
2440 	if (q->enqueue) {
2441 		rc = __dev_xmit_skb(skb, q, dev, txq);
2442 		goto out;
2443 	}
2444 
2445 	/* The device has no queue. Common case for software devices:
2446 	   loopback, all the sorts of tunnels...
2447 
2448 	   Really, it is unlikely that netif_tx_lock protection is necessary
2449 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2450 	   counters.)
2451 	   However, it is possible, that they rely on protection
2452 	   made by us here.
2453 
2454 	   Check this and shot the lock. It is not prone from deadlocks.
2455 	   Either shot noqueue qdisc, it is even simpler 8)
2456 	 */
2457 	if (dev->flags & IFF_UP) {
2458 		int cpu = smp_processor_id(); /* ok because BHs are off */
2459 
2460 		if (txq->xmit_lock_owner != cpu) {
2461 
2462 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2463 				goto recursion_alert;
2464 
2465 			HARD_TX_LOCK(dev, txq, cpu);
2466 
2467 			if (!netif_tx_queue_stopped(txq)) {
2468 				__this_cpu_inc(xmit_recursion);
2469 				rc = dev_hard_start_xmit(skb, dev, txq);
2470 				__this_cpu_dec(xmit_recursion);
2471 				if (dev_xmit_complete(rc)) {
2472 					HARD_TX_UNLOCK(dev, txq);
2473 					goto out;
2474 				}
2475 			}
2476 			HARD_TX_UNLOCK(dev, txq);
2477 			if (net_ratelimit())
2478 				printk(KERN_CRIT "Virtual device %s asks to "
2479 				       "queue packet!\n", dev->name);
2480 		} else {
2481 			/* Recursion is detected! It is possible,
2482 			 * unfortunately
2483 			 */
2484 recursion_alert:
2485 			if (net_ratelimit())
2486 				printk(KERN_CRIT "Dead loop on virtual device "
2487 				       "%s, fix it urgently!\n", dev->name);
2488 		}
2489 	}
2490 
2491 	rc = -ENETDOWN;
2492 	rcu_read_unlock_bh();
2493 
2494 	kfree_skb(skb);
2495 	return rc;
2496 out:
2497 	rcu_read_unlock_bh();
2498 	return rc;
2499 }
2500 EXPORT_SYMBOL(dev_queue_xmit);
2501 
2502 
2503 /*=======================================================================
2504 			Receiver routines
2505   =======================================================================*/
2506 
2507 int netdev_max_backlog __read_mostly = 1000;
2508 int netdev_tstamp_prequeue __read_mostly = 1;
2509 int netdev_budget __read_mostly = 300;
2510 int weight_p __read_mostly = 64;            /* old backlog weight */
2511 
2512 /* Called with irq disabled */
2513 static inline void ____napi_schedule(struct softnet_data *sd,
2514 				     struct napi_struct *napi)
2515 {
2516 	list_add_tail(&napi->poll_list, &sd->poll_list);
2517 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2518 }
2519 
2520 /*
2521  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2522  * and src/dst port numbers. Returns a non-zero hash number on success
2523  * and 0 on failure.
2524  */
2525 __u32 __skb_get_rxhash(struct sk_buff *skb)
2526 {
2527 	int nhoff, hash = 0, poff;
2528 	const struct ipv6hdr *ip6;
2529 	const struct iphdr *ip;
2530 	u8 ip_proto;
2531 	u32 addr1, addr2, ihl;
2532 	union {
2533 		u32 v32;
2534 		u16 v16[2];
2535 	} ports;
2536 
2537 	nhoff = skb_network_offset(skb);
2538 
2539 	switch (skb->protocol) {
2540 	case __constant_htons(ETH_P_IP):
2541 		if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2542 			goto done;
2543 
2544 		ip = (const struct iphdr *) (skb->data + nhoff);
2545 		if (ip_is_fragment(ip))
2546 			ip_proto = 0;
2547 		else
2548 			ip_proto = ip->protocol;
2549 		addr1 = (__force u32) ip->saddr;
2550 		addr2 = (__force u32) ip->daddr;
2551 		ihl = ip->ihl;
2552 		break;
2553 	case __constant_htons(ETH_P_IPV6):
2554 		if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2555 			goto done;
2556 
2557 		ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
2558 		ip_proto = ip6->nexthdr;
2559 		addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2560 		addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2561 		ihl = (40 >> 2);
2562 		break;
2563 	default:
2564 		goto done;
2565 	}
2566 
2567 	ports.v32 = 0;
2568 	poff = proto_ports_offset(ip_proto);
2569 	if (poff >= 0) {
2570 		nhoff += ihl * 4 + poff;
2571 		if (pskb_may_pull(skb, nhoff + 4)) {
2572 			ports.v32 = * (__force u32 *) (skb->data + nhoff);
2573 			if (ports.v16[1] < ports.v16[0])
2574 				swap(ports.v16[0], ports.v16[1]);
2575 		}
2576 	}
2577 
2578 	/* get a consistent hash (same value on both flow directions) */
2579 	if (addr2 < addr1)
2580 		swap(addr1, addr2);
2581 
2582 	hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2583 	if (!hash)
2584 		hash = 1;
2585 
2586 done:
2587 	return hash;
2588 }
2589 EXPORT_SYMBOL(__skb_get_rxhash);
2590 
2591 #ifdef CONFIG_RPS
2592 
2593 /* One global table that all flow-based protocols share. */
2594 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2595 EXPORT_SYMBOL(rps_sock_flow_table);
2596 
2597 static struct rps_dev_flow *
2598 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2599 	    struct rps_dev_flow *rflow, u16 next_cpu)
2600 {
2601 	u16 tcpu;
2602 
2603 	tcpu = rflow->cpu = next_cpu;
2604 	if (tcpu != RPS_NO_CPU) {
2605 #ifdef CONFIG_RFS_ACCEL
2606 		struct netdev_rx_queue *rxqueue;
2607 		struct rps_dev_flow_table *flow_table;
2608 		struct rps_dev_flow *old_rflow;
2609 		u32 flow_id;
2610 		u16 rxq_index;
2611 		int rc;
2612 
2613 		/* Should we steer this flow to a different hardware queue? */
2614 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2615 		    !(dev->features & NETIF_F_NTUPLE))
2616 			goto out;
2617 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2618 		if (rxq_index == skb_get_rx_queue(skb))
2619 			goto out;
2620 
2621 		rxqueue = dev->_rx + rxq_index;
2622 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
2623 		if (!flow_table)
2624 			goto out;
2625 		flow_id = skb->rxhash & flow_table->mask;
2626 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2627 							rxq_index, flow_id);
2628 		if (rc < 0)
2629 			goto out;
2630 		old_rflow = rflow;
2631 		rflow = &flow_table->flows[flow_id];
2632 		rflow->cpu = next_cpu;
2633 		rflow->filter = rc;
2634 		if (old_rflow->filter == rflow->filter)
2635 			old_rflow->filter = RPS_NO_FILTER;
2636 	out:
2637 #endif
2638 		rflow->last_qtail =
2639 			per_cpu(softnet_data, tcpu).input_queue_head;
2640 	}
2641 
2642 	return rflow;
2643 }
2644 
2645 /*
2646  * get_rps_cpu is called from netif_receive_skb and returns the target
2647  * CPU from the RPS map of the receiving queue for a given skb.
2648  * rcu_read_lock must be held on entry.
2649  */
2650 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2651 		       struct rps_dev_flow **rflowp)
2652 {
2653 	struct netdev_rx_queue *rxqueue;
2654 	struct rps_map *map;
2655 	struct rps_dev_flow_table *flow_table;
2656 	struct rps_sock_flow_table *sock_flow_table;
2657 	int cpu = -1;
2658 	u16 tcpu;
2659 
2660 	if (skb_rx_queue_recorded(skb)) {
2661 		u16 index = skb_get_rx_queue(skb);
2662 		if (unlikely(index >= dev->real_num_rx_queues)) {
2663 			WARN_ONCE(dev->real_num_rx_queues > 1,
2664 				  "%s received packet on queue %u, but number "
2665 				  "of RX queues is %u\n",
2666 				  dev->name, index, dev->real_num_rx_queues);
2667 			goto done;
2668 		}
2669 		rxqueue = dev->_rx + index;
2670 	} else
2671 		rxqueue = dev->_rx;
2672 
2673 	map = rcu_dereference(rxqueue->rps_map);
2674 	if (map) {
2675 		if (map->len == 1 &&
2676 		    !rcu_dereference_raw(rxqueue->rps_flow_table)) {
2677 			tcpu = map->cpus[0];
2678 			if (cpu_online(tcpu))
2679 				cpu = tcpu;
2680 			goto done;
2681 		}
2682 	} else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
2683 		goto done;
2684 	}
2685 
2686 	skb_reset_network_header(skb);
2687 	if (!skb_get_rxhash(skb))
2688 		goto done;
2689 
2690 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2691 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
2692 	if (flow_table && sock_flow_table) {
2693 		u16 next_cpu;
2694 		struct rps_dev_flow *rflow;
2695 
2696 		rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2697 		tcpu = rflow->cpu;
2698 
2699 		next_cpu = sock_flow_table->ents[skb->rxhash &
2700 		    sock_flow_table->mask];
2701 
2702 		/*
2703 		 * If the desired CPU (where last recvmsg was done) is
2704 		 * different from current CPU (one in the rx-queue flow
2705 		 * table entry), switch if one of the following holds:
2706 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
2707 		 *   - Current CPU is offline.
2708 		 *   - The current CPU's queue tail has advanced beyond the
2709 		 *     last packet that was enqueued using this table entry.
2710 		 *     This guarantees that all previous packets for the flow
2711 		 *     have been dequeued, thus preserving in order delivery.
2712 		 */
2713 		if (unlikely(tcpu != next_cpu) &&
2714 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2715 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2716 		      rflow->last_qtail)) >= 0))
2717 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2718 
2719 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2720 			*rflowp = rflow;
2721 			cpu = tcpu;
2722 			goto done;
2723 		}
2724 	}
2725 
2726 	if (map) {
2727 		tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2728 
2729 		if (cpu_online(tcpu)) {
2730 			cpu = tcpu;
2731 			goto done;
2732 		}
2733 	}
2734 
2735 done:
2736 	return cpu;
2737 }
2738 
2739 #ifdef CONFIG_RFS_ACCEL
2740 
2741 /**
2742  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2743  * @dev: Device on which the filter was set
2744  * @rxq_index: RX queue index
2745  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2746  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2747  *
2748  * Drivers that implement ndo_rx_flow_steer() should periodically call
2749  * this function for each installed filter and remove the filters for
2750  * which it returns %true.
2751  */
2752 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2753 			 u32 flow_id, u16 filter_id)
2754 {
2755 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2756 	struct rps_dev_flow_table *flow_table;
2757 	struct rps_dev_flow *rflow;
2758 	bool expire = true;
2759 	int cpu;
2760 
2761 	rcu_read_lock();
2762 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
2763 	if (flow_table && flow_id <= flow_table->mask) {
2764 		rflow = &flow_table->flows[flow_id];
2765 		cpu = ACCESS_ONCE(rflow->cpu);
2766 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2767 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2768 			   rflow->last_qtail) <
2769 		     (int)(10 * flow_table->mask)))
2770 			expire = false;
2771 	}
2772 	rcu_read_unlock();
2773 	return expire;
2774 }
2775 EXPORT_SYMBOL(rps_may_expire_flow);
2776 
2777 #endif /* CONFIG_RFS_ACCEL */
2778 
2779 /* Called from hardirq (IPI) context */
2780 static void rps_trigger_softirq(void *data)
2781 {
2782 	struct softnet_data *sd = data;
2783 
2784 	____napi_schedule(sd, &sd->backlog);
2785 	sd->received_rps++;
2786 }
2787 
2788 #endif /* CONFIG_RPS */
2789 
2790 /*
2791  * Check if this softnet_data structure is another cpu one
2792  * If yes, queue it to our IPI list and return 1
2793  * If no, return 0
2794  */
2795 static int rps_ipi_queued(struct softnet_data *sd)
2796 {
2797 #ifdef CONFIG_RPS
2798 	struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2799 
2800 	if (sd != mysd) {
2801 		sd->rps_ipi_next = mysd->rps_ipi_list;
2802 		mysd->rps_ipi_list = sd;
2803 
2804 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2805 		return 1;
2806 	}
2807 #endif /* CONFIG_RPS */
2808 	return 0;
2809 }
2810 
2811 /*
2812  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2813  * queue (may be a remote CPU queue).
2814  */
2815 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2816 			      unsigned int *qtail)
2817 {
2818 	struct softnet_data *sd;
2819 	unsigned long flags;
2820 
2821 	sd = &per_cpu(softnet_data, cpu);
2822 
2823 	local_irq_save(flags);
2824 
2825 	rps_lock(sd);
2826 	if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2827 		if (skb_queue_len(&sd->input_pkt_queue)) {
2828 enqueue:
2829 			__skb_queue_tail(&sd->input_pkt_queue, skb);
2830 			input_queue_tail_incr_save(sd, qtail);
2831 			rps_unlock(sd);
2832 			local_irq_restore(flags);
2833 			return NET_RX_SUCCESS;
2834 		}
2835 
2836 		/* Schedule NAPI for backlog device
2837 		 * We can use non atomic operation since we own the queue lock
2838 		 */
2839 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
2840 			if (!rps_ipi_queued(sd))
2841 				____napi_schedule(sd, &sd->backlog);
2842 		}
2843 		goto enqueue;
2844 	}
2845 
2846 	sd->dropped++;
2847 	rps_unlock(sd);
2848 
2849 	local_irq_restore(flags);
2850 
2851 	atomic_long_inc(&skb->dev->rx_dropped);
2852 	kfree_skb(skb);
2853 	return NET_RX_DROP;
2854 }
2855 
2856 /**
2857  *	netif_rx	-	post buffer to the network code
2858  *	@skb: buffer to post
2859  *
2860  *	This function receives a packet from a device driver and queues it for
2861  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2862  *	may be dropped during processing for congestion control or by the
2863  *	protocol layers.
2864  *
2865  *	return values:
2866  *	NET_RX_SUCCESS	(no congestion)
2867  *	NET_RX_DROP     (packet was dropped)
2868  *
2869  */
2870 
2871 int netif_rx(struct sk_buff *skb)
2872 {
2873 	int ret;
2874 
2875 	/* if netpoll wants it, pretend we never saw it */
2876 	if (netpoll_rx(skb))
2877 		return NET_RX_DROP;
2878 
2879 	if (netdev_tstamp_prequeue)
2880 		net_timestamp_check(skb);
2881 
2882 	trace_netif_rx(skb);
2883 #ifdef CONFIG_RPS
2884 	{
2885 		struct rps_dev_flow voidflow, *rflow = &voidflow;
2886 		int cpu;
2887 
2888 		preempt_disable();
2889 		rcu_read_lock();
2890 
2891 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
2892 		if (cpu < 0)
2893 			cpu = smp_processor_id();
2894 
2895 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2896 
2897 		rcu_read_unlock();
2898 		preempt_enable();
2899 	}
2900 #else
2901 	{
2902 		unsigned int qtail;
2903 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2904 		put_cpu();
2905 	}
2906 #endif
2907 	return ret;
2908 }
2909 EXPORT_SYMBOL(netif_rx);
2910 
2911 int netif_rx_ni(struct sk_buff *skb)
2912 {
2913 	int err;
2914 
2915 	preempt_disable();
2916 	err = netif_rx(skb);
2917 	if (local_softirq_pending())
2918 		do_softirq();
2919 	preempt_enable();
2920 
2921 	return err;
2922 }
2923 EXPORT_SYMBOL(netif_rx_ni);
2924 
2925 static void net_tx_action(struct softirq_action *h)
2926 {
2927 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2928 
2929 	if (sd->completion_queue) {
2930 		struct sk_buff *clist;
2931 
2932 		local_irq_disable();
2933 		clist = sd->completion_queue;
2934 		sd->completion_queue = NULL;
2935 		local_irq_enable();
2936 
2937 		while (clist) {
2938 			struct sk_buff *skb = clist;
2939 			clist = clist->next;
2940 
2941 			WARN_ON(atomic_read(&skb->users));
2942 			trace_kfree_skb(skb, net_tx_action);
2943 			__kfree_skb(skb);
2944 		}
2945 	}
2946 
2947 	if (sd->output_queue) {
2948 		struct Qdisc *head;
2949 
2950 		local_irq_disable();
2951 		head = sd->output_queue;
2952 		sd->output_queue = NULL;
2953 		sd->output_queue_tailp = &sd->output_queue;
2954 		local_irq_enable();
2955 
2956 		while (head) {
2957 			struct Qdisc *q = head;
2958 			spinlock_t *root_lock;
2959 
2960 			head = head->next_sched;
2961 
2962 			root_lock = qdisc_lock(q);
2963 			if (spin_trylock(root_lock)) {
2964 				smp_mb__before_clear_bit();
2965 				clear_bit(__QDISC_STATE_SCHED,
2966 					  &q->state);
2967 				qdisc_run(q);
2968 				spin_unlock(root_lock);
2969 			} else {
2970 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2971 					      &q->state)) {
2972 					__netif_reschedule(q);
2973 				} else {
2974 					smp_mb__before_clear_bit();
2975 					clear_bit(__QDISC_STATE_SCHED,
2976 						  &q->state);
2977 				}
2978 			}
2979 		}
2980 	}
2981 }
2982 
2983 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2984     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2985 /* This hook is defined here for ATM LANE */
2986 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2987 			     unsigned char *addr) __read_mostly;
2988 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2989 #endif
2990 
2991 #ifdef CONFIG_NET_CLS_ACT
2992 /* TODO: Maybe we should just force sch_ingress to be compiled in
2993  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2994  * a compare and 2 stores extra right now if we dont have it on
2995  * but have CONFIG_NET_CLS_ACT
2996  * NOTE: This doesn't stop any functionality; if you dont have
2997  * the ingress scheduler, you just can't add policies on ingress.
2998  *
2999  */
3000 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3001 {
3002 	struct net_device *dev = skb->dev;
3003 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3004 	int result = TC_ACT_OK;
3005 	struct Qdisc *q;
3006 
3007 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3008 		if (net_ratelimit())
3009 			pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
3010 			       skb->skb_iif, dev->ifindex);
3011 		return TC_ACT_SHOT;
3012 	}
3013 
3014 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3015 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3016 
3017 	q = rxq->qdisc;
3018 	if (q != &noop_qdisc) {
3019 		spin_lock(qdisc_lock(q));
3020 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3021 			result = qdisc_enqueue_root(skb, q);
3022 		spin_unlock(qdisc_lock(q));
3023 	}
3024 
3025 	return result;
3026 }
3027 
3028 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3029 					 struct packet_type **pt_prev,
3030 					 int *ret, struct net_device *orig_dev)
3031 {
3032 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3033 
3034 	if (!rxq || rxq->qdisc == &noop_qdisc)
3035 		goto out;
3036 
3037 	if (*pt_prev) {
3038 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3039 		*pt_prev = NULL;
3040 	}
3041 
3042 	switch (ing_filter(skb, rxq)) {
3043 	case TC_ACT_SHOT:
3044 	case TC_ACT_STOLEN:
3045 		kfree_skb(skb);
3046 		return NULL;
3047 	}
3048 
3049 out:
3050 	skb->tc_verd = 0;
3051 	return skb;
3052 }
3053 #endif
3054 
3055 /**
3056  *	netdev_rx_handler_register - register receive handler
3057  *	@dev: device to register a handler for
3058  *	@rx_handler: receive handler to register
3059  *	@rx_handler_data: data pointer that is used by rx handler
3060  *
3061  *	Register a receive hander for a device. This handler will then be
3062  *	called from __netif_receive_skb. A negative errno code is returned
3063  *	on a failure.
3064  *
3065  *	The caller must hold the rtnl_mutex.
3066  *
3067  *	For a general description of rx_handler, see enum rx_handler_result.
3068  */
3069 int netdev_rx_handler_register(struct net_device *dev,
3070 			       rx_handler_func_t *rx_handler,
3071 			       void *rx_handler_data)
3072 {
3073 	ASSERT_RTNL();
3074 
3075 	if (dev->rx_handler)
3076 		return -EBUSY;
3077 
3078 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3079 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3080 
3081 	return 0;
3082 }
3083 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3084 
3085 /**
3086  *	netdev_rx_handler_unregister - unregister receive handler
3087  *	@dev: device to unregister a handler from
3088  *
3089  *	Unregister a receive hander from a device.
3090  *
3091  *	The caller must hold the rtnl_mutex.
3092  */
3093 void netdev_rx_handler_unregister(struct net_device *dev)
3094 {
3095 
3096 	ASSERT_RTNL();
3097 	rcu_assign_pointer(dev->rx_handler, NULL);
3098 	rcu_assign_pointer(dev->rx_handler_data, NULL);
3099 }
3100 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3101 
3102 static int __netif_receive_skb(struct sk_buff *skb)
3103 {
3104 	struct packet_type *ptype, *pt_prev;
3105 	rx_handler_func_t *rx_handler;
3106 	struct net_device *orig_dev;
3107 	struct net_device *null_or_dev;
3108 	bool deliver_exact = false;
3109 	int ret = NET_RX_DROP;
3110 	__be16 type;
3111 
3112 	if (!netdev_tstamp_prequeue)
3113 		net_timestamp_check(skb);
3114 
3115 	trace_netif_receive_skb(skb);
3116 
3117 	/* if we've gotten here through NAPI, check netpoll */
3118 	if (netpoll_receive_skb(skb))
3119 		return NET_RX_DROP;
3120 
3121 	if (!skb->skb_iif)
3122 		skb->skb_iif = skb->dev->ifindex;
3123 	orig_dev = skb->dev;
3124 
3125 	skb_reset_network_header(skb);
3126 	skb_reset_transport_header(skb);
3127 	skb_reset_mac_len(skb);
3128 
3129 	pt_prev = NULL;
3130 
3131 	rcu_read_lock();
3132 
3133 another_round:
3134 
3135 	__this_cpu_inc(softnet_data.processed);
3136 
3137 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3138 		skb = vlan_untag(skb);
3139 		if (unlikely(!skb))
3140 			goto out;
3141 	}
3142 
3143 #ifdef CONFIG_NET_CLS_ACT
3144 	if (skb->tc_verd & TC_NCLS) {
3145 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3146 		goto ncls;
3147 	}
3148 #endif
3149 
3150 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3151 		if (!ptype->dev || ptype->dev == skb->dev) {
3152 			if (pt_prev)
3153 				ret = deliver_skb(skb, pt_prev, orig_dev);
3154 			pt_prev = ptype;
3155 		}
3156 	}
3157 
3158 #ifdef CONFIG_NET_CLS_ACT
3159 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3160 	if (!skb)
3161 		goto out;
3162 ncls:
3163 #endif
3164 
3165 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3166 	if (rx_handler) {
3167 		if (pt_prev) {
3168 			ret = deliver_skb(skb, pt_prev, orig_dev);
3169 			pt_prev = NULL;
3170 		}
3171 		switch (rx_handler(&skb)) {
3172 		case RX_HANDLER_CONSUMED:
3173 			goto out;
3174 		case RX_HANDLER_ANOTHER:
3175 			goto another_round;
3176 		case RX_HANDLER_EXACT:
3177 			deliver_exact = true;
3178 		case RX_HANDLER_PASS:
3179 			break;
3180 		default:
3181 			BUG();
3182 		}
3183 	}
3184 
3185 	if (vlan_tx_tag_present(skb)) {
3186 		if (pt_prev) {
3187 			ret = deliver_skb(skb, pt_prev, orig_dev);
3188 			pt_prev = NULL;
3189 		}
3190 		if (vlan_do_receive(&skb)) {
3191 			ret = __netif_receive_skb(skb);
3192 			goto out;
3193 		} else if (unlikely(!skb))
3194 			goto out;
3195 	}
3196 
3197 	/* deliver only exact match when indicated */
3198 	null_or_dev = deliver_exact ? skb->dev : NULL;
3199 
3200 	type = skb->protocol;
3201 	list_for_each_entry_rcu(ptype,
3202 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3203 		if (ptype->type == type &&
3204 		    (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3205 		     ptype->dev == orig_dev)) {
3206 			if (pt_prev)
3207 				ret = deliver_skb(skb, pt_prev, orig_dev);
3208 			pt_prev = ptype;
3209 		}
3210 	}
3211 
3212 	if (pt_prev) {
3213 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3214 	} else {
3215 		atomic_long_inc(&skb->dev->rx_dropped);
3216 		kfree_skb(skb);
3217 		/* Jamal, now you will not able to escape explaining
3218 		 * me how you were going to use this. :-)
3219 		 */
3220 		ret = NET_RX_DROP;
3221 	}
3222 
3223 out:
3224 	rcu_read_unlock();
3225 	return ret;
3226 }
3227 
3228 /**
3229  *	netif_receive_skb - process receive buffer from network
3230  *	@skb: buffer to process
3231  *
3232  *	netif_receive_skb() is the main receive data processing function.
3233  *	It always succeeds. The buffer may be dropped during processing
3234  *	for congestion control or by the protocol layers.
3235  *
3236  *	This function may only be called from softirq context and interrupts
3237  *	should be enabled.
3238  *
3239  *	Return values (usually ignored):
3240  *	NET_RX_SUCCESS: no congestion
3241  *	NET_RX_DROP: packet was dropped
3242  */
3243 int netif_receive_skb(struct sk_buff *skb)
3244 {
3245 	if (netdev_tstamp_prequeue)
3246 		net_timestamp_check(skb);
3247 
3248 	if (skb_defer_rx_timestamp(skb))
3249 		return NET_RX_SUCCESS;
3250 
3251 #ifdef CONFIG_RPS
3252 	{
3253 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3254 		int cpu, ret;
3255 
3256 		rcu_read_lock();
3257 
3258 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3259 
3260 		if (cpu >= 0) {
3261 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3262 			rcu_read_unlock();
3263 		} else {
3264 			rcu_read_unlock();
3265 			ret = __netif_receive_skb(skb);
3266 		}
3267 
3268 		return ret;
3269 	}
3270 #else
3271 	return __netif_receive_skb(skb);
3272 #endif
3273 }
3274 EXPORT_SYMBOL(netif_receive_skb);
3275 
3276 /* Network device is going away, flush any packets still pending
3277  * Called with irqs disabled.
3278  */
3279 static void flush_backlog(void *arg)
3280 {
3281 	struct net_device *dev = arg;
3282 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3283 	struct sk_buff *skb, *tmp;
3284 
3285 	rps_lock(sd);
3286 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3287 		if (skb->dev == dev) {
3288 			__skb_unlink(skb, &sd->input_pkt_queue);
3289 			kfree_skb(skb);
3290 			input_queue_head_incr(sd);
3291 		}
3292 	}
3293 	rps_unlock(sd);
3294 
3295 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3296 		if (skb->dev == dev) {
3297 			__skb_unlink(skb, &sd->process_queue);
3298 			kfree_skb(skb);
3299 			input_queue_head_incr(sd);
3300 		}
3301 	}
3302 }
3303 
3304 static int napi_gro_complete(struct sk_buff *skb)
3305 {
3306 	struct packet_type *ptype;
3307 	__be16 type = skb->protocol;
3308 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3309 	int err = -ENOENT;
3310 
3311 	if (NAPI_GRO_CB(skb)->count == 1) {
3312 		skb_shinfo(skb)->gso_size = 0;
3313 		goto out;
3314 	}
3315 
3316 	rcu_read_lock();
3317 	list_for_each_entry_rcu(ptype, head, list) {
3318 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3319 			continue;
3320 
3321 		err = ptype->gro_complete(skb);
3322 		break;
3323 	}
3324 	rcu_read_unlock();
3325 
3326 	if (err) {
3327 		WARN_ON(&ptype->list == head);
3328 		kfree_skb(skb);
3329 		return NET_RX_SUCCESS;
3330 	}
3331 
3332 out:
3333 	return netif_receive_skb(skb);
3334 }
3335 
3336 inline void napi_gro_flush(struct napi_struct *napi)
3337 {
3338 	struct sk_buff *skb, *next;
3339 
3340 	for (skb = napi->gro_list; skb; skb = next) {
3341 		next = skb->next;
3342 		skb->next = NULL;
3343 		napi_gro_complete(skb);
3344 	}
3345 
3346 	napi->gro_count = 0;
3347 	napi->gro_list = NULL;
3348 }
3349 EXPORT_SYMBOL(napi_gro_flush);
3350 
3351 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3352 {
3353 	struct sk_buff **pp = NULL;
3354 	struct packet_type *ptype;
3355 	__be16 type = skb->protocol;
3356 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3357 	int same_flow;
3358 	int mac_len;
3359 	enum gro_result ret;
3360 
3361 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3362 		goto normal;
3363 
3364 	if (skb_is_gso(skb) || skb_has_frag_list(skb))
3365 		goto normal;
3366 
3367 	rcu_read_lock();
3368 	list_for_each_entry_rcu(ptype, head, list) {
3369 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3370 			continue;
3371 
3372 		skb_set_network_header(skb, skb_gro_offset(skb));
3373 		mac_len = skb->network_header - skb->mac_header;
3374 		skb->mac_len = mac_len;
3375 		NAPI_GRO_CB(skb)->same_flow = 0;
3376 		NAPI_GRO_CB(skb)->flush = 0;
3377 		NAPI_GRO_CB(skb)->free = 0;
3378 
3379 		pp = ptype->gro_receive(&napi->gro_list, skb);
3380 		break;
3381 	}
3382 	rcu_read_unlock();
3383 
3384 	if (&ptype->list == head)
3385 		goto normal;
3386 
3387 	same_flow = NAPI_GRO_CB(skb)->same_flow;
3388 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3389 
3390 	if (pp) {
3391 		struct sk_buff *nskb = *pp;
3392 
3393 		*pp = nskb->next;
3394 		nskb->next = NULL;
3395 		napi_gro_complete(nskb);
3396 		napi->gro_count--;
3397 	}
3398 
3399 	if (same_flow)
3400 		goto ok;
3401 
3402 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3403 		goto normal;
3404 
3405 	napi->gro_count++;
3406 	NAPI_GRO_CB(skb)->count = 1;
3407 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3408 	skb->next = napi->gro_list;
3409 	napi->gro_list = skb;
3410 	ret = GRO_HELD;
3411 
3412 pull:
3413 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
3414 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
3415 
3416 		BUG_ON(skb->end - skb->tail < grow);
3417 
3418 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3419 
3420 		skb->tail += grow;
3421 		skb->data_len -= grow;
3422 
3423 		skb_shinfo(skb)->frags[0].page_offset += grow;
3424 		skb_shinfo(skb)->frags[0].size -= grow;
3425 
3426 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3427 			put_page(skb_shinfo(skb)->frags[0].page);
3428 			memmove(skb_shinfo(skb)->frags,
3429 				skb_shinfo(skb)->frags + 1,
3430 				--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3431 		}
3432 	}
3433 
3434 ok:
3435 	return ret;
3436 
3437 normal:
3438 	ret = GRO_NORMAL;
3439 	goto pull;
3440 }
3441 EXPORT_SYMBOL(dev_gro_receive);
3442 
3443 static inline gro_result_t
3444 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3445 {
3446 	struct sk_buff *p;
3447 
3448 	for (p = napi->gro_list; p; p = p->next) {
3449 		unsigned long diffs;
3450 
3451 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3452 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3453 		diffs |= compare_ether_header(skb_mac_header(p),
3454 					      skb_gro_mac_header(skb));
3455 		NAPI_GRO_CB(p)->same_flow = !diffs;
3456 		NAPI_GRO_CB(p)->flush = 0;
3457 	}
3458 
3459 	return dev_gro_receive(napi, skb);
3460 }
3461 
3462 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3463 {
3464 	switch (ret) {
3465 	case GRO_NORMAL:
3466 		if (netif_receive_skb(skb))
3467 			ret = GRO_DROP;
3468 		break;
3469 
3470 	case GRO_DROP:
3471 	case GRO_MERGED_FREE:
3472 		kfree_skb(skb);
3473 		break;
3474 
3475 	case GRO_HELD:
3476 	case GRO_MERGED:
3477 		break;
3478 	}
3479 
3480 	return ret;
3481 }
3482 EXPORT_SYMBOL(napi_skb_finish);
3483 
3484 void skb_gro_reset_offset(struct sk_buff *skb)
3485 {
3486 	NAPI_GRO_CB(skb)->data_offset = 0;
3487 	NAPI_GRO_CB(skb)->frag0 = NULL;
3488 	NAPI_GRO_CB(skb)->frag0_len = 0;
3489 
3490 	if (skb->mac_header == skb->tail &&
3491 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
3492 		NAPI_GRO_CB(skb)->frag0 =
3493 			page_address(skb_shinfo(skb)->frags[0].page) +
3494 			skb_shinfo(skb)->frags[0].page_offset;
3495 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3496 	}
3497 }
3498 EXPORT_SYMBOL(skb_gro_reset_offset);
3499 
3500 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3501 {
3502 	skb_gro_reset_offset(skb);
3503 
3504 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3505 }
3506 EXPORT_SYMBOL(napi_gro_receive);
3507 
3508 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3509 {
3510 	__skb_pull(skb, skb_headlen(skb));
3511 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3512 	skb->vlan_tci = 0;
3513 	skb->dev = napi->dev;
3514 	skb->skb_iif = 0;
3515 
3516 	napi->skb = skb;
3517 }
3518 
3519 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3520 {
3521 	struct sk_buff *skb = napi->skb;
3522 
3523 	if (!skb) {
3524 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3525 		if (skb)
3526 			napi->skb = skb;
3527 	}
3528 	return skb;
3529 }
3530 EXPORT_SYMBOL(napi_get_frags);
3531 
3532 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3533 			       gro_result_t ret)
3534 {
3535 	switch (ret) {
3536 	case GRO_NORMAL:
3537 	case GRO_HELD:
3538 		skb->protocol = eth_type_trans(skb, skb->dev);
3539 
3540 		if (ret == GRO_HELD)
3541 			skb_gro_pull(skb, -ETH_HLEN);
3542 		else if (netif_receive_skb(skb))
3543 			ret = GRO_DROP;
3544 		break;
3545 
3546 	case GRO_DROP:
3547 	case GRO_MERGED_FREE:
3548 		napi_reuse_skb(napi, skb);
3549 		break;
3550 
3551 	case GRO_MERGED:
3552 		break;
3553 	}
3554 
3555 	return ret;
3556 }
3557 EXPORT_SYMBOL(napi_frags_finish);
3558 
3559 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3560 {
3561 	struct sk_buff *skb = napi->skb;
3562 	struct ethhdr *eth;
3563 	unsigned int hlen;
3564 	unsigned int off;
3565 
3566 	napi->skb = NULL;
3567 
3568 	skb_reset_mac_header(skb);
3569 	skb_gro_reset_offset(skb);
3570 
3571 	off = skb_gro_offset(skb);
3572 	hlen = off + sizeof(*eth);
3573 	eth = skb_gro_header_fast(skb, off);
3574 	if (skb_gro_header_hard(skb, hlen)) {
3575 		eth = skb_gro_header_slow(skb, hlen, off);
3576 		if (unlikely(!eth)) {
3577 			napi_reuse_skb(napi, skb);
3578 			skb = NULL;
3579 			goto out;
3580 		}
3581 	}
3582 
3583 	skb_gro_pull(skb, sizeof(*eth));
3584 
3585 	/*
3586 	 * This works because the only protocols we care about don't require
3587 	 * special handling.  We'll fix it up properly at the end.
3588 	 */
3589 	skb->protocol = eth->h_proto;
3590 
3591 out:
3592 	return skb;
3593 }
3594 EXPORT_SYMBOL(napi_frags_skb);
3595 
3596 gro_result_t napi_gro_frags(struct napi_struct *napi)
3597 {
3598 	struct sk_buff *skb = napi_frags_skb(napi);
3599 
3600 	if (!skb)
3601 		return GRO_DROP;
3602 
3603 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3604 }
3605 EXPORT_SYMBOL(napi_gro_frags);
3606 
3607 /*
3608  * net_rps_action sends any pending IPI's for rps.
3609  * Note: called with local irq disabled, but exits with local irq enabled.
3610  */
3611 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3612 {
3613 #ifdef CONFIG_RPS
3614 	struct softnet_data *remsd = sd->rps_ipi_list;
3615 
3616 	if (remsd) {
3617 		sd->rps_ipi_list = NULL;
3618 
3619 		local_irq_enable();
3620 
3621 		/* Send pending IPI's to kick RPS processing on remote cpus. */
3622 		while (remsd) {
3623 			struct softnet_data *next = remsd->rps_ipi_next;
3624 
3625 			if (cpu_online(remsd->cpu))
3626 				__smp_call_function_single(remsd->cpu,
3627 							   &remsd->csd, 0);
3628 			remsd = next;
3629 		}
3630 	} else
3631 #endif
3632 		local_irq_enable();
3633 }
3634 
3635 static int process_backlog(struct napi_struct *napi, int quota)
3636 {
3637 	int work = 0;
3638 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3639 
3640 #ifdef CONFIG_RPS
3641 	/* Check if we have pending ipi, its better to send them now,
3642 	 * not waiting net_rx_action() end.
3643 	 */
3644 	if (sd->rps_ipi_list) {
3645 		local_irq_disable();
3646 		net_rps_action_and_irq_enable(sd);
3647 	}
3648 #endif
3649 	napi->weight = weight_p;
3650 	local_irq_disable();
3651 	while (work < quota) {
3652 		struct sk_buff *skb;
3653 		unsigned int qlen;
3654 
3655 		while ((skb = __skb_dequeue(&sd->process_queue))) {
3656 			local_irq_enable();
3657 			__netif_receive_skb(skb);
3658 			local_irq_disable();
3659 			input_queue_head_incr(sd);
3660 			if (++work >= quota) {
3661 				local_irq_enable();
3662 				return work;
3663 			}
3664 		}
3665 
3666 		rps_lock(sd);
3667 		qlen = skb_queue_len(&sd->input_pkt_queue);
3668 		if (qlen)
3669 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
3670 						   &sd->process_queue);
3671 
3672 		if (qlen < quota - work) {
3673 			/*
3674 			 * Inline a custom version of __napi_complete().
3675 			 * only current cpu owns and manipulates this napi,
3676 			 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3677 			 * we can use a plain write instead of clear_bit(),
3678 			 * and we dont need an smp_mb() memory barrier.
3679 			 */
3680 			list_del(&napi->poll_list);
3681 			napi->state = 0;
3682 
3683 			quota = work + qlen;
3684 		}
3685 		rps_unlock(sd);
3686 	}
3687 	local_irq_enable();
3688 
3689 	return work;
3690 }
3691 
3692 /**
3693  * __napi_schedule - schedule for receive
3694  * @n: entry to schedule
3695  *
3696  * The entry's receive function will be scheduled to run
3697  */
3698 void __napi_schedule(struct napi_struct *n)
3699 {
3700 	unsigned long flags;
3701 
3702 	local_irq_save(flags);
3703 	____napi_schedule(&__get_cpu_var(softnet_data), n);
3704 	local_irq_restore(flags);
3705 }
3706 EXPORT_SYMBOL(__napi_schedule);
3707 
3708 void __napi_complete(struct napi_struct *n)
3709 {
3710 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3711 	BUG_ON(n->gro_list);
3712 
3713 	list_del(&n->poll_list);
3714 	smp_mb__before_clear_bit();
3715 	clear_bit(NAPI_STATE_SCHED, &n->state);
3716 }
3717 EXPORT_SYMBOL(__napi_complete);
3718 
3719 void napi_complete(struct napi_struct *n)
3720 {
3721 	unsigned long flags;
3722 
3723 	/*
3724 	 * don't let napi dequeue from the cpu poll list
3725 	 * just in case its running on a different cpu
3726 	 */
3727 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3728 		return;
3729 
3730 	napi_gro_flush(n);
3731 	local_irq_save(flags);
3732 	__napi_complete(n);
3733 	local_irq_restore(flags);
3734 }
3735 EXPORT_SYMBOL(napi_complete);
3736 
3737 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3738 		    int (*poll)(struct napi_struct *, int), int weight)
3739 {
3740 	INIT_LIST_HEAD(&napi->poll_list);
3741 	napi->gro_count = 0;
3742 	napi->gro_list = NULL;
3743 	napi->skb = NULL;
3744 	napi->poll = poll;
3745 	napi->weight = weight;
3746 	list_add(&napi->dev_list, &dev->napi_list);
3747 	napi->dev = dev;
3748 #ifdef CONFIG_NETPOLL
3749 	spin_lock_init(&napi->poll_lock);
3750 	napi->poll_owner = -1;
3751 #endif
3752 	set_bit(NAPI_STATE_SCHED, &napi->state);
3753 }
3754 EXPORT_SYMBOL(netif_napi_add);
3755 
3756 void netif_napi_del(struct napi_struct *napi)
3757 {
3758 	struct sk_buff *skb, *next;
3759 
3760 	list_del_init(&napi->dev_list);
3761 	napi_free_frags(napi);
3762 
3763 	for (skb = napi->gro_list; skb; skb = next) {
3764 		next = skb->next;
3765 		skb->next = NULL;
3766 		kfree_skb(skb);
3767 	}
3768 
3769 	napi->gro_list = NULL;
3770 	napi->gro_count = 0;
3771 }
3772 EXPORT_SYMBOL(netif_napi_del);
3773 
3774 static void net_rx_action(struct softirq_action *h)
3775 {
3776 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
3777 	unsigned long time_limit = jiffies + 2;
3778 	int budget = netdev_budget;
3779 	void *have;
3780 
3781 	local_irq_disable();
3782 
3783 	while (!list_empty(&sd->poll_list)) {
3784 		struct napi_struct *n;
3785 		int work, weight;
3786 
3787 		/* If softirq window is exhuasted then punt.
3788 		 * Allow this to run for 2 jiffies since which will allow
3789 		 * an average latency of 1.5/HZ.
3790 		 */
3791 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3792 			goto softnet_break;
3793 
3794 		local_irq_enable();
3795 
3796 		/* Even though interrupts have been re-enabled, this
3797 		 * access is safe because interrupts can only add new
3798 		 * entries to the tail of this list, and only ->poll()
3799 		 * calls can remove this head entry from the list.
3800 		 */
3801 		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
3802 
3803 		have = netpoll_poll_lock(n);
3804 
3805 		weight = n->weight;
3806 
3807 		/* This NAPI_STATE_SCHED test is for avoiding a race
3808 		 * with netpoll's poll_napi().  Only the entity which
3809 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3810 		 * actually make the ->poll() call.  Therefore we avoid
3811 		 * accidentally calling ->poll() when NAPI is not scheduled.
3812 		 */
3813 		work = 0;
3814 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3815 			work = n->poll(n, weight);
3816 			trace_napi_poll(n);
3817 		}
3818 
3819 		WARN_ON_ONCE(work > weight);
3820 
3821 		budget -= work;
3822 
3823 		local_irq_disable();
3824 
3825 		/* Drivers must not modify the NAPI state if they
3826 		 * consume the entire weight.  In such cases this code
3827 		 * still "owns" the NAPI instance and therefore can
3828 		 * move the instance around on the list at-will.
3829 		 */
3830 		if (unlikely(work == weight)) {
3831 			if (unlikely(napi_disable_pending(n))) {
3832 				local_irq_enable();
3833 				napi_complete(n);
3834 				local_irq_disable();
3835 			} else
3836 				list_move_tail(&n->poll_list, &sd->poll_list);
3837 		}
3838 
3839 		netpoll_poll_unlock(have);
3840 	}
3841 out:
3842 	net_rps_action_and_irq_enable(sd);
3843 
3844 #ifdef CONFIG_NET_DMA
3845 	/*
3846 	 * There may not be any more sk_buffs coming right now, so push
3847 	 * any pending DMA copies to hardware
3848 	 */
3849 	dma_issue_pending_all();
3850 #endif
3851 
3852 	return;
3853 
3854 softnet_break:
3855 	sd->time_squeeze++;
3856 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3857 	goto out;
3858 }
3859 
3860 static gifconf_func_t *gifconf_list[NPROTO];
3861 
3862 /**
3863  *	register_gifconf	-	register a SIOCGIF handler
3864  *	@family: Address family
3865  *	@gifconf: Function handler
3866  *
3867  *	Register protocol dependent address dumping routines. The handler
3868  *	that is passed must not be freed or reused until it has been replaced
3869  *	by another handler.
3870  */
3871 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3872 {
3873 	if (family >= NPROTO)
3874 		return -EINVAL;
3875 	gifconf_list[family] = gifconf;
3876 	return 0;
3877 }
3878 EXPORT_SYMBOL(register_gifconf);
3879 
3880 
3881 /*
3882  *	Map an interface index to its name (SIOCGIFNAME)
3883  */
3884 
3885 /*
3886  *	We need this ioctl for efficient implementation of the
3887  *	if_indextoname() function required by the IPv6 API.  Without
3888  *	it, we would have to search all the interfaces to find a
3889  *	match.  --pb
3890  */
3891 
3892 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3893 {
3894 	struct net_device *dev;
3895 	struct ifreq ifr;
3896 
3897 	/*
3898 	 *	Fetch the caller's info block.
3899 	 */
3900 
3901 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3902 		return -EFAULT;
3903 
3904 	rcu_read_lock();
3905 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3906 	if (!dev) {
3907 		rcu_read_unlock();
3908 		return -ENODEV;
3909 	}
3910 
3911 	strcpy(ifr.ifr_name, dev->name);
3912 	rcu_read_unlock();
3913 
3914 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3915 		return -EFAULT;
3916 	return 0;
3917 }
3918 
3919 /*
3920  *	Perform a SIOCGIFCONF call. This structure will change
3921  *	size eventually, and there is nothing I can do about it.
3922  *	Thus we will need a 'compatibility mode'.
3923  */
3924 
3925 static int dev_ifconf(struct net *net, char __user *arg)
3926 {
3927 	struct ifconf ifc;
3928 	struct net_device *dev;
3929 	char __user *pos;
3930 	int len;
3931 	int total;
3932 	int i;
3933 
3934 	/*
3935 	 *	Fetch the caller's info block.
3936 	 */
3937 
3938 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3939 		return -EFAULT;
3940 
3941 	pos = ifc.ifc_buf;
3942 	len = ifc.ifc_len;
3943 
3944 	/*
3945 	 *	Loop over the interfaces, and write an info block for each.
3946 	 */
3947 
3948 	total = 0;
3949 	for_each_netdev(net, dev) {
3950 		for (i = 0; i < NPROTO; i++) {
3951 			if (gifconf_list[i]) {
3952 				int done;
3953 				if (!pos)
3954 					done = gifconf_list[i](dev, NULL, 0);
3955 				else
3956 					done = gifconf_list[i](dev, pos + total,
3957 							       len - total);
3958 				if (done < 0)
3959 					return -EFAULT;
3960 				total += done;
3961 			}
3962 		}
3963 	}
3964 
3965 	/*
3966 	 *	All done.  Write the updated control block back to the caller.
3967 	 */
3968 	ifc.ifc_len = total;
3969 
3970 	/*
3971 	 * 	Both BSD and Solaris return 0 here, so we do too.
3972 	 */
3973 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3974 }
3975 
3976 #ifdef CONFIG_PROC_FS
3977 /*
3978  *	This is invoked by the /proc filesystem handler to display a device
3979  *	in detail.
3980  */
3981 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3982 	__acquires(RCU)
3983 {
3984 	struct net *net = seq_file_net(seq);
3985 	loff_t off;
3986 	struct net_device *dev;
3987 
3988 	rcu_read_lock();
3989 	if (!*pos)
3990 		return SEQ_START_TOKEN;
3991 
3992 	off = 1;
3993 	for_each_netdev_rcu(net, dev)
3994 		if (off++ == *pos)
3995 			return dev;
3996 
3997 	return NULL;
3998 }
3999 
4000 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4001 {
4002 	struct net_device *dev = v;
4003 
4004 	if (v == SEQ_START_TOKEN)
4005 		dev = first_net_device_rcu(seq_file_net(seq));
4006 	else
4007 		dev = next_net_device_rcu(dev);
4008 
4009 	++*pos;
4010 	return dev;
4011 }
4012 
4013 void dev_seq_stop(struct seq_file *seq, void *v)
4014 	__releases(RCU)
4015 {
4016 	rcu_read_unlock();
4017 }
4018 
4019 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4020 {
4021 	struct rtnl_link_stats64 temp;
4022 	const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4023 
4024 	seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4025 		   "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4026 		   dev->name, stats->rx_bytes, stats->rx_packets,
4027 		   stats->rx_errors,
4028 		   stats->rx_dropped + stats->rx_missed_errors,
4029 		   stats->rx_fifo_errors,
4030 		   stats->rx_length_errors + stats->rx_over_errors +
4031 		    stats->rx_crc_errors + stats->rx_frame_errors,
4032 		   stats->rx_compressed, stats->multicast,
4033 		   stats->tx_bytes, stats->tx_packets,
4034 		   stats->tx_errors, stats->tx_dropped,
4035 		   stats->tx_fifo_errors, stats->collisions,
4036 		   stats->tx_carrier_errors +
4037 		    stats->tx_aborted_errors +
4038 		    stats->tx_window_errors +
4039 		    stats->tx_heartbeat_errors,
4040 		   stats->tx_compressed);
4041 }
4042 
4043 /*
4044  *	Called from the PROCfs module. This now uses the new arbitrary sized
4045  *	/proc/net interface to create /proc/net/dev
4046  */
4047 static int dev_seq_show(struct seq_file *seq, void *v)
4048 {
4049 	if (v == SEQ_START_TOKEN)
4050 		seq_puts(seq, "Inter-|   Receive                            "
4051 			      "                    |  Transmit\n"
4052 			      " face |bytes    packets errs drop fifo frame "
4053 			      "compressed multicast|bytes    packets errs "
4054 			      "drop fifo colls carrier compressed\n");
4055 	else
4056 		dev_seq_printf_stats(seq, v);
4057 	return 0;
4058 }
4059 
4060 static struct softnet_data *softnet_get_online(loff_t *pos)
4061 {
4062 	struct softnet_data *sd = NULL;
4063 
4064 	while (*pos < nr_cpu_ids)
4065 		if (cpu_online(*pos)) {
4066 			sd = &per_cpu(softnet_data, *pos);
4067 			break;
4068 		} else
4069 			++*pos;
4070 	return sd;
4071 }
4072 
4073 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4074 {
4075 	return softnet_get_online(pos);
4076 }
4077 
4078 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4079 {
4080 	++*pos;
4081 	return softnet_get_online(pos);
4082 }
4083 
4084 static void softnet_seq_stop(struct seq_file *seq, void *v)
4085 {
4086 }
4087 
4088 static int softnet_seq_show(struct seq_file *seq, void *v)
4089 {
4090 	struct softnet_data *sd = v;
4091 
4092 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4093 		   sd->processed, sd->dropped, sd->time_squeeze, 0,
4094 		   0, 0, 0, 0, /* was fastroute */
4095 		   sd->cpu_collision, sd->received_rps);
4096 	return 0;
4097 }
4098 
4099 static const struct seq_operations dev_seq_ops = {
4100 	.start = dev_seq_start,
4101 	.next  = dev_seq_next,
4102 	.stop  = dev_seq_stop,
4103 	.show  = dev_seq_show,
4104 };
4105 
4106 static int dev_seq_open(struct inode *inode, struct file *file)
4107 {
4108 	return seq_open_net(inode, file, &dev_seq_ops,
4109 			    sizeof(struct seq_net_private));
4110 }
4111 
4112 static const struct file_operations dev_seq_fops = {
4113 	.owner	 = THIS_MODULE,
4114 	.open    = dev_seq_open,
4115 	.read    = seq_read,
4116 	.llseek  = seq_lseek,
4117 	.release = seq_release_net,
4118 };
4119 
4120 static const struct seq_operations softnet_seq_ops = {
4121 	.start = softnet_seq_start,
4122 	.next  = softnet_seq_next,
4123 	.stop  = softnet_seq_stop,
4124 	.show  = softnet_seq_show,
4125 };
4126 
4127 static int softnet_seq_open(struct inode *inode, struct file *file)
4128 {
4129 	return seq_open(file, &softnet_seq_ops);
4130 }
4131 
4132 static const struct file_operations softnet_seq_fops = {
4133 	.owner	 = THIS_MODULE,
4134 	.open    = softnet_seq_open,
4135 	.read    = seq_read,
4136 	.llseek  = seq_lseek,
4137 	.release = seq_release,
4138 };
4139 
4140 static void *ptype_get_idx(loff_t pos)
4141 {
4142 	struct packet_type *pt = NULL;
4143 	loff_t i = 0;
4144 	int t;
4145 
4146 	list_for_each_entry_rcu(pt, &ptype_all, list) {
4147 		if (i == pos)
4148 			return pt;
4149 		++i;
4150 	}
4151 
4152 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4153 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4154 			if (i == pos)
4155 				return pt;
4156 			++i;
4157 		}
4158 	}
4159 	return NULL;
4160 }
4161 
4162 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4163 	__acquires(RCU)
4164 {
4165 	rcu_read_lock();
4166 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4167 }
4168 
4169 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4170 {
4171 	struct packet_type *pt;
4172 	struct list_head *nxt;
4173 	int hash;
4174 
4175 	++*pos;
4176 	if (v == SEQ_START_TOKEN)
4177 		return ptype_get_idx(0);
4178 
4179 	pt = v;
4180 	nxt = pt->list.next;
4181 	if (pt->type == htons(ETH_P_ALL)) {
4182 		if (nxt != &ptype_all)
4183 			goto found;
4184 		hash = 0;
4185 		nxt = ptype_base[0].next;
4186 	} else
4187 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4188 
4189 	while (nxt == &ptype_base[hash]) {
4190 		if (++hash >= PTYPE_HASH_SIZE)
4191 			return NULL;
4192 		nxt = ptype_base[hash].next;
4193 	}
4194 found:
4195 	return list_entry(nxt, struct packet_type, list);
4196 }
4197 
4198 static void ptype_seq_stop(struct seq_file *seq, void *v)
4199 	__releases(RCU)
4200 {
4201 	rcu_read_unlock();
4202 }
4203 
4204 static int ptype_seq_show(struct seq_file *seq, void *v)
4205 {
4206 	struct packet_type *pt = v;
4207 
4208 	if (v == SEQ_START_TOKEN)
4209 		seq_puts(seq, "Type Device      Function\n");
4210 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4211 		if (pt->type == htons(ETH_P_ALL))
4212 			seq_puts(seq, "ALL ");
4213 		else
4214 			seq_printf(seq, "%04x", ntohs(pt->type));
4215 
4216 		seq_printf(seq, " %-8s %pF\n",
4217 			   pt->dev ? pt->dev->name : "", pt->func);
4218 	}
4219 
4220 	return 0;
4221 }
4222 
4223 static const struct seq_operations ptype_seq_ops = {
4224 	.start = ptype_seq_start,
4225 	.next  = ptype_seq_next,
4226 	.stop  = ptype_seq_stop,
4227 	.show  = ptype_seq_show,
4228 };
4229 
4230 static int ptype_seq_open(struct inode *inode, struct file *file)
4231 {
4232 	return seq_open_net(inode, file, &ptype_seq_ops,
4233 			sizeof(struct seq_net_private));
4234 }
4235 
4236 static const struct file_operations ptype_seq_fops = {
4237 	.owner	 = THIS_MODULE,
4238 	.open    = ptype_seq_open,
4239 	.read    = seq_read,
4240 	.llseek  = seq_lseek,
4241 	.release = seq_release_net,
4242 };
4243 
4244 
4245 static int __net_init dev_proc_net_init(struct net *net)
4246 {
4247 	int rc = -ENOMEM;
4248 
4249 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4250 		goto out;
4251 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4252 		goto out_dev;
4253 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4254 		goto out_softnet;
4255 
4256 	if (wext_proc_init(net))
4257 		goto out_ptype;
4258 	rc = 0;
4259 out:
4260 	return rc;
4261 out_ptype:
4262 	proc_net_remove(net, "ptype");
4263 out_softnet:
4264 	proc_net_remove(net, "softnet_stat");
4265 out_dev:
4266 	proc_net_remove(net, "dev");
4267 	goto out;
4268 }
4269 
4270 static void __net_exit dev_proc_net_exit(struct net *net)
4271 {
4272 	wext_proc_exit(net);
4273 
4274 	proc_net_remove(net, "ptype");
4275 	proc_net_remove(net, "softnet_stat");
4276 	proc_net_remove(net, "dev");
4277 }
4278 
4279 static struct pernet_operations __net_initdata dev_proc_ops = {
4280 	.init = dev_proc_net_init,
4281 	.exit = dev_proc_net_exit,
4282 };
4283 
4284 static int __init dev_proc_init(void)
4285 {
4286 	return register_pernet_subsys(&dev_proc_ops);
4287 }
4288 #else
4289 #define dev_proc_init() 0
4290 #endif	/* CONFIG_PROC_FS */
4291 
4292 
4293 /**
4294  *	netdev_set_master	-	set up master pointer
4295  *	@slave: slave device
4296  *	@master: new master device
4297  *
4298  *	Changes the master device of the slave. Pass %NULL to break the
4299  *	bonding. The caller must hold the RTNL semaphore. On a failure
4300  *	a negative errno code is returned. On success the reference counts
4301  *	are adjusted and the function returns zero.
4302  */
4303 int netdev_set_master(struct net_device *slave, struct net_device *master)
4304 {
4305 	struct net_device *old = slave->master;
4306 
4307 	ASSERT_RTNL();
4308 
4309 	if (master) {
4310 		if (old)
4311 			return -EBUSY;
4312 		dev_hold(master);
4313 	}
4314 
4315 	slave->master = master;
4316 
4317 	if (old)
4318 		dev_put(old);
4319 	return 0;
4320 }
4321 EXPORT_SYMBOL(netdev_set_master);
4322 
4323 /**
4324  *	netdev_set_bond_master	-	set up bonding master/slave pair
4325  *	@slave: slave device
4326  *	@master: new master device
4327  *
4328  *	Changes the master device of the slave. Pass %NULL to break the
4329  *	bonding. The caller must hold the RTNL semaphore. On a failure
4330  *	a negative errno code is returned. On success %RTM_NEWLINK is sent
4331  *	to the routing socket and the function returns zero.
4332  */
4333 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4334 {
4335 	int err;
4336 
4337 	ASSERT_RTNL();
4338 
4339 	err = netdev_set_master(slave, master);
4340 	if (err)
4341 		return err;
4342 	if (master)
4343 		slave->flags |= IFF_SLAVE;
4344 	else
4345 		slave->flags &= ~IFF_SLAVE;
4346 
4347 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4348 	return 0;
4349 }
4350 EXPORT_SYMBOL(netdev_set_bond_master);
4351 
4352 static void dev_change_rx_flags(struct net_device *dev, int flags)
4353 {
4354 	const struct net_device_ops *ops = dev->netdev_ops;
4355 
4356 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4357 		ops->ndo_change_rx_flags(dev, flags);
4358 }
4359 
4360 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4361 {
4362 	unsigned short old_flags = dev->flags;
4363 	uid_t uid;
4364 	gid_t gid;
4365 
4366 	ASSERT_RTNL();
4367 
4368 	dev->flags |= IFF_PROMISC;
4369 	dev->promiscuity += inc;
4370 	if (dev->promiscuity == 0) {
4371 		/*
4372 		 * Avoid overflow.
4373 		 * If inc causes overflow, untouch promisc and return error.
4374 		 */
4375 		if (inc < 0)
4376 			dev->flags &= ~IFF_PROMISC;
4377 		else {
4378 			dev->promiscuity -= inc;
4379 			printk(KERN_WARNING "%s: promiscuity touches roof, "
4380 				"set promiscuity failed, promiscuity feature "
4381 				"of device might be broken.\n", dev->name);
4382 			return -EOVERFLOW;
4383 		}
4384 	}
4385 	if (dev->flags != old_flags) {
4386 		printk(KERN_INFO "device %s %s promiscuous mode\n",
4387 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4388 							       "left");
4389 		if (audit_enabled) {
4390 			current_uid_gid(&uid, &gid);
4391 			audit_log(current->audit_context, GFP_ATOMIC,
4392 				AUDIT_ANOM_PROMISCUOUS,
4393 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4394 				dev->name, (dev->flags & IFF_PROMISC),
4395 				(old_flags & IFF_PROMISC),
4396 				audit_get_loginuid(current),
4397 				uid, gid,
4398 				audit_get_sessionid(current));
4399 		}
4400 
4401 		dev_change_rx_flags(dev, IFF_PROMISC);
4402 	}
4403 	return 0;
4404 }
4405 
4406 /**
4407  *	dev_set_promiscuity	- update promiscuity count on a device
4408  *	@dev: device
4409  *	@inc: modifier
4410  *
4411  *	Add or remove promiscuity from a device. While the count in the device
4412  *	remains above zero the interface remains promiscuous. Once it hits zero
4413  *	the device reverts back to normal filtering operation. A negative inc
4414  *	value is used to drop promiscuity on the device.
4415  *	Return 0 if successful or a negative errno code on error.
4416  */
4417 int dev_set_promiscuity(struct net_device *dev, int inc)
4418 {
4419 	unsigned short old_flags = dev->flags;
4420 	int err;
4421 
4422 	err = __dev_set_promiscuity(dev, inc);
4423 	if (err < 0)
4424 		return err;
4425 	if (dev->flags != old_flags)
4426 		dev_set_rx_mode(dev);
4427 	return err;
4428 }
4429 EXPORT_SYMBOL(dev_set_promiscuity);
4430 
4431 /**
4432  *	dev_set_allmulti	- update allmulti count on a device
4433  *	@dev: device
4434  *	@inc: modifier
4435  *
4436  *	Add or remove reception of all multicast frames to a device. While the
4437  *	count in the device remains above zero the interface remains listening
4438  *	to all interfaces. Once it hits zero the device reverts back to normal
4439  *	filtering operation. A negative @inc value is used to drop the counter
4440  *	when releasing a resource needing all multicasts.
4441  *	Return 0 if successful or a negative errno code on error.
4442  */
4443 
4444 int dev_set_allmulti(struct net_device *dev, int inc)
4445 {
4446 	unsigned short old_flags = dev->flags;
4447 
4448 	ASSERT_RTNL();
4449 
4450 	dev->flags |= IFF_ALLMULTI;
4451 	dev->allmulti += inc;
4452 	if (dev->allmulti == 0) {
4453 		/*
4454 		 * Avoid overflow.
4455 		 * If inc causes overflow, untouch allmulti and return error.
4456 		 */
4457 		if (inc < 0)
4458 			dev->flags &= ~IFF_ALLMULTI;
4459 		else {
4460 			dev->allmulti -= inc;
4461 			printk(KERN_WARNING "%s: allmulti touches roof, "
4462 				"set allmulti failed, allmulti feature of "
4463 				"device might be broken.\n", dev->name);
4464 			return -EOVERFLOW;
4465 		}
4466 	}
4467 	if (dev->flags ^ old_flags) {
4468 		dev_change_rx_flags(dev, IFF_ALLMULTI);
4469 		dev_set_rx_mode(dev);
4470 	}
4471 	return 0;
4472 }
4473 EXPORT_SYMBOL(dev_set_allmulti);
4474 
4475 /*
4476  *	Upload unicast and multicast address lists to device and
4477  *	configure RX filtering. When the device doesn't support unicast
4478  *	filtering it is put in promiscuous mode while unicast addresses
4479  *	are present.
4480  */
4481 void __dev_set_rx_mode(struct net_device *dev)
4482 {
4483 	const struct net_device_ops *ops = dev->netdev_ops;
4484 
4485 	/* dev_open will call this function so the list will stay sane. */
4486 	if (!(dev->flags&IFF_UP))
4487 		return;
4488 
4489 	if (!netif_device_present(dev))
4490 		return;
4491 
4492 	if (ops->ndo_set_rx_mode)
4493 		ops->ndo_set_rx_mode(dev);
4494 	else {
4495 		/* Unicast addresses changes may only happen under the rtnl,
4496 		 * therefore calling __dev_set_promiscuity here is safe.
4497 		 */
4498 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4499 			__dev_set_promiscuity(dev, 1);
4500 			dev->uc_promisc = 1;
4501 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4502 			__dev_set_promiscuity(dev, -1);
4503 			dev->uc_promisc = 0;
4504 		}
4505 
4506 		if (ops->ndo_set_multicast_list)
4507 			ops->ndo_set_multicast_list(dev);
4508 	}
4509 }
4510 
4511 void dev_set_rx_mode(struct net_device *dev)
4512 {
4513 	netif_addr_lock_bh(dev);
4514 	__dev_set_rx_mode(dev);
4515 	netif_addr_unlock_bh(dev);
4516 }
4517 
4518 /**
4519  *	dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
4520  *	@dev: device
4521  *	@cmd: memory area for ethtool_ops::get_settings() result
4522  *
4523  *      The cmd arg is initialized properly (cleared and
4524  *      ethtool_cmd::cmd field set to ETHTOOL_GSET).
4525  *
4526  *	Return device's ethtool_ops::get_settings() result value or
4527  *	-EOPNOTSUPP when device doesn't expose
4528  *	ethtool_ops::get_settings() operation.
4529  */
4530 int dev_ethtool_get_settings(struct net_device *dev,
4531 			     struct ethtool_cmd *cmd)
4532 {
4533 	if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
4534 		return -EOPNOTSUPP;
4535 
4536 	memset(cmd, 0, sizeof(struct ethtool_cmd));
4537 	cmd->cmd = ETHTOOL_GSET;
4538 	return dev->ethtool_ops->get_settings(dev, cmd);
4539 }
4540 EXPORT_SYMBOL(dev_ethtool_get_settings);
4541 
4542 /**
4543  *	dev_get_flags - get flags reported to userspace
4544  *	@dev: device
4545  *
4546  *	Get the combination of flag bits exported through APIs to userspace.
4547  */
4548 unsigned dev_get_flags(const struct net_device *dev)
4549 {
4550 	unsigned flags;
4551 
4552 	flags = (dev->flags & ~(IFF_PROMISC |
4553 				IFF_ALLMULTI |
4554 				IFF_RUNNING |
4555 				IFF_LOWER_UP |
4556 				IFF_DORMANT)) |
4557 		(dev->gflags & (IFF_PROMISC |
4558 				IFF_ALLMULTI));
4559 
4560 	if (netif_running(dev)) {
4561 		if (netif_oper_up(dev))
4562 			flags |= IFF_RUNNING;
4563 		if (netif_carrier_ok(dev))
4564 			flags |= IFF_LOWER_UP;
4565 		if (netif_dormant(dev))
4566 			flags |= IFF_DORMANT;
4567 	}
4568 
4569 	return flags;
4570 }
4571 EXPORT_SYMBOL(dev_get_flags);
4572 
4573 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4574 {
4575 	int old_flags = dev->flags;
4576 	int ret;
4577 
4578 	ASSERT_RTNL();
4579 
4580 	/*
4581 	 *	Set the flags on our device.
4582 	 */
4583 
4584 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4585 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4586 			       IFF_AUTOMEDIA)) |
4587 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4588 				    IFF_ALLMULTI));
4589 
4590 	/*
4591 	 *	Load in the correct multicast list now the flags have changed.
4592 	 */
4593 
4594 	if ((old_flags ^ flags) & IFF_MULTICAST)
4595 		dev_change_rx_flags(dev, IFF_MULTICAST);
4596 
4597 	dev_set_rx_mode(dev);
4598 
4599 	/*
4600 	 *	Have we downed the interface. We handle IFF_UP ourselves
4601 	 *	according to user attempts to set it, rather than blindly
4602 	 *	setting it.
4603 	 */
4604 
4605 	ret = 0;
4606 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4607 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4608 
4609 		if (!ret)
4610 			dev_set_rx_mode(dev);
4611 	}
4612 
4613 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4614 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4615 
4616 		dev->gflags ^= IFF_PROMISC;
4617 		dev_set_promiscuity(dev, inc);
4618 	}
4619 
4620 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4621 	   is important. Some (broken) drivers set IFF_PROMISC, when
4622 	   IFF_ALLMULTI is requested not asking us and not reporting.
4623 	 */
4624 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4625 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4626 
4627 		dev->gflags ^= IFF_ALLMULTI;
4628 		dev_set_allmulti(dev, inc);
4629 	}
4630 
4631 	return ret;
4632 }
4633 
4634 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4635 {
4636 	unsigned int changes = dev->flags ^ old_flags;
4637 
4638 	if (changes & IFF_UP) {
4639 		if (dev->flags & IFF_UP)
4640 			call_netdevice_notifiers(NETDEV_UP, dev);
4641 		else
4642 			call_netdevice_notifiers(NETDEV_DOWN, dev);
4643 	}
4644 
4645 	if (dev->flags & IFF_UP &&
4646 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4647 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4648 }
4649 
4650 /**
4651  *	dev_change_flags - change device settings
4652  *	@dev: device
4653  *	@flags: device state flags
4654  *
4655  *	Change settings on device based state flags. The flags are
4656  *	in the userspace exported format.
4657  */
4658 int dev_change_flags(struct net_device *dev, unsigned flags)
4659 {
4660 	int ret, changes;
4661 	int old_flags = dev->flags;
4662 
4663 	ret = __dev_change_flags(dev, flags);
4664 	if (ret < 0)
4665 		return ret;
4666 
4667 	changes = old_flags ^ dev->flags;
4668 	if (changes)
4669 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4670 
4671 	__dev_notify_flags(dev, old_flags);
4672 	return ret;
4673 }
4674 EXPORT_SYMBOL(dev_change_flags);
4675 
4676 /**
4677  *	dev_set_mtu - Change maximum transfer unit
4678  *	@dev: device
4679  *	@new_mtu: new transfer unit
4680  *
4681  *	Change the maximum transfer size of the network device.
4682  */
4683 int dev_set_mtu(struct net_device *dev, int new_mtu)
4684 {
4685 	const struct net_device_ops *ops = dev->netdev_ops;
4686 	int err;
4687 
4688 	if (new_mtu == dev->mtu)
4689 		return 0;
4690 
4691 	/*	MTU must be positive.	 */
4692 	if (new_mtu < 0)
4693 		return -EINVAL;
4694 
4695 	if (!netif_device_present(dev))
4696 		return -ENODEV;
4697 
4698 	err = 0;
4699 	if (ops->ndo_change_mtu)
4700 		err = ops->ndo_change_mtu(dev, new_mtu);
4701 	else
4702 		dev->mtu = new_mtu;
4703 
4704 	if (!err && dev->flags & IFF_UP)
4705 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4706 	return err;
4707 }
4708 EXPORT_SYMBOL(dev_set_mtu);
4709 
4710 /**
4711  *	dev_set_group - Change group this device belongs to
4712  *	@dev: device
4713  *	@new_group: group this device should belong to
4714  */
4715 void dev_set_group(struct net_device *dev, int new_group)
4716 {
4717 	dev->group = new_group;
4718 }
4719 EXPORT_SYMBOL(dev_set_group);
4720 
4721 /**
4722  *	dev_set_mac_address - Change Media Access Control Address
4723  *	@dev: device
4724  *	@sa: new address
4725  *
4726  *	Change the hardware (MAC) address of the device
4727  */
4728 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4729 {
4730 	const struct net_device_ops *ops = dev->netdev_ops;
4731 	int err;
4732 
4733 	if (!ops->ndo_set_mac_address)
4734 		return -EOPNOTSUPP;
4735 	if (sa->sa_family != dev->type)
4736 		return -EINVAL;
4737 	if (!netif_device_present(dev))
4738 		return -ENODEV;
4739 	err = ops->ndo_set_mac_address(dev, sa);
4740 	if (!err)
4741 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4742 	return err;
4743 }
4744 EXPORT_SYMBOL(dev_set_mac_address);
4745 
4746 /*
4747  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4748  */
4749 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4750 {
4751 	int err;
4752 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4753 
4754 	if (!dev)
4755 		return -ENODEV;
4756 
4757 	switch (cmd) {
4758 	case SIOCGIFFLAGS:	/* Get interface flags */
4759 		ifr->ifr_flags = (short) dev_get_flags(dev);
4760 		return 0;
4761 
4762 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4763 				   (currently unused) */
4764 		ifr->ifr_metric = 0;
4765 		return 0;
4766 
4767 	case SIOCGIFMTU:	/* Get the MTU of a device */
4768 		ifr->ifr_mtu = dev->mtu;
4769 		return 0;
4770 
4771 	case SIOCGIFHWADDR:
4772 		if (!dev->addr_len)
4773 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4774 		else
4775 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4776 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4777 		ifr->ifr_hwaddr.sa_family = dev->type;
4778 		return 0;
4779 
4780 	case SIOCGIFSLAVE:
4781 		err = -EINVAL;
4782 		break;
4783 
4784 	case SIOCGIFMAP:
4785 		ifr->ifr_map.mem_start = dev->mem_start;
4786 		ifr->ifr_map.mem_end   = dev->mem_end;
4787 		ifr->ifr_map.base_addr = dev->base_addr;
4788 		ifr->ifr_map.irq       = dev->irq;
4789 		ifr->ifr_map.dma       = dev->dma;
4790 		ifr->ifr_map.port      = dev->if_port;
4791 		return 0;
4792 
4793 	case SIOCGIFINDEX:
4794 		ifr->ifr_ifindex = dev->ifindex;
4795 		return 0;
4796 
4797 	case SIOCGIFTXQLEN:
4798 		ifr->ifr_qlen = dev->tx_queue_len;
4799 		return 0;
4800 
4801 	default:
4802 		/* dev_ioctl() should ensure this case
4803 		 * is never reached
4804 		 */
4805 		WARN_ON(1);
4806 		err = -ENOTTY;
4807 		break;
4808 
4809 	}
4810 	return err;
4811 }
4812 
4813 /*
4814  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4815  */
4816 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4817 {
4818 	int err;
4819 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4820 	const struct net_device_ops *ops;
4821 
4822 	if (!dev)
4823 		return -ENODEV;
4824 
4825 	ops = dev->netdev_ops;
4826 
4827 	switch (cmd) {
4828 	case SIOCSIFFLAGS:	/* Set interface flags */
4829 		return dev_change_flags(dev, ifr->ifr_flags);
4830 
4831 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4832 				   (currently unused) */
4833 		return -EOPNOTSUPP;
4834 
4835 	case SIOCSIFMTU:	/* Set the MTU of a device */
4836 		return dev_set_mtu(dev, ifr->ifr_mtu);
4837 
4838 	case SIOCSIFHWADDR:
4839 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4840 
4841 	case SIOCSIFHWBROADCAST:
4842 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4843 			return -EINVAL;
4844 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4845 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4846 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4847 		return 0;
4848 
4849 	case SIOCSIFMAP:
4850 		if (ops->ndo_set_config) {
4851 			if (!netif_device_present(dev))
4852 				return -ENODEV;
4853 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4854 		}
4855 		return -EOPNOTSUPP;
4856 
4857 	case SIOCADDMULTI:
4858 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4859 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4860 			return -EINVAL;
4861 		if (!netif_device_present(dev))
4862 			return -ENODEV;
4863 		return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
4864 
4865 	case SIOCDELMULTI:
4866 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4867 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4868 			return -EINVAL;
4869 		if (!netif_device_present(dev))
4870 			return -ENODEV;
4871 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
4872 
4873 	case SIOCSIFTXQLEN:
4874 		if (ifr->ifr_qlen < 0)
4875 			return -EINVAL;
4876 		dev->tx_queue_len = ifr->ifr_qlen;
4877 		return 0;
4878 
4879 	case SIOCSIFNAME:
4880 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4881 		return dev_change_name(dev, ifr->ifr_newname);
4882 
4883 	/*
4884 	 *	Unknown or private ioctl
4885 	 */
4886 	default:
4887 		if ((cmd >= SIOCDEVPRIVATE &&
4888 		    cmd <= SIOCDEVPRIVATE + 15) ||
4889 		    cmd == SIOCBONDENSLAVE ||
4890 		    cmd == SIOCBONDRELEASE ||
4891 		    cmd == SIOCBONDSETHWADDR ||
4892 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4893 		    cmd == SIOCBONDINFOQUERY ||
4894 		    cmd == SIOCBONDCHANGEACTIVE ||
4895 		    cmd == SIOCGMIIPHY ||
4896 		    cmd == SIOCGMIIREG ||
4897 		    cmd == SIOCSMIIREG ||
4898 		    cmd == SIOCBRADDIF ||
4899 		    cmd == SIOCBRDELIF ||
4900 		    cmd == SIOCSHWTSTAMP ||
4901 		    cmd == SIOCWANDEV) {
4902 			err = -EOPNOTSUPP;
4903 			if (ops->ndo_do_ioctl) {
4904 				if (netif_device_present(dev))
4905 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4906 				else
4907 					err = -ENODEV;
4908 			}
4909 		} else
4910 			err = -EINVAL;
4911 
4912 	}
4913 	return err;
4914 }
4915 
4916 /*
4917  *	This function handles all "interface"-type I/O control requests. The actual
4918  *	'doing' part of this is dev_ifsioc above.
4919  */
4920 
4921 /**
4922  *	dev_ioctl	-	network device ioctl
4923  *	@net: the applicable net namespace
4924  *	@cmd: command to issue
4925  *	@arg: pointer to a struct ifreq in user space
4926  *
4927  *	Issue ioctl functions to devices. This is normally called by the
4928  *	user space syscall interfaces but can sometimes be useful for
4929  *	other purposes. The return value is the return from the syscall if
4930  *	positive or a negative errno code on error.
4931  */
4932 
4933 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4934 {
4935 	struct ifreq ifr;
4936 	int ret;
4937 	char *colon;
4938 
4939 	/* One special case: SIOCGIFCONF takes ifconf argument
4940 	   and requires shared lock, because it sleeps writing
4941 	   to user space.
4942 	 */
4943 
4944 	if (cmd == SIOCGIFCONF) {
4945 		rtnl_lock();
4946 		ret = dev_ifconf(net, (char __user *) arg);
4947 		rtnl_unlock();
4948 		return ret;
4949 	}
4950 	if (cmd == SIOCGIFNAME)
4951 		return dev_ifname(net, (struct ifreq __user *)arg);
4952 
4953 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4954 		return -EFAULT;
4955 
4956 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4957 
4958 	colon = strchr(ifr.ifr_name, ':');
4959 	if (colon)
4960 		*colon = 0;
4961 
4962 	/*
4963 	 *	See which interface the caller is talking about.
4964 	 */
4965 
4966 	switch (cmd) {
4967 	/*
4968 	 *	These ioctl calls:
4969 	 *	- can be done by all.
4970 	 *	- atomic and do not require locking.
4971 	 *	- return a value
4972 	 */
4973 	case SIOCGIFFLAGS:
4974 	case SIOCGIFMETRIC:
4975 	case SIOCGIFMTU:
4976 	case SIOCGIFHWADDR:
4977 	case SIOCGIFSLAVE:
4978 	case SIOCGIFMAP:
4979 	case SIOCGIFINDEX:
4980 	case SIOCGIFTXQLEN:
4981 		dev_load(net, ifr.ifr_name);
4982 		rcu_read_lock();
4983 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4984 		rcu_read_unlock();
4985 		if (!ret) {
4986 			if (colon)
4987 				*colon = ':';
4988 			if (copy_to_user(arg, &ifr,
4989 					 sizeof(struct ifreq)))
4990 				ret = -EFAULT;
4991 		}
4992 		return ret;
4993 
4994 	case SIOCETHTOOL:
4995 		dev_load(net, ifr.ifr_name);
4996 		rtnl_lock();
4997 		ret = dev_ethtool(net, &ifr);
4998 		rtnl_unlock();
4999 		if (!ret) {
5000 			if (colon)
5001 				*colon = ':';
5002 			if (copy_to_user(arg, &ifr,
5003 					 sizeof(struct ifreq)))
5004 				ret = -EFAULT;
5005 		}
5006 		return ret;
5007 
5008 	/*
5009 	 *	These ioctl calls:
5010 	 *	- require superuser power.
5011 	 *	- require strict serialization.
5012 	 *	- return a value
5013 	 */
5014 	case SIOCGMIIPHY:
5015 	case SIOCGMIIREG:
5016 	case SIOCSIFNAME:
5017 		if (!capable(CAP_NET_ADMIN))
5018 			return -EPERM;
5019 		dev_load(net, ifr.ifr_name);
5020 		rtnl_lock();
5021 		ret = dev_ifsioc(net, &ifr, cmd);
5022 		rtnl_unlock();
5023 		if (!ret) {
5024 			if (colon)
5025 				*colon = ':';
5026 			if (copy_to_user(arg, &ifr,
5027 					 sizeof(struct ifreq)))
5028 				ret = -EFAULT;
5029 		}
5030 		return ret;
5031 
5032 	/*
5033 	 *	These ioctl calls:
5034 	 *	- require superuser power.
5035 	 *	- require strict serialization.
5036 	 *	- do not return a value
5037 	 */
5038 	case SIOCSIFFLAGS:
5039 	case SIOCSIFMETRIC:
5040 	case SIOCSIFMTU:
5041 	case SIOCSIFMAP:
5042 	case SIOCSIFHWADDR:
5043 	case SIOCSIFSLAVE:
5044 	case SIOCADDMULTI:
5045 	case SIOCDELMULTI:
5046 	case SIOCSIFHWBROADCAST:
5047 	case SIOCSIFTXQLEN:
5048 	case SIOCSMIIREG:
5049 	case SIOCBONDENSLAVE:
5050 	case SIOCBONDRELEASE:
5051 	case SIOCBONDSETHWADDR:
5052 	case SIOCBONDCHANGEACTIVE:
5053 	case SIOCBRADDIF:
5054 	case SIOCBRDELIF:
5055 	case SIOCSHWTSTAMP:
5056 		if (!capable(CAP_NET_ADMIN))
5057 			return -EPERM;
5058 		/* fall through */
5059 	case SIOCBONDSLAVEINFOQUERY:
5060 	case SIOCBONDINFOQUERY:
5061 		dev_load(net, ifr.ifr_name);
5062 		rtnl_lock();
5063 		ret = dev_ifsioc(net, &ifr, cmd);
5064 		rtnl_unlock();
5065 		return ret;
5066 
5067 	case SIOCGIFMEM:
5068 		/* Get the per device memory space. We can add this but
5069 		 * currently do not support it */
5070 	case SIOCSIFMEM:
5071 		/* Set the per device memory buffer space.
5072 		 * Not applicable in our case */
5073 	case SIOCSIFLINK:
5074 		return -ENOTTY;
5075 
5076 	/*
5077 	 *	Unknown or private ioctl.
5078 	 */
5079 	default:
5080 		if (cmd == SIOCWANDEV ||
5081 		    (cmd >= SIOCDEVPRIVATE &&
5082 		     cmd <= SIOCDEVPRIVATE + 15)) {
5083 			dev_load(net, ifr.ifr_name);
5084 			rtnl_lock();
5085 			ret = dev_ifsioc(net, &ifr, cmd);
5086 			rtnl_unlock();
5087 			if (!ret && copy_to_user(arg, &ifr,
5088 						 sizeof(struct ifreq)))
5089 				ret = -EFAULT;
5090 			return ret;
5091 		}
5092 		/* Take care of Wireless Extensions */
5093 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5094 			return wext_handle_ioctl(net, &ifr, cmd, arg);
5095 		return -ENOTTY;
5096 	}
5097 }
5098 
5099 
5100 /**
5101  *	dev_new_index	-	allocate an ifindex
5102  *	@net: the applicable net namespace
5103  *
5104  *	Returns a suitable unique value for a new device interface
5105  *	number.  The caller must hold the rtnl semaphore or the
5106  *	dev_base_lock to be sure it remains unique.
5107  */
5108 static int dev_new_index(struct net *net)
5109 {
5110 	static int ifindex;
5111 	for (;;) {
5112 		if (++ifindex <= 0)
5113 			ifindex = 1;
5114 		if (!__dev_get_by_index(net, ifindex))
5115 			return ifindex;
5116 	}
5117 }
5118 
5119 /* Delayed registration/unregisteration */
5120 static LIST_HEAD(net_todo_list);
5121 
5122 static void net_set_todo(struct net_device *dev)
5123 {
5124 	list_add_tail(&dev->todo_list, &net_todo_list);
5125 }
5126 
5127 static void rollback_registered_many(struct list_head *head)
5128 {
5129 	struct net_device *dev, *tmp;
5130 
5131 	BUG_ON(dev_boot_phase);
5132 	ASSERT_RTNL();
5133 
5134 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5135 		/* Some devices call without registering
5136 		 * for initialization unwind. Remove those
5137 		 * devices and proceed with the remaining.
5138 		 */
5139 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5140 			pr_debug("unregister_netdevice: device %s/%p never "
5141 				 "was registered\n", dev->name, dev);
5142 
5143 			WARN_ON(1);
5144 			list_del(&dev->unreg_list);
5145 			continue;
5146 		}
5147 		dev->dismantle = true;
5148 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5149 	}
5150 
5151 	/* If device is running, close it first. */
5152 	dev_close_many(head);
5153 
5154 	list_for_each_entry(dev, head, unreg_list) {
5155 		/* And unlink it from device chain. */
5156 		unlist_netdevice(dev);
5157 
5158 		dev->reg_state = NETREG_UNREGISTERING;
5159 	}
5160 
5161 	synchronize_net();
5162 
5163 	list_for_each_entry(dev, head, unreg_list) {
5164 		/* Shutdown queueing discipline. */
5165 		dev_shutdown(dev);
5166 
5167 
5168 		/* Notify protocols, that we are about to destroy
5169 		   this device. They should clean all the things.
5170 		*/
5171 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5172 
5173 		if (!dev->rtnl_link_ops ||
5174 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5175 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5176 
5177 		/*
5178 		 *	Flush the unicast and multicast chains
5179 		 */
5180 		dev_uc_flush(dev);
5181 		dev_mc_flush(dev);
5182 
5183 		if (dev->netdev_ops->ndo_uninit)
5184 			dev->netdev_ops->ndo_uninit(dev);
5185 
5186 		/* Notifier chain MUST detach us from master device. */
5187 		WARN_ON(dev->master);
5188 
5189 		/* Remove entries from kobject tree */
5190 		netdev_unregister_kobject(dev);
5191 	}
5192 
5193 	/* Process any work delayed until the end of the batch */
5194 	dev = list_first_entry(head, struct net_device, unreg_list);
5195 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5196 
5197 	rcu_barrier();
5198 
5199 	list_for_each_entry(dev, head, unreg_list)
5200 		dev_put(dev);
5201 }
5202 
5203 static void rollback_registered(struct net_device *dev)
5204 {
5205 	LIST_HEAD(single);
5206 
5207 	list_add(&dev->unreg_list, &single);
5208 	rollback_registered_many(&single);
5209 	list_del(&single);
5210 }
5211 
5212 static u32 netdev_fix_features(struct net_device *dev, u32 features)
5213 {
5214 	/* Fix illegal checksum combinations */
5215 	if ((features & NETIF_F_HW_CSUM) &&
5216 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5217 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5218 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5219 	}
5220 
5221 	if ((features & NETIF_F_NO_CSUM) &&
5222 	    (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5223 		netdev_warn(dev, "mixed no checksumming and other settings.\n");
5224 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5225 	}
5226 
5227 	/* Fix illegal SG+CSUM combinations. */
5228 	if ((features & NETIF_F_SG) &&
5229 	    !(features & NETIF_F_ALL_CSUM)) {
5230 		netdev_dbg(dev,
5231 			"Dropping NETIF_F_SG since no checksum feature.\n");
5232 		features &= ~NETIF_F_SG;
5233 	}
5234 
5235 	/* TSO requires that SG is present as well. */
5236 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5237 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5238 		features &= ~NETIF_F_ALL_TSO;
5239 	}
5240 
5241 	/* TSO ECN requires that TSO is present as well. */
5242 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5243 		features &= ~NETIF_F_TSO_ECN;
5244 
5245 	/* Software GSO depends on SG. */
5246 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5247 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5248 		features &= ~NETIF_F_GSO;
5249 	}
5250 
5251 	/* UFO needs SG and checksumming */
5252 	if (features & NETIF_F_UFO) {
5253 		/* maybe split UFO into V4 and V6? */
5254 		if (!((features & NETIF_F_GEN_CSUM) ||
5255 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5256 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5257 			netdev_dbg(dev,
5258 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
5259 			features &= ~NETIF_F_UFO;
5260 		}
5261 
5262 		if (!(features & NETIF_F_SG)) {
5263 			netdev_dbg(dev,
5264 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5265 			features &= ~NETIF_F_UFO;
5266 		}
5267 	}
5268 
5269 	return features;
5270 }
5271 
5272 int __netdev_update_features(struct net_device *dev)
5273 {
5274 	u32 features;
5275 	int err = 0;
5276 
5277 	ASSERT_RTNL();
5278 
5279 	features = netdev_get_wanted_features(dev);
5280 
5281 	if (dev->netdev_ops->ndo_fix_features)
5282 		features = dev->netdev_ops->ndo_fix_features(dev, features);
5283 
5284 	/* driver might be less strict about feature dependencies */
5285 	features = netdev_fix_features(dev, features);
5286 
5287 	if (dev->features == features)
5288 		return 0;
5289 
5290 	netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
5291 		dev->features, features);
5292 
5293 	if (dev->netdev_ops->ndo_set_features)
5294 		err = dev->netdev_ops->ndo_set_features(dev, features);
5295 
5296 	if (unlikely(err < 0)) {
5297 		netdev_err(dev,
5298 			"set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5299 			err, features, dev->features);
5300 		return -1;
5301 	}
5302 
5303 	if (!err)
5304 		dev->features = features;
5305 
5306 	return 1;
5307 }
5308 
5309 /**
5310  *	netdev_update_features - recalculate device features
5311  *	@dev: the device to check
5312  *
5313  *	Recalculate dev->features set and send notifications if it
5314  *	has changed. Should be called after driver or hardware dependent
5315  *	conditions might have changed that influence the features.
5316  */
5317 void netdev_update_features(struct net_device *dev)
5318 {
5319 	if (__netdev_update_features(dev))
5320 		netdev_features_change(dev);
5321 }
5322 EXPORT_SYMBOL(netdev_update_features);
5323 
5324 /**
5325  *	netdev_change_features - recalculate device features
5326  *	@dev: the device to check
5327  *
5328  *	Recalculate dev->features set and send notifications even
5329  *	if they have not changed. Should be called instead of
5330  *	netdev_update_features() if also dev->vlan_features might
5331  *	have changed to allow the changes to be propagated to stacked
5332  *	VLAN devices.
5333  */
5334 void netdev_change_features(struct net_device *dev)
5335 {
5336 	__netdev_update_features(dev);
5337 	netdev_features_change(dev);
5338 }
5339 EXPORT_SYMBOL(netdev_change_features);
5340 
5341 /**
5342  *	netif_stacked_transfer_operstate -	transfer operstate
5343  *	@rootdev: the root or lower level device to transfer state from
5344  *	@dev: the device to transfer operstate to
5345  *
5346  *	Transfer operational state from root to device. This is normally
5347  *	called when a stacking relationship exists between the root
5348  *	device and the device(a leaf device).
5349  */
5350 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5351 					struct net_device *dev)
5352 {
5353 	if (rootdev->operstate == IF_OPER_DORMANT)
5354 		netif_dormant_on(dev);
5355 	else
5356 		netif_dormant_off(dev);
5357 
5358 	if (netif_carrier_ok(rootdev)) {
5359 		if (!netif_carrier_ok(dev))
5360 			netif_carrier_on(dev);
5361 	} else {
5362 		if (netif_carrier_ok(dev))
5363 			netif_carrier_off(dev);
5364 	}
5365 }
5366 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5367 
5368 #ifdef CONFIG_RPS
5369 static int netif_alloc_rx_queues(struct net_device *dev)
5370 {
5371 	unsigned int i, count = dev->num_rx_queues;
5372 	struct netdev_rx_queue *rx;
5373 
5374 	BUG_ON(count < 1);
5375 
5376 	rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5377 	if (!rx) {
5378 		pr_err("netdev: Unable to allocate %u rx queues.\n", count);
5379 		return -ENOMEM;
5380 	}
5381 	dev->_rx = rx;
5382 
5383 	for (i = 0; i < count; i++)
5384 		rx[i].dev = dev;
5385 	return 0;
5386 }
5387 #endif
5388 
5389 static void netdev_init_one_queue(struct net_device *dev,
5390 				  struct netdev_queue *queue, void *_unused)
5391 {
5392 	/* Initialize queue lock */
5393 	spin_lock_init(&queue->_xmit_lock);
5394 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5395 	queue->xmit_lock_owner = -1;
5396 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5397 	queue->dev = dev;
5398 }
5399 
5400 static int netif_alloc_netdev_queues(struct net_device *dev)
5401 {
5402 	unsigned int count = dev->num_tx_queues;
5403 	struct netdev_queue *tx;
5404 
5405 	BUG_ON(count < 1);
5406 
5407 	tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5408 	if (!tx) {
5409 		pr_err("netdev: Unable to allocate %u tx queues.\n",
5410 		       count);
5411 		return -ENOMEM;
5412 	}
5413 	dev->_tx = tx;
5414 
5415 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5416 	spin_lock_init(&dev->tx_global_lock);
5417 
5418 	return 0;
5419 }
5420 
5421 /**
5422  *	register_netdevice	- register a network device
5423  *	@dev: device to register
5424  *
5425  *	Take a completed network device structure and add it to the kernel
5426  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5427  *	chain. 0 is returned on success. A negative errno code is returned
5428  *	on a failure to set up the device, or if the name is a duplicate.
5429  *
5430  *	Callers must hold the rtnl semaphore. You may want
5431  *	register_netdev() instead of this.
5432  *
5433  *	BUGS:
5434  *	The locking appears insufficient to guarantee two parallel registers
5435  *	will not get the same name.
5436  */
5437 
5438 int register_netdevice(struct net_device *dev)
5439 {
5440 	int ret;
5441 	struct net *net = dev_net(dev);
5442 
5443 	BUG_ON(dev_boot_phase);
5444 	ASSERT_RTNL();
5445 
5446 	might_sleep();
5447 
5448 	/* When net_device's are persistent, this will be fatal. */
5449 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5450 	BUG_ON(!net);
5451 
5452 	spin_lock_init(&dev->addr_list_lock);
5453 	netdev_set_addr_lockdep_class(dev);
5454 
5455 	dev->iflink = -1;
5456 
5457 	ret = dev_get_valid_name(dev, dev->name);
5458 	if (ret < 0)
5459 		goto out;
5460 
5461 	/* Init, if this function is available */
5462 	if (dev->netdev_ops->ndo_init) {
5463 		ret = dev->netdev_ops->ndo_init(dev);
5464 		if (ret) {
5465 			if (ret > 0)
5466 				ret = -EIO;
5467 			goto out;
5468 		}
5469 	}
5470 
5471 	dev->ifindex = dev_new_index(net);
5472 	if (dev->iflink == -1)
5473 		dev->iflink = dev->ifindex;
5474 
5475 	/* Transfer changeable features to wanted_features and enable
5476 	 * software offloads (GSO and GRO).
5477 	 */
5478 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
5479 	dev->features |= NETIF_F_SOFT_FEATURES;
5480 	dev->wanted_features = dev->features & dev->hw_features;
5481 
5482 	/* Turn on no cache copy if HW is doing checksum */
5483 	dev->hw_features |= NETIF_F_NOCACHE_COPY;
5484 	if ((dev->features & NETIF_F_ALL_CSUM) &&
5485 	    !(dev->features & NETIF_F_NO_CSUM)) {
5486 		dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5487 		dev->features |= NETIF_F_NOCACHE_COPY;
5488 	}
5489 
5490 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5491 	 */
5492 	dev->vlan_features |= NETIF_F_HIGHDMA;
5493 
5494 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5495 	ret = notifier_to_errno(ret);
5496 	if (ret)
5497 		goto err_uninit;
5498 
5499 	ret = netdev_register_kobject(dev);
5500 	if (ret)
5501 		goto err_uninit;
5502 	dev->reg_state = NETREG_REGISTERED;
5503 
5504 	__netdev_update_features(dev);
5505 
5506 	/*
5507 	 *	Default initial state at registry is that the
5508 	 *	device is present.
5509 	 */
5510 
5511 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5512 
5513 	dev_init_scheduler(dev);
5514 	dev_hold(dev);
5515 	list_netdevice(dev);
5516 
5517 	/* Notify protocols, that a new device appeared. */
5518 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5519 	ret = notifier_to_errno(ret);
5520 	if (ret) {
5521 		rollback_registered(dev);
5522 		dev->reg_state = NETREG_UNREGISTERED;
5523 	}
5524 	/*
5525 	 *	Prevent userspace races by waiting until the network
5526 	 *	device is fully setup before sending notifications.
5527 	 */
5528 	if (!dev->rtnl_link_ops ||
5529 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5530 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5531 
5532 out:
5533 	return ret;
5534 
5535 err_uninit:
5536 	if (dev->netdev_ops->ndo_uninit)
5537 		dev->netdev_ops->ndo_uninit(dev);
5538 	goto out;
5539 }
5540 EXPORT_SYMBOL(register_netdevice);
5541 
5542 /**
5543  *	init_dummy_netdev	- init a dummy network device for NAPI
5544  *	@dev: device to init
5545  *
5546  *	This takes a network device structure and initialize the minimum
5547  *	amount of fields so it can be used to schedule NAPI polls without
5548  *	registering a full blown interface. This is to be used by drivers
5549  *	that need to tie several hardware interfaces to a single NAPI
5550  *	poll scheduler due to HW limitations.
5551  */
5552 int init_dummy_netdev(struct net_device *dev)
5553 {
5554 	/* Clear everything. Note we don't initialize spinlocks
5555 	 * are they aren't supposed to be taken by any of the
5556 	 * NAPI code and this dummy netdev is supposed to be
5557 	 * only ever used for NAPI polls
5558 	 */
5559 	memset(dev, 0, sizeof(struct net_device));
5560 
5561 	/* make sure we BUG if trying to hit standard
5562 	 * register/unregister code path
5563 	 */
5564 	dev->reg_state = NETREG_DUMMY;
5565 
5566 	/* NAPI wants this */
5567 	INIT_LIST_HEAD(&dev->napi_list);
5568 
5569 	/* a dummy interface is started by default */
5570 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5571 	set_bit(__LINK_STATE_START, &dev->state);
5572 
5573 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
5574 	 * because users of this 'device' dont need to change
5575 	 * its refcount.
5576 	 */
5577 
5578 	return 0;
5579 }
5580 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5581 
5582 
5583 /**
5584  *	register_netdev	- register a network device
5585  *	@dev: device to register
5586  *
5587  *	Take a completed network device structure and add it to the kernel
5588  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5589  *	chain. 0 is returned on success. A negative errno code is returned
5590  *	on a failure to set up the device, or if the name is a duplicate.
5591  *
5592  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5593  *	and expands the device name if you passed a format string to
5594  *	alloc_netdev.
5595  */
5596 int register_netdev(struct net_device *dev)
5597 {
5598 	int err;
5599 
5600 	rtnl_lock();
5601 	err = register_netdevice(dev);
5602 	rtnl_unlock();
5603 	return err;
5604 }
5605 EXPORT_SYMBOL(register_netdev);
5606 
5607 int netdev_refcnt_read(const struct net_device *dev)
5608 {
5609 	int i, refcnt = 0;
5610 
5611 	for_each_possible_cpu(i)
5612 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5613 	return refcnt;
5614 }
5615 EXPORT_SYMBOL(netdev_refcnt_read);
5616 
5617 /*
5618  * netdev_wait_allrefs - wait until all references are gone.
5619  *
5620  * This is called when unregistering network devices.
5621  *
5622  * Any protocol or device that holds a reference should register
5623  * for netdevice notification, and cleanup and put back the
5624  * reference if they receive an UNREGISTER event.
5625  * We can get stuck here if buggy protocols don't correctly
5626  * call dev_put.
5627  */
5628 static void netdev_wait_allrefs(struct net_device *dev)
5629 {
5630 	unsigned long rebroadcast_time, warning_time;
5631 	int refcnt;
5632 
5633 	linkwatch_forget_dev(dev);
5634 
5635 	rebroadcast_time = warning_time = jiffies;
5636 	refcnt = netdev_refcnt_read(dev);
5637 
5638 	while (refcnt != 0) {
5639 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5640 			rtnl_lock();
5641 
5642 			/* Rebroadcast unregister notification */
5643 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5644 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5645 			 * should have already handle it the first time */
5646 
5647 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5648 				     &dev->state)) {
5649 				/* We must not have linkwatch events
5650 				 * pending on unregister. If this
5651 				 * happens, we simply run the queue
5652 				 * unscheduled, resulting in a noop
5653 				 * for this device.
5654 				 */
5655 				linkwatch_run_queue();
5656 			}
5657 
5658 			__rtnl_unlock();
5659 
5660 			rebroadcast_time = jiffies;
5661 		}
5662 
5663 		msleep(250);
5664 
5665 		refcnt = netdev_refcnt_read(dev);
5666 
5667 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5668 			printk(KERN_EMERG "unregister_netdevice: "
5669 			       "waiting for %s to become free. Usage "
5670 			       "count = %d\n",
5671 			       dev->name, refcnt);
5672 			warning_time = jiffies;
5673 		}
5674 	}
5675 }
5676 
5677 /* The sequence is:
5678  *
5679  *	rtnl_lock();
5680  *	...
5681  *	register_netdevice(x1);
5682  *	register_netdevice(x2);
5683  *	...
5684  *	unregister_netdevice(y1);
5685  *	unregister_netdevice(y2);
5686  *      ...
5687  *	rtnl_unlock();
5688  *	free_netdev(y1);
5689  *	free_netdev(y2);
5690  *
5691  * We are invoked by rtnl_unlock().
5692  * This allows us to deal with problems:
5693  * 1) We can delete sysfs objects which invoke hotplug
5694  *    without deadlocking with linkwatch via keventd.
5695  * 2) Since we run with the RTNL semaphore not held, we can sleep
5696  *    safely in order to wait for the netdev refcnt to drop to zero.
5697  *
5698  * We must not return until all unregister events added during
5699  * the interval the lock was held have been completed.
5700  */
5701 void netdev_run_todo(void)
5702 {
5703 	struct list_head list;
5704 
5705 	/* Snapshot list, allow later requests */
5706 	list_replace_init(&net_todo_list, &list);
5707 
5708 	__rtnl_unlock();
5709 
5710 	while (!list_empty(&list)) {
5711 		struct net_device *dev
5712 			= list_first_entry(&list, struct net_device, todo_list);
5713 		list_del(&dev->todo_list);
5714 
5715 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5716 			printk(KERN_ERR "network todo '%s' but state %d\n",
5717 			       dev->name, dev->reg_state);
5718 			dump_stack();
5719 			continue;
5720 		}
5721 
5722 		dev->reg_state = NETREG_UNREGISTERED;
5723 
5724 		on_each_cpu(flush_backlog, dev, 1);
5725 
5726 		netdev_wait_allrefs(dev);
5727 
5728 		/* paranoia */
5729 		BUG_ON(netdev_refcnt_read(dev));
5730 		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
5731 		WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
5732 		WARN_ON(dev->dn_ptr);
5733 
5734 		if (dev->destructor)
5735 			dev->destructor(dev);
5736 
5737 		/* Free network device */
5738 		kobject_put(&dev->dev.kobj);
5739 	}
5740 }
5741 
5742 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5743  * fields in the same order, with only the type differing.
5744  */
5745 static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5746 				    const struct net_device_stats *netdev_stats)
5747 {
5748 #if BITS_PER_LONG == 64
5749         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5750         memcpy(stats64, netdev_stats, sizeof(*stats64));
5751 #else
5752 	size_t i, n = sizeof(*stats64) / sizeof(u64);
5753 	const unsigned long *src = (const unsigned long *)netdev_stats;
5754 	u64 *dst = (u64 *)stats64;
5755 
5756 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5757 		     sizeof(*stats64) / sizeof(u64));
5758 	for (i = 0; i < n; i++)
5759 		dst[i] = src[i];
5760 #endif
5761 }
5762 
5763 /**
5764  *	dev_get_stats	- get network device statistics
5765  *	@dev: device to get statistics from
5766  *	@storage: place to store stats
5767  *
5768  *	Get network statistics from device. Return @storage.
5769  *	The device driver may provide its own method by setting
5770  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5771  *	otherwise the internal statistics structure is used.
5772  */
5773 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5774 					struct rtnl_link_stats64 *storage)
5775 {
5776 	const struct net_device_ops *ops = dev->netdev_ops;
5777 
5778 	if (ops->ndo_get_stats64) {
5779 		memset(storage, 0, sizeof(*storage));
5780 		ops->ndo_get_stats64(dev, storage);
5781 	} else if (ops->ndo_get_stats) {
5782 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5783 	} else {
5784 		netdev_stats_to_stats64(storage, &dev->stats);
5785 	}
5786 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5787 	return storage;
5788 }
5789 EXPORT_SYMBOL(dev_get_stats);
5790 
5791 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5792 {
5793 	struct netdev_queue *queue = dev_ingress_queue(dev);
5794 
5795 #ifdef CONFIG_NET_CLS_ACT
5796 	if (queue)
5797 		return queue;
5798 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5799 	if (!queue)
5800 		return NULL;
5801 	netdev_init_one_queue(dev, queue, NULL);
5802 	queue->qdisc = &noop_qdisc;
5803 	queue->qdisc_sleeping = &noop_qdisc;
5804 	rcu_assign_pointer(dev->ingress_queue, queue);
5805 #endif
5806 	return queue;
5807 }
5808 
5809 /**
5810  *	alloc_netdev_mqs - allocate network device
5811  *	@sizeof_priv:	size of private data to allocate space for
5812  *	@name:		device name format string
5813  *	@setup:		callback to initialize device
5814  *	@txqs:		the number of TX subqueues to allocate
5815  *	@rxqs:		the number of RX subqueues to allocate
5816  *
5817  *	Allocates a struct net_device with private data area for driver use
5818  *	and performs basic initialization.  Also allocates subquue structs
5819  *	for each queue on the device.
5820  */
5821 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5822 		void (*setup)(struct net_device *),
5823 		unsigned int txqs, unsigned int rxqs)
5824 {
5825 	struct net_device *dev;
5826 	size_t alloc_size;
5827 	struct net_device *p;
5828 
5829 	BUG_ON(strlen(name) >= sizeof(dev->name));
5830 
5831 	if (txqs < 1) {
5832 		pr_err("alloc_netdev: Unable to allocate device "
5833 		       "with zero queues.\n");
5834 		return NULL;
5835 	}
5836 
5837 #ifdef CONFIG_RPS
5838 	if (rxqs < 1) {
5839 		pr_err("alloc_netdev: Unable to allocate device "
5840 		       "with zero RX queues.\n");
5841 		return NULL;
5842 	}
5843 #endif
5844 
5845 	alloc_size = sizeof(struct net_device);
5846 	if (sizeof_priv) {
5847 		/* ensure 32-byte alignment of private area */
5848 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5849 		alloc_size += sizeof_priv;
5850 	}
5851 	/* ensure 32-byte alignment of whole construct */
5852 	alloc_size += NETDEV_ALIGN - 1;
5853 
5854 	p = kzalloc(alloc_size, GFP_KERNEL);
5855 	if (!p) {
5856 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5857 		return NULL;
5858 	}
5859 
5860 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5861 	dev->padded = (char *)dev - (char *)p;
5862 
5863 	dev->pcpu_refcnt = alloc_percpu(int);
5864 	if (!dev->pcpu_refcnt)
5865 		goto free_p;
5866 
5867 	if (dev_addr_init(dev))
5868 		goto free_pcpu;
5869 
5870 	dev_mc_init(dev);
5871 	dev_uc_init(dev);
5872 
5873 	dev_net_set(dev, &init_net);
5874 
5875 	dev->gso_max_size = GSO_MAX_SIZE;
5876 
5877 	INIT_LIST_HEAD(&dev->napi_list);
5878 	INIT_LIST_HEAD(&dev->unreg_list);
5879 	INIT_LIST_HEAD(&dev->link_watch_list);
5880 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5881 	setup(dev);
5882 
5883 	dev->num_tx_queues = txqs;
5884 	dev->real_num_tx_queues = txqs;
5885 	if (netif_alloc_netdev_queues(dev))
5886 		goto free_all;
5887 
5888 #ifdef CONFIG_RPS
5889 	dev->num_rx_queues = rxqs;
5890 	dev->real_num_rx_queues = rxqs;
5891 	if (netif_alloc_rx_queues(dev))
5892 		goto free_all;
5893 #endif
5894 
5895 	strcpy(dev->name, name);
5896 	dev->group = INIT_NETDEV_GROUP;
5897 	return dev;
5898 
5899 free_all:
5900 	free_netdev(dev);
5901 	return NULL;
5902 
5903 free_pcpu:
5904 	free_percpu(dev->pcpu_refcnt);
5905 	kfree(dev->_tx);
5906 #ifdef CONFIG_RPS
5907 	kfree(dev->_rx);
5908 #endif
5909 
5910 free_p:
5911 	kfree(p);
5912 	return NULL;
5913 }
5914 EXPORT_SYMBOL(alloc_netdev_mqs);
5915 
5916 /**
5917  *	free_netdev - free network device
5918  *	@dev: device
5919  *
5920  *	This function does the last stage of destroying an allocated device
5921  * 	interface. The reference to the device object is released.
5922  *	If this is the last reference then it will be freed.
5923  */
5924 void free_netdev(struct net_device *dev)
5925 {
5926 	struct napi_struct *p, *n;
5927 
5928 	release_net(dev_net(dev));
5929 
5930 	kfree(dev->_tx);
5931 #ifdef CONFIG_RPS
5932 	kfree(dev->_rx);
5933 #endif
5934 
5935 	kfree(rcu_dereference_raw(dev->ingress_queue));
5936 
5937 	/* Flush device addresses */
5938 	dev_addr_flush(dev);
5939 
5940 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5941 		netif_napi_del(p);
5942 
5943 	free_percpu(dev->pcpu_refcnt);
5944 	dev->pcpu_refcnt = NULL;
5945 
5946 	/*  Compatibility with error handling in drivers */
5947 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5948 		kfree((char *)dev - dev->padded);
5949 		return;
5950 	}
5951 
5952 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5953 	dev->reg_state = NETREG_RELEASED;
5954 
5955 	/* will free via device release */
5956 	put_device(&dev->dev);
5957 }
5958 EXPORT_SYMBOL(free_netdev);
5959 
5960 /**
5961  *	synchronize_net -  Synchronize with packet receive processing
5962  *
5963  *	Wait for packets currently being received to be done.
5964  *	Does not block later packets from starting.
5965  */
5966 void synchronize_net(void)
5967 {
5968 	might_sleep();
5969 	if (rtnl_is_locked())
5970 		synchronize_rcu_expedited();
5971 	else
5972 		synchronize_rcu();
5973 }
5974 EXPORT_SYMBOL(synchronize_net);
5975 
5976 /**
5977  *	unregister_netdevice_queue - remove device from the kernel
5978  *	@dev: device
5979  *	@head: list
5980  *
5981  *	This function shuts down a device interface and removes it
5982  *	from the kernel tables.
5983  *	If head not NULL, device is queued to be unregistered later.
5984  *
5985  *	Callers must hold the rtnl semaphore.  You may want
5986  *	unregister_netdev() instead of this.
5987  */
5988 
5989 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5990 {
5991 	ASSERT_RTNL();
5992 
5993 	if (head) {
5994 		list_move_tail(&dev->unreg_list, head);
5995 	} else {
5996 		rollback_registered(dev);
5997 		/* Finish processing unregister after unlock */
5998 		net_set_todo(dev);
5999 	}
6000 }
6001 EXPORT_SYMBOL(unregister_netdevice_queue);
6002 
6003 /**
6004  *	unregister_netdevice_many - unregister many devices
6005  *	@head: list of devices
6006  */
6007 void unregister_netdevice_many(struct list_head *head)
6008 {
6009 	struct net_device *dev;
6010 
6011 	if (!list_empty(head)) {
6012 		rollback_registered_many(head);
6013 		list_for_each_entry(dev, head, unreg_list)
6014 			net_set_todo(dev);
6015 	}
6016 }
6017 EXPORT_SYMBOL(unregister_netdevice_many);
6018 
6019 /**
6020  *	unregister_netdev - remove device from the kernel
6021  *	@dev: device
6022  *
6023  *	This function shuts down a device interface and removes it
6024  *	from the kernel tables.
6025  *
6026  *	This is just a wrapper for unregister_netdevice that takes
6027  *	the rtnl semaphore.  In general you want to use this and not
6028  *	unregister_netdevice.
6029  */
6030 void unregister_netdev(struct net_device *dev)
6031 {
6032 	rtnl_lock();
6033 	unregister_netdevice(dev);
6034 	rtnl_unlock();
6035 }
6036 EXPORT_SYMBOL(unregister_netdev);
6037 
6038 /**
6039  *	dev_change_net_namespace - move device to different nethost namespace
6040  *	@dev: device
6041  *	@net: network namespace
6042  *	@pat: If not NULL name pattern to try if the current device name
6043  *	      is already taken in the destination network namespace.
6044  *
6045  *	This function shuts down a device interface and moves it
6046  *	to a new network namespace. On success 0 is returned, on
6047  *	a failure a netagive errno code is returned.
6048  *
6049  *	Callers must hold the rtnl semaphore.
6050  */
6051 
6052 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6053 {
6054 	int err;
6055 
6056 	ASSERT_RTNL();
6057 
6058 	/* Don't allow namespace local devices to be moved. */
6059 	err = -EINVAL;
6060 	if (dev->features & NETIF_F_NETNS_LOCAL)
6061 		goto out;
6062 
6063 	/* Ensure the device has been registrered */
6064 	err = -EINVAL;
6065 	if (dev->reg_state != NETREG_REGISTERED)
6066 		goto out;
6067 
6068 	/* Get out if there is nothing todo */
6069 	err = 0;
6070 	if (net_eq(dev_net(dev), net))
6071 		goto out;
6072 
6073 	/* Pick the destination device name, and ensure
6074 	 * we can use it in the destination network namespace.
6075 	 */
6076 	err = -EEXIST;
6077 	if (__dev_get_by_name(net, dev->name)) {
6078 		/* We get here if we can't use the current device name */
6079 		if (!pat)
6080 			goto out;
6081 		if (dev_get_valid_name(dev, pat) < 0)
6082 			goto out;
6083 	}
6084 
6085 	/*
6086 	 * And now a mini version of register_netdevice unregister_netdevice.
6087 	 */
6088 
6089 	/* If device is running close it first. */
6090 	dev_close(dev);
6091 
6092 	/* And unlink it from device chain */
6093 	err = -ENODEV;
6094 	unlist_netdevice(dev);
6095 
6096 	synchronize_net();
6097 
6098 	/* Shutdown queueing discipline. */
6099 	dev_shutdown(dev);
6100 
6101 	/* Notify protocols, that we are about to destroy
6102 	   this device. They should clean all the things.
6103 
6104 	   Note that dev->reg_state stays at NETREG_REGISTERED.
6105 	   This is wanted because this way 8021q and macvlan know
6106 	   the device is just moving and can keep their slaves up.
6107 	*/
6108 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6109 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
6110 
6111 	/*
6112 	 *	Flush the unicast and multicast chains
6113 	 */
6114 	dev_uc_flush(dev);
6115 	dev_mc_flush(dev);
6116 
6117 	/* Actually switch the network namespace */
6118 	dev_net_set(dev, net);
6119 
6120 	/* If there is an ifindex conflict assign a new one */
6121 	if (__dev_get_by_index(net, dev->ifindex)) {
6122 		int iflink = (dev->iflink == dev->ifindex);
6123 		dev->ifindex = dev_new_index(net);
6124 		if (iflink)
6125 			dev->iflink = dev->ifindex;
6126 	}
6127 
6128 	/* Fixup kobjects */
6129 	err = device_rename(&dev->dev, dev->name);
6130 	WARN_ON(err);
6131 
6132 	/* Add the device back in the hashes */
6133 	list_netdevice(dev);
6134 
6135 	/* Notify protocols, that a new device appeared. */
6136 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
6137 
6138 	/*
6139 	 *	Prevent userspace races by waiting until the network
6140 	 *	device is fully setup before sending notifications.
6141 	 */
6142 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6143 
6144 	synchronize_net();
6145 	err = 0;
6146 out:
6147 	return err;
6148 }
6149 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6150 
6151 static int dev_cpu_callback(struct notifier_block *nfb,
6152 			    unsigned long action,
6153 			    void *ocpu)
6154 {
6155 	struct sk_buff **list_skb;
6156 	struct sk_buff *skb;
6157 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
6158 	struct softnet_data *sd, *oldsd;
6159 
6160 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6161 		return NOTIFY_OK;
6162 
6163 	local_irq_disable();
6164 	cpu = smp_processor_id();
6165 	sd = &per_cpu(softnet_data, cpu);
6166 	oldsd = &per_cpu(softnet_data, oldcpu);
6167 
6168 	/* Find end of our completion_queue. */
6169 	list_skb = &sd->completion_queue;
6170 	while (*list_skb)
6171 		list_skb = &(*list_skb)->next;
6172 	/* Append completion queue from offline CPU. */
6173 	*list_skb = oldsd->completion_queue;
6174 	oldsd->completion_queue = NULL;
6175 
6176 	/* Append output queue from offline CPU. */
6177 	if (oldsd->output_queue) {
6178 		*sd->output_queue_tailp = oldsd->output_queue;
6179 		sd->output_queue_tailp = oldsd->output_queue_tailp;
6180 		oldsd->output_queue = NULL;
6181 		oldsd->output_queue_tailp = &oldsd->output_queue;
6182 	}
6183 	/* Append NAPI poll list from offline CPU. */
6184 	if (!list_empty(&oldsd->poll_list)) {
6185 		list_splice_init(&oldsd->poll_list, &sd->poll_list);
6186 		raise_softirq_irqoff(NET_RX_SOFTIRQ);
6187 	}
6188 
6189 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
6190 	local_irq_enable();
6191 
6192 	/* Process offline CPU's input_pkt_queue */
6193 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6194 		netif_rx(skb);
6195 		input_queue_head_incr(oldsd);
6196 	}
6197 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6198 		netif_rx(skb);
6199 		input_queue_head_incr(oldsd);
6200 	}
6201 
6202 	return NOTIFY_OK;
6203 }
6204 
6205 
6206 /**
6207  *	netdev_increment_features - increment feature set by one
6208  *	@all: current feature set
6209  *	@one: new feature set
6210  *	@mask: mask feature set
6211  *
6212  *	Computes a new feature set after adding a device with feature set
6213  *	@one to the master device with current feature set @all.  Will not
6214  *	enable anything that is off in @mask. Returns the new feature set.
6215  */
6216 u32 netdev_increment_features(u32 all, u32 one, u32 mask)
6217 {
6218 	if (mask & NETIF_F_GEN_CSUM)
6219 		mask |= NETIF_F_ALL_CSUM;
6220 	mask |= NETIF_F_VLAN_CHALLENGED;
6221 
6222 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6223 	all &= one | ~NETIF_F_ALL_FOR_ALL;
6224 
6225 	/* If device needs checksumming, downgrade to it. */
6226 	if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
6227 		all &= ~NETIF_F_NO_CSUM;
6228 
6229 	/* If one device supports hw checksumming, set for all. */
6230 	if (all & NETIF_F_GEN_CSUM)
6231 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6232 
6233 	return all;
6234 }
6235 EXPORT_SYMBOL(netdev_increment_features);
6236 
6237 static struct hlist_head *netdev_create_hash(void)
6238 {
6239 	int i;
6240 	struct hlist_head *hash;
6241 
6242 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6243 	if (hash != NULL)
6244 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
6245 			INIT_HLIST_HEAD(&hash[i]);
6246 
6247 	return hash;
6248 }
6249 
6250 /* Initialize per network namespace state */
6251 static int __net_init netdev_init(struct net *net)
6252 {
6253 	INIT_LIST_HEAD(&net->dev_base_head);
6254 
6255 	net->dev_name_head = netdev_create_hash();
6256 	if (net->dev_name_head == NULL)
6257 		goto err_name;
6258 
6259 	net->dev_index_head = netdev_create_hash();
6260 	if (net->dev_index_head == NULL)
6261 		goto err_idx;
6262 
6263 	return 0;
6264 
6265 err_idx:
6266 	kfree(net->dev_name_head);
6267 err_name:
6268 	return -ENOMEM;
6269 }
6270 
6271 /**
6272  *	netdev_drivername - network driver for the device
6273  *	@dev: network device
6274  *
6275  *	Determine network driver for device.
6276  */
6277 const char *netdev_drivername(const struct net_device *dev)
6278 {
6279 	const struct device_driver *driver;
6280 	const struct device *parent;
6281 	const char *empty = "";
6282 
6283 	parent = dev->dev.parent;
6284 	if (!parent)
6285 		return empty;
6286 
6287 	driver = parent->driver;
6288 	if (driver && driver->name)
6289 		return driver->name;
6290 	return empty;
6291 }
6292 
6293 static int __netdev_printk(const char *level, const struct net_device *dev,
6294 			   struct va_format *vaf)
6295 {
6296 	int r;
6297 
6298 	if (dev && dev->dev.parent)
6299 		r = dev_printk(level, dev->dev.parent, "%s: %pV",
6300 			       netdev_name(dev), vaf);
6301 	else if (dev)
6302 		r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6303 	else
6304 		r = printk("%s(NULL net_device): %pV", level, vaf);
6305 
6306 	return r;
6307 }
6308 
6309 int netdev_printk(const char *level, const struct net_device *dev,
6310 		  const char *format, ...)
6311 {
6312 	struct va_format vaf;
6313 	va_list args;
6314 	int r;
6315 
6316 	va_start(args, format);
6317 
6318 	vaf.fmt = format;
6319 	vaf.va = &args;
6320 
6321 	r = __netdev_printk(level, dev, &vaf);
6322 	va_end(args);
6323 
6324 	return r;
6325 }
6326 EXPORT_SYMBOL(netdev_printk);
6327 
6328 #define define_netdev_printk_level(func, level)			\
6329 int func(const struct net_device *dev, const char *fmt, ...)	\
6330 {								\
6331 	int r;							\
6332 	struct va_format vaf;					\
6333 	va_list args;						\
6334 								\
6335 	va_start(args, fmt);					\
6336 								\
6337 	vaf.fmt = fmt;						\
6338 	vaf.va = &args;						\
6339 								\
6340 	r = __netdev_printk(level, dev, &vaf);			\
6341 	va_end(args);						\
6342 								\
6343 	return r;						\
6344 }								\
6345 EXPORT_SYMBOL(func);
6346 
6347 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6348 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6349 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6350 define_netdev_printk_level(netdev_err, KERN_ERR);
6351 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6352 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6353 define_netdev_printk_level(netdev_info, KERN_INFO);
6354 
6355 static void __net_exit netdev_exit(struct net *net)
6356 {
6357 	kfree(net->dev_name_head);
6358 	kfree(net->dev_index_head);
6359 }
6360 
6361 static struct pernet_operations __net_initdata netdev_net_ops = {
6362 	.init = netdev_init,
6363 	.exit = netdev_exit,
6364 };
6365 
6366 static void __net_exit default_device_exit(struct net *net)
6367 {
6368 	struct net_device *dev, *aux;
6369 	/*
6370 	 * Push all migratable network devices back to the
6371 	 * initial network namespace
6372 	 */
6373 	rtnl_lock();
6374 	for_each_netdev_safe(net, dev, aux) {
6375 		int err;
6376 		char fb_name[IFNAMSIZ];
6377 
6378 		/* Ignore unmoveable devices (i.e. loopback) */
6379 		if (dev->features & NETIF_F_NETNS_LOCAL)
6380 			continue;
6381 
6382 		/* Leave virtual devices for the generic cleanup */
6383 		if (dev->rtnl_link_ops)
6384 			continue;
6385 
6386 		/* Push remaining network devices to init_net */
6387 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6388 		err = dev_change_net_namespace(dev, &init_net, fb_name);
6389 		if (err) {
6390 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6391 				__func__, dev->name, err);
6392 			BUG();
6393 		}
6394 	}
6395 	rtnl_unlock();
6396 }
6397 
6398 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6399 {
6400 	/* At exit all network devices most be removed from a network
6401 	 * namespace.  Do this in the reverse order of registration.
6402 	 * Do this across as many network namespaces as possible to
6403 	 * improve batching efficiency.
6404 	 */
6405 	struct net_device *dev;
6406 	struct net *net;
6407 	LIST_HEAD(dev_kill_list);
6408 
6409 	rtnl_lock();
6410 	list_for_each_entry(net, net_list, exit_list) {
6411 		for_each_netdev_reverse(net, dev) {
6412 			if (dev->rtnl_link_ops)
6413 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6414 			else
6415 				unregister_netdevice_queue(dev, &dev_kill_list);
6416 		}
6417 	}
6418 	unregister_netdevice_many(&dev_kill_list);
6419 	list_del(&dev_kill_list);
6420 	rtnl_unlock();
6421 }
6422 
6423 static struct pernet_operations __net_initdata default_device_ops = {
6424 	.exit = default_device_exit,
6425 	.exit_batch = default_device_exit_batch,
6426 };
6427 
6428 /*
6429  *	Initialize the DEV module. At boot time this walks the device list and
6430  *	unhooks any devices that fail to initialise (normally hardware not
6431  *	present) and leaves us with a valid list of present and active devices.
6432  *
6433  */
6434 
6435 /*
6436  *       This is called single threaded during boot, so no need
6437  *       to take the rtnl semaphore.
6438  */
6439 static int __init net_dev_init(void)
6440 {
6441 	int i, rc = -ENOMEM;
6442 
6443 	BUG_ON(!dev_boot_phase);
6444 
6445 	if (dev_proc_init())
6446 		goto out;
6447 
6448 	if (netdev_kobject_init())
6449 		goto out;
6450 
6451 	INIT_LIST_HEAD(&ptype_all);
6452 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
6453 		INIT_LIST_HEAD(&ptype_base[i]);
6454 
6455 	if (register_pernet_subsys(&netdev_net_ops))
6456 		goto out;
6457 
6458 	/*
6459 	 *	Initialise the packet receive queues.
6460 	 */
6461 
6462 	for_each_possible_cpu(i) {
6463 		struct softnet_data *sd = &per_cpu(softnet_data, i);
6464 
6465 		memset(sd, 0, sizeof(*sd));
6466 		skb_queue_head_init(&sd->input_pkt_queue);
6467 		skb_queue_head_init(&sd->process_queue);
6468 		sd->completion_queue = NULL;
6469 		INIT_LIST_HEAD(&sd->poll_list);
6470 		sd->output_queue = NULL;
6471 		sd->output_queue_tailp = &sd->output_queue;
6472 #ifdef CONFIG_RPS
6473 		sd->csd.func = rps_trigger_softirq;
6474 		sd->csd.info = sd;
6475 		sd->csd.flags = 0;
6476 		sd->cpu = i;
6477 #endif
6478 
6479 		sd->backlog.poll = process_backlog;
6480 		sd->backlog.weight = weight_p;
6481 		sd->backlog.gro_list = NULL;
6482 		sd->backlog.gro_count = 0;
6483 	}
6484 
6485 	dev_boot_phase = 0;
6486 
6487 	/* The loopback device is special if any other network devices
6488 	 * is present in a network namespace the loopback device must
6489 	 * be present. Since we now dynamically allocate and free the
6490 	 * loopback device ensure this invariant is maintained by
6491 	 * keeping the loopback device as the first device on the
6492 	 * list of network devices.  Ensuring the loopback devices
6493 	 * is the first device that appears and the last network device
6494 	 * that disappears.
6495 	 */
6496 	if (register_pernet_device(&loopback_net_ops))
6497 		goto out;
6498 
6499 	if (register_pernet_device(&default_device_ops))
6500 		goto out;
6501 
6502 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6503 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6504 
6505 	hotcpu_notifier(dev_cpu_callback, 0);
6506 	dst_init();
6507 	dev_mcast_init();
6508 	rc = 0;
6509 out:
6510 	return rc;
6511 }
6512 
6513 subsys_initcall(net_dev_init);
6514 
6515 static int __init initialize_hashrnd(void)
6516 {
6517 	get_random_bytes(&hashrnd, sizeof(hashrnd));
6518 	return 0;
6519 }
6520 
6521 late_initcall_sync(initialize_hashrnd);
6522 
6523