xref: /openbmc/linux/net/core/dev.c (revision a2835763e130c343ace5320c20d33c281e7097b7)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/proc_fs.h>
101 #include <linux/seq_file.h>
102 #include <linux/stat.h>
103 #include <linux/if_bridge.h>
104 #include <linux/if_macvlan.h>
105 #include <net/dst.h>
106 #include <net/pkt_sched.h>
107 #include <net/checksum.h>
108 #include <net/xfrm.h>
109 #include <linux/highmem.h>
110 #include <linux/init.h>
111 #include <linux/kmod.h>
112 #include <linux/module.h>
113 #include <linux/netpoll.h>
114 #include <linux/rcupdate.h>
115 #include <linux/delay.h>
116 #include <net/wext.h>
117 #include <net/iw_handler.h>
118 #include <asm/current.h>
119 #include <linux/audit.h>
120 #include <linux/dmaengine.h>
121 #include <linux/err.h>
122 #include <linux/ctype.h>
123 #include <linux/if_arp.h>
124 #include <linux/if_vlan.h>
125 #include <linux/ip.h>
126 #include <net/ip.h>
127 #include <linux/ipv6.h>
128 #include <linux/in.h>
129 #include <linux/jhash.h>
130 #include <linux/random.h>
131 #include <trace/events/napi.h>
132 
133 #include "net-sysfs.h"
134 
135 /* Instead of increasing this, you should create a hash table. */
136 #define MAX_GRO_SKBS 8
137 
138 /* This should be increased if a protocol with a bigger head is added. */
139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
140 
141 /*
142  *	The list of packet types we will receive (as opposed to discard)
143  *	and the routines to invoke.
144  *
145  *	Why 16. Because with 16 the only overlap we get on a hash of the
146  *	low nibble of the protocol value is RARP/SNAP/X.25.
147  *
148  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
149  *             sure which should go first, but I bet it won't make much
150  *             difference if we are running VLANs.  The good news is that
151  *             this protocol won't be in the list unless compiled in, so
152  *             the average user (w/out VLANs) will not be adversely affected.
153  *             --BLG
154  *
155  *		0800	IP
156  *		8100    802.1Q VLAN
157  *		0001	802.3
158  *		0002	AX.25
159  *		0004	802.2
160  *		8035	RARP
161  *		0005	SNAP
162  *		0805	X.25
163  *		0806	ARP
164  *		8137	IPX
165  *		0009	Localtalk
166  *		86DD	IPv6
167  */
168 
169 #define PTYPE_HASH_SIZE	(16)
170 #define PTYPE_HASH_MASK	(PTYPE_HASH_SIZE - 1)
171 
172 static DEFINE_SPINLOCK(ptype_lock);
173 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
174 static struct list_head ptype_all __read_mostly;	/* Taps */
175 
176 /*
177  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
178  * semaphore.
179  *
180  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
181  *
182  * Writers must hold the rtnl semaphore while they loop through the
183  * dev_base_head list, and hold dev_base_lock for writing when they do the
184  * actual updates.  This allows pure readers to access the list even
185  * while a writer is preparing to update it.
186  *
187  * To put it another way, dev_base_lock is held for writing only to
188  * protect against pure readers; the rtnl semaphore provides the
189  * protection against other writers.
190  *
191  * See, for example usages, register_netdevice() and
192  * unregister_netdevice(), which must be called with the rtnl
193  * semaphore held.
194  */
195 DEFINE_RWLOCK(dev_base_lock);
196 EXPORT_SYMBOL(dev_base_lock);
197 
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
201 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
202 }
203 
204 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
205 {
206 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207 }
208 
209 /* Device list insertion */
210 static int list_netdevice(struct net_device *dev)
211 {
212 	struct net *net = dev_net(dev);
213 
214 	ASSERT_RTNL();
215 
216 	write_lock_bh(&dev_base_lock);
217 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
218 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
219 	hlist_add_head_rcu(&dev->index_hlist,
220 			   dev_index_hash(net, dev->ifindex));
221 	write_unlock_bh(&dev_base_lock);
222 	return 0;
223 }
224 
225 /* Device list removal
226  * caller must respect a RCU grace period before freeing/reusing dev
227  */
228 static void unlist_netdevice(struct net_device *dev)
229 {
230 	ASSERT_RTNL();
231 
232 	/* Unlink dev from the device chain */
233 	write_lock_bh(&dev_base_lock);
234 	list_del_rcu(&dev->dev_list);
235 	hlist_del_rcu(&dev->name_hlist);
236 	hlist_del_rcu(&dev->index_hlist);
237 	write_unlock_bh(&dev_base_lock);
238 }
239 
240 /*
241  *	Our notifier list
242  */
243 
244 static RAW_NOTIFIER_HEAD(netdev_chain);
245 
246 /*
247  *	Device drivers call our routines to queue packets here. We empty the
248  *	queue in the local softnet handler.
249  */
250 
251 DEFINE_PER_CPU(struct softnet_data, softnet_data);
252 EXPORT_PER_CPU_SYMBOL(softnet_data);
253 
254 #ifdef CONFIG_LOCKDEP
255 /*
256  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
257  * according to dev->type
258  */
259 static const unsigned short netdev_lock_type[] =
260 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
261 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
262 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
263 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
264 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
265 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
266 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
267 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
268 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
269 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
270 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
271 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
272 	 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
273 	 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
274 	 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
275 	 ARPHRD_VOID, ARPHRD_NONE};
276 
277 static const char *const netdev_lock_name[] =
278 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
279 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
280 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
281 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
282 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
283 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
284 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
285 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
286 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
287 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
288 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
289 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
290 	 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
291 	 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
292 	 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
293 	 "_xmit_VOID", "_xmit_NONE"};
294 
295 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
296 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
297 
298 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
299 {
300 	int i;
301 
302 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
303 		if (netdev_lock_type[i] == dev_type)
304 			return i;
305 	/* the last key is used by default */
306 	return ARRAY_SIZE(netdev_lock_type) - 1;
307 }
308 
309 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
310 						 unsigned short dev_type)
311 {
312 	int i;
313 
314 	i = netdev_lock_pos(dev_type);
315 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
316 				   netdev_lock_name[i]);
317 }
318 
319 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
320 {
321 	int i;
322 
323 	i = netdev_lock_pos(dev->type);
324 	lockdep_set_class_and_name(&dev->addr_list_lock,
325 				   &netdev_addr_lock_key[i],
326 				   netdev_lock_name[i]);
327 }
328 #else
329 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
330 						 unsigned short dev_type)
331 {
332 }
333 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334 {
335 }
336 #endif
337 
338 /*******************************************************************************
339 
340 		Protocol management and registration routines
341 
342 *******************************************************************************/
343 
344 /*
345  *	Add a protocol ID to the list. Now that the input handler is
346  *	smarter we can dispense with all the messy stuff that used to be
347  *	here.
348  *
349  *	BEWARE!!! Protocol handlers, mangling input packets,
350  *	MUST BE last in hash buckets and checking protocol handlers
351  *	MUST start from promiscuous ptype_all chain in net_bh.
352  *	It is true now, do not change it.
353  *	Explanation follows: if protocol handler, mangling packet, will
354  *	be the first on list, it is not able to sense, that packet
355  *	is cloned and should be copied-on-write, so that it will
356  *	change it and subsequent readers will get broken packet.
357  *							--ANK (980803)
358  */
359 
360 /**
361  *	dev_add_pack - add packet handler
362  *	@pt: packet type declaration
363  *
364  *	Add a protocol handler to the networking stack. The passed &packet_type
365  *	is linked into kernel lists and may not be freed until it has been
366  *	removed from the kernel lists.
367  *
368  *	This call does not sleep therefore it can not
369  *	guarantee all CPU's that are in middle of receiving packets
370  *	will see the new packet type (until the next received packet).
371  */
372 
373 void dev_add_pack(struct packet_type *pt)
374 {
375 	int hash;
376 
377 	spin_lock_bh(&ptype_lock);
378 	if (pt->type == htons(ETH_P_ALL))
379 		list_add_rcu(&pt->list, &ptype_all);
380 	else {
381 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
382 		list_add_rcu(&pt->list, &ptype_base[hash]);
383 	}
384 	spin_unlock_bh(&ptype_lock);
385 }
386 EXPORT_SYMBOL(dev_add_pack);
387 
388 /**
389  *	__dev_remove_pack	 - remove packet handler
390  *	@pt: packet type declaration
391  *
392  *	Remove a protocol handler that was previously added to the kernel
393  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
394  *	from the kernel lists and can be freed or reused once this function
395  *	returns.
396  *
397  *      The packet type might still be in use by receivers
398  *	and must not be freed until after all the CPU's have gone
399  *	through a quiescent state.
400  */
401 void __dev_remove_pack(struct packet_type *pt)
402 {
403 	struct list_head *head;
404 	struct packet_type *pt1;
405 
406 	spin_lock_bh(&ptype_lock);
407 
408 	if (pt->type == htons(ETH_P_ALL))
409 		head = &ptype_all;
410 	else
411 		head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
412 
413 	list_for_each_entry(pt1, head, list) {
414 		if (pt == pt1) {
415 			list_del_rcu(&pt->list);
416 			goto out;
417 		}
418 	}
419 
420 	printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
421 out:
422 	spin_unlock_bh(&ptype_lock);
423 }
424 EXPORT_SYMBOL(__dev_remove_pack);
425 
426 /**
427  *	dev_remove_pack	 - remove packet handler
428  *	@pt: packet type declaration
429  *
430  *	Remove a protocol handler that was previously added to the kernel
431  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
432  *	from the kernel lists and can be freed or reused once this function
433  *	returns.
434  *
435  *	This call sleeps to guarantee that no CPU is looking at the packet
436  *	type after return.
437  */
438 void dev_remove_pack(struct packet_type *pt)
439 {
440 	__dev_remove_pack(pt);
441 
442 	synchronize_net();
443 }
444 EXPORT_SYMBOL(dev_remove_pack);
445 
446 /******************************************************************************
447 
448 		      Device Boot-time Settings Routines
449 
450 *******************************************************************************/
451 
452 /* Boot time configuration table */
453 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
454 
455 /**
456  *	netdev_boot_setup_add	- add new setup entry
457  *	@name: name of the device
458  *	@map: configured settings for the device
459  *
460  *	Adds new setup entry to the dev_boot_setup list.  The function
461  *	returns 0 on error and 1 on success.  This is a generic routine to
462  *	all netdevices.
463  */
464 static int netdev_boot_setup_add(char *name, struct ifmap *map)
465 {
466 	struct netdev_boot_setup *s;
467 	int i;
468 
469 	s = dev_boot_setup;
470 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
471 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
472 			memset(s[i].name, 0, sizeof(s[i].name));
473 			strlcpy(s[i].name, name, IFNAMSIZ);
474 			memcpy(&s[i].map, map, sizeof(s[i].map));
475 			break;
476 		}
477 	}
478 
479 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
480 }
481 
482 /**
483  *	netdev_boot_setup_check	- check boot time settings
484  *	@dev: the netdevice
485  *
486  * 	Check boot time settings for the device.
487  *	The found settings are set for the device to be used
488  *	later in the device probing.
489  *	Returns 0 if no settings found, 1 if they are.
490  */
491 int netdev_boot_setup_check(struct net_device *dev)
492 {
493 	struct netdev_boot_setup *s = dev_boot_setup;
494 	int i;
495 
496 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
497 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
498 		    !strcmp(dev->name, s[i].name)) {
499 			dev->irq 	= s[i].map.irq;
500 			dev->base_addr 	= s[i].map.base_addr;
501 			dev->mem_start 	= s[i].map.mem_start;
502 			dev->mem_end 	= s[i].map.mem_end;
503 			return 1;
504 		}
505 	}
506 	return 0;
507 }
508 EXPORT_SYMBOL(netdev_boot_setup_check);
509 
510 
511 /**
512  *	netdev_boot_base	- get address from boot time settings
513  *	@prefix: prefix for network device
514  *	@unit: id for network device
515  *
516  * 	Check boot time settings for the base address of device.
517  *	The found settings are set for the device to be used
518  *	later in the device probing.
519  *	Returns 0 if no settings found.
520  */
521 unsigned long netdev_boot_base(const char *prefix, int unit)
522 {
523 	const struct netdev_boot_setup *s = dev_boot_setup;
524 	char name[IFNAMSIZ];
525 	int i;
526 
527 	sprintf(name, "%s%d", prefix, unit);
528 
529 	/*
530 	 * If device already registered then return base of 1
531 	 * to indicate not to probe for this interface
532 	 */
533 	if (__dev_get_by_name(&init_net, name))
534 		return 1;
535 
536 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
537 		if (!strcmp(name, s[i].name))
538 			return s[i].map.base_addr;
539 	return 0;
540 }
541 
542 /*
543  * Saves at boot time configured settings for any netdevice.
544  */
545 int __init netdev_boot_setup(char *str)
546 {
547 	int ints[5];
548 	struct ifmap map;
549 
550 	str = get_options(str, ARRAY_SIZE(ints), ints);
551 	if (!str || !*str)
552 		return 0;
553 
554 	/* Save settings */
555 	memset(&map, 0, sizeof(map));
556 	if (ints[0] > 0)
557 		map.irq = ints[1];
558 	if (ints[0] > 1)
559 		map.base_addr = ints[2];
560 	if (ints[0] > 2)
561 		map.mem_start = ints[3];
562 	if (ints[0] > 3)
563 		map.mem_end = ints[4];
564 
565 	/* Add new entry to the list */
566 	return netdev_boot_setup_add(str, &map);
567 }
568 
569 __setup("netdev=", netdev_boot_setup);
570 
571 /*******************************************************************************
572 
573 			    Device Interface Subroutines
574 
575 *******************************************************************************/
576 
577 /**
578  *	__dev_get_by_name	- find a device by its name
579  *	@net: the applicable net namespace
580  *	@name: name to find
581  *
582  *	Find an interface by name. Must be called under RTNL semaphore
583  *	or @dev_base_lock. If the name is found a pointer to the device
584  *	is returned. If the name is not found then %NULL is returned. The
585  *	reference counters are not incremented so the caller must be
586  *	careful with locks.
587  */
588 
589 struct net_device *__dev_get_by_name(struct net *net, const char *name)
590 {
591 	struct hlist_node *p;
592 	struct net_device *dev;
593 	struct hlist_head *head = dev_name_hash(net, name);
594 
595 	hlist_for_each_entry(dev, p, head, name_hlist)
596 		if (!strncmp(dev->name, name, IFNAMSIZ))
597 			return dev;
598 
599 	return NULL;
600 }
601 EXPORT_SYMBOL(__dev_get_by_name);
602 
603 /**
604  *	dev_get_by_name_rcu	- find a device by its name
605  *	@net: the applicable net namespace
606  *	@name: name to find
607  *
608  *	Find an interface by name.
609  *	If the name is found a pointer to the device is returned.
610  * 	If the name is not found then %NULL is returned.
611  *	The reference counters are not incremented so the caller must be
612  *	careful with locks. The caller must hold RCU lock.
613  */
614 
615 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
616 {
617 	struct hlist_node *p;
618 	struct net_device *dev;
619 	struct hlist_head *head = dev_name_hash(net, name);
620 
621 	hlist_for_each_entry_rcu(dev, p, head, name_hlist)
622 		if (!strncmp(dev->name, name, IFNAMSIZ))
623 			return dev;
624 
625 	return NULL;
626 }
627 EXPORT_SYMBOL(dev_get_by_name_rcu);
628 
629 /**
630  *	dev_get_by_name		- find a device by its name
631  *	@net: the applicable net namespace
632  *	@name: name to find
633  *
634  *	Find an interface by name. This can be called from any
635  *	context and does its own locking. The returned handle has
636  *	the usage count incremented and the caller must use dev_put() to
637  *	release it when it is no longer needed. %NULL is returned if no
638  *	matching device is found.
639  */
640 
641 struct net_device *dev_get_by_name(struct net *net, const char *name)
642 {
643 	struct net_device *dev;
644 
645 	rcu_read_lock();
646 	dev = dev_get_by_name_rcu(net, name);
647 	if (dev)
648 		dev_hold(dev);
649 	rcu_read_unlock();
650 	return dev;
651 }
652 EXPORT_SYMBOL(dev_get_by_name);
653 
654 /**
655  *	__dev_get_by_index - find a device by its ifindex
656  *	@net: the applicable net namespace
657  *	@ifindex: index of device
658  *
659  *	Search for an interface by index. Returns %NULL if the device
660  *	is not found or a pointer to the device. The device has not
661  *	had its reference counter increased so the caller must be careful
662  *	about locking. The caller must hold either the RTNL semaphore
663  *	or @dev_base_lock.
664  */
665 
666 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
667 {
668 	struct hlist_node *p;
669 	struct net_device *dev;
670 	struct hlist_head *head = dev_index_hash(net, ifindex);
671 
672 	hlist_for_each_entry(dev, p, head, index_hlist)
673 		if (dev->ifindex == ifindex)
674 			return dev;
675 
676 	return NULL;
677 }
678 EXPORT_SYMBOL(__dev_get_by_index);
679 
680 /**
681  *	dev_get_by_index_rcu - find a device by its ifindex
682  *	@net: the applicable net namespace
683  *	@ifindex: index of device
684  *
685  *	Search for an interface by index. Returns %NULL if the device
686  *	is not found or a pointer to the device. The device has not
687  *	had its reference counter increased so the caller must be careful
688  *	about locking. The caller must hold RCU lock.
689  */
690 
691 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
692 {
693 	struct hlist_node *p;
694 	struct net_device *dev;
695 	struct hlist_head *head = dev_index_hash(net, ifindex);
696 
697 	hlist_for_each_entry_rcu(dev, p, head, index_hlist)
698 		if (dev->ifindex == ifindex)
699 			return dev;
700 
701 	return NULL;
702 }
703 EXPORT_SYMBOL(dev_get_by_index_rcu);
704 
705 
706 /**
707  *	dev_get_by_index - find a device by its ifindex
708  *	@net: the applicable net namespace
709  *	@ifindex: index of device
710  *
711  *	Search for an interface by index. Returns NULL if the device
712  *	is not found or a pointer to the device. The device returned has
713  *	had a reference added and the pointer is safe until the user calls
714  *	dev_put to indicate they have finished with it.
715  */
716 
717 struct net_device *dev_get_by_index(struct net *net, int ifindex)
718 {
719 	struct net_device *dev;
720 
721 	rcu_read_lock();
722 	dev = dev_get_by_index_rcu(net, ifindex);
723 	if (dev)
724 		dev_hold(dev);
725 	rcu_read_unlock();
726 	return dev;
727 }
728 EXPORT_SYMBOL(dev_get_by_index);
729 
730 /**
731  *	dev_getbyhwaddr - find a device by its hardware address
732  *	@net: the applicable net namespace
733  *	@type: media type of device
734  *	@ha: hardware address
735  *
736  *	Search for an interface by MAC address. Returns NULL if the device
737  *	is not found or a pointer to the device. The caller must hold the
738  *	rtnl semaphore. The returned device has not had its ref count increased
739  *	and the caller must therefore be careful about locking
740  *
741  *	BUGS:
742  *	If the API was consistent this would be __dev_get_by_hwaddr
743  */
744 
745 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
746 {
747 	struct net_device *dev;
748 
749 	ASSERT_RTNL();
750 
751 	for_each_netdev(net, dev)
752 		if (dev->type == type &&
753 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
754 			return dev;
755 
756 	return NULL;
757 }
758 EXPORT_SYMBOL(dev_getbyhwaddr);
759 
760 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
761 {
762 	struct net_device *dev;
763 
764 	ASSERT_RTNL();
765 	for_each_netdev(net, dev)
766 		if (dev->type == type)
767 			return dev;
768 
769 	return NULL;
770 }
771 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
772 
773 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
774 {
775 	struct net_device *dev;
776 
777 	rtnl_lock();
778 	dev = __dev_getfirstbyhwtype(net, type);
779 	if (dev)
780 		dev_hold(dev);
781 	rtnl_unlock();
782 	return dev;
783 }
784 EXPORT_SYMBOL(dev_getfirstbyhwtype);
785 
786 /**
787  *	dev_get_by_flags - find any device with given flags
788  *	@net: the applicable net namespace
789  *	@if_flags: IFF_* values
790  *	@mask: bitmask of bits in if_flags to check
791  *
792  *	Search for any interface with the given flags. Returns NULL if a device
793  *	is not found or a pointer to the device. The device returned has
794  *	had a reference added and the pointer is safe until the user calls
795  *	dev_put to indicate they have finished with it.
796  */
797 
798 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
799 				    unsigned short mask)
800 {
801 	struct net_device *dev, *ret;
802 
803 	ret = NULL;
804 	rcu_read_lock();
805 	for_each_netdev_rcu(net, dev) {
806 		if (((dev->flags ^ if_flags) & mask) == 0) {
807 			dev_hold(dev);
808 			ret = dev;
809 			break;
810 		}
811 	}
812 	rcu_read_unlock();
813 	return ret;
814 }
815 EXPORT_SYMBOL(dev_get_by_flags);
816 
817 /**
818  *	dev_valid_name - check if name is okay for network device
819  *	@name: name string
820  *
821  *	Network device names need to be valid file names to
822  *	to allow sysfs to work.  We also disallow any kind of
823  *	whitespace.
824  */
825 int dev_valid_name(const char *name)
826 {
827 	if (*name == '\0')
828 		return 0;
829 	if (strlen(name) >= IFNAMSIZ)
830 		return 0;
831 	if (!strcmp(name, ".") || !strcmp(name, ".."))
832 		return 0;
833 
834 	while (*name) {
835 		if (*name == '/' || isspace(*name))
836 			return 0;
837 		name++;
838 	}
839 	return 1;
840 }
841 EXPORT_SYMBOL(dev_valid_name);
842 
843 /**
844  *	__dev_alloc_name - allocate a name for a device
845  *	@net: network namespace to allocate the device name in
846  *	@name: name format string
847  *	@buf:  scratch buffer and result name string
848  *
849  *	Passed a format string - eg "lt%d" it will try and find a suitable
850  *	id. It scans list of devices to build up a free map, then chooses
851  *	the first empty slot. The caller must hold the dev_base or rtnl lock
852  *	while allocating the name and adding the device in order to avoid
853  *	duplicates.
854  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
855  *	Returns the number of the unit assigned or a negative errno code.
856  */
857 
858 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
859 {
860 	int i = 0;
861 	const char *p;
862 	const int max_netdevices = 8*PAGE_SIZE;
863 	unsigned long *inuse;
864 	struct net_device *d;
865 
866 	p = strnchr(name, IFNAMSIZ-1, '%');
867 	if (p) {
868 		/*
869 		 * Verify the string as this thing may have come from
870 		 * the user.  There must be either one "%d" and no other "%"
871 		 * characters.
872 		 */
873 		if (p[1] != 'd' || strchr(p + 2, '%'))
874 			return -EINVAL;
875 
876 		/* Use one page as a bit array of possible slots */
877 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
878 		if (!inuse)
879 			return -ENOMEM;
880 
881 		for_each_netdev(net, d) {
882 			if (!sscanf(d->name, name, &i))
883 				continue;
884 			if (i < 0 || i >= max_netdevices)
885 				continue;
886 
887 			/*  avoid cases where sscanf is not exact inverse of printf */
888 			snprintf(buf, IFNAMSIZ, name, i);
889 			if (!strncmp(buf, d->name, IFNAMSIZ))
890 				set_bit(i, inuse);
891 		}
892 
893 		i = find_first_zero_bit(inuse, max_netdevices);
894 		free_page((unsigned long) inuse);
895 	}
896 
897 	if (buf != name)
898 		snprintf(buf, IFNAMSIZ, name, i);
899 	if (!__dev_get_by_name(net, buf))
900 		return i;
901 
902 	/* It is possible to run out of possible slots
903 	 * when the name is long and there isn't enough space left
904 	 * for the digits, or if all bits are used.
905 	 */
906 	return -ENFILE;
907 }
908 
909 /**
910  *	dev_alloc_name - allocate a name for a device
911  *	@dev: device
912  *	@name: name format string
913  *
914  *	Passed a format string - eg "lt%d" it will try and find a suitable
915  *	id. It scans list of devices to build up a free map, then chooses
916  *	the first empty slot. The caller must hold the dev_base or rtnl lock
917  *	while allocating the name and adding the device in order to avoid
918  *	duplicates.
919  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
920  *	Returns the number of the unit assigned or a negative errno code.
921  */
922 
923 int dev_alloc_name(struct net_device *dev, const char *name)
924 {
925 	char buf[IFNAMSIZ];
926 	struct net *net;
927 	int ret;
928 
929 	BUG_ON(!dev_net(dev));
930 	net = dev_net(dev);
931 	ret = __dev_alloc_name(net, name, buf);
932 	if (ret >= 0)
933 		strlcpy(dev->name, buf, IFNAMSIZ);
934 	return ret;
935 }
936 EXPORT_SYMBOL(dev_alloc_name);
937 
938 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
939 			      bool fmt)
940 {
941 	if (!dev_valid_name(name))
942 		return -EINVAL;
943 
944 	if (fmt && strchr(name, '%'))
945 		return __dev_alloc_name(net, name, buf);
946 	else if (__dev_get_by_name(net, name))
947 		return -EEXIST;
948 	else if (buf != name)
949 		strlcpy(buf, name, IFNAMSIZ);
950 
951 	return 0;
952 }
953 
954 /**
955  *	dev_change_name - change name of a device
956  *	@dev: device
957  *	@newname: name (or format string) must be at least IFNAMSIZ
958  *
959  *	Change name of a device, can pass format strings "eth%d".
960  *	for wildcarding.
961  */
962 int dev_change_name(struct net_device *dev, const char *newname)
963 {
964 	char oldname[IFNAMSIZ];
965 	int err = 0;
966 	int ret;
967 	struct net *net;
968 
969 	ASSERT_RTNL();
970 	BUG_ON(!dev_net(dev));
971 
972 	net = dev_net(dev);
973 	if (dev->flags & IFF_UP)
974 		return -EBUSY;
975 
976 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
977 		return 0;
978 
979 	memcpy(oldname, dev->name, IFNAMSIZ);
980 
981 	err = dev_get_valid_name(net, newname, dev->name, 1);
982 	if (err < 0)
983 		return err;
984 
985 rollback:
986 	/* For now only devices in the initial network namespace
987 	 * are in sysfs.
988 	 */
989 	if (net_eq(net, &init_net)) {
990 		ret = device_rename(&dev->dev, dev->name);
991 		if (ret) {
992 			memcpy(dev->name, oldname, IFNAMSIZ);
993 			return ret;
994 		}
995 	}
996 
997 	write_lock_bh(&dev_base_lock);
998 	hlist_del(&dev->name_hlist);
999 	write_unlock_bh(&dev_base_lock);
1000 
1001 	synchronize_rcu();
1002 
1003 	write_lock_bh(&dev_base_lock);
1004 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1005 	write_unlock_bh(&dev_base_lock);
1006 
1007 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1008 	ret = notifier_to_errno(ret);
1009 
1010 	if (ret) {
1011 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1012 		if (err >= 0) {
1013 			err = ret;
1014 			memcpy(dev->name, oldname, IFNAMSIZ);
1015 			goto rollback;
1016 		} else {
1017 			printk(KERN_ERR
1018 			       "%s: name change rollback failed: %d.\n",
1019 			       dev->name, ret);
1020 		}
1021 	}
1022 
1023 	return err;
1024 }
1025 
1026 /**
1027  *	dev_set_alias - change ifalias of a device
1028  *	@dev: device
1029  *	@alias: name up to IFALIASZ
1030  *	@len: limit of bytes to copy from info
1031  *
1032  *	Set ifalias for a device,
1033  */
1034 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1035 {
1036 	ASSERT_RTNL();
1037 
1038 	if (len >= IFALIASZ)
1039 		return -EINVAL;
1040 
1041 	if (!len) {
1042 		if (dev->ifalias) {
1043 			kfree(dev->ifalias);
1044 			dev->ifalias = NULL;
1045 		}
1046 		return 0;
1047 	}
1048 
1049 	dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1050 	if (!dev->ifalias)
1051 		return -ENOMEM;
1052 
1053 	strlcpy(dev->ifalias, alias, len+1);
1054 	return len;
1055 }
1056 
1057 
1058 /**
1059  *	netdev_features_change - device changes features
1060  *	@dev: device to cause notification
1061  *
1062  *	Called to indicate a device has changed features.
1063  */
1064 void netdev_features_change(struct net_device *dev)
1065 {
1066 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1067 }
1068 EXPORT_SYMBOL(netdev_features_change);
1069 
1070 /**
1071  *	netdev_state_change - device changes state
1072  *	@dev: device to cause notification
1073  *
1074  *	Called to indicate a device has changed state. This function calls
1075  *	the notifier chains for netdev_chain and sends a NEWLINK message
1076  *	to the routing socket.
1077  */
1078 void netdev_state_change(struct net_device *dev)
1079 {
1080 	if (dev->flags & IFF_UP) {
1081 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
1082 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1083 	}
1084 }
1085 EXPORT_SYMBOL(netdev_state_change);
1086 
1087 void netdev_bonding_change(struct net_device *dev, unsigned long event)
1088 {
1089 	call_netdevice_notifiers(event, dev);
1090 }
1091 EXPORT_SYMBOL(netdev_bonding_change);
1092 
1093 /**
1094  *	dev_load 	- load a network module
1095  *	@net: the applicable net namespace
1096  *	@name: name of interface
1097  *
1098  *	If a network interface is not present and the process has suitable
1099  *	privileges this function loads the module. If module loading is not
1100  *	available in this kernel then it becomes a nop.
1101  */
1102 
1103 void dev_load(struct net *net, const char *name)
1104 {
1105 	struct net_device *dev;
1106 
1107 	rcu_read_lock();
1108 	dev = dev_get_by_name_rcu(net, name);
1109 	rcu_read_unlock();
1110 
1111 	if (!dev && capable(CAP_NET_ADMIN))
1112 		request_module("%s", name);
1113 }
1114 EXPORT_SYMBOL(dev_load);
1115 
1116 /**
1117  *	dev_open	- prepare an interface for use.
1118  *	@dev:	device to open
1119  *
1120  *	Takes a device from down to up state. The device's private open
1121  *	function is invoked and then the multicast lists are loaded. Finally
1122  *	the device is moved into the up state and a %NETDEV_UP message is
1123  *	sent to the netdev notifier chain.
1124  *
1125  *	Calling this function on an active interface is a nop. On a failure
1126  *	a negative errno code is returned.
1127  */
1128 int dev_open(struct net_device *dev)
1129 {
1130 	const struct net_device_ops *ops = dev->netdev_ops;
1131 	int ret;
1132 
1133 	ASSERT_RTNL();
1134 
1135 	/*
1136 	 *	Is it already up?
1137 	 */
1138 
1139 	if (dev->flags & IFF_UP)
1140 		return 0;
1141 
1142 	/*
1143 	 *	Is it even present?
1144 	 */
1145 	if (!netif_device_present(dev))
1146 		return -ENODEV;
1147 
1148 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1149 	ret = notifier_to_errno(ret);
1150 	if (ret)
1151 		return ret;
1152 
1153 	/*
1154 	 *	Call device private open method
1155 	 */
1156 	set_bit(__LINK_STATE_START, &dev->state);
1157 
1158 	if (ops->ndo_validate_addr)
1159 		ret = ops->ndo_validate_addr(dev);
1160 
1161 	if (!ret && ops->ndo_open)
1162 		ret = ops->ndo_open(dev);
1163 
1164 	/*
1165 	 *	If it went open OK then:
1166 	 */
1167 
1168 	if (ret)
1169 		clear_bit(__LINK_STATE_START, &dev->state);
1170 	else {
1171 		/*
1172 		 *	Set the flags.
1173 		 */
1174 		dev->flags |= IFF_UP;
1175 
1176 		/*
1177 		 *	Enable NET_DMA
1178 		 */
1179 		net_dmaengine_get();
1180 
1181 		/*
1182 		 *	Initialize multicasting status
1183 		 */
1184 		dev_set_rx_mode(dev);
1185 
1186 		/*
1187 		 *	Wakeup transmit queue engine
1188 		 */
1189 		dev_activate(dev);
1190 
1191 		/*
1192 		 *	... and announce new interface.
1193 		 */
1194 		call_netdevice_notifiers(NETDEV_UP, dev);
1195 	}
1196 
1197 	return ret;
1198 }
1199 EXPORT_SYMBOL(dev_open);
1200 
1201 /**
1202  *	dev_close - shutdown an interface.
1203  *	@dev: device to shutdown
1204  *
1205  *	This function moves an active device into down state. A
1206  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1207  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1208  *	chain.
1209  */
1210 int dev_close(struct net_device *dev)
1211 {
1212 	const struct net_device_ops *ops = dev->netdev_ops;
1213 	ASSERT_RTNL();
1214 
1215 	might_sleep();
1216 
1217 	if (!(dev->flags & IFF_UP))
1218 		return 0;
1219 
1220 	/*
1221 	 *	Tell people we are going down, so that they can
1222 	 *	prepare to death, when device is still operating.
1223 	 */
1224 	call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1225 
1226 	clear_bit(__LINK_STATE_START, &dev->state);
1227 
1228 	/* Synchronize to scheduled poll. We cannot touch poll list,
1229 	 * it can be even on different cpu. So just clear netif_running().
1230 	 *
1231 	 * dev->stop() will invoke napi_disable() on all of it's
1232 	 * napi_struct instances on this device.
1233 	 */
1234 	smp_mb__after_clear_bit(); /* Commit netif_running(). */
1235 
1236 	dev_deactivate(dev);
1237 
1238 	/*
1239 	 *	Call the device specific close. This cannot fail.
1240 	 *	Only if device is UP
1241 	 *
1242 	 *	We allow it to be called even after a DETACH hot-plug
1243 	 *	event.
1244 	 */
1245 	if (ops->ndo_stop)
1246 		ops->ndo_stop(dev);
1247 
1248 	/*
1249 	 *	Device is now down.
1250 	 */
1251 
1252 	dev->flags &= ~IFF_UP;
1253 
1254 	/*
1255 	 * Tell people we are down
1256 	 */
1257 	call_netdevice_notifiers(NETDEV_DOWN, dev);
1258 
1259 	/*
1260 	 *	Shutdown NET_DMA
1261 	 */
1262 	net_dmaengine_put();
1263 
1264 	return 0;
1265 }
1266 EXPORT_SYMBOL(dev_close);
1267 
1268 
1269 /**
1270  *	dev_disable_lro - disable Large Receive Offload on a device
1271  *	@dev: device
1272  *
1273  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1274  *	called under RTNL.  This is needed if received packets may be
1275  *	forwarded to another interface.
1276  */
1277 void dev_disable_lro(struct net_device *dev)
1278 {
1279 	if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1280 	    dev->ethtool_ops->set_flags) {
1281 		u32 flags = dev->ethtool_ops->get_flags(dev);
1282 		if (flags & ETH_FLAG_LRO) {
1283 			flags &= ~ETH_FLAG_LRO;
1284 			dev->ethtool_ops->set_flags(dev, flags);
1285 		}
1286 	}
1287 	WARN_ON(dev->features & NETIF_F_LRO);
1288 }
1289 EXPORT_SYMBOL(dev_disable_lro);
1290 
1291 
1292 static int dev_boot_phase = 1;
1293 
1294 /*
1295  *	Device change register/unregister. These are not inline or static
1296  *	as we export them to the world.
1297  */
1298 
1299 /**
1300  *	register_netdevice_notifier - register a network notifier block
1301  *	@nb: notifier
1302  *
1303  *	Register a notifier to be called when network device events occur.
1304  *	The notifier passed is linked into the kernel structures and must
1305  *	not be reused until it has been unregistered. A negative errno code
1306  *	is returned on a failure.
1307  *
1308  * 	When registered all registration and up events are replayed
1309  *	to the new notifier to allow device to have a race free
1310  *	view of the network device list.
1311  */
1312 
1313 int register_netdevice_notifier(struct notifier_block *nb)
1314 {
1315 	struct net_device *dev;
1316 	struct net_device *last;
1317 	struct net *net;
1318 	int err;
1319 
1320 	rtnl_lock();
1321 	err = raw_notifier_chain_register(&netdev_chain, nb);
1322 	if (err)
1323 		goto unlock;
1324 	if (dev_boot_phase)
1325 		goto unlock;
1326 	for_each_net(net) {
1327 		for_each_netdev(net, dev) {
1328 			err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1329 			err = notifier_to_errno(err);
1330 			if (err)
1331 				goto rollback;
1332 
1333 			if (!(dev->flags & IFF_UP))
1334 				continue;
1335 
1336 			nb->notifier_call(nb, NETDEV_UP, dev);
1337 		}
1338 	}
1339 
1340 unlock:
1341 	rtnl_unlock();
1342 	return err;
1343 
1344 rollback:
1345 	last = dev;
1346 	for_each_net(net) {
1347 		for_each_netdev(net, dev) {
1348 			if (dev == last)
1349 				break;
1350 
1351 			if (dev->flags & IFF_UP) {
1352 				nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1353 				nb->notifier_call(nb, NETDEV_DOWN, dev);
1354 			}
1355 			nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1356 			nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1357 		}
1358 	}
1359 
1360 	raw_notifier_chain_unregister(&netdev_chain, nb);
1361 	goto unlock;
1362 }
1363 EXPORT_SYMBOL(register_netdevice_notifier);
1364 
1365 /**
1366  *	unregister_netdevice_notifier - unregister a network notifier block
1367  *	@nb: notifier
1368  *
1369  *	Unregister a notifier previously registered by
1370  *	register_netdevice_notifier(). The notifier is unlinked into the
1371  *	kernel structures and may then be reused. A negative errno code
1372  *	is returned on a failure.
1373  */
1374 
1375 int unregister_netdevice_notifier(struct notifier_block *nb)
1376 {
1377 	int err;
1378 
1379 	rtnl_lock();
1380 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1381 	rtnl_unlock();
1382 	return err;
1383 }
1384 EXPORT_SYMBOL(unregister_netdevice_notifier);
1385 
1386 /**
1387  *	call_netdevice_notifiers - call all network notifier blocks
1388  *      @val: value passed unmodified to notifier function
1389  *      @dev: net_device pointer passed unmodified to notifier function
1390  *
1391  *	Call all network notifier blocks.  Parameters and return value
1392  *	are as for raw_notifier_call_chain().
1393  */
1394 
1395 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1396 {
1397 	return raw_notifier_call_chain(&netdev_chain, val, dev);
1398 }
1399 
1400 /* When > 0 there are consumers of rx skb time stamps */
1401 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1402 
1403 void net_enable_timestamp(void)
1404 {
1405 	atomic_inc(&netstamp_needed);
1406 }
1407 EXPORT_SYMBOL(net_enable_timestamp);
1408 
1409 void net_disable_timestamp(void)
1410 {
1411 	atomic_dec(&netstamp_needed);
1412 }
1413 EXPORT_SYMBOL(net_disable_timestamp);
1414 
1415 static inline void net_timestamp(struct sk_buff *skb)
1416 {
1417 	if (atomic_read(&netstamp_needed))
1418 		__net_timestamp(skb);
1419 	else
1420 		skb->tstamp.tv64 = 0;
1421 }
1422 
1423 /**
1424  * dev_forward_skb - loopback an skb to another netif
1425  *
1426  * @dev: destination network device
1427  * @skb: buffer to forward
1428  *
1429  * return values:
1430  *	NET_RX_SUCCESS	(no congestion)
1431  *	NET_RX_DROP     (packet was dropped)
1432  *
1433  * dev_forward_skb can be used for injecting an skb from the
1434  * start_xmit function of one device into the receive queue
1435  * of another device.
1436  *
1437  * The receiving device may be in another namespace, so
1438  * we have to clear all information in the skb that could
1439  * impact namespace isolation.
1440  */
1441 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1442 {
1443 	skb_orphan(skb);
1444 
1445 	if (!(dev->flags & IFF_UP))
1446 		return NET_RX_DROP;
1447 
1448 	if (skb->len > (dev->mtu + dev->hard_header_len))
1449 		return NET_RX_DROP;
1450 
1451 	skb_set_dev(skb, dev);
1452 	skb->tstamp.tv64 = 0;
1453 	skb->pkt_type = PACKET_HOST;
1454 	skb->protocol = eth_type_trans(skb, dev);
1455 	return netif_rx(skb);
1456 }
1457 EXPORT_SYMBOL_GPL(dev_forward_skb);
1458 
1459 /*
1460  *	Support routine. Sends outgoing frames to any network
1461  *	taps currently in use.
1462  */
1463 
1464 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1465 {
1466 	struct packet_type *ptype;
1467 
1468 #ifdef CONFIG_NET_CLS_ACT
1469 	if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1470 		net_timestamp(skb);
1471 #else
1472 	net_timestamp(skb);
1473 #endif
1474 
1475 	rcu_read_lock();
1476 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
1477 		/* Never send packets back to the socket
1478 		 * they originated from - MvS (miquels@drinkel.ow.org)
1479 		 */
1480 		if ((ptype->dev == dev || !ptype->dev) &&
1481 		    (ptype->af_packet_priv == NULL ||
1482 		     (struct sock *)ptype->af_packet_priv != skb->sk)) {
1483 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1484 			if (!skb2)
1485 				break;
1486 
1487 			/* skb->nh should be correctly
1488 			   set by sender, so that the second statement is
1489 			   just protection against buggy protocols.
1490 			 */
1491 			skb_reset_mac_header(skb2);
1492 
1493 			if (skb_network_header(skb2) < skb2->data ||
1494 			    skb2->network_header > skb2->tail) {
1495 				if (net_ratelimit())
1496 					printk(KERN_CRIT "protocol %04x is "
1497 					       "buggy, dev %s\n",
1498 					       skb2->protocol, dev->name);
1499 				skb_reset_network_header(skb2);
1500 			}
1501 
1502 			skb2->transport_header = skb2->network_header;
1503 			skb2->pkt_type = PACKET_OUTGOING;
1504 			ptype->func(skb2, skb->dev, ptype, skb->dev);
1505 		}
1506 	}
1507 	rcu_read_unlock();
1508 }
1509 
1510 
1511 static inline void __netif_reschedule(struct Qdisc *q)
1512 {
1513 	struct softnet_data *sd;
1514 	unsigned long flags;
1515 
1516 	local_irq_save(flags);
1517 	sd = &__get_cpu_var(softnet_data);
1518 	q->next_sched = sd->output_queue;
1519 	sd->output_queue = q;
1520 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
1521 	local_irq_restore(flags);
1522 }
1523 
1524 void __netif_schedule(struct Qdisc *q)
1525 {
1526 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1527 		__netif_reschedule(q);
1528 }
1529 EXPORT_SYMBOL(__netif_schedule);
1530 
1531 void dev_kfree_skb_irq(struct sk_buff *skb)
1532 {
1533 	if (atomic_dec_and_test(&skb->users)) {
1534 		struct softnet_data *sd;
1535 		unsigned long flags;
1536 
1537 		local_irq_save(flags);
1538 		sd = &__get_cpu_var(softnet_data);
1539 		skb->next = sd->completion_queue;
1540 		sd->completion_queue = skb;
1541 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
1542 		local_irq_restore(flags);
1543 	}
1544 }
1545 EXPORT_SYMBOL(dev_kfree_skb_irq);
1546 
1547 void dev_kfree_skb_any(struct sk_buff *skb)
1548 {
1549 	if (in_irq() || irqs_disabled())
1550 		dev_kfree_skb_irq(skb);
1551 	else
1552 		dev_kfree_skb(skb);
1553 }
1554 EXPORT_SYMBOL(dev_kfree_skb_any);
1555 
1556 
1557 /**
1558  * netif_device_detach - mark device as removed
1559  * @dev: network device
1560  *
1561  * Mark device as removed from system and therefore no longer available.
1562  */
1563 void netif_device_detach(struct net_device *dev)
1564 {
1565 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1566 	    netif_running(dev)) {
1567 		netif_tx_stop_all_queues(dev);
1568 	}
1569 }
1570 EXPORT_SYMBOL(netif_device_detach);
1571 
1572 /**
1573  * netif_device_attach - mark device as attached
1574  * @dev: network device
1575  *
1576  * Mark device as attached from system and restart if needed.
1577  */
1578 void netif_device_attach(struct net_device *dev)
1579 {
1580 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1581 	    netif_running(dev)) {
1582 		netif_tx_wake_all_queues(dev);
1583 		__netdev_watchdog_up(dev);
1584 	}
1585 }
1586 EXPORT_SYMBOL(netif_device_attach);
1587 
1588 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1589 {
1590 	return ((features & NETIF_F_GEN_CSUM) ||
1591 		((features & NETIF_F_IP_CSUM) &&
1592 		 protocol == htons(ETH_P_IP)) ||
1593 		((features & NETIF_F_IPV6_CSUM) &&
1594 		 protocol == htons(ETH_P_IPV6)) ||
1595 		((features & NETIF_F_FCOE_CRC) &&
1596 		 protocol == htons(ETH_P_FCOE)));
1597 }
1598 
1599 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1600 {
1601 	if (can_checksum_protocol(dev->features, skb->protocol))
1602 		return true;
1603 
1604 	if (skb->protocol == htons(ETH_P_8021Q)) {
1605 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1606 		if (can_checksum_protocol(dev->features & dev->vlan_features,
1607 					  veh->h_vlan_encapsulated_proto))
1608 			return true;
1609 	}
1610 
1611 	return false;
1612 }
1613 
1614 /**
1615  * skb_dev_set -- assign a new device to a buffer
1616  * @skb: buffer for the new device
1617  * @dev: network device
1618  *
1619  * If an skb is owned by a device already, we have to reset
1620  * all data private to the namespace a device belongs to
1621  * before assigning it a new device.
1622  */
1623 #ifdef CONFIG_NET_NS
1624 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1625 {
1626 	skb_dst_drop(skb);
1627 	if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1628 		secpath_reset(skb);
1629 		nf_reset(skb);
1630 		skb_init_secmark(skb);
1631 		skb->mark = 0;
1632 		skb->priority = 0;
1633 		skb->nf_trace = 0;
1634 		skb->ipvs_property = 0;
1635 #ifdef CONFIG_NET_SCHED
1636 		skb->tc_index = 0;
1637 #endif
1638 	}
1639 	skb->dev = dev;
1640 }
1641 EXPORT_SYMBOL(skb_set_dev);
1642 #endif /* CONFIG_NET_NS */
1643 
1644 /*
1645  * Invalidate hardware checksum when packet is to be mangled, and
1646  * complete checksum manually on outgoing path.
1647  */
1648 int skb_checksum_help(struct sk_buff *skb)
1649 {
1650 	__wsum csum;
1651 	int ret = 0, offset;
1652 
1653 	if (skb->ip_summed == CHECKSUM_COMPLETE)
1654 		goto out_set_summed;
1655 
1656 	if (unlikely(skb_shinfo(skb)->gso_size)) {
1657 		/* Let GSO fix up the checksum. */
1658 		goto out_set_summed;
1659 	}
1660 
1661 	offset = skb->csum_start - skb_headroom(skb);
1662 	BUG_ON(offset >= skb_headlen(skb));
1663 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
1664 
1665 	offset += skb->csum_offset;
1666 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1667 
1668 	if (skb_cloned(skb) &&
1669 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1670 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1671 		if (ret)
1672 			goto out;
1673 	}
1674 
1675 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
1676 out_set_summed:
1677 	skb->ip_summed = CHECKSUM_NONE;
1678 out:
1679 	return ret;
1680 }
1681 EXPORT_SYMBOL(skb_checksum_help);
1682 
1683 /**
1684  *	skb_gso_segment - Perform segmentation on skb.
1685  *	@skb: buffer to segment
1686  *	@features: features for the output path (see dev->features)
1687  *
1688  *	This function segments the given skb and returns a list of segments.
1689  *
1690  *	It may return NULL if the skb requires no segmentation.  This is
1691  *	only possible when GSO is used for verifying header integrity.
1692  */
1693 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1694 {
1695 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1696 	struct packet_type *ptype;
1697 	__be16 type = skb->protocol;
1698 	int err;
1699 
1700 	skb_reset_mac_header(skb);
1701 	skb->mac_len = skb->network_header - skb->mac_header;
1702 	__skb_pull(skb, skb->mac_len);
1703 
1704 	if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1705 		struct net_device *dev = skb->dev;
1706 		struct ethtool_drvinfo info = {};
1707 
1708 		if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1709 			dev->ethtool_ops->get_drvinfo(dev, &info);
1710 
1711 		WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1712 			"ip_summed=%d",
1713 		     info.driver, dev ? dev->features : 0L,
1714 		     skb->sk ? skb->sk->sk_route_caps : 0L,
1715 		     skb->len, skb->data_len, skb->ip_summed);
1716 
1717 		if (skb_header_cloned(skb) &&
1718 		    (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1719 			return ERR_PTR(err);
1720 	}
1721 
1722 	rcu_read_lock();
1723 	list_for_each_entry_rcu(ptype,
1724 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1725 		if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1726 			if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1727 				err = ptype->gso_send_check(skb);
1728 				segs = ERR_PTR(err);
1729 				if (err || skb_gso_ok(skb, features))
1730 					break;
1731 				__skb_push(skb, (skb->data -
1732 						 skb_network_header(skb)));
1733 			}
1734 			segs = ptype->gso_segment(skb, features);
1735 			break;
1736 		}
1737 	}
1738 	rcu_read_unlock();
1739 
1740 	__skb_push(skb, skb->data - skb_mac_header(skb));
1741 
1742 	return segs;
1743 }
1744 EXPORT_SYMBOL(skb_gso_segment);
1745 
1746 /* Take action when hardware reception checksum errors are detected. */
1747 #ifdef CONFIG_BUG
1748 void netdev_rx_csum_fault(struct net_device *dev)
1749 {
1750 	if (net_ratelimit()) {
1751 		printk(KERN_ERR "%s: hw csum failure.\n",
1752 			dev ? dev->name : "<unknown>");
1753 		dump_stack();
1754 	}
1755 }
1756 EXPORT_SYMBOL(netdev_rx_csum_fault);
1757 #endif
1758 
1759 /* Actually, we should eliminate this check as soon as we know, that:
1760  * 1. IOMMU is present and allows to map all the memory.
1761  * 2. No high memory really exists on this machine.
1762  */
1763 
1764 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1765 {
1766 #ifdef CONFIG_HIGHMEM
1767 	int i;
1768 
1769 	if (dev->features & NETIF_F_HIGHDMA)
1770 		return 0;
1771 
1772 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1773 		if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1774 			return 1;
1775 
1776 #endif
1777 	return 0;
1778 }
1779 
1780 struct dev_gso_cb {
1781 	void (*destructor)(struct sk_buff *skb);
1782 };
1783 
1784 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1785 
1786 static void dev_gso_skb_destructor(struct sk_buff *skb)
1787 {
1788 	struct dev_gso_cb *cb;
1789 
1790 	do {
1791 		struct sk_buff *nskb = skb->next;
1792 
1793 		skb->next = nskb->next;
1794 		nskb->next = NULL;
1795 		kfree_skb(nskb);
1796 	} while (skb->next);
1797 
1798 	cb = DEV_GSO_CB(skb);
1799 	if (cb->destructor)
1800 		cb->destructor(skb);
1801 }
1802 
1803 /**
1804  *	dev_gso_segment - Perform emulated hardware segmentation on skb.
1805  *	@skb: buffer to segment
1806  *
1807  *	This function segments the given skb and stores the list of segments
1808  *	in skb->next.
1809  */
1810 static int dev_gso_segment(struct sk_buff *skb)
1811 {
1812 	struct net_device *dev = skb->dev;
1813 	struct sk_buff *segs;
1814 	int features = dev->features & ~(illegal_highdma(dev, skb) ?
1815 					 NETIF_F_SG : 0);
1816 
1817 	segs = skb_gso_segment(skb, features);
1818 
1819 	/* Verifying header integrity only. */
1820 	if (!segs)
1821 		return 0;
1822 
1823 	if (IS_ERR(segs))
1824 		return PTR_ERR(segs);
1825 
1826 	skb->next = segs;
1827 	DEV_GSO_CB(skb)->destructor = skb->destructor;
1828 	skb->destructor = dev_gso_skb_destructor;
1829 
1830 	return 0;
1831 }
1832 
1833 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1834 			struct netdev_queue *txq)
1835 {
1836 	const struct net_device_ops *ops = dev->netdev_ops;
1837 	int rc = NETDEV_TX_OK;
1838 
1839 	if (likely(!skb->next)) {
1840 		if (!list_empty(&ptype_all))
1841 			dev_queue_xmit_nit(skb, dev);
1842 
1843 		if (netif_needs_gso(dev, skb)) {
1844 			if (unlikely(dev_gso_segment(skb)))
1845 				goto out_kfree_skb;
1846 			if (skb->next)
1847 				goto gso;
1848 		}
1849 
1850 		/*
1851 		 * If device doesnt need skb->dst, release it right now while
1852 		 * its hot in this cpu cache
1853 		 */
1854 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1855 			skb_dst_drop(skb);
1856 
1857 		rc = ops->ndo_start_xmit(skb, dev);
1858 		if (rc == NETDEV_TX_OK)
1859 			txq_trans_update(txq);
1860 		/*
1861 		 * TODO: if skb_orphan() was called by
1862 		 * dev->hard_start_xmit() (for example, the unmodified
1863 		 * igb driver does that; bnx2 doesn't), then
1864 		 * skb_tx_software_timestamp() will be unable to send
1865 		 * back the time stamp.
1866 		 *
1867 		 * How can this be prevented? Always create another
1868 		 * reference to the socket before calling
1869 		 * dev->hard_start_xmit()? Prevent that skb_orphan()
1870 		 * does anything in dev->hard_start_xmit() by clearing
1871 		 * the skb destructor before the call and restoring it
1872 		 * afterwards, then doing the skb_orphan() ourselves?
1873 		 */
1874 		return rc;
1875 	}
1876 
1877 gso:
1878 	do {
1879 		struct sk_buff *nskb = skb->next;
1880 
1881 		skb->next = nskb->next;
1882 		nskb->next = NULL;
1883 
1884 		/*
1885 		 * If device doesnt need nskb->dst, release it right now while
1886 		 * its hot in this cpu cache
1887 		 */
1888 		if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1889 			skb_dst_drop(nskb);
1890 
1891 		rc = ops->ndo_start_xmit(nskb, dev);
1892 		if (unlikely(rc != NETDEV_TX_OK)) {
1893 			if (rc & ~NETDEV_TX_MASK)
1894 				goto out_kfree_gso_skb;
1895 			nskb->next = skb->next;
1896 			skb->next = nskb;
1897 			return rc;
1898 		}
1899 		txq_trans_update(txq);
1900 		if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1901 			return NETDEV_TX_BUSY;
1902 	} while (skb->next);
1903 
1904 out_kfree_gso_skb:
1905 	if (likely(skb->next == NULL))
1906 		skb->destructor = DEV_GSO_CB(skb)->destructor;
1907 out_kfree_skb:
1908 	kfree_skb(skb);
1909 	return rc;
1910 }
1911 
1912 static u32 skb_tx_hashrnd;
1913 
1914 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1915 {
1916 	u32 hash;
1917 
1918 	if (skb_rx_queue_recorded(skb)) {
1919 		hash = skb_get_rx_queue(skb);
1920 		while (unlikely(hash >= dev->real_num_tx_queues))
1921 			hash -= dev->real_num_tx_queues;
1922 		return hash;
1923 	}
1924 
1925 	if (skb->sk && skb->sk->sk_hash)
1926 		hash = skb->sk->sk_hash;
1927 	else
1928 		hash = skb->protocol;
1929 
1930 	hash = jhash_1word(hash, skb_tx_hashrnd);
1931 
1932 	return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1933 }
1934 EXPORT_SYMBOL(skb_tx_hash);
1935 
1936 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1937 {
1938 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1939 		if (net_ratelimit()) {
1940 			WARN(1, "%s selects TX queue %d, but "
1941 			     "real number of TX queues is %d\n",
1942 			     dev->name, queue_index,
1943 			     dev->real_num_tx_queues);
1944 		}
1945 		return 0;
1946 	}
1947 	return queue_index;
1948 }
1949 
1950 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1951 					struct sk_buff *skb)
1952 {
1953 	u16 queue_index;
1954 	struct sock *sk = skb->sk;
1955 
1956 	if (sk_tx_queue_recorded(sk)) {
1957 		queue_index = sk_tx_queue_get(sk);
1958 	} else {
1959 		const struct net_device_ops *ops = dev->netdev_ops;
1960 
1961 		if (ops->ndo_select_queue) {
1962 			queue_index = ops->ndo_select_queue(dev, skb);
1963 			queue_index = dev_cap_txqueue(dev, queue_index);
1964 		} else {
1965 			queue_index = 0;
1966 			if (dev->real_num_tx_queues > 1)
1967 				queue_index = skb_tx_hash(dev, skb);
1968 
1969 			if (sk && sk->sk_dst_cache)
1970 				sk_tx_queue_set(sk, queue_index);
1971 		}
1972 	}
1973 
1974 	skb_set_queue_mapping(skb, queue_index);
1975 	return netdev_get_tx_queue(dev, queue_index);
1976 }
1977 
1978 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
1979 				 struct net_device *dev,
1980 				 struct netdev_queue *txq)
1981 {
1982 	spinlock_t *root_lock = qdisc_lock(q);
1983 	int rc;
1984 
1985 	spin_lock(root_lock);
1986 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1987 		kfree_skb(skb);
1988 		rc = NET_XMIT_DROP;
1989 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
1990 		   !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
1991 		/*
1992 		 * This is a work-conserving queue; there are no old skbs
1993 		 * waiting to be sent out; and the qdisc is not running -
1994 		 * xmit the skb directly.
1995 		 */
1996 		__qdisc_update_bstats(q, skb->len);
1997 		if (sch_direct_xmit(skb, q, dev, txq, root_lock))
1998 			__qdisc_run(q);
1999 		else
2000 			clear_bit(__QDISC_STATE_RUNNING, &q->state);
2001 
2002 		rc = NET_XMIT_SUCCESS;
2003 	} else {
2004 		rc = qdisc_enqueue_root(skb, q);
2005 		qdisc_run(q);
2006 	}
2007 	spin_unlock(root_lock);
2008 
2009 	return rc;
2010 }
2011 
2012 /*
2013  * Returns true if either:
2014  *	1. skb has frag_list and the device doesn't support FRAGLIST, or
2015  *	2. skb is fragmented and the device does not support SG, or if
2016  *	   at least one of fragments is in highmem and device does not
2017  *	   support DMA from it.
2018  */
2019 static inline int skb_needs_linearize(struct sk_buff *skb,
2020 				      struct net_device *dev)
2021 {
2022 	return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2023 	       (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2024 					      illegal_highdma(dev, skb)));
2025 }
2026 
2027 /**
2028  *	dev_queue_xmit - transmit a buffer
2029  *	@skb: buffer to transmit
2030  *
2031  *	Queue a buffer for transmission to a network device. The caller must
2032  *	have set the device and priority and built the buffer before calling
2033  *	this function. The function can be called from an interrupt.
2034  *
2035  *	A negative errno code is returned on a failure. A success does not
2036  *	guarantee the frame will be transmitted as it may be dropped due
2037  *	to congestion or traffic shaping.
2038  *
2039  * -----------------------------------------------------------------------------------
2040  *      I notice this method can also return errors from the queue disciplines,
2041  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2042  *      be positive.
2043  *
2044  *      Regardless of the return value, the skb is consumed, so it is currently
2045  *      difficult to retry a send to this method.  (You can bump the ref count
2046  *      before sending to hold a reference for retry if you are careful.)
2047  *
2048  *      When calling this method, interrupts MUST be enabled.  This is because
2049  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2050  *          --BLG
2051  */
2052 int dev_queue_xmit(struct sk_buff *skb)
2053 {
2054 	struct net_device *dev = skb->dev;
2055 	struct netdev_queue *txq;
2056 	struct Qdisc *q;
2057 	int rc = -ENOMEM;
2058 
2059 	/* GSO will handle the following emulations directly. */
2060 	if (netif_needs_gso(dev, skb))
2061 		goto gso;
2062 
2063 	/* Convert a paged skb to linear, if required */
2064 	if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2065 		goto out_kfree_skb;
2066 
2067 	/* If packet is not checksummed and device does not support
2068 	 * checksumming for this protocol, complete checksumming here.
2069 	 */
2070 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
2071 		skb_set_transport_header(skb, skb->csum_start -
2072 					      skb_headroom(skb));
2073 		if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2074 			goto out_kfree_skb;
2075 	}
2076 
2077 gso:
2078 	/* Disable soft irqs for various locks below. Also
2079 	 * stops preemption for RCU.
2080 	 */
2081 	rcu_read_lock_bh();
2082 
2083 	txq = dev_pick_tx(dev, skb);
2084 	q = rcu_dereference(txq->qdisc);
2085 
2086 #ifdef CONFIG_NET_CLS_ACT
2087 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2088 #endif
2089 	if (q->enqueue) {
2090 		rc = __dev_xmit_skb(skb, q, dev, txq);
2091 		goto out;
2092 	}
2093 
2094 	/* The device has no queue. Common case for software devices:
2095 	   loopback, all the sorts of tunnels...
2096 
2097 	   Really, it is unlikely that netif_tx_lock protection is necessary
2098 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2099 	   counters.)
2100 	   However, it is possible, that they rely on protection
2101 	   made by us here.
2102 
2103 	   Check this and shot the lock. It is not prone from deadlocks.
2104 	   Either shot noqueue qdisc, it is even simpler 8)
2105 	 */
2106 	if (dev->flags & IFF_UP) {
2107 		int cpu = smp_processor_id(); /* ok because BHs are off */
2108 
2109 		if (txq->xmit_lock_owner != cpu) {
2110 
2111 			HARD_TX_LOCK(dev, txq, cpu);
2112 
2113 			if (!netif_tx_queue_stopped(txq)) {
2114 				rc = dev_hard_start_xmit(skb, dev, txq);
2115 				if (dev_xmit_complete(rc)) {
2116 					HARD_TX_UNLOCK(dev, txq);
2117 					goto out;
2118 				}
2119 			}
2120 			HARD_TX_UNLOCK(dev, txq);
2121 			if (net_ratelimit())
2122 				printk(KERN_CRIT "Virtual device %s asks to "
2123 				       "queue packet!\n", dev->name);
2124 		} else {
2125 			/* Recursion is detected! It is possible,
2126 			 * unfortunately */
2127 			if (net_ratelimit())
2128 				printk(KERN_CRIT "Dead loop on virtual device "
2129 				       "%s, fix it urgently!\n", dev->name);
2130 		}
2131 	}
2132 
2133 	rc = -ENETDOWN;
2134 	rcu_read_unlock_bh();
2135 
2136 out_kfree_skb:
2137 	kfree_skb(skb);
2138 	return rc;
2139 out:
2140 	rcu_read_unlock_bh();
2141 	return rc;
2142 }
2143 EXPORT_SYMBOL(dev_queue_xmit);
2144 
2145 
2146 /*=======================================================================
2147 			Receiver routines
2148   =======================================================================*/
2149 
2150 int netdev_max_backlog __read_mostly = 1000;
2151 int netdev_budget __read_mostly = 300;
2152 int weight_p __read_mostly = 64;            /* old backlog weight */
2153 
2154 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2155 
2156 
2157 /**
2158  *	netif_rx	-	post buffer to the network code
2159  *	@skb: buffer to post
2160  *
2161  *	This function receives a packet from a device driver and queues it for
2162  *	the upper (protocol) levels to process.  It always succeeds. The buffer
2163  *	may be dropped during processing for congestion control or by the
2164  *	protocol layers.
2165  *
2166  *	return values:
2167  *	NET_RX_SUCCESS	(no congestion)
2168  *	NET_RX_DROP     (packet was dropped)
2169  *
2170  */
2171 
2172 int netif_rx(struct sk_buff *skb)
2173 {
2174 	struct softnet_data *queue;
2175 	unsigned long flags;
2176 
2177 	/* if netpoll wants it, pretend we never saw it */
2178 	if (netpoll_rx(skb))
2179 		return NET_RX_DROP;
2180 
2181 	if (!skb->tstamp.tv64)
2182 		net_timestamp(skb);
2183 
2184 	/*
2185 	 * The code is rearranged so that the path is the most
2186 	 * short when CPU is congested, but is still operating.
2187 	 */
2188 	local_irq_save(flags);
2189 	queue = &__get_cpu_var(softnet_data);
2190 
2191 	__get_cpu_var(netdev_rx_stat).total++;
2192 	if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2193 		if (queue->input_pkt_queue.qlen) {
2194 enqueue:
2195 			__skb_queue_tail(&queue->input_pkt_queue, skb);
2196 			local_irq_restore(flags);
2197 			return NET_RX_SUCCESS;
2198 		}
2199 
2200 		napi_schedule(&queue->backlog);
2201 		goto enqueue;
2202 	}
2203 
2204 	__get_cpu_var(netdev_rx_stat).dropped++;
2205 	local_irq_restore(flags);
2206 
2207 	kfree_skb(skb);
2208 	return NET_RX_DROP;
2209 }
2210 EXPORT_SYMBOL(netif_rx);
2211 
2212 int netif_rx_ni(struct sk_buff *skb)
2213 {
2214 	int err;
2215 
2216 	preempt_disable();
2217 	err = netif_rx(skb);
2218 	if (local_softirq_pending())
2219 		do_softirq();
2220 	preempt_enable();
2221 
2222 	return err;
2223 }
2224 EXPORT_SYMBOL(netif_rx_ni);
2225 
2226 static void net_tx_action(struct softirq_action *h)
2227 {
2228 	struct softnet_data *sd = &__get_cpu_var(softnet_data);
2229 
2230 	if (sd->completion_queue) {
2231 		struct sk_buff *clist;
2232 
2233 		local_irq_disable();
2234 		clist = sd->completion_queue;
2235 		sd->completion_queue = NULL;
2236 		local_irq_enable();
2237 
2238 		while (clist) {
2239 			struct sk_buff *skb = clist;
2240 			clist = clist->next;
2241 
2242 			WARN_ON(atomic_read(&skb->users));
2243 			__kfree_skb(skb);
2244 		}
2245 	}
2246 
2247 	if (sd->output_queue) {
2248 		struct Qdisc *head;
2249 
2250 		local_irq_disable();
2251 		head = sd->output_queue;
2252 		sd->output_queue = NULL;
2253 		local_irq_enable();
2254 
2255 		while (head) {
2256 			struct Qdisc *q = head;
2257 			spinlock_t *root_lock;
2258 
2259 			head = head->next_sched;
2260 
2261 			root_lock = qdisc_lock(q);
2262 			if (spin_trylock(root_lock)) {
2263 				smp_mb__before_clear_bit();
2264 				clear_bit(__QDISC_STATE_SCHED,
2265 					  &q->state);
2266 				qdisc_run(q);
2267 				spin_unlock(root_lock);
2268 			} else {
2269 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
2270 					      &q->state)) {
2271 					__netif_reschedule(q);
2272 				} else {
2273 					smp_mb__before_clear_bit();
2274 					clear_bit(__QDISC_STATE_SCHED,
2275 						  &q->state);
2276 				}
2277 			}
2278 		}
2279 	}
2280 }
2281 
2282 static inline int deliver_skb(struct sk_buff *skb,
2283 			      struct packet_type *pt_prev,
2284 			      struct net_device *orig_dev)
2285 {
2286 	atomic_inc(&skb->users);
2287 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2288 }
2289 
2290 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2291 
2292 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2293 /* This hook is defined here for ATM LANE */
2294 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2295 			     unsigned char *addr) __read_mostly;
2296 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2297 #endif
2298 
2299 /*
2300  * If bridge module is loaded call bridging hook.
2301  *  returns NULL if packet was consumed.
2302  */
2303 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2304 					struct sk_buff *skb) __read_mostly;
2305 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2306 
2307 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2308 					    struct packet_type **pt_prev, int *ret,
2309 					    struct net_device *orig_dev)
2310 {
2311 	struct net_bridge_port *port;
2312 
2313 	if (skb->pkt_type == PACKET_LOOPBACK ||
2314 	    (port = rcu_dereference(skb->dev->br_port)) == NULL)
2315 		return skb;
2316 
2317 	if (*pt_prev) {
2318 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2319 		*pt_prev = NULL;
2320 	}
2321 
2322 	return br_handle_frame_hook(port, skb);
2323 }
2324 #else
2325 #define handle_bridge(skb, pt_prev, ret, orig_dev)	(skb)
2326 #endif
2327 
2328 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2329 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2330 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2331 
2332 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2333 					     struct packet_type **pt_prev,
2334 					     int *ret,
2335 					     struct net_device *orig_dev)
2336 {
2337 	if (skb->dev->macvlan_port == NULL)
2338 		return skb;
2339 
2340 	if (*pt_prev) {
2341 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2342 		*pt_prev = NULL;
2343 	}
2344 	return macvlan_handle_frame_hook(skb);
2345 }
2346 #else
2347 #define handle_macvlan(skb, pt_prev, ret, orig_dev)	(skb)
2348 #endif
2349 
2350 #ifdef CONFIG_NET_CLS_ACT
2351 /* TODO: Maybe we should just force sch_ingress to be compiled in
2352  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2353  * a compare and 2 stores extra right now if we dont have it on
2354  * but have CONFIG_NET_CLS_ACT
2355  * NOTE: This doesnt stop any functionality; if you dont have
2356  * the ingress scheduler, you just cant add policies on ingress.
2357  *
2358  */
2359 static int ing_filter(struct sk_buff *skb)
2360 {
2361 	struct net_device *dev = skb->dev;
2362 	u32 ttl = G_TC_RTTL(skb->tc_verd);
2363 	struct netdev_queue *rxq;
2364 	int result = TC_ACT_OK;
2365 	struct Qdisc *q;
2366 
2367 	if (MAX_RED_LOOP < ttl++) {
2368 		printk(KERN_WARNING
2369 		       "Redir loop detected Dropping packet (%d->%d)\n",
2370 		       skb->skb_iif, dev->ifindex);
2371 		return TC_ACT_SHOT;
2372 	}
2373 
2374 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2375 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2376 
2377 	rxq = &dev->rx_queue;
2378 
2379 	q = rxq->qdisc;
2380 	if (q != &noop_qdisc) {
2381 		spin_lock(qdisc_lock(q));
2382 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2383 			result = qdisc_enqueue_root(skb, q);
2384 		spin_unlock(qdisc_lock(q));
2385 	}
2386 
2387 	return result;
2388 }
2389 
2390 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2391 					 struct packet_type **pt_prev,
2392 					 int *ret, struct net_device *orig_dev)
2393 {
2394 	if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2395 		goto out;
2396 
2397 	if (*pt_prev) {
2398 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
2399 		*pt_prev = NULL;
2400 	} else {
2401 		/* Huh? Why does turning on AF_PACKET affect this? */
2402 		skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2403 	}
2404 
2405 	switch (ing_filter(skb)) {
2406 	case TC_ACT_SHOT:
2407 	case TC_ACT_STOLEN:
2408 		kfree_skb(skb);
2409 		return NULL;
2410 	}
2411 
2412 out:
2413 	skb->tc_verd = 0;
2414 	return skb;
2415 }
2416 #endif
2417 
2418 /*
2419  * 	netif_nit_deliver - deliver received packets to network taps
2420  * 	@skb: buffer
2421  *
2422  * 	This function is used to deliver incoming packets to network
2423  * 	taps. It should be used when the normal netif_receive_skb path
2424  * 	is bypassed, for example because of VLAN acceleration.
2425  */
2426 void netif_nit_deliver(struct sk_buff *skb)
2427 {
2428 	struct packet_type *ptype;
2429 
2430 	if (list_empty(&ptype_all))
2431 		return;
2432 
2433 	skb_reset_network_header(skb);
2434 	skb_reset_transport_header(skb);
2435 	skb->mac_len = skb->network_header - skb->mac_header;
2436 
2437 	rcu_read_lock();
2438 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2439 		if (!ptype->dev || ptype->dev == skb->dev)
2440 			deliver_skb(skb, ptype, skb->dev);
2441 	}
2442 	rcu_read_unlock();
2443 }
2444 
2445 /**
2446  *	netif_receive_skb - process receive buffer from network
2447  *	@skb: buffer to process
2448  *
2449  *	netif_receive_skb() is the main receive data processing function.
2450  *	It always succeeds. The buffer may be dropped during processing
2451  *	for congestion control or by the protocol layers.
2452  *
2453  *	This function may only be called from softirq context and interrupts
2454  *	should be enabled.
2455  *
2456  *	Return values (usually ignored):
2457  *	NET_RX_SUCCESS: no congestion
2458  *	NET_RX_DROP: packet was dropped
2459  */
2460 int netif_receive_skb(struct sk_buff *skb)
2461 {
2462 	struct packet_type *ptype, *pt_prev;
2463 	struct net_device *orig_dev;
2464 	struct net_device *null_or_orig;
2465 	struct net_device *null_or_bond;
2466 	int ret = NET_RX_DROP;
2467 	__be16 type;
2468 
2469 	if (!skb->tstamp.tv64)
2470 		net_timestamp(skb);
2471 
2472 	if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2473 		return NET_RX_SUCCESS;
2474 
2475 	/* if we've gotten here through NAPI, check netpoll */
2476 	if (netpoll_receive_skb(skb))
2477 		return NET_RX_DROP;
2478 
2479 	if (!skb->skb_iif)
2480 		skb->skb_iif = skb->dev->ifindex;
2481 
2482 	null_or_orig = NULL;
2483 	orig_dev = skb->dev;
2484 	if (orig_dev->master) {
2485 		if (skb_bond_should_drop(skb))
2486 			null_or_orig = orig_dev; /* deliver only exact match */
2487 		else
2488 			skb->dev = orig_dev->master;
2489 	}
2490 
2491 	__get_cpu_var(netdev_rx_stat).total++;
2492 
2493 	skb_reset_network_header(skb);
2494 	skb_reset_transport_header(skb);
2495 	skb->mac_len = skb->network_header - skb->mac_header;
2496 
2497 	pt_prev = NULL;
2498 
2499 	rcu_read_lock();
2500 
2501 #ifdef CONFIG_NET_CLS_ACT
2502 	if (skb->tc_verd & TC_NCLS) {
2503 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2504 		goto ncls;
2505 	}
2506 #endif
2507 
2508 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
2509 		if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2510 		    ptype->dev == orig_dev) {
2511 			if (pt_prev)
2512 				ret = deliver_skb(skb, pt_prev, orig_dev);
2513 			pt_prev = ptype;
2514 		}
2515 	}
2516 
2517 #ifdef CONFIG_NET_CLS_ACT
2518 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2519 	if (!skb)
2520 		goto out;
2521 ncls:
2522 #endif
2523 
2524 	skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2525 	if (!skb)
2526 		goto out;
2527 	skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2528 	if (!skb)
2529 		goto out;
2530 
2531 	/*
2532 	 * Make sure frames received on VLAN interfaces stacked on
2533 	 * bonding interfaces still make their way to any base bonding
2534 	 * device that may have registered for a specific ptype.  The
2535 	 * handler may have to adjust skb->dev and orig_dev.
2536 	 */
2537 	null_or_bond = NULL;
2538 	if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2539 	    (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2540 		null_or_bond = vlan_dev_real_dev(skb->dev);
2541 	}
2542 
2543 	type = skb->protocol;
2544 	list_for_each_entry_rcu(ptype,
2545 			&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2546 		if (ptype->type == type && (ptype->dev == null_or_orig ||
2547 		     ptype->dev == skb->dev || ptype->dev == orig_dev ||
2548 		     ptype->dev == null_or_bond)) {
2549 			if (pt_prev)
2550 				ret = deliver_skb(skb, pt_prev, orig_dev);
2551 			pt_prev = ptype;
2552 		}
2553 	}
2554 
2555 	if (pt_prev) {
2556 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2557 	} else {
2558 		kfree_skb(skb);
2559 		/* Jamal, now you will not able to escape explaining
2560 		 * me how you were going to use this. :-)
2561 		 */
2562 		ret = NET_RX_DROP;
2563 	}
2564 
2565 out:
2566 	rcu_read_unlock();
2567 	return ret;
2568 }
2569 EXPORT_SYMBOL(netif_receive_skb);
2570 
2571 /* Network device is going away, flush any packets still pending  */
2572 static void flush_backlog(void *arg)
2573 {
2574 	struct net_device *dev = arg;
2575 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2576 	struct sk_buff *skb, *tmp;
2577 
2578 	skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2579 		if (skb->dev == dev) {
2580 			__skb_unlink(skb, &queue->input_pkt_queue);
2581 			kfree_skb(skb);
2582 		}
2583 }
2584 
2585 static int napi_gro_complete(struct sk_buff *skb)
2586 {
2587 	struct packet_type *ptype;
2588 	__be16 type = skb->protocol;
2589 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2590 	int err = -ENOENT;
2591 
2592 	if (NAPI_GRO_CB(skb)->count == 1) {
2593 		skb_shinfo(skb)->gso_size = 0;
2594 		goto out;
2595 	}
2596 
2597 	rcu_read_lock();
2598 	list_for_each_entry_rcu(ptype, head, list) {
2599 		if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2600 			continue;
2601 
2602 		err = ptype->gro_complete(skb);
2603 		break;
2604 	}
2605 	rcu_read_unlock();
2606 
2607 	if (err) {
2608 		WARN_ON(&ptype->list == head);
2609 		kfree_skb(skb);
2610 		return NET_RX_SUCCESS;
2611 	}
2612 
2613 out:
2614 	return netif_receive_skb(skb);
2615 }
2616 
2617 static void napi_gro_flush(struct napi_struct *napi)
2618 {
2619 	struct sk_buff *skb, *next;
2620 
2621 	for (skb = napi->gro_list; skb; skb = next) {
2622 		next = skb->next;
2623 		skb->next = NULL;
2624 		napi_gro_complete(skb);
2625 	}
2626 
2627 	napi->gro_count = 0;
2628 	napi->gro_list = NULL;
2629 }
2630 
2631 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2632 {
2633 	struct sk_buff **pp = NULL;
2634 	struct packet_type *ptype;
2635 	__be16 type = skb->protocol;
2636 	struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2637 	int same_flow;
2638 	int mac_len;
2639 	enum gro_result ret;
2640 
2641 	if (!(skb->dev->features & NETIF_F_GRO))
2642 		goto normal;
2643 
2644 	if (skb_is_gso(skb) || skb_has_frags(skb))
2645 		goto normal;
2646 
2647 	rcu_read_lock();
2648 	list_for_each_entry_rcu(ptype, head, list) {
2649 		if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2650 			continue;
2651 
2652 		skb_set_network_header(skb, skb_gro_offset(skb));
2653 		mac_len = skb->network_header - skb->mac_header;
2654 		skb->mac_len = mac_len;
2655 		NAPI_GRO_CB(skb)->same_flow = 0;
2656 		NAPI_GRO_CB(skb)->flush = 0;
2657 		NAPI_GRO_CB(skb)->free = 0;
2658 
2659 		pp = ptype->gro_receive(&napi->gro_list, skb);
2660 		break;
2661 	}
2662 	rcu_read_unlock();
2663 
2664 	if (&ptype->list == head)
2665 		goto normal;
2666 
2667 	same_flow = NAPI_GRO_CB(skb)->same_flow;
2668 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2669 
2670 	if (pp) {
2671 		struct sk_buff *nskb = *pp;
2672 
2673 		*pp = nskb->next;
2674 		nskb->next = NULL;
2675 		napi_gro_complete(nskb);
2676 		napi->gro_count--;
2677 	}
2678 
2679 	if (same_flow)
2680 		goto ok;
2681 
2682 	if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2683 		goto normal;
2684 
2685 	napi->gro_count++;
2686 	NAPI_GRO_CB(skb)->count = 1;
2687 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2688 	skb->next = napi->gro_list;
2689 	napi->gro_list = skb;
2690 	ret = GRO_HELD;
2691 
2692 pull:
2693 	if (skb_headlen(skb) < skb_gro_offset(skb)) {
2694 		int grow = skb_gro_offset(skb) - skb_headlen(skb);
2695 
2696 		BUG_ON(skb->end - skb->tail < grow);
2697 
2698 		memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2699 
2700 		skb->tail += grow;
2701 		skb->data_len -= grow;
2702 
2703 		skb_shinfo(skb)->frags[0].page_offset += grow;
2704 		skb_shinfo(skb)->frags[0].size -= grow;
2705 
2706 		if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2707 			put_page(skb_shinfo(skb)->frags[0].page);
2708 			memmove(skb_shinfo(skb)->frags,
2709 				skb_shinfo(skb)->frags + 1,
2710 				--skb_shinfo(skb)->nr_frags);
2711 		}
2712 	}
2713 
2714 ok:
2715 	return ret;
2716 
2717 normal:
2718 	ret = GRO_NORMAL;
2719 	goto pull;
2720 }
2721 EXPORT_SYMBOL(dev_gro_receive);
2722 
2723 static gro_result_t
2724 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2725 {
2726 	struct sk_buff *p;
2727 
2728 	if (netpoll_rx_on(skb))
2729 		return GRO_NORMAL;
2730 
2731 	for (p = napi->gro_list; p; p = p->next) {
2732 		NAPI_GRO_CB(p)->same_flow =
2733 			(p->dev == skb->dev) &&
2734 			!compare_ether_header(skb_mac_header(p),
2735 					      skb_gro_mac_header(skb));
2736 		NAPI_GRO_CB(p)->flush = 0;
2737 	}
2738 
2739 	return dev_gro_receive(napi, skb);
2740 }
2741 
2742 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2743 {
2744 	switch (ret) {
2745 	case GRO_NORMAL:
2746 		if (netif_receive_skb(skb))
2747 			ret = GRO_DROP;
2748 		break;
2749 
2750 	case GRO_DROP:
2751 	case GRO_MERGED_FREE:
2752 		kfree_skb(skb);
2753 		break;
2754 
2755 	case GRO_HELD:
2756 	case GRO_MERGED:
2757 		break;
2758 	}
2759 
2760 	return ret;
2761 }
2762 EXPORT_SYMBOL(napi_skb_finish);
2763 
2764 void skb_gro_reset_offset(struct sk_buff *skb)
2765 {
2766 	NAPI_GRO_CB(skb)->data_offset = 0;
2767 	NAPI_GRO_CB(skb)->frag0 = NULL;
2768 	NAPI_GRO_CB(skb)->frag0_len = 0;
2769 
2770 	if (skb->mac_header == skb->tail &&
2771 	    !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2772 		NAPI_GRO_CB(skb)->frag0 =
2773 			page_address(skb_shinfo(skb)->frags[0].page) +
2774 			skb_shinfo(skb)->frags[0].page_offset;
2775 		NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2776 	}
2777 }
2778 EXPORT_SYMBOL(skb_gro_reset_offset);
2779 
2780 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2781 {
2782 	skb_gro_reset_offset(skb);
2783 
2784 	return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2785 }
2786 EXPORT_SYMBOL(napi_gro_receive);
2787 
2788 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2789 {
2790 	__skb_pull(skb, skb_headlen(skb));
2791 	skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2792 
2793 	napi->skb = skb;
2794 }
2795 EXPORT_SYMBOL(napi_reuse_skb);
2796 
2797 struct sk_buff *napi_get_frags(struct napi_struct *napi)
2798 {
2799 	struct sk_buff *skb = napi->skb;
2800 
2801 	if (!skb) {
2802 		skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
2803 		if (skb)
2804 			napi->skb = skb;
2805 	}
2806 	return skb;
2807 }
2808 EXPORT_SYMBOL(napi_get_frags);
2809 
2810 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
2811 			       gro_result_t ret)
2812 {
2813 	switch (ret) {
2814 	case GRO_NORMAL:
2815 	case GRO_HELD:
2816 		skb->protocol = eth_type_trans(skb, skb->dev);
2817 
2818 		if (ret == GRO_HELD)
2819 			skb_gro_pull(skb, -ETH_HLEN);
2820 		else if (netif_receive_skb(skb))
2821 			ret = GRO_DROP;
2822 		break;
2823 
2824 	case GRO_DROP:
2825 	case GRO_MERGED_FREE:
2826 		napi_reuse_skb(napi, skb);
2827 		break;
2828 
2829 	case GRO_MERGED:
2830 		break;
2831 	}
2832 
2833 	return ret;
2834 }
2835 EXPORT_SYMBOL(napi_frags_finish);
2836 
2837 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
2838 {
2839 	struct sk_buff *skb = napi->skb;
2840 	struct ethhdr *eth;
2841 	unsigned int hlen;
2842 	unsigned int off;
2843 
2844 	napi->skb = NULL;
2845 
2846 	skb_reset_mac_header(skb);
2847 	skb_gro_reset_offset(skb);
2848 
2849 	off = skb_gro_offset(skb);
2850 	hlen = off + sizeof(*eth);
2851 	eth = skb_gro_header_fast(skb, off);
2852 	if (skb_gro_header_hard(skb, hlen)) {
2853 		eth = skb_gro_header_slow(skb, hlen, off);
2854 		if (unlikely(!eth)) {
2855 			napi_reuse_skb(napi, skb);
2856 			skb = NULL;
2857 			goto out;
2858 		}
2859 	}
2860 
2861 	skb_gro_pull(skb, sizeof(*eth));
2862 
2863 	/*
2864 	 * This works because the only protocols we care about don't require
2865 	 * special handling.  We'll fix it up properly at the end.
2866 	 */
2867 	skb->protocol = eth->h_proto;
2868 
2869 out:
2870 	return skb;
2871 }
2872 EXPORT_SYMBOL(napi_frags_skb);
2873 
2874 gro_result_t napi_gro_frags(struct napi_struct *napi)
2875 {
2876 	struct sk_buff *skb = napi_frags_skb(napi);
2877 
2878 	if (!skb)
2879 		return GRO_DROP;
2880 
2881 	return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2882 }
2883 EXPORT_SYMBOL(napi_gro_frags);
2884 
2885 static int process_backlog(struct napi_struct *napi, int quota)
2886 {
2887 	int work = 0;
2888 	struct softnet_data *queue = &__get_cpu_var(softnet_data);
2889 	unsigned long start_time = jiffies;
2890 
2891 	napi->weight = weight_p;
2892 	do {
2893 		struct sk_buff *skb;
2894 
2895 		local_irq_disable();
2896 		skb = __skb_dequeue(&queue->input_pkt_queue);
2897 		if (!skb) {
2898 			__napi_complete(napi);
2899 			local_irq_enable();
2900 			break;
2901 		}
2902 		local_irq_enable();
2903 
2904 		netif_receive_skb(skb);
2905 	} while (++work < quota && jiffies == start_time);
2906 
2907 	return work;
2908 }
2909 
2910 /**
2911  * __napi_schedule - schedule for receive
2912  * @n: entry to schedule
2913  *
2914  * The entry's receive function will be scheduled to run
2915  */
2916 void __napi_schedule(struct napi_struct *n)
2917 {
2918 	unsigned long flags;
2919 
2920 	local_irq_save(flags);
2921 	list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2922 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
2923 	local_irq_restore(flags);
2924 }
2925 EXPORT_SYMBOL(__napi_schedule);
2926 
2927 void __napi_complete(struct napi_struct *n)
2928 {
2929 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2930 	BUG_ON(n->gro_list);
2931 
2932 	list_del(&n->poll_list);
2933 	smp_mb__before_clear_bit();
2934 	clear_bit(NAPI_STATE_SCHED, &n->state);
2935 }
2936 EXPORT_SYMBOL(__napi_complete);
2937 
2938 void napi_complete(struct napi_struct *n)
2939 {
2940 	unsigned long flags;
2941 
2942 	/*
2943 	 * don't let napi dequeue from the cpu poll list
2944 	 * just in case its running on a different cpu
2945 	 */
2946 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2947 		return;
2948 
2949 	napi_gro_flush(n);
2950 	local_irq_save(flags);
2951 	__napi_complete(n);
2952 	local_irq_restore(flags);
2953 }
2954 EXPORT_SYMBOL(napi_complete);
2955 
2956 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2957 		    int (*poll)(struct napi_struct *, int), int weight)
2958 {
2959 	INIT_LIST_HEAD(&napi->poll_list);
2960 	napi->gro_count = 0;
2961 	napi->gro_list = NULL;
2962 	napi->skb = NULL;
2963 	napi->poll = poll;
2964 	napi->weight = weight;
2965 	list_add(&napi->dev_list, &dev->napi_list);
2966 	napi->dev = dev;
2967 #ifdef CONFIG_NETPOLL
2968 	spin_lock_init(&napi->poll_lock);
2969 	napi->poll_owner = -1;
2970 #endif
2971 	set_bit(NAPI_STATE_SCHED, &napi->state);
2972 }
2973 EXPORT_SYMBOL(netif_napi_add);
2974 
2975 void netif_napi_del(struct napi_struct *napi)
2976 {
2977 	struct sk_buff *skb, *next;
2978 
2979 	list_del_init(&napi->dev_list);
2980 	napi_free_frags(napi);
2981 
2982 	for (skb = napi->gro_list; skb; skb = next) {
2983 		next = skb->next;
2984 		skb->next = NULL;
2985 		kfree_skb(skb);
2986 	}
2987 
2988 	napi->gro_list = NULL;
2989 	napi->gro_count = 0;
2990 }
2991 EXPORT_SYMBOL(netif_napi_del);
2992 
2993 
2994 static void net_rx_action(struct softirq_action *h)
2995 {
2996 	struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2997 	unsigned long time_limit = jiffies + 2;
2998 	int budget = netdev_budget;
2999 	void *have;
3000 
3001 	local_irq_disable();
3002 
3003 	while (!list_empty(list)) {
3004 		struct napi_struct *n;
3005 		int work, weight;
3006 
3007 		/* If softirq window is exhuasted then punt.
3008 		 * Allow this to run for 2 jiffies since which will allow
3009 		 * an average latency of 1.5/HZ.
3010 		 */
3011 		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3012 			goto softnet_break;
3013 
3014 		local_irq_enable();
3015 
3016 		/* Even though interrupts have been re-enabled, this
3017 		 * access is safe because interrupts can only add new
3018 		 * entries to the tail of this list, and only ->poll()
3019 		 * calls can remove this head entry from the list.
3020 		 */
3021 		n = list_first_entry(list, struct napi_struct, poll_list);
3022 
3023 		have = netpoll_poll_lock(n);
3024 
3025 		weight = n->weight;
3026 
3027 		/* This NAPI_STATE_SCHED test is for avoiding a race
3028 		 * with netpoll's poll_napi().  Only the entity which
3029 		 * obtains the lock and sees NAPI_STATE_SCHED set will
3030 		 * actually make the ->poll() call.  Therefore we avoid
3031 		 * accidently calling ->poll() when NAPI is not scheduled.
3032 		 */
3033 		work = 0;
3034 		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3035 			work = n->poll(n, weight);
3036 			trace_napi_poll(n);
3037 		}
3038 
3039 		WARN_ON_ONCE(work > weight);
3040 
3041 		budget -= work;
3042 
3043 		local_irq_disable();
3044 
3045 		/* Drivers must not modify the NAPI state if they
3046 		 * consume the entire weight.  In such cases this code
3047 		 * still "owns" the NAPI instance and therefore can
3048 		 * move the instance around on the list at-will.
3049 		 */
3050 		if (unlikely(work == weight)) {
3051 			if (unlikely(napi_disable_pending(n))) {
3052 				local_irq_enable();
3053 				napi_complete(n);
3054 				local_irq_disable();
3055 			} else
3056 				list_move_tail(&n->poll_list, list);
3057 		}
3058 
3059 		netpoll_poll_unlock(have);
3060 	}
3061 out:
3062 	local_irq_enable();
3063 
3064 #ifdef CONFIG_NET_DMA
3065 	/*
3066 	 * There may not be any more sk_buffs coming right now, so push
3067 	 * any pending DMA copies to hardware
3068 	 */
3069 	dma_issue_pending_all();
3070 #endif
3071 
3072 	return;
3073 
3074 softnet_break:
3075 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
3076 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3077 	goto out;
3078 }
3079 
3080 static gifconf_func_t *gifconf_list[NPROTO];
3081 
3082 /**
3083  *	register_gifconf	-	register a SIOCGIF handler
3084  *	@family: Address family
3085  *	@gifconf: Function handler
3086  *
3087  *	Register protocol dependent address dumping routines. The handler
3088  *	that is passed must not be freed or reused until it has been replaced
3089  *	by another handler.
3090  */
3091 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3092 {
3093 	if (family >= NPROTO)
3094 		return -EINVAL;
3095 	gifconf_list[family] = gifconf;
3096 	return 0;
3097 }
3098 EXPORT_SYMBOL(register_gifconf);
3099 
3100 
3101 /*
3102  *	Map an interface index to its name (SIOCGIFNAME)
3103  */
3104 
3105 /*
3106  *	We need this ioctl for efficient implementation of the
3107  *	if_indextoname() function required by the IPv6 API.  Without
3108  *	it, we would have to search all the interfaces to find a
3109  *	match.  --pb
3110  */
3111 
3112 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3113 {
3114 	struct net_device *dev;
3115 	struct ifreq ifr;
3116 
3117 	/*
3118 	 *	Fetch the caller's info block.
3119 	 */
3120 
3121 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3122 		return -EFAULT;
3123 
3124 	rcu_read_lock();
3125 	dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3126 	if (!dev) {
3127 		rcu_read_unlock();
3128 		return -ENODEV;
3129 	}
3130 
3131 	strcpy(ifr.ifr_name, dev->name);
3132 	rcu_read_unlock();
3133 
3134 	if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3135 		return -EFAULT;
3136 	return 0;
3137 }
3138 
3139 /*
3140  *	Perform a SIOCGIFCONF call. This structure will change
3141  *	size eventually, and there is nothing I can do about it.
3142  *	Thus we will need a 'compatibility mode'.
3143  */
3144 
3145 static int dev_ifconf(struct net *net, char __user *arg)
3146 {
3147 	struct ifconf ifc;
3148 	struct net_device *dev;
3149 	char __user *pos;
3150 	int len;
3151 	int total;
3152 	int i;
3153 
3154 	/*
3155 	 *	Fetch the caller's info block.
3156 	 */
3157 
3158 	if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3159 		return -EFAULT;
3160 
3161 	pos = ifc.ifc_buf;
3162 	len = ifc.ifc_len;
3163 
3164 	/*
3165 	 *	Loop over the interfaces, and write an info block for each.
3166 	 */
3167 
3168 	total = 0;
3169 	for_each_netdev(net, dev) {
3170 		for (i = 0; i < NPROTO; i++) {
3171 			if (gifconf_list[i]) {
3172 				int done;
3173 				if (!pos)
3174 					done = gifconf_list[i](dev, NULL, 0);
3175 				else
3176 					done = gifconf_list[i](dev, pos + total,
3177 							       len - total);
3178 				if (done < 0)
3179 					return -EFAULT;
3180 				total += done;
3181 			}
3182 		}
3183 	}
3184 
3185 	/*
3186 	 *	All done.  Write the updated control block back to the caller.
3187 	 */
3188 	ifc.ifc_len = total;
3189 
3190 	/*
3191 	 * 	Both BSD and Solaris return 0 here, so we do too.
3192 	 */
3193 	return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3194 }
3195 
3196 #ifdef CONFIG_PROC_FS
3197 /*
3198  *	This is invoked by the /proc filesystem handler to display a device
3199  *	in detail.
3200  */
3201 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3202 	__acquires(RCU)
3203 {
3204 	struct net *net = seq_file_net(seq);
3205 	loff_t off;
3206 	struct net_device *dev;
3207 
3208 	rcu_read_lock();
3209 	if (!*pos)
3210 		return SEQ_START_TOKEN;
3211 
3212 	off = 1;
3213 	for_each_netdev_rcu(net, dev)
3214 		if (off++ == *pos)
3215 			return dev;
3216 
3217 	return NULL;
3218 }
3219 
3220 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3221 {
3222 	struct net_device *dev = (v == SEQ_START_TOKEN) ?
3223 				  first_net_device(seq_file_net(seq)) :
3224 				  next_net_device((struct net_device *)v);
3225 
3226 	++*pos;
3227 	return rcu_dereference(dev);
3228 }
3229 
3230 void dev_seq_stop(struct seq_file *seq, void *v)
3231 	__releases(RCU)
3232 {
3233 	rcu_read_unlock();
3234 }
3235 
3236 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3237 {
3238 	const struct net_device_stats *stats = dev_get_stats(dev);
3239 
3240 	seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3241 		   "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3242 		   dev->name, stats->rx_bytes, stats->rx_packets,
3243 		   stats->rx_errors,
3244 		   stats->rx_dropped + stats->rx_missed_errors,
3245 		   stats->rx_fifo_errors,
3246 		   stats->rx_length_errors + stats->rx_over_errors +
3247 		    stats->rx_crc_errors + stats->rx_frame_errors,
3248 		   stats->rx_compressed, stats->multicast,
3249 		   stats->tx_bytes, stats->tx_packets,
3250 		   stats->tx_errors, stats->tx_dropped,
3251 		   stats->tx_fifo_errors, stats->collisions,
3252 		   stats->tx_carrier_errors +
3253 		    stats->tx_aborted_errors +
3254 		    stats->tx_window_errors +
3255 		    stats->tx_heartbeat_errors,
3256 		   stats->tx_compressed);
3257 }
3258 
3259 /*
3260  *	Called from the PROCfs module. This now uses the new arbitrary sized
3261  *	/proc/net interface to create /proc/net/dev
3262  */
3263 static int dev_seq_show(struct seq_file *seq, void *v)
3264 {
3265 	if (v == SEQ_START_TOKEN)
3266 		seq_puts(seq, "Inter-|   Receive                            "
3267 			      "                    |  Transmit\n"
3268 			      " face |bytes    packets errs drop fifo frame "
3269 			      "compressed multicast|bytes    packets errs "
3270 			      "drop fifo colls carrier compressed\n");
3271 	else
3272 		dev_seq_printf_stats(seq, v);
3273 	return 0;
3274 }
3275 
3276 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3277 {
3278 	struct netif_rx_stats *rc = NULL;
3279 
3280 	while (*pos < nr_cpu_ids)
3281 		if (cpu_online(*pos)) {
3282 			rc = &per_cpu(netdev_rx_stat, *pos);
3283 			break;
3284 		} else
3285 			++*pos;
3286 	return rc;
3287 }
3288 
3289 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3290 {
3291 	return softnet_get_online(pos);
3292 }
3293 
3294 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3295 {
3296 	++*pos;
3297 	return softnet_get_online(pos);
3298 }
3299 
3300 static void softnet_seq_stop(struct seq_file *seq, void *v)
3301 {
3302 }
3303 
3304 static int softnet_seq_show(struct seq_file *seq, void *v)
3305 {
3306 	struct netif_rx_stats *s = v;
3307 
3308 	seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3309 		   s->total, s->dropped, s->time_squeeze, 0,
3310 		   0, 0, 0, 0, /* was fastroute */
3311 		   s->cpu_collision);
3312 	return 0;
3313 }
3314 
3315 static const struct seq_operations dev_seq_ops = {
3316 	.start = dev_seq_start,
3317 	.next  = dev_seq_next,
3318 	.stop  = dev_seq_stop,
3319 	.show  = dev_seq_show,
3320 };
3321 
3322 static int dev_seq_open(struct inode *inode, struct file *file)
3323 {
3324 	return seq_open_net(inode, file, &dev_seq_ops,
3325 			    sizeof(struct seq_net_private));
3326 }
3327 
3328 static const struct file_operations dev_seq_fops = {
3329 	.owner	 = THIS_MODULE,
3330 	.open    = dev_seq_open,
3331 	.read    = seq_read,
3332 	.llseek  = seq_lseek,
3333 	.release = seq_release_net,
3334 };
3335 
3336 static const struct seq_operations softnet_seq_ops = {
3337 	.start = softnet_seq_start,
3338 	.next  = softnet_seq_next,
3339 	.stop  = softnet_seq_stop,
3340 	.show  = softnet_seq_show,
3341 };
3342 
3343 static int softnet_seq_open(struct inode *inode, struct file *file)
3344 {
3345 	return seq_open(file, &softnet_seq_ops);
3346 }
3347 
3348 static const struct file_operations softnet_seq_fops = {
3349 	.owner	 = THIS_MODULE,
3350 	.open    = softnet_seq_open,
3351 	.read    = seq_read,
3352 	.llseek  = seq_lseek,
3353 	.release = seq_release,
3354 };
3355 
3356 static void *ptype_get_idx(loff_t pos)
3357 {
3358 	struct packet_type *pt = NULL;
3359 	loff_t i = 0;
3360 	int t;
3361 
3362 	list_for_each_entry_rcu(pt, &ptype_all, list) {
3363 		if (i == pos)
3364 			return pt;
3365 		++i;
3366 	}
3367 
3368 	for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3369 		list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3370 			if (i == pos)
3371 				return pt;
3372 			++i;
3373 		}
3374 	}
3375 	return NULL;
3376 }
3377 
3378 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3379 	__acquires(RCU)
3380 {
3381 	rcu_read_lock();
3382 	return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3383 }
3384 
3385 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3386 {
3387 	struct packet_type *pt;
3388 	struct list_head *nxt;
3389 	int hash;
3390 
3391 	++*pos;
3392 	if (v == SEQ_START_TOKEN)
3393 		return ptype_get_idx(0);
3394 
3395 	pt = v;
3396 	nxt = pt->list.next;
3397 	if (pt->type == htons(ETH_P_ALL)) {
3398 		if (nxt != &ptype_all)
3399 			goto found;
3400 		hash = 0;
3401 		nxt = ptype_base[0].next;
3402 	} else
3403 		hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3404 
3405 	while (nxt == &ptype_base[hash]) {
3406 		if (++hash >= PTYPE_HASH_SIZE)
3407 			return NULL;
3408 		nxt = ptype_base[hash].next;
3409 	}
3410 found:
3411 	return list_entry(nxt, struct packet_type, list);
3412 }
3413 
3414 static void ptype_seq_stop(struct seq_file *seq, void *v)
3415 	__releases(RCU)
3416 {
3417 	rcu_read_unlock();
3418 }
3419 
3420 static int ptype_seq_show(struct seq_file *seq, void *v)
3421 {
3422 	struct packet_type *pt = v;
3423 
3424 	if (v == SEQ_START_TOKEN)
3425 		seq_puts(seq, "Type Device      Function\n");
3426 	else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3427 		if (pt->type == htons(ETH_P_ALL))
3428 			seq_puts(seq, "ALL ");
3429 		else
3430 			seq_printf(seq, "%04x", ntohs(pt->type));
3431 
3432 		seq_printf(seq, " %-8s %pF\n",
3433 			   pt->dev ? pt->dev->name : "", pt->func);
3434 	}
3435 
3436 	return 0;
3437 }
3438 
3439 static const struct seq_operations ptype_seq_ops = {
3440 	.start = ptype_seq_start,
3441 	.next  = ptype_seq_next,
3442 	.stop  = ptype_seq_stop,
3443 	.show  = ptype_seq_show,
3444 };
3445 
3446 static int ptype_seq_open(struct inode *inode, struct file *file)
3447 {
3448 	return seq_open_net(inode, file, &ptype_seq_ops,
3449 			sizeof(struct seq_net_private));
3450 }
3451 
3452 static const struct file_operations ptype_seq_fops = {
3453 	.owner	 = THIS_MODULE,
3454 	.open    = ptype_seq_open,
3455 	.read    = seq_read,
3456 	.llseek  = seq_lseek,
3457 	.release = seq_release_net,
3458 };
3459 
3460 
3461 static int __net_init dev_proc_net_init(struct net *net)
3462 {
3463 	int rc = -ENOMEM;
3464 
3465 	if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3466 		goto out;
3467 	if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3468 		goto out_dev;
3469 	if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3470 		goto out_softnet;
3471 
3472 	if (wext_proc_init(net))
3473 		goto out_ptype;
3474 	rc = 0;
3475 out:
3476 	return rc;
3477 out_ptype:
3478 	proc_net_remove(net, "ptype");
3479 out_softnet:
3480 	proc_net_remove(net, "softnet_stat");
3481 out_dev:
3482 	proc_net_remove(net, "dev");
3483 	goto out;
3484 }
3485 
3486 static void __net_exit dev_proc_net_exit(struct net *net)
3487 {
3488 	wext_proc_exit(net);
3489 
3490 	proc_net_remove(net, "ptype");
3491 	proc_net_remove(net, "softnet_stat");
3492 	proc_net_remove(net, "dev");
3493 }
3494 
3495 static struct pernet_operations __net_initdata dev_proc_ops = {
3496 	.init = dev_proc_net_init,
3497 	.exit = dev_proc_net_exit,
3498 };
3499 
3500 static int __init dev_proc_init(void)
3501 {
3502 	return register_pernet_subsys(&dev_proc_ops);
3503 }
3504 #else
3505 #define dev_proc_init() 0
3506 #endif	/* CONFIG_PROC_FS */
3507 
3508 
3509 /**
3510  *	netdev_set_master	-	set up master/slave pair
3511  *	@slave: slave device
3512  *	@master: new master device
3513  *
3514  *	Changes the master device of the slave. Pass %NULL to break the
3515  *	bonding. The caller must hold the RTNL semaphore. On a failure
3516  *	a negative errno code is returned. On success the reference counts
3517  *	are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3518  *	function returns zero.
3519  */
3520 int netdev_set_master(struct net_device *slave, struct net_device *master)
3521 {
3522 	struct net_device *old = slave->master;
3523 
3524 	ASSERT_RTNL();
3525 
3526 	if (master) {
3527 		if (old)
3528 			return -EBUSY;
3529 		dev_hold(master);
3530 	}
3531 
3532 	slave->master = master;
3533 
3534 	synchronize_net();
3535 
3536 	if (old)
3537 		dev_put(old);
3538 
3539 	if (master)
3540 		slave->flags |= IFF_SLAVE;
3541 	else
3542 		slave->flags &= ~IFF_SLAVE;
3543 
3544 	rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3545 	return 0;
3546 }
3547 EXPORT_SYMBOL(netdev_set_master);
3548 
3549 static void dev_change_rx_flags(struct net_device *dev, int flags)
3550 {
3551 	const struct net_device_ops *ops = dev->netdev_ops;
3552 
3553 	if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3554 		ops->ndo_change_rx_flags(dev, flags);
3555 }
3556 
3557 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3558 {
3559 	unsigned short old_flags = dev->flags;
3560 	uid_t uid;
3561 	gid_t gid;
3562 
3563 	ASSERT_RTNL();
3564 
3565 	dev->flags |= IFF_PROMISC;
3566 	dev->promiscuity += inc;
3567 	if (dev->promiscuity == 0) {
3568 		/*
3569 		 * Avoid overflow.
3570 		 * If inc causes overflow, untouch promisc and return error.
3571 		 */
3572 		if (inc < 0)
3573 			dev->flags &= ~IFF_PROMISC;
3574 		else {
3575 			dev->promiscuity -= inc;
3576 			printk(KERN_WARNING "%s: promiscuity touches roof, "
3577 				"set promiscuity failed, promiscuity feature "
3578 				"of device might be broken.\n", dev->name);
3579 			return -EOVERFLOW;
3580 		}
3581 	}
3582 	if (dev->flags != old_flags) {
3583 		printk(KERN_INFO "device %s %s promiscuous mode\n",
3584 		       dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3585 							       "left");
3586 		if (audit_enabled) {
3587 			current_uid_gid(&uid, &gid);
3588 			audit_log(current->audit_context, GFP_ATOMIC,
3589 				AUDIT_ANOM_PROMISCUOUS,
3590 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3591 				dev->name, (dev->flags & IFF_PROMISC),
3592 				(old_flags & IFF_PROMISC),
3593 				audit_get_loginuid(current),
3594 				uid, gid,
3595 				audit_get_sessionid(current));
3596 		}
3597 
3598 		dev_change_rx_flags(dev, IFF_PROMISC);
3599 	}
3600 	return 0;
3601 }
3602 
3603 /**
3604  *	dev_set_promiscuity	- update promiscuity count on a device
3605  *	@dev: device
3606  *	@inc: modifier
3607  *
3608  *	Add or remove promiscuity from a device. While the count in the device
3609  *	remains above zero the interface remains promiscuous. Once it hits zero
3610  *	the device reverts back to normal filtering operation. A negative inc
3611  *	value is used to drop promiscuity on the device.
3612  *	Return 0 if successful or a negative errno code on error.
3613  */
3614 int dev_set_promiscuity(struct net_device *dev, int inc)
3615 {
3616 	unsigned short old_flags = dev->flags;
3617 	int err;
3618 
3619 	err = __dev_set_promiscuity(dev, inc);
3620 	if (err < 0)
3621 		return err;
3622 	if (dev->flags != old_flags)
3623 		dev_set_rx_mode(dev);
3624 	return err;
3625 }
3626 EXPORT_SYMBOL(dev_set_promiscuity);
3627 
3628 /**
3629  *	dev_set_allmulti	- update allmulti count on a device
3630  *	@dev: device
3631  *	@inc: modifier
3632  *
3633  *	Add or remove reception of all multicast frames to a device. While the
3634  *	count in the device remains above zero the interface remains listening
3635  *	to all interfaces. Once it hits zero the device reverts back to normal
3636  *	filtering operation. A negative @inc value is used to drop the counter
3637  *	when releasing a resource needing all multicasts.
3638  *	Return 0 if successful or a negative errno code on error.
3639  */
3640 
3641 int dev_set_allmulti(struct net_device *dev, int inc)
3642 {
3643 	unsigned short old_flags = dev->flags;
3644 
3645 	ASSERT_RTNL();
3646 
3647 	dev->flags |= IFF_ALLMULTI;
3648 	dev->allmulti += inc;
3649 	if (dev->allmulti == 0) {
3650 		/*
3651 		 * Avoid overflow.
3652 		 * If inc causes overflow, untouch allmulti and return error.
3653 		 */
3654 		if (inc < 0)
3655 			dev->flags &= ~IFF_ALLMULTI;
3656 		else {
3657 			dev->allmulti -= inc;
3658 			printk(KERN_WARNING "%s: allmulti touches roof, "
3659 				"set allmulti failed, allmulti feature of "
3660 				"device might be broken.\n", dev->name);
3661 			return -EOVERFLOW;
3662 		}
3663 	}
3664 	if (dev->flags ^ old_flags) {
3665 		dev_change_rx_flags(dev, IFF_ALLMULTI);
3666 		dev_set_rx_mode(dev);
3667 	}
3668 	return 0;
3669 }
3670 EXPORT_SYMBOL(dev_set_allmulti);
3671 
3672 /*
3673  *	Upload unicast and multicast address lists to device and
3674  *	configure RX filtering. When the device doesn't support unicast
3675  *	filtering it is put in promiscuous mode while unicast addresses
3676  *	are present.
3677  */
3678 void __dev_set_rx_mode(struct net_device *dev)
3679 {
3680 	const struct net_device_ops *ops = dev->netdev_ops;
3681 
3682 	/* dev_open will call this function so the list will stay sane. */
3683 	if (!(dev->flags&IFF_UP))
3684 		return;
3685 
3686 	if (!netif_device_present(dev))
3687 		return;
3688 
3689 	if (ops->ndo_set_rx_mode)
3690 		ops->ndo_set_rx_mode(dev);
3691 	else {
3692 		/* Unicast addresses changes may only happen under the rtnl,
3693 		 * therefore calling __dev_set_promiscuity here is safe.
3694 		 */
3695 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3696 			__dev_set_promiscuity(dev, 1);
3697 			dev->uc_promisc = 1;
3698 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3699 			__dev_set_promiscuity(dev, -1);
3700 			dev->uc_promisc = 0;
3701 		}
3702 
3703 		if (ops->ndo_set_multicast_list)
3704 			ops->ndo_set_multicast_list(dev);
3705 	}
3706 }
3707 
3708 void dev_set_rx_mode(struct net_device *dev)
3709 {
3710 	netif_addr_lock_bh(dev);
3711 	__dev_set_rx_mode(dev);
3712 	netif_addr_unlock_bh(dev);
3713 }
3714 
3715 /* hw addresses list handling functions */
3716 
3717 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3718 			 int addr_len, unsigned char addr_type)
3719 {
3720 	struct netdev_hw_addr *ha;
3721 	int alloc_size;
3722 
3723 	if (addr_len > MAX_ADDR_LEN)
3724 		return -EINVAL;
3725 
3726 	list_for_each_entry(ha, &list->list, list) {
3727 		if (!memcmp(ha->addr, addr, addr_len) &&
3728 		    ha->type == addr_type) {
3729 			ha->refcount++;
3730 			return 0;
3731 		}
3732 	}
3733 
3734 
3735 	alloc_size = sizeof(*ha);
3736 	if (alloc_size < L1_CACHE_BYTES)
3737 		alloc_size = L1_CACHE_BYTES;
3738 	ha = kmalloc(alloc_size, GFP_ATOMIC);
3739 	if (!ha)
3740 		return -ENOMEM;
3741 	memcpy(ha->addr, addr, addr_len);
3742 	ha->type = addr_type;
3743 	ha->refcount = 1;
3744 	ha->synced = false;
3745 	list_add_tail_rcu(&ha->list, &list->list);
3746 	list->count++;
3747 	return 0;
3748 }
3749 
3750 static void ha_rcu_free(struct rcu_head *head)
3751 {
3752 	struct netdev_hw_addr *ha;
3753 
3754 	ha = container_of(head, struct netdev_hw_addr, rcu_head);
3755 	kfree(ha);
3756 }
3757 
3758 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
3759 			 int addr_len, unsigned char addr_type)
3760 {
3761 	struct netdev_hw_addr *ha;
3762 
3763 	list_for_each_entry(ha, &list->list, list) {
3764 		if (!memcmp(ha->addr, addr, addr_len) &&
3765 		    (ha->type == addr_type || !addr_type)) {
3766 			if (--ha->refcount)
3767 				return 0;
3768 			list_del_rcu(&ha->list);
3769 			call_rcu(&ha->rcu_head, ha_rcu_free);
3770 			list->count--;
3771 			return 0;
3772 		}
3773 	}
3774 	return -ENOENT;
3775 }
3776 
3777 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
3778 				  struct netdev_hw_addr_list *from_list,
3779 				  int addr_len,
3780 				  unsigned char addr_type)
3781 {
3782 	int err;
3783 	struct netdev_hw_addr *ha, *ha2;
3784 	unsigned char type;
3785 
3786 	list_for_each_entry(ha, &from_list->list, list) {
3787 		type = addr_type ? addr_type : ha->type;
3788 		err = __hw_addr_add(to_list, ha->addr, addr_len, type);
3789 		if (err)
3790 			goto unroll;
3791 	}
3792 	return 0;
3793 
3794 unroll:
3795 	list_for_each_entry(ha2, &from_list->list, list) {
3796 		if (ha2 == ha)
3797 			break;
3798 		type = addr_type ? addr_type : ha2->type;
3799 		__hw_addr_del(to_list, ha2->addr, addr_len, type);
3800 	}
3801 	return err;
3802 }
3803 
3804 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
3805 				   struct netdev_hw_addr_list *from_list,
3806 				   int addr_len,
3807 				   unsigned char addr_type)
3808 {
3809 	struct netdev_hw_addr *ha;
3810 	unsigned char type;
3811 
3812 	list_for_each_entry(ha, &from_list->list, list) {
3813 		type = addr_type ? addr_type : ha->type;
3814 		__hw_addr_del(to_list, ha->addr, addr_len, addr_type);
3815 	}
3816 }
3817 
3818 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
3819 			  struct netdev_hw_addr_list *from_list,
3820 			  int addr_len)
3821 {
3822 	int err = 0;
3823 	struct netdev_hw_addr *ha, *tmp;
3824 
3825 	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3826 		if (!ha->synced) {
3827 			err = __hw_addr_add(to_list, ha->addr,
3828 					    addr_len, ha->type);
3829 			if (err)
3830 				break;
3831 			ha->synced = true;
3832 			ha->refcount++;
3833 		} else if (ha->refcount == 1) {
3834 			__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
3835 			__hw_addr_del(from_list, ha->addr, addr_len, ha->type);
3836 		}
3837 	}
3838 	return err;
3839 }
3840 
3841 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
3842 			     struct netdev_hw_addr_list *from_list,
3843 			     int addr_len)
3844 {
3845 	struct netdev_hw_addr *ha, *tmp;
3846 
3847 	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
3848 		if (ha->synced) {
3849 			__hw_addr_del(to_list, ha->addr,
3850 				      addr_len, ha->type);
3851 			ha->synced = false;
3852 			__hw_addr_del(from_list, ha->addr,
3853 				      addr_len, ha->type);
3854 		}
3855 	}
3856 }
3857 
3858 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
3859 {
3860 	struct netdev_hw_addr *ha, *tmp;
3861 
3862 	list_for_each_entry_safe(ha, tmp, &list->list, list) {
3863 		list_del_rcu(&ha->list);
3864 		call_rcu(&ha->rcu_head, ha_rcu_free);
3865 	}
3866 	list->count = 0;
3867 }
3868 
3869 static void __hw_addr_init(struct netdev_hw_addr_list *list)
3870 {
3871 	INIT_LIST_HEAD(&list->list);
3872 	list->count = 0;
3873 }
3874 
3875 /* Device addresses handling functions */
3876 
3877 static void dev_addr_flush(struct net_device *dev)
3878 {
3879 	/* rtnl_mutex must be held here */
3880 
3881 	__hw_addr_flush(&dev->dev_addrs);
3882 	dev->dev_addr = NULL;
3883 }
3884 
3885 static int dev_addr_init(struct net_device *dev)
3886 {
3887 	unsigned char addr[MAX_ADDR_LEN];
3888 	struct netdev_hw_addr *ha;
3889 	int err;
3890 
3891 	/* rtnl_mutex must be held here */
3892 
3893 	__hw_addr_init(&dev->dev_addrs);
3894 	memset(addr, 0, sizeof(addr));
3895 	err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
3896 			    NETDEV_HW_ADDR_T_LAN);
3897 	if (!err) {
3898 		/*
3899 		 * Get the first (previously created) address from the list
3900 		 * and set dev_addr pointer to this location.
3901 		 */
3902 		ha = list_first_entry(&dev->dev_addrs.list,
3903 				      struct netdev_hw_addr, list);
3904 		dev->dev_addr = ha->addr;
3905 	}
3906 	return err;
3907 }
3908 
3909 /**
3910  *	dev_addr_add	- Add a device address
3911  *	@dev: device
3912  *	@addr: address to add
3913  *	@addr_type: address type
3914  *
3915  *	Add a device address to the device or increase the reference count if
3916  *	it already exists.
3917  *
3918  *	The caller must hold the rtnl_mutex.
3919  */
3920 int dev_addr_add(struct net_device *dev, unsigned char *addr,
3921 		 unsigned char addr_type)
3922 {
3923 	int err;
3924 
3925 	ASSERT_RTNL();
3926 
3927 	err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
3928 	if (!err)
3929 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3930 	return err;
3931 }
3932 EXPORT_SYMBOL(dev_addr_add);
3933 
3934 /**
3935  *	dev_addr_del	- Release a device address.
3936  *	@dev: device
3937  *	@addr: address to delete
3938  *	@addr_type: address type
3939  *
3940  *	Release reference to a device address and remove it from the device
3941  *	if the reference count drops to zero.
3942  *
3943  *	The caller must hold the rtnl_mutex.
3944  */
3945 int dev_addr_del(struct net_device *dev, unsigned char *addr,
3946 		 unsigned char addr_type)
3947 {
3948 	int err;
3949 	struct netdev_hw_addr *ha;
3950 
3951 	ASSERT_RTNL();
3952 
3953 	/*
3954 	 * We can not remove the first address from the list because
3955 	 * dev->dev_addr points to that.
3956 	 */
3957 	ha = list_first_entry(&dev->dev_addrs.list,
3958 			      struct netdev_hw_addr, list);
3959 	if (ha->addr == dev->dev_addr && ha->refcount == 1)
3960 		return -ENOENT;
3961 
3962 	err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
3963 			    addr_type);
3964 	if (!err)
3965 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3966 	return err;
3967 }
3968 EXPORT_SYMBOL(dev_addr_del);
3969 
3970 /**
3971  *	dev_addr_add_multiple	- Add device addresses from another device
3972  *	@to_dev: device to which addresses will be added
3973  *	@from_dev: device from which addresses will be added
3974  *	@addr_type: address type - 0 means type will be used from from_dev
3975  *
3976  *	Add device addresses of the one device to another.
3977  **
3978  *	The caller must hold the rtnl_mutex.
3979  */
3980 int dev_addr_add_multiple(struct net_device *to_dev,
3981 			  struct net_device *from_dev,
3982 			  unsigned char addr_type)
3983 {
3984 	int err;
3985 
3986 	ASSERT_RTNL();
3987 
3988 	if (from_dev->addr_len != to_dev->addr_len)
3989 		return -EINVAL;
3990 	err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
3991 				     to_dev->addr_len, addr_type);
3992 	if (!err)
3993 		call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
3994 	return err;
3995 }
3996 EXPORT_SYMBOL(dev_addr_add_multiple);
3997 
3998 /**
3999  *	dev_addr_del_multiple	- Delete device addresses by another device
4000  *	@to_dev: device where the addresses will be deleted
4001  *	@from_dev: device by which addresses the addresses will be deleted
4002  *	@addr_type: address type - 0 means type will used from from_dev
4003  *
4004  *	Deletes addresses in to device by the list of addresses in from device.
4005  *
4006  *	The caller must hold the rtnl_mutex.
4007  */
4008 int dev_addr_del_multiple(struct net_device *to_dev,
4009 			  struct net_device *from_dev,
4010 			  unsigned char addr_type)
4011 {
4012 	ASSERT_RTNL();
4013 
4014 	if (from_dev->addr_len != to_dev->addr_len)
4015 		return -EINVAL;
4016 	__hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4017 			       to_dev->addr_len, addr_type);
4018 	call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4019 	return 0;
4020 }
4021 EXPORT_SYMBOL(dev_addr_del_multiple);
4022 
4023 /* multicast addresses handling functions */
4024 
4025 int __dev_addr_delete(struct dev_addr_list **list, int *count,
4026 		      void *addr, int alen, int glbl)
4027 {
4028 	struct dev_addr_list *da;
4029 
4030 	for (; (da = *list) != NULL; list = &da->next) {
4031 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4032 		    alen == da->da_addrlen) {
4033 			if (glbl) {
4034 				int old_glbl = da->da_gusers;
4035 				da->da_gusers = 0;
4036 				if (old_glbl == 0)
4037 					break;
4038 			}
4039 			if (--da->da_users)
4040 				return 0;
4041 
4042 			*list = da->next;
4043 			kfree(da);
4044 			(*count)--;
4045 			return 0;
4046 		}
4047 	}
4048 	return -ENOENT;
4049 }
4050 
4051 int __dev_addr_add(struct dev_addr_list **list, int *count,
4052 		   void *addr, int alen, int glbl)
4053 {
4054 	struct dev_addr_list *da;
4055 
4056 	for (da = *list; da != NULL; da = da->next) {
4057 		if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4058 		    da->da_addrlen == alen) {
4059 			if (glbl) {
4060 				int old_glbl = da->da_gusers;
4061 				da->da_gusers = 1;
4062 				if (old_glbl)
4063 					return 0;
4064 			}
4065 			da->da_users++;
4066 			return 0;
4067 		}
4068 	}
4069 
4070 	da = kzalloc(sizeof(*da), GFP_ATOMIC);
4071 	if (da == NULL)
4072 		return -ENOMEM;
4073 	memcpy(da->da_addr, addr, alen);
4074 	da->da_addrlen = alen;
4075 	da->da_users = 1;
4076 	da->da_gusers = glbl ? 1 : 0;
4077 	da->next = *list;
4078 	*list = da;
4079 	(*count)++;
4080 	return 0;
4081 }
4082 
4083 /**
4084  *	dev_unicast_delete	- Release secondary unicast address.
4085  *	@dev: device
4086  *	@addr: address to delete
4087  *
4088  *	Release reference to a secondary unicast address and remove it
4089  *	from the device if the reference count drops to zero.
4090  *
4091  * 	The caller must hold the rtnl_mutex.
4092  */
4093 int dev_unicast_delete(struct net_device *dev, void *addr)
4094 {
4095 	int err;
4096 
4097 	ASSERT_RTNL();
4098 
4099 	netif_addr_lock_bh(dev);
4100 	err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4101 			    NETDEV_HW_ADDR_T_UNICAST);
4102 	if (!err)
4103 		__dev_set_rx_mode(dev);
4104 	netif_addr_unlock_bh(dev);
4105 	return err;
4106 }
4107 EXPORT_SYMBOL(dev_unicast_delete);
4108 
4109 /**
4110  *	dev_unicast_add		- add a secondary unicast address
4111  *	@dev: device
4112  *	@addr: address to add
4113  *
4114  *	Add a secondary unicast address to the device or increase
4115  *	the reference count if it already exists.
4116  *
4117  *	The caller must hold the rtnl_mutex.
4118  */
4119 int dev_unicast_add(struct net_device *dev, void *addr)
4120 {
4121 	int err;
4122 
4123 	ASSERT_RTNL();
4124 
4125 	netif_addr_lock_bh(dev);
4126 	err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4127 			    NETDEV_HW_ADDR_T_UNICAST);
4128 	if (!err)
4129 		__dev_set_rx_mode(dev);
4130 	netif_addr_unlock_bh(dev);
4131 	return err;
4132 }
4133 EXPORT_SYMBOL(dev_unicast_add);
4134 
4135 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4136 		    struct dev_addr_list **from, int *from_count)
4137 {
4138 	struct dev_addr_list *da, *next;
4139 	int err = 0;
4140 
4141 	da = *from;
4142 	while (da != NULL) {
4143 		next = da->next;
4144 		if (!da->da_synced) {
4145 			err = __dev_addr_add(to, to_count,
4146 					     da->da_addr, da->da_addrlen, 0);
4147 			if (err < 0)
4148 				break;
4149 			da->da_synced = 1;
4150 			da->da_users++;
4151 		} else if (da->da_users == 1) {
4152 			__dev_addr_delete(to, to_count,
4153 					  da->da_addr, da->da_addrlen, 0);
4154 			__dev_addr_delete(from, from_count,
4155 					  da->da_addr, da->da_addrlen, 0);
4156 		}
4157 		da = next;
4158 	}
4159 	return err;
4160 }
4161 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4162 
4163 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4164 		       struct dev_addr_list **from, int *from_count)
4165 {
4166 	struct dev_addr_list *da, *next;
4167 
4168 	da = *from;
4169 	while (da != NULL) {
4170 		next = da->next;
4171 		if (da->da_synced) {
4172 			__dev_addr_delete(to, to_count,
4173 					  da->da_addr, da->da_addrlen, 0);
4174 			da->da_synced = 0;
4175 			__dev_addr_delete(from, from_count,
4176 					  da->da_addr, da->da_addrlen, 0);
4177 		}
4178 		da = next;
4179 	}
4180 }
4181 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4182 
4183 /**
4184  *	dev_unicast_sync - Synchronize device's unicast list to another device
4185  *	@to: destination device
4186  *	@from: source device
4187  *
4188  *	Add newly added addresses to the destination device and release
4189  *	addresses that have no users left. The source device must be
4190  *	locked by netif_tx_lock_bh.
4191  *
4192  *	This function is intended to be called from the dev->set_rx_mode
4193  *	function of layered software devices.
4194  */
4195 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4196 {
4197 	int err = 0;
4198 
4199 	if (to->addr_len != from->addr_len)
4200 		return -EINVAL;
4201 
4202 	netif_addr_lock_bh(to);
4203 	err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4204 	if (!err)
4205 		__dev_set_rx_mode(to);
4206 	netif_addr_unlock_bh(to);
4207 	return err;
4208 }
4209 EXPORT_SYMBOL(dev_unicast_sync);
4210 
4211 /**
4212  *	dev_unicast_unsync - Remove synchronized addresses from the destination device
4213  *	@to: destination device
4214  *	@from: source device
4215  *
4216  *	Remove all addresses that were added to the destination device by
4217  *	dev_unicast_sync(). This function is intended to be called from the
4218  *	dev->stop function of layered software devices.
4219  */
4220 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4221 {
4222 	if (to->addr_len != from->addr_len)
4223 		return;
4224 
4225 	netif_addr_lock_bh(from);
4226 	netif_addr_lock(to);
4227 	__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4228 	__dev_set_rx_mode(to);
4229 	netif_addr_unlock(to);
4230 	netif_addr_unlock_bh(from);
4231 }
4232 EXPORT_SYMBOL(dev_unicast_unsync);
4233 
4234 static void dev_unicast_flush(struct net_device *dev)
4235 {
4236 	netif_addr_lock_bh(dev);
4237 	__hw_addr_flush(&dev->uc);
4238 	netif_addr_unlock_bh(dev);
4239 }
4240 
4241 static void dev_unicast_init(struct net_device *dev)
4242 {
4243 	__hw_addr_init(&dev->uc);
4244 }
4245 
4246 
4247 static void __dev_addr_discard(struct dev_addr_list **list)
4248 {
4249 	struct dev_addr_list *tmp;
4250 
4251 	while (*list != NULL) {
4252 		tmp = *list;
4253 		*list = tmp->next;
4254 		if (tmp->da_users > tmp->da_gusers)
4255 			printk("__dev_addr_discard: address leakage! "
4256 			       "da_users=%d\n", tmp->da_users);
4257 		kfree(tmp);
4258 	}
4259 }
4260 
4261 static void dev_addr_discard(struct net_device *dev)
4262 {
4263 	netif_addr_lock_bh(dev);
4264 
4265 	__dev_addr_discard(&dev->mc_list);
4266 	netdev_mc_count(dev) = 0;
4267 
4268 	netif_addr_unlock_bh(dev);
4269 }
4270 
4271 /**
4272  *	dev_get_flags - get flags reported to userspace
4273  *	@dev: device
4274  *
4275  *	Get the combination of flag bits exported through APIs to userspace.
4276  */
4277 unsigned dev_get_flags(const struct net_device *dev)
4278 {
4279 	unsigned flags;
4280 
4281 	flags = (dev->flags & ~(IFF_PROMISC |
4282 				IFF_ALLMULTI |
4283 				IFF_RUNNING |
4284 				IFF_LOWER_UP |
4285 				IFF_DORMANT)) |
4286 		(dev->gflags & (IFF_PROMISC |
4287 				IFF_ALLMULTI));
4288 
4289 	if (netif_running(dev)) {
4290 		if (netif_oper_up(dev))
4291 			flags |= IFF_RUNNING;
4292 		if (netif_carrier_ok(dev))
4293 			flags |= IFF_LOWER_UP;
4294 		if (netif_dormant(dev))
4295 			flags |= IFF_DORMANT;
4296 	}
4297 
4298 	return flags;
4299 }
4300 EXPORT_SYMBOL(dev_get_flags);
4301 
4302 /**
4303  *	dev_change_flags - change device settings
4304  *	@dev: device
4305  *	@flags: device state flags
4306  *
4307  *	Change settings on device based state flags. The flags are
4308  *	in the userspace exported format.
4309  */
4310 int dev_change_flags(struct net_device *dev, unsigned flags)
4311 {
4312 	int ret, changes;
4313 	int old_flags = dev->flags;
4314 
4315 	ASSERT_RTNL();
4316 
4317 	/*
4318 	 *	Set the flags on our device.
4319 	 */
4320 
4321 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4322 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4323 			       IFF_AUTOMEDIA)) |
4324 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4325 				    IFF_ALLMULTI));
4326 
4327 	/*
4328 	 *	Load in the correct multicast list now the flags have changed.
4329 	 */
4330 
4331 	if ((old_flags ^ flags) & IFF_MULTICAST)
4332 		dev_change_rx_flags(dev, IFF_MULTICAST);
4333 
4334 	dev_set_rx_mode(dev);
4335 
4336 	/*
4337 	 *	Have we downed the interface. We handle IFF_UP ourselves
4338 	 *	according to user attempts to set it, rather than blindly
4339 	 *	setting it.
4340 	 */
4341 
4342 	ret = 0;
4343 	if ((old_flags ^ flags) & IFF_UP) {	/* Bit is different  ? */
4344 		ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
4345 
4346 		if (!ret)
4347 			dev_set_rx_mode(dev);
4348 	}
4349 
4350 	if (dev->flags & IFF_UP &&
4351 	    ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
4352 					  IFF_VOLATILE)))
4353 		call_netdevice_notifiers(NETDEV_CHANGE, dev);
4354 
4355 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
4356 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
4357 
4358 		dev->gflags ^= IFF_PROMISC;
4359 		dev_set_promiscuity(dev, inc);
4360 	}
4361 
4362 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4363 	   is important. Some (broken) drivers set IFF_PROMISC, when
4364 	   IFF_ALLMULTI is requested not asking us and not reporting.
4365 	 */
4366 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4367 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4368 
4369 		dev->gflags ^= IFF_ALLMULTI;
4370 		dev_set_allmulti(dev, inc);
4371 	}
4372 
4373 	/* Exclude state transition flags, already notified */
4374 	changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
4375 	if (changes)
4376 		rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4377 
4378 	return ret;
4379 }
4380 EXPORT_SYMBOL(dev_change_flags);
4381 
4382 /**
4383  *	dev_set_mtu - Change maximum transfer unit
4384  *	@dev: device
4385  *	@new_mtu: new transfer unit
4386  *
4387  *	Change the maximum transfer size of the network device.
4388  */
4389 int dev_set_mtu(struct net_device *dev, int new_mtu)
4390 {
4391 	const struct net_device_ops *ops = dev->netdev_ops;
4392 	int err;
4393 
4394 	if (new_mtu == dev->mtu)
4395 		return 0;
4396 
4397 	/*	MTU must be positive.	 */
4398 	if (new_mtu < 0)
4399 		return -EINVAL;
4400 
4401 	if (!netif_device_present(dev))
4402 		return -ENODEV;
4403 
4404 	err = 0;
4405 	if (ops->ndo_change_mtu)
4406 		err = ops->ndo_change_mtu(dev, new_mtu);
4407 	else
4408 		dev->mtu = new_mtu;
4409 
4410 	if (!err && dev->flags & IFF_UP)
4411 		call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4412 	return err;
4413 }
4414 EXPORT_SYMBOL(dev_set_mtu);
4415 
4416 /**
4417  *	dev_set_mac_address - Change Media Access Control Address
4418  *	@dev: device
4419  *	@sa: new address
4420  *
4421  *	Change the hardware (MAC) address of the device
4422  */
4423 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4424 {
4425 	const struct net_device_ops *ops = dev->netdev_ops;
4426 	int err;
4427 
4428 	if (!ops->ndo_set_mac_address)
4429 		return -EOPNOTSUPP;
4430 	if (sa->sa_family != dev->type)
4431 		return -EINVAL;
4432 	if (!netif_device_present(dev))
4433 		return -ENODEV;
4434 	err = ops->ndo_set_mac_address(dev, sa);
4435 	if (!err)
4436 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4437 	return err;
4438 }
4439 EXPORT_SYMBOL(dev_set_mac_address);
4440 
4441 /*
4442  *	Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4443  */
4444 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4445 {
4446 	int err;
4447 	struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4448 
4449 	if (!dev)
4450 		return -ENODEV;
4451 
4452 	switch (cmd) {
4453 	case SIOCGIFFLAGS:	/* Get interface flags */
4454 		ifr->ifr_flags = (short) dev_get_flags(dev);
4455 		return 0;
4456 
4457 	case SIOCGIFMETRIC:	/* Get the metric on the interface
4458 				   (currently unused) */
4459 		ifr->ifr_metric = 0;
4460 		return 0;
4461 
4462 	case SIOCGIFMTU:	/* Get the MTU of a device */
4463 		ifr->ifr_mtu = dev->mtu;
4464 		return 0;
4465 
4466 	case SIOCGIFHWADDR:
4467 		if (!dev->addr_len)
4468 			memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4469 		else
4470 			memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4471 			       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4472 		ifr->ifr_hwaddr.sa_family = dev->type;
4473 		return 0;
4474 
4475 	case SIOCGIFSLAVE:
4476 		err = -EINVAL;
4477 		break;
4478 
4479 	case SIOCGIFMAP:
4480 		ifr->ifr_map.mem_start = dev->mem_start;
4481 		ifr->ifr_map.mem_end   = dev->mem_end;
4482 		ifr->ifr_map.base_addr = dev->base_addr;
4483 		ifr->ifr_map.irq       = dev->irq;
4484 		ifr->ifr_map.dma       = dev->dma;
4485 		ifr->ifr_map.port      = dev->if_port;
4486 		return 0;
4487 
4488 	case SIOCGIFINDEX:
4489 		ifr->ifr_ifindex = dev->ifindex;
4490 		return 0;
4491 
4492 	case SIOCGIFTXQLEN:
4493 		ifr->ifr_qlen = dev->tx_queue_len;
4494 		return 0;
4495 
4496 	default:
4497 		/* dev_ioctl() should ensure this case
4498 		 * is never reached
4499 		 */
4500 		WARN_ON(1);
4501 		err = -EINVAL;
4502 		break;
4503 
4504 	}
4505 	return err;
4506 }
4507 
4508 /*
4509  *	Perform the SIOCxIFxxx calls, inside rtnl_lock()
4510  */
4511 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4512 {
4513 	int err;
4514 	struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4515 	const struct net_device_ops *ops;
4516 
4517 	if (!dev)
4518 		return -ENODEV;
4519 
4520 	ops = dev->netdev_ops;
4521 
4522 	switch (cmd) {
4523 	case SIOCSIFFLAGS:	/* Set interface flags */
4524 		return dev_change_flags(dev, ifr->ifr_flags);
4525 
4526 	case SIOCSIFMETRIC:	/* Set the metric on the interface
4527 				   (currently unused) */
4528 		return -EOPNOTSUPP;
4529 
4530 	case SIOCSIFMTU:	/* Set the MTU of a device */
4531 		return dev_set_mtu(dev, ifr->ifr_mtu);
4532 
4533 	case SIOCSIFHWADDR:
4534 		return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4535 
4536 	case SIOCSIFHWBROADCAST:
4537 		if (ifr->ifr_hwaddr.sa_family != dev->type)
4538 			return -EINVAL;
4539 		memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4540 		       min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4541 		call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4542 		return 0;
4543 
4544 	case SIOCSIFMAP:
4545 		if (ops->ndo_set_config) {
4546 			if (!netif_device_present(dev))
4547 				return -ENODEV;
4548 			return ops->ndo_set_config(dev, &ifr->ifr_map);
4549 		}
4550 		return -EOPNOTSUPP;
4551 
4552 	case SIOCADDMULTI:
4553 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4554 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4555 			return -EINVAL;
4556 		if (!netif_device_present(dev))
4557 			return -ENODEV;
4558 		return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4559 				  dev->addr_len, 1);
4560 
4561 	case SIOCDELMULTI:
4562 		if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4563 		    ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4564 			return -EINVAL;
4565 		if (!netif_device_present(dev))
4566 			return -ENODEV;
4567 		return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4568 				     dev->addr_len, 1);
4569 
4570 	case SIOCSIFTXQLEN:
4571 		if (ifr->ifr_qlen < 0)
4572 			return -EINVAL;
4573 		dev->tx_queue_len = ifr->ifr_qlen;
4574 		return 0;
4575 
4576 	case SIOCSIFNAME:
4577 		ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4578 		return dev_change_name(dev, ifr->ifr_newname);
4579 
4580 	/*
4581 	 *	Unknown or private ioctl
4582 	 */
4583 	default:
4584 		if ((cmd >= SIOCDEVPRIVATE &&
4585 		    cmd <= SIOCDEVPRIVATE + 15) ||
4586 		    cmd == SIOCBONDENSLAVE ||
4587 		    cmd == SIOCBONDRELEASE ||
4588 		    cmd == SIOCBONDSETHWADDR ||
4589 		    cmd == SIOCBONDSLAVEINFOQUERY ||
4590 		    cmd == SIOCBONDINFOQUERY ||
4591 		    cmd == SIOCBONDCHANGEACTIVE ||
4592 		    cmd == SIOCGMIIPHY ||
4593 		    cmd == SIOCGMIIREG ||
4594 		    cmd == SIOCSMIIREG ||
4595 		    cmd == SIOCBRADDIF ||
4596 		    cmd == SIOCBRDELIF ||
4597 		    cmd == SIOCSHWTSTAMP ||
4598 		    cmd == SIOCWANDEV) {
4599 			err = -EOPNOTSUPP;
4600 			if (ops->ndo_do_ioctl) {
4601 				if (netif_device_present(dev))
4602 					err = ops->ndo_do_ioctl(dev, ifr, cmd);
4603 				else
4604 					err = -ENODEV;
4605 			}
4606 		} else
4607 			err = -EINVAL;
4608 
4609 	}
4610 	return err;
4611 }
4612 
4613 /*
4614  *	This function handles all "interface"-type I/O control requests. The actual
4615  *	'doing' part of this is dev_ifsioc above.
4616  */
4617 
4618 /**
4619  *	dev_ioctl	-	network device ioctl
4620  *	@net: the applicable net namespace
4621  *	@cmd: command to issue
4622  *	@arg: pointer to a struct ifreq in user space
4623  *
4624  *	Issue ioctl functions to devices. This is normally called by the
4625  *	user space syscall interfaces but can sometimes be useful for
4626  *	other purposes. The return value is the return from the syscall if
4627  *	positive or a negative errno code on error.
4628  */
4629 
4630 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4631 {
4632 	struct ifreq ifr;
4633 	int ret;
4634 	char *colon;
4635 
4636 	/* One special case: SIOCGIFCONF takes ifconf argument
4637 	   and requires shared lock, because it sleeps writing
4638 	   to user space.
4639 	 */
4640 
4641 	if (cmd == SIOCGIFCONF) {
4642 		rtnl_lock();
4643 		ret = dev_ifconf(net, (char __user *) arg);
4644 		rtnl_unlock();
4645 		return ret;
4646 	}
4647 	if (cmd == SIOCGIFNAME)
4648 		return dev_ifname(net, (struct ifreq __user *)arg);
4649 
4650 	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4651 		return -EFAULT;
4652 
4653 	ifr.ifr_name[IFNAMSIZ-1] = 0;
4654 
4655 	colon = strchr(ifr.ifr_name, ':');
4656 	if (colon)
4657 		*colon = 0;
4658 
4659 	/*
4660 	 *	See which interface the caller is talking about.
4661 	 */
4662 
4663 	switch (cmd) {
4664 	/*
4665 	 *	These ioctl calls:
4666 	 *	- can be done by all.
4667 	 *	- atomic and do not require locking.
4668 	 *	- return a value
4669 	 */
4670 	case SIOCGIFFLAGS:
4671 	case SIOCGIFMETRIC:
4672 	case SIOCGIFMTU:
4673 	case SIOCGIFHWADDR:
4674 	case SIOCGIFSLAVE:
4675 	case SIOCGIFMAP:
4676 	case SIOCGIFINDEX:
4677 	case SIOCGIFTXQLEN:
4678 		dev_load(net, ifr.ifr_name);
4679 		rcu_read_lock();
4680 		ret = dev_ifsioc_locked(net, &ifr, cmd);
4681 		rcu_read_unlock();
4682 		if (!ret) {
4683 			if (colon)
4684 				*colon = ':';
4685 			if (copy_to_user(arg, &ifr,
4686 					 sizeof(struct ifreq)))
4687 				ret = -EFAULT;
4688 		}
4689 		return ret;
4690 
4691 	case SIOCETHTOOL:
4692 		dev_load(net, ifr.ifr_name);
4693 		rtnl_lock();
4694 		ret = dev_ethtool(net, &ifr);
4695 		rtnl_unlock();
4696 		if (!ret) {
4697 			if (colon)
4698 				*colon = ':';
4699 			if (copy_to_user(arg, &ifr,
4700 					 sizeof(struct ifreq)))
4701 				ret = -EFAULT;
4702 		}
4703 		return ret;
4704 
4705 	/*
4706 	 *	These ioctl calls:
4707 	 *	- require superuser power.
4708 	 *	- require strict serialization.
4709 	 *	- return a value
4710 	 */
4711 	case SIOCGMIIPHY:
4712 	case SIOCGMIIREG:
4713 	case SIOCSIFNAME:
4714 		if (!capable(CAP_NET_ADMIN))
4715 			return -EPERM;
4716 		dev_load(net, ifr.ifr_name);
4717 		rtnl_lock();
4718 		ret = dev_ifsioc(net, &ifr, cmd);
4719 		rtnl_unlock();
4720 		if (!ret) {
4721 			if (colon)
4722 				*colon = ':';
4723 			if (copy_to_user(arg, &ifr,
4724 					 sizeof(struct ifreq)))
4725 				ret = -EFAULT;
4726 		}
4727 		return ret;
4728 
4729 	/*
4730 	 *	These ioctl calls:
4731 	 *	- require superuser power.
4732 	 *	- require strict serialization.
4733 	 *	- do not return a value
4734 	 */
4735 	case SIOCSIFFLAGS:
4736 	case SIOCSIFMETRIC:
4737 	case SIOCSIFMTU:
4738 	case SIOCSIFMAP:
4739 	case SIOCSIFHWADDR:
4740 	case SIOCSIFSLAVE:
4741 	case SIOCADDMULTI:
4742 	case SIOCDELMULTI:
4743 	case SIOCSIFHWBROADCAST:
4744 	case SIOCSIFTXQLEN:
4745 	case SIOCSMIIREG:
4746 	case SIOCBONDENSLAVE:
4747 	case SIOCBONDRELEASE:
4748 	case SIOCBONDSETHWADDR:
4749 	case SIOCBONDCHANGEACTIVE:
4750 	case SIOCBRADDIF:
4751 	case SIOCBRDELIF:
4752 	case SIOCSHWTSTAMP:
4753 		if (!capable(CAP_NET_ADMIN))
4754 			return -EPERM;
4755 		/* fall through */
4756 	case SIOCBONDSLAVEINFOQUERY:
4757 	case SIOCBONDINFOQUERY:
4758 		dev_load(net, ifr.ifr_name);
4759 		rtnl_lock();
4760 		ret = dev_ifsioc(net, &ifr, cmd);
4761 		rtnl_unlock();
4762 		return ret;
4763 
4764 	case SIOCGIFMEM:
4765 		/* Get the per device memory space. We can add this but
4766 		 * currently do not support it */
4767 	case SIOCSIFMEM:
4768 		/* Set the per device memory buffer space.
4769 		 * Not applicable in our case */
4770 	case SIOCSIFLINK:
4771 		return -EINVAL;
4772 
4773 	/*
4774 	 *	Unknown or private ioctl.
4775 	 */
4776 	default:
4777 		if (cmd == SIOCWANDEV ||
4778 		    (cmd >= SIOCDEVPRIVATE &&
4779 		     cmd <= SIOCDEVPRIVATE + 15)) {
4780 			dev_load(net, ifr.ifr_name);
4781 			rtnl_lock();
4782 			ret = dev_ifsioc(net, &ifr, cmd);
4783 			rtnl_unlock();
4784 			if (!ret && copy_to_user(arg, &ifr,
4785 						 sizeof(struct ifreq)))
4786 				ret = -EFAULT;
4787 			return ret;
4788 		}
4789 		/* Take care of Wireless Extensions */
4790 		if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4791 			return wext_handle_ioctl(net, &ifr, cmd, arg);
4792 		return -EINVAL;
4793 	}
4794 }
4795 
4796 
4797 /**
4798  *	dev_new_index	-	allocate an ifindex
4799  *	@net: the applicable net namespace
4800  *
4801  *	Returns a suitable unique value for a new device interface
4802  *	number.  The caller must hold the rtnl semaphore or the
4803  *	dev_base_lock to be sure it remains unique.
4804  */
4805 static int dev_new_index(struct net *net)
4806 {
4807 	static int ifindex;
4808 	for (;;) {
4809 		if (++ifindex <= 0)
4810 			ifindex = 1;
4811 		if (!__dev_get_by_index(net, ifindex))
4812 			return ifindex;
4813 	}
4814 }
4815 
4816 /* Delayed registration/unregisteration */
4817 static LIST_HEAD(net_todo_list);
4818 
4819 static void net_set_todo(struct net_device *dev)
4820 {
4821 	list_add_tail(&dev->todo_list, &net_todo_list);
4822 }
4823 
4824 static void rollback_registered_many(struct list_head *head)
4825 {
4826 	struct net_device *dev, *tmp;
4827 
4828 	BUG_ON(dev_boot_phase);
4829 	ASSERT_RTNL();
4830 
4831 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4832 		/* Some devices call without registering
4833 		 * for initialization unwind. Remove those
4834 		 * devices and proceed with the remaining.
4835 		 */
4836 		if (dev->reg_state == NETREG_UNINITIALIZED) {
4837 			pr_debug("unregister_netdevice: device %s/%p never "
4838 				 "was registered\n", dev->name, dev);
4839 
4840 			WARN_ON(1);
4841 			list_del(&dev->unreg_list);
4842 			continue;
4843 		}
4844 
4845 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
4846 
4847 		/* If device is running, close it first. */
4848 		dev_close(dev);
4849 
4850 		/* And unlink it from device chain. */
4851 		unlist_netdevice(dev);
4852 
4853 		dev->reg_state = NETREG_UNREGISTERING;
4854 	}
4855 
4856 	synchronize_net();
4857 
4858 	list_for_each_entry(dev, head, unreg_list) {
4859 		/* Shutdown queueing discipline. */
4860 		dev_shutdown(dev);
4861 
4862 
4863 		/* Notify protocols, that we are about to destroy
4864 		   this device. They should clean all the things.
4865 		*/
4866 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4867 
4868 		if (!dev->rtnl_link_ops ||
4869 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4870 			rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4871 
4872 		/*
4873 		 *	Flush the unicast and multicast chains
4874 		 */
4875 		dev_unicast_flush(dev);
4876 		dev_addr_discard(dev);
4877 
4878 		if (dev->netdev_ops->ndo_uninit)
4879 			dev->netdev_ops->ndo_uninit(dev);
4880 
4881 		/* Notifier chain MUST detach us from master device. */
4882 		WARN_ON(dev->master);
4883 
4884 		/* Remove entries from kobject tree */
4885 		netdev_unregister_kobject(dev);
4886 	}
4887 
4888 	/* Process any work delayed until the end of the batch */
4889 	dev = list_first_entry(head, struct net_device, unreg_list);
4890 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4891 
4892 	synchronize_net();
4893 
4894 	list_for_each_entry(dev, head, unreg_list)
4895 		dev_put(dev);
4896 }
4897 
4898 static void rollback_registered(struct net_device *dev)
4899 {
4900 	LIST_HEAD(single);
4901 
4902 	list_add(&dev->unreg_list, &single);
4903 	rollback_registered_many(&single);
4904 }
4905 
4906 static void __netdev_init_queue_locks_one(struct net_device *dev,
4907 					  struct netdev_queue *dev_queue,
4908 					  void *_unused)
4909 {
4910 	spin_lock_init(&dev_queue->_xmit_lock);
4911 	netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4912 	dev_queue->xmit_lock_owner = -1;
4913 }
4914 
4915 static void netdev_init_queue_locks(struct net_device *dev)
4916 {
4917 	netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4918 	__netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4919 }
4920 
4921 unsigned long netdev_fix_features(unsigned long features, const char *name)
4922 {
4923 	/* Fix illegal SG+CSUM combinations. */
4924 	if ((features & NETIF_F_SG) &&
4925 	    !(features & NETIF_F_ALL_CSUM)) {
4926 		if (name)
4927 			printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4928 			       "checksum feature.\n", name);
4929 		features &= ~NETIF_F_SG;
4930 	}
4931 
4932 	/* TSO requires that SG is present as well. */
4933 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4934 		if (name)
4935 			printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4936 			       "SG feature.\n", name);
4937 		features &= ~NETIF_F_TSO;
4938 	}
4939 
4940 	if (features & NETIF_F_UFO) {
4941 		if (!(features & NETIF_F_GEN_CSUM)) {
4942 			if (name)
4943 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4944 				       "since no NETIF_F_HW_CSUM feature.\n",
4945 				       name);
4946 			features &= ~NETIF_F_UFO;
4947 		}
4948 
4949 		if (!(features & NETIF_F_SG)) {
4950 			if (name)
4951 				printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4952 				       "since no NETIF_F_SG feature.\n", name);
4953 			features &= ~NETIF_F_UFO;
4954 		}
4955 	}
4956 
4957 	return features;
4958 }
4959 EXPORT_SYMBOL(netdev_fix_features);
4960 
4961 /**
4962  *	netif_stacked_transfer_operstate -	transfer operstate
4963  *	@rootdev: the root or lower level device to transfer state from
4964  *	@dev: the device to transfer operstate to
4965  *
4966  *	Transfer operational state from root to device. This is normally
4967  *	called when a stacking relationship exists between the root
4968  *	device and the device(a leaf device).
4969  */
4970 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4971 					struct net_device *dev)
4972 {
4973 	if (rootdev->operstate == IF_OPER_DORMANT)
4974 		netif_dormant_on(dev);
4975 	else
4976 		netif_dormant_off(dev);
4977 
4978 	if (netif_carrier_ok(rootdev)) {
4979 		if (!netif_carrier_ok(dev))
4980 			netif_carrier_on(dev);
4981 	} else {
4982 		if (netif_carrier_ok(dev))
4983 			netif_carrier_off(dev);
4984 	}
4985 }
4986 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4987 
4988 /**
4989  *	register_netdevice	- register a network device
4990  *	@dev: device to register
4991  *
4992  *	Take a completed network device structure and add it to the kernel
4993  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4994  *	chain. 0 is returned on success. A negative errno code is returned
4995  *	on a failure to set up the device, or if the name is a duplicate.
4996  *
4997  *	Callers must hold the rtnl semaphore. You may want
4998  *	register_netdev() instead of this.
4999  *
5000  *	BUGS:
5001  *	The locking appears insufficient to guarantee two parallel registers
5002  *	will not get the same name.
5003  */
5004 
5005 int register_netdevice(struct net_device *dev)
5006 {
5007 	int ret;
5008 	struct net *net = dev_net(dev);
5009 
5010 	BUG_ON(dev_boot_phase);
5011 	ASSERT_RTNL();
5012 
5013 	might_sleep();
5014 
5015 	/* When net_device's are persistent, this will be fatal. */
5016 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5017 	BUG_ON(!net);
5018 
5019 	spin_lock_init(&dev->addr_list_lock);
5020 	netdev_set_addr_lockdep_class(dev);
5021 	netdev_init_queue_locks(dev);
5022 
5023 	dev->iflink = -1;
5024 
5025 	/* Init, if this function is available */
5026 	if (dev->netdev_ops->ndo_init) {
5027 		ret = dev->netdev_ops->ndo_init(dev);
5028 		if (ret) {
5029 			if (ret > 0)
5030 				ret = -EIO;
5031 			goto out;
5032 		}
5033 	}
5034 
5035 	ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5036 	if (ret)
5037 		goto err_uninit;
5038 
5039 	dev->ifindex = dev_new_index(net);
5040 	if (dev->iflink == -1)
5041 		dev->iflink = dev->ifindex;
5042 
5043 	/* Fix illegal checksum combinations */
5044 	if ((dev->features & NETIF_F_HW_CSUM) &&
5045 	    (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5046 		printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5047 		       dev->name);
5048 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5049 	}
5050 
5051 	if ((dev->features & NETIF_F_NO_CSUM) &&
5052 	    (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5053 		printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5054 		       dev->name);
5055 		dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5056 	}
5057 
5058 	dev->features = netdev_fix_features(dev->features, dev->name);
5059 
5060 	/* Enable software GSO if SG is supported. */
5061 	if (dev->features & NETIF_F_SG)
5062 		dev->features |= NETIF_F_GSO;
5063 
5064 	netdev_initialize_kobject(dev);
5065 
5066 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5067 	ret = notifier_to_errno(ret);
5068 	if (ret)
5069 		goto err_uninit;
5070 
5071 	ret = netdev_register_kobject(dev);
5072 	if (ret)
5073 		goto err_uninit;
5074 	dev->reg_state = NETREG_REGISTERED;
5075 
5076 	/*
5077 	 *	Default initial state at registry is that the
5078 	 *	device is present.
5079 	 */
5080 
5081 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5082 
5083 	dev_init_scheduler(dev);
5084 	dev_hold(dev);
5085 	list_netdevice(dev);
5086 
5087 	/* Notify protocols, that a new device appeared. */
5088 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5089 	ret = notifier_to_errno(ret);
5090 	if (ret) {
5091 		rollback_registered(dev);
5092 		dev->reg_state = NETREG_UNREGISTERED;
5093 	}
5094 	/*
5095 	 *	Prevent userspace races by waiting until the network
5096 	 *	device is fully setup before sending notifications.
5097 	 */
5098 	if (!dev->rtnl_link_ops ||
5099 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5100 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5101 
5102 out:
5103 	return ret;
5104 
5105 err_uninit:
5106 	if (dev->netdev_ops->ndo_uninit)
5107 		dev->netdev_ops->ndo_uninit(dev);
5108 	goto out;
5109 }
5110 EXPORT_SYMBOL(register_netdevice);
5111 
5112 /**
5113  *	init_dummy_netdev	- init a dummy network device for NAPI
5114  *	@dev: device to init
5115  *
5116  *	This takes a network device structure and initialize the minimum
5117  *	amount of fields so it can be used to schedule NAPI polls without
5118  *	registering a full blown interface. This is to be used by drivers
5119  *	that need to tie several hardware interfaces to a single NAPI
5120  *	poll scheduler due to HW limitations.
5121  */
5122 int init_dummy_netdev(struct net_device *dev)
5123 {
5124 	/* Clear everything. Note we don't initialize spinlocks
5125 	 * are they aren't supposed to be taken by any of the
5126 	 * NAPI code and this dummy netdev is supposed to be
5127 	 * only ever used for NAPI polls
5128 	 */
5129 	memset(dev, 0, sizeof(struct net_device));
5130 
5131 	/* make sure we BUG if trying to hit standard
5132 	 * register/unregister code path
5133 	 */
5134 	dev->reg_state = NETREG_DUMMY;
5135 
5136 	/* initialize the ref count */
5137 	atomic_set(&dev->refcnt, 1);
5138 
5139 	/* NAPI wants this */
5140 	INIT_LIST_HEAD(&dev->napi_list);
5141 
5142 	/* a dummy interface is started by default */
5143 	set_bit(__LINK_STATE_PRESENT, &dev->state);
5144 	set_bit(__LINK_STATE_START, &dev->state);
5145 
5146 	return 0;
5147 }
5148 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5149 
5150 
5151 /**
5152  *	register_netdev	- register a network device
5153  *	@dev: device to register
5154  *
5155  *	Take a completed network device structure and add it to the kernel
5156  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5157  *	chain. 0 is returned on success. A negative errno code is returned
5158  *	on a failure to set up the device, or if the name is a duplicate.
5159  *
5160  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
5161  *	and expands the device name if you passed a format string to
5162  *	alloc_netdev.
5163  */
5164 int register_netdev(struct net_device *dev)
5165 {
5166 	int err;
5167 
5168 	rtnl_lock();
5169 
5170 	/*
5171 	 * If the name is a format string the caller wants us to do a
5172 	 * name allocation.
5173 	 */
5174 	if (strchr(dev->name, '%')) {
5175 		err = dev_alloc_name(dev, dev->name);
5176 		if (err < 0)
5177 			goto out;
5178 	}
5179 
5180 	err = register_netdevice(dev);
5181 out:
5182 	rtnl_unlock();
5183 	return err;
5184 }
5185 EXPORT_SYMBOL(register_netdev);
5186 
5187 /*
5188  * netdev_wait_allrefs - wait until all references are gone.
5189  *
5190  * This is called when unregistering network devices.
5191  *
5192  * Any protocol or device that holds a reference should register
5193  * for netdevice notification, and cleanup and put back the
5194  * reference if they receive an UNREGISTER event.
5195  * We can get stuck here if buggy protocols don't correctly
5196  * call dev_put.
5197  */
5198 static void netdev_wait_allrefs(struct net_device *dev)
5199 {
5200 	unsigned long rebroadcast_time, warning_time;
5201 
5202 	linkwatch_forget_dev(dev);
5203 
5204 	rebroadcast_time = warning_time = jiffies;
5205 	while (atomic_read(&dev->refcnt) != 0) {
5206 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5207 			rtnl_lock();
5208 
5209 			/* Rebroadcast unregister notification */
5210 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5211 			/* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5212 			 * should have already handle it the first time */
5213 
5214 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5215 				     &dev->state)) {
5216 				/* We must not have linkwatch events
5217 				 * pending on unregister. If this
5218 				 * happens, we simply run the queue
5219 				 * unscheduled, resulting in a noop
5220 				 * for this device.
5221 				 */
5222 				linkwatch_run_queue();
5223 			}
5224 
5225 			__rtnl_unlock();
5226 
5227 			rebroadcast_time = jiffies;
5228 		}
5229 
5230 		msleep(250);
5231 
5232 		if (time_after(jiffies, warning_time + 10 * HZ)) {
5233 			printk(KERN_EMERG "unregister_netdevice: "
5234 			       "waiting for %s to become free. Usage "
5235 			       "count = %d\n",
5236 			       dev->name, atomic_read(&dev->refcnt));
5237 			warning_time = jiffies;
5238 		}
5239 	}
5240 }
5241 
5242 /* The sequence is:
5243  *
5244  *	rtnl_lock();
5245  *	...
5246  *	register_netdevice(x1);
5247  *	register_netdevice(x2);
5248  *	...
5249  *	unregister_netdevice(y1);
5250  *	unregister_netdevice(y2);
5251  *      ...
5252  *	rtnl_unlock();
5253  *	free_netdev(y1);
5254  *	free_netdev(y2);
5255  *
5256  * We are invoked by rtnl_unlock().
5257  * This allows us to deal with problems:
5258  * 1) We can delete sysfs objects which invoke hotplug
5259  *    without deadlocking with linkwatch via keventd.
5260  * 2) Since we run with the RTNL semaphore not held, we can sleep
5261  *    safely in order to wait for the netdev refcnt to drop to zero.
5262  *
5263  * We must not return until all unregister events added during
5264  * the interval the lock was held have been completed.
5265  */
5266 void netdev_run_todo(void)
5267 {
5268 	struct list_head list;
5269 
5270 	/* Snapshot list, allow later requests */
5271 	list_replace_init(&net_todo_list, &list);
5272 
5273 	__rtnl_unlock();
5274 
5275 	while (!list_empty(&list)) {
5276 		struct net_device *dev
5277 			= list_first_entry(&list, struct net_device, todo_list);
5278 		list_del(&dev->todo_list);
5279 
5280 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5281 			printk(KERN_ERR "network todo '%s' but state %d\n",
5282 			       dev->name, dev->reg_state);
5283 			dump_stack();
5284 			continue;
5285 		}
5286 
5287 		dev->reg_state = NETREG_UNREGISTERED;
5288 
5289 		on_each_cpu(flush_backlog, dev, 1);
5290 
5291 		netdev_wait_allrefs(dev);
5292 
5293 		/* paranoia */
5294 		BUG_ON(atomic_read(&dev->refcnt));
5295 		WARN_ON(dev->ip_ptr);
5296 		WARN_ON(dev->ip6_ptr);
5297 		WARN_ON(dev->dn_ptr);
5298 
5299 		if (dev->destructor)
5300 			dev->destructor(dev);
5301 
5302 		/* Free network device */
5303 		kobject_put(&dev->dev.kobj);
5304 	}
5305 }
5306 
5307 /**
5308  *	dev_txq_stats_fold - fold tx_queues stats
5309  *	@dev: device to get statistics from
5310  *	@stats: struct net_device_stats to hold results
5311  */
5312 void dev_txq_stats_fold(const struct net_device *dev,
5313 			struct net_device_stats *stats)
5314 {
5315 	unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5316 	unsigned int i;
5317 	struct netdev_queue *txq;
5318 
5319 	for (i = 0; i < dev->num_tx_queues; i++) {
5320 		txq = netdev_get_tx_queue(dev, i);
5321 		tx_bytes   += txq->tx_bytes;
5322 		tx_packets += txq->tx_packets;
5323 		tx_dropped += txq->tx_dropped;
5324 	}
5325 	if (tx_bytes || tx_packets || tx_dropped) {
5326 		stats->tx_bytes   = tx_bytes;
5327 		stats->tx_packets = tx_packets;
5328 		stats->tx_dropped = tx_dropped;
5329 	}
5330 }
5331 EXPORT_SYMBOL(dev_txq_stats_fold);
5332 
5333 /**
5334  *	dev_get_stats	- get network device statistics
5335  *	@dev: device to get statistics from
5336  *
5337  *	Get network statistics from device. The device driver may provide
5338  *	its own method by setting dev->netdev_ops->get_stats; otherwise
5339  *	the internal statistics structure is used.
5340  */
5341 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5342 {
5343 	const struct net_device_ops *ops = dev->netdev_ops;
5344 
5345 	if (ops->ndo_get_stats)
5346 		return ops->ndo_get_stats(dev);
5347 
5348 	dev_txq_stats_fold(dev, &dev->stats);
5349 	return &dev->stats;
5350 }
5351 EXPORT_SYMBOL(dev_get_stats);
5352 
5353 static void netdev_init_one_queue(struct net_device *dev,
5354 				  struct netdev_queue *queue,
5355 				  void *_unused)
5356 {
5357 	queue->dev = dev;
5358 }
5359 
5360 static void netdev_init_queues(struct net_device *dev)
5361 {
5362 	netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5363 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5364 	spin_lock_init(&dev->tx_global_lock);
5365 }
5366 
5367 /**
5368  *	alloc_netdev_mq - allocate network device
5369  *	@sizeof_priv:	size of private data to allocate space for
5370  *	@name:		device name format string
5371  *	@setup:		callback to initialize device
5372  *	@queue_count:	the number of subqueues to allocate
5373  *
5374  *	Allocates a struct net_device with private data area for driver use
5375  *	and performs basic initialization.  Also allocates subquue structs
5376  *	for each queue on the device at the end of the netdevice.
5377  */
5378 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5379 		void (*setup)(struct net_device *), unsigned int queue_count)
5380 {
5381 	struct netdev_queue *tx;
5382 	struct net_device *dev;
5383 	size_t alloc_size;
5384 	struct net_device *p;
5385 
5386 	BUG_ON(strlen(name) >= sizeof(dev->name));
5387 
5388 	alloc_size = sizeof(struct net_device);
5389 	if (sizeof_priv) {
5390 		/* ensure 32-byte alignment of private area */
5391 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5392 		alloc_size += sizeof_priv;
5393 	}
5394 	/* ensure 32-byte alignment of whole construct */
5395 	alloc_size += NETDEV_ALIGN - 1;
5396 
5397 	p = kzalloc(alloc_size, GFP_KERNEL);
5398 	if (!p) {
5399 		printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5400 		return NULL;
5401 	}
5402 
5403 	tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5404 	if (!tx) {
5405 		printk(KERN_ERR "alloc_netdev: Unable to allocate "
5406 		       "tx qdiscs.\n");
5407 		goto free_p;
5408 	}
5409 
5410 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
5411 	dev->padded = (char *)dev - (char *)p;
5412 
5413 	if (dev_addr_init(dev))
5414 		goto free_tx;
5415 
5416 	dev_unicast_init(dev);
5417 
5418 	dev_net_set(dev, &init_net);
5419 
5420 	dev->_tx = tx;
5421 	dev->num_tx_queues = queue_count;
5422 	dev->real_num_tx_queues = queue_count;
5423 
5424 	dev->gso_max_size = GSO_MAX_SIZE;
5425 
5426 	netdev_init_queues(dev);
5427 
5428 	INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5429 	dev->ethtool_ntuple_list.count = 0;
5430 	INIT_LIST_HEAD(&dev->napi_list);
5431 	INIT_LIST_HEAD(&dev->unreg_list);
5432 	INIT_LIST_HEAD(&dev->link_watch_list);
5433 	dev->priv_flags = IFF_XMIT_DST_RELEASE;
5434 	setup(dev);
5435 	strcpy(dev->name, name);
5436 	return dev;
5437 
5438 free_tx:
5439 	kfree(tx);
5440 
5441 free_p:
5442 	kfree(p);
5443 	return NULL;
5444 }
5445 EXPORT_SYMBOL(alloc_netdev_mq);
5446 
5447 /**
5448  *	free_netdev - free network device
5449  *	@dev: device
5450  *
5451  *	This function does the last stage of destroying an allocated device
5452  * 	interface. The reference to the device object is released.
5453  *	If this is the last reference then it will be freed.
5454  */
5455 void free_netdev(struct net_device *dev)
5456 {
5457 	struct napi_struct *p, *n;
5458 
5459 	release_net(dev_net(dev));
5460 
5461 	kfree(dev->_tx);
5462 
5463 	/* Flush device addresses */
5464 	dev_addr_flush(dev);
5465 
5466 	/* Clear ethtool n-tuple list */
5467 	ethtool_ntuple_flush(dev);
5468 
5469 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5470 		netif_napi_del(p);
5471 
5472 	/*  Compatibility with error handling in drivers */
5473 	if (dev->reg_state == NETREG_UNINITIALIZED) {
5474 		kfree((char *)dev - dev->padded);
5475 		return;
5476 	}
5477 
5478 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5479 	dev->reg_state = NETREG_RELEASED;
5480 
5481 	/* will free via device release */
5482 	put_device(&dev->dev);
5483 }
5484 EXPORT_SYMBOL(free_netdev);
5485 
5486 /**
5487  *	synchronize_net -  Synchronize with packet receive processing
5488  *
5489  *	Wait for packets currently being received to be done.
5490  *	Does not block later packets from starting.
5491  */
5492 void synchronize_net(void)
5493 {
5494 	might_sleep();
5495 	synchronize_rcu();
5496 }
5497 EXPORT_SYMBOL(synchronize_net);
5498 
5499 /**
5500  *	unregister_netdevice_queue - remove device from the kernel
5501  *	@dev: device
5502  *	@head: list
5503  *
5504  *	This function shuts down a device interface and removes it
5505  *	from the kernel tables.
5506  *	If head not NULL, device is queued to be unregistered later.
5507  *
5508  *	Callers must hold the rtnl semaphore.  You may want
5509  *	unregister_netdev() instead of this.
5510  */
5511 
5512 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5513 {
5514 	ASSERT_RTNL();
5515 
5516 	if (head) {
5517 		list_move_tail(&dev->unreg_list, head);
5518 	} else {
5519 		rollback_registered(dev);
5520 		/* Finish processing unregister after unlock */
5521 		net_set_todo(dev);
5522 	}
5523 }
5524 EXPORT_SYMBOL(unregister_netdevice_queue);
5525 
5526 /**
5527  *	unregister_netdevice_many - unregister many devices
5528  *	@head: list of devices
5529  */
5530 void unregister_netdevice_many(struct list_head *head)
5531 {
5532 	struct net_device *dev;
5533 
5534 	if (!list_empty(head)) {
5535 		rollback_registered_many(head);
5536 		list_for_each_entry(dev, head, unreg_list)
5537 			net_set_todo(dev);
5538 	}
5539 }
5540 EXPORT_SYMBOL(unregister_netdevice_many);
5541 
5542 /**
5543  *	unregister_netdev - remove device from the kernel
5544  *	@dev: device
5545  *
5546  *	This function shuts down a device interface and removes it
5547  *	from the kernel tables.
5548  *
5549  *	This is just a wrapper for unregister_netdevice that takes
5550  *	the rtnl semaphore.  In general you want to use this and not
5551  *	unregister_netdevice.
5552  */
5553 void unregister_netdev(struct net_device *dev)
5554 {
5555 	rtnl_lock();
5556 	unregister_netdevice(dev);
5557 	rtnl_unlock();
5558 }
5559 EXPORT_SYMBOL(unregister_netdev);
5560 
5561 /**
5562  *	dev_change_net_namespace - move device to different nethost namespace
5563  *	@dev: device
5564  *	@net: network namespace
5565  *	@pat: If not NULL name pattern to try if the current device name
5566  *	      is already taken in the destination network namespace.
5567  *
5568  *	This function shuts down a device interface and moves it
5569  *	to a new network namespace. On success 0 is returned, on
5570  *	a failure a netagive errno code is returned.
5571  *
5572  *	Callers must hold the rtnl semaphore.
5573  */
5574 
5575 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5576 {
5577 	int err;
5578 
5579 	ASSERT_RTNL();
5580 
5581 	/* Don't allow namespace local devices to be moved. */
5582 	err = -EINVAL;
5583 	if (dev->features & NETIF_F_NETNS_LOCAL)
5584 		goto out;
5585 
5586 #ifdef CONFIG_SYSFS
5587 	/* Don't allow real devices to be moved when sysfs
5588 	 * is enabled.
5589 	 */
5590 	err = -EINVAL;
5591 	if (dev->dev.parent)
5592 		goto out;
5593 #endif
5594 
5595 	/* Ensure the device has been registrered */
5596 	err = -EINVAL;
5597 	if (dev->reg_state != NETREG_REGISTERED)
5598 		goto out;
5599 
5600 	/* Get out if there is nothing todo */
5601 	err = 0;
5602 	if (net_eq(dev_net(dev), net))
5603 		goto out;
5604 
5605 	/* Pick the destination device name, and ensure
5606 	 * we can use it in the destination network namespace.
5607 	 */
5608 	err = -EEXIST;
5609 	if (__dev_get_by_name(net, dev->name)) {
5610 		/* We get here if we can't use the current device name */
5611 		if (!pat)
5612 			goto out;
5613 		if (dev_get_valid_name(net, pat, dev->name, 1))
5614 			goto out;
5615 	}
5616 
5617 	/*
5618 	 * And now a mini version of register_netdevice unregister_netdevice.
5619 	 */
5620 
5621 	/* If device is running close it first. */
5622 	dev_close(dev);
5623 
5624 	/* And unlink it from device chain */
5625 	err = -ENODEV;
5626 	unlist_netdevice(dev);
5627 
5628 	synchronize_net();
5629 
5630 	/* Shutdown queueing discipline. */
5631 	dev_shutdown(dev);
5632 
5633 	/* Notify protocols, that we are about to destroy
5634 	   this device. They should clean all the things.
5635 	*/
5636 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5637 	call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5638 
5639 	/*
5640 	 *	Flush the unicast and multicast chains
5641 	 */
5642 	dev_unicast_flush(dev);
5643 	dev_addr_discard(dev);
5644 
5645 	netdev_unregister_kobject(dev);
5646 
5647 	/* Actually switch the network namespace */
5648 	dev_net_set(dev, net);
5649 
5650 	/* If there is an ifindex conflict assign a new one */
5651 	if (__dev_get_by_index(net, dev->ifindex)) {
5652 		int iflink = (dev->iflink == dev->ifindex);
5653 		dev->ifindex = dev_new_index(net);
5654 		if (iflink)
5655 			dev->iflink = dev->ifindex;
5656 	}
5657 
5658 	/* Fixup kobjects */
5659 	err = netdev_register_kobject(dev);
5660 	WARN_ON(err);
5661 
5662 	/* Add the device back in the hashes */
5663 	list_netdevice(dev);
5664 
5665 	/* Notify protocols, that a new device appeared. */
5666 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
5667 
5668 	/*
5669 	 *	Prevent userspace races by waiting until the network
5670 	 *	device is fully setup before sending notifications.
5671 	 */
5672 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5673 
5674 	synchronize_net();
5675 	err = 0;
5676 out:
5677 	return err;
5678 }
5679 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5680 
5681 static int dev_cpu_callback(struct notifier_block *nfb,
5682 			    unsigned long action,
5683 			    void *ocpu)
5684 {
5685 	struct sk_buff **list_skb;
5686 	struct Qdisc **list_net;
5687 	struct sk_buff *skb;
5688 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
5689 	struct softnet_data *sd, *oldsd;
5690 
5691 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5692 		return NOTIFY_OK;
5693 
5694 	local_irq_disable();
5695 	cpu = smp_processor_id();
5696 	sd = &per_cpu(softnet_data, cpu);
5697 	oldsd = &per_cpu(softnet_data, oldcpu);
5698 
5699 	/* Find end of our completion_queue. */
5700 	list_skb = &sd->completion_queue;
5701 	while (*list_skb)
5702 		list_skb = &(*list_skb)->next;
5703 	/* Append completion queue from offline CPU. */
5704 	*list_skb = oldsd->completion_queue;
5705 	oldsd->completion_queue = NULL;
5706 
5707 	/* Find end of our output_queue. */
5708 	list_net = &sd->output_queue;
5709 	while (*list_net)
5710 		list_net = &(*list_net)->next_sched;
5711 	/* Append output queue from offline CPU. */
5712 	*list_net = oldsd->output_queue;
5713 	oldsd->output_queue = NULL;
5714 
5715 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
5716 	local_irq_enable();
5717 
5718 	/* Process offline CPU's input_pkt_queue */
5719 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5720 		netif_rx(skb);
5721 
5722 	return NOTIFY_OK;
5723 }
5724 
5725 
5726 /**
5727  *	netdev_increment_features - increment feature set by one
5728  *	@all: current feature set
5729  *	@one: new feature set
5730  *	@mask: mask feature set
5731  *
5732  *	Computes a new feature set after adding a device with feature set
5733  *	@one to the master device with current feature set @all.  Will not
5734  *	enable anything that is off in @mask. Returns the new feature set.
5735  */
5736 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5737 					unsigned long mask)
5738 {
5739 	/* If device needs checksumming, downgrade to it. */
5740 	if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5741 		all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5742 	else if (mask & NETIF_F_ALL_CSUM) {
5743 		/* If one device supports v4/v6 checksumming, set for all. */
5744 		if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5745 		    !(all & NETIF_F_GEN_CSUM)) {
5746 			all &= ~NETIF_F_ALL_CSUM;
5747 			all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5748 		}
5749 
5750 		/* If one device supports hw checksumming, set for all. */
5751 		if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5752 			all &= ~NETIF_F_ALL_CSUM;
5753 			all |= NETIF_F_HW_CSUM;
5754 		}
5755 	}
5756 
5757 	one |= NETIF_F_ALL_CSUM;
5758 
5759 	one |= all & NETIF_F_ONE_FOR_ALL;
5760 	all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
5761 	all |= one & mask & NETIF_F_ONE_FOR_ALL;
5762 
5763 	return all;
5764 }
5765 EXPORT_SYMBOL(netdev_increment_features);
5766 
5767 static struct hlist_head *netdev_create_hash(void)
5768 {
5769 	int i;
5770 	struct hlist_head *hash;
5771 
5772 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5773 	if (hash != NULL)
5774 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
5775 			INIT_HLIST_HEAD(&hash[i]);
5776 
5777 	return hash;
5778 }
5779 
5780 /* Initialize per network namespace state */
5781 static int __net_init netdev_init(struct net *net)
5782 {
5783 	INIT_LIST_HEAD(&net->dev_base_head);
5784 
5785 	net->dev_name_head = netdev_create_hash();
5786 	if (net->dev_name_head == NULL)
5787 		goto err_name;
5788 
5789 	net->dev_index_head = netdev_create_hash();
5790 	if (net->dev_index_head == NULL)
5791 		goto err_idx;
5792 
5793 	return 0;
5794 
5795 err_idx:
5796 	kfree(net->dev_name_head);
5797 err_name:
5798 	return -ENOMEM;
5799 }
5800 
5801 /**
5802  *	netdev_drivername - network driver for the device
5803  *	@dev: network device
5804  *	@buffer: buffer for resulting name
5805  *	@len: size of buffer
5806  *
5807  *	Determine network driver for device.
5808  */
5809 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5810 {
5811 	const struct device_driver *driver;
5812 	const struct device *parent;
5813 
5814 	if (len <= 0 || !buffer)
5815 		return buffer;
5816 	buffer[0] = 0;
5817 
5818 	parent = dev->dev.parent;
5819 
5820 	if (!parent)
5821 		return buffer;
5822 
5823 	driver = parent->driver;
5824 	if (driver && driver->name)
5825 		strlcpy(buffer, driver->name, len);
5826 	return buffer;
5827 }
5828 
5829 static void __net_exit netdev_exit(struct net *net)
5830 {
5831 	kfree(net->dev_name_head);
5832 	kfree(net->dev_index_head);
5833 }
5834 
5835 static struct pernet_operations __net_initdata netdev_net_ops = {
5836 	.init = netdev_init,
5837 	.exit = netdev_exit,
5838 };
5839 
5840 static void __net_exit default_device_exit(struct net *net)
5841 {
5842 	struct net_device *dev, *aux;
5843 	/*
5844 	 * Push all migratable network devices back to the
5845 	 * initial network namespace
5846 	 */
5847 	rtnl_lock();
5848 	for_each_netdev_safe(net, dev, aux) {
5849 		int err;
5850 		char fb_name[IFNAMSIZ];
5851 
5852 		/* Ignore unmoveable devices (i.e. loopback) */
5853 		if (dev->features & NETIF_F_NETNS_LOCAL)
5854 			continue;
5855 
5856 		/* Leave virtual devices for the generic cleanup */
5857 		if (dev->rtnl_link_ops)
5858 			continue;
5859 
5860 		/* Push remaing network devices to init_net */
5861 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5862 		err = dev_change_net_namespace(dev, &init_net, fb_name);
5863 		if (err) {
5864 			printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5865 				__func__, dev->name, err);
5866 			BUG();
5867 		}
5868 	}
5869 	rtnl_unlock();
5870 }
5871 
5872 static void __net_exit default_device_exit_batch(struct list_head *net_list)
5873 {
5874 	/* At exit all network devices most be removed from a network
5875 	 * namespace.  Do this in the reverse order of registeration.
5876 	 * Do this across as many network namespaces as possible to
5877 	 * improve batching efficiency.
5878 	 */
5879 	struct net_device *dev;
5880 	struct net *net;
5881 	LIST_HEAD(dev_kill_list);
5882 
5883 	rtnl_lock();
5884 	list_for_each_entry(net, net_list, exit_list) {
5885 		for_each_netdev_reverse(net, dev) {
5886 			if (dev->rtnl_link_ops)
5887 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5888 			else
5889 				unregister_netdevice_queue(dev, &dev_kill_list);
5890 		}
5891 	}
5892 	unregister_netdevice_many(&dev_kill_list);
5893 	rtnl_unlock();
5894 }
5895 
5896 static struct pernet_operations __net_initdata default_device_ops = {
5897 	.exit = default_device_exit,
5898 	.exit_batch = default_device_exit_batch,
5899 };
5900 
5901 /*
5902  *	Initialize the DEV module. At boot time this walks the device list and
5903  *	unhooks any devices that fail to initialise (normally hardware not
5904  *	present) and leaves us with a valid list of present and active devices.
5905  *
5906  */
5907 
5908 /*
5909  *       This is called single threaded during boot, so no need
5910  *       to take the rtnl semaphore.
5911  */
5912 static int __init net_dev_init(void)
5913 {
5914 	int i, rc = -ENOMEM;
5915 
5916 	BUG_ON(!dev_boot_phase);
5917 
5918 	if (dev_proc_init())
5919 		goto out;
5920 
5921 	if (netdev_kobject_init())
5922 		goto out;
5923 
5924 	INIT_LIST_HEAD(&ptype_all);
5925 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
5926 		INIT_LIST_HEAD(&ptype_base[i]);
5927 
5928 	if (register_pernet_subsys(&netdev_net_ops))
5929 		goto out;
5930 
5931 	/*
5932 	 *	Initialise the packet receive queues.
5933 	 */
5934 
5935 	for_each_possible_cpu(i) {
5936 		struct softnet_data *queue;
5937 
5938 		queue = &per_cpu(softnet_data, i);
5939 		skb_queue_head_init(&queue->input_pkt_queue);
5940 		queue->completion_queue = NULL;
5941 		INIT_LIST_HEAD(&queue->poll_list);
5942 
5943 		queue->backlog.poll = process_backlog;
5944 		queue->backlog.weight = weight_p;
5945 		queue->backlog.gro_list = NULL;
5946 		queue->backlog.gro_count = 0;
5947 	}
5948 
5949 	dev_boot_phase = 0;
5950 
5951 	/* The loopback device is special if any other network devices
5952 	 * is present in a network namespace the loopback device must
5953 	 * be present. Since we now dynamically allocate and free the
5954 	 * loopback device ensure this invariant is maintained by
5955 	 * keeping the loopback device as the first device on the
5956 	 * list of network devices.  Ensuring the loopback devices
5957 	 * is the first device that appears and the last network device
5958 	 * that disappears.
5959 	 */
5960 	if (register_pernet_device(&loopback_net_ops))
5961 		goto out;
5962 
5963 	if (register_pernet_device(&default_device_ops))
5964 		goto out;
5965 
5966 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5967 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5968 
5969 	hotcpu_notifier(dev_cpu_callback, 0);
5970 	dst_init();
5971 	dev_mcast_init();
5972 	rc = 0;
5973 out:
5974 	return rc;
5975 }
5976 
5977 subsys_initcall(net_dev_init);
5978 
5979 static int __init initialize_hashrnd(void)
5980 {
5981 	get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5982 	return 0;
5983 }
5984 
5985 late_initcall_sync(initialize_hashrnd);
5986 
5987