xref: /openbmc/linux/net/core/dev.c (revision fbcb21705930f2930f506149d0b8d36dfbe45107)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <net/mpls.h>
122 #include <linux/ipv6.h>
123 #include <linux/in.h>
124 #include <linux/jhash.h>
125 #include <linux/random.h>
126 #include <trace/events/napi.h>
127 #include <trace/events/net.h>
128 #include <trace/events/skb.h>
129 #include <linux/pci.h>
130 #include <linux/inetdevice.h>
131 #include <linux/cpu_rmap.h>
132 #include <linux/static_key.h>
133 #include <linux/hashtable.h>
134 #include <linux/vmalloc.h>
135 #include <linux/if_macvlan.h>
136 #include <linux/errqueue.h>
137 #include <linux/hrtimer.h>
138 
139 #include "net-sysfs.h"
140 
141 /* Instead of increasing this, you should create a hash table. */
142 #define MAX_GRO_SKBS 8
143 
144 /* This should be increased if a protocol with a bigger head is added. */
145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 
147 static DEFINE_SPINLOCK(ptype_lock);
148 static DEFINE_SPINLOCK(offload_lock);
149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
150 struct list_head ptype_all __read_mostly;	/* Taps */
151 static struct list_head offload_base __read_mostly;
152 
153 static int netif_rx_internal(struct sk_buff *skb);
154 static int call_netdevice_notifiers_info(unsigned long val,
155 					 struct net_device *dev,
156 					 struct netdev_notifier_info *info);
157 
158 /*
159  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
160  * semaphore.
161  *
162  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
163  *
164  * Writers must hold the rtnl semaphore while they loop through the
165  * dev_base_head list, and hold dev_base_lock for writing when they do the
166  * actual updates.  This allows pure readers to access the list even
167  * while a writer is preparing to update it.
168  *
169  * To put it another way, dev_base_lock is held for writing only to
170  * protect against pure readers; the rtnl semaphore provides the
171  * protection against other writers.
172  *
173  * See, for example usages, register_netdevice() and
174  * unregister_netdevice(), which must be called with the rtnl
175  * semaphore held.
176  */
177 DEFINE_RWLOCK(dev_base_lock);
178 EXPORT_SYMBOL(dev_base_lock);
179 
180 /* protects napi_hash addition/deletion and napi_gen_id */
181 static DEFINE_SPINLOCK(napi_hash_lock);
182 
183 static unsigned int napi_gen_id;
184 static DEFINE_HASHTABLE(napi_hash, 8);
185 
186 static seqcount_t devnet_rename_seq;
187 
188 static inline void dev_base_seq_inc(struct net *net)
189 {
190 	while (++net->dev_base_seq == 0);
191 }
192 
193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
194 {
195 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
196 
197 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
198 }
199 
200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
201 {
202 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
203 }
204 
205 static inline void rps_lock(struct softnet_data *sd)
206 {
207 #ifdef CONFIG_RPS
208 	spin_lock(&sd->input_pkt_queue.lock);
209 #endif
210 }
211 
212 static inline void rps_unlock(struct softnet_data *sd)
213 {
214 #ifdef CONFIG_RPS
215 	spin_unlock(&sd->input_pkt_queue.lock);
216 #endif
217 }
218 
219 /* Device list insertion */
220 static void list_netdevice(struct net_device *dev)
221 {
222 	struct net *net = dev_net(dev);
223 
224 	ASSERT_RTNL();
225 
226 	write_lock_bh(&dev_base_lock);
227 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
228 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
229 	hlist_add_head_rcu(&dev->index_hlist,
230 			   dev_index_hash(net, dev->ifindex));
231 	write_unlock_bh(&dev_base_lock);
232 
233 	dev_base_seq_inc(net);
234 }
235 
236 /* Device list removal
237  * caller must respect a RCU grace period before freeing/reusing dev
238  */
239 static void unlist_netdevice(struct net_device *dev)
240 {
241 	ASSERT_RTNL();
242 
243 	/* Unlink dev from the device chain */
244 	write_lock_bh(&dev_base_lock);
245 	list_del_rcu(&dev->dev_list);
246 	hlist_del_rcu(&dev->name_hlist);
247 	hlist_del_rcu(&dev->index_hlist);
248 	write_unlock_bh(&dev_base_lock);
249 
250 	dev_base_seq_inc(dev_net(dev));
251 }
252 
253 /*
254  *	Our notifier list
255  */
256 
257 static RAW_NOTIFIER_HEAD(netdev_chain);
258 
259 /*
260  *	Device drivers call our routines to queue packets here. We empty the
261  *	queue in the local softnet handler.
262  */
263 
264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
265 EXPORT_PER_CPU_SYMBOL(softnet_data);
266 
267 #ifdef CONFIG_LOCKDEP
268 /*
269  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
270  * according to dev->type
271  */
272 static const unsigned short netdev_lock_type[] =
273 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
274 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
275 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
276 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
277 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
278 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
279 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
280 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
281 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
282 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
283 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
284 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
285 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
286 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
287 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
288 
289 static const char *const netdev_lock_name[] =
290 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
291 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
292 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
293 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
294 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
295 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
296 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
297 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
298 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
299 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
300 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
301 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
302 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
303 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
304 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
305 
306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
308 
309 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
310 {
311 	int i;
312 
313 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
314 		if (netdev_lock_type[i] == dev_type)
315 			return i;
316 	/* the last key is used by default */
317 	return ARRAY_SIZE(netdev_lock_type) - 1;
318 }
319 
320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
321 						 unsigned short dev_type)
322 {
323 	int i;
324 
325 	i = netdev_lock_pos(dev_type);
326 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
327 				   netdev_lock_name[i]);
328 }
329 
330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
331 {
332 	int i;
333 
334 	i = netdev_lock_pos(dev->type);
335 	lockdep_set_class_and_name(&dev->addr_list_lock,
336 				   &netdev_addr_lock_key[i],
337 				   netdev_lock_name[i]);
338 }
339 #else
340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
341 						 unsigned short dev_type)
342 {
343 }
344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
345 {
346 }
347 #endif
348 
349 /*******************************************************************************
350 
351 		Protocol management and registration routines
352 
353 *******************************************************************************/
354 
355 /*
356  *	Add a protocol ID to the list. Now that the input handler is
357  *	smarter we can dispense with all the messy stuff that used to be
358  *	here.
359  *
360  *	BEWARE!!! Protocol handlers, mangling input packets,
361  *	MUST BE last in hash buckets and checking protocol handlers
362  *	MUST start from promiscuous ptype_all chain in net_bh.
363  *	It is true now, do not change it.
364  *	Explanation follows: if protocol handler, mangling packet, will
365  *	be the first on list, it is not able to sense, that packet
366  *	is cloned and should be copied-on-write, so that it will
367  *	change it and subsequent readers will get broken packet.
368  *							--ANK (980803)
369  */
370 
371 static inline struct list_head *ptype_head(const struct packet_type *pt)
372 {
373 	if (pt->type == htons(ETH_P_ALL))
374 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
375 	else
376 		return pt->dev ? &pt->dev->ptype_specific :
377 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
378 }
379 
380 /**
381  *	dev_add_pack - add packet handler
382  *	@pt: packet type declaration
383  *
384  *	Add a protocol handler to the networking stack. The passed &packet_type
385  *	is linked into kernel lists and may not be freed until it has been
386  *	removed from the kernel lists.
387  *
388  *	This call does not sleep therefore it can not
389  *	guarantee all CPU's that are in middle of receiving packets
390  *	will see the new packet type (until the next received packet).
391  */
392 
393 void dev_add_pack(struct packet_type *pt)
394 {
395 	struct list_head *head = ptype_head(pt);
396 
397 	spin_lock(&ptype_lock);
398 	list_add_rcu(&pt->list, head);
399 	spin_unlock(&ptype_lock);
400 }
401 EXPORT_SYMBOL(dev_add_pack);
402 
403 /**
404  *	__dev_remove_pack	 - remove packet handler
405  *	@pt: packet type declaration
406  *
407  *	Remove a protocol handler that was previously added to the kernel
408  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
409  *	from the kernel lists and can be freed or reused once this function
410  *	returns.
411  *
412  *      The packet type might still be in use by receivers
413  *	and must not be freed until after all the CPU's have gone
414  *	through a quiescent state.
415  */
416 void __dev_remove_pack(struct packet_type *pt)
417 {
418 	struct list_head *head = ptype_head(pt);
419 	struct packet_type *pt1;
420 
421 	spin_lock(&ptype_lock);
422 
423 	list_for_each_entry(pt1, head, list) {
424 		if (pt == pt1) {
425 			list_del_rcu(&pt->list);
426 			goto out;
427 		}
428 	}
429 
430 	pr_warn("dev_remove_pack: %p not found\n", pt);
431 out:
432 	spin_unlock(&ptype_lock);
433 }
434 EXPORT_SYMBOL(__dev_remove_pack);
435 
436 /**
437  *	dev_remove_pack	 - remove packet handler
438  *	@pt: packet type declaration
439  *
440  *	Remove a protocol handler that was previously added to the kernel
441  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
442  *	from the kernel lists and can be freed or reused once this function
443  *	returns.
444  *
445  *	This call sleeps to guarantee that no CPU is looking at the packet
446  *	type after return.
447  */
448 void dev_remove_pack(struct packet_type *pt)
449 {
450 	__dev_remove_pack(pt);
451 
452 	synchronize_net();
453 }
454 EXPORT_SYMBOL(dev_remove_pack);
455 
456 
457 /**
458  *	dev_add_offload - register offload handlers
459  *	@po: protocol offload declaration
460  *
461  *	Add protocol offload handlers to the networking stack. The passed
462  *	&proto_offload is linked into kernel lists and may not be freed until
463  *	it has been removed from the kernel lists.
464  *
465  *	This call does not sleep therefore it can not
466  *	guarantee all CPU's that are in middle of receiving packets
467  *	will see the new offload handlers (until the next received packet).
468  */
469 void dev_add_offload(struct packet_offload *po)
470 {
471 	struct list_head *head = &offload_base;
472 
473 	spin_lock(&offload_lock);
474 	list_add_rcu(&po->list, head);
475 	spin_unlock(&offload_lock);
476 }
477 EXPORT_SYMBOL(dev_add_offload);
478 
479 /**
480  *	__dev_remove_offload	 - remove offload handler
481  *	@po: packet offload declaration
482  *
483  *	Remove a protocol offload handler that was previously added to the
484  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
485  *	is removed from the kernel lists and can be freed or reused once this
486  *	function returns.
487  *
488  *      The packet type might still be in use by receivers
489  *	and must not be freed until after all the CPU's have gone
490  *	through a quiescent state.
491  */
492 static void __dev_remove_offload(struct packet_offload *po)
493 {
494 	struct list_head *head = &offload_base;
495 	struct packet_offload *po1;
496 
497 	spin_lock(&offload_lock);
498 
499 	list_for_each_entry(po1, head, list) {
500 		if (po == po1) {
501 			list_del_rcu(&po->list);
502 			goto out;
503 		}
504 	}
505 
506 	pr_warn("dev_remove_offload: %p not found\n", po);
507 out:
508 	spin_unlock(&offload_lock);
509 }
510 
511 /**
512  *	dev_remove_offload	 - remove packet offload handler
513  *	@po: packet offload declaration
514  *
515  *	Remove a packet offload handler that was previously added to the kernel
516  *	offload handlers by dev_add_offload(). The passed &offload_type is
517  *	removed from the kernel lists and can be freed or reused once this
518  *	function returns.
519  *
520  *	This call sleeps to guarantee that no CPU is looking at the packet
521  *	type after return.
522  */
523 void dev_remove_offload(struct packet_offload *po)
524 {
525 	__dev_remove_offload(po);
526 
527 	synchronize_net();
528 }
529 EXPORT_SYMBOL(dev_remove_offload);
530 
531 /******************************************************************************
532 
533 		      Device Boot-time Settings Routines
534 
535 *******************************************************************************/
536 
537 /* Boot time configuration table */
538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
539 
540 /**
541  *	netdev_boot_setup_add	- add new setup entry
542  *	@name: name of the device
543  *	@map: configured settings for the device
544  *
545  *	Adds new setup entry to the dev_boot_setup list.  The function
546  *	returns 0 on error and 1 on success.  This is a generic routine to
547  *	all netdevices.
548  */
549 static int netdev_boot_setup_add(char *name, struct ifmap *map)
550 {
551 	struct netdev_boot_setup *s;
552 	int i;
553 
554 	s = dev_boot_setup;
555 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
556 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
557 			memset(s[i].name, 0, sizeof(s[i].name));
558 			strlcpy(s[i].name, name, IFNAMSIZ);
559 			memcpy(&s[i].map, map, sizeof(s[i].map));
560 			break;
561 		}
562 	}
563 
564 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
565 }
566 
567 /**
568  *	netdev_boot_setup_check	- check boot time settings
569  *	@dev: the netdevice
570  *
571  * 	Check boot time settings for the device.
572  *	The found settings are set for the device to be used
573  *	later in the device probing.
574  *	Returns 0 if no settings found, 1 if they are.
575  */
576 int netdev_boot_setup_check(struct net_device *dev)
577 {
578 	struct netdev_boot_setup *s = dev_boot_setup;
579 	int i;
580 
581 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
582 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
583 		    !strcmp(dev->name, s[i].name)) {
584 			dev->irq 	= s[i].map.irq;
585 			dev->base_addr 	= s[i].map.base_addr;
586 			dev->mem_start 	= s[i].map.mem_start;
587 			dev->mem_end 	= s[i].map.mem_end;
588 			return 1;
589 		}
590 	}
591 	return 0;
592 }
593 EXPORT_SYMBOL(netdev_boot_setup_check);
594 
595 
596 /**
597  *	netdev_boot_base	- get address from boot time settings
598  *	@prefix: prefix for network device
599  *	@unit: id for network device
600  *
601  * 	Check boot time settings for the base address of device.
602  *	The found settings are set for the device to be used
603  *	later in the device probing.
604  *	Returns 0 if no settings found.
605  */
606 unsigned long netdev_boot_base(const char *prefix, int unit)
607 {
608 	const struct netdev_boot_setup *s = dev_boot_setup;
609 	char name[IFNAMSIZ];
610 	int i;
611 
612 	sprintf(name, "%s%d", prefix, unit);
613 
614 	/*
615 	 * If device already registered then return base of 1
616 	 * to indicate not to probe for this interface
617 	 */
618 	if (__dev_get_by_name(&init_net, name))
619 		return 1;
620 
621 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
622 		if (!strcmp(name, s[i].name))
623 			return s[i].map.base_addr;
624 	return 0;
625 }
626 
627 /*
628  * Saves at boot time configured settings for any netdevice.
629  */
630 int __init netdev_boot_setup(char *str)
631 {
632 	int ints[5];
633 	struct ifmap map;
634 
635 	str = get_options(str, ARRAY_SIZE(ints), ints);
636 	if (!str || !*str)
637 		return 0;
638 
639 	/* Save settings */
640 	memset(&map, 0, sizeof(map));
641 	if (ints[0] > 0)
642 		map.irq = ints[1];
643 	if (ints[0] > 1)
644 		map.base_addr = ints[2];
645 	if (ints[0] > 2)
646 		map.mem_start = ints[3];
647 	if (ints[0] > 3)
648 		map.mem_end = ints[4];
649 
650 	/* Add new entry to the list */
651 	return netdev_boot_setup_add(str, &map);
652 }
653 
654 __setup("netdev=", netdev_boot_setup);
655 
656 /*******************************************************************************
657 
658 			    Device Interface Subroutines
659 
660 *******************************************************************************/
661 
662 /**
663  *	__dev_get_by_name	- find a device by its name
664  *	@net: the applicable net namespace
665  *	@name: name to find
666  *
667  *	Find an interface by name. Must be called under RTNL semaphore
668  *	or @dev_base_lock. If the name is found a pointer to the device
669  *	is returned. If the name is not found then %NULL is returned. The
670  *	reference counters are not incremented so the caller must be
671  *	careful with locks.
672  */
673 
674 struct net_device *__dev_get_by_name(struct net *net, const char *name)
675 {
676 	struct net_device *dev;
677 	struct hlist_head *head = dev_name_hash(net, name);
678 
679 	hlist_for_each_entry(dev, head, name_hlist)
680 		if (!strncmp(dev->name, name, IFNAMSIZ))
681 			return dev;
682 
683 	return NULL;
684 }
685 EXPORT_SYMBOL(__dev_get_by_name);
686 
687 /**
688  *	dev_get_by_name_rcu	- find a device by its name
689  *	@net: the applicable net namespace
690  *	@name: name to find
691  *
692  *	Find an interface by name.
693  *	If the name is found a pointer to the device is returned.
694  * 	If the name is not found then %NULL is returned.
695  *	The reference counters are not incremented so the caller must be
696  *	careful with locks. The caller must hold RCU lock.
697  */
698 
699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
700 {
701 	struct net_device *dev;
702 	struct hlist_head *head = dev_name_hash(net, name);
703 
704 	hlist_for_each_entry_rcu(dev, head, name_hlist)
705 		if (!strncmp(dev->name, name, IFNAMSIZ))
706 			return dev;
707 
708 	return NULL;
709 }
710 EXPORT_SYMBOL(dev_get_by_name_rcu);
711 
712 /**
713  *	dev_get_by_name		- find a device by its name
714  *	@net: the applicable net namespace
715  *	@name: name to find
716  *
717  *	Find an interface by name. This can be called from any
718  *	context and does its own locking. The returned handle has
719  *	the usage count incremented and the caller must use dev_put() to
720  *	release it when it is no longer needed. %NULL is returned if no
721  *	matching device is found.
722  */
723 
724 struct net_device *dev_get_by_name(struct net *net, const char *name)
725 {
726 	struct net_device *dev;
727 
728 	rcu_read_lock();
729 	dev = dev_get_by_name_rcu(net, name);
730 	if (dev)
731 		dev_hold(dev);
732 	rcu_read_unlock();
733 	return dev;
734 }
735 EXPORT_SYMBOL(dev_get_by_name);
736 
737 /**
738  *	__dev_get_by_index - find a device by its ifindex
739  *	@net: the applicable net namespace
740  *	@ifindex: index of device
741  *
742  *	Search for an interface by index. Returns %NULL if the device
743  *	is not found or a pointer to the device. The device has not
744  *	had its reference counter increased so the caller must be careful
745  *	about locking. The caller must hold either the RTNL semaphore
746  *	or @dev_base_lock.
747  */
748 
749 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
750 {
751 	struct net_device *dev;
752 	struct hlist_head *head = dev_index_hash(net, ifindex);
753 
754 	hlist_for_each_entry(dev, head, index_hlist)
755 		if (dev->ifindex == ifindex)
756 			return dev;
757 
758 	return NULL;
759 }
760 EXPORT_SYMBOL(__dev_get_by_index);
761 
762 /**
763  *	dev_get_by_index_rcu - find a device by its ifindex
764  *	@net: the applicable net namespace
765  *	@ifindex: index of device
766  *
767  *	Search for an interface by index. Returns %NULL if the device
768  *	is not found or a pointer to the device. The device has not
769  *	had its reference counter increased so the caller must be careful
770  *	about locking. The caller must hold RCU lock.
771  */
772 
773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
774 {
775 	struct net_device *dev;
776 	struct hlist_head *head = dev_index_hash(net, ifindex);
777 
778 	hlist_for_each_entry_rcu(dev, head, index_hlist)
779 		if (dev->ifindex == ifindex)
780 			return dev;
781 
782 	return NULL;
783 }
784 EXPORT_SYMBOL(dev_get_by_index_rcu);
785 
786 
787 /**
788  *	dev_get_by_index - find a device by its ifindex
789  *	@net: the applicable net namespace
790  *	@ifindex: index of device
791  *
792  *	Search for an interface by index. Returns NULL if the device
793  *	is not found or a pointer to the device. The device returned has
794  *	had a reference added and the pointer is safe until the user calls
795  *	dev_put to indicate they have finished with it.
796  */
797 
798 struct net_device *dev_get_by_index(struct net *net, int ifindex)
799 {
800 	struct net_device *dev;
801 
802 	rcu_read_lock();
803 	dev = dev_get_by_index_rcu(net, ifindex);
804 	if (dev)
805 		dev_hold(dev);
806 	rcu_read_unlock();
807 	return dev;
808 }
809 EXPORT_SYMBOL(dev_get_by_index);
810 
811 /**
812  *	netdev_get_name - get a netdevice name, knowing its ifindex.
813  *	@net: network namespace
814  *	@name: a pointer to the buffer where the name will be stored.
815  *	@ifindex: the ifindex of the interface to get the name from.
816  *
817  *	The use of raw_seqcount_begin() and cond_resched() before
818  *	retrying is required as we want to give the writers a chance
819  *	to complete when CONFIG_PREEMPT is not set.
820  */
821 int netdev_get_name(struct net *net, char *name, int ifindex)
822 {
823 	struct net_device *dev;
824 	unsigned int seq;
825 
826 retry:
827 	seq = raw_seqcount_begin(&devnet_rename_seq);
828 	rcu_read_lock();
829 	dev = dev_get_by_index_rcu(net, ifindex);
830 	if (!dev) {
831 		rcu_read_unlock();
832 		return -ENODEV;
833 	}
834 
835 	strcpy(name, dev->name);
836 	rcu_read_unlock();
837 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
838 		cond_resched();
839 		goto retry;
840 	}
841 
842 	return 0;
843 }
844 
845 /**
846  *	dev_getbyhwaddr_rcu - find a device by its hardware address
847  *	@net: the applicable net namespace
848  *	@type: media type of device
849  *	@ha: hardware address
850  *
851  *	Search for an interface by MAC address. Returns NULL if the device
852  *	is not found or a pointer to the device.
853  *	The caller must hold RCU or RTNL.
854  *	The returned device has not had its ref count increased
855  *	and the caller must therefore be careful about locking
856  *
857  */
858 
859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
860 				       const char *ha)
861 {
862 	struct net_device *dev;
863 
864 	for_each_netdev_rcu(net, dev)
865 		if (dev->type == type &&
866 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
867 			return dev;
868 
869 	return NULL;
870 }
871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
872 
873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
874 {
875 	struct net_device *dev;
876 
877 	ASSERT_RTNL();
878 	for_each_netdev(net, dev)
879 		if (dev->type == type)
880 			return dev;
881 
882 	return NULL;
883 }
884 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
885 
886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
887 {
888 	struct net_device *dev, *ret = NULL;
889 
890 	rcu_read_lock();
891 	for_each_netdev_rcu(net, dev)
892 		if (dev->type == type) {
893 			dev_hold(dev);
894 			ret = dev;
895 			break;
896 		}
897 	rcu_read_unlock();
898 	return ret;
899 }
900 EXPORT_SYMBOL(dev_getfirstbyhwtype);
901 
902 /**
903  *	__dev_get_by_flags - find any device with given flags
904  *	@net: the applicable net namespace
905  *	@if_flags: IFF_* values
906  *	@mask: bitmask of bits in if_flags to check
907  *
908  *	Search for any interface with the given flags. Returns NULL if a device
909  *	is not found or a pointer to the device. Must be called inside
910  *	rtnl_lock(), and result refcount is unchanged.
911  */
912 
913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
914 				      unsigned short mask)
915 {
916 	struct net_device *dev, *ret;
917 
918 	ASSERT_RTNL();
919 
920 	ret = NULL;
921 	for_each_netdev(net, dev) {
922 		if (((dev->flags ^ if_flags) & mask) == 0) {
923 			ret = dev;
924 			break;
925 		}
926 	}
927 	return ret;
928 }
929 EXPORT_SYMBOL(__dev_get_by_flags);
930 
931 /**
932  *	dev_valid_name - check if name is okay for network device
933  *	@name: name string
934  *
935  *	Network device names need to be valid file names to
936  *	to allow sysfs to work.  We also disallow any kind of
937  *	whitespace.
938  */
939 bool dev_valid_name(const char *name)
940 {
941 	if (*name == '\0')
942 		return false;
943 	if (strlen(name) >= IFNAMSIZ)
944 		return false;
945 	if (!strcmp(name, ".") || !strcmp(name, ".."))
946 		return false;
947 
948 	while (*name) {
949 		if (*name == '/' || *name == ':' || isspace(*name))
950 			return false;
951 		name++;
952 	}
953 	return true;
954 }
955 EXPORT_SYMBOL(dev_valid_name);
956 
957 /**
958  *	__dev_alloc_name - allocate a name for a device
959  *	@net: network namespace to allocate the device name in
960  *	@name: name format string
961  *	@buf:  scratch buffer and result name string
962  *
963  *	Passed a format string - eg "lt%d" it will try and find a suitable
964  *	id. It scans list of devices to build up a free map, then chooses
965  *	the first empty slot. The caller must hold the dev_base or rtnl lock
966  *	while allocating the name and adding the device in order to avoid
967  *	duplicates.
968  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
969  *	Returns the number of the unit assigned or a negative errno code.
970  */
971 
972 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
973 {
974 	int i = 0;
975 	const char *p;
976 	const int max_netdevices = 8*PAGE_SIZE;
977 	unsigned long *inuse;
978 	struct net_device *d;
979 
980 	p = strnchr(name, IFNAMSIZ-1, '%');
981 	if (p) {
982 		/*
983 		 * Verify the string as this thing may have come from
984 		 * the user.  There must be either one "%d" and no other "%"
985 		 * characters.
986 		 */
987 		if (p[1] != 'd' || strchr(p + 2, '%'))
988 			return -EINVAL;
989 
990 		/* Use one page as a bit array of possible slots */
991 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
992 		if (!inuse)
993 			return -ENOMEM;
994 
995 		for_each_netdev(net, d) {
996 			if (!sscanf(d->name, name, &i))
997 				continue;
998 			if (i < 0 || i >= max_netdevices)
999 				continue;
1000 
1001 			/*  avoid cases where sscanf is not exact inverse of printf */
1002 			snprintf(buf, IFNAMSIZ, name, i);
1003 			if (!strncmp(buf, d->name, IFNAMSIZ))
1004 				set_bit(i, inuse);
1005 		}
1006 
1007 		i = find_first_zero_bit(inuse, max_netdevices);
1008 		free_page((unsigned long) inuse);
1009 	}
1010 
1011 	if (buf != name)
1012 		snprintf(buf, IFNAMSIZ, name, i);
1013 	if (!__dev_get_by_name(net, buf))
1014 		return i;
1015 
1016 	/* It is possible to run out of possible slots
1017 	 * when the name is long and there isn't enough space left
1018 	 * for the digits, or if all bits are used.
1019 	 */
1020 	return -ENFILE;
1021 }
1022 
1023 /**
1024  *	dev_alloc_name - allocate a name for a device
1025  *	@dev: device
1026  *	@name: name format string
1027  *
1028  *	Passed a format string - eg "lt%d" it will try and find a suitable
1029  *	id. It scans list of devices to build up a free map, then chooses
1030  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1031  *	while allocating the name and adding the device in order to avoid
1032  *	duplicates.
1033  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1034  *	Returns the number of the unit assigned or a negative errno code.
1035  */
1036 
1037 int dev_alloc_name(struct net_device *dev, const char *name)
1038 {
1039 	char buf[IFNAMSIZ];
1040 	struct net *net;
1041 	int ret;
1042 
1043 	BUG_ON(!dev_net(dev));
1044 	net = dev_net(dev);
1045 	ret = __dev_alloc_name(net, name, buf);
1046 	if (ret >= 0)
1047 		strlcpy(dev->name, buf, IFNAMSIZ);
1048 	return ret;
1049 }
1050 EXPORT_SYMBOL(dev_alloc_name);
1051 
1052 static int dev_alloc_name_ns(struct net *net,
1053 			     struct net_device *dev,
1054 			     const char *name)
1055 {
1056 	char buf[IFNAMSIZ];
1057 	int ret;
1058 
1059 	ret = __dev_alloc_name(net, name, buf);
1060 	if (ret >= 0)
1061 		strlcpy(dev->name, buf, IFNAMSIZ);
1062 	return ret;
1063 }
1064 
1065 static int dev_get_valid_name(struct net *net,
1066 			      struct net_device *dev,
1067 			      const char *name)
1068 {
1069 	BUG_ON(!net);
1070 
1071 	if (!dev_valid_name(name))
1072 		return -EINVAL;
1073 
1074 	if (strchr(name, '%'))
1075 		return dev_alloc_name_ns(net, dev, name);
1076 	else if (__dev_get_by_name(net, name))
1077 		return -EEXIST;
1078 	else if (dev->name != name)
1079 		strlcpy(dev->name, name, IFNAMSIZ);
1080 
1081 	return 0;
1082 }
1083 
1084 /**
1085  *	dev_change_name - change name of a device
1086  *	@dev: device
1087  *	@newname: name (or format string) must be at least IFNAMSIZ
1088  *
1089  *	Change name of a device, can pass format strings "eth%d".
1090  *	for wildcarding.
1091  */
1092 int dev_change_name(struct net_device *dev, const char *newname)
1093 {
1094 	unsigned char old_assign_type;
1095 	char oldname[IFNAMSIZ];
1096 	int err = 0;
1097 	int ret;
1098 	struct net *net;
1099 
1100 	ASSERT_RTNL();
1101 	BUG_ON(!dev_net(dev));
1102 
1103 	net = dev_net(dev);
1104 	if (dev->flags & IFF_UP)
1105 		return -EBUSY;
1106 
1107 	write_seqcount_begin(&devnet_rename_seq);
1108 
1109 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1110 		write_seqcount_end(&devnet_rename_seq);
1111 		return 0;
1112 	}
1113 
1114 	memcpy(oldname, dev->name, IFNAMSIZ);
1115 
1116 	err = dev_get_valid_name(net, dev, newname);
1117 	if (err < 0) {
1118 		write_seqcount_end(&devnet_rename_seq);
1119 		return err;
1120 	}
1121 
1122 	if (oldname[0] && !strchr(oldname, '%'))
1123 		netdev_info(dev, "renamed from %s\n", oldname);
1124 
1125 	old_assign_type = dev->name_assign_type;
1126 	dev->name_assign_type = NET_NAME_RENAMED;
1127 
1128 rollback:
1129 	ret = device_rename(&dev->dev, dev->name);
1130 	if (ret) {
1131 		memcpy(dev->name, oldname, IFNAMSIZ);
1132 		dev->name_assign_type = old_assign_type;
1133 		write_seqcount_end(&devnet_rename_seq);
1134 		return ret;
1135 	}
1136 
1137 	write_seqcount_end(&devnet_rename_seq);
1138 
1139 	netdev_adjacent_rename_links(dev, oldname);
1140 
1141 	write_lock_bh(&dev_base_lock);
1142 	hlist_del_rcu(&dev->name_hlist);
1143 	write_unlock_bh(&dev_base_lock);
1144 
1145 	synchronize_rcu();
1146 
1147 	write_lock_bh(&dev_base_lock);
1148 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1149 	write_unlock_bh(&dev_base_lock);
1150 
1151 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1152 	ret = notifier_to_errno(ret);
1153 
1154 	if (ret) {
1155 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1156 		if (err >= 0) {
1157 			err = ret;
1158 			write_seqcount_begin(&devnet_rename_seq);
1159 			memcpy(dev->name, oldname, IFNAMSIZ);
1160 			memcpy(oldname, newname, IFNAMSIZ);
1161 			dev->name_assign_type = old_assign_type;
1162 			old_assign_type = NET_NAME_RENAMED;
1163 			goto rollback;
1164 		} else {
1165 			pr_err("%s: name change rollback failed: %d\n",
1166 			       dev->name, ret);
1167 		}
1168 	}
1169 
1170 	return err;
1171 }
1172 
1173 /**
1174  *	dev_set_alias - change ifalias of a device
1175  *	@dev: device
1176  *	@alias: name up to IFALIASZ
1177  *	@len: limit of bytes to copy from info
1178  *
1179  *	Set ifalias for a device,
1180  */
1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1182 {
1183 	char *new_ifalias;
1184 
1185 	ASSERT_RTNL();
1186 
1187 	if (len >= IFALIASZ)
1188 		return -EINVAL;
1189 
1190 	if (!len) {
1191 		kfree(dev->ifalias);
1192 		dev->ifalias = NULL;
1193 		return 0;
1194 	}
1195 
1196 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1197 	if (!new_ifalias)
1198 		return -ENOMEM;
1199 	dev->ifalias = new_ifalias;
1200 
1201 	strlcpy(dev->ifalias, alias, len+1);
1202 	return len;
1203 }
1204 
1205 
1206 /**
1207  *	netdev_features_change - device changes features
1208  *	@dev: device to cause notification
1209  *
1210  *	Called to indicate a device has changed features.
1211  */
1212 void netdev_features_change(struct net_device *dev)
1213 {
1214 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1215 }
1216 EXPORT_SYMBOL(netdev_features_change);
1217 
1218 /**
1219  *	netdev_state_change - device changes state
1220  *	@dev: device to cause notification
1221  *
1222  *	Called to indicate a device has changed state. This function calls
1223  *	the notifier chains for netdev_chain and sends a NEWLINK message
1224  *	to the routing socket.
1225  */
1226 void netdev_state_change(struct net_device *dev)
1227 {
1228 	if (dev->flags & IFF_UP) {
1229 		struct netdev_notifier_change_info change_info;
1230 
1231 		change_info.flags_changed = 0;
1232 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1233 					      &change_info.info);
1234 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1235 	}
1236 }
1237 EXPORT_SYMBOL(netdev_state_change);
1238 
1239 /**
1240  * 	netdev_notify_peers - notify network peers about existence of @dev
1241  * 	@dev: network device
1242  *
1243  * Generate traffic such that interested network peers are aware of
1244  * @dev, such as by generating a gratuitous ARP. This may be used when
1245  * a device wants to inform the rest of the network about some sort of
1246  * reconfiguration such as a failover event or virtual machine
1247  * migration.
1248  */
1249 void netdev_notify_peers(struct net_device *dev)
1250 {
1251 	rtnl_lock();
1252 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1253 	rtnl_unlock();
1254 }
1255 EXPORT_SYMBOL(netdev_notify_peers);
1256 
1257 static int __dev_open(struct net_device *dev)
1258 {
1259 	const struct net_device_ops *ops = dev->netdev_ops;
1260 	int ret;
1261 
1262 	ASSERT_RTNL();
1263 
1264 	if (!netif_device_present(dev))
1265 		return -ENODEV;
1266 
1267 	/* Block netpoll from trying to do any rx path servicing.
1268 	 * If we don't do this there is a chance ndo_poll_controller
1269 	 * or ndo_poll may be running while we open the device
1270 	 */
1271 	netpoll_poll_disable(dev);
1272 
1273 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1274 	ret = notifier_to_errno(ret);
1275 	if (ret)
1276 		return ret;
1277 
1278 	set_bit(__LINK_STATE_START, &dev->state);
1279 
1280 	if (ops->ndo_validate_addr)
1281 		ret = ops->ndo_validate_addr(dev);
1282 
1283 	if (!ret && ops->ndo_open)
1284 		ret = ops->ndo_open(dev);
1285 
1286 	netpoll_poll_enable(dev);
1287 
1288 	if (ret)
1289 		clear_bit(__LINK_STATE_START, &dev->state);
1290 	else {
1291 		dev->flags |= IFF_UP;
1292 		dev_set_rx_mode(dev);
1293 		dev_activate(dev);
1294 		add_device_randomness(dev->dev_addr, dev->addr_len);
1295 	}
1296 
1297 	return ret;
1298 }
1299 
1300 /**
1301  *	dev_open	- prepare an interface for use.
1302  *	@dev:	device to open
1303  *
1304  *	Takes a device from down to up state. The device's private open
1305  *	function is invoked and then the multicast lists are loaded. Finally
1306  *	the device is moved into the up state and a %NETDEV_UP message is
1307  *	sent to the netdev notifier chain.
1308  *
1309  *	Calling this function on an active interface is a nop. On a failure
1310  *	a negative errno code is returned.
1311  */
1312 int dev_open(struct net_device *dev)
1313 {
1314 	int ret;
1315 
1316 	if (dev->flags & IFF_UP)
1317 		return 0;
1318 
1319 	ret = __dev_open(dev);
1320 	if (ret < 0)
1321 		return ret;
1322 
1323 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1324 	call_netdevice_notifiers(NETDEV_UP, dev);
1325 
1326 	return ret;
1327 }
1328 EXPORT_SYMBOL(dev_open);
1329 
1330 static int __dev_close_many(struct list_head *head)
1331 {
1332 	struct net_device *dev;
1333 
1334 	ASSERT_RTNL();
1335 	might_sleep();
1336 
1337 	list_for_each_entry(dev, head, close_list) {
1338 		/* Temporarily disable netpoll until the interface is down */
1339 		netpoll_poll_disable(dev);
1340 
1341 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1342 
1343 		clear_bit(__LINK_STATE_START, &dev->state);
1344 
1345 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1346 		 * can be even on different cpu. So just clear netif_running().
1347 		 *
1348 		 * dev->stop() will invoke napi_disable() on all of it's
1349 		 * napi_struct instances on this device.
1350 		 */
1351 		smp_mb__after_atomic(); /* Commit netif_running(). */
1352 	}
1353 
1354 	dev_deactivate_many(head);
1355 
1356 	list_for_each_entry(dev, head, close_list) {
1357 		const struct net_device_ops *ops = dev->netdev_ops;
1358 
1359 		/*
1360 		 *	Call the device specific close. This cannot fail.
1361 		 *	Only if device is UP
1362 		 *
1363 		 *	We allow it to be called even after a DETACH hot-plug
1364 		 *	event.
1365 		 */
1366 		if (ops->ndo_stop)
1367 			ops->ndo_stop(dev);
1368 
1369 		dev->flags &= ~IFF_UP;
1370 		netpoll_poll_enable(dev);
1371 	}
1372 
1373 	return 0;
1374 }
1375 
1376 static int __dev_close(struct net_device *dev)
1377 {
1378 	int retval;
1379 	LIST_HEAD(single);
1380 
1381 	list_add(&dev->close_list, &single);
1382 	retval = __dev_close_many(&single);
1383 	list_del(&single);
1384 
1385 	return retval;
1386 }
1387 
1388 int dev_close_many(struct list_head *head, bool unlink)
1389 {
1390 	struct net_device *dev, *tmp;
1391 
1392 	/* Remove the devices that don't need to be closed */
1393 	list_for_each_entry_safe(dev, tmp, head, close_list)
1394 		if (!(dev->flags & IFF_UP))
1395 			list_del_init(&dev->close_list);
1396 
1397 	__dev_close_many(head);
1398 
1399 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1400 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1401 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1402 		if (unlink)
1403 			list_del_init(&dev->close_list);
1404 	}
1405 
1406 	return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close_many);
1409 
1410 /**
1411  *	dev_close - shutdown an interface.
1412  *	@dev: device to shutdown
1413  *
1414  *	This function moves an active device into down state. A
1415  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1416  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1417  *	chain.
1418  */
1419 int dev_close(struct net_device *dev)
1420 {
1421 	if (dev->flags & IFF_UP) {
1422 		LIST_HEAD(single);
1423 
1424 		list_add(&dev->close_list, &single);
1425 		dev_close_many(&single, true);
1426 		list_del(&single);
1427 	}
1428 	return 0;
1429 }
1430 EXPORT_SYMBOL(dev_close);
1431 
1432 
1433 /**
1434  *	dev_disable_lro - disable Large Receive Offload on a device
1435  *	@dev: device
1436  *
1437  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1438  *	called under RTNL.  This is needed if received packets may be
1439  *	forwarded to another interface.
1440  */
1441 void dev_disable_lro(struct net_device *dev)
1442 {
1443 	struct net_device *lower_dev;
1444 	struct list_head *iter;
1445 
1446 	dev->wanted_features &= ~NETIF_F_LRO;
1447 	netdev_update_features(dev);
1448 
1449 	if (unlikely(dev->features & NETIF_F_LRO))
1450 		netdev_WARN(dev, "failed to disable LRO!\n");
1451 
1452 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1453 		dev_disable_lro(lower_dev);
1454 }
1455 EXPORT_SYMBOL(dev_disable_lro);
1456 
1457 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1458 				   struct net_device *dev)
1459 {
1460 	struct netdev_notifier_info info;
1461 
1462 	netdev_notifier_info_init(&info, dev);
1463 	return nb->notifier_call(nb, val, &info);
1464 }
1465 
1466 static int dev_boot_phase = 1;
1467 
1468 /**
1469  *	register_netdevice_notifier - register a network notifier block
1470  *	@nb: notifier
1471  *
1472  *	Register a notifier to be called when network device events occur.
1473  *	The notifier passed is linked into the kernel structures and must
1474  *	not be reused until it has been unregistered. A negative errno code
1475  *	is returned on a failure.
1476  *
1477  * 	When registered all registration and up events are replayed
1478  *	to the new notifier to allow device to have a race free
1479  *	view of the network device list.
1480  */
1481 
1482 int register_netdevice_notifier(struct notifier_block *nb)
1483 {
1484 	struct net_device *dev;
1485 	struct net_device *last;
1486 	struct net *net;
1487 	int err;
1488 
1489 	rtnl_lock();
1490 	err = raw_notifier_chain_register(&netdev_chain, nb);
1491 	if (err)
1492 		goto unlock;
1493 	if (dev_boot_phase)
1494 		goto unlock;
1495 	for_each_net(net) {
1496 		for_each_netdev(net, dev) {
1497 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1498 			err = notifier_to_errno(err);
1499 			if (err)
1500 				goto rollback;
1501 
1502 			if (!(dev->flags & IFF_UP))
1503 				continue;
1504 
1505 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1506 		}
1507 	}
1508 
1509 unlock:
1510 	rtnl_unlock();
1511 	return err;
1512 
1513 rollback:
1514 	last = dev;
1515 	for_each_net(net) {
1516 		for_each_netdev(net, dev) {
1517 			if (dev == last)
1518 				goto outroll;
1519 
1520 			if (dev->flags & IFF_UP) {
1521 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1522 							dev);
1523 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1524 			}
1525 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1526 		}
1527 	}
1528 
1529 outroll:
1530 	raw_notifier_chain_unregister(&netdev_chain, nb);
1531 	goto unlock;
1532 }
1533 EXPORT_SYMBOL(register_netdevice_notifier);
1534 
1535 /**
1536  *	unregister_netdevice_notifier - unregister a network notifier block
1537  *	@nb: notifier
1538  *
1539  *	Unregister a notifier previously registered by
1540  *	register_netdevice_notifier(). The notifier is unlinked into the
1541  *	kernel structures and may then be reused. A negative errno code
1542  *	is returned on a failure.
1543  *
1544  * 	After unregistering unregister and down device events are synthesized
1545  *	for all devices on the device list to the removed notifier to remove
1546  *	the need for special case cleanup code.
1547  */
1548 
1549 int unregister_netdevice_notifier(struct notifier_block *nb)
1550 {
1551 	struct net_device *dev;
1552 	struct net *net;
1553 	int err;
1554 
1555 	rtnl_lock();
1556 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1557 	if (err)
1558 		goto unlock;
1559 
1560 	for_each_net(net) {
1561 		for_each_netdev(net, dev) {
1562 			if (dev->flags & IFF_UP) {
1563 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1564 							dev);
1565 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1566 			}
1567 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1568 		}
1569 	}
1570 unlock:
1571 	rtnl_unlock();
1572 	return err;
1573 }
1574 EXPORT_SYMBOL(unregister_netdevice_notifier);
1575 
1576 /**
1577  *	call_netdevice_notifiers_info - call all network notifier blocks
1578  *	@val: value passed unmodified to notifier function
1579  *	@dev: net_device pointer passed unmodified to notifier function
1580  *	@info: notifier information data
1581  *
1582  *	Call all network notifier blocks.  Parameters and return value
1583  *	are as for raw_notifier_call_chain().
1584  */
1585 
1586 static int call_netdevice_notifiers_info(unsigned long val,
1587 					 struct net_device *dev,
1588 					 struct netdev_notifier_info *info)
1589 {
1590 	ASSERT_RTNL();
1591 	netdev_notifier_info_init(info, dev);
1592 	return raw_notifier_call_chain(&netdev_chain, val, info);
1593 }
1594 
1595 /**
1596  *	call_netdevice_notifiers - call all network notifier blocks
1597  *      @val: value passed unmodified to notifier function
1598  *      @dev: net_device pointer passed unmodified to notifier function
1599  *
1600  *	Call all network notifier blocks.  Parameters and return value
1601  *	are as for raw_notifier_call_chain().
1602  */
1603 
1604 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1605 {
1606 	struct netdev_notifier_info info;
1607 
1608 	return call_netdevice_notifiers_info(val, dev, &info);
1609 }
1610 EXPORT_SYMBOL(call_netdevice_notifiers);
1611 
1612 static struct static_key netstamp_needed __read_mostly;
1613 #ifdef HAVE_JUMP_LABEL
1614 /* We are not allowed to call static_key_slow_dec() from irq context
1615  * If net_disable_timestamp() is called from irq context, defer the
1616  * static_key_slow_dec() calls.
1617  */
1618 static atomic_t netstamp_needed_deferred;
1619 #endif
1620 
1621 void net_enable_timestamp(void)
1622 {
1623 #ifdef HAVE_JUMP_LABEL
1624 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1625 
1626 	if (deferred) {
1627 		while (--deferred)
1628 			static_key_slow_dec(&netstamp_needed);
1629 		return;
1630 	}
1631 #endif
1632 	static_key_slow_inc(&netstamp_needed);
1633 }
1634 EXPORT_SYMBOL(net_enable_timestamp);
1635 
1636 void net_disable_timestamp(void)
1637 {
1638 #ifdef HAVE_JUMP_LABEL
1639 	if (in_interrupt()) {
1640 		atomic_inc(&netstamp_needed_deferred);
1641 		return;
1642 	}
1643 #endif
1644 	static_key_slow_dec(&netstamp_needed);
1645 }
1646 EXPORT_SYMBOL(net_disable_timestamp);
1647 
1648 static inline void net_timestamp_set(struct sk_buff *skb)
1649 {
1650 	skb->tstamp.tv64 = 0;
1651 	if (static_key_false(&netstamp_needed))
1652 		__net_timestamp(skb);
1653 }
1654 
1655 #define net_timestamp_check(COND, SKB)			\
1656 	if (static_key_false(&netstamp_needed)) {		\
1657 		if ((COND) && !(SKB)->tstamp.tv64)	\
1658 			__net_timestamp(SKB);		\
1659 	}						\
1660 
1661 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1662 {
1663 	unsigned int len;
1664 
1665 	if (!(dev->flags & IFF_UP))
1666 		return false;
1667 
1668 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1669 	if (skb->len <= len)
1670 		return true;
1671 
1672 	/* if TSO is enabled, we don't care about the length as the packet
1673 	 * could be forwarded without being segmented before
1674 	 */
1675 	if (skb_is_gso(skb))
1676 		return true;
1677 
1678 	return false;
1679 }
1680 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1681 
1682 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1683 {
1684 	if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1685 		if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1686 			atomic_long_inc(&dev->rx_dropped);
1687 			kfree_skb(skb);
1688 			return NET_RX_DROP;
1689 		}
1690 	}
1691 
1692 	if (unlikely(!is_skb_forwardable(dev, skb))) {
1693 		atomic_long_inc(&dev->rx_dropped);
1694 		kfree_skb(skb);
1695 		return NET_RX_DROP;
1696 	}
1697 
1698 	skb_scrub_packet(skb, true);
1699 	skb->priority = 0;
1700 	skb->protocol = eth_type_trans(skb, dev);
1701 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1702 
1703 	return 0;
1704 }
1705 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1706 
1707 /**
1708  * dev_forward_skb - loopback an skb to another netif
1709  *
1710  * @dev: destination network device
1711  * @skb: buffer to forward
1712  *
1713  * return values:
1714  *	NET_RX_SUCCESS	(no congestion)
1715  *	NET_RX_DROP     (packet was dropped, but freed)
1716  *
1717  * dev_forward_skb can be used for injecting an skb from the
1718  * start_xmit function of one device into the receive queue
1719  * of another device.
1720  *
1721  * The receiving device may be in another namespace, so
1722  * we have to clear all information in the skb that could
1723  * impact namespace isolation.
1724  */
1725 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1726 {
1727 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1728 }
1729 EXPORT_SYMBOL_GPL(dev_forward_skb);
1730 
1731 static inline int deliver_skb(struct sk_buff *skb,
1732 			      struct packet_type *pt_prev,
1733 			      struct net_device *orig_dev)
1734 {
1735 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1736 		return -ENOMEM;
1737 	atomic_inc(&skb->users);
1738 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1739 }
1740 
1741 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1742 					  struct packet_type **pt,
1743 					  struct net_device *orig_dev,
1744 					  __be16 type,
1745 					  struct list_head *ptype_list)
1746 {
1747 	struct packet_type *ptype, *pt_prev = *pt;
1748 
1749 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1750 		if (ptype->type != type)
1751 			continue;
1752 		if (pt_prev)
1753 			deliver_skb(skb, pt_prev, orig_dev);
1754 		pt_prev = ptype;
1755 	}
1756 	*pt = pt_prev;
1757 }
1758 
1759 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1760 {
1761 	if (!ptype->af_packet_priv || !skb->sk)
1762 		return false;
1763 
1764 	if (ptype->id_match)
1765 		return ptype->id_match(ptype, skb->sk);
1766 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1767 		return true;
1768 
1769 	return false;
1770 }
1771 
1772 /*
1773  *	Support routine. Sends outgoing frames to any network
1774  *	taps currently in use.
1775  */
1776 
1777 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1778 {
1779 	struct packet_type *ptype;
1780 	struct sk_buff *skb2 = NULL;
1781 	struct packet_type *pt_prev = NULL;
1782 	struct list_head *ptype_list = &ptype_all;
1783 
1784 	rcu_read_lock();
1785 again:
1786 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1787 		/* Never send packets back to the socket
1788 		 * they originated from - MvS (miquels@drinkel.ow.org)
1789 		 */
1790 		if (skb_loop_sk(ptype, skb))
1791 			continue;
1792 
1793 		if (pt_prev) {
1794 			deliver_skb(skb2, pt_prev, skb->dev);
1795 			pt_prev = ptype;
1796 			continue;
1797 		}
1798 
1799 		/* need to clone skb, done only once */
1800 		skb2 = skb_clone(skb, GFP_ATOMIC);
1801 		if (!skb2)
1802 			goto out_unlock;
1803 
1804 		net_timestamp_set(skb2);
1805 
1806 		/* skb->nh should be correctly
1807 		 * set by sender, so that the second statement is
1808 		 * just protection against buggy protocols.
1809 		 */
1810 		skb_reset_mac_header(skb2);
1811 
1812 		if (skb_network_header(skb2) < skb2->data ||
1813 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1814 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1815 					     ntohs(skb2->protocol),
1816 					     dev->name);
1817 			skb_reset_network_header(skb2);
1818 		}
1819 
1820 		skb2->transport_header = skb2->network_header;
1821 		skb2->pkt_type = PACKET_OUTGOING;
1822 		pt_prev = ptype;
1823 	}
1824 
1825 	if (ptype_list == &ptype_all) {
1826 		ptype_list = &dev->ptype_all;
1827 		goto again;
1828 	}
1829 out_unlock:
1830 	if (pt_prev)
1831 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1832 	rcu_read_unlock();
1833 }
1834 
1835 /**
1836  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1837  * @dev: Network device
1838  * @txq: number of queues available
1839  *
1840  * If real_num_tx_queues is changed the tc mappings may no longer be
1841  * valid. To resolve this verify the tc mapping remains valid and if
1842  * not NULL the mapping. With no priorities mapping to this
1843  * offset/count pair it will no longer be used. In the worst case TC0
1844  * is invalid nothing can be done so disable priority mappings. If is
1845  * expected that drivers will fix this mapping if they can before
1846  * calling netif_set_real_num_tx_queues.
1847  */
1848 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1849 {
1850 	int i;
1851 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1852 
1853 	/* If TC0 is invalidated disable TC mapping */
1854 	if (tc->offset + tc->count > txq) {
1855 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1856 		dev->num_tc = 0;
1857 		return;
1858 	}
1859 
1860 	/* Invalidated prio to tc mappings set to TC0 */
1861 	for (i = 1; i < TC_BITMASK + 1; i++) {
1862 		int q = netdev_get_prio_tc_map(dev, i);
1863 
1864 		tc = &dev->tc_to_txq[q];
1865 		if (tc->offset + tc->count > txq) {
1866 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1867 				i, q);
1868 			netdev_set_prio_tc_map(dev, i, 0);
1869 		}
1870 	}
1871 }
1872 
1873 #ifdef CONFIG_XPS
1874 static DEFINE_MUTEX(xps_map_mutex);
1875 #define xmap_dereference(P)		\
1876 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1877 
1878 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1879 					int cpu, u16 index)
1880 {
1881 	struct xps_map *map = NULL;
1882 	int pos;
1883 
1884 	if (dev_maps)
1885 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1886 
1887 	for (pos = 0; map && pos < map->len; pos++) {
1888 		if (map->queues[pos] == index) {
1889 			if (map->len > 1) {
1890 				map->queues[pos] = map->queues[--map->len];
1891 			} else {
1892 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1893 				kfree_rcu(map, rcu);
1894 				map = NULL;
1895 			}
1896 			break;
1897 		}
1898 	}
1899 
1900 	return map;
1901 }
1902 
1903 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1904 {
1905 	struct xps_dev_maps *dev_maps;
1906 	int cpu, i;
1907 	bool active = false;
1908 
1909 	mutex_lock(&xps_map_mutex);
1910 	dev_maps = xmap_dereference(dev->xps_maps);
1911 
1912 	if (!dev_maps)
1913 		goto out_no_maps;
1914 
1915 	for_each_possible_cpu(cpu) {
1916 		for (i = index; i < dev->num_tx_queues; i++) {
1917 			if (!remove_xps_queue(dev_maps, cpu, i))
1918 				break;
1919 		}
1920 		if (i == dev->num_tx_queues)
1921 			active = true;
1922 	}
1923 
1924 	if (!active) {
1925 		RCU_INIT_POINTER(dev->xps_maps, NULL);
1926 		kfree_rcu(dev_maps, rcu);
1927 	}
1928 
1929 	for (i = index; i < dev->num_tx_queues; i++)
1930 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1931 					     NUMA_NO_NODE);
1932 
1933 out_no_maps:
1934 	mutex_unlock(&xps_map_mutex);
1935 }
1936 
1937 static struct xps_map *expand_xps_map(struct xps_map *map,
1938 				      int cpu, u16 index)
1939 {
1940 	struct xps_map *new_map;
1941 	int alloc_len = XPS_MIN_MAP_ALLOC;
1942 	int i, pos;
1943 
1944 	for (pos = 0; map && pos < map->len; pos++) {
1945 		if (map->queues[pos] != index)
1946 			continue;
1947 		return map;
1948 	}
1949 
1950 	/* Need to add queue to this CPU's existing map */
1951 	if (map) {
1952 		if (pos < map->alloc_len)
1953 			return map;
1954 
1955 		alloc_len = map->alloc_len * 2;
1956 	}
1957 
1958 	/* Need to allocate new map to store queue on this CPU's map */
1959 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1960 			       cpu_to_node(cpu));
1961 	if (!new_map)
1962 		return NULL;
1963 
1964 	for (i = 0; i < pos; i++)
1965 		new_map->queues[i] = map->queues[i];
1966 	new_map->alloc_len = alloc_len;
1967 	new_map->len = pos;
1968 
1969 	return new_map;
1970 }
1971 
1972 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1973 			u16 index)
1974 {
1975 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1976 	struct xps_map *map, *new_map;
1977 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1978 	int cpu, numa_node_id = -2;
1979 	bool active = false;
1980 
1981 	mutex_lock(&xps_map_mutex);
1982 
1983 	dev_maps = xmap_dereference(dev->xps_maps);
1984 
1985 	/* allocate memory for queue storage */
1986 	for_each_online_cpu(cpu) {
1987 		if (!cpumask_test_cpu(cpu, mask))
1988 			continue;
1989 
1990 		if (!new_dev_maps)
1991 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1992 		if (!new_dev_maps) {
1993 			mutex_unlock(&xps_map_mutex);
1994 			return -ENOMEM;
1995 		}
1996 
1997 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1998 				 NULL;
1999 
2000 		map = expand_xps_map(map, cpu, index);
2001 		if (!map)
2002 			goto error;
2003 
2004 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2005 	}
2006 
2007 	if (!new_dev_maps)
2008 		goto out_no_new_maps;
2009 
2010 	for_each_possible_cpu(cpu) {
2011 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2012 			/* add queue to CPU maps */
2013 			int pos = 0;
2014 
2015 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2016 			while ((pos < map->len) && (map->queues[pos] != index))
2017 				pos++;
2018 
2019 			if (pos == map->len)
2020 				map->queues[map->len++] = index;
2021 #ifdef CONFIG_NUMA
2022 			if (numa_node_id == -2)
2023 				numa_node_id = cpu_to_node(cpu);
2024 			else if (numa_node_id != cpu_to_node(cpu))
2025 				numa_node_id = -1;
2026 #endif
2027 		} else if (dev_maps) {
2028 			/* fill in the new device map from the old device map */
2029 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2030 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2031 		}
2032 
2033 	}
2034 
2035 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2036 
2037 	/* Cleanup old maps */
2038 	if (dev_maps) {
2039 		for_each_possible_cpu(cpu) {
2040 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2041 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2042 			if (map && map != new_map)
2043 				kfree_rcu(map, rcu);
2044 		}
2045 
2046 		kfree_rcu(dev_maps, rcu);
2047 	}
2048 
2049 	dev_maps = new_dev_maps;
2050 	active = true;
2051 
2052 out_no_new_maps:
2053 	/* update Tx queue numa node */
2054 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2055 				     (numa_node_id >= 0) ? numa_node_id :
2056 				     NUMA_NO_NODE);
2057 
2058 	if (!dev_maps)
2059 		goto out_no_maps;
2060 
2061 	/* removes queue from unused CPUs */
2062 	for_each_possible_cpu(cpu) {
2063 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2064 			continue;
2065 
2066 		if (remove_xps_queue(dev_maps, cpu, index))
2067 			active = true;
2068 	}
2069 
2070 	/* free map if not active */
2071 	if (!active) {
2072 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2073 		kfree_rcu(dev_maps, rcu);
2074 	}
2075 
2076 out_no_maps:
2077 	mutex_unlock(&xps_map_mutex);
2078 
2079 	return 0;
2080 error:
2081 	/* remove any maps that we added */
2082 	for_each_possible_cpu(cpu) {
2083 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2084 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2085 				 NULL;
2086 		if (new_map && new_map != map)
2087 			kfree(new_map);
2088 	}
2089 
2090 	mutex_unlock(&xps_map_mutex);
2091 
2092 	kfree(new_dev_maps);
2093 	return -ENOMEM;
2094 }
2095 EXPORT_SYMBOL(netif_set_xps_queue);
2096 
2097 #endif
2098 /*
2099  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2100  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2101  */
2102 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2103 {
2104 	int rc;
2105 
2106 	if (txq < 1 || txq > dev->num_tx_queues)
2107 		return -EINVAL;
2108 
2109 	if (dev->reg_state == NETREG_REGISTERED ||
2110 	    dev->reg_state == NETREG_UNREGISTERING) {
2111 		ASSERT_RTNL();
2112 
2113 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2114 						  txq);
2115 		if (rc)
2116 			return rc;
2117 
2118 		if (dev->num_tc)
2119 			netif_setup_tc(dev, txq);
2120 
2121 		if (txq < dev->real_num_tx_queues) {
2122 			qdisc_reset_all_tx_gt(dev, txq);
2123 #ifdef CONFIG_XPS
2124 			netif_reset_xps_queues_gt(dev, txq);
2125 #endif
2126 		}
2127 	}
2128 
2129 	dev->real_num_tx_queues = txq;
2130 	return 0;
2131 }
2132 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2133 
2134 #ifdef CONFIG_SYSFS
2135 /**
2136  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2137  *	@dev: Network device
2138  *	@rxq: Actual number of RX queues
2139  *
2140  *	This must be called either with the rtnl_lock held or before
2141  *	registration of the net device.  Returns 0 on success, or a
2142  *	negative error code.  If called before registration, it always
2143  *	succeeds.
2144  */
2145 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2146 {
2147 	int rc;
2148 
2149 	if (rxq < 1 || rxq > dev->num_rx_queues)
2150 		return -EINVAL;
2151 
2152 	if (dev->reg_state == NETREG_REGISTERED) {
2153 		ASSERT_RTNL();
2154 
2155 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2156 						  rxq);
2157 		if (rc)
2158 			return rc;
2159 	}
2160 
2161 	dev->real_num_rx_queues = rxq;
2162 	return 0;
2163 }
2164 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2165 #endif
2166 
2167 /**
2168  * netif_get_num_default_rss_queues - default number of RSS queues
2169  *
2170  * This routine should set an upper limit on the number of RSS queues
2171  * used by default by multiqueue devices.
2172  */
2173 int netif_get_num_default_rss_queues(void)
2174 {
2175 	return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2176 }
2177 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2178 
2179 static inline void __netif_reschedule(struct Qdisc *q)
2180 {
2181 	struct softnet_data *sd;
2182 	unsigned long flags;
2183 
2184 	local_irq_save(flags);
2185 	sd = this_cpu_ptr(&softnet_data);
2186 	q->next_sched = NULL;
2187 	*sd->output_queue_tailp = q;
2188 	sd->output_queue_tailp = &q->next_sched;
2189 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2190 	local_irq_restore(flags);
2191 }
2192 
2193 void __netif_schedule(struct Qdisc *q)
2194 {
2195 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2196 		__netif_reschedule(q);
2197 }
2198 EXPORT_SYMBOL(__netif_schedule);
2199 
2200 struct dev_kfree_skb_cb {
2201 	enum skb_free_reason reason;
2202 };
2203 
2204 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2205 {
2206 	return (struct dev_kfree_skb_cb *)skb->cb;
2207 }
2208 
2209 void netif_schedule_queue(struct netdev_queue *txq)
2210 {
2211 	rcu_read_lock();
2212 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2213 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2214 
2215 		__netif_schedule(q);
2216 	}
2217 	rcu_read_unlock();
2218 }
2219 EXPORT_SYMBOL(netif_schedule_queue);
2220 
2221 /**
2222  *	netif_wake_subqueue - allow sending packets on subqueue
2223  *	@dev: network device
2224  *	@queue_index: sub queue index
2225  *
2226  * Resume individual transmit queue of a device with multiple transmit queues.
2227  */
2228 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2229 {
2230 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2231 
2232 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2233 		struct Qdisc *q;
2234 
2235 		rcu_read_lock();
2236 		q = rcu_dereference(txq->qdisc);
2237 		__netif_schedule(q);
2238 		rcu_read_unlock();
2239 	}
2240 }
2241 EXPORT_SYMBOL(netif_wake_subqueue);
2242 
2243 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2244 {
2245 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2246 		struct Qdisc *q;
2247 
2248 		rcu_read_lock();
2249 		q = rcu_dereference(dev_queue->qdisc);
2250 		__netif_schedule(q);
2251 		rcu_read_unlock();
2252 	}
2253 }
2254 EXPORT_SYMBOL(netif_tx_wake_queue);
2255 
2256 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2257 {
2258 	unsigned long flags;
2259 
2260 	if (likely(atomic_read(&skb->users) == 1)) {
2261 		smp_rmb();
2262 		atomic_set(&skb->users, 0);
2263 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2264 		return;
2265 	}
2266 	get_kfree_skb_cb(skb)->reason = reason;
2267 	local_irq_save(flags);
2268 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2269 	__this_cpu_write(softnet_data.completion_queue, skb);
2270 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2271 	local_irq_restore(flags);
2272 }
2273 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2274 
2275 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2276 {
2277 	if (in_irq() || irqs_disabled())
2278 		__dev_kfree_skb_irq(skb, reason);
2279 	else
2280 		dev_kfree_skb(skb);
2281 }
2282 EXPORT_SYMBOL(__dev_kfree_skb_any);
2283 
2284 
2285 /**
2286  * netif_device_detach - mark device as removed
2287  * @dev: network device
2288  *
2289  * Mark device as removed from system and therefore no longer available.
2290  */
2291 void netif_device_detach(struct net_device *dev)
2292 {
2293 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2294 	    netif_running(dev)) {
2295 		netif_tx_stop_all_queues(dev);
2296 	}
2297 }
2298 EXPORT_SYMBOL(netif_device_detach);
2299 
2300 /**
2301  * netif_device_attach - mark device as attached
2302  * @dev: network device
2303  *
2304  * Mark device as attached from system and restart if needed.
2305  */
2306 void netif_device_attach(struct net_device *dev)
2307 {
2308 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2309 	    netif_running(dev)) {
2310 		netif_tx_wake_all_queues(dev);
2311 		__netdev_watchdog_up(dev);
2312 	}
2313 }
2314 EXPORT_SYMBOL(netif_device_attach);
2315 
2316 static void skb_warn_bad_offload(const struct sk_buff *skb)
2317 {
2318 	static const netdev_features_t null_features = 0;
2319 	struct net_device *dev = skb->dev;
2320 	const char *driver = "";
2321 
2322 	if (!net_ratelimit())
2323 		return;
2324 
2325 	if (dev && dev->dev.parent)
2326 		driver = dev_driver_string(dev->dev.parent);
2327 
2328 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2329 	     "gso_type=%d ip_summed=%d\n",
2330 	     driver, dev ? &dev->features : &null_features,
2331 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2332 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2333 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2334 }
2335 
2336 /*
2337  * Invalidate hardware checksum when packet is to be mangled, and
2338  * complete checksum manually on outgoing path.
2339  */
2340 int skb_checksum_help(struct sk_buff *skb)
2341 {
2342 	__wsum csum;
2343 	int ret = 0, offset;
2344 
2345 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2346 		goto out_set_summed;
2347 
2348 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2349 		skb_warn_bad_offload(skb);
2350 		return -EINVAL;
2351 	}
2352 
2353 	/* Before computing a checksum, we should make sure no frag could
2354 	 * be modified by an external entity : checksum could be wrong.
2355 	 */
2356 	if (skb_has_shared_frag(skb)) {
2357 		ret = __skb_linearize(skb);
2358 		if (ret)
2359 			goto out;
2360 	}
2361 
2362 	offset = skb_checksum_start_offset(skb);
2363 	BUG_ON(offset >= skb_headlen(skb));
2364 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2365 
2366 	offset += skb->csum_offset;
2367 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2368 
2369 	if (skb_cloned(skb) &&
2370 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2371 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2372 		if (ret)
2373 			goto out;
2374 	}
2375 
2376 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2377 out_set_summed:
2378 	skb->ip_summed = CHECKSUM_NONE;
2379 out:
2380 	return ret;
2381 }
2382 EXPORT_SYMBOL(skb_checksum_help);
2383 
2384 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2385 {
2386 	__be16 type = skb->protocol;
2387 
2388 	/* Tunnel gso handlers can set protocol to ethernet. */
2389 	if (type == htons(ETH_P_TEB)) {
2390 		struct ethhdr *eth;
2391 
2392 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2393 			return 0;
2394 
2395 		eth = (struct ethhdr *)skb_mac_header(skb);
2396 		type = eth->h_proto;
2397 	}
2398 
2399 	return __vlan_get_protocol(skb, type, depth);
2400 }
2401 
2402 /**
2403  *	skb_mac_gso_segment - mac layer segmentation handler.
2404  *	@skb: buffer to segment
2405  *	@features: features for the output path (see dev->features)
2406  */
2407 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2408 				    netdev_features_t features)
2409 {
2410 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2411 	struct packet_offload *ptype;
2412 	int vlan_depth = skb->mac_len;
2413 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2414 
2415 	if (unlikely(!type))
2416 		return ERR_PTR(-EINVAL);
2417 
2418 	__skb_pull(skb, vlan_depth);
2419 
2420 	rcu_read_lock();
2421 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2422 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2423 			segs = ptype->callbacks.gso_segment(skb, features);
2424 			break;
2425 		}
2426 	}
2427 	rcu_read_unlock();
2428 
2429 	__skb_push(skb, skb->data - skb_mac_header(skb));
2430 
2431 	return segs;
2432 }
2433 EXPORT_SYMBOL(skb_mac_gso_segment);
2434 
2435 
2436 /* openvswitch calls this on rx path, so we need a different check.
2437  */
2438 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2439 {
2440 	if (tx_path)
2441 		return skb->ip_summed != CHECKSUM_PARTIAL;
2442 	else
2443 		return skb->ip_summed == CHECKSUM_NONE;
2444 }
2445 
2446 /**
2447  *	__skb_gso_segment - Perform segmentation on skb.
2448  *	@skb: buffer to segment
2449  *	@features: features for the output path (see dev->features)
2450  *	@tx_path: whether it is called in TX path
2451  *
2452  *	This function segments the given skb and returns a list of segments.
2453  *
2454  *	It may return NULL if the skb requires no segmentation.  This is
2455  *	only possible when GSO is used for verifying header integrity.
2456  */
2457 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2458 				  netdev_features_t features, bool tx_path)
2459 {
2460 	if (unlikely(skb_needs_check(skb, tx_path))) {
2461 		int err;
2462 
2463 		skb_warn_bad_offload(skb);
2464 
2465 		err = skb_cow_head(skb, 0);
2466 		if (err < 0)
2467 			return ERR_PTR(err);
2468 	}
2469 
2470 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2471 	SKB_GSO_CB(skb)->encap_level = 0;
2472 
2473 	skb_reset_mac_header(skb);
2474 	skb_reset_mac_len(skb);
2475 
2476 	return skb_mac_gso_segment(skb, features);
2477 }
2478 EXPORT_SYMBOL(__skb_gso_segment);
2479 
2480 /* Take action when hardware reception checksum errors are detected. */
2481 #ifdef CONFIG_BUG
2482 void netdev_rx_csum_fault(struct net_device *dev)
2483 {
2484 	if (net_ratelimit()) {
2485 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2486 		dump_stack();
2487 	}
2488 }
2489 EXPORT_SYMBOL(netdev_rx_csum_fault);
2490 #endif
2491 
2492 /* Actually, we should eliminate this check as soon as we know, that:
2493  * 1. IOMMU is present and allows to map all the memory.
2494  * 2. No high memory really exists on this machine.
2495  */
2496 
2497 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2498 {
2499 #ifdef CONFIG_HIGHMEM
2500 	int i;
2501 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2502 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2503 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2504 			if (PageHighMem(skb_frag_page(frag)))
2505 				return 1;
2506 		}
2507 	}
2508 
2509 	if (PCI_DMA_BUS_IS_PHYS) {
2510 		struct device *pdev = dev->dev.parent;
2511 
2512 		if (!pdev)
2513 			return 0;
2514 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2515 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2516 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2517 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2518 				return 1;
2519 		}
2520 	}
2521 #endif
2522 	return 0;
2523 }
2524 
2525 /* If MPLS offload request, verify we are testing hardware MPLS features
2526  * instead of standard features for the netdev.
2527  */
2528 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2529 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2530 					   netdev_features_t features,
2531 					   __be16 type)
2532 {
2533 	if (eth_p_mpls(type))
2534 		features &= skb->dev->mpls_features;
2535 
2536 	return features;
2537 }
2538 #else
2539 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2540 					   netdev_features_t features,
2541 					   __be16 type)
2542 {
2543 	return features;
2544 }
2545 #endif
2546 
2547 static netdev_features_t harmonize_features(struct sk_buff *skb,
2548 	netdev_features_t features)
2549 {
2550 	int tmp;
2551 	__be16 type;
2552 
2553 	type = skb_network_protocol(skb, &tmp);
2554 	features = net_mpls_features(skb, features, type);
2555 
2556 	if (skb->ip_summed != CHECKSUM_NONE &&
2557 	    !can_checksum_protocol(features, type)) {
2558 		features &= ~NETIF_F_ALL_CSUM;
2559 	} else if (illegal_highdma(skb->dev, skb)) {
2560 		features &= ~NETIF_F_SG;
2561 	}
2562 
2563 	return features;
2564 }
2565 
2566 netdev_features_t passthru_features_check(struct sk_buff *skb,
2567 					  struct net_device *dev,
2568 					  netdev_features_t features)
2569 {
2570 	return features;
2571 }
2572 EXPORT_SYMBOL(passthru_features_check);
2573 
2574 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2575 					     struct net_device *dev,
2576 					     netdev_features_t features)
2577 {
2578 	return vlan_features_check(skb, features);
2579 }
2580 
2581 netdev_features_t netif_skb_features(struct sk_buff *skb)
2582 {
2583 	struct net_device *dev = skb->dev;
2584 	netdev_features_t features = dev->features;
2585 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2586 
2587 	if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2588 		features &= ~NETIF_F_GSO_MASK;
2589 
2590 	/* If encapsulation offload request, verify we are testing
2591 	 * hardware encapsulation features instead of standard
2592 	 * features for the netdev
2593 	 */
2594 	if (skb->encapsulation)
2595 		features &= dev->hw_enc_features;
2596 
2597 	if (skb_vlan_tagged(skb))
2598 		features = netdev_intersect_features(features,
2599 						     dev->vlan_features |
2600 						     NETIF_F_HW_VLAN_CTAG_TX |
2601 						     NETIF_F_HW_VLAN_STAG_TX);
2602 
2603 	if (dev->netdev_ops->ndo_features_check)
2604 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2605 								features);
2606 	else
2607 		features &= dflt_features_check(skb, dev, features);
2608 
2609 	return harmonize_features(skb, features);
2610 }
2611 EXPORT_SYMBOL(netif_skb_features);
2612 
2613 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2614 		    struct netdev_queue *txq, bool more)
2615 {
2616 	unsigned int len;
2617 	int rc;
2618 
2619 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2620 		dev_queue_xmit_nit(skb, dev);
2621 
2622 	len = skb->len;
2623 	trace_net_dev_start_xmit(skb, dev);
2624 	rc = netdev_start_xmit(skb, dev, txq, more);
2625 	trace_net_dev_xmit(skb, rc, dev, len);
2626 
2627 	return rc;
2628 }
2629 
2630 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2631 				    struct netdev_queue *txq, int *ret)
2632 {
2633 	struct sk_buff *skb = first;
2634 	int rc = NETDEV_TX_OK;
2635 
2636 	while (skb) {
2637 		struct sk_buff *next = skb->next;
2638 
2639 		skb->next = NULL;
2640 		rc = xmit_one(skb, dev, txq, next != NULL);
2641 		if (unlikely(!dev_xmit_complete(rc))) {
2642 			skb->next = next;
2643 			goto out;
2644 		}
2645 
2646 		skb = next;
2647 		if (netif_xmit_stopped(txq) && skb) {
2648 			rc = NETDEV_TX_BUSY;
2649 			break;
2650 		}
2651 	}
2652 
2653 out:
2654 	*ret = rc;
2655 	return skb;
2656 }
2657 
2658 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2659 					  netdev_features_t features)
2660 {
2661 	if (skb_vlan_tag_present(skb) &&
2662 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2663 		skb = __vlan_hwaccel_push_inside(skb);
2664 	return skb;
2665 }
2666 
2667 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2668 {
2669 	netdev_features_t features;
2670 
2671 	if (skb->next)
2672 		return skb;
2673 
2674 	features = netif_skb_features(skb);
2675 	skb = validate_xmit_vlan(skb, features);
2676 	if (unlikely(!skb))
2677 		goto out_null;
2678 
2679 	if (netif_needs_gso(dev, skb, features)) {
2680 		struct sk_buff *segs;
2681 
2682 		segs = skb_gso_segment(skb, features);
2683 		if (IS_ERR(segs)) {
2684 			goto out_kfree_skb;
2685 		} else if (segs) {
2686 			consume_skb(skb);
2687 			skb = segs;
2688 		}
2689 	} else {
2690 		if (skb_needs_linearize(skb, features) &&
2691 		    __skb_linearize(skb))
2692 			goto out_kfree_skb;
2693 
2694 		/* If packet is not checksummed and device does not
2695 		 * support checksumming for this protocol, complete
2696 		 * checksumming here.
2697 		 */
2698 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2699 			if (skb->encapsulation)
2700 				skb_set_inner_transport_header(skb,
2701 							       skb_checksum_start_offset(skb));
2702 			else
2703 				skb_set_transport_header(skb,
2704 							 skb_checksum_start_offset(skb));
2705 			if (!(features & NETIF_F_ALL_CSUM) &&
2706 			    skb_checksum_help(skb))
2707 				goto out_kfree_skb;
2708 		}
2709 	}
2710 
2711 	return skb;
2712 
2713 out_kfree_skb:
2714 	kfree_skb(skb);
2715 out_null:
2716 	return NULL;
2717 }
2718 
2719 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2720 {
2721 	struct sk_buff *next, *head = NULL, *tail;
2722 
2723 	for (; skb != NULL; skb = next) {
2724 		next = skb->next;
2725 		skb->next = NULL;
2726 
2727 		/* in case skb wont be segmented, point to itself */
2728 		skb->prev = skb;
2729 
2730 		skb = validate_xmit_skb(skb, dev);
2731 		if (!skb)
2732 			continue;
2733 
2734 		if (!head)
2735 			head = skb;
2736 		else
2737 			tail->next = skb;
2738 		/* If skb was segmented, skb->prev points to
2739 		 * the last segment. If not, it still contains skb.
2740 		 */
2741 		tail = skb->prev;
2742 	}
2743 	return head;
2744 }
2745 
2746 static void qdisc_pkt_len_init(struct sk_buff *skb)
2747 {
2748 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
2749 
2750 	qdisc_skb_cb(skb)->pkt_len = skb->len;
2751 
2752 	/* To get more precise estimation of bytes sent on wire,
2753 	 * we add to pkt_len the headers size of all segments
2754 	 */
2755 	if (shinfo->gso_size)  {
2756 		unsigned int hdr_len;
2757 		u16 gso_segs = shinfo->gso_segs;
2758 
2759 		/* mac layer + network layer */
2760 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2761 
2762 		/* + transport layer */
2763 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2764 			hdr_len += tcp_hdrlen(skb);
2765 		else
2766 			hdr_len += sizeof(struct udphdr);
2767 
2768 		if (shinfo->gso_type & SKB_GSO_DODGY)
2769 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2770 						shinfo->gso_size);
2771 
2772 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2773 	}
2774 }
2775 
2776 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2777 				 struct net_device *dev,
2778 				 struct netdev_queue *txq)
2779 {
2780 	spinlock_t *root_lock = qdisc_lock(q);
2781 	bool contended;
2782 	int rc;
2783 
2784 	qdisc_pkt_len_init(skb);
2785 	qdisc_calculate_pkt_len(skb, q);
2786 	/*
2787 	 * Heuristic to force contended enqueues to serialize on a
2788 	 * separate lock before trying to get qdisc main lock.
2789 	 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2790 	 * often and dequeue packets faster.
2791 	 */
2792 	contended = qdisc_is_running(q);
2793 	if (unlikely(contended))
2794 		spin_lock(&q->busylock);
2795 
2796 	spin_lock(root_lock);
2797 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2798 		kfree_skb(skb);
2799 		rc = NET_XMIT_DROP;
2800 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2801 		   qdisc_run_begin(q)) {
2802 		/*
2803 		 * This is a work-conserving queue; there are no old skbs
2804 		 * waiting to be sent out; and the qdisc is not running -
2805 		 * xmit the skb directly.
2806 		 */
2807 
2808 		qdisc_bstats_update(q, skb);
2809 
2810 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2811 			if (unlikely(contended)) {
2812 				spin_unlock(&q->busylock);
2813 				contended = false;
2814 			}
2815 			__qdisc_run(q);
2816 		} else
2817 			qdisc_run_end(q);
2818 
2819 		rc = NET_XMIT_SUCCESS;
2820 	} else {
2821 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2822 		if (qdisc_run_begin(q)) {
2823 			if (unlikely(contended)) {
2824 				spin_unlock(&q->busylock);
2825 				contended = false;
2826 			}
2827 			__qdisc_run(q);
2828 		}
2829 	}
2830 	spin_unlock(root_lock);
2831 	if (unlikely(contended))
2832 		spin_unlock(&q->busylock);
2833 	return rc;
2834 }
2835 
2836 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2837 static void skb_update_prio(struct sk_buff *skb)
2838 {
2839 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2840 
2841 	if (!skb->priority && skb->sk && map) {
2842 		unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2843 
2844 		if (prioidx < map->priomap_len)
2845 			skb->priority = map->priomap[prioidx];
2846 	}
2847 }
2848 #else
2849 #define skb_update_prio(skb)
2850 #endif
2851 
2852 static DEFINE_PER_CPU(int, xmit_recursion);
2853 #define RECURSION_LIMIT 10
2854 
2855 /**
2856  *	dev_loopback_xmit - loop back @skb
2857  *	@skb: buffer to transmit
2858  */
2859 int dev_loopback_xmit(struct sk_buff *skb)
2860 {
2861 	skb_reset_mac_header(skb);
2862 	__skb_pull(skb, skb_network_offset(skb));
2863 	skb->pkt_type = PACKET_LOOPBACK;
2864 	skb->ip_summed = CHECKSUM_UNNECESSARY;
2865 	WARN_ON(!skb_dst(skb));
2866 	skb_dst_force(skb);
2867 	netif_rx_ni(skb);
2868 	return 0;
2869 }
2870 EXPORT_SYMBOL(dev_loopback_xmit);
2871 
2872 /**
2873  *	__dev_queue_xmit - transmit a buffer
2874  *	@skb: buffer to transmit
2875  *	@accel_priv: private data used for L2 forwarding offload
2876  *
2877  *	Queue a buffer for transmission to a network device. The caller must
2878  *	have set the device and priority and built the buffer before calling
2879  *	this function. The function can be called from an interrupt.
2880  *
2881  *	A negative errno code is returned on a failure. A success does not
2882  *	guarantee the frame will be transmitted as it may be dropped due
2883  *	to congestion or traffic shaping.
2884  *
2885  * -----------------------------------------------------------------------------------
2886  *      I notice this method can also return errors from the queue disciplines,
2887  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2888  *      be positive.
2889  *
2890  *      Regardless of the return value, the skb is consumed, so it is currently
2891  *      difficult to retry a send to this method.  (You can bump the ref count
2892  *      before sending to hold a reference for retry if you are careful.)
2893  *
2894  *      When calling this method, interrupts MUST be enabled.  This is because
2895  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2896  *          --BLG
2897  */
2898 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
2899 {
2900 	struct net_device *dev = skb->dev;
2901 	struct netdev_queue *txq;
2902 	struct Qdisc *q;
2903 	int rc = -ENOMEM;
2904 
2905 	skb_reset_mac_header(skb);
2906 
2907 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
2908 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
2909 
2910 	/* Disable soft irqs for various locks below. Also
2911 	 * stops preemption for RCU.
2912 	 */
2913 	rcu_read_lock_bh();
2914 
2915 	skb_update_prio(skb);
2916 
2917 	/* If device/qdisc don't need skb->dst, release it right now while
2918 	 * its hot in this cpu cache.
2919 	 */
2920 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2921 		skb_dst_drop(skb);
2922 	else
2923 		skb_dst_force(skb);
2924 
2925 	txq = netdev_pick_tx(dev, skb, accel_priv);
2926 	q = rcu_dereference_bh(txq->qdisc);
2927 
2928 #ifdef CONFIG_NET_CLS_ACT
2929 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2930 #endif
2931 	trace_net_dev_queue(skb);
2932 	if (q->enqueue) {
2933 		rc = __dev_xmit_skb(skb, q, dev, txq);
2934 		goto out;
2935 	}
2936 
2937 	/* The device has no queue. Common case for software devices:
2938 	   loopback, all the sorts of tunnels...
2939 
2940 	   Really, it is unlikely that netif_tx_lock protection is necessary
2941 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2942 	   counters.)
2943 	   However, it is possible, that they rely on protection
2944 	   made by us here.
2945 
2946 	   Check this and shot the lock. It is not prone from deadlocks.
2947 	   Either shot noqueue qdisc, it is even simpler 8)
2948 	 */
2949 	if (dev->flags & IFF_UP) {
2950 		int cpu = smp_processor_id(); /* ok because BHs are off */
2951 
2952 		if (txq->xmit_lock_owner != cpu) {
2953 
2954 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2955 				goto recursion_alert;
2956 
2957 			skb = validate_xmit_skb(skb, dev);
2958 			if (!skb)
2959 				goto drop;
2960 
2961 			HARD_TX_LOCK(dev, txq, cpu);
2962 
2963 			if (!netif_xmit_stopped(txq)) {
2964 				__this_cpu_inc(xmit_recursion);
2965 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
2966 				__this_cpu_dec(xmit_recursion);
2967 				if (dev_xmit_complete(rc)) {
2968 					HARD_TX_UNLOCK(dev, txq);
2969 					goto out;
2970 				}
2971 			}
2972 			HARD_TX_UNLOCK(dev, txq);
2973 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2974 					     dev->name);
2975 		} else {
2976 			/* Recursion is detected! It is possible,
2977 			 * unfortunately
2978 			 */
2979 recursion_alert:
2980 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2981 					     dev->name);
2982 		}
2983 	}
2984 
2985 	rc = -ENETDOWN;
2986 drop:
2987 	rcu_read_unlock_bh();
2988 
2989 	atomic_long_inc(&dev->tx_dropped);
2990 	kfree_skb_list(skb);
2991 	return rc;
2992 out:
2993 	rcu_read_unlock_bh();
2994 	return rc;
2995 }
2996 
2997 int dev_queue_xmit(struct sk_buff *skb)
2998 {
2999 	return __dev_queue_xmit(skb, NULL);
3000 }
3001 EXPORT_SYMBOL(dev_queue_xmit);
3002 
3003 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3004 {
3005 	return __dev_queue_xmit(skb, accel_priv);
3006 }
3007 EXPORT_SYMBOL(dev_queue_xmit_accel);
3008 
3009 
3010 /*=======================================================================
3011 			Receiver routines
3012   =======================================================================*/
3013 
3014 int netdev_max_backlog __read_mostly = 1000;
3015 EXPORT_SYMBOL(netdev_max_backlog);
3016 
3017 int netdev_tstamp_prequeue __read_mostly = 1;
3018 int netdev_budget __read_mostly = 300;
3019 int weight_p __read_mostly = 64;            /* old backlog weight */
3020 
3021 /* Called with irq disabled */
3022 static inline void ____napi_schedule(struct softnet_data *sd,
3023 				     struct napi_struct *napi)
3024 {
3025 	list_add_tail(&napi->poll_list, &sd->poll_list);
3026 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3027 }
3028 
3029 #ifdef CONFIG_RPS
3030 
3031 /* One global table that all flow-based protocols share. */
3032 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3033 EXPORT_SYMBOL(rps_sock_flow_table);
3034 u32 rps_cpu_mask __read_mostly;
3035 EXPORT_SYMBOL(rps_cpu_mask);
3036 
3037 struct static_key rps_needed __read_mostly;
3038 
3039 static struct rps_dev_flow *
3040 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3041 	    struct rps_dev_flow *rflow, u16 next_cpu)
3042 {
3043 	if (next_cpu != RPS_NO_CPU) {
3044 #ifdef CONFIG_RFS_ACCEL
3045 		struct netdev_rx_queue *rxqueue;
3046 		struct rps_dev_flow_table *flow_table;
3047 		struct rps_dev_flow *old_rflow;
3048 		u32 flow_id;
3049 		u16 rxq_index;
3050 		int rc;
3051 
3052 		/* Should we steer this flow to a different hardware queue? */
3053 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3054 		    !(dev->features & NETIF_F_NTUPLE))
3055 			goto out;
3056 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3057 		if (rxq_index == skb_get_rx_queue(skb))
3058 			goto out;
3059 
3060 		rxqueue = dev->_rx + rxq_index;
3061 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3062 		if (!flow_table)
3063 			goto out;
3064 		flow_id = skb_get_hash(skb) & flow_table->mask;
3065 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3066 							rxq_index, flow_id);
3067 		if (rc < 0)
3068 			goto out;
3069 		old_rflow = rflow;
3070 		rflow = &flow_table->flows[flow_id];
3071 		rflow->filter = rc;
3072 		if (old_rflow->filter == rflow->filter)
3073 			old_rflow->filter = RPS_NO_FILTER;
3074 	out:
3075 #endif
3076 		rflow->last_qtail =
3077 			per_cpu(softnet_data, next_cpu).input_queue_head;
3078 	}
3079 
3080 	rflow->cpu = next_cpu;
3081 	return rflow;
3082 }
3083 
3084 /*
3085  * get_rps_cpu is called from netif_receive_skb and returns the target
3086  * CPU from the RPS map of the receiving queue for a given skb.
3087  * rcu_read_lock must be held on entry.
3088  */
3089 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3090 		       struct rps_dev_flow **rflowp)
3091 {
3092 	const struct rps_sock_flow_table *sock_flow_table;
3093 	struct netdev_rx_queue *rxqueue = dev->_rx;
3094 	struct rps_dev_flow_table *flow_table;
3095 	struct rps_map *map;
3096 	int cpu = -1;
3097 	u32 tcpu;
3098 	u32 hash;
3099 
3100 	if (skb_rx_queue_recorded(skb)) {
3101 		u16 index = skb_get_rx_queue(skb);
3102 
3103 		if (unlikely(index >= dev->real_num_rx_queues)) {
3104 			WARN_ONCE(dev->real_num_rx_queues > 1,
3105 				  "%s received packet on queue %u, but number "
3106 				  "of RX queues is %u\n",
3107 				  dev->name, index, dev->real_num_rx_queues);
3108 			goto done;
3109 		}
3110 		rxqueue += index;
3111 	}
3112 
3113 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3114 
3115 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3116 	map = rcu_dereference(rxqueue->rps_map);
3117 	if (!flow_table && !map)
3118 		goto done;
3119 
3120 	skb_reset_network_header(skb);
3121 	hash = skb_get_hash(skb);
3122 	if (!hash)
3123 		goto done;
3124 
3125 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3126 	if (flow_table && sock_flow_table) {
3127 		struct rps_dev_flow *rflow;
3128 		u32 next_cpu;
3129 		u32 ident;
3130 
3131 		/* First check into global flow table if there is a match */
3132 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3133 		if ((ident ^ hash) & ~rps_cpu_mask)
3134 			goto try_rps;
3135 
3136 		next_cpu = ident & rps_cpu_mask;
3137 
3138 		/* OK, now we know there is a match,
3139 		 * we can look at the local (per receive queue) flow table
3140 		 */
3141 		rflow = &flow_table->flows[hash & flow_table->mask];
3142 		tcpu = rflow->cpu;
3143 
3144 		/*
3145 		 * If the desired CPU (where last recvmsg was done) is
3146 		 * different from current CPU (one in the rx-queue flow
3147 		 * table entry), switch if one of the following holds:
3148 		 *   - Current CPU is unset (equal to RPS_NO_CPU).
3149 		 *   - Current CPU is offline.
3150 		 *   - The current CPU's queue tail has advanced beyond the
3151 		 *     last packet that was enqueued using this table entry.
3152 		 *     This guarantees that all previous packets for the flow
3153 		 *     have been dequeued, thus preserving in order delivery.
3154 		 */
3155 		if (unlikely(tcpu != next_cpu) &&
3156 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3157 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3158 		      rflow->last_qtail)) >= 0)) {
3159 			tcpu = next_cpu;
3160 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3161 		}
3162 
3163 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3164 			*rflowp = rflow;
3165 			cpu = tcpu;
3166 			goto done;
3167 		}
3168 	}
3169 
3170 try_rps:
3171 
3172 	if (map) {
3173 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3174 		if (cpu_online(tcpu)) {
3175 			cpu = tcpu;
3176 			goto done;
3177 		}
3178 	}
3179 
3180 done:
3181 	return cpu;
3182 }
3183 
3184 #ifdef CONFIG_RFS_ACCEL
3185 
3186 /**
3187  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3188  * @dev: Device on which the filter was set
3189  * @rxq_index: RX queue index
3190  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3191  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3192  *
3193  * Drivers that implement ndo_rx_flow_steer() should periodically call
3194  * this function for each installed filter and remove the filters for
3195  * which it returns %true.
3196  */
3197 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3198 			 u32 flow_id, u16 filter_id)
3199 {
3200 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3201 	struct rps_dev_flow_table *flow_table;
3202 	struct rps_dev_flow *rflow;
3203 	bool expire = true;
3204 	int cpu;
3205 
3206 	rcu_read_lock();
3207 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3208 	if (flow_table && flow_id <= flow_table->mask) {
3209 		rflow = &flow_table->flows[flow_id];
3210 		cpu = ACCESS_ONCE(rflow->cpu);
3211 		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3212 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3213 			   rflow->last_qtail) <
3214 		     (int)(10 * flow_table->mask)))
3215 			expire = false;
3216 	}
3217 	rcu_read_unlock();
3218 	return expire;
3219 }
3220 EXPORT_SYMBOL(rps_may_expire_flow);
3221 
3222 #endif /* CONFIG_RFS_ACCEL */
3223 
3224 /* Called from hardirq (IPI) context */
3225 static void rps_trigger_softirq(void *data)
3226 {
3227 	struct softnet_data *sd = data;
3228 
3229 	____napi_schedule(sd, &sd->backlog);
3230 	sd->received_rps++;
3231 }
3232 
3233 #endif /* CONFIG_RPS */
3234 
3235 /*
3236  * Check if this softnet_data structure is another cpu one
3237  * If yes, queue it to our IPI list and return 1
3238  * If no, return 0
3239  */
3240 static int rps_ipi_queued(struct softnet_data *sd)
3241 {
3242 #ifdef CONFIG_RPS
3243 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3244 
3245 	if (sd != mysd) {
3246 		sd->rps_ipi_next = mysd->rps_ipi_list;
3247 		mysd->rps_ipi_list = sd;
3248 
3249 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3250 		return 1;
3251 	}
3252 #endif /* CONFIG_RPS */
3253 	return 0;
3254 }
3255 
3256 #ifdef CONFIG_NET_FLOW_LIMIT
3257 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3258 #endif
3259 
3260 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3261 {
3262 #ifdef CONFIG_NET_FLOW_LIMIT
3263 	struct sd_flow_limit *fl;
3264 	struct softnet_data *sd;
3265 	unsigned int old_flow, new_flow;
3266 
3267 	if (qlen < (netdev_max_backlog >> 1))
3268 		return false;
3269 
3270 	sd = this_cpu_ptr(&softnet_data);
3271 
3272 	rcu_read_lock();
3273 	fl = rcu_dereference(sd->flow_limit);
3274 	if (fl) {
3275 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3276 		old_flow = fl->history[fl->history_head];
3277 		fl->history[fl->history_head] = new_flow;
3278 
3279 		fl->history_head++;
3280 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3281 
3282 		if (likely(fl->buckets[old_flow]))
3283 			fl->buckets[old_flow]--;
3284 
3285 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3286 			fl->count++;
3287 			rcu_read_unlock();
3288 			return true;
3289 		}
3290 	}
3291 	rcu_read_unlock();
3292 #endif
3293 	return false;
3294 }
3295 
3296 /*
3297  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3298  * queue (may be a remote CPU queue).
3299  */
3300 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3301 			      unsigned int *qtail)
3302 {
3303 	struct softnet_data *sd;
3304 	unsigned long flags;
3305 	unsigned int qlen;
3306 
3307 	sd = &per_cpu(softnet_data, cpu);
3308 
3309 	local_irq_save(flags);
3310 
3311 	rps_lock(sd);
3312 	qlen = skb_queue_len(&sd->input_pkt_queue);
3313 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3314 		if (qlen) {
3315 enqueue:
3316 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3317 			input_queue_tail_incr_save(sd, qtail);
3318 			rps_unlock(sd);
3319 			local_irq_restore(flags);
3320 			return NET_RX_SUCCESS;
3321 		}
3322 
3323 		/* Schedule NAPI for backlog device
3324 		 * We can use non atomic operation since we own the queue lock
3325 		 */
3326 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3327 			if (!rps_ipi_queued(sd))
3328 				____napi_schedule(sd, &sd->backlog);
3329 		}
3330 		goto enqueue;
3331 	}
3332 
3333 	sd->dropped++;
3334 	rps_unlock(sd);
3335 
3336 	local_irq_restore(flags);
3337 
3338 	atomic_long_inc(&skb->dev->rx_dropped);
3339 	kfree_skb(skb);
3340 	return NET_RX_DROP;
3341 }
3342 
3343 static int netif_rx_internal(struct sk_buff *skb)
3344 {
3345 	int ret;
3346 
3347 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3348 
3349 	trace_netif_rx(skb);
3350 #ifdef CONFIG_RPS
3351 	if (static_key_false(&rps_needed)) {
3352 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3353 		int cpu;
3354 
3355 		preempt_disable();
3356 		rcu_read_lock();
3357 
3358 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3359 		if (cpu < 0)
3360 			cpu = smp_processor_id();
3361 
3362 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3363 
3364 		rcu_read_unlock();
3365 		preempt_enable();
3366 	} else
3367 #endif
3368 	{
3369 		unsigned int qtail;
3370 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3371 		put_cpu();
3372 	}
3373 	return ret;
3374 }
3375 
3376 /**
3377  *	netif_rx	-	post buffer to the network code
3378  *	@skb: buffer to post
3379  *
3380  *	This function receives a packet from a device driver and queues it for
3381  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3382  *	may be dropped during processing for congestion control or by the
3383  *	protocol layers.
3384  *
3385  *	return values:
3386  *	NET_RX_SUCCESS	(no congestion)
3387  *	NET_RX_DROP     (packet was dropped)
3388  *
3389  */
3390 
3391 int netif_rx(struct sk_buff *skb)
3392 {
3393 	trace_netif_rx_entry(skb);
3394 
3395 	return netif_rx_internal(skb);
3396 }
3397 EXPORT_SYMBOL(netif_rx);
3398 
3399 int netif_rx_ni(struct sk_buff *skb)
3400 {
3401 	int err;
3402 
3403 	trace_netif_rx_ni_entry(skb);
3404 
3405 	preempt_disable();
3406 	err = netif_rx_internal(skb);
3407 	if (local_softirq_pending())
3408 		do_softirq();
3409 	preempt_enable();
3410 
3411 	return err;
3412 }
3413 EXPORT_SYMBOL(netif_rx_ni);
3414 
3415 static void net_tx_action(struct softirq_action *h)
3416 {
3417 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3418 
3419 	if (sd->completion_queue) {
3420 		struct sk_buff *clist;
3421 
3422 		local_irq_disable();
3423 		clist = sd->completion_queue;
3424 		sd->completion_queue = NULL;
3425 		local_irq_enable();
3426 
3427 		while (clist) {
3428 			struct sk_buff *skb = clist;
3429 			clist = clist->next;
3430 
3431 			WARN_ON(atomic_read(&skb->users));
3432 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3433 				trace_consume_skb(skb);
3434 			else
3435 				trace_kfree_skb(skb, net_tx_action);
3436 			__kfree_skb(skb);
3437 		}
3438 	}
3439 
3440 	if (sd->output_queue) {
3441 		struct Qdisc *head;
3442 
3443 		local_irq_disable();
3444 		head = sd->output_queue;
3445 		sd->output_queue = NULL;
3446 		sd->output_queue_tailp = &sd->output_queue;
3447 		local_irq_enable();
3448 
3449 		while (head) {
3450 			struct Qdisc *q = head;
3451 			spinlock_t *root_lock;
3452 
3453 			head = head->next_sched;
3454 
3455 			root_lock = qdisc_lock(q);
3456 			if (spin_trylock(root_lock)) {
3457 				smp_mb__before_atomic();
3458 				clear_bit(__QDISC_STATE_SCHED,
3459 					  &q->state);
3460 				qdisc_run(q);
3461 				spin_unlock(root_lock);
3462 			} else {
3463 				if (!test_bit(__QDISC_STATE_DEACTIVATED,
3464 					      &q->state)) {
3465 					__netif_reschedule(q);
3466 				} else {
3467 					smp_mb__before_atomic();
3468 					clear_bit(__QDISC_STATE_SCHED,
3469 						  &q->state);
3470 				}
3471 			}
3472 		}
3473 	}
3474 }
3475 
3476 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3477     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3478 /* This hook is defined here for ATM LANE */
3479 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3480 			     unsigned char *addr) __read_mostly;
3481 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3482 #endif
3483 
3484 #ifdef CONFIG_NET_CLS_ACT
3485 /* TODO: Maybe we should just force sch_ingress to be compiled in
3486  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3487  * a compare and 2 stores extra right now if we dont have it on
3488  * but have CONFIG_NET_CLS_ACT
3489  * NOTE: This doesn't stop any functionality; if you dont have
3490  * the ingress scheduler, you just can't add policies on ingress.
3491  *
3492  */
3493 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3494 {
3495 	struct net_device *dev = skb->dev;
3496 	u32 ttl = G_TC_RTTL(skb->tc_verd);
3497 	int result = TC_ACT_OK;
3498 	struct Qdisc *q;
3499 
3500 	if (unlikely(MAX_RED_LOOP < ttl++)) {
3501 		net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3502 				     skb->skb_iif, dev->ifindex);
3503 		return TC_ACT_SHOT;
3504 	}
3505 
3506 	skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3507 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3508 
3509 	q = rcu_dereference(rxq->qdisc);
3510 	if (q != &noop_qdisc) {
3511 		spin_lock(qdisc_lock(q));
3512 		if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3513 			result = qdisc_enqueue_root(skb, q);
3514 		spin_unlock(qdisc_lock(q));
3515 	}
3516 
3517 	return result;
3518 }
3519 
3520 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3521 					 struct packet_type **pt_prev,
3522 					 int *ret, struct net_device *orig_dev)
3523 {
3524 	struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3525 
3526 	if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc)
3527 		goto out;
3528 
3529 	if (*pt_prev) {
3530 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3531 		*pt_prev = NULL;
3532 	}
3533 
3534 	switch (ing_filter(skb, rxq)) {
3535 	case TC_ACT_SHOT:
3536 	case TC_ACT_STOLEN:
3537 		kfree_skb(skb);
3538 		return NULL;
3539 	}
3540 
3541 out:
3542 	skb->tc_verd = 0;
3543 	return skb;
3544 }
3545 #endif
3546 
3547 /**
3548  *	netdev_rx_handler_register - register receive handler
3549  *	@dev: device to register a handler for
3550  *	@rx_handler: receive handler to register
3551  *	@rx_handler_data: data pointer that is used by rx handler
3552  *
3553  *	Register a receive handler for a device. This handler will then be
3554  *	called from __netif_receive_skb. A negative errno code is returned
3555  *	on a failure.
3556  *
3557  *	The caller must hold the rtnl_mutex.
3558  *
3559  *	For a general description of rx_handler, see enum rx_handler_result.
3560  */
3561 int netdev_rx_handler_register(struct net_device *dev,
3562 			       rx_handler_func_t *rx_handler,
3563 			       void *rx_handler_data)
3564 {
3565 	ASSERT_RTNL();
3566 
3567 	if (dev->rx_handler)
3568 		return -EBUSY;
3569 
3570 	/* Note: rx_handler_data must be set before rx_handler */
3571 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3572 	rcu_assign_pointer(dev->rx_handler, rx_handler);
3573 
3574 	return 0;
3575 }
3576 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3577 
3578 /**
3579  *	netdev_rx_handler_unregister - unregister receive handler
3580  *	@dev: device to unregister a handler from
3581  *
3582  *	Unregister a receive handler from a device.
3583  *
3584  *	The caller must hold the rtnl_mutex.
3585  */
3586 void netdev_rx_handler_unregister(struct net_device *dev)
3587 {
3588 
3589 	ASSERT_RTNL();
3590 	RCU_INIT_POINTER(dev->rx_handler, NULL);
3591 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3592 	 * section has a guarantee to see a non NULL rx_handler_data
3593 	 * as well.
3594 	 */
3595 	synchronize_net();
3596 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3597 }
3598 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3599 
3600 /*
3601  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3602  * the special handling of PFMEMALLOC skbs.
3603  */
3604 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3605 {
3606 	switch (skb->protocol) {
3607 	case htons(ETH_P_ARP):
3608 	case htons(ETH_P_IP):
3609 	case htons(ETH_P_IPV6):
3610 	case htons(ETH_P_8021Q):
3611 	case htons(ETH_P_8021AD):
3612 		return true;
3613 	default:
3614 		return false;
3615 	}
3616 }
3617 
3618 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3619 {
3620 	struct packet_type *ptype, *pt_prev;
3621 	rx_handler_func_t *rx_handler;
3622 	struct net_device *orig_dev;
3623 	bool deliver_exact = false;
3624 	int ret = NET_RX_DROP;
3625 	__be16 type;
3626 
3627 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
3628 
3629 	trace_netif_receive_skb(skb);
3630 
3631 	orig_dev = skb->dev;
3632 
3633 	skb_reset_network_header(skb);
3634 	if (!skb_transport_header_was_set(skb))
3635 		skb_reset_transport_header(skb);
3636 	skb_reset_mac_len(skb);
3637 
3638 	pt_prev = NULL;
3639 
3640 	rcu_read_lock();
3641 
3642 another_round:
3643 	skb->skb_iif = skb->dev->ifindex;
3644 
3645 	__this_cpu_inc(softnet_data.processed);
3646 
3647 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3648 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3649 		skb = skb_vlan_untag(skb);
3650 		if (unlikely(!skb))
3651 			goto unlock;
3652 	}
3653 
3654 #ifdef CONFIG_NET_CLS_ACT
3655 	if (skb->tc_verd & TC_NCLS) {
3656 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3657 		goto ncls;
3658 	}
3659 #endif
3660 
3661 	if (pfmemalloc)
3662 		goto skip_taps;
3663 
3664 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
3665 		if (pt_prev)
3666 			ret = deliver_skb(skb, pt_prev, orig_dev);
3667 		pt_prev = ptype;
3668 	}
3669 
3670 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3671 		if (pt_prev)
3672 			ret = deliver_skb(skb, pt_prev, orig_dev);
3673 		pt_prev = ptype;
3674 	}
3675 
3676 skip_taps:
3677 #ifdef CONFIG_NET_CLS_ACT
3678 	skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3679 	if (!skb)
3680 		goto unlock;
3681 ncls:
3682 #endif
3683 
3684 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3685 		goto drop;
3686 
3687 	if (skb_vlan_tag_present(skb)) {
3688 		if (pt_prev) {
3689 			ret = deliver_skb(skb, pt_prev, orig_dev);
3690 			pt_prev = NULL;
3691 		}
3692 		if (vlan_do_receive(&skb))
3693 			goto another_round;
3694 		else if (unlikely(!skb))
3695 			goto unlock;
3696 	}
3697 
3698 	rx_handler = rcu_dereference(skb->dev->rx_handler);
3699 	if (rx_handler) {
3700 		if (pt_prev) {
3701 			ret = deliver_skb(skb, pt_prev, orig_dev);
3702 			pt_prev = NULL;
3703 		}
3704 		switch (rx_handler(&skb)) {
3705 		case RX_HANDLER_CONSUMED:
3706 			ret = NET_RX_SUCCESS;
3707 			goto unlock;
3708 		case RX_HANDLER_ANOTHER:
3709 			goto another_round;
3710 		case RX_HANDLER_EXACT:
3711 			deliver_exact = true;
3712 		case RX_HANDLER_PASS:
3713 			break;
3714 		default:
3715 			BUG();
3716 		}
3717 	}
3718 
3719 	if (unlikely(skb_vlan_tag_present(skb))) {
3720 		if (skb_vlan_tag_get_id(skb))
3721 			skb->pkt_type = PACKET_OTHERHOST;
3722 		/* Note: we might in the future use prio bits
3723 		 * and set skb->priority like in vlan_do_receive()
3724 		 * For the time being, just ignore Priority Code Point
3725 		 */
3726 		skb->vlan_tci = 0;
3727 	}
3728 
3729 	type = skb->protocol;
3730 
3731 	/* deliver only exact match when indicated */
3732 	if (likely(!deliver_exact)) {
3733 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3734 				       &ptype_base[ntohs(type) &
3735 						   PTYPE_HASH_MASK]);
3736 	}
3737 
3738 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3739 			       &orig_dev->ptype_specific);
3740 
3741 	if (unlikely(skb->dev != orig_dev)) {
3742 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
3743 				       &skb->dev->ptype_specific);
3744 	}
3745 
3746 	if (pt_prev) {
3747 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3748 			goto drop;
3749 		else
3750 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3751 	} else {
3752 drop:
3753 		atomic_long_inc(&skb->dev->rx_dropped);
3754 		kfree_skb(skb);
3755 		/* Jamal, now you will not able to escape explaining
3756 		 * me how you were going to use this. :-)
3757 		 */
3758 		ret = NET_RX_DROP;
3759 	}
3760 
3761 unlock:
3762 	rcu_read_unlock();
3763 	return ret;
3764 }
3765 
3766 static int __netif_receive_skb(struct sk_buff *skb)
3767 {
3768 	int ret;
3769 
3770 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3771 		unsigned long pflags = current->flags;
3772 
3773 		/*
3774 		 * PFMEMALLOC skbs are special, they should
3775 		 * - be delivered to SOCK_MEMALLOC sockets only
3776 		 * - stay away from userspace
3777 		 * - have bounded memory usage
3778 		 *
3779 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
3780 		 * context down to all allocation sites.
3781 		 */
3782 		current->flags |= PF_MEMALLOC;
3783 		ret = __netif_receive_skb_core(skb, true);
3784 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
3785 	} else
3786 		ret = __netif_receive_skb_core(skb, false);
3787 
3788 	return ret;
3789 }
3790 
3791 static int netif_receive_skb_internal(struct sk_buff *skb)
3792 {
3793 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3794 
3795 	if (skb_defer_rx_timestamp(skb))
3796 		return NET_RX_SUCCESS;
3797 
3798 #ifdef CONFIG_RPS
3799 	if (static_key_false(&rps_needed)) {
3800 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3801 		int cpu, ret;
3802 
3803 		rcu_read_lock();
3804 
3805 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3806 
3807 		if (cpu >= 0) {
3808 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3809 			rcu_read_unlock();
3810 			return ret;
3811 		}
3812 		rcu_read_unlock();
3813 	}
3814 #endif
3815 	return __netif_receive_skb(skb);
3816 }
3817 
3818 /**
3819  *	netif_receive_skb - process receive buffer from network
3820  *	@skb: buffer to process
3821  *
3822  *	netif_receive_skb() is the main receive data processing function.
3823  *	It always succeeds. The buffer may be dropped during processing
3824  *	for congestion control or by the protocol layers.
3825  *
3826  *	This function may only be called from softirq context and interrupts
3827  *	should be enabled.
3828  *
3829  *	Return values (usually ignored):
3830  *	NET_RX_SUCCESS: no congestion
3831  *	NET_RX_DROP: packet was dropped
3832  */
3833 int netif_receive_skb(struct sk_buff *skb)
3834 {
3835 	trace_netif_receive_skb_entry(skb);
3836 
3837 	return netif_receive_skb_internal(skb);
3838 }
3839 EXPORT_SYMBOL(netif_receive_skb);
3840 
3841 /* Network device is going away, flush any packets still pending
3842  * Called with irqs disabled.
3843  */
3844 static void flush_backlog(void *arg)
3845 {
3846 	struct net_device *dev = arg;
3847 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3848 	struct sk_buff *skb, *tmp;
3849 
3850 	rps_lock(sd);
3851 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3852 		if (skb->dev == dev) {
3853 			__skb_unlink(skb, &sd->input_pkt_queue);
3854 			kfree_skb(skb);
3855 			input_queue_head_incr(sd);
3856 		}
3857 	}
3858 	rps_unlock(sd);
3859 
3860 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3861 		if (skb->dev == dev) {
3862 			__skb_unlink(skb, &sd->process_queue);
3863 			kfree_skb(skb);
3864 			input_queue_head_incr(sd);
3865 		}
3866 	}
3867 }
3868 
3869 static int napi_gro_complete(struct sk_buff *skb)
3870 {
3871 	struct packet_offload *ptype;
3872 	__be16 type = skb->protocol;
3873 	struct list_head *head = &offload_base;
3874 	int err = -ENOENT;
3875 
3876 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3877 
3878 	if (NAPI_GRO_CB(skb)->count == 1) {
3879 		skb_shinfo(skb)->gso_size = 0;
3880 		goto out;
3881 	}
3882 
3883 	rcu_read_lock();
3884 	list_for_each_entry_rcu(ptype, head, list) {
3885 		if (ptype->type != type || !ptype->callbacks.gro_complete)
3886 			continue;
3887 
3888 		err = ptype->callbacks.gro_complete(skb, 0);
3889 		break;
3890 	}
3891 	rcu_read_unlock();
3892 
3893 	if (err) {
3894 		WARN_ON(&ptype->list == head);
3895 		kfree_skb(skb);
3896 		return NET_RX_SUCCESS;
3897 	}
3898 
3899 out:
3900 	return netif_receive_skb_internal(skb);
3901 }
3902 
3903 /* napi->gro_list contains packets ordered by age.
3904  * youngest packets at the head of it.
3905  * Complete skbs in reverse order to reduce latencies.
3906  */
3907 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3908 {
3909 	struct sk_buff *skb, *prev = NULL;
3910 
3911 	/* scan list and build reverse chain */
3912 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3913 		skb->prev = prev;
3914 		prev = skb;
3915 	}
3916 
3917 	for (skb = prev; skb; skb = prev) {
3918 		skb->next = NULL;
3919 
3920 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3921 			return;
3922 
3923 		prev = skb->prev;
3924 		napi_gro_complete(skb);
3925 		napi->gro_count--;
3926 	}
3927 
3928 	napi->gro_list = NULL;
3929 }
3930 EXPORT_SYMBOL(napi_gro_flush);
3931 
3932 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3933 {
3934 	struct sk_buff *p;
3935 	unsigned int maclen = skb->dev->hard_header_len;
3936 	u32 hash = skb_get_hash_raw(skb);
3937 
3938 	for (p = napi->gro_list; p; p = p->next) {
3939 		unsigned long diffs;
3940 
3941 		NAPI_GRO_CB(p)->flush = 0;
3942 
3943 		if (hash != skb_get_hash_raw(p)) {
3944 			NAPI_GRO_CB(p)->same_flow = 0;
3945 			continue;
3946 		}
3947 
3948 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3949 		diffs |= p->vlan_tci ^ skb->vlan_tci;
3950 		if (maclen == ETH_HLEN)
3951 			diffs |= compare_ether_header(skb_mac_header(p),
3952 						      skb_mac_header(skb));
3953 		else if (!diffs)
3954 			diffs = memcmp(skb_mac_header(p),
3955 				       skb_mac_header(skb),
3956 				       maclen);
3957 		NAPI_GRO_CB(p)->same_flow = !diffs;
3958 	}
3959 }
3960 
3961 static void skb_gro_reset_offset(struct sk_buff *skb)
3962 {
3963 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
3964 	const skb_frag_t *frag0 = &pinfo->frags[0];
3965 
3966 	NAPI_GRO_CB(skb)->data_offset = 0;
3967 	NAPI_GRO_CB(skb)->frag0 = NULL;
3968 	NAPI_GRO_CB(skb)->frag0_len = 0;
3969 
3970 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3971 	    pinfo->nr_frags &&
3972 	    !PageHighMem(skb_frag_page(frag0))) {
3973 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3974 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3975 	}
3976 }
3977 
3978 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
3979 {
3980 	struct skb_shared_info *pinfo = skb_shinfo(skb);
3981 
3982 	BUG_ON(skb->end - skb->tail < grow);
3983 
3984 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3985 
3986 	skb->data_len -= grow;
3987 	skb->tail += grow;
3988 
3989 	pinfo->frags[0].page_offset += grow;
3990 	skb_frag_size_sub(&pinfo->frags[0], grow);
3991 
3992 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
3993 		skb_frag_unref(skb, 0);
3994 		memmove(pinfo->frags, pinfo->frags + 1,
3995 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
3996 	}
3997 }
3998 
3999 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4000 {
4001 	struct sk_buff **pp = NULL;
4002 	struct packet_offload *ptype;
4003 	__be16 type = skb->protocol;
4004 	struct list_head *head = &offload_base;
4005 	int same_flow;
4006 	enum gro_result ret;
4007 	int grow;
4008 
4009 	if (!(skb->dev->features & NETIF_F_GRO))
4010 		goto normal;
4011 
4012 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4013 		goto normal;
4014 
4015 	gro_list_prepare(napi, skb);
4016 
4017 	rcu_read_lock();
4018 	list_for_each_entry_rcu(ptype, head, list) {
4019 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4020 			continue;
4021 
4022 		skb_set_network_header(skb, skb_gro_offset(skb));
4023 		skb_reset_mac_len(skb);
4024 		NAPI_GRO_CB(skb)->same_flow = 0;
4025 		NAPI_GRO_CB(skb)->flush = 0;
4026 		NAPI_GRO_CB(skb)->free = 0;
4027 		NAPI_GRO_CB(skb)->udp_mark = 0;
4028 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4029 
4030 		/* Setup for GRO checksum validation */
4031 		switch (skb->ip_summed) {
4032 		case CHECKSUM_COMPLETE:
4033 			NAPI_GRO_CB(skb)->csum = skb->csum;
4034 			NAPI_GRO_CB(skb)->csum_valid = 1;
4035 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4036 			break;
4037 		case CHECKSUM_UNNECESSARY:
4038 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4039 			NAPI_GRO_CB(skb)->csum_valid = 0;
4040 			break;
4041 		default:
4042 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4043 			NAPI_GRO_CB(skb)->csum_valid = 0;
4044 		}
4045 
4046 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4047 		break;
4048 	}
4049 	rcu_read_unlock();
4050 
4051 	if (&ptype->list == head)
4052 		goto normal;
4053 
4054 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4055 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4056 
4057 	if (pp) {
4058 		struct sk_buff *nskb = *pp;
4059 
4060 		*pp = nskb->next;
4061 		nskb->next = NULL;
4062 		napi_gro_complete(nskb);
4063 		napi->gro_count--;
4064 	}
4065 
4066 	if (same_flow)
4067 		goto ok;
4068 
4069 	if (NAPI_GRO_CB(skb)->flush)
4070 		goto normal;
4071 
4072 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4073 		struct sk_buff *nskb = napi->gro_list;
4074 
4075 		/* locate the end of the list to select the 'oldest' flow */
4076 		while (nskb->next) {
4077 			pp = &nskb->next;
4078 			nskb = *pp;
4079 		}
4080 		*pp = NULL;
4081 		nskb->next = NULL;
4082 		napi_gro_complete(nskb);
4083 	} else {
4084 		napi->gro_count++;
4085 	}
4086 	NAPI_GRO_CB(skb)->count = 1;
4087 	NAPI_GRO_CB(skb)->age = jiffies;
4088 	NAPI_GRO_CB(skb)->last = skb;
4089 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4090 	skb->next = napi->gro_list;
4091 	napi->gro_list = skb;
4092 	ret = GRO_HELD;
4093 
4094 pull:
4095 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4096 	if (grow > 0)
4097 		gro_pull_from_frag0(skb, grow);
4098 ok:
4099 	return ret;
4100 
4101 normal:
4102 	ret = GRO_NORMAL;
4103 	goto pull;
4104 }
4105 
4106 struct packet_offload *gro_find_receive_by_type(__be16 type)
4107 {
4108 	struct list_head *offload_head = &offload_base;
4109 	struct packet_offload *ptype;
4110 
4111 	list_for_each_entry_rcu(ptype, offload_head, list) {
4112 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4113 			continue;
4114 		return ptype;
4115 	}
4116 	return NULL;
4117 }
4118 EXPORT_SYMBOL(gro_find_receive_by_type);
4119 
4120 struct packet_offload *gro_find_complete_by_type(__be16 type)
4121 {
4122 	struct list_head *offload_head = &offload_base;
4123 	struct packet_offload *ptype;
4124 
4125 	list_for_each_entry_rcu(ptype, offload_head, list) {
4126 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4127 			continue;
4128 		return ptype;
4129 	}
4130 	return NULL;
4131 }
4132 EXPORT_SYMBOL(gro_find_complete_by_type);
4133 
4134 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4135 {
4136 	switch (ret) {
4137 	case GRO_NORMAL:
4138 		if (netif_receive_skb_internal(skb))
4139 			ret = GRO_DROP;
4140 		break;
4141 
4142 	case GRO_DROP:
4143 		kfree_skb(skb);
4144 		break;
4145 
4146 	case GRO_MERGED_FREE:
4147 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4148 			kmem_cache_free(skbuff_head_cache, skb);
4149 		else
4150 			__kfree_skb(skb);
4151 		break;
4152 
4153 	case GRO_HELD:
4154 	case GRO_MERGED:
4155 		break;
4156 	}
4157 
4158 	return ret;
4159 }
4160 
4161 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4162 {
4163 	trace_napi_gro_receive_entry(skb);
4164 
4165 	skb_gro_reset_offset(skb);
4166 
4167 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4168 }
4169 EXPORT_SYMBOL(napi_gro_receive);
4170 
4171 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4172 {
4173 	if (unlikely(skb->pfmemalloc)) {
4174 		consume_skb(skb);
4175 		return;
4176 	}
4177 	__skb_pull(skb, skb_headlen(skb));
4178 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4179 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4180 	skb->vlan_tci = 0;
4181 	skb->dev = napi->dev;
4182 	skb->skb_iif = 0;
4183 	skb->encapsulation = 0;
4184 	skb_shinfo(skb)->gso_type = 0;
4185 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4186 
4187 	napi->skb = skb;
4188 }
4189 
4190 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4191 {
4192 	struct sk_buff *skb = napi->skb;
4193 
4194 	if (!skb) {
4195 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4196 		napi->skb = skb;
4197 	}
4198 	return skb;
4199 }
4200 EXPORT_SYMBOL(napi_get_frags);
4201 
4202 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4203 				      struct sk_buff *skb,
4204 				      gro_result_t ret)
4205 {
4206 	switch (ret) {
4207 	case GRO_NORMAL:
4208 	case GRO_HELD:
4209 		__skb_push(skb, ETH_HLEN);
4210 		skb->protocol = eth_type_trans(skb, skb->dev);
4211 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4212 			ret = GRO_DROP;
4213 		break;
4214 
4215 	case GRO_DROP:
4216 	case GRO_MERGED_FREE:
4217 		napi_reuse_skb(napi, skb);
4218 		break;
4219 
4220 	case GRO_MERGED:
4221 		break;
4222 	}
4223 
4224 	return ret;
4225 }
4226 
4227 /* Upper GRO stack assumes network header starts at gro_offset=0
4228  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4229  * We copy ethernet header into skb->data to have a common layout.
4230  */
4231 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4232 {
4233 	struct sk_buff *skb = napi->skb;
4234 	const struct ethhdr *eth;
4235 	unsigned int hlen = sizeof(*eth);
4236 
4237 	napi->skb = NULL;
4238 
4239 	skb_reset_mac_header(skb);
4240 	skb_gro_reset_offset(skb);
4241 
4242 	eth = skb_gro_header_fast(skb, 0);
4243 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4244 		eth = skb_gro_header_slow(skb, hlen, 0);
4245 		if (unlikely(!eth)) {
4246 			napi_reuse_skb(napi, skb);
4247 			return NULL;
4248 		}
4249 	} else {
4250 		gro_pull_from_frag0(skb, hlen);
4251 		NAPI_GRO_CB(skb)->frag0 += hlen;
4252 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4253 	}
4254 	__skb_pull(skb, hlen);
4255 
4256 	/*
4257 	 * This works because the only protocols we care about don't require
4258 	 * special handling.
4259 	 * We'll fix it up properly in napi_frags_finish()
4260 	 */
4261 	skb->protocol = eth->h_proto;
4262 
4263 	return skb;
4264 }
4265 
4266 gro_result_t napi_gro_frags(struct napi_struct *napi)
4267 {
4268 	struct sk_buff *skb = napi_frags_skb(napi);
4269 
4270 	if (!skb)
4271 		return GRO_DROP;
4272 
4273 	trace_napi_gro_frags_entry(skb);
4274 
4275 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4276 }
4277 EXPORT_SYMBOL(napi_gro_frags);
4278 
4279 /* Compute the checksum from gro_offset and return the folded value
4280  * after adding in any pseudo checksum.
4281  */
4282 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4283 {
4284 	__wsum wsum;
4285 	__sum16 sum;
4286 
4287 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4288 
4289 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4290 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4291 	if (likely(!sum)) {
4292 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4293 		    !skb->csum_complete_sw)
4294 			netdev_rx_csum_fault(skb->dev);
4295 	}
4296 
4297 	NAPI_GRO_CB(skb)->csum = wsum;
4298 	NAPI_GRO_CB(skb)->csum_valid = 1;
4299 
4300 	return sum;
4301 }
4302 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4303 
4304 /*
4305  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4306  * Note: called with local irq disabled, but exits with local irq enabled.
4307  */
4308 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4309 {
4310 #ifdef CONFIG_RPS
4311 	struct softnet_data *remsd = sd->rps_ipi_list;
4312 
4313 	if (remsd) {
4314 		sd->rps_ipi_list = NULL;
4315 
4316 		local_irq_enable();
4317 
4318 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4319 		while (remsd) {
4320 			struct softnet_data *next = remsd->rps_ipi_next;
4321 
4322 			if (cpu_online(remsd->cpu))
4323 				smp_call_function_single_async(remsd->cpu,
4324 							   &remsd->csd);
4325 			remsd = next;
4326 		}
4327 	} else
4328 #endif
4329 		local_irq_enable();
4330 }
4331 
4332 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4333 {
4334 #ifdef CONFIG_RPS
4335 	return sd->rps_ipi_list != NULL;
4336 #else
4337 	return false;
4338 #endif
4339 }
4340 
4341 static int process_backlog(struct napi_struct *napi, int quota)
4342 {
4343 	int work = 0;
4344 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4345 
4346 	/* Check if we have pending ipi, its better to send them now,
4347 	 * not waiting net_rx_action() end.
4348 	 */
4349 	if (sd_has_rps_ipi_waiting(sd)) {
4350 		local_irq_disable();
4351 		net_rps_action_and_irq_enable(sd);
4352 	}
4353 
4354 	napi->weight = weight_p;
4355 	local_irq_disable();
4356 	while (1) {
4357 		struct sk_buff *skb;
4358 
4359 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4360 			local_irq_enable();
4361 			__netif_receive_skb(skb);
4362 			local_irq_disable();
4363 			input_queue_head_incr(sd);
4364 			if (++work >= quota) {
4365 				local_irq_enable();
4366 				return work;
4367 			}
4368 		}
4369 
4370 		rps_lock(sd);
4371 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4372 			/*
4373 			 * Inline a custom version of __napi_complete().
4374 			 * only current cpu owns and manipulates this napi,
4375 			 * and NAPI_STATE_SCHED is the only possible flag set
4376 			 * on backlog.
4377 			 * We can use a plain write instead of clear_bit(),
4378 			 * and we dont need an smp_mb() memory barrier.
4379 			 */
4380 			napi->state = 0;
4381 			rps_unlock(sd);
4382 
4383 			break;
4384 		}
4385 
4386 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4387 					   &sd->process_queue);
4388 		rps_unlock(sd);
4389 	}
4390 	local_irq_enable();
4391 
4392 	return work;
4393 }
4394 
4395 /**
4396  * __napi_schedule - schedule for receive
4397  * @n: entry to schedule
4398  *
4399  * The entry's receive function will be scheduled to run.
4400  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4401  */
4402 void __napi_schedule(struct napi_struct *n)
4403 {
4404 	unsigned long flags;
4405 
4406 	local_irq_save(flags);
4407 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4408 	local_irq_restore(flags);
4409 }
4410 EXPORT_SYMBOL(__napi_schedule);
4411 
4412 /**
4413  * __napi_schedule_irqoff - schedule for receive
4414  * @n: entry to schedule
4415  *
4416  * Variant of __napi_schedule() assuming hard irqs are masked
4417  */
4418 void __napi_schedule_irqoff(struct napi_struct *n)
4419 {
4420 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4421 }
4422 EXPORT_SYMBOL(__napi_schedule_irqoff);
4423 
4424 void __napi_complete(struct napi_struct *n)
4425 {
4426 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4427 
4428 	list_del_init(&n->poll_list);
4429 	smp_mb__before_atomic();
4430 	clear_bit(NAPI_STATE_SCHED, &n->state);
4431 }
4432 EXPORT_SYMBOL(__napi_complete);
4433 
4434 void napi_complete_done(struct napi_struct *n, int work_done)
4435 {
4436 	unsigned long flags;
4437 
4438 	/*
4439 	 * don't let napi dequeue from the cpu poll list
4440 	 * just in case its running on a different cpu
4441 	 */
4442 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4443 		return;
4444 
4445 	if (n->gro_list) {
4446 		unsigned long timeout = 0;
4447 
4448 		if (work_done)
4449 			timeout = n->dev->gro_flush_timeout;
4450 
4451 		if (timeout)
4452 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4453 				      HRTIMER_MODE_REL_PINNED);
4454 		else
4455 			napi_gro_flush(n, false);
4456 	}
4457 	if (likely(list_empty(&n->poll_list))) {
4458 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4459 	} else {
4460 		/* If n->poll_list is not empty, we need to mask irqs */
4461 		local_irq_save(flags);
4462 		__napi_complete(n);
4463 		local_irq_restore(flags);
4464 	}
4465 }
4466 EXPORT_SYMBOL(napi_complete_done);
4467 
4468 /* must be called under rcu_read_lock(), as we dont take a reference */
4469 struct napi_struct *napi_by_id(unsigned int napi_id)
4470 {
4471 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4472 	struct napi_struct *napi;
4473 
4474 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4475 		if (napi->napi_id == napi_id)
4476 			return napi;
4477 
4478 	return NULL;
4479 }
4480 EXPORT_SYMBOL_GPL(napi_by_id);
4481 
4482 void napi_hash_add(struct napi_struct *napi)
4483 {
4484 	if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4485 
4486 		spin_lock(&napi_hash_lock);
4487 
4488 		/* 0 is not a valid id, we also skip an id that is taken
4489 		 * we expect both events to be extremely rare
4490 		 */
4491 		napi->napi_id = 0;
4492 		while (!napi->napi_id) {
4493 			napi->napi_id = ++napi_gen_id;
4494 			if (napi_by_id(napi->napi_id))
4495 				napi->napi_id = 0;
4496 		}
4497 
4498 		hlist_add_head_rcu(&napi->napi_hash_node,
4499 			&napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4500 
4501 		spin_unlock(&napi_hash_lock);
4502 	}
4503 }
4504 EXPORT_SYMBOL_GPL(napi_hash_add);
4505 
4506 /* Warning : caller is responsible to make sure rcu grace period
4507  * is respected before freeing memory containing @napi
4508  */
4509 void napi_hash_del(struct napi_struct *napi)
4510 {
4511 	spin_lock(&napi_hash_lock);
4512 
4513 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4514 		hlist_del_rcu(&napi->napi_hash_node);
4515 
4516 	spin_unlock(&napi_hash_lock);
4517 }
4518 EXPORT_SYMBOL_GPL(napi_hash_del);
4519 
4520 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4521 {
4522 	struct napi_struct *napi;
4523 
4524 	napi = container_of(timer, struct napi_struct, timer);
4525 	if (napi->gro_list)
4526 		napi_schedule(napi);
4527 
4528 	return HRTIMER_NORESTART;
4529 }
4530 
4531 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4532 		    int (*poll)(struct napi_struct *, int), int weight)
4533 {
4534 	INIT_LIST_HEAD(&napi->poll_list);
4535 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4536 	napi->timer.function = napi_watchdog;
4537 	napi->gro_count = 0;
4538 	napi->gro_list = NULL;
4539 	napi->skb = NULL;
4540 	napi->poll = poll;
4541 	if (weight > NAPI_POLL_WEIGHT)
4542 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4543 			    weight, dev->name);
4544 	napi->weight = weight;
4545 	list_add(&napi->dev_list, &dev->napi_list);
4546 	napi->dev = dev;
4547 #ifdef CONFIG_NETPOLL
4548 	spin_lock_init(&napi->poll_lock);
4549 	napi->poll_owner = -1;
4550 #endif
4551 	set_bit(NAPI_STATE_SCHED, &napi->state);
4552 }
4553 EXPORT_SYMBOL(netif_napi_add);
4554 
4555 void napi_disable(struct napi_struct *n)
4556 {
4557 	might_sleep();
4558 	set_bit(NAPI_STATE_DISABLE, &n->state);
4559 
4560 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4561 		msleep(1);
4562 
4563 	hrtimer_cancel(&n->timer);
4564 
4565 	clear_bit(NAPI_STATE_DISABLE, &n->state);
4566 }
4567 EXPORT_SYMBOL(napi_disable);
4568 
4569 void netif_napi_del(struct napi_struct *napi)
4570 {
4571 	list_del_init(&napi->dev_list);
4572 	napi_free_frags(napi);
4573 
4574 	kfree_skb_list(napi->gro_list);
4575 	napi->gro_list = NULL;
4576 	napi->gro_count = 0;
4577 }
4578 EXPORT_SYMBOL(netif_napi_del);
4579 
4580 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4581 {
4582 	void *have;
4583 	int work, weight;
4584 
4585 	list_del_init(&n->poll_list);
4586 
4587 	have = netpoll_poll_lock(n);
4588 
4589 	weight = n->weight;
4590 
4591 	/* This NAPI_STATE_SCHED test is for avoiding a race
4592 	 * with netpoll's poll_napi().  Only the entity which
4593 	 * obtains the lock and sees NAPI_STATE_SCHED set will
4594 	 * actually make the ->poll() call.  Therefore we avoid
4595 	 * accidentally calling ->poll() when NAPI is not scheduled.
4596 	 */
4597 	work = 0;
4598 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4599 		work = n->poll(n, weight);
4600 		trace_napi_poll(n);
4601 	}
4602 
4603 	WARN_ON_ONCE(work > weight);
4604 
4605 	if (likely(work < weight))
4606 		goto out_unlock;
4607 
4608 	/* Drivers must not modify the NAPI state if they
4609 	 * consume the entire weight.  In such cases this code
4610 	 * still "owns" the NAPI instance and therefore can
4611 	 * move the instance around on the list at-will.
4612 	 */
4613 	if (unlikely(napi_disable_pending(n))) {
4614 		napi_complete(n);
4615 		goto out_unlock;
4616 	}
4617 
4618 	if (n->gro_list) {
4619 		/* flush too old packets
4620 		 * If HZ < 1000, flush all packets.
4621 		 */
4622 		napi_gro_flush(n, HZ >= 1000);
4623 	}
4624 
4625 	/* Some drivers may have called napi_schedule
4626 	 * prior to exhausting their budget.
4627 	 */
4628 	if (unlikely(!list_empty(&n->poll_list))) {
4629 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4630 			     n->dev ? n->dev->name : "backlog");
4631 		goto out_unlock;
4632 	}
4633 
4634 	list_add_tail(&n->poll_list, repoll);
4635 
4636 out_unlock:
4637 	netpoll_poll_unlock(have);
4638 
4639 	return work;
4640 }
4641 
4642 static void net_rx_action(struct softirq_action *h)
4643 {
4644 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4645 	unsigned long time_limit = jiffies + 2;
4646 	int budget = netdev_budget;
4647 	LIST_HEAD(list);
4648 	LIST_HEAD(repoll);
4649 
4650 	local_irq_disable();
4651 	list_splice_init(&sd->poll_list, &list);
4652 	local_irq_enable();
4653 
4654 	for (;;) {
4655 		struct napi_struct *n;
4656 
4657 		if (list_empty(&list)) {
4658 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4659 				return;
4660 			break;
4661 		}
4662 
4663 		n = list_first_entry(&list, struct napi_struct, poll_list);
4664 		budget -= napi_poll(n, &repoll);
4665 
4666 		/* If softirq window is exhausted then punt.
4667 		 * Allow this to run for 2 jiffies since which will allow
4668 		 * an average latency of 1.5/HZ.
4669 		 */
4670 		if (unlikely(budget <= 0 ||
4671 			     time_after_eq(jiffies, time_limit))) {
4672 			sd->time_squeeze++;
4673 			break;
4674 		}
4675 	}
4676 
4677 	local_irq_disable();
4678 
4679 	list_splice_tail_init(&sd->poll_list, &list);
4680 	list_splice_tail(&repoll, &list);
4681 	list_splice(&list, &sd->poll_list);
4682 	if (!list_empty(&sd->poll_list))
4683 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
4684 
4685 	net_rps_action_and_irq_enable(sd);
4686 }
4687 
4688 struct netdev_adjacent {
4689 	struct net_device *dev;
4690 
4691 	/* upper master flag, there can only be one master device per list */
4692 	bool master;
4693 
4694 	/* counter for the number of times this device was added to us */
4695 	u16 ref_nr;
4696 
4697 	/* private field for the users */
4698 	void *private;
4699 
4700 	struct list_head list;
4701 	struct rcu_head rcu;
4702 };
4703 
4704 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4705 						 struct net_device *adj_dev,
4706 						 struct list_head *adj_list)
4707 {
4708 	struct netdev_adjacent *adj;
4709 
4710 	list_for_each_entry(adj, adj_list, list) {
4711 		if (adj->dev == adj_dev)
4712 			return adj;
4713 	}
4714 	return NULL;
4715 }
4716 
4717 /**
4718  * netdev_has_upper_dev - Check if device is linked to an upper device
4719  * @dev: device
4720  * @upper_dev: upper device to check
4721  *
4722  * Find out if a device is linked to specified upper device and return true
4723  * in case it is. Note that this checks only immediate upper device,
4724  * not through a complete stack of devices. The caller must hold the RTNL lock.
4725  */
4726 bool netdev_has_upper_dev(struct net_device *dev,
4727 			  struct net_device *upper_dev)
4728 {
4729 	ASSERT_RTNL();
4730 
4731 	return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4732 }
4733 EXPORT_SYMBOL(netdev_has_upper_dev);
4734 
4735 /**
4736  * netdev_has_any_upper_dev - Check if device is linked to some device
4737  * @dev: device
4738  *
4739  * Find out if a device is linked to an upper device and return true in case
4740  * it is. The caller must hold the RTNL lock.
4741  */
4742 static bool netdev_has_any_upper_dev(struct net_device *dev)
4743 {
4744 	ASSERT_RTNL();
4745 
4746 	return !list_empty(&dev->all_adj_list.upper);
4747 }
4748 
4749 /**
4750  * netdev_master_upper_dev_get - Get master upper device
4751  * @dev: device
4752  *
4753  * Find a master upper device and return pointer to it or NULL in case
4754  * it's not there. The caller must hold the RTNL lock.
4755  */
4756 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4757 {
4758 	struct netdev_adjacent *upper;
4759 
4760 	ASSERT_RTNL();
4761 
4762 	if (list_empty(&dev->adj_list.upper))
4763 		return NULL;
4764 
4765 	upper = list_first_entry(&dev->adj_list.upper,
4766 				 struct netdev_adjacent, list);
4767 	if (likely(upper->master))
4768 		return upper->dev;
4769 	return NULL;
4770 }
4771 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4772 
4773 void *netdev_adjacent_get_private(struct list_head *adj_list)
4774 {
4775 	struct netdev_adjacent *adj;
4776 
4777 	adj = list_entry(adj_list, struct netdev_adjacent, list);
4778 
4779 	return adj->private;
4780 }
4781 EXPORT_SYMBOL(netdev_adjacent_get_private);
4782 
4783 /**
4784  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
4785  * @dev: device
4786  * @iter: list_head ** of the current position
4787  *
4788  * Gets the next device from the dev's upper list, starting from iter
4789  * position. The caller must hold RCU read lock.
4790  */
4791 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
4792 						 struct list_head **iter)
4793 {
4794 	struct netdev_adjacent *upper;
4795 
4796 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4797 
4798 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4799 
4800 	if (&upper->list == &dev->adj_list.upper)
4801 		return NULL;
4802 
4803 	*iter = &upper->list;
4804 
4805 	return upper->dev;
4806 }
4807 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
4808 
4809 /**
4810  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4811  * @dev: device
4812  * @iter: list_head ** of the current position
4813  *
4814  * Gets the next device from the dev's upper list, starting from iter
4815  * position. The caller must hold RCU read lock.
4816  */
4817 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4818 						     struct list_head **iter)
4819 {
4820 	struct netdev_adjacent *upper;
4821 
4822 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
4823 
4824 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4825 
4826 	if (&upper->list == &dev->all_adj_list.upper)
4827 		return NULL;
4828 
4829 	*iter = &upper->list;
4830 
4831 	return upper->dev;
4832 }
4833 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4834 
4835 /**
4836  * netdev_lower_get_next_private - Get the next ->private from the
4837  *				   lower neighbour list
4838  * @dev: device
4839  * @iter: list_head ** of the current position
4840  *
4841  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4842  * list, starting from iter position. The caller must hold either hold the
4843  * RTNL lock or its own locking that guarantees that the neighbour lower
4844  * list will remain unchainged.
4845  */
4846 void *netdev_lower_get_next_private(struct net_device *dev,
4847 				    struct list_head **iter)
4848 {
4849 	struct netdev_adjacent *lower;
4850 
4851 	lower = list_entry(*iter, struct netdev_adjacent, list);
4852 
4853 	if (&lower->list == &dev->adj_list.lower)
4854 		return NULL;
4855 
4856 	*iter = lower->list.next;
4857 
4858 	return lower->private;
4859 }
4860 EXPORT_SYMBOL(netdev_lower_get_next_private);
4861 
4862 /**
4863  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4864  *				       lower neighbour list, RCU
4865  *				       variant
4866  * @dev: device
4867  * @iter: list_head ** of the current position
4868  *
4869  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4870  * list, starting from iter position. The caller must hold RCU read lock.
4871  */
4872 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4873 					struct list_head **iter)
4874 {
4875 	struct netdev_adjacent *lower;
4876 
4877 	WARN_ON_ONCE(!rcu_read_lock_held());
4878 
4879 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4880 
4881 	if (&lower->list == &dev->adj_list.lower)
4882 		return NULL;
4883 
4884 	*iter = &lower->list;
4885 
4886 	return lower->private;
4887 }
4888 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4889 
4890 /**
4891  * netdev_lower_get_next - Get the next device from the lower neighbour
4892  *                         list
4893  * @dev: device
4894  * @iter: list_head ** of the current position
4895  *
4896  * Gets the next netdev_adjacent from the dev's lower neighbour
4897  * list, starting from iter position. The caller must hold RTNL lock or
4898  * its own locking that guarantees that the neighbour lower
4899  * list will remain unchainged.
4900  */
4901 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
4902 {
4903 	struct netdev_adjacent *lower;
4904 
4905 	lower = list_entry((*iter)->next, struct netdev_adjacent, list);
4906 
4907 	if (&lower->list == &dev->adj_list.lower)
4908 		return NULL;
4909 
4910 	*iter = &lower->list;
4911 
4912 	return lower->dev;
4913 }
4914 EXPORT_SYMBOL(netdev_lower_get_next);
4915 
4916 /**
4917  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4918  *				       lower neighbour list, RCU
4919  *				       variant
4920  * @dev: device
4921  *
4922  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4923  * list. The caller must hold RCU read lock.
4924  */
4925 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4926 {
4927 	struct netdev_adjacent *lower;
4928 
4929 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
4930 			struct netdev_adjacent, list);
4931 	if (lower)
4932 		return lower->private;
4933 	return NULL;
4934 }
4935 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4936 
4937 /**
4938  * netdev_master_upper_dev_get_rcu - Get master upper device
4939  * @dev: device
4940  *
4941  * Find a master upper device and return pointer to it or NULL in case
4942  * it's not there. The caller must hold the RCU read lock.
4943  */
4944 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4945 {
4946 	struct netdev_adjacent *upper;
4947 
4948 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
4949 				       struct netdev_adjacent, list);
4950 	if (upper && likely(upper->master))
4951 		return upper->dev;
4952 	return NULL;
4953 }
4954 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4955 
4956 static int netdev_adjacent_sysfs_add(struct net_device *dev,
4957 			      struct net_device *adj_dev,
4958 			      struct list_head *dev_list)
4959 {
4960 	char linkname[IFNAMSIZ+7];
4961 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4962 		"upper_%s" : "lower_%s", adj_dev->name);
4963 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
4964 				 linkname);
4965 }
4966 static void netdev_adjacent_sysfs_del(struct net_device *dev,
4967 			       char *name,
4968 			       struct list_head *dev_list)
4969 {
4970 	char linkname[IFNAMSIZ+7];
4971 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
4972 		"upper_%s" : "lower_%s", name);
4973 	sysfs_remove_link(&(dev->dev.kobj), linkname);
4974 }
4975 
4976 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
4977 						 struct net_device *adj_dev,
4978 						 struct list_head *dev_list)
4979 {
4980 	return (dev_list == &dev->adj_list.upper ||
4981 		dev_list == &dev->adj_list.lower) &&
4982 		net_eq(dev_net(dev), dev_net(adj_dev));
4983 }
4984 
4985 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4986 					struct net_device *adj_dev,
4987 					struct list_head *dev_list,
4988 					void *private, bool master)
4989 {
4990 	struct netdev_adjacent *adj;
4991 	int ret;
4992 
4993 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
4994 
4995 	if (adj) {
4996 		adj->ref_nr++;
4997 		return 0;
4998 	}
4999 
5000 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5001 	if (!adj)
5002 		return -ENOMEM;
5003 
5004 	adj->dev = adj_dev;
5005 	adj->master = master;
5006 	adj->ref_nr = 1;
5007 	adj->private = private;
5008 	dev_hold(adj_dev);
5009 
5010 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5011 		 adj_dev->name, dev->name, adj_dev->name);
5012 
5013 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5014 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5015 		if (ret)
5016 			goto free_adj;
5017 	}
5018 
5019 	/* Ensure that master link is always the first item in list. */
5020 	if (master) {
5021 		ret = sysfs_create_link(&(dev->dev.kobj),
5022 					&(adj_dev->dev.kobj), "master");
5023 		if (ret)
5024 			goto remove_symlinks;
5025 
5026 		list_add_rcu(&adj->list, dev_list);
5027 	} else {
5028 		list_add_tail_rcu(&adj->list, dev_list);
5029 	}
5030 
5031 	return 0;
5032 
5033 remove_symlinks:
5034 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5035 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5036 free_adj:
5037 	kfree(adj);
5038 	dev_put(adj_dev);
5039 
5040 	return ret;
5041 }
5042 
5043 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5044 					 struct net_device *adj_dev,
5045 					 struct list_head *dev_list)
5046 {
5047 	struct netdev_adjacent *adj;
5048 
5049 	adj = __netdev_find_adj(dev, adj_dev, dev_list);
5050 
5051 	if (!adj) {
5052 		pr_err("tried to remove device %s from %s\n",
5053 		       dev->name, adj_dev->name);
5054 		BUG();
5055 	}
5056 
5057 	if (adj->ref_nr > 1) {
5058 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5059 			 adj->ref_nr-1);
5060 		adj->ref_nr--;
5061 		return;
5062 	}
5063 
5064 	if (adj->master)
5065 		sysfs_remove_link(&(dev->dev.kobj), "master");
5066 
5067 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5068 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5069 
5070 	list_del_rcu(&adj->list);
5071 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5072 		 adj_dev->name, dev->name, adj_dev->name);
5073 	dev_put(adj_dev);
5074 	kfree_rcu(adj, rcu);
5075 }
5076 
5077 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5078 					    struct net_device *upper_dev,
5079 					    struct list_head *up_list,
5080 					    struct list_head *down_list,
5081 					    void *private, bool master)
5082 {
5083 	int ret;
5084 
5085 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5086 					   master);
5087 	if (ret)
5088 		return ret;
5089 
5090 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5091 					   false);
5092 	if (ret) {
5093 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5094 		return ret;
5095 	}
5096 
5097 	return 0;
5098 }
5099 
5100 static int __netdev_adjacent_dev_link(struct net_device *dev,
5101 				      struct net_device *upper_dev)
5102 {
5103 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5104 						&dev->all_adj_list.upper,
5105 						&upper_dev->all_adj_list.lower,
5106 						NULL, false);
5107 }
5108 
5109 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5110 					       struct net_device *upper_dev,
5111 					       struct list_head *up_list,
5112 					       struct list_head *down_list)
5113 {
5114 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5115 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5116 }
5117 
5118 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5119 					 struct net_device *upper_dev)
5120 {
5121 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5122 					   &dev->all_adj_list.upper,
5123 					   &upper_dev->all_adj_list.lower);
5124 }
5125 
5126 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5127 						struct net_device *upper_dev,
5128 						void *private, bool master)
5129 {
5130 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5131 
5132 	if (ret)
5133 		return ret;
5134 
5135 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5136 					       &dev->adj_list.upper,
5137 					       &upper_dev->adj_list.lower,
5138 					       private, master);
5139 	if (ret) {
5140 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5141 		return ret;
5142 	}
5143 
5144 	return 0;
5145 }
5146 
5147 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5148 						   struct net_device *upper_dev)
5149 {
5150 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5151 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5152 					   &dev->adj_list.upper,
5153 					   &upper_dev->adj_list.lower);
5154 }
5155 
5156 static int __netdev_upper_dev_link(struct net_device *dev,
5157 				   struct net_device *upper_dev, bool master,
5158 				   void *private)
5159 {
5160 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5161 	int ret = 0;
5162 
5163 	ASSERT_RTNL();
5164 
5165 	if (dev == upper_dev)
5166 		return -EBUSY;
5167 
5168 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5169 	if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
5170 		return -EBUSY;
5171 
5172 	if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
5173 		return -EEXIST;
5174 
5175 	if (master && netdev_master_upper_dev_get(dev))
5176 		return -EBUSY;
5177 
5178 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5179 						   master);
5180 	if (ret)
5181 		return ret;
5182 
5183 	/* Now that we linked these devs, make all the upper_dev's
5184 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5185 	 * versa, and don't forget the devices itself. All of these
5186 	 * links are non-neighbours.
5187 	 */
5188 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5189 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5190 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5191 				 i->dev->name, j->dev->name);
5192 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5193 			if (ret)
5194 				goto rollback_mesh;
5195 		}
5196 	}
5197 
5198 	/* add dev to every upper_dev's upper device */
5199 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5200 		pr_debug("linking %s's upper device %s with %s\n",
5201 			 upper_dev->name, i->dev->name, dev->name);
5202 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5203 		if (ret)
5204 			goto rollback_upper_mesh;
5205 	}
5206 
5207 	/* add upper_dev to every dev's lower device */
5208 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5209 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5210 			 i->dev->name, upper_dev->name);
5211 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5212 		if (ret)
5213 			goto rollback_lower_mesh;
5214 	}
5215 
5216 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5217 	return 0;
5218 
5219 rollback_lower_mesh:
5220 	to_i = i;
5221 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5222 		if (i == to_i)
5223 			break;
5224 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5225 	}
5226 
5227 	i = NULL;
5228 
5229 rollback_upper_mesh:
5230 	to_i = i;
5231 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5232 		if (i == to_i)
5233 			break;
5234 		__netdev_adjacent_dev_unlink(dev, i->dev);
5235 	}
5236 
5237 	i = j = NULL;
5238 
5239 rollback_mesh:
5240 	to_i = i;
5241 	to_j = j;
5242 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5243 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5244 			if (i == to_i && j == to_j)
5245 				break;
5246 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5247 		}
5248 		if (i == to_i)
5249 			break;
5250 	}
5251 
5252 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5253 
5254 	return ret;
5255 }
5256 
5257 /**
5258  * netdev_upper_dev_link - Add a link to the upper device
5259  * @dev: device
5260  * @upper_dev: new upper device
5261  *
5262  * Adds a link to device which is upper to this one. The caller must hold
5263  * the RTNL lock. On a failure a negative errno code is returned.
5264  * On success the reference counts are adjusted and the function
5265  * returns zero.
5266  */
5267 int netdev_upper_dev_link(struct net_device *dev,
5268 			  struct net_device *upper_dev)
5269 {
5270 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5271 }
5272 EXPORT_SYMBOL(netdev_upper_dev_link);
5273 
5274 /**
5275  * netdev_master_upper_dev_link - Add a master link to the upper device
5276  * @dev: device
5277  * @upper_dev: new upper device
5278  *
5279  * Adds a link to device which is upper to this one. In this case, only
5280  * one master upper device can be linked, although other non-master devices
5281  * might be linked as well. The caller must hold the RTNL lock.
5282  * On a failure a negative errno code is returned. On success the reference
5283  * counts are adjusted and the function returns zero.
5284  */
5285 int netdev_master_upper_dev_link(struct net_device *dev,
5286 				 struct net_device *upper_dev)
5287 {
5288 	return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5289 }
5290 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5291 
5292 int netdev_master_upper_dev_link_private(struct net_device *dev,
5293 					 struct net_device *upper_dev,
5294 					 void *private)
5295 {
5296 	return __netdev_upper_dev_link(dev, upper_dev, true, private);
5297 }
5298 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5299 
5300 /**
5301  * netdev_upper_dev_unlink - Removes a link to upper device
5302  * @dev: device
5303  * @upper_dev: new upper device
5304  *
5305  * Removes a link to device which is upper to this one. The caller must hold
5306  * the RTNL lock.
5307  */
5308 void netdev_upper_dev_unlink(struct net_device *dev,
5309 			     struct net_device *upper_dev)
5310 {
5311 	struct netdev_adjacent *i, *j;
5312 	ASSERT_RTNL();
5313 
5314 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5315 
5316 	/* Here is the tricky part. We must remove all dev's lower
5317 	 * devices from all upper_dev's upper devices and vice
5318 	 * versa, to maintain the graph relationship.
5319 	 */
5320 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5321 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5322 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5323 
5324 	/* remove also the devices itself from lower/upper device
5325 	 * list
5326 	 */
5327 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5328 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5329 
5330 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5331 		__netdev_adjacent_dev_unlink(dev, i->dev);
5332 
5333 	call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
5334 }
5335 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5336 
5337 /**
5338  * netdev_bonding_info_change - Dispatch event about slave change
5339  * @dev: device
5340  * @bonding_info: info to dispatch
5341  *
5342  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5343  * The caller must hold the RTNL lock.
5344  */
5345 void netdev_bonding_info_change(struct net_device *dev,
5346 				struct netdev_bonding_info *bonding_info)
5347 {
5348 	struct netdev_notifier_bonding_info	info;
5349 
5350 	memcpy(&info.bonding_info, bonding_info,
5351 	       sizeof(struct netdev_bonding_info));
5352 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5353 				      &info.info);
5354 }
5355 EXPORT_SYMBOL(netdev_bonding_info_change);
5356 
5357 static void netdev_adjacent_add_links(struct net_device *dev)
5358 {
5359 	struct netdev_adjacent *iter;
5360 
5361 	struct net *net = dev_net(dev);
5362 
5363 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5364 		if (!net_eq(net,dev_net(iter->dev)))
5365 			continue;
5366 		netdev_adjacent_sysfs_add(iter->dev, dev,
5367 					  &iter->dev->adj_list.lower);
5368 		netdev_adjacent_sysfs_add(dev, iter->dev,
5369 					  &dev->adj_list.upper);
5370 	}
5371 
5372 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5373 		if (!net_eq(net,dev_net(iter->dev)))
5374 			continue;
5375 		netdev_adjacent_sysfs_add(iter->dev, dev,
5376 					  &iter->dev->adj_list.upper);
5377 		netdev_adjacent_sysfs_add(dev, iter->dev,
5378 					  &dev->adj_list.lower);
5379 	}
5380 }
5381 
5382 static void netdev_adjacent_del_links(struct net_device *dev)
5383 {
5384 	struct netdev_adjacent *iter;
5385 
5386 	struct net *net = dev_net(dev);
5387 
5388 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5389 		if (!net_eq(net,dev_net(iter->dev)))
5390 			continue;
5391 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5392 					  &iter->dev->adj_list.lower);
5393 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5394 					  &dev->adj_list.upper);
5395 	}
5396 
5397 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5398 		if (!net_eq(net,dev_net(iter->dev)))
5399 			continue;
5400 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5401 					  &iter->dev->adj_list.upper);
5402 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5403 					  &dev->adj_list.lower);
5404 	}
5405 }
5406 
5407 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5408 {
5409 	struct netdev_adjacent *iter;
5410 
5411 	struct net *net = dev_net(dev);
5412 
5413 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5414 		if (!net_eq(net,dev_net(iter->dev)))
5415 			continue;
5416 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5417 					  &iter->dev->adj_list.lower);
5418 		netdev_adjacent_sysfs_add(iter->dev, dev,
5419 					  &iter->dev->adj_list.lower);
5420 	}
5421 
5422 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5423 		if (!net_eq(net,dev_net(iter->dev)))
5424 			continue;
5425 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5426 					  &iter->dev->adj_list.upper);
5427 		netdev_adjacent_sysfs_add(iter->dev, dev,
5428 					  &iter->dev->adj_list.upper);
5429 	}
5430 }
5431 
5432 void *netdev_lower_dev_get_private(struct net_device *dev,
5433 				   struct net_device *lower_dev)
5434 {
5435 	struct netdev_adjacent *lower;
5436 
5437 	if (!lower_dev)
5438 		return NULL;
5439 	lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
5440 	if (!lower)
5441 		return NULL;
5442 
5443 	return lower->private;
5444 }
5445 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5446 
5447 
5448 int dev_get_nest_level(struct net_device *dev,
5449 		       bool (*type_check)(struct net_device *dev))
5450 {
5451 	struct net_device *lower = NULL;
5452 	struct list_head *iter;
5453 	int max_nest = -1;
5454 	int nest;
5455 
5456 	ASSERT_RTNL();
5457 
5458 	netdev_for_each_lower_dev(dev, lower, iter) {
5459 		nest = dev_get_nest_level(lower, type_check);
5460 		if (max_nest < nest)
5461 			max_nest = nest;
5462 	}
5463 
5464 	if (type_check(dev))
5465 		max_nest++;
5466 
5467 	return max_nest;
5468 }
5469 EXPORT_SYMBOL(dev_get_nest_level);
5470 
5471 static void dev_change_rx_flags(struct net_device *dev, int flags)
5472 {
5473 	const struct net_device_ops *ops = dev->netdev_ops;
5474 
5475 	if (ops->ndo_change_rx_flags)
5476 		ops->ndo_change_rx_flags(dev, flags);
5477 }
5478 
5479 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5480 {
5481 	unsigned int old_flags = dev->flags;
5482 	kuid_t uid;
5483 	kgid_t gid;
5484 
5485 	ASSERT_RTNL();
5486 
5487 	dev->flags |= IFF_PROMISC;
5488 	dev->promiscuity += inc;
5489 	if (dev->promiscuity == 0) {
5490 		/*
5491 		 * Avoid overflow.
5492 		 * If inc causes overflow, untouch promisc and return error.
5493 		 */
5494 		if (inc < 0)
5495 			dev->flags &= ~IFF_PROMISC;
5496 		else {
5497 			dev->promiscuity -= inc;
5498 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5499 				dev->name);
5500 			return -EOVERFLOW;
5501 		}
5502 	}
5503 	if (dev->flags != old_flags) {
5504 		pr_info("device %s %s promiscuous mode\n",
5505 			dev->name,
5506 			dev->flags & IFF_PROMISC ? "entered" : "left");
5507 		if (audit_enabled) {
5508 			current_uid_gid(&uid, &gid);
5509 			audit_log(current->audit_context, GFP_ATOMIC,
5510 				AUDIT_ANOM_PROMISCUOUS,
5511 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5512 				dev->name, (dev->flags & IFF_PROMISC),
5513 				(old_flags & IFF_PROMISC),
5514 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
5515 				from_kuid(&init_user_ns, uid),
5516 				from_kgid(&init_user_ns, gid),
5517 				audit_get_sessionid(current));
5518 		}
5519 
5520 		dev_change_rx_flags(dev, IFF_PROMISC);
5521 	}
5522 	if (notify)
5523 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
5524 	return 0;
5525 }
5526 
5527 /**
5528  *	dev_set_promiscuity	- update promiscuity count on a device
5529  *	@dev: device
5530  *	@inc: modifier
5531  *
5532  *	Add or remove promiscuity from a device. While the count in the device
5533  *	remains above zero the interface remains promiscuous. Once it hits zero
5534  *	the device reverts back to normal filtering operation. A negative inc
5535  *	value is used to drop promiscuity on the device.
5536  *	Return 0 if successful or a negative errno code on error.
5537  */
5538 int dev_set_promiscuity(struct net_device *dev, int inc)
5539 {
5540 	unsigned int old_flags = dev->flags;
5541 	int err;
5542 
5543 	err = __dev_set_promiscuity(dev, inc, true);
5544 	if (err < 0)
5545 		return err;
5546 	if (dev->flags != old_flags)
5547 		dev_set_rx_mode(dev);
5548 	return err;
5549 }
5550 EXPORT_SYMBOL(dev_set_promiscuity);
5551 
5552 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5553 {
5554 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5555 
5556 	ASSERT_RTNL();
5557 
5558 	dev->flags |= IFF_ALLMULTI;
5559 	dev->allmulti += inc;
5560 	if (dev->allmulti == 0) {
5561 		/*
5562 		 * Avoid overflow.
5563 		 * If inc causes overflow, untouch allmulti and return error.
5564 		 */
5565 		if (inc < 0)
5566 			dev->flags &= ~IFF_ALLMULTI;
5567 		else {
5568 			dev->allmulti -= inc;
5569 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5570 				dev->name);
5571 			return -EOVERFLOW;
5572 		}
5573 	}
5574 	if (dev->flags ^ old_flags) {
5575 		dev_change_rx_flags(dev, IFF_ALLMULTI);
5576 		dev_set_rx_mode(dev);
5577 		if (notify)
5578 			__dev_notify_flags(dev, old_flags,
5579 					   dev->gflags ^ old_gflags);
5580 	}
5581 	return 0;
5582 }
5583 
5584 /**
5585  *	dev_set_allmulti	- update allmulti count on a device
5586  *	@dev: device
5587  *	@inc: modifier
5588  *
5589  *	Add or remove reception of all multicast frames to a device. While the
5590  *	count in the device remains above zero the interface remains listening
5591  *	to all interfaces. Once it hits zero the device reverts back to normal
5592  *	filtering operation. A negative @inc value is used to drop the counter
5593  *	when releasing a resource needing all multicasts.
5594  *	Return 0 if successful or a negative errno code on error.
5595  */
5596 
5597 int dev_set_allmulti(struct net_device *dev, int inc)
5598 {
5599 	return __dev_set_allmulti(dev, inc, true);
5600 }
5601 EXPORT_SYMBOL(dev_set_allmulti);
5602 
5603 /*
5604  *	Upload unicast and multicast address lists to device and
5605  *	configure RX filtering. When the device doesn't support unicast
5606  *	filtering it is put in promiscuous mode while unicast addresses
5607  *	are present.
5608  */
5609 void __dev_set_rx_mode(struct net_device *dev)
5610 {
5611 	const struct net_device_ops *ops = dev->netdev_ops;
5612 
5613 	/* dev_open will call this function so the list will stay sane. */
5614 	if (!(dev->flags&IFF_UP))
5615 		return;
5616 
5617 	if (!netif_device_present(dev))
5618 		return;
5619 
5620 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5621 		/* Unicast addresses changes may only happen under the rtnl,
5622 		 * therefore calling __dev_set_promiscuity here is safe.
5623 		 */
5624 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5625 			__dev_set_promiscuity(dev, 1, false);
5626 			dev->uc_promisc = true;
5627 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5628 			__dev_set_promiscuity(dev, -1, false);
5629 			dev->uc_promisc = false;
5630 		}
5631 	}
5632 
5633 	if (ops->ndo_set_rx_mode)
5634 		ops->ndo_set_rx_mode(dev);
5635 }
5636 
5637 void dev_set_rx_mode(struct net_device *dev)
5638 {
5639 	netif_addr_lock_bh(dev);
5640 	__dev_set_rx_mode(dev);
5641 	netif_addr_unlock_bh(dev);
5642 }
5643 
5644 /**
5645  *	dev_get_flags - get flags reported to userspace
5646  *	@dev: device
5647  *
5648  *	Get the combination of flag bits exported through APIs to userspace.
5649  */
5650 unsigned int dev_get_flags(const struct net_device *dev)
5651 {
5652 	unsigned int flags;
5653 
5654 	flags = (dev->flags & ~(IFF_PROMISC |
5655 				IFF_ALLMULTI |
5656 				IFF_RUNNING |
5657 				IFF_LOWER_UP |
5658 				IFF_DORMANT)) |
5659 		(dev->gflags & (IFF_PROMISC |
5660 				IFF_ALLMULTI));
5661 
5662 	if (netif_running(dev)) {
5663 		if (netif_oper_up(dev))
5664 			flags |= IFF_RUNNING;
5665 		if (netif_carrier_ok(dev))
5666 			flags |= IFF_LOWER_UP;
5667 		if (netif_dormant(dev))
5668 			flags |= IFF_DORMANT;
5669 	}
5670 
5671 	return flags;
5672 }
5673 EXPORT_SYMBOL(dev_get_flags);
5674 
5675 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5676 {
5677 	unsigned int old_flags = dev->flags;
5678 	int ret;
5679 
5680 	ASSERT_RTNL();
5681 
5682 	/*
5683 	 *	Set the flags on our device.
5684 	 */
5685 
5686 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5687 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5688 			       IFF_AUTOMEDIA)) |
5689 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5690 				    IFF_ALLMULTI));
5691 
5692 	/*
5693 	 *	Load in the correct multicast list now the flags have changed.
5694 	 */
5695 
5696 	if ((old_flags ^ flags) & IFF_MULTICAST)
5697 		dev_change_rx_flags(dev, IFF_MULTICAST);
5698 
5699 	dev_set_rx_mode(dev);
5700 
5701 	/*
5702 	 *	Have we downed the interface. We handle IFF_UP ourselves
5703 	 *	according to user attempts to set it, rather than blindly
5704 	 *	setting it.
5705 	 */
5706 
5707 	ret = 0;
5708 	if ((old_flags ^ flags) & IFF_UP)
5709 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5710 
5711 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
5712 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
5713 		unsigned int old_flags = dev->flags;
5714 
5715 		dev->gflags ^= IFF_PROMISC;
5716 
5717 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
5718 			if (dev->flags != old_flags)
5719 				dev_set_rx_mode(dev);
5720 	}
5721 
5722 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5723 	   is important. Some (broken) drivers set IFF_PROMISC, when
5724 	   IFF_ALLMULTI is requested not asking us and not reporting.
5725 	 */
5726 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5727 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5728 
5729 		dev->gflags ^= IFF_ALLMULTI;
5730 		__dev_set_allmulti(dev, inc, false);
5731 	}
5732 
5733 	return ret;
5734 }
5735 
5736 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5737 			unsigned int gchanges)
5738 {
5739 	unsigned int changes = dev->flags ^ old_flags;
5740 
5741 	if (gchanges)
5742 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5743 
5744 	if (changes & IFF_UP) {
5745 		if (dev->flags & IFF_UP)
5746 			call_netdevice_notifiers(NETDEV_UP, dev);
5747 		else
5748 			call_netdevice_notifiers(NETDEV_DOWN, dev);
5749 	}
5750 
5751 	if (dev->flags & IFF_UP &&
5752 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5753 		struct netdev_notifier_change_info change_info;
5754 
5755 		change_info.flags_changed = changes;
5756 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5757 					      &change_info.info);
5758 	}
5759 }
5760 
5761 /**
5762  *	dev_change_flags - change device settings
5763  *	@dev: device
5764  *	@flags: device state flags
5765  *
5766  *	Change settings on device based state flags. The flags are
5767  *	in the userspace exported format.
5768  */
5769 int dev_change_flags(struct net_device *dev, unsigned int flags)
5770 {
5771 	int ret;
5772 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5773 
5774 	ret = __dev_change_flags(dev, flags);
5775 	if (ret < 0)
5776 		return ret;
5777 
5778 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5779 	__dev_notify_flags(dev, old_flags, changes);
5780 	return ret;
5781 }
5782 EXPORT_SYMBOL(dev_change_flags);
5783 
5784 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
5785 {
5786 	const struct net_device_ops *ops = dev->netdev_ops;
5787 
5788 	if (ops->ndo_change_mtu)
5789 		return ops->ndo_change_mtu(dev, new_mtu);
5790 
5791 	dev->mtu = new_mtu;
5792 	return 0;
5793 }
5794 
5795 /**
5796  *	dev_set_mtu - Change maximum transfer unit
5797  *	@dev: device
5798  *	@new_mtu: new transfer unit
5799  *
5800  *	Change the maximum transfer size of the network device.
5801  */
5802 int dev_set_mtu(struct net_device *dev, int new_mtu)
5803 {
5804 	int err, orig_mtu;
5805 
5806 	if (new_mtu == dev->mtu)
5807 		return 0;
5808 
5809 	/*	MTU must be positive.	 */
5810 	if (new_mtu < 0)
5811 		return -EINVAL;
5812 
5813 	if (!netif_device_present(dev))
5814 		return -ENODEV;
5815 
5816 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
5817 	err = notifier_to_errno(err);
5818 	if (err)
5819 		return err;
5820 
5821 	orig_mtu = dev->mtu;
5822 	err = __dev_set_mtu(dev, new_mtu);
5823 
5824 	if (!err) {
5825 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5826 		err = notifier_to_errno(err);
5827 		if (err) {
5828 			/* setting mtu back and notifying everyone again,
5829 			 * so that they have a chance to revert changes.
5830 			 */
5831 			__dev_set_mtu(dev, orig_mtu);
5832 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5833 		}
5834 	}
5835 	return err;
5836 }
5837 EXPORT_SYMBOL(dev_set_mtu);
5838 
5839 /**
5840  *	dev_set_group - Change group this device belongs to
5841  *	@dev: device
5842  *	@new_group: group this device should belong to
5843  */
5844 void dev_set_group(struct net_device *dev, int new_group)
5845 {
5846 	dev->group = new_group;
5847 }
5848 EXPORT_SYMBOL(dev_set_group);
5849 
5850 /**
5851  *	dev_set_mac_address - Change Media Access Control Address
5852  *	@dev: device
5853  *	@sa: new address
5854  *
5855  *	Change the hardware (MAC) address of the device
5856  */
5857 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5858 {
5859 	const struct net_device_ops *ops = dev->netdev_ops;
5860 	int err;
5861 
5862 	if (!ops->ndo_set_mac_address)
5863 		return -EOPNOTSUPP;
5864 	if (sa->sa_family != dev->type)
5865 		return -EINVAL;
5866 	if (!netif_device_present(dev))
5867 		return -ENODEV;
5868 	err = ops->ndo_set_mac_address(dev, sa);
5869 	if (err)
5870 		return err;
5871 	dev->addr_assign_type = NET_ADDR_SET;
5872 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5873 	add_device_randomness(dev->dev_addr, dev->addr_len);
5874 	return 0;
5875 }
5876 EXPORT_SYMBOL(dev_set_mac_address);
5877 
5878 /**
5879  *	dev_change_carrier - Change device carrier
5880  *	@dev: device
5881  *	@new_carrier: new value
5882  *
5883  *	Change device carrier
5884  */
5885 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5886 {
5887 	const struct net_device_ops *ops = dev->netdev_ops;
5888 
5889 	if (!ops->ndo_change_carrier)
5890 		return -EOPNOTSUPP;
5891 	if (!netif_device_present(dev))
5892 		return -ENODEV;
5893 	return ops->ndo_change_carrier(dev, new_carrier);
5894 }
5895 EXPORT_SYMBOL(dev_change_carrier);
5896 
5897 /**
5898  *	dev_get_phys_port_id - Get device physical port ID
5899  *	@dev: device
5900  *	@ppid: port ID
5901  *
5902  *	Get device physical port ID
5903  */
5904 int dev_get_phys_port_id(struct net_device *dev,
5905 			 struct netdev_phys_item_id *ppid)
5906 {
5907 	const struct net_device_ops *ops = dev->netdev_ops;
5908 
5909 	if (!ops->ndo_get_phys_port_id)
5910 		return -EOPNOTSUPP;
5911 	return ops->ndo_get_phys_port_id(dev, ppid);
5912 }
5913 EXPORT_SYMBOL(dev_get_phys_port_id);
5914 
5915 /**
5916  *	dev_get_phys_port_name - Get device physical port name
5917  *	@dev: device
5918  *	@name: port name
5919  *
5920  *	Get device physical port name
5921  */
5922 int dev_get_phys_port_name(struct net_device *dev,
5923 			   char *name, size_t len)
5924 {
5925 	const struct net_device_ops *ops = dev->netdev_ops;
5926 
5927 	if (!ops->ndo_get_phys_port_name)
5928 		return -EOPNOTSUPP;
5929 	return ops->ndo_get_phys_port_name(dev, name, len);
5930 }
5931 EXPORT_SYMBOL(dev_get_phys_port_name);
5932 
5933 /**
5934  *	dev_new_index	-	allocate an ifindex
5935  *	@net: the applicable net namespace
5936  *
5937  *	Returns a suitable unique value for a new device interface
5938  *	number.  The caller must hold the rtnl semaphore or the
5939  *	dev_base_lock to be sure it remains unique.
5940  */
5941 static int dev_new_index(struct net *net)
5942 {
5943 	int ifindex = net->ifindex;
5944 	for (;;) {
5945 		if (++ifindex <= 0)
5946 			ifindex = 1;
5947 		if (!__dev_get_by_index(net, ifindex))
5948 			return net->ifindex = ifindex;
5949 	}
5950 }
5951 
5952 /* Delayed registration/unregisteration */
5953 static LIST_HEAD(net_todo_list);
5954 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5955 
5956 static void net_set_todo(struct net_device *dev)
5957 {
5958 	list_add_tail(&dev->todo_list, &net_todo_list);
5959 	dev_net(dev)->dev_unreg_count++;
5960 }
5961 
5962 static void rollback_registered_many(struct list_head *head)
5963 {
5964 	struct net_device *dev, *tmp;
5965 	LIST_HEAD(close_head);
5966 
5967 	BUG_ON(dev_boot_phase);
5968 	ASSERT_RTNL();
5969 
5970 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5971 		/* Some devices call without registering
5972 		 * for initialization unwind. Remove those
5973 		 * devices and proceed with the remaining.
5974 		 */
5975 		if (dev->reg_state == NETREG_UNINITIALIZED) {
5976 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5977 				 dev->name, dev);
5978 
5979 			WARN_ON(1);
5980 			list_del(&dev->unreg_list);
5981 			continue;
5982 		}
5983 		dev->dismantle = true;
5984 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
5985 	}
5986 
5987 	/* If device is running, close it first. */
5988 	list_for_each_entry(dev, head, unreg_list)
5989 		list_add_tail(&dev->close_list, &close_head);
5990 	dev_close_many(&close_head, true);
5991 
5992 	list_for_each_entry(dev, head, unreg_list) {
5993 		/* And unlink it from device chain. */
5994 		unlist_netdevice(dev);
5995 
5996 		dev->reg_state = NETREG_UNREGISTERING;
5997 	}
5998 
5999 	synchronize_net();
6000 
6001 	list_for_each_entry(dev, head, unreg_list) {
6002 		struct sk_buff *skb = NULL;
6003 
6004 		/* Shutdown queueing discipline. */
6005 		dev_shutdown(dev);
6006 
6007 
6008 		/* Notify protocols, that we are about to destroy
6009 		   this device. They should clean all the things.
6010 		*/
6011 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6012 
6013 		if (!dev->rtnl_link_ops ||
6014 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6015 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6016 						     GFP_KERNEL);
6017 
6018 		/*
6019 		 *	Flush the unicast and multicast chains
6020 		 */
6021 		dev_uc_flush(dev);
6022 		dev_mc_flush(dev);
6023 
6024 		if (dev->netdev_ops->ndo_uninit)
6025 			dev->netdev_ops->ndo_uninit(dev);
6026 
6027 		if (skb)
6028 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6029 
6030 		/* Notifier chain MUST detach us all upper devices. */
6031 		WARN_ON(netdev_has_any_upper_dev(dev));
6032 
6033 		/* Remove entries from kobject tree */
6034 		netdev_unregister_kobject(dev);
6035 #ifdef CONFIG_XPS
6036 		/* Remove XPS queueing entries */
6037 		netif_reset_xps_queues_gt(dev, 0);
6038 #endif
6039 	}
6040 
6041 	synchronize_net();
6042 
6043 	list_for_each_entry(dev, head, unreg_list)
6044 		dev_put(dev);
6045 }
6046 
6047 static void rollback_registered(struct net_device *dev)
6048 {
6049 	LIST_HEAD(single);
6050 
6051 	list_add(&dev->unreg_list, &single);
6052 	rollback_registered_many(&single);
6053 	list_del(&single);
6054 }
6055 
6056 static netdev_features_t netdev_fix_features(struct net_device *dev,
6057 	netdev_features_t features)
6058 {
6059 	/* Fix illegal checksum combinations */
6060 	if ((features & NETIF_F_HW_CSUM) &&
6061 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6062 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6063 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6064 	}
6065 
6066 	/* TSO requires that SG is present as well. */
6067 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6068 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6069 		features &= ~NETIF_F_ALL_TSO;
6070 	}
6071 
6072 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6073 					!(features & NETIF_F_IP_CSUM)) {
6074 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6075 		features &= ~NETIF_F_TSO;
6076 		features &= ~NETIF_F_TSO_ECN;
6077 	}
6078 
6079 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6080 					 !(features & NETIF_F_IPV6_CSUM)) {
6081 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6082 		features &= ~NETIF_F_TSO6;
6083 	}
6084 
6085 	/* TSO ECN requires that TSO is present as well. */
6086 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6087 		features &= ~NETIF_F_TSO_ECN;
6088 
6089 	/* Software GSO depends on SG. */
6090 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6091 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6092 		features &= ~NETIF_F_GSO;
6093 	}
6094 
6095 	/* UFO needs SG and checksumming */
6096 	if (features & NETIF_F_UFO) {
6097 		/* maybe split UFO into V4 and V6? */
6098 		if (!((features & NETIF_F_GEN_CSUM) ||
6099 		    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6100 			    == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6101 			netdev_dbg(dev,
6102 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6103 			features &= ~NETIF_F_UFO;
6104 		}
6105 
6106 		if (!(features & NETIF_F_SG)) {
6107 			netdev_dbg(dev,
6108 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6109 			features &= ~NETIF_F_UFO;
6110 		}
6111 	}
6112 
6113 #ifdef CONFIG_NET_RX_BUSY_POLL
6114 	if (dev->netdev_ops->ndo_busy_poll)
6115 		features |= NETIF_F_BUSY_POLL;
6116 	else
6117 #endif
6118 		features &= ~NETIF_F_BUSY_POLL;
6119 
6120 	return features;
6121 }
6122 
6123 int __netdev_update_features(struct net_device *dev)
6124 {
6125 	netdev_features_t features;
6126 	int err = 0;
6127 
6128 	ASSERT_RTNL();
6129 
6130 	features = netdev_get_wanted_features(dev);
6131 
6132 	if (dev->netdev_ops->ndo_fix_features)
6133 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6134 
6135 	/* driver might be less strict about feature dependencies */
6136 	features = netdev_fix_features(dev, features);
6137 
6138 	if (dev->features == features)
6139 		return 0;
6140 
6141 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6142 		&dev->features, &features);
6143 
6144 	if (dev->netdev_ops->ndo_set_features)
6145 		err = dev->netdev_ops->ndo_set_features(dev, features);
6146 
6147 	if (unlikely(err < 0)) {
6148 		netdev_err(dev,
6149 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6150 			err, &features, &dev->features);
6151 		return -1;
6152 	}
6153 
6154 	if (!err)
6155 		dev->features = features;
6156 
6157 	return 1;
6158 }
6159 
6160 /**
6161  *	netdev_update_features - recalculate device features
6162  *	@dev: the device to check
6163  *
6164  *	Recalculate dev->features set and send notifications if it
6165  *	has changed. Should be called after driver or hardware dependent
6166  *	conditions might have changed that influence the features.
6167  */
6168 void netdev_update_features(struct net_device *dev)
6169 {
6170 	if (__netdev_update_features(dev))
6171 		netdev_features_change(dev);
6172 }
6173 EXPORT_SYMBOL(netdev_update_features);
6174 
6175 /**
6176  *	netdev_change_features - recalculate device features
6177  *	@dev: the device to check
6178  *
6179  *	Recalculate dev->features set and send notifications even
6180  *	if they have not changed. Should be called instead of
6181  *	netdev_update_features() if also dev->vlan_features might
6182  *	have changed to allow the changes to be propagated to stacked
6183  *	VLAN devices.
6184  */
6185 void netdev_change_features(struct net_device *dev)
6186 {
6187 	__netdev_update_features(dev);
6188 	netdev_features_change(dev);
6189 }
6190 EXPORT_SYMBOL(netdev_change_features);
6191 
6192 /**
6193  *	netif_stacked_transfer_operstate -	transfer operstate
6194  *	@rootdev: the root or lower level device to transfer state from
6195  *	@dev: the device to transfer operstate to
6196  *
6197  *	Transfer operational state from root to device. This is normally
6198  *	called when a stacking relationship exists between the root
6199  *	device and the device(a leaf device).
6200  */
6201 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6202 					struct net_device *dev)
6203 {
6204 	if (rootdev->operstate == IF_OPER_DORMANT)
6205 		netif_dormant_on(dev);
6206 	else
6207 		netif_dormant_off(dev);
6208 
6209 	if (netif_carrier_ok(rootdev)) {
6210 		if (!netif_carrier_ok(dev))
6211 			netif_carrier_on(dev);
6212 	} else {
6213 		if (netif_carrier_ok(dev))
6214 			netif_carrier_off(dev);
6215 	}
6216 }
6217 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6218 
6219 #ifdef CONFIG_SYSFS
6220 static int netif_alloc_rx_queues(struct net_device *dev)
6221 {
6222 	unsigned int i, count = dev->num_rx_queues;
6223 	struct netdev_rx_queue *rx;
6224 	size_t sz = count * sizeof(*rx);
6225 
6226 	BUG_ON(count < 1);
6227 
6228 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6229 	if (!rx) {
6230 		rx = vzalloc(sz);
6231 		if (!rx)
6232 			return -ENOMEM;
6233 	}
6234 	dev->_rx = rx;
6235 
6236 	for (i = 0; i < count; i++)
6237 		rx[i].dev = dev;
6238 	return 0;
6239 }
6240 #endif
6241 
6242 static void netdev_init_one_queue(struct net_device *dev,
6243 				  struct netdev_queue *queue, void *_unused)
6244 {
6245 	/* Initialize queue lock */
6246 	spin_lock_init(&queue->_xmit_lock);
6247 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6248 	queue->xmit_lock_owner = -1;
6249 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6250 	queue->dev = dev;
6251 #ifdef CONFIG_BQL
6252 	dql_init(&queue->dql, HZ);
6253 #endif
6254 }
6255 
6256 static void netif_free_tx_queues(struct net_device *dev)
6257 {
6258 	kvfree(dev->_tx);
6259 }
6260 
6261 static int netif_alloc_netdev_queues(struct net_device *dev)
6262 {
6263 	unsigned int count = dev->num_tx_queues;
6264 	struct netdev_queue *tx;
6265 	size_t sz = count * sizeof(*tx);
6266 
6267 	BUG_ON(count < 1 || count > 0xffff);
6268 
6269 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6270 	if (!tx) {
6271 		tx = vzalloc(sz);
6272 		if (!tx)
6273 			return -ENOMEM;
6274 	}
6275 	dev->_tx = tx;
6276 
6277 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6278 	spin_lock_init(&dev->tx_global_lock);
6279 
6280 	return 0;
6281 }
6282 
6283 /**
6284  *	register_netdevice	- register a network device
6285  *	@dev: device to register
6286  *
6287  *	Take a completed network device structure and add it to the kernel
6288  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6289  *	chain. 0 is returned on success. A negative errno code is returned
6290  *	on a failure to set up the device, or if the name is a duplicate.
6291  *
6292  *	Callers must hold the rtnl semaphore. You may want
6293  *	register_netdev() instead of this.
6294  *
6295  *	BUGS:
6296  *	The locking appears insufficient to guarantee two parallel registers
6297  *	will not get the same name.
6298  */
6299 
6300 int register_netdevice(struct net_device *dev)
6301 {
6302 	int ret;
6303 	struct net *net = dev_net(dev);
6304 
6305 	BUG_ON(dev_boot_phase);
6306 	ASSERT_RTNL();
6307 
6308 	might_sleep();
6309 
6310 	/* When net_device's are persistent, this will be fatal. */
6311 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6312 	BUG_ON(!net);
6313 
6314 	spin_lock_init(&dev->addr_list_lock);
6315 	netdev_set_addr_lockdep_class(dev);
6316 
6317 	dev->iflink = -1;
6318 
6319 	ret = dev_get_valid_name(net, dev, dev->name);
6320 	if (ret < 0)
6321 		goto out;
6322 
6323 	/* Init, if this function is available */
6324 	if (dev->netdev_ops->ndo_init) {
6325 		ret = dev->netdev_ops->ndo_init(dev);
6326 		if (ret) {
6327 			if (ret > 0)
6328 				ret = -EIO;
6329 			goto out;
6330 		}
6331 	}
6332 
6333 	if (((dev->hw_features | dev->features) &
6334 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
6335 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6336 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6337 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6338 		ret = -EINVAL;
6339 		goto err_uninit;
6340 	}
6341 
6342 	ret = -EBUSY;
6343 	if (!dev->ifindex)
6344 		dev->ifindex = dev_new_index(net);
6345 	else if (__dev_get_by_index(net, dev->ifindex))
6346 		goto err_uninit;
6347 
6348 	if (dev->iflink == -1)
6349 		dev->iflink = dev->ifindex;
6350 
6351 	/* Transfer changeable features to wanted_features and enable
6352 	 * software offloads (GSO and GRO).
6353 	 */
6354 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
6355 	dev->features |= NETIF_F_SOFT_FEATURES;
6356 	dev->wanted_features = dev->features & dev->hw_features;
6357 
6358 	if (!(dev->flags & IFF_LOOPBACK)) {
6359 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
6360 	}
6361 
6362 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6363 	 */
6364 	dev->vlan_features |= NETIF_F_HIGHDMA;
6365 
6366 	/* Make NETIF_F_SG inheritable to tunnel devices.
6367 	 */
6368 	dev->hw_enc_features |= NETIF_F_SG;
6369 
6370 	/* Make NETIF_F_SG inheritable to MPLS.
6371 	 */
6372 	dev->mpls_features |= NETIF_F_SG;
6373 
6374 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6375 	ret = notifier_to_errno(ret);
6376 	if (ret)
6377 		goto err_uninit;
6378 
6379 	ret = netdev_register_kobject(dev);
6380 	if (ret)
6381 		goto err_uninit;
6382 	dev->reg_state = NETREG_REGISTERED;
6383 
6384 	__netdev_update_features(dev);
6385 
6386 	/*
6387 	 *	Default initial state at registry is that the
6388 	 *	device is present.
6389 	 */
6390 
6391 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6392 
6393 	linkwatch_init_dev(dev);
6394 
6395 	dev_init_scheduler(dev);
6396 	dev_hold(dev);
6397 	list_netdevice(dev);
6398 	add_device_randomness(dev->dev_addr, dev->addr_len);
6399 
6400 	/* If the device has permanent device address, driver should
6401 	 * set dev_addr and also addr_assign_type should be set to
6402 	 * NET_ADDR_PERM (default value).
6403 	 */
6404 	if (dev->addr_assign_type == NET_ADDR_PERM)
6405 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6406 
6407 	/* Notify protocols, that a new device appeared. */
6408 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6409 	ret = notifier_to_errno(ret);
6410 	if (ret) {
6411 		rollback_registered(dev);
6412 		dev->reg_state = NETREG_UNREGISTERED;
6413 	}
6414 	/*
6415 	 *	Prevent userspace races by waiting until the network
6416 	 *	device is fully setup before sending notifications.
6417 	 */
6418 	if (!dev->rtnl_link_ops ||
6419 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6420 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6421 
6422 out:
6423 	return ret;
6424 
6425 err_uninit:
6426 	if (dev->netdev_ops->ndo_uninit)
6427 		dev->netdev_ops->ndo_uninit(dev);
6428 	goto out;
6429 }
6430 EXPORT_SYMBOL(register_netdevice);
6431 
6432 /**
6433  *	init_dummy_netdev	- init a dummy network device for NAPI
6434  *	@dev: device to init
6435  *
6436  *	This takes a network device structure and initialize the minimum
6437  *	amount of fields so it can be used to schedule NAPI polls without
6438  *	registering a full blown interface. This is to be used by drivers
6439  *	that need to tie several hardware interfaces to a single NAPI
6440  *	poll scheduler due to HW limitations.
6441  */
6442 int init_dummy_netdev(struct net_device *dev)
6443 {
6444 	/* Clear everything. Note we don't initialize spinlocks
6445 	 * are they aren't supposed to be taken by any of the
6446 	 * NAPI code and this dummy netdev is supposed to be
6447 	 * only ever used for NAPI polls
6448 	 */
6449 	memset(dev, 0, sizeof(struct net_device));
6450 
6451 	/* make sure we BUG if trying to hit standard
6452 	 * register/unregister code path
6453 	 */
6454 	dev->reg_state = NETREG_DUMMY;
6455 
6456 	/* NAPI wants this */
6457 	INIT_LIST_HEAD(&dev->napi_list);
6458 
6459 	/* a dummy interface is started by default */
6460 	set_bit(__LINK_STATE_PRESENT, &dev->state);
6461 	set_bit(__LINK_STATE_START, &dev->state);
6462 
6463 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
6464 	 * because users of this 'device' dont need to change
6465 	 * its refcount.
6466 	 */
6467 
6468 	return 0;
6469 }
6470 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6471 
6472 
6473 /**
6474  *	register_netdev	- register a network device
6475  *	@dev: device to register
6476  *
6477  *	Take a completed network device structure and add it to the kernel
6478  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6479  *	chain. 0 is returned on success. A negative errno code is returned
6480  *	on a failure to set up the device, or if the name is a duplicate.
6481  *
6482  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
6483  *	and expands the device name if you passed a format string to
6484  *	alloc_netdev.
6485  */
6486 int register_netdev(struct net_device *dev)
6487 {
6488 	int err;
6489 
6490 	rtnl_lock();
6491 	err = register_netdevice(dev);
6492 	rtnl_unlock();
6493 	return err;
6494 }
6495 EXPORT_SYMBOL(register_netdev);
6496 
6497 int netdev_refcnt_read(const struct net_device *dev)
6498 {
6499 	int i, refcnt = 0;
6500 
6501 	for_each_possible_cpu(i)
6502 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6503 	return refcnt;
6504 }
6505 EXPORT_SYMBOL(netdev_refcnt_read);
6506 
6507 /**
6508  * netdev_wait_allrefs - wait until all references are gone.
6509  * @dev: target net_device
6510  *
6511  * This is called when unregistering network devices.
6512  *
6513  * Any protocol or device that holds a reference should register
6514  * for netdevice notification, and cleanup and put back the
6515  * reference if they receive an UNREGISTER event.
6516  * We can get stuck here if buggy protocols don't correctly
6517  * call dev_put.
6518  */
6519 static void netdev_wait_allrefs(struct net_device *dev)
6520 {
6521 	unsigned long rebroadcast_time, warning_time;
6522 	int refcnt;
6523 
6524 	linkwatch_forget_dev(dev);
6525 
6526 	rebroadcast_time = warning_time = jiffies;
6527 	refcnt = netdev_refcnt_read(dev);
6528 
6529 	while (refcnt != 0) {
6530 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6531 			rtnl_lock();
6532 
6533 			/* Rebroadcast unregister notification */
6534 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6535 
6536 			__rtnl_unlock();
6537 			rcu_barrier();
6538 			rtnl_lock();
6539 
6540 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6541 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6542 				     &dev->state)) {
6543 				/* We must not have linkwatch events
6544 				 * pending on unregister. If this
6545 				 * happens, we simply run the queue
6546 				 * unscheduled, resulting in a noop
6547 				 * for this device.
6548 				 */
6549 				linkwatch_run_queue();
6550 			}
6551 
6552 			__rtnl_unlock();
6553 
6554 			rebroadcast_time = jiffies;
6555 		}
6556 
6557 		msleep(250);
6558 
6559 		refcnt = netdev_refcnt_read(dev);
6560 
6561 		if (time_after(jiffies, warning_time + 10 * HZ)) {
6562 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6563 				 dev->name, refcnt);
6564 			warning_time = jiffies;
6565 		}
6566 	}
6567 }
6568 
6569 /* The sequence is:
6570  *
6571  *	rtnl_lock();
6572  *	...
6573  *	register_netdevice(x1);
6574  *	register_netdevice(x2);
6575  *	...
6576  *	unregister_netdevice(y1);
6577  *	unregister_netdevice(y2);
6578  *      ...
6579  *	rtnl_unlock();
6580  *	free_netdev(y1);
6581  *	free_netdev(y2);
6582  *
6583  * We are invoked by rtnl_unlock().
6584  * This allows us to deal with problems:
6585  * 1) We can delete sysfs objects which invoke hotplug
6586  *    without deadlocking with linkwatch via keventd.
6587  * 2) Since we run with the RTNL semaphore not held, we can sleep
6588  *    safely in order to wait for the netdev refcnt to drop to zero.
6589  *
6590  * We must not return until all unregister events added during
6591  * the interval the lock was held have been completed.
6592  */
6593 void netdev_run_todo(void)
6594 {
6595 	struct list_head list;
6596 
6597 	/* Snapshot list, allow later requests */
6598 	list_replace_init(&net_todo_list, &list);
6599 
6600 	__rtnl_unlock();
6601 
6602 
6603 	/* Wait for rcu callbacks to finish before next phase */
6604 	if (!list_empty(&list))
6605 		rcu_barrier();
6606 
6607 	while (!list_empty(&list)) {
6608 		struct net_device *dev
6609 			= list_first_entry(&list, struct net_device, todo_list);
6610 		list_del(&dev->todo_list);
6611 
6612 		rtnl_lock();
6613 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6614 		__rtnl_unlock();
6615 
6616 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6617 			pr_err("network todo '%s' but state %d\n",
6618 			       dev->name, dev->reg_state);
6619 			dump_stack();
6620 			continue;
6621 		}
6622 
6623 		dev->reg_state = NETREG_UNREGISTERED;
6624 
6625 		on_each_cpu(flush_backlog, dev, 1);
6626 
6627 		netdev_wait_allrefs(dev);
6628 
6629 		/* paranoia */
6630 		BUG_ON(netdev_refcnt_read(dev));
6631 		BUG_ON(!list_empty(&dev->ptype_all));
6632 		BUG_ON(!list_empty(&dev->ptype_specific));
6633 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
6634 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6635 		WARN_ON(dev->dn_ptr);
6636 
6637 		if (dev->destructor)
6638 			dev->destructor(dev);
6639 
6640 		/* Report a network device has been unregistered */
6641 		rtnl_lock();
6642 		dev_net(dev)->dev_unreg_count--;
6643 		__rtnl_unlock();
6644 		wake_up(&netdev_unregistering_wq);
6645 
6646 		/* Free network device */
6647 		kobject_put(&dev->dev.kobj);
6648 	}
6649 }
6650 
6651 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6652  * fields in the same order, with only the type differing.
6653  */
6654 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6655 			     const struct net_device_stats *netdev_stats)
6656 {
6657 #if BITS_PER_LONG == 64
6658 	BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6659 	memcpy(stats64, netdev_stats, sizeof(*stats64));
6660 #else
6661 	size_t i, n = sizeof(*stats64) / sizeof(u64);
6662 	const unsigned long *src = (const unsigned long *)netdev_stats;
6663 	u64 *dst = (u64 *)stats64;
6664 
6665 	BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6666 		     sizeof(*stats64) / sizeof(u64));
6667 	for (i = 0; i < n; i++)
6668 		dst[i] = src[i];
6669 #endif
6670 }
6671 EXPORT_SYMBOL(netdev_stats_to_stats64);
6672 
6673 /**
6674  *	dev_get_stats	- get network device statistics
6675  *	@dev: device to get statistics from
6676  *	@storage: place to store stats
6677  *
6678  *	Get network statistics from device. Return @storage.
6679  *	The device driver may provide its own method by setting
6680  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6681  *	otherwise the internal statistics structure is used.
6682  */
6683 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6684 					struct rtnl_link_stats64 *storage)
6685 {
6686 	const struct net_device_ops *ops = dev->netdev_ops;
6687 
6688 	if (ops->ndo_get_stats64) {
6689 		memset(storage, 0, sizeof(*storage));
6690 		ops->ndo_get_stats64(dev, storage);
6691 	} else if (ops->ndo_get_stats) {
6692 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6693 	} else {
6694 		netdev_stats_to_stats64(storage, &dev->stats);
6695 	}
6696 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6697 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
6698 	return storage;
6699 }
6700 EXPORT_SYMBOL(dev_get_stats);
6701 
6702 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6703 {
6704 	struct netdev_queue *queue = dev_ingress_queue(dev);
6705 
6706 #ifdef CONFIG_NET_CLS_ACT
6707 	if (queue)
6708 		return queue;
6709 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6710 	if (!queue)
6711 		return NULL;
6712 	netdev_init_one_queue(dev, queue, NULL);
6713 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
6714 	queue->qdisc_sleeping = &noop_qdisc;
6715 	rcu_assign_pointer(dev->ingress_queue, queue);
6716 #endif
6717 	return queue;
6718 }
6719 
6720 static const struct ethtool_ops default_ethtool_ops;
6721 
6722 void netdev_set_default_ethtool_ops(struct net_device *dev,
6723 				    const struct ethtool_ops *ops)
6724 {
6725 	if (dev->ethtool_ops == &default_ethtool_ops)
6726 		dev->ethtool_ops = ops;
6727 }
6728 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6729 
6730 void netdev_freemem(struct net_device *dev)
6731 {
6732 	char *addr = (char *)dev - dev->padded;
6733 
6734 	kvfree(addr);
6735 }
6736 
6737 /**
6738  *	alloc_netdev_mqs - allocate network device
6739  *	@sizeof_priv:		size of private data to allocate space for
6740  *	@name:			device name format string
6741  *	@name_assign_type: 	origin of device name
6742  *	@setup:			callback to initialize device
6743  *	@txqs:			the number of TX subqueues to allocate
6744  *	@rxqs:			the number of RX subqueues to allocate
6745  *
6746  *	Allocates a struct net_device with private data area for driver use
6747  *	and performs basic initialization.  Also allocates subqueue structs
6748  *	for each queue on the device.
6749  */
6750 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6751 		unsigned char name_assign_type,
6752 		void (*setup)(struct net_device *),
6753 		unsigned int txqs, unsigned int rxqs)
6754 {
6755 	struct net_device *dev;
6756 	size_t alloc_size;
6757 	struct net_device *p;
6758 
6759 	BUG_ON(strlen(name) >= sizeof(dev->name));
6760 
6761 	if (txqs < 1) {
6762 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6763 		return NULL;
6764 	}
6765 
6766 #ifdef CONFIG_SYSFS
6767 	if (rxqs < 1) {
6768 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6769 		return NULL;
6770 	}
6771 #endif
6772 
6773 	alloc_size = sizeof(struct net_device);
6774 	if (sizeof_priv) {
6775 		/* ensure 32-byte alignment of private area */
6776 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6777 		alloc_size += sizeof_priv;
6778 	}
6779 	/* ensure 32-byte alignment of whole construct */
6780 	alloc_size += NETDEV_ALIGN - 1;
6781 
6782 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6783 	if (!p)
6784 		p = vzalloc(alloc_size);
6785 	if (!p)
6786 		return NULL;
6787 
6788 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
6789 	dev->padded = (char *)dev - (char *)p;
6790 
6791 	dev->pcpu_refcnt = alloc_percpu(int);
6792 	if (!dev->pcpu_refcnt)
6793 		goto free_dev;
6794 
6795 	if (dev_addr_init(dev))
6796 		goto free_pcpu;
6797 
6798 	dev_mc_init(dev);
6799 	dev_uc_init(dev);
6800 
6801 	dev_net_set(dev, &init_net);
6802 
6803 	dev->gso_max_size = GSO_MAX_SIZE;
6804 	dev->gso_max_segs = GSO_MAX_SEGS;
6805 	dev->gso_min_segs = 0;
6806 
6807 	INIT_LIST_HEAD(&dev->napi_list);
6808 	INIT_LIST_HEAD(&dev->unreg_list);
6809 	INIT_LIST_HEAD(&dev->close_list);
6810 	INIT_LIST_HEAD(&dev->link_watch_list);
6811 	INIT_LIST_HEAD(&dev->adj_list.upper);
6812 	INIT_LIST_HEAD(&dev->adj_list.lower);
6813 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
6814 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
6815 	INIT_LIST_HEAD(&dev->ptype_all);
6816 	INIT_LIST_HEAD(&dev->ptype_specific);
6817 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
6818 	setup(dev);
6819 
6820 	dev->num_tx_queues = txqs;
6821 	dev->real_num_tx_queues = txqs;
6822 	if (netif_alloc_netdev_queues(dev))
6823 		goto free_all;
6824 
6825 #ifdef CONFIG_SYSFS
6826 	dev->num_rx_queues = rxqs;
6827 	dev->real_num_rx_queues = rxqs;
6828 	if (netif_alloc_rx_queues(dev))
6829 		goto free_all;
6830 #endif
6831 
6832 	strcpy(dev->name, name);
6833 	dev->name_assign_type = name_assign_type;
6834 	dev->group = INIT_NETDEV_GROUP;
6835 	if (!dev->ethtool_ops)
6836 		dev->ethtool_ops = &default_ethtool_ops;
6837 	return dev;
6838 
6839 free_all:
6840 	free_netdev(dev);
6841 	return NULL;
6842 
6843 free_pcpu:
6844 	free_percpu(dev->pcpu_refcnt);
6845 free_dev:
6846 	netdev_freemem(dev);
6847 	return NULL;
6848 }
6849 EXPORT_SYMBOL(alloc_netdev_mqs);
6850 
6851 /**
6852  *	free_netdev - free network device
6853  *	@dev: device
6854  *
6855  *	This function does the last stage of destroying an allocated device
6856  * 	interface. The reference to the device object is released.
6857  *	If this is the last reference then it will be freed.
6858  */
6859 void free_netdev(struct net_device *dev)
6860 {
6861 	struct napi_struct *p, *n;
6862 
6863 	netif_free_tx_queues(dev);
6864 #ifdef CONFIG_SYSFS
6865 	kvfree(dev->_rx);
6866 #endif
6867 
6868 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6869 
6870 	/* Flush device addresses */
6871 	dev_addr_flush(dev);
6872 
6873 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6874 		netif_napi_del(p);
6875 
6876 	free_percpu(dev->pcpu_refcnt);
6877 	dev->pcpu_refcnt = NULL;
6878 
6879 	/*  Compatibility with error handling in drivers */
6880 	if (dev->reg_state == NETREG_UNINITIALIZED) {
6881 		netdev_freemem(dev);
6882 		return;
6883 	}
6884 
6885 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6886 	dev->reg_state = NETREG_RELEASED;
6887 
6888 	/* will free via device release */
6889 	put_device(&dev->dev);
6890 }
6891 EXPORT_SYMBOL(free_netdev);
6892 
6893 /**
6894  *	synchronize_net -  Synchronize with packet receive processing
6895  *
6896  *	Wait for packets currently being received to be done.
6897  *	Does not block later packets from starting.
6898  */
6899 void synchronize_net(void)
6900 {
6901 	might_sleep();
6902 	if (rtnl_is_locked())
6903 		synchronize_rcu_expedited();
6904 	else
6905 		synchronize_rcu();
6906 }
6907 EXPORT_SYMBOL(synchronize_net);
6908 
6909 /**
6910  *	unregister_netdevice_queue - remove device from the kernel
6911  *	@dev: device
6912  *	@head: list
6913  *
6914  *	This function shuts down a device interface and removes it
6915  *	from the kernel tables.
6916  *	If head not NULL, device is queued to be unregistered later.
6917  *
6918  *	Callers must hold the rtnl semaphore.  You may want
6919  *	unregister_netdev() instead of this.
6920  */
6921 
6922 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6923 {
6924 	ASSERT_RTNL();
6925 
6926 	if (head) {
6927 		list_move_tail(&dev->unreg_list, head);
6928 	} else {
6929 		rollback_registered(dev);
6930 		/* Finish processing unregister after unlock */
6931 		net_set_todo(dev);
6932 	}
6933 }
6934 EXPORT_SYMBOL(unregister_netdevice_queue);
6935 
6936 /**
6937  *	unregister_netdevice_many - unregister many devices
6938  *	@head: list of devices
6939  *
6940  *  Note: As most callers use a stack allocated list_head,
6941  *  we force a list_del() to make sure stack wont be corrupted later.
6942  */
6943 void unregister_netdevice_many(struct list_head *head)
6944 {
6945 	struct net_device *dev;
6946 
6947 	if (!list_empty(head)) {
6948 		rollback_registered_many(head);
6949 		list_for_each_entry(dev, head, unreg_list)
6950 			net_set_todo(dev);
6951 		list_del(head);
6952 	}
6953 }
6954 EXPORT_SYMBOL(unregister_netdevice_many);
6955 
6956 /**
6957  *	unregister_netdev - remove device from the kernel
6958  *	@dev: device
6959  *
6960  *	This function shuts down a device interface and removes it
6961  *	from the kernel tables.
6962  *
6963  *	This is just a wrapper for unregister_netdevice that takes
6964  *	the rtnl semaphore.  In general you want to use this and not
6965  *	unregister_netdevice.
6966  */
6967 void unregister_netdev(struct net_device *dev)
6968 {
6969 	rtnl_lock();
6970 	unregister_netdevice(dev);
6971 	rtnl_unlock();
6972 }
6973 EXPORT_SYMBOL(unregister_netdev);
6974 
6975 /**
6976  *	dev_change_net_namespace - move device to different nethost namespace
6977  *	@dev: device
6978  *	@net: network namespace
6979  *	@pat: If not NULL name pattern to try if the current device name
6980  *	      is already taken in the destination network namespace.
6981  *
6982  *	This function shuts down a device interface and moves it
6983  *	to a new network namespace. On success 0 is returned, on
6984  *	a failure a netagive errno code is returned.
6985  *
6986  *	Callers must hold the rtnl semaphore.
6987  */
6988 
6989 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6990 {
6991 	int err;
6992 
6993 	ASSERT_RTNL();
6994 
6995 	/* Don't allow namespace local devices to be moved. */
6996 	err = -EINVAL;
6997 	if (dev->features & NETIF_F_NETNS_LOCAL)
6998 		goto out;
6999 
7000 	/* Ensure the device has been registrered */
7001 	if (dev->reg_state != NETREG_REGISTERED)
7002 		goto out;
7003 
7004 	/* Get out if there is nothing todo */
7005 	err = 0;
7006 	if (net_eq(dev_net(dev), net))
7007 		goto out;
7008 
7009 	/* Pick the destination device name, and ensure
7010 	 * we can use it in the destination network namespace.
7011 	 */
7012 	err = -EEXIST;
7013 	if (__dev_get_by_name(net, dev->name)) {
7014 		/* We get here if we can't use the current device name */
7015 		if (!pat)
7016 			goto out;
7017 		if (dev_get_valid_name(net, dev, pat) < 0)
7018 			goto out;
7019 	}
7020 
7021 	/*
7022 	 * And now a mini version of register_netdevice unregister_netdevice.
7023 	 */
7024 
7025 	/* If device is running close it first. */
7026 	dev_close(dev);
7027 
7028 	/* And unlink it from device chain */
7029 	err = -ENODEV;
7030 	unlist_netdevice(dev);
7031 
7032 	synchronize_net();
7033 
7034 	/* Shutdown queueing discipline. */
7035 	dev_shutdown(dev);
7036 
7037 	/* Notify protocols, that we are about to destroy
7038 	   this device. They should clean all the things.
7039 
7040 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7041 	   This is wanted because this way 8021q and macvlan know
7042 	   the device is just moving and can keep their slaves up.
7043 	*/
7044 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7045 	rcu_barrier();
7046 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7047 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7048 
7049 	/*
7050 	 *	Flush the unicast and multicast chains
7051 	 */
7052 	dev_uc_flush(dev);
7053 	dev_mc_flush(dev);
7054 
7055 	/* Send a netdev-removed uevent to the old namespace */
7056 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7057 	netdev_adjacent_del_links(dev);
7058 
7059 	/* Actually switch the network namespace */
7060 	dev_net_set(dev, net);
7061 
7062 	/* If there is an ifindex conflict assign a new one */
7063 	if (__dev_get_by_index(net, dev->ifindex)) {
7064 		int iflink = (dev->iflink == dev->ifindex);
7065 		dev->ifindex = dev_new_index(net);
7066 		if (iflink)
7067 			dev->iflink = dev->ifindex;
7068 	}
7069 
7070 	/* Send a netdev-add uevent to the new namespace */
7071 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7072 	netdev_adjacent_add_links(dev);
7073 
7074 	/* Fixup kobjects */
7075 	err = device_rename(&dev->dev, dev->name);
7076 	WARN_ON(err);
7077 
7078 	/* Add the device back in the hashes */
7079 	list_netdevice(dev);
7080 
7081 	/* Notify protocols, that a new device appeared. */
7082 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7083 
7084 	/*
7085 	 *	Prevent userspace races by waiting until the network
7086 	 *	device is fully setup before sending notifications.
7087 	 */
7088 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7089 
7090 	synchronize_net();
7091 	err = 0;
7092 out:
7093 	return err;
7094 }
7095 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7096 
7097 static int dev_cpu_callback(struct notifier_block *nfb,
7098 			    unsigned long action,
7099 			    void *ocpu)
7100 {
7101 	struct sk_buff **list_skb;
7102 	struct sk_buff *skb;
7103 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7104 	struct softnet_data *sd, *oldsd;
7105 
7106 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7107 		return NOTIFY_OK;
7108 
7109 	local_irq_disable();
7110 	cpu = smp_processor_id();
7111 	sd = &per_cpu(softnet_data, cpu);
7112 	oldsd = &per_cpu(softnet_data, oldcpu);
7113 
7114 	/* Find end of our completion_queue. */
7115 	list_skb = &sd->completion_queue;
7116 	while (*list_skb)
7117 		list_skb = &(*list_skb)->next;
7118 	/* Append completion queue from offline CPU. */
7119 	*list_skb = oldsd->completion_queue;
7120 	oldsd->completion_queue = NULL;
7121 
7122 	/* Append output queue from offline CPU. */
7123 	if (oldsd->output_queue) {
7124 		*sd->output_queue_tailp = oldsd->output_queue;
7125 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7126 		oldsd->output_queue = NULL;
7127 		oldsd->output_queue_tailp = &oldsd->output_queue;
7128 	}
7129 	/* Append NAPI poll list from offline CPU, with one exception :
7130 	 * process_backlog() must be called by cpu owning percpu backlog.
7131 	 * We properly handle process_queue & input_pkt_queue later.
7132 	 */
7133 	while (!list_empty(&oldsd->poll_list)) {
7134 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7135 							    struct napi_struct,
7136 							    poll_list);
7137 
7138 		list_del_init(&napi->poll_list);
7139 		if (napi->poll == process_backlog)
7140 			napi->state = 0;
7141 		else
7142 			____napi_schedule(sd, napi);
7143 	}
7144 
7145 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7146 	local_irq_enable();
7147 
7148 	/* Process offline CPU's input_pkt_queue */
7149 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7150 		netif_rx_ni(skb);
7151 		input_queue_head_incr(oldsd);
7152 	}
7153 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7154 		netif_rx_ni(skb);
7155 		input_queue_head_incr(oldsd);
7156 	}
7157 
7158 	return NOTIFY_OK;
7159 }
7160 
7161 
7162 /**
7163  *	netdev_increment_features - increment feature set by one
7164  *	@all: current feature set
7165  *	@one: new feature set
7166  *	@mask: mask feature set
7167  *
7168  *	Computes a new feature set after adding a device with feature set
7169  *	@one to the master device with current feature set @all.  Will not
7170  *	enable anything that is off in @mask. Returns the new feature set.
7171  */
7172 netdev_features_t netdev_increment_features(netdev_features_t all,
7173 	netdev_features_t one, netdev_features_t mask)
7174 {
7175 	if (mask & NETIF_F_GEN_CSUM)
7176 		mask |= NETIF_F_ALL_CSUM;
7177 	mask |= NETIF_F_VLAN_CHALLENGED;
7178 
7179 	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7180 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7181 
7182 	/* If one device supports hw checksumming, set for all. */
7183 	if (all & NETIF_F_GEN_CSUM)
7184 		all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7185 
7186 	return all;
7187 }
7188 EXPORT_SYMBOL(netdev_increment_features);
7189 
7190 static struct hlist_head * __net_init netdev_create_hash(void)
7191 {
7192 	int i;
7193 	struct hlist_head *hash;
7194 
7195 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7196 	if (hash != NULL)
7197 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7198 			INIT_HLIST_HEAD(&hash[i]);
7199 
7200 	return hash;
7201 }
7202 
7203 /* Initialize per network namespace state */
7204 static int __net_init netdev_init(struct net *net)
7205 {
7206 	if (net != &init_net)
7207 		INIT_LIST_HEAD(&net->dev_base_head);
7208 
7209 	net->dev_name_head = netdev_create_hash();
7210 	if (net->dev_name_head == NULL)
7211 		goto err_name;
7212 
7213 	net->dev_index_head = netdev_create_hash();
7214 	if (net->dev_index_head == NULL)
7215 		goto err_idx;
7216 
7217 	return 0;
7218 
7219 err_idx:
7220 	kfree(net->dev_name_head);
7221 err_name:
7222 	return -ENOMEM;
7223 }
7224 
7225 /**
7226  *	netdev_drivername - network driver for the device
7227  *	@dev: network device
7228  *
7229  *	Determine network driver for device.
7230  */
7231 const char *netdev_drivername(const struct net_device *dev)
7232 {
7233 	const struct device_driver *driver;
7234 	const struct device *parent;
7235 	const char *empty = "";
7236 
7237 	parent = dev->dev.parent;
7238 	if (!parent)
7239 		return empty;
7240 
7241 	driver = parent->driver;
7242 	if (driver && driver->name)
7243 		return driver->name;
7244 	return empty;
7245 }
7246 
7247 static void __netdev_printk(const char *level, const struct net_device *dev,
7248 			    struct va_format *vaf)
7249 {
7250 	if (dev && dev->dev.parent) {
7251 		dev_printk_emit(level[1] - '0',
7252 				dev->dev.parent,
7253 				"%s %s %s%s: %pV",
7254 				dev_driver_string(dev->dev.parent),
7255 				dev_name(dev->dev.parent),
7256 				netdev_name(dev), netdev_reg_state(dev),
7257 				vaf);
7258 	} else if (dev) {
7259 		printk("%s%s%s: %pV",
7260 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7261 	} else {
7262 		printk("%s(NULL net_device): %pV", level, vaf);
7263 	}
7264 }
7265 
7266 void netdev_printk(const char *level, const struct net_device *dev,
7267 		   const char *format, ...)
7268 {
7269 	struct va_format vaf;
7270 	va_list args;
7271 
7272 	va_start(args, format);
7273 
7274 	vaf.fmt = format;
7275 	vaf.va = &args;
7276 
7277 	__netdev_printk(level, dev, &vaf);
7278 
7279 	va_end(args);
7280 }
7281 EXPORT_SYMBOL(netdev_printk);
7282 
7283 #define define_netdev_printk_level(func, level)			\
7284 void func(const struct net_device *dev, const char *fmt, ...)	\
7285 {								\
7286 	struct va_format vaf;					\
7287 	va_list args;						\
7288 								\
7289 	va_start(args, fmt);					\
7290 								\
7291 	vaf.fmt = fmt;						\
7292 	vaf.va = &args;						\
7293 								\
7294 	__netdev_printk(level, dev, &vaf);			\
7295 								\
7296 	va_end(args);						\
7297 }								\
7298 EXPORT_SYMBOL(func);
7299 
7300 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7301 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7302 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7303 define_netdev_printk_level(netdev_err, KERN_ERR);
7304 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7305 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7306 define_netdev_printk_level(netdev_info, KERN_INFO);
7307 
7308 static void __net_exit netdev_exit(struct net *net)
7309 {
7310 	kfree(net->dev_name_head);
7311 	kfree(net->dev_index_head);
7312 }
7313 
7314 static struct pernet_operations __net_initdata netdev_net_ops = {
7315 	.init = netdev_init,
7316 	.exit = netdev_exit,
7317 };
7318 
7319 static void __net_exit default_device_exit(struct net *net)
7320 {
7321 	struct net_device *dev, *aux;
7322 	/*
7323 	 * Push all migratable network devices back to the
7324 	 * initial network namespace
7325 	 */
7326 	rtnl_lock();
7327 	for_each_netdev_safe(net, dev, aux) {
7328 		int err;
7329 		char fb_name[IFNAMSIZ];
7330 
7331 		/* Ignore unmoveable devices (i.e. loopback) */
7332 		if (dev->features & NETIF_F_NETNS_LOCAL)
7333 			continue;
7334 
7335 		/* Leave virtual devices for the generic cleanup */
7336 		if (dev->rtnl_link_ops)
7337 			continue;
7338 
7339 		/* Push remaining network devices to init_net */
7340 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7341 		err = dev_change_net_namespace(dev, &init_net, fb_name);
7342 		if (err) {
7343 			pr_emerg("%s: failed to move %s to init_net: %d\n",
7344 				 __func__, dev->name, err);
7345 			BUG();
7346 		}
7347 	}
7348 	rtnl_unlock();
7349 }
7350 
7351 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7352 {
7353 	/* Return with the rtnl_lock held when there are no network
7354 	 * devices unregistering in any network namespace in net_list.
7355 	 */
7356 	struct net *net;
7357 	bool unregistering;
7358 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
7359 
7360 	add_wait_queue(&netdev_unregistering_wq, &wait);
7361 	for (;;) {
7362 		unregistering = false;
7363 		rtnl_lock();
7364 		list_for_each_entry(net, net_list, exit_list) {
7365 			if (net->dev_unreg_count > 0) {
7366 				unregistering = true;
7367 				break;
7368 			}
7369 		}
7370 		if (!unregistering)
7371 			break;
7372 		__rtnl_unlock();
7373 
7374 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7375 	}
7376 	remove_wait_queue(&netdev_unregistering_wq, &wait);
7377 }
7378 
7379 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7380 {
7381 	/* At exit all network devices most be removed from a network
7382 	 * namespace.  Do this in the reverse order of registration.
7383 	 * Do this across as many network namespaces as possible to
7384 	 * improve batching efficiency.
7385 	 */
7386 	struct net_device *dev;
7387 	struct net *net;
7388 	LIST_HEAD(dev_kill_list);
7389 
7390 	/* To prevent network device cleanup code from dereferencing
7391 	 * loopback devices or network devices that have been freed
7392 	 * wait here for all pending unregistrations to complete,
7393 	 * before unregistring the loopback device and allowing the
7394 	 * network namespace be freed.
7395 	 *
7396 	 * The netdev todo list containing all network devices
7397 	 * unregistrations that happen in default_device_exit_batch
7398 	 * will run in the rtnl_unlock() at the end of
7399 	 * default_device_exit_batch.
7400 	 */
7401 	rtnl_lock_unregistering(net_list);
7402 	list_for_each_entry(net, net_list, exit_list) {
7403 		for_each_netdev_reverse(net, dev) {
7404 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7405 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7406 			else
7407 				unregister_netdevice_queue(dev, &dev_kill_list);
7408 		}
7409 	}
7410 	unregister_netdevice_many(&dev_kill_list);
7411 	rtnl_unlock();
7412 }
7413 
7414 static struct pernet_operations __net_initdata default_device_ops = {
7415 	.exit = default_device_exit,
7416 	.exit_batch = default_device_exit_batch,
7417 };
7418 
7419 /*
7420  *	Initialize the DEV module. At boot time this walks the device list and
7421  *	unhooks any devices that fail to initialise (normally hardware not
7422  *	present) and leaves us with a valid list of present and active devices.
7423  *
7424  */
7425 
7426 /*
7427  *       This is called single threaded during boot, so no need
7428  *       to take the rtnl semaphore.
7429  */
7430 static int __init net_dev_init(void)
7431 {
7432 	int i, rc = -ENOMEM;
7433 
7434 	BUG_ON(!dev_boot_phase);
7435 
7436 	if (dev_proc_init())
7437 		goto out;
7438 
7439 	if (netdev_kobject_init())
7440 		goto out;
7441 
7442 	INIT_LIST_HEAD(&ptype_all);
7443 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
7444 		INIT_LIST_HEAD(&ptype_base[i]);
7445 
7446 	INIT_LIST_HEAD(&offload_base);
7447 
7448 	if (register_pernet_subsys(&netdev_net_ops))
7449 		goto out;
7450 
7451 	/*
7452 	 *	Initialise the packet receive queues.
7453 	 */
7454 
7455 	for_each_possible_cpu(i) {
7456 		struct softnet_data *sd = &per_cpu(softnet_data, i);
7457 
7458 		skb_queue_head_init(&sd->input_pkt_queue);
7459 		skb_queue_head_init(&sd->process_queue);
7460 		INIT_LIST_HEAD(&sd->poll_list);
7461 		sd->output_queue_tailp = &sd->output_queue;
7462 #ifdef CONFIG_RPS
7463 		sd->csd.func = rps_trigger_softirq;
7464 		sd->csd.info = sd;
7465 		sd->cpu = i;
7466 #endif
7467 
7468 		sd->backlog.poll = process_backlog;
7469 		sd->backlog.weight = weight_p;
7470 	}
7471 
7472 	dev_boot_phase = 0;
7473 
7474 	/* The loopback device is special if any other network devices
7475 	 * is present in a network namespace the loopback device must
7476 	 * be present. Since we now dynamically allocate and free the
7477 	 * loopback device ensure this invariant is maintained by
7478 	 * keeping the loopback device as the first device on the
7479 	 * list of network devices.  Ensuring the loopback devices
7480 	 * is the first device that appears and the last network device
7481 	 * that disappears.
7482 	 */
7483 	if (register_pernet_device(&loopback_net_ops))
7484 		goto out;
7485 
7486 	if (register_pernet_device(&default_device_ops))
7487 		goto out;
7488 
7489 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7490 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7491 
7492 	hotcpu_notifier(dev_cpu_callback, 0);
7493 	dst_init();
7494 	rc = 0;
7495 out:
7496 	return rc;
7497 }
7498 
7499 subsys_initcall(net_dev_init);
7500