xref: /openbmc/linux/net/core/dev.c (revision 96c63fa7393d0a346acfe5a91e0c7d4c7782641b)
1 /*
2  * 	NET3	Protocol independent device support routines.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  *	Derived from the non IP parts of dev.c 1.0.19
10  * 		Authors:	Ross Biro
11  *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *				Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *
14  *	Additional Authors:
15  *		Florian la Roche <rzsfl@rz.uni-sb.de>
16  *		Alan Cox <gw4pts@gw4pts.ampr.org>
17  *		David Hinds <dahinds@users.sourceforge.net>
18  *		Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19  *		Adam Sulmicki <adam@cfar.umd.edu>
20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
21  *
22  *	Changes:
23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
24  *              			to 2 if register_netdev gets called
25  *              			before net_dev_init & also removed a
26  *              			few lines of code in the process.
27  *		Alan Cox	:	device private ioctl copies fields back.
28  *		Alan Cox	:	Transmit queue code does relevant
29  *					stunts to keep the queue safe.
30  *		Alan Cox	:	Fixed double lock.
31  *		Alan Cox	:	Fixed promisc NULL pointer trap
32  *		????????	:	Support the full private ioctl range
33  *		Alan Cox	:	Moved ioctl permission check into
34  *					drivers
35  *		Tim Kordas	:	SIOCADDMULTI/SIOCDELMULTI
36  *		Alan Cox	:	100 backlog just doesn't cut it when
37  *					you start doing multicast video 8)
38  *		Alan Cox	:	Rewrote net_bh and list manager.
39  *		Alan Cox	: 	Fix ETH_P_ALL echoback lengths.
40  *		Alan Cox	:	Took out transmit every packet pass
41  *					Saved a few bytes in the ioctl handler
42  *		Alan Cox	:	Network driver sets packet type before
43  *					calling netif_rx. Saves a function
44  *					call a packet.
45  *		Alan Cox	:	Hashed net_bh()
46  *		Richard Kooijman:	Timestamp fixes.
47  *		Alan Cox	:	Wrong field in SIOCGIFDSTADDR
48  *		Alan Cox	:	Device lock protection.
49  *		Alan Cox	: 	Fixed nasty side effect of device close
50  *					changes.
51  *		Rudi Cilibrasi	:	Pass the right thing to
52  *					set_mac_address()
53  *		Dave Miller	:	32bit quantity for the device lock to
54  *					make it work out on a Sparc.
55  *		Bjorn Ekwall	:	Added KERNELD hack.
56  *		Alan Cox	:	Cleaned up the backlog initialise.
57  *		Craig Metz	:	SIOCGIFCONF fix if space for under
58  *					1 device.
59  *	    Thomas Bogendoerfer :	Return ENODEV for dev_open, if there
60  *					is no device open function.
61  *		Andi Kleen	:	Fix error reporting for SIOCGIFCONF
62  *	    Michael Chastain	:	Fix signed/unsigned for SIOCGIFCONF
63  *		Cyrus Durgin	:	Cleaned for KMOD
64  *		Adam Sulmicki   :	Bug Fix : Network Device Unload
65  *					A network device unload needs to purge
66  *					the backlog queue.
67  *	Paul Rusty Russell	:	SIOCSIFNAME
68  *              Pekka Riikonen  :	Netdev boot-time settings code
69  *              Andrew Morton   :       Make unregister_netdevice wait
70  *              			indefinitely on dev->refcnt
71  * 		J Hadi Salim	:	- Backlog queue sampling
72  *				        - netif_rx() feedback
73  */
74 
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <net/busy_poll.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/stat.h>
102 #include <net/dst.h>
103 #include <net/dst_metadata.h>
104 #include <net/pkt_sched.h>
105 #include <net/checksum.h>
106 #include <net/xfrm.h>
107 #include <linux/highmem.h>
108 #include <linux/init.h>
109 #include <linux/module.h>
110 #include <linux/netpoll.h>
111 #include <linux/rcupdate.h>
112 #include <linux/delay.h>
113 #include <net/iw_handler.h>
114 #include <asm/current.h>
115 #include <linux/audit.h>
116 #include <linux/dmaengine.h>
117 #include <linux/err.h>
118 #include <linux/ctype.h>
119 #include <linux/if_arp.h>
120 #include <linux/if_vlan.h>
121 #include <linux/ip.h>
122 #include <net/ip.h>
123 #include <net/mpls.h>
124 #include <linux/ipv6.h>
125 #include <linux/in.h>
126 #include <linux/jhash.h>
127 #include <linux/random.h>
128 #include <trace/events/napi.h>
129 #include <trace/events/net.h>
130 #include <trace/events/skb.h>
131 #include <linux/pci.h>
132 #include <linux/inetdevice.h>
133 #include <linux/cpu_rmap.h>
134 #include <linux/static_key.h>
135 #include <linux/hashtable.h>
136 #include <linux/vmalloc.h>
137 #include <linux/if_macvlan.h>
138 #include <linux/errqueue.h>
139 #include <linux/hrtimer.h>
140 #include <linux/netfilter_ingress.h>
141 #include <linux/sctp.h>
142 #include <linux/crash_dump.h>
143 
144 #include "net-sysfs.h"
145 
146 /* Instead of increasing this, you should create a hash table. */
147 #define MAX_GRO_SKBS 8
148 
149 /* This should be increased if a protocol with a bigger head is added. */
150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
151 
152 static DEFINE_SPINLOCK(ptype_lock);
153 static DEFINE_SPINLOCK(offload_lock);
154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
155 struct list_head ptype_all __read_mostly;	/* Taps */
156 static struct list_head offload_base __read_mostly;
157 
158 static int netif_rx_internal(struct sk_buff *skb);
159 static int call_netdevice_notifiers_info(unsigned long val,
160 					 struct net_device *dev,
161 					 struct netdev_notifier_info *info);
162 
163 /*
164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
165  * semaphore.
166  *
167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
168  *
169  * Writers must hold the rtnl semaphore while they loop through the
170  * dev_base_head list, and hold dev_base_lock for writing when they do the
171  * actual updates.  This allows pure readers to access the list even
172  * while a writer is preparing to update it.
173  *
174  * To put it another way, dev_base_lock is held for writing only to
175  * protect against pure readers; the rtnl semaphore provides the
176  * protection against other writers.
177  *
178  * See, for example usages, register_netdevice() and
179  * unregister_netdevice(), which must be called with the rtnl
180  * semaphore held.
181  */
182 DEFINE_RWLOCK(dev_base_lock);
183 EXPORT_SYMBOL(dev_base_lock);
184 
185 /* protects napi_hash addition/deletion and napi_gen_id */
186 static DEFINE_SPINLOCK(napi_hash_lock);
187 
188 static unsigned int napi_gen_id = NR_CPUS;
189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
190 
191 static seqcount_t devnet_rename_seq;
192 
193 static inline void dev_base_seq_inc(struct net *net)
194 {
195 	while (++net->dev_base_seq == 0);
196 }
197 
198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
199 {
200 	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
201 
202 	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
203 }
204 
205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
206 {
207 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
208 }
209 
210 static inline void rps_lock(struct softnet_data *sd)
211 {
212 #ifdef CONFIG_RPS
213 	spin_lock(&sd->input_pkt_queue.lock);
214 #endif
215 }
216 
217 static inline void rps_unlock(struct softnet_data *sd)
218 {
219 #ifdef CONFIG_RPS
220 	spin_unlock(&sd->input_pkt_queue.lock);
221 #endif
222 }
223 
224 /* Device list insertion */
225 static void list_netdevice(struct net_device *dev)
226 {
227 	struct net *net = dev_net(dev);
228 
229 	ASSERT_RTNL();
230 
231 	write_lock_bh(&dev_base_lock);
232 	list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
233 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
234 	hlist_add_head_rcu(&dev->index_hlist,
235 			   dev_index_hash(net, dev->ifindex));
236 	write_unlock_bh(&dev_base_lock);
237 
238 	dev_base_seq_inc(net);
239 }
240 
241 /* Device list removal
242  * caller must respect a RCU grace period before freeing/reusing dev
243  */
244 static void unlist_netdevice(struct net_device *dev)
245 {
246 	ASSERT_RTNL();
247 
248 	/* Unlink dev from the device chain */
249 	write_lock_bh(&dev_base_lock);
250 	list_del_rcu(&dev->dev_list);
251 	hlist_del_rcu(&dev->name_hlist);
252 	hlist_del_rcu(&dev->index_hlist);
253 	write_unlock_bh(&dev_base_lock);
254 
255 	dev_base_seq_inc(dev_net(dev));
256 }
257 
258 /*
259  *	Our notifier list
260  */
261 
262 static RAW_NOTIFIER_HEAD(netdev_chain);
263 
264 /*
265  *	Device drivers call our routines to queue packets here. We empty the
266  *	queue in the local softnet handler.
267  */
268 
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
270 EXPORT_PER_CPU_SYMBOL(softnet_data);
271 
272 #ifdef CONFIG_LOCKDEP
273 /*
274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275  * according to dev->type
276  */
277 static const unsigned short netdev_lock_type[] =
278 	{ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
279 	 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
280 	 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
281 	 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
282 	 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
283 	 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
284 	 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
285 	 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
286 	 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
287 	 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
288 	 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
289 	 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
290 	 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
291 	 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
292 	 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
293 
294 static const char *const netdev_lock_name[] =
295 	{"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
296 	 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
297 	 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
298 	 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
299 	 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
300 	 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
301 	 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
302 	 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
303 	 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
304 	 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
305 	 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
306 	 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
307 	 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
308 	 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
309 	 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
310 
311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
313 
314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315 {
316 	int i;
317 
318 	for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 		if (netdev_lock_type[i] == dev_type)
320 			return i;
321 	/* the last key is used by default */
322 	return ARRAY_SIZE(netdev_lock_type) - 1;
323 }
324 
325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 						 unsigned short dev_type)
327 {
328 	int i;
329 
330 	i = netdev_lock_pos(dev_type);
331 	lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 				   netdev_lock_name[i]);
333 }
334 
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336 {
337 	int i;
338 
339 	i = netdev_lock_pos(dev->type);
340 	lockdep_set_class_and_name(&dev->addr_list_lock,
341 				   &netdev_addr_lock_key[i],
342 				   netdev_lock_name[i]);
343 }
344 #else
345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 						 unsigned short dev_type)
347 {
348 }
349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350 {
351 }
352 #endif
353 
354 /*******************************************************************************
355 
356 		Protocol management and registration routines
357 
358 *******************************************************************************/
359 
360 /*
361  *	Add a protocol ID to the list. Now that the input handler is
362  *	smarter we can dispense with all the messy stuff that used to be
363  *	here.
364  *
365  *	BEWARE!!! Protocol handlers, mangling input packets,
366  *	MUST BE last in hash buckets and checking protocol handlers
367  *	MUST start from promiscuous ptype_all chain in net_bh.
368  *	It is true now, do not change it.
369  *	Explanation follows: if protocol handler, mangling packet, will
370  *	be the first on list, it is not able to sense, that packet
371  *	is cloned and should be copied-on-write, so that it will
372  *	change it and subsequent readers will get broken packet.
373  *							--ANK (980803)
374  */
375 
376 static inline struct list_head *ptype_head(const struct packet_type *pt)
377 {
378 	if (pt->type == htons(ETH_P_ALL))
379 		return pt->dev ? &pt->dev->ptype_all : &ptype_all;
380 	else
381 		return pt->dev ? &pt->dev->ptype_specific :
382 				 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 }
384 
385 /**
386  *	dev_add_pack - add packet handler
387  *	@pt: packet type declaration
388  *
389  *	Add a protocol handler to the networking stack. The passed &packet_type
390  *	is linked into kernel lists and may not be freed until it has been
391  *	removed from the kernel lists.
392  *
393  *	This call does not sleep therefore it can not
394  *	guarantee all CPU's that are in middle of receiving packets
395  *	will see the new packet type (until the next received packet).
396  */
397 
398 void dev_add_pack(struct packet_type *pt)
399 {
400 	struct list_head *head = ptype_head(pt);
401 
402 	spin_lock(&ptype_lock);
403 	list_add_rcu(&pt->list, head);
404 	spin_unlock(&ptype_lock);
405 }
406 EXPORT_SYMBOL(dev_add_pack);
407 
408 /**
409  *	__dev_remove_pack	 - remove packet handler
410  *	@pt: packet type declaration
411  *
412  *	Remove a protocol handler that was previously added to the kernel
413  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
414  *	from the kernel lists and can be freed or reused once this function
415  *	returns.
416  *
417  *      The packet type might still be in use by receivers
418  *	and must not be freed until after all the CPU's have gone
419  *	through a quiescent state.
420  */
421 void __dev_remove_pack(struct packet_type *pt)
422 {
423 	struct list_head *head = ptype_head(pt);
424 	struct packet_type *pt1;
425 
426 	spin_lock(&ptype_lock);
427 
428 	list_for_each_entry(pt1, head, list) {
429 		if (pt == pt1) {
430 			list_del_rcu(&pt->list);
431 			goto out;
432 		}
433 	}
434 
435 	pr_warn("dev_remove_pack: %p not found\n", pt);
436 out:
437 	spin_unlock(&ptype_lock);
438 }
439 EXPORT_SYMBOL(__dev_remove_pack);
440 
441 /**
442  *	dev_remove_pack	 - remove packet handler
443  *	@pt: packet type declaration
444  *
445  *	Remove a protocol handler that was previously added to the kernel
446  *	protocol handlers by dev_add_pack(). The passed &packet_type is removed
447  *	from the kernel lists and can be freed or reused once this function
448  *	returns.
449  *
450  *	This call sleeps to guarantee that no CPU is looking at the packet
451  *	type after return.
452  */
453 void dev_remove_pack(struct packet_type *pt)
454 {
455 	__dev_remove_pack(pt);
456 
457 	synchronize_net();
458 }
459 EXPORT_SYMBOL(dev_remove_pack);
460 
461 
462 /**
463  *	dev_add_offload - register offload handlers
464  *	@po: protocol offload declaration
465  *
466  *	Add protocol offload handlers to the networking stack. The passed
467  *	&proto_offload is linked into kernel lists and may not be freed until
468  *	it has been removed from the kernel lists.
469  *
470  *	This call does not sleep therefore it can not
471  *	guarantee all CPU's that are in middle of receiving packets
472  *	will see the new offload handlers (until the next received packet).
473  */
474 void dev_add_offload(struct packet_offload *po)
475 {
476 	struct packet_offload *elem;
477 
478 	spin_lock(&offload_lock);
479 	list_for_each_entry(elem, &offload_base, list) {
480 		if (po->priority < elem->priority)
481 			break;
482 	}
483 	list_add_rcu(&po->list, elem->list.prev);
484 	spin_unlock(&offload_lock);
485 }
486 EXPORT_SYMBOL(dev_add_offload);
487 
488 /**
489  *	__dev_remove_offload	 - remove offload handler
490  *	@po: packet offload declaration
491  *
492  *	Remove a protocol offload handler that was previously added to the
493  *	kernel offload handlers by dev_add_offload(). The passed &offload_type
494  *	is removed from the kernel lists and can be freed or reused once this
495  *	function returns.
496  *
497  *      The packet type might still be in use by receivers
498  *	and must not be freed until after all the CPU's have gone
499  *	through a quiescent state.
500  */
501 static void __dev_remove_offload(struct packet_offload *po)
502 {
503 	struct list_head *head = &offload_base;
504 	struct packet_offload *po1;
505 
506 	spin_lock(&offload_lock);
507 
508 	list_for_each_entry(po1, head, list) {
509 		if (po == po1) {
510 			list_del_rcu(&po->list);
511 			goto out;
512 		}
513 	}
514 
515 	pr_warn("dev_remove_offload: %p not found\n", po);
516 out:
517 	spin_unlock(&offload_lock);
518 }
519 
520 /**
521  *	dev_remove_offload	 - remove packet offload handler
522  *	@po: packet offload declaration
523  *
524  *	Remove a packet offload handler that was previously added to the kernel
525  *	offload handlers by dev_add_offload(). The passed &offload_type is
526  *	removed from the kernel lists and can be freed or reused once this
527  *	function returns.
528  *
529  *	This call sleeps to guarantee that no CPU is looking at the packet
530  *	type after return.
531  */
532 void dev_remove_offload(struct packet_offload *po)
533 {
534 	__dev_remove_offload(po);
535 
536 	synchronize_net();
537 }
538 EXPORT_SYMBOL(dev_remove_offload);
539 
540 /******************************************************************************
541 
542 		      Device Boot-time Settings Routines
543 
544 *******************************************************************************/
545 
546 /* Boot time configuration table */
547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
548 
549 /**
550  *	netdev_boot_setup_add	- add new setup entry
551  *	@name: name of the device
552  *	@map: configured settings for the device
553  *
554  *	Adds new setup entry to the dev_boot_setup list.  The function
555  *	returns 0 on error and 1 on success.  This is a generic routine to
556  *	all netdevices.
557  */
558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
559 {
560 	struct netdev_boot_setup *s;
561 	int i;
562 
563 	s = dev_boot_setup;
564 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 		if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
566 			memset(s[i].name, 0, sizeof(s[i].name));
567 			strlcpy(s[i].name, name, IFNAMSIZ);
568 			memcpy(&s[i].map, map, sizeof(s[i].map));
569 			break;
570 		}
571 	}
572 
573 	return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 }
575 
576 /**
577  *	netdev_boot_setup_check	- check boot time settings
578  *	@dev: the netdevice
579  *
580  * 	Check boot time settings for the device.
581  *	The found settings are set for the device to be used
582  *	later in the device probing.
583  *	Returns 0 if no settings found, 1 if they are.
584  */
585 int netdev_boot_setup_check(struct net_device *dev)
586 {
587 	struct netdev_boot_setup *s = dev_boot_setup;
588 	int i;
589 
590 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
591 		if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
592 		    !strcmp(dev->name, s[i].name)) {
593 			dev->irq 	= s[i].map.irq;
594 			dev->base_addr 	= s[i].map.base_addr;
595 			dev->mem_start 	= s[i].map.mem_start;
596 			dev->mem_end 	= s[i].map.mem_end;
597 			return 1;
598 		}
599 	}
600 	return 0;
601 }
602 EXPORT_SYMBOL(netdev_boot_setup_check);
603 
604 
605 /**
606  *	netdev_boot_base	- get address from boot time settings
607  *	@prefix: prefix for network device
608  *	@unit: id for network device
609  *
610  * 	Check boot time settings for the base address of device.
611  *	The found settings are set for the device to be used
612  *	later in the device probing.
613  *	Returns 0 if no settings found.
614  */
615 unsigned long netdev_boot_base(const char *prefix, int unit)
616 {
617 	const struct netdev_boot_setup *s = dev_boot_setup;
618 	char name[IFNAMSIZ];
619 	int i;
620 
621 	sprintf(name, "%s%d", prefix, unit);
622 
623 	/*
624 	 * If device already registered then return base of 1
625 	 * to indicate not to probe for this interface
626 	 */
627 	if (__dev_get_by_name(&init_net, name))
628 		return 1;
629 
630 	for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
631 		if (!strcmp(name, s[i].name))
632 			return s[i].map.base_addr;
633 	return 0;
634 }
635 
636 /*
637  * Saves at boot time configured settings for any netdevice.
638  */
639 int __init netdev_boot_setup(char *str)
640 {
641 	int ints[5];
642 	struct ifmap map;
643 
644 	str = get_options(str, ARRAY_SIZE(ints), ints);
645 	if (!str || !*str)
646 		return 0;
647 
648 	/* Save settings */
649 	memset(&map, 0, sizeof(map));
650 	if (ints[0] > 0)
651 		map.irq = ints[1];
652 	if (ints[0] > 1)
653 		map.base_addr = ints[2];
654 	if (ints[0] > 2)
655 		map.mem_start = ints[3];
656 	if (ints[0] > 3)
657 		map.mem_end = ints[4];
658 
659 	/* Add new entry to the list */
660 	return netdev_boot_setup_add(str, &map);
661 }
662 
663 __setup("netdev=", netdev_boot_setup);
664 
665 /*******************************************************************************
666 
667 			    Device Interface Subroutines
668 
669 *******************************************************************************/
670 
671 /**
672  *	dev_get_iflink	- get 'iflink' value of a interface
673  *	@dev: targeted interface
674  *
675  *	Indicates the ifindex the interface is linked to.
676  *	Physical interfaces have the same 'ifindex' and 'iflink' values.
677  */
678 
679 int dev_get_iflink(const struct net_device *dev)
680 {
681 	if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
682 		return dev->netdev_ops->ndo_get_iflink(dev);
683 
684 	return dev->ifindex;
685 }
686 EXPORT_SYMBOL(dev_get_iflink);
687 
688 /**
689  *	dev_fill_metadata_dst - Retrieve tunnel egress information.
690  *	@dev: targeted interface
691  *	@skb: The packet.
692  *
693  *	For better visibility of tunnel traffic OVS needs to retrieve
694  *	egress tunnel information for a packet. Following API allows
695  *	user to get this info.
696  */
697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
698 {
699 	struct ip_tunnel_info *info;
700 
701 	if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
702 		return -EINVAL;
703 
704 	info = skb_tunnel_info_unclone(skb);
705 	if (!info)
706 		return -ENOMEM;
707 	if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
708 		return -EINVAL;
709 
710 	return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
711 }
712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
713 
714 /**
715  *	__dev_get_by_name	- find a device by its name
716  *	@net: the applicable net namespace
717  *	@name: name to find
718  *
719  *	Find an interface by name. Must be called under RTNL semaphore
720  *	or @dev_base_lock. If the name is found a pointer to the device
721  *	is returned. If the name is not found then %NULL is returned. The
722  *	reference counters are not incremented so the caller must be
723  *	careful with locks.
724  */
725 
726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
727 {
728 	struct net_device *dev;
729 	struct hlist_head *head = dev_name_hash(net, name);
730 
731 	hlist_for_each_entry(dev, head, name_hlist)
732 		if (!strncmp(dev->name, name, IFNAMSIZ))
733 			return dev;
734 
735 	return NULL;
736 }
737 EXPORT_SYMBOL(__dev_get_by_name);
738 
739 /**
740  *	dev_get_by_name_rcu	- find a device by its name
741  *	@net: the applicable net namespace
742  *	@name: name to find
743  *
744  *	Find an interface by name.
745  *	If the name is found a pointer to the device is returned.
746  * 	If the name is not found then %NULL is returned.
747  *	The reference counters are not incremented so the caller must be
748  *	careful with locks. The caller must hold RCU lock.
749  */
750 
751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
752 {
753 	struct net_device *dev;
754 	struct hlist_head *head = dev_name_hash(net, name);
755 
756 	hlist_for_each_entry_rcu(dev, head, name_hlist)
757 		if (!strncmp(dev->name, name, IFNAMSIZ))
758 			return dev;
759 
760 	return NULL;
761 }
762 EXPORT_SYMBOL(dev_get_by_name_rcu);
763 
764 /**
765  *	dev_get_by_name		- find a device by its name
766  *	@net: the applicable net namespace
767  *	@name: name to find
768  *
769  *	Find an interface by name. This can be called from any
770  *	context and does its own locking. The returned handle has
771  *	the usage count incremented and the caller must use dev_put() to
772  *	release it when it is no longer needed. %NULL is returned if no
773  *	matching device is found.
774  */
775 
776 struct net_device *dev_get_by_name(struct net *net, const char *name)
777 {
778 	struct net_device *dev;
779 
780 	rcu_read_lock();
781 	dev = dev_get_by_name_rcu(net, name);
782 	if (dev)
783 		dev_hold(dev);
784 	rcu_read_unlock();
785 	return dev;
786 }
787 EXPORT_SYMBOL(dev_get_by_name);
788 
789 /**
790  *	__dev_get_by_index - find a device by its ifindex
791  *	@net: the applicable net namespace
792  *	@ifindex: index of device
793  *
794  *	Search for an interface by index. Returns %NULL if the device
795  *	is not found or a pointer to the device. The device has not
796  *	had its reference counter increased so the caller must be careful
797  *	about locking. The caller must hold either the RTNL semaphore
798  *	or @dev_base_lock.
799  */
800 
801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
802 {
803 	struct net_device *dev;
804 	struct hlist_head *head = dev_index_hash(net, ifindex);
805 
806 	hlist_for_each_entry(dev, head, index_hlist)
807 		if (dev->ifindex == ifindex)
808 			return dev;
809 
810 	return NULL;
811 }
812 EXPORT_SYMBOL(__dev_get_by_index);
813 
814 /**
815  *	dev_get_by_index_rcu - find a device by its ifindex
816  *	@net: the applicable net namespace
817  *	@ifindex: index of device
818  *
819  *	Search for an interface by index. Returns %NULL if the device
820  *	is not found or a pointer to the device. The device has not
821  *	had its reference counter increased so the caller must be careful
822  *	about locking. The caller must hold RCU lock.
823  */
824 
825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
826 {
827 	struct net_device *dev;
828 	struct hlist_head *head = dev_index_hash(net, ifindex);
829 
830 	hlist_for_each_entry_rcu(dev, head, index_hlist)
831 		if (dev->ifindex == ifindex)
832 			return dev;
833 
834 	return NULL;
835 }
836 EXPORT_SYMBOL(dev_get_by_index_rcu);
837 
838 
839 /**
840  *	dev_get_by_index - find a device by its ifindex
841  *	@net: the applicable net namespace
842  *	@ifindex: index of device
843  *
844  *	Search for an interface by index. Returns NULL if the device
845  *	is not found or a pointer to the device. The device returned has
846  *	had a reference added and the pointer is safe until the user calls
847  *	dev_put to indicate they have finished with it.
848  */
849 
850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
851 {
852 	struct net_device *dev;
853 
854 	rcu_read_lock();
855 	dev = dev_get_by_index_rcu(net, ifindex);
856 	if (dev)
857 		dev_hold(dev);
858 	rcu_read_unlock();
859 	return dev;
860 }
861 EXPORT_SYMBOL(dev_get_by_index);
862 
863 /**
864  *	netdev_get_name - get a netdevice name, knowing its ifindex.
865  *	@net: network namespace
866  *	@name: a pointer to the buffer where the name will be stored.
867  *	@ifindex: the ifindex of the interface to get the name from.
868  *
869  *	The use of raw_seqcount_begin() and cond_resched() before
870  *	retrying is required as we want to give the writers a chance
871  *	to complete when CONFIG_PREEMPT is not set.
872  */
873 int netdev_get_name(struct net *net, char *name, int ifindex)
874 {
875 	struct net_device *dev;
876 	unsigned int seq;
877 
878 retry:
879 	seq = raw_seqcount_begin(&devnet_rename_seq);
880 	rcu_read_lock();
881 	dev = dev_get_by_index_rcu(net, ifindex);
882 	if (!dev) {
883 		rcu_read_unlock();
884 		return -ENODEV;
885 	}
886 
887 	strcpy(name, dev->name);
888 	rcu_read_unlock();
889 	if (read_seqcount_retry(&devnet_rename_seq, seq)) {
890 		cond_resched();
891 		goto retry;
892 	}
893 
894 	return 0;
895 }
896 
897 /**
898  *	dev_getbyhwaddr_rcu - find a device by its hardware address
899  *	@net: the applicable net namespace
900  *	@type: media type of device
901  *	@ha: hardware address
902  *
903  *	Search for an interface by MAC address. Returns NULL if the device
904  *	is not found or a pointer to the device.
905  *	The caller must hold RCU or RTNL.
906  *	The returned device has not had its ref count increased
907  *	and the caller must therefore be careful about locking
908  *
909  */
910 
911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
912 				       const char *ha)
913 {
914 	struct net_device *dev;
915 
916 	for_each_netdev_rcu(net, dev)
917 		if (dev->type == type &&
918 		    !memcmp(dev->dev_addr, ha, dev->addr_len))
919 			return dev;
920 
921 	return NULL;
922 }
923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
924 
925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
926 {
927 	struct net_device *dev;
928 
929 	ASSERT_RTNL();
930 	for_each_netdev(net, dev)
931 		if (dev->type == type)
932 			return dev;
933 
934 	return NULL;
935 }
936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
937 
938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
939 {
940 	struct net_device *dev, *ret = NULL;
941 
942 	rcu_read_lock();
943 	for_each_netdev_rcu(net, dev)
944 		if (dev->type == type) {
945 			dev_hold(dev);
946 			ret = dev;
947 			break;
948 		}
949 	rcu_read_unlock();
950 	return ret;
951 }
952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
953 
954 /**
955  *	__dev_get_by_flags - find any device with given flags
956  *	@net: the applicable net namespace
957  *	@if_flags: IFF_* values
958  *	@mask: bitmask of bits in if_flags to check
959  *
960  *	Search for any interface with the given flags. Returns NULL if a device
961  *	is not found or a pointer to the device. Must be called inside
962  *	rtnl_lock(), and result refcount is unchanged.
963  */
964 
965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
966 				      unsigned short mask)
967 {
968 	struct net_device *dev, *ret;
969 
970 	ASSERT_RTNL();
971 
972 	ret = NULL;
973 	for_each_netdev(net, dev) {
974 		if (((dev->flags ^ if_flags) & mask) == 0) {
975 			ret = dev;
976 			break;
977 		}
978 	}
979 	return ret;
980 }
981 EXPORT_SYMBOL(__dev_get_by_flags);
982 
983 /**
984  *	dev_valid_name - check if name is okay for network device
985  *	@name: name string
986  *
987  *	Network device names need to be valid file names to
988  *	to allow sysfs to work.  We also disallow any kind of
989  *	whitespace.
990  */
991 bool dev_valid_name(const char *name)
992 {
993 	if (*name == '\0')
994 		return false;
995 	if (strlen(name) >= IFNAMSIZ)
996 		return false;
997 	if (!strcmp(name, ".") || !strcmp(name, ".."))
998 		return false;
999 
1000 	while (*name) {
1001 		if (*name == '/' || *name == ':' || isspace(*name))
1002 			return false;
1003 		name++;
1004 	}
1005 	return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008 
1009 /**
1010  *	__dev_alloc_name - allocate a name for a device
1011  *	@net: network namespace to allocate the device name in
1012  *	@name: name format string
1013  *	@buf:  scratch buffer and result name string
1014  *
1015  *	Passed a format string - eg "lt%d" it will try and find a suitable
1016  *	id. It scans list of devices to build up a free map, then chooses
1017  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *	while allocating the name and adding the device in order to avoid
1019  *	duplicates.
1020  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *	Returns the number of the unit assigned or a negative errno code.
1022  */
1023 
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026 	int i = 0;
1027 	const char *p;
1028 	const int max_netdevices = 8*PAGE_SIZE;
1029 	unsigned long *inuse;
1030 	struct net_device *d;
1031 
1032 	p = strnchr(name, IFNAMSIZ-1, '%');
1033 	if (p) {
1034 		/*
1035 		 * Verify the string as this thing may have come from
1036 		 * the user.  There must be either one "%d" and no other "%"
1037 		 * characters.
1038 		 */
1039 		if (p[1] != 'd' || strchr(p + 2, '%'))
1040 			return -EINVAL;
1041 
1042 		/* Use one page as a bit array of possible slots */
1043 		inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044 		if (!inuse)
1045 			return -ENOMEM;
1046 
1047 		for_each_netdev(net, d) {
1048 			if (!sscanf(d->name, name, &i))
1049 				continue;
1050 			if (i < 0 || i >= max_netdevices)
1051 				continue;
1052 
1053 			/*  avoid cases where sscanf is not exact inverse of printf */
1054 			snprintf(buf, IFNAMSIZ, name, i);
1055 			if (!strncmp(buf, d->name, IFNAMSIZ))
1056 				set_bit(i, inuse);
1057 		}
1058 
1059 		i = find_first_zero_bit(inuse, max_netdevices);
1060 		free_page((unsigned long) inuse);
1061 	}
1062 
1063 	if (buf != name)
1064 		snprintf(buf, IFNAMSIZ, name, i);
1065 	if (!__dev_get_by_name(net, buf))
1066 		return i;
1067 
1068 	/* It is possible to run out of possible slots
1069 	 * when the name is long and there isn't enough space left
1070 	 * for the digits, or if all bits are used.
1071 	 */
1072 	return -ENFILE;
1073 }
1074 
1075 /**
1076  *	dev_alloc_name - allocate a name for a device
1077  *	@dev: device
1078  *	@name: name format string
1079  *
1080  *	Passed a format string - eg "lt%d" it will try and find a suitable
1081  *	id. It scans list of devices to build up a free map, then chooses
1082  *	the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *	while allocating the name and adding the device in order to avoid
1084  *	duplicates.
1085  *	Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *	Returns the number of the unit assigned or a negative errno code.
1087  */
1088 
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091 	char buf[IFNAMSIZ];
1092 	struct net *net;
1093 	int ret;
1094 
1095 	BUG_ON(!dev_net(dev));
1096 	net = dev_net(dev);
1097 	ret = __dev_alloc_name(net, name, buf);
1098 	if (ret >= 0)
1099 		strlcpy(dev->name, buf, IFNAMSIZ);
1100 	return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103 
1104 static int dev_alloc_name_ns(struct net *net,
1105 			     struct net_device *dev,
1106 			     const char *name)
1107 {
1108 	char buf[IFNAMSIZ];
1109 	int ret;
1110 
1111 	ret = __dev_alloc_name(net, name, buf);
1112 	if (ret >= 0)
1113 		strlcpy(dev->name, buf, IFNAMSIZ);
1114 	return ret;
1115 }
1116 
1117 static int dev_get_valid_name(struct net *net,
1118 			      struct net_device *dev,
1119 			      const char *name)
1120 {
1121 	BUG_ON(!net);
1122 
1123 	if (!dev_valid_name(name))
1124 		return -EINVAL;
1125 
1126 	if (strchr(name, '%'))
1127 		return dev_alloc_name_ns(net, dev, name);
1128 	else if (__dev_get_by_name(net, name))
1129 		return -EEXIST;
1130 	else if (dev->name != name)
1131 		strlcpy(dev->name, name, IFNAMSIZ);
1132 
1133 	return 0;
1134 }
1135 
1136 /**
1137  *	dev_change_name - change name of a device
1138  *	@dev: device
1139  *	@newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *	Change name of a device, can pass format strings "eth%d".
1142  *	for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146 	unsigned char old_assign_type;
1147 	char oldname[IFNAMSIZ];
1148 	int err = 0;
1149 	int ret;
1150 	struct net *net;
1151 
1152 	ASSERT_RTNL();
1153 	BUG_ON(!dev_net(dev));
1154 
1155 	net = dev_net(dev);
1156 	if (dev->flags & IFF_UP)
1157 		return -EBUSY;
1158 
1159 	write_seqcount_begin(&devnet_rename_seq);
1160 
1161 	if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162 		write_seqcount_end(&devnet_rename_seq);
1163 		return 0;
1164 	}
1165 
1166 	memcpy(oldname, dev->name, IFNAMSIZ);
1167 
1168 	err = dev_get_valid_name(net, dev, newname);
1169 	if (err < 0) {
1170 		write_seqcount_end(&devnet_rename_seq);
1171 		return err;
1172 	}
1173 
1174 	if (oldname[0] && !strchr(oldname, '%'))
1175 		netdev_info(dev, "renamed from %s\n", oldname);
1176 
1177 	old_assign_type = dev->name_assign_type;
1178 	dev->name_assign_type = NET_NAME_RENAMED;
1179 
1180 rollback:
1181 	ret = device_rename(&dev->dev, dev->name);
1182 	if (ret) {
1183 		memcpy(dev->name, oldname, IFNAMSIZ);
1184 		dev->name_assign_type = old_assign_type;
1185 		write_seqcount_end(&devnet_rename_seq);
1186 		return ret;
1187 	}
1188 
1189 	write_seqcount_end(&devnet_rename_seq);
1190 
1191 	netdev_adjacent_rename_links(dev, oldname);
1192 
1193 	write_lock_bh(&dev_base_lock);
1194 	hlist_del_rcu(&dev->name_hlist);
1195 	write_unlock_bh(&dev_base_lock);
1196 
1197 	synchronize_rcu();
1198 
1199 	write_lock_bh(&dev_base_lock);
1200 	hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201 	write_unlock_bh(&dev_base_lock);
1202 
1203 	ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204 	ret = notifier_to_errno(ret);
1205 
1206 	if (ret) {
1207 		/* err >= 0 after dev_alloc_name() or stores the first errno */
1208 		if (err >= 0) {
1209 			err = ret;
1210 			write_seqcount_begin(&devnet_rename_seq);
1211 			memcpy(dev->name, oldname, IFNAMSIZ);
1212 			memcpy(oldname, newname, IFNAMSIZ);
1213 			dev->name_assign_type = old_assign_type;
1214 			old_assign_type = NET_NAME_RENAMED;
1215 			goto rollback;
1216 		} else {
1217 			pr_err("%s: name change rollback failed: %d\n",
1218 			       dev->name, ret);
1219 		}
1220 	}
1221 
1222 	return err;
1223 }
1224 
1225 /**
1226  *	dev_set_alias - change ifalias of a device
1227  *	@dev: device
1228  *	@alias: name up to IFALIASZ
1229  *	@len: limit of bytes to copy from info
1230  *
1231  *	Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235 	char *new_ifalias;
1236 
1237 	ASSERT_RTNL();
1238 
1239 	if (len >= IFALIASZ)
1240 		return -EINVAL;
1241 
1242 	if (!len) {
1243 		kfree(dev->ifalias);
1244 		dev->ifalias = NULL;
1245 		return 0;
1246 	}
1247 
1248 	new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249 	if (!new_ifalias)
1250 		return -ENOMEM;
1251 	dev->ifalias = new_ifalias;
1252 
1253 	strlcpy(dev->ifalias, alias, len+1);
1254 	return len;
1255 }
1256 
1257 
1258 /**
1259  *	netdev_features_change - device changes features
1260  *	@dev: device to cause notification
1261  *
1262  *	Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266 	call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269 
1270 /**
1271  *	netdev_state_change - device changes state
1272  *	@dev: device to cause notification
1273  *
1274  *	Called to indicate a device has changed state. This function calls
1275  *	the notifier chains for netdev_chain and sends a NEWLINK message
1276  *	to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280 	if (dev->flags & IFF_UP) {
1281 		struct netdev_notifier_change_info change_info;
1282 
1283 		change_info.flags_changed = 0;
1284 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285 					      &change_info.info);
1286 		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 	}
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290 
1291 /**
1292  * 	netdev_notify_peers - notify network peers about existence of @dev
1293  * 	@dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303 	rtnl_lock();
1304 	call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305 	rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308 
1309 static int __dev_open(struct net_device *dev)
1310 {
1311 	const struct net_device_ops *ops = dev->netdev_ops;
1312 	int ret;
1313 
1314 	ASSERT_RTNL();
1315 
1316 	if (!netif_device_present(dev))
1317 		return -ENODEV;
1318 
1319 	/* Block netpoll from trying to do any rx path servicing.
1320 	 * If we don't do this there is a chance ndo_poll_controller
1321 	 * or ndo_poll may be running while we open the device
1322 	 */
1323 	netpoll_poll_disable(dev);
1324 
1325 	ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326 	ret = notifier_to_errno(ret);
1327 	if (ret)
1328 		return ret;
1329 
1330 	set_bit(__LINK_STATE_START, &dev->state);
1331 
1332 	if (ops->ndo_validate_addr)
1333 		ret = ops->ndo_validate_addr(dev);
1334 
1335 	if (!ret && ops->ndo_open)
1336 		ret = ops->ndo_open(dev);
1337 
1338 	netpoll_poll_enable(dev);
1339 
1340 	if (ret)
1341 		clear_bit(__LINK_STATE_START, &dev->state);
1342 	else {
1343 		dev->flags |= IFF_UP;
1344 		dev_set_rx_mode(dev);
1345 		dev_activate(dev);
1346 		add_device_randomness(dev->dev_addr, dev->addr_len);
1347 	}
1348 
1349 	return ret;
1350 }
1351 
1352 /**
1353  *	dev_open	- prepare an interface for use.
1354  *	@dev:	device to open
1355  *
1356  *	Takes a device from down to up state. The device's private open
1357  *	function is invoked and then the multicast lists are loaded. Finally
1358  *	the device is moved into the up state and a %NETDEV_UP message is
1359  *	sent to the netdev notifier chain.
1360  *
1361  *	Calling this function on an active interface is a nop. On a failure
1362  *	a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366 	int ret;
1367 
1368 	if (dev->flags & IFF_UP)
1369 		return 0;
1370 
1371 	ret = __dev_open(dev);
1372 	if (ret < 0)
1373 		return ret;
1374 
1375 	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376 	call_netdevice_notifiers(NETDEV_UP, dev);
1377 
1378 	return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381 
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384 	struct net_device *dev;
1385 
1386 	ASSERT_RTNL();
1387 	might_sleep();
1388 
1389 	list_for_each_entry(dev, head, close_list) {
1390 		/* Temporarily disable netpoll until the interface is down */
1391 		netpoll_poll_disable(dev);
1392 
1393 		call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394 
1395 		clear_bit(__LINK_STATE_START, &dev->state);
1396 
1397 		/* Synchronize to scheduled poll. We cannot touch poll list, it
1398 		 * can be even on different cpu. So just clear netif_running().
1399 		 *
1400 		 * dev->stop() will invoke napi_disable() on all of it's
1401 		 * napi_struct instances on this device.
1402 		 */
1403 		smp_mb__after_atomic(); /* Commit netif_running(). */
1404 	}
1405 
1406 	dev_deactivate_many(head);
1407 
1408 	list_for_each_entry(dev, head, close_list) {
1409 		const struct net_device_ops *ops = dev->netdev_ops;
1410 
1411 		/*
1412 		 *	Call the device specific close. This cannot fail.
1413 		 *	Only if device is UP
1414 		 *
1415 		 *	We allow it to be called even after a DETACH hot-plug
1416 		 *	event.
1417 		 */
1418 		if (ops->ndo_stop)
1419 			ops->ndo_stop(dev);
1420 
1421 		dev->flags &= ~IFF_UP;
1422 		netpoll_poll_enable(dev);
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static int __dev_close(struct net_device *dev)
1429 {
1430 	int retval;
1431 	LIST_HEAD(single);
1432 
1433 	list_add(&dev->close_list, &single);
1434 	retval = __dev_close_many(&single);
1435 	list_del(&single);
1436 
1437 	return retval;
1438 }
1439 
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442 	struct net_device *dev, *tmp;
1443 
1444 	/* Remove the devices that don't need to be closed */
1445 	list_for_each_entry_safe(dev, tmp, head, close_list)
1446 		if (!(dev->flags & IFF_UP))
1447 			list_del_init(&dev->close_list);
1448 
1449 	__dev_close_many(head);
1450 
1451 	list_for_each_entry_safe(dev, tmp, head, close_list) {
1452 		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453 		call_netdevice_notifiers(NETDEV_DOWN, dev);
1454 		if (unlink)
1455 			list_del_init(&dev->close_list);
1456 	}
1457 
1458 	return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461 
1462 /**
1463  *	dev_close - shutdown an interface.
1464  *	@dev: device to shutdown
1465  *
1466  *	This function moves an active device into down state. A
1467  *	%NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *	is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *	chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473 	if (dev->flags & IFF_UP) {
1474 		LIST_HEAD(single);
1475 
1476 		list_add(&dev->close_list, &single);
1477 		dev_close_many(&single, true);
1478 		list_del(&single);
1479 	}
1480 	return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483 
1484 
1485 /**
1486  *	dev_disable_lro - disable Large Receive Offload on a device
1487  *	@dev: device
1488  *
1489  *	Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *	called under RTNL.  This is needed if received packets may be
1491  *	forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495 	struct net_device *lower_dev;
1496 	struct list_head *iter;
1497 
1498 	dev->wanted_features &= ~NETIF_F_LRO;
1499 	netdev_update_features(dev);
1500 
1501 	if (unlikely(dev->features & NETIF_F_LRO))
1502 		netdev_WARN(dev, "failed to disable LRO!\n");
1503 
1504 	netdev_for_each_lower_dev(dev, lower_dev, iter)
1505 		dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508 
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510 				   struct net_device *dev)
1511 {
1512 	struct netdev_notifier_info info;
1513 
1514 	netdev_notifier_info_init(&info, dev);
1515 	return nb->notifier_call(nb, val, &info);
1516 }
1517 
1518 static int dev_boot_phase = 1;
1519 
1520 /**
1521  *	register_netdevice_notifier - register a network notifier block
1522  *	@nb: notifier
1523  *
1524  *	Register a notifier to be called when network device events occur.
1525  *	The notifier passed is linked into the kernel structures and must
1526  *	not be reused until it has been unregistered. A negative errno code
1527  *	is returned on a failure.
1528  *
1529  * 	When registered all registration and up events are replayed
1530  *	to the new notifier to allow device to have a race free
1531  *	view of the network device list.
1532  */
1533 
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536 	struct net_device *dev;
1537 	struct net_device *last;
1538 	struct net *net;
1539 	int err;
1540 
1541 	rtnl_lock();
1542 	err = raw_notifier_chain_register(&netdev_chain, nb);
1543 	if (err)
1544 		goto unlock;
1545 	if (dev_boot_phase)
1546 		goto unlock;
1547 	for_each_net(net) {
1548 		for_each_netdev(net, dev) {
1549 			err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550 			err = notifier_to_errno(err);
1551 			if (err)
1552 				goto rollback;
1553 
1554 			if (!(dev->flags & IFF_UP))
1555 				continue;
1556 
1557 			call_netdevice_notifier(nb, NETDEV_UP, dev);
1558 		}
1559 	}
1560 
1561 unlock:
1562 	rtnl_unlock();
1563 	return err;
1564 
1565 rollback:
1566 	last = dev;
1567 	for_each_net(net) {
1568 		for_each_netdev(net, dev) {
1569 			if (dev == last)
1570 				goto outroll;
1571 
1572 			if (dev->flags & IFF_UP) {
1573 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574 							dev);
1575 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 			}
1577 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578 		}
1579 	}
1580 
1581 outroll:
1582 	raw_notifier_chain_unregister(&netdev_chain, nb);
1583 	goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586 
1587 /**
1588  *	unregister_netdevice_notifier - unregister a network notifier block
1589  *	@nb: notifier
1590  *
1591  *	Unregister a notifier previously registered by
1592  *	register_netdevice_notifier(). The notifier is unlinked into the
1593  *	kernel structures and may then be reused. A negative errno code
1594  *	is returned on a failure.
1595  *
1596  * 	After unregistering unregister and down device events are synthesized
1597  *	for all devices on the device list to the removed notifier to remove
1598  *	the need for special case cleanup code.
1599  */
1600 
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603 	struct net_device *dev;
1604 	struct net *net;
1605 	int err;
1606 
1607 	rtnl_lock();
1608 	err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609 	if (err)
1610 		goto unlock;
1611 
1612 	for_each_net(net) {
1613 		for_each_netdev(net, dev) {
1614 			if (dev->flags & IFF_UP) {
1615 				call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616 							dev);
1617 				call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 			}
1619 			call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620 		}
1621 	}
1622 unlock:
1623 	rtnl_unlock();
1624 	return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627 
1628 /**
1629  *	call_netdevice_notifiers_info - call all network notifier blocks
1630  *	@val: value passed unmodified to notifier function
1631  *	@dev: net_device pointer passed unmodified to notifier function
1632  *	@info: notifier information data
1633  *
1634  *	Call all network notifier blocks.  Parameters and return value
1635  *	are as for raw_notifier_call_chain().
1636  */
1637 
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639 					 struct net_device *dev,
1640 					 struct netdev_notifier_info *info)
1641 {
1642 	ASSERT_RTNL();
1643 	netdev_notifier_info_init(info, dev);
1644 	return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646 
1647 /**
1648  *	call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *	Call all network notifier blocks.  Parameters and return value
1653  *	are as for raw_notifier_call_chain().
1654  */
1655 
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658 	struct netdev_notifier_info info;
1659 
1660 	return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663 
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666 
1667 void net_inc_ingress_queue(void)
1668 {
1669 	static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672 
1673 void net_dec_ingress_queue(void)
1674 {
1675 	static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679 
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682 
1683 void net_inc_egress_queue(void)
1684 {
1685 	static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688 
1689 void net_dec_egress_queue(void)
1690 {
1691 	static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695 
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704 
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708 	int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709 
1710 	if (deferred) {
1711 		while (--deferred)
1712 			static_key_slow_dec(&netstamp_needed);
1713 		return;
1714 	}
1715 #endif
1716 	static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719 
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723 	if (in_interrupt()) {
1724 		atomic_inc(&netstamp_needed_deferred);
1725 		return;
1726 	}
1727 #endif
1728 	static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731 
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734 	skb->tstamp.tv64 = 0;
1735 	if (static_key_false(&netstamp_needed))
1736 		__net_timestamp(skb);
1737 }
1738 
1739 #define net_timestamp_check(COND, SKB)			\
1740 	if (static_key_false(&netstamp_needed)) {		\
1741 		if ((COND) && !(SKB)->tstamp.tv64)	\
1742 			__net_timestamp(SKB);		\
1743 	}						\
1744 
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747 	unsigned int len;
1748 
1749 	if (!(dev->flags & IFF_UP))
1750 		return false;
1751 
1752 	len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753 	if (skb->len <= len)
1754 		return true;
1755 
1756 	/* if TSO is enabled, we don't care about the length as the packet
1757 	 * could be forwarded without being segmented before
1758 	 */
1759 	if (skb_is_gso(skb))
1760 		return true;
1761 
1762 	return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765 
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768 	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1769 	    unlikely(!is_skb_forwardable(dev, skb))) {
1770 		atomic_long_inc(&dev->rx_dropped);
1771 		kfree_skb(skb);
1772 		return NET_RX_DROP;
1773 	}
1774 
1775 	skb_scrub_packet(skb, true);
1776 	skb->priority = 0;
1777 	skb->protocol = eth_type_trans(skb, dev);
1778 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1779 
1780 	return 0;
1781 }
1782 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1783 
1784 /**
1785  * dev_forward_skb - loopback an skb to another netif
1786  *
1787  * @dev: destination network device
1788  * @skb: buffer to forward
1789  *
1790  * return values:
1791  *	NET_RX_SUCCESS	(no congestion)
1792  *	NET_RX_DROP     (packet was dropped, but freed)
1793  *
1794  * dev_forward_skb can be used for injecting an skb from the
1795  * start_xmit function of one device into the receive queue
1796  * of another device.
1797  *
1798  * The receiving device may be in another namespace, so
1799  * we have to clear all information in the skb that could
1800  * impact namespace isolation.
1801  */
1802 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1803 {
1804 	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1805 }
1806 EXPORT_SYMBOL_GPL(dev_forward_skb);
1807 
1808 static inline int deliver_skb(struct sk_buff *skb,
1809 			      struct packet_type *pt_prev,
1810 			      struct net_device *orig_dev)
1811 {
1812 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1813 		return -ENOMEM;
1814 	atomic_inc(&skb->users);
1815 	return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1816 }
1817 
1818 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1819 					  struct packet_type **pt,
1820 					  struct net_device *orig_dev,
1821 					  __be16 type,
1822 					  struct list_head *ptype_list)
1823 {
1824 	struct packet_type *ptype, *pt_prev = *pt;
1825 
1826 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1827 		if (ptype->type != type)
1828 			continue;
1829 		if (pt_prev)
1830 			deliver_skb(skb, pt_prev, orig_dev);
1831 		pt_prev = ptype;
1832 	}
1833 	*pt = pt_prev;
1834 }
1835 
1836 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1837 {
1838 	if (!ptype->af_packet_priv || !skb->sk)
1839 		return false;
1840 
1841 	if (ptype->id_match)
1842 		return ptype->id_match(ptype, skb->sk);
1843 	else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1844 		return true;
1845 
1846 	return false;
1847 }
1848 
1849 /*
1850  *	Support routine. Sends outgoing frames to any network
1851  *	taps currently in use.
1852  */
1853 
1854 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1855 {
1856 	struct packet_type *ptype;
1857 	struct sk_buff *skb2 = NULL;
1858 	struct packet_type *pt_prev = NULL;
1859 	struct list_head *ptype_list = &ptype_all;
1860 
1861 	rcu_read_lock();
1862 again:
1863 	list_for_each_entry_rcu(ptype, ptype_list, list) {
1864 		/* Never send packets back to the socket
1865 		 * they originated from - MvS (miquels@drinkel.ow.org)
1866 		 */
1867 		if (skb_loop_sk(ptype, skb))
1868 			continue;
1869 
1870 		if (pt_prev) {
1871 			deliver_skb(skb2, pt_prev, skb->dev);
1872 			pt_prev = ptype;
1873 			continue;
1874 		}
1875 
1876 		/* need to clone skb, done only once */
1877 		skb2 = skb_clone(skb, GFP_ATOMIC);
1878 		if (!skb2)
1879 			goto out_unlock;
1880 
1881 		net_timestamp_set(skb2);
1882 
1883 		/* skb->nh should be correctly
1884 		 * set by sender, so that the second statement is
1885 		 * just protection against buggy protocols.
1886 		 */
1887 		skb_reset_mac_header(skb2);
1888 
1889 		if (skb_network_header(skb2) < skb2->data ||
1890 		    skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1891 			net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1892 					     ntohs(skb2->protocol),
1893 					     dev->name);
1894 			skb_reset_network_header(skb2);
1895 		}
1896 
1897 		skb2->transport_header = skb2->network_header;
1898 		skb2->pkt_type = PACKET_OUTGOING;
1899 		pt_prev = ptype;
1900 	}
1901 
1902 	if (ptype_list == &ptype_all) {
1903 		ptype_list = &dev->ptype_all;
1904 		goto again;
1905 	}
1906 out_unlock:
1907 	if (pt_prev)
1908 		pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1909 	rcu_read_unlock();
1910 }
1911 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1912 
1913 /**
1914  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1915  * @dev: Network device
1916  * @txq: number of queues available
1917  *
1918  * If real_num_tx_queues is changed the tc mappings may no longer be
1919  * valid. To resolve this verify the tc mapping remains valid and if
1920  * not NULL the mapping. With no priorities mapping to this
1921  * offset/count pair it will no longer be used. In the worst case TC0
1922  * is invalid nothing can be done so disable priority mappings. If is
1923  * expected that drivers will fix this mapping if they can before
1924  * calling netif_set_real_num_tx_queues.
1925  */
1926 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1927 {
1928 	int i;
1929 	struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1930 
1931 	/* If TC0 is invalidated disable TC mapping */
1932 	if (tc->offset + tc->count > txq) {
1933 		pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934 		dev->num_tc = 0;
1935 		return;
1936 	}
1937 
1938 	/* Invalidated prio to tc mappings set to TC0 */
1939 	for (i = 1; i < TC_BITMASK + 1; i++) {
1940 		int q = netdev_get_prio_tc_map(dev, i);
1941 
1942 		tc = &dev->tc_to_txq[q];
1943 		if (tc->offset + tc->count > txq) {
1944 			pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1945 				i, q);
1946 			netdev_set_prio_tc_map(dev, i, 0);
1947 		}
1948 	}
1949 }
1950 
1951 #ifdef CONFIG_XPS
1952 static DEFINE_MUTEX(xps_map_mutex);
1953 #define xmap_dereference(P)		\
1954 	rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1955 
1956 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1957 					int cpu, u16 index)
1958 {
1959 	struct xps_map *map = NULL;
1960 	int pos;
1961 
1962 	if (dev_maps)
1963 		map = xmap_dereference(dev_maps->cpu_map[cpu]);
1964 
1965 	for (pos = 0; map && pos < map->len; pos++) {
1966 		if (map->queues[pos] == index) {
1967 			if (map->len > 1) {
1968 				map->queues[pos] = map->queues[--map->len];
1969 			} else {
1970 				RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1971 				kfree_rcu(map, rcu);
1972 				map = NULL;
1973 			}
1974 			break;
1975 		}
1976 	}
1977 
1978 	return map;
1979 }
1980 
1981 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1982 {
1983 	struct xps_dev_maps *dev_maps;
1984 	int cpu, i;
1985 	bool active = false;
1986 
1987 	mutex_lock(&xps_map_mutex);
1988 	dev_maps = xmap_dereference(dev->xps_maps);
1989 
1990 	if (!dev_maps)
1991 		goto out_no_maps;
1992 
1993 	for_each_possible_cpu(cpu) {
1994 		for (i = index; i < dev->num_tx_queues; i++) {
1995 			if (!remove_xps_queue(dev_maps, cpu, i))
1996 				break;
1997 		}
1998 		if (i == dev->num_tx_queues)
1999 			active = true;
2000 	}
2001 
2002 	if (!active) {
2003 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2004 		kfree_rcu(dev_maps, rcu);
2005 	}
2006 
2007 	for (i = index; i < dev->num_tx_queues; i++)
2008 		netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2009 					     NUMA_NO_NODE);
2010 
2011 out_no_maps:
2012 	mutex_unlock(&xps_map_mutex);
2013 }
2014 
2015 static struct xps_map *expand_xps_map(struct xps_map *map,
2016 				      int cpu, u16 index)
2017 {
2018 	struct xps_map *new_map;
2019 	int alloc_len = XPS_MIN_MAP_ALLOC;
2020 	int i, pos;
2021 
2022 	for (pos = 0; map && pos < map->len; pos++) {
2023 		if (map->queues[pos] != index)
2024 			continue;
2025 		return map;
2026 	}
2027 
2028 	/* Need to add queue to this CPU's existing map */
2029 	if (map) {
2030 		if (pos < map->alloc_len)
2031 			return map;
2032 
2033 		alloc_len = map->alloc_len * 2;
2034 	}
2035 
2036 	/* Need to allocate new map to store queue on this CPU's map */
2037 	new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038 			       cpu_to_node(cpu));
2039 	if (!new_map)
2040 		return NULL;
2041 
2042 	for (i = 0; i < pos; i++)
2043 		new_map->queues[i] = map->queues[i];
2044 	new_map->alloc_len = alloc_len;
2045 	new_map->len = pos;
2046 
2047 	return new_map;
2048 }
2049 
2050 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2051 			u16 index)
2052 {
2053 	struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2054 	struct xps_map *map, *new_map;
2055 	int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2056 	int cpu, numa_node_id = -2;
2057 	bool active = false;
2058 
2059 	mutex_lock(&xps_map_mutex);
2060 
2061 	dev_maps = xmap_dereference(dev->xps_maps);
2062 
2063 	/* allocate memory for queue storage */
2064 	for_each_online_cpu(cpu) {
2065 		if (!cpumask_test_cpu(cpu, mask))
2066 			continue;
2067 
2068 		if (!new_dev_maps)
2069 			new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2070 		if (!new_dev_maps) {
2071 			mutex_unlock(&xps_map_mutex);
2072 			return -ENOMEM;
2073 		}
2074 
2075 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2076 				 NULL;
2077 
2078 		map = expand_xps_map(map, cpu, index);
2079 		if (!map)
2080 			goto error;
2081 
2082 		RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2083 	}
2084 
2085 	if (!new_dev_maps)
2086 		goto out_no_new_maps;
2087 
2088 	for_each_possible_cpu(cpu) {
2089 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2090 			/* add queue to CPU maps */
2091 			int pos = 0;
2092 
2093 			map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094 			while ((pos < map->len) && (map->queues[pos] != index))
2095 				pos++;
2096 
2097 			if (pos == map->len)
2098 				map->queues[map->len++] = index;
2099 #ifdef CONFIG_NUMA
2100 			if (numa_node_id == -2)
2101 				numa_node_id = cpu_to_node(cpu);
2102 			else if (numa_node_id != cpu_to_node(cpu))
2103 				numa_node_id = -1;
2104 #endif
2105 		} else if (dev_maps) {
2106 			/* fill in the new device map from the old device map */
2107 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2108 			RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109 		}
2110 
2111 	}
2112 
2113 	rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2114 
2115 	/* Cleanup old maps */
2116 	if (dev_maps) {
2117 		for_each_possible_cpu(cpu) {
2118 			new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119 			map = xmap_dereference(dev_maps->cpu_map[cpu]);
2120 			if (map && map != new_map)
2121 				kfree_rcu(map, rcu);
2122 		}
2123 
2124 		kfree_rcu(dev_maps, rcu);
2125 	}
2126 
2127 	dev_maps = new_dev_maps;
2128 	active = true;
2129 
2130 out_no_new_maps:
2131 	/* update Tx queue numa node */
2132 	netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2133 				     (numa_node_id >= 0) ? numa_node_id :
2134 				     NUMA_NO_NODE);
2135 
2136 	if (!dev_maps)
2137 		goto out_no_maps;
2138 
2139 	/* removes queue from unused CPUs */
2140 	for_each_possible_cpu(cpu) {
2141 		if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2142 			continue;
2143 
2144 		if (remove_xps_queue(dev_maps, cpu, index))
2145 			active = true;
2146 	}
2147 
2148 	/* free map if not active */
2149 	if (!active) {
2150 		RCU_INIT_POINTER(dev->xps_maps, NULL);
2151 		kfree_rcu(dev_maps, rcu);
2152 	}
2153 
2154 out_no_maps:
2155 	mutex_unlock(&xps_map_mutex);
2156 
2157 	return 0;
2158 error:
2159 	/* remove any maps that we added */
2160 	for_each_possible_cpu(cpu) {
2161 		new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2162 		map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2163 				 NULL;
2164 		if (new_map && new_map != map)
2165 			kfree(new_map);
2166 	}
2167 
2168 	mutex_unlock(&xps_map_mutex);
2169 
2170 	kfree(new_dev_maps);
2171 	return -ENOMEM;
2172 }
2173 EXPORT_SYMBOL(netif_set_xps_queue);
2174 
2175 #endif
2176 /*
2177  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2178  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2179  */
2180 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2181 {
2182 	int rc;
2183 
2184 	if (txq < 1 || txq > dev->num_tx_queues)
2185 		return -EINVAL;
2186 
2187 	if (dev->reg_state == NETREG_REGISTERED ||
2188 	    dev->reg_state == NETREG_UNREGISTERING) {
2189 		ASSERT_RTNL();
2190 
2191 		rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2192 						  txq);
2193 		if (rc)
2194 			return rc;
2195 
2196 		if (dev->num_tc)
2197 			netif_setup_tc(dev, txq);
2198 
2199 		if (txq < dev->real_num_tx_queues) {
2200 			qdisc_reset_all_tx_gt(dev, txq);
2201 #ifdef CONFIG_XPS
2202 			netif_reset_xps_queues_gt(dev, txq);
2203 #endif
2204 		}
2205 	}
2206 
2207 	dev->real_num_tx_queues = txq;
2208 	return 0;
2209 }
2210 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2211 
2212 #ifdef CONFIG_SYSFS
2213 /**
2214  *	netif_set_real_num_rx_queues - set actual number of RX queues used
2215  *	@dev: Network device
2216  *	@rxq: Actual number of RX queues
2217  *
2218  *	This must be called either with the rtnl_lock held or before
2219  *	registration of the net device.  Returns 0 on success, or a
2220  *	negative error code.  If called before registration, it always
2221  *	succeeds.
2222  */
2223 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2224 {
2225 	int rc;
2226 
2227 	if (rxq < 1 || rxq > dev->num_rx_queues)
2228 		return -EINVAL;
2229 
2230 	if (dev->reg_state == NETREG_REGISTERED) {
2231 		ASSERT_RTNL();
2232 
2233 		rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2234 						  rxq);
2235 		if (rc)
2236 			return rc;
2237 	}
2238 
2239 	dev->real_num_rx_queues = rxq;
2240 	return 0;
2241 }
2242 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2243 #endif
2244 
2245 /**
2246  * netif_get_num_default_rss_queues - default number of RSS queues
2247  *
2248  * This routine should set an upper limit on the number of RSS queues
2249  * used by default by multiqueue devices.
2250  */
2251 int netif_get_num_default_rss_queues(void)
2252 {
2253 	return is_kdump_kernel() ?
2254 		1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2255 }
2256 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2257 
2258 static void __netif_reschedule(struct Qdisc *q)
2259 {
2260 	struct softnet_data *sd;
2261 	unsigned long flags;
2262 
2263 	local_irq_save(flags);
2264 	sd = this_cpu_ptr(&softnet_data);
2265 	q->next_sched = NULL;
2266 	*sd->output_queue_tailp = q;
2267 	sd->output_queue_tailp = &q->next_sched;
2268 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2269 	local_irq_restore(flags);
2270 }
2271 
2272 void __netif_schedule(struct Qdisc *q)
2273 {
2274 	if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2275 		__netif_reschedule(q);
2276 }
2277 EXPORT_SYMBOL(__netif_schedule);
2278 
2279 struct dev_kfree_skb_cb {
2280 	enum skb_free_reason reason;
2281 };
2282 
2283 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2284 {
2285 	return (struct dev_kfree_skb_cb *)skb->cb;
2286 }
2287 
2288 void netif_schedule_queue(struct netdev_queue *txq)
2289 {
2290 	rcu_read_lock();
2291 	if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2292 		struct Qdisc *q = rcu_dereference(txq->qdisc);
2293 
2294 		__netif_schedule(q);
2295 	}
2296 	rcu_read_unlock();
2297 }
2298 EXPORT_SYMBOL(netif_schedule_queue);
2299 
2300 /**
2301  *	netif_wake_subqueue - allow sending packets on subqueue
2302  *	@dev: network device
2303  *	@queue_index: sub queue index
2304  *
2305  * Resume individual transmit queue of a device with multiple transmit queues.
2306  */
2307 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2308 {
2309 	struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2310 
2311 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2312 		struct Qdisc *q;
2313 
2314 		rcu_read_lock();
2315 		q = rcu_dereference(txq->qdisc);
2316 		__netif_schedule(q);
2317 		rcu_read_unlock();
2318 	}
2319 }
2320 EXPORT_SYMBOL(netif_wake_subqueue);
2321 
2322 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2323 {
2324 	if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2325 		struct Qdisc *q;
2326 
2327 		rcu_read_lock();
2328 		q = rcu_dereference(dev_queue->qdisc);
2329 		__netif_schedule(q);
2330 		rcu_read_unlock();
2331 	}
2332 }
2333 EXPORT_SYMBOL(netif_tx_wake_queue);
2334 
2335 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2336 {
2337 	unsigned long flags;
2338 
2339 	if (likely(atomic_read(&skb->users) == 1)) {
2340 		smp_rmb();
2341 		atomic_set(&skb->users, 0);
2342 	} else if (likely(!atomic_dec_and_test(&skb->users))) {
2343 		return;
2344 	}
2345 	get_kfree_skb_cb(skb)->reason = reason;
2346 	local_irq_save(flags);
2347 	skb->next = __this_cpu_read(softnet_data.completion_queue);
2348 	__this_cpu_write(softnet_data.completion_queue, skb);
2349 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
2350 	local_irq_restore(flags);
2351 }
2352 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2353 
2354 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2355 {
2356 	if (in_irq() || irqs_disabled())
2357 		__dev_kfree_skb_irq(skb, reason);
2358 	else
2359 		dev_kfree_skb(skb);
2360 }
2361 EXPORT_SYMBOL(__dev_kfree_skb_any);
2362 
2363 
2364 /**
2365  * netif_device_detach - mark device as removed
2366  * @dev: network device
2367  *
2368  * Mark device as removed from system and therefore no longer available.
2369  */
2370 void netif_device_detach(struct net_device *dev)
2371 {
2372 	if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2373 	    netif_running(dev)) {
2374 		netif_tx_stop_all_queues(dev);
2375 	}
2376 }
2377 EXPORT_SYMBOL(netif_device_detach);
2378 
2379 /**
2380  * netif_device_attach - mark device as attached
2381  * @dev: network device
2382  *
2383  * Mark device as attached from system and restart if needed.
2384  */
2385 void netif_device_attach(struct net_device *dev)
2386 {
2387 	if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2388 	    netif_running(dev)) {
2389 		netif_tx_wake_all_queues(dev);
2390 		__netdev_watchdog_up(dev);
2391 	}
2392 }
2393 EXPORT_SYMBOL(netif_device_attach);
2394 
2395 /*
2396  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2397  * to be used as a distribution range.
2398  */
2399 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2400 		  unsigned int num_tx_queues)
2401 {
2402 	u32 hash;
2403 	u16 qoffset = 0;
2404 	u16 qcount = num_tx_queues;
2405 
2406 	if (skb_rx_queue_recorded(skb)) {
2407 		hash = skb_get_rx_queue(skb);
2408 		while (unlikely(hash >= num_tx_queues))
2409 			hash -= num_tx_queues;
2410 		return hash;
2411 	}
2412 
2413 	if (dev->num_tc) {
2414 		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2415 		qoffset = dev->tc_to_txq[tc].offset;
2416 		qcount = dev->tc_to_txq[tc].count;
2417 	}
2418 
2419 	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2420 }
2421 EXPORT_SYMBOL(__skb_tx_hash);
2422 
2423 static void skb_warn_bad_offload(const struct sk_buff *skb)
2424 {
2425 	static const netdev_features_t null_features = 0;
2426 	struct net_device *dev = skb->dev;
2427 	const char *name = "";
2428 
2429 	if (!net_ratelimit())
2430 		return;
2431 
2432 	if (dev) {
2433 		if (dev->dev.parent)
2434 			name = dev_driver_string(dev->dev.parent);
2435 		else
2436 			name = netdev_name(dev);
2437 	}
2438 	WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2439 	     "gso_type=%d ip_summed=%d\n",
2440 	     name, dev ? &dev->features : &null_features,
2441 	     skb->sk ? &skb->sk->sk_route_caps : &null_features,
2442 	     skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2443 	     skb_shinfo(skb)->gso_type, skb->ip_summed);
2444 }
2445 
2446 /*
2447  * Invalidate hardware checksum when packet is to be mangled, and
2448  * complete checksum manually on outgoing path.
2449  */
2450 int skb_checksum_help(struct sk_buff *skb)
2451 {
2452 	__wsum csum;
2453 	int ret = 0, offset;
2454 
2455 	if (skb->ip_summed == CHECKSUM_COMPLETE)
2456 		goto out_set_summed;
2457 
2458 	if (unlikely(skb_shinfo(skb)->gso_size)) {
2459 		skb_warn_bad_offload(skb);
2460 		return -EINVAL;
2461 	}
2462 
2463 	/* Before computing a checksum, we should make sure no frag could
2464 	 * be modified by an external entity : checksum could be wrong.
2465 	 */
2466 	if (skb_has_shared_frag(skb)) {
2467 		ret = __skb_linearize(skb);
2468 		if (ret)
2469 			goto out;
2470 	}
2471 
2472 	offset = skb_checksum_start_offset(skb);
2473 	BUG_ON(offset >= skb_headlen(skb));
2474 	csum = skb_checksum(skb, offset, skb->len - offset, 0);
2475 
2476 	offset += skb->csum_offset;
2477 	BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2478 
2479 	if (skb_cloned(skb) &&
2480 	    !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2481 		ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2482 		if (ret)
2483 			goto out;
2484 	}
2485 
2486 	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
2487 out_set_summed:
2488 	skb->ip_summed = CHECKSUM_NONE;
2489 out:
2490 	return ret;
2491 }
2492 EXPORT_SYMBOL(skb_checksum_help);
2493 
2494 /* skb_csum_offload_check - Driver helper function to determine if a device
2495  * with limited checksum offload capabilities is able to offload the checksum
2496  * for a given packet.
2497  *
2498  * Arguments:
2499  *   skb - sk_buff for the packet in question
2500  *   spec - contains the description of what device can offload
2501  *   csum_encapped - returns true if the checksum being offloaded is
2502  *	      encpasulated. That is it is checksum for the transport header
2503  *	      in the inner headers.
2504  *   checksum_help - when set indicates that helper function should
2505  *	      call skb_checksum_help if offload checks fail
2506  *
2507  * Returns:
2508  *   true: Packet has passed the checksum checks and should be offloadable to
2509  *	   the device (a driver may still need to check for additional
2510  *	   restrictions of its device)
2511  *   false: Checksum is not offloadable. If checksum_help was set then
2512  *	   skb_checksum_help was called to resolve checksum for non-GSO
2513  *	   packets and when IP protocol is not SCTP
2514  */
2515 bool __skb_csum_offload_chk(struct sk_buff *skb,
2516 			    const struct skb_csum_offl_spec *spec,
2517 			    bool *csum_encapped,
2518 			    bool csum_help)
2519 {
2520 	struct iphdr *iph;
2521 	struct ipv6hdr *ipv6;
2522 	void *nhdr;
2523 	int protocol;
2524 	u8 ip_proto;
2525 
2526 	if (skb->protocol == htons(ETH_P_8021Q) ||
2527 	    skb->protocol == htons(ETH_P_8021AD)) {
2528 		if (!spec->vlan_okay)
2529 			goto need_help;
2530 	}
2531 
2532 	/* We check whether the checksum refers to a transport layer checksum in
2533 	 * the outermost header or an encapsulated transport layer checksum that
2534 	 * corresponds to the inner headers of the skb. If the checksum is for
2535 	 * something else in the packet we need help.
2536 	 */
2537 	if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
2538 		/* Non-encapsulated checksum */
2539 		protocol = eproto_to_ipproto(vlan_get_protocol(skb));
2540 		nhdr = skb_network_header(skb);
2541 		*csum_encapped = false;
2542 		if (spec->no_not_encapped)
2543 			goto need_help;
2544 	} else if (skb->encapsulation && spec->encap_okay &&
2545 		   skb_checksum_start_offset(skb) ==
2546 		   skb_inner_transport_offset(skb)) {
2547 		/* Encapsulated checksum */
2548 		*csum_encapped = true;
2549 		switch (skb->inner_protocol_type) {
2550 		case ENCAP_TYPE_ETHER:
2551 			protocol = eproto_to_ipproto(skb->inner_protocol);
2552 			break;
2553 		case ENCAP_TYPE_IPPROTO:
2554 			protocol = skb->inner_protocol;
2555 			break;
2556 		}
2557 		nhdr = skb_inner_network_header(skb);
2558 	} else {
2559 		goto need_help;
2560 	}
2561 
2562 	switch (protocol) {
2563 	case IPPROTO_IP:
2564 		if (!spec->ipv4_okay)
2565 			goto need_help;
2566 		iph = nhdr;
2567 		ip_proto = iph->protocol;
2568 		if (iph->ihl != 5 && !spec->ip_options_okay)
2569 			goto need_help;
2570 		break;
2571 	case IPPROTO_IPV6:
2572 		if (!spec->ipv6_okay)
2573 			goto need_help;
2574 		if (spec->no_encapped_ipv6 && *csum_encapped)
2575 			goto need_help;
2576 		ipv6 = nhdr;
2577 		nhdr += sizeof(*ipv6);
2578 		ip_proto = ipv6->nexthdr;
2579 		break;
2580 	default:
2581 		goto need_help;
2582 	}
2583 
2584 ip_proto_again:
2585 	switch (ip_proto) {
2586 	case IPPROTO_TCP:
2587 		if (!spec->tcp_okay ||
2588 		    skb->csum_offset != offsetof(struct tcphdr, check))
2589 			goto need_help;
2590 		break;
2591 	case IPPROTO_UDP:
2592 		if (!spec->udp_okay ||
2593 		    skb->csum_offset != offsetof(struct udphdr, check))
2594 			goto need_help;
2595 		break;
2596 	case IPPROTO_SCTP:
2597 		if (!spec->sctp_okay ||
2598 		    skb->csum_offset != offsetof(struct sctphdr, checksum))
2599 			goto cant_help;
2600 		break;
2601 	case NEXTHDR_HOP:
2602 	case NEXTHDR_ROUTING:
2603 	case NEXTHDR_DEST: {
2604 		u8 *opthdr = nhdr;
2605 
2606 		if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
2607 			goto need_help;
2608 
2609 		ip_proto = opthdr[0];
2610 		nhdr += (opthdr[1] + 1) << 3;
2611 
2612 		goto ip_proto_again;
2613 	}
2614 	default:
2615 		goto need_help;
2616 	}
2617 
2618 	/* Passed the tests for offloading checksum */
2619 	return true;
2620 
2621 need_help:
2622 	if (csum_help && !skb_shinfo(skb)->gso_size)
2623 		skb_checksum_help(skb);
2624 cant_help:
2625 	return false;
2626 }
2627 EXPORT_SYMBOL(__skb_csum_offload_chk);
2628 
2629 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2630 {
2631 	__be16 type = skb->protocol;
2632 
2633 	/* Tunnel gso handlers can set protocol to ethernet. */
2634 	if (type == htons(ETH_P_TEB)) {
2635 		struct ethhdr *eth;
2636 
2637 		if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2638 			return 0;
2639 
2640 		eth = (struct ethhdr *)skb_mac_header(skb);
2641 		type = eth->h_proto;
2642 	}
2643 
2644 	return __vlan_get_protocol(skb, type, depth);
2645 }
2646 
2647 /**
2648  *	skb_mac_gso_segment - mac layer segmentation handler.
2649  *	@skb: buffer to segment
2650  *	@features: features for the output path (see dev->features)
2651  */
2652 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2653 				    netdev_features_t features)
2654 {
2655 	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2656 	struct packet_offload *ptype;
2657 	int vlan_depth = skb->mac_len;
2658 	__be16 type = skb_network_protocol(skb, &vlan_depth);
2659 
2660 	if (unlikely(!type))
2661 		return ERR_PTR(-EINVAL);
2662 
2663 	__skb_pull(skb, vlan_depth);
2664 
2665 	rcu_read_lock();
2666 	list_for_each_entry_rcu(ptype, &offload_base, list) {
2667 		if (ptype->type == type && ptype->callbacks.gso_segment) {
2668 			segs = ptype->callbacks.gso_segment(skb, features);
2669 			break;
2670 		}
2671 	}
2672 	rcu_read_unlock();
2673 
2674 	__skb_push(skb, skb->data - skb_mac_header(skb));
2675 
2676 	return segs;
2677 }
2678 EXPORT_SYMBOL(skb_mac_gso_segment);
2679 
2680 
2681 /* openvswitch calls this on rx path, so we need a different check.
2682  */
2683 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2684 {
2685 	if (tx_path)
2686 		return skb->ip_summed != CHECKSUM_PARTIAL;
2687 	else
2688 		return skb->ip_summed == CHECKSUM_NONE;
2689 }
2690 
2691 /**
2692  *	__skb_gso_segment - Perform segmentation on skb.
2693  *	@skb: buffer to segment
2694  *	@features: features for the output path (see dev->features)
2695  *	@tx_path: whether it is called in TX path
2696  *
2697  *	This function segments the given skb and returns a list of segments.
2698  *
2699  *	It may return NULL if the skb requires no segmentation.  This is
2700  *	only possible when GSO is used for verifying header integrity.
2701  *
2702  *	Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2703  */
2704 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2705 				  netdev_features_t features, bool tx_path)
2706 {
2707 	if (unlikely(skb_needs_check(skb, tx_path))) {
2708 		int err;
2709 
2710 		skb_warn_bad_offload(skb);
2711 
2712 		err = skb_cow_head(skb, 0);
2713 		if (err < 0)
2714 			return ERR_PTR(err);
2715 	}
2716 
2717 	/* Only report GSO partial support if it will enable us to
2718 	 * support segmentation on this frame without needing additional
2719 	 * work.
2720 	 */
2721 	if (features & NETIF_F_GSO_PARTIAL) {
2722 		netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2723 		struct net_device *dev = skb->dev;
2724 
2725 		partial_features |= dev->features & dev->gso_partial_features;
2726 		if (!skb_gso_ok(skb, features | partial_features))
2727 			features &= ~NETIF_F_GSO_PARTIAL;
2728 	}
2729 
2730 	BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2731 		     sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2732 
2733 	SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2734 	SKB_GSO_CB(skb)->encap_level = 0;
2735 
2736 	skb_reset_mac_header(skb);
2737 	skb_reset_mac_len(skb);
2738 
2739 	return skb_mac_gso_segment(skb, features);
2740 }
2741 EXPORT_SYMBOL(__skb_gso_segment);
2742 
2743 /* Take action when hardware reception checksum errors are detected. */
2744 #ifdef CONFIG_BUG
2745 void netdev_rx_csum_fault(struct net_device *dev)
2746 {
2747 	if (net_ratelimit()) {
2748 		pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2749 		dump_stack();
2750 	}
2751 }
2752 EXPORT_SYMBOL(netdev_rx_csum_fault);
2753 #endif
2754 
2755 /* Actually, we should eliminate this check as soon as we know, that:
2756  * 1. IOMMU is present and allows to map all the memory.
2757  * 2. No high memory really exists on this machine.
2758  */
2759 
2760 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2761 {
2762 #ifdef CONFIG_HIGHMEM
2763 	int i;
2764 	if (!(dev->features & NETIF_F_HIGHDMA)) {
2765 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2766 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2767 			if (PageHighMem(skb_frag_page(frag)))
2768 				return 1;
2769 		}
2770 	}
2771 
2772 	if (PCI_DMA_BUS_IS_PHYS) {
2773 		struct device *pdev = dev->dev.parent;
2774 
2775 		if (!pdev)
2776 			return 0;
2777 		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2778 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2779 			dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2780 			if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2781 				return 1;
2782 		}
2783 	}
2784 #endif
2785 	return 0;
2786 }
2787 
2788 /* If MPLS offload request, verify we are testing hardware MPLS features
2789  * instead of standard features for the netdev.
2790  */
2791 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2792 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2793 					   netdev_features_t features,
2794 					   __be16 type)
2795 {
2796 	if (eth_p_mpls(type))
2797 		features &= skb->dev->mpls_features;
2798 
2799 	return features;
2800 }
2801 #else
2802 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2803 					   netdev_features_t features,
2804 					   __be16 type)
2805 {
2806 	return features;
2807 }
2808 #endif
2809 
2810 static netdev_features_t harmonize_features(struct sk_buff *skb,
2811 	netdev_features_t features)
2812 {
2813 	int tmp;
2814 	__be16 type;
2815 
2816 	type = skb_network_protocol(skb, &tmp);
2817 	features = net_mpls_features(skb, features, type);
2818 
2819 	if (skb->ip_summed != CHECKSUM_NONE &&
2820 	    !can_checksum_protocol(features, type)) {
2821 		features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2822 	} else if (illegal_highdma(skb->dev, skb)) {
2823 		features &= ~NETIF_F_SG;
2824 	}
2825 
2826 	return features;
2827 }
2828 
2829 netdev_features_t passthru_features_check(struct sk_buff *skb,
2830 					  struct net_device *dev,
2831 					  netdev_features_t features)
2832 {
2833 	return features;
2834 }
2835 EXPORT_SYMBOL(passthru_features_check);
2836 
2837 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2838 					     struct net_device *dev,
2839 					     netdev_features_t features)
2840 {
2841 	return vlan_features_check(skb, features);
2842 }
2843 
2844 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2845 					    struct net_device *dev,
2846 					    netdev_features_t features)
2847 {
2848 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
2849 
2850 	if (gso_segs > dev->gso_max_segs)
2851 		return features & ~NETIF_F_GSO_MASK;
2852 
2853 	/* Support for GSO partial features requires software
2854 	 * intervention before we can actually process the packets
2855 	 * so we need to strip support for any partial features now
2856 	 * and we can pull them back in after we have partially
2857 	 * segmented the frame.
2858 	 */
2859 	if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2860 		features &= ~dev->gso_partial_features;
2861 
2862 	/* Make sure to clear the IPv4 ID mangling feature if the
2863 	 * IPv4 header has the potential to be fragmented.
2864 	 */
2865 	if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2866 		struct iphdr *iph = skb->encapsulation ?
2867 				    inner_ip_hdr(skb) : ip_hdr(skb);
2868 
2869 		if (!(iph->frag_off & htons(IP_DF)))
2870 			features &= ~NETIF_F_TSO_MANGLEID;
2871 	}
2872 
2873 	return features;
2874 }
2875 
2876 netdev_features_t netif_skb_features(struct sk_buff *skb)
2877 {
2878 	struct net_device *dev = skb->dev;
2879 	netdev_features_t features = dev->features;
2880 
2881 	if (skb_is_gso(skb))
2882 		features = gso_features_check(skb, dev, features);
2883 
2884 	/* If encapsulation offload request, verify we are testing
2885 	 * hardware encapsulation features instead of standard
2886 	 * features for the netdev
2887 	 */
2888 	if (skb->encapsulation)
2889 		features &= dev->hw_enc_features;
2890 
2891 	if (skb_vlan_tagged(skb))
2892 		features = netdev_intersect_features(features,
2893 						     dev->vlan_features |
2894 						     NETIF_F_HW_VLAN_CTAG_TX |
2895 						     NETIF_F_HW_VLAN_STAG_TX);
2896 
2897 	if (dev->netdev_ops->ndo_features_check)
2898 		features &= dev->netdev_ops->ndo_features_check(skb, dev,
2899 								features);
2900 	else
2901 		features &= dflt_features_check(skb, dev, features);
2902 
2903 	return harmonize_features(skb, features);
2904 }
2905 EXPORT_SYMBOL(netif_skb_features);
2906 
2907 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2908 		    struct netdev_queue *txq, bool more)
2909 {
2910 	unsigned int len;
2911 	int rc;
2912 
2913 	if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2914 		dev_queue_xmit_nit(skb, dev);
2915 
2916 	len = skb->len;
2917 	trace_net_dev_start_xmit(skb, dev);
2918 	rc = netdev_start_xmit(skb, dev, txq, more);
2919 	trace_net_dev_xmit(skb, rc, dev, len);
2920 
2921 	return rc;
2922 }
2923 
2924 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2925 				    struct netdev_queue *txq, int *ret)
2926 {
2927 	struct sk_buff *skb = first;
2928 	int rc = NETDEV_TX_OK;
2929 
2930 	while (skb) {
2931 		struct sk_buff *next = skb->next;
2932 
2933 		skb->next = NULL;
2934 		rc = xmit_one(skb, dev, txq, next != NULL);
2935 		if (unlikely(!dev_xmit_complete(rc))) {
2936 			skb->next = next;
2937 			goto out;
2938 		}
2939 
2940 		skb = next;
2941 		if (netif_xmit_stopped(txq) && skb) {
2942 			rc = NETDEV_TX_BUSY;
2943 			break;
2944 		}
2945 	}
2946 
2947 out:
2948 	*ret = rc;
2949 	return skb;
2950 }
2951 
2952 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2953 					  netdev_features_t features)
2954 {
2955 	if (skb_vlan_tag_present(skb) &&
2956 	    !vlan_hw_offload_capable(features, skb->vlan_proto))
2957 		skb = __vlan_hwaccel_push_inside(skb);
2958 	return skb;
2959 }
2960 
2961 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2962 {
2963 	netdev_features_t features;
2964 
2965 	features = netif_skb_features(skb);
2966 	skb = validate_xmit_vlan(skb, features);
2967 	if (unlikely(!skb))
2968 		goto out_null;
2969 
2970 	if (netif_needs_gso(skb, features)) {
2971 		struct sk_buff *segs;
2972 
2973 		segs = skb_gso_segment(skb, features);
2974 		if (IS_ERR(segs)) {
2975 			goto out_kfree_skb;
2976 		} else if (segs) {
2977 			consume_skb(skb);
2978 			skb = segs;
2979 		}
2980 	} else {
2981 		if (skb_needs_linearize(skb, features) &&
2982 		    __skb_linearize(skb))
2983 			goto out_kfree_skb;
2984 
2985 		/* If packet is not checksummed and device does not
2986 		 * support checksumming for this protocol, complete
2987 		 * checksumming here.
2988 		 */
2989 		if (skb->ip_summed == CHECKSUM_PARTIAL) {
2990 			if (skb->encapsulation)
2991 				skb_set_inner_transport_header(skb,
2992 							       skb_checksum_start_offset(skb));
2993 			else
2994 				skb_set_transport_header(skb,
2995 							 skb_checksum_start_offset(skb));
2996 			if (!(features & NETIF_F_CSUM_MASK) &&
2997 			    skb_checksum_help(skb))
2998 				goto out_kfree_skb;
2999 		}
3000 	}
3001 
3002 	return skb;
3003 
3004 out_kfree_skb:
3005 	kfree_skb(skb);
3006 out_null:
3007 	atomic_long_inc(&dev->tx_dropped);
3008 	return NULL;
3009 }
3010 
3011 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
3012 {
3013 	struct sk_buff *next, *head = NULL, *tail;
3014 
3015 	for (; skb != NULL; skb = next) {
3016 		next = skb->next;
3017 		skb->next = NULL;
3018 
3019 		/* in case skb wont be segmented, point to itself */
3020 		skb->prev = skb;
3021 
3022 		skb = validate_xmit_skb(skb, dev);
3023 		if (!skb)
3024 			continue;
3025 
3026 		if (!head)
3027 			head = skb;
3028 		else
3029 			tail->next = skb;
3030 		/* If skb was segmented, skb->prev points to
3031 		 * the last segment. If not, it still contains skb.
3032 		 */
3033 		tail = skb->prev;
3034 	}
3035 	return head;
3036 }
3037 
3038 static void qdisc_pkt_len_init(struct sk_buff *skb)
3039 {
3040 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
3041 
3042 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3043 
3044 	/* To get more precise estimation of bytes sent on wire,
3045 	 * we add to pkt_len the headers size of all segments
3046 	 */
3047 	if (shinfo->gso_size)  {
3048 		unsigned int hdr_len;
3049 		u16 gso_segs = shinfo->gso_segs;
3050 
3051 		/* mac layer + network layer */
3052 		hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3053 
3054 		/* + transport layer */
3055 		if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
3056 			hdr_len += tcp_hdrlen(skb);
3057 		else
3058 			hdr_len += sizeof(struct udphdr);
3059 
3060 		if (shinfo->gso_type & SKB_GSO_DODGY)
3061 			gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3062 						shinfo->gso_size);
3063 
3064 		qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3065 	}
3066 }
3067 
3068 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3069 				 struct net_device *dev,
3070 				 struct netdev_queue *txq)
3071 {
3072 	spinlock_t *root_lock = qdisc_lock(q);
3073 	bool contended;
3074 	int rc;
3075 
3076 	qdisc_calculate_pkt_len(skb, q);
3077 	/*
3078 	 * Heuristic to force contended enqueues to serialize on a
3079 	 * separate lock before trying to get qdisc main lock.
3080 	 * This permits qdisc->running owner to get the lock more
3081 	 * often and dequeue packets faster.
3082 	 */
3083 	contended = qdisc_is_running(q);
3084 	if (unlikely(contended))
3085 		spin_lock(&q->busylock);
3086 
3087 	spin_lock(root_lock);
3088 	if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3089 		kfree_skb(skb);
3090 		rc = NET_XMIT_DROP;
3091 	} else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3092 		   qdisc_run_begin(q)) {
3093 		/*
3094 		 * This is a work-conserving queue; there are no old skbs
3095 		 * waiting to be sent out; and the qdisc is not running -
3096 		 * xmit the skb directly.
3097 		 */
3098 
3099 		qdisc_bstats_update(q, skb);
3100 
3101 		if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3102 			if (unlikely(contended)) {
3103 				spin_unlock(&q->busylock);
3104 				contended = false;
3105 			}
3106 			__qdisc_run(q);
3107 		} else
3108 			qdisc_run_end(q);
3109 
3110 		rc = NET_XMIT_SUCCESS;
3111 	} else {
3112 		rc = q->enqueue(skb, q) & NET_XMIT_MASK;
3113 		if (qdisc_run_begin(q)) {
3114 			if (unlikely(contended)) {
3115 				spin_unlock(&q->busylock);
3116 				contended = false;
3117 			}
3118 			__qdisc_run(q);
3119 		}
3120 	}
3121 	spin_unlock(root_lock);
3122 	if (unlikely(contended))
3123 		spin_unlock(&q->busylock);
3124 	return rc;
3125 }
3126 
3127 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3128 static void skb_update_prio(struct sk_buff *skb)
3129 {
3130 	struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3131 
3132 	if (!skb->priority && skb->sk && map) {
3133 		unsigned int prioidx =
3134 			sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3135 
3136 		if (prioidx < map->priomap_len)
3137 			skb->priority = map->priomap[prioidx];
3138 	}
3139 }
3140 #else
3141 #define skb_update_prio(skb)
3142 #endif
3143 
3144 DEFINE_PER_CPU(int, xmit_recursion);
3145 EXPORT_SYMBOL(xmit_recursion);
3146 
3147 #define RECURSION_LIMIT 10
3148 
3149 /**
3150  *	dev_loopback_xmit - loop back @skb
3151  *	@net: network namespace this loopback is happening in
3152  *	@sk:  sk needed to be a netfilter okfn
3153  *	@skb: buffer to transmit
3154  */
3155 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3156 {
3157 	skb_reset_mac_header(skb);
3158 	__skb_pull(skb, skb_network_offset(skb));
3159 	skb->pkt_type = PACKET_LOOPBACK;
3160 	skb->ip_summed = CHECKSUM_UNNECESSARY;
3161 	WARN_ON(!skb_dst(skb));
3162 	skb_dst_force(skb);
3163 	netif_rx_ni(skb);
3164 	return 0;
3165 }
3166 EXPORT_SYMBOL(dev_loopback_xmit);
3167 
3168 #ifdef CONFIG_NET_EGRESS
3169 static struct sk_buff *
3170 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3171 {
3172 	struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3173 	struct tcf_result cl_res;
3174 
3175 	if (!cl)
3176 		return skb;
3177 
3178 	/* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3179 	 * earlier by the caller.
3180 	 */
3181 	qdisc_bstats_cpu_update(cl->q, skb);
3182 
3183 	switch (tc_classify(skb, cl, &cl_res, false)) {
3184 	case TC_ACT_OK:
3185 	case TC_ACT_RECLASSIFY:
3186 		skb->tc_index = TC_H_MIN(cl_res.classid);
3187 		break;
3188 	case TC_ACT_SHOT:
3189 		qdisc_qstats_cpu_drop(cl->q);
3190 		*ret = NET_XMIT_DROP;
3191 		kfree_skb(skb);
3192 		return NULL;
3193 	case TC_ACT_STOLEN:
3194 	case TC_ACT_QUEUED:
3195 		*ret = NET_XMIT_SUCCESS;
3196 		consume_skb(skb);
3197 		return NULL;
3198 	case TC_ACT_REDIRECT:
3199 		/* No need to push/pop skb's mac_header here on egress! */
3200 		skb_do_redirect(skb);
3201 		*ret = NET_XMIT_SUCCESS;
3202 		return NULL;
3203 	default:
3204 		break;
3205 	}
3206 
3207 	return skb;
3208 }
3209 #endif /* CONFIG_NET_EGRESS */
3210 
3211 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3212 {
3213 #ifdef CONFIG_XPS
3214 	struct xps_dev_maps *dev_maps;
3215 	struct xps_map *map;
3216 	int queue_index = -1;
3217 
3218 	rcu_read_lock();
3219 	dev_maps = rcu_dereference(dev->xps_maps);
3220 	if (dev_maps) {
3221 		map = rcu_dereference(
3222 		    dev_maps->cpu_map[skb->sender_cpu - 1]);
3223 		if (map) {
3224 			if (map->len == 1)
3225 				queue_index = map->queues[0];
3226 			else
3227 				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3228 									   map->len)];
3229 			if (unlikely(queue_index >= dev->real_num_tx_queues))
3230 				queue_index = -1;
3231 		}
3232 	}
3233 	rcu_read_unlock();
3234 
3235 	return queue_index;
3236 #else
3237 	return -1;
3238 #endif
3239 }
3240 
3241 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3242 {
3243 	struct sock *sk = skb->sk;
3244 	int queue_index = sk_tx_queue_get(sk);
3245 
3246 	if (queue_index < 0 || skb->ooo_okay ||
3247 	    queue_index >= dev->real_num_tx_queues) {
3248 		int new_index = get_xps_queue(dev, skb);
3249 		if (new_index < 0)
3250 			new_index = skb_tx_hash(dev, skb);
3251 
3252 		if (queue_index != new_index && sk &&
3253 		    sk_fullsock(sk) &&
3254 		    rcu_access_pointer(sk->sk_dst_cache))
3255 			sk_tx_queue_set(sk, new_index);
3256 
3257 		queue_index = new_index;
3258 	}
3259 
3260 	return queue_index;
3261 }
3262 
3263 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3264 				    struct sk_buff *skb,
3265 				    void *accel_priv)
3266 {
3267 	int queue_index = 0;
3268 
3269 #ifdef CONFIG_XPS
3270 	u32 sender_cpu = skb->sender_cpu - 1;
3271 
3272 	if (sender_cpu >= (u32)NR_CPUS)
3273 		skb->sender_cpu = raw_smp_processor_id() + 1;
3274 #endif
3275 
3276 	if (dev->real_num_tx_queues != 1) {
3277 		const struct net_device_ops *ops = dev->netdev_ops;
3278 		if (ops->ndo_select_queue)
3279 			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3280 							    __netdev_pick_tx);
3281 		else
3282 			queue_index = __netdev_pick_tx(dev, skb);
3283 
3284 		if (!accel_priv)
3285 			queue_index = netdev_cap_txqueue(dev, queue_index);
3286 	}
3287 
3288 	skb_set_queue_mapping(skb, queue_index);
3289 	return netdev_get_tx_queue(dev, queue_index);
3290 }
3291 
3292 /**
3293  *	__dev_queue_xmit - transmit a buffer
3294  *	@skb: buffer to transmit
3295  *	@accel_priv: private data used for L2 forwarding offload
3296  *
3297  *	Queue a buffer for transmission to a network device. The caller must
3298  *	have set the device and priority and built the buffer before calling
3299  *	this function. The function can be called from an interrupt.
3300  *
3301  *	A negative errno code is returned on a failure. A success does not
3302  *	guarantee the frame will be transmitted as it may be dropped due
3303  *	to congestion or traffic shaping.
3304  *
3305  * -----------------------------------------------------------------------------------
3306  *      I notice this method can also return errors from the queue disciplines,
3307  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3308  *      be positive.
3309  *
3310  *      Regardless of the return value, the skb is consumed, so it is currently
3311  *      difficult to retry a send to this method.  (You can bump the ref count
3312  *      before sending to hold a reference for retry if you are careful.)
3313  *
3314  *      When calling this method, interrupts MUST be enabled.  This is because
3315  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3316  *          --BLG
3317  */
3318 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3319 {
3320 	struct net_device *dev = skb->dev;
3321 	struct netdev_queue *txq;
3322 	struct Qdisc *q;
3323 	int rc = -ENOMEM;
3324 
3325 	skb_reset_mac_header(skb);
3326 
3327 	if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3328 		__skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3329 
3330 	/* Disable soft irqs for various locks below. Also
3331 	 * stops preemption for RCU.
3332 	 */
3333 	rcu_read_lock_bh();
3334 
3335 	skb_update_prio(skb);
3336 
3337 	qdisc_pkt_len_init(skb);
3338 #ifdef CONFIG_NET_CLS_ACT
3339 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3340 # ifdef CONFIG_NET_EGRESS
3341 	if (static_key_false(&egress_needed)) {
3342 		skb = sch_handle_egress(skb, &rc, dev);
3343 		if (!skb)
3344 			goto out;
3345 	}
3346 # endif
3347 #endif
3348 	/* If device/qdisc don't need skb->dst, release it right now while
3349 	 * its hot in this cpu cache.
3350 	 */
3351 	if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3352 		skb_dst_drop(skb);
3353 	else
3354 		skb_dst_force(skb);
3355 
3356 #ifdef CONFIG_NET_SWITCHDEV
3357 	/* Don't forward if offload device already forwarded */
3358 	if (skb->offload_fwd_mark &&
3359 	    skb->offload_fwd_mark == dev->offload_fwd_mark) {
3360 		consume_skb(skb);
3361 		rc = NET_XMIT_SUCCESS;
3362 		goto out;
3363 	}
3364 #endif
3365 
3366 	txq = netdev_pick_tx(dev, skb, accel_priv);
3367 	q = rcu_dereference_bh(txq->qdisc);
3368 
3369 	trace_net_dev_queue(skb);
3370 	if (q->enqueue) {
3371 		rc = __dev_xmit_skb(skb, q, dev, txq);
3372 		goto out;
3373 	}
3374 
3375 	/* The device has no queue. Common case for software devices:
3376 	   loopback, all the sorts of tunnels...
3377 
3378 	   Really, it is unlikely that netif_tx_lock protection is necessary
3379 	   here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3380 	   counters.)
3381 	   However, it is possible, that they rely on protection
3382 	   made by us here.
3383 
3384 	   Check this and shot the lock. It is not prone from deadlocks.
3385 	   Either shot noqueue qdisc, it is even simpler 8)
3386 	 */
3387 	if (dev->flags & IFF_UP) {
3388 		int cpu = smp_processor_id(); /* ok because BHs are off */
3389 
3390 		if (txq->xmit_lock_owner != cpu) {
3391 
3392 			if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3393 				goto recursion_alert;
3394 
3395 			skb = validate_xmit_skb(skb, dev);
3396 			if (!skb)
3397 				goto out;
3398 
3399 			HARD_TX_LOCK(dev, txq, cpu);
3400 
3401 			if (!netif_xmit_stopped(txq)) {
3402 				__this_cpu_inc(xmit_recursion);
3403 				skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3404 				__this_cpu_dec(xmit_recursion);
3405 				if (dev_xmit_complete(rc)) {
3406 					HARD_TX_UNLOCK(dev, txq);
3407 					goto out;
3408 				}
3409 			}
3410 			HARD_TX_UNLOCK(dev, txq);
3411 			net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3412 					     dev->name);
3413 		} else {
3414 			/* Recursion is detected! It is possible,
3415 			 * unfortunately
3416 			 */
3417 recursion_alert:
3418 			net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3419 					     dev->name);
3420 		}
3421 	}
3422 
3423 	rc = -ENETDOWN;
3424 	rcu_read_unlock_bh();
3425 
3426 	atomic_long_inc(&dev->tx_dropped);
3427 	kfree_skb_list(skb);
3428 	return rc;
3429 out:
3430 	rcu_read_unlock_bh();
3431 	return rc;
3432 }
3433 
3434 int dev_queue_xmit(struct sk_buff *skb)
3435 {
3436 	return __dev_queue_xmit(skb, NULL);
3437 }
3438 EXPORT_SYMBOL(dev_queue_xmit);
3439 
3440 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3441 {
3442 	return __dev_queue_xmit(skb, accel_priv);
3443 }
3444 EXPORT_SYMBOL(dev_queue_xmit_accel);
3445 
3446 
3447 /*=======================================================================
3448 			Receiver routines
3449   =======================================================================*/
3450 
3451 int netdev_max_backlog __read_mostly = 1000;
3452 EXPORT_SYMBOL(netdev_max_backlog);
3453 
3454 int netdev_tstamp_prequeue __read_mostly = 1;
3455 int netdev_budget __read_mostly = 300;
3456 int weight_p __read_mostly = 64;            /* old backlog weight */
3457 
3458 /* Called with irq disabled */
3459 static inline void ____napi_schedule(struct softnet_data *sd,
3460 				     struct napi_struct *napi)
3461 {
3462 	list_add_tail(&napi->poll_list, &sd->poll_list);
3463 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3464 }
3465 
3466 #ifdef CONFIG_RPS
3467 
3468 /* One global table that all flow-based protocols share. */
3469 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3470 EXPORT_SYMBOL(rps_sock_flow_table);
3471 u32 rps_cpu_mask __read_mostly;
3472 EXPORT_SYMBOL(rps_cpu_mask);
3473 
3474 struct static_key rps_needed __read_mostly;
3475 EXPORT_SYMBOL(rps_needed);
3476 
3477 static struct rps_dev_flow *
3478 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3479 	    struct rps_dev_flow *rflow, u16 next_cpu)
3480 {
3481 	if (next_cpu < nr_cpu_ids) {
3482 #ifdef CONFIG_RFS_ACCEL
3483 		struct netdev_rx_queue *rxqueue;
3484 		struct rps_dev_flow_table *flow_table;
3485 		struct rps_dev_flow *old_rflow;
3486 		u32 flow_id;
3487 		u16 rxq_index;
3488 		int rc;
3489 
3490 		/* Should we steer this flow to a different hardware queue? */
3491 		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3492 		    !(dev->features & NETIF_F_NTUPLE))
3493 			goto out;
3494 		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3495 		if (rxq_index == skb_get_rx_queue(skb))
3496 			goto out;
3497 
3498 		rxqueue = dev->_rx + rxq_index;
3499 		flow_table = rcu_dereference(rxqueue->rps_flow_table);
3500 		if (!flow_table)
3501 			goto out;
3502 		flow_id = skb_get_hash(skb) & flow_table->mask;
3503 		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3504 							rxq_index, flow_id);
3505 		if (rc < 0)
3506 			goto out;
3507 		old_rflow = rflow;
3508 		rflow = &flow_table->flows[flow_id];
3509 		rflow->filter = rc;
3510 		if (old_rflow->filter == rflow->filter)
3511 			old_rflow->filter = RPS_NO_FILTER;
3512 	out:
3513 #endif
3514 		rflow->last_qtail =
3515 			per_cpu(softnet_data, next_cpu).input_queue_head;
3516 	}
3517 
3518 	rflow->cpu = next_cpu;
3519 	return rflow;
3520 }
3521 
3522 /*
3523  * get_rps_cpu is called from netif_receive_skb and returns the target
3524  * CPU from the RPS map of the receiving queue for a given skb.
3525  * rcu_read_lock must be held on entry.
3526  */
3527 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3528 		       struct rps_dev_flow **rflowp)
3529 {
3530 	const struct rps_sock_flow_table *sock_flow_table;
3531 	struct netdev_rx_queue *rxqueue = dev->_rx;
3532 	struct rps_dev_flow_table *flow_table;
3533 	struct rps_map *map;
3534 	int cpu = -1;
3535 	u32 tcpu;
3536 	u32 hash;
3537 
3538 	if (skb_rx_queue_recorded(skb)) {
3539 		u16 index = skb_get_rx_queue(skb);
3540 
3541 		if (unlikely(index >= dev->real_num_rx_queues)) {
3542 			WARN_ONCE(dev->real_num_rx_queues > 1,
3543 				  "%s received packet on queue %u, but number "
3544 				  "of RX queues is %u\n",
3545 				  dev->name, index, dev->real_num_rx_queues);
3546 			goto done;
3547 		}
3548 		rxqueue += index;
3549 	}
3550 
3551 	/* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3552 
3553 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3554 	map = rcu_dereference(rxqueue->rps_map);
3555 	if (!flow_table && !map)
3556 		goto done;
3557 
3558 	skb_reset_network_header(skb);
3559 	hash = skb_get_hash(skb);
3560 	if (!hash)
3561 		goto done;
3562 
3563 	sock_flow_table = rcu_dereference(rps_sock_flow_table);
3564 	if (flow_table && sock_flow_table) {
3565 		struct rps_dev_flow *rflow;
3566 		u32 next_cpu;
3567 		u32 ident;
3568 
3569 		/* First check into global flow table if there is a match */
3570 		ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3571 		if ((ident ^ hash) & ~rps_cpu_mask)
3572 			goto try_rps;
3573 
3574 		next_cpu = ident & rps_cpu_mask;
3575 
3576 		/* OK, now we know there is a match,
3577 		 * we can look at the local (per receive queue) flow table
3578 		 */
3579 		rflow = &flow_table->flows[hash & flow_table->mask];
3580 		tcpu = rflow->cpu;
3581 
3582 		/*
3583 		 * If the desired CPU (where last recvmsg was done) is
3584 		 * different from current CPU (one in the rx-queue flow
3585 		 * table entry), switch if one of the following holds:
3586 		 *   - Current CPU is unset (>= nr_cpu_ids).
3587 		 *   - Current CPU is offline.
3588 		 *   - The current CPU's queue tail has advanced beyond the
3589 		 *     last packet that was enqueued using this table entry.
3590 		 *     This guarantees that all previous packets for the flow
3591 		 *     have been dequeued, thus preserving in order delivery.
3592 		 */
3593 		if (unlikely(tcpu != next_cpu) &&
3594 		    (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3595 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3596 		      rflow->last_qtail)) >= 0)) {
3597 			tcpu = next_cpu;
3598 			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3599 		}
3600 
3601 		if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3602 			*rflowp = rflow;
3603 			cpu = tcpu;
3604 			goto done;
3605 		}
3606 	}
3607 
3608 try_rps:
3609 
3610 	if (map) {
3611 		tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3612 		if (cpu_online(tcpu)) {
3613 			cpu = tcpu;
3614 			goto done;
3615 		}
3616 	}
3617 
3618 done:
3619 	return cpu;
3620 }
3621 
3622 #ifdef CONFIG_RFS_ACCEL
3623 
3624 /**
3625  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3626  * @dev: Device on which the filter was set
3627  * @rxq_index: RX queue index
3628  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3629  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3630  *
3631  * Drivers that implement ndo_rx_flow_steer() should periodically call
3632  * this function for each installed filter and remove the filters for
3633  * which it returns %true.
3634  */
3635 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3636 			 u32 flow_id, u16 filter_id)
3637 {
3638 	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3639 	struct rps_dev_flow_table *flow_table;
3640 	struct rps_dev_flow *rflow;
3641 	bool expire = true;
3642 	unsigned int cpu;
3643 
3644 	rcu_read_lock();
3645 	flow_table = rcu_dereference(rxqueue->rps_flow_table);
3646 	if (flow_table && flow_id <= flow_table->mask) {
3647 		rflow = &flow_table->flows[flow_id];
3648 		cpu = ACCESS_ONCE(rflow->cpu);
3649 		if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3650 		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3651 			   rflow->last_qtail) <
3652 		     (int)(10 * flow_table->mask)))
3653 			expire = false;
3654 	}
3655 	rcu_read_unlock();
3656 	return expire;
3657 }
3658 EXPORT_SYMBOL(rps_may_expire_flow);
3659 
3660 #endif /* CONFIG_RFS_ACCEL */
3661 
3662 /* Called from hardirq (IPI) context */
3663 static void rps_trigger_softirq(void *data)
3664 {
3665 	struct softnet_data *sd = data;
3666 
3667 	____napi_schedule(sd, &sd->backlog);
3668 	sd->received_rps++;
3669 }
3670 
3671 #endif /* CONFIG_RPS */
3672 
3673 /*
3674  * Check if this softnet_data structure is another cpu one
3675  * If yes, queue it to our IPI list and return 1
3676  * If no, return 0
3677  */
3678 static int rps_ipi_queued(struct softnet_data *sd)
3679 {
3680 #ifdef CONFIG_RPS
3681 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3682 
3683 	if (sd != mysd) {
3684 		sd->rps_ipi_next = mysd->rps_ipi_list;
3685 		mysd->rps_ipi_list = sd;
3686 
3687 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
3688 		return 1;
3689 	}
3690 #endif /* CONFIG_RPS */
3691 	return 0;
3692 }
3693 
3694 #ifdef CONFIG_NET_FLOW_LIMIT
3695 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3696 #endif
3697 
3698 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3699 {
3700 #ifdef CONFIG_NET_FLOW_LIMIT
3701 	struct sd_flow_limit *fl;
3702 	struct softnet_data *sd;
3703 	unsigned int old_flow, new_flow;
3704 
3705 	if (qlen < (netdev_max_backlog >> 1))
3706 		return false;
3707 
3708 	sd = this_cpu_ptr(&softnet_data);
3709 
3710 	rcu_read_lock();
3711 	fl = rcu_dereference(sd->flow_limit);
3712 	if (fl) {
3713 		new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3714 		old_flow = fl->history[fl->history_head];
3715 		fl->history[fl->history_head] = new_flow;
3716 
3717 		fl->history_head++;
3718 		fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3719 
3720 		if (likely(fl->buckets[old_flow]))
3721 			fl->buckets[old_flow]--;
3722 
3723 		if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3724 			fl->count++;
3725 			rcu_read_unlock();
3726 			return true;
3727 		}
3728 	}
3729 	rcu_read_unlock();
3730 #endif
3731 	return false;
3732 }
3733 
3734 /*
3735  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3736  * queue (may be a remote CPU queue).
3737  */
3738 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3739 			      unsigned int *qtail)
3740 {
3741 	struct softnet_data *sd;
3742 	unsigned long flags;
3743 	unsigned int qlen;
3744 
3745 	sd = &per_cpu(softnet_data, cpu);
3746 
3747 	local_irq_save(flags);
3748 
3749 	rps_lock(sd);
3750 	if (!netif_running(skb->dev))
3751 		goto drop;
3752 	qlen = skb_queue_len(&sd->input_pkt_queue);
3753 	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3754 		if (qlen) {
3755 enqueue:
3756 			__skb_queue_tail(&sd->input_pkt_queue, skb);
3757 			input_queue_tail_incr_save(sd, qtail);
3758 			rps_unlock(sd);
3759 			local_irq_restore(flags);
3760 			return NET_RX_SUCCESS;
3761 		}
3762 
3763 		/* Schedule NAPI for backlog device
3764 		 * We can use non atomic operation since we own the queue lock
3765 		 */
3766 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3767 			if (!rps_ipi_queued(sd))
3768 				____napi_schedule(sd, &sd->backlog);
3769 		}
3770 		goto enqueue;
3771 	}
3772 
3773 drop:
3774 	sd->dropped++;
3775 	rps_unlock(sd);
3776 
3777 	local_irq_restore(flags);
3778 
3779 	atomic_long_inc(&skb->dev->rx_dropped);
3780 	kfree_skb(skb);
3781 	return NET_RX_DROP;
3782 }
3783 
3784 static int netif_rx_internal(struct sk_buff *skb)
3785 {
3786 	int ret;
3787 
3788 	net_timestamp_check(netdev_tstamp_prequeue, skb);
3789 
3790 	trace_netif_rx(skb);
3791 #ifdef CONFIG_RPS
3792 	if (static_key_false(&rps_needed)) {
3793 		struct rps_dev_flow voidflow, *rflow = &voidflow;
3794 		int cpu;
3795 
3796 		preempt_disable();
3797 		rcu_read_lock();
3798 
3799 		cpu = get_rps_cpu(skb->dev, skb, &rflow);
3800 		if (cpu < 0)
3801 			cpu = smp_processor_id();
3802 
3803 		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3804 
3805 		rcu_read_unlock();
3806 		preempt_enable();
3807 	} else
3808 #endif
3809 	{
3810 		unsigned int qtail;
3811 		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3812 		put_cpu();
3813 	}
3814 	return ret;
3815 }
3816 
3817 /**
3818  *	netif_rx	-	post buffer to the network code
3819  *	@skb: buffer to post
3820  *
3821  *	This function receives a packet from a device driver and queues it for
3822  *	the upper (protocol) levels to process.  It always succeeds. The buffer
3823  *	may be dropped during processing for congestion control or by the
3824  *	protocol layers.
3825  *
3826  *	return values:
3827  *	NET_RX_SUCCESS	(no congestion)
3828  *	NET_RX_DROP     (packet was dropped)
3829  *
3830  */
3831 
3832 int netif_rx(struct sk_buff *skb)
3833 {
3834 	trace_netif_rx_entry(skb);
3835 
3836 	return netif_rx_internal(skb);
3837 }
3838 EXPORT_SYMBOL(netif_rx);
3839 
3840 int netif_rx_ni(struct sk_buff *skb)
3841 {
3842 	int err;
3843 
3844 	trace_netif_rx_ni_entry(skb);
3845 
3846 	preempt_disable();
3847 	err = netif_rx_internal(skb);
3848 	if (local_softirq_pending())
3849 		do_softirq();
3850 	preempt_enable();
3851 
3852 	return err;
3853 }
3854 EXPORT_SYMBOL(netif_rx_ni);
3855 
3856 static void net_tx_action(struct softirq_action *h)
3857 {
3858 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3859 
3860 	if (sd->completion_queue) {
3861 		struct sk_buff *clist;
3862 
3863 		local_irq_disable();
3864 		clist = sd->completion_queue;
3865 		sd->completion_queue = NULL;
3866 		local_irq_enable();
3867 
3868 		while (clist) {
3869 			struct sk_buff *skb = clist;
3870 			clist = clist->next;
3871 
3872 			WARN_ON(atomic_read(&skb->users));
3873 			if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3874 				trace_consume_skb(skb);
3875 			else
3876 				trace_kfree_skb(skb, net_tx_action);
3877 
3878 			if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3879 				__kfree_skb(skb);
3880 			else
3881 				__kfree_skb_defer(skb);
3882 		}
3883 
3884 		__kfree_skb_flush();
3885 	}
3886 
3887 	if (sd->output_queue) {
3888 		struct Qdisc *head;
3889 
3890 		local_irq_disable();
3891 		head = sd->output_queue;
3892 		sd->output_queue = NULL;
3893 		sd->output_queue_tailp = &sd->output_queue;
3894 		local_irq_enable();
3895 
3896 		while (head) {
3897 			struct Qdisc *q = head;
3898 			spinlock_t *root_lock;
3899 
3900 			head = head->next_sched;
3901 
3902 			root_lock = qdisc_lock(q);
3903 			spin_lock(root_lock);
3904 			/* We need to make sure head->next_sched is read
3905 			 * before clearing __QDISC_STATE_SCHED
3906 			 */
3907 			smp_mb__before_atomic();
3908 			clear_bit(__QDISC_STATE_SCHED, &q->state);
3909 			qdisc_run(q);
3910 			spin_unlock(root_lock);
3911 		}
3912 	}
3913 }
3914 
3915 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3916     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3917 /* This hook is defined here for ATM LANE */
3918 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3919 			     unsigned char *addr) __read_mostly;
3920 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3921 #endif
3922 
3923 static inline struct sk_buff *
3924 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3925 		   struct net_device *orig_dev)
3926 {
3927 #ifdef CONFIG_NET_CLS_ACT
3928 	struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3929 	struct tcf_result cl_res;
3930 
3931 	/* If there's at least one ingress present somewhere (so
3932 	 * we get here via enabled static key), remaining devices
3933 	 * that are not configured with an ingress qdisc will bail
3934 	 * out here.
3935 	 */
3936 	if (!cl)
3937 		return skb;
3938 	if (*pt_prev) {
3939 		*ret = deliver_skb(skb, *pt_prev, orig_dev);
3940 		*pt_prev = NULL;
3941 	}
3942 
3943 	qdisc_skb_cb(skb)->pkt_len = skb->len;
3944 	skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3945 	qdisc_bstats_cpu_update(cl->q, skb);
3946 
3947 	switch (tc_classify(skb, cl, &cl_res, false)) {
3948 	case TC_ACT_OK:
3949 	case TC_ACT_RECLASSIFY:
3950 		skb->tc_index = TC_H_MIN(cl_res.classid);
3951 		break;
3952 	case TC_ACT_SHOT:
3953 		qdisc_qstats_cpu_drop(cl->q);
3954 		kfree_skb(skb);
3955 		return NULL;
3956 	case TC_ACT_STOLEN:
3957 	case TC_ACT_QUEUED:
3958 		consume_skb(skb);
3959 		return NULL;
3960 	case TC_ACT_REDIRECT:
3961 		/* skb_mac_header check was done by cls/act_bpf, so
3962 		 * we can safely push the L2 header back before
3963 		 * redirecting to another netdev
3964 		 */
3965 		__skb_push(skb, skb->mac_len);
3966 		skb_do_redirect(skb);
3967 		return NULL;
3968 	default:
3969 		break;
3970 	}
3971 #endif /* CONFIG_NET_CLS_ACT */
3972 	return skb;
3973 }
3974 
3975 /**
3976  *	netdev_rx_handler_register - register receive handler
3977  *	@dev: device to register a handler for
3978  *	@rx_handler: receive handler to register
3979  *	@rx_handler_data: data pointer that is used by rx handler
3980  *
3981  *	Register a receive handler for a device. This handler will then be
3982  *	called from __netif_receive_skb. A negative errno code is returned
3983  *	on a failure.
3984  *
3985  *	The caller must hold the rtnl_mutex.
3986  *
3987  *	For a general description of rx_handler, see enum rx_handler_result.
3988  */
3989 int netdev_rx_handler_register(struct net_device *dev,
3990 			       rx_handler_func_t *rx_handler,
3991 			       void *rx_handler_data)
3992 {
3993 	ASSERT_RTNL();
3994 
3995 	if (dev->rx_handler)
3996 		return -EBUSY;
3997 
3998 	/* Note: rx_handler_data must be set before rx_handler */
3999 	rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
4000 	rcu_assign_pointer(dev->rx_handler, rx_handler);
4001 
4002 	return 0;
4003 }
4004 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
4005 
4006 /**
4007  *	netdev_rx_handler_unregister - unregister receive handler
4008  *	@dev: device to unregister a handler from
4009  *
4010  *	Unregister a receive handler from a device.
4011  *
4012  *	The caller must hold the rtnl_mutex.
4013  */
4014 void netdev_rx_handler_unregister(struct net_device *dev)
4015 {
4016 
4017 	ASSERT_RTNL();
4018 	RCU_INIT_POINTER(dev->rx_handler, NULL);
4019 	/* a reader seeing a non NULL rx_handler in a rcu_read_lock()
4020 	 * section has a guarantee to see a non NULL rx_handler_data
4021 	 * as well.
4022 	 */
4023 	synchronize_net();
4024 	RCU_INIT_POINTER(dev->rx_handler_data, NULL);
4025 }
4026 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
4027 
4028 /*
4029  * Limit the use of PFMEMALLOC reserves to those protocols that implement
4030  * the special handling of PFMEMALLOC skbs.
4031  */
4032 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
4033 {
4034 	switch (skb->protocol) {
4035 	case htons(ETH_P_ARP):
4036 	case htons(ETH_P_IP):
4037 	case htons(ETH_P_IPV6):
4038 	case htons(ETH_P_8021Q):
4039 	case htons(ETH_P_8021AD):
4040 		return true;
4041 	default:
4042 		return false;
4043 	}
4044 }
4045 
4046 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
4047 			     int *ret, struct net_device *orig_dev)
4048 {
4049 #ifdef CONFIG_NETFILTER_INGRESS
4050 	if (nf_hook_ingress_active(skb)) {
4051 		if (*pt_prev) {
4052 			*ret = deliver_skb(skb, *pt_prev, orig_dev);
4053 			*pt_prev = NULL;
4054 		}
4055 
4056 		return nf_hook_ingress(skb);
4057 	}
4058 #endif /* CONFIG_NETFILTER_INGRESS */
4059 	return 0;
4060 }
4061 
4062 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
4063 {
4064 	struct packet_type *ptype, *pt_prev;
4065 	rx_handler_func_t *rx_handler;
4066 	struct net_device *orig_dev;
4067 	bool deliver_exact = false;
4068 	int ret = NET_RX_DROP;
4069 	__be16 type;
4070 
4071 	net_timestamp_check(!netdev_tstamp_prequeue, skb);
4072 
4073 	trace_netif_receive_skb(skb);
4074 
4075 	orig_dev = skb->dev;
4076 
4077 	skb_reset_network_header(skb);
4078 	if (!skb_transport_header_was_set(skb))
4079 		skb_reset_transport_header(skb);
4080 	skb_reset_mac_len(skb);
4081 
4082 	pt_prev = NULL;
4083 
4084 another_round:
4085 	skb->skb_iif = skb->dev->ifindex;
4086 
4087 	__this_cpu_inc(softnet_data.processed);
4088 
4089 	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
4090 	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
4091 		skb = skb_vlan_untag(skb);
4092 		if (unlikely(!skb))
4093 			goto out;
4094 	}
4095 
4096 #ifdef CONFIG_NET_CLS_ACT
4097 	if (skb->tc_verd & TC_NCLS) {
4098 		skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
4099 		goto ncls;
4100 	}
4101 #endif
4102 
4103 	if (pfmemalloc)
4104 		goto skip_taps;
4105 
4106 	list_for_each_entry_rcu(ptype, &ptype_all, list) {
4107 		if (pt_prev)
4108 			ret = deliver_skb(skb, pt_prev, orig_dev);
4109 		pt_prev = ptype;
4110 	}
4111 
4112 	list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
4113 		if (pt_prev)
4114 			ret = deliver_skb(skb, pt_prev, orig_dev);
4115 		pt_prev = ptype;
4116 	}
4117 
4118 skip_taps:
4119 #ifdef CONFIG_NET_INGRESS
4120 	if (static_key_false(&ingress_needed)) {
4121 		skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
4122 		if (!skb)
4123 			goto out;
4124 
4125 		if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4126 			goto out;
4127 	}
4128 #endif
4129 #ifdef CONFIG_NET_CLS_ACT
4130 	skb->tc_verd = 0;
4131 ncls:
4132 #endif
4133 	if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4134 		goto drop;
4135 
4136 	if (skb_vlan_tag_present(skb)) {
4137 		if (pt_prev) {
4138 			ret = deliver_skb(skb, pt_prev, orig_dev);
4139 			pt_prev = NULL;
4140 		}
4141 		if (vlan_do_receive(&skb))
4142 			goto another_round;
4143 		else if (unlikely(!skb))
4144 			goto out;
4145 	}
4146 
4147 	rx_handler = rcu_dereference(skb->dev->rx_handler);
4148 	if (rx_handler) {
4149 		if (pt_prev) {
4150 			ret = deliver_skb(skb, pt_prev, orig_dev);
4151 			pt_prev = NULL;
4152 		}
4153 		switch (rx_handler(&skb)) {
4154 		case RX_HANDLER_CONSUMED:
4155 			ret = NET_RX_SUCCESS;
4156 			goto out;
4157 		case RX_HANDLER_ANOTHER:
4158 			goto another_round;
4159 		case RX_HANDLER_EXACT:
4160 			deliver_exact = true;
4161 		case RX_HANDLER_PASS:
4162 			break;
4163 		default:
4164 			BUG();
4165 		}
4166 	}
4167 
4168 	if (unlikely(skb_vlan_tag_present(skb))) {
4169 		if (skb_vlan_tag_get_id(skb))
4170 			skb->pkt_type = PACKET_OTHERHOST;
4171 		/* Note: we might in the future use prio bits
4172 		 * and set skb->priority like in vlan_do_receive()
4173 		 * For the time being, just ignore Priority Code Point
4174 		 */
4175 		skb->vlan_tci = 0;
4176 	}
4177 
4178 	type = skb->protocol;
4179 
4180 	/* deliver only exact match when indicated */
4181 	if (likely(!deliver_exact)) {
4182 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4183 				       &ptype_base[ntohs(type) &
4184 						   PTYPE_HASH_MASK]);
4185 	}
4186 
4187 	deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4188 			       &orig_dev->ptype_specific);
4189 
4190 	if (unlikely(skb->dev != orig_dev)) {
4191 		deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4192 				       &skb->dev->ptype_specific);
4193 	}
4194 
4195 	if (pt_prev) {
4196 		if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4197 			goto drop;
4198 		else
4199 			ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4200 	} else {
4201 drop:
4202 		if (!deliver_exact)
4203 			atomic_long_inc(&skb->dev->rx_dropped);
4204 		else
4205 			atomic_long_inc(&skb->dev->rx_nohandler);
4206 		kfree_skb(skb);
4207 		/* Jamal, now you will not able to escape explaining
4208 		 * me how you were going to use this. :-)
4209 		 */
4210 		ret = NET_RX_DROP;
4211 	}
4212 
4213 out:
4214 	return ret;
4215 }
4216 
4217 static int __netif_receive_skb(struct sk_buff *skb)
4218 {
4219 	int ret;
4220 
4221 	if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4222 		unsigned long pflags = current->flags;
4223 
4224 		/*
4225 		 * PFMEMALLOC skbs are special, they should
4226 		 * - be delivered to SOCK_MEMALLOC sockets only
4227 		 * - stay away from userspace
4228 		 * - have bounded memory usage
4229 		 *
4230 		 * Use PF_MEMALLOC as this saves us from propagating the allocation
4231 		 * context down to all allocation sites.
4232 		 */
4233 		current->flags |= PF_MEMALLOC;
4234 		ret = __netif_receive_skb_core(skb, true);
4235 		tsk_restore_flags(current, pflags, PF_MEMALLOC);
4236 	} else
4237 		ret = __netif_receive_skb_core(skb, false);
4238 
4239 	return ret;
4240 }
4241 
4242 static int netif_receive_skb_internal(struct sk_buff *skb)
4243 {
4244 	int ret;
4245 
4246 	net_timestamp_check(netdev_tstamp_prequeue, skb);
4247 
4248 	if (skb_defer_rx_timestamp(skb))
4249 		return NET_RX_SUCCESS;
4250 
4251 	rcu_read_lock();
4252 
4253 #ifdef CONFIG_RPS
4254 	if (static_key_false(&rps_needed)) {
4255 		struct rps_dev_flow voidflow, *rflow = &voidflow;
4256 		int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4257 
4258 		if (cpu >= 0) {
4259 			ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4260 			rcu_read_unlock();
4261 			return ret;
4262 		}
4263 	}
4264 #endif
4265 	ret = __netif_receive_skb(skb);
4266 	rcu_read_unlock();
4267 	return ret;
4268 }
4269 
4270 /**
4271  *	netif_receive_skb - process receive buffer from network
4272  *	@skb: buffer to process
4273  *
4274  *	netif_receive_skb() is the main receive data processing function.
4275  *	It always succeeds. The buffer may be dropped during processing
4276  *	for congestion control or by the protocol layers.
4277  *
4278  *	This function may only be called from softirq context and interrupts
4279  *	should be enabled.
4280  *
4281  *	Return values (usually ignored):
4282  *	NET_RX_SUCCESS: no congestion
4283  *	NET_RX_DROP: packet was dropped
4284  */
4285 int netif_receive_skb(struct sk_buff *skb)
4286 {
4287 	trace_netif_receive_skb_entry(skb);
4288 
4289 	return netif_receive_skb_internal(skb);
4290 }
4291 EXPORT_SYMBOL(netif_receive_skb);
4292 
4293 /* Network device is going away, flush any packets still pending
4294  * Called with irqs disabled.
4295  */
4296 static void flush_backlog(void *arg)
4297 {
4298 	struct net_device *dev = arg;
4299 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4300 	struct sk_buff *skb, *tmp;
4301 
4302 	rps_lock(sd);
4303 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4304 		if (skb->dev == dev) {
4305 			__skb_unlink(skb, &sd->input_pkt_queue);
4306 			kfree_skb(skb);
4307 			input_queue_head_incr(sd);
4308 		}
4309 	}
4310 	rps_unlock(sd);
4311 
4312 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4313 		if (skb->dev == dev) {
4314 			__skb_unlink(skb, &sd->process_queue);
4315 			kfree_skb(skb);
4316 			input_queue_head_incr(sd);
4317 		}
4318 	}
4319 }
4320 
4321 static int napi_gro_complete(struct sk_buff *skb)
4322 {
4323 	struct packet_offload *ptype;
4324 	__be16 type = skb->protocol;
4325 	struct list_head *head = &offload_base;
4326 	int err = -ENOENT;
4327 
4328 	BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4329 
4330 	if (NAPI_GRO_CB(skb)->count == 1) {
4331 		skb_shinfo(skb)->gso_size = 0;
4332 		goto out;
4333 	}
4334 
4335 	rcu_read_lock();
4336 	list_for_each_entry_rcu(ptype, head, list) {
4337 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4338 			continue;
4339 
4340 		err = ptype->callbacks.gro_complete(skb, 0);
4341 		break;
4342 	}
4343 	rcu_read_unlock();
4344 
4345 	if (err) {
4346 		WARN_ON(&ptype->list == head);
4347 		kfree_skb(skb);
4348 		return NET_RX_SUCCESS;
4349 	}
4350 
4351 out:
4352 	return netif_receive_skb_internal(skb);
4353 }
4354 
4355 /* napi->gro_list contains packets ordered by age.
4356  * youngest packets at the head of it.
4357  * Complete skbs in reverse order to reduce latencies.
4358  */
4359 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4360 {
4361 	struct sk_buff *skb, *prev = NULL;
4362 
4363 	/* scan list and build reverse chain */
4364 	for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4365 		skb->prev = prev;
4366 		prev = skb;
4367 	}
4368 
4369 	for (skb = prev; skb; skb = prev) {
4370 		skb->next = NULL;
4371 
4372 		if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4373 			return;
4374 
4375 		prev = skb->prev;
4376 		napi_gro_complete(skb);
4377 		napi->gro_count--;
4378 	}
4379 
4380 	napi->gro_list = NULL;
4381 }
4382 EXPORT_SYMBOL(napi_gro_flush);
4383 
4384 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4385 {
4386 	struct sk_buff *p;
4387 	unsigned int maclen = skb->dev->hard_header_len;
4388 	u32 hash = skb_get_hash_raw(skb);
4389 
4390 	for (p = napi->gro_list; p; p = p->next) {
4391 		unsigned long diffs;
4392 
4393 		NAPI_GRO_CB(p)->flush = 0;
4394 
4395 		if (hash != skb_get_hash_raw(p)) {
4396 			NAPI_GRO_CB(p)->same_flow = 0;
4397 			continue;
4398 		}
4399 
4400 		diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4401 		diffs |= p->vlan_tci ^ skb->vlan_tci;
4402 		diffs |= skb_metadata_dst_cmp(p, skb);
4403 		if (maclen == ETH_HLEN)
4404 			diffs |= compare_ether_header(skb_mac_header(p),
4405 						      skb_mac_header(skb));
4406 		else if (!diffs)
4407 			diffs = memcmp(skb_mac_header(p),
4408 				       skb_mac_header(skb),
4409 				       maclen);
4410 		NAPI_GRO_CB(p)->same_flow = !diffs;
4411 	}
4412 }
4413 
4414 static void skb_gro_reset_offset(struct sk_buff *skb)
4415 {
4416 	const struct skb_shared_info *pinfo = skb_shinfo(skb);
4417 	const skb_frag_t *frag0 = &pinfo->frags[0];
4418 
4419 	NAPI_GRO_CB(skb)->data_offset = 0;
4420 	NAPI_GRO_CB(skb)->frag0 = NULL;
4421 	NAPI_GRO_CB(skb)->frag0_len = 0;
4422 
4423 	if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4424 	    pinfo->nr_frags &&
4425 	    !PageHighMem(skb_frag_page(frag0))) {
4426 		NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4427 		NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4428 	}
4429 }
4430 
4431 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4432 {
4433 	struct skb_shared_info *pinfo = skb_shinfo(skb);
4434 
4435 	BUG_ON(skb->end - skb->tail < grow);
4436 
4437 	memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4438 
4439 	skb->data_len -= grow;
4440 	skb->tail += grow;
4441 
4442 	pinfo->frags[0].page_offset += grow;
4443 	skb_frag_size_sub(&pinfo->frags[0], grow);
4444 
4445 	if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4446 		skb_frag_unref(skb, 0);
4447 		memmove(pinfo->frags, pinfo->frags + 1,
4448 			--pinfo->nr_frags * sizeof(pinfo->frags[0]));
4449 	}
4450 }
4451 
4452 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4453 {
4454 	struct sk_buff **pp = NULL;
4455 	struct packet_offload *ptype;
4456 	__be16 type = skb->protocol;
4457 	struct list_head *head = &offload_base;
4458 	int same_flow;
4459 	enum gro_result ret;
4460 	int grow;
4461 
4462 	if (!(skb->dev->features & NETIF_F_GRO))
4463 		goto normal;
4464 
4465 	if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4466 		goto normal;
4467 
4468 	gro_list_prepare(napi, skb);
4469 
4470 	rcu_read_lock();
4471 	list_for_each_entry_rcu(ptype, head, list) {
4472 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4473 			continue;
4474 
4475 		skb_set_network_header(skb, skb_gro_offset(skb));
4476 		skb_reset_mac_len(skb);
4477 		NAPI_GRO_CB(skb)->same_flow = 0;
4478 		NAPI_GRO_CB(skb)->flush = 0;
4479 		NAPI_GRO_CB(skb)->free = 0;
4480 		NAPI_GRO_CB(skb)->encap_mark = 0;
4481 		NAPI_GRO_CB(skb)->is_fou = 0;
4482 		NAPI_GRO_CB(skb)->is_atomic = 1;
4483 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4484 
4485 		/* Setup for GRO checksum validation */
4486 		switch (skb->ip_summed) {
4487 		case CHECKSUM_COMPLETE:
4488 			NAPI_GRO_CB(skb)->csum = skb->csum;
4489 			NAPI_GRO_CB(skb)->csum_valid = 1;
4490 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4491 			break;
4492 		case CHECKSUM_UNNECESSARY:
4493 			NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4494 			NAPI_GRO_CB(skb)->csum_valid = 0;
4495 			break;
4496 		default:
4497 			NAPI_GRO_CB(skb)->csum_cnt = 0;
4498 			NAPI_GRO_CB(skb)->csum_valid = 0;
4499 		}
4500 
4501 		pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4502 		break;
4503 	}
4504 	rcu_read_unlock();
4505 
4506 	if (&ptype->list == head)
4507 		goto normal;
4508 
4509 	same_flow = NAPI_GRO_CB(skb)->same_flow;
4510 	ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4511 
4512 	if (pp) {
4513 		struct sk_buff *nskb = *pp;
4514 
4515 		*pp = nskb->next;
4516 		nskb->next = NULL;
4517 		napi_gro_complete(nskb);
4518 		napi->gro_count--;
4519 	}
4520 
4521 	if (same_flow)
4522 		goto ok;
4523 
4524 	if (NAPI_GRO_CB(skb)->flush)
4525 		goto normal;
4526 
4527 	if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4528 		struct sk_buff *nskb = napi->gro_list;
4529 
4530 		/* locate the end of the list to select the 'oldest' flow */
4531 		while (nskb->next) {
4532 			pp = &nskb->next;
4533 			nskb = *pp;
4534 		}
4535 		*pp = NULL;
4536 		nskb->next = NULL;
4537 		napi_gro_complete(nskb);
4538 	} else {
4539 		napi->gro_count++;
4540 	}
4541 	NAPI_GRO_CB(skb)->count = 1;
4542 	NAPI_GRO_CB(skb)->age = jiffies;
4543 	NAPI_GRO_CB(skb)->last = skb;
4544 	skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4545 	skb->next = napi->gro_list;
4546 	napi->gro_list = skb;
4547 	ret = GRO_HELD;
4548 
4549 pull:
4550 	grow = skb_gro_offset(skb) - skb_headlen(skb);
4551 	if (grow > 0)
4552 		gro_pull_from_frag0(skb, grow);
4553 ok:
4554 	return ret;
4555 
4556 normal:
4557 	ret = GRO_NORMAL;
4558 	goto pull;
4559 }
4560 
4561 struct packet_offload *gro_find_receive_by_type(__be16 type)
4562 {
4563 	struct list_head *offload_head = &offload_base;
4564 	struct packet_offload *ptype;
4565 
4566 	list_for_each_entry_rcu(ptype, offload_head, list) {
4567 		if (ptype->type != type || !ptype->callbacks.gro_receive)
4568 			continue;
4569 		return ptype;
4570 	}
4571 	return NULL;
4572 }
4573 EXPORT_SYMBOL(gro_find_receive_by_type);
4574 
4575 struct packet_offload *gro_find_complete_by_type(__be16 type)
4576 {
4577 	struct list_head *offload_head = &offload_base;
4578 	struct packet_offload *ptype;
4579 
4580 	list_for_each_entry_rcu(ptype, offload_head, list) {
4581 		if (ptype->type != type || !ptype->callbacks.gro_complete)
4582 			continue;
4583 		return ptype;
4584 	}
4585 	return NULL;
4586 }
4587 EXPORT_SYMBOL(gro_find_complete_by_type);
4588 
4589 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4590 {
4591 	switch (ret) {
4592 	case GRO_NORMAL:
4593 		if (netif_receive_skb_internal(skb))
4594 			ret = GRO_DROP;
4595 		break;
4596 
4597 	case GRO_DROP:
4598 		kfree_skb(skb);
4599 		break;
4600 
4601 	case GRO_MERGED_FREE:
4602 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4603 			skb_dst_drop(skb);
4604 			kmem_cache_free(skbuff_head_cache, skb);
4605 		} else {
4606 			__kfree_skb(skb);
4607 		}
4608 		break;
4609 
4610 	case GRO_HELD:
4611 	case GRO_MERGED:
4612 		break;
4613 	}
4614 
4615 	return ret;
4616 }
4617 
4618 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4619 {
4620 	skb_mark_napi_id(skb, napi);
4621 	trace_napi_gro_receive_entry(skb);
4622 
4623 	skb_gro_reset_offset(skb);
4624 
4625 	return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4626 }
4627 EXPORT_SYMBOL(napi_gro_receive);
4628 
4629 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4630 {
4631 	if (unlikely(skb->pfmemalloc)) {
4632 		consume_skb(skb);
4633 		return;
4634 	}
4635 	__skb_pull(skb, skb_headlen(skb));
4636 	/* restore the reserve we had after netdev_alloc_skb_ip_align() */
4637 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4638 	skb->vlan_tci = 0;
4639 	skb->dev = napi->dev;
4640 	skb->skb_iif = 0;
4641 	skb->encapsulation = 0;
4642 	skb_shinfo(skb)->gso_type = 0;
4643 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4644 
4645 	napi->skb = skb;
4646 }
4647 
4648 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4649 {
4650 	struct sk_buff *skb = napi->skb;
4651 
4652 	if (!skb) {
4653 		skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4654 		if (skb) {
4655 			napi->skb = skb;
4656 			skb_mark_napi_id(skb, napi);
4657 		}
4658 	}
4659 	return skb;
4660 }
4661 EXPORT_SYMBOL(napi_get_frags);
4662 
4663 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4664 				      struct sk_buff *skb,
4665 				      gro_result_t ret)
4666 {
4667 	switch (ret) {
4668 	case GRO_NORMAL:
4669 	case GRO_HELD:
4670 		__skb_push(skb, ETH_HLEN);
4671 		skb->protocol = eth_type_trans(skb, skb->dev);
4672 		if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4673 			ret = GRO_DROP;
4674 		break;
4675 
4676 	case GRO_DROP:
4677 	case GRO_MERGED_FREE:
4678 		napi_reuse_skb(napi, skb);
4679 		break;
4680 
4681 	case GRO_MERGED:
4682 		break;
4683 	}
4684 
4685 	return ret;
4686 }
4687 
4688 /* Upper GRO stack assumes network header starts at gro_offset=0
4689  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4690  * We copy ethernet header into skb->data to have a common layout.
4691  */
4692 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4693 {
4694 	struct sk_buff *skb = napi->skb;
4695 	const struct ethhdr *eth;
4696 	unsigned int hlen = sizeof(*eth);
4697 
4698 	napi->skb = NULL;
4699 
4700 	skb_reset_mac_header(skb);
4701 	skb_gro_reset_offset(skb);
4702 
4703 	eth = skb_gro_header_fast(skb, 0);
4704 	if (unlikely(skb_gro_header_hard(skb, hlen))) {
4705 		eth = skb_gro_header_slow(skb, hlen, 0);
4706 		if (unlikely(!eth)) {
4707 			net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4708 					     __func__, napi->dev->name);
4709 			napi_reuse_skb(napi, skb);
4710 			return NULL;
4711 		}
4712 	} else {
4713 		gro_pull_from_frag0(skb, hlen);
4714 		NAPI_GRO_CB(skb)->frag0 += hlen;
4715 		NAPI_GRO_CB(skb)->frag0_len -= hlen;
4716 	}
4717 	__skb_pull(skb, hlen);
4718 
4719 	/*
4720 	 * This works because the only protocols we care about don't require
4721 	 * special handling.
4722 	 * We'll fix it up properly in napi_frags_finish()
4723 	 */
4724 	skb->protocol = eth->h_proto;
4725 
4726 	return skb;
4727 }
4728 
4729 gro_result_t napi_gro_frags(struct napi_struct *napi)
4730 {
4731 	struct sk_buff *skb = napi_frags_skb(napi);
4732 
4733 	if (!skb)
4734 		return GRO_DROP;
4735 
4736 	trace_napi_gro_frags_entry(skb);
4737 
4738 	return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4739 }
4740 EXPORT_SYMBOL(napi_gro_frags);
4741 
4742 /* Compute the checksum from gro_offset and return the folded value
4743  * after adding in any pseudo checksum.
4744  */
4745 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4746 {
4747 	__wsum wsum;
4748 	__sum16 sum;
4749 
4750 	wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4751 
4752 	/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4753 	sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4754 	if (likely(!sum)) {
4755 		if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4756 		    !skb->csum_complete_sw)
4757 			netdev_rx_csum_fault(skb->dev);
4758 	}
4759 
4760 	NAPI_GRO_CB(skb)->csum = wsum;
4761 	NAPI_GRO_CB(skb)->csum_valid = 1;
4762 
4763 	return sum;
4764 }
4765 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4766 
4767 /*
4768  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4769  * Note: called with local irq disabled, but exits with local irq enabled.
4770  */
4771 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4772 {
4773 #ifdef CONFIG_RPS
4774 	struct softnet_data *remsd = sd->rps_ipi_list;
4775 
4776 	if (remsd) {
4777 		sd->rps_ipi_list = NULL;
4778 
4779 		local_irq_enable();
4780 
4781 		/* Send pending IPI's to kick RPS processing on remote cpus. */
4782 		while (remsd) {
4783 			struct softnet_data *next = remsd->rps_ipi_next;
4784 
4785 			if (cpu_online(remsd->cpu))
4786 				smp_call_function_single_async(remsd->cpu,
4787 							   &remsd->csd);
4788 			remsd = next;
4789 		}
4790 	} else
4791 #endif
4792 		local_irq_enable();
4793 }
4794 
4795 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4796 {
4797 #ifdef CONFIG_RPS
4798 	return sd->rps_ipi_list != NULL;
4799 #else
4800 	return false;
4801 #endif
4802 }
4803 
4804 static int process_backlog(struct napi_struct *napi, int quota)
4805 {
4806 	int work = 0;
4807 	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4808 
4809 	/* Check if we have pending ipi, its better to send them now,
4810 	 * not waiting net_rx_action() end.
4811 	 */
4812 	if (sd_has_rps_ipi_waiting(sd)) {
4813 		local_irq_disable();
4814 		net_rps_action_and_irq_enable(sd);
4815 	}
4816 
4817 	napi->weight = weight_p;
4818 	local_irq_disable();
4819 	while (1) {
4820 		struct sk_buff *skb;
4821 
4822 		while ((skb = __skb_dequeue(&sd->process_queue))) {
4823 			rcu_read_lock();
4824 			local_irq_enable();
4825 			__netif_receive_skb(skb);
4826 			rcu_read_unlock();
4827 			local_irq_disable();
4828 			input_queue_head_incr(sd);
4829 			if (++work >= quota) {
4830 				local_irq_enable();
4831 				return work;
4832 			}
4833 		}
4834 
4835 		rps_lock(sd);
4836 		if (skb_queue_empty(&sd->input_pkt_queue)) {
4837 			/*
4838 			 * Inline a custom version of __napi_complete().
4839 			 * only current cpu owns and manipulates this napi,
4840 			 * and NAPI_STATE_SCHED is the only possible flag set
4841 			 * on backlog.
4842 			 * We can use a plain write instead of clear_bit(),
4843 			 * and we dont need an smp_mb() memory barrier.
4844 			 */
4845 			napi->state = 0;
4846 			rps_unlock(sd);
4847 
4848 			break;
4849 		}
4850 
4851 		skb_queue_splice_tail_init(&sd->input_pkt_queue,
4852 					   &sd->process_queue);
4853 		rps_unlock(sd);
4854 	}
4855 	local_irq_enable();
4856 
4857 	return work;
4858 }
4859 
4860 /**
4861  * __napi_schedule - schedule for receive
4862  * @n: entry to schedule
4863  *
4864  * The entry's receive function will be scheduled to run.
4865  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4866  */
4867 void __napi_schedule(struct napi_struct *n)
4868 {
4869 	unsigned long flags;
4870 
4871 	local_irq_save(flags);
4872 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4873 	local_irq_restore(flags);
4874 }
4875 EXPORT_SYMBOL(__napi_schedule);
4876 
4877 /**
4878  * __napi_schedule_irqoff - schedule for receive
4879  * @n: entry to schedule
4880  *
4881  * Variant of __napi_schedule() assuming hard irqs are masked
4882  */
4883 void __napi_schedule_irqoff(struct napi_struct *n)
4884 {
4885 	____napi_schedule(this_cpu_ptr(&softnet_data), n);
4886 }
4887 EXPORT_SYMBOL(__napi_schedule_irqoff);
4888 
4889 void __napi_complete(struct napi_struct *n)
4890 {
4891 	BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4892 
4893 	list_del_init(&n->poll_list);
4894 	smp_mb__before_atomic();
4895 	clear_bit(NAPI_STATE_SCHED, &n->state);
4896 }
4897 EXPORT_SYMBOL(__napi_complete);
4898 
4899 void napi_complete_done(struct napi_struct *n, int work_done)
4900 {
4901 	unsigned long flags;
4902 
4903 	/*
4904 	 * don't let napi dequeue from the cpu poll list
4905 	 * just in case its running on a different cpu
4906 	 */
4907 	if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4908 		return;
4909 
4910 	if (n->gro_list) {
4911 		unsigned long timeout = 0;
4912 
4913 		if (work_done)
4914 			timeout = n->dev->gro_flush_timeout;
4915 
4916 		if (timeout)
4917 			hrtimer_start(&n->timer, ns_to_ktime(timeout),
4918 				      HRTIMER_MODE_REL_PINNED);
4919 		else
4920 			napi_gro_flush(n, false);
4921 	}
4922 	if (likely(list_empty(&n->poll_list))) {
4923 		WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4924 	} else {
4925 		/* If n->poll_list is not empty, we need to mask irqs */
4926 		local_irq_save(flags);
4927 		__napi_complete(n);
4928 		local_irq_restore(flags);
4929 	}
4930 }
4931 EXPORT_SYMBOL(napi_complete_done);
4932 
4933 /* must be called under rcu_read_lock(), as we dont take a reference */
4934 static struct napi_struct *napi_by_id(unsigned int napi_id)
4935 {
4936 	unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4937 	struct napi_struct *napi;
4938 
4939 	hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4940 		if (napi->napi_id == napi_id)
4941 			return napi;
4942 
4943 	return NULL;
4944 }
4945 
4946 #if defined(CONFIG_NET_RX_BUSY_POLL)
4947 #define BUSY_POLL_BUDGET 8
4948 bool sk_busy_loop(struct sock *sk, int nonblock)
4949 {
4950 	unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4951 	int (*busy_poll)(struct napi_struct *dev);
4952 	struct napi_struct *napi;
4953 	int rc = false;
4954 
4955 	rcu_read_lock();
4956 
4957 	napi = napi_by_id(sk->sk_napi_id);
4958 	if (!napi)
4959 		goto out;
4960 
4961 	/* Note: ndo_busy_poll method is optional in linux-4.5 */
4962 	busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4963 
4964 	do {
4965 		rc = 0;
4966 		local_bh_disable();
4967 		if (busy_poll) {
4968 			rc = busy_poll(napi);
4969 		} else if (napi_schedule_prep(napi)) {
4970 			void *have = netpoll_poll_lock(napi);
4971 
4972 			if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4973 				rc = napi->poll(napi, BUSY_POLL_BUDGET);
4974 				trace_napi_poll(napi);
4975 				if (rc == BUSY_POLL_BUDGET) {
4976 					napi_complete_done(napi, rc);
4977 					napi_schedule(napi);
4978 				}
4979 			}
4980 			netpoll_poll_unlock(have);
4981 		}
4982 		if (rc > 0)
4983 			__NET_ADD_STATS(sock_net(sk),
4984 					LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4985 		local_bh_enable();
4986 
4987 		if (rc == LL_FLUSH_FAILED)
4988 			break; /* permanent failure */
4989 
4990 		cpu_relax();
4991 	} while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4992 		 !need_resched() && !busy_loop_timeout(end_time));
4993 
4994 	rc = !skb_queue_empty(&sk->sk_receive_queue);
4995 out:
4996 	rcu_read_unlock();
4997 	return rc;
4998 }
4999 EXPORT_SYMBOL(sk_busy_loop);
5000 
5001 #endif /* CONFIG_NET_RX_BUSY_POLL */
5002 
5003 void napi_hash_add(struct napi_struct *napi)
5004 {
5005 	if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
5006 	    test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
5007 		return;
5008 
5009 	spin_lock(&napi_hash_lock);
5010 
5011 	/* 0..NR_CPUS+1 range is reserved for sender_cpu use */
5012 	do {
5013 		if (unlikely(++napi_gen_id < NR_CPUS + 1))
5014 			napi_gen_id = NR_CPUS + 1;
5015 	} while (napi_by_id(napi_gen_id));
5016 	napi->napi_id = napi_gen_id;
5017 
5018 	hlist_add_head_rcu(&napi->napi_hash_node,
5019 			   &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
5020 
5021 	spin_unlock(&napi_hash_lock);
5022 }
5023 EXPORT_SYMBOL_GPL(napi_hash_add);
5024 
5025 /* Warning : caller is responsible to make sure rcu grace period
5026  * is respected before freeing memory containing @napi
5027  */
5028 bool napi_hash_del(struct napi_struct *napi)
5029 {
5030 	bool rcu_sync_needed = false;
5031 
5032 	spin_lock(&napi_hash_lock);
5033 
5034 	if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
5035 		rcu_sync_needed = true;
5036 		hlist_del_rcu(&napi->napi_hash_node);
5037 	}
5038 	spin_unlock(&napi_hash_lock);
5039 	return rcu_sync_needed;
5040 }
5041 EXPORT_SYMBOL_GPL(napi_hash_del);
5042 
5043 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
5044 {
5045 	struct napi_struct *napi;
5046 
5047 	napi = container_of(timer, struct napi_struct, timer);
5048 	if (napi->gro_list)
5049 		napi_schedule(napi);
5050 
5051 	return HRTIMER_NORESTART;
5052 }
5053 
5054 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
5055 		    int (*poll)(struct napi_struct *, int), int weight)
5056 {
5057 	INIT_LIST_HEAD(&napi->poll_list);
5058 	hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
5059 	napi->timer.function = napi_watchdog;
5060 	napi->gro_count = 0;
5061 	napi->gro_list = NULL;
5062 	napi->skb = NULL;
5063 	napi->poll = poll;
5064 	if (weight > NAPI_POLL_WEIGHT)
5065 		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
5066 			    weight, dev->name);
5067 	napi->weight = weight;
5068 	list_add(&napi->dev_list, &dev->napi_list);
5069 	napi->dev = dev;
5070 #ifdef CONFIG_NETPOLL
5071 	spin_lock_init(&napi->poll_lock);
5072 	napi->poll_owner = -1;
5073 #endif
5074 	set_bit(NAPI_STATE_SCHED, &napi->state);
5075 	napi_hash_add(napi);
5076 }
5077 EXPORT_SYMBOL(netif_napi_add);
5078 
5079 void napi_disable(struct napi_struct *n)
5080 {
5081 	might_sleep();
5082 	set_bit(NAPI_STATE_DISABLE, &n->state);
5083 
5084 	while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
5085 		msleep(1);
5086 	while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
5087 		msleep(1);
5088 
5089 	hrtimer_cancel(&n->timer);
5090 
5091 	clear_bit(NAPI_STATE_DISABLE, &n->state);
5092 }
5093 EXPORT_SYMBOL(napi_disable);
5094 
5095 /* Must be called in process context */
5096 void netif_napi_del(struct napi_struct *napi)
5097 {
5098 	might_sleep();
5099 	if (napi_hash_del(napi))
5100 		synchronize_net();
5101 	list_del_init(&napi->dev_list);
5102 	napi_free_frags(napi);
5103 
5104 	kfree_skb_list(napi->gro_list);
5105 	napi->gro_list = NULL;
5106 	napi->gro_count = 0;
5107 }
5108 EXPORT_SYMBOL(netif_napi_del);
5109 
5110 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5111 {
5112 	void *have;
5113 	int work, weight;
5114 
5115 	list_del_init(&n->poll_list);
5116 
5117 	have = netpoll_poll_lock(n);
5118 
5119 	weight = n->weight;
5120 
5121 	/* This NAPI_STATE_SCHED test is for avoiding a race
5122 	 * with netpoll's poll_napi().  Only the entity which
5123 	 * obtains the lock and sees NAPI_STATE_SCHED set will
5124 	 * actually make the ->poll() call.  Therefore we avoid
5125 	 * accidentally calling ->poll() when NAPI is not scheduled.
5126 	 */
5127 	work = 0;
5128 	if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5129 		work = n->poll(n, weight);
5130 		trace_napi_poll(n);
5131 	}
5132 
5133 	WARN_ON_ONCE(work > weight);
5134 
5135 	if (likely(work < weight))
5136 		goto out_unlock;
5137 
5138 	/* Drivers must not modify the NAPI state if they
5139 	 * consume the entire weight.  In such cases this code
5140 	 * still "owns" the NAPI instance and therefore can
5141 	 * move the instance around on the list at-will.
5142 	 */
5143 	if (unlikely(napi_disable_pending(n))) {
5144 		napi_complete(n);
5145 		goto out_unlock;
5146 	}
5147 
5148 	if (n->gro_list) {
5149 		/* flush too old packets
5150 		 * If HZ < 1000, flush all packets.
5151 		 */
5152 		napi_gro_flush(n, HZ >= 1000);
5153 	}
5154 
5155 	/* Some drivers may have called napi_schedule
5156 	 * prior to exhausting their budget.
5157 	 */
5158 	if (unlikely(!list_empty(&n->poll_list))) {
5159 		pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5160 			     n->dev ? n->dev->name : "backlog");
5161 		goto out_unlock;
5162 	}
5163 
5164 	list_add_tail(&n->poll_list, repoll);
5165 
5166 out_unlock:
5167 	netpoll_poll_unlock(have);
5168 
5169 	return work;
5170 }
5171 
5172 static void net_rx_action(struct softirq_action *h)
5173 {
5174 	struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5175 	unsigned long time_limit = jiffies + 2;
5176 	int budget = netdev_budget;
5177 	LIST_HEAD(list);
5178 	LIST_HEAD(repoll);
5179 
5180 	local_irq_disable();
5181 	list_splice_init(&sd->poll_list, &list);
5182 	local_irq_enable();
5183 
5184 	for (;;) {
5185 		struct napi_struct *n;
5186 
5187 		if (list_empty(&list)) {
5188 			if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5189 				return;
5190 			break;
5191 		}
5192 
5193 		n = list_first_entry(&list, struct napi_struct, poll_list);
5194 		budget -= napi_poll(n, &repoll);
5195 
5196 		/* If softirq window is exhausted then punt.
5197 		 * Allow this to run for 2 jiffies since which will allow
5198 		 * an average latency of 1.5/HZ.
5199 		 */
5200 		if (unlikely(budget <= 0 ||
5201 			     time_after_eq(jiffies, time_limit))) {
5202 			sd->time_squeeze++;
5203 			break;
5204 		}
5205 	}
5206 
5207 	__kfree_skb_flush();
5208 	local_irq_disable();
5209 
5210 	list_splice_tail_init(&sd->poll_list, &list);
5211 	list_splice_tail(&repoll, &list);
5212 	list_splice(&list, &sd->poll_list);
5213 	if (!list_empty(&sd->poll_list))
5214 		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
5215 
5216 	net_rps_action_and_irq_enable(sd);
5217 }
5218 
5219 struct netdev_adjacent {
5220 	struct net_device *dev;
5221 
5222 	/* upper master flag, there can only be one master device per list */
5223 	bool master;
5224 
5225 	/* counter for the number of times this device was added to us */
5226 	u16 ref_nr;
5227 
5228 	/* private field for the users */
5229 	void *private;
5230 
5231 	struct list_head list;
5232 	struct rcu_head rcu;
5233 };
5234 
5235 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5236 						 struct list_head *adj_list)
5237 {
5238 	struct netdev_adjacent *adj;
5239 
5240 	list_for_each_entry(adj, adj_list, list) {
5241 		if (adj->dev == adj_dev)
5242 			return adj;
5243 	}
5244 	return NULL;
5245 }
5246 
5247 /**
5248  * netdev_has_upper_dev - Check if device is linked to an upper device
5249  * @dev: device
5250  * @upper_dev: upper device to check
5251  *
5252  * Find out if a device is linked to specified upper device and return true
5253  * in case it is. Note that this checks only immediate upper device,
5254  * not through a complete stack of devices. The caller must hold the RTNL lock.
5255  */
5256 bool netdev_has_upper_dev(struct net_device *dev,
5257 			  struct net_device *upper_dev)
5258 {
5259 	ASSERT_RTNL();
5260 
5261 	return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5262 }
5263 EXPORT_SYMBOL(netdev_has_upper_dev);
5264 
5265 /**
5266  * netdev_has_any_upper_dev - Check if device is linked to some device
5267  * @dev: device
5268  *
5269  * Find out if a device is linked to an upper device and return true in case
5270  * it is. The caller must hold the RTNL lock.
5271  */
5272 static bool netdev_has_any_upper_dev(struct net_device *dev)
5273 {
5274 	ASSERT_RTNL();
5275 
5276 	return !list_empty(&dev->all_adj_list.upper);
5277 }
5278 
5279 /**
5280  * netdev_master_upper_dev_get - Get master upper device
5281  * @dev: device
5282  *
5283  * Find a master upper device and return pointer to it or NULL in case
5284  * it's not there. The caller must hold the RTNL lock.
5285  */
5286 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5287 {
5288 	struct netdev_adjacent *upper;
5289 
5290 	ASSERT_RTNL();
5291 
5292 	if (list_empty(&dev->adj_list.upper))
5293 		return NULL;
5294 
5295 	upper = list_first_entry(&dev->adj_list.upper,
5296 				 struct netdev_adjacent, list);
5297 	if (likely(upper->master))
5298 		return upper->dev;
5299 	return NULL;
5300 }
5301 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5302 
5303 void *netdev_adjacent_get_private(struct list_head *adj_list)
5304 {
5305 	struct netdev_adjacent *adj;
5306 
5307 	adj = list_entry(adj_list, struct netdev_adjacent, list);
5308 
5309 	return adj->private;
5310 }
5311 EXPORT_SYMBOL(netdev_adjacent_get_private);
5312 
5313 /**
5314  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5315  * @dev: device
5316  * @iter: list_head ** of the current position
5317  *
5318  * Gets the next device from the dev's upper list, starting from iter
5319  * position. The caller must hold RCU read lock.
5320  */
5321 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5322 						 struct list_head **iter)
5323 {
5324 	struct netdev_adjacent *upper;
5325 
5326 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5327 
5328 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5329 
5330 	if (&upper->list == &dev->adj_list.upper)
5331 		return NULL;
5332 
5333 	*iter = &upper->list;
5334 
5335 	return upper->dev;
5336 }
5337 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5338 
5339 /**
5340  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5341  * @dev: device
5342  * @iter: list_head ** of the current position
5343  *
5344  * Gets the next device from the dev's upper list, starting from iter
5345  * position. The caller must hold RCU read lock.
5346  */
5347 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5348 						     struct list_head **iter)
5349 {
5350 	struct netdev_adjacent *upper;
5351 
5352 	WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5353 
5354 	upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5355 
5356 	if (&upper->list == &dev->all_adj_list.upper)
5357 		return NULL;
5358 
5359 	*iter = &upper->list;
5360 
5361 	return upper->dev;
5362 }
5363 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5364 
5365 /**
5366  * netdev_lower_get_next_private - Get the next ->private from the
5367  *				   lower neighbour list
5368  * @dev: device
5369  * @iter: list_head ** of the current position
5370  *
5371  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5372  * list, starting from iter position. The caller must hold either hold the
5373  * RTNL lock or its own locking that guarantees that the neighbour lower
5374  * list will remain unchanged.
5375  */
5376 void *netdev_lower_get_next_private(struct net_device *dev,
5377 				    struct list_head **iter)
5378 {
5379 	struct netdev_adjacent *lower;
5380 
5381 	lower = list_entry(*iter, struct netdev_adjacent, list);
5382 
5383 	if (&lower->list == &dev->adj_list.lower)
5384 		return NULL;
5385 
5386 	*iter = lower->list.next;
5387 
5388 	return lower->private;
5389 }
5390 EXPORT_SYMBOL(netdev_lower_get_next_private);
5391 
5392 /**
5393  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5394  *				       lower neighbour list, RCU
5395  *				       variant
5396  * @dev: device
5397  * @iter: list_head ** of the current position
5398  *
5399  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5400  * list, starting from iter position. The caller must hold RCU read lock.
5401  */
5402 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5403 					struct list_head **iter)
5404 {
5405 	struct netdev_adjacent *lower;
5406 
5407 	WARN_ON_ONCE(!rcu_read_lock_held());
5408 
5409 	lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5410 
5411 	if (&lower->list == &dev->adj_list.lower)
5412 		return NULL;
5413 
5414 	*iter = &lower->list;
5415 
5416 	return lower->private;
5417 }
5418 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5419 
5420 /**
5421  * netdev_lower_get_next - Get the next device from the lower neighbour
5422  *                         list
5423  * @dev: device
5424  * @iter: list_head ** of the current position
5425  *
5426  * Gets the next netdev_adjacent from the dev's lower neighbour
5427  * list, starting from iter position. The caller must hold RTNL lock or
5428  * its own locking that guarantees that the neighbour lower
5429  * list will remain unchanged.
5430  */
5431 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5432 {
5433 	struct netdev_adjacent *lower;
5434 
5435 	lower = list_entry(*iter, struct netdev_adjacent, list);
5436 
5437 	if (&lower->list == &dev->adj_list.lower)
5438 		return NULL;
5439 
5440 	*iter = lower->list.next;
5441 
5442 	return lower->dev;
5443 }
5444 EXPORT_SYMBOL(netdev_lower_get_next);
5445 
5446 /**
5447  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5448  *				       lower neighbour list, RCU
5449  *				       variant
5450  * @dev: device
5451  *
5452  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5453  * list. The caller must hold RCU read lock.
5454  */
5455 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5456 {
5457 	struct netdev_adjacent *lower;
5458 
5459 	lower = list_first_or_null_rcu(&dev->adj_list.lower,
5460 			struct netdev_adjacent, list);
5461 	if (lower)
5462 		return lower->private;
5463 	return NULL;
5464 }
5465 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5466 
5467 /**
5468  * netdev_master_upper_dev_get_rcu - Get master upper device
5469  * @dev: device
5470  *
5471  * Find a master upper device and return pointer to it or NULL in case
5472  * it's not there. The caller must hold the RCU read lock.
5473  */
5474 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5475 {
5476 	struct netdev_adjacent *upper;
5477 
5478 	upper = list_first_or_null_rcu(&dev->adj_list.upper,
5479 				       struct netdev_adjacent, list);
5480 	if (upper && likely(upper->master))
5481 		return upper->dev;
5482 	return NULL;
5483 }
5484 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5485 
5486 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5487 			      struct net_device *adj_dev,
5488 			      struct list_head *dev_list)
5489 {
5490 	char linkname[IFNAMSIZ+7];
5491 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5492 		"upper_%s" : "lower_%s", adj_dev->name);
5493 	return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5494 				 linkname);
5495 }
5496 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5497 			       char *name,
5498 			       struct list_head *dev_list)
5499 {
5500 	char linkname[IFNAMSIZ+7];
5501 	sprintf(linkname, dev_list == &dev->adj_list.upper ?
5502 		"upper_%s" : "lower_%s", name);
5503 	sysfs_remove_link(&(dev->dev.kobj), linkname);
5504 }
5505 
5506 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5507 						 struct net_device *adj_dev,
5508 						 struct list_head *dev_list)
5509 {
5510 	return (dev_list == &dev->adj_list.upper ||
5511 		dev_list == &dev->adj_list.lower) &&
5512 		net_eq(dev_net(dev), dev_net(adj_dev));
5513 }
5514 
5515 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5516 					struct net_device *adj_dev,
5517 					struct list_head *dev_list,
5518 					void *private, bool master)
5519 {
5520 	struct netdev_adjacent *adj;
5521 	int ret;
5522 
5523 	adj = __netdev_find_adj(adj_dev, dev_list);
5524 
5525 	if (adj) {
5526 		adj->ref_nr++;
5527 		return 0;
5528 	}
5529 
5530 	adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5531 	if (!adj)
5532 		return -ENOMEM;
5533 
5534 	adj->dev = adj_dev;
5535 	adj->master = master;
5536 	adj->ref_nr = 1;
5537 	adj->private = private;
5538 	dev_hold(adj_dev);
5539 
5540 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5541 		 adj_dev->name, dev->name, adj_dev->name);
5542 
5543 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5544 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5545 		if (ret)
5546 			goto free_adj;
5547 	}
5548 
5549 	/* Ensure that master link is always the first item in list. */
5550 	if (master) {
5551 		ret = sysfs_create_link(&(dev->dev.kobj),
5552 					&(adj_dev->dev.kobj), "master");
5553 		if (ret)
5554 			goto remove_symlinks;
5555 
5556 		list_add_rcu(&adj->list, dev_list);
5557 	} else {
5558 		list_add_tail_rcu(&adj->list, dev_list);
5559 	}
5560 
5561 	return 0;
5562 
5563 remove_symlinks:
5564 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5565 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5566 free_adj:
5567 	kfree(adj);
5568 	dev_put(adj_dev);
5569 
5570 	return ret;
5571 }
5572 
5573 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5574 					 struct net_device *adj_dev,
5575 					 struct list_head *dev_list)
5576 {
5577 	struct netdev_adjacent *adj;
5578 
5579 	adj = __netdev_find_adj(adj_dev, dev_list);
5580 
5581 	if (!adj) {
5582 		pr_err("tried to remove device %s from %s\n",
5583 		       dev->name, adj_dev->name);
5584 		BUG();
5585 	}
5586 
5587 	if (adj->ref_nr > 1) {
5588 		pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
5589 			 adj->ref_nr-1);
5590 		adj->ref_nr--;
5591 		return;
5592 	}
5593 
5594 	if (adj->master)
5595 		sysfs_remove_link(&(dev->dev.kobj), "master");
5596 
5597 	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5598 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5599 
5600 	list_del_rcu(&adj->list);
5601 	pr_debug("dev_put for %s, because link removed from %s to %s\n",
5602 		 adj_dev->name, dev->name, adj_dev->name);
5603 	dev_put(adj_dev);
5604 	kfree_rcu(adj, rcu);
5605 }
5606 
5607 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5608 					    struct net_device *upper_dev,
5609 					    struct list_head *up_list,
5610 					    struct list_head *down_list,
5611 					    void *private, bool master)
5612 {
5613 	int ret;
5614 
5615 	ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
5616 					   master);
5617 	if (ret)
5618 		return ret;
5619 
5620 	ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
5621 					   false);
5622 	if (ret) {
5623 		__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5624 		return ret;
5625 	}
5626 
5627 	return 0;
5628 }
5629 
5630 static int __netdev_adjacent_dev_link(struct net_device *dev,
5631 				      struct net_device *upper_dev)
5632 {
5633 	return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5634 						&dev->all_adj_list.upper,
5635 						&upper_dev->all_adj_list.lower,
5636 						NULL, false);
5637 }
5638 
5639 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5640 					       struct net_device *upper_dev,
5641 					       struct list_head *up_list,
5642 					       struct list_head *down_list)
5643 {
5644 	__netdev_adjacent_dev_remove(dev, upper_dev, up_list);
5645 	__netdev_adjacent_dev_remove(upper_dev, dev, down_list);
5646 }
5647 
5648 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5649 					 struct net_device *upper_dev)
5650 {
5651 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5652 					   &dev->all_adj_list.upper,
5653 					   &upper_dev->all_adj_list.lower);
5654 }
5655 
5656 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5657 						struct net_device *upper_dev,
5658 						void *private, bool master)
5659 {
5660 	int ret = __netdev_adjacent_dev_link(dev, upper_dev);
5661 
5662 	if (ret)
5663 		return ret;
5664 
5665 	ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
5666 					       &dev->adj_list.upper,
5667 					       &upper_dev->adj_list.lower,
5668 					       private, master);
5669 	if (ret) {
5670 		__netdev_adjacent_dev_unlink(dev, upper_dev);
5671 		return ret;
5672 	}
5673 
5674 	return 0;
5675 }
5676 
5677 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5678 						   struct net_device *upper_dev)
5679 {
5680 	__netdev_adjacent_dev_unlink(dev, upper_dev);
5681 	__netdev_adjacent_dev_unlink_lists(dev, upper_dev,
5682 					   &dev->adj_list.upper,
5683 					   &upper_dev->adj_list.lower);
5684 }
5685 
5686 static int __netdev_upper_dev_link(struct net_device *dev,
5687 				   struct net_device *upper_dev, bool master,
5688 				   void *upper_priv, void *upper_info)
5689 {
5690 	struct netdev_notifier_changeupper_info changeupper_info;
5691 	struct netdev_adjacent *i, *j, *to_i, *to_j;
5692 	int ret = 0;
5693 
5694 	ASSERT_RTNL();
5695 
5696 	if (dev == upper_dev)
5697 		return -EBUSY;
5698 
5699 	/* To prevent loops, check if dev is not upper device to upper_dev. */
5700 	if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5701 		return -EBUSY;
5702 
5703 	if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5704 		return -EEXIST;
5705 
5706 	if (master && netdev_master_upper_dev_get(dev))
5707 		return -EBUSY;
5708 
5709 	changeupper_info.upper_dev = upper_dev;
5710 	changeupper_info.master = master;
5711 	changeupper_info.linking = true;
5712 	changeupper_info.upper_info = upper_info;
5713 
5714 	ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5715 					    &changeupper_info.info);
5716 	ret = notifier_to_errno(ret);
5717 	if (ret)
5718 		return ret;
5719 
5720 	ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5721 						   master);
5722 	if (ret)
5723 		return ret;
5724 
5725 	/* Now that we linked these devs, make all the upper_dev's
5726 	 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5727 	 * versa, and don't forget the devices itself. All of these
5728 	 * links are non-neighbours.
5729 	 */
5730 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5731 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5732 			pr_debug("Interlinking %s with %s, non-neighbour\n",
5733 				 i->dev->name, j->dev->name);
5734 			ret = __netdev_adjacent_dev_link(i->dev, j->dev);
5735 			if (ret)
5736 				goto rollback_mesh;
5737 		}
5738 	}
5739 
5740 	/* add dev to every upper_dev's upper device */
5741 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5742 		pr_debug("linking %s's upper device %s with %s\n",
5743 			 upper_dev->name, i->dev->name, dev->name);
5744 		ret = __netdev_adjacent_dev_link(dev, i->dev);
5745 		if (ret)
5746 			goto rollback_upper_mesh;
5747 	}
5748 
5749 	/* add upper_dev to every dev's lower device */
5750 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5751 		pr_debug("linking %s's lower device %s with %s\n", dev->name,
5752 			 i->dev->name, upper_dev->name);
5753 		ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
5754 		if (ret)
5755 			goto rollback_lower_mesh;
5756 	}
5757 
5758 	ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5759 					    &changeupper_info.info);
5760 	ret = notifier_to_errno(ret);
5761 	if (ret)
5762 		goto rollback_lower_mesh;
5763 
5764 	return 0;
5765 
5766 rollback_lower_mesh:
5767 	to_i = i;
5768 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5769 		if (i == to_i)
5770 			break;
5771 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5772 	}
5773 
5774 	i = NULL;
5775 
5776 rollback_upper_mesh:
5777 	to_i = i;
5778 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5779 		if (i == to_i)
5780 			break;
5781 		__netdev_adjacent_dev_unlink(dev, i->dev);
5782 	}
5783 
5784 	i = j = NULL;
5785 
5786 rollback_mesh:
5787 	to_i = i;
5788 	to_j = j;
5789 	list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5790 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5791 			if (i == to_i && j == to_j)
5792 				break;
5793 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5794 		}
5795 		if (i == to_i)
5796 			break;
5797 	}
5798 
5799 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5800 
5801 	return ret;
5802 }
5803 
5804 /**
5805  * netdev_upper_dev_link - Add a link to the upper device
5806  * @dev: device
5807  * @upper_dev: new upper device
5808  *
5809  * Adds a link to device which is upper to this one. The caller must hold
5810  * the RTNL lock. On a failure a negative errno code is returned.
5811  * On success the reference counts are adjusted and the function
5812  * returns zero.
5813  */
5814 int netdev_upper_dev_link(struct net_device *dev,
5815 			  struct net_device *upper_dev)
5816 {
5817 	return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5818 }
5819 EXPORT_SYMBOL(netdev_upper_dev_link);
5820 
5821 /**
5822  * netdev_master_upper_dev_link - Add a master link to the upper device
5823  * @dev: device
5824  * @upper_dev: new upper device
5825  * @upper_priv: upper device private
5826  * @upper_info: upper info to be passed down via notifier
5827  *
5828  * Adds a link to device which is upper to this one. In this case, only
5829  * one master upper device can be linked, although other non-master devices
5830  * might be linked as well. The caller must hold the RTNL lock.
5831  * On a failure a negative errno code is returned. On success the reference
5832  * counts are adjusted and the function returns zero.
5833  */
5834 int netdev_master_upper_dev_link(struct net_device *dev,
5835 				 struct net_device *upper_dev,
5836 				 void *upper_priv, void *upper_info)
5837 {
5838 	return __netdev_upper_dev_link(dev, upper_dev, true,
5839 				       upper_priv, upper_info);
5840 }
5841 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5842 
5843 /**
5844  * netdev_upper_dev_unlink - Removes a link to upper device
5845  * @dev: device
5846  * @upper_dev: new upper device
5847  *
5848  * Removes a link to device which is upper to this one. The caller must hold
5849  * the RTNL lock.
5850  */
5851 void netdev_upper_dev_unlink(struct net_device *dev,
5852 			     struct net_device *upper_dev)
5853 {
5854 	struct netdev_notifier_changeupper_info changeupper_info;
5855 	struct netdev_adjacent *i, *j;
5856 	ASSERT_RTNL();
5857 
5858 	changeupper_info.upper_dev = upper_dev;
5859 	changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5860 	changeupper_info.linking = false;
5861 
5862 	call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5863 				      &changeupper_info.info);
5864 
5865 	__netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5866 
5867 	/* Here is the tricky part. We must remove all dev's lower
5868 	 * devices from all upper_dev's upper devices and vice
5869 	 * versa, to maintain the graph relationship.
5870 	 */
5871 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5872 		list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5873 			__netdev_adjacent_dev_unlink(i->dev, j->dev);
5874 
5875 	/* remove also the devices itself from lower/upper device
5876 	 * list
5877 	 */
5878 	list_for_each_entry(i, &dev->all_adj_list.lower, list)
5879 		__netdev_adjacent_dev_unlink(i->dev, upper_dev);
5880 
5881 	list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5882 		__netdev_adjacent_dev_unlink(dev, i->dev);
5883 
5884 	call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5885 				      &changeupper_info.info);
5886 }
5887 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5888 
5889 /**
5890  * netdev_bonding_info_change - Dispatch event about slave change
5891  * @dev: device
5892  * @bonding_info: info to dispatch
5893  *
5894  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5895  * The caller must hold the RTNL lock.
5896  */
5897 void netdev_bonding_info_change(struct net_device *dev,
5898 				struct netdev_bonding_info *bonding_info)
5899 {
5900 	struct netdev_notifier_bonding_info	info;
5901 
5902 	memcpy(&info.bonding_info, bonding_info,
5903 	       sizeof(struct netdev_bonding_info));
5904 	call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5905 				      &info.info);
5906 }
5907 EXPORT_SYMBOL(netdev_bonding_info_change);
5908 
5909 static void netdev_adjacent_add_links(struct net_device *dev)
5910 {
5911 	struct netdev_adjacent *iter;
5912 
5913 	struct net *net = dev_net(dev);
5914 
5915 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5916 		if (!net_eq(net,dev_net(iter->dev)))
5917 			continue;
5918 		netdev_adjacent_sysfs_add(iter->dev, dev,
5919 					  &iter->dev->adj_list.lower);
5920 		netdev_adjacent_sysfs_add(dev, iter->dev,
5921 					  &dev->adj_list.upper);
5922 	}
5923 
5924 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5925 		if (!net_eq(net,dev_net(iter->dev)))
5926 			continue;
5927 		netdev_adjacent_sysfs_add(iter->dev, dev,
5928 					  &iter->dev->adj_list.upper);
5929 		netdev_adjacent_sysfs_add(dev, iter->dev,
5930 					  &dev->adj_list.lower);
5931 	}
5932 }
5933 
5934 static void netdev_adjacent_del_links(struct net_device *dev)
5935 {
5936 	struct netdev_adjacent *iter;
5937 
5938 	struct net *net = dev_net(dev);
5939 
5940 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5941 		if (!net_eq(net,dev_net(iter->dev)))
5942 			continue;
5943 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5944 					  &iter->dev->adj_list.lower);
5945 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5946 					  &dev->adj_list.upper);
5947 	}
5948 
5949 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5950 		if (!net_eq(net,dev_net(iter->dev)))
5951 			continue;
5952 		netdev_adjacent_sysfs_del(iter->dev, dev->name,
5953 					  &iter->dev->adj_list.upper);
5954 		netdev_adjacent_sysfs_del(dev, iter->dev->name,
5955 					  &dev->adj_list.lower);
5956 	}
5957 }
5958 
5959 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5960 {
5961 	struct netdev_adjacent *iter;
5962 
5963 	struct net *net = dev_net(dev);
5964 
5965 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
5966 		if (!net_eq(net,dev_net(iter->dev)))
5967 			continue;
5968 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5969 					  &iter->dev->adj_list.lower);
5970 		netdev_adjacent_sysfs_add(iter->dev, dev,
5971 					  &iter->dev->adj_list.lower);
5972 	}
5973 
5974 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
5975 		if (!net_eq(net,dev_net(iter->dev)))
5976 			continue;
5977 		netdev_adjacent_sysfs_del(iter->dev, oldname,
5978 					  &iter->dev->adj_list.upper);
5979 		netdev_adjacent_sysfs_add(iter->dev, dev,
5980 					  &iter->dev->adj_list.upper);
5981 	}
5982 }
5983 
5984 void *netdev_lower_dev_get_private(struct net_device *dev,
5985 				   struct net_device *lower_dev)
5986 {
5987 	struct netdev_adjacent *lower;
5988 
5989 	if (!lower_dev)
5990 		return NULL;
5991 	lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5992 	if (!lower)
5993 		return NULL;
5994 
5995 	return lower->private;
5996 }
5997 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5998 
5999 
6000 int dev_get_nest_level(struct net_device *dev,
6001 		       bool (*type_check)(const struct net_device *dev))
6002 {
6003 	struct net_device *lower = NULL;
6004 	struct list_head *iter;
6005 	int max_nest = -1;
6006 	int nest;
6007 
6008 	ASSERT_RTNL();
6009 
6010 	netdev_for_each_lower_dev(dev, lower, iter) {
6011 		nest = dev_get_nest_level(lower, type_check);
6012 		if (max_nest < nest)
6013 			max_nest = nest;
6014 	}
6015 
6016 	if (type_check(dev))
6017 		max_nest++;
6018 
6019 	return max_nest;
6020 }
6021 EXPORT_SYMBOL(dev_get_nest_level);
6022 
6023 /**
6024  * netdev_lower_change - Dispatch event about lower device state change
6025  * @lower_dev: device
6026  * @lower_state_info: state to dispatch
6027  *
6028  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
6029  * The caller must hold the RTNL lock.
6030  */
6031 void netdev_lower_state_changed(struct net_device *lower_dev,
6032 				void *lower_state_info)
6033 {
6034 	struct netdev_notifier_changelowerstate_info changelowerstate_info;
6035 
6036 	ASSERT_RTNL();
6037 	changelowerstate_info.lower_state_info = lower_state_info;
6038 	call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
6039 				      &changelowerstate_info.info);
6040 }
6041 EXPORT_SYMBOL(netdev_lower_state_changed);
6042 
6043 static void dev_change_rx_flags(struct net_device *dev, int flags)
6044 {
6045 	const struct net_device_ops *ops = dev->netdev_ops;
6046 
6047 	if (ops->ndo_change_rx_flags)
6048 		ops->ndo_change_rx_flags(dev, flags);
6049 }
6050 
6051 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6052 {
6053 	unsigned int old_flags = dev->flags;
6054 	kuid_t uid;
6055 	kgid_t gid;
6056 
6057 	ASSERT_RTNL();
6058 
6059 	dev->flags |= IFF_PROMISC;
6060 	dev->promiscuity += inc;
6061 	if (dev->promiscuity == 0) {
6062 		/*
6063 		 * Avoid overflow.
6064 		 * If inc causes overflow, untouch promisc and return error.
6065 		 */
6066 		if (inc < 0)
6067 			dev->flags &= ~IFF_PROMISC;
6068 		else {
6069 			dev->promiscuity -= inc;
6070 			pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6071 				dev->name);
6072 			return -EOVERFLOW;
6073 		}
6074 	}
6075 	if (dev->flags != old_flags) {
6076 		pr_info("device %s %s promiscuous mode\n",
6077 			dev->name,
6078 			dev->flags & IFF_PROMISC ? "entered" : "left");
6079 		if (audit_enabled) {
6080 			current_uid_gid(&uid, &gid);
6081 			audit_log(current->audit_context, GFP_ATOMIC,
6082 				AUDIT_ANOM_PROMISCUOUS,
6083 				"dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6084 				dev->name, (dev->flags & IFF_PROMISC),
6085 				(old_flags & IFF_PROMISC),
6086 				from_kuid(&init_user_ns, audit_get_loginuid(current)),
6087 				from_kuid(&init_user_ns, uid),
6088 				from_kgid(&init_user_ns, gid),
6089 				audit_get_sessionid(current));
6090 		}
6091 
6092 		dev_change_rx_flags(dev, IFF_PROMISC);
6093 	}
6094 	if (notify)
6095 		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
6096 	return 0;
6097 }
6098 
6099 /**
6100  *	dev_set_promiscuity	- update promiscuity count on a device
6101  *	@dev: device
6102  *	@inc: modifier
6103  *
6104  *	Add or remove promiscuity from a device. While the count in the device
6105  *	remains above zero the interface remains promiscuous. Once it hits zero
6106  *	the device reverts back to normal filtering operation. A negative inc
6107  *	value is used to drop promiscuity on the device.
6108  *	Return 0 if successful or a negative errno code on error.
6109  */
6110 int dev_set_promiscuity(struct net_device *dev, int inc)
6111 {
6112 	unsigned int old_flags = dev->flags;
6113 	int err;
6114 
6115 	err = __dev_set_promiscuity(dev, inc, true);
6116 	if (err < 0)
6117 		return err;
6118 	if (dev->flags != old_flags)
6119 		dev_set_rx_mode(dev);
6120 	return err;
6121 }
6122 EXPORT_SYMBOL(dev_set_promiscuity);
6123 
6124 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6125 {
6126 	unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6127 
6128 	ASSERT_RTNL();
6129 
6130 	dev->flags |= IFF_ALLMULTI;
6131 	dev->allmulti += inc;
6132 	if (dev->allmulti == 0) {
6133 		/*
6134 		 * Avoid overflow.
6135 		 * If inc causes overflow, untouch allmulti and return error.
6136 		 */
6137 		if (inc < 0)
6138 			dev->flags &= ~IFF_ALLMULTI;
6139 		else {
6140 			dev->allmulti -= inc;
6141 			pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6142 				dev->name);
6143 			return -EOVERFLOW;
6144 		}
6145 	}
6146 	if (dev->flags ^ old_flags) {
6147 		dev_change_rx_flags(dev, IFF_ALLMULTI);
6148 		dev_set_rx_mode(dev);
6149 		if (notify)
6150 			__dev_notify_flags(dev, old_flags,
6151 					   dev->gflags ^ old_gflags);
6152 	}
6153 	return 0;
6154 }
6155 
6156 /**
6157  *	dev_set_allmulti	- update allmulti count on a device
6158  *	@dev: device
6159  *	@inc: modifier
6160  *
6161  *	Add or remove reception of all multicast frames to a device. While the
6162  *	count in the device remains above zero the interface remains listening
6163  *	to all interfaces. Once it hits zero the device reverts back to normal
6164  *	filtering operation. A negative @inc value is used to drop the counter
6165  *	when releasing a resource needing all multicasts.
6166  *	Return 0 if successful or a negative errno code on error.
6167  */
6168 
6169 int dev_set_allmulti(struct net_device *dev, int inc)
6170 {
6171 	return __dev_set_allmulti(dev, inc, true);
6172 }
6173 EXPORT_SYMBOL(dev_set_allmulti);
6174 
6175 /*
6176  *	Upload unicast and multicast address lists to device and
6177  *	configure RX filtering. When the device doesn't support unicast
6178  *	filtering it is put in promiscuous mode while unicast addresses
6179  *	are present.
6180  */
6181 void __dev_set_rx_mode(struct net_device *dev)
6182 {
6183 	const struct net_device_ops *ops = dev->netdev_ops;
6184 
6185 	/* dev_open will call this function so the list will stay sane. */
6186 	if (!(dev->flags&IFF_UP))
6187 		return;
6188 
6189 	if (!netif_device_present(dev))
6190 		return;
6191 
6192 	if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6193 		/* Unicast addresses changes may only happen under the rtnl,
6194 		 * therefore calling __dev_set_promiscuity here is safe.
6195 		 */
6196 		if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6197 			__dev_set_promiscuity(dev, 1, false);
6198 			dev->uc_promisc = true;
6199 		} else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6200 			__dev_set_promiscuity(dev, -1, false);
6201 			dev->uc_promisc = false;
6202 		}
6203 	}
6204 
6205 	if (ops->ndo_set_rx_mode)
6206 		ops->ndo_set_rx_mode(dev);
6207 }
6208 
6209 void dev_set_rx_mode(struct net_device *dev)
6210 {
6211 	netif_addr_lock_bh(dev);
6212 	__dev_set_rx_mode(dev);
6213 	netif_addr_unlock_bh(dev);
6214 }
6215 
6216 /**
6217  *	dev_get_flags - get flags reported to userspace
6218  *	@dev: device
6219  *
6220  *	Get the combination of flag bits exported through APIs to userspace.
6221  */
6222 unsigned int dev_get_flags(const struct net_device *dev)
6223 {
6224 	unsigned int flags;
6225 
6226 	flags = (dev->flags & ~(IFF_PROMISC |
6227 				IFF_ALLMULTI |
6228 				IFF_RUNNING |
6229 				IFF_LOWER_UP |
6230 				IFF_DORMANT)) |
6231 		(dev->gflags & (IFF_PROMISC |
6232 				IFF_ALLMULTI));
6233 
6234 	if (netif_running(dev)) {
6235 		if (netif_oper_up(dev))
6236 			flags |= IFF_RUNNING;
6237 		if (netif_carrier_ok(dev))
6238 			flags |= IFF_LOWER_UP;
6239 		if (netif_dormant(dev))
6240 			flags |= IFF_DORMANT;
6241 	}
6242 
6243 	return flags;
6244 }
6245 EXPORT_SYMBOL(dev_get_flags);
6246 
6247 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6248 {
6249 	unsigned int old_flags = dev->flags;
6250 	int ret;
6251 
6252 	ASSERT_RTNL();
6253 
6254 	/*
6255 	 *	Set the flags on our device.
6256 	 */
6257 
6258 	dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6259 			       IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6260 			       IFF_AUTOMEDIA)) |
6261 		     (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6262 				    IFF_ALLMULTI));
6263 
6264 	/*
6265 	 *	Load in the correct multicast list now the flags have changed.
6266 	 */
6267 
6268 	if ((old_flags ^ flags) & IFF_MULTICAST)
6269 		dev_change_rx_flags(dev, IFF_MULTICAST);
6270 
6271 	dev_set_rx_mode(dev);
6272 
6273 	/*
6274 	 *	Have we downed the interface. We handle IFF_UP ourselves
6275 	 *	according to user attempts to set it, rather than blindly
6276 	 *	setting it.
6277 	 */
6278 
6279 	ret = 0;
6280 	if ((old_flags ^ flags) & IFF_UP)
6281 		ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6282 
6283 	if ((flags ^ dev->gflags) & IFF_PROMISC) {
6284 		int inc = (flags & IFF_PROMISC) ? 1 : -1;
6285 		unsigned int old_flags = dev->flags;
6286 
6287 		dev->gflags ^= IFF_PROMISC;
6288 
6289 		if (__dev_set_promiscuity(dev, inc, false) >= 0)
6290 			if (dev->flags != old_flags)
6291 				dev_set_rx_mode(dev);
6292 	}
6293 
6294 	/* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6295 	   is important. Some (broken) drivers set IFF_PROMISC, when
6296 	   IFF_ALLMULTI is requested not asking us and not reporting.
6297 	 */
6298 	if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6299 		int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6300 
6301 		dev->gflags ^= IFF_ALLMULTI;
6302 		__dev_set_allmulti(dev, inc, false);
6303 	}
6304 
6305 	return ret;
6306 }
6307 
6308 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6309 			unsigned int gchanges)
6310 {
6311 	unsigned int changes = dev->flags ^ old_flags;
6312 
6313 	if (gchanges)
6314 		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6315 
6316 	if (changes & IFF_UP) {
6317 		if (dev->flags & IFF_UP)
6318 			call_netdevice_notifiers(NETDEV_UP, dev);
6319 		else
6320 			call_netdevice_notifiers(NETDEV_DOWN, dev);
6321 	}
6322 
6323 	if (dev->flags & IFF_UP &&
6324 	    (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6325 		struct netdev_notifier_change_info change_info;
6326 
6327 		change_info.flags_changed = changes;
6328 		call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6329 					      &change_info.info);
6330 	}
6331 }
6332 
6333 /**
6334  *	dev_change_flags - change device settings
6335  *	@dev: device
6336  *	@flags: device state flags
6337  *
6338  *	Change settings on device based state flags. The flags are
6339  *	in the userspace exported format.
6340  */
6341 int dev_change_flags(struct net_device *dev, unsigned int flags)
6342 {
6343 	int ret;
6344 	unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6345 
6346 	ret = __dev_change_flags(dev, flags);
6347 	if (ret < 0)
6348 		return ret;
6349 
6350 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6351 	__dev_notify_flags(dev, old_flags, changes);
6352 	return ret;
6353 }
6354 EXPORT_SYMBOL(dev_change_flags);
6355 
6356 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6357 {
6358 	const struct net_device_ops *ops = dev->netdev_ops;
6359 
6360 	if (ops->ndo_change_mtu)
6361 		return ops->ndo_change_mtu(dev, new_mtu);
6362 
6363 	dev->mtu = new_mtu;
6364 	return 0;
6365 }
6366 
6367 /**
6368  *	dev_set_mtu - Change maximum transfer unit
6369  *	@dev: device
6370  *	@new_mtu: new transfer unit
6371  *
6372  *	Change the maximum transfer size of the network device.
6373  */
6374 int dev_set_mtu(struct net_device *dev, int new_mtu)
6375 {
6376 	int err, orig_mtu;
6377 
6378 	if (new_mtu == dev->mtu)
6379 		return 0;
6380 
6381 	/*	MTU must be positive.	 */
6382 	if (new_mtu < 0)
6383 		return -EINVAL;
6384 
6385 	if (!netif_device_present(dev))
6386 		return -ENODEV;
6387 
6388 	err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6389 	err = notifier_to_errno(err);
6390 	if (err)
6391 		return err;
6392 
6393 	orig_mtu = dev->mtu;
6394 	err = __dev_set_mtu(dev, new_mtu);
6395 
6396 	if (!err) {
6397 		err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6398 		err = notifier_to_errno(err);
6399 		if (err) {
6400 			/* setting mtu back and notifying everyone again,
6401 			 * so that they have a chance to revert changes.
6402 			 */
6403 			__dev_set_mtu(dev, orig_mtu);
6404 			call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6405 		}
6406 	}
6407 	return err;
6408 }
6409 EXPORT_SYMBOL(dev_set_mtu);
6410 
6411 /**
6412  *	dev_set_group - Change group this device belongs to
6413  *	@dev: device
6414  *	@new_group: group this device should belong to
6415  */
6416 void dev_set_group(struct net_device *dev, int new_group)
6417 {
6418 	dev->group = new_group;
6419 }
6420 EXPORT_SYMBOL(dev_set_group);
6421 
6422 /**
6423  *	dev_set_mac_address - Change Media Access Control Address
6424  *	@dev: device
6425  *	@sa: new address
6426  *
6427  *	Change the hardware (MAC) address of the device
6428  */
6429 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6430 {
6431 	const struct net_device_ops *ops = dev->netdev_ops;
6432 	int err;
6433 
6434 	if (!ops->ndo_set_mac_address)
6435 		return -EOPNOTSUPP;
6436 	if (sa->sa_family != dev->type)
6437 		return -EINVAL;
6438 	if (!netif_device_present(dev))
6439 		return -ENODEV;
6440 	err = ops->ndo_set_mac_address(dev, sa);
6441 	if (err)
6442 		return err;
6443 	dev->addr_assign_type = NET_ADDR_SET;
6444 	call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6445 	add_device_randomness(dev->dev_addr, dev->addr_len);
6446 	return 0;
6447 }
6448 EXPORT_SYMBOL(dev_set_mac_address);
6449 
6450 /**
6451  *	dev_change_carrier - Change device carrier
6452  *	@dev: device
6453  *	@new_carrier: new value
6454  *
6455  *	Change device carrier
6456  */
6457 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6458 {
6459 	const struct net_device_ops *ops = dev->netdev_ops;
6460 
6461 	if (!ops->ndo_change_carrier)
6462 		return -EOPNOTSUPP;
6463 	if (!netif_device_present(dev))
6464 		return -ENODEV;
6465 	return ops->ndo_change_carrier(dev, new_carrier);
6466 }
6467 EXPORT_SYMBOL(dev_change_carrier);
6468 
6469 /**
6470  *	dev_get_phys_port_id - Get device physical port ID
6471  *	@dev: device
6472  *	@ppid: port ID
6473  *
6474  *	Get device physical port ID
6475  */
6476 int dev_get_phys_port_id(struct net_device *dev,
6477 			 struct netdev_phys_item_id *ppid)
6478 {
6479 	const struct net_device_ops *ops = dev->netdev_ops;
6480 
6481 	if (!ops->ndo_get_phys_port_id)
6482 		return -EOPNOTSUPP;
6483 	return ops->ndo_get_phys_port_id(dev, ppid);
6484 }
6485 EXPORT_SYMBOL(dev_get_phys_port_id);
6486 
6487 /**
6488  *	dev_get_phys_port_name - Get device physical port name
6489  *	@dev: device
6490  *	@name: port name
6491  *	@len: limit of bytes to copy to name
6492  *
6493  *	Get device physical port name
6494  */
6495 int dev_get_phys_port_name(struct net_device *dev,
6496 			   char *name, size_t len)
6497 {
6498 	const struct net_device_ops *ops = dev->netdev_ops;
6499 
6500 	if (!ops->ndo_get_phys_port_name)
6501 		return -EOPNOTSUPP;
6502 	return ops->ndo_get_phys_port_name(dev, name, len);
6503 }
6504 EXPORT_SYMBOL(dev_get_phys_port_name);
6505 
6506 /**
6507  *	dev_change_proto_down - update protocol port state information
6508  *	@dev: device
6509  *	@proto_down: new value
6510  *
6511  *	This info can be used by switch drivers to set the phys state of the
6512  *	port.
6513  */
6514 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6515 {
6516 	const struct net_device_ops *ops = dev->netdev_ops;
6517 
6518 	if (!ops->ndo_change_proto_down)
6519 		return -EOPNOTSUPP;
6520 	if (!netif_device_present(dev))
6521 		return -ENODEV;
6522 	return ops->ndo_change_proto_down(dev, proto_down);
6523 }
6524 EXPORT_SYMBOL(dev_change_proto_down);
6525 
6526 /**
6527  *	dev_new_index	-	allocate an ifindex
6528  *	@net: the applicable net namespace
6529  *
6530  *	Returns a suitable unique value for a new device interface
6531  *	number.  The caller must hold the rtnl semaphore or the
6532  *	dev_base_lock to be sure it remains unique.
6533  */
6534 static int dev_new_index(struct net *net)
6535 {
6536 	int ifindex = net->ifindex;
6537 	for (;;) {
6538 		if (++ifindex <= 0)
6539 			ifindex = 1;
6540 		if (!__dev_get_by_index(net, ifindex))
6541 			return net->ifindex = ifindex;
6542 	}
6543 }
6544 
6545 /* Delayed registration/unregisteration */
6546 static LIST_HEAD(net_todo_list);
6547 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6548 
6549 static void net_set_todo(struct net_device *dev)
6550 {
6551 	list_add_tail(&dev->todo_list, &net_todo_list);
6552 	dev_net(dev)->dev_unreg_count++;
6553 }
6554 
6555 static void rollback_registered_many(struct list_head *head)
6556 {
6557 	struct net_device *dev, *tmp;
6558 	LIST_HEAD(close_head);
6559 
6560 	BUG_ON(dev_boot_phase);
6561 	ASSERT_RTNL();
6562 
6563 	list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6564 		/* Some devices call without registering
6565 		 * for initialization unwind. Remove those
6566 		 * devices and proceed with the remaining.
6567 		 */
6568 		if (dev->reg_state == NETREG_UNINITIALIZED) {
6569 			pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6570 				 dev->name, dev);
6571 
6572 			WARN_ON(1);
6573 			list_del(&dev->unreg_list);
6574 			continue;
6575 		}
6576 		dev->dismantle = true;
6577 		BUG_ON(dev->reg_state != NETREG_REGISTERED);
6578 	}
6579 
6580 	/* If device is running, close it first. */
6581 	list_for_each_entry(dev, head, unreg_list)
6582 		list_add_tail(&dev->close_list, &close_head);
6583 	dev_close_many(&close_head, true);
6584 
6585 	list_for_each_entry(dev, head, unreg_list) {
6586 		/* And unlink it from device chain. */
6587 		unlist_netdevice(dev);
6588 
6589 		dev->reg_state = NETREG_UNREGISTERING;
6590 		on_each_cpu(flush_backlog, dev, 1);
6591 	}
6592 
6593 	synchronize_net();
6594 
6595 	list_for_each_entry(dev, head, unreg_list) {
6596 		struct sk_buff *skb = NULL;
6597 
6598 		/* Shutdown queueing discipline. */
6599 		dev_shutdown(dev);
6600 
6601 
6602 		/* Notify protocols, that we are about to destroy
6603 		   this device. They should clean all the things.
6604 		*/
6605 		call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6606 
6607 		if (!dev->rtnl_link_ops ||
6608 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6609 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6610 						     GFP_KERNEL);
6611 
6612 		/*
6613 		 *	Flush the unicast and multicast chains
6614 		 */
6615 		dev_uc_flush(dev);
6616 		dev_mc_flush(dev);
6617 
6618 		if (dev->netdev_ops->ndo_uninit)
6619 			dev->netdev_ops->ndo_uninit(dev);
6620 
6621 		if (skb)
6622 			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6623 
6624 		/* Notifier chain MUST detach us all upper devices. */
6625 		WARN_ON(netdev_has_any_upper_dev(dev));
6626 
6627 		/* Remove entries from kobject tree */
6628 		netdev_unregister_kobject(dev);
6629 #ifdef CONFIG_XPS
6630 		/* Remove XPS queueing entries */
6631 		netif_reset_xps_queues_gt(dev, 0);
6632 #endif
6633 	}
6634 
6635 	synchronize_net();
6636 
6637 	list_for_each_entry(dev, head, unreg_list)
6638 		dev_put(dev);
6639 }
6640 
6641 static void rollback_registered(struct net_device *dev)
6642 {
6643 	LIST_HEAD(single);
6644 
6645 	list_add(&dev->unreg_list, &single);
6646 	rollback_registered_many(&single);
6647 	list_del(&single);
6648 }
6649 
6650 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6651 	struct net_device *upper, netdev_features_t features)
6652 {
6653 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6654 	netdev_features_t feature;
6655 	int feature_bit;
6656 
6657 	for_each_netdev_feature(&upper_disables, feature_bit) {
6658 		feature = __NETIF_F_BIT(feature_bit);
6659 		if (!(upper->wanted_features & feature)
6660 		    && (features & feature)) {
6661 			netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6662 				   &feature, upper->name);
6663 			features &= ~feature;
6664 		}
6665 	}
6666 
6667 	return features;
6668 }
6669 
6670 static void netdev_sync_lower_features(struct net_device *upper,
6671 	struct net_device *lower, netdev_features_t features)
6672 {
6673 	netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6674 	netdev_features_t feature;
6675 	int feature_bit;
6676 
6677 	for_each_netdev_feature(&upper_disables, feature_bit) {
6678 		feature = __NETIF_F_BIT(feature_bit);
6679 		if (!(features & feature) && (lower->features & feature)) {
6680 			netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6681 				   &feature, lower->name);
6682 			lower->wanted_features &= ~feature;
6683 			netdev_update_features(lower);
6684 
6685 			if (unlikely(lower->features & feature))
6686 				netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6687 					    &feature, lower->name);
6688 		}
6689 	}
6690 }
6691 
6692 static netdev_features_t netdev_fix_features(struct net_device *dev,
6693 	netdev_features_t features)
6694 {
6695 	/* Fix illegal checksum combinations */
6696 	if ((features & NETIF_F_HW_CSUM) &&
6697 	    (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6698 		netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6699 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6700 	}
6701 
6702 	/* TSO requires that SG is present as well. */
6703 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6704 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6705 		features &= ~NETIF_F_ALL_TSO;
6706 	}
6707 
6708 	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6709 					!(features & NETIF_F_IP_CSUM)) {
6710 		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6711 		features &= ~NETIF_F_TSO;
6712 		features &= ~NETIF_F_TSO_ECN;
6713 	}
6714 
6715 	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6716 					 !(features & NETIF_F_IPV6_CSUM)) {
6717 		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6718 		features &= ~NETIF_F_TSO6;
6719 	}
6720 
6721 	/* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6722 	if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6723 		features &= ~NETIF_F_TSO_MANGLEID;
6724 
6725 	/* TSO ECN requires that TSO is present as well. */
6726 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6727 		features &= ~NETIF_F_TSO_ECN;
6728 
6729 	/* Software GSO depends on SG. */
6730 	if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6731 		netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6732 		features &= ~NETIF_F_GSO;
6733 	}
6734 
6735 	/* UFO needs SG and checksumming */
6736 	if (features & NETIF_F_UFO) {
6737 		/* maybe split UFO into V4 and V6? */
6738 		if (!(features & NETIF_F_HW_CSUM) &&
6739 		    ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6740 		     (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6741 			netdev_dbg(dev,
6742 				"Dropping NETIF_F_UFO since no checksum offload features.\n");
6743 			features &= ~NETIF_F_UFO;
6744 		}
6745 
6746 		if (!(features & NETIF_F_SG)) {
6747 			netdev_dbg(dev,
6748 				"Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6749 			features &= ~NETIF_F_UFO;
6750 		}
6751 	}
6752 
6753 	/* GSO partial features require GSO partial be set */
6754 	if ((features & dev->gso_partial_features) &&
6755 	    !(features & NETIF_F_GSO_PARTIAL)) {
6756 		netdev_dbg(dev,
6757 			   "Dropping partially supported GSO features since no GSO partial.\n");
6758 		features &= ~dev->gso_partial_features;
6759 	}
6760 
6761 #ifdef CONFIG_NET_RX_BUSY_POLL
6762 	if (dev->netdev_ops->ndo_busy_poll)
6763 		features |= NETIF_F_BUSY_POLL;
6764 	else
6765 #endif
6766 		features &= ~NETIF_F_BUSY_POLL;
6767 
6768 	return features;
6769 }
6770 
6771 int __netdev_update_features(struct net_device *dev)
6772 {
6773 	struct net_device *upper, *lower;
6774 	netdev_features_t features;
6775 	struct list_head *iter;
6776 	int err = -1;
6777 
6778 	ASSERT_RTNL();
6779 
6780 	features = netdev_get_wanted_features(dev);
6781 
6782 	if (dev->netdev_ops->ndo_fix_features)
6783 		features = dev->netdev_ops->ndo_fix_features(dev, features);
6784 
6785 	/* driver might be less strict about feature dependencies */
6786 	features = netdev_fix_features(dev, features);
6787 
6788 	/* some features can't be enabled if they're off an an upper device */
6789 	netdev_for_each_upper_dev_rcu(dev, upper, iter)
6790 		features = netdev_sync_upper_features(dev, upper, features);
6791 
6792 	if (dev->features == features)
6793 		goto sync_lower;
6794 
6795 	netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6796 		&dev->features, &features);
6797 
6798 	if (dev->netdev_ops->ndo_set_features)
6799 		err = dev->netdev_ops->ndo_set_features(dev, features);
6800 	else
6801 		err = 0;
6802 
6803 	if (unlikely(err < 0)) {
6804 		netdev_err(dev,
6805 			"set_features() failed (%d); wanted %pNF, left %pNF\n",
6806 			err, &features, &dev->features);
6807 		/* return non-0 since some features might have changed and
6808 		 * it's better to fire a spurious notification than miss it
6809 		 */
6810 		return -1;
6811 	}
6812 
6813 sync_lower:
6814 	/* some features must be disabled on lower devices when disabled
6815 	 * on an upper device (think: bonding master or bridge)
6816 	 */
6817 	netdev_for_each_lower_dev(dev, lower, iter)
6818 		netdev_sync_lower_features(dev, lower, features);
6819 
6820 	if (!err)
6821 		dev->features = features;
6822 
6823 	return err < 0 ? 0 : 1;
6824 }
6825 
6826 /**
6827  *	netdev_update_features - recalculate device features
6828  *	@dev: the device to check
6829  *
6830  *	Recalculate dev->features set and send notifications if it
6831  *	has changed. Should be called after driver or hardware dependent
6832  *	conditions might have changed that influence the features.
6833  */
6834 void netdev_update_features(struct net_device *dev)
6835 {
6836 	if (__netdev_update_features(dev))
6837 		netdev_features_change(dev);
6838 }
6839 EXPORT_SYMBOL(netdev_update_features);
6840 
6841 /**
6842  *	netdev_change_features - recalculate device features
6843  *	@dev: the device to check
6844  *
6845  *	Recalculate dev->features set and send notifications even
6846  *	if they have not changed. Should be called instead of
6847  *	netdev_update_features() if also dev->vlan_features might
6848  *	have changed to allow the changes to be propagated to stacked
6849  *	VLAN devices.
6850  */
6851 void netdev_change_features(struct net_device *dev)
6852 {
6853 	__netdev_update_features(dev);
6854 	netdev_features_change(dev);
6855 }
6856 EXPORT_SYMBOL(netdev_change_features);
6857 
6858 /**
6859  *	netif_stacked_transfer_operstate -	transfer operstate
6860  *	@rootdev: the root or lower level device to transfer state from
6861  *	@dev: the device to transfer operstate to
6862  *
6863  *	Transfer operational state from root to device. This is normally
6864  *	called when a stacking relationship exists between the root
6865  *	device and the device(a leaf device).
6866  */
6867 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6868 					struct net_device *dev)
6869 {
6870 	if (rootdev->operstate == IF_OPER_DORMANT)
6871 		netif_dormant_on(dev);
6872 	else
6873 		netif_dormant_off(dev);
6874 
6875 	if (netif_carrier_ok(rootdev)) {
6876 		if (!netif_carrier_ok(dev))
6877 			netif_carrier_on(dev);
6878 	} else {
6879 		if (netif_carrier_ok(dev))
6880 			netif_carrier_off(dev);
6881 	}
6882 }
6883 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6884 
6885 #ifdef CONFIG_SYSFS
6886 static int netif_alloc_rx_queues(struct net_device *dev)
6887 {
6888 	unsigned int i, count = dev->num_rx_queues;
6889 	struct netdev_rx_queue *rx;
6890 	size_t sz = count * sizeof(*rx);
6891 
6892 	BUG_ON(count < 1);
6893 
6894 	rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6895 	if (!rx) {
6896 		rx = vzalloc(sz);
6897 		if (!rx)
6898 			return -ENOMEM;
6899 	}
6900 	dev->_rx = rx;
6901 
6902 	for (i = 0; i < count; i++)
6903 		rx[i].dev = dev;
6904 	return 0;
6905 }
6906 #endif
6907 
6908 static void netdev_init_one_queue(struct net_device *dev,
6909 				  struct netdev_queue *queue, void *_unused)
6910 {
6911 	/* Initialize queue lock */
6912 	spin_lock_init(&queue->_xmit_lock);
6913 	netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6914 	queue->xmit_lock_owner = -1;
6915 	netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6916 	queue->dev = dev;
6917 #ifdef CONFIG_BQL
6918 	dql_init(&queue->dql, HZ);
6919 #endif
6920 }
6921 
6922 static void netif_free_tx_queues(struct net_device *dev)
6923 {
6924 	kvfree(dev->_tx);
6925 }
6926 
6927 static int netif_alloc_netdev_queues(struct net_device *dev)
6928 {
6929 	unsigned int count = dev->num_tx_queues;
6930 	struct netdev_queue *tx;
6931 	size_t sz = count * sizeof(*tx);
6932 
6933 	if (count < 1 || count > 0xffff)
6934 		return -EINVAL;
6935 
6936 	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6937 	if (!tx) {
6938 		tx = vzalloc(sz);
6939 		if (!tx)
6940 			return -ENOMEM;
6941 	}
6942 	dev->_tx = tx;
6943 
6944 	netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6945 	spin_lock_init(&dev->tx_global_lock);
6946 
6947 	return 0;
6948 }
6949 
6950 void netif_tx_stop_all_queues(struct net_device *dev)
6951 {
6952 	unsigned int i;
6953 
6954 	for (i = 0; i < dev->num_tx_queues; i++) {
6955 		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6956 		netif_tx_stop_queue(txq);
6957 	}
6958 }
6959 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6960 
6961 /**
6962  *	register_netdevice	- register a network device
6963  *	@dev: device to register
6964  *
6965  *	Take a completed network device structure and add it to the kernel
6966  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6967  *	chain. 0 is returned on success. A negative errno code is returned
6968  *	on a failure to set up the device, or if the name is a duplicate.
6969  *
6970  *	Callers must hold the rtnl semaphore. You may want
6971  *	register_netdev() instead of this.
6972  *
6973  *	BUGS:
6974  *	The locking appears insufficient to guarantee two parallel registers
6975  *	will not get the same name.
6976  */
6977 
6978 int register_netdevice(struct net_device *dev)
6979 {
6980 	int ret;
6981 	struct net *net = dev_net(dev);
6982 
6983 	BUG_ON(dev_boot_phase);
6984 	ASSERT_RTNL();
6985 
6986 	might_sleep();
6987 
6988 	/* When net_device's are persistent, this will be fatal. */
6989 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6990 	BUG_ON(!net);
6991 
6992 	spin_lock_init(&dev->addr_list_lock);
6993 	netdev_set_addr_lockdep_class(dev);
6994 
6995 	ret = dev_get_valid_name(net, dev, dev->name);
6996 	if (ret < 0)
6997 		goto out;
6998 
6999 	/* Init, if this function is available */
7000 	if (dev->netdev_ops->ndo_init) {
7001 		ret = dev->netdev_ops->ndo_init(dev);
7002 		if (ret) {
7003 			if (ret > 0)
7004 				ret = -EIO;
7005 			goto out;
7006 		}
7007 	}
7008 
7009 	if (((dev->hw_features | dev->features) &
7010 	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
7011 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7012 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7013 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7014 		ret = -EINVAL;
7015 		goto err_uninit;
7016 	}
7017 
7018 	ret = -EBUSY;
7019 	if (!dev->ifindex)
7020 		dev->ifindex = dev_new_index(net);
7021 	else if (__dev_get_by_index(net, dev->ifindex))
7022 		goto err_uninit;
7023 
7024 	/* Transfer changeable features to wanted_features and enable
7025 	 * software offloads (GSO and GRO).
7026 	 */
7027 	dev->hw_features |= NETIF_F_SOFT_FEATURES;
7028 	dev->features |= NETIF_F_SOFT_FEATURES;
7029 	dev->wanted_features = dev->features & dev->hw_features;
7030 
7031 	if (!(dev->flags & IFF_LOOPBACK))
7032 		dev->hw_features |= NETIF_F_NOCACHE_COPY;
7033 
7034 	/* If IPv4 TCP segmentation offload is supported we should also
7035 	 * allow the device to enable segmenting the frame with the option
7036 	 * of ignoring a static IP ID value.  This doesn't enable the
7037 	 * feature itself but allows the user to enable it later.
7038 	 */
7039 	if (dev->hw_features & NETIF_F_TSO)
7040 		dev->hw_features |= NETIF_F_TSO_MANGLEID;
7041 	if (dev->vlan_features & NETIF_F_TSO)
7042 		dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7043 	if (dev->mpls_features & NETIF_F_TSO)
7044 		dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7045 	if (dev->hw_enc_features & NETIF_F_TSO)
7046 		dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7047 
7048 	/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7049 	 */
7050 	dev->vlan_features |= NETIF_F_HIGHDMA;
7051 
7052 	/* Make NETIF_F_SG inheritable to tunnel devices.
7053 	 */
7054 	dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7055 
7056 	/* Make NETIF_F_SG inheritable to MPLS.
7057 	 */
7058 	dev->mpls_features |= NETIF_F_SG;
7059 
7060 	ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7061 	ret = notifier_to_errno(ret);
7062 	if (ret)
7063 		goto err_uninit;
7064 
7065 	ret = netdev_register_kobject(dev);
7066 	if (ret)
7067 		goto err_uninit;
7068 	dev->reg_state = NETREG_REGISTERED;
7069 
7070 	__netdev_update_features(dev);
7071 
7072 	/*
7073 	 *	Default initial state at registry is that the
7074 	 *	device is present.
7075 	 */
7076 
7077 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7078 
7079 	linkwatch_init_dev(dev);
7080 
7081 	dev_init_scheduler(dev);
7082 	dev_hold(dev);
7083 	list_netdevice(dev);
7084 	add_device_randomness(dev->dev_addr, dev->addr_len);
7085 
7086 	/* If the device has permanent device address, driver should
7087 	 * set dev_addr and also addr_assign_type should be set to
7088 	 * NET_ADDR_PERM (default value).
7089 	 */
7090 	if (dev->addr_assign_type == NET_ADDR_PERM)
7091 		memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7092 
7093 	/* Notify protocols, that a new device appeared. */
7094 	ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7095 	ret = notifier_to_errno(ret);
7096 	if (ret) {
7097 		rollback_registered(dev);
7098 		dev->reg_state = NETREG_UNREGISTERED;
7099 	}
7100 	/*
7101 	 *	Prevent userspace races by waiting until the network
7102 	 *	device is fully setup before sending notifications.
7103 	 */
7104 	if (!dev->rtnl_link_ops ||
7105 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7106 		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7107 
7108 out:
7109 	return ret;
7110 
7111 err_uninit:
7112 	if (dev->netdev_ops->ndo_uninit)
7113 		dev->netdev_ops->ndo_uninit(dev);
7114 	goto out;
7115 }
7116 EXPORT_SYMBOL(register_netdevice);
7117 
7118 /**
7119  *	init_dummy_netdev	- init a dummy network device for NAPI
7120  *	@dev: device to init
7121  *
7122  *	This takes a network device structure and initialize the minimum
7123  *	amount of fields so it can be used to schedule NAPI polls without
7124  *	registering a full blown interface. This is to be used by drivers
7125  *	that need to tie several hardware interfaces to a single NAPI
7126  *	poll scheduler due to HW limitations.
7127  */
7128 int init_dummy_netdev(struct net_device *dev)
7129 {
7130 	/* Clear everything. Note we don't initialize spinlocks
7131 	 * are they aren't supposed to be taken by any of the
7132 	 * NAPI code and this dummy netdev is supposed to be
7133 	 * only ever used for NAPI polls
7134 	 */
7135 	memset(dev, 0, sizeof(struct net_device));
7136 
7137 	/* make sure we BUG if trying to hit standard
7138 	 * register/unregister code path
7139 	 */
7140 	dev->reg_state = NETREG_DUMMY;
7141 
7142 	/* NAPI wants this */
7143 	INIT_LIST_HEAD(&dev->napi_list);
7144 
7145 	/* a dummy interface is started by default */
7146 	set_bit(__LINK_STATE_PRESENT, &dev->state);
7147 	set_bit(__LINK_STATE_START, &dev->state);
7148 
7149 	/* Note : We dont allocate pcpu_refcnt for dummy devices,
7150 	 * because users of this 'device' dont need to change
7151 	 * its refcount.
7152 	 */
7153 
7154 	return 0;
7155 }
7156 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7157 
7158 
7159 /**
7160  *	register_netdev	- register a network device
7161  *	@dev: device to register
7162  *
7163  *	Take a completed network device structure and add it to the kernel
7164  *	interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7165  *	chain. 0 is returned on success. A negative errno code is returned
7166  *	on a failure to set up the device, or if the name is a duplicate.
7167  *
7168  *	This is a wrapper around register_netdevice that takes the rtnl semaphore
7169  *	and expands the device name if you passed a format string to
7170  *	alloc_netdev.
7171  */
7172 int register_netdev(struct net_device *dev)
7173 {
7174 	int err;
7175 
7176 	rtnl_lock();
7177 	err = register_netdevice(dev);
7178 	rtnl_unlock();
7179 	return err;
7180 }
7181 EXPORT_SYMBOL(register_netdev);
7182 
7183 int netdev_refcnt_read(const struct net_device *dev)
7184 {
7185 	int i, refcnt = 0;
7186 
7187 	for_each_possible_cpu(i)
7188 		refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7189 	return refcnt;
7190 }
7191 EXPORT_SYMBOL(netdev_refcnt_read);
7192 
7193 /**
7194  * netdev_wait_allrefs - wait until all references are gone.
7195  * @dev: target net_device
7196  *
7197  * This is called when unregistering network devices.
7198  *
7199  * Any protocol or device that holds a reference should register
7200  * for netdevice notification, and cleanup and put back the
7201  * reference if they receive an UNREGISTER event.
7202  * We can get stuck here if buggy protocols don't correctly
7203  * call dev_put.
7204  */
7205 static void netdev_wait_allrefs(struct net_device *dev)
7206 {
7207 	unsigned long rebroadcast_time, warning_time;
7208 	int refcnt;
7209 
7210 	linkwatch_forget_dev(dev);
7211 
7212 	rebroadcast_time = warning_time = jiffies;
7213 	refcnt = netdev_refcnt_read(dev);
7214 
7215 	while (refcnt != 0) {
7216 		if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7217 			rtnl_lock();
7218 
7219 			/* Rebroadcast unregister notification */
7220 			call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7221 
7222 			__rtnl_unlock();
7223 			rcu_barrier();
7224 			rtnl_lock();
7225 
7226 			call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7227 			if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7228 				     &dev->state)) {
7229 				/* We must not have linkwatch events
7230 				 * pending on unregister. If this
7231 				 * happens, we simply run the queue
7232 				 * unscheduled, resulting in a noop
7233 				 * for this device.
7234 				 */
7235 				linkwatch_run_queue();
7236 			}
7237 
7238 			__rtnl_unlock();
7239 
7240 			rebroadcast_time = jiffies;
7241 		}
7242 
7243 		msleep(250);
7244 
7245 		refcnt = netdev_refcnt_read(dev);
7246 
7247 		if (time_after(jiffies, warning_time + 10 * HZ)) {
7248 			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7249 				 dev->name, refcnt);
7250 			warning_time = jiffies;
7251 		}
7252 	}
7253 }
7254 
7255 /* The sequence is:
7256  *
7257  *	rtnl_lock();
7258  *	...
7259  *	register_netdevice(x1);
7260  *	register_netdevice(x2);
7261  *	...
7262  *	unregister_netdevice(y1);
7263  *	unregister_netdevice(y2);
7264  *      ...
7265  *	rtnl_unlock();
7266  *	free_netdev(y1);
7267  *	free_netdev(y2);
7268  *
7269  * We are invoked by rtnl_unlock().
7270  * This allows us to deal with problems:
7271  * 1) We can delete sysfs objects which invoke hotplug
7272  *    without deadlocking with linkwatch via keventd.
7273  * 2) Since we run with the RTNL semaphore not held, we can sleep
7274  *    safely in order to wait for the netdev refcnt to drop to zero.
7275  *
7276  * We must not return until all unregister events added during
7277  * the interval the lock was held have been completed.
7278  */
7279 void netdev_run_todo(void)
7280 {
7281 	struct list_head list;
7282 
7283 	/* Snapshot list, allow later requests */
7284 	list_replace_init(&net_todo_list, &list);
7285 
7286 	__rtnl_unlock();
7287 
7288 
7289 	/* Wait for rcu callbacks to finish before next phase */
7290 	if (!list_empty(&list))
7291 		rcu_barrier();
7292 
7293 	while (!list_empty(&list)) {
7294 		struct net_device *dev
7295 			= list_first_entry(&list, struct net_device, todo_list);
7296 		list_del(&dev->todo_list);
7297 
7298 		rtnl_lock();
7299 		call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7300 		__rtnl_unlock();
7301 
7302 		if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7303 			pr_err("network todo '%s' but state %d\n",
7304 			       dev->name, dev->reg_state);
7305 			dump_stack();
7306 			continue;
7307 		}
7308 
7309 		dev->reg_state = NETREG_UNREGISTERED;
7310 
7311 		netdev_wait_allrefs(dev);
7312 
7313 		/* paranoia */
7314 		BUG_ON(netdev_refcnt_read(dev));
7315 		BUG_ON(!list_empty(&dev->ptype_all));
7316 		BUG_ON(!list_empty(&dev->ptype_specific));
7317 		WARN_ON(rcu_access_pointer(dev->ip_ptr));
7318 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7319 		WARN_ON(dev->dn_ptr);
7320 
7321 		if (dev->destructor)
7322 			dev->destructor(dev);
7323 
7324 		/* Report a network device has been unregistered */
7325 		rtnl_lock();
7326 		dev_net(dev)->dev_unreg_count--;
7327 		__rtnl_unlock();
7328 		wake_up(&netdev_unregistering_wq);
7329 
7330 		/* Free network device */
7331 		kobject_put(&dev->dev.kobj);
7332 	}
7333 }
7334 
7335 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7336  * all the same fields in the same order as net_device_stats, with only
7337  * the type differing, but rtnl_link_stats64 may have additional fields
7338  * at the end for newer counters.
7339  */
7340 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7341 			     const struct net_device_stats *netdev_stats)
7342 {
7343 #if BITS_PER_LONG == 64
7344 	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7345 	memcpy(stats64, netdev_stats, sizeof(*stats64));
7346 	/* zero out counters that only exist in rtnl_link_stats64 */
7347 	memset((char *)stats64 + sizeof(*netdev_stats), 0,
7348 	       sizeof(*stats64) - sizeof(*netdev_stats));
7349 #else
7350 	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7351 	const unsigned long *src = (const unsigned long *)netdev_stats;
7352 	u64 *dst = (u64 *)stats64;
7353 
7354 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7355 	for (i = 0; i < n; i++)
7356 		dst[i] = src[i];
7357 	/* zero out counters that only exist in rtnl_link_stats64 */
7358 	memset((char *)stats64 + n * sizeof(u64), 0,
7359 	       sizeof(*stats64) - n * sizeof(u64));
7360 #endif
7361 }
7362 EXPORT_SYMBOL(netdev_stats_to_stats64);
7363 
7364 /**
7365  *	dev_get_stats	- get network device statistics
7366  *	@dev: device to get statistics from
7367  *	@storage: place to store stats
7368  *
7369  *	Get network statistics from device. Return @storage.
7370  *	The device driver may provide its own method by setting
7371  *	dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7372  *	otherwise the internal statistics structure is used.
7373  */
7374 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7375 					struct rtnl_link_stats64 *storage)
7376 {
7377 	const struct net_device_ops *ops = dev->netdev_ops;
7378 
7379 	if (ops->ndo_get_stats64) {
7380 		memset(storage, 0, sizeof(*storage));
7381 		ops->ndo_get_stats64(dev, storage);
7382 	} else if (ops->ndo_get_stats) {
7383 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7384 	} else {
7385 		netdev_stats_to_stats64(storage, &dev->stats);
7386 	}
7387 	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7388 	storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7389 	storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7390 	return storage;
7391 }
7392 EXPORT_SYMBOL(dev_get_stats);
7393 
7394 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7395 {
7396 	struct netdev_queue *queue = dev_ingress_queue(dev);
7397 
7398 #ifdef CONFIG_NET_CLS_ACT
7399 	if (queue)
7400 		return queue;
7401 	queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7402 	if (!queue)
7403 		return NULL;
7404 	netdev_init_one_queue(dev, queue, NULL);
7405 	RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7406 	queue->qdisc_sleeping = &noop_qdisc;
7407 	rcu_assign_pointer(dev->ingress_queue, queue);
7408 #endif
7409 	return queue;
7410 }
7411 
7412 static const struct ethtool_ops default_ethtool_ops;
7413 
7414 void netdev_set_default_ethtool_ops(struct net_device *dev,
7415 				    const struct ethtool_ops *ops)
7416 {
7417 	if (dev->ethtool_ops == &default_ethtool_ops)
7418 		dev->ethtool_ops = ops;
7419 }
7420 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7421 
7422 void netdev_freemem(struct net_device *dev)
7423 {
7424 	char *addr = (char *)dev - dev->padded;
7425 
7426 	kvfree(addr);
7427 }
7428 
7429 /**
7430  *	alloc_netdev_mqs - allocate network device
7431  *	@sizeof_priv:		size of private data to allocate space for
7432  *	@name:			device name format string
7433  *	@name_assign_type: 	origin of device name
7434  *	@setup:			callback to initialize device
7435  *	@txqs:			the number of TX subqueues to allocate
7436  *	@rxqs:			the number of RX subqueues to allocate
7437  *
7438  *	Allocates a struct net_device with private data area for driver use
7439  *	and performs basic initialization.  Also allocates subqueue structs
7440  *	for each queue on the device.
7441  */
7442 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7443 		unsigned char name_assign_type,
7444 		void (*setup)(struct net_device *),
7445 		unsigned int txqs, unsigned int rxqs)
7446 {
7447 	struct net_device *dev;
7448 	size_t alloc_size;
7449 	struct net_device *p;
7450 
7451 	BUG_ON(strlen(name) >= sizeof(dev->name));
7452 
7453 	if (txqs < 1) {
7454 		pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7455 		return NULL;
7456 	}
7457 
7458 #ifdef CONFIG_SYSFS
7459 	if (rxqs < 1) {
7460 		pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7461 		return NULL;
7462 	}
7463 #endif
7464 
7465 	alloc_size = sizeof(struct net_device);
7466 	if (sizeof_priv) {
7467 		/* ensure 32-byte alignment of private area */
7468 		alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7469 		alloc_size += sizeof_priv;
7470 	}
7471 	/* ensure 32-byte alignment of whole construct */
7472 	alloc_size += NETDEV_ALIGN - 1;
7473 
7474 	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7475 	if (!p)
7476 		p = vzalloc(alloc_size);
7477 	if (!p)
7478 		return NULL;
7479 
7480 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
7481 	dev->padded = (char *)dev - (char *)p;
7482 
7483 	dev->pcpu_refcnt = alloc_percpu(int);
7484 	if (!dev->pcpu_refcnt)
7485 		goto free_dev;
7486 
7487 	if (dev_addr_init(dev))
7488 		goto free_pcpu;
7489 
7490 	dev_mc_init(dev);
7491 	dev_uc_init(dev);
7492 
7493 	dev_net_set(dev, &init_net);
7494 
7495 	dev->gso_max_size = GSO_MAX_SIZE;
7496 	dev->gso_max_segs = GSO_MAX_SEGS;
7497 
7498 	INIT_LIST_HEAD(&dev->napi_list);
7499 	INIT_LIST_HEAD(&dev->unreg_list);
7500 	INIT_LIST_HEAD(&dev->close_list);
7501 	INIT_LIST_HEAD(&dev->link_watch_list);
7502 	INIT_LIST_HEAD(&dev->adj_list.upper);
7503 	INIT_LIST_HEAD(&dev->adj_list.lower);
7504 	INIT_LIST_HEAD(&dev->all_adj_list.upper);
7505 	INIT_LIST_HEAD(&dev->all_adj_list.lower);
7506 	INIT_LIST_HEAD(&dev->ptype_all);
7507 	INIT_LIST_HEAD(&dev->ptype_specific);
7508 	dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7509 	setup(dev);
7510 
7511 	if (!dev->tx_queue_len) {
7512 		dev->priv_flags |= IFF_NO_QUEUE;
7513 		dev->tx_queue_len = 1;
7514 	}
7515 
7516 	dev->num_tx_queues = txqs;
7517 	dev->real_num_tx_queues = txqs;
7518 	if (netif_alloc_netdev_queues(dev))
7519 		goto free_all;
7520 
7521 #ifdef CONFIG_SYSFS
7522 	dev->num_rx_queues = rxqs;
7523 	dev->real_num_rx_queues = rxqs;
7524 	if (netif_alloc_rx_queues(dev))
7525 		goto free_all;
7526 #endif
7527 
7528 	strcpy(dev->name, name);
7529 	dev->name_assign_type = name_assign_type;
7530 	dev->group = INIT_NETDEV_GROUP;
7531 	if (!dev->ethtool_ops)
7532 		dev->ethtool_ops = &default_ethtool_ops;
7533 
7534 	nf_hook_ingress_init(dev);
7535 
7536 	return dev;
7537 
7538 free_all:
7539 	free_netdev(dev);
7540 	return NULL;
7541 
7542 free_pcpu:
7543 	free_percpu(dev->pcpu_refcnt);
7544 free_dev:
7545 	netdev_freemem(dev);
7546 	return NULL;
7547 }
7548 EXPORT_SYMBOL(alloc_netdev_mqs);
7549 
7550 /**
7551  *	free_netdev - free network device
7552  *	@dev: device
7553  *
7554  *	This function does the last stage of destroying an allocated device
7555  * 	interface. The reference to the device object is released.
7556  *	If this is the last reference then it will be freed.
7557  *	Must be called in process context.
7558  */
7559 void free_netdev(struct net_device *dev)
7560 {
7561 	struct napi_struct *p, *n;
7562 
7563 	might_sleep();
7564 	netif_free_tx_queues(dev);
7565 #ifdef CONFIG_SYSFS
7566 	kvfree(dev->_rx);
7567 #endif
7568 
7569 	kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7570 
7571 	/* Flush device addresses */
7572 	dev_addr_flush(dev);
7573 
7574 	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7575 		netif_napi_del(p);
7576 
7577 	free_percpu(dev->pcpu_refcnt);
7578 	dev->pcpu_refcnt = NULL;
7579 
7580 	/*  Compatibility with error handling in drivers */
7581 	if (dev->reg_state == NETREG_UNINITIALIZED) {
7582 		netdev_freemem(dev);
7583 		return;
7584 	}
7585 
7586 	BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7587 	dev->reg_state = NETREG_RELEASED;
7588 
7589 	/* will free via device release */
7590 	put_device(&dev->dev);
7591 }
7592 EXPORT_SYMBOL(free_netdev);
7593 
7594 /**
7595  *	synchronize_net -  Synchronize with packet receive processing
7596  *
7597  *	Wait for packets currently being received to be done.
7598  *	Does not block later packets from starting.
7599  */
7600 void synchronize_net(void)
7601 {
7602 	might_sleep();
7603 	if (rtnl_is_locked())
7604 		synchronize_rcu_expedited();
7605 	else
7606 		synchronize_rcu();
7607 }
7608 EXPORT_SYMBOL(synchronize_net);
7609 
7610 /**
7611  *	unregister_netdevice_queue - remove device from the kernel
7612  *	@dev: device
7613  *	@head: list
7614  *
7615  *	This function shuts down a device interface and removes it
7616  *	from the kernel tables.
7617  *	If head not NULL, device is queued to be unregistered later.
7618  *
7619  *	Callers must hold the rtnl semaphore.  You may want
7620  *	unregister_netdev() instead of this.
7621  */
7622 
7623 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7624 {
7625 	ASSERT_RTNL();
7626 
7627 	if (head) {
7628 		list_move_tail(&dev->unreg_list, head);
7629 	} else {
7630 		rollback_registered(dev);
7631 		/* Finish processing unregister after unlock */
7632 		net_set_todo(dev);
7633 	}
7634 }
7635 EXPORT_SYMBOL(unregister_netdevice_queue);
7636 
7637 /**
7638  *	unregister_netdevice_many - unregister many devices
7639  *	@head: list of devices
7640  *
7641  *  Note: As most callers use a stack allocated list_head,
7642  *  we force a list_del() to make sure stack wont be corrupted later.
7643  */
7644 void unregister_netdevice_many(struct list_head *head)
7645 {
7646 	struct net_device *dev;
7647 
7648 	if (!list_empty(head)) {
7649 		rollback_registered_many(head);
7650 		list_for_each_entry(dev, head, unreg_list)
7651 			net_set_todo(dev);
7652 		list_del(head);
7653 	}
7654 }
7655 EXPORT_SYMBOL(unregister_netdevice_many);
7656 
7657 /**
7658  *	unregister_netdev - remove device from the kernel
7659  *	@dev: device
7660  *
7661  *	This function shuts down a device interface and removes it
7662  *	from the kernel tables.
7663  *
7664  *	This is just a wrapper for unregister_netdevice that takes
7665  *	the rtnl semaphore.  In general you want to use this and not
7666  *	unregister_netdevice.
7667  */
7668 void unregister_netdev(struct net_device *dev)
7669 {
7670 	rtnl_lock();
7671 	unregister_netdevice(dev);
7672 	rtnl_unlock();
7673 }
7674 EXPORT_SYMBOL(unregister_netdev);
7675 
7676 /**
7677  *	dev_change_net_namespace - move device to different nethost namespace
7678  *	@dev: device
7679  *	@net: network namespace
7680  *	@pat: If not NULL name pattern to try if the current device name
7681  *	      is already taken in the destination network namespace.
7682  *
7683  *	This function shuts down a device interface and moves it
7684  *	to a new network namespace. On success 0 is returned, on
7685  *	a failure a netagive errno code is returned.
7686  *
7687  *	Callers must hold the rtnl semaphore.
7688  */
7689 
7690 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7691 {
7692 	int err;
7693 
7694 	ASSERT_RTNL();
7695 
7696 	/* Don't allow namespace local devices to be moved. */
7697 	err = -EINVAL;
7698 	if (dev->features & NETIF_F_NETNS_LOCAL)
7699 		goto out;
7700 
7701 	/* Ensure the device has been registrered */
7702 	if (dev->reg_state != NETREG_REGISTERED)
7703 		goto out;
7704 
7705 	/* Get out if there is nothing todo */
7706 	err = 0;
7707 	if (net_eq(dev_net(dev), net))
7708 		goto out;
7709 
7710 	/* Pick the destination device name, and ensure
7711 	 * we can use it in the destination network namespace.
7712 	 */
7713 	err = -EEXIST;
7714 	if (__dev_get_by_name(net, dev->name)) {
7715 		/* We get here if we can't use the current device name */
7716 		if (!pat)
7717 			goto out;
7718 		if (dev_get_valid_name(net, dev, pat) < 0)
7719 			goto out;
7720 	}
7721 
7722 	/*
7723 	 * And now a mini version of register_netdevice unregister_netdevice.
7724 	 */
7725 
7726 	/* If device is running close it first. */
7727 	dev_close(dev);
7728 
7729 	/* And unlink it from device chain */
7730 	err = -ENODEV;
7731 	unlist_netdevice(dev);
7732 
7733 	synchronize_net();
7734 
7735 	/* Shutdown queueing discipline. */
7736 	dev_shutdown(dev);
7737 
7738 	/* Notify protocols, that we are about to destroy
7739 	   this device. They should clean all the things.
7740 
7741 	   Note that dev->reg_state stays at NETREG_REGISTERED.
7742 	   This is wanted because this way 8021q and macvlan know
7743 	   the device is just moving and can keep their slaves up.
7744 	*/
7745 	call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7746 	rcu_barrier();
7747 	call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7748 	rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7749 
7750 	/*
7751 	 *	Flush the unicast and multicast chains
7752 	 */
7753 	dev_uc_flush(dev);
7754 	dev_mc_flush(dev);
7755 
7756 	/* Send a netdev-removed uevent to the old namespace */
7757 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7758 	netdev_adjacent_del_links(dev);
7759 
7760 	/* Actually switch the network namespace */
7761 	dev_net_set(dev, net);
7762 
7763 	/* If there is an ifindex conflict assign a new one */
7764 	if (__dev_get_by_index(net, dev->ifindex))
7765 		dev->ifindex = dev_new_index(net);
7766 
7767 	/* Send a netdev-add uevent to the new namespace */
7768 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7769 	netdev_adjacent_add_links(dev);
7770 
7771 	/* Fixup kobjects */
7772 	err = device_rename(&dev->dev, dev->name);
7773 	WARN_ON(err);
7774 
7775 	/* Add the device back in the hashes */
7776 	list_netdevice(dev);
7777 
7778 	/* Notify protocols, that a new device appeared. */
7779 	call_netdevice_notifiers(NETDEV_REGISTER, dev);
7780 
7781 	/*
7782 	 *	Prevent userspace races by waiting until the network
7783 	 *	device is fully setup before sending notifications.
7784 	 */
7785 	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7786 
7787 	synchronize_net();
7788 	err = 0;
7789 out:
7790 	return err;
7791 }
7792 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7793 
7794 static int dev_cpu_callback(struct notifier_block *nfb,
7795 			    unsigned long action,
7796 			    void *ocpu)
7797 {
7798 	struct sk_buff **list_skb;
7799 	struct sk_buff *skb;
7800 	unsigned int cpu, oldcpu = (unsigned long)ocpu;
7801 	struct softnet_data *sd, *oldsd;
7802 
7803 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7804 		return NOTIFY_OK;
7805 
7806 	local_irq_disable();
7807 	cpu = smp_processor_id();
7808 	sd = &per_cpu(softnet_data, cpu);
7809 	oldsd = &per_cpu(softnet_data, oldcpu);
7810 
7811 	/* Find end of our completion_queue. */
7812 	list_skb = &sd->completion_queue;
7813 	while (*list_skb)
7814 		list_skb = &(*list_skb)->next;
7815 	/* Append completion queue from offline CPU. */
7816 	*list_skb = oldsd->completion_queue;
7817 	oldsd->completion_queue = NULL;
7818 
7819 	/* Append output queue from offline CPU. */
7820 	if (oldsd->output_queue) {
7821 		*sd->output_queue_tailp = oldsd->output_queue;
7822 		sd->output_queue_tailp = oldsd->output_queue_tailp;
7823 		oldsd->output_queue = NULL;
7824 		oldsd->output_queue_tailp = &oldsd->output_queue;
7825 	}
7826 	/* Append NAPI poll list from offline CPU, with one exception :
7827 	 * process_backlog() must be called by cpu owning percpu backlog.
7828 	 * We properly handle process_queue & input_pkt_queue later.
7829 	 */
7830 	while (!list_empty(&oldsd->poll_list)) {
7831 		struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7832 							    struct napi_struct,
7833 							    poll_list);
7834 
7835 		list_del_init(&napi->poll_list);
7836 		if (napi->poll == process_backlog)
7837 			napi->state = 0;
7838 		else
7839 			____napi_schedule(sd, napi);
7840 	}
7841 
7842 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
7843 	local_irq_enable();
7844 
7845 	/* Process offline CPU's input_pkt_queue */
7846 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7847 		netif_rx_ni(skb);
7848 		input_queue_head_incr(oldsd);
7849 	}
7850 	while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7851 		netif_rx_ni(skb);
7852 		input_queue_head_incr(oldsd);
7853 	}
7854 
7855 	return NOTIFY_OK;
7856 }
7857 
7858 
7859 /**
7860  *	netdev_increment_features - increment feature set by one
7861  *	@all: current feature set
7862  *	@one: new feature set
7863  *	@mask: mask feature set
7864  *
7865  *	Computes a new feature set after adding a device with feature set
7866  *	@one to the master device with current feature set @all.  Will not
7867  *	enable anything that is off in @mask. Returns the new feature set.
7868  */
7869 netdev_features_t netdev_increment_features(netdev_features_t all,
7870 	netdev_features_t one, netdev_features_t mask)
7871 {
7872 	if (mask & NETIF_F_HW_CSUM)
7873 		mask |= NETIF_F_CSUM_MASK;
7874 	mask |= NETIF_F_VLAN_CHALLENGED;
7875 
7876 	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7877 	all &= one | ~NETIF_F_ALL_FOR_ALL;
7878 
7879 	/* If one device supports hw checksumming, set for all. */
7880 	if (all & NETIF_F_HW_CSUM)
7881 		all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7882 
7883 	return all;
7884 }
7885 EXPORT_SYMBOL(netdev_increment_features);
7886 
7887 static struct hlist_head * __net_init netdev_create_hash(void)
7888 {
7889 	int i;
7890 	struct hlist_head *hash;
7891 
7892 	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7893 	if (hash != NULL)
7894 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
7895 			INIT_HLIST_HEAD(&hash[i]);
7896 
7897 	return hash;
7898 }
7899 
7900 /* Initialize per network namespace state */
7901 static int __net_init netdev_init(struct net *net)
7902 {
7903 	if (net != &init_net)
7904 		INIT_LIST_HEAD(&net->dev_base_head);
7905 
7906 	net->dev_name_head = netdev_create_hash();
7907 	if (net->dev_name_head == NULL)
7908 		goto err_name;
7909 
7910 	net->dev_index_head = netdev_create_hash();
7911 	if (net->dev_index_head == NULL)
7912 		goto err_idx;
7913 
7914 	return 0;
7915 
7916 err_idx:
7917 	kfree(net->dev_name_head);
7918 err_name:
7919 	return -ENOMEM;
7920 }
7921 
7922 /**
7923  *	netdev_drivername - network driver for the device
7924  *	@dev: network device
7925  *
7926  *	Determine network driver for device.
7927  */
7928 const char *netdev_drivername(const struct net_device *dev)
7929 {
7930 	const struct device_driver *driver;
7931 	const struct device *parent;
7932 	const char *empty = "";
7933 
7934 	parent = dev->dev.parent;
7935 	if (!parent)
7936 		return empty;
7937 
7938 	driver = parent->driver;
7939 	if (driver && driver->name)
7940 		return driver->name;
7941 	return empty;
7942 }
7943 
7944 static void __netdev_printk(const char *level, const struct net_device *dev,
7945 			    struct va_format *vaf)
7946 {
7947 	if (dev && dev->dev.parent) {
7948 		dev_printk_emit(level[1] - '0',
7949 				dev->dev.parent,
7950 				"%s %s %s%s: %pV",
7951 				dev_driver_string(dev->dev.parent),
7952 				dev_name(dev->dev.parent),
7953 				netdev_name(dev), netdev_reg_state(dev),
7954 				vaf);
7955 	} else if (dev) {
7956 		printk("%s%s%s: %pV",
7957 		       level, netdev_name(dev), netdev_reg_state(dev), vaf);
7958 	} else {
7959 		printk("%s(NULL net_device): %pV", level, vaf);
7960 	}
7961 }
7962 
7963 void netdev_printk(const char *level, const struct net_device *dev,
7964 		   const char *format, ...)
7965 {
7966 	struct va_format vaf;
7967 	va_list args;
7968 
7969 	va_start(args, format);
7970 
7971 	vaf.fmt = format;
7972 	vaf.va = &args;
7973 
7974 	__netdev_printk(level, dev, &vaf);
7975 
7976 	va_end(args);
7977 }
7978 EXPORT_SYMBOL(netdev_printk);
7979 
7980 #define define_netdev_printk_level(func, level)			\
7981 void func(const struct net_device *dev, const char *fmt, ...)	\
7982 {								\
7983 	struct va_format vaf;					\
7984 	va_list args;						\
7985 								\
7986 	va_start(args, fmt);					\
7987 								\
7988 	vaf.fmt = fmt;						\
7989 	vaf.va = &args;						\
7990 								\
7991 	__netdev_printk(level, dev, &vaf);			\
7992 								\
7993 	va_end(args);						\
7994 }								\
7995 EXPORT_SYMBOL(func);
7996 
7997 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7998 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7999 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8000 define_netdev_printk_level(netdev_err, KERN_ERR);
8001 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8002 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8003 define_netdev_printk_level(netdev_info, KERN_INFO);
8004 
8005 static void __net_exit netdev_exit(struct net *net)
8006 {
8007 	kfree(net->dev_name_head);
8008 	kfree(net->dev_index_head);
8009 }
8010 
8011 static struct pernet_operations __net_initdata netdev_net_ops = {
8012 	.init = netdev_init,
8013 	.exit = netdev_exit,
8014 };
8015 
8016 static void __net_exit default_device_exit(struct net *net)
8017 {
8018 	struct net_device *dev, *aux;
8019 	/*
8020 	 * Push all migratable network devices back to the
8021 	 * initial network namespace
8022 	 */
8023 	rtnl_lock();
8024 	for_each_netdev_safe(net, dev, aux) {
8025 		int err;
8026 		char fb_name[IFNAMSIZ];
8027 
8028 		/* Ignore unmoveable devices (i.e. loopback) */
8029 		if (dev->features & NETIF_F_NETNS_LOCAL)
8030 			continue;
8031 
8032 		/* Leave virtual devices for the generic cleanup */
8033 		if (dev->rtnl_link_ops)
8034 			continue;
8035 
8036 		/* Push remaining network devices to init_net */
8037 		snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8038 		err = dev_change_net_namespace(dev, &init_net, fb_name);
8039 		if (err) {
8040 			pr_emerg("%s: failed to move %s to init_net: %d\n",
8041 				 __func__, dev->name, err);
8042 			BUG();
8043 		}
8044 	}
8045 	rtnl_unlock();
8046 }
8047 
8048 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8049 {
8050 	/* Return with the rtnl_lock held when there are no network
8051 	 * devices unregistering in any network namespace in net_list.
8052 	 */
8053 	struct net *net;
8054 	bool unregistering;
8055 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
8056 
8057 	add_wait_queue(&netdev_unregistering_wq, &wait);
8058 	for (;;) {
8059 		unregistering = false;
8060 		rtnl_lock();
8061 		list_for_each_entry(net, net_list, exit_list) {
8062 			if (net->dev_unreg_count > 0) {
8063 				unregistering = true;
8064 				break;
8065 			}
8066 		}
8067 		if (!unregistering)
8068 			break;
8069 		__rtnl_unlock();
8070 
8071 		wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8072 	}
8073 	remove_wait_queue(&netdev_unregistering_wq, &wait);
8074 }
8075 
8076 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8077 {
8078 	/* At exit all network devices most be removed from a network
8079 	 * namespace.  Do this in the reverse order of registration.
8080 	 * Do this across as many network namespaces as possible to
8081 	 * improve batching efficiency.
8082 	 */
8083 	struct net_device *dev;
8084 	struct net *net;
8085 	LIST_HEAD(dev_kill_list);
8086 
8087 	/* To prevent network device cleanup code from dereferencing
8088 	 * loopback devices or network devices that have been freed
8089 	 * wait here for all pending unregistrations to complete,
8090 	 * before unregistring the loopback device and allowing the
8091 	 * network namespace be freed.
8092 	 *
8093 	 * The netdev todo list containing all network devices
8094 	 * unregistrations that happen in default_device_exit_batch
8095 	 * will run in the rtnl_unlock() at the end of
8096 	 * default_device_exit_batch.
8097 	 */
8098 	rtnl_lock_unregistering(net_list);
8099 	list_for_each_entry(net, net_list, exit_list) {
8100 		for_each_netdev_reverse(net, dev) {
8101 			if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8102 				dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8103 			else
8104 				unregister_netdevice_queue(dev, &dev_kill_list);
8105 		}
8106 	}
8107 	unregister_netdevice_many(&dev_kill_list);
8108 	rtnl_unlock();
8109 }
8110 
8111 static struct pernet_operations __net_initdata default_device_ops = {
8112 	.exit = default_device_exit,
8113 	.exit_batch = default_device_exit_batch,
8114 };
8115 
8116 /*
8117  *	Initialize the DEV module. At boot time this walks the device list and
8118  *	unhooks any devices that fail to initialise (normally hardware not
8119  *	present) and leaves us with a valid list of present and active devices.
8120  *
8121  */
8122 
8123 /*
8124  *       This is called single threaded during boot, so no need
8125  *       to take the rtnl semaphore.
8126  */
8127 static int __init net_dev_init(void)
8128 {
8129 	int i, rc = -ENOMEM;
8130 
8131 	BUG_ON(!dev_boot_phase);
8132 
8133 	if (dev_proc_init())
8134 		goto out;
8135 
8136 	if (netdev_kobject_init())
8137 		goto out;
8138 
8139 	INIT_LIST_HEAD(&ptype_all);
8140 	for (i = 0; i < PTYPE_HASH_SIZE; i++)
8141 		INIT_LIST_HEAD(&ptype_base[i]);
8142 
8143 	INIT_LIST_HEAD(&offload_base);
8144 
8145 	if (register_pernet_subsys(&netdev_net_ops))
8146 		goto out;
8147 
8148 	/*
8149 	 *	Initialise the packet receive queues.
8150 	 */
8151 
8152 	for_each_possible_cpu(i) {
8153 		struct softnet_data *sd = &per_cpu(softnet_data, i);
8154 
8155 		skb_queue_head_init(&sd->input_pkt_queue);
8156 		skb_queue_head_init(&sd->process_queue);
8157 		INIT_LIST_HEAD(&sd->poll_list);
8158 		sd->output_queue_tailp = &sd->output_queue;
8159 #ifdef CONFIG_RPS
8160 		sd->csd.func = rps_trigger_softirq;
8161 		sd->csd.info = sd;
8162 		sd->cpu = i;
8163 #endif
8164 
8165 		sd->backlog.poll = process_backlog;
8166 		sd->backlog.weight = weight_p;
8167 	}
8168 
8169 	dev_boot_phase = 0;
8170 
8171 	/* The loopback device is special if any other network devices
8172 	 * is present in a network namespace the loopback device must
8173 	 * be present. Since we now dynamically allocate and free the
8174 	 * loopback device ensure this invariant is maintained by
8175 	 * keeping the loopback device as the first device on the
8176 	 * list of network devices.  Ensuring the loopback devices
8177 	 * is the first device that appears and the last network device
8178 	 * that disappears.
8179 	 */
8180 	if (register_pernet_device(&loopback_net_ops))
8181 		goto out;
8182 
8183 	if (register_pernet_device(&default_device_ops))
8184 		goto out;
8185 
8186 	open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8187 	open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8188 
8189 	hotcpu_notifier(dev_cpu_callback, 0);
8190 	dst_subsys_init();
8191 	rc = 0;
8192 out:
8193 	return rc;
8194 }
8195 
8196 subsys_initcall(net_dev_init);
8197