1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <linux/bpf.h> 98 #include <net/net_namespace.h> 99 #include <net/sock.h> 100 #include <net/busy_poll.h> 101 #include <linux/rtnetlink.h> 102 #include <linux/stat.h> 103 #include <net/dst.h> 104 #include <net/dst_metadata.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <net/xfrm.h> 108 #include <linux/highmem.h> 109 #include <linux/init.h> 110 #include <linux/module.h> 111 #include <linux/netpoll.h> 112 #include <linux/rcupdate.h> 113 #include <linux/delay.h> 114 #include <net/iw_handler.h> 115 #include <asm/current.h> 116 #include <linux/audit.h> 117 #include <linux/dmaengine.h> 118 #include <linux/err.h> 119 #include <linux/ctype.h> 120 #include <linux/if_arp.h> 121 #include <linux/if_vlan.h> 122 #include <linux/ip.h> 123 #include <net/ip.h> 124 #include <net/mpls.h> 125 #include <linux/ipv6.h> 126 #include <linux/in.h> 127 #include <linux/jhash.h> 128 #include <linux/random.h> 129 #include <trace/events/napi.h> 130 #include <trace/events/net.h> 131 #include <trace/events/skb.h> 132 #include <linux/pci.h> 133 #include <linux/inetdevice.h> 134 #include <linux/cpu_rmap.h> 135 #include <linux/static_key.h> 136 #include <linux/hashtable.h> 137 #include <linux/vmalloc.h> 138 #include <linux/if_macvlan.h> 139 #include <linux/errqueue.h> 140 #include <linux/hrtimer.h> 141 #include <linux/netfilter_ingress.h> 142 #include <linux/sctp.h> 143 #include <linux/crash_dump.h> 144 145 #include "net-sysfs.h" 146 147 /* Instead of increasing this, you should create a hash table. */ 148 #define MAX_GRO_SKBS 8 149 150 /* This should be increased if a protocol with a bigger head is added. */ 151 #define GRO_MAX_HEAD (MAX_HEADER + 128) 152 153 static DEFINE_SPINLOCK(ptype_lock); 154 static DEFINE_SPINLOCK(offload_lock); 155 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 156 struct list_head ptype_all __read_mostly; /* Taps */ 157 static struct list_head offload_base __read_mostly; 158 159 static int netif_rx_internal(struct sk_buff *skb); 160 static int call_netdevice_notifiers_info(unsigned long val, 161 struct net_device *dev, 162 struct netdev_notifier_info *info); 163 164 /* 165 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 166 * semaphore. 167 * 168 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 169 * 170 * Writers must hold the rtnl semaphore while they loop through the 171 * dev_base_head list, and hold dev_base_lock for writing when they do the 172 * actual updates. This allows pure readers to access the list even 173 * while a writer is preparing to update it. 174 * 175 * To put it another way, dev_base_lock is held for writing only to 176 * protect against pure readers; the rtnl semaphore provides the 177 * protection against other writers. 178 * 179 * See, for example usages, register_netdevice() and 180 * unregister_netdevice(), which must be called with the rtnl 181 * semaphore held. 182 */ 183 DEFINE_RWLOCK(dev_base_lock); 184 EXPORT_SYMBOL(dev_base_lock); 185 186 /* protects napi_hash addition/deletion and napi_gen_id */ 187 static DEFINE_SPINLOCK(napi_hash_lock); 188 189 static unsigned int napi_gen_id = NR_CPUS; 190 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); 191 192 static seqcount_t devnet_rename_seq; 193 194 static inline void dev_base_seq_inc(struct net *net) 195 { 196 while (++net->dev_base_seq == 0); 197 } 198 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 200 { 201 unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ)); 202 203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 204 } 205 206 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 207 { 208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 209 } 210 211 static inline void rps_lock(struct softnet_data *sd) 212 { 213 #ifdef CONFIG_RPS 214 spin_lock(&sd->input_pkt_queue.lock); 215 #endif 216 } 217 218 static inline void rps_unlock(struct softnet_data *sd) 219 { 220 #ifdef CONFIG_RPS 221 spin_unlock(&sd->input_pkt_queue.lock); 222 #endif 223 } 224 225 /* Device list insertion */ 226 static void list_netdevice(struct net_device *dev) 227 { 228 struct net *net = dev_net(dev); 229 230 ASSERT_RTNL(); 231 232 write_lock_bh(&dev_base_lock); 233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 235 hlist_add_head_rcu(&dev->index_hlist, 236 dev_index_hash(net, dev->ifindex)); 237 write_unlock_bh(&dev_base_lock); 238 239 dev_base_seq_inc(net); 240 } 241 242 /* Device list removal 243 * caller must respect a RCU grace period before freeing/reusing dev 244 */ 245 static void unlist_netdevice(struct net_device *dev) 246 { 247 ASSERT_RTNL(); 248 249 /* Unlink dev from the device chain */ 250 write_lock_bh(&dev_base_lock); 251 list_del_rcu(&dev->dev_list); 252 hlist_del_rcu(&dev->name_hlist); 253 hlist_del_rcu(&dev->index_hlist); 254 write_unlock_bh(&dev_base_lock); 255 256 dev_base_seq_inc(dev_net(dev)); 257 } 258 259 /* 260 * Our notifier list 261 */ 262 263 static RAW_NOTIFIER_HEAD(netdev_chain); 264 265 /* 266 * Device drivers call our routines to queue packets here. We empty the 267 * queue in the local softnet handler. 268 */ 269 270 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 271 EXPORT_PER_CPU_SYMBOL(softnet_data); 272 273 #ifdef CONFIG_LOCKDEP 274 /* 275 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 276 * according to dev->type 277 */ 278 static const unsigned short netdev_lock_type[] = 279 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 280 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 281 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 282 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 283 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 284 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 285 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 286 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 287 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 288 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 289 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 290 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 291 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 292 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 293 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 294 295 static const char *const netdev_lock_name[] = 296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 308 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 309 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 310 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 311 312 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 313 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 314 315 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 316 { 317 int i; 318 319 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 320 if (netdev_lock_type[i] == dev_type) 321 return i; 322 /* the last key is used by default */ 323 return ARRAY_SIZE(netdev_lock_type) - 1; 324 } 325 326 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 327 unsigned short dev_type) 328 { 329 int i; 330 331 i = netdev_lock_pos(dev_type); 332 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 333 netdev_lock_name[i]); 334 } 335 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 337 { 338 int i; 339 340 i = netdev_lock_pos(dev->type); 341 lockdep_set_class_and_name(&dev->addr_list_lock, 342 &netdev_addr_lock_key[i], 343 netdev_lock_name[i]); 344 } 345 #else 346 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 347 unsigned short dev_type) 348 { 349 } 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 351 { 352 } 353 #endif 354 355 /******************************************************************************* 356 357 Protocol management and registration routines 358 359 *******************************************************************************/ 360 361 /* 362 * Add a protocol ID to the list. Now that the input handler is 363 * smarter we can dispense with all the messy stuff that used to be 364 * here. 365 * 366 * BEWARE!!! Protocol handlers, mangling input packets, 367 * MUST BE last in hash buckets and checking protocol handlers 368 * MUST start from promiscuous ptype_all chain in net_bh. 369 * It is true now, do not change it. 370 * Explanation follows: if protocol handler, mangling packet, will 371 * be the first on list, it is not able to sense, that packet 372 * is cloned and should be copied-on-write, so that it will 373 * change it and subsequent readers will get broken packet. 374 * --ANK (980803) 375 */ 376 377 static inline struct list_head *ptype_head(const struct packet_type *pt) 378 { 379 if (pt->type == htons(ETH_P_ALL)) 380 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 381 else 382 return pt->dev ? &pt->dev->ptype_specific : 383 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 384 } 385 386 /** 387 * dev_add_pack - add packet handler 388 * @pt: packet type declaration 389 * 390 * Add a protocol handler to the networking stack. The passed &packet_type 391 * is linked into kernel lists and may not be freed until it has been 392 * removed from the kernel lists. 393 * 394 * This call does not sleep therefore it can not 395 * guarantee all CPU's that are in middle of receiving packets 396 * will see the new packet type (until the next received packet). 397 */ 398 399 void dev_add_pack(struct packet_type *pt) 400 { 401 struct list_head *head = ptype_head(pt); 402 403 spin_lock(&ptype_lock); 404 list_add_rcu(&pt->list, head); 405 spin_unlock(&ptype_lock); 406 } 407 EXPORT_SYMBOL(dev_add_pack); 408 409 /** 410 * __dev_remove_pack - remove packet handler 411 * @pt: packet type declaration 412 * 413 * Remove a protocol handler that was previously added to the kernel 414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 415 * from the kernel lists and can be freed or reused once this function 416 * returns. 417 * 418 * The packet type might still be in use by receivers 419 * and must not be freed until after all the CPU's have gone 420 * through a quiescent state. 421 */ 422 void __dev_remove_pack(struct packet_type *pt) 423 { 424 struct list_head *head = ptype_head(pt); 425 struct packet_type *pt1; 426 427 spin_lock(&ptype_lock); 428 429 list_for_each_entry(pt1, head, list) { 430 if (pt == pt1) { 431 list_del_rcu(&pt->list); 432 goto out; 433 } 434 } 435 436 pr_warn("dev_remove_pack: %p not found\n", pt); 437 out: 438 spin_unlock(&ptype_lock); 439 } 440 EXPORT_SYMBOL(__dev_remove_pack); 441 442 /** 443 * dev_remove_pack - remove packet handler 444 * @pt: packet type declaration 445 * 446 * Remove a protocol handler that was previously added to the kernel 447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 448 * from the kernel lists and can be freed or reused once this function 449 * returns. 450 * 451 * This call sleeps to guarantee that no CPU is looking at the packet 452 * type after return. 453 */ 454 void dev_remove_pack(struct packet_type *pt) 455 { 456 __dev_remove_pack(pt); 457 458 synchronize_net(); 459 } 460 EXPORT_SYMBOL(dev_remove_pack); 461 462 463 /** 464 * dev_add_offload - register offload handlers 465 * @po: protocol offload declaration 466 * 467 * Add protocol offload handlers to the networking stack. The passed 468 * &proto_offload is linked into kernel lists and may not be freed until 469 * it has been removed from the kernel lists. 470 * 471 * This call does not sleep therefore it can not 472 * guarantee all CPU's that are in middle of receiving packets 473 * will see the new offload handlers (until the next received packet). 474 */ 475 void dev_add_offload(struct packet_offload *po) 476 { 477 struct packet_offload *elem; 478 479 spin_lock(&offload_lock); 480 list_for_each_entry(elem, &offload_base, list) { 481 if (po->priority < elem->priority) 482 break; 483 } 484 list_add_rcu(&po->list, elem->list.prev); 485 spin_unlock(&offload_lock); 486 } 487 EXPORT_SYMBOL(dev_add_offload); 488 489 /** 490 * __dev_remove_offload - remove offload handler 491 * @po: packet offload declaration 492 * 493 * Remove a protocol offload handler that was previously added to the 494 * kernel offload handlers by dev_add_offload(). The passed &offload_type 495 * is removed from the kernel lists and can be freed or reused once this 496 * function returns. 497 * 498 * The packet type might still be in use by receivers 499 * and must not be freed until after all the CPU's have gone 500 * through a quiescent state. 501 */ 502 static void __dev_remove_offload(struct packet_offload *po) 503 { 504 struct list_head *head = &offload_base; 505 struct packet_offload *po1; 506 507 spin_lock(&offload_lock); 508 509 list_for_each_entry(po1, head, list) { 510 if (po == po1) { 511 list_del_rcu(&po->list); 512 goto out; 513 } 514 } 515 516 pr_warn("dev_remove_offload: %p not found\n", po); 517 out: 518 spin_unlock(&offload_lock); 519 } 520 521 /** 522 * dev_remove_offload - remove packet offload handler 523 * @po: packet offload declaration 524 * 525 * Remove a packet offload handler that was previously added to the kernel 526 * offload handlers by dev_add_offload(). The passed &offload_type is 527 * removed from the kernel lists and can be freed or reused once this 528 * function returns. 529 * 530 * This call sleeps to guarantee that no CPU is looking at the packet 531 * type after return. 532 */ 533 void dev_remove_offload(struct packet_offload *po) 534 { 535 __dev_remove_offload(po); 536 537 synchronize_net(); 538 } 539 EXPORT_SYMBOL(dev_remove_offload); 540 541 /****************************************************************************** 542 543 Device Boot-time Settings Routines 544 545 *******************************************************************************/ 546 547 /* Boot time configuration table */ 548 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 549 550 /** 551 * netdev_boot_setup_add - add new setup entry 552 * @name: name of the device 553 * @map: configured settings for the device 554 * 555 * Adds new setup entry to the dev_boot_setup list. The function 556 * returns 0 on error and 1 on success. This is a generic routine to 557 * all netdevices. 558 */ 559 static int netdev_boot_setup_add(char *name, struct ifmap *map) 560 { 561 struct netdev_boot_setup *s; 562 int i; 563 564 s = dev_boot_setup; 565 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 566 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 567 memset(s[i].name, 0, sizeof(s[i].name)); 568 strlcpy(s[i].name, name, IFNAMSIZ); 569 memcpy(&s[i].map, map, sizeof(s[i].map)); 570 break; 571 } 572 } 573 574 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 575 } 576 577 /** 578 * netdev_boot_setup_check - check boot time settings 579 * @dev: the netdevice 580 * 581 * Check boot time settings for the device. 582 * The found settings are set for the device to be used 583 * later in the device probing. 584 * Returns 0 if no settings found, 1 if they are. 585 */ 586 int netdev_boot_setup_check(struct net_device *dev) 587 { 588 struct netdev_boot_setup *s = dev_boot_setup; 589 int i; 590 591 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 592 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 593 !strcmp(dev->name, s[i].name)) { 594 dev->irq = s[i].map.irq; 595 dev->base_addr = s[i].map.base_addr; 596 dev->mem_start = s[i].map.mem_start; 597 dev->mem_end = s[i].map.mem_end; 598 return 1; 599 } 600 } 601 return 0; 602 } 603 EXPORT_SYMBOL(netdev_boot_setup_check); 604 605 606 /** 607 * netdev_boot_base - get address from boot time settings 608 * @prefix: prefix for network device 609 * @unit: id for network device 610 * 611 * Check boot time settings for the base address of device. 612 * The found settings are set for the device to be used 613 * later in the device probing. 614 * Returns 0 if no settings found. 615 */ 616 unsigned long netdev_boot_base(const char *prefix, int unit) 617 { 618 const struct netdev_boot_setup *s = dev_boot_setup; 619 char name[IFNAMSIZ]; 620 int i; 621 622 sprintf(name, "%s%d", prefix, unit); 623 624 /* 625 * If device already registered then return base of 1 626 * to indicate not to probe for this interface 627 */ 628 if (__dev_get_by_name(&init_net, name)) 629 return 1; 630 631 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 632 if (!strcmp(name, s[i].name)) 633 return s[i].map.base_addr; 634 return 0; 635 } 636 637 /* 638 * Saves at boot time configured settings for any netdevice. 639 */ 640 int __init netdev_boot_setup(char *str) 641 { 642 int ints[5]; 643 struct ifmap map; 644 645 str = get_options(str, ARRAY_SIZE(ints), ints); 646 if (!str || !*str) 647 return 0; 648 649 /* Save settings */ 650 memset(&map, 0, sizeof(map)); 651 if (ints[0] > 0) 652 map.irq = ints[1]; 653 if (ints[0] > 1) 654 map.base_addr = ints[2]; 655 if (ints[0] > 2) 656 map.mem_start = ints[3]; 657 if (ints[0] > 3) 658 map.mem_end = ints[4]; 659 660 /* Add new entry to the list */ 661 return netdev_boot_setup_add(str, &map); 662 } 663 664 __setup("netdev=", netdev_boot_setup); 665 666 /******************************************************************************* 667 668 Device Interface Subroutines 669 670 *******************************************************************************/ 671 672 /** 673 * dev_get_iflink - get 'iflink' value of a interface 674 * @dev: targeted interface 675 * 676 * Indicates the ifindex the interface is linked to. 677 * Physical interfaces have the same 'ifindex' and 'iflink' values. 678 */ 679 680 int dev_get_iflink(const struct net_device *dev) 681 { 682 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 683 return dev->netdev_ops->ndo_get_iflink(dev); 684 685 return dev->ifindex; 686 } 687 EXPORT_SYMBOL(dev_get_iflink); 688 689 /** 690 * dev_fill_metadata_dst - Retrieve tunnel egress information. 691 * @dev: targeted interface 692 * @skb: The packet. 693 * 694 * For better visibility of tunnel traffic OVS needs to retrieve 695 * egress tunnel information for a packet. Following API allows 696 * user to get this info. 697 */ 698 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 699 { 700 struct ip_tunnel_info *info; 701 702 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst) 703 return -EINVAL; 704 705 info = skb_tunnel_info_unclone(skb); 706 if (!info) 707 return -ENOMEM; 708 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) 709 return -EINVAL; 710 711 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb); 712 } 713 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); 714 715 /** 716 * __dev_get_by_name - find a device by its name 717 * @net: the applicable net namespace 718 * @name: name to find 719 * 720 * Find an interface by name. Must be called under RTNL semaphore 721 * or @dev_base_lock. If the name is found a pointer to the device 722 * is returned. If the name is not found then %NULL is returned. The 723 * reference counters are not incremented so the caller must be 724 * careful with locks. 725 */ 726 727 struct net_device *__dev_get_by_name(struct net *net, const char *name) 728 { 729 struct net_device *dev; 730 struct hlist_head *head = dev_name_hash(net, name); 731 732 hlist_for_each_entry(dev, head, name_hlist) 733 if (!strncmp(dev->name, name, IFNAMSIZ)) 734 return dev; 735 736 return NULL; 737 } 738 EXPORT_SYMBOL(__dev_get_by_name); 739 740 /** 741 * dev_get_by_name_rcu - find a device by its name 742 * @net: the applicable net namespace 743 * @name: name to find 744 * 745 * Find an interface by name. 746 * If the name is found a pointer to the device is returned. 747 * If the name is not found then %NULL is returned. 748 * The reference counters are not incremented so the caller must be 749 * careful with locks. The caller must hold RCU lock. 750 */ 751 752 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 753 { 754 struct net_device *dev; 755 struct hlist_head *head = dev_name_hash(net, name); 756 757 hlist_for_each_entry_rcu(dev, head, name_hlist) 758 if (!strncmp(dev->name, name, IFNAMSIZ)) 759 return dev; 760 761 return NULL; 762 } 763 EXPORT_SYMBOL(dev_get_by_name_rcu); 764 765 /** 766 * dev_get_by_name - find a device by its name 767 * @net: the applicable net namespace 768 * @name: name to find 769 * 770 * Find an interface by name. This can be called from any 771 * context and does its own locking. The returned handle has 772 * the usage count incremented and the caller must use dev_put() to 773 * release it when it is no longer needed. %NULL is returned if no 774 * matching device is found. 775 */ 776 777 struct net_device *dev_get_by_name(struct net *net, const char *name) 778 { 779 struct net_device *dev; 780 781 rcu_read_lock(); 782 dev = dev_get_by_name_rcu(net, name); 783 if (dev) 784 dev_hold(dev); 785 rcu_read_unlock(); 786 return dev; 787 } 788 EXPORT_SYMBOL(dev_get_by_name); 789 790 /** 791 * __dev_get_by_index - find a device by its ifindex 792 * @net: the applicable net namespace 793 * @ifindex: index of device 794 * 795 * Search for an interface by index. Returns %NULL if the device 796 * is not found or a pointer to the device. The device has not 797 * had its reference counter increased so the caller must be careful 798 * about locking. The caller must hold either the RTNL semaphore 799 * or @dev_base_lock. 800 */ 801 802 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 803 { 804 struct net_device *dev; 805 struct hlist_head *head = dev_index_hash(net, ifindex); 806 807 hlist_for_each_entry(dev, head, index_hlist) 808 if (dev->ifindex == ifindex) 809 return dev; 810 811 return NULL; 812 } 813 EXPORT_SYMBOL(__dev_get_by_index); 814 815 /** 816 * dev_get_by_index_rcu - find a device by its ifindex 817 * @net: the applicable net namespace 818 * @ifindex: index of device 819 * 820 * Search for an interface by index. Returns %NULL if the device 821 * is not found or a pointer to the device. The device has not 822 * had its reference counter increased so the caller must be careful 823 * about locking. The caller must hold RCU lock. 824 */ 825 826 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 827 { 828 struct net_device *dev; 829 struct hlist_head *head = dev_index_hash(net, ifindex); 830 831 hlist_for_each_entry_rcu(dev, head, index_hlist) 832 if (dev->ifindex == ifindex) 833 return dev; 834 835 return NULL; 836 } 837 EXPORT_SYMBOL(dev_get_by_index_rcu); 838 839 840 /** 841 * dev_get_by_index - find a device by its ifindex 842 * @net: the applicable net namespace 843 * @ifindex: index of device 844 * 845 * Search for an interface by index. Returns NULL if the device 846 * is not found or a pointer to the device. The device returned has 847 * had a reference added and the pointer is safe until the user calls 848 * dev_put to indicate they have finished with it. 849 */ 850 851 struct net_device *dev_get_by_index(struct net *net, int ifindex) 852 { 853 struct net_device *dev; 854 855 rcu_read_lock(); 856 dev = dev_get_by_index_rcu(net, ifindex); 857 if (dev) 858 dev_hold(dev); 859 rcu_read_unlock(); 860 return dev; 861 } 862 EXPORT_SYMBOL(dev_get_by_index); 863 864 /** 865 * netdev_get_name - get a netdevice name, knowing its ifindex. 866 * @net: network namespace 867 * @name: a pointer to the buffer where the name will be stored. 868 * @ifindex: the ifindex of the interface to get the name from. 869 * 870 * The use of raw_seqcount_begin() and cond_resched() before 871 * retrying is required as we want to give the writers a chance 872 * to complete when CONFIG_PREEMPT is not set. 873 */ 874 int netdev_get_name(struct net *net, char *name, int ifindex) 875 { 876 struct net_device *dev; 877 unsigned int seq; 878 879 retry: 880 seq = raw_seqcount_begin(&devnet_rename_seq); 881 rcu_read_lock(); 882 dev = dev_get_by_index_rcu(net, ifindex); 883 if (!dev) { 884 rcu_read_unlock(); 885 return -ENODEV; 886 } 887 888 strcpy(name, dev->name); 889 rcu_read_unlock(); 890 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 891 cond_resched(); 892 goto retry; 893 } 894 895 return 0; 896 } 897 898 /** 899 * dev_getbyhwaddr_rcu - find a device by its hardware address 900 * @net: the applicable net namespace 901 * @type: media type of device 902 * @ha: hardware address 903 * 904 * Search for an interface by MAC address. Returns NULL if the device 905 * is not found or a pointer to the device. 906 * The caller must hold RCU or RTNL. 907 * The returned device has not had its ref count increased 908 * and the caller must therefore be careful about locking 909 * 910 */ 911 912 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 913 const char *ha) 914 { 915 struct net_device *dev; 916 917 for_each_netdev_rcu(net, dev) 918 if (dev->type == type && 919 !memcmp(dev->dev_addr, ha, dev->addr_len)) 920 return dev; 921 922 return NULL; 923 } 924 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 925 926 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 927 { 928 struct net_device *dev; 929 930 ASSERT_RTNL(); 931 for_each_netdev(net, dev) 932 if (dev->type == type) 933 return dev; 934 935 return NULL; 936 } 937 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 938 939 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 940 { 941 struct net_device *dev, *ret = NULL; 942 943 rcu_read_lock(); 944 for_each_netdev_rcu(net, dev) 945 if (dev->type == type) { 946 dev_hold(dev); 947 ret = dev; 948 break; 949 } 950 rcu_read_unlock(); 951 return ret; 952 } 953 EXPORT_SYMBOL(dev_getfirstbyhwtype); 954 955 /** 956 * __dev_get_by_flags - find any device with given flags 957 * @net: the applicable net namespace 958 * @if_flags: IFF_* values 959 * @mask: bitmask of bits in if_flags to check 960 * 961 * Search for any interface with the given flags. Returns NULL if a device 962 * is not found or a pointer to the device. Must be called inside 963 * rtnl_lock(), and result refcount is unchanged. 964 */ 965 966 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 967 unsigned short mask) 968 { 969 struct net_device *dev, *ret; 970 971 ASSERT_RTNL(); 972 973 ret = NULL; 974 for_each_netdev(net, dev) { 975 if (((dev->flags ^ if_flags) & mask) == 0) { 976 ret = dev; 977 break; 978 } 979 } 980 return ret; 981 } 982 EXPORT_SYMBOL(__dev_get_by_flags); 983 984 /** 985 * dev_valid_name - check if name is okay for network device 986 * @name: name string 987 * 988 * Network device names need to be valid file names to 989 * to allow sysfs to work. We also disallow any kind of 990 * whitespace. 991 */ 992 bool dev_valid_name(const char *name) 993 { 994 if (*name == '\0') 995 return false; 996 if (strlen(name) >= IFNAMSIZ) 997 return false; 998 if (!strcmp(name, ".") || !strcmp(name, "..")) 999 return false; 1000 1001 while (*name) { 1002 if (*name == '/' || *name == ':' || isspace(*name)) 1003 return false; 1004 name++; 1005 } 1006 return true; 1007 } 1008 EXPORT_SYMBOL(dev_valid_name); 1009 1010 /** 1011 * __dev_alloc_name - allocate a name for a device 1012 * @net: network namespace to allocate the device name in 1013 * @name: name format string 1014 * @buf: scratch buffer and result name string 1015 * 1016 * Passed a format string - eg "lt%d" it will try and find a suitable 1017 * id. It scans list of devices to build up a free map, then chooses 1018 * the first empty slot. The caller must hold the dev_base or rtnl lock 1019 * while allocating the name and adding the device in order to avoid 1020 * duplicates. 1021 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1022 * Returns the number of the unit assigned or a negative errno code. 1023 */ 1024 1025 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 1026 { 1027 int i = 0; 1028 const char *p; 1029 const int max_netdevices = 8*PAGE_SIZE; 1030 unsigned long *inuse; 1031 struct net_device *d; 1032 1033 p = strnchr(name, IFNAMSIZ-1, '%'); 1034 if (p) { 1035 /* 1036 * Verify the string as this thing may have come from 1037 * the user. There must be either one "%d" and no other "%" 1038 * characters. 1039 */ 1040 if (p[1] != 'd' || strchr(p + 2, '%')) 1041 return -EINVAL; 1042 1043 /* Use one page as a bit array of possible slots */ 1044 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1045 if (!inuse) 1046 return -ENOMEM; 1047 1048 for_each_netdev(net, d) { 1049 if (!sscanf(d->name, name, &i)) 1050 continue; 1051 if (i < 0 || i >= max_netdevices) 1052 continue; 1053 1054 /* avoid cases where sscanf is not exact inverse of printf */ 1055 snprintf(buf, IFNAMSIZ, name, i); 1056 if (!strncmp(buf, d->name, IFNAMSIZ)) 1057 set_bit(i, inuse); 1058 } 1059 1060 i = find_first_zero_bit(inuse, max_netdevices); 1061 free_page((unsigned long) inuse); 1062 } 1063 1064 if (buf != name) 1065 snprintf(buf, IFNAMSIZ, name, i); 1066 if (!__dev_get_by_name(net, buf)) 1067 return i; 1068 1069 /* It is possible to run out of possible slots 1070 * when the name is long and there isn't enough space left 1071 * for the digits, or if all bits are used. 1072 */ 1073 return -ENFILE; 1074 } 1075 1076 /** 1077 * dev_alloc_name - allocate a name for a device 1078 * @dev: device 1079 * @name: name format string 1080 * 1081 * Passed a format string - eg "lt%d" it will try and find a suitable 1082 * id. It scans list of devices to build up a free map, then chooses 1083 * the first empty slot. The caller must hold the dev_base or rtnl lock 1084 * while allocating the name and adding the device in order to avoid 1085 * duplicates. 1086 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1087 * Returns the number of the unit assigned or a negative errno code. 1088 */ 1089 1090 int dev_alloc_name(struct net_device *dev, const char *name) 1091 { 1092 char buf[IFNAMSIZ]; 1093 struct net *net; 1094 int ret; 1095 1096 BUG_ON(!dev_net(dev)); 1097 net = dev_net(dev); 1098 ret = __dev_alloc_name(net, name, buf); 1099 if (ret >= 0) 1100 strlcpy(dev->name, buf, IFNAMSIZ); 1101 return ret; 1102 } 1103 EXPORT_SYMBOL(dev_alloc_name); 1104 1105 static int dev_alloc_name_ns(struct net *net, 1106 struct net_device *dev, 1107 const char *name) 1108 { 1109 char buf[IFNAMSIZ]; 1110 int ret; 1111 1112 ret = __dev_alloc_name(net, name, buf); 1113 if (ret >= 0) 1114 strlcpy(dev->name, buf, IFNAMSIZ); 1115 return ret; 1116 } 1117 1118 static int dev_get_valid_name(struct net *net, 1119 struct net_device *dev, 1120 const char *name) 1121 { 1122 BUG_ON(!net); 1123 1124 if (!dev_valid_name(name)) 1125 return -EINVAL; 1126 1127 if (strchr(name, '%')) 1128 return dev_alloc_name_ns(net, dev, name); 1129 else if (__dev_get_by_name(net, name)) 1130 return -EEXIST; 1131 else if (dev->name != name) 1132 strlcpy(dev->name, name, IFNAMSIZ); 1133 1134 return 0; 1135 } 1136 1137 /** 1138 * dev_change_name - change name of a device 1139 * @dev: device 1140 * @newname: name (or format string) must be at least IFNAMSIZ 1141 * 1142 * Change name of a device, can pass format strings "eth%d". 1143 * for wildcarding. 1144 */ 1145 int dev_change_name(struct net_device *dev, const char *newname) 1146 { 1147 unsigned char old_assign_type; 1148 char oldname[IFNAMSIZ]; 1149 int err = 0; 1150 int ret; 1151 struct net *net; 1152 1153 ASSERT_RTNL(); 1154 BUG_ON(!dev_net(dev)); 1155 1156 net = dev_net(dev); 1157 if (dev->flags & IFF_UP) 1158 return -EBUSY; 1159 1160 write_seqcount_begin(&devnet_rename_seq); 1161 1162 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1163 write_seqcount_end(&devnet_rename_seq); 1164 return 0; 1165 } 1166 1167 memcpy(oldname, dev->name, IFNAMSIZ); 1168 1169 err = dev_get_valid_name(net, dev, newname); 1170 if (err < 0) { 1171 write_seqcount_end(&devnet_rename_seq); 1172 return err; 1173 } 1174 1175 if (oldname[0] && !strchr(oldname, '%')) 1176 netdev_info(dev, "renamed from %s\n", oldname); 1177 1178 old_assign_type = dev->name_assign_type; 1179 dev->name_assign_type = NET_NAME_RENAMED; 1180 1181 rollback: 1182 ret = device_rename(&dev->dev, dev->name); 1183 if (ret) { 1184 memcpy(dev->name, oldname, IFNAMSIZ); 1185 dev->name_assign_type = old_assign_type; 1186 write_seqcount_end(&devnet_rename_seq); 1187 return ret; 1188 } 1189 1190 write_seqcount_end(&devnet_rename_seq); 1191 1192 netdev_adjacent_rename_links(dev, oldname); 1193 1194 write_lock_bh(&dev_base_lock); 1195 hlist_del_rcu(&dev->name_hlist); 1196 write_unlock_bh(&dev_base_lock); 1197 1198 synchronize_rcu(); 1199 1200 write_lock_bh(&dev_base_lock); 1201 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1202 write_unlock_bh(&dev_base_lock); 1203 1204 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1205 ret = notifier_to_errno(ret); 1206 1207 if (ret) { 1208 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1209 if (err >= 0) { 1210 err = ret; 1211 write_seqcount_begin(&devnet_rename_seq); 1212 memcpy(dev->name, oldname, IFNAMSIZ); 1213 memcpy(oldname, newname, IFNAMSIZ); 1214 dev->name_assign_type = old_assign_type; 1215 old_assign_type = NET_NAME_RENAMED; 1216 goto rollback; 1217 } else { 1218 pr_err("%s: name change rollback failed: %d\n", 1219 dev->name, ret); 1220 } 1221 } 1222 1223 return err; 1224 } 1225 1226 /** 1227 * dev_set_alias - change ifalias of a device 1228 * @dev: device 1229 * @alias: name up to IFALIASZ 1230 * @len: limit of bytes to copy from info 1231 * 1232 * Set ifalias for a device, 1233 */ 1234 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1235 { 1236 char *new_ifalias; 1237 1238 ASSERT_RTNL(); 1239 1240 if (len >= IFALIASZ) 1241 return -EINVAL; 1242 1243 if (!len) { 1244 kfree(dev->ifalias); 1245 dev->ifalias = NULL; 1246 return 0; 1247 } 1248 1249 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1250 if (!new_ifalias) 1251 return -ENOMEM; 1252 dev->ifalias = new_ifalias; 1253 1254 strlcpy(dev->ifalias, alias, len+1); 1255 return len; 1256 } 1257 1258 1259 /** 1260 * netdev_features_change - device changes features 1261 * @dev: device to cause notification 1262 * 1263 * Called to indicate a device has changed features. 1264 */ 1265 void netdev_features_change(struct net_device *dev) 1266 { 1267 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1268 } 1269 EXPORT_SYMBOL(netdev_features_change); 1270 1271 /** 1272 * netdev_state_change - device changes state 1273 * @dev: device to cause notification 1274 * 1275 * Called to indicate a device has changed state. This function calls 1276 * the notifier chains for netdev_chain and sends a NEWLINK message 1277 * to the routing socket. 1278 */ 1279 void netdev_state_change(struct net_device *dev) 1280 { 1281 if (dev->flags & IFF_UP) { 1282 struct netdev_notifier_change_info change_info; 1283 1284 change_info.flags_changed = 0; 1285 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1286 &change_info.info); 1287 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1288 } 1289 } 1290 EXPORT_SYMBOL(netdev_state_change); 1291 1292 /** 1293 * netdev_notify_peers - notify network peers about existence of @dev 1294 * @dev: network device 1295 * 1296 * Generate traffic such that interested network peers are aware of 1297 * @dev, such as by generating a gratuitous ARP. This may be used when 1298 * a device wants to inform the rest of the network about some sort of 1299 * reconfiguration such as a failover event or virtual machine 1300 * migration. 1301 */ 1302 void netdev_notify_peers(struct net_device *dev) 1303 { 1304 rtnl_lock(); 1305 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1306 rtnl_unlock(); 1307 } 1308 EXPORT_SYMBOL(netdev_notify_peers); 1309 1310 static int __dev_open(struct net_device *dev) 1311 { 1312 const struct net_device_ops *ops = dev->netdev_ops; 1313 int ret; 1314 1315 ASSERT_RTNL(); 1316 1317 if (!netif_device_present(dev)) 1318 return -ENODEV; 1319 1320 /* Block netpoll from trying to do any rx path servicing. 1321 * If we don't do this there is a chance ndo_poll_controller 1322 * or ndo_poll may be running while we open the device 1323 */ 1324 netpoll_poll_disable(dev); 1325 1326 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1327 ret = notifier_to_errno(ret); 1328 if (ret) 1329 return ret; 1330 1331 set_bit(__LINK_STATE_START, &dev->state); 1332 1333 if (ops->ndo_validate_addr) 1334 ret = ops->ndo_validate_addr(dev); 1335 1336 if (!ret && ops->ndo_open) 1337 ret = ops->ndo_open(dev); 1338 1339 netpoll_poll_enable(dev); 1340 1341 if (ret) 1342 clear_bit(__LINK_STATE_START, &dev->state); 1343 else { 1344 dev->flags |= IFF_UP; 1345 dev_set_rx_mode(dev); 1346 dev_activate(dev); 1347 add_device_randomness(dev->dev_addr, dev->addr_len); 1348 } 1349 1350 return ret; 1351 } 1352 1353 /** 1354 * dev_open - prepare an interface for use. 1355 * @dev: device to open 1356 * 1357 * Takes a device from down to up state. The device's private open 1358 * function is invoked and then the multicast lists are loaded. Finally 1359 * the device is moved into the up state and a %NETDEV_UP message is 1360 * sent to the netdev notifier chain. 1361 * 1362 * Calling this function on an active interface is a nop. On a failure 1363 * a negative errno code is returned. 1364 */ 1365 int dev_open(struct net_device *dev) 1366 { 1367 int ret; 1368 1369 if (dev->flags & IFF_UP) 1370 return 0; 1371 1372 ret = __dev_open(dev); 1373 if (ret < 0) 1374 return ret; 1375 1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1377 call_netdevice_notifiers(NETDEV_UP, dev); 1378 1379 return ret; 1380 } 1381 EXPORT_SYMBOL(dev_open); 1382 1383 static int __dev_close_many(struct list_head *head) 1384 { 1385 struct net_device *dev; 1386 1387 ASSERT_RTNL(); 1388 might_sleep(); 1389 1390 list_for_each_entry(dev, head, close_list) { 1391 /* Temporarily disable netpoll until the interface is down */ 1392 netpoll_poll_disable(dev); 1393 1394 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1395 1396 clear_bit(__LINK_STATE_START, &dev->state); 1397 1398 /* Synchronize to scheduled poll. We cannot touch poll list, it 1399 * can be even on different cpu. So just clear netif_running(). 1400 * 1401 * dev->stop() will invoke napi_disable() on all of it's 1402 * napi_struct instances on this device. 1403 */ 1404 smp_mb__after_atomic(); /* Commit netif_running(). */ 1405 } 1406 1407 dev_deactivate_many(head); 1408 1409 list_for_each_entry(dev, head, close_list) { 1410 const struct net_device_ops *ops = dev->netdev_ops; 1411 1412 /* 1413 * Call the device specific close. This cannot fail. 1414 * Only if device is UP 1415 * 1416 * We allow it to be called even after a DETACH hot-plug 1417 * event. 1418 */ 1419 if (ops->ndo_stop) 1420 ops->ndo_stop(dev); 1421 1422 dev->flags &= ~IFF_UP; 1423 netpoll_poll_enable(dev); 1424 } 1425 1426 return 0; 1427 } 1428 1429 static int __dev_close(struct net_device *dev) 1430 { 1431 int retval; 1432 LIST_HEAD(single); 1433 1434 list_add(&dev->close_list, &single); 1435 retval = __dev_close_many(&single); 1436 list_del(&single); 1437 1438 return retval; 1439 } 1440 1441 int dev_close_many(struct list_head *head, bool unlink) 1442 { 1443 struct net_device *dev, *tmp; 1444 1445 /* Remove the devices that don't need to be closed */ 1446 list_for_each_entry_safe(dev, tmp, head, close_list) 1447 if (!(dev->flags & IFF_UP)) 1448 list_del_init(&dev->close_list); 1449 1450 __dev_close_many(head); 1451 1452 list_for_each_entry_safe(dev, tmp, head, close_list) { 1453 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1454 call_netdevice_notifiers(NETDEV_DOWN, dev); 1455 if (unlink) 1456 list_del_init(&dev->close_list); 1457 } 1458 1459 return 0; 1460 } 1461 EXPORT_SYMBOL(dev_close_many); 1462 1463 /** 1464 * dev_close - shutdown an interface. 1465 * @dev: device to shutdown 1466 * 1467 * This function moves an active device into down state. A 1468 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1469 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1470 * chain. 1471 */ 1472 int dev_close(struct net_device *dev) 1473 { 1474 if (dev->flags & IFF_UP) { 1475 LIST_HEAD(single); 1476 1477 list_add(&dev->close_list, &single); 1478 dev_close_many(&single, true); 1479 list_del(&single); 1480 } 1481 return 0; 1482 } 1483 EXPORT_SYMBOL(dev_close); 1484 1485 1486 /** 1487 * dev_disable_lro - disable Large Receive Offload on a device 1488 * @dev: device 1489 * 1490 * Disable Large Receive Offload (LRO) on a net device. Must be 1491 * called under RTNL. This is needed if received packets may be 1492 * forwarded to another interface. 1493 */ 1494 void dev_disable_lro(struct net_device *dev) 1495 { 1496 struct net_device *lower_dev; 1497 struct list_head *iter; 1498 1499 dev->wanted_features &= ~NETIF_F_LRO; 1500 netdev_update_features(dev); 1501 1502 if (unlikely(dev->features & NETIF_F_LRO)) 1503 netdev_WARN(dev, "failed to disable LRO!\n"); 1504 1505 netdev_for_each_lower_dev(dev, lower_dev, iter) 1506 dev_disable_lro(lower_dev); 1507 } 1508 EXPORT_SYMBOL(dev_disable_lro); 1509 1510 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1511 struct net_device *dev) 1512 { 1513 struct netdev_notifier_info info; 1514 1515 netdev_notifier_info_init(&info, dev); 1516 return nb->notifier_call(nb, val, &info); 1517 } 1518 1519 static int dev_boot_phase = 1; 1520 1521 /** 1522 * register_netdevice_notifier - register a network notifier block 1523 * @nb: notifier 1524 * 1525 * Register a notifier to be called when network device events occur. 1526 * The notifier passed is linked into the kernel structures and must 1527 * not be reused until it has been unregistered. A negative errno code 1528 * is returned on a failure. 1529 * 1530 * When registered all registration and up events are replayed 1531 * to the new notifier to allow device to have a race free 1532 * view of the network device list. 1533 */ 1534 1535 int register_netdevice_notifier(struct notifier_block *nb) 1536 { 1537 struct net_device *dev; 1538 struct net_device *last; 1539 struct net *net; 1540 int err; 1541 1542 rtnl_lock(); 1543 err = raw_notifier_chain_register(&netdev_chain, nb); 1544 if (err) 1545 goto unlock; 1546 if (dev_boot_phase) 1547 goto unlock; 1548 for_each_net(net) { 1549 for_each_netdev(net, dev) { 1550 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1551 err = notifier_to_errno(err); 1552 if (err) 1553 goto rollback; 1554 1555 if (!(dev->flags & IFF_UP)) 1556 continue; 1557 1558 call_netdevice_notifier(nb, NETDEV_UP, dev); 1559 } 1560 } 1561 1562 unlock: 1563 rtnl_unlock(); 1564 return err; 1565 1566 rollback: 1567 last = dev; 1568 for_each_net(net) { 1569 for_each_netdev(net, dev) { 1570 if (dev == last) 1571 goto outroll; 1572 1573 if (dev->flags & IFF_UP) { 1574 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1575 dev); 1576 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1577 } 1578 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1579 } 1580 } 1581 1582 outroll: 1583 raw_notifier_chain_unregister(&netdev_chain, nb); 1584 goto unlock; 1585 } 1586 EXPORT_SYMBOL(register_netdevice_notifier); 1587 1588 /** 1589 * unregister_netdevice_notifier - unregister a network notifier block 1590 * @nb: notifier 1591 * 1592 * Unregister a notifier previously registered by 1593 * register_netdevice_notifier(). The notifier is unlinked into the 1594 * kernel structures and may then be reused. A negative errno code 1595 * is returned on a failure. 1596 * 1597 * After unregistering unregister and down device events are synthesized 1598 * for all devices on the device list to the removed notifier to remove 1599 * the need for special case cleanup code. 1600 */ 1601 1602 int unregister_netdevice_notifier(struct notifier_block *nb) 1603 { 1604 struct net_device *dev; 1605 struct net *net; 1606 int err; 1607 1608 rtnl_lock(); 1609 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1610 if (err) 1611 goto unlock; 1612 1613 for_each_net(net) { 1614 for_each_netdev(net, dev) { 1615 if (dev->flags & IFF_UP) { 1616 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1617 dev); 1618 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1619 } 1620 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1621 } 1622 } 1623 unlock: 1624 rtnl_unlock(); 1625 return err; 1626 } 1627 EXPORT_SYMBOL(unregister_netdevice_notifier); 1628 1629 /** 1630 * call_netdevice_notifiers_info - call all network notifier blocks 1631 * @val: value passed unmodified to notifier function 1632 * @dev: net_device pointer passed unmodified to notifier function 1633 * @info: notifier information data 1634 * 1635 * Call all network notifier blocks. Parameters and return value 1636 * are as for raw_notifier_call_chain(). 1637 */ 1638 1639 static int call_netdevice_notifiers_info(unsigned long val, 1640 struct net_device *dev, 1641 struct netdev_notifier_info *info) 1642 { 1643 ASSERT_RTNL(); 1644 netdev_notifier_info_init(info, dev); 1645 return raw_notifier_call_chain(&netdev_chain, val, info); 1646 } 1647 1648 /** 1649 * call_netdevice_notifiers - call all network notifier blocks 1650 * @val: value passed unmodified to notifier function 1651 * @dev: net_device pointer passed unmodified to notifier function 1652 * 1653 * Call all network notifier blocks. Parameters and return value 1654 * are as for raw_notifier_call_chain(). 1655 */ 1656 1657 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1658 { 1659 struct netdev_notifier_info info; 1660 1661 return call_netdevice_notifiers_info(val, dev, &info); 1662 } 1663 EXPORT_SYMBOL(call_netdevice_notifiers); 1664 1665 #ifdef CONFIG_NET_INGRESS 1666 static struct static_key ingress_needed __read_mostly; 1667 1668 void net_inc_ingress_queue(void) 1669 { 1670 static_key_slow_inc(&ingress_needed); 1671 } 1672 EXPORT_SYMBOL_GPL(net_inc_ingress_queue); 1673 1674 void net_dec_ingress_queue(void) 1675 { 1676 static_key_slow_dec(&ingress_needed); 1677 } 1678 EXPORT_SYMBOL_GPL(net_dec_ingress_queue); 1679 #endif 1680 1681 #ifdef CONFIG_NET_EGRESS 1682 static struct static_key egress_needed __read_mostly; 1683 1684 void net_inc_egress_queue(void) 1685 { 1686 static_key_slow_inc(&egress_needed); 1687 } 1688 EXPORT_SYMBOL_GPL(net_inc_egress_queue); 1689 1690 void net_dec_egress_queue(void) 1691 { 1692 static_key_slow_dec(&egress_needed); 1693 } 1694 EXPORT_SYMBOL_GPL(net_dec_egress_queue); 1695 #endif 1696 1697 static struct static_key netstamp_needed __read_mostly; 1698 #ifdef HAVE_JUMP_LABEL 1699 /* We are not allowed to call static_key_slow_dec() from irq context 1700 * If net_disable_timestamp() is called from irq context, defer the 1701 * static_key_slow_dec() calls. 1702 */ 1703 static atomic_t netstamp_needed_deferred; 1704 #endif 1705 1706 void net_enable_timestamp(void) 1707 { 1708 #ifdef HAVE_JUMP_LABEL 1709 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1710 1711 if (deferred) { 1712 while (--deferred) 1713 static_key_slow_dec(&netstamp_needed); 1714 return; 1715 } 1716 #endif 1717 static_key_slow_inc(&netstamp_needed); 1718 } 1719 EXPORT_SYMBOL(net_enable_timestamp); 1720 1721 void net_disable_timestamp(void) 1722 { 1723 #ifdef HAVE_JUMP_LABEL 1724 if (in_interrupt()) { 1725 atomic_inc(&netstamp_needed_deferred); 1726 return; 1727 } 1728 #endif 1729 static_key_slow_dec(&netstamp_needed); 1730 } 1731 EXPORT_SYMBOL(net_disable_timestamp); 1732 1733 static inline void net_timestamp_set(struct sk_buff *skb) 1734 { 1735 skb->tstamp.tv64 = 0; 1736 if (static_key_false(&netstamp_needed)) 1737 __net_timestamp(skb); 1738 } 1739 1740 #define net_timestamp_check(COND, SKB) \ 1741 if (static_key_false(&netstamp_needed)) { \ 1742 if ((COND) && !(SKB)->tstamp.tv64) \ 1743 __net_timestamp(SKB); \ 1744 } \ 1745 1746 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb) 1747 { 1748 unsigned int len; 1749 1750 if (!(dev->flags & IFF_UP)) 1751 return false; 1752 1753 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1754 if (skb->len <= len) 1755 return true; 1756 1757 /* if TSO is enabled, we don't care about the length as the packet 1758 * could be forwarded without being segmented before 1759 */ 1760 if (skb_is_gso(skb)) 1761 return true; 1762 1763 return false; 1764 } 1765 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1766 1767 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1768 { 1769 int ret = ____dev_forward_skb(dev, skb); 1770 1771 if (likely(!ret)) { 1772 skb->protocol = eth_type_trans(skb, dev); 1773 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1774 } 1775 1776 return ret; 1777 } 1778 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1779 1780 /** 1781 * dev_forward_skb - loopback an skb to another netif 1782 * 1783 * @dev: destination network device 1784 * @skb: buffer to forward 1785 * 1786 * return values: 1787 * NET_RX_SUCCESS (no congestion) 1788 * NET_RX_DROP (packet was dropped, but freed) 1789 * 1790 * dev_forward_skb can be used for injecting an skb from the 1791 * start_xmit function of one device into the receive queue 1792 * of another device. 1793 * 1794 * The receiving device may be in another namespace, so 1795 * we have to clear all information in the skb that could 1796 * impact namespace isolation. 1797 */ 1798 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1799 { 1800 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1801 } 1802 EXPORT_SYMBOL_GPL(dev_forward_skb); 1803 1804 static inline int deliver_skb(struct sk_buff *skb, 1805 struct packet_type *pt_prev, 1806 struct net_device *orig_dev) 1807 { 1808 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1809 return -ENOMEM; 1810 atomic_inc(&skb->users); 1811 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1812 } 1813 1814 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1815 struct packet_type **pt, 1816 struct net_device *orig_dev, 1817 __be16 type, 1818 struct list_head *ptype_list) 1819 { 1820 struct packet_type *ptype, *pt_prev = *pt; 1821 1822 list_for_each_entry_rcu(ptype, ptype_list, list) { 1823 if (ptype->type != type) 1824 continue; 1825 if (pt_prev) 1826 deliver_skb(skb, pt_prev, orig_dev); 1827 pt_prev = ptype; 1828 } 1829 *pt = pt_prev; 1830 } 1831 1832 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1833 { 1834 if (!ptype->af_packet_priv || !skb->sk) 1835 return false; 1836 1837 if (ptype->id_match) 1838 return ptype->id_match(ptype, skb->sk); 1839 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1840 return true; 1841 1842 return false; 1843 } 1844 1845 /* 1846 * Support routine. Sends outgoing frames to any network 1847 * taps currently in use. 1848 */ 1849 1850 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1851 { 1852 struct packet_type *ptype; 1853 struct sk_buff *skb2 = NULL; 1854 struct packet_type *pt_prev = NULL; 1855 struct list_head *ptype_list = &ptype_all; 1856 1857 rcu_read_lock(); 1858 again: 1859 list_for_each_entry_rcu(ptype, ptype_list, list) { 1860 /* Never send packets back to the socket 1861 * they originated from - MvS (miquels@drinkel.ow.org) 1862 */ 1863 if (skb_loop_sk(ptype, skb)) 1864 continue; 1865 1866 if (pt_prev) { 1867 deliver_skb(skb2, pt_prev, skb->dev); 1868 pt_prev = ptype; 1869 continue; 1870 } 1871 1872 /* need to clone skb, done only once */ 1873 skb2 = skb_clone(skb, GFP_ATOMIC); 1874 if (!skb2) 1875 goto out_unlock; 1876 1877 net_timestamp_set(skb2); 1878 1879 /* skb->nh should be correctly 1880 * set by sender, so that the second statement is 1881 * just protection against buggy protocols. 1882 */ 1883 skb_reset_mac_header(skb2); 1884 1885 if (skb_network_header(skb2) < skb2->data || 1886 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1887 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1888 ntohs(skb2->protocol), 1889 dev->name); 1890 skb_reset_network_header(skb2); 1891 } 1892 1893 skb2->transport_header = skb2->network_header; 1894 skb2->pkt_type = PACKET_OUTGOING; 1895 pt_prev = ptype; 1896 } 1897 1898 if (ptype_list == &ptype_all) { 1899 ptype_list = &dev->ptype_all; 1900 goto again; 1901 } 1902 out_unlock: 1903 if (pt_prev) 1904 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1905 rcu_read_unlock(); 1906 } 1907 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit); 1908 1909 /** 1910 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1911 * @dev: Network device 1912 * @txq: number of queues available 1913 * 1914 * If real_num_tx_queues is changed the tc mappings may no longer be 1915 * valid. To resolve this verify the tc mapping remains valid and if 1916 * not NULL the mapping. With no priorities mapping to this 1917 * offset/count pair it will no longer be used. In the worst case TC0 1918 * is invalid nothing can be done so disable priority mappings. If is 1919 * expected that drivers will fix this mapping if they can before 1920 * calling netif_set_real_num_tx_queues. 1921 */ 1922 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1923 { 1924 int i; 1925 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1926 1927 /* If TC0 is invalidated disable TC mapping */ 1928 if (tc->offset + tc->count > txq) { 1929 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1930 dev->num_tc = 0; 1931 return; 1932 } 1933 1934 /* Invalidated prio to tc mappings set to TC0 */ 1935 for (i = 1; i < TC_BITMASK + 1; i++) { 1936 int q = netdev_get_prio_tc_map(dev, i); 1937 1938 tc = &dev->tc_to_txq[q]; 1939 if (tc->offset + tc->count > txq) { 1940 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1941 i, q); 1942 netdev_set_prio_tc_map(dev, i, 0); 1943 } 1944 } 1945 } 1946 1947 #ifdef CONFIG_XPS 1948 static DEFINE_MUTEX(xps_map_mutex); 1949 #define xmap_dereference(P) \ 1950 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1951 1952 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1953 int cpu, u16 index) 1954 { 1955 struct xps_map *map = NULL; 1956 int pos; 1957 1958 if (dev_maps) 1959 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1960 1961 for (pos = 0; map && pos < map->len; pos++) { 1962 if (map->queues[pos] == index) { 1963 if (map->len > 1) { 1964 map->queues[pos] = map->queues[--map->len]; 1965 } else { 1966 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1967 kfree_rcu(map, rcu); 1968 map = NULL; 1969 } 1970 break; 1971 } 1972 } 1973 1974 return map; 1975 } 1976 1977 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1978 { 1979 struct xps_dev_maps *dev_maps; 1980 int cpu, i; 1981 bool active = false; 1982 1983 mutex_lock(&xps_map_mutex); 1984 dev_maps = xmap_dereference(dev->xps_maps); 1985 1986 if (!dev_maps) 1987 goto out_no_maps; 1988 1989 for_each_possible_cpu(cpu) { 1990 for (i = index; i < dev->num_tx_queues; i++) { 1991 if (!remove_xps_queue(dev_maps, cpu, i)) 1992 break; 1993 } 1994 if (i == dev->num_tx_queues) 1995 active = true; 1996 } 1997 1998 if (!active) { 1999 RCU_INIT_POINTER(dev->xps_maps, NULL); 2000 kfree_rcu(dev_maps, rcu); 2001 } 2002 2003 for (i = index; i < dev->num_tx_queues; i++) 2004 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 2005 NUMA_NO_NODE); 2006 2007 out_no_maps: 2008 mutex_unlock(&xps_map_mutex); 2009 } 2010 2011 static struct xps_map *expand_xps_map(struct xps_map *map, 2012 int cpu, u16 index) 2013 { 2014 struct xps_map *new_map; 2015 int alloc_len = XPS_MIN_MAP_ALLOC; 2016 int i, pos; 2017 2018 for (pos = 0; map && pos < map->len; pos++) { 2019 if (map->queues[pos] != index) 2020 continue; 2021 return map; 2022 } 2023 2024 /* Need to add queue to this CPU's existing map */ 2025 if (map) { 2026 if (pos < map->alloc_len) 2027 return map; 2028 2029 alloc_len = map->alloc_len * 2; 2030 } 2031 2032 /* Need to allocate new map to store queue on this CPU's map */ 2033 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 2034 cpu_to_node(cpu)); 2035 if (!new_map) 2036 return NULL; 2037 2038 for (i = 0; i < pos; i++) 2039 new_map->queues[i] = map->queues[i]; 2040 new_map->alloc_len = alloc_len; 2041 new_map->len = pos; 2042 2043 return new_map; 2044 } 2045 2046 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 2047 u16 index) 2048 { 2049 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 2050 struct xps_map *map, *new_map; 2051 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 2052 int cpu, numa_node_id = -2; 2053 bool active = false; 2054 2055 mutex_lock(&xps_map_mutex); 2056 2057 dev_maps = xmap_dereference(dev->xps_maps); 2058 2059 /* allocate memory for queue storage */ 2060 for_each_online_cpu(cpu) { 2061 if (!cpumask_test_cpu(cpu, mask)) 2062 continue; 2063 2064 if (!new_dev_maps) 2065 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2066 if (!new_dev_maps) { 2067 mutex_unlock(&xps_map_mutex); 2068 return -ENOMEM; 2069 } 2070 2071 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2072 NULL; 2073 2074 map = expand_xps_map(map, cpu, index); 2075 if (!map) 2076 goto error; 2077 2078 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2079 } 2080 2081 if (!new_dev_maps) 2082 goto out_no_new_maps; 2083 2084 for_each_possible_cpu(cpu) { 2085 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2086 /* add queue to CPU maps */ 2087 int pos = 0; 2088 2089 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2090 while ((pos < map->len) && (map->queues[pos] != index)) 2091 pos++; 2092 2093 if (pos == map->len) 2094 map->queues[map->len++] = index; 2095 #ifdef CONFIG_NUMA 2096 if (numa_node_id == -2) 2097 numa_node_id = cpu_to_node(cpu); 2098 else if (numa_node_id != cpu_to_node(cpu)) 2099 numa_node_id = -1; 2100 #endif 2101 } else if (dev_maps) { 2102 /* fill in the new device map from the old device map */ 2103 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2104 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2105 } 2106 2107 } 2108 2109 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2110 2111 /* Cleanup old maps */ 2112 if (dev_maps) { 2113 for_each_possible_cpu(cpu) { 2114 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2115 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2116 if (map && map != new_map) 2117 kfree_rcu(map, rcu); 2118 } 2119 2120 kfree_rcu(dev_maps, rcu); 2121 } 2122 2123 dev_maps = new_dev_maps; 2124 active = true; 2125 2126 out_no_new_maps: 2127 /* update Tx queue numa node */ 2128 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2129 (numa_node_id >= 0) ? numa_node_id : 2130 NUMA_NO_NODE); 2131 2132 if (!dev_maps) 2133 goto out_no_maps; 2134 2135 /* removes queue from unused CPUs */ 2136 for_each_possible_cpu(cpu) { 2137 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2138 continue; 2139 2140 if (remove_xps_queue(dev_maps, cpu, index)) 2141 active = true; 2142 } 2143 2144 /* free map if not active */ 2145 if (!active) { 2146 RCU_INIT_POINTER(dev->xps_maps, NULL); 2147 kfree_rcu(dev_maps, rcu); 2148 } 2149 2150 out_no_maps: 2151 mutex_unlock(&xps_map_mutex); 2152 2153 return 0; 2154 error: 2155 /* remove any maps that we added */ 2156 for_each_possible_cpu(cpu) { 2157 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2158 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2159 NULL; 2160 if (new_map && new_map != map) 2161 kfree(new_map); 2162 } 2163 2164 mutex_unlock(&xps_map_mutex); 2165 2166 kfree(new_dev_maps); 2167 return -ENOMEM; 2168 } 2169 EXPORT_SYMBOL(netif_set_xps_queue); 2170 2171 #endif 2172 /* 2173 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2174 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2175 */ 2176 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2177 { 2178 int rc; 2179 2180 if (txq < 1 || txq > dev->num_tx_queues) 2181 return -EINVAL; 2182 2183 if (dev->reg_state == NETREG_REGISTERED || 2184 dev->reg_state == NETREG_UNREGISTERING) { 2185 ASSERT_RTNL(); 2186 2187 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2188 txq); 2189 if (rc) 2190 return rc; 2191 2192 if (dev->num_tc) 2193 netif_setup_tc(dev, txq); 2194 2195 if (txq < dev->real_num_tx_queues) { 2196 qdisc_reset_all_tx_gt(dev, txq); 2197 #ifdef CONFIG_XPS 2198 netif_reset_xps_queues_gt(dev, txq); 2199 #endif 2200 } 2201 } 2202 2203 dev->real_num_tx_queues = txq; 2204 return 0; 2205 } 2206 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2207 2208 #ifdef CONFIG_SYSFS 2209 /** 2210 * netif_set_real_num_rx_queues - set actual number of RX queues used 2211 * @dev: Network device 2212 * @rxq: Actual number of RX queues 2213 * 2214 * This must be called either with the rtnl_lock held or before 2215 * registration of the net device. Returns 0 on success, or a 2216 * negative error code. If called before registration, it always 2217 * succeeds. 2218 */ 2219 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2220 { 2221 int rc; 2222 2223 if (rxq < 1 || rxq > dev->num_rx_queues) 2224 return -EINVAL; 2225 2226 if (dev->reg_state == NETREG_REGISTERED) { 2227 ASSERT_RTNL(); 2228 2229 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2230 rxq); 2231 if (rc) 2232 return rc; 2233 } 2234 2235 dev->real_num_rx_queues = rxq; 2236 return 0; 2237 } 2238 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2239 #endif 2240 2241 /** 2242 * netif_get_num_default_rss_queues - default number of RSS queues 2243 * 2244 * This routine should set an upper limit on the number of RSS queues 2245 * used by default by multiqueue devices. 2246 */ 2247 int netif_get_num_default_rss_queues(void) 2248 { 2249 return is_kdump_kernel() ? 2250 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2251 } 2252 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2253 2254 static void __netif_reschedule(struct Qdisc *q) 2255 { 2256 struct softnet_data *sd; 2257 unsigned long flags; 2258 2259 local_irq_save(flags); 2260 sd = this_cpu_ptr(&softnet_data); 2261 q->next_sched = NULL; 2262 *sd->output_queue_tailp = q; 2263 sd->output_queue_tailp = &q->next_sched; 2264 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2265 local_irq_restore(flags); 2266 } 2267 2268 void __netif_schedule(struct Qdisc *q) 2269 { 2270 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2271 __netif_reschedule(q); 2272 } 2273 EXPORT_SYMBOL(__netif_schedule); 2274 2275 struct dev_kfree_skb_cb { 2276 enum skb_free_reason reason; 2277 }; 2278 2279 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2280 { 2281 return (struct dev_kfree_skb_cb *)skb->cb; 2282 } 2283 2284 void netif_schedule_queue(struct netdev_queue *txq) 2285 { 2286 rcu_read_lock(); 2287 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2288 struct Qdisc *q = rcu_dereference(txq->qdisc); 2289 2290 __netif_schedule(q); 2291 } 2292 rcu_read_unlock(); 2293 } 2294 EXPORT_SYMBOL(netif_schedule_queue); 2295 2296 /** 2297 * netif_wake_subqueue - allow sending packets on subqueue 2298 * @dev: network device 2299 * @queue_index: sub queue index 2300 * 2301 * Resume individual transmit queue of a device with multiple transmit queues. 2302 */ 2303 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2304 { 2305 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2306 2307 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2308 struct Qdisc *q; 2309 2310 rcu_read_lock(); 2311 q = rcu_dereference(txq->qdisc); 2312 __netif_schedule(q); 2313 rcu_read_unlock(); 2314 } 2315 } 2316 EXPORT_SYMBOL(netif_wake_subqueue); 2317 2318 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2319 { 2320 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2321 struct Qdisc *q; 2322 2323 rcu_read_lock(); 2324 q = rcu_dereference(dev_queue->qdisc); 2325 __netif_schedule(q); 2326 rcu_read_unlock(); 2327 } 2328 } 2329 EXPORT_SYMBOL(netif_tx_wake_queue); 2330 2331 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2332 { 2333 unsigned long flags; 2334 2335 if (likely(atomic_read(&skb->users) == 1)) { 2336 smp_rmb(); 2337 atomic_set(&skb->users, 0); 2338 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2339 return; 2340 } 2341 get_kfree_skb_cb(skb)->reason = reason; 2342 local_irq_save(flags); 2343 skb->next = __this_cpu_read(softnet_data.completion_queue); 2344 __this_cpu_write(softnet_data.completion_queue, skb); 2345 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2346 local_irq_restore(flags); 2347 } 2348 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2349 2350 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2351 { 2352 if (in_irq() || irqs_disabled()) 2353 __dev_kfree_skb_irq(skb, reason); 2354 else 2355 dev_kfree_skb(skb); 2356 } 2357 EXPORT_SYMBOL(__dev_kfree_skb_any); 2358 2359 2360 /** 2361 * netif_device_detach - mark device as removed 2362 * @dev: network device 2363 * 2364 * Mark device as removed from system and therefore no longer available. 2365 */ 2366 void netif_device_detach(struct net_device *dev) 2367 { 2368 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2369 netif_running(dev)) { 2370 netif_tx_stop_all_queues(dev); 2371 } 2372 } 2373 EXPORT_SYMBOL(netif_device_detach); 2374 2375 /** 2376 * netif_device_attach - mark device as attached 2377 * @dev: network device 2378 * 2379 * Mark device as attached from system and restart if needed. 2380 */ 2381 void netif_device_attach(struct net_device *dev) 2382 { 2383 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2384 netif_running(dev)) { 2385 netif_tx_wake_all_queues(dev); 2386 __netdev_watchdog_up(dev); 2387 } 2388 } 2389 EXPORT_SYMBOL(netif_device_attach); 2390 2391 /* 2392 * Returns a Tx hash based on the given packet descriptor a Tx queues' number 2393 * to be used as a distribution range. 2394 */ 2395 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb, 2396 unsigned int num_tx_queues) 2397 { 2398 u32 hash; 2399 u16 qoffset = 0; 2400 u16 qcount = num_tx_queues; 2401 2402 if (skb_rx_queue_recorded(skb)) { 2403 hash = skb_get_rx_queue(skb); 2404 while (unlikely(hash >= num_tx_queues)) 2405 hash -= num_tx_queues; 2406 return hash; 2407 } 2408 2409 if (dev->num_tc) { 2410 u8 tc = netdev_get_prio_tc_map(dev, skb->priority); 2411 qoffset = dev->tc_to_txq[tc].offset; 2412 qcount = dev->tc_to_txq[tc].count; 2413 } 2414 2415 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset; 2416 } 2417 EXPORT_SYMBOL(__skb_tx_hash); 2418 2419 static void skb_warn_bad_offload(const struct sk_buff *skb) 2420 { 2421 static const netdev_features_t null_features; 2422 struct net_device *dev = skb->dev; 2423 const char *name = ""; 2424 2425 if (!net_ratelimit()) 2426 return; 2427 2428 if (dev) { 2429 if (dev->dev.parent) 2430 name = dev_driver_string(dev->dev.parent); 2431 else 2432 name = netdev_name(dev); 2433 } 2434 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2435 "gso_type=%d ip_summed=%d\n", 2436 name, dev ? &dev->features : &null_features, 2437 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2438 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2439 skb_shinfo(skb)->gso_type, skb->ip_summed); 2440 } 2441 2442 /* 2443 * Invalidate hardware checksum when packet is to be mangled, and 2444 * complete checksum manually on outgoing path. 2445 */ 2446 int skb_checksum_help(struct sk_buff *skb) 2447 { 2448 __wsum csum; 2449 int ret = 0, offset; 2450 2451 if (skb->ip_summed == CHECKSUM_COMPLETE) 2452 goto out_set_summed; 2453 2454 if (unlikely(skb_shinfo(skb)->gso_size)) { 2455 skb_warn_bad_offload(skb); 2456 return -EINVAL; 2457 } 2458 2459 /* Before computing a checksum, we should make sure no frag could 2460 * be modified by an external entity : checksum could be wrong. 2461 */ 2462 if (skb_has_shared_frag(skb)) { 2463 ret = __skb_linearize(skb); 2464 if (ret) 2465 goto out; 2466 } 2467 2468 offset = skb_checksum_start_offset(skb); 2469 BUG_ON(offset >= skb_headlen(skb)); 2470 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2471 2472 offset += skb->csum_offset; 2473 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2474 2475 if (skb_cloned(skb) && 2476 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2477 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2478 if (ret) 2479 goto out; 2480 } 2481 2482 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; 2483 out_set_summed: 2484 skb->ip_summed = CHECKSUM_NONE; 2485 out: 2486 return ret; 2487 } 2488 EXPORT_SYMBOL(skb_checksum_help); 2489 2490 /* skb_csum_offload_check - Driver helper function to determine if a device 2491 * with limited checksum offload capabilities is able to offload the checksum 2492 * for a given packet. 2493 * 2494 * Arguments: 2495 * skb - sk_buff for the packet in question 2496 * spec - contains the description of what device can offload 2497 * csum_encapped - returns true if the checksum being offloaded is 2498 * encpasulated. That is it is checksum for the transport header 2499 * in the inner headers. 2500 * checksum_help - when set indicates that helper function should 2501 * call skb_checksum_help if offload checks fail 2502 * 2503 * Returns: 2504 * true: Packet has passed the checksum checks and should be offloadable to 2505 * the device (a driver may still need to check for additional 2506 * restrictions of its device) 2507 * false: Checksum is not offloadable. If checksum_help was set then 2508 * skb_checksum_help was called to resolve checksum for non-GSO 2509 * packets and when IP protocol is not SCTP 2510 */ 2511 bool __skb_csum_offload_chk(struct sk_buff *skb, 2512 const struct skb_csum_offl_spec *spec, 2513 bool *csum_encapped, 2514 bool csum_help) 2515 { 2516 struct iphdr *iph; 2517 struct ipv6hdr *ipv6; 2518 void *nhdr; 2519 int protocol; 2520 u8 ip_proto; 2521 2522 if (skb->protocol == htons(ETH_P_8021Q) || 2523 skb->protocol == htons(ETH_P_8021AD)) { 2524 if (!spec->vlan_okay) 2525 goto need_help; 2526 } 2527 2528 /* We check whether the checksum refers to a transport layer checksum in 2529 * the outermost header or an encapsulated transport layer checksum that 2530 * corresponds to the inner headers of the skb. If the checksum is for 2531 * something else in the packet we need help. 2532 */ 2533 if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) { 2534 /* Non-encapsulated checksum */ 2535 protocol = eproto_to_ipproto(vlan_get_protocol(skb)); 2536 nhdr = skb_network_header(skb); 2537 *csum_encapped = false; 2538 if (spec->no_not_encapped) 2539 goto need_help; 2540 } else if (skb->encapsulation && spec->encap_okay && 2541 skb_checksum_start_offset(skb) == 2542 skb_inner_transport_offset(skb)) { 2543 /* Encapsulated checksum */ 2544 *csum_encapped = true; 2545 switch (skb->inner_protocol_type) { 2546 case ENCAP_TYPE_ETHER: 2547 protocol = eproto_to_ipproto(skb->inner_protocol); 2548 break; 2549 case ENCAP_TYPE_IPPROTO: 2550 protocol = skb->inner_protocol; 2551 break; 2552 } 2553 nhdr = skb_inner_network_header(skb); 2554 } else { 2555 goto need_help; 2556 } 2557 2558 switch (protocol) { 2559 case IPPROTO_IP: 2560 if (!spec->ipv4_okay) 2561 goto need_help; 2562 iph = nhdr; 2563 ip_proto = iph->protocol; 2564 if (iph->ihl != 5 && !spec->ip_options_okay) 2565 goto need_help; 2566 break; 2567 case IPPROTO_IPV6: 2568 if (!spec->ipv6_okay) 2569 goto need_help; 2570 if (spec->no_encapped_ipv6 && *csum_encapped) 2571 goto need_help; 2572 ipv6 = nhdr; 2573 nhdr += sizeof(*ipv6); 2574 ip_proto = ipv6->nexthdr; 2575 break; 2576 default: 2577 goto need_help; 2578 } 2579 2580 ip_proto_again: 2581 switch (ip_proto) { 2582 case IPPROTO_TCP: 2583 if (!spec->tcp_okay || 2584 skb->csum_offset != offsetof(struct tcphdr, check)) 2585 goto need_help; 2586 break; 2587 case IPPROTO_UDP: 2588 if (!spec->udp_okay || 2589 skb->csum_offset != offsetof(struct udphdr, check)) 2590 goto need_help; 2591 break; 2592 case IPPROTO_SCTP: 2593 if (!spec->sctp_okay || 2594 skb->csum_offset != offsetof(struct sctphdr, checksum)) 2595 goto cant_help; 2596 break; 2597 case NEXTHDR_HOP: 2598 case NEXTHDR_ROUTING: 2599 case NEXTHDR_DEST: { 2600 u8 *opthdr = nhdr; 2601 2602 if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay) 2603 goto need_help; 2604 2605 ip_proto = opthdr[0]; 2606 nhdr += (opthdr[1] + 1) << 3; 2607 2608 goto ip_proto_again; 2609 } 2610 default: 2611 goto need_help; 2612 } 2613 2614 /* Passed the tests for offloading checksum */ 2615 return true; 2616 2617 need_help: 2618 if (csum_help && !skb_shinfo(skb)->gso_size) 2619 skb_checksum_help(skb); 2620 cant_help: 2621 return false; 2622 } 2623 EXPORT_SYMBOL(__skb_csum_offload_chk); 2624 2625 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2626 { 2627 __be16 type = skb->protocol; 2628 2629 /* Tunnel gso handlers can set protocol to ethernet. */ 2630 if (type == htons(ETH_P_TEB)) { 2631 struct ethhdr *eth; 2632 2633 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2634 return 0; 2635 2636 eth = (struct ethhdr *)skb_mac_header(skb); 2637 type = eth->h_proto; 2638 } 2639 2640 return __vlan_get_protocol(skb, type, depth); 2641 } 2642 2643 /** 2644 * skb_mac_gso_segment - mac layer segmentation handler. 2645 * @skb: buffer to segment 2646 * @features: features for the output path (see dev->features) 2647 */ 2648 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2649 netdev_features_t features) 2650 { 2651 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2652 struct packet_offload *ptype; 2653 int vlan_depth = skb->mac_len; 2654 __be16 type = skb_network_protocol(skb, &vlan_depth); 2655 2656 if (unlikely(!type)) 2657 return ERR_PTR(-EINVAL); 2658 2659 __skb_pull(skb, vlan_depth); 2660 2661 rcu_read_lock(); 2662 list_for_each_entry_rcu(ptype, &offload_base, list) { 2663 if (ptype->type == type && ptype->callbacks.gso_segment) { 2664 segs = ptype->callbacks.gso_segment(skb, features); 2665 break; 2666 } 2667 } 2668 rcu_read_unlock(); 2669 2670 __skb_push(skb, skb->data - skb_mac_header(skb)); 2671 2672 return segs; 2673 } 2674 EXPORT_SYMBOL(skb_mac_gso_segment); 2675 2676 2677 /* openvswitch calls this on rx path, so we need a different check. 2678 */ 2679 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2680 { 2681 if (tx_path) 2682 return skb->ip_summed != CHECKSUM_PARTIAL; 2683 else 2684 return skb->ip_summed == CHECKSUM_NONE; 2685 } 2686 2687 /** 2688 * __skb_gso_segment - Perform segmentation on skb. 2689 * @skb: buffer to segment 2690 * @features: features for the output path (see dev->features) 2691 * @tx_path: whether it is called in TX path 2692 * 2693 * This function segments the given skb and returns a list of segments. 2694 * 2695 * It may return NULL if the skb requires no segmentation. This is 2696 * only possible when GSO is used for verifying header integrity. 2697 * 2698 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb. 2699 */ 2700 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2701 netdev_features_t features, bool tx_path) 2702 { 2703 if (unlikely(skb_needs_check(skb, tx_path))) { 2704 int err; 2705 2706 skb_warn_bad_offload(skb); 2707 2708 err = skb_cow_head(skb, 0); 2709 if (err < 0) 2710 return ERR_PTR(err); 2711 } 2712 2713 /* Only report GSO partial support if it will enable us to 2714 * support segmentation on this frame without needing additional 2715 * work. 2716 */ 2717 if (features & NETIF_F_GSO_PARTIAL) { 2718 netdev_features_t partial_features = NETIF_F_GSO_ROBUST; 2719 struct net_device *dev = skb->dev; 2720 2721 partial_features |= dev->features & dev->gso_partial_features; 2722 if (!skb_gso_ok(skb, features | partial_features)) 2723 features &= ~NETIF_F_GSO_PARTIAL; 2724 } 2725 2726 BUILD_BUG_ON(SKB_SGO_CB_OFFSET + 2727 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); 2728 2729 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2730 SKB_GSO_CB(skb)->encap_level = 0; 2731 2732 skb_reset_mac_header(skb); 2733 skb_reset_mac_len(skb); 2734 2735 return skb_mac_gso_segment(skb, features); 2736 } 2737 EXPORT_SYMBOL(__skb_gso_segment); 2738 2739 /* Take action when hardware reception checksum errors are detected. */ 2740 #ifdef CONFIG_BUG 2741 void netdev_rx_csum_fault(struct net_device *dev) 2742 { 2743 if (net_ratelimit()) { 2744 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2745 dump_stack(); 2746 } 2747 } 2748 EXPORT_SYMBOL(netdev_rx_csum_fault); 2749 #endif 2750 2751 /* Actually, we should eliminate this check as soon as we know, that: 2752 * 1. IOMMU is present and allows to map all the memory. 2753 * 2. No high memory really exists on this machine. 2754 */ 2755 2756 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2757 { 2758 #ifdef CONFIG_HIGHMEM 2759 int i; 2760 if (!(dev->features & NETIF_F_HIGHDMA)) { 2761 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2762 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2763 if (PageHighMem(skb_frag_page(frag))) 2764 return 1; 2765 } 2766 } 2767 2768 if (PCI_DMA_BUS_IS_PHYS) { 2769 struct device *pdev = dev->dev.parent; 2770 2771 if (!pdev) 2772 return 0; 2773 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2774 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2775 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2776 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2777 return 1; 2778 } 2779 } 2780 #endif 2781 return 0; 2782 } 2783 2784 /* If MPLS offload request, verify we are testing hardware MPLS features 2785 * instead of standard features for the netdev. 2786 */ 2787 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2788 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2789 netdev_features_t features, 2790 __be16 type) 2791 { 2792 if (eth_p_mpls(type)) 2793 features &= skb->dev->mpls_features; 2794 2795 return features; 2796 } 2797 #else 2798 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2799 netdev_features_t features, 2800 __be16 type) 2801 { 2802 return features; 2803 } 2804 #endif 2805 2806 static netdev_features_t harmonize_features(struct sk_buff *skb, 2807 netdev_features_t features) 2808 { 2809 int tmp; 2810 __be16 type; 2811 2812 type = skb_network_protocol(skb, &tmp); 2813 features = net_mpls_features(skb, features, type); 2814 2815 if (skb->ip_summed != CHECKSUM_NONE && 2816 !can_checksum_protocol(features, type)) { 2817 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); 2818 } else if (illegal_highdma(skb->dev, skb)) { 2819 features &= ~NETIF_F_SG; 2820 } 2821 2822 return features; 2823 } 2824 2825 netdev_features_t passthru_features_check(struct sk_buff *skb, 2826 struct net_device *dev, 2827 netdev_features_t features) 2828 { 2829 return features; 2830 } 2831 EXPORT_SYMBOL(passthru_features_check); 2832 2833 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2834 struct net_device *dev, 2835 netdev_features_t features) 2836 { 2837 return vlan_features_check(skb, features); 2838 } 2839 2840 static netdev_features_t gso_features_check(const struct sk_buff *skb, 2841 struct net_device *dev, 2842 netdev_features_t features) 2843 { 2844 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2845 2846 if (gso_segs > dev->gso_max_segs) 2847 return features & ~NETIF_F_GSO_MASK; 2848 2849 /* Support for GSO partial features requires software 2850 * intervention before we can actually process the packets 2851 * so we need to strip support for any partial features now 2852 * and we can pull them back in after we have partially 2853 * segmented the frame. 2854 */ 2855 if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) 2856 features &= ~dev->gso_partial_features; 2857 2858 /* Make sure to clear the IPv4 ID mangling feature if the 2859 * IPv4 header has the potential to be fragmented. 2860 */ 2861 if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { 2862 struct iphdr *iph = skb->encapsulation ? 2863 inner_ip_hdr(skb) : ip_hdr(skb); 2864 2865 if (!(iph->frag_off & htons(IP_DF))) 2866 features &= ~NETIF_F_TSO_MANGLEID; 2867 } 2868 2869 return features; 2870 } 2871 2872 netdev_features_t netif_skb_features(struct sk_buff *skb) 2873 { 2874 struct net_device *dev = skb->dev; 2875 netdev_features_t features = dev->features; 2876 2877 if (skb_is_gso(skb)) 2878 features = gso_features_check(skb, dev, features); 2879 2880 /* If encapsulation offload request, verify we are testing 2881 * hardware encapsulation features instead of standard 2882 * features for the netdev 2883 */ 2884 if (skb->encapsulation) 2885 features &= dev->hw_enc_features; 2886 2887 if (skb_vlan_tagged(skb)) 2888 features = netdev_intersect_features(features, 2889 dev->vlan_features | 2890 NETIF_F_HW_VLAN_CTAG_TX | 2891 NETIF_F_HW_VLAN_STAG_TX); 2892 2893 if (dev->netdev_ops->ndo_features_check) 2894 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2895 features); 2896 else 2897 features &= dflt_features_check(skb, dev, features); 2898 2899 return harmonize_features(skb, features); 2900 } 2901 EXPORT_SYMBOL(netif_skb_features); 2902 2903 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2904 struct netdev_queue *txq, bool more) 2905 { 2906 unsigned int len; 2907 int rc; 2908 2909 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2910 dev_queue_xmit_nit(skb, dev); 2911 2912 len = skb->len; 2913 trace_net_dev_start_xmit(skb, dev); 2914 rc = netdev_start_xmit(skb, dev, txq, more); 2915 trace_net_dev_xmit(skb, rc, dev, len); 2916 2917 return rc; 2918 } 2919 2920 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2921 struct netdev_queue *txq, int *ret) 2922 { 2923 struct sk_buff *skb = first; 2924 int rc = NETDEV_TX_OK; 2925 2926 while (skb) { 2927 struct sk_buff *next = skb->next; 2928 2929 skb->next = NULL; 2930 rc = xmit_one(skb, dev, txq, next != NULL); 2931 if (unlikely(!dev_xmit_complete(rc))) { 2932 skb->next = next; 2933 goto out; 2934 } 2935 2936 skb = next; 2937 if (netif_xmit_stopped(txq) && skb) { 2938 rc = NETDEV_TX_BUSY; 2939 break; 2940 } 2941 } 2942 2943 out: 2944 *ret = rc; 2945 return skb; 2946 } 2947 2948 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2949 netdev_features_t features) 2950 { 2951 if (skb_vlan_tag_present(skb) && 2952 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2953 skb = __vlan_hwaccel_push_inside(skb); 2954 return skb; 2955 } 2956 2957 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2958 { 2959 netdev_features_t features; 2960 2961 features = netif_skb_features(skb); 2962 skb = validate_xmit_vlan(skb, features); 2963 if (unlikely(!skb)) 2964 goto out_null; 2965 2966 if (netif_needs_gso(skb, features)) { 2967 struct sk_buff *segs; 2968 2969 segs = skb_gso_segment(skb, features); 2970 if (IS_ERR(segs)) { 2971 goto out_kfree_skb; 2972 } else if (segs) { 2973 consume_skb(skb); 2974 skb = segs; 2975 } 2976 } else { 2977 if (skb_needs_linearize(skb, features) && 2978 __skb_linearize(skb)) 2979 goto out_kfree_skb; 2980 2981 /* If packet is not checksummed and device does not 2982 * support checksumming for this protocol, complete 2983 * checksumming here. 2984 */ 2985 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2986 if (skb->encapsulation) 2987 skb_set_inner_transport_header(skb, 2988 skb_checksum_start_offset(skb)); 2989 else 2990 skb_set_transport_header(skb, 2991 skb_checksum_start_offset(skb)); 2992 if (!(features & NETIF_F_CSUM_MASK) && 2993 skb_checksum_help(skb)) 2994 goto out_kfree_skb; 2995 } 2996 } 2997 2998 return skb; 2999 3000 out_kfree_skb: 3001 kfree_skb(skb); 3002 out_null: 3003 atomic_long_inc(&dev->tx_dropped); 3004 return NULL; 3005 } 3006 3007 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 3008 { 3009 struct sk_buff *next, *head = NULL, *tail; 3010 3011 for (; skb != NULL; skb = next) { 3012 next = skb->next; 3013 skb->next = NULL; 3014 3015 /* in case skb wont be segmented, point to itself */ 3016 skb->prev = skb; 3017 3018 skb = validate_xmit_skb(skb, dev); 3019 if (!skb) 3020 continue; 3021 3022 if (!head) 3023 head = skb; 3024 else 3025 tail->next = skb; 3026 /* If skb was segmented, skb->prev points to 3027 * the last segment. If not, it still contains skb. 3028 */ 3029 tail = skb->prev; 3030 } 3031 return head; 3032 } 3033 EXPORT_SYMBOL_GPL(validate_xmit_skb_list); 3034 3035 static void qdisc_pkt_len_init(struct sk_buff *skb) 3036 { 3037 const struct skb_shared_info *shinfo = skb_shinfo(skb); 3038 3039 qdisc_skb_cb(skb)->pkt_len = skb->len; 3040 3041 /* To get more precise estimation of bytes sent on wire, 3042 * we add to pkt_len the headers size of all segments 3043 */ 3044 if (shinfo->gso_size) { 3045 unsigned int hdr_len; 3046 u16 gso_segs = shinfo->gso_segs; 3047 3048 /* mac layer + network layer */ 3049 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 3050 3051 /* + transport layer */ 3052 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 3053 hdr_len += tcp_hdrlen(skb); 3054 else 3055 hdr_len += sizeof(struct udphdr); 3056 3057 if (shinfo->gso_type & SKB_GSO_DODGY) 3058 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 3059 shinfo->gso_size); 3060 3061 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 3062 } 3063 } 3064 3065 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 3066 struct net_device *dev, 3067 struct netdev_queue *txq) 3068 { 3069 spinlock_t *root_lock = qdisc_lock(q); 3070 struct sk_buff *to_free = NULL; 3071 bool contended; 3072 int rc; 3073 3074 qdisc_calculate_pkt_len(skb, q); 3075 /* 3076 * Heuristic to force contended enqueues to serialize on a 3077 * separate lock before trying to get qdisc main lock. 3078 * This permits qdisc->running owner to get the lock more 3079 * often and dequeue packets faster. 3080 */ 3081 contended = qdisc_is_running(q); 3082 if (unlikely(contended)) 3083 spin_lock(&q->busylock); 3084 3085 spin_lock(root_lock); 3086 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 3087 __qdisc_drop(skb, &to_free); 3088 rc = NET_XMIT_DROP; 3089 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 3090 qdisc_run_begin(q)) { 3091 /* 3092 * This is a work-conserving queue; there are no old skbs 3093 * waiting to be sent out; and the qdisc is not running - 3094 * xmit the skb directly. 3095 */ 3096 3097 qdisc_bstats_update(q, skb); 3098 3099 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 3100 if (unlikely(contended)) { 3101 spin_unlock(&q->busylock); 3102 contended = false; 3103 } 3104 __qdisc_run(q); 3105 } else 3106 qdisc_run_end(q); 3107 3108 rc = NET_XMIT_SUCCESS; 3109 } else { 3110 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; 3111 if (qdisc_run_begin(q)) { 3112 if (unlikely(contended)) { 3113 spin_unlock(&q->busylock); 3114 contended = false; 3115 } 3116 __qdisc_run(q); 3117 } 3118 } 3119 spin_unlock(root_lock); 3120 if (unlikely(to_free)) 3121 kfree_skb_list(to_free); 3122 if (unlikely(contended)) 3123 spin_unlock(&q->busylock); 3124 return rc; 3125 } 3126 3127 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 3128 static void skb_update_prio(struct sk_buff *skb) 3129 { 3130 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 3131 3132 if (!skb->priority && skb->sk && map) { 3133 unsigned int prioidx = 3134 sock_cgroup_prioidx(&skb->sk->sk_cgrp_data); 3135 3136 if (prioidx < map->priomap_len) 3137 skb->priority = map->priomap[prioidx]; 3138 } 3139 } 3140 #else 3141 #define skb_update_prio(skb) 3142 #endif 3143 3144 DEFINE_PER_CPU(int, xmit_recursion); 3145 EXPORT_SYMBOL(xmit_recursion); 3146 3147 /** 3148 * dev_loopback_xmit - loop back @skb 3149 * @net: network namespace this loopback is happening in 3150 * @sk: sk needed to be a netfilter okfn 3151 * @skb: buffer to transmit 3152 */ 3153 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) 3154 { 3155 skb_reset_mac_header(skb); 3156 __skb_pull(skb, skb_network_offset(skb)); 3157 skb->pkt_type = PACKET_LOOPBACK; 3158 skb->ip_summed = CHECKSUM_UNNECESSARY; 3159 WARN_ON(!skb_dst(skb)); 3160 skb_dst_force(skb); 3161 netif_rx_ni(skb); 3162 return 0; 3163 } 3164 EXPORT_SYMBOL(dev_loopback_xmit); 3165 3166 #ifdef CONFIG_NET_EGRESS 3167 static struct sk_buff * 3168 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) 3169 { 3170 struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list); 3171 struct tcf_result cl_res; 3172 3173 if (!cl) 3174 return skb; 3175 3176 /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set 3177 * earlier by the caller. 3178 */ 3179 qdisc_bstats_cpu_update(cl->q, skb); 3180 3181 switch (tc_classify(skb, cl, &cl_res, false)) { 3182 case TC_ACT_OK: 3183 case TC_ACT_RECLASSIFY: 3184 skb->tc_index = TC_H_MIN(cl_res.classid); 3185 break; 3186 case TC_ACT_SHOT: 3187 qdisc_qstats_cpu_drop(cl->q); 3188 *ret = NET_XMIT_DROP; 3189 kfree_skb(skb); 3190 return NULL; 3191 case TC_ACT_STOLEN: 3192 case TC_ACT_QUEUED: 3193 *ret = NET_XMIT_SUCCESS; 3194 consume_skb(skb); 3195 return NULL; 3196 case TC_ACT_REDIRECT: 3197 /* No need to push/pop skb's mac_header here on egress! */ 3198 skb_do_redirect(skb); 3199 *ret = NET_XMIT_SUCCESS; 3200 return NULL; 3201 default: 3202 break; 3203 } 3204 3205 return skb; 3206 } 3207 #endif /* CONFIG_NET_EGRESS */ 3208 3209 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb) 3210 { 3211 #ifdef CONFIG_XPS 3212 struct xps_dev_maps *dev_maps; 3213 struct xps_map *map; 3214 int queue_index = -1; 3215 3216 rcu_read_lock(); 3217 dev_maps = rcu_dereference(dev->xps_maps); 3218 if (dev_maps) { 3219 map = rcu_dereference( 3220 dev_maps->cpu_map[skb->sender_cpu - 1]); 3221 if (map) { 3222 if (map->len == 1) 3223 queue_index = map->queues[0]; 3224 else 3225 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb), 3226 map->len)]; 3227 if (unlikely(queue_index >= dev->real_num_tx_queues)) 3228 queue_index = -1; 3229 } 3230 } 3231 rcu_read_unlock(); 3232 3233 return queue_index; 3234 #else 3235 return -1; 3236 #endif 3237 } 3238 3239 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb) 3240 { 3241 struct sock *sk = skb->sk; 3242 int queue_index = sk_tx_queue_get(sk); 3243 3244 if (queue_index < 0 || skb->ooo_okay || 3245 queue_index >= dev->real_num_tx_queues) { 3246 int new_index = get_xps_queue(dev, skb); 3247 if (new_index < 0) 3248 new_index = skb_tx_hash(dev, skb); 3249 3250 if (queue_index != new_index && sk && 3251 sk_fullsock(sk) && 3252 rcu_access_pointer(sk->sk_dst_cache)) 3253 sk_tx_queue_set(sk, new_index); 3254 3255 queue_index = new_index; 3256 } 3257 3258 return queue_index; 3259 } 3260 3261 struct netdev_queue *netdev_pick_tx(struct net_device *dev, 3262 struct sk_buff *skb, 3263 void *accel_priv) 3264 { 3265 int queue_index = 0; 3266 3267 #ifdef CONFIG_XPS 3268 u32 sender_cpu = skb->sender_cpu - 1; 3269 3270 if (sender_cpu >= (u32)NR_CPUS) 3271 skb->sender_cpu = raw_smp_processor_id() + 1; 3272 #endif 3273 3274 if (dev->real_num_tx_queues != 1) { 3275 const struct net_device_ops *ops = dev->netdev_ops; 3276 if (ops->ndo_select_queue) 3277 queue_index = ops->ndo_select_queue(dev, skb, accel_priv, 3278 __netdev_pick_tx); 3279 else 3280 queue_index = __netdev_pick_tx(dev, skb); 3281 3282 if (!accel_priv) 3283 queue_index = netdev_cap_txqueue(dev, queue_index); 3284 } 3285 3286 skb_set_queue_mapping(skb, queue_index); 3287 return netdev_get_tx_queue(dev, queue_index); 3288 } 3289 3290 /** 3291 * __dev_queue_xmit - transmit a buffer 3292 * @skb: buffer to transmit 3293 * @accel_priv: private data used for L2 forwarding offload 3294 * 3295 * Queue a buffer for transmission to a network device. The caller must 3296 * have set the device and priority and built the buffer before calling 3297 * this function. The function can be called from an interrupt. 3298 * 3299 * A negative errno code is returned on a failure. A success does not 3300 * guarantee the frame will be transmitted as it may be dropped due 3301 * to congestion or traffic shaping. 3302 * 3303 * ----------------------------------------------------------------------------------- 3304 * I notice this method can also return errors from the queue disciplines, 3305 * including NET_XMIT_DROP, which is a positive value. So, errors can also 3306 * be positive. 3307 * 3308 * Regardless of the return value, the skb is consumed, so it is currently 3309 * difficult to retry a send to this method. (You can bump the ref count 3310 * before sending to hold a reference for retry if you are careful.) 3311 * 3312 * When calling this method, interrupts MUST be enabled. This is because 3313 * the BH enable code must have IRQs enabled so that it will not deadlock. 3314 * --BLG 3315 */ 3316 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 3317 { 3318 struct net_device *dev = skb->dev; 3319 struct netdev_queue *txq; 3320 struct Qdisc *q; 3321 int rc = -ENOMEM; 3322 3323 skb_reset_mac_header(skb); 3324 3325 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 3326 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 3327 3328 /* Disable soft irqs for various locks below. Also 3329 * stops preemption for RCU. 3330 */ 3331 rcu_read_lock_bh(); 3332 3333 skb_update_prio(skb); 3334 3335 qdisc_pkt_len_init(skb); 3336 #ifdef CONFIG_NET_CLS_ACT 3337 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 3338 # ifdef CONFIG_NET_EGRESS 3339 if (static_key_false(&egress_needed)) { 3340 skb = sch_handle_egress(skb, &rc, dev); 3341 if (!skb) 3342 goto out; 3343 } 3344 # endif 3345 #endif 3346 /* If device/qdisc don't need skb->dst, release it right now while 3347 * its hot in this cpu cache. 3348 */ 3349 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 3350 skb_dst_drop(skb); 3351 else 3352 skb_dst_force(skb); 3353 3354 txq = netdev_pick_tx(dev, skb, accel_priv); 3355 q = rcu_dereference_bh(txq->qdisc); 3356 3357 trace_net_dev_queue(skb); 3358 if (q->enqueue) { 3359 rc = __dev_xmit_skb(skb, q, dev, txq); 3360 goto out; 3361 } 3362 3363 /* The device has no queue. Common case for software devices: 3364 loopback, all the sorts of tunnels... 3365 3366 Really, it is unlikely that netif_tx_lock protection is necessary 3367 here. (f.e. loopback and IP tunnels are clean ignoring statistics 3368 counters.) 3369 However, it is possible, that they rely on protection 3370 made by us here. 3371 3372 Check this and shot the lock. It is not prone from deadlocks. 3373 Either shot noqueue qdisc, it is even simpler 8) 3374 */ 3375 if (dev->flags & IFF_UP) { 3376 int cpu = smp_processor_id(); /* ok because BHs are off */ 3377 3378 if (txq->xmit_lock_owner != cpu) { 3379 if (unlikely(__this_cpu_read(xmit_recursion) > 3380 XMIT_RECURSION_LIMIT)) 3381 goto recursion_alert; 3382 3383 skb = validate_xmit_skb(skb, dev); 3384 if (!skb) 3385 goto out; 3386 3387 HARD_TX_LOCK(dev, txq, cpu); 3388 3389 if (!netif_xmit_stopped(txq)) { 3390 __this_cpu_inc(xmit_recursion); 3391 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 3392 __this_cpu_dec(xmit_recursion); 3393 if (dev_xmit_complete(rc)) { 3394 HARD_TX_UNLOCK(dev, txq); 3395 goto out; 3396 } 3397 } 3398 HARD_TX_UNLOCK(dev, txq); 3399 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 3400 dev->name); 3401 } else { 3402 /* Recursion is detected! It is possible, 3403 * unfortunately 3404 */ 3405 recursion_alert: 3406 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 3407 dev->name); 3408 } 3409 } 3410 3411 rc = -ENETDOWN; 3412 rcu_read_unlock_bh(); 3413 3414 atomic_long_inc(&dev->tx_dropped); 3415 kfree_skb_list(skb); 3416 return rc; 3417 out: 3418 rcu_read_unlock_bh(); 3419 return rc; 3420 } 3421 3422 int dev_queue_xmit(struct sk_buff *skb) 3423 { 3424 return __dev_queue_xmit(skb, NULL); 3425 } 3426 EXPORT_SYMBOL(dev_queue_xmit); 3427 3428 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3429 { 3430 return __dev_queue_xmit(skb, accel_priv); 3431 } 3432 EXPORT_SYMBOL(dev_queue_xmit_accel); 3433 3434 3435 /*======================================================================= 3436 Receiver routines 3437 =======================================================================*/ 3438 3439 int netdev_max_backlog __read_mostly = 1000; 3440 EXPORT_SYMBOL(netdev_max_backlog); 3441 3442 int netdev_tstamp_prequeue __read_mostly = 1; 3443 int netdev_budget __read_mostly = 300; 3444 int weight_p __read_mostly = 64; /* old backlog weight */ 3445 3446 /* Called with irq disabled */ 3447 static inline void ____napi_schedule(struct softnet_data *sd, 3448 struct napi_struct *napi) 3449 { 3450 list_add_tail(&napi->poll_list, &sd->poll_list); 3451 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3452 } 3453 3454 #ifdef CONFIG_RPS 3455 3456 /* One global table that all flow-based protocols share. */ 3457 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3458 EXPORT_SYMBOL(rps_sock_flow_table); 3459 u32 rps_cpu_mask __read_mostly; 3460 EXPORT_SYMBOL(rps_cpu_mask); 3461 3462 struct static_key rps_needed __read_mostly; 3463 EXPORT_SYMBOL(rps_needed); 3464 3465 static struct rps_dev_flow * 3466 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3467 struct rps_dev_flow *rflow, u16 next_cpu) 3468 { 3469 if (next_cpu < nr_cpu_ids) { 3470 #ifdef CONFIG_RFS_ACCEL 3471 struct netdev_rx_queue *rxqueue; 3472 struct rps_dev_flow_table *flow_table; 3473 struct rps_dev_flow *old_rflow; 3474 u32 flow_id; 3475 u16 rxq_index; 3476 int rc; 3477 3478 /* Should we steer this flow to a different hardware queue? */ 3479 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3480 !(dev->features & NETIF_F_NTUPLE)) 3481 goto out; 3482 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3483 if (rxq_index == skb_get_rx_queue(skb)) 3484 goto out; 3485 3486 rxqueue = dev->_rx + rxq_index; 3487 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3488 if (!flow_table) 3489 goto out; 3490 flow_id = skb_get_hash(skb) & flow_table->mask; 3491 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3492 rxq_index, flow_id); 3493 if (rc < 0) 3494 goto out; 3495 old_rflow = rflow; 3496 rflow = &flow_table->flows[flow_id]; 3497 rflow->filter = rc; 3498 if (old_rflow->filter == rflow->filter) 3499 old_rflow->filter = RPS_NO_FILTER; 3500 out: 3501 #endif 3502 rflow->last_qtail = 3503 per_cpu(softnet_data, next_cpu).input_queue_head; 3504 } 3505 3506 rflow->cpu = next_cpu; 3507 return rflow; 3508 } 3509 3510 /* 3511 * get_rps_cpu is called from netif_receive_skb and returns the target 3512 * CPU from the RPS map of the receiving queue for a given skb. 3513 * rcu_read_lock must be held on entry. 3514 */ 3515 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3516 struct rps_dev_flow **rflowp) 3517 { 3518 const struct rps_sock_flow_table *sock_flow_table; 3519 struct netdev_rx_queue *rxqueue = dev->_rx; 3520 struct rps_dev_flow_table *flow_table; 3521 struct rps_map *map; 3522 int cpu = -1; 3523 u32 tcpu; 3524 u32 hash; 3525 3526 if (skb_rx_queue_recorded(skb)) { 3527 u16 index = skb_get_rx_queue(skb); 3528 3529 if (unlikely(index >= dev->real_num_rx_queues)) { 3530 WARN_ONCE(dev->real_num_rx_queues > 1, 3531 "%s received packet on queue %u, but number " 3532 "of RX queues is %u\n", 3533 dev->name, index, dev->real_num_rx_queues); 3534 goto done; 3535 } 3536 rxqueue += index; 3537 } 3538 3539 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3540 3541 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3542 map = rcu_dereference(rxqueue->rps_map); 3543 if (!flow_table && !map) 3544 goto done; 3545 3546 skb_reset_network_header(skb); 3547 hash = skb_get_hash(skb); 3548 if (!hash) 3549 goto done; 3550 3551 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3552 if (flow_table && sock_flow_table) { 3553 struct rps_dev_flow *rflow; 3554 u32 next_cpu; 3555 u32 ident; 3556 3557 /* First check into global flow table if there is a match */ 3558 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3559 if ((ident ^ hash) & ~rps_cpu_mask) 3560 goto try_rps; 3561 3562 next_cpu = ident & rps_cpu_mask; 3563 3564 /* OK, now we know there is a match, 3565 * we can look at the local (per receive queue) flow table 3566 */ 3567 rflow = &flow_table->flows[hash & flow_table->mask]; 3568 tcpu = rflow->cpu; 3569 3570 /* 3571 * If the desired CPU (where last recvmsg was done) is 3572 * different from current CPU (one in the rx-queue flow 3573 * table entry), switch if one of the following holds: 3574 * - Current CPU is unset (>= nr_cpu_ids). 3575 * - Current CPU is offline. 3576 * - The current CPU's queue tail has advanced beyond the 3577 * last packet that was enqueued using this table entry. 3578 * This guarantees that all previous packets for the flow 3579 * have been dequeued, thus preserving in order delivery. 3580 */ 3581 if (unlikely(tcpu != next_cpu) && 3582 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) || 3583 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3584 rflow->last_qtail)) >= 0)) { 3585 tcpu = next_cpu; 3586 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3587 } 3588 3589 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { 3590 *rflowp = rflow; 3591 cpu = tcpu; 3592 goto done; 3593 } 3594 } 3595 3596 try_rps: 3597 3598 if (map) { 3599 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3600 if (cpu_online(tcpu)) { 3601 cpu = tcpu; 3602 goto done; 3603 } 3604 } 3605 3606 done: 3607 return cpu; 3608 } 3609 3610 #ifdef CONFIG_RFS_ACCEL 3611 3612 /** 3613 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3614 * @dev: Device on which the filter was set 3615 * @rxq_index: RX queue index 3616 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3617 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3618 * 3619 * Drivers that implement ndo_rx_flow_steer() should periodically call 3620 * this function for each installed filter and remove the filters for 3621 * which it returns %true. 3622 */ 3623 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3624 u32 flow_id, u16 filter_id) 3625 { 3626 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3627 struct rps_dev_flow_table *flow_table; 3628 struct rps_dev_flow *rflow; 3629 bool expire = true; 3630 unsigned int cpu; 3631 3632 rcu_read_lock(); 3633 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3634 if (flow_table && flow_id <= flow_table->mask) { 3635 rflow = &flow_table->flows[flow_id]; 3636 cpu = ACCESS_ONCE(rflow->cpu); 3637 if (rflow->filter == filter_id && cpu < nr_cpu_ids && 3638 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3639 rflow->last_qtail) < 3640 (int)(10 * flow_table->mask))) 3641 expire = false; 3642 } 3643 rcu_read_unlock(); 3644 return expire; 3645 } 3646 EXPORT_SYMBOL(rps_may_expire_flow); 3647 3648 #endif /* CONFIG_RFS_ACCEL */ 3649 3650 /* Called from hardirq (IPI) context */ 3651 static void rps_trigger_softirq(void *data) 3652 { 3653 struct softnet_data *sd = data; 3654 3655 ____napi_schedule(sd, &sd->backlog); 3656 sd->received_rps++; 3657 } 3658 3659 #endif /* CONFIG_RPS */ 3660 3661 /* 3662 * Check if this softnet_data structure is another cpu one 3663 * If yes, queue it to our IPI list and return 1 3664 * If no, return 0 3665 */ 3666 static int rps_ipi_queued(struct softnet_data *sd) 3667 { 3668 #ifdef CONFIG_RPS 3669 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3670 3671 if (sd != mysd) { 3672 sd->rps_ipi_next = mysd->rps_ipi_list; 3673 mysd->rps_ipi_list = sd; 3674 3675 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3676 return 1; 3677 } 3678 #endif /* CONFIG_RPS */ 3679 return 0; 3680 } 3681 3682 #ifdef CONFIG_NET_FLOW_LIMIT 3683 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3684 #endif 3685 3686 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3687 { 3688 #ifdef CONFIG_NET_FLOW_LIMIT 3689 struct sd_flow_limit *fl; 3690 struct softnet_data *sd; 3691 unsigned int old_flow, new_flow; 3692 3693 if (qlen < (netdev_max_backlog >> 1)) 3694 return false; 3695 3696 sd = this_cpu_ptr(&softnet_data); 3697 3698 rcu_read_lock(); 3699 fl = rcu_dereference(sd->flow_limit); 3700 if (fl) { 3701 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3702 old_flow = fl->history[fl->history_head]; 3703 fl->history[fl->history_head] = new_flow; 3704 3705 fl->history_head++; 3706 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3707 3708 if (likely(fl->buckets[old_flow])) 3709 fl->buckets[old_flow]--; 3710 3711 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3712 fl->count++; 3713 rcu_read_unlock(); 3714 return true; 3715 } 3716 } 3717 rcu_read_unlock(); 3718 #endif 3719 return false; 3720 } 3721 3722 /* 3723 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3724 * queue (may be a remote CPU queue). 3725 */ 3726 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3727 unsigned int *qtail) 3728 { 3729 struct softnet_data *sd; 3730 unsigned long flags; 3731 unsigned int qlen; 3732 3733 sd = &per_cpu(softnet_data, cpu); 3734 3735 local_irq_save(flags); 3736 3737 rps_lock(sd); 3738 if (!netif_running(skb->dev)) 3739 goto drop; 3740 qlen = skb_queue_len(&sd->input_pkt_queue); 3741 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3742 if (qlen) { 3743 enqueue: 3744 __skb_queue_tail(&sd->input_pkt_queue, skb); 3745 input_queue_tail_incr_save(sd, qtail); 3746 rps_unlock(sd); 3747 local_irq_restore(flags); 3748 return NET_RX_SUCCESS; 3749 } 3750 3751 /* Schedule NAPI for backlog device 3752 * We can use non atomic operation since we own the queue lock 3753 */ 3754 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3755 if (!rps_ipi_queued(sd)) 3756 ____napi_schedule(sd, &sd->backlog); 3757 } 3758 goto enqueue; 3759 } 3760 3761 drop: 3762 sd->dropped++; 3763 rps_unlock(sd); 3764 3765 local_irq_restore(flags); 3766 3767 atomic_long_inc(&skb->dev->rx_dropped); 3768 kfree_skb(skb); 3769 return NET_RX_DROP; 3770 } 3771 3772 static int netif_rx_internal(struct sk_buff *skb) 3773 { 3774 int ret; 3775 3776 net_timestamp_check(netdev_tstamp_prequeue, skb); 3777 3778 trace_netif_rx(skb); 3779 #ifdef CONFIG_RPS 3780 if (static_key_false(&rps_needed)) { 3781 struct rps_dev_flow voidflow, *rflow = &voidflow; 3782 int cpu; 3783 3784 preempt_disable(); 3785 rcu_read_lock(); 3786 3787 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3788 if (cpu < 0) 3789 cpu = smp_processor_id(); 3790 3791 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3792 3793 rcu_read_unlock(); 3794 preempt_enable(); 3795 } else 3796 #endif 3797 { 3798 unsigned int qtail; 3799 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3800 put_cpu(); 3801 } 3802 return ret; 3803 } 3804 3805 /** 3806 * netif_rx - post buffer to the network code 3807 * @skb: buffer to post 3808 * 3809 * This function receives a packet from a device driver and queues it for 3810 * the upper (protocol) levels to process. It always succeeds. The buffer 3811 * may be dropped during processing for congestion control or by the 3812 * protocol layers. 3813 * 3814 * return values: 3815 * NET_RX_SUCCESS (no congestion) 3816 * NET_RX_DROP (packet was dropped) 3817 * 3818 */ 3819 3820 int netif_rx(struct sk_buff *skb) 3821 { 3822 trace_netif_rx_entry(skb); 3823 3824 return netif_rx_internal(skb); 3825 } 3826 EXPORT_SYMBOL(netif_rx); 3827 3828 int netif_rx_ni(struct sk_buff *skb) 3829 { 3830 int err; 3831 3832 trace_netif_rx_ni_entry(skb); 3833 3834 preempt_disable(); 3835 err = netif_rx_internal(skb); 3836 if (local_softirq_pending()) 3837 do_softirq(); 3838 preempt_enable(); 3839 3840 return err; 3841 } 3842 EXPORT_SYMBOL(netif_rx_ni); 3843 3844 static __latent_entropy void net_tx_action(struct softirq_action *h) 3845 { 3846 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3847 3848 if (sd->completion_queue) { 3849 struct sk_buff *clist; 3850 3851 local_irq_disable(); 3852 clist = sd->completion_queue; 3853 sd->completion_queue = NULL; 3854 local_irq_enable(); 3855 3856 while (clist) { 3857 struct sk_buff *skb = clist; 3858 clist = clist->next; 3859 3860 WARN_ON(atomic_read(&skb->users)); 3861 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3862 trace_consume_skb(skb); 3863 else 3864 trace_kfree_skb(skb, net_tx_action); 3865 3866 if (skb->fclone != SKB_FCLONE_UNAVAILABLE) 3867 __kfree_skb(skb); 3868 else 3869 __kfree_skb_defer(skb); 3870 } 3871 3872 __kfree_skb_flush(); 3873 } 3874 3875 if (sd->output_queue) { 3876 struct Qdisc *head; 3877 3878 local_irq_disable(); 3879 head = sd->output_queue; 3880 sd->output_queue = NULL; 3881 sd->output_queue_tailp = &sd->output_queue; 3882 local_irq_enable(); 3883 3884 while (head) { 3885 struct Qdisc *q = head; 3886 spinlock_t *root_lock; 3887 3888 head = head->next_sched; 3889 3890 root_lock = qdisc_lock(q); 3891 spin_lock(root_lock); 3892 /* We need to make sure head->next_sched is read 3893 * before clearing __QDISC_STATE_SCHED 3894 */ 3895 smp_mb__before_atomic(); 3896 clear_bit(__QDISC_STATE_SCHED, &q->state); 3897 qdisc_run(q); 3898 spin_unlock(root_lock); 3899 } 3900 } 3901 } 3902 3903 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) 3904 /* This hook is defined here for ATM LANE */ 3905 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3906 unsigned char *addr) __read_mostly; 3907 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3908 #endif 3909 3910 static inline struct sk_buff * 3911 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, 3912 struct net_device *orig_dev) 3913 { 3914 #ifdef CONFIG_NET_CLS_ACT 3915 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list); 3916 struct tcf_result cl_res; 3917 3918 /* If there's at least one ingress present somewhere (so 3919 * we get here via enabled static key), remaining devices 3920 * that are not configured with an ingress qdisc will bail 3921 * out here. 3922 */ 3923 if (!cl) 3924 return skb; 3925 if (*pt_prev) { 3926 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3927 *pt_prev = NULL; 3928 } 3929 3930 qdisc_skb_cb(skb)->pkt_len = skb->len; 3931 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3932 qdisc_bstats_cpu_update(cl->q, skb); 3933 3934 switch (tc_classify(skb, cl, &cl_res, false)) { 3935 case TC_ACT_OK: 3936 case TC_ACT_RECLASSIFY: 3937 skb->tc_index = TC_H_MIN(cl_res.classid); 3938 break; 3939 case TC_ACT_SHOT: 3940 qdisc_qstats_cpu_drop(cl->q); 3941 kfree_skb(skb); 3942 return NULL; 3943 case TC_ACT_STOLEN: 3944 case TC_ACT_QUEUED: 3945 consume_skb(skb); 3946 return NULL; 3947 case TC_ACT_REDIRECT: 3948 /* skb_mac_header check was done by cls/act_bpf, so 3949 * we can safely push the L2 header back before 3950 * redirecting to another netdev 3951 */ 3952 __skb_push(skb, skb->mac_len); 3953 skb_do_redirect(skb); 3954 return NULL; 3955 default: 3956 break; 3957 } 3958 #endif /* CONFIG_NET_CLS_ACT */ 3959 return skb; 3960 } 3961 3962 /** 3963 * netdev_is_rx_handler_busy - check if receive handler is registered 3964 * @dev: device to check 3965 * 3966 * Check if a receive handler is already registered for a given device. 3967 * Return true if there one. 3968 * 3969 * The caller must hold the rtnl_mutex. 3970 */ 3971 bool netdev_is_rx_handler_busy(struct net_device *dev) 3972 { 3973 ASSERT_RTNL(); 3974 return dev && rtnl_dereference(dev->rx_handler); 3975 } 3976 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy); 3977 3978 /** 3979 * netdev_rx_handler_register - register receive handler 3980 * @dev: device to register a handler for 3981 * @rx_handler: receive handler to register 3982 * @rx_handler_data: data pointer that is used by rx handler 3983 * 3984 * Register a receive handler for a device. This handler will then be 3985 * called from __netif_receive_skb. A negative errno code is returned 3986 * on a failure. 3987 * 3988 * The caller must hold the rtnl_mutex. 3989 * 3990 * For a general description of rx_handler, see enum rx_handler_result. 3991 */ 3992 int netdev_rx_handler_register(struct net_device *dev, 3993 rx_handler_func_t *rx_handler, 3994 void *rx_handler_data) 3995 { 3996 ASSERT_RTNL(); 3997 3998 if (dev->rx_handler) 3999 return -EBUSY; 4000 4001 /* Note: rx_handler_data must be set before rx_handler */ 4002 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 4003 rcu_assign_pointer(dev->rx_handler, rx_handler); 4004 4005 return 0; 4006 } 4007 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 4008 4009 /** 4010 * netdev_rx_handler_unregister - unregister receive handler 4011 * @dev: device to unregister a handler from 4012 * 4013 * Unregister a receive handler from a device. 4014 * 4015 * The caller must hold the rtnl_mutex. 4016 */ 4017 void netdev_rx_handler_unregister(struct net_device *dev) 4018 { 4019 4020 ASSERT_RTNL(); 4021 RCU_INIT_POINTER(dev->rx_handler, NULL); 4022 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 4023 * section has a guarantee to see a non NULL rx_handler_data 4024 * as well. 4025 */ 4026 synchronize_net(); 4027 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 4028 } 4029 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 4030 4031 /* 4032 * Limit the use of PFMEMALLOC reserves to those protocols that implement 4033 * the special handling of PFMEMALLOC skbs. 4034 */ 4035 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 4036 { 4037 switch (skb->protocol) { 4038 case htons(ETH_P_ARP): 4039 case htons(ETH_P_IP): 4040 case htons(ETH_P_IPV6): 4041 case htons(ETH_P_8021Q): 4042 case htons(ETH_P_8021AD): 4043 return true; 4044 default: 4045 return false; 4046 } 4047 } 4048 4049 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, 4050 int *ret, struct net_device *orig_dev) 4051 { 4052 #ifdef CONFIG_NETFILTER_INGRESS 4053 if (nf_hook_ingress_active(skb)) { 4054 int ingress_retval; 4055 4056 if (*pt_prev) { 4057 *ret = deliver_skb(skb, *pt_prev, orig_dev); 4058 *pt_prev = NULL; 4059 } 4060 4061 rcu_read_lock(); 4062 ingress_retval = nf_hook_ingress(skb); 4063 rcu_read_unlock(); 4064 return ingress_retval; 4065 } 4066 #endif /* CONFIG_NETFILTER_INGRESS */ 4067 return 0; 4068 } 4069 4070 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 4071 { 4072 struct packet_type *ptype, *pt_prev; 4073 rx_handler_func_t *rx_handler; 4074 struct net_device *orig_dev; 4075 bool deliver_exact = false; 4076 int ret = NET_RX_DROP; 4077 __be16 type; 4078 4079 net_timestamp_check(!netdev_tstamp_prequeue, skb); 4080 4081 trace_netif_receive_skb(skb); 4082 4083 orig_dev = skb->dev; 4084 4085 skb_reset_network_header(skb); 4086 if (!skb_transport_header_was_set(skb)) 4087 skb_reset_transport_header(skb); 4088 skb_reset_mac_len(skb); 4089 4090 pt_prev = NULL; 4091 4092 another_round: 4093 skb->skb_iif = skb->dev->ifindex; 4094 4095 __this_cpu_inc(softnet_data.processed); 4096 4097 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 4098 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 4099 skb = skb_vlan_untag(skb); 4100 if (unlikely(!skb)) 4101 goto out; 4102 } 4103 4104 #ifdef CONFIG_NET_CLS_ACT 4105 if (skb->tc_verd & TC_NCLS) { 4106 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 4107 goto ncls; 4108 } 4109 #endif 4110 4111 if (pfmemalloc) 4112 goto skip_taps; 4113 4114 list_for_each_entry_rcu(ptype, &ptype_all, list) { 4115 if (pt_prev) 4116 ret = deliver_skb(skb, pt_prev, orig_dev); 4117 pt_prev = ptype; 4118 } 4119 4120 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 4121 if (pt_prev) 4122 ret = deliver_skb(skb, pt_prev, orig_dev); 4123 pt_prev = ptype; 4124 } 4125 4126 skip_taps: 4127 #ifdef CONFIG_NET_INGRESS 4128 if (static_key_false(&ingress_needed)) { 4129 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); 4130 if (!skb) 4131 goto out; 4132 4133 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0) 4134 goto out; 4135 } 4136 #endif 4137 #ifdef CONFIG_NET_CLS_ACT 4138 skb->tc_verd = 0; 4139 ncls: 4140 #endif 4141 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 4142 goto drop; 4143 4144 if (skb_vlan_tag_present(skb)) { 4145 if (pt_prev) { 4146 ret = deliver_skb(skb, pt_prev, orig_dev); 4147 pt_prev = NULL; 4148 } 4149 if (vlan_do_receive(&skb)) 4150 goto another_round; 4151 else if (unlikely(!skb)) 4152 goto out; 4153 } 4154 4155 rx_handler = rcu_dereference(skb->dev->rx_handler); 4156 if (rx_handler) { 4157 if (pt_prev) { 4158 ret = deliver_skb(skb, pt_prev, orig_dev); 4159 pt_prev = NULL; 4160 } 4161 switch (rx_handler(&skb)) { 4162 case RX_HANDLER_CONSUMED: 4163 ret = NET_RX_SUCCESS; 4164 goto out; 4165 case RX_HANDLER_ANOTHER: 4166 goto another_round; 4167 case RX_HANDLER_EXACT: 4168 deliver_exact = true; 4169 case RX_HANDLER_PASS: 4170 break; 4171 default: 4172 BUG(); 4173 } 4174 } 4175 4176 if (unlikely(skb_vlan_tag_present(skb))) { 4177 if (skb_vlan_tag_get_id(skb)) 4178 skb->pkt_type = PACKET_OTHERHOST; 4179 /* Note: we might in the future use prio bits 4180 * and set skb->priority like in vlan_do_receive() 4181 * For the time being, just ignore Priority Code Point 4182 */ 4183 skb->vlan_tci = 0; 4184 } 4185 4186 type = skb->protocol; 4187 4188 /* deliver only exact match when indicated */ 4189 if (likely(!deliver_exact)) { 4190 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4191 &ptype_base[ntohs(type) & 4192 PTYPE_HASH_MASK]); 4193 } 4194 4195 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4196 &orig_dev->ptype_specific); 4197 4198 if (unlikely(skb->dev != orig_dev)) { 4199 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 4200 &skb->dev->ptype_specific); 4201 } 4202 4203 if (pt_prev) { 4204 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 4205 goto drop; 4206 else 4207 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 4208 } else { 4209 drop: 4210 if (!deliver_exact) 4211 atomic_long_inc(&skb->dev->rx_dropped); 4212 else 4213 atomic_long_inc(&skb->dev->rx_nohandler); 4214 kfree_skb(skb); 4215 /* Jamal, now you will not able to escape explaining 4216 * me how you were going to use this. :-) 4217 */ 4218 ret = NET_RX_DROP; 4219 } 4220 4221 out: 4222 return ret; 4223 } 4224 4225 static int __netif_receive_skb(struct sk_buff *skb) 4226 { 4227 int ret; 4228 4229 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 4230 unsigned long pflags = current->flags; 4231 4232 /* 4233 * PFMEMALLOC skbs are special, they should 4234 * - be delivered to SOCK_MEMALLOC sockets only 4235 * - stay away from userspace 4236 * - have bounded memory usage 4237 * 4238 * Use PF_MEMALLOC as this saves us from propagating the allocation 4239 * context down to all allocation sites. 4240 */ 4241 current->flags |= PF_MEMALLOC; 4242 ret = __netif_receive_skb_core(skb, true); 4243 tsk_restore_flags(current, pflags, PF_MEMALLOC); 4244 } else 4245 ret = __netif_receive_skb_core(skb, false); 4246 4247 return ret; 4248 } 4249 4250 static int netif_receive_skb_internal(struct sk_buff *skb) 4251 { 4252 int ret; 4253 4254 net_timestamp_check(netdev_tstamp_prequeue, skb); 4255 4256 if (skb_defer_rx_timestamp(skb)) 4257 return NET_RX_SUCCESS; 4258 4259 rcu_read_lock(); 4260 4261 #ifdef CONFIG_RPS 4262 if (static_key_false(&rps_needed)) { 4263 struct rps_dev_flow voidflow, *rflow = &voidflow; 4264 int cpu = get_rps_cpu(skb->dev, skb, &rflow); 4265 4266 if (cpu >= 0) { 4267 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 4268 rcu_read_unlock(); 4269 return ret; 4270 } 4271 } 4272 #endif 4273 ret = __netif_receive_skb(skb); 4274 rcu_read_unlock(); 4275 return ret; 4276 } 4277 4278 /** 4279 * netif_receive_skb - process receive buffer from network 4280 * @skb: buffer to process 4281 * 4282 * netif_receive_skb() is the main receive data processing function. 4283 * It always succeeds. The buffer may be dropped during processing 4284 * for congestion control or by the protocol layers. 4285 * 4286 * This function may only be called from softirq context and interrupts 4287 * should be enabled. 4288 * 4289 * Return values (usually ignored): 4290 * NET_RX_SUCCESS: no congestion 4291 * NET_RX_DROP: packet was dropped 4292 */ 4293 int netif_receive_skb(struct sk_buff *skb) 4294 { 4295 trace_netif_receive_skb_entry(skb); 4296 4297 return netif_receive_skb_internal(skb); 4298 } 4299 EXPORT_SYMBOL(netif_receive_skb); 4300 4301 DEFINE_PER_CPU(struct work_struct, flush_works); 4302 4303 /* Network device is going away, flush any packets still pending */ 4304 static void flush_backlog(struct work_struct *work) 4305 { 4306 struct sk_buff *skb, *tmp; 4307 struct softnet_data *sd; 4308 4309 local_bh_disable(); 4310 sd = this_cpu_ptr(&softnet_data); 4311 4312 local_irq_disable(); 4313 rps_lock(sd); 4314 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 4315 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4316 __skb_unlink(skb, &sd->input_pkt_queue); 4317 kfree_skb(skb); 4318 input_queue_head_incr(sd); 4319 } 4320 } 4321 rps_unlock(sd); 4322 local_irq_enable(); 4323 4324 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 4325 if (skb->dev->reg_state == NETREG_UNREGISTERING) { 4326 __skb_unlink(skb, &sd->process_queue); 4327 kfree_skb(skb); 4328 input_queue_head_incr(sd); 4329 } 4330 } 4331 local_bh_enable(); 4332 } 4333 4334 static void flush_all_backlogs(void) 4335 { 4336 unsigned int cpu; 4337 4338 get_online_cpus(); 4339 4340 for_each_online_cpu(cpu) 4341 queue_work_on(cpu, system_highpri_wq, 4342 per_cpu_ptr(&flush_works, cpu)); 4343 4344 for_each_online_cpu(cpu) 4345 flush_work(per_cpu_ptr(&flush_works, cpu)); 4346 4347 put_online_cpus(); 4348 } 4349 4350 static int napi_gro_complete(struct sk_buff *skb) 4351 { 4352 struct packet_offload *ptype; 4353 __be16 type = skb->protocol; 4354 struct list_head *head = &offload_base; 4355 int err = -ENOENT; 4356 4357 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 4358 4359 if (NAPI_GRO_CB(skb)->count == 1) { 4360 skb_shinfo(skb)->gso_size = 0; 4361 goto out; 4362 } 4363 4364 rcu_read_lock(); 4365 list_for_each_entry_rcu(ptype, head, list) { 4366 if (ptype->type != type || !ptype->callbacks.gro_complete) 4367 continue; 4368 4369 err = ptype->callbacks.gro_complete(skb, 0); 4370 break; 4371 } 4372 rcu_read_unlock(); 4373 4374 if (err) { 4375 WARN_ON(&ptype->list == head); 4376 kfree_skb(skb); 4377 return NET_RX_SUCCESS; 4378 } 4379 4380 out: 4381 return netif_receive_skb_internal(skb); 4382 } 4383 4384 /* napi->gro_list contains packets ordered by age. 4385 * youngest packets at the head of it. 4386 * Complete skbs in reverse order to reduce latencies. 4387 */ 4388 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 4389 { 4390 struct sk_buff *skb, *prev = NULL; 4391 4392 /* scan list and build reverse chain */ 4393 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 4394 skb->prev = prev; 4395 prev = skb; 4396 } 4397 4398 for (skb = prev; skb; skb = prev) { 4399 skb->next = NULL; 4400 4401 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 4402 return; 4403 4404 prev = skb->prev; 4405 napi_gro_complete(skb); 4406 napi->gro_count--; 4407 } 4408 4409 napi->gro_list = NULL; 4410 } 4411 EXPORT_SYMBOL(napi_gro_flush); 4412 4413 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 4414 { 4415 struct sk_buff *p; 4416 unsigned int maclen = skb->dev->hard_header_len; 4417 u32 hash = skb_get_hash_raw(skb); 4418 4419 for (p = napi->gro_list; p; p = p->next) { 4420 unsigned long diffs; 4421 4422 NAPI_GRO_CB(p)->flush = 0; 4423 4424 if (hash != skb_get_hash_raw(p)) { 4425 NAPI_GRO_CB(p)->same_flow = 0; 4426 continue; 4427 } 4428 4429 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 4430 diffs |= p->vlan_tci ^ skb->vlan_tci; 4431 diffs |= skb_metadata_dst_cmp(p, skb); 4432 if (maclen == ETH_HLEN) 4433 diffs |= compare_ether_header(skb_mac_header(p), 4434 skb_mac_header(skb)); 4435 else if (!diffs) 4436 diffs = memcmp(skb_mac_header(p), 4437 skb_mac_header(skb), 4438 maclen); 4439 NAPI_GRO_CB(p)->same_flow = !diffs; 4440 } 4441 } 4442 4443 static void skb_gro_reset_offset(struct sk_buff *skb) 4444 { 4445 const struct skb_shared_info *pinfo = skb_shinfo(skb); 4446 const skb_frag_t *frag0 = &pinfo->frags[0]; 4447 4448 NAPI_GRO_CB(skb)->data_offset = 0; 4449 NAPI_GRO_CB(skb)->frag0 = NULL; 4450 NAPI_GRO_CB(skb)->frag0_len = 0; 4451 4452 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 4453 pinfo->nr_frags && 4454 !PageHighMem(skb_frag_page(frag0))) { 4455 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 4456 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 4457 } 4458 } 4459 4460 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 4461 { 4462 struct skb_shared_info *pinfo = skb_shinfo(skb); 4463 4464 BUG_ON(skb->end - skb->tail < grow); 4465 4466 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4467 4468 skb->data_len -= grow; 4469 skb->tail += grow; 4470 4471 pinfo->frags[0].page_offset += grow; 4472 skb_frag_size_sub(&pinfo->frags[0], grow); 4473 4474 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4475 skb_frag_unref(skb, 0); 4476 memmove(pinfo->frags, pinfo->frags + 1, 4477 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4478 } 4479 } 4480 4481 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4482 { 4483 struct sk_buff **pp = NULL; 4484 struct packet_offload *ptype; 4485 __be16 type = skb->protocol; 4486 struct list_head *head = &offload_base; 4487 int same_flow; 4488 enum gro_result ret; 4489 int grow; 4490 4491 if (!(skb->dev->features & NETIF_F_GRO)) 4492 goto normal; 4493 4494 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4495 goto normal; 4496 4497 gro_list_prepare(napi, skb); 4498 4499 rcu_read_lock(); 4500 list_for_each_entry_rcu(ptype, head, list) { 4501 if (ptype->type != type || !ptype->callbacks.gro_receive) 4502 continue; 4503 4504 skb_set_network_header(skb, skb_gro_offset(skb)); 4505 skb_reset_mac_len(skb); 4506 NAPI_GRO_CB(skb)->same_flow = 0; 4507 NAPI_GRO_CB(skb)->flush = 0; 4508 NAPI_GRO_CB(skb)->free = 0; 4509 NAPI_GRO_CB(skb)->encap_mark = 0; 4510 NAPI_GRO_CB(skb)->recursion_counter = 0; 4511 NAPI_GRO_CB(skb)->is_fou = 0; 4512 NAPI_GRO_CB(skb)->is_atomic = 1; 4513 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4514 4515 /* Setup for GRO checksum validation */ 4516 switch (skb->ip_summed) { 4517 case CHECKSUM_COMPLETE: 4518 NAPI_GRO_CB(skb)->csum = skb->csum; 4519 NAPI_GRO_CB(skb)->csum_valid = 1; 4520 NAPI_GRO_CB(skb)->csum_cnt = 0; 4521 break; 4522 case CHECKSUM_UNNECESSARY: 4523 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4524 NAPI_GRO_CB(skb)->csum_valid = 0; 4525 break; 4526 default: 4527 NAPI_GRO_CB(skb)->csum_cnt = 0; 4528 NAPI_GRO_CB(skb)->csum_valid = 0; 4529 } 4530 4531 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4532 break; 4533 } 4534 rcu_read_unlock(); 4535 4536 if (&ptype->list == head) 4537 goto normal; 4538 4539 same_flow = NAPI_GRO_CB(skb)->same_flow; 4540 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4541 4542 if (pp) { 4543 struct sk_buff *nskb = *pp; 4544 4545 *pp = nskb->next; 4546 nskb->next = NULL; 4547 napi_gro_complete(nskb); 4548 napi->gro_count--; 4549 } 4550 4551 if (same_flow) 4552 goto ok; 4553 4554 if (NAPI_GRO_CB(skb)->flush) 4555 goto normal; 4556 4557 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4558 struct sk_buff *nskb = napi->gro_list; 4559 4560 /* locate the end of the list to select the 'oldest' flow */ 4561 while (nskb->next) { 4562 pp = &nskb->next; 4563 nskb = *pp; 4564 } 4565 *pp = NULL; 4566 nskb->next = NULL; 4567 napi_gro_complete(nskb); 4568 } else { 4569 napi->gro_count++; 4570 } 4571 NAPI_GRO_CB(skb)->count = 1; 4572 NAPI_GRO_CB(skb)->age = jiffies; 4573 NAPI_GRO_CB(skb)->last = skb; 4574 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4575 skb->next = napi->gro_list; 4576 napi->gro_list = skb; 4577 ret = GRO_HELD; 4578 4579 pull: 4580 grow = skb_gro_offset(skb) - skb_headlen(skb); 4581 if (grow > 0) 4582 gro_pull_from_frag0(skb, grow); 4583 ok: 4584 return ret; 4585 4586 normal: 4587 ret = GRO_NORMAL; 4588 goto pull; 4589 } 4590 4591 struct packet_offload *gro_find_receive_by_type(__be16 type) 4592 { 4593 struct list_head *offload_head = &offload_base; 4594 struct packet_offload *ptype; 4595 4596 list_for_each_entry_rcu(ptype, offload_head, list) { 4597 if (ptype->type != type || !ptype->callbacks.gro_receive) 4598 continue; 4599 return ptype; 4600 } 4601 return NULL; 4602 } 4603 EXPORT_SYMBOL(gro_find_receive_by_type); 4604 4605 struct packet_offload *gro_find_complete_by_type(__be16 type) 4606 { 4607 struct list_head *offload_head = &offload_base; 4608 struct packet_offload *ptype; 4609 4610 list_for_each_entry_rcu(ptype, offload_head, list) { 4611 if (ptype->type != type || !ptype->callbacks.gro_complete) 4612 continue; 4613 return ptype; 4614 } 4615 return NULL; 4616 } 4617 EXPORT_SYMBOL(gro_find_complete_by_type); 4618 4619 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4620 { 4621 switch (ret) { 4622 case GRO_NORMAL: 4623 if (netif_receive_skb_internal(skb)) 4624 ret = GRO_DROP; 4625 break; 4626 4627 case GRO_DROP: 4628 kfree_skb(skb); 4629 break; 4630 4631 case GRO_MERGED_FREE: 4632 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) { 4633 skb_dst_drop(skb); 4634 kmem_cache_free(skbuff_head_cache, skb); 4635 } else { 4636 __kfree_skb(skb); 4637 } 4638 break; 4639 4640 case GRO_HELD: 4641 case GRO_MERGED: 4642 break; 4643 } 4644 4645 return ret; 4646 } 4647 4648 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4649 { 4650 skb_mark_napi_id(skb, napi); 4651 trace_napi_gro_receive_entry(skb); 4652 4653 skb_gro_reset_offset(skb); 4654 4655 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4656 } 4657 EXPORT_SYMBOL(napi_gro_receive); 4658 4659 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4660 { 4661 if (unlikely(skb->pfmemalloc)) { 4662 consume_skb(skb); 4663 return; 4664 } 4665 __skb_pull(skb, skb_headlen(skb)); 4666 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4667 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4668 skb->vlan_tci = 0; 4669 skb->dev = napi->dev; 4670 skb->skb_iif = 0; 4671 skb->encapsulation = 0; 4672 skb_shinfo(skb)->gso_type = 0; 4673 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4674 4675 napi->skb = skb; 4676 } 4677 4678 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4679 { 4680 struct sk_buff *skb = napi->skb; 4681 4682 if (!skb) { 4683 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4684 if (skb) { 4685 napi->skb = skb; 4686 skb_mark_napi_id(skb, napi); 4687 } 4688 } 4689 return skb; 4690 } 4691 EXPORT_SYMBOL(napi_get_frags); 4692 4693 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4694 struct sk_buff *skb, 4695 gro_result_t ret) 4696 { 4697 switch (ret) { 4698 case GRO_NORMAL: 4699 case GRO_HELD: 4700 __skb_push(skb, ETH_HLEN); 4701 skb->protocol = eth_type_trans(skb, skb->dev); 4702 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4703 ret = GRO_DROP; 4704 break; 4705 4706 case GRO_DROP: 4707 case GRO_MERGED_FREE: 4708 napi_reuse_skb(napi, skb); 4709 break; 4710 4711 case GRO_MERGED: 4712 break; 4713 } 4714 4715 return ret; 4716 } 4717 4718 /* Upper GRO stack assumes network header starts at gro_offset=0 4719 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4720 * We copy ethernet header into skb->data to have a common layout. 4721 */ 4722 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4723 { 4724 struct sk_buff *skb = napi->skb; 4725 const struct ethhdr *eth; 4726 unsigned int hlen = sizeof(*eth); 4727 4728 napi->skb = NULL; 4729 4730 skb_reset_mac_header(skb); 4731 skb_gro_reset_offset(skb); 4732 4733 eth = skb_gro_header_fast(skb, 0); 4734 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4735 eth = skb_gro_header_slow(skb, hlen, 0); 4736 if (unlikely(!eth)) { 4737 net_warn_ratelimited("%s: dropping impossible skb from %s\n", 4738 __func__, napi->dev->name); 4739 napi_reuse_skb(napi, skb); 4740 return NULL; 4741 } 4742 } else { 4743 gro_pull_from_frag0(skb, hlen); 4744 NAPI_GRO_CB(skb)->frag0 += hlen; 4745 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4746 } 4747 __skb_pull(skb, hlen); 4748 4749 /* 4750 * This works because the only protocols we care about don't require 4751 * special handling. 4752 * We'll fix it up properly in napi_frags_finish() 4753 */ 4754 skb->protocol = eth->h_proto; 4755 4756 return skb; 4757 } 4758 4759 gro_result_t napi_gro_frags(struct napi_struct *napi) 4760 { 4761 struct sk_buff *skb = napi_frags_skb(napi); 4762 4763 if (!skb) 4764 return GRO_DROP; 4765 4766 trace_napi_gro_frags_entry(skb); 4767 4768 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4769 } 4770 EXPORT_SYMBOL(napi_gro_frags); 4771 4772 /* Compute the checksum from gro_offset and return the folded value 4773 * after adding in any pseudo checksum. 4774 */ 4775 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4776 { 4777 __wsum wsum; 4778 __sum16 sum; 4779 4780 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4781 4782 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4783 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4784 if (likely(!sum)) { 4785 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4786 !skb->csum_complete_sw) 4787 netdev_rx_csum_fault(skb->dev); 4788 } 4789 4790 NAPI_GRO_CB(skb)->csum = wsum; 4791 NAPI_GRO_CB(skb)->csum_valid = 1; 4792 4793 return sum; 4794 } 4795 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4796 4797 /* 4798 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4799 * Note: called with local irq disabled, but exits with local irq enabled. 4800 */ 4801 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4802 { 4803 #ifdef CONFIG_RPS 4804 struct softnet_data *remsd = sd->rps_ipi_list; 4805 4806 if (remsd) { 4807 sd->rps_ipi_list = NULL; 4808 4809 local_irq_enable(); 4810 4811 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4812 while (remsd) { 4813 struct softnet_data *next = remsd->rps_ipi_next; 4814 4815 if (cpu_online(remsd->cpu)) 4816 smp_call_function_single_async(remsd->cpu, 4817 &remsd->csd); 4818 remsd = next; 4819 } 4820 } else 4821 #endif 4822 local_irq_enable(); 4823 } 4824 4825 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4826 { 4827 #ifdef CONFIG_RPS 4828 return sd->rps_ipi_list != NULL; 4829 #else 4830 return false; 4831 #endif 4832 } 4833 4834 static int process_backlog(struct napi_struct *napi, int quota) 4835 { 4836 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4837 bool again = true; 4838 int work = 0; 4839 4840 /* Check if we have pending ipi, its better to send them now, 4841 * not waiting net_rx_action() end. 4842 */ 4843 if (sd_has_rps_ipi_waiting(sd)) { 4844 local_irq_disable(); 4845 net_rps_action_and_irq_enable(sd); 4846 } 4847 4848 napi->weight = weight_p; 4849 while (again) { 4850 struct sk_buff *skb; 4851 4852 while ((skb = __skb_dequeue(&sd->process_queue))) { 4853 rcu_read_lock(); 4854 __netif_receive_skb(skb); 4855 rcu_read_unlock(); 4856 input_queue_head_incr(sd); 4857 if (++work >= quota) 4858 return work; 4859 4860 } 4861 4862 local_irq_disable(); 4863 rps_lock(sd); 4864 if (skb_queue_empty(&sd->input_pkt_queue)) { 4865 /* 4866 * Inline a custom version of __napi_complete(). 4867 * only current cpu owns and manipulates this napi, 4868 * and NAPI_STATE_SCHED is the only possible flag set 4869 * on backlog. 4870 * We can use a plain write instead of clear_bit(), 4871 * and we dont need an smp_mb() memory barrier. 4872 */ 4873 napi->state = 0; 4874 again = false; 4875 } else { 4876 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4877 &sd->process_queue); 4878 } 4879 rps_unlock(sd); 4880 local_irq_enable(); 4881 } 4882 4883 return work; 4884 } 4885 4886 /** 4887 * __napi_schedule - schedule for receive 4888 * @n: entry to schedule 4889 * 4890 * The entry's receive function will be scheduled to run. 4891 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4892 */ 4893 void __napi_schedule(struct napi_struct *n) 4894 { 4895 unsigned long flags; 4896 4897 local_irq_save(flags); 4898 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4899 local_irq_restore(flags); 4900 } 4901 EXPORT_SYMBOL(__napi_schedule); 4902 4903 /** 4904 * __napi_schedule_irqoff - schedule for receive 4905 * @n: entry to schedule 4906 * 4907 * Variant of __napi_schedule() assuming hard irqs are masked 4908 */ 4909 void __napi_schedule_irqoff(struct napi_struct *n) 4910 { 4911 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4912 } 4913 EXPORT_SYMBOL(__napi_schedule_irqoff); 4914 4915 void __napi_complete(struct napi_struct *n) 4916 { 4917 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4918 4919 list_del_init(&n->poll_list); 4920 smp_mb__before_atomic(); 4921 clear_bit(NAPI_STATE_SCHED, &n->state); 4922 } 4923 EXPORT_SYMBOL(__napi_complete); 4924 4925 void napi_complete_done(struct napi_struct *n, int work_done) 4926 { 4927 unsigned long flags; 4928 4929 /* 4930 * don't let napi dequeue from the cpu poll list 4931 * just in case its running on a different cpu 4932 */ 4933 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4934 return; 4935 4936 if (n->gro_list) { 4937 unsigned long timeout = 0; 4938 4939 if (work_done) 4940 timeout = n->dev->gro_flush_timeout; 4941 4942 if (timeout) 4943 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4944 HRTIMER_MODE_REL_PINNED); 4945 else 4946 napi_gro_flush(n, false); 4947 } 4948 if (likely(list_empty(&n->poll_list))) { 4949 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4950 } else { 4951 /* If n->poll_list is not empty, we need to mask irqs */ 4952 local_irq_save(flags); 4953 __napi_complete(n); 4954 local_irq_restore(flags); 4955 } 4956 } 4957 EXPORT_SYMBOL(napi_complete_done); 4958 4959 /* must be called under rcu_read_lock(), as we dont take a reference */ 4960 static struct napi_struct *napi_by_id(unsigned int napi_id) 4961 { 4962 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4963 struct napi_struct *napi; 4964 4965 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4966 if (napi->napi_id == napi_id) 4967 return napi; 4968 4969 return NULL; 4970 } 4971 4972 #if defined(CONFIG_NET_RX_BUSY_POLL) 4973 #define BUSY_POLL_BUDGET 8 4974 bool sk_busy_loop(struct sock *sk, int nonblock) 4975 { 4976 unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; 4977 int (*busy_poll)(struct napi_struct *dev); 4978 struct napi_struct *napi; 4979 int rc = false; 4980 4981 rcu_read_lock(); 4982 4983 napi = napi_by_id(sk->sk_napi_id); 4984 if (!napi) 4985 goto out; 4986 4987 /* Note: ndo_busy_poll method is optional in linux-4.5 */ 4988 busy_poll = napi->dev->netdev_ops->ndo_busy_poll; 4989 4990 do { 4991 rc = 0; 4992 local_bh_disable(); 4993 if (busy_poll) { 4994 rc = busy_poll(napi); 4995 } else if (napi_schedule_prep(napi)) { 4996 void *have = netpoll_poll_lock(napi); 4997 4998 if (test_bit(NAPI_STATE_SCHED, &napi->state)) { 4999 rc = napi->poll(napi, BUSY_POLL_BUDGET); 5000 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); 5001 if (rc == BUSY_POLL_BUDGET) { 5002 napi_complete_done(napi, rc); 5003 napi_schedule(napi); 5004 } 5005 } 5006 netpoll_poll_unlock(have); 5007 } 5008 if (rc > 0) 5009 __NET_ADD_STATS(sock_net(sk), 5010 LINUX_MIB_BUSYPOLLRXPACKETS, rc); 5011 local_bh_enable(); 5012 5013 if (rc == LL_FLUSH_FAILED) 5014 break; /* permanent failure */ 5015 5016 cpu_relax(); 5017 } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) && 5018 !need_resched() && !busy_loop_timeout(end_time)); 5019 5020 rc = !skb_queue_empty(&sk->sk_receive_queue); 5021 out: 5022 rcu_read_unlock(); 5023 return rc; 5024 } 5025 EXPORT_SYMBOL(sk_busy_loop); 5026 5027 #endif /* CONFIG_NET_RX_BUSY_POLL */ 5028 5029 void napi_hash_add(struct napi_struct *napi) 5030 { 5031 if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) || 5032 test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) 5033 return; 5034 5035 spin_lock(&napi_hash_lock); 5036 5037 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ 5038 do { 5039 if (unlikely(++napi_gen_id < NR_CPUS + 1)) 5040 napi_gen_id = NR_CPUS + 1; 5041 } while (napi_by_id(napi_gen_id)); 5042 napi->napi_id = napi_gen_id; 5043 5044 hlist_add_head_rcu(&napi->napi_hash_node, 5045 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 5046 5047 spin_unlock(&napi_hash_lock); 5048 } 5049 EXPORT_SYMBOL_GPL(napi_hash_add); 5050 5051 /* Warning : caller is responsible to make sure rcu grace period 5052 * is respected before freeing memory containing @napi 5053 */ 5054 bool napi_hash_del(struct napi_struct *napi) 5055 { 5056 bool rcu_sync_needed = false; 5057 5058 spin_lock(&napi_hash_lock); 5059 5060 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) { 5061 rcu_sync_needed = true; 5062 hlist_del_rcu(&napi->napi_hash_node); 5063 } 5064 spin_unlock(&napi_hash_lock); 5065 return rcu_sync_needed; 5066 } 5067 EXPORT_SYMBOL_GPL(napi_hash_del); 5068 5069 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 5070 { 5071 struct napi_struct *napi; 5072 5073 napi = container_of(timer, struct napi_struct, timer); 5074 if (napi->gro_list) 5075 napi_schedule(napi); 5076 5077 return HRTIMER_NORESTART; 5078 } 5079 5080 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 5081 int (*poll)(struct napi_struct *, int), int weight) 5082 { 5083 INIT_LIST_HEAD(&napi->poll_list); 5084 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 5085 napi->timer.function = napi_watchdog; 5086 napi->gro_count = 0; 5087 napi->gro_list = NULL; 5088 napi->skb = NULL; 5089 napi->poll = poll; 5090 if (weight > NAPI_POLL_WEIGHT) 5091 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 5092 weight, dev->name); 5093 napi->weight = weight; 5094 list_add(&napi->dev_list, &dev->napi_list); 5095 napi->dev = dev; 5096 #ifdef CONFIG_NETPOLL 5097 spin_lock_init(&napi->poll_lock); 5098 napi->poll_owner = -1; 5099 #endif 5100 set_bit(NAPI_STATE_SCHED, &napi->state); 5101 napi_hash_add(napi); 5102 } 5103 EXPORT_SYMBOL(netif_napi_add); 5104 5105 void napi_disable(struct napi_struct *n) 5106 { 5107 might_sleep(); 5108 set_bit(NAPI_STATE_DISABLE, &n->state); 5109 5110 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 5111 msleep(1); 5112 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state)) 5113 msleep(1); 5114 5115 hrtimer_cancel(&n->timer); 5116 5117 clear_bit(NAPI_STATE_DISABLE, &n->state); 5118 } 5119 EXPORT_SYMBOL(napi_disable); 5120 5121 /* Must be called in process context */ 5122 void netif_napi_del(struct napi_struct *napi) 5123 { 5124 might_sleep(); 5125 if (napi_hash_del(napi)) 5126 synchronize_net(); 5127 list_del_init(&napi->dev_list); 5128 napi_free_frags(napi); 5129 5130 kfree_skb_list(napi->gro_list); 5131 napi->gro_list = NULL; 5132 napi->gro_count = 0; 5133 } 5134 EXPORT_SYMBOL(netif_napi_del); 5135 5136 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 5137 { 5138 void *have; 5139 int work, weight; 5140 5141 list_del_init(&n->poll_list); 5142 5143 have = netpoll_poll_lock(n); 5144 5145 weight = n->weight; 5146 5147 /* This NAPI_STATE_SCHED test is for avoiding a race 5148 * with netpoll's poll_napi(). Only the entity which 5149 * obtains the lock and sees NAPI_STATE_SCHED set will 5150 * actually make the ->poll() call. Therefore we avoid 5151 * accidentally calling ->poll() when NAPI is not scheduled. 5152 */ 5153 work = 0; 5154 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 5155 work = n->poll(n, weight); 5156 trace_napi_poll(n, work, weight); 5157 } 5158 5159 WARN_ON_ONCE(work > weight); 5160 5161 if (likely(work < weight)) 5162 goto out_unlock; 5163 5164 /* Drivers must not modify the NAPI state if they 5165 * consume the entire weight. In such cases this code 5166 * still "owns" the NAPI instance and therefore can 5167 * move the instance around on the list at-will. 5168 */ 5169 if (unlikely(napi_disable_pending(n))) { 5170 napi_complete(n); 5171 goto out_unlock; 5172 } 5173 5174 if (n->gro_list) { 5175 /* flush too old packets 5176 * If HZ < 1000, flush all packets. 5177 */ 5178 napi_gro_flush(n, HZ >= 1000); 5179 } 5180 5181 /* Some drivers may have called napi_schedule 5182 * prior to exhausting their budget. 5183 */ 5184 if (unlikely(!list_empty(&n->poll_list))) { 5185 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 5186 n->dev ? n->dev->name : "backlog"); 5187 goto out_unlock; 5188 } 5189 5190 list_add_tail(&n->poll_list, repoll); 5191 5192 out_unlock: 5193 netpoll_poll_unlock(have); 5194 5195 return work; 5196 } 5197 5198 static __latent_entropy void net_rx_action(struct softirq_action *h) 5199 { 5200 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 5201 unsigned long time_limit = jiffies + 2; 5202 int budget = netdev_budget; 5203 LIST_HEAD(list); 5204 LIST_HEAD(repoll); 5205 5206 local_irq_disable(); 5207 list_splice_init(&sd->poll_list, &list); 5208 local_irq_enable(); 5209 5210 for (;;) { 5211 struct napi_struct *n; 5212 5213 if (list_empty(&list)) { 5214 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 5215 return; 5216 break; 5217 } 5218 5219 n = list_first_entry(&list, struct napi_struct, poll_list); 5220 budget -= napi_poll(n, &repoll); 5221 5222 /* If softirq window is exhausted then punt. 5223 * Allow this to run for 2 jiffies since which will allow 5224 * an average latency of 1.5/HZ. 5225 */ 5226 if (unlikely(budget <= 0 || 5227 time_after_eq(jiffies, time_limit))) { 5228 sd->time_squeeze++; 5229 break; 5230 } 5231 } 5232 5233 __kfree_skb_flush(); 5234 local_irq_disable(); 5235 5236 list_splice_tail_init(&sd->poll_list, &list); 5237 list_splice_tail(&repoll, &list); 5238 list_splice(&list, &sd->poll_list); 5239 if (!list_empty(&sd->poll_list)) 5240 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 5241 5242 net_rps_action_and_irq_enable(sd); 5243 } 5244 5245 struct netdev_adjacent { 5246 struct net_device *dev; 5247 5248 /* upper master flag, there can only be one master device per list */ 5249 bool master; 5250 5251 /* counter for the number of times this device was added to us */ 5252 u16 ref_nr; 5253 5254 /* private field for the users */ 5255 void *private; 5256 5257 struct list_head list; 5258 struct rcu_head rcu; 5259 }; 5260 5261 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev, 5262 struct list_head *adj_list) 5263 { 5264 struct netdev_adjacent *adj; 5265 5266 list_for_each_entry(adj, adj_list, list) { 5267 if (adj->dev == adj_dev) 5268 return adj; 5269 } 5270 return NULL; 5271 } 5272 5273 /** 5274 * netdev_has_upper_dev - Check if device is linked to an upper device 5275 * @dev: device 5276 * @upper_dev: upper device to check 5277 * 5278 * Find out if a device is linked to specified upper device and return true 5279 * in case it is. Note that this checks only immediate upper device, 5280 * not through a complete stack of devices. The caller must hold the RTNL lock. 5281 */ 5282 bool netdev_has_upper_dev(struct net_device *dev, 5283 struct net_device *upper_dev) 5284 { 5285 ASSERT_RTNL(); 5286 5287 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper); 5288 } 5289 EXPORT_SYMBOL(netdev_has_upper_dev); 5290 5291 /** 5292 * netdev_has_any_upper_dev - Check if device is linked to some device 5293 * @dev: device 5294 * 5295 * Find out if a device is linked to an upper device and return true in case 5296 * it is. The caller must hold the RTNL lock. 5297 */ 5298 static bool netdev_has_any_upper_dev(struct net_device *dev) 5299 { 5300 ASSERT_RTNL(); 5301 5302 return !list_empty(&dev->all_adj_list.upper); 5303 } 5304 5305 /** 5306 * netdev_master_upper_dev_get - Get master upper device 5307 * @dev: device 5308 * 5309 * Find a master upper device and return pointer to it or NULL in case 5310 * it's not there. The caller must hold the RTNL lock. 5311 */ 5312 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 5313 { 5314 struct netdev_adjacent *upper; 5315 5316 ASSERT_RTNL(); 5317 5318 if (list_empty(&dev->adj_list.upper)) 5319 return NULL; 5320 5321 upper = list_first_entry(&dev->adj_list.upper, 5322 struct netdev_adjacent, list); 5323 if (likely(upper->master)) 5324 return upper->dev; 5325 return NULL; 5326 } 5327 EXPORT_SYMBOL(netdev_master_upper_dev_get); 5328 5329 void *netdev_adjacent_get_private(struct list_head *adj_list) 5330 { 5331 struct netdev_adjacent *adj; 5332 5333 adj = list_entry(adj_list, struct netdev_adjacent, list); 5334 5335 return adj->private; 5336 } 5337 EXPORT_SYMBOL(netdev_adjacent_get_private); 5338 5339 /** 5340 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 5341 * @dev: device 5342 * @iter: list_head ** of the current position 5343 * 5344 * Gets the next device from the dev's upper list, starting from iter 5345 * position. The caller must hold RCU read lock. 5346 */ 5347 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 5348 struct list_head **iter) 5349 { 5350 struct netdev_adjacent *upper; 5351 5352 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5353 5354 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5355 5356 if (&upper->list == &dev->adj_list.upper) 5357 return NULL; 5358 5359 *iter = &upper->list; 5360 5361 return upper->dev; 5362 } 5363 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 5364 5365 /** 5366 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 5367 * @dev: device 5368 * @iter: list_head ** of the current position 5369 * 5370 * Gets the next device from the dev's upper list, starting from iter 5371 * position. The caller must hold RCU read lock. 5372 */ 5373 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 5374 struct list_head **iter) 5375 { 5376 struct netdev_adjacent *upper; 5377 5378 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 5379 5380 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5381 5382 if (&upper->list == &dev->all_adj_list.upper) 5383 return NULL; 5384 5385 *iter = &upper->list; 5386 5387 return upper->dev; 5388 } 5389 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 5390 5391 /** 5392 * netdev_lower_get_next_private - Get the next ->private from the 5393 * lower neighbour list 5394 * @dev: device 5395 * @iter: list_head ** of the current position 5396 * 5397 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5398 * list, starting from iter position. The caller must hold either hold the 5399 * RTNL lock or its own locking that guarantees that the neighbour lower 5400 * list will remain unchanged. 5401 */ 5402 void *netdev_lower_get_next_private(struct net_device *dev, 5403 struct list_head **iter) 5404 { 5405 struct netdev_adjacent *lower; 5406 5407 lower = list_entry(*iter, struct netdev_adjacent, list); 5408 5409 if (&lower->list == &dev->adj_list.lower) 5410 return NULL; 5411 5412 *iter = lower->list.next; 5413 5414 return lower->private; 5415 } 5416 EXPORT_SYMBOL(netdev_lower_get_next_private); 5417 5418 /** 5419 * netdev_lower_get_next_private_rcu - Get the next ->private from the 5420 * lower neighbour list, RCU 5421 * variant 5422 * @dev: device 5423 * @iter: list_head ** of the current position 5424 * 5425 * Gets the next netdev_adjacent->private from the dev's lower neighbour 5426 * list, starting from iter position. The caller must hold RCU read lock. 5427 */ 5428 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 5429 struct list_head **iter) 5430 { 5431 struct netdev_adjacent *lower; 5432 5433 WARN_ON_ONCE(!rcu_read_lock_held()); 5434 5435 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5436 5437 if (&lower->list == &dev->adj_list.lower) 5438 return NULL; 5439 5440 *iter = &lower->list; 5441 5442 return lower->private; 5443 } 5444 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 5445 5446 /** 5447 * netdev_lower_get_next - Get the next device from the lower neighbour 5448 * list 5449 * @dev: device 5450 * @iter: list_head ** of the current position 5451 * 5452 * Gets the next netdev_adjacent from the dev's lower neighbour 5453 * list, starting from iter position. The caller must hold RTNL lock or 5454 * its own locking that guarantees that the neighbour lower 5455 * list will remain unchanged. 5456 */ 5457 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 5458 { 5459 struct netdev_adjacent *lower; 5460 5461 lower = list_entry(*iter, struct netdev_adjacent, list); 5462 5463 if (&lower->list == &dev->adj_list.lower) 5464 return NULL; 5465 5466 *iter = lower->list.next; 5467 5468 return lower->dev; 5469 } 5470 EXPORT_SYMBOL(netdev_lower_get_next); 5471 5472 /** 5473 * netdev_all_lower_get_next - Get the next device from all lower neighbour list 5474 * @dev: device 5475 * @iter: list_head ** of the current position 5476 * 5477 * Gets the next netdev_adjacent from the dev's all lower neighbour 5478 * list, starting from iter position. The caller must hold RTNL lock or 5479 * its own locking that guarantees that the neighbour all lower 5480 * list will remain unchanged. 5481 */ 5482 struct net_device *netdev_all_lower_get_next(struct net_device *dev, struct list_head **iter) 5483 { 5484 struct netdev_adjacent *lower; 5485 5486 lower = list_entry(*iter, struct netdev_adjacent, list); 5487 5488 if (&lower->list == &dev->all_adj_list.lower) 5489 return NULL; 5490 5491 *iter = lower->list.next; 5492 5493 return lower->dev; 5494 } 5495 EXPORT_SYMBOL(netdev_all_lower_get_next); 5496 5497 /** 5498 * netdev_all_lower_get_next_rcu - Get the next device from all 5499 * lower neighbour list, RCU variant 5500 * @dev: device 5501 * @iter: list_head ** of the current position 5502 * 5503 * Gets the next netdev_adjacent from the dev's all lower neighbour 5504 * list, starting from iter position. The caller must hold RCU read lock. 5505 */ 5506 struct net_device *netdev_all_lower_get_next_rcu(struct net_device *dev, 5507 struct list_head **iter) 5508 { 5509 struct netdev_adjacent *lower; 5510 5511 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 5512 5513 if (&lower->list == &dev->all_adj_list.lower) 5514 return NULL; 5515 5516 *iter = &lower->list; 5517 5518 return lower->dev; 5519 } 5520 EXPORT_SYMBOL(netdev_all_lower_get_next_rcu); 5521 5522 /** 5523 * netdev_lower_get_first_private_rcu - Get the first ->private from the 5524 * lower neighbour list, RCU 5525 * variant 5526 * @dev: device 5527 * 5528 * Gets the first netdev_adjacent->private from the dev's lower neighbour 5529 * list. The caller must hold RCU read lock. 5530 */ 5531 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 5532 { 5533 struct netdev_adjacent *lower; 5534 5535 lower = list_first_or_null_rcu(&dev->adj_list.lower, 5536 struct netdev_adjacent, list); 5537 if (lower) 5538 return lower->private; 5539 return NULL; 5540 } 5541 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 5542 5543 /** 5544 * netdev_master_upper_dev_get_rcu - Get master upper device 5545 * @dev: device 5546 * 5547 * Find a master upper device and return pointer to it or NULL in case 5548 * it's not there. The caller must hold the RCU read lock. 5549 */ 5550 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 5551 { 5552 struct netdev_adjacent *upper; 5553 5554 upper = list_first_or_null_rcu(&dev->adj_list.upper, 5555 struct netdev_adjacent, list); 5556 if (upper && likely(upper->master)) 5557 return upper->dev; 5558 return NULL; 5559 } 5560 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 5561 5562 static int netdev_adjacent_sysfs_add(struct net_device *dev, 5563 struct net_device *adj_dev, 5564 struct list_head *dev_list) 5565 { 5566 char linkname[IFNAMSIZ+7]; 5567 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5568 "upper_%s" : "lower_%s", adj_dev->name); 5569 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 5570 linkname); 5571 } 5572 static void netdev_adjacent_sysfs_del(struct net_device *dev, 5573 char *name, 5574 struct list_head *dev_list) 5575 { 5576 char linkname[IFNAMSIZ+7]; 5577 sprintf(linkname, dev_list == &dev->adj_list.upper ? 5578 "upper_%s" : "lower_%s", name); 5579 sysfs_remove_link(&(dev->dev.kobj), linkname); 5580 } 5581 5582 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 5583 struct net_device *adj_dev, 5584 struct list_head *dev_list) 5585 { 5586 return (dev_list == &dev->adj_list.upper || 5587 dev_list == &dev->adj_list.lower) && 5588 net_eq(dev_net(dev), dev_net(adj_dev)); 5589 } 5590 5591 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5592 struct net_device *adj_dev, 5593 u16 ref_nr, 5594 struct list_head *dev_list, 5595 void *private, bool master) 5596 { 5597 struct netdev_adjacent *adj; 5598 int ret; 5599 5600 adj = __netdev_find_adj(adj_dev, dev_list); 5601 5602 if (adj) { 5603 adj->ref_nr += ref_nr; 5604 return 0; 5605 } 5606 5607 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5608 if (!adj) 5609 return -ENOMEM; 5610 5611 adj->dev = adj_dev; 5612 adj->master = master; 5613 adj->ref_nr = ref_nr; 5614 adj->private = private; 5615 dev_hold(adj_dev); 5616 5617 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5618 adj_dev->name, dev->name, adj_dev->name); 5619 5620 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5621 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5622 if (ret) 5623 goto free_adj; 5624 } 5625 5626 /* Ensure that master link is always the first item in list. */ 5627 if (master) { 5628 ret = sysfs_create_link(&(dev->dev.kobj), 5629 &(adj_dev->dev.kobj), "master"); 5630 if (ret) 5631 goto remove_symlinks; 5632 5633 list_add_rcu(&adj->list, dev_list); 5634 } else { 5635 list_add_tail_rcu(&adj->list, dev_list); 5636 } 5637 5638 return 0; 5639 5640 remove_symlinks: 5641 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5642 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5643 free_adj: 5644 kfree(adj); 5645 dev_put(adj_dev); 5646 5647 return ret; 5648 } 5649 5650 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5651 struct net_device *adj_dev, 5652 u16 ref_nr, 5653 struct list_head *dev_list) 5654 { 5655 struct netdev_adjacent *adj; 5656 5657 adj = __netdev_find_adj(adj_dev, dev_list); 5658 5659 if (!adj) { 5660 pr_err("tried to remove device %s from %s\n", 5661 dev->name, adj_dev->name); 5662 BUG(); 5663 } 5664 5665 if (adj->ref_nr > ref_nr) { 5666 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name, 5667 ref_nr, adj->ref_nr-ref_nr); 5668 adj->ref_nr -= ref_nr; 5669 return; 5670 } 5671 5672 if (adj->master) 5673 sysfs_remove_link(&(dev->dev.kobj), "master"); 5674 5675 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5676 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5677 5678 list_del_rcu(&adj->list); 5679 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5680 adj_dev->name, dev->name, adj_dev->name); 5681 dev_put(adj_dev); 5682 kfree_rcu(adj, rcu); 5683 } 5684 5685 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5686 struct net_device *upper_dev, 5687 u16 ref_nr, 5688 struct list_head *up_list, 5689 struct list_head *down_list, 5690 void *private, bool master) 5691 { 5692 int ret; 5693 5694 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list, 5695 private, master); 5696 if (ret) 5697 return ret; 5698 5699 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list, 5700 private, false); 5701 if (ret) { 5702 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5703 return ret; 5704 } 5705 5706 return 0; 5707 } 5708 5709 static int __netdev_adjacent_dev_link(struct net_device *dev, 5710 struct net_device *upper_dev, 5711 u16 ref_nr) 5712 { 5713 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr, 5714 &dev->all_adj_list.upper, 5715 &upper_dev->all_adj_list.lower, 5716 NULL, false); 5717 } 5718 5719 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5720 struct net_device *upper_dev, 5721 u16 ref_nr, 5722 struct list_head *up_list, 5723 struct list_head *down_list) 5724 { 5725 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list); 5726 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list); 5727 } 5728 5729 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5730 struct net_device *upper_dev, 5731 u16 ref_nr) 5732 { 5733 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr, 5734 &dev->all_adj_list.upper, 5735 &upper_dev->all_adj_list.lower); 5736 } 5737 5738 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5739 struct net_device *upper_dev, 5740 void *private, bool master) 5741 { 5742 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1); 5743 5744 if (ret) 5745 return ret; 5746 5747 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1, 5748 &dev->adj_list.upper, 5749 &upper_dev->adj_list.lower, 5750 private, master); 5751 if (ret) { 5752 __netdev_adjacent_dev_unlink(dev, upper_dev, 1); 5753 return ret; 5754 } 5755 5756 return 0; 5757 } 5758 5759 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5760 struct net_device *upper_dev) 5761 { 5762 __netdev_adjacent_dev_unlink(dev, upper_dev, 1); 5763 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1, 5764 &dev->adj_list.upper, 5765 &upper_dev->adj_list.lower); 5766 } 5767 5768 static int __netdev_upper_dev_link(struct net_device *dev, 5769 struct net_device *upper_dev, bool master, 5770 void *upper_priv, void *upper_info) 5771 { 5772 struct netdev_notifier_changeupper_info changeupper_info; 5773 struct netdev_adjacent *i, *j, *to_i, *to_j; 5774 int ret = 0; 5775 5776 ASSERT_RTNL(); 5777 5778 if (dev == upper_dev) 5779 return -EBUSY; 5780 5781 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5782 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper)) 5783 return -EBUSY; 5784 5785 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper)) 5786 return -EEXIST; 5787 5788 if (master && netdev_master_upper_dev_get(dev)) 5789 return -EBUSY; 5790 5791 changeupper_info.upper_dev = upper_dev; 5792 changeupper_info.master = master; 5793 changeupper_info.linking = true; 5794 changeupper_info.upper_info = upper_info; 5795 5796 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5797 &changeupper_info.info); 5798 ret = notifier_to_errno(ret); 5799 if (ret) 5800 return ret; 5801 5802 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv, 5803 master); 5804 if (ret) 5805 return ret; 5806 5807 /* Now that we linked these devs, make all the upper_dev's 5808 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5809 * versa, and don't forget the devices itself. All of these 5810 * links are non-neighbours. 5811 */ 5812 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5813 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5814 pr_debug("Interlinking %s with %s, non-neighbour\n", 5815 i->dev->name, j->dev->name); 5816 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr); 5817 if (ret) 5818 goto rollback_mesh; 5819 } 5820 } 5821 5822 /* add dev to every upper_dev's upper device */ 5823 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5824 pr_debug("linking %s's upper device %s with %s\n", 5825 upper_dev->name, i->dev->name, dev->name); 5826 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr); 5827 if (ret) 5828 goto rollback_upper_mesh; 5829 } 5830 5831 /* add upper_dev to every dev's lower device */ 5832 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5833 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5834 i->dev->name, upper_dev->name); 5835 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr); 5836 if (ret) 5837 goto rollback_lower_mesh; 5838 } 5839 5840 ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5841 &changeupper_info.info); 5842 ret = notifier_to_errno(ret); 5843 if (ret) 5844 goto rollback_lower_mesh; 5845 5846 return 0; 5847 5848 rollback_lower_mesh: 5849 to_i = i; 5850 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5851 if (i == to_i) 5852 break; 5853 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); 5854 } 5855 5856 i = NULL; 5857 5858 rollback_upper_mesh: 5859 to_i = i; 5860 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5861 if (i == to_i) 5862 break; 5863 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); 5864 } 5865 5866 i = j = NULL; 5867 5868 rollback_mesh: 5869 to_i = i; 5870 to_j = j; 5871 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5872 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5873 if (i == to_i && j == to_j) 5874 break; 5875 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); 5876 } 5877 if (i == to_i) 5878 break; 5879 } 5880 5881 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5882 5883 return ret; 5884 } 5885 5886 /** 5887 * netdev_upper_dev_link - Add a link to the upper device 5888 * @dev: device 5889 * @upper_dev: new upper device 5890 * 5891 * Adds a link to device which is upper to this one. The caller must hold 5892 * the RTNL lock. On a failure a negative errno code is returned. 5893 * On success the reference counts are adjusted and the function 5894 * returns zero. 5895 */ 5896 int netdev_upper_dev_link(struct net_device *dev, 5897 struct net_device *upper_dev) 5898 { 5899 return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL); 5900 } 5901 EXPORT_SYMBOL(netdev_upper_dev_link); 5902 5903 /** 5904 * netdev_master_upper_dev_link - Add a master link to the upper device 5905 * @dev: device 5906 * @upper_dev: new upper device 5907 * @upper_priv: upper device private 5908 * @upper_info: upper info to be passed down via notifier 5909 * 5910 * Adds a link to device which is upper to this one. In this case, only 5911 * one master upper device can be linked, although other non-master devices 5912 * might be linked as well. The caller must hold the RTNL lock. 5913 * On a failure a negative errno code is returned. On success the reference 5914 * counts are adjusted and the function returns zero. 5915 */ 5916 int netdev_master_upper_dev_link(struct net_device *dev, 5917 struct net_device *upper_dev, 5918 void *upper_priv, void *upper_info) 5919 { 5920 return __netdev_upper_dev_link(dev, upper_dev, true, 5921 upper_priv, upper_info); 5922 } 5923 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5924 5925 /** 5926 * netdev_upper_dev_unlink - Removes a link to upper device 5927 * @dev: device 5928 * @upper_dev: new upper device 5929 * 5930 * Removes a link to device which is upper to this one. The caller must hold 5931 * the RTNL lock. 5932 */ 5933 void netdev_upper_dev_unlink(struct net_device *dev, 5934 struct net_device *upper_dev) 5935 { 5936 struct netdev_notifier_changeupper_info changeupper_info; 5937 struct netdev_adjacent *i, *j; 5938 ASSERT_RTNL(); 5939 5940 changeupper_info.upper_dev = upper_dev; 5941 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; 5942 changeupper_info.linking = false; 5943 5944 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev, 5945 &changeupper_info.info); 5946 5947 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5948 5949 /* Here is the tricky part. We must remove all dev's lower 5950 * devices from all upper_dev's upper devices and vice 5951 * versa, to maintain the graph relationship. 5952 */ 5953 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5954 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5955 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr); 5956 5957 /* remove also the devices itself from lower/upper device 5958 * list 5959 */ 5960 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5961 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr); 5962 5963 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5964 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr); 5965 5966 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, 5967 &changeupper_info.info); 5968 } 5969 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5970 5971 /** 5972 * netdev_bonding_info_change - Dispatch event about slave change 5973 * @dev: device 5974 * @bonding_info: info to dispatch 5975 * 5976 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5977 * The caller must hold the RTNL lock. 5978 */ 5979 void netdev_bonding_info_change(struct net_device *dev, 5980 struct netdev_bonding_info *bonding_info) 5981 { 5982 struct netdev_notifier_bonding_info info; 5983 5984 memcpy(&info.bonding_info, bonding_info, 5985 sizeof(struct netdev_bonding_info)); 5986 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5987 &info.info); 5988 } 5989 EXPORT_SYMBOL(netdev_bonding_info_change); 5990 5991 static void netdev_adjacent_add_links(struct net_device *dev) 5992 { 5993 struct netdev_adjacent *iter; 5994 5995 struct net *net = dev_net(dev); 5996 5997 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5998 if (!net_eq(net, dev_net(iter->dev))) 5999 continue; 6000 netdev_adjacent_sysfs_add(iter->dev, dev, 6001 &iter->dev->adj_list.lower); 6002 netdev_adjacent_sysfs_add(dev, iter->dev, 6003 &dev->adj_list.upper); 6004 } 6005 6006 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6007 if (!net_eq(net, dev_net(iter->dev))) 6008 continue; 6009 netdev_adjacent_sysfs_add(iter->dev, dev, 6010 &iter->dev->adj_list.upper); 6011 netdev_adjacent_sysfs_add(dev, iter->dev, 6012 &dev->adj_list.lower); 6013 } 6014 } 6015 6016 static void netdev_adjacent_del_links(struct net_device *dev) 6017 { 6018 struct netdev_adjacent *iter; 6019 6020 struct net *net = dev_net(dev); 6021 6022 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6023 if (!net_eq(net, dev_net(iter->dev))) 6024 continue; 6025 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6026 &iter->dev->adj_list.lower); 6027 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6028 &dev->adj_list.upper); 6029 } 6030 6031 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6032 if (!net_eq(net, dev_net(iter->dev))) 6033 continue; 6034 netdev_adjacent_sysfs_del(iter->dev, dev->name, 6035 &iter->dev->adj_list.upper); 6036 netdev_adjacent_sysfs_del(dev, iter->dev->name, 6037 &dev->adj_list.lower); 6038 } 6039 } 6040 6041 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 6042 { 6043 struct netdev_adjacent *iter; 6044 6045 struct net *net = dev_net(dev); 6046 6047 list_for_each_entry(iter, &dev->adj_list.upper, list) { 6048 if (!net_eq(net, dev_net(iter->dev))) 6049 continue; 6050 netdev_adjacent_sysfs_del(iter->dev, oldname, 6051 &iter->dev->adj_list.lower); 6052 netdev_adjacent_sysfs_add(iter->dev, dev, 6053 &iter->dev->adj_list.lower); 6054 } 6055 6056 list_for_each_entry(iter, &dev->adj_list.lower, list) { 6057 if (!net_eq(net, dev_net(iter->dev))) 6058 continue; 6059 netdev_adjacent_sysfs_del(iter->dev, oldname, 6060 &iter->dev->adj_list.upper); 6061 netdev_adjacent_sysfs_add(iter->dev, dev, 6062 &iter->dev->adj_list.upper); 6063 } 6064 } 6065 6066 void *netdev_lower_dev_get_private(struct net_device *dev, 6067 struct net_device *lower_dev) 6068 { 6069 struct netdev_adjacent *lower; 6070 6071 if (!lower_dev) 6072 return NULL; 6073 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower); 6074 if (!lower) 6075 return NULL; 6076 6077 return lower->private; 6078 } 6079 EXPORT_SYMBOL(netdev_lower_dev_get_private); 6080 6081 6082 int dev_get_nest_level(struct net_device *dev) 6083 { 6084 struct net_device *lower = NULL; 6085 struct list_head *iter; 6086 int max_nest = -1; 6087 int nest; 6088 6089 ASSERT_RTNL(); 6090 6091 netdev_for_each_lower_dev(dev, lower, iter) { 6092 nest = dev_get_nest_level(lower); 6093 if (max_nest < nest) 6094 max_nest = nest; 6095 } 6096 6097 return max_nest + 1; 6098 } 6099 EXPORT_SYMBOL(dev_get_nest_level); 6100 6101 /** 6102 * netdev_lower_change - Dispatch event about lower device state change 6103 * @lower_dev: device 6104 * @lower_state_info: state to dispatch 6105 * 6106 * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info. 6107 * The caller must hold the RTNL lock. 6108 */ 6109 void netdev_lower_state_changed(struct net_device *lower_dev, 6110 void *lower_state_info) 6111 { 6112 struct netdev_notifier_changelowerstate_info changelowerstate_info; 6113 6114 ASSERT_RTNL(); 6115 changelowerstate_info.lower_state_info = lower_state_info; 6116 call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev, 6117 &changelowerstate_info.info); 6118 } 6119 EXPORT_SYMBOL(netdev_lower_state_changed); 6120 6121 int netdev_default_l2upper_neigh_construct(struct net_device *dev, 6122 struct neighbour *n) 6123 { 6124 struct net_device *lower_dev, *stop_dev; 6125 struct list_head *iter; 6126 int err; 6127 6128 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6129 if (!lower_dev->netdev_ops->ndo_neigh_construct) 6130 continue; 6131 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n); 6132 if (err) { 6133 stop_dev = lower_dev; 6134 goto rollback; 6135 } 6136 } 6137 return 0; 6138 6139 rollback: 6140 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6141 if (lower_dev == stop_dev) 6142 break; 6143 if (!lower_dev->netdev_ops->ndo_neigh_destroy) 6144 continue; 6145 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); 6146 } 6147 return err; 6148 } 6149 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct); 6150 6151 void netdev_default_l2upper_neigh_destroy(struct net_device *dev, 6152 struct neighbour *n) 6153 { 6154 struct net_device *lower_dev; 6155 struct list_head *iter; 6156 6157 netdev_for_each_lower_dev(dev, lower_dev, iter) { 6158 if (!lower_dev->netdev_ops->ndo_neigh_destroy) 6159 continue; 6160 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n); 6161 } 6162 } 6163 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy); 6164 6165 static void dev_change_rx_flags(struct net_device *dev, int flags) 6166 { 6167 const struct net_device_ops *ops = dev->netdev_ops; 6168 6169 if (ops->ndo_change_rx_flags) 6170 ops->ndo_change_rx_flags(dev, flags); 6171 } 6172 6173 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 6174 { 6175 unsigned int old_flags = dev->flags; 6176 kuid_t uid; 6177 kgid_t gid; 6178 6179 ASSERT_RTNL(); 6180 6181 dev->flags |= IFF_PROMISC; 6182 dev->promiscuity += inc; 6183 if (dev->promiscuity == 0) { 6184 /* 6185 * Avoid overflow. 6186 * If inc causes overflow, untouch promisc and return error. 6187 */ 6188 if (inc < 0) 6189 dev->flags &= ~IFF_PROMISC; 6190 else { 6191 dev->promiscuity -= inc; 6192 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 6193 dev->name); 6194 return -EOVERFLOW; 6195 } 6196 } 6197 if (dev->flags != old_flags) { 6198 pr_info("device %s %s promiscuous mode\n", 6199 dev->name, 6200 dev->flags & IFF_PROMISC ? "entered" : "left"); 6201 if (audit_enabled) { 6202 current_uid_gid(&uid, &gid); 6203 audit_log(current->audit_context, GFP_ATOMIC, 6204 AUDIT_ANOM_PROMISCUOUS, 6205 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 6206 dev->name, (dev->flags & IFF_PROMISC), 6207 (old_flags & IFF_PROMISC), 6208 from_kuid(&init_user_ns, audit_get_loginuid(current)), 6209 from_kuid(&init_user_ns, uid), 6210 from_kgid(&init_user_ns, gid), 6211 audit_get_sessionid(current)); 6212 } 6213 6214 dev_change_rx_flags(dev, IFF_PROMISC); 6215 } 6216 if (notify) 6217 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 6218 return 0; 6219 } 6220 6221 /** 6222 * dev_set_promiscuity - update promiscuity count on a device 6223 * @dev: device 6224 * @inc: modifier 6225 * 6226 * Add or remove promiscuity from a device. While the count in the device 6227 * remains above zero the interface remains promiscuous. Once it hits zero 6228 * the device reverts back to normal filtering operation. A negative inc 6229 * value is used to drop promiscuity on the device. 6230 * Return 0 if successful or a negative errno code on error. 6231 */ 6232 int dev_set_promiscuity(struct net_device *dev, int inc) 6233 { 6234 unsigned int old_flags = dev->flags; 6235 int err; 6236 6237 err = __dev_set_promiscuity(dev, inc, true); 6238 if (err < 0) 6239 return err; 6240 if (dev->flags != old_flags) 6241 dev_set_rx_mode(dev); 6242 return err; 6243 } 6244 EXPORT_SYMBOL(dev_set_promiscuity); 6245 6246 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 6247 { 6248 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 6249 6250 ASSERT_RTNL(); 6251 6252 dev->flags |= IFF_ALLMULTI; 6253 dev->allmulti += inc; 6254 if (dev->allmulti == 0) { 6255 /* 6256 * Avoid overflow. 6257 * If inc causes overflow, untouch allmulti and return error. 6258 */ 6259 if (inc < 0) 6260 dev->flags &= ~IFF_ALLMULTI; 6261 else { 6262 dev->allmulti -= inc; 6263 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 6264 dev->name); 6265 return -EOVERFLOW; 6266 } 6267 } 6268 if (dev->flags ^ old_flags) { 6269 dev_change_rx_flags(dev, IFF_ALLMULTI); 6270 dev_set_rx_mode(dev); 6271 if (notify) 6272 __dev_notify_flags(dev, old_flags, 6273 dev->gflags ^ old_gflags); 6274 } 6275 return 0; 6276 } 6277 6278 /** 6279 * dev_set_allmulti - update allmulti count on a device 6280 * @dev: device 6281 * @inc: modifier 6282 * 6283 * Add or remove reception of all multicast frames to a device. While the 6284 * count in the device remains above zero the interface remains listening 6285 * to all interfaces. Once it hits zero the device reverts back to normal 6286 * filtering operation. A negative @inc value is used to drop the counter 6287 * when releasing a resource needing all multicasts. 6288 * Return 0 if successful or a negative errno code on error. 6289 */ 6290 6291 int dev_set_allmulti(struct net_device *dev, int inc) 6292 { 6293 return __dev_set_allmulti(dev, inc, true); 6294 } 6295 EXPORT_SYMBOL(dev_set_allmulti); 6296 6297 /* 6298 * Upload unicast and multicast address lists to device and 6299 * configure RX filtering. When the device doesn't support unicast 6300 * filtering it is put in promiscuous mode while unicast addresses 6301 * are present. 6302 */ 6303 void __dev_set_rx_mode(struct net_device *dev) 6304 { 6305 const struct net_device_ops *ops = dev->netdev_ops; 6306 6307 /* dev_open will call this function so the list will stay sane. */ 6308 if (!(dev->flags&IFF_UP)) 6309 return; 6310 6311 if (!netif_device_present(dev)) 6312 return; 6313 6314 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 6315 /* Unicast addresses changes may only happen under the rtnl, 6316 * therefore calling __dev_set_promiscuity here is safe. 6317 */ 6318 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 6319 __dev_set_promiscuity(dev, 1, false); 6320 dev->uc_promisc = true; 6321 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 6322 __dev_set_promiscuity(dev, -1, false); 6323 dev->uc_promisc = false; 6324 } 6325 } 6326 6327 if (ops->ndo_set_rx_mode) 6328 ops->ndo_set_rx_mode(dev); 6329 } 6330 6331 void dev_set_rx_mode(struct net_device *dev) 6332 { 6333 netif_addr_lock_bh(dev); 6334 __dev_set_rx_mode(dev); 6335 netif_addr_unlock_bh(dev); 6336 } 6337 6338 /** 6339 * dev_get_flags - get flags reported to userspace 6340 * @dev: device 6341 * 6342 * Get the combination of flag bits exported through APIs to userspace. 6343 */ 6344 unsigned int dev_get_flags(const struct net_device *dev) 6345 { 6346 unsigned int flags; 6347 6348 flags = (dev->flags & ~(IFF_PROMISC | 6349 IFF_ALLMULTI | 6350 IFF_RUNNING | 6351 IFF_LOWER_UP | 6352 IFF_DORMANT)) | 6353 (dev->gflags & (IFF_PROMISC | 6354 IFF_ALLMULTI)); 6355 6356 if (netif_running(dev)) { 6357 if (netif_oper_up(dev)) 6358 flags |= IFF_RUNNING; 6359 if (netif_carrier_ok(dev)) 6360 flags |= IFF_LOWER_UP; 6361 if (netif_dormant(dev)) 6362 flags |= IFF_DORMANT; 6363 } 6364 6365 return flags; 6366 } 6367 EXPORT_SYMBOL(dev_get_flags); 6368 6369 int __dev_change_flags(struct net_device *dev, unsigned int flags) 6370 { 6371 unsigned int old_flags = dev->flags; 6372 int ret; 6373 6374 ASSERT_RTNL(); 6375 6376 /* 6377 * Set the flags on our device. 6378 */ 6379 6380 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 6381 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 6382 IFF_AUTOMEDIA)) | 6383 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 6384 IFF_ALLMULTI)); 6385 6386 /* 6387 * Load in the correct multicast list now the flags have changed. 6388 */ 6389 6390 if ((old_flags ^ flags) & IFF_MULTICAST) 6391 dev_change_rx_flags(dev, IFF_MULTICAST); 6392 6393 dev_set_rx_mode(dev); 6394 6395 /* 6396 * Have we downed the interface. We handle IFF_UP ourselves 6397 * according to user attempts to set it, rather than blindly 6398 * setting it. 6399 */ 6400 6401 ret = 0; 6402 if ((old_flags ^ flags) & IFF_UP) 6403 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 6404 6405 if ((flags ^ dev->gflags) & IFF_PROMISC) { 6406 int inc = (flags & IFF_PROMISC) ? 1 : -1; 6407 unsigned int old_flags = dev->flags; 6408 6409 dev->gflags ^= IFF_PROMISC; 6410 6411 if (__dev_set_promiscuity(dev, inc, false) >= 0) 6412 if (dev->flags != old_flags) 6413 dev_set_rx_mode(dev); 6414 } 6415 6416 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 6417 is important. Some (broken) drivers set IFF_PROMISC, when 6418 IFF_ALLMULTI is requested not asking us and not reporting. 6419 */ 6420 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 6421 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 6422 6423 dev->gflags ^= IFF_ALLMULTI; 6424 __dev_set_allmulti(dev, inc, false); 6425 } 6426 6427 return ret; 6428 } 6429 6430 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 6431 unsigned int gchanges) 6432 { 6433 unsigned int changes = dev->flags ^ old_flags; 6434 6435 if (gchanges) 6436 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 6437 6438 if (changes & IFF_UP) { 6439 if (dev->flags & IFF_UP) 6440 call_netdevice_notifiers(NETDEV_UP, dev); 6441 else 6442 call_netdevice_notifiers(NETDEV_DOWN, dev); 6443 } 6444 6445 if (dev->flags & IFF_UP && 6446 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 6447 struct netdev_notifier_change_info change_info; 6448 6449 change_info.flags_changed = changes; 6450 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 6451 &change_info.info); 6452 } 6453 } 6454 6455 /** 6456 * dev_change_flags - change device settings 6457 * @dev: device 6458 * @flags: device state flags 6459 * 6460 * Change settings on device based state flags. The flags are 6461 * in the userspace exported format. 6462 */ 6463 int dev_change_flags(struct net_device *dev, unsigned int flags) 6464 { 6465 int ret; 6466 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 6467 6468 ret = __dev_change_flags(dev, flags); 6469 if (ret < 0) 6470 return ret; 6471 6472 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 6473 __dev_notify_flags(dev, old_flags, changes); 6474 return ret; 6475 } 6476 EXPORT_SYMBOL(dev_change_flags); 6477 6478 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 6479 { 6480 const struct net_device_ops *ops = dev->netdev_ops; 6481 6482 if (ops->ndo_change_mtu) 6483 return ops->ndo_change_mtu(dev, new_mtu); 6484 6485 dev->mtu = new_mtu; 6486 return 0; 6487 } 6488 6489 /** 6490 * dev_set_mtu - Change maximum transfer unit 6491 * @dev: device 6492 * @new_mtu: new transfer unit 6493 * 6494 * Change the maximum transfer size of the network device. 6495 */ 6496 int dev_set_mtu(struct net_device *dev, int new_mtu) 6497 { 6498 int err, orig_mtu; 6499 6500 if (new_mtu == dev->mtu) 6501 return 0; 6502 6503 /* MTU must be positive. */ 6504 if (new_mtu < 0) 6505 return -EINVAL; 6506 6507 if (!netif_device_present(dev)) 6508 return -ENODEV; 6509 6510 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 6511 err = notifier_to_errno(err); 6512 if (err) 6513 return err; 6514 6515 orig_mtu = dev->mtu; 6516 err = __dev_set_mtu(dev, new_mtu); 6517 6518 if (!err) { 6519 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6520 err = notifier_to_errno(err); 6521 if (err) { 6522 /* setting mtu back and notifying everyone again, 6523 * so that they have a chance to revert changes. 6524 */ 6525 __dev_set_mtu(dev, orig_mtu); 6526 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 6527 } 6528 } 6529 return err; 6530 } 6531 EXPORT_SYMBOL(dev_set_mtu); 6532 6533 /** 6534 * dev_set_group - Change group this device belongs to 6535 * @dev: device 6536 * @new_group: group this device should belong to 6537 */ 6538 void dev_set_group(struct net_device *dev, int new_group) 6539 { 6540 dev->group = new_group; 6541 } 6542 EXPORT_SYMBOL(dev_set_group); 6543 6544 /** 6545 * dev_set_mac_address - Change Media Access Control Address 6546 * @dev: device 6547 * @sa: new address 6548 * 6549 * Change the hardware (MAC) address of the device 6550 */ 6551 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 6552 { 6553 const struct net_device_ops *ops = dev->netdev_ops; 6554 int err; 6555 6556 if (!ops->ndo_set_mac_address) 6557 return -EOPNOTSUPP; 6558 if (sa->sa_family != dev->type) 6559 return -EINVAL; 6560 if (!netif_device_present(dev)) 6561 return -ENODEV; 6562 err = ops->ndo_set_mac_address(dev, sa); 6563 if (err) 6564 return err; 6565 dev->addr_assign_type = NET_ADDR_SET; 6566 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 6567 add_device_randomness(dev->dev_addr, dev->addr_len); 6568 return 0; 6569 } 6570 EXPORT_SYMBOL(dev_set_mac_address); 6571 6572 /** 6573 * dev_change_carrier - Change device carrier 6574 * @dev: device 6575 * @new_carrier: new value 6576 * 6577 * Change device carrier 6578 */ 6579 int dev_change_carrier(struct net_device *dev, bool new_carrier) 6580 { 6581 const struct net_device_ops *ops = dev->netdev_ops; 6582 6583 if (!ops->ndo_change_carrier) 6584 return -EOPNOTSUPP; 6585 if (!netif_device_present(dev)) 6586 return -ENODEV; 6587 return ops->ndo_change_carrier(dev, new_carrier); 6588 } 6589 EXPORT_SYMBOL(dev_change_carrier); 6590 6591 /** 6592 * dev_get_phys_port_id - Get device physical port ID 6593 * @dev: device 6594 * @ppid: port ID 6595 * 6596 * Get device physical port ID 6597 */ 6598 int dev_get_phys_port_id(struct net_device *dev, 6599 struct netdev_phys_item_id *ppid) 6600 { 6601 const struct net_device_ops *ops = dev->netdev_ops; 6602 6603 if (!ops->ndo_get_phys_port_id) 6604 return -EOPNOTSUPP; 6605 return ops->ndo_get_phys_port_id(dev, ppid); 6606 } 6607 EXPORT_SYMBOL(dev_get_phys_port_id); 6608 6609 /** 6610 * dev_get_phys_port_name - Get device physical port name 6611 * @dev: device 6612 * @name: port name 6613 * @len: limit of bytes to copy to name 6614 * 6615 * Get device physical port name 6616 */ 6617 int dev_get_phys_port_name(struct net_device *dev, 6618 char *name, size_t len) 6619 { 6620 const struct net_device_ops *ops = dev->netdev_ops; 6621 6622 if (!ops->ndo_get_phys_port_name) 6623 return -EOPNOTSUPP; 6624 return ops->ndo_get_phys_port_name(dev, name, len); 6625 } 6626 EXPORT_SYMBOL(dev_get_phys_port_name); 6627 6628 /** 6629 * dev_change_proto_down - update protocol port state information 6630 * @dev: device 6631 * @proto_down: new value 6632 * 6633 * This info can be used by switch drivers to set the phys state of the 6634 * port. 6635 */ 6636 int dev_change_proto_down(struct net_device *dev, bool proto_down) 6637 { 6638 const struct net_device_ops *ops = dev->netdev_ops; 6639 6640 if (!ops->ndo_change_proto_down) 6641 return -EOPNOTSUPP; 6642 if (!netif_device_present(dev)) 6643 return -ENODEV; 6644 return ops->ndo_change_proto_down(dev, proto_down); 6645 } 6646 EXPORT_SYMBOL(dev_change_proto_down); 6647 6648 /** 6649 * dev_change_xdp_fd - set or clear a bpf program for a device rx path 6650 * @dev: device 6651 * @fd: new program fd or negative value to clear 6652 * 6653 * Set or clear a bpf program for a device 6654 */ 6655 int dev_change_xdp_fd(struct net_device *dev, int fd) 6656 { 6657 const struct net_device_ops *ops = dev->netdev_ops; 6658 struct bpf_prog *prog = NULL; 6659 struct netdev_xdp xdp = {}; 6660 int err; 6661 6662 if (!ops->ndo_xdp) 6663 return -EOPNOTSUPP; 6664 if (fd >= 0) { 6665 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP); 6666 if (IS_ERR(prog)) 6667 return PTR_ERR(prog); 6668 } 6669 6670 xdp.command = XDP_SETUP_PROG; 6671 xdp.prog = prog; 6672 err = ops->ndo_xdp(dev, &xdp); 6673 if (err < 0 && prog) 6674 bpf_prog_put(prog); 6675 6676 return err; 6677 } 6678 EXPORT_SYMBOL(dev_change_xdp_fd); 6679 6680 /** 6681 * dev_new_index - allocate an ifindex 6682 * @net: the applicable net namespace 6683 * 6684 * Returns a suitable unique value for a new device interface 6685 * number. The caller must hold the rtnl semaphore or the 6686 * dev_base_lock to be sure it remains unique. 6687 */ 6688 static int dev_new_index(struct net *net) 6689 { 6690 int ifindex = net->ifindex; 6691 for (;;) { 6692 if (++ifindex <= 0) 6693 ifindex = 1; 6694 if (!__dev_get_by_index(net, ifindex)) 6695 return net->ifindex = ifindex; 6696 } 6697 } 6698 6699 /* Delayed registration/unregisteration */ 6700 static LIST_HEAD(net_todo_list); 6701 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 6702 6703 static void net_set_todo(struct net_device *dev) 6704 { 6705 list_add_tail(&dev->todo_list, &net_todo_list); 6706 dev_net(dev)->dev_unreg_count++; 6707 } 6708 6709 static void rollback_registered_many(struct list_head *head) 6710 { 6711 struct net_device *dev, *tmp; 6712 LIST_HEAD(close_head); 6713 6714 BUG_ON(dev_boot_phase); 6715 ASSERT_RTNL(); 6716 6717 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 6718 /* Some devices call without registering 6719 * for initialization unwind. Remove those 6720 * devices and proceed with the remaining. 6721 */ 6722 if (dev->reg_state == NETREG_UNINITIALIZED) { 6723 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 6724 dev->name, dev); 6725 6726 WARN_ON(1); 6727 list_del(&dev->unreg_list); 6728 continue; 6729 } 6730 dev->dismantle = true; 6731 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6732 } 6733 6734 /* If device is running, close it first. */ 6735 list_for_each_entry(dev, head, unreg_list) 6736 list_add_tail(&dev->close_list, &close_head); 6737 dev_close_many(&close_head, true); 6738 6739 list_for_each_entry(dev, head, unreg_list) { 6740 /* And unlink it from device chain. */ 6741 unlist_netdevice(dev); 6742 6743 dev->reg_state = NETREG_UNREGISTERING; 6744 } 6745 flush_all_backlogs(); 6746 6747 synchronize_net(); 6748 6749 list_for_each_entry(dev, head, unreg_list) { 6750 struct sk_buff *skb = NULL; 6751 6752 /* Shutdown queueing discipline. */ 6753 dev_shutdown(dev); 6754 6755 6756 /* Notify protocols, that we are about to destroy 6757 this device. They should clean all the things. 6758 */ 6759 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6760 6761 if (!dev->rtnl_link_ops || 6762 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6763 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6764 GFP_KERNEL); 6765 6766 /* 6767 * Flush the unicast and multicast chains 6768 */ 6769 dev_uc_flush(dev); 6770 dev_mc_flush(dev); 6771 6772 if (dev->netdev_ops->ndo_uninit) 6773 dev->netdev_ops->ndo_uninit(dev); 6774 6775 if (skb) 6776 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6777 6778 /* Notifier chain MUST detach us all upper devices. */ 6779 WARN_ON(netdev_has_any_upper_dev(dev)); 6780 6781 /* Remove entries from kobject tree */ 6782 netdev_unregister_kobject(dev); 6783 #ifdef CONFIG_XPS 6784 /* Remove XPS queueing entries */ 6785 netif_reset_xps_queues_gt(dev, 0); 6786 #endif 6787 } 6788 6789 synchronize_net(); 6790 6791 list_for_each_entry(dev, head, unreg_list) 6792 dev_put(dev); 6793 } 6794 6795 static void rollback_registered(struct net_device *dev) 6796 { 6797 LIST_HEAD(single); 6798 6799 list_add(&dev->unreg_list, &single); 6800 rollback_registered_many(&single); 6801 list_del(&single); 6802 } 6803 6804 static netdev_features_t netdev_sync_upper_features(struct net_device *lower, 6805 struct net_device *upper, netdev_features_t features) 6806 { 6807 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6808 netdev_features_t feature; 6809 int feature_bit; 6810 6811 for_each_netdev_feature(&upper_disables, feature_bit) { 6812 feature = __NETIF_F_BIT(feature_bit); 6813 if (!(upper->wanted_features & feature) 6814 && (features & feature)) { 6815 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n", 6816 &feature, upper->name); 6817 features &= ~feature; 6818 } 6819 } 6820 6821 return features; 6822 } 6823 6824 static void netdev_sync_lower_features(struct net_device *upper, 6825 struct net_device *lower, netdev_features_t features) 6826 { 6827 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; 6828 netdev_features_t feature; 6829 int feature_bit; 6830 6831 for_each_netdev_feature(&upper_disables, feature_bit) { 6832 feature = __NETIF_F_BIT(feature_bit); 6833 if (!(features & feature) && (lower->features & feature)) { 6834 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", 6835 &feature, lower->name); 6836 lower->wanted_features &= ~feature; 6837 netdev_update_features(lower); 6838 6839 if (unlikely(lower->features & feature)) 6840 netdev_WARN(upper, "failed to disable %pNF on %s!\n", 6841 &feature, lower->name); 6842 } 6843 } 6844 } 6845 6846 static netdev_features_t netdev_fix_features(struct net_device *dev, 6847 netdev_features_t features) 6848 { 6849 /* Fix illegal checksum combinations */ 6850 if ((features & NETIF_F_HW_CSUM) && 6851 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6852 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6853 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6854 } 6855 6856 /* TSO requires that SG is present as well. */ 6857 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6858 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6859 features &= ~NETIF_F_ALL_TSO; 6860 } 6861 6862 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6863 !(features & NETIF_F_IP_CSUM)) { 6864 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6865 features &= ~NETIF_F_TSO; 6866 features &= ~NETIF_F_TSO_ECN; 6867 } 6868 6869 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6870 !(features & NETIF_F_IPV6_CSUM)) { 6871 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6872 features &= ~NETIF_F_TSO6; 6873 } 6874 6875 /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */ 6876 if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO)) 6877 features &= ~NETIF_F_TSO_MANGLEID; 6878 6879 /* TSO ECN requires that TSO is present as well. */ 6880 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6881 features &= ~NETIF_F_TSO_ECN; 6882 6883 /* Software GSO depends on SG. */ 6884 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6885 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6886 features &= ~NETIF_F_GSO; 6887 } 6888 6889 /* UFO needs SG and checksumming */ 6890 if (features & NETIF_F_UFO) { 6891 /* maybe split UFO into V4 and V6? */ 6892 if (!(features & NETIF_F_HW_CSUM) && 6893 ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 6894 (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) { 6895 netdev_dbg(dev, 6896 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6897 features &= ~NETIF_F_UFO; 6898 } 6899 6900 if (!(features & NETIF_F_SG)) { 6901 netdev_dbg(dev, 6902 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6903 features &= ~NETIF_F_UFO; 6904 } 6905 } 6906 6907 /* GSO partial features require GSO partial be set */ 6908 if ((features & dev->gso_partial_features) && 6909 !(features & NETIF_F_GSO_PARTIAL)) { 6910 netdev_dbg(dev, 6911 "Dropping partially supported GSO features since no GSO partial.\n"); 6912 features &= ~dev->gso_partial_features; 6913 } 6914 6915 #ifdef CONFIG_NET_RX_BUSY_POLL 6916 if (dev->netdev_ops->ndo_busy_poll) 6917 features |= NETIF_F_BUSY_POLL; 6918 else 6919 #endif 6920 features &= ~NETIF_F_BUSY_POLL; 6921 6922 return features; 6923 } 6924 6925 int __netdev_update_features(struct net_device *dev) 6926 { 6927 struct net_device *upper, *lower; 6928 netdev_features_t features; 6929 struct list_head *iter; 6930 int err = -1; 6931 6932 ASSERT_RTNL(); 6933 6934 features = netdev_get_wanted_features(dev); 6935 6936 if (dev->netdev_ops->ndo_fix_features) 6937 features = dev->netdev_ops->ndo_fix_features(dev, features); 6938 6939 /* driver might be less strict about feature dependencies */ 6940 features = netdev_fix_features(dev, features); 6941 6942 /* some features can't be enabled if they're off an an upper device */ 6943 netdev_for_each_upper_dev_rcu(dev, upper, iter) 6944 features = netdev_sync_upper_features(dev, upper, features); 6945 6946 if (dev->features == features) 6947 goto sync_lower; 6948 6949 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6950 &dev->features, &features); 6951 6952 if (dev->netdev_ops->ndo_set_features) 6953 err = dev->netdev_ops->ndo_set_features(dev, features); 6954 else 6955 err = 0; 6956 6957 if (unlikely(err < 0)) { 6958 netdev_err(dev, 6959 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6960 err, &features, &dev->features); 6961 /* return non-0 since some features might have changed and 6962 * it's better to fire a spurious notification than miss it 6963 */ 6964 return -1; 6965 } 6966 6967 sync_lower: 6968 /* some features must be disabled on lower devices when disabled 6969 * on an upper device (think: bonding master or bridge) 6970 */ 6971 netdev_for_each_lower_dev(dev, lower, iter) 6972 netdev_sync_lower_features(dev, lower, features); 6973 6974 if (!err) 6975 dev->features = features; 6976 6977 return err < 0 ? 0 : 1; 6978 } 6979 6980 /** 6981 * netdev_update_features - recalculate device features 6982 * @dev: the device to check 6983 * 6984 * Recalculate dev->features set and send notifications if it 6985 * has changed. Should be called after driver or hardware dependent 6986 * conditions might have changed that influence the features. 6987 */ 6988 void netdev_update_features(struct net_device *dev) 6989 { 6990 if (__netdev_update_features(dev)) 6991 netdev_features_change(dev); 6992 } 6993 EXPORT_SYMBOL(netdev_update_features); 6994 6995 /** 6996 * netdev_change_features - recalculate device features 6997 * @dev: the device to check 6998 * 6999 * Recalculate dev->features set and send notifications even 7000 * if they have not changed. Should be called instead of 7001 * netdev_update_features() if also dev->vlan_features might 7002 * have changed to allow the changes to be propagated to stacked 7003 * VLAN devices. 7004 */ 7005 void netdev_change_features(struct net_device *dev) 7006 { 7007 __netdev_update_features(dev); 7008 netdev_features_change(dev); 7009 } 7010 EXPORT_SYMBOL(netdev_change_features); 7011 7012 /** 7013 * netif_stacked_transfer_operstate - transfer operstate 7014 * @rootdev: the root or lower level device to transfer state from 7015 * @dev: the device to transfer operstate to 7016 * 7017 * Transfer operational state from root to device. This is normally 7018 * called when a stacking relationship exists between the root 7019 * device and the device(a leaf device). 7020 */ 7021 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 7022 struct net_device *dev) 7023 { 7024 if (rootdev->operstate == IF_OPER_DORMANT) 7025 netif_dormant_on(dev); 7026 else 7027 netif_dormant_off(dev); 7028 7029 if (netif_carrier_ok(rootdev)) { 7030 if (!netif_carrier_ok(dev)) 7031 netif_carrier_on(dev); 7032 } else { 7033 if (netif_carrier_ok(dev)) 7034 netif_carrier_off(dev); 7035 } 7036 } 7037 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 7038 7039 #ifdef CONFIG_SYSFS 7040 static int netif_alloc_rx_queues(struct net_device *dev) 7041 { 7042 unsigned int i, count = dev->num_rx_queues; 7043 struct netdev_rx_queue *rx; 7044 size_t sz = count * sizeof(*rx); 7045 7046 BUG_ON(count < 1); 7047 7048 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7049 if (!rx) { 7050 rx = vzalloc(sz); 7051 if (!rx) 7052 return -ENOMEM; 7053 } 7054 dev->_rx = rx; 7055 7056 for (i = 0; i < count; i++) 7057 rx[i].dev = dev; 7058 return 0; 7059 } 7060 #endif 7061 7062 static void netdev_init_one_queue(struct net_device *dev, 7063 struct netdev_queue *queue, void *_unused) 7064 { 7065 /* Initialize queue lock */ 7066 spin_lock_init(&queue->_xmit_lock); 7067 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 7068 queue->xmit_lock_owner = -1; 7069 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 7070 queue->dev = dev; 7071 #ifdef CONFIG_BQL 7072 dql_init(&queue->dql, HZ); 7073 #endif 7074 } 7075 7076 static void netif_free_tx_queues(struct net_device *dev) 7077 { 7078 kvfree(dev->_tx); 7079 } 7080 7081 static int netif_alloc_netdev_queues(struct net_device *dev) 7082 { 7083 unsigned int count = dev->num_tx_queues; 7084 struct netdev_queue *tx; 7085 size_t sz = count * sizeof(*tx); 7086 7087 if (count < 1 || count > 0xffff) 7088 return -EINVAL; 7089 7090 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7091 if (!tx) { 7092 tx = vzalloc(sz); 7093 if (!tx) 7094 return -ENOMEM; 7095 } 7096 dev->_tx = tx; 7097 7098 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 7099 spin_lock_init(&dev->tx_global_lock); 7100 7101 return 0; 7102 } 7103 7104 void netif_tx_stop_all_queues(struct net_device *dev) 7105 { 7106 unsigned int i; 7107 7108 for (i = 0; i < dev->num_tx_queues; i++) { 7109 struct netdev_queue *txq = netdev_get_tx_queue(dev, i); 7110 netif_tx_stop_queue(txq); 7111 } 7112 } 7113 EXPORT_SYMBOL(netif_tx_stop_all_queues); 7114 7115 /** 7116 * register_netdevice - register a network device 7117 * @dev: device to register 7118 * 7119 * Take a completed network device structure and add it to the kernel 7120 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7121 * chain. 0 is returned on success. A negative errno code is returned 7122 * on a failure to set up the device, or if the name is a duplicate. 7123 * 7124 * Callers must hold the rtnl semaphore. You may want 7125 * register_netdev() instead of this. 7126 * 7127 * BUGS: 7128 * The locking appears insufficient to guarantee two parallel registers 7129 * will not get the same name. 7130 */ 7131 7132 int register_netdevice(struct net_device *dev) 7133 { 7134 int ret; 7135 struct net *net = dev_net(dev); 7136 7137 BUG_ON(dev_boot_phase); 7138 ASSERT_RTNL(); 7139 7140 might_sleep(); 7141 7142 /* When net_device's are persistent, this will be fatal. */ 7143 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 7144 BUG_ON(!net); 7145 7146 spin_lock_init(&dev->addr_list_lock); 7147 netdev_set_addr_lockdep_class(dev); 7148 7149 ret = dev_get_valid_name(net, dev, dev->name); 7150 if (ret < 0) 7151 goto out; 7152 7153 /* Init, if this function is available */ 7154 if (dev->netdev_ops->ndo_init) { 7155 ret = dev->netdev_ops->ndo_init(dev); 7156 if (ret) { 7157 if (ret > 0) 7158 ret = -EIO; 7159 goto out; 7160 } 7161 } 7162 7163 if (((dev->hw_features | dev->features) & 7164 NETIF_F_HW_VLAN_CTAG_FILTER) && 7165 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 7166 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 7167 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 7168 ret = -EINVAL; 7169 goto err_uninit; 7170 } 7171 7172 ret = -EBUSY; 7173 if (!dev->ifindex) 7174 dev->ifindex = dev_new_index(net); 7175 else if (__dev_get_by_index(net, dev->ifindex)) 7176 goto err_uninit; 7177 7178 /* Transfer changeable features to wanted_features and enable 7179 * software offloads (GSO and GRO). 7180 */ 7181 dev->hw_features |= NETIF_F_SOFT_FEATURES; 7182 dev->features |= NETIF_F_SOFT_FEATURES; 7183 dev->wanted_features = dev->features & dev->hw_features; 7184 7185 if (!(dev->flags & IFF_LOOPBACK)) 7186 dev->hw_features |= NETIF_F_NOCACHE_COPY; 7187 7188 /* If IPv4 TCP segmentation offload is supported we should also 7189 * allow the device to enable segmenting the frame with the option 7190 * of ignoring a static IP ID value. This doesn't enable the 7191 * feature itself but allows the user to enable it later. 7192 */ 7193 if (dev->hw_features & NETIF_F_TSO) 7194 dev->hw_features |= NETIF_F_TSO_MANGLEID; 7195 if (dev->vlan_features & NETIF_F_TSO) 7196 dev->vlan_features |= NETIF_F_TSO_MANGLEID; 7197 if (dev->mpls_features & NETIF_F_TSO) 7198 dev->mpls_features |= NETIF_F_TSO_MANGLEID; 7199 if (dev->hw_enc_features & NETIF_F_TSO) 7200 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; 7201 7202 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 7203 */ 7204 dev->vlan_features |= NETIF_F_HIGHDMA; 7205 7206 /* Make NETIF_F_SG inheritable to tunnel devices. 7207 */ 7208 dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL; 7209 7210 /* Make NETIF_F_SG inheritable to MPLS. 7211 */ 7212 dev->mpls_features |= NETIF_F_SG; 7213 7214 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 7215 ret = notifier_to_errno(ret); 7216 if (ret) 7217 goto err_uninit; 7218 7219 ret = netdev_register_kobject(dev); 7220 if (ret) 7221 goto err_uninit; 7222 dev->reg_state = NETREG_REGISTERED; 7223 7224 __netdev_update_features(dev); 7225 7226 /* 7227 * Default initial state at registry is that the 7228 * device is present. 7229 */ 7230 7231 set_bit(__LINK_STATE_PRESENT, &dev->state); 7232 7233 linkwatch_init_dev(dev); 7234 7235 dev_init_scheduler(dev); 7236 dev_hold(dev); 7237 list_netdevice(dev); 7238 add_device_randomness(dev->dev_addr, dev->addr_len); 7239 7240 /* If the device has permanent device address, driver should 7241 * set dev_addr and also addr_assign_type should be set to 7242 * NET_ADDR_PERM (default value). 7243 */ 7244 if (dev->addr_assign_type == NET_ADDR_PERM) 7245 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 7246 7247 /* Notify protocols, that a new device appeared. */ 7248 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 7249 ret = notifier_to_errno(ret); 7250 if (ret) { 7251 rollback_registered(dev); 7252 dev->reg_state = NETREG_UNREGISTERED; 7253 } 7254 /* 7255 * Prevent userspace races by waiting until the network 7256 * device is fully setup before sending notifications. 7257 */ 7258 if (!dev->rtnl_link_ops || 7259 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 7260 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7261 7262 out: 7263 return ret; 7264 7265 err_uninit: 7266 if (dev->netdev_ops->ndo_uninit) 7267 dev->netdev_ops->ndo_uninit(dev); 7268 goto out; 7269 } 7270 EXPORT_SYMBOL(register_netdevice); 7271 7272 /** 7273 * init_dummy_netdev - init a dummy network device for NAPI 7274 * @dev: device to init 7275 * 7276 * This takes a network device structure and initialize the minimum 7277 * amount of fields so it can be used to schedule NAPI polls without 7278 * registering a full blown interface. This is to be used by drivers 7279 * that need to tie several hardware interfaces to a single NAPI 7280 * poll scheduler due to HW limitations. 7281 */ 7282 int init_dummy_netdev(struct net_device *dev) 7283 { 7284 /* Clear everything. Note we don't initialize spinlocks 7285 * are they aren't supposed to be taken by any of the 7286 * NAPI code and this dummy netdev is supposed to be 7287 * only ever used for NAPI polls 7288 */ 7289 memset(dev, 0, sizeof(struct net_device)); 7290 7291 /* make sure we BUG if trying to hit standard 7292 * register/unregister code path 7293 */ 7294 dev->reg_state = NETREG_DUMMY; 7295 7296 /* NAPI wants this */ 7297 INIT_LIST_HEAD(&dev->napi_list); 7298 7299 /* a dummy interface is started by default */ 7300 set_bit(__LINK_STATE_PRESENT, &dev->state); 7301 set_bit(__LINK_STATE_START, &dev->state); 7302 7303 /* Note : We dont allocate pcpu_refcnt for dummy devices, 7304 * because users of this 'device' dont need to change 7305 * its refcount. 7306 */ 7307 7308 return 0; 7309 } 7310 EXPORT_SYMBOL_GPL(init_dummy_netdev); 7311 7312 7313 /** 7314 * register_netdev - register a network device 7315 * @dev: device to register 7316 * 7317 * Take a completed network device structure and add it to the kernel 7318 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 7319 * chain. 0 is returned on success. A negative errno code is returned 7320 * on a failure to set up the device, or if the name is a duplicate. 7321 * 7322 * This is a wrapper around register_netdevice that takes the rtnl semaphore 7323 * and expands the device name if you passed a format string to 7324 * alloc_netdev. 7325 */ 7326 int register_netdev(struct net_device *dev) 7327 { 7328 int err; 7329 7330 rtnl_lock(); 7331 err = register_netdevice(dev); 7332 rtnl_unlock(); 7333 return err; 7334 } 7335 EXPORT_SYMBOL(register_netdev); 7336 7337 int netdev_refcnt_read(const struct net_device *dev) 7338 { 7339 int i, refcnt = 0; 7340 7341 for_each_possible_cpu(i) 7342 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 7343 return refcnt; 7344 } 7345 EXPORT_SYMBOL(netdev_refcnt_read); 7346 7347 /** 7348 * netdev_wait_allrefs - wait until all references are gone. 7349 * @dev: target net_device 7350 * 7351 * This is called when unregistering network devices. 7352 * 7353 * Any protocol or device that holds a reference should register 7354 * for netdevice notification, and cleanup and put back the 7355 * reference if they receive an UNREGISTER event. 7356 * We can get stuck here if buggy protocols don't correctly 7357 * call dev_put. 7358 */ 7359 static void netdev_wait_allrefs(struct net_device *dev) 7360 { 7361 unsigned long rebroadcast_time, warning_time; 7362 int refcnt; 7363 7364 linkwatch_forget_dev(dev); 7365 7366 rebroadcast_time = warning_time = jiffies; 7367 refcnt = netdev_refcnt_read(dev); 7368 7369 while (refcnt != 0) { 7370 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 7371 rtnl_lock(); 7372 7373 /* Rebroadcast unregister notification */ 7374 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7375 7376 __rtnl_unlock(); 7377 rcu_barrier(); 7378 rtnl_lock(); 7379 7380 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7381 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 7382 &dev->state)) { 7383 /* We must not have linkwatch events 7384 * pending on unregister. If this 7385 * happens, we simply run the queue 7386 * unscheduled, resulting in a noop 7387 * for this device. 7388 */ 7389 linkwatch_run_queue(); 7390 } 7391 7392 __rtnl_unlock(); 7393 7394 rebroadcast_time = jiffies; 7395 } 7396 7397 msleep(250); 7398 7399 refcnt = netdev_refcnt_read(dev); 7400 7401 if (time_after(jiffies, warning_time + 10 * HZ)) { 7402 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 7403 dev->name, refcnt); 7404 warning_time = jiffies; 7405 } 7406 } 7407 } 7408 7409 /* The sequence is: 7410 * 7411 * rtnl_lock(); 7412 * ... 7413 * register_netdevice(x1); 7414 * register_netdevice(x2); 7415 * ... 7416 * unregister_netdevice(y1); 7417 * unregister_netdevice(y2); 7418 * ... 7419 * rtnl_unlock(); 7420 * free_netdev(y1); 7421 * free_netdev(y2); 7422 * 7423 * We are invoked by rtnl_unlock(). 7424 * This allows us to deal with problems: 7425 * 1) We can delete sysfs objects which invoke hotplug 7426 * without deadlocking with linkwatch via keventd. 7427 * 2) Since we run with the RTNL semaphore not held, we can sleep 7428 * safely in order to wait for the netdev refcnt to drop to zero. 7429 * 7430 * We must not return until all unregister events added during 7431 * the interval the lock was held have been completed. 7432 */ 7433 void netdev_run_todo(void) 7434 { 7435 struct list_head list; 7436 7437 /* Snapshot list, allow later requests */ 7438 list_replace_init(&net_todo_list, &list); 7439 7440 __rtnl_unlock(); 7441 7442 7443 /* Wait for rcu callbacks to finish before next phase */ 7444 if (!list_empty(&list)) 7445 rcu_barrier(); 7446 7447 while (!list_empty(&list)) { 7448 struct net_device *dev 7449 = list_first_entry(&list, struct net_device, todo_list); 7450 list_del(&dev->todo_list); 7451 7452 rtnl_lock(); 7453 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7454 __rtnl_unlock(); 7455 7456 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 7457 pr_err("network todo '%s' but state %d\n", 7458 dev->name, dev->reg_state); 7459 dump_stack(); 7460 continue; 7461 } 7462 7463 dev->reg_state = NETREG_UNREGISTERED; 7464 7465 netdev_wait_allrefs(dev); 7466 7467 /* paranoia */ 7468 BUG_ON(netdev_refcnt_read(dev)); 7469 BUG_ON(!list_empty(&dev->ptype_all)); 7470 BUG_ON(!list_empty(&dev->ptype_specific)); 7471 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 7472 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 7473 WARN_ON(dev->dn_ptr); 7474 7475 if (dev->destructor) 7476 dev->destructor(dev); 7477 7478 /* Report a network device has been unregistered */ 7479 rtnl_lock(); 7480 dev_net(dev)->dev_unreg_count--; 7481 __rtnl_unlock(); 7482 wake_up(&netdev_unregistering_wq); 7483 7484 /* Free network device */ 7485 kobject_put(&dev->dev.kobj); 7486 } 7487 } 7488 7489 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has 7490 * all the same fields in the same order as net_device_stats, with only 7491 * the type differing, but rtnl_link_stats64 may have additional fields 7492 * at the end for newer counters. 7493 */ 7494 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 7495 const struct net_device_stats *netdev_stats) 7496 { 7497 #if BITS_PER_LONG == 64 7498 BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats)); 7499 memcpy(stats64, netdev_stats, sizeof(*stats64)); 7500 /* zero out counters that only exist in rtnl_link_stats64 */ 7501 memset((char *)stats64 + sizeof(*netdev_stats), 0, 7502 sizeof(*stats64) - sizeof(*netdev_stats)); 7503 #else 7504 size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long); 7505 const unsigned long *src = (const unsigned long *)netdev_stats; 7506 u64 *dst = (u64 *)stats64; 7507 7508 BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64)); 7509 for (i = 0; i < n; i++) 7510 dst[i] = src[i]; 7511 /* zero out counters that only exist in rtnl_link_stats64 */ 7512 memset((char *)stats64 + n * sizeof(u64), 0, 7513 sizeof(*stats64) - n * sizeof(u64)); 7514 #endif 7515 } 7516 EXPORT_SYMBOL(netdev_stats_to_stats64); 7517 7518 /** 7519 * dev_get_stats - get network device statistics 7520 * @dev: device to get statistics from 7521 * @storage: place to store stats 7522 * 7523 * Get network statistics from device. Return @storage. 7524 * The device driver may provide its own method by setting 7525 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 7526 * otherwise the internal statistics structure is used. 7527 */ 7528 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 7529 struct rtnl_link_stats64 *storage) 7530 { 7531 const struct net_device_ops *ops = dev->netdev_ops; 7532 7533 if (ops->ndo_get_stats64) { 7534 memset(storage, 0, sizeof(*storage)); 7535 ops->ndo_get_stats64(dev, storage); 7536 } else if (ops->ndo_get_stats) { 7537 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 7538 } else { 7539 netdev_stats_to_stats64(storage, &dev->stats); 7540 } 7541 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 7542 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 7543 storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler); 7544 return storage; 7545 } 7546 EXPORT_SYMBOL(dev_get_stats); 7547 7548 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 7549 { 7550 struct netdev_queue *queue = dev_ingress_queue(dev); 7551 7552 #ifdef CONFIG_NET_CLS_ACT 7553 if (queue) 7554 return queue; 7555 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 7556 if (!queue) 7557 return NULL; 7558 netdev_init_one_queue(dev, queue, NULL); 7559 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 7560 queue->qdisc_sleeping = &noop_qdisc; 7561 rcu_assign_pointer(dev->ingress_queue, queue); 7562 #endif 7563 return queue; 7564 } 7565 7566 static const struct ethtool_ops default_ethtool_ops; 7567 7568 void netdev_set_default_ethtool_ops(struct net_device *dev, 7569 const struct ethtool_ops *ops) 7570 { 7571 if (dev->ethtool_ops == &default_ethtool_ops) 7572 dev->ethtool_ops = ops; 7573 } 7574 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 7575 7576 void netdev_freemem(struct net_device *dev) 7577 { 7578 char *addr = (char *)dev - dev->padded; 7579 7580 kvfree(addr); 7581 } 7582 7583 /** 7584 * alloc_netdev_mqs - allocate network device 7585 * @sizeof_priv: size of private data to allocate space for 7586 * @name: device name format string 7587 * @name_assign_type: origin of device name 7588 * @setup: callback to initialize device 7589 * @txqs: the number of TX subqueues to allocate 7590 * @rxqs: the number of RX subqueues to allocate 7591 * 7592 * Allocates a struct net_device with private data area for driver use 7593 * and performs basic initialization. Also allocates subqueue structs 7594 * for each queue on the device. 7595 */ 7596 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 7597 unsigned char name_assign_type, 7598 void (*setup)(struct net_device *), 7599 unsigned int txqs, unsigned int rxqs) 7600 { 7601 struct net_device *dev; 7602 size_t alloc_size; 7603 struct net_device *p; 7604 7605 BUG_ON(strlen(name) >= sizeof(dev->name)); 7606 7607 if (txqs < 1) { 7608 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 7609 return NULL; 7610 } 7611 7612 #ifdef CONFIG_SYSFS 7613 if (rxqs < 1) { 7614 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 7615 return NULL; 7616 } 7617 #endif 7618 7619 alloc_size = sizeof(struct net_device); 7620 if (sizeof_priv) { 7621 /* ensure 32-byte alignment of private area */ 7622 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 7623 alloc_size += sizeof_priv; 7624 } 7625 /* ensure 32-byte alignment of whole construct */ 7626 alloc_size += NETDEV_ALIGN - 1; 7627 7628 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 7629 if (!p) 7630 p = vzalloc(alloc_size); 7631 if (!p) 7632 return NULL; 7633 7634 dev = PTR_ALIGN(p, NETDEV_ALIGN); 7635 dev->padded = (char *)dev - (char *)p; 7636 7637 dev->pcpu_refcnt = alloc_percpu(int); 7638 if (!dev->pcpu_refcnt) 7639 goto free_dev; 7640 7641 if (dev_addr_init(dev)) 7642 goto free_pcpu; 7643 7644 dev_mc_init(dev); 7645 dev_uc_init(dev); 7646 7647 dev_net_set(dev, &init_net); 7648 7649 dev->gso_max_size = GSO_MAX_SIZE; 7650 dev->gso_max_segs = GSO_MAX_SEGS; 7651 7652 INIT_LIST_HEAD(&dev->napi_list); 7653 INIT_LIST_HEAD(&dev->unreg_list); 7654 INIT_LIST_HEAD(&dev->close_list); 7655 INIT_LIST_HEAD(&dev->link_watch_list); 7656 INIT_LIST_HEAD(&dev->adj_list.upper); 7657 INIT_LIST_HEAD(&dev->adj_list.lower); 7658 INIT_LIST_HEAD(&dev->all_adj_list.upper); 7659 INIT_LIST_HEAD(&dev->all_adj_list.lower); 7660 INIT_LIST_HEAD(&dev->ptype_all); 7661 INIT_LIST_HEAD(&dev->ptype_specific); 7662 #ifdef CONFIG_NET_SCHED 7663 hash_init(dev->qdisc_hash); 7664 #endif 7665 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 7666 setup(dev); 7667 7668 if (!dev->tx_queue_len) { 7669 dev->priv_flags |= IFF_NO_QUEUE; 7670 dev->tx_queue_len = 1; 7671 } 7672 7673 dev->num_tx_queues = txqs; 7674 dev->real_num_tx_queues = txqs; 7675 if (netif_alloc_netdev_queues(dev)) 7676 goto free_all; 7677 7678 #ifdef CONFIG_SYSFS 7679 dev->num_rx_queues = rxqs; 7680 dev->real_num_rx_queues = rxqs; 7681 if (netif_alloc_rx_queues(dev)) 7682 goto free_all; 7683 #endif 7684 7685 strcpy(dev->name, name); 7686 dev->name_assign_type = name_assign_type; 7687 dev->group = INIT_NETDEV_GROUP; 7688 if (!dev->ethtool_ops) 7689 dev->ethtool_ops = &default_ethtool_ops; 7690 7691 nf_hook_ingress_init(dev); 7692 7693 return dev; 7694 7695 free_all: 7696 free_netdev(dev); 7697 return NULL; 7698 7699 free_pcpu: 7700 free_percpu(dev->pcpu_refcnt); 7701 free_dev: 7702 netdev_freemem(dev); 7703 return NULL; 7704 } 7705 EXPORT_SYMBOL(alloc_netdev_mqs); 7706 7707 /** 7708 * free_netdev - free network device 7709 * @dev: device 7710 * 7711 * This function does the last stage of destroying an allocated device 7712 * interface. The reference to the device object is released. 7713 * If this is the last reference then it will be freed. 7714 * Must be called in process context. 7715 */ 7716 void free_netdev(struct net_device *dev) 7717 { 7718 struct napi_struct *p, *n; 7719 7720 might_sleep(); 7721 netif_free_tx_queues(dev); 7722 #ifdef CONFIG_SYSFS 7723 kvfree(dev->_rx); 7724 #endif 7725 7726 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 7727 7728 /* Flush device addresses */ 7729 dev_addr_flush(dev); 7730 7731 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 7732 netif_napi_del(p); 7733 7734 free_percpu(dev->pcpu_refcnt); 7735 dev->pcpu_refcnt = NULL; 7736 7737 /* Compatibility with error handling in drivers */ 7738 if (dev->reg_state == NETREG_UNINITIALIZED) { 7739 netdev_freemem(dev); 7740 return; 7741 } 7742 7743 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 7744 dev->reg_state = NETREG_RELEASED; 7745 7746 /* will free via device release */ 7747 put_device(&dev->dev); 7748 } 7749 EXPORT_SYMBOL(free_netdev); 7750 7751 /** 7752 * synchronize_net - Synchronize with packet receive processing 7753 * 7754 * Wait for packets currently being received to be done. 7755 * Does not block later packets from starting. 7756 */ 7757 void synchronize_net(void) 7758 { 7759 might_sleep(); 7760 if (rtnl_is_locked()) 7761 synchronize_rcu_expedited(); 7762 else 7763 synchronize_rcu(); 7764 } 7765 EXPORT_SYMBOL(synchronize_net); 7766 7767 /** 7768 * unregister_netdevice_queue - remove device from the kernel 7769 * @dev: device 7770 * @head: list 7771 * 7772 * This function shuts down a device interface and removes it 7773 * from the kernel tables. 7774 * If head not NULL, device is queued to be unregistered later. 7775 * 7776 * Callers must hold the rtnl semaphore. You may want 7777 * unregister_netdev() instead of this. 7778 */ 7779 7780 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 7781 { 7782 ASSERT_RTNL(); 7783 7784 if (head) { 7785 list_move_tail(&dev->unreg_list, head); 7786 } else { 7787 rollback_registered(dev); 7788 /* Finish processing unregister after unlock */ 7789 net_set_todo(dev); 7790 } 7791 } 7792 EXPORT_SYMBOL(unregister_netdevice_queue); 7793 7794 /** 7795 * unregister_netdevice_many - unregister many devices 7796 * @head: list of devices 7797 * 7798 * Note: As most callers use a stack allocated list_head, 7799 * we force a list_del() to make sure stack wont be corrupted later. 7800 */ 7801 void unregister_netdevice_many(struct list_head *head) 7802 { 7803 struct net_device *dev; 7804 7805 if (!list_empty(head)) { 7806 rollback_registered_many(head); 7807 list_for_each_entry(dev, head, unreg_list) 7808 net_set_todo(dev); 7809 list_del(head); 7810 } 7811 } 7812 EXPORT_SYMBOL(unregister_netdevice_many); 7813 7814 /** 7815 * unregister_netdev - remove device from the kernel 7816 * @dev: device 7817 * 7818 * This function shuts down a device interface and removes it 7819 * from the kernel tables. 7820 * 7821 * This is just a wrapper for unregister_netdevice that takes 7822 * the rtnl semaphore. In general you want to use this and not 7823 * unregister_netdevice. 7824 */ 7825 void unregister_netdev(struct net_device *dev) 7826 { 7827 rtnl_lock(); 7828 unregister_netdevice(dev); 7829 rtnl_unlock(); 7830 } 7831 EXPORT_SYMBOL(unregister_netdev); 7832 7833 /** 7834 * dev_change_net_namespace - move device to different nethost namespace 7835 * @dev: device 7836 * @net: network namespace 7837 * @pat: If not NULL name pattern to try if the current device name 7838 * is already taken in the destination network namespace. 7839 * 7840 * This function shuts down a device interface and moves it 7841 * to a new network namespace. On success 0 is returned, on 7842 * a failure a netagive errno code is returned. 7843 * 7844 * Callers must hold the rtnl semaphore. 7845 */ 7846 7847 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7848 { 7849 int err; 7850 7851 ASSERT_RTNL(); 7852 7853 /* Don't allow namespace local devices to be moved. */ 7854 err = -EINVAL; 7855 if (dev->features & NETIF_F_NETNS_LOCAL) 7856 goto out; 7857 7858 /* Ensure the device has been registrered */ 7859 if (dev->reg_state != NETREG_REGISTERED) 7860 goto out; 7861 7862 /* Get out if there is nothing todo */ 7863 err = 0; 7864 if (net_eq(dev_net(dev), net)) 7865 goto out; 7866 7867 /* Pick the destination device name, and ensure 7868 * we can use it in the destination network namespace. 7869 */ 7870 err = -EEXIST; 7871 if (__dev_get_by_name(net, dev->name)) { 7872 /* We get here if we can't use the current device name */ 7873 if (!pat) 7874 goto out; 7875 if (dev_get_valid_name(net, dev, pat) < 0) 7876 goto out; 7877 } 7878 7879 /* 7880 * And now a mini version of register_netdevice unregister_netdevice. 7881 */ 7882 7883 /* If device is running close it first. */ 7884 dev_close(dev); 7885 7886 /* And unlink it from device chain */ 7887 err = -ENODEV; 7888 unlist_netdevice(dev); 7889 7890 synchronize_net(); 7891 7892 /* Shutdown queueing discipline. */ 7893 dev_shutdown(dev); 7894 7895 /* Notify protocols, that we are about to destroy 7896 this device. They should clean all the things. 7897 7898 Note that dev->reg_state stays at NETREG_REGISTERED. 7899 This is wanted because this way 8021q and macvlan know 7900 the device is just moving and can keep their slaves up. 7901 */ 7902 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7903 rcu_barrier(); 7904 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7905 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7906 7907 /* 7908 * Flush the unicast and multicast chains 7909 */ 7910 dev_uc_flush(dev); 7911 dev_mc_flush(dev); 7912 7913 /* Send a netdev-removed uevent to the old namespace */ 7914 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7915 netdev_adjacent_del_links(dev); 7916 7917 /* Actually switch the network namespace */ 7918 dev_net_set(dev, net); 7919 7920 /* If there is an ifindex conflict assign a new one */ 7921 if (__dev_get_by_index(net, dev->ifindex)) 7922 dev->ifindex = dev_new_index(net); 7923 7924 /* Send a netdev-add uevent to the new namespace */ 7925 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7926 netdev_adjacent_add_links(dev); 7927 7928 /* Fixup kobjects */ 7929 err = device_rename(&dev->dev, dev->name); 7930 WARN_ON(err); 7931 7932 /* Add the device back in the hashes */ 7933 list_netdevice(dev); 7934 7935 /* Notify protocols, that a new device appeared. */ 7936 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7937 7938 /* 7939 * Prevent userspace races by waiting until the network 7940 * device is fully setup before sending notifications. 7941 */ 7942 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7943 7944 synchronize_net(); 7945 err = 0; 7946 out: 7947 return err; 7948 } 7949 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7950 7951 static int dev_cpu_callback(struct notifier_block *nfb, 7952 unsigned long action, 7953 void *ocpu) 7954 { 7955 struct sk_buff **list_skb; 7956 struct sk_buff *skb; 7957 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7958 struct softnet_data *sd, *oldsd; 7959 7960 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7961 return NOTIFY_OK; 7962 7963 local_irq_disable(); 7964 cpu = smp_processor_id(); 7965 sd = &per_cpu(softnet_data, cpu); 7966 oldsd = &per_cpu(softnet_data, oldcpu); 7967 7968 /* Find end of our completion_queue. */ 7969 list_skb = &sd->completion_queue; 7970 while (*list_skb) 7971 list_skb = &(*list_skb)->next; 7972 /* Append completion queue from offline CPU. */ 7973 *list_skb = oldsd->completion_queue; 7974 oldsd->completion_queue = NULL; 7975 7976 /* Append output queue from offline CPU. */ 7977 if (oldsd->output_queue) { 7978 *sd->output_queue_tailp = oldsd->output_queue; 7979 sd->output_queue_tailp = oldsd->output_queue_tailp; 7980 oldsd->output_queue = NULL; 7981 oldsd->output_queue_tailp = &oldsd->output_queue; 7982 } 7983 /* Append NAPI poll list from offline CPU, with one exception : 7984 * process_backlog() must be called by cpu owning percpu backlog. 7985 * We properly handle process_queue & input_pkt_queue later. 7986 */ 7987 while (!list_empty(&oldsd->poll_list)) { 7988 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7989 struct napi_struct, 7990 poll_list); 7991 7992 list_del_init(&napi->poll_list); 7993 if (napi->poll == process_backlog) 7994 napi->state = 0; 7995 else 7996 ____napi_schedule(sd, napi); 7997 } 7998 7999 raise_softirq_irqoff(NET_TX_SOFTIRQ); 8000 local_irq_enable(); 8001 8002 /* Process offline CPU's input_pkt_queue */ 8003 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 8004 netif_rx_ni(skb); 8005 input_queue_head_incr(oldsd); 8006 } 8007 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 8008 netif_rx_ni(skb); 8009 input_queue_head_incr(oldsd); 8010 } 8011 8012 return NOTIFY_OK; 8013 } 8014 8015 8016 /** 8017 * netdev_increment_features - increment feature set by one 8018 * @all: current feature set 8019 * @one: new feature set 8020 * @mask: mask feature set 8021 * 8022 * Computes a new feature set after adding a device with feature set 8023 * @one to the master device with current feature set @all. Will not 8024 * enable anything that is off in @mask. Returns the new feature set. 8025 */ 8026 netdev_features_t netdev_increment_features(netdev_features_t all, 8027 netdev_features_t one, netdev_features_t mask) 8028 { 8029 if (mask & NETIF_F_HW_CSUM) 8030 mask |= NETIF_F_CSUM_MASK; 8031 mask |= NETIF_F_VLAN_CHALLENGED; 8032 8033 all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask; 8034 all &= one | ~NETIF_F_ALL_FOR_ALL; 8035 8036 /* If one device supports hw checksumming, set for all. */ 8037 if (all & NETIF_F_HW_CSUM) 8038 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM); 8039 8040 return all; 8041 } 8042 EXPORT_SYMBOL(netdev_increment_features); 8043 8044 static struct hlist_head * __net_init netdev_create_hash(void) 8045 { 8046 int i; 8047 struct hlist_head *hash; 8048 8049 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 8050 if (hash != NULL) 8051 for (i = 0; i < NETDEV_HASHENTRIES; i++) 8052 INIT_HLIST_HEAD(&hash[i]); 8053 8054 return hash; 8055 } 8056 8057 /* Initialize per network namespace state */ 8058 static int __net_init netdev_init(struct net *net) 8059 { 8060 if (net != &init_net) 8061 INIT_LIST_HEAD(&net->dev_base_head); 8062 8063 net->dev_name_head = netdev_create_hash(); 8064 if (net->dev_name_head == NULL) 8065 goto err_name; 8066 8067 net->dev_index_head = netdev_create_hash(); 8068 if (net->dev_index_head == NULL) 8069 goto err_idx; 8070 8071 return 0; 8072 8073 err_idx: 8074 kfree(net->dev_name_head); 8075 err_name: 8076 return -ENOMEM; 8077 } 8078 8079 /** 8080 * netdev_drivername - network driver for the device 8081 * @dev: network device 8082 * 8083 * Determine network driver for device. 8084 */ 8085 const char *netdev_drivername(const struct net_device *dev) 8086 { 8087 const struct device_driver *driver; 8088 const struct device *parent; 8089 const char *empty = ""; 8090 8091 parent = dev->dev.parent; 8092 if (!parent) 8093 return empty; 8094 8095 driver = parent->driver; 8096 if (driver && driver->name) 8097 return driver->name; 8098 return empty; 8099 } 8100 8101 static void __netdev_printk(const char *level, const struct net_device *dev, 8102 struct va_format *vaf) 8103 { 8104 if (dev && dev->dev.parent) { 8105 dev_printk_emit(level[1] - '0', 8106 dev->dev.parent, 8107 "%s %s %s%s: %pV", 8108 dev_driver_string(dev->dev.parent), 8109 dev_name(dev->dev.parent), 8110 netdev_name(dev), netdev_reg_state(dev), 8111 vaf); 8112 } else if (dev) { 8113 printk("%s%s%s: %pV", 8114 level, netdev_name(dev), netdev_reg_state(dev), vaf); 8115 } else { 8116 printk("%s(NULL net_device): %pV", level, vaf); 8117 } 8118 } 8119 8120 void netdev_printk(const char *level, const struct net_device *dev, 8121 const char *format, ...) 8122 { 8123 struct va_format vaf; 8124 va_list args; 8125 8126 va_start(args, format); 8127 8128 vaf.fmt = format; 8129 vaf.va = &args; 8130 8131 __netdev_printk(level, dev, &vaf); 8132 8133 va_end(args); 8134 } 8135 EXPORT_SYMBOL(netdev_printk); 8136 8137 #define define_netdev_printk_level(func, level) \ 8138 void func(const struct net_device *dev, const char *fmt, ...) \ 8139 { \ 8140 struct va_format vaf; \ 8141 va_list args; \ 8142 \ 8143 va_start(args, fmt); \ 8144 \ 8145 vaf.fmt = fmt; \ 8146 vaf.va = &args; \ 8147 \ 8148 __netdev_printk(level, dev, &vaf); \ 8149 \ 8150 va_end(args); \ 8151 } \ 8152 EXPORT_SYMBOL(func); 8153 8154 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 8155 define_netdev_printk_level(netdev_alert, KERN_ALERT); 8156 define_netdev_printk_level(netdev_crit, KERN_CRIT); 8157 define_netdev_printk_level(netdev_err, KERN_ERR); 8158 define_netdev_printk_level(netdev_warn, KERN_WARNING); 8159 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 8160 define_netdev_printk_level(netdev_info, KERN_INFO); 8161 8162 static void __net_exit netdev_exit(struct net *net) 8163 { 8164 kfree(net->dev_name_head); 8165 kfree(net->dev_index_head); 8166 } 8167 8168 static struct pernet_operations __net_initdata netdev_net_ops = { 8169 .init = netdev_init, 8170 .exit = netdev_exit, 8171 }; 8172 8173 static void __net_exit default_device_exit(struct net *net) 8174 { 8175 struct net_device *dev, *aux; 8176 /* 8177 * Push all migratable network devices back to the 8178 * initial network namespace 8179 */ 8180 rtnl_lock(); 8181 for_each_netdev_safe(net, dev, aux) { 8182 int err; 8183 char fb_name[IFNAMSIZ]; 8184 8185 /* Ignore unmoveable devices (i.e. loopback) */ 8186 if (dev->features & NETIF_F_NETNS_LOCAL) 8187 continue; 8188 8189 /* Leave virtual devices for the generic cleanup */ 8190 if (dev->rtnl_link_ops) 8191 continue; 8192 8193 /* Push remaining network devices to init_net */ 8194 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 8195 err = dev_change_net_namespace(dev, &init_net, fb_name); 8196 if (err) { 8197 pr_emerg("%s: failed to move %s to init_net: %d\n", 8198 __func__, dev->name, err); 8199 BUG(); 8200 } 8201 } 8202 rtnl_unlock(); 8203 } 8204 8205 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 8206 { 8207 /* Return with the rtnl_lock held when there are no network 8208 * devices unregistering in any network namespace in net_list. 8209 */ 8210 struct net *net; 8211 bool unregistering; 8212 DEFINE_WAIT_FUNC(wait, woken_wake_function); 8213 8214 add_wait_queue(&netdev_unregistering_wq, &wait); 8215 for (;;) { 8216 unregistering = false; 8217 rtnl_lock(); 8218 list_for_each_entry(net, net_list, exit_list) { 8219 if (net->dev_unreg_count > 0) { 8220 unregistering = true; 8221 break; 8222 } 8223 } 8224 if (!unregistering) 8225 break; 8226 __rtnl_unlock(); 8227 8228 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 8229 } 8230 remove_wait_queue(&netdev_unregistering_wq, &wait); 8231 } 8232 8233 static void __net_exit default_device_exit_batch(struct list_head *net_list) 8234 { 8235 /* At exit all network devices most be removed from a network 8236 * namespace. Do this in the reverse order of registration. 8237 * Do this across as many network namespaces as possible to 8238 * improve batching efficiency. 8239 */ 8240 struct net_device *dev; 8241 struct net *net; 8242 LIST_HEAD(dev_kill_list); 8243 8244 /* To prevent network device cleanup code from dereferencing 8245 * loopback devices or network devices that have been freed 8246 * wait here for all pending unregistrations to complete, 8247 * before unregistring the loopback device and allowing the 8248 * network namespace be freed. 8249 * 8250 * The netdev todo list containing all network devices 8251 * unregistrations that happen in default_device_exit_batch 8252 * will run in the rtnl_unlock() at the end of 8253 * default_device_exit_batch. 8254 */ 8255 rtnl_lock_unregistering(net_list); 8256 list_for_each_entry(net, net_list, exit_list) { 8257 for_each_netdev_reverse(net, dev) { 8258 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 8259 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 8260 else 8261 unregister_netdevice_queue(dev, &dev_kill_list); 8262 } 8263 } 8264 unregister_netdevice_many(&dev_kill_list); 8265 rtnl_unlock(); 8266 } 8267 8268 static struct pernet_operations __net_initdata default_device_ops = { 8269 .exit = default_device_exit, 8270 .exit_batch = default_device_exit_batch, 8271 }; 8272 8273 /* 8274 * Initialize the DEV module. At boot time this walks the device list and 8275 * unhooks any devices that fail to initialise (normally hardware not 8276 * present) and leaves us with a valid list of present and active devices. 8277 * 8278 */ 8279 8280 /* 8281 * This is called single threaded during boot, so no need 8282 * to take the rtnl semaphore. 8283 */ 8284 static int __init net_dev_init(void) 8285 { 8286 int i, rc = -ENOMEM; 8287 8288 BUG_ON(!dev_boot_phase); 8289 8290 if (dev_proc_init()) 8291 goto out; 8292 8293 if (netdev_kobject_init()) 8294 goto out; 8295 8296 INIT_LIST_HEAD(&ptype_all); 8297 for (i = 0; i < PTYPE_HASH_SIZE; i++) 8298 INIT_LIST_HEAD(&ptype_base[i]); 8299 8300 INIT_LIST_HEAD(&offload_base); 8301 8302 if (register_pernet_subsys(&netdev_net_ops)) 8303 goto out; 8304 8305 /* 8306 * Initialise the packet receive queues. 8307 */ 8308 8309 for_each_possible_cpu(i) { 8310 struct work_struct *flush = per_cpu_ptr(&flush_works, i); 8311 struct softnet_data *sd = &per_cpu(softnet_data, i); 8312 8313 INIT_WORK(flush, flush_backlog); 8314 8315 skb_queue_head_init(&sd->input_pkt_queue); 8316 skb_queue_head_init(&sd->process_queue); 8317 INIT_LIST_HEAD(&sd->poll_list); 8318 sd->output_queue_tailp = &sd->output_queue; 8319 #ifdef CONFIG_RPS 8320 sd->csd.func = rps_trigger_softirq; 8321 sd->csd.info = sd; 8322 sd->cpu = i; 8323 #endif 8324 8325 sd->backlog.poll = process_backlog; 8326 sd->backlog.weight = weight_p; 8327 } 8328 8329 dev_boot_phase = 0; 8330 8331 /* The loopback device is special if any other network devices 8332 * is present in a network namespace the loopback device must 8333 * be present. Since we now dynamically allocate and free the 8334 * loopback device ensure this invariant is maintained by 8335 * keeping the loopback device as the first device on the 8336 * list of network devices. Ensuring the loopback devices 8337 * is the first device that appears and the last network device 8338 * that disappears. 8339 */ 8340 if (register_pernet_device(&loopback_net_ops)) 8341 goto out; 8342 8343 if (register_pernet_device(&default_device_ops)) 8344 goto out; 8345 8346 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 8347 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 8348 8349 hotcpu_notifier(dev_cpu_callback, 0); 8350 dst_subsys_init(); 8351 rc = 0; 8352 out: 8353 return rc; 8354 } 8355 8356 subsys_initcall(net_dev_init); 8357