1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <linux/rtnetlink.h> 100 #include <linux/stat.h> 101 #include <net/dst.h> 102 #include <net/pkt_sched.h> 103 #include <net/checksum.h> 104 #include <net/xfrm.h> 105 #include <linux/highmem.h> 106 #include <linux/init.h> 107 #include <linux/module.h> 108 #include <linux/netpoll.h> 109 #include <linux/rcupdate.h> 110 #include <linux/delay.h> 111 #include <net/iw_handler.h> 112 #include <asm/current.h> 113 #include <linux/audit.h> 114 #include <linux/dmaengine.h> 115 #include <linux/err.h> 116 #include <linux/ctype.h> 117 #include <linux/if_arp.h> 118 #include <linux/if_vlan.h> 119 #include <linux/ip.h> 120 #include <net/ip.h> 121 #include <net/mpls.h> 122 #include <linux/ipv6.h> 123 #include <linux/in.h> 124 #include <linux/jhash.h> 125 #include <linux/random.h> 126 #include <trace/events/napi.h> 127 #include <trace/events/net.h> 128 #include <trace/events/skb.h> 129 #include <linux/pci.h> 130 #include <linux/inetdevice.h> 131 #include <linux/cpu_rmap.h> 132 #include <linux/static_key.h> 133 #include <linux/hashtable.h> 134 #include <linux/vmalloc.h> 135 #include <linux/if_macvlan.h> 136 #include <linux/errqueue.h> 137 #include <linux/hrtimer.h> 138 139 #include "net-sysfs.h" 140 141 /* Instead of increasing this, you should create a hash table. */ 142 #define MAX_GRO_SKBS 8 143 144 /* This should be increased if a protocol with a bigger head is added. */ 145 #define GRO_MAX_HEAD (MAX_HEADER + 128) 146 147 static DEFINE_SPINLOCK(ptype_lock); 148 static DEFINE_SPINLOCK(offload_lock); 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 150 struct list_head ptype_all __read_mostly; /* Taps */ 151 static struct list_head offload_base __read_mostly; 152 153 static int netif_rx_internal(struct sk_buff *skb); 154 static int call_netdevice_notifiers_info(unsigned long val, 155 struct net_device *dev, 156 struct netdev_notifier_info *info); 157 158 /* 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 160 * semaphore. 161 * 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 163 * 164 * Writers must hold the rtnl semaphore while they loop through the 165 * dev_base_head list, and hold dev_base_lock for writing when they do the 166 * actual updates. This allows pure readers to access the list even 167 * while a writer is preparing to update it. 168 * 169 * To put it another way, dev_base_lock is held for writing only to 170 * protect against pure readers; the rtnl semaphore provides the 171 * protection against other writers. 172 * 173 * See, for example usages, register_netdevice() and 174 * unregister_netdevice(), which must be called with the rtnl 175 * semaphore held. 176 */ 177 DEFINE_RWLOCK(dev_base_lock); 178 EXPORT_SYMBOL(dev_base_lock); 179 180 /* protects napi_hash addition/deletion and napi_gen_id */ 181 static DEFINE_SPINLOCK(napi_hash_lock); 182 183 static unsigned int napi_gen_id; 184 static DEFINE_HASHTABLE(napi_hash, 8); 185 186 static seqcount_t devnet_rename_seq; 187 188 static inline void dev_base_seq_inc(struct net *net) 189 { 190 while (++net->dev_base_seq == 0); 191 } 192 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 194 { 195 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 196 197 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 198 } 199 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 201 { 202 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 203 } 204 205 static inline void rps_lock(struct softnet_data *sd) 206 { 207 #ifdef CONFIG_RPS 208 spin_lock(&sd->input_pkt_queue.lock); 209 #endif 210 } 211 212 static inline void rps_unlock(struct softnet_data *sd) 213 { 214 #ifdef CONFIG_RPS 215 spin_unlock(&sd->input_pkt_queue.lock); 216 #endif 217 } 218 219 /* Device list insertion */ 220 static void list_netdevice(struct net_device *dev) 221 { 222 struct net *net = dev_net(dev); 223 224 ASSERT_RTNL(); 225 226 write_lock_bh(&dev_base_lock); 227 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 228 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 229 hlist_add_head_rcu(&dev->index_hlist, 230 dev_index_hash(net, dev->ifindex)); 231 write_unlock_bh(&dev_base_lock); 232 233 dev_base_seq_inc(net); 234 } 235 236 /* Device list removal 237 * caller must respect a RCU grace period before freeing/reusing dev 238 */ 239 static void unlist_netdevice(struct net_device *dev) 240 { 241 ASSERT_RTNL(); 242 243 /* Unlink dev from the device chain */ 244 write_lock_bh(&dev_base_lock); 245 list_del_rcu(&dev->dev_list); 246 hlist_del_rcu(&dev->name_hlist); 247 hlist_del_rcu(&dev->index_hlist); 248 write_unlock_bh(&dev_base_lock); 249 250 dev_base_seq_inc(dev_net(dev)); 251 } 252 253 /* 254 * Our notifier list 255 */ 256 257 static RAW_NOTIFIER_HEAD(netdev_chain); 258 259 /* 260 * Device drivers call our routines to queue packets here. We empty the 261 * queue in the local softnet handler. 262 */ 263 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 265 EXPORT_PER_CPU_SYMBOL(softnet_data); 266 267 #ifdef CONFIG_LOCKDEP 268 /* 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 270 * according to dev->type 271 */ 272 static const unsigned short netdev_lock_type[] = 273 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 274 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 275 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 276 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 277 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 278 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 279 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 280 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 281 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 282 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 283 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 284 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 285 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 286 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 287 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 288 289 static const char *const netdev_lock_name[] = 290 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 291 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 292 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 293 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 294 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 295 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 296 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 297 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 298 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 299 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 300 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 301 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 302 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 303 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 304 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 305 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 308 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 310 { 311 int i; 312 313 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 314 if (netdev_lock_type[i] == dev_type) 315 return i; 316 /* the last key is used by default */ 317 return ARRAY_SIZE(netdev_lock_type) - 1; 318 } 319 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 321 unsigned short dev_type) 322 { 323 int i; 324 325 i = netdev_lock_pos(dev_type); 326 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 327 netdev_lock_name[i]); 328 } 329 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 331 { 332 int i; 333 334 i = netdev_lock_pos(dev->type); 335 lockdep_set_class_and_name(&dev->addr_list_lock, 336 &netdev_addr_lock_key[i], 337 netdev_lock_name[i]); 338 } 339 #else 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 341 unsigned short dev_type) 342 { 343 } 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 345 { 346 } 347 #endif 348 349 /******************************************************************************* 350 351 Protocol management and registration routines 352 353 *******************************************************************************/ 354 355 /* 356 * Add a protocol ID to the list. Now that the input handler is 357 * smarter we can dispense with all the messy stuff that used to be 358 * here. 359 * 360 * BEWARE!!! Protocol handlers, mangling input packets, 361 * MUST BE last in hash buckets and checking protocol handlers 362 * MUST start from promiscuous ptype_all chain in net_bh. 363 * It is true now, do not change it. 364 * Explanation follows: if protocol handler, mangling packet, will 365 * be the first on list, it is not able to sense, that packet 366 * is cloned and should be copied-on-write, so that it will 367 * change it and subsequent readers will get broken packet. 368 * --ANK (980803) 369 */ 370 371 static inline struct list_head *ptype_head(const struct packet_type *pt) 372 { 373 if (pt->type == htons(ETH_P_ALL)) 374 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 375 else 376 return pt->dev ? &pt->dev->ptype_specific : 377 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 378 } 379 380 /** 381 * dev_add_pack - add packet handler 382 * @pt: packet type declaration 383 * 384 * Add a protocol handler to the networking stack. The passed &packet_type 385 * is linked into kernel lists and may not be freed until it has been 386 * removed from the kernel lists. 387 * 388 * This call does not sleep therefore it can not 389 * guarantee all CPU's that are in middle of receiving packets 390 * will see the new packet type (until the next received packet). 391 */ 392 393 void dev_add_pack(struct packet_type *pt) 394 { 395 struct list_head *head = ptype_head(pt); 396 397 spin_lock(&ptype_lock); 398 list_add_rcu(&pt->list, head); 399 spin_unlock(&ptype_lock); 400 } 401 EXPORT_SYMBOL(dev_add_pack); 402 403 /** 404 * __dev_remove_pack - remove packet handler 405 * @pt: packet type declaration 406 * 407 * Remove a protocol handler that was previously added to the kernel 408 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 409 * from the kernel lists and can be freed or reused once this function 410 * returns. 411 * 412 * The packet type might still be in use by receivers 413 * and must not be freed until after all the CPU's have gone 414 * through a quiescent state. 415 */ 416 void __dev_remove_pack(struct packet_type *pt) 417 { 418 struct list_head *head = ptype_head(pt); 419 struct packet_type *pt1; 420 421 spin_lock(&ptype_lock); 422 423 list_for_each_entry(pt1, head, list) { 424 if (pt == pt1) { 425 list_del_rcu(&pt->list); 426 goto out; 427 } 428 } 429 430 pr_warn("dev_remove_pack: %p not found\n", pt); 431 out: 432 spin_unlock(&ptype_lock); 433 } 434 EXPORT_SYMBOL(__dev_remove_pack); 435 436 /** 437 * dev_remove_pack - remove packet handler 438 * @pt: packet type declaration 439 * 440 * Remove a protocol handler that was previously added to the kernel 441 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 442 * from the kernel lists and can be freed or reused once this function 443 * returns. 444 * 445 * This call sleeps to guarantee that no CPU is looking at the packet 446 * type after return. 447 */ 448 void dev_remove_pack(struct packet_type *pt) 449 { 450 __dev_remove_pack(pt); 451 452 synchronize_net(); 453 } 454 EXPORT_SYMBOL(dev_remove_pack); 455 456 457 /** 458 * dev_add_offload - register offload handlers 459 * @po: protocol offload declaration 460 * 461 * Add protocol offload handlers to the networking stack. The passed 462 * &proto_offload is linked into kernel lists and may not be freed until 463 * it has been removed from the kernel lists. 464 * 465 * This call does not sleep therefore it can not 466 * guarantee all CPU's that are in middle of receiving packets 467 * will see the new offload handlers (until the next received packet). 468 */ 469 void dev_add_offload(struct packet_offload *po) 470 { 471 struct list_head *head = &offload_base; 472 473 spin_lock(&offload_lock); 474 list_add_rcu(&po->list, head); 475 spin_unlock(&offload_lock); 476 } 477 EXPORT_SYMBOL(dev_add_offload); 478 479 /** 480 * __dev_remove_offload - remove offload handler 481 * @po: packet offload declaration 482 * 483 * Remove a protocol offload handler that was previously added to the 484 * kernel offload handlers by dev_add_offload(). The passed &offload_type 485 * is removed from the kernel lists and can be freed or reused once this 486 * function returns. 487 * 488 * The packet type might still be in use by receivers 489 * and must not be freed until after all the CPU's have gone 490 * through a quiescent state. 491 */ 492 static void __dev_remove_offload(struct packet_offload *po) 493 { 494 struct list_head *head = &offload_base; 495 struct packet_offload *po1; 496 497 spin_lock(&offload_lock); 498 499 list_for_each_entry(po1, head, list) { 500 if (po == po1) { 501 list_del_rcu(&po->list); 502 goto out; 503 } 504 } 505 506 pr_warn("dev_remove_offload: %p not found\n", po); 507 out: 508 spin_unlock(&offload_lock); 509 } 510 511 /** 512 * dev_remove_offload - remove packet offload handler 513 * @po: packet offload declaration 514 * 515 * Remove a packet offload handler that was previously added to the kernel 516 * offload handlers by dev_add_offload(). The passed &offload_type is 517 * removed from the kernel lists and can be freed or reused once this 518 * function returns. 519 * 520 * This call sleeps to guarantee that no CPU is looking at the packet 521 * type after return. 522 */ 523 void dev_remove_offload(struct packet_offload *po) 524 { 525 __dev_remove_offload(po); 526 527 synchronize_net(); 528 } 529 EXPORT_SYMBOL(dev_remove_offload); 530 531 /****************************************************************************** 532 533 Device Boot-time Settings Routines 534 535 *******************************************************************************/ 536 537 /* Boot time configuration table */ 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 539 540 /** 541 * netdev_boot_setup_add - add new setup entry 542 * @name: name of the device 543 * @map: configured settings for the device 544 * 545 * Adds new setup entry to the dev_boot_setup list. The function 546 * returns 0 on error and 1 on success. This is a generic routine to 547 * all netdevices. 548 */ 549 static int netdev_boot_setup_add(char *name, struct ifmap *map) 550 { 551 struct netdev_boot_setup *s; 552 int i; 553 554 s = dev_boot_setup; 555 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 556 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 557 memset(s[i].name, 0, sizeof(s[i].name)); 558 strlcpy(s[i].name, name, IFNAMSIZ); 559 memcpy(&s[i].map, map, sizeof(s[i].map)); 560 break; 561 } 562 } 563 564 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 565 } 566 567 /** 568 * netdev_boot_setup_check - check boot time settings 569 * @dev: the netdevice 570 * 571 * Check boot time settings for the device. 572 * The found settings are set for the device to be used 573 * later in the device probing. 574 * Returns 0 if no settings found, 1 if they are. 575 */ 576 int netdev_boot_setup_check(struct net_device *dev) 577 { 578 struct netdev_boot_setup *s = dev_boot_setup; 579 int i; 580 581 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 582 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 583 !strcmp(dev->name, s[i].name)) { 584 dev->irq = s[i].map.irq; 585 dev->base_addr = s[i].map.base_addr; 586 dev->mem_start = s[i].map.mem_start; 587 dev->mem_end = s[i].map.mem_end; 588 return 1; 589 } 590 } 591 return 0; 592 } 593 EXPORT_SYMBOL(netdev_boot_setup_check); 594 595 596 /** 597 * netdev_boot_base - get address from boot time settings 598 * @prefix: prefix for network device 599 * @unit: id for network device 600 * 601 * Check boot time settings for the base address of device. 602 * The found settings are set for the device to be used 603 * later in the device probing. 604 * Returns 0 if no settings found. 605 */ 606 unsigned long netdev_boot_base(const char *prefix, int unit) 607 { 608 const struct netdev_boot_setup *s = dev_boot_setup; 609 char name[IFNAMSIZ]; 610 int i; 611 612 sprintf(name, "%s%d", prefix, unit); 613 614 /* 615 * If device already registered then return base of 1 616 * to indicate not to probe for this interface 617 */ 618 if (__dev_get_by_name(&init_net, name)) 619 return 1; 620 621 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 622 if (!strcmp(name, s[i].name)) 623 return s[i].map.base_addr; 624 return 0; 625 } 626 627 /* 628 * Saves at boot time configured settings for any netdevice. 629 */ 630 int __init netdev_boot_setup(char *str) 631 { 632 int ints[5]; 633 struct ifmap map; 634 635 str = get_options(str, ARRAY_SIZE(ints), ints); 636 if (!str || !*str) 637 return 0; 638 639 /* Save settings */ 640 memset(&map, 0, sizeof(map)); 641 if (ints[0] > 0) 642 map.irq = ints[1]; 643 if (ints[0] > 1) 644 map.base_addr = ints[2]; 645 if (ints[0] > 2) 646 map.mem_start = ints[3]; 647 if (ints[0] > 3) 648 map.mem_end = ints[4]; 649 650 /* Add new entry to the list */ 651 return netdev_boot_setup_add(str, &map); 652 } 653 654 __setup("netdev=", netdev_boot_setup); 655 656 /******************************************************************************* 657 658 Device Interface Subroutines 659 660 *******************************************************************************/ 661 662 /** 663 * __dev_get_by_name - find a device by its name 664 * @net: the applicable net namespace 665 * @name: name to find 666 * 667 * Find an interface by name. Must be called under RTNL semaphore 668 * or @dev_base_lock. If the name is found a pointer to the device 669 * is returned. If the name is not found then %NULL is returned. The 670 * reference counters are not incremented so the caller must be 671 * careful with locks. 672 */ 673 674 struct net_device *__dev_get_by_name(struct net *net, const char *name) 675 { 676 struct net_device *dev; 677 struct hlist_head *head = dev_name_hash(net, name); 678 679 hlist_for_each_entry(dev, head, name_hlist) 680 if (!strncmp(dev->name, name, IFNAMSIZ)) 681 return dev; 682 683 return NULL; 684 } 685 EXPORT_SYMBOL(__dev_get_by_name); 686 687 /** 688 * dev_get_by_name_rcu - find a device by its name 689 * @net: the applicable net namespace 690 * @name: name to find 691 * 692 * Find an interface by name. 693 * If the name is found a pointer to the device is returned. 694 * If the name is not found then %NULL is returned. 695 * The reference counters are not incremented so the caller must be 696 * careful with locks. The caller must hold RCU lock. 697 */ 698 699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 700 { 701 struct net_device *dev; 702 struct hlist_head *head = dev_name_hash(net, name); 703 704 hlist_for_each_entry_rcu(dev, head, name_hlist) 705 if (!strncmp(dev->name, name, IFNAMSIZ)) 706 return dev; 707 708 return NULL; 709 } 710 EXPORT_SYMBOL(dev_get_by_name_rcu); 711 712 /** 713 * dev_get_by_name - find a device by its name 714 * @net: the applicable net namespace 715 * @name: name to find 716 * 717 * Find an interface by name. This can be called from any 718 * context and does its own locking. The returned handle has 719 * the usage count incremented and the caller must use dev_put() to 720 * release it when it is no longer needed. %NULL is returned if no 721 * matching device is found. 722 */ 723 724 struct net_device *dev_get_by_name(struct net *net, const char *name) 725 { 726 struct net_device *dev; 727 728 rcu_read_lock(); 729 dev = dev_get_by_name_rcu(net, name); 730 if (dev) 731 dev_hold(dev); 732 rcu_read_unlock(); 733 return dev; 734 } 735 EXPORT_SYMBOL(dev_get_by_name); 736 737 /** 738 * __dev_get_by_index - find a device by its ifindex 739 * @net: the applicable net namespace 740 * @ifindex: index of device 741 * 742 * Search for an interface by index. Returns %NULL if the device 743 * is not found or a pointer to the device. The device has not 744 * had its reference counter increased so the caller must be careful 745 * about locking. The caller must hold either the RTNL semaphore 746 * or @dev_base_lock. 747 */ 748 749 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 750 { 751 struct net_device *dev; 752 struct hlist_head *head = dev_index_hash(net, ifindex); 753 754 hlist_for_each_entry(dev, head, index_hlist) 755 if (dev->ifindex == ifindex) 756 return dev; 757 758 return NULL; 759 } 760 EXPORT_SYMBOL(__dev_get_by_index); 761 762 /** 763 * dev_get_by_index_rcu - find a device by its ifindex 764 * @net: the applicable net namespace 765 * @ifindex: index of device 766 * 767 * Search for an interface by index. Returns %NULL if the device 768 * is not found or a pointer to the device. The device has not 769 * had its reference counter increased so the caller must be careful 770 * about locking. The caller must hold RCU lock. 771 */ 772 773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 774 { 775 struct net_device *dev; 776 struct hlist_head *head = dev_index_hash(net, ifindex); 777 778 hlist_for_each_entry_rcu(dev, head, index_hlist) 779 if (dev->ifindex == ifindex) 780 return dev; 781 782 return NULL; 783 } 784 EXPORT_SYMBOL(dev_get_by_index_rcu); 785 786 787 /** 788 * dev_get_by_index - find a device by its ifindex 789 * @net: the applicable net namespace 790 * @ifindex: index of device 791 * 792 * Search for an interface by index. Returns NULL if the device 793 * is not found or a pointer to the device. The device returned has 794 * had a reference added and the pointer is safe until the user calls 795 * dev_put to indicate they have finished with it. 796 */ 797 798 struct net_device *dev_get_by_index(struct net *net, int ifindex) 799 { 800 struct net_device *dev; 801 802 rcu_read_lock(); 803 dev = dev_get_by_index_rcu(net, ifindex); 804 if (dev) 805 dev_hold(dev); 806 rcu_read_unlock(); 807 return dev; 808 } 809 EXPORT_SYMBOL(dev_get_by_index); 810 811 /** 812 * netdev_get_name - get a netdevice name, knowing its ifindex. 813 * @net: network namespace 814 * @name: a pointer to the buffer where the name will be stored. 815 * @ifindex: the ifindex of the interface to get the name from. 816 * 817 * The use of raw_seqcount_begin() and cond_resched() before 818 * retrying is required as we want to give the writers a chance 819 * to complete when CONFIG_PREEMPT is not set. 820 */ 821 int netdev_get_name(struct net *net, char *name, int ifindex) 822 { 823 struct net_device *dev; 824 unsigned int seq; 825 826 retry: 827 seq = raw_seqcount_begin(&devnet_rename_seq); 828 rcu_read_lock(); 829 dev = dev_get_by_index_rcu(net, ifindex); 830 if (!dev) { 831 rcu_read_unlock(); 832 return -ENODEV; 833 } 834 835 strcpy(name, dev->name); 836 rcu_read_unlock(); 837 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 838 cond_resched(); 839 goto retry; 840 } 841 842 return 0; 843 } 844 845 /** 846 * dev_getbyhwaddr_rcu - find a device by its hardware address 847 * @net: the applicable net namespace 848 * @type: media type of device 849 * @ha: hardware address 850 * 851 * Search for an interface by MAC address. Returns NULL if the device 852 * is not found or a pointer to the device. 853 * The caller must hold RCU or RTNL. 854 * The returned device has not had its ref count increased 855 * and the caller must therefore be careful about locking 856 * 857 */ 858 859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 860 const char *ha) 861 { 862 struct net_device *dev; 863 864 for_each_netdev_rcu(net, dev) 865 if (dev->type == type && 866 !memcmp(dev->dev_addr, ha, dev->addr_len)) 867 return dev; 868 869 return NULL; 870 } 871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 872 873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 874 { 875 struct net_device *dev; 876 877 ASSERT_RTNL(); 878 for_each_netdev(net, dev) 879 if (dev->type == type) 880 return dev; 881 882 return NULL; 883 } 884 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 885 886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 887 { 888 struct net_device *dev, *ret = NULL; 889 890 rcu_read_lock(); 891 for_each_netdev_rcu(net, dev) 892 if (dev->type == type) { 893 dev_hold(dev); 894 ret = dev; 895 break; 896 } 897 rcu_read_unlock(); 898 return ret; 899 } 900 EXPORT_SYMBOL(dev_getfirstbyhwtype); 901 902 /** 903 * __dev_get_by_flags - find any device with given flags 904 * @net: the applicable net namespace 905 * @if_flags: IFF_* values 906 * @mask: bitmask of bits in if_flags to check 907 * 908 * Search for any interface with the given flags. Returns NULL if a device 909 * is not found or a pointer to the device. Must be called inside 910 * rtnl_lock(), and result refcount is unchanged. 911 */ 912 913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 914 unsigned short mask) 915 { 916 struct net_device *dev, *ret; 917 918 ASSERT_RTNL(); 919 920 ret = NULL; 921 for_each_netdev(net, dev) { 922 if (((dev->flags ^ if_flags) & mask) == 0) { 923 ret = dev; 924 break; 925 } 926 } 927 return ret; 928 } 929 EXPORT_SYMBOL(__dev_get_by_flags); 930 931 /** 932 * dev_valid_name - check if name is okay for network device 933 * @name: name string 934 * 935 * Network device names need to be valid file names to 936 * to allow sysfs to work. We also disallow any kind of 937 * whitespace. 938 */ 939 bool dev_valid_name(const char *name) 940 { 941 if (*name == '\0') 942 return false; 943 if (strlen(name) >= IFNAMSIZ) 944 return false; 945 if (!strcmp(name, ".") || !strcmp(name, "..")) 946 return false; 947 948 while (*name) { 949 if (*name == '/' || *name == ':' || isspace(*name)) 950 return false; 951 name++; 952 } 953 return true; 954 } 955 EXPORT_SYMBOL(dev_valid_name); 956 957 /** 958 * __dev_alloc_name - allocate a name for a device 959 * @net: network namespace to allocate the device name in 960 * @name: name format string 961 * @buf: scratch buffer and result name string 962 * 963 * Passed a format string - eg "lt%d" it will try and find a suitable 964 * id. It scans list of devices to build up a free map, then chooses 965 * the first empty slot. The caller must hold the dev_base or rtnl lock 966 * while allocating the name and adding the device in order to avoid 967 * duplicates. 968 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 969 * Returns the number of the unit assigned or a negative errno code. 970 */ 971 972 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 973 { 974 int i = 0; 975 const char *p; 976 const int max_netdevices = 8*PAGE_SIZE; 977 unsigned long *inuse; 978 struct net_device *d; 979 980 p = strnchr(name, IFNAMSIZ-1, '%'); 981 if (p) { 982 /* 983 * Verify the string as this thing may have come from 984 * the user. There must be either one "%d" and no other "%" 985 * characters. 986 */ 987 if (p[1] != 'd' || strchr(p + 2, '%')) 988 return -EINVAL; 989 990 /* Use one page as a bit array of possible slots */ 991 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 992 if (!inuse) 993 return -ENOMEM; 994 995 for_each_netdev(net, d) { 996 if (!sscanf(d->name, name, &i)) 997 continue; 998 if (i < 0 || i >= max_netdevices) 999 continue; 1000 1001 /* avoid cases where sscanf is not exact inverse of printf */ 1002 snprintf(buf, IFNAMSIZ, name, i); 1003 if (!strncmp(buf, d->name, IFNAMSIZ)) 1004 set_bit(i, inuse); 1005 } 1006 1007 i = find_first_zero_bit(inuse, max_netdevices); 1008 free_page((unsigned long) inuse); 1009 } 1010 1011 if (buf != name) 1012 snprintf(buf, IFNAMSIZ, name, i); 1013 if (!__dev_get_by_name(net, buf)) 1014 return i; 1015 1016 /* It is possible to run out of possible slots 1017 * when the name is long and there isn't enough space left 1018 * for the digits, or if all bits are used. 1019 */ 1020 return -ENFILE; 1021 } 1022 1023 /** 1024 * dev_alloc_name - allocate a name for a device 1025 * @dev: device 1026 * @name: name format string 1027 * 1028 * Passed a format string - eg "lt%d" it will try and find a suitable 1029 * id. It scans list of devices to build up a free map, then chooses 1030 * the first empty slot. The caller must hold the dev_base or rtnl lock 1031 * while allocating the name and adding the device in order to avoid 1032 * duplicates. 1033 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1034 * Returns the number of the unit assigned or a negative errno code. 1035 */ 1036 1037 int dev_alloc_name(struct net_device *dev, const char *name) 1038 { 1039 char buf[IFNAMSIZ]; 1040 struct net *net; 1041 int ret; 1042 1043 BUG_ON(!dev_net(dev)); 1044 net = dev_net(dev); 1045 ret = __dev_alloc_name(net, name, buf); 1046 if (ret >= 0) 1047 strlcpy(dev->name, buf, IFNAMSIZ); 1048 return ret; 1049 } 1050 EXPORT_SYMBOL(dev_alloc_name); 1051 1052 static int dev_alloc_name_ns(struct net *net, 1053 struct net_device *dev, 1054 const char *name) 1055 { 1056 char buf[IFNAMSIZ]; 1057 int ret; 1058 1059 ret = __dev_alloc_name(net, name, buf); 1060 if (ret >= 0) 1061 strlcpy(dev->name, buf, IFNAMSIZ); 1062 return ret; 1063 } 1064 1065 static int dev_get_valid_name(struct net *net, 1066 struct net_device *dev, 1067 const char *name) 1068 { 1069 BUG_ON(!net); 1070 1071 if (!dev_valid_name(name)) 1072 return -EINVAL; 1073 1074 if (strchr(name, '%')) 1075 return dev_alloc_name_ns(net, dev, name); 1076 else if (__dev_get_by_name(net, name)) 1077 return -EEXIST; 1078 else if (dev->name != name) 1079 strlcpy(dev->name, name, IFNAMSIZ); 1080 1081 return 0; 1082 } 1083 1084 /** 1085 * dev_change_name - change name of a device 1086 * @dev: device 1087 * @newname: name (or format string) must be at least IFNAMSIZ 1088 * 1089 * Change name of a device, can pass format strings "eth%d". 1090 * for wildcarding. 1091 */ 1092 int dev_change_name(struct net_device *dev, const char *newname) 1093 { 1094 unsigned char old_assign_type; 1095 char oldname[IFNAMSIZ]; 1096 int err = 0; 1097 int ret; 1098 struct net *net; 1099 1100 ASSERT_RTNL(); 1101 BUG_ON(!dev_net(dev)); 1102 1103 net = dev_net(dev); 1104 if (dev->flags & IFF_UP) 1105 return -EBUSY; 1106 1107 write_seqcount_begin(&devnet_rename_seq); 1108 1109 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1110 write_seqcount_end(&devnet_rename_seq); 1111 return 0; 1112 } 1113 1114 memcpy(oldname, dev->name, IFNAMSIZ); 1115 1116 err = dev_get_valid_name(net, dev, newname); 1117 if (err < 0) { 1118 write_seqcount_end(&devnet_rename_seq); 1119 return err; 1120 } 1121 1122 if (oldname[0] && !strchr(oldname, '%')) 1123 netdev_info(dev, "renamed from %s\n", oldname); 1124 1125 old_assign_type = dev->name_assign_type; 1126 dev->name_assign_type = NET_NAME_RENAMED; 1127 1128 rollback: 1129 ret = device_rename(&dev->dev, dev->name); 1130 if (ret) { 1131 memcpy(dev->name, oldname, IFNAMSIZ); 1132 dev->name_assign_type = old_assign_type; 1133 write_seqcount_end(&devnet_rename_seq); 1134 return ret; 1135 } 1136 1137 write_seqcount_end(&devnet_rename_seq); 1138 1139 netdev_adjacent_rename_links(dev, oldname); 1140 1141 write_lock_bh(&dev_base_lock); 1142 hlist_del_rcu(&dev->name_hlist); 1143 write_unlock_bh(&dev_base_lock); 1144 1145 synchronize_rcu(); 1146 1147 write_lock_bh(&dev_base_lock); 1148 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1149 write_unlock_bh(&dev_base_lock); 1150 1151 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1152 ret = notifier_to_errno(ret); 1153 1154 if (ret) { 1155 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1156 if (err >= 0) { 1157 err = ret; 1158 write_seqcount_begin(&devnet_rename_seq); 1159 memcpy(dev->name, oldname, IFNAMSIZ); 1160 memcpy(oldname, newname, IFNAMSIZ); 1161 dev->name_assign_type = old_assign_type; 1162 old_assign_type = NET_NAME_RENAMED; 1163 goto rollback; 1164 } else { 1165 pr_err("%s: name change rollback failed: %d\n", 1166 dev->name, ret); 1167 } 1168 } 1169 1170 return err; 1171 } 1172 1173 /** 1174 * dev_set_alias - change ifalias of a device 1175 * @dev: device 1176 * @alias: name up to IFALIASZ 1177 * @len: limit of bytes to copy from info 1178 * 1179 * Set ifalias for a device, 1180 */ 1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1182 { 1183 char *new_ifalias; 1184 1185 ASSERT_RTNL(); 1186 1187 if (len >= IFALIASZ) 1188 return -EINVAL; 1189 1190 if (!len) { 1191 kfree(dev->ifalias); 1192 dev->ifalias = NULL; 1193 return 0; 1194 } 1195 1196 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1197 if (!new_ifalias) 1198 return -ENOMEM; 1199 dev->ifalias = new_ifalias; 1200 1201 strlcpy(dev->ifalias, alias, len+1); 1202 return len; 1203 } 1204 1205 1206 /** 1207 * netdev_features_change - device changes features 1208 * @dev: device to cause notification 1209 * 1210 * Called to indicate a device has changed features. 1211 */ 1212 void netdev_features_change(struct net_device *dev) 1213 { 1214 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1215 } 1216 EXPORT_SYMBOL(netdev_features_change); 1217 1218 /** 1219 * netdev_state_change - device changes state 1220 * @dev: device to cause notification 1221 * 1222 * Called to indicate a device has changed state. This function calls 1223 * the notifier chains for netdev_chain and sends a NEWLINK message 1224 * to the routing socket. 1225 */ 1226 void netdev_state_change(struct net_device *dev) 1227 { 1228 if (dev->flags & IFF_UP) { 1229 struct netdev_notifier_change_info change_info; 1230 1231 change_info.flags_changed = 0; 1232 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1233 &change_info.info); 1234 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1235 } 1236 } 1237 EXPORT_SYMBOL(netdev_state_change); 1238 1239 /** 1240 * netdev_notify_peers - notify network peers about existence of @dev 1241 * @dev: network device 1242 * 1243 * Generate traffic such that interested network peers are aware of 1244 * @dev, such as by generating a gratuitous ARP. This may be used when 1245 * a device wants to inform the rest of the network about some sort of 1246 * reconfiguration such as a failover event or virtual machine 1247 * migration. 1248 */ 1249 void netdev_notify_peers(struct net_device *dev) 1250 { 1251 rtnl_lock(); 1252 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1253 rtnl_unlock(); 1254 } 1255 EXPORT_SYMBOL(netdev_notify_peers); 1256 1257 static int __dev_open(struct net_device *dev) 1258 { 1259 const struct net_device_ops *ops = dev->netdev_ops; 1260 int ret; 1261 1262 ASSERT_RTNL(); 1263 1264 if (!netif_device_present(dev)) 1265 return -ENODEV; 1266 1267 /* Block netpoll from trying to do any rx path servicing. 1268 * If we don't do this there is a chance ndo_poll_controller 1269 * or ndo_poll may be running while we open the device 1270 */ 1271 netpoll_poll_disable(dev); 1272 1273 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1274 ret = notifier_to_errno(ret); 1275 if (ret) 1276 return ret; 1277 1278 set_bit(__LINK_STATE_START, &dev->state); 1279 1280 if (ops->ndo_validate_addr) 1281 ret = ops->ndo_validate_addr(dev); 1282 1283 if (!ret && ops->ndo_open) 1284 ret = ops->ndo_open(dev); 1285 1286 netpoll_poll_enable(dev); 1287 1288 if (ret) 1289 clear_bit(__LINK_STATE_START, &dev->state); 1290 else { 1291 dev->flags |= IFF_UP; 1292 dev_set_rx_mode(dev); 1293 dev_activate(dev); 1294 add_device_randomness(dev->dev_addr, dev->addr_len); 1295 } 1296 1297 return ret; 1298 } 1299 1300 /** 1301 * dev_open - prepare an interface for use. 1302 * @dev: device to open 1303 * 1304 * Takes a device from down to up state. The device's private open 1305 * function is invoked and then the multicast lists are loaded. Finally 1306 * the device is moved into the up state and a %NETDEV_UP message is 1307 * sent to the netdev notifier chain. 1308 * 1309 * Calling this function on an active interface is a nop. On a failure 1310 * a negative errno code is returned. 1311 */ 1312 int dev_open(struct net_device *dev) 1313 { 1314 int ret; 1315 1316 if (dev->flags & IFF_UP) 1317 return 0; 1318 1319 ret = __dev_open(dev); 1320 if (ret < 0) 1321 return ret; 1322 1323 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1324 call_netdevice_notifiers(NETDEV_UP, dev); 1325 1326 return ret; 1327 } 1328 EXPORT_SYMBOL(dev_open); 1329 1330 static int __dev_close_many(struct list_head *head) 1331 { 1332 struct net_device *dev; 1333 1334 ASSERT_RTNL(); 1335 might_sleep(); 1336 1337 list_for_each_entry(dev, head, close_list) { 1338 /* Temporarily disable netpoll until the interface is down */ 1339 netpoll_poll_disable(dev); 1340 1341 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1342 1343 clear_bit(__LINK_STATE_START, &dev->state); 1344 1345 /* Synchronize to scheduled poll. We cannot touch poll list, it 1346 * can be even on different cpu. So just clear netif_running(). 1347 * 1348 * dev->stop() will invoke napi_disable() on all of it's 1349 * napi_struct instances on this device. 1350 */ 1351 smp_mb__after_atomic(); /* Commit netif_running(). */ 1352 } 1353 1354 dev_deactivate_many(head); 1355 1356 list_for_each_entry(dev, head, close_list) { 1357 const struct net_device_ops *ops = dev->netdev_ops; 1358 1359 /* 1360 * Call the device specific close. This cannot fail. 1361 * Only if device is UP 1362 * 1363 * We allow it to be called even after a DETACH hot-plug 1364 * event. 1365 */ 1366 if (ops->ndo_stop) 1367 ops->ndo_stop(dev); 1368 1369 dev->flags &= ~IFF_UP; 1370 netpoll_poll_enable(dev); 1371 } 1372 1373 return 0; 1374 } 1375 1376 static int __dev_close(struct net_device *dev) 1377 { 1378 int retval; 1379 LIST_HEAD(single); 1380 1381 list_add(&dev->close_list, &single); 1382 retval = __dev_close_many(&single); 1383 list_del(&single); 1384 1385 return retval; 1386 } 1387 1388 static int dev_close_many(struct list_head *head) 1389 { 1390 struct net_device *dev, *tmp; 1391 1392 /* Remove the devices that don't need to be closed */ 1393 list_for_each_entry_safe(dev, tmp, head, close_list) 1394 if (!(dev->flags & IFF_UP)) 1395 list_del_init(&dev->close_list); 1396 1397 __dev_close_many(head); 1398 1399 list_for_each_entry_safe(dev, tmp, head, close_list) { 1400 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1401 call_netdevice_notifiers(NETDEV_DOWN, dev); 1402 list_del_init(&dev->close_list); 1403 } 1404 1405 return 0; 1406 } 1407 1408 /** 1409 * dev_close - shutdown an interface. 1410 * @dev: device to shutdown 1411 * 1412 * This function moves an active device into down state. A 1413 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1414 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1415 * chain. 1416 */ 1417 int dev_close(struct net_device *dev) 1418 { 1419 if (dev->flags & IFF_UP) { 1420 LIST_HEAD(single); 1421 1422 list_add(&dev->close_list, &single); 1423 dev_close_many(&single); 1424 list_del(&single); 1425 } 1426 return 0; 1427 } 1428 EXPORT_SYMBOL(dev_close); 1429 1430 1431 /** 1432 * dev_disable_lro - disable Large Receive Offload on a device 1433 * @dev: device 1434 * 1435 * Disable Large Receive Offload (LRO) on a net device. Must be 1436 * called under RTNL. This is needed if received packets may be 1437 * forwarded to another interface. 1438 */ 1439 void dev_disable_lro(struct net_device *dev) 1440 { 1441 struct net_device *lower_dev; 1442 struct list_head *iter; 1443 1444 dev->wanted_features &= ~NETIF_F_LRO; 1445 netdev_update_features(dev); 1446 1447 if (unlikely(dev->features & NETIF_F_LRO)) 1448 netdev_WARN(dev, "failed to disable LRO!\n"); 1449 1450 netdev_for_each_lower_dev(dev, lower_dev, iter) 1451 dev_disable_lro(lower_dev); 1452 } 1453 EXPORT_SYMBOL(dev_disable_lro); 1454 1455 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1456 struct net_device *dev) 1457 { 1458 struct netdev_notifier_info info; 1459 1460 netdev_notifier_info_init(&info, dev); 1461 return nb->notifier_call(nb, val, &info); 1462 } 1463 1464 static int dev_boot_phase = 1; 1465 1466 /** 1467 * register_netdevice_notifier - register a network notifier block 1468 * @nb: notifier 1469 * 1470 * Register a notifier to be called when network device events occur. 1471 * The notifier passed is linked into the kernel structures and must 1472 * not be reused until it has been unregistered. A negative errno code 1473 * is returned on a failure. 1474 * 1475 * When registered all registration and up events are replayed 1476 * to the new notifier to allow device to have a race free 1477 * view of the network device list. 1478 */ 1479 1480 int register_netdevice_notifier(struct notifier_block *nb) 1481 { 1482 struct net_device *dev; 1483 struct net_device *last; 1484 struct net *net; 1485 int err; 1486 1487 rtnl_lock(); 1488 err = raw_notifier_chain_register(&netdev_chain, nb); 1489 if (err) 1490 goto unlock; 1491 if (dev_boot_phase) 1492 goto unlock; 1493 for_each_net(net) { 1494 for_each_netdev(net, dev) { 1495 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1496 err = notifier_to_errno(err); 1497 if (err) 1498 goto rollback; 1499 1500 if (!(dev->flags & IFF_UP)) 1501 continue; 1502 1503 call_netdevice_notifier(nb, NETDEV_UP, dev); 1504 } 1505 } 1506 1507 unlock: 1508 rtnl_unlock(); 1509 return err; 1510 1511 rollback: 1512 last = dev; 1513 for_each_net(net) { 1514 for_each_netdev(net, dev) { 1515 if (dev == last) 1516 goto outroll; 1517 1518 if (dev->flags & IFF_UP) { 1519 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1520 dev); 1521 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1522 } 1523 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1524 } 1525 } 1526 1527 outroll: 1528 raw_notifier_chain_unregister(&netdev_chain, nb); 1529 goto unlock; 1530 } 1531 EXPORT_SYMBOL(register_netdevice_notifier); 1532 1533 /** 1534 * unregister_netdevice_notifier - unregister a network notifier block 1535 * @nb: notifier 1536 * 1537 * Unregister a notifier previously registered by 1538 * register_netdevice_notifier(). The notifier is unlinked into the 1539 * kernel structures and may then be reused. A negative errno code 1540 * is returned on a failure. 1541 * 1542 * After unregistering unregister and down device events are synthesized 1543 * for all devices on the device list to the removed notifier to remove 1544 * the need for special case cleanup code. 1545 */ 1546 1547 int unregister_netdevice_notifier(struct notifier_block *nb) 1548 { 1549 struct net_device *dev; 1550 struct net *net; 1551 int err; 1552 1553 rtnl_lock(); 1554 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1555 if (err) 1556 goto unlock; 1557 1558 for_each_net(net) { 1559 for_each_netdev(net, dev) { 1560 if (dev->flags & IFF_UP) { 1561 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1562 dev); 1563 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1564 } 1565 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1566 } 1567 } 1568 unlock: 1569 rtnl_unlock(); 1570 return err; 1571 } 1572 EXPORT_SYMBOL(unregister_netdevice_notifier); 1573 1574 /** 1575 * call_netdevice_notifiers_info - call all network notifier blocks 1576 * @val: value passed unmodified to notifier function 1577 * @dev: net_device pointer passed unmodified to notifier function 1578 * @info: notifier information data 1579 * 1580 * Call all network notifier blocks. Parameters and return value 1581 * are as for raw_notifier_call_chain(). 1582 */ 1583 1584 static int call_netdevice_notifiers_info(unsigned long val, 1585 struct net_device *dev, 1586 struct netdev_notifier_info *info) 1587 { 1588 ASSERT_RTNL(); 1589 netdev_notifier_info_init(info, dev); 1590 return raw_notifier_call_chain(&netdev_chain, val, info); 1591 } 1592 1593 /** 1594 * call_netdevice_notifiers - call all network notifier blocks 1595 * @val: value passed unmodified to notifier function 1596 * @dev: net_device pointer passed unmodified to notifier function 1597 * 1598 * Call all network notifier blocks. Parameters and return value 1599 * are as for raw_notifier_call_chain(). 1600 */ 1601 1602 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1603 { 1604 struct netdev_notifier_info info; 1605 1606 return call_netdevice_notifiers_info(val, dev, &info); 1607 } 1608 EXPORT_SYMBOL(call_netdevice_notifiers); 1609 1610 static struct static_key netstamp_needed __read_mostly; 1611 #ifdef HAVE_JUMP_LABEL 1612 /* We are not allowed to call static_key_slow_dec() from irq context 1613 * If net_disable_timestamp() is called from irq context, defer the 1614 * static_key_slow_dec() calls. 1615 */ 1616 static atomic_t netstamp_needed_deferred; 1617 #endif 1618 1619 void net_enable_timestamp(void) 1620 { 1621 #ifdef HAVE_JUMP_LABEL 1622 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1623 1624 if (deferred) { 1625 while (--deferred) 1626 static_key_slow_dec(&netstamp_needed); 1627 return; 1628 } 1629 #endif 1630 static_key_slow_inc(&netstamp_needed); 1631 } 1632 EXPORT_SYMBOL(net_enable_timestamp); 1633 1634 void net_disable_timestamp(void) 1635 { 1636 #ifdef HAVE_JUMP_LABEL 1637 if (in_interrupt()) { 1638 atomic_inc(&netstamp_needed_deferred); 1639 return; 1640 } 1641 #endif 1642 static_key_slow_dec(&netstamp_needed); 1643 } 1644 EXPORT_SYMBOL(net_disable_timestamp); 1645 1646 static inline void net_timestamp_set(struct sk_buff *skb) 1647 { 1648 skb->tstamp.tv64 = 0; 1649 if (static_key_false(&netstamp_needed)) 1650 __net_timestamp(skb); 1651 } 1652 1653 #define net_timestamp_check(COND, SKB) \ 1654 if (static_key_false(&netstamp_needed)) { \ 1655 if ((COND) && !(SKB)->tstamp.tv64) \ 1656 __net_timestamp(SKB); \ 1657 } \ 1658 1659 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) 1660 { 1661 unsigned int len; 1662 1663 if (!(dev->flags & IFF_UP)) 1664 return false; 1665 1666 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1667 if (skb->len <= len) 1668 return true; 1669 1670 /* if TSO is enabled, we don't care about the length as the packet 1671 * could be forwarded without being segmented before 1672 */ 1673 if (skb_is_gso(skb)) 1674 return true; 1675 1676 return false; 1677 } 1678 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1679 1680 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1681 { 1682 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 1683 if (skb_copy_ubufs(skb, GFP_ATOMIC)) { 1684 atomic_long_inc(&dev->rx_dropped); 1685 kfree_skb(skb); 1686 return NET_RX_DROP; 1687 } 1688 } 1689 1690 if (unlikely(!is_skb_forwardable(dev, skb))) { 1691 atomic_long_inc(&dev->rx_dropped); 1692 kfree_skb(skb); 1693 return NET_RX_DROP; 1694 } 1695 1696 skb_scrub_packet(skb, true); 1697 skb->protocol = eth_type_trans(skb, dev); 1698 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1699 1700 return 0; 1701 } 1702 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1703 1704 /** 1705 * dev_forward_skb - loopback an skb to another netif 1706 * 1707 * @dev: destination network device 1708 * @skb: buffer to forward 1709 * 1710 * return values: 1711 * NET_RX_SUCCESS (no congestion) 1712 * NET_RX_DROP (packet was dropped, but freed) 1713 * 1714 * dev_forward_skb can be used for injecting an skb from the 1715 * start_xmit function of one device into the receive queue 1716 * of another device. 1717 * 1718 * The receiving device may be in another namespace, so 1719 * we have to clear all information in the skb that could 1720 * impact namespace isolation. 1721 */ 1722 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1723 { 1724 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1725 } 1726 EXPORT_SYMBOL_GPL(dev_forward_skb); 1727 1728 static inline int deliver_skb(struct sk_buff *skb, 1729 struct packet_type *pt_prev, 1730 struct net_device *orig_dev) 1731 { 1732 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1733 return -ENOMEM; 1734 atomic_inc(&skb->users); 1735 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1736 } 1737 1738 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1739 struct packet_type **pt, 1740 struct net_device *dev, __be16 type, 1741 struct list_head *ptype_list) 1742 { 1743 struct packet_type *ptype, *pt_prev = *pt; 1744 1745 list_for_each_entry_rcu(ptype, ptype_list, list) { 1746 if (ptype->type != type) 1747 continue; 1748 if (pt_prev) 1749 deliver_skb(skb, pt_prev, dev); 1750 pt_prev = ptype; 1751 } 1752 *pt = pt_prev; 1753 } 1754 1755 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1756 { 1757 if (!ptype->af_packet_priv || !skb->sk) 1758 return false; 1759 1760 if (ptype->id_match) 1761 return ptype->id_match(ptype, skb->sk); 1762 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1763 return true; 1764 1765 return false; 1766 } 1767 1768 /* 1769 * Support routine. Sends outgoing frames to any network 1770 * taps currently in use. 1771 */ 1772 1773 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1774 { 1775 struct packet_type *ptype; 1776 struct sk_buff *skb2 = NULL; 1777 struct packet_type *pt_prev = NULL; 1778 struct list_head *ptype_list = &ptype_all; 1779 1780 rcu_read_lock(); 1781 again: 1782 list_for_each_entry_rcu(ptype, ptype_list, list) { 1783 /* Never send packets back to the socket 1784 * they originated from - MvS (miquels@drinkel.ow.org) 1785 */ 1786 if (skb_loop_sk(ptype, skb)) 1787 continue; 1788 1789 if (pt_prev) { 1790 deliver_skb(skb2, pt_prev, skb->dev); 1791 pt_prev = ptype; 1792 continue; 1793 } 1794 1795 /* need to clone skb, done only once */ 1796 skb2 = skb_clone(skb, GFP_ATOMIC); 1797 if (!skb2) 1798 goto out_unlock; 1799 1800 net_timestamp_set(skb2); 1801 1802 /* skb->nh should be correctly 1803 * set by sender, so that the second statement is 1804 * just protection against buggy protocols. 1805 */ 1806 skb_reset_mac_header(skb2); 1807 1808 if (skb_network_header(skb2) < skb2->data || 1809 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1810 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1811 ntohs(skb2->protocol), 1812 dev->name); 1813 skb_reset_network_header(skb2); 1814 } 1815 1816 skb2->transport_header = skb2->network_header; 1817 skb2->pkt_type = PACKET_OUTGOING; 1818 pt_prev = ptype; 1819 } 1820 1821 if (ptype_list == &ptype_all) { 1822 ptype_list = &dev->ptype_all; 1823 goto again; 1824 } 1825 out_unlock: 1826 if (pt_prev) 1827 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1828 rcu_read_unlock(); 1829 } 1830 1831 /** 1832 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1833 * @dev: Network device 1834 * @txq: number of queues available 1835 * 1836 * If real_num_tx_queues is changed the tc mappings may no longer be 1837 * valid. To resolve this verify the tc mapping remains valid and if 1838 * not NULL the mapping. With no priorities mapping to this 1839 * offset/count pair it will no longer be used. In the worst case TC0 1840 * is invalid nothing can be done so disable priority mappings. If is 1841 * expected that drivers will fix this mapping if they can before 1842 * calling netif_set_real_num_tx_queues. 1843 */ 1844 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1845 { 1846 int i; 1847 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1848 1849 /* If TC0 is invalidated disable TC mapping */ 1850 if (tc->offset + tc->count > txq) { 1851 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1852 dev->num_tc = 0; 1853 return; 1854 } 1855 1856 /* Invalidated prio to tc mappings set to TC0 */ 1857 for (i = 1; i < TC_BITMASK + 1; i++) { 1858 int q = netdev_get_prio_tc_map(dev, i); 1859 1860 tc = &dev->tc_to_txq[q]; 1861 if (tc->offset + tc->count > txq) { 1862 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1863 i, q); 1864 netdev_set_prio_tc_map(dev, i, 0); 1865 } 1866 } 1867 } 1868 1869 #ifdef CONFIG_XPS 1870 static DEFINE_MUTEX(xps_map_mutex); 1871 #define xmap_dereference(P) \ 1872 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1873 1874 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1875 int cpu, u16 index) 1876 { 1877 struct xps_map *map = NULL; 1878 int pos; 1879 1880 if (dev_maps) 1881 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1882 1883 for (pos = 0; map && pos < map->len; pos++) { 1884 if (map->queues[pos] == index) { 1885 if (map->len > 1) { 1886 map->queues[pos] = map->queues[--map->len]; 1887 } else { 1888 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1889 kfree_rcu(map, rcu); 1890 map = NULL; 1891 } 1892 break; 1893 } 1894 } 1895 1896 return map; 1897 } 1898 1899 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1900 { 1901 struct xps_dev_maps *dev_maps; 1902 int cpu, i; 1903 bool active = false; 1904 1905 mutex_lock(&xps_map_mutex); 1906 dev_maps = xmap_dereference(dev->xps_maps); 1907 1908 if (!dev_maps) 1909 goto out_no_maps; 1910 1911 for_each_possible_cpu(cpu) { 1912 for (i = index; i < dev->num_tx_queues; i++) { 1913 if (!remove_xps_queue(dev_maps, cpu, i)) 1914 break; 1915 } 1916 if (i == dev->num_tx_queues) 1917 active = true; 1918 } 1919 1920 if (!active) { 1921 RCU_INIT_POINTER(dev->xps_maps, NULL); 1922 kfree_rcu(dev_maps, rcu); 1923 } 1924 1925 for (i = index; i < dev->num_tx_queues; i++) 1926 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 1927 NUMA_NO_NODE); 1928 1929 out_no_maps: 1930 mutex_unlock(&xps_map_mutex); 1931 } 1932 1933 static struct xps_map *expand_xps_map(struct xps_map *map, 1934 int cpu, u16 index) 1935 { 1936 struct xps_map *new_map; 1937 int alloc_len = XPS_MIN_MAP_ALLOC; 1938 int i, pos; 1939 1940 for (pos = 0; map && pos < map->len; pos++) { 1941 if (map->queues[pos] != index) 1942 continue; 1943 return map; 1944 } 1945 1946 /* Need to add queue to this CPU's existing map */ 1947 if (map) { 1948 if (pos < map->alloc_len) 1949 return map; 1950 1951 alloc_len = map->alloc_len * 2; 1952 } 1953 1954 /* Need to allocate new map to store queue on this CPU's map */ 1955 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 1956 cpu_to_node(cpu)); 1957 if (!new_map) 1958 return NULL; 1959 1960 for (i = 0; i < pos; i++) 1961 new_map->queues[i] = map->queues[i]; 1962 new_map->alloc_len = alloc_len; 1963 new_map->len = pos; 1964 1965 return new_map; 1966 } 1967 1968 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 1969 u16 index) 1970 { 1971 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 1972 struct xps_map *map, *new_map; 1973 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 1974 int cpu, numa_node_id = -2; 1975 bool active = false; 1976 1977 mutex_lock(&xps_map_mutex); 1978 1979 dev_maps = xmap_dereference(dev->xps_maps); 1980 1981 /* allocate memory for queue storage */ 1982 for_each_online_cpu(cpu) { 1983 if (!cpumask_test_cpu(cpu, mask)) 1984 continue; 1985 1986 if (!new_dev_maps) 1987 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 1988 if (!new_dev_maps) { 1989 mutex_unlock(&xps_map_mutex); 1990 return -ENOMEM; 1991 } 1992 1993 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 1994 NULL; 1995 1996 map = expand_xps_map(map, cpu, index); 1997 if (!map) 1998 goto error; 1999 2000 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2001 } 2002 2003 if (!new_dev_maps) 2004 goto out_no_new_maps; 2005 2006 for_each_possible_cpu(cpu) { 2007 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2008 /* add queue to CPU maps */ 2009 int pos = 0; 2010 2011 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2012 while ((pos < map->len) && (map->queues[pos] != index)) 2013 pos++; 2014 2015 if (pos == map->len) 2016 map->queues[map->len++] = index; 2017 #ifdef CONFIG_NUMA 2018 if (numa_node_id == -2) 2019 numa_node_id = cpu_to_node(cpu); 2020 else if (numa_node_id != cpu_to_node(cpu)) 2021 numa_node_id = -1; 2022 #endif 2023 } else if (dev_maps) { 2024 /* fill in the new device map from the old device map */ 2025 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2026 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2027 } 2028 2029 } 2030 2031 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2032 2033 /* Cleanup old maps */ 2034 if (dev_maps) { 2035 for_each_possible_cpu(cpu) { 2036 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2037 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2038 if (map && map != new_map) 2039 kfree_rcu(map, rcu); 2040 } 2041 2042 kfree_rcu(dev_maps, rcu); 2043 } 2044 2045 dev_maps = new_dev_maps; 2046 active = true; 2047 2048 out_no_new_maps: 2049 /* update Tx queue numa node */ 2050 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2051 (numa_node_id >= 0) ? numa_node_id : 2052 NUMA_NO_NODE); 2053 2054 if (!dev_maps) 2055 goto out_no_maps; 2056 2057 /* removes queue from unused CPUs */ 2058 for_each_possible_cpu(cpu) { 2059 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2060 continue; 2061 2062 if (remove_xps_queue(dev_maps, cpu, index)) 2063 active = true; 2064 } 2065 2066 /* free map if not active */ 2067 if (!active) { 2068 RCU_INIT_POINTER(dev->xps_maps, NULL); 2069 kfree_rcu(dev_maps, rcu); 2070 } 2071 2072 out_no_maps: 2073 mutex_unlock(&xps_map_mutex); 2074 2075 return 0; 2076 error: 2077 /* remove any maps that we added */ 2078 for_each_possible_cpu(cpu) { 2079 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2080 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2081 NULL; 2082 if (new_map && new_map != map) 2083 kfree(new_map); 2084 } 2085 2086 mutex_unlock(&xps_map_mutex); 2087 2088 kfree(new_dev_maps); 2089 return -ENOMEM; 2090 } 2091 EXPORT_SYMBOL(netif_set_xps_queue); 2092 2093 #endif 2094 /* 2095 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2096 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2097 */ 2098 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2099 { 2100 int rc; 2101 2102 if (txq < 1 || txq > dev->num_tx_queues) 2103 return -EINVAL; 2104 2105 if (dev->reg_state == NETREG_REGISTERED || 2106 dev->reg_state == NETREG_UNREGISTERING) { 2107 ASSERT_RTNL(); 2108 2109 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2110 txq); 2111 if (rc) 2112 return rc; 2113 2114 if (dev->num_tc) 2115 netif_setup_tc(dev, txq); 2116 2117 if (txq < dev->real_num_tx_queues) { 2118 qdisc_reset_all_tx_gt(dev, txq); 2119 #ifdef CONFIG_XPS 2120 netif_reset_xps_queues_gt(dev, txq); 2121 #endif 2122 } 2123 } 2124 2125 dev->real_num_tx_queues = txq; 2126 return 0; 2127 } 2128 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2129 2130 #ifdef CONFIG_SYSFS 2131 /** 2132 * netif_set_real_num_rx_queues - set actual number of RX queues used 2133 * @dev: Network device 2134 * @rxq: Actual number of RX queues 2135 * 2136 * This must be called either with the rtnl_lock held or before 2137 * registration of the net device. Returns 0 on success, or a 2138 * negative error code. If called before registration, it always 2139 * succeeds. 2140 */ 2141 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2142 { 2143 int rc; 2144 2145 if (rxq < 1 || rxq > dev->num_rx_queues) 2146 return -EINVAL; 2147 2148 if (dev->reg_state == NETREG_REGISTERED) { 2149 ASSERT_RTNL(); 2150 2151 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2152 rxq); 2153 if (rc) 2154 return rc; 2155 } 2156 2157 dev->real_num_rx_queues = rxq; 2158 return 0; 2159 } 2160 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2161 #endif 2162 2163 /** 2164 * netif_get_num_default_rss_queues - default number of RSS queues 2165 * 2166 * This routine should set an upper limit on the number of RSS queues 2167 * used by default by multiqueue devices. 2168 */ 2169 int netif_get_num_default_rss_queues(void) 2170 { 2171 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2172 } 2173 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2174 2175 static inline void __netif_reschedule(struct Qdisc *q) 2176 { 2177 struct softnet_data *sd; 2178 unsigned long flags; 2179 2180 local_irq_save(flags); 2181 sd = this_cpu_ptr(&softnet_data); 2182 q->next_sched = NULL; 2183 *sd->output_queue_tailp = q; 2184 sd->output_queue_tailp = &q->next_sched; 2185 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2186 local_irq_restore(flags); 2187 } 2188 2189 void __netif_schedule(struct Qdisc *q) 2190 { 2191 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2192 __netif_reschedule(q); 2193 } 2194 EXPORT_SYMBOL(__netif_schedule); 2195 2196 struct dev_kfree_skb_cb { 2197 enum skb_free_reason reason; 2198 }; 2199 2200 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2201 { 2202 return (struct dev_kfree_skb_cb *)skb->cb; 2203 } 2204 2205 void netif_schedule_queue(struct netdev_queue *txq) 2206 { 2207 rcu_read_lock(); 2208 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2209 struct Qdisc *q = rcu_dereference(txq->qdisc); 2210 2211 __netif_schedule(q); 2212 } 2213 rcu_read_unlock(); 2214 } 2215 EXPORT_SYMBOL(netif_schedule_queue); 2216 2217 /** 2218 * netif_wake_subqueue - allow sending packets on subqueue 2219 * @dev: network device 2220 * @queue_index: sub queue index 2221 * 2222 * Resume individual transmit queue of a device with multiple transmit queues. 2223 */ 2224 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2225 { 2226 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2227 2228 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2229 struct Qdisc *q; 2230 2231 rcu_read_lock(); 2232 q = rcu_dereference(txq->qdisc); 2233 __netif_schedule(q); 2234 rcu_read_unlock(); 2235 } 2236 } 2237 EXPORT_SYMBOL(netif_wake_subqueue); 2238 2239 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2240 { 2241 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2242 struct Qdisc *q; 2243 2244 rcu_read_lock(); 2245 q = rcu_dereference(dev_queue->qdisc); 2246 __netif_schedule(q); 2247 rcu_read_unlock(); 2248 } 2249 } 2250 EXPORT_SYMBOL(netif_tx_wake_queue); 2251 2252 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2253 { 2254 unsigned long flags; 2255 2256 if (likely(atomic_read(&skb->users) == 1)) { 2257 smp_rmb(); 2258 atomic_set(&skb->users, 0); 2259 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2260 return; 2261 } 2262 get_kfree_skb_cb(skb)->reason = reason; 2263 local_irq_save(flags); 2264 skb->next = __this_cpu_read(softnet_data.completion_queue); 2265 __this_cpu_write(softnet_data.completion_queue, skb); 2266 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2267 local_irq_restore(flags); 2268 } 2269 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2270 2271 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2272 { 2273 if (in_irq() || irqs_disabled()) 2274 __dev_kfree_skb_irq(skb, reason); 2275 else 2276 dev_kfree_skb(skb); 2277 } 2278 EXPORT_SYMBOL(__dev_kfree_skb_any); 2279 2280 2281 /** 2282 * netif_device_detach - mark device as removed 2283 * @dev: network device 2284 * 2285 * Mark device as removed from system and therefore no longer available. 2286 */ 2287 void netif_device_detach(struct net_device *dev) 2288 { 2289 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2290 netif_running(dev)) { 2291 netif_tx_stop_all_queues(dev); 2292 } 2293 } 2294 EXPORT_SYMBOL(netif_device_detach); 2295 2296 /** 2297 * netif_device_attach - mark device as attached 2298 * @dev: network device 2299 * 2300 * Mark device as attached from system and restart if needed. 2301 */ 2302 void netif_device_attach(struct net_device *dev) 2303 { 2304 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2305 netif_running(dev)) { 2306 netif_tx_wake_all_queues(dev); 2307 __netdev_watchdog_up(dev); 2308 } 2309 } 2310 EXPORT_SYMBOL(netif_device_attach); 2311 2312 static void skb_warn_bad_offload(const struct sk_buff *skb) 2313 { 2314 static const netdev_features_t null_features = 0; 2315 struct net_device *dev = skb->dev; 2316 const char *driver = ""; 2317 2318 if (!net_ratelimit()) 2319 return; 2320 2321 if (dev && dev->dev.parent) 2322 driver = dev_driver_string(dev->dev.parent); 2323 2324 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2325 "gso_type=%d ip_summed=%d\n", 2326 driver, dev ? &dev->features : &null_features, 2327 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2328 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2329 skb_shinfo(skb)->gso_type, skb->ip_summed); 2330 } 2331 2332 /* 2333 * Invalidate hardware checksum when packet is to be mangled, and 2334 * complete checksum manually on outgoing path. 2335 */ 2336 int skb_checksum_help(struct sk_buff *skb) 2337 { 2338 __wsum csum; 2339 int ret = 0, offset; 2340 2341 if (skb->ip_summed == CHECKSUM_COMPLETE) 2342 goto out_set_summed; 2343 2344 if (unlikely(skb_shinfo(skb)->gso_size)) { 2345 skb_warn_bad_offload(skb); 2346 return -EINVAL; 2347 } 2348 2349 /* Before computing a checksum, we should make sure no frag could 2350 * be modified by an external entity : checksum could be wrong. 2351 */ 2352 if (skb_has_shared_frag(skb)) { 2353 ret = __skb_linearize(skb); 2354 if (ret) 2355 goto out; 2356 } 2357 2358 offset = skb_checksum_start_offset(skb); 2359 BUG_ON(offset >= skb_headlen(skb)); 2360 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2361 2362 offset += skb->csum_offset; 2363 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2364 2365 if (skb_cloned(skb) && 2366 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2367 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2368 if (ret) 2369 goto out; 2370 } 2371 2372 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2373 out_set_summed: 2374 skb->ip_summed = CHECKSUM_NONE; 2375 out: 2376 return ret; 2377 } 2378 EXPORT_SYMBOL(skb_checksum_help); 2379 2380 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2381 { 2382 __be16 type = skb->protocol; 2383 2384 /* Tunnel gso handlers can set protocol to ethernet. */ 2385 if (type == htons(ETH_P_TEB)) { 2386 struct ethhdr *eth; 2387 2388 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2389 return 0; 2390 2391 eth = (struct ethhdr *)skb_mac_header(skb); 2392 type = eth->h_proto; 2393 } 2394 2395 return __vlan_get_protocol(skb, type, depth); 2396 } 2397 2398 /** 2399 * skb_mac_gso_segment - mac layer segmentation handler. 2400 * @skb: buffer to segment 2401 * @features: features for the output path (see dev->features) 2402 */ 2403 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2404 netdev_features_t features) 2405 { 2406 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2407 struct packet_offload *ptype; 2408 int vlan_depth = skb->mac_len; 2409 __be16 type = skb_network_protocol(skb, &vlan_depth); 2410 2411 if (unlikely(!type)) 2412 return ERR_PTR(-EINVAL); 2413 2414 __skb_pull(skb, vlan_depth); 2415 2416 rcu_read_lock(); 2417 list_for_each_entry_rcu(ptype, &offload_base, list) { 2418 if (ptype->type == type && ptype->callbacks.gso_segment) { 2419 segs = ptype->callbacks.gso_segment(skb, features); 2420 break; 2421 } 2422 } 2423 rcu_read_unlock(); 2424 2425 __skb_push(skb, skb->data - skb_mac_header(skb)); 2426 2427 return segs; 2428 } 2429 EXPORT_SYMBOL(skb_mac_gso_segment); 2430 2431 2432 /* openvswitch calls this on rx path, so we need a different check. 2433 */ 2434 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2435 { 2436 if (tx_path) 2437 return skb->ip_summed != CHECKSUM_PARTIAL; 2438 else 2439 return skb->ip_summed == CHECKSUM_NONE; 2440 } 2441 2442 /** 2443 * __skb_gso_segment - Perform segmentation on skb. 2444 * @skb: buffer to segment 2445 * @features: features for the output path (see dev->features) 2446 * @tx_path: whether it is called in TX path 2447 * 2448 * This function segments the given skb and returns a list of segments. 2449 * 2450 * It may return NULL if the skb requires no segmentation. This is 2451 * only possible when GSO is used for verifying header integrity. 2452 */ 2453 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2454 netdev_features_t features, bool tx_path) 2455 { 2456 if (unlikely(skb_needs_check(skb, tx_path))) { 2457 int err; 2458 2459 skb_warn_bad_offload(skb); 2460 2461 err = skb_cow_head(skb, 0); 2462 if (err < 0) 2463 return ERR_PTR(err); 2464 } 2465 2466 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2467 SKB_GSO_CB(skb)->encap_level = 0; 2468 2469 skb_reset_mac_header(skb); 2470 skb_reset_mac_len(skb); 2471 2472 return skb_mac_gso_segment(skb, features); 2473 } 2474 EXPORT_SYMBOL(__skb_gso_segment); 2475 2476 /* Take action when hardware reception checksum errors are detected. */ 2477 #ifdef CONFIG_BUG 2478 void netdev_rx_csum_fault(struct net_device *dev) 2479 { 2480 if (net_ratelimit()) { 2481 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2482 dump_stack(); 2483 } 2484 } 2485 EXPORT_SYMBOL(netdev_rx_csum_fault); 2486 #endif 2487 2488 /* Actually, we should eliminate this check as soon as we know, that: 2489 * 1. IOMMU is present and allows to map all the memory. 2490 * 2. No high memory really exists on this machine. 2491 */ 2492 2493 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2494 { 2495 #ifdef CONFIG_HIGHMEM 2496 int i; 2497 if (!(dev->features & NETIF_F_HIGHDMA)) { 2498 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2499 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2500 if (PageHighMem(skb_frag_page(frag))) 2501 return 1; 2502 } 2503 } 2504 2505 if (PCI_DMA_BUS_IS_PHYS) { 2506 struct device *pdev = dev->dev.parent; 2507 2508 if (!pdev) 2509 return 0; 2510 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2511 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2512 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2513 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2514 return 1; 2515 } 2516 } 2517 #endif 2518 return 0; 2519 } 2520 2521 /* If MPLS offload request, verify we are testing hardware MPLS features 2522 * instead of standard features for the netdev. 2523 */ 2524 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2525 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2526 netdev_features_t features, 2527 __be16 type) 2528 { 2529 if (eth_p_mpls(type)) 2530 features &= skb->dev->mpls_features; 2531 2532 return features; 2533 } 2534 #else 2535 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2536 netdev_features_t features, 2537 __be16 type) 2538 { 2539 return features; 2540 } 2541 #endif 2542 2543 static netdev_features_t harmonize_features(struct sk_buff *skb, 2544 netdev_features_t features) 2545 { 2546 int tmp; 2547 __be16 type; 2548 2549 type = skb_network_protocol(skb, &tmp); 2550 features = net_mpls_features(skb, features, type); 2551 2552 if (skb->ip_summed != CHECKSUM_NONE && 2553 !can_checksum_protocol(features, type)) { 2554 features &= ~NETIF_F_ALL_CSUM; 2555 } else if (illegal_highdma(skb->dev, skb)) { 2556 features &= ~NETIF_F_SG; 2557 } 2558 2559 return features; 2560 } 2561 2562 netdev_features_t netif_skb_features(struct sk_buff *skb) 2563 { 2564 struct net_device *dev = skb->dev; 2565 netdev_features_t features = dev->features; 2566 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2567 __be16 protocol = skb->protocol; 2568 2569 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) 2570 features &= ~NETIF_F_GSO_MASK; 2571 2572 /* If encapsulation offload request, verify we are testing 2573 * hardware encapsulation features instead of standard 2574 * features for the netdev 2575 */ 2576 if (skb->encapsulation) 2577 features &= dev->hw_enc_features; 2578 2579 if (!skb_vlan_tag_present(skb)) { 2580 if (unlikely(protocol == htons(ETH_P_8021Q) || 2581 protocol == htons(ETH_P_8021AD))) { 2582 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2583 protocol = veh->h_vlan_encapsulated_proto; 2584 } else { 2585 goto finalize; 2586 } 2587 } 2588 2589 features = netdev_intersect_features(features, 2590 dev->vlan_features | 2591 NETIF_F_HW_VLAN_CTAG_TX | 2592 NETIF_F_HW_VLAN_STAG_TX); 2593 2594 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) 2595 features = netdev_intersect_features(features, 2596 NETIF_F_SG | 2597 NETIF_F_HIGHDMA | 2598 NETIF_F_FRAGLIST | 2599 NETIF_F_GEN_CSUM | 2600 NETIF_F_HW_VLAN_CTAG_TX | 2601 NETIF_F_HW_VLAN_STAG_TX); 2602 2603 finalize: 2604 if (dev->netdev_ops->ndo_features_check) 2605 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2606 features); 2607 2608 return harmonize_features(skb, features); 2609 } 2610 EXPORT_SYMBOL(netif_skb_features); 2611 2612 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2613 struct netdev_queue *txq, bool more) 2614 { 2615 unsigned int len; 2616 int rc; 2617 2618 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2619 dev_queue_xmit_nit(skb, dev); 2620 2621 len = skb->len; 2622 trace_net_dev_start_xmit(skb, dev); 2623 rc = netdev_start_xmit(skb, dev, txq, more); 2624 trace_net_dev_xmit(skb, rc, dev, len); 2625 2626 return rc; 2627 } 2628 2629 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2630 struct netdev_queue *txq, int *ret) 2631 { 2632 struct sk_buff *skb = first; 2633 int rc = NETDEV_TX_OK; 2634 2635 while (skb) { 2636 struct sk_buff *next = skb->next; 2637 2638 skb->next = NULL; 2639 rc = xmit_one(skb, dev, txq, next != NULL); 2640 if (unlikely(!dev_xmit_complete(rc))) { 2641 skb->next = next; 2642 goto out; 2643 } 2644 2645 skb = next; 2646 if (netif_xmit_stopped(txq) && skb) { 2647 rc = NETDEV_TX_BUSY; 2648 break; 2649 } 2650 } 2651 2652 out: 2653 *ret = rc; 2654 return skb; 2655 } 2656 2657 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2658 netdev_features_t features) 2659 { 2660 if (skb_vlan_tag_present(skb) && 2661 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2662 skb = __vlan_hwaccel_push_inside(skb); 2663 return skb; 2664 } 2665 2666 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2667 { 2668 netdev_features_t features; 2669 2670 if (skb->next) 2671 return skb; 2672 2673 features = netif_skb_features(skb); 2674 skb = validate_xmit_vlan(skb, features); 2675 if (unlikely(!skb)) 2676 goto out_null; 2677 2678 if (netif_needs_gso(dev, skb, features)) { 2679 struct sk_buff *segs; 2680 2681 segs = skb_gso_segment(skb, features); 2682 if (IS_ERR(segs)) { 2683 goto out_kfree_skb; 2684 } else if (segs) { 2685 consume_skb(skb); 2686 skb = segs; 2687 } 2688 } else { 2689 if (skb_needs_linearize(skb, features) && 2690 __skb_linearize(skb)) 2691 goto out_kfree_skb; 2692 2693 /* If packet is not checksummed and device does not 2694 * support checksumming for this protocol, complete 2695 * checksumming here. 2696 */ 2697 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2698 if (skb->encapsulation) 2699 skb_set_inner_transport_header(skb, 2700 skb_checksum_start_offset(skb)); 2701 else 2702 skb_set_transport_header(skb, 2703 skb_checksum_start_offset(skb)); 2704 if (!(features & NETIF_F_ALL_CSUM) && 2705 skb_checksum_help(skb)) 2706 goto out_kfree_skb; 2707 } 2708 } 2709 2710 return skb; 2711 2712 out_kfree_skb: 2713 kfree_skb(skb); 2714 out_null: 2715 return NULL; 2716 } 2717 2718 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2719 { 2720 struct sk_buff *next, *head = NULL, *tail; 2721 2722 for (; skb != NULL; skb = next) { 2723 next = skb->next; 2724 skb->next = NULL; 2725 2726 /* in case skb wont be segmented, point to itself */ 2727 skb->prev = skb; 2728 2729 skb = validate_xmit_skb(skb, dev); 2730 if (!skb) 2731 continue; 2732 2733 if (!head) 2734 head = skb; 2735 else 2736 tail->next = skb; 2737 /* If skb was segmented, skb->prev points to 2738 * the last segment. If not, it still contains skb. 2739 */ 2740 tail = skb->prev; 2741 } 2742 return head; 2743 } 2744 2745 static void qdisc_pkt_len_init(struct sk_buff *skb) 2746 { 2747 const struct skb_shared_info *shinfo = skb_shinfo(skb); 2748 2749 qdisc_skb_cb(skb)->pkt_len = skb->len; 2750 2751 /* To get more precise estimation of bytes sent on wire, 2752 * we add to pkt_len the headers size of all segments 2753 */ 2754 if (shinfo->gso_size) { 2755 unsigned int hdr_len; 2756 u16 gso_segs = shinfo->gso_segs; 2757 2758 /* mac layer + network layer */ 2759 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 2760 2761 /* + transport layer */ 2762 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 2763 hdr_len += tcp_hdrlen(skb); 2764 else 2765 hdr_len += sizeof(struct udphdr); 2766 2767 if (shinfo->gso_type & SKB_GSO_DODGY) 2768 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 2769 shinfo->gso_size); 2770 2771 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 2772 } 2773 } 2774 2775 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2776 struct net_device *dev, 2777 struct netdev_queue *txq) 2778 { 2779 spinlock_t *root_lock = qdisc_lock(q); 2780 bool contended; 2781 int rc; 2782 2783 qdisc_pkt_len_init(skb); 2784 qdisc_calculate_pkt_len(skb, q); 2785 /* 2786 * Heuristic to force contended enqueues to serialize on a 2787 * separate lock before trying to get qdisc main lock. 2788 * This permits __QDISC___STATE_RUNNING owner to get the lock more 2789 * often and dequeue packets faster. 2790 */ 2791 contended = qdisc_is_running(q); 2792 if (unlikely(contended)) 2793 spin_lock(&q->busylock); 2794 2795 spin_lock(root_lock); 2796 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2797 kfree_skb(skb); 2798 rc = NET_XMIT_DROP; 2799 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2800 qdisc_run_begin(q)) { 2801 /* 2802 * This is a work-conserving queue; there are no old skbs 2803 * waiting to be sent out; and the qdisc is not running - 2804 * xmit the skb directly. 2805 */ 2806 2807 qdisc_bstats_update(q, skb); 2808 2809 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 2810 if (unlikely(contended)) { 2811 spin_unlock(&q->busylock); 2812 contended = false; 2813 } 2814 __qdisc_run(q); 2815 } else 2816 qdisc_run_end(q); 2817 2818 rc = NET_XMIT_SUCCESS; 2819 } else { 2820 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2821 if (qdisc_run_begin(q)) { 2822 if (unlikely(contended)) { 2823 spin_unlock(&q->busylock); 2824 contended = false; 2825 } 2826 __qdisc_run(q); 2827 } 2828 } 2829 spin_unlock(root_lock); 2830 if (unlikely(contended)) 2831 spin_unlock(&q->busylock); 2832 return rc; 2833 } 2834 2835 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 2836 static void skb_update_prio(struct sk_buff *skb) 2837 { 2838 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 2839 2840 if (!skb->priority && skb->sk && map) { 2841 unsigned int prioidx = skb->sk->sk_cgrp_prioidx; 2842 2843 if (prioidx < map->priomap_len) 2844 skb->priority = map->priomap[prioidx]; 2845 } 2846 } 2847 #else 2848 #define skb_update_prio(skb) 2849 #endif 2850 2851 DEFINE_PER_CPU(int, xmit_recursion); 2852 EXPORT_SYMBOL(xmit_recursion); 2853 2854 #define RECURSION_LIMIT 10 2855 2856 /** 2857 * dev_loopback_xmit - loop back @skb 2858 * @skb: buffer to transmit 2859 */ 2860 int dev_loopback_xmit(struct sk_buff *skb) 2861 { 2862 skb_reset_mac_header(skb); 2863 __skb_pull(skb, skb_network_offset(skb)); 2864 skb->pkt_type = PACKET_LOOPBACK; 2865 skb->ip_summed = CHECKSUM_UNNECESSARY; 2866 WARN_ON(!skb_dst(skb)); 2867 skb_dst_force(skb); 2868 netif_rx_ni(skb); 2869 return 0; 2870 } 2871 EXPORT_SYMBOL(dev_loopback_xmit); 2872 2873 /** 2874 * __dev_queue_xmit - transmit a buffer 2875 * @skb: buffer to transmit 2876 * @accel_priv: private data used for L2 forwarding offload 2877 * 2878 * Queue a buffer for transmission to a network device. The caller must 2879 * have set the device and priority and built the buffer before calling 2880 * this function. The function can be called from an interrupt. 2881 * 2882 * A negative errno code is returned on a failure. A success does not 2883 * guarantee the frame will be transmitted as it may be dropped due 2884 * to congestion or traffic shaping. 2885 * 2886 * ----------------------------------------------------------------------------------- 2887 * I notice this method can also return errors from the queue disciplines, 2888 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2889 * be positive. 2890 * 2891 * Regardless of the return value, the skb is consumed, so it is currently 2892 * difficult to retry a send to this method. (You can bump the ref count 2893 * before sending to hold a reference for retry if you are careful.) 2894 * 2895 * When calling this method, interrupts MUST be enabled. This is because 2896 * the BH enable code must have IRQs enabled so that it will not deadlock. 2897 * --BLG 2898 */ 2899 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 2900 { 2901 struct net_device *dev = skb->dev; 2902 struct netdev_queue *txq; 2903 struct Qdisc *q; 2904 int rc = -ENOMEM; 2905 2906 skb_reset_mac_header(skb); 2907 2908 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 2909 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 2910 2911 /* Disable soft irqs for various locks below. Also 2912 * stops preemption for RCU. 2913 */ 2914 rcu_read_lock_bh(); 2915 2916 skb_update_prio(skb); 2917 2918 /* If device/qdisc don't need skb->dst, release it right now while 2919 * its hot in this cpu cache. 2920 */ 2921 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2922 skb_dst_drop(skb); 2923 else 2924 skb_dst_force(skb); 2925 2926 txq = netdev_pick_tx(dev, skb, accel_priv); 2927 q = rcu_dereference_bh(txq->qdisc); 2928 2929 #ifdef CONFIG_NET_CLS_ACT 2930 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2931 #endif 2932 trace_net_dev_queue(skb); 2933 if (q->enqueue) { 2934 rc = __dev_xmit_skb(skb, q, dev, txq); 2935 goto out; 2936 } 2937 2938 /* The device has no queue. Common case for software devices: 2939 loopback, all the sorts of tunnels... 2940 2941 Really, it is unlikely that netif_tx_lock protection is necessary 2942 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2943 counters.) 2944 However, it is possible, that they rely on protection 2945 made by us here. 2946 2947 Check this and shot the lock. It is not prone from deadlocks. 2948 Either shot noqueue qdisc, it is even simpler 8) 2949 */ 2950 if (dev->flags & IFF_UP) { 2951 int cpu = smp_processor_id(); /* ok because BHs are off */ 2952 2953 if (txq->xmit_lock_owner != cpu) { 2954 2955 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 2956 goto recursion_alert; 2957 2958 skb = validate_xmit_skb(skb, dev); 2959 if (!skb) 2960 goto drop; 2961 2962 HARD_TX_LOCK(dev, txq, cpu); 2963 2964 if (!netif_xmit_stopped(txq)) { 2965 __this_cpu_inc(xmit_recursion); 2966 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 2967 __this_cpu_dec(xmit_recursion); 2968 if (dev_xmit_complete(rc)) { 2969 HARD_TX_UNLOCK(dev, txq); 2970 goto out; 2971 } 2972 } 2973 HARD_TX_UNLOCK(dev, txq); 2974 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 2975 dev->name); 2976 } else { 2977 /* Recursion is detected! It is possible, 2978 * unfortunately 2979 */ 2980 recursion_alert: 2981 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 2982 dev->name); 2983 } 2984 } 2985 2986 rc = -ENETDOWN; 2987 drop: 2988 rcu_read_unlock_bh(); 2989 2990 atomic_long_inc(&dev->tx_dropped); 2991 kfree_skb_list(skb); 2992 return rc; 2993 out: 2994 rcu_read_unlock_bh(); 2995 return rc; 2996 } 2997 2998 int dev_queue_xmit(struct sk_buff *skb) 2999 { 3000 return __dev_queue_xmit(skb, NULL); 3001 } 3002 EXPORT_SYMBOL(dev_queue_xmit); 3003 3004 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3005 { 3006 return __dev_queue_xmit(skb, accel_priv); 3007 } 3008 EXPORT_SYMBOL(dev_queue_xmit_accel); 3009 3010 3011 /*======================================================================= 3012 Receiver routines 3013 =======================================================================*/ 3014 3015 int netdev_max_backlog __read_mostly = 1000; 3016 EXPORT_SYMBOL(netdev_max_backlog); 3017 3018 int netdev_tstamp_prequeue __read_mostly = 1; 3019 int netdev_budget __read_mostly = 300; 3020 int weight_p __read_mostly = 64; /* old backlog weight */ 3021 3022 /* Called with irq disabled */ 3023 static inline void ____napi_schedule(struct softnet_data *sd, 3024 struct napi_struct *napi) 3025 { 3026 list_add_tail(&napi->poll_list, &sd->poll_list); 3027 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3028 } 3029 3030 #ifdef CONFIG_RPS 3031 3032 /* One global table that all flow-based protocols share. */ 3033 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3034 EXPORT_SYMBOL(rps_sock_flow_table); 3035 u32 rps_cpu_mask __read_mostly; 3036 EXPORT_SYMBOL(rps_cpu_mask); 3037 3038 struct static_key rps_needed __read_mostly; 3039 3040 static struct rps_dev_flow * 3041 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3042 struct rps_dev_flow *rflow, u16 next_cpu) 3043 { 3044 if (next_cpu != RPS_NO_CPU) { 3045 #ifdef CONFIG_RFS_ACCEL 3046 struct netdev_rx_queue *rxqueue; 3047 struct rps_dev_flow_table *flow_table; 3048 struct rps_dev_flow *old_rflow; 3049 u32 flow_id; 3050 u16 rxq_index; 3051 int rc; 3052 3053 /* Should we steer this flow to a different hardware queue? */ 3054 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3055 !(dev->features & NETIF_F_NTUPLE)) 3056 goto out; 3057 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3058 if (rxq_index == skb_get_rx_queue(skb)) 3059 goto out; 3060 3061 rxqueue = dev->_rx + rxq_index; 3062 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3063 if (!flow_table) 3064 goto out; 3065 flow_id = skb_get_hash(skb) & flow_table->mask; 3066 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3067 rxq_index, flow_id); 3068 if (rc < 0) 3069 goto out; 3070 old_rflow = rflow; 3071 rflow = &flow_table->flows[flow_id]; 3072 rflow->filter = rc; 3073 if (old_rflow->filter == rflow->filter) 3074 old_rflow->filter = RPS_NO_FILTER; 3075 out: 3076 #endif 3077 rflow->last_qtail = 3078 per_cpu(softnet_data, next_cpu).input_queue_head; 3079 } 3080 3081 rflow->cpu = next_cpu; 3082 return rflow; 3083 } 3084 3085 /* 3086 * get_rps_cpu is called from netif_receive_skb and returns the target 3087 * CPU from the RPS map of the receiving queue for a given skb. 3088 * rcu_read_lock must be held on entry. 3089 */ 3090 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3091 struct rps_dev_flow **rflowp) 3092 { 3093 const struct rps_sock_flow_table *sock_flow_table; 3094 struct netdev_rx_queue *rxqueue = dev->_rx; 3095 struct rps_dev_flow_table *flow_table; 3096 struct rps_map *map; 3097 int cpu = -1; 3098 u32 tcpu; 3099 u32 hash; 3100 3101 if (skb_rx_queue_recorded(skb)) { 3102 u16 index = skb_get_rx_queue(skb); 3103 3104 if (unlikely(index >= dev->real_num_rx_queues)) { 3105 WARN_ONCE(dev->real_num_rx_queues > 1, 3106 "%s received packet on queue %u, but number " 3107 "of RX queues is %u\n", 3108 dev->name, index, dev->real_num_rx_queues); 3109 goto done; 3110 } 3111 rxqueue += index; 3112 } 3113 3114 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3115 3116 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3117 map = rcu_dereference(rxqueue->rps_map); 3118 if (!flow_table && !map) 3119 goto done; 3120 3121 skb_reset_network_header(skb); 3122 hash = skb_get_hash(skb); 3123 if (!hash) 3124 goto done; 3125 3126 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3127 if (flow_table && sock_flow_table) { 3128 struct rps_dev_flow *rflow; 3129 u32 next_cpu; 3130 u32 ident; 3131 3132 /* First check into global flow table if there is a match */ 3133 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3134 if ((ident ^ hash) & ~rps_cpu_mask) 3135 goto try_rps; 3136 3137 next_cpu = ident & rps_cpu_mask; 3138 3139 /* OK, now we know there is a match, 3140 * we can look at the local (per receive queue) flow table 3141 */ 3142 rflow = &flow_table->flows[hash & flow_table->mask]; 3143 tcpu = rflow->cpu; 3144 3145 /* 3146 * If the desired CPU (where last recvmsg was done) is 3147 * different from current CPU (one in the rx-queue flow 3148 * table entry), switch if one of the following holds: 3149 * - Current CPU is unset (equal to RPS_NO_CPU). 3150 * - Current CPU is offline. 3151 * - The current CPU's queue tail has advanced beyond the 3152 * last packet that was enqueued using this table entry. 3153 * This guarantees that all previous packets for the flow 3154 * have been dequeued, thus preserving in order delivery. 3155 */ 3156 if (unlikely(tcpu != next_cpu) && 3157 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 3158 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3159 rflow->last_qtail)) >= 0)) { 3160 tcpu = next_cpu; 3161 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3162 } 3163 3164 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 3165 *rflowp = rflow; 3166 cpu = tcpu; 3167 goto done; 3168 } 3169 } 3170 3171 try_rps: 3172 3173 if (map) { 3174 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3175 if (cpu_online(tcpu)) { 3176 cpu = tcpu; 3177 goto done; 3178 } 3179 } 3180 3181 done: 3182 return cpu; 3183 } 3184 3185 #ifdef CONFIG_RFS_ACCEL 3186 3187 /** 3188 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3189 * @dev: Device on which the filter was set 3190 * @rxq_index: RX queue index 3191 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3192 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3193 * 3194 * Drivers that implement ndo_rx_flow_steer() should periodically call 3195 * this function for each installed filter and remove the filters for 3196 * which it returns %true. 3197 */ 3198 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3199 u32 flow_id, u16 filter_id) 3200 { 3201 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3202 struct rps_dev_flow_table *flow_table; 3203 struct rps_dev_flow *rflow; 3204 bool expire = true; 3205 int cpu; 3206 3207 rcu_read_lock(); 3208 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3209 if (flow_table && flow_id <= flow_table->mask) { 3210 rflow = &flow_table->flows[flow_id]; 3211 cpu = ACCESS_ONCE(rflow->cpu); 3212 if (rflow->filter == filter_id && cpu != RPS_NO_CPU && 3213 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3214 rflow->last_qtail) < 3215 (int)(10 * flow_table->mask))) 3216 expire = false; 3217 } 3218 rcu_read_unlock(); 3219 return expire; 3220 } 3221 EXPORT_SYMBOL(rps_may_expire_flow); 3222 3223 #endif /* CONFIG_RFS_ACCEL */ 3224 3225 /* Called from hardirq (IPI) context */ 3226 static void rps_trigger_softirq(void *data) 3227 { 3228 struct softnet_data *sd = data; 3229 3230 ____napi_schedule(sd, &sd->backlog); 3231 sd->received_rps++; 3232 } 3233 3234 #endif /* CONFIG_RPS */ 3235 3236 /* 3237 * Check if this softnet_data structure is another cpu one 3238 * If yes, queue it to our IPI list and return 1 3239 * If no, return 0 3240 */ 3241 static int rps_ipi_queued(struct softnet_data *sd) 3242 { 3243 #ifdef CONFIG_RPS 3244 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3245 3246 if (sd != mysd) { 3247 sd->rps_ipi_next = mysd->rps_ipi_list; 3248 mysd->rps_ipi_list = sd; 3249 3250 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3251 return 1; 3252 } 3253 #endif /* CONFIG_RPS */ 3254 return 0; 3255 } 3256 3257 #ifdef CONFIG_NET_FLOW_LIMIT 3258 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3259 #endif 3260 3261 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3262 { 3263 #ifdef CONFIG_NET_FLOW_LIMIT 3264 struct sd_flow_limit *fl; 3265 struct softnet_data *sd; 3266 unsigned int old_flow, new_flow; 3267 3268 if (qlen < (netdev_max_backlog >> 1)) 3269 return false; 3270 3271 sd = this_cpu_ptr(&softnet_data); 3272 3273 rcu_read_lock(); 3274 fl = rcu_dereference(sd->flow_limit); 3275 if (fl) { 3276 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3277 old_flow = fl->history[fl->history_head]; 3278 fl->history[fl->history_head] = new_flow; 3279 3280 fl->history_head++; 3281 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3282 3283 if (likely(fl->buckets[old_flow])) 3284 fl->buckets[old_flow]--; 3285 3286 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3287 fl->count++; 3288 rcu_read_unlock(); 3289 return true; 3290 } 3291 } 3292 rcu_read_unlock(); 3293 #endif 3294 return false; 3295 } 3296 3297 /* 3298 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3299 * queue (may be a remote CPU queue). 3300 */ 3301 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3302 unsigned int *qtail) 3303 { 3304 struct softnet_data *sd; 3305 unsigned long flags; 3306 unsigned int qlen; 3307 3308 sd = &per_cpu(softnet_data, cpu); 3309 3310 local_irq_save(flags); 3311 3312 rps_lock(sd); 3313 qlen = skb_queue_len(&sd->input_pkt_queue); 3314 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3315 if (qlen) { 3316 enqueue: 3317 __skb_queue_tail(&sd->input_pkt_queue, skb); 3318 input_queue_tail_incr_save(sd, qtail); 3319 rps_unlock(sd); 3320 local_irq_restore(flags); 3321 return NET_RX_SUCCESS; 3322 } 3323 3324 /* Schedule NAPI for backlog device 3325 * We can use non atomic operation since we own the queue lock 3326 */ 3327 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3328 if (!rps_ipi_queued(sd)) 3329 ____napi_schedule(sd, &sd->backlog); 3330 } 3331 goto enqueue; 3332 } 3333 3334 sd->dropped++; 3335 rps_unlock(sd); 3336 3337 local_irq_restore(flags); 3338 3339 atomic_long_inc(&skb->dev->rx_dropped); 3340 kfree_skb(skb); 3341 return NET_RX_DROP; 3342 } 3343 3344 static int netif_rx_internal(struct sk_buff *skb) 3345 { 3346 int ret; 3347 3348 net_timestamp_check(netdev_tstamp_prequeue, skb); 3349 3350 trace_netif_rx(skb); 3351 #ifdef CONFIG_RPS 3352 if (static_key_false(&rps_needed)) { 3353 struct rps_dev_flow voidflow, *rflow = &voidflow; 3354 int cpu; 3355 3356 preempt_disable(); 3357 rcu_read_lock(); 3358 3359 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3360 if (cpu < 0) 3361 cpu = smp_processor_id(); 3362 3363 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3364 3365 rcu_read_unlock(); 3366 preempt_enable(); 3367 } else 3368 #endif 3369 { 3370 unsigned int qtail; 3371 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3372 put_cpu(); 3373 } 3374 return ret; 3375 } 3376 3377 /** 3378 * netif_rx - post buffer to the network code 3379 * @skb: buffer to post 3380 * 3381 * This function receives a packet from a device driver and queues it for 3382 * the upper (protocol) levels to process. It always succeeds. The buffer 3383 * may be dropped during processing for congestion control or by the 3384 * protocol layers. 3385 * 3386 * return values: 3387 * NET_RX_SUCCESS (no congestion) 3388 * NET_RX_DROP (packet was dropped) 3389 * 3390 */ 3391 3392 int netif_rx(struct sk_buff *skb) 3393 { 3394 trace_netif_rx_entry(skb); 3395 3396 return netif_rx_internal(skb); 3397 } 3398 EXPORT_SYMBOL(netif_rx); 3399 3400 int netif_rx_ni(struct sk_buff *skb) 3401 { 3402 int err; 3403 3404 trace_netif_rx_ni_entry(skb); 3405 3406 preempt_disable(); 3407 err = netif_rx_internal(skb); 3408 if (local_softirq_pending()) 3409 do_softirq(); 3410 preempt_enable(); 3411 3412 return err; 3413 } 3414 EXPORT_SYMBOL(netif_rx_ni); 3415 3416 static void net_tx_action(struct softirq_action *h) 3417 { 3418 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3419 3420 if (sd->completion_queue) { 3421 struct sk_buff *clist; 3422 3423 local_irq_disable(); 3424 clist = sd->completion_queue; 3425 sd->completion_queue = NULL; 3426 local_irq_enable(); 3427 3428 while (clist) { 3429 struct sk_buff *skb = clist; 3430 clist = clist->next; 3431 3432 WARN_ON(atomic_read(&skb->users)); 3433 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3434 trace_consume_skb(skb); 3435 else 3436 trace_kfree_skb(skb, net_tx_action); 3437 __kfree_skb(skb); 3438 } 3439 } 3440 3441 if (sd->output_queue) { 3442 struct Qdisc *head; 3443 3444 local_irq_disable(); 3445 head = sd->output_queue; 3446 sd->output_queue = NULL; 3447 sd->output_queue_tailp = &sd->output_queue; 3448 local_irq_enable(); 3449 3450 while (head) { 3451 struct Qdisc *q = head; 3452 spinlock_t *root_lock; 3453 3454 head = head->next_sched; 3455 3456 root_lock = qdisc_lock(q); 3457 if (spin_trylock(root_lock)) { 3458 smp_mb__before_atomic(); 3459 clear_bit(__QDISC_STATE_SCHED, 3460 &q->state); 3461 qdisc_run(q); 3462 spin_unlock(root_lock); 3463 } else { 3464 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3465 &q->state)) { 3466 __netif_reschedule(q); 3467 } else { 3468 smp_mb__before_atomic(); 3469 clear_bit(__QDISC_STATE_SCHED, 3470 &q->state); 3471 } 3472 } 3473 } 3474 } 3475 } 3476 3477 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3478 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3479 /* This hook is defined here for ATM LANE */ 3480 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3481 unsigned char *addr) __read_mostly; 3482 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3483 #endif 3484 3485 #ifdef CONFIG_NET_CLS_ACT 3486 /* TODO: Maybe we should just force sch_ingress to be compiled in 3487 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 3488 * a compare and 2 stores extra right now if we dont have it on 3489 * but have CONFIG_NET_CLS_ACT 3490 * NOTE: This doesn't stop any functionality; if you dont have 3491 * the ingress scheduler, you just can't add policies on ingress. 3492 * 3493 */ 3494 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) 3495 { 3496 struct net_device *dev = skb->dev; 3497 u32 ttl = G_TC_RTTL(skb->tc_verd); 3498 int result = TC_ACT_OK; 3499 struct Qdisc *q; 3500 3501 if (unlikely(MAX_RED_LOOP < ttl++)) { 3502 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", 3503 skb->skb_iif, dev->ifindex); 3504 return TC_ACT_SHOT; 3505 } 3506 3507 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3508 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3509 3510 q = rcu_dereference(rxq->qdisc); 3511 if (q != &noop_qdisc) { 3512 spin_lock(qdisc_lock(q)); 3513 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 3514 result = qdisc_enqueue_root(skb, q); 3515 spin_unlock(qdisc_lock(q)); 3516 } 3517 3518 return result; 3519 } 3520 3521 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3522 struct packet_type **pt_prev, 3523 int *ret, struct net_device *orig_dev) 3524 { 3525 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); 3526 3527 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) 3528 goto out; 3529 3530 if (*pt_prev) { 3531 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3532 *pt_prev = NULL; 3533 } 3534 3535 switch (ing_filter(skb, rxq)) { 3536 case TC_ACT_SHOT: 3537 case TC_ACT_STOLEN: 3538 kfree_skb(skb); 3539 return NULL; 3540 } 3541 3542 out: 3543 skb->tc_verd = 0; 3544 return skb; 3545 } 3546 #endif 3547 3548 /** 3549 * netdev_rx_handler_register - register receive handler 3550 * @dev: device to register a handler for 3551 * @rx_handler: receive handler to register 3552 * @rx_handler_data: data pointer that is used by rx handler 3553 * 3554 * Register a receive handler for a device. This handler will then be 3555 * called from __netif_receive_skb. A negative errno code is returned 3556 * on a failure. 3557 * 3558 * The caller must hold the rtnl_mutex. 3559 * 3560 * For a general description of rx_handler, see enum rx_handler_result. 3561 */ 3562 int netdev_rx_handler_register(struct net_device *dev, 3563 rx_handler_func_t *rx_handler, 3564 void *rx_handler_data) 3565 { 3566 ASSERT_RTNL(); 3567 3568 if (dev->rx_handler) 3569 return -EBUSY; 3570 3571 /* Note: rx_handler_data must be set before rx_handler */ 3572 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3573 rcu_assign_pointer(dev->rx_handler, rx_handler); 3574 3575 return 0; 3576 } 3577 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3578 3579 /** 3580 * netdev_rx_handler_unregister - unregister receive handler 3581 * @dev: device to unregister a handler from 3582 * 3583 * Unregister a receive handler from a device. 3584 * 3585 * The caller must hold the rtnl_mutex. 3586 */ 3587 void netdev_rx_handler_unregister(struct net_device *dev) 3588 { 3589 3590 ASSERT_RTNL(); 3591 RCU_INIT_POINTER(dev->rx_handler, NULL); 3592 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3593 * section has a guarantee to see a non NULL rx_handler_data 3594 * as well. 3595 */ 3596 synchronize_net(); 3597 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3598 } 3599 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3600 3601 /* 3602 * Limit the use of PFMEMALLOC reserves to those protocols that implement 3603 * the special handling of PFMEMALLOC skbs. 3604 */ 3605 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3606 { 3607 switch (skb->protocol) { 3608 case htons(ETH_P_ARP): 3609 case htons(ETH_P_IP): 3610 case htons(ETH_P_IPV6): 3611 case htons(ETH_P_8021Q): 3612 case htons(ETH_P_8021AD): 3613 return true; 3614 default: 3615 return false; 3616 } 3617 } 3618 3619 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 3620 { 3621 struct packet_type *ptype, *pt_prev; 3622 rx_handler_func_t *rx_handler; 3623 struct net_device *orig_dev; 3624 bool deliver_exact = false; 3625 int ret = NET_RX_DROP; 3626 __be16 type; 3627 3628 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3629 3630 trace_netif_receive_skb(skb); 3631 3632 orig_dev = skb->dev; 3633 3634 skb_reset_network_header(skb); 3635 if (!skb_transport_header_was_set(skb)) 3636 skb_reset_transport_header(skb); 3637 skb_reset_mac_len(skb); 3638 3639 pt_prev = NULL; 3640 3641 rcu_read_lock(); 3642 3643 another_round: 3644 skb->skb_iif = skb->dev->ifindex; 3645 3646 __this_cpu_inc(softnet_data.processed); 3647 3648 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 3649 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 3650 skb = skb_vlan_untag(skb); 3651 if (unlikely(!skb)) 3652 goto unlock; 3653 } 3654 3655 #ifdef CONFIG_NET_CLS_ACT 3656 if (skb->tc_verd & TC_NCLS) { 3657 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3658 goto ncls; 3659 } 3660 #endif 3661 3662 if (pfmemalloc) 3663 goto skip_taps; 3664 3665 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3666 if (pt_prev) 3667 ret = deliver_skb(skb, pt_prev, orig_dev); 3668 pt_prev = ptype; 3669 } 3670 3671 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 3672 if (pt_prev) 3673 ret = deliver_skb(skb, pt_prev, orig_dev); 3674 pt_prev = ptype; 3675 } 3676 3677 skip_taps: 3678 #ifdef CONFIG_NET_CLS_ACT 3679 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3680 if (!skb) 3681 goto unlock; 3682 ncls: 3683 #endif 3684 3685 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 3686 goto drop; 3687 3688 if (skb_vlan_tag_present(skb)) { 3689 if (pt_prev) { 3690 ret = deliver_skb(skb, pt_prev, orig_dev); 3691 pt_prev = NULL; 3692 } 3693 if (vlan_do_receive(&skb)) 3694 goto another_round; 3695 else if (unlikely(!skb)) 3696 goto unlock; 3697 } 3698 3699 rx_handler = rcu_dereference(skb->dev->rx_handler); 3700 if (rx_handler) { 3701 if (pt_prev) { 3702 ret = deliver_skb(skb, pt_prev, orig_dev); 3703 pt_prev = NULL; 3704 } 3705 switch (rx_handler(&skb)) { 3706 case RX_HANDLER_CONSUMED: 3707 ret = NET_RX_SUCCESS; 3708 goto unlock; 3709 case RX_HANDLER_ANOTHER: 3710 goto another_round; 3711 case RX_HANDLER_EXACT: 3712 deliver_exact = true; 3713 case RX_HANDLER_PASS: 3714 break; 3715 default: 3716 BUG(); 3717 } 3718 } 3719 3720 if (unlikely(skb_vlan_tag_present(skb))) { 3721 if (skb_vlan_tag_get_id(skb)) 3722 skb->pkt_type = PACKET_OTHERHOST; 3723 /* Note: we might in the future use prio bits 3724 * and set skb->priority like in vlan_do_receive() 3725 * For the time being, just ignore Priority Code Point 3726 */ 3727 skb->vlan_tci = 0; 3728 } 3729 3730 type = skb->protocol; 3731 3732 /* deliver only exact match when indicated */ 3733 if (likely(!deliver_exact)) { 3734 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3735 &ptype_base[ntohs(type) & 3736 PTYPE_HASH_MASK]); 3737 } 3738 3739 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3740 &orig_dev->ptype_specific); 3741 3742 if (unlikely(skb->dev != orig_dev)) { 3743 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3744 &skb->dev->ptype_specific); 3745 } 3746 3747 if (pt_prev) { 3748 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 3749 goto drop; 3750 else 3751 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3752 } else { 3753 drop: 3754 atomic_long_inc(&skb->dev->rx_dropped); 3755 kfree_skb(skb); 3756 /* Jamal, now you will not able to escape explaining 3757 * me how you were going to use this. :-) 3758 */ 3759 ret = NET_RX_DROP; 3760 } 3761 3762 unlock: 3763 rcu_read_unlock(); 3764 return ret; 3765 } 3766 3767 static int __netif_receive_skb(struct sk_buff *skb) 3768 { 3769 int ret; 3770 3771 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 3772 unsigned long pflags = current->flags; 3773 3774 /* 3775 * PFMEMALLOC skbs are special, they should 3776 * - be delivered to SOCK_MEMALLOC sockets only 3777 * - stay away from userspace 3778 * - have bounded memory usage 3779 * 3780 * Use PF_MEMALLOC as this saves us from propagating the allocation 3781 * context down to all allocation sites. 3782 */ 3783 current->flags |= PF_MEMALLOC; 3784 ret = __netif_receive_skb_core(skb, true); 3785 tsk_restore_flags(current, pflags, PF_MEMALLOC); 3786 } else 3787 ret = __netif_receive_skb_core(skb, false); 3788 3789 return ret; 3790 } 3791 3792 static int netif_receive_skb_internal(struct sk_buff *skb) 3793 { 3794 net_timestamp_check(netdev_tstamp_prequeue, skb); 3795 3796 if (skb_defer_rx_timestamp(skb)) 3797 return NET_RX_SUCCESS; 3798 3799 #ifdef CONFIG_RPS 3800 if (static_key_false(&rps_needed)) { 3801 struct rps_dev_flow voidflow, *rflow = &voidflow; 3802 int cpu, ret; 3803 3804 rcu_read_lock(); 3805 3806 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3807 3808 if (cpu >= 0) { 3809 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3810 rcu_read_unlock(); 3811 return ret; 3812 } 3813 rcu_read_unlock(); 3814 } 3815 #endif 3816 return __netif_receive_skb(skb); 3817 } 3818 3819 /** 3820 * netif_receive_skb - process receive buffer from network 3821 * @skb: buffer to process 3822 * 3823 * netif_receive_skb() is the main receive data processing function. 3824 * It always succeeds. The buffer may be dropped during processing 3825 * for congestion control or by the protocol layers. 3826 * 3827 * This function may only be called from softirq context and interrupts 3828 * should be enabled. 3829 * 3830 * Return values (usually ignored): 3831 * NET_RX_SUCCESS: no congestion 3832 * NET_RX_DROP: packet was dropped 3833 */ 3834 int netif_receive_skb(struct sk_buff *skb) 3835 { 3836 trace_netif_receive_skb_entry(skb); 3837 3838 return netif_receive_skb_internal(skb); 3839 } 3840 EXPORT_SYMBOL(netif_receive_skb); 3841 3842 /* Network device is going away, flush any packets still pending 3843 * Called with irqs disabled. 3844 */ 3845 static void flush_backlog(void *arg) 3846 { 3847 struct net_device *dev = arg; 3848 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3849 struct sk_buff *skb, *tmp; 3850 3851 rps_lock(sd); 3852 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3853 if (skb->dev == dev) { 3854 __skb_unlink(skb, &sd->input_pkt_queue); 3855 kfree_skb(skb); 3856 input_queue_head_incr(sd); 3857 } 3858 } 3859 rps_unlock(sd); 3860 3861 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3862 if (skb->dev == dev) { 3863 __skb_unlink(skb, &sd->process_queue); 3864 kfree_skb(skb); 3865 input_queue_head_incr(sd); 3866 } 3867 } 3868 } 3869 3870 static int napi_gro_complete(struct sk_buff *skb) 3871 { 3872 struct packet_offload *ptype; 3873 __be16 type = skb->protocol; 3874 struct list_head *head = &offload_base; 3875 int err = -ENOENT; 3876 3877 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 3878 3879 if (NAPI_GRO_CB(skb)->count == 1) { 3880 skb_shinfo(skb)->gso_size = 0; 3881 goto out; 3882 } 3883 3884 rcu_read_lock(); 3885 list_for_each_entry_rcu(ptype, head, list) { 3886 if (ptype->type != type || !ptype->callbacks.gro_complete) 3887 continue; 3888 3889 err = ptype->callbacks.gro_complete(skb, 0); 3890 break; 3891 } 3892 rcu_read_unlock(); 3893 3894 if (err) { 3895 WARN_ON(&ptype->list == head); 3896 kfree_skb(skb); 3897 return NET_RX_SUCCESS; 3898 } 3899 3900 out: 3901 return netif_receive_skb_internal(skb); 3902 } 3903 3904 /* napi->gro_list contains packets ordered by age. 3905 * youngest packets at the head of it. 3906 * Complete skbs in reverse order to reduce latencies. 3907 */ 3908 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 3909 { 3910 struct sk_buff *skb, *prev = NULL; 3911 3912 /* scan list and build reverse chain */ 3913 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 3914 skb->prev = prev; 3915 prev = skb; 3916 } 3917 3918 for (skb = prev; skb; skb = prev) { 3919 skb->next = NULL; 3920 3921 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 3922 return; 3923 3924 prev = skb->prev; 3925 napi_gro_complete(skb); 3926 napi->gro_count--; 3927 } 3928 3929 napi->gro_list = NULL; 3930 } 3931 EXPORT_SYMBOL(napi_gro_flush); 3932 3933 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 3934 { 3935 struct sk_buff *p; 3936 unsigned int maclen = skb->dev->hard_header_len; 3937 u32 hash = skb_get_hash_raw(skb); 3938 3939 for (p = napi->gro_list; p; p = p->next) { 3940 unsigned long diffs; 3941 3942 NAPI_GRO_CB(p)->flush = 0; 3943 3944 if (hash != skb_get_hash_raw(p)) { 3945 NAPI_GRO_CB(p)->same_flow = 0; 3946 continue; 3947 } 3948 3949 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3950 diffs |= p->vlan_tci ^ skb->vlan_tci; 3951 if (maclen == ETH_HLEN) 3952 diffs |= compare_ether_header(skb_mac_header(p), 3953 skb_mac_header(skb)); 3954 else if (!diffs) 3955 diffs = memcmp(skb_mac_header(p), 3956 skb_mac_header(skb), 3957 maclen); 3958 NAPI_GRO_CB(p)->same_flow = !diffs; 3959 } 3960 } 3961 3962 static void skb_gro_reset_offset(struct sk_buff *skb) 3963 { 3964 const struct skb_shared_info *pinfo = skb_shinfo(skb); 3965 const skb_frag_t *frag0 = &pinfo->frags[0]; 3966 3967 NAPI_GRO_CB(skb)->data_offset = 0; 3968 NAPI_GRO_CB(skb)->frag0 = NULL; 3969 NAPI_GRO_CB(skb)->frag0_len = 0; 3970 3971 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 3972 pinfo->nr_frags && 3973 !PageHighMem(skb_frag_page(frag0))) { 3974 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 3975 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 3976 } 3977 } 3978 3979 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 3980 { 3981 struct skb_shared_info *pinfo = skb_shinfo(skb); 3982 3983 BUG_ON(skb->end - skb->tail < grow); 3984 3985 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 3986 3987 skb->data_len -= grow; 3988 skb->tail += grow; 3989 3990 pinfo->frags[0].page_offset += grow; 3991 skb_frag_size_sub(&pinfo->frags[0], grow); 3992 3993 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 3994 skb_frag_unref(skb, 0); 3995 memmove(pinfo->frags, pinfo->frags + 1, 3996 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 3997 } 3998 } 3999 4000 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4001 { 4002 struct sk_buff **pp = NULL; 4003 struct packet_offload *ptype; 4004 __be16 type = skb->protocol; 4005 struct list_head *head = &offload_base; 4006 int same_flow; 4007 enum gro_result ret; 4008 int grow; 4009 4010 if (!(skb->dev->features & NETIF_F_GRO)) 4011 goto normal; 4012 4013 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4014 goto normal; 4015 4016 gro_list_prepare(napi, skb); 4017 4018 rcu_read_lock(); 4019 list_for_each_entry_rcu(ptype, head, list) { 4020 if (ptype->type != type || !ptype->callbacks.gro_receive) 4021 continue; 4022 4023 skb_set_network_header(skb, skb_gro_offset(skb)); 4024 skb_reset_mac_len(skb); 4025 NAPI_GRO_CB(skb)->same_flow = 0; 4026 NAPI_GRO_CB(skb)->flush = 0; 4027 NAPI_GRO_CB(skb)->free = 0; 4028 NAPI_GRO_CB(skb)->udp_mark = 0; 4029 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4030 4031 /* Setup for GRO checksum validation */ 4032 switch (skb->ip_summed) { 4033 case CHECKSUM_COMPLETE: 4034 NAPI_GRO_CB(skb)->csum = skb->csum; 4035 NAPI_GRO_CB(skb)->csum_valid = 1; 4036 NAPI_GRO_CB(skb)->csum_cnt = 0; 4037 break; 4038 case CHECKSUM_UNNECESSARY: 4039 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4040 NAPI_GRO_CB(skb)->csum_valid = 0; 4041 break; 4042 default: 4043 NAPI_GRO_CB(skb)->csum_cnt = 0; 4044 NAPI_GRO_CB(skb)->csum_valid = 0; 4045 } 4046 4047 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4048 break; 4049 } 4050 rcu_read_unlock(); 4051 4052 if (&ptype->list == head) 4053 goto normal; 4054 4055 same_flow = NAPI_GRO_CB(skb)->same_flow; 4056 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4057 4058 if (pp) { 4059 struct sk_buff *nskb = *pp; 4060 4061 *pp = nskb->next; 4062 nskb->next = NULL; 4063 napi_gro_complete(nskb); 4064 napi->gro_count--; 4065 } 4066 4067 if (same_flow) 4068 goto ok; 4069 4070 if (NAPI_GRO_CB(skb)->flush) 4071 goto normal; 4072 4073 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4074 struct sk_buff *nskb = napi->gro_list; 4075 4076 /* locate the end of the list to select the 'oldest' flow */ 4077 while (nskb->next) { 4078 pp = &nskb->next; 4079 nskb = *pp; 4080 } 4081 *pp = NULL; 4082 nskb->next = NULL; 4083 napi_gro_complete(nskb); 4084 } else { 4085 napi->gro_count++; 4086 } 4087 NAPI_GRO_CB(skb)->count = 1; 4088 NAPI_GRO_CB(skb)->age = jiffies; 4089 NAPI_GRO_CB(skb)->last = skb; 4090 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4091 skb->next = napi->gro_list; 4092 napi->gro_list = skb; 4093 ret = GRO_HELD; 4094 4095 pull: 4096 grow = skb_gro_offset(skb) - skb_headlen(skb); 4097 if (grow > 0) 4098 gro_pull_from_frag0(skb, grow); 4099 ok: 4100 return ret; 4101 4102 normal: 4103 ret = GRO_NORMAL; 4104 goto pull; 4105 } 4106 4107 struct packet_offload *gro_find_receive_by_type(__be16 type) 4108 { 4109 struct list_head *offload_head = &offload_base; 4110 struct packet_offload *ptype; 4111 4112 list_for_each_entry_rcu(ptype, offload_head, list) { 4113 if (ptype->type != type || !ptype->callbacks.gro_receive) 4114 continue; 4115 return ptype; 4116 } 4117 return NULL; 4118 } 4119 EXPORT_SYMBOL(gro_find_receive_by_type); 4120 4121 struct packet_offload *gro_find_complete_by_type(__be16 type) 4122 { 4123 struct list_head *offload_head = &offload_base; 4124 struct packet_offload *ptype; 4125 4126 list_for_each_entry_rcu(ptype, offload_head, list) { 4127 if (ptype->type != type || !ptype->callbacks.gro_complete) 4128 continue; 4129 return ptype; 4130 } 4131 return NULL; 4132 } 4133 EXPORT_SYMBOL(gro_find_complete_by_type); 4134 4135 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4136 { 4137 switch (ret) { 4138 case GRO_NORMAL: 4139 if (netif_receive_skb_internal(skb)) 4140 ret = GRO_DROP; 4141 break; 4142 4143 case GRO_DROP: 4144 kfree_skb(skb); 4145 break; 4146 4147 case GRO_MERGED_FREE: 4148 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4149 kmem_cache_free(skbuff_head_cache, skb); 4150 else 4151 __kfree_skb(skb); 4152 break; 4153 4154 case GRO_HELD: 4155 case GRO_MERGED: 4156 break; 4157 } 4158 4159 return ret; 4160 } 4161 4162 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4163 { 4164 trace_napi_gro_receive_entry(skb); 4165 4166 skb_gro_reset_offset(skb); 4167 4168 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4169 } 4170 EXPORT_SYMBOL(napi_gro_receive); 4171 4172 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4173 { 4174 if (unlikely(skb->pfmemalloc)) { 4175 consume_skb(skb); 4176 return; 4177 } 4178 __skb_pull(skb, skb_headlen(skb)); 4179 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4180 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4181 skb->vlan_tci = 0; 4182 skb->dev = napi->dev; 4183 skb->skb_iif = 0; 4184 skb->encapsulation = 0; 4185 skb_shinfo(skb)->gso_type = 0; 4186 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4187 4188 napi->skb = skb; 4189 } 4190 4191 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4192 { 4193 struct sk_buff *skb = napi->skb; 4194 4195 if (!skb) { 4196 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4197 napi->skb = skb; 4198 } 4199 return skb; 4200 } 4201 EXPORT_SYMBOL(napi_get_frags); 4202 4203 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4204 struct sk_buff *skb, 4205 gro_result_t ret) 4206 { 4207 switch (ret) { 4208 case GRO_NORMAL: 4209 case GRO_HELD: 4210 __skb_push(skb, ETH_HLEN); 4211 skb->protocol = eth_type_trans(skb, skb->dev); 4212 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4213 ret = GRO_DROP; 4214 break; 4215 4216 case GRO_DROP: 4217 case GRO_MERGED_FREE: 4218 napi_reuse_skb(napi, skb); 4219 break; 4220 4221 case GRO_MERGED: 4222 break; 4223 } 4224 4225 return ret; 4226 } 4227 4228 /* Upper GRO stack assumes network header starts at gro_offset=0 4229 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4230 * We copy ethernet header into skb->data to have a common layout. 4231 */ 4232 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4233 { 4234 struct sk_buff *skb = napi->skb; 4235 const struct ethhdr *eth; 4236 unsigned int hlen = sizeof(*eth); 4237 4238 napi->skb = NULL; 4239 4240 skb_reset_mac_header(skb); 4241 skb_gro_reset_offset(skb); 4242 4243 eth = skb_gro_header_fast(skb, 0); 4244 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4245 eth = skb_gro_header_slow(skb, hlen, 0); 4246 if (unlikely(!eth)) { 4247 napi_reuse_skb(napi, skb); 4248 return NULL; 4249 } 4250 } else { 4251 gro_pull_from_frag0(skb, hlen); 4252 NAPI_GRO_CB(skb)->frag0 += hlen; 4253 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4254 } 4255 __skb_pull(skb, hlen); 4256 4257 /* 4258 * This works because the only protocols we care about don't require 4259 * special handling. 4260 * We'll fix it up properly in napi_frags_finish() 4261 */ 4262 skb->protocol = eth->h_proto; 4263 4264 return skb; 4265 } 4266 4267 gro_result_t napi_gro_frags(struct napi_struct *napi) 4268 { 4269 struct sk_buff *skb = napi_frags_skb(napi); 4270 4271 if (!skb) 4272 return GRO_DROP; 4273 4274 trace_napi_gro_frags_entry(skb); 4275 4276 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4277 } 4278 EXPORT_SYMBOL(napi_gro_frags); 4279 4280 /* Compute the checksum from gro_offset and return the folded value 4281 * after adding in any pseudo checksum. 4282 */ 4283 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4284 { 4285 __wsum wsum; 4286 __sum16 sum; 4287 4288 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4289 4290 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4291 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4292 if (likely(!sum)) { 4293 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4294 !skb->csum_complete_sw) 4295 netdev_rx_csum_fault(skb->dev); 4296 } 4297 4298 NAPI_GRO_CB(skb)->csum = wsum; 4299 NAPI_GRO_CB(skb)->csum_valid = 1; 4300 4301 return sum; 4302 } 4303 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4304 4305 /* 4306 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4307 * Note: called with local irq disabled, but exits with local irq enabled. 4308 */ 4309 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4310 { 4311 #ifdef CONFIG_RPS 4312 struct softnet_data *remsd = sd->rps_ipi_list; 4313 4314 if (remsd) { 4315 sd->rps_ipi_list = NULL; 4316 4317 local_irq_enable(); 4318 4319 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4320 while (remsd) { 4321 struct softnet_data *next = remsd->rps_ipi_next; 4322 4323 if (cpu_online(remsd->cpu)) 4324 smp_call_function_single_async(remsd->cpu, 4325 &remsd->csd); 4326 remsd = next; 4327 } 4328 } else 4329 #endif 4330 local_irq_enable(); 4331 } 4332 4333 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4334 { 4335 #ifdef CONFIG_RPS 4336 return sd->rps_ipi_list != NULL; 4337 #else 4338 return false; 4339 #endif 4340 } 4341 4342 static int process_backlog(struct napi_struct *napi, int quota) 4343 { 4344 int work = 0; 4345 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4346 4347 /* Check if we have pending ipi, its better to send them now, 4348 * not waiting net_rx_action() end. 4349 */ 4350 if (sd_has_rps_ipi_waiting(sd)) { 4351 local_irq_disable(); 4352 net_rps_action_and_irq_enable(sd); 4353 } 4354 4355 napi->weight = weight_p; 4356 local_irq_disable(); 4357 while (1) { 4358 struct sk_buff *skb; 4359 4360 while ((skb = __skb_dequeue(&sd->process_queue))) { 4361 local_irq_enable(); 4362 __netif_receive_skb(skb); 4363 local_irq_disable(); 4364 input_queue_head_incr(sd); 4365 if (++work >= quota) { 4366 local_irq_enable(); 4367 return work; 4368 } 4369 } 4370 4371 rps_lock(sd); 4372 if (skb_queue_empty(&sd->input_pkt_queue)) { 4373 /* 4374 * Inline a custom version of __napi_complete(). 4375 * only current cpu owns and manipulates this napi, 4376 * and NAPI_STATE_SCHED is the only possible flag set 4377 * on backlog. 4378 * We can use a plain write instead of clear_bit(), 4379 * and we dont need an smp_mb() memory barrier. 4380 */ 4381 napi->state = 0; 4382 rps_unlock(sd); 4383 4384 break; 4385 } 4386 4387 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4388 &sd->process_queue); 4389 rps_unlock(sd); 4390 } 4391 local_irq_enable(); 4392 4393 return work; 4394 } 4395 4396 /** 4397 * __napi_schedule - schedule for receive 4398 * @n: entry to schedule 4399 * 4400 * The entry's receive function will be scheduled to run. 4401 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4402 */ 4403 void __napi_schedule(struct napi_struct *n) 4404 { 4405 unsigned long flags; 4406 4407 local_irq_save(flags); 4408 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4409 local_irq_restore(flags); 4410 } 4411 EXPORT_SYMBOL(__napi_schedule); 4412 4413 /** 4414 * __napi_schedule_irqoff - schedule for receive 4415 * @n: entry to schedule 4416 * 4417 * Variant of __napi_schedule() assuming hard irqs are masked 4418 */ 4419 void __napi_schedule_irqoff(struct napi_struct *n) 4420 { 4421 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4422 } 4423 EXPORT_SYMBOL(__napi_schedule_irqoff); 4424 4425 void __napi_complete(struct napi_struct *n) 4426 { 4427 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4428 4429 list_del_init(&n->poll_list); 4430 smp_mb__before_atomic(); 4431 clear_bit(NAPI_STATE_SCHED, &n->state); 4432 } 4433 EXPORT_SYMBOL(__napi_complete); 4434 4435 void napi_complete_done(struct napi_struct *n, int work_done) 4436 { 4437 unsigned long flags; 4438 4439 /* 4440 * don't let napi dequeue from the cpu poll list 4441 * just in case its running on a different cpu 4442 */ 4443 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4444 return; 4445 4446 if (n->gro_list) { 4447 unsigned long timeout = 0; 4448 4449 if (work_done) 4450 timeout = n->dev->gro_flush_timeout; 4451 4452 if (timeout) 4453 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4454 HRTIMER_MODE_REL_PINNED); 4455 else 4456 napi_gro_flush(n, false); 4457 } 4458 if (likely(list_empty(&n->poll_list))) { 4459 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4460 } else { 4461 /* If n->poll_list is not empty, we need to mask irqs */ 4462 local_irq_save(flags); 4463 __napi_complete(n); 4464 local_irq_restore(flags); 4465 } 4466 } 4467 EXPORT_SYMBOL(napi_complete_done); 4468 4469 /* must be called under rcu_read_lock(), as we dont take a reference */ 4470 struct napi_struct *napi_by_id(unsigned int napi_id) 4471 { 4472 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4473 struct napi_struct *napi; 4474 4475 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4476 if (napi->napi_id == napi_id) 4477 return napi; 4478 4479 return NULL; 4480 } 4481 EXPORT_SYMBOL_GPL(napi_by_id); 4482 4483 void napi_hash_add(struct napi_struct *napi) 4484 { 4485 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { 4486 4487 spin_lock(&napi_hash_lock); 4488 4489 /* 0 is not a valid id, we also skip an id that is taken 4490 * we expect both events to be extremely rare 4491 */ 4492 napi->napi_id = 0; 4493 while (!napi->napi_id) { 4494 napi->napi_id = ++napi_gen_id; 4495 if (napi_by_id(napi->napi_id)) 4496 napi->napi_id = 0; 4497 } 4498 4499 hlist_add_head_rcu(&napi->napi_hash_node, 4500 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 4501 4502 spin_unlock(&napi_hash_lock); 4503 } 4504 } 4505 EXPORT_SYMBOL_GPL(napi_hash_add); 4506 4507 /* Warning : caller is responsible to make sure rcu grace period 4508 * is respected before freeing memory containing @napi 4509 */ 4510 void napi_hash_del(struct napi_struct *napi) 4511 { 4512 spin_lock(&napi_hash_lock); 4513 4514 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) 4515 hlist_del_rcu(&napi->napi_hash_node); 4516 4517 spin_unlock(&napi_hash_lock); 4518 } 4519 EXPORT_SYMBOL_GPL(napi_hash_del); 4520 4521 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 4522 { 4523 struct napi_struct *napi; 4524 4525 napi = container_of(timer, struct napi_struct, timer); 4526 if (napi->gro_list) 4527 napi_schedule(napi); 4528 4529 return HRTIMER_NORESTART; 4530 } 4531 4532 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4533 int (*poll)(struct napi_struct *, int), int weight) 4534 { 4535 INIT_LIST_HEAD(&napi->poll_list); 4536 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 4537 napi->timer.function = napi_watchdog; 4538 napi->gro_count = 0; 4539 napi->gro_list = NULL; 4540 napi->skb = NULL; 4541 napi->poll = poll; 4542 if (weight > NAPI_POLL_WEIGHT) 4543 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 4544 weight, dev->name); 4545 napi->weight = weight; 4546 list_add(&napi->dev_list, &dev->napi_list); 4547 napi->dev = dev; 4548 #ifdef CONFIG_NETPOLL 4549 spin_lock_init(&napi->poll_lock); 4550 napi->poll_owner = -1; 4551 #endif 4552 set_bit(NAPI_STATE_SCHED, &napi->state); 4553 } 4554 EXPORT_SYMBOL(netif_napi_add); 4555 4556 void napi_disable(struct napi_struct *n) 4557 { 4558 might_sleep(); 4559 set_bit(NAPI_STATE_DISABLE, &n->state); 4560 4561 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 4562 msleep(1); 4563 4564 hrtimer_cancel(&n->timer); 4565 4566 clear_bit(NAPI_STATE_DISABLE, &n->state); 4567 } 4568 EXPORT_SYMBOL(napi_disable); 4569 4570 void netif_napi_del(struct napi_struct *napi) 4571 { 4572 list_del_init(&napi->dev_list); 4573 napi_free_frags(napi); 4574 4575 kfree_skb_list(napi->gro_list); 4576 napi->gro_list = NULL; 4577 napi->gro_count = 0; 4578 } 4579 EXPORT_SYMBOL(netif_napi_del); 4580 4581 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 4582 { 4583 void *have; 4584 int work, weight; 4585 4586 list_del_init(&n->poll_list); 4587 4588 have = netpoll_poll_lock(n); 4589 4590 weight = n->weight; 4591 4592 /* This NAPI_STATE_SCHED test is for avoiding a race 4593 * with netpoll's poll_napi(). Only the entity which 4594 * obtains the lock and sees NAPI_STATE_SCHED set will 4595 * actually make the ->poll() call. Therefore we avoid 4596 * accidentally calling ->poll() when NAPI is not scheduled. 4597 */ 4598 work = 0; 4599 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 4600 work = n->poll(n, weight); 4601 trace_napi_poll(n); 4602 } 4603 4604 WARN_ON_ONCE(work > weight); 4605 4606 if (likely(work < weight)) 4607 goto out_unlock; 4608 4609 /* Drivers must not modify the NAPI state if they 4610 * consume the entire weight. In such cases this code 4611 * still "owns" the NAPI instance and therefore can 4612 * move the instance around on the list at-will. 4613 */ 4614 if (unlikely(napi_disable_pending(n))) { 4615 napi_complete(n); 4616 goto out_unlock; 4617 } 4618 4619 if (n->gro_list) { 4620 /* flush too old packets 4621 * If HZ < 1000, flush all packets. 4622 */ 4623 napi_gro_flush(n, HZ >= 1000); 4624 } 4625 4626 /* Some drivers may have called napi_schedule 4627 * prior to exhausting their budget. 4628 */ 4629 if (unlikely(!list_empty(&n->poll_list))) { 4630 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 4631 n->dev ? n->dev->name : "backlog"); 4632 goto out_unlock; 4633 } 4634 4635 list_add_tail(&n->poll_list, repoll); 4636 4637 out_unlock: 4638 netpoll_poll_unlock(have); 4639 4640 return work; 4641 } 4642 4643 static void net_rx_action(struct softirq_action *h) 4644 { 4645 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4646 unsigned long time_limit = jiffies + 2; 4647 int budget = netdev_budget; 4648 LIST_HEAD(list); 4649 LIST_HEAD(repoll); 4650 4651 local_irq_disable(); 4652 list_splice_init(&sd->poll_list, &list); 4653 local_irq_enable(); 4654 4655 for (;;) { 4656 struct napi_struct *n; 4657 4658 if (list_empty(&list)) { 4659 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 4660 return; 4661 break; 4662 } 4663 4664 n = list_first_entry(&list, struct napi_struct, poll_list); 4665 budget -= napi_poll(n, &repoll); 4666 4667 /* If softirq window is exhausted then punt. 4668 * Allow this to run for 2 jiffies since which will allow 4669 * an average latency of 1.5/HZ. 4670 */ 4671 if (unlikely(budget <= 0 || 4672 time_after_eq(jiffies, time_limit))) { 4673 sd->time_squeeze++; 4674 break; 4675 } 4676 } 4677 4678 local_irq_disable(); 4679 4680 list_splice_tail_init(&sd->poll_list, &list); 4681 list_splice_tail(&repoll, &list); 4682 list_splice(&list, &sd->poll_list); 4683 if (!list_empty(&sd->poll_list)) 4684 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4685 4686 net_rps_action_and_irq_enable(sd); 4687 } 4688 4689 struct netdev_adjacent { 4690 struct net_device *dev; 4691 4692 /* upper master flag, there can only be one master device per list */ 4693 bool master; 4694 4695 /* counter for the number of times this device was added to us */ 4696 u16 ref_nr; 4697 4698 /* private field for the users */ 4699 void *private; 4700 4701 struct list_head list; 4702 struct rcu_head rcu; 4703 }; 4704 4705 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, 4706 struct net_device *adj_dev, 4707 struct list_head *adj_list) 4708 { 4709 struct netdev_adjacent *adj; 4710 4711 list_for_each_entry(adj, adj_list, list) { 4712 if (adj->dev == adj_dev) 4713 return adj; 4714 } 4715 return NULL; 4716 } 4717 4718 /** 4719 * netdev_has_upper_dev - Check if device is linked to an upper device 4720 * @dev: device 4721 * @upper_dev: upper device to check 4722 * 4723 * Find out if a device is linked to specified upper device and return true 4724 * in case it is. Note that this checks only immediate upper device, 4725 * not through a complete stack of devices. The caller must hold the RTNL lock. 4726 */ 4727 bool netdev_has_upper_dev(struct net_device *dev, 4728 struct net_device *upper_dev) 4729 { 4730 ASSERT_RTNL(); 4731 4732 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); 4733 } 4734 EXPORT_SYMBOL(netdev_has_upper_dev); 4735 4736 /** 4737 * netdev_has_any_upper_dev - Check if device is linked to some device 4738 * @dev: device 4739 * 4740 * Find out if a device is linked to an upper device and return true in case 4741 * it is. The caller must hold the RTNL lock. 4742 */ 4743 static bool netdev_has_any_upper_dev(struct net_device *dev) 4744 { 4745 ASSERT_RTNL(); 4746 4747 return !list_empty(&dev->all_adj_list.upper); 4748 } 4749 4750 /** 4751 * netdev_master_upper_dev_get - Get master upper device 4752 * @dev: device 4753 * 4754 * Find a master upper device and return pointer to it or NULL in case 4755 * it's not there. The caller must hold the RTNL lock. 4756 */ 4757 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 4758 { 4759 struct netdev_adjacent *upper; 4760 4761 ASSERT_RTNL(); 4762 4763 if (list_empty(&dev->adj_list.upper)) 4764 return NULL; 4765 4766 upper = list_first_entry(&dev->adj_list.upper, 4767 struct netdev_adjacent, list); 4768 if (likely(upper->master)) 4769 return upper->dev; 4770 return NULL; 4771 } 4772 EXPORT_SYMBOL(netdev_master_upper_dev_get); 4773 4774 void *netdev_adjacent_get_private(struct list_head *adj_list) 4775 { 4776 struct netdev_adjacent *adj; 4777 4778 adj = list_entry(adj_list, struct netdev_adjacent, list); 4779 4780 return adj->private; 4781 } 4782 EXPORT_SYMBOL(netdev_adjacent_get_private); 4783 4784 /** 4785 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 4786 * @dev: device 4787 * @iter: list_head ** of the current position 4788 * 4789 * Gets the next device from the dev's upper list, starting from iter 4790 * position. The caller must hold RCU read lock. 4791 */ 4792 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 4793 struct list_head **iter) 4794 { 4795 struct netdev_adjacent *upper; 4796 4797 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4798 4799 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4800 4801 if (&upper->list == &dev->adj_list.upper) 4802 return NULL; 4803 4804 *iter = &upper->list; 4805 4806 return upper->dev; 4807 } 4808 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 4809 4810 /** 4811 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 4812 * @dev: device 4813 * @iter: list_head ** of the current position 4814 * 4815 * Gets the next device from the dev's upper list, starting from iter 4816 * position. The caller must hold RCU read lock. 4817 */ 4818 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 4819 struct list_head **iter) 4820 { 4821 struct netdev_adjacent *upper; 4822 4823 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4824 4825 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4826 4827 if (&upper->list == &dev->all_adj_list.upper) 4828 return NULL; 4829 4830 *iter = &upper->list; 4831 4832 return upper->dev; 4833 } 4834 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 4835 4836 /** 4837 * netdev_lower_get_next_private - Get the next ->private from the 4838 * lower neighbour list 4839 * @dev: device 4840 * @iter: list_head ** of the current position 4841 * 4842 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4843 * list, starting from iter position. The caller must hold either hold the 4844 * RTNL lock or its own locking that guarantees that the neighbour lower 4845 * list will remain unchainged. 4846 */ 4847 void *netdev_lower_get_next_private(struct net_device *dev, 4848 struct list_head **iter) 4849 { 4850 struct netdev_adjacent *lower; 4851 4852 lower = list_entry(*iter, struct netdev_adjacent, list); 4853 4854 if (&lower->list == &dev->adj_list.lower) 4855 return NULL; 4856 4857 *iter = lower->list.next; 4858 4859 return lower->private; 4860 } 4861 EXPORT_SYMBOL(netdev_lower_get_next_private); 4862 4863 /** 4864 * netdev_lower_get_next_private_rcu - Get the next ->private from the 4865 * lower neighbour list, RCU 4866 * variant 4867 * @dev: device 4868 * @iter: list_head ** of the current position 4869 * 4870 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4871 * list, starting from iter position. The caller must hold RCU read lock. 4872 */ 4873 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 4874 struct list_head **iter) 4875 { 4876 struct netdev_adjacent *lower; 4877 4878 WARN_ON_ONCE(!rcu_read_lock_held()); 4879 4880 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4881 4882 if (&lower->list == &dev->adj_list.lower) 4883 return NULL; 4884 4885 *iter = &lower->list; 4886 4887 return lower->private; 4888 } 4889 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 4890 4891 /** 4892 * netdev_lower_get_next - Get the next device from the lower neighbour 4893 * list 4894 * @dev: device 4895 * @iter: list_head ** of the current position 4896 * 4897 * Gets the next netdev_adjacent from the dev's lower neighbour 4898 * list, starting from iter position. The caller must hold RTNL lock or 4899 * its own locking that guarantees that the neighbour lower 4900 * list will remain unchainged. 4901 */ 4902 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 4903 { 4904 struct netdev_adjacent *lower; 4905 4906 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 4907 4908 if (&lower->list == &dev->adj_list.lower) 4909 return NULL; 4910 4911 *iter = &lower->list; 4912 4913 return lower->dev; 4914 } 4915 EXPORT_SYMBOL(netdev_lower_get_next); 4916 4917 /** 4918 * netdev_lower_get_first_private_rcu - Get the first ->private from the 4919 * lower neighbour list, RCU 4920 * variant 4921 * @dev: device 4922 * 4923 * Gets the first netdev_adjacent->private from the dev's lower neighbour 4924 * list. The caller must hold RCU read lock. 4925 */ 4926 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 4927 { 4928 struct netdev_adjacent *lower; 4929 4930 lower = list_first_or_null_rcu(&dev->adj_list.lower, 4931 struct netdev_adjacent, list); 4932 if (lower) 4933 return lower->private; 4934 return NULL; 4935 } 4936 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 4937 4938 /** 4939 * netdev_master_upper_dev_get_rcu - Get master upper device 4940 * @dev: device 4941 * 4942 * Find a master upper device and return pointer to it or NULL in case 4943 * it's not there. The caller must hold the RCU read lock. 4944 */ 4945 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 4946 { 4947 struct netdev_adjacent *upper; 4948 4949 upper = list_first_or_null_rcu(&dev->adj_list.upper, 4950 struct netdev_adjacent, list); 4951 if (upper && likely(upper->master)) 4952 return upper->dev; 4953 return NULL; 4954 } 4955 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 4956 4957 static int netdev_adjacent_sysfs_add(struct net_device *dev, 4958 struct net_device *adj_dev, 4959 struct list_head *dev_list) 4960 { 4961 char linkname[IFNAMSIZ+7]; 4962 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4963 "upper_%s" : "lower_%s", adj_dev->name); 4964 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 4965 linkname); 4966 } 4967 static void netdev_adjacent_sysfs_del(struct net_device *dev, 4968 char *name, 4969 struct list_head *dev_list) 4970 { 4971 char linkname[IFNAMSIZ+7]; 4972 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4973 "upper_%s" : "lower_%s", name); 4974 sysfs_remove_link(&(dev->dev.kobj), linkname); 4975 } 4976 4977 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 4978 struct net_device *adj_dev, 4979 struct list_head *dev_list) 4980 { 4981 return (dev_list == &dev->adj_list.upper || 4982 dev_list == &dev->adj_list.lower) && 4983 net_eq(dev_net(dev), dev_net(adj_dev)); 4984 } 4985 4986 static int __netdev_adjacent_dev_insert(struct net_device *dev, 4987 struct net_device *adj_dev, 4988 struct list_head *dev_list, 4989 void *private, bool master) 4990 { 4991 struct netdev_adjacent *adj; 4992 int ret; 4993 4994 adj = __netdev_find_adj(dev, adj_dev, dev_list); 4995 4996 if (adj) { 4997 adj->ref_nr++; 4998 return 0; 4999 } 5000 5001 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5002 if (!adj) 5003 return -ENOMEM; 5004 5005 adj->dev = adj_dev; 5006 adj->master = master; 5007 adj->ref_nr = 1; 5008 adj->private = private; 5009 dev_hold(adj_dev); 5010 5011 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5012 adj_dev->name, dev->name, adj_dev->name); 5013 5014 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5015 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5016 if (ret) 5017 goto free_adj; 5018 } 5019 5020 /* Ensure that master link is always the first item in list. */ 5021 if (master) { 5022 ret = sysfs_create_link(&(dev->dev.kobj), 5023 &(adj_dev->dev.kobj), "master"); 5024 if (ret) 5025 goto remove_symlinks; 5026 5027 list_add_rcu(&adj->list, dev_list); 5028 } else { 5029 list_add_tail_rcu(&adj->list, dev_list); 5030 } 5031 5032 return 0; 5033 5034 remove_symlinks: 5035 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5036 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5037 free_adj: 5038 kfree(adj); 5039 dev_put(adj_dev); 5040 5041 return ret; 5042 } 5043 5044 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5045 struct net_device *adj_dev, 5046 struct list_head *dev_list) 5047 { 5048 struct netdev_adjacent *adj; 5049 5050 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5051 5052 if (!adj) { 5053 pr_err("tried to remove device %s from %s\n", 5054 dev->name, adj_dev->name); 5055 BUG(); 5056 } 5057 5058 if (adj->ref_nr > 1) { 5059 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5060 adj->ref_nr-1); 5061 adj->ref_nr--; 5062 return; 5063 } 5064 5065 if (adj->master) 5066 sysfs_remove_link(&(dev->dev.kobj), "master"); 5067 5068 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5069 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5070 5071 list_del_rcu(&adj->list); 5072 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5073 adj_dev->name, dev->name, adj_dev->name); 5074 dev_put(adj_dev); 5075 kfree_rcu(adj, rcu); 5076 } 5077 5078 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5079 struct net_device *upper_dev, 5080 struct list_head *up_list, 5081 struct list_head *down_list, 5082 void *private, bool master) 5083 { 5084 int ret; 5085 5086 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5087 master); 5088 if (ret) 5089 return ret; 5090 5091 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5092 false); 5093 if (ret) { 5094 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5095 return ret; 5096 } 5097 5098 return 0; 5099 } 5100 5101 static int __netdev_adjacent_dev_link(struct net_device *dev, 5102 struct net_device *upper_dev) 5103 { 5104 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5105 &dev->all_adj_list.upper, 5106 &upper_dev->all_adj_list.lower, 5107 NULL, false); 5108 } 5109 5110 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5111 struct net_device *upper_dev, 5112 struct list_head *up_list, 5113 struct list_head *down_list) 5114 { 5115 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5116 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5117 } 5118 5119 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5120 struct net_device *upper_dev) 5121 { 5122 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5123 &dev->all_adj_list.upper, 5124 &upper_dev->all_adj_list.lower); 5125 } 5126 5127 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5128 struct net_device *upper_dev, 5129 void *private, bool master) 5130 { 5131 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5132 5133 if (ret) 5134 return ret; 5135 5136 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5137 &dev->adj_list.upper, 5138 &upper_dev->adj_list.lower, 5139 private, master); 5140 if (ret) { 5141 __netdev_adjacent_dev_unlink(dev, upper_dev); 5142 return ret; 5143 } 5144 5145 return 0; 5146 } 5147 5148 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5149 struct net_device *upper_dev) 5150 { 5151 __netdev_adjacent_dev_unlink(dev, upper_dev); 5152 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5153 &dev->adj_list.upper, 5154 &upper_dev->adj_list.lower); 5155 } 5156 5157 static int __netdev_upper_dev_link(struct net_device *dev, 5158 struct net_device *upper_dev, bool master, 5159 void *private) 5160 { 5161 struct netdev_adjacent *i, *j, *to_i, *to_j; 5162 int ret = 0; 5163 5164 ASSERT_RTNL(); 5165 5166 if (dev == upper_dev) 5167 return -EBUSY; 5168 5169 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5170 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) 5171 return -EBUSY; 5172 5173 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) 5174 return -EEXIST; 5175 5176 if (master && netdev_master_upper_dev_get(dev)) 5177 return -EBUSY; 5178 5179 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, 5180 master); 5181 if (ret) 5182 return ret; 5183 5184 /* Now that we linked these devs, make all the upper_dev's 5185 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5186 * versa, and don't forget the devices itself. All of these 5187 * links are non-neighbours. 5188 */ 5189 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5190 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5191 pr_debug("Interlinking %s with %s, non-neighbour\n", 5192 i->dev->name, j->dev->name); 5193 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5194 if (ret) 5195 goto rollback_mesh; 5196 } 5197 } 5198 5199 /* add dev to every upper_dev's upper device */ 5200 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5201 pr_debug("linking %s's upper device %s with %s\n", 5202 upper_dev->name, i->dev->name, dev->name); 5203 ret = __netdev_adjacent_dev_link(dev, i->dev); 5204 if (ret) 5205 goto rollback_upper_mesh; 5206 } 5207 5208 /* add upper_dev to every dev's lower device */ 5209 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5210 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5211 i->dev->name, upper_dev->name); 5212 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5213 if (ret) 5214 goto rollback_lower_mesh; 5215 } 5216 5217 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5218 return 0; 5219 5220 rollback_lower_mesh: 5221 to_i = i; 5222 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5223 if (i == to_i) 5224 break; 5225 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5226 } 5227 5228 i = NULL; 5229 5230 rollback_upper_mesh: 5231 to_i = i; 5232 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5233 if (i == to_i) 5234 break; 5235 __netdev_adjacent_dev_unlink(dev, i->dev); 5236 } 5237 5238 i = j = NULL; 5239 5240 rollback_mesh: 5241 to_i = i; 5242 to_j = j; 5243 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5244 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5245 if (i == to_i && j == to_j) 5246 break; 5247 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5248 } 5249 if (i == to_i) 5250 break; 5251 } 5252 5253 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5254 5255 return ret; 5256 } 5257 5258 /** 5259 * netdev_upper_dev_link - Add a link to the upper device 5260 * @dev: device 5261 * @upper_dev: new upper device 5262 * 5263 * Adds a link to device which is upper to this one. The caller must hold 5264 * the RTNL lock. On a failure a negative errno code is returned. 5265 * On success the reference counts are adjusted and the function 5266 * returns zero. 5267 */ 5268 int netdev_upper_dev_link(struct net_device *dev, 5269 struct net_device *upper_dev) 5270 { 5271 return __netdev_upper_dev_link(dev, upper_dev, false, NULL); 5272 } 5273 EXPORT_SYMBOL(netdev_upper_dev_link); 5274 5275 /** 5276 * netdev_master_upper_dev_link - Add a master link to the upper device 5277 * @dev: device 5278 * @upper_dev: new upper device 5279 * 5280 * Adds a link to device which is upper to this one. In this case, only 5281 * one master upper device can be linked, although other non-master devices 5282 * might be linked as well. The caller must hold the RTNL lock. 5283 * On a failure a negative errno code is returned. On success the reference 5284 * counts are adjusted and the function returns zero. 5285 */ 5286 int netdev_master_upper_dev_link(struct net_device *dev, 5287 struct net_device *upper_dev) 5288 { 5289 return __netdev_upper_dev_link(dev, upper_dev, true, NULL); 5290 } 5291 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5292 5293 int netdev_master_upper_dev_link_private(struct net_device *dev, 5294 struct net_device *upper_dev, 5295 void *private) 5296 { 5297 return __netdev_upper_dev_link(dev, upper_dev, true, private); 5298 } 5299 EXPORT_SYMBOL(netdev_master_upper_dev_link_private); 5300 5301 /** 5302 * netdev_upper_dev_unlink - Removes a link to upper device 5303 * @dev: device 5304 * @upper_dev: new upper device 5305 * 5306 * Removes a link to device which is upper to this one. The caller must hold 5307 * the RTNL lock. 5308 */ 5309 void netdev_upper_dev_unlink(struct net_device *dev, 5310 struct net_device *upper_dev) 5311 { 5312 struct netdev_adjacent *i, *j; 5313 ASSERT_RTNL(); 5314 5315 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5316 5317 /* Here is the tricky part. We must remove all dev's lower 5318 * devices from all upper_dev's upper devices and vice 5319 * versa, to maintain the graph relationship. 5320 */ 5321 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5322 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5323 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5324 5325 /* remove also the devices itself from lower/upper device 5326 * list 5327 */ 5328 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5329 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5330 5331 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5332 __netdev_adjacent_dev_unlink(dev, i->dev); 5333 5334 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5335 } 5336 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5337 5338 /** 5339 * netdev_bonding_info_change - Dispatch event about slave change 5340 * @dev: device 5341 * @bonding_info: info to dispatch 5342 * 5343 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5344 * The caller must hold the RTNL lock. 5345 */ 5346 void netdev_bonding_info_change(struct net_device *dev, 5347 struct netdev_bonding_info *bonding_info) 5348 { 5349 struct netdev_notifier_bonding_info info; 5350 5351 memcpy(&info.bonding_info, bonding_info, 5352 sizeof(struct netdev_bonding_info)); 5353 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5354 &info.info); 5355 } 5356 EXPORT_SYMBOL(netdev_bonding_info_change); 5357 5358 static void netdev_adjacent_add_links(struct net_device *dev) 5359 { 5360 struct netdev_adjacent *iter; 5361 5362 struct net *net = dev_net(dev); 5363 5364 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5365 if (!net_eq(net,dev_net(iter->dev))) 5366 continue; 5367 netdev_adjacent_sysfs_add(iter->dev, dev, 5368 &iter->dev->adj_list.lower); 5369 netdev_adjacent_sysfs_add(dev, iter->dev, 5370 &dev->adj_list.upper); 5371 } 5372 5373 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5374 if (!net_eq(net,dev_net(iter->dev))) 5375 continue; 5376 netdev_adjacent_sysfs_add(iter->dev, dev, 5377 &iter->dev->adj_list.upper); 5378 netdev_adjacent_sysfs_add(dev, iter->dev, 5379 &dev->adj_list.lower); 5380 } 5381 } 5382 5383 static void netdev_adjacent_del_links(struct net_device *dev) 5384 { 5385 struct netdev_adjacent *iter; 5386 5387 struct net *net = dev_net(dev); 5388 5389 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5390 if (!net_eq(net,dev_net(iter->dev))) 5391 continue; 5392 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5393 &iter->dev->adj_list.lower); 5394 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5395 &dev->adj_list.upper); 5396 } 5397 5398 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5399 if (!net_eq(net,dev_net(iter->dev))) 5400 continue; 5401 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5402 &iter->dev->adj_list.upper); 5403 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5404 &dev->adj_list.lower); 5405 } 5406 } 5407 5408 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5409 { 5410 struct netdev_adjacent *iter; 5411 5412 struct net *net = dev_net(dev); 5413 5414 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5415 if (!net_eq(net,dev_net(iter->dev))) 5416 continue; 5417 netdev_adjacent_sysfs_del(iter->dev, oldname, 5418 &iter->dev->adj_list.lower); 5419 netdev_adjacent_sysfs_add(iter->dev, dev, 5420 &iter->dev->adj_list.lower); 5421 } 5422 5423 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5424 if (!net_eq(net,dev_net(iter->dev))) 5425 continue; 5426 netdev_adjacent_sysfs_del(iter->dev, oldname, 5427 &iter->dev->adj_list.upper); 5428 netdev_adjacent_sysfs_add(iter->dev, dev, 5429 &iter->dev->adj_list.upper); 5430 } 5431 } 5432 5433 void *netdev_lower_dev_get_private(struct net_device *dev, 5434 struct net_device *lower_dev) 5435 { 5436 struct netdev_adjacent *lower; 5437 5438 if (!lower_dev) 5439 return NULL; 5440 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); 5441 if (!lower) 5442 return NULL; 5443 5444 return lower->private; 5445 } 5446 EXPORT_SYMBOL(netdev_lower_dev_get_private); 5447 5448 5449 int dev_get_nest_level(struct net_device *dev, 5450 bool (*type_check)(struct net_device *dev)) 5451 { 5452 struct net_device *lower = NULL; 5453 struct list_head *iter; 5454 int max_nest = -1; 5455 int nest; 5456 5457 ASSERT_RTNL(); 5458 5459 netdev_for_each_lower_dev(dev, lower, iter) { 5460 nest = dev_get_nest_level(lower, type_check); 5461 if (max_nest < nest) 5462 max_nest = nest; 5463 } 5464 5465 if (type_check(dev)) 5466 max_nest++; 5467 5468 return max_nest; 5469 } 5470 EXPORT_SYMBOL(dev_get_nest_level); 5471 5472 static void dev_change_rx_flags(struct net_device *dev, int flags) 5473 { 5474 const struct net_device_ops *ops = dev->netdev_ops; 5475 5476 if (ops->ndo_change_rx_flags) 5477 ops->ndo_change_rx_flags(dev, flags); 5478 } 5479 5480 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 5481 { 5482 unsigned int old_flags = dev->flags; 5483 kuid_t uid; 5484 kgid_t gid; 5485 5486 ASSERT_RTNL(); 5487 5488 dev->flags |= IFF_PROMISC; 5489 dev->promiscuity += inc; 5490 if (dev->promiscuity == 0) { 5491 /* 5492 * Avoid overflow. 5493 * If inc causes overflow, untouch promisc and return error. 5494 */ 5495 if (inc < 0) 5496 dev->flags &= ~IFF_PROMISC; 5497 else { 5498 dev->promiscuity -= inc; 5499 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 5500 dev->name); 5501 return -EOVERFLOW; 5502 } 5503 } 5504 if (dev->flags != old_flags) { 5505 pr_info("device %s %s promiscuous mode\n", 5506 dev->name, 5507 dev->flags & IFF_PROMISC ? "entered" : "left"); 5508 if (audit_enabled) { 5509 current_uid_gid(&uid, &gid); 5510 audit_log(current->audit_context, GFP_ATOMIC, 5511 AUDIT_ANOM_PROMISCUOUS, 5512 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 5513 dev->name, (dev->flags & IFF_PROMISC), 5514 (old_flags & IFF_PROMISC), 5515 from_kuid(&init_user_ns, audit_get_loginuid(current)), 5516 from_kuid(&init_user_ns, uid), 5517 from_kgid(&init_user_ns, gid), 5518 audit_get_sessionid(current)); 5519 } 5520 5521 dev_change_rx_flags(dev, IFF_PROMISC); 5522 } 5523 if (notify) 5524 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 5525 return 0; 5526 } 5527 5528 /** 5529 * dev_set_promiscuity - update promiscuity count on a device 5530 * @dev: device 5531 * @inc: modifier 5532 * 5533 * Add or remove promiscuity from a device. While the count in the device 5534 * remains above zero the interface remains promiscuous. Once it hits zero 5535 * the device reverts back to normal filtering operation. A negative inc 5536 * value is used to drop promiscuity on the device. 5537 * Return 0 if successful or a negative errno code on error. 5538 */ 5539 int dev_set_promiscuity(struct net_device *dev, int inc) 5540 { 5541 unsigned int old_flags = dev->flags; 5542 int err; 5543 5544 err = __dev_set_promiscuity(dev, inc, true); 5545 if (err < 0) 5546 return err; 5547 if (dev->flags != old_flags) 5548 dev_set_rx_mode(dev); 5549 return err; 5550 } 5551 EXPORT_SYMBOL(dev_set_promiscuity); 5552 5553 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 5554 { 5555 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 5556 5557 ASSERT_RTNL(); 5558 5559 dev->flags |= IFF_ALLMULTI; 5560 dev->allmulti += inc; 5561 if (dev->allmulti == 0) { 5562 /* 5563 * Avoid overflow. 5564 * If inc causes overflow, untouch allmulti and return error. 5565 */ 5566 if (inc < 0) 5567 dev->flags &= ~IFF_ALLMULTI; 5568 else { 5569 dev->allmulti -= inc; 5570 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 5571 dev->name); 5572 return -EOVERFLOW; 5573 } 5574 } 5575 if (dev->flags ^ old_flags) { 5576 dev_change_rx_flags(dev, IFF_ALLMULTI); 5577 dev_set_rx_mode(dev); 5578 if (notify) 5579 __dev_notify_flags(dev, old_flags, 5580 dev->gflags ^ old_gflags); 5581 } 5582 return 0; 5583 } 5584 5585 /** 5586 * dev_set_allmulti - update allmulti count on a device 5587 * @dev: device 5588 * @inc: modifier 5589 * 5590 * Add or remove reception of all multicast frames to a device. While the 5591 * count in the device remains above zero the interface remains listening 5592 * to all interfaces. Once it hits zero the device reverts back to normal 5593 * filtering operation. A negative @inc value is used to drop the counter 5594 * when releasing a resource needing all multicasts. 5595 * Return 0 if successful or a negative errno code on error. 5596 */ 5597 5598 int dev_set_allmulti(struct net_device *dev, int inc) 5599 { 5600 return __dev_set_allmulti(dev, inc, true); 5601 } 5602 EXPORT_SYMBOL(dev_set_allmulti); 5603 5604 /* 5605 * Upload unicast and multicast address lists to device and 5606 * configure RX filtering. When the device doesn't support unicast 5607 * filtering it is put in promiscuous mode while unicast addresses 5608 * are present. 5609 */ 5610 void __dev_set_rx_mode(struct net_device *dev) 5611 { 5612 const struct net_device_ops *ops = dev->netdev_ops; 5613 5614 /* dev_open will call this function so the list will stay sane. */ 5615 if (!(dev->flags&IFF_UP)) 5616 return; 5617 5618 if (!netif_device_present(dev)) 5619 return; 5620 5621 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 5622 /* Unicast addresses changes may only happen under the rtnl, 5623 * therefore calling __dev_set_promiscuity here is safe. 5624 */ 5625 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 5626 __dev_set_promiscuity(dev, 1, false); 5627 dev->uc_promisc = true; 5628 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 5629 __dev_set_promiscuity(dev, -1, false); 5630 dev->uc_promisc = false; 5631 } 5632 } 5633 5634 if (ops->ndo_set_rx_mode) 5635 ops->ndo_set_rx_mode(dev); 5636 } 5637 5638 void dev_set_rx_mode(struct net_device *dev) 5639 { 5640 netif_addr_lock_bh(dev); 5641 __dev_set_rx_mode(dev); 5642 netif_addr_unlock_bh(dev); 5643 } 5644 5645 /** 5646 * dev_get_flags - get flags reported to userspace 5647 * @dev: device 5648 * 5649 * Get the combination of flag bits exported through APIs to userspace. 5650 */ 5651 unsigned int dev_get_flags(const struct net_device *dev) 5652 { 5653 unsigned int flags; 5654 5655 flags = (dev->flags & ~(IFF_PROMISC | 5656 IFF_ALLMULTI | 5657 IFF_RUNNING | 5658 IFF_LOWER_UP | 5659 IFF_DORMANT)) | 5660 (dev->gflags & (IFF_PROMISC | 5661 IFF_ALLMULTI)); 5662 5663 if (netif_running(dev)) { 5664 if (netif_oper_up(dev)) 5665 flags |= IFF_RUNNING; 5666 if (netif_carrier_ok(dev)) 5667 flags |= IFF_LOWER_UP; 5668 if (netif_dormant(dev)) 5669 flags |= IFF_DORMANT; 5670 } 5671 5672 return flags; 5673 } 5674 EXPORT_SYMBOL(dev_get_flags); 5675 5676 int __dev_change_flags(struct net_device *dev, unsigned int flags) 5677 { 5678 unsigned int old_flags = dev->flags; 5679 int ret; 5680 5681 ASSERT_RTNL(); 5682 5683 /* 5684 * Set the flags on our device. 5685 */ 5686 5687 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 5688 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 5689 IFF_AUTOMEDIA)) | 5690 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 5691 IFF_ALLMULTI)); 5692 5693 /* 5694 * Load in the correct multicast list now the flags have changed. 5695 */ 5696 5697 if ((old_flags ^ flags) & IFF_MULTICAST) 5698 dev_change_rx_flags(dev, IFF_MULTICAST); 5699 5700 dev_set_rx_mode(dev); 5701 5702 /* 5703 * Have we downed the interface. We handle IFF_UP ourselves 5704 * according to user attempts to set it, rather than blindly 5705 * setting it. 5706 */ 5707 5708 ret = 0; 5709 if ((old_flags ^ flags) & IFF_UP) 5710 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 5711 5712 if ((flags ^ dev->gflags) & IFF_PROMISC) { 5713 int inc = (flags & IFF_PROMISC) ? 1 : -1; 5714 unsigned int old_flags = dev->flags; 5715 5716 dev->gflags ^= IFF_PROMISC; 5717 5718 if (__dev_set_promiscuity(dev, inc, false) >= 0) 5719 if (dev->flags != old_flags) 5720 dev_set_rx_mode(dev); 5721 } 5722 5723 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 5724 is important. Some (broken) drivers set IFF_PROMISC, when 5725 IFF_ALLMULTI is requested not asking us and not reporting. 5726 */ 5727 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 5728 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 5729 5730 dev->gflags ^= IFF_ALLMULTI; 5731 __dev_set_allmulti(dev, inc, false); 5732 } 5733 5734 return ret; 5735 } 5736 5737 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 5738 unsigned int gchanges) 5739 { 5740 unsigned int changes = dev->flags ^ old_flags; 5741 5742 if (gchanges) 5743 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 5744 5745 if (changes & IFF_UP) { 5746 if (dev->flags & IFF_UP) 5747 call_netdevice_notifiers(NETDEV_UP, dev); 5748 else 5749 call_netdevice_notifiers(NETDEV_DOWN, dev); 5750 } 5751 5752 if (dev->flags & IFF_UP && 5753 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 5754 struct netdev_notifier_change_info change_info; 5755 5756 change_info.flags_changed = changes; 5757 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 5758 &change_info.info); 5759 } 5760 } 5761 5762 /** 5763 * dev_change_flags - change device settings 5764 * @dev: device 5765 * @flags: device state flags 5766 * 5767 * Change settings on device based state flags. The flags are 5768 * in the userspace exported format. 5769 */ 5770 int dev_change_flags(struct net_device *dev, unsigned int flags) 5771 { 5772 int ret; 5773 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 5774 5775 ret = __dev_change_flags(dev, flags); 5776 if (ret < 0) 5777 return ret; 5778 5779 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 5780 __dev_notify_flags(dev, old_flags, changes); 5781 return ret; 5782 } 5783 EXPORT_SYMBOL(dev_change_flags); 5784 5785 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 5786 { 5787 const struct net_device_ops *ops = dev->netdev_ops; 5788 5789 if (ops->ndo_change_mtu) 5790 return ops->ndo_change_mtu(dev, new_mtu); 5791 5792 dev->mtu = new_mtu; 5793 return 0; 5794 } 5795 5796 /** 5797 * dev_set_mtu - Change maximum transfer unit 5798 * @dev: device 5799 * @new_mtu: new transfer unit 5800 * 5801 * Change the maximum transfer size of the network device. 5802 */ 5803 int dev_set_mtu(struct net_device *dev, int new_mtu) 5804 { 5805 int err, orig_mtu; 5806 5807 if (new_mtu == dev->mtu) 5808 return 0; 5809 5810 /* MTU must be positive. */ 5811 if (new_mtu < 0) 5812 return -EINVAL; 5813 5814 if (!netif_device_present(dev)) 5815 return -ENODEV; 5816 5817 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 5818 err = notifier_to_errno(err); 5819 if (err) 5820 return err; 5821 5822 orig_mtu = dev->mtu; 5823 err = __dev_set_mtu(dev, new_mtu); 5824 5825 if (!err) { 5826 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5827 err = notifier_to_errno(err); 5828 if (err) { 5829 /* setting mtu back and notifying everyone again, 5830 * so that they have a chance to revert changes. 5831 */ 5832 __dev_set_mtu(dev, orig_mtu); 5833 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5834 } 5835 } 5836 return err; 5837 } 5838 EXPORT_SYMBOL(dev_set_mtu); 5839 5840 /** 5841 * dev_set_group - Change group this device belongs to 5842 * @dev: device 5843 * @new_group: group this device should belong to 5844 */ 5845 void dev_set_group(struct net_device *dev, int new_group) 5846 { 5847 dev->group = new_group; 5848 } 5849 EXPORT_SYMBOL(dev_set_group); 5850 5851 /** 5852 * dev_set_mac_address - Change Media Access Control Address 5853 * @dev: device 5854 * @sa: new address 5855 * 5856 * Change the hardware (MAC) address of the device 5857 */ 5858 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 5859 { 5860 const struct net_device_ops *ops = dev->netdev_ops; 5861 int err; 5862 5863 if (!ops->ndo_set_mac_address) 5864 return -EOPNOTSUPP; 5865 if (sa->sa_family != dev->type) 5866 return -EINVAL; 5867 if (!netif_device_present(dev)) 5868 return -ENODEV; 5869 err = ops->ndo_set_mac_address(dev, sa); 5870 if (err) 5871 return err; 5872 dev->addr_assign_type = NET_ADDR_SET; 5873 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 5874 add_device_randomness(dev->dev_addr, dev->addr_len); 5875 return 0; 5876 } 5877 EXPORT_SYMBOL(dev_set_mac_address); 5878 5879 /** 5880 * dev_change_carrier - Change device carrier 5881 * @dev: device 5882 * @new_carrier: new value 5883 * 5884 * Change device carrier 5885 */ 5886 int dev_change_carrier(struct net_device *dev, bool new_carrier) 5887 { 5888 const struct net_device_ops *ops = dev->netdev_ops; 5889 5890 if (!ops->ndo_change_carrier) 5891 return -EOPNOTSUPP; 5892 if (!netif_device_present(dev)) 5893 return -ENODEV; 5894 return ops->ndo_change_carrier(dev, new_carrier); 5895 } 5896 EXPORT_SYMBOL(dev_change_carrier); 5897 5898 /** 5899 * dev_get_phys_port_id - Get device physical port ID 5900 * @dev: device 5901 * @ppid: port ID 5902 * 5903 * Get device physical port ID 5904 */ 5905 int dev_get_phys_port_id(struct net_device *dev, 5906 struct netdev_phys_item_id *ppid) 5907 { 5908 const struct net_device_ops *ops = dev->netdev_ops; 5909 5910 if (!ops->ndo_get_phys_port_id) 5911 return -EOPNOTSUPP; 5912 return ops->ndo_get_phys_port_id(dev, ppid); 5913 } 5914 EXPORT_SYMBOL(dev_get_phys_port_id); 5915 5916 /** 5917 * dev_new_index - allocate an ifindex 5918 * @net: the applicable net namespace 5919 * 5920 * Returns a suitable unique value for a new device interface 5921 * number. The caller must hold the rtnl semaphore or the 5922 * dev_base_lock to be sure it remains unique. 5923 */ 5924 static int dev_new_index(struct net *net) 5925 { 5926 int ifindex = net->ifindex; 5927 for (;;) { 5928 if (++ifindex <= 0) 5929 ifindex = 1; 5930 if (!__dev_get_by_index(net, ifindex)) 5931 return net->ifindex = ifindex; 5932 } 5933 } 5934 5935 /* Delayed registration/unregisteration */ 5936 static LIST_HEAD(net_todo_list); 5937 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 5938 5939 static void net_set_todo(struct net_device *dev) 5940 { 5941 list_add_tail(&dev->todo_list, &net_todo_list); 5942 dev_net(dev)->dev_unreg_count++; 5943 } 5944 5945 static void rollback_registered_many(struct list_head *head) 5946 { 5947 struct net_device *dev, *tmp; 5948 LIST_HEAD(close_head); 5949 5950 BUG_ON(dev_boot_phase); 5951 ASSERT_RTNL(); 5952 5953 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 5954 /* Some devices call without registering 5955 * for initialization unwind. Remove those 5956 * devices and proceed with the remaining. 5957 */ 5958 if (dev->reg_state == NETREG_UNINITIALIZED) { 5959 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 5960 dev->name, dev); 5961 5962 WARN_ON(1); 5963 list_del(&dev->unreg_list); 5964 continue; 5965 } 5966 dev->dismantle = true; 5967 BUG_ON(dev->reg_state != NETREG_REGISTERED); 5968 } 5969 5970 /* If device is running, close it first. */ 5971 list_for_each_entry(dev, head, unreg_list) 5972 list_add_tail(&dev->close_list, &close_head); 5973 dev_close_many(&close_head); 5974 5975 list_for_each_entry(dev, head, unreg_list) { 5976 /* And unlink it from device chain. */ 5977 unlist_netdevice(dev); 5978 5979 dev->reg_state = NETREG_UNREGISTERING; 5980 } 5981 5982 synchronize_net(); 5983 5984 list_for_each_entry(dev, head, unreg_list) { 5985 struct sk_buff *skb = NULL; 5986 5987 /* Shutdown queueing discipline. */ 5988 dev_shutdown(dev); 5989 5990 5991 /* Notify protocols, that we are about to destroy 5992 this device. They should clean all the things. 5993 */ 5994 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5995 5996 if (!dev->rtnl_link_ops || 5997 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 5998 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 5999 GFP_KERNEL); 6000 6001 /* 6002 * Flush the unicast and multicast chains 6003 */ 6004 dev_uc_flush(dev); 6005 dev_mc_flush(dev); 6006 6007 if (dev->netdev_ops->ndo_uninit) 6008 dev->netdev_ops->ndo_uninit(dev); 6009 6010 if (skb) 6011 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6012 6013 /* Notifier chain MUST detach us all upper devices. */ 6014 WARN_ON(netdev_has_any_upper_dev(dev)); 6015 6016 /* Remove entries from kobject tree */ 6017 netdev_unregister_kobject(dev); 6018 #ifdef CONFIG_XPS 6019 /* Remove XPS queueing entries */ 6020 netif_reset_xps_queues_gt(dev, 0); 6021 #endif 6022 } 6023 6024 synchronize_net(); 6025 6026 list_for_each_entry(dev, head, unreg_list) 6027 dev_put(dev); 6028 } 6029 6030 static void rollback_registered(struct net_device *dev) 6031 { 6032 LIST_HEAD(single); 6033 6034 list_add(&dev->unreg_list, &single); 6035 rollback_registered_many(&single); 6036 list_del(&single); 6037 } 6038 6039 static netdev_features_t netdev_fix_features(struct net_device *dev, 6040 netdev_features_t features) 6041 { 6042 /* Fix illegal checksum combinations */ 6043 if ((features & NETIF_F_HW_CSUM) && 6044 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6045 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6046 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6047 } 6048 6049 /* TSO requires that SG is present as well. */ 6050 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6051 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6052 features &= ~NETIF_F_ALL_TSO; 6053 } 6054 6055 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6056 !(features & NETIF_F_IP_CSUM)) { 6057 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6058 features &= ~NETIF_F_TSO; 6059 features &= ~NETIF_F_TSO_ECN; 6060 } 6061 6062 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6063 !(features & NETIF_F_IPV6_CSUM)) { 6064 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6065 features &= ~NETIF_F_TSO6; 6066 } 6067 6068 /* TSO ECN requires that TSO is present as well. */ 6069 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6070 features &= ~NETIF_F_TSO_ECN; 6071 6072 /* Software GSO depends on SG. */ 6073 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6074 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6075 features &= ~NETIF_F_GSO; 6076 } 6077 6078 /* UFO needs SG and checksumming */ 6079 if (features & NETIF_F_UFO) { 6080 /* maybe split UFO into V4 and V6? */ 6081 if (!((features & NETIF_F_GEN_CSUM) || 6082 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 6083 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6084 netdev_dbg(dev, 6085 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6086 features &= ~NETIF_F_UFO; 6087 } 6088 6089 if (!(features & NETIF_F_SG)) { 6090 netdev_dbg(dev, 6091 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6092 features &= ~NETIF_F_UFO; 6093 } 6094 } 6095 6096 #ifdef CONFIG_NET_RX_BUSY_POLL 6097 if (dev->netdev_ops->ndo_busy_poll) 6098 features |= NETIF_F_BUSY_POLL; 6099 else 6100 #endif 6101 features &= ~NETIF_F_BUSY_POLL; 6102 6103 return features; 6104 } 6105 6106 int __netdev_update_features(struct net_device *dev) 6107 { 6108 netdev_features_t features; 6109 int err = 0; 6110 6111 ASSERT_RTNL(); 6112 6113 features = netdev_get_wanted_features(dev); 6114 6115 if (dev->netdev_ops->ndo_fix_features) 6116 features = dev->netdev_ops->ndo_fix_features(dev, features); 6117 6118 /* driver might be less strict about feature dependencies */ 6119 features = netdev_fix_features(dev, features); 6120 6121 if (dev->features == features) 6122 return 0; 6123 6124 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6125 &dev->features, &features); 6126 6127 if (dev->netdev_ops->ndo_set_features) 6128 err = dev->netdev_ops->ndo_set_features(dev, features); 6129 6130 if (unlikely(err < 0)) { 6131 netdev_err(dev, 6132 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6133 err, &features, &dev->features); 6134 return -1; 6135 } 6136 6137 if (!err) 6138 dev->features = features; 6139 6140 return 1; 6141 } 6142 6143 /** 6144 * netdev_update_features - recalculate device features 6145 * @dev: the device to check 6146 * 6147 * Recalculate dev->features set and send notifications if it 6148 * has changed. Should be called after driver or hardware dependent 6149 * conditions might have changed that influence the features. 6150 */ 6151 void netdev_update_features(struct net_device *dev) 6152 { 6153 if (__netdev_update_features(dev)) 6154 netdev_features_change(dev); 6155 } 6156 EXPORT_SYMBOL(netdev_update_features); 6157 6158 /** 6159 * netdev_change_features - recalculate device features 6160 * @dev: the device to check 6161 * 6162 * Recalculate dev->features set and send notifications even 6163 * if they have not changed. Should be called instead of 6164 * netdev_update_features() if also dev->vlan_features might 6165 * have changed to allow the changes to be propagated to stacked 6166 * VLAN devices. 6167 */ 6168 void netdev_change_features(struct net_device *dev) 6169 { 6170 __netdev_update_features(dev); 6171 netdev_features_change(dev); 6172 } 6173 EXPORT_SYMBOL(netdev_change_features); 6174 6175 /** 6176 * netif_stacked_transfer_operstate - transfer operstate 6177 * @rootdev: the root or lower level device to transfer state from 6178 * @dev: the device to transfer operstate to 6179 * 6180 * Transfer operational state from root to device. This is normally 6181 * called when a stacking relationship exists between the root 6182 * device and the device(a leaf device). 6183 */ 6184 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6185 struct net_device *dev) 6186 { 6187 if (rootdev->operstate == IF_OPER_DORMANT) 6188 netif_dormant_on(dev); 6189 else 6190 netif_dormant_off(dev); 6191 6192 if (netif_carrier_ok(rootdev)) { 6193 if (!netif_carrier_ok(dev)) 6194 netif_carrier_on(dev); 6195 } else { 6196 if (netif_carrier_ok(dev)) 6197 netif_carrier_off(dev); 6198 } 6199 } 6200 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6201 6202 #ifdef CONFIG_SYSFS 6203 static int netif_alloc_rx_queues(struct net_device *dev) 6204 { 6205 unsigned int i, count = dev->num_rx_queues; 6206 struct netdev_rx_queue *rx; 6207 size_t sz = count * sizeof(*rx); 6208 6209 BUG_ON(count < 1); 6210 6211 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6212 if (!rx) { 6213 rx = vzalloc(sz); 6214 if (!rx) 6215 return -ENOMEM; 6216 } 6217 dev->_rx = rx; 6218 6219 for (i = 0; i < count; i++) 6220 rx[i].dev = dev; 6221 return 0; 6222 } 6223 #endif 6224 6225 static void netdev_init_one_queue(struct net_device *dev, 6226 struct netdev_queue *queue, void *_unused) 6227 { 6228 /* Initialize queue lock */ 6229 spin_lock_init(&queue->_xmit_lock); 6230 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6231 queue->xmit_lock_owner = -1; 6232 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6233 queue->dev = dev; 6234 #ifdef CONFIG_BQL 6235 dql_init(&queue->dql, HZ); 6236 #endif 6237 } 6238 6239 static void netif_free_tx_queues(struct net_device *dev) 6240 { 6241 kvfree(dev->_tx); 6242 } 6243 6244 static int netif_alloc_netdev_queues(struct net_device *dev) 6245 { 6246 unsigned int count = dev->num_tx_queues; 6247 struct netdev_queue *tx; 6248 size_t sz = count * sizeof(*tx); 6249 6250 BUG_ON(count < 1 || count > 0xffff); 6251 6252 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6253 if (!tx) { 6254 tx = vzalloc(sz); 6255 if (!tx) 6256 return -ENOMEM; 6257 } 6258 dev->_tx = tx; 6259 6260 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6261 spin_lock_init(&dev->tx_global_lock); 6262 6263 return 0; 6264 } 6265 6266 /** 6267 * register_netdevice - register a network device 6268 * @dev: device to register 6269 * 6270 * Take a completed network device structure and add it to the kernel 6271 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6272 * chain. 0 is returned on success. A negative errno code is returned 6273 * on a failure to set up the device, or if the name is a duplicate. 6274 * 6275 * Callers must hold the rtnl semaphore. You may want 6276 * register_netdev() instead of this. 6277 * 6278 * BUGS: 6279 * The locking appears insufficient to guarantee two parallel registers 6280 * will not get the same name. 6281 */ 6282 6283 int register_netdevice(struct net_device *dev) 6284 { 6285 int ret; 6286 struct net *net = dev_net(dev); 6287 6288 BUG_ON(dev_boot_phase); 6289 ASSERT_RTNL(); 6290 6291 might_sleep(); 6292 6293 /* When net_device's are persistent, this will be fatal. */ 6294 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6295 BUG_ON(!net); 6296 6297 spin_lock_init(&dev->addr_list_lock); 6298 netdev_set_addr_lockdep_class(dev); 6299 6300 dev->iflink = -1; 6301 6302 ret = dev_get_valid_name(net, dev, dev->name); 6303 if (ret < 0) 6304 goto out; 6305 6306 /* Init, if this function is available */ 6307 if (dev->netdev_ops->ndo_init) { 6308 ret = dev->netdev_ops->ndo_init(dev); 6309 if (ret) { 6310 if (ret > 0) 6311 ret = -EIO; 6312 goto out; 6313 } 6314 } 6315 6316 if (((dev->hw_features | dev->features) & 6317 NETIF_F_HW_VLAN_CTAG_FILTER) && 6318 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 6319 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 6320 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 6321 ret = -EINVAL; 6322 goto err_uninit; 6323 } 6324 6325 ret = -EBUSY; 6326 if (!dev->ifindex) 6327 dev->ifindex = dev_new_index(net); 6328 else if (__dev_get_by_index(net, dev->ifindex)) 6329 goto err_uninit; 6330 6331 if (dev->iflink == -1) 6332 dev->iflink = dev->ifindex; 6333 6334 /* Transfer changeable features to wanted_features and enable 6335 * software offloads (GSO and GRO). 6336 */ 6337 dev->hw_features |= NETIF_F_SOFT_FEATURES; 6338 dev->features |= NETIF_F_SOFT_FEATURES; 6339 dev->wanted_features = dev->features & dev->hw_features; 6340 6341 if (!(dev->flags & IFF_LOOPBACK)) { 6342 dev->hw_features |= NETIF_F_NOCACHE_COPY; 6343 } 6344 6345 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 6346 */ 6347 dev->vlan_features |= NETIF_F_HIGHDMA; 6348 6349 /* Make NETIF_F_SG inheritable to tunnel devices. 6350 */ 6351 dev->hw_enc_features |= NETIF_F_SG; 6352 6353 /* Make NETIF_F_SG inheritable to MPLS. 6354 */ 6355 dev->mpls_features |= NETIF_F_SG; 6356 6357 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 6358 ret = notifier_to_errno(ret); 6359 if (ret) 6360 goto err_uninit; 6361 6362 ret = netdev_register_kobject(dev); 6363 if (ret) 6364 goto err_uninit; 6365 dev->reg_state = NETREG_REGISTERED; 6366 6367 __netdev_update_features(dev); 6368 6369 /* 6370 * Default initial state at registry is that the 6371 * device is present. 6372 */ 6373 6374 set_bit(__LINK_STATE_PRESENT, &dev->state); 6375 6376 linkwatch_init_dev(dev); 6377 6378 dev_init_scheduler(dev); 6379 dev_hold(dev); 6380 list_netdevice(dev); 6381 add_device_randomness(dev->dev_addr, dev->addr_len); 6382 6383 /* If the device has permanent device address, driver should 6384 * set dev_addr and also addr_assign_type should be set to 6385 * NET_ADDR_PERM (default value). 6386 */ 6387 if (dev->addr_assign_type == NET_ADDR_PERM) 6388 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 6389 6390 /* Notify protocols, that a new device appeared. */ 6391 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 6392 ret = notifier_to_errno(ret); 6393 if (ret) { 6394 rollback_registered(dev); 6395 dev->reg_state = NETREG_UNREGISTERED; 6396 } 6397 /* 6398 * Prevent userspace races by waiting until the network 6399 * device is fully setup before sending notifications. 6400 */ 6401 if (!dev->rtnl_link_ops || 6402 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6403 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 6404 6405 out: 6406 return ret; 6407 6408 err_uninit: 6409 if (dev->netdev_ops->ndo_uninit) 6410 dev->netdev_ops->ndo_uninit(dev); 6411 goto out; 6412 } 6413 EXPORT_SYMBOL(register_netdevice); 6414 6415 /** 6416 * init_dummy_netdev - init a dummy network device for NAPI 6417 * @dev: device to init 6418 * 6419 * This takes a network device structure and initialize the minimum 6420 * amount of fields so it can be used to schedule NAPI polls without 6421 * registering a full blown interface. This is to be used by drivers 6422 * that need to tie several hardware interfaces to a single NAPI 6423 * poll scheduler due to HW limitations. 6424 */ 6425 int init_dummy_netdev(struct net_device *dev) 6426 { 6427 /* Clear everything. Note we don't initialize spinlocks 6428 * are they aren't supposed to be taken by any of the 6429 * NAPI code and this dummy netdev is supposed to be 6430 * only ever used for NAPI polls 6431 */ 6432 memset(dev, 0, sizeof(struct net_device)); 6433 6434 /* make sure we BUG if trying to hit standard 6435 * register/unregister code path 6436 */ 6437 dev->reg_state = NETREG_DUMMY; 6438 6439 /* NAPI wants this */ 6440 INIT_LIST_HEAD(&dev->napi_list); 6441 6442 /* a dummy interface is started by default */ 6443 set_bit(__LINK_STATE_PRESENT, &dev->state); 6444 set_bit(__LINK_STATE_START, &dev->state); 6445 6446 /* Note : We dont allocate pcpu_refcnt for dummy devices, 6447 * because users of this 'device' dont need to change 6448 * its refcount. 6449 */ 6450 6451 return 0; 6452 } 6453 EXPORT_SYMBOL_GPL(init_dummy_netdev); 6454 6455 6456 /** 6457 * register_netdev - register a network device 6458 * @dev: device to register 6459 * 6460 * Take a completed network device structure and add it to the kernel 6461 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6462 * chain. 0 is returned on success. A negative errno code is returned 6463 * on a failure to set up the device, or if the name is a duplicate. 6464 * 6465 * This is a wrapper around register_netdevice that takes the rtnl semaphore 6466 * and expands the device name if you passed a format string to 6467 * alloc_netdev. 6468 */ 6469 int register_netdev(struct net_device *dev) 6470 { 6471 int err; 6472 6473 rtnl_lock(); 6474 err = register_netdevice(dev); 6475 rtnl_unlock(); 6476 return err; 6477 } 6478 EXPORT_SYMBOL(register_netdev); 6479 6480 int netdev_refcnt_read(const struct net_device *dev) 6481 { 6482 int i, refcnt = 0; 6483 6484 for_each_possible_cpu(i) 6485 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 6486 return refcnt; 6487 } 6488 EXPORT_SYMBOL(netdev_refcnt_read); 6489 6490 /** 6491 * netdev_wait_allrefs - wait until all references are gone. 6492 * @dev: target net_device 6493 * 6494 * This is called when unregistering network devices. 6495 * 6496 * Any protocol or device that holds a reference should register 6497 * for netdevice notification, and cleanup and put back the 6498 * reference if they receive an UNREGISTER event. 6499 * We can get stuck here if buggy protocols don't correctly 6500 * call dev_put. 6501 */ 6502 static void netdev_wait_allrefs(struct net_device *dev) 6503 { 6504 unsigned long rebroadcast_time, warning_time; 6505 int refcnt; 6506 6507 linkwatch_forget_dev(dev); 6508 6509 rebroadcast_time = warning_time = jiffies; 6510 refcnt = netdev_refcnt_read(dev); 6511 6512 while (refcnt != 0) { 6513 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 6514 rtnl_lock(); 6515 6516 /* Rebroadcast unregister notification */ 6517 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6518 6519 __rtnl_unlock(); 6520 rcu_barrier(); 6521 rtnl_lock(); 6522 6523 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6524 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 6525 &dev->state)) { 6526 /* We must not have linkwatch events 6527 * pending on unregister. If this 6528 * happens, we simply run the queue 6529 * unscheduled, resulting in a noop 6530 * for this device. 6531 */ 6532 linkwatch_run_queue(); 6533 } 6534 6535 __rtnl_unlock(); 6536 6537 rebroadcast_time = jiffies; 6538 } 6539 6540 msleep(250); 6541 6542 refcnt = netdev_refcnt_read(dev); 6543 6544 if (time_after(jiffies, warning_time + 10 * HZ)) { 6545 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 6546 dev->name, refcnt); 6547 warning_time = jiffies; 6548 } 6549 } 6550 } 6551 6552 /* The sequence is: 6553 * 6554 * rtnl_lock(); 6555 * ... 6556 * register_netdevice(x1); 6557 * register_netdevice(x2); 6558 * ... 6559 * unregister_netdevice(y1); 6560 * unregister_netdevice(y2); 6561 * ... 6562 * rtnl_unlock(); 6563 * free_netdev(y1); 6564 * free_netdev(y2); 6565 * 6566 * We are invoked by rtnl_unlock(). 6567 * This allows us to deal with problems: 6568 * 1) We can delete sysfs objects which invoke hotplug 6569 * without deadlocking with linkwatch via keventd. 6570 * 2) Since we run with the RTNL semaphore not held, we can sleep 6571 * safely in order to wait for the netdev refcnt to drop to zero. 6572 * 6573 * We must not return until all unregister events added during 6574 * the interval the lock was held have been completed. 6575 */ 6576 void netdev_run_todo(void) 6577 { 6578 struct list_head list; 6579 6580 /* Snapshot list, allow later requests */ 6581 list_replace_init(&net_todo_list, &list); 6582 6583 __rtnl_unlock(); 6584 6585 6586 /* Wait for rcu callbacks to finish before next phase */ 6587 if (!list_empty(&list)) 6588 rcu_barrier(); 6589 6590 while (!list_empty(&list)) { 6591 struct net_device *dev 6592 = list_first_entry(&list, struct net_device, todo_list); 6593 list_del(&dev->todo_list); 6594 6595 rtnl_lock(); 6596 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6597 __rtnl_unlock(); 6598 6599 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 6600 pr_err("network todo '%s' but state %d\n", 6601 dev->name, dev->reg_state); 6602 dump_stack(); 6603 continue; 6604 } 6605 6606 dev->reg_state = NETREG_UNREGISTERED; 6607 6608 on_each_cpu(flush_backlog, dev, 1); 6609 6610 netdev_wait_allrefs(dev); 6611 6612 /* paranoia */ 6613 BUG_ON(netdev_refcnt_read(dev)); 6614 BUG_ON(!list_empty(&dev->ptype_all)); 6615 BUG_ON(!list_empty(&dev->ptype_specific)); 6616 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 6617 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 6618 WARN_ON(dev->dn_ptr); 6619 6620 if (dev->destructor) 6621 dev->destructor(dev); 6622 6623 /* Report a network device has been unregistered */ 6624 rtnl_lock(); 6625 dev_net(dev)->dev_unreg_count--; 6626 __rtnl_unlock(); 6627 wake_up(&netdev_unregistering_wq); 6628 6629 /* Free network device */ 6630 kobject_put(&dev->dev.kobj); 6631 } 6632 } 6633 6634 /* Convert net_device_stats to rtnl_link_stats64. They have the same 6635 * fields in the same order, with only the type differing. 6636 */ 6637 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 6638 const struct net_device_stats *netdev_stats) 6639 { 6640 #if BITS_PER_LONG == 64 6641 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 6642 memcpy(stats64, netdev_stats, sizeof(*stats64)); 6643 #else 6644 size_t i, n = sizeof(*stats64) / sizeof(u64); 6645 const unsigned long *src = (const unsigned long *)netdev_stats; 6646 u64 *dst = (u64 *)stats64; 6647 6648 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 6649 sizeof(*stats64) / sizeof(u64)); 6650 for (i = 0; i < n; i++) 6651 dst[i] = src[i]; 6652 #endif 6653 } 6654 EXPORT_SYMBOL(netdev_stats_to_stats64); 6655 6656 /** 6657 * dev_get_stats - get network device statistics 6658 * @dev: device to get statistics from 6659 * @storage: place to store stats 6660 * 6661 * Get network statistics from device. Return @storage. 6662 * The device driver may provide its own method by setting 6663 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 6664 * otherwise the internal statistics structure is used. 6665 */ 6666 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 6667 struct rtnl_link_stats64 *storage) 6668 { 6669 const struct net_device_ops *ops = dev->netdev_ops; 6670 6671 if (ops->ndo_get_stats64) { 6672 memset(storage, 0, sizeof(*storage)); 6673 ops->ndo_get_stats64(dev, storage); 6674 } else if (ops->ndo_get_stats) { 6675 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 6676 } else { 6677 netdev_stats_to_stats64(storage, &dev->stats); 6678 } 6679 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 6680 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 6681 return storage; 6682 } 6683 EXPORT_SYMBOL(dev_get_stats); 6684 6685 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 6686 { 6687 struct netdev_queue *queue = dev_ingress_queue(dev); 6688 6689 #ifdef CONFIG_NET_CLS_ACT 6690 if (queue) 6691 return queue; 6692 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 6693 if (!queue) 6694 return NULL; 6695 netdev_init_one_queue(dev, queue, NULL); 6696 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 6697 queue->qdisc_sleeping = &noop_qdisc; 6698 rcu_assign_pointer(dev->ingress_queue, queue); 6699 #endif 6700 return queue; 6701 } 6702 6703 static const struct ethtool_ops default_ethtool_ops; 6704 6705 void netdev_set_default_ethtool_ops(struct net_device *dev, 6706 const struct ethtool_ops *ops) 6707 { 6708 if (dev->ethtool_ops == &default_ethtool_ops) 6709 dev->ethtool_ops = ops; 6710 } 6711 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 6712 6713 void netdev_freemem(struct net_device *dev) 6714 { 6715 char *addr = (char *)dev - dev->padded; 6716 6717 kvfree(addr); 6718 } 6719 6720 /** 6721 * alloc_netdev_mqs - allocate network device 6722 * @sizeof_priv: size of private data to allocate space for 6723 * @name: device name format string 6724 * @name_assign_type: origin of device name 6725 * @setup: callback to initialize device 6726 * @txqs: the number of TX subqueues to allocate 6727 * @rxqs: the number of RX subqueues to allocate 6728 * 6729 * Allocates a struct net_device with private data area for driver use 6730 * and performs basic initialization. Also allocates subqueue structs 6731 * for each queue on the device. 6732 */ 6733 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 6734 unsigned char name_assign_type, 6735 void (*setup)(struct net_device *), 6736 unsigned int txqs, unsigned int rxqs) 6737 { 6738 struct net_device *dev; 6739 size_t alloc_size; 6740 struct net_device *p; 6741 6742 BUG_ON(strlen(name) >= sizeof(dev->name)); 6743 6744 if (txqs < 1) { 6745 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 6746 return NULL; 6747 } 6748 6749 #ifdef CONFIG_SYSFS 6750 if (rxqs < 1) { 6751 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 6752 return NULL; 6753 } 6754 #endif 6755 6756 alloc_size = sizeof(struct net_device); 6757 if (sizeof_priv) { 6758 /* ensure 32-byte alignment of private area */ 6759 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 6760 alloc_size += sizeof_priv; 6761 } 6762 /* ensure 32-byte alignment of whole construct */ 6763 alloc_size += NETDEV_ALIGN - 1; 6764 6765 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6766 if (!p) 6767 p = vzalloc(alloc_size); 6768 if (!p) 6769 return NULL; 6770 6771 dev = PTR_ALIGN(p, NETDEV_ALIGN); 6772 dev->padded = (char *)dev - (char *)p; 6773 6774 dev->pcpu_refcnt = alloc_percpu(int); 6775 if (!dev->pcpu_refcnt) 6776 goto free_dev; 6777 6778 if (dev_addr_init(dev)) 6779 goto free_pcpu; 6780 6781 dev_mc_init(dev); 6782 dev_uc_init(dev); 6783 6784 dev_net_set(dev, &init_net); 6785 6786 dev->gso_max_size = GSO_MAX_SIZE; 6787 dev->gso_max_segs = GSO_MAX_SEGS; 6788 dev->gso_min_segs = 0; 6789 6790 INIT_LIST_HEAD(&dev->napi_list); 6791 INIT_LIST_HEAD(&dev->unreg_list); 6792 INIT_LIST_HEAD(&dev->close_list); 6793 INIT_LIST_HEAD(&dev->link_watch_list); 6794 INIT_LIST_HEAD(&dev->adj_list.upper); 6795 INIT_LIST_HEAD(&dev->adj_list.lower); 6796 INIT_LIST_HEAD(&dev->all_adj_list.upper); 6797 INIT_LIST_HEAD(&dev->all_adj_list.lower); 6798 INIT_LIST_HEAD(&dev->ptype_all); 6799 INIT_LIST_HEAD(&dev->ptype_specific); 6800 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 6801 setup(dev); 6802 6803 dev->num_tx_queues = txqs; 6804 dev->real_num_tx_queues = txqs; 6805 if (netif_alloc_netdev_queues(dev)) 6806 goto free_all; 6807 6808 #ifdef CONFIG_SYSFS 6809 dev->num_rx_queues = rxqs; 6810 dev->real_num_rx_queues = rxqs; 6811 if (netif_alloc_rx_queues(dev)) 6812 goto free_all; 6813 #endif 6814 6815 strcpy(dev->name, name); 6816 dev->name_assign_type = name_assign_type; 6817 dev->group = INIT_NETDEV_GROUP; 6818 if (!dev->ethtool_ops) 6819 dev->ethtool_ops = &default_ethtool_ops; 6820 return dev; 6821 6822 free_all: 6823 free_netdev(dev); 6824 return NULL; 6825 6826 free_pcpu: 6827 free_percpu(dev->pcpu_refcnt); 6828 free_dev: 6829 netdev_freemem(dev); 6830 return NULL; 6831 } 6832 EXPORT_SYMBOL(alloc_netdev_mqs); 6833 6834 /** 6835 * free_netdev - free network device 6836 * @dev: device 6837 * 6838 * This function does the last stage of destroying an allocated device 6839 * interface. The reference to the device object is released. 6840 * If this is the last reference then it will be freed. 6841 */ 6842 void free_netdev(struct net_device *dev) 6843 { 6844 struct napi_struct *p, *n; 6845 6846 release_net(dev_net(dev)); 6847 6848 netif_free_tx_queues(dev); 6849 #ifdef CONFIG_SYSFS 6850 kvfree(dev->_rx); 6851 #endif 6852 6853 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 6854 6855 /* Flush device addresses */ 6856 dev_addr_flush(dev); 6857 6858 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 6859 netif_napi_del(p); 6860 6861 free_percpu(dev->pcpu_refcnt); 6862 dev->pcpu_refcnt = NULL; 6863 6864 /* Compatibility with error handling in drivers */ 6865 if (dev->reg_state == NETREG_UNINITIALIZED) { 6866 netdev_freemem(dev); 6867 return; 6868 } 6869 6870 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 6871 dev->reg_state = NETREG_RELEASED; 6872 6873 /* will free via device release */ 6874 put_device(&dev->dev); 6875 } 6876 EXPORT_SYMBOL(free_netdev); 6877 6878 /** 6879 * synchronize_net - Synchronize with packet receive processing 6880 * 6881 * Wait for packets currently being received to be done. 6882 * Does not block later packets from starting. 6883 */ 6884 void synchronize_net(void) 6885 { 6886 might_sleep(); 6887 if (rtnl_is_locked()) 6888 synchronize_rcu_expedited(); 6889 else 6890 synchronize_rcu(); 6891 } 6892 EXPORT_SYMBOL(synchronize_net); 6893 6894 /** 6895 * unregister_netdevice_queue - remove device from the kernel 6896 * @dev: device 6897 * @head: list 6898 * 6899 * This function shuts down a device interface and removes it 6900 * from the kernel tables. 6901 * If head not NULL, device is queued to be unregistered later. 6902 * 6903 * Callers must hold the rtnl semaphore. You may want 6904 * unregister_netdev() instead of this. 6905 */ 6906 6907 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 6908 { 6909 ASSERT_RTNL(); 6910 6911 if (head) { 6912 list_move_tail(&dev->unreg_list, head); 6913 } else { 6914 rollback_registered(dev); 6915 /* Finish processing unregister after unlock */ 6916 net_set_todo(dev); 6917 } 6918 } 6919 EXPORT_SYMBOL(unregister_netdevice_queue); 6920 6921 /** 6922 * unregister_netdevice_many - unregister many devices 6923 * @head: list of devices 6924 * 6925 * Note: As most callers use a stack allocated list_head, 6926 * we force a list_del() to make sure stack wont be corrupted later. 6927 */ 6928 void unregister_netdevice_many(struct list_head *head) 6929 { 6930 struct net_device *dev; 6931 6932 if (!list_empty(head)) { 6933 rollback_registered_many(head); 6934 list_for_each_entry(dev, head, unreg_list) 6935 net_set_todo(dev); 6936 list_del(head); 6937 } 6938 } 6939 EXPORT_SYMBOL(unregister_netdevice_many); 6940 6941 /** 6942 * unregister_netdev - remove device from the kernel 6943 * @dev: device 6944 * 6945 * This function shuts down a device interface and removes it 6946 * from the kernel tables. 6947 * 6948 * This is just a wrapper for unregister_netdevice that takes 6949 * the rtnl semaphore. In general you want to use this and not 6950 * unregister_netdevice. 6951 */ 6952 void unregister_netdev(struct net_device *dev) 6953 { 6954 rtnl_lock(); 6955 unregister_netdevice(dev); 6956 rtnl_unlock(); 6957 } 6958 EXPORT_SYMBOL(unregister_netdev); 6959 6960 /** 6961 * dev_change_net_namespace - move device to different nethost namespace 6962 * @dev: device 6963 * @net: network namespace 6964 * @pat: If not NULL name pattern to try if the current device name 6965 * is already taken in the destination network namespace. 6966 * 6967 * This function shuts down a device interface and moves it 6968 * to a new network namespace. On success 0 is returned, on 6969 * a failure a netagive errno code is returned. 6970 * 6971 * Callers must hold the rtnl semaphore. 6972 */ 6973 6974 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 6975 { 6976 int err; 6977 6978 ASSERT_RTNL(); 6979 6980 /* Don't allow namespace local devices to be moved. */ 6981 err = -EINVAL; 6982 if (dev->features & NETIF_F_NETNS_LOCAL) 6983 goto out; 6984 6985 /* Ensure the device has been registrered */ 6986 if (dev->reg_state != NETREG_REGISTERED) 6987 goto out; 6988 6989 /* Get out if there is nothing todo */ 6990 err = 0; 6991 if (net_eq(dev_net(dev), net)) 6992 goto out; 6993 6994 /* Pick the destination device name, and ensure 6995 * we can use it in the destination network namespace. 6996 */ 6997 err = -EEXIST; 6998 if (__dev_get_by_name(net, dev->name)) { 6999 /* We get here if we can't use the current device name */ 7000 if (!pat) 7001 goto out; 7002 if (dev_get_valid_name(net, dev, pat) < 0) 7003 goto out; 7004 } 7005 7006 /* 7007 * And now a mini version of register_netdevice unregister_netdevice. 7008 */ 7009 7010 /* If device is running close it first. */ 7011 dev_close(dev); 7012 7013 /* And unlink it from device chain */ 7014 err = -ENODEV; 7015 unlist_netdevice(dev); 7016 7017 synchronize_net(); 7018 7019 /* Shutdown queueing discipline. */ 7020 dev_shutdown(dev); 7021 7022 /* Notify protocols, that we are about to destroy 7023 this device. They should clean all the things. 7024 7025 Note that dev->reg_state stays at NETREG_REGISTERED. 7026 This is wanted because this way 8021q and macvlan know 7027 the device is just moving and can keep their slaves up. 7028 */ 7029 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7030 rcu_barrier(); 7031 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7032 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7033 7034 /* 7035 * Flush the unicast and multicast chains 7036 */ 7037 dev_uc_flush(dev); 7038 dev_mc_flush(dev); 7039 7040 /* Send a netdev-removed uevent to the old namespace */ 7041 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7042 netdev_adjacent_del_links(dev); 7043 7044 /* Actually switch the network namespace */ 7045 dev_net_set(dev, net); 7046 7047 /* If there is an ifindex conflict assign a new one */ 7048 if (__dev_get_by_index(net, dev->ifindex)) { 7049 int iflink = (dev->iflink == dev->ifindex); 7050 dev->ifindex = dev_new_index(net); 7051 if (iflink) 7052 dev->iflink = dev->ifindex; 7053 } 7054 7055 /* Send a netdev-add uevent to the new namespace */ 7056 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7057 netdev_adjacent_add_links(dev); 7058 7059 /* Fixup kobjects */ 7060 err = device_rename(&dev->dev, dev->name); 7061 WARN_ON(err); 7062 7063 /* Add the device back in the hashes */ 7064 list_netdevice(dev); 7065 7066 /* Notify protocols, that a new device appeared. */ 7067 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7068 7069 /* 7070 * Prevent userspace races by waiting until the network 7071 * device is fully setup before sending notifications. 7072 */ 7073 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7074 7075 synchronize_net(); 7076 err = 0; 7077 out: 7078 return err; 7079 } 7080 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7081 7082 static int dev_cpu_callback(struct notifier_block *nfb, 7083 unsigned long action, 7084 void *ocpu) 7085 { 7086 struct sk_buff **list_skb; 7087 struct sk_buff *skb; 7088 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7089 struct softnet_data *sd, *oldsd; 7090 7091 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7092 return NOTIFY_OK; 7093 7094 local_irq_disable(); 7095 cpu = smp_processor_id(); 7096 sd = &per_cpu(softnet_data, cpu); 7097 oldsd = &per_cpu(softnet_data, oldcpu); 7098 7099 /* Find end of our completion_queue. */ 7100 list_skb = &sd->completion_queue; 7101 while (*list_skb) 7102 list_skb = &(*list_skb)->next; 7103 /* Append completion queue from offline CPU. */ 7104 *list_skb = oldsd->completion_queue; 7105 oldsd->completion_queue = NULL; 7106 7107 /* Append output queue from offline CPU. */ 7108 if (oldsd->output_queue) { 7109 *sd->output_queue_tailp = oldsd->output_queue; 7110 sd->output_queue_tailp = oldsd->output_queue_tailp; 7111 oldsd->output_queue = NULL; 7112 oldsd->output_queue_tailp = &oldsd->output_queue; 7113 } 7114 /* Append NAPI poll list from offline CPU, with one exception : 7115 * process_backlog() must be called by cpu owning percpu backlog. 7116 * We properly handle process_queue & input_pkt_queue later. 7117 */ 7118 while (!list_empty(&oldsd->poll_list)) { 7119 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7120 struct napi_struct, 7121 poll_list); 7122 7123 list_del_init(&napi->poll_list); 7124 if (napi->poll == process_backlog) 7125 napi->state = 0; 7126 else 7127 ____napi_schedule(sd, napi); 7128 } 7129 7130 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7131 local_irq_enable(); 7132 7133 /* Process offline CPU's input_pkt_queue */ 7134 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7135 netif_rx_ni(skb); 7136 input_queue_head_incr(oldsd); 7137 } 7138 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7139 netif_rx_ni(skb); 7140 input_queue_head_incr(oldsd); 7141 } 7142 7143 return NOTIFY_OK; 7144 } 7145 7146 7147 /** 7148 * netdev_increment_features - increment feature set by one 7149 * @all: current feature set 7150 * @one: new feature set 7151 * @mask: mask feature set 7152 * 7153 * Computes a new feature set after adding a device with feature set 7154 * @one to the master device with current feature set @all. Will not 7155 * enable anything that is off in @mask. Returns the new feature set. 7156 */ 7157 netdev_features_t netdev_increment_features(netdev_features_t all, 7158 netdev_features_t one, netdev_features_t mask) 7159 { 7160 if (mask & NETIF_F_GEN_CSUM) 7161 mask |= NETIF_F_ALL_CSUM; 7162 mask |= NETIF_F_VLAN_CHALLENGED; 7163 7164 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 7165 all &= one | ~NETIF_F_ALL_FOR_ALL; 7166 7167 /* If one device supports hw checksumming, set for all. */ 7168 if (all & NETIF_F_GEN_CSUM) 7169 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 7170 7171 return all; 7172 } 7173 EXPORT_SYMBOL(netdev_increment_features); 7174 7175 static struct hlist_head * __net_init netdev_create_hash(void) 7176 { 7177 int i; 7178 struct hlist_head *hash; 7179 7180 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7181 if (hash != NULL) 7182 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7183 INIT_HLIST_HEAD(&hash[i]); 7184 7185 return hash; 7186 } 7187 7188 /* Initialize per network namespace state */ 7189 static int __net_init netdev_init(struct net *net) 7190 { 7191 if (net != &init_net) 7192 INIT_LIST_HEAD(&net->dev_base_head); 7193 7194 net->dev_name_head = netdev_create_hash(); 7195 if (net->dev_name_head == NULL) 7196 goto err_name; 7197 7198 net->dev_index_head = netdev_create_hash(); 7199 if (net->dev_index_head == NULL) 7200 goto err_idx; 7201 7202 return 0; 7203 7204 err_idx: 7205 kfree(net->dev_name_head); 7206 err_name: 7207 return -ENOMEM; 7208 } 7209 7210 /** 7211 * netdev_drivername - network driver for the device 7212 * @dev: network device 7213 * 7214 * Determine network driver for device. 7215 */ 7216 const char *netdev_drivername(const struct net_device *dev) 7217 { 7218 const struct device_driver *driver; 7219 const struct device *parent; 7220 const char *empty = ""; 7221 7222 parent = dev->dev.parent; 7223 if (!parent) 7224 return empty; 7225 7226 driver = parent->driver; 7227 if (driver && driver->name) 7228 return driver->name; 7229 return empty; 7230 } 7231 7232 static void __netdev_printk(const char *level, const struct net_device *dev, 7233 struct va_format *vaf) 7234 { 7235 if (dev && dev->dev.parent) { 7236 dev_printk_emit(level[1] - '0', 7237 dev->dev.parent, 7238 "%s %s %s%s: %pV", 7239 dev_driver_string(dev->dev.parent), 7240 dev_name(dev->dev.parent), 7241 netdev_name(dev), netdev_reg_state(dev), 7242 vaf); 7243 } else if (dev) { 7244 printk("%s%s%s: %pV", 7245 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7246 } else { 7247 printk("%s(NULL net_device): %pV", level, vaf); 7248 } 7249 } 7250 7251 void netdev_printk(const char *level, const struct net_device *dev, 7252 const char *format, ...) 7253 { 7254 struct va_format vaf; 7255 va_list args; 7256 7257 va_start(args, format); 7258 7259 vaf.fmt = format; 7260 vaf.va = &args; 7261 7262 __netdev_printk(level, dev, &vaf); 7263 7264 va_end(args); 7265 } 7266 EXPORT_SYMBOL(netdev_printk); 7267 7268 #define define_netdev_printk_level(func, level) \ 7269 void func(const struct net_device *dev, const char *fmt, ...) \ 7270 { \ 7271 struct va_format vaf; \ 7272 va_list args; \ 7273 \ 7274 va_start(args, fmt); \ 7275 \ 7276 vaf.fmt = fmt; \ 7277 vaf.va = &args; \ 7278 \ 7279 __netdev_printk(level, dev, &vaf); \ 7280 \ 7281 va_end(args); \ 7282 } \ 7283 EXPORT_SYMBOL(func); 7284 7285 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 7286 define_netdev_printk_level(netdev_alert, KERN_ALERT); 7287 define_netdev_printk_level(netdev_crit, KERN_CRIT); 7288 define_netdev_printk_level(netdev_err, KERN_ERR); 7289 define_netdev_printk_level(netdev_warn, KERN_WARNING); 7290 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 7291 define_netdev_printk_level(netdev_info, KERN_INFO); 7292 7293 static void __net_exit netdev_exit(struct net *net) 7294 { 7295 kfree(net->dev_name_head); 7296 kfree(net->dev_index_head); 7297 } 7298 7299 static struct pernet_operations __net_initdata netdev_net_ops = { 7300 .init = netdev_init, 7301 .exit = netdev_exit, 7302 }; 7303 7304 static void __net_exit default_device_exit(struct net *net) 7305 { 7306 struct net_device *dev, *aux; 7307 /* 7308 * Push all migratable network devices back to the 7309 * initial network namespace 7310 */ 7311 rtnl_lock(); 7312 for_each_netdev_safe(net, dev, aux) { 7313 int err; 7314 char fb_name[IFNAMSIZ]; 7315 7316 /* Ignore unmoveable devices (i.e. loopback) */ 7317 if (dev->features & NETIF_F_NETNS_LOCAL) 7318 continue; 7319 7320 /* Leave virtual devices for the generic cleanup */ 7321 if (dev->rtnl_link_ops) 7322 continue; 7323 7324 /* Push remaining network devices to init_net */ 7325 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 7326 err = dev_change_net_namespace(dev, &init_net, fb_name); 7327 if (err) { 7328 pr_emerg("%s: failed to move %s to init_net: %d\n", 7329 __func__, dev->name, err); 7330 BUG(); 7331 } 7332 } 7333 rtnl_unlock(); 7334 } 7335 7336 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 7337 { 7338 /* Return with the rtnl_lock held when there are no network 7339 * devices unregistering in any network namespace in net_list. 7340 */ 7341 struct net *net; 7342 bool unregistering; 7343 DEFINE_WAIT_FUNC(wait, woken_wake_function); 7344 7345 add_wait_queue(&netdev_unregistering_wq, &wait); 7346 for (;;) { 7347 unregistering = false; 7348 rtnl_lock(); 7349 list_for_each_entry(net, net_list, exit_list) { 7350 if (net->dev_unreg_count > 0) { 7351 unregistering = true; 7352 break; 7353 } 7354 } 7355 if (!unregistering) 7356 break; 7357 __rtnl_unlock(); 7358 7359 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 7360 } 7361 remove_wait_queue(&netdev_unregistering_wq, &wait); 7362 } 7363 7364 static void __net_exit default_device_exit_batch(struct list_head *net_list) 7365 { 7366 /* At exit all network devices most be removed from a network 7367 * namespace. Do this in the reverse order of registration. 7368 * Do this across as many network namespaces as possible to 7369 * improve batching efficiency. 7370 */ 7371 struct net_device *dev; 7372 struct net *net; 7373 LIST_HEAD(dev_kill_list); 7374 7375 /* To prevent network device cleanup code from dereferencing 7376 * loopback devices or network devices that have been freed 7377 * wait here for all pending unregistrations to complete, 7378 * before unregistring the loopback device and allowing the 7379 * network namespace be freed. 7380 * 7381 * The netdev todo list containing all network devices 7382 * unregistrations that happen in default_device_exit_batch 7383 * will run in the rtnl_unlock() at the end of 7384 * default_device_exit_batch. 7385 */ 7386 rtnl_lock_unregistering(net_list); 7387 list_for_each_entry(net, net_list, exit_list) { 7388 for_each_netdev_reverse(net, dev) { 7389 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 7390 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 7391 else 7392 unregister_netdevice_queue(dev, &dev_kill_list); 7393 } 7394 } 7395 unregister_netdevice_many(&dev_kill_list); 7396 rtnl_unlock(); 7397 } 7398 7399 static struct pernet_operations __net_initdata default_device_ops = { 7400 .exit = default_device_exit, 7401 .exit_batch = default_device_exit_batch, 7402 }; 7403 7404 /* 7405 * Initialize the DEV module. At boot time this walks the device list and 7406 * unhooks any devices that fail to initialise (normally hardware not 7407 * present) and leaves us with a valid list of present and active devices. 7408 * 7409 */ 7410 7411 /* 7412 * This is called single threaded during boot, so no need 7413 * to take the rtnl semaphore. 7414 */ 7415 static int __init net_dev_init(void) 7416 { 7417 int i, rc = -ENOMEM; 7418 7419 BUG_ON(!dev_boot_phase); 7420 7421 if (dev_proc_init()) 7422 goto out; 7423 7424 if (netdev_kobject_init()) 7425 goto out; 7426 7427 INIT_LIST_HEAD(&ptype_all); 7428 for (i = 0; i < PTYPE_HASH_SIZE; i++) 7429 INIT_LIST_HEAD(&ptype_base[i]); 7430 7431 INIT_LIST_HEAD(&offload_base); 7432 7433 if (register_pernet_subsys(&netdev_net_ops)) 7434 goto out; 7435 7436 /* 7437 * Initialise the packet receive queues. 7438 */ 7439 7440 for_each_possible_cpu(i) { 7441 struct softnet_data *sd = &per_cpu(softnet_data, i); 7442 7443 skb_queue_head_init(&sd->input_pkt_queue); 7444 skb_queue_head_init(&sd->process_queue); 7445 INIT_LIST_HEAD(&sd->poll_list); 7446 sd->output_queue_tailp = &sd->output_queue; 7447 #ifdef CONFIG_RPS 7448 sd->csd.func = rps_trigger_softirq; 7449 sd->csd.info = sd; 7450 sd->cpu = i; 7451 #endif 7452 7453 sd->backlog.poll = process_backlog; 7454 sd->backlog.weight = weight_p; 7455 } 7456 7457 dev_boot_phase = 0; 7458 7459 /* The loopback device is special if any other network devices 7460 * is present in a network namespace the loopback device must 7461 * be present. Since we now dynamically allocate and free the 7462 * loopback device ensure this invariant is maintained by 7463 * keeping the loopback device as the first device on the 7464 * list of network devices. Ensuring the loopback devices 7465 * is the first device that appears and the last network device 7466 * that disappears. 7467 */ 7468 if (register_pernet_device(&loopback_net_ops)) 7469 goto out; 7470 7471 if (register_pernet_device(&default_device_ops)) 7472 goto out; 7473 7474 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 7475 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 7476 7477 hotcpu_notifier(dev_cpu_callback, 0); 7478 dst_init(); 7479 rc = 0; 7480 out: 7481 return rc; 7482 } 7483 7484 subsys_initcall(net_dev_init); 7485