1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <linux/rtnetlink.h> 100 #include <linux/stat.h> 101 #include <net/dst.h> 102 #include <net/pkt_sched.h> 103 #include <net/checksum.h> 104 #include <net/xfrm.h> 105 #include <linux/highmem.h> 106 #include <linux/init.h> 107 #include <linux/module.h> 108 #include <linux/netpoll.h> 109 #include <linux/rcupdate.h> 110 #include <linux/delay.h> 111 #include <net/iw_handler.h> 112 #include <asm/current.h> 113 #include <linux/audit.h> 114 #include <linux/dmaengine.h> 115 #include <linux/err.h> 116 #include <linux/ctype.h> 117 #include <linux/if_arp.h> 118 #include <linux/if_vlan.h> 119 #include <linux/ip.h> 120 #include <net/ip.h> 121 #include <net/mpls.h> 122 #include <linux/ipv6.h> 123 #include <linux/in.h> 124 #include <linux/jhash.h> 125 #include <linux/random.h> 126 #include <trace/events/napi.h> 127 #include <trace/events/net.h> 128 #include <trace/events/skb.h> 129 #include <linux/pci.h> 130 #include <linux/inetdevice.h> 131 #include <linux/cpu_rmap.h> 132 #include <linux/static_key.h> 133 #include <linux/hashtable.h> 134 #include <linux/vmalloc.h> 135 #include <linux/if_macvlan.h> 136 #include <linux/errqueue.h> 137 #include <linux/hrtimer.h> 138 139 #include "net-sysfs.h" 140 141 /* Instead of increasing this, you should create a hash table. */ 142 #define MAX_GRO_SKBS 8 143 144 /* This should be increased if a protocol with a bigger head is added. */ 145 #define GRO_MAX_HEAD (MAX_HEADER + 128) 146 147 static DEFINE_SPINLOCK(ptype_lock); 148 static DEFINE_SPINLOCK(offload_lock); 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 150 struct list_head ptype_all __read_mostly; /* Taps */ 151 static struct list_head offload_base __read_mostly; 152 153 static int netif_rx_internal(struct sk_buff *skb); 154 static int call_netdevice_notifiers_info(unsigned long val, 155 struct net_device *dev, 156 struct netdev_notifier_info *info); 157 158 /* 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 160 * semaphore. 161 * 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 163 * 164 * Writers must hold the rtnl semaphore while they loop through the 165 * dev_base_head list, and hold dev_base_lock for writing when they do the 166 * actual updates. This allows pure readers to access the list even 167 * while a writer is preparing to update it. 168 * 169 * To put it another way, dev_base_lock is held for writing only to 170 * protect against pure readers; the rtnl semaphore provides the 171 * protection against other writers. 172 * 173 * See, for example usages, register_netdevice() and 174 * unregister_netdevice(), which must be called with the rtnl 175 * semaphore held. 176 */ 177 DEFINE_RWLOCK(dev_base_lock); 178 EXPORT_SYMBOL(dev_base_lock); 179 180 /* protects napi_hash addition/deletion and napi_gen_id */ 181 static DEFINE_SPINLOCK(napi_hash_lock); 182 183 static unsigned int napi_gen_id; 184 static DEFINE_HASHTABLE(napi_hash, 8); 185 186 static seqcount_t devnet_rename_seq; 187 188 static inline void dev_base_seq_inc(struct net *net) 189 { 190 while (++net->dev_base_seq == 0); 191 } 192 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 194 { 195 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 196 197 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 198 } 199 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 201 { 202 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 203 } 204 205 static inline void rps_lock(struct softnet_data *sd) 206 { 207 #ifdef CONFIG_RPS 208 spin_lock(&sd->input_pkt_queue.lock); 209 #endif 210 } 211 212 static inline void rps_unlock(struct softnet_data *sd) 213 { 214 #ifdef CONFIG_RPS 215 spin_unlock(&sd->input_pkt_queue.lock); 216 #endif 217 } 218 219 /* Device list insertion */ 220 static void list_netdevice(struct net_device *dev) 221 { 222 struct net *net = dev_net(dev); 223 224 ASSERT_RTNL(); 225 226 write_lock_bh(&dev_base_lock); 227 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 228 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 229 hlist_add_head_rcu(&dev->index_hlist, 230 dev_index_hash(net, dev->ifindex)); 231 write_unlock_bh(&dev_base_lock); 232 233 dev_base_seq_inc(net); 234 } 235 236 /* Device list removal 237 * caller must respect a RCU grace period before freeing/reusing dev 238 */ 239 static void unlist_netdevice(struct net_device *dev) 240 { 241 ASSERT_RTNL(); 242 243 /* Unlink dev from the device chain */ 244 write_lock_bh(&dev_base_lock); 245 list_del_rcu(&dev->dev_list); 246 hlist_del_rcu(&dev->name_hlist); 247 hlist_del_rcu(&dev->index_hlist); 248 write_unlock_bh(&dev_base_lock); 249 250 dev_base_seq_inc(dev_net(dev)); 251 } 252 253 /* 254 * Our notifier list 255 */ 256 257 static RAW_NOTIFIER_HEAD(netdev_chain); 258 259 /* 260 * Device drivers call our routines to queue packets here. We empty the 261 * queue in the local softnet handler. 262 */ 263 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 265 EXPORT_PER_CPU_SYMBOL(softnet_data); 266 267 #ifdef CONFIG_LOCKDEP 268 /* 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 270 * according to dev->type 271 */ 272 static const unsigned short netdev_lock_type[] = 273 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 274 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 275 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 276 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 277 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 278 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 279 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 280 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 281 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 282 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 283 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 284 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 285 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 286 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 287 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 288 289 static const char *const netdev_lock_name[] = 290 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 291 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 292 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 293 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 294 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 295 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 296 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 297 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 298 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 299 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 300 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 301 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 302 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 303 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 304 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 305 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 308 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 310 { 311 int i; 312 313 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 314 if (netdev_lock_type[i] == dev_type) 315 return i; 316 /* the last key is used by default */ 317 return ARRAY_SIZE(netdev_lock_type) - 1; 318 } 319 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 321 unsigned short dev_type) 322 { 323 int i; 324 325 i = netdev_lock_pos(dev_type); 326 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 327 netdev_lock_name[i]); 328 } 329 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 331 { 332 int i; 333 334 i = netdev_lock_pos(dev->type); 335 lockdep_set_class_and_name(&dev->addr_list_lock, 336 &netdev_addr_lock_key[i], 337 netdev_lock_name[i]); 338 } 339 #else 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 341 unsigned short dev_type) 342 { 343 } 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 345 { 346 } 347 #endif 348 349 /******************************************************************************* 350 351 Protocol management and registration routines 352 353 *******************************************************************************/ 354 355 /* 356 * Add a protocol ID to the list. Now that the input handler is 357 * smarter we can dispense with all the messy stuff that used to be 358 * here. 359 * 360 * BEWARE!!! Protocol handlers, mangling input packets, 361 * MUST BE last in hash buckets and checking protocol handlers 362 * MUST start from promiscuous ptype_all chain in net_bh. 363 * It is true now, do not change it. 364 * Explanation follows: if protocol handler, mangling packet, will 365 * be the first on list, it is not able to sense, that packet 366 * is cloned and should be copied-on-write, so that it will 367 * change it and subsequent readers will get broken packet. 368 * --ANK (980803) 369 */ 370 371 static inline struct list_head *ptype_head(const struct packet_type *pt) 372 { 373 if (pt->type == htons(ETH_P_ALL)) 374 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 375 else 376 return pt->dev ? &pt->dev->ptype_specific : 377 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 378 } 379 380 /** 381 * dev_add_pack - add packet handler 382 * @pt: packet type declaration 383 * 384 * Add a protocol handler to the networking stack. The passed &packet_type 385 * is linked into kernel lists and may not be freed until it has been 386 * removed from the kernel lists. 387 * 388 * This call does not sleep therefore it can not 389 * guarantee all CPU's that are in middle of receiving packets 390 * will see the new packet type (until the next received packet). 391 */ 392 393 void dev_add_pack(struct packet_type *pt) 394 { 395 struct list_head *head = ptype_head(pt); 396 397 spin_lock(&ptype_lock); 398 list_add_rcu(&pt->list, head); 399 spin_unlock(&ptype_lock); 400 } 401 EXPORT_SYMBOL(dev_add_pack); 402 403 /** 404 * __dev_remove_pack - remove packet handler 405 * @pt: packet type declaration 406 * 407 * Remove a protocol handler that was previously added to the kernel 408 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 409 * from the kernel lists and can be freed or reused once this function 410 * returns. 411 * 412 * The packet type might still be in use by receivers 413 * and must not be freed until after all the CPU's have gone 414 * through a quiescent state. 415 */ 416 void __dev_remove_pack(struct packet_type *pt) 417 { 418 struct list_head *head = ptype_head(pt); 419 struct packet_type *pt1; 420 421 spin_lock(&ptype_lock); 422 423 list_for_each_entry(pt1, head, list) { 424 if (pt == pt1) { 425 list_del_rcu(&pt->list); 426 goto out; 427 } 428 } 429 430 pr_warn("dev_remove_pack: %p not found\n", pt); 431 out: 432 spin_unlock(&ptype_lock); 433 } 434 EXPORT_SYMBOL(__dev_remove_pack); 435 436 /** 437 * dev_remove_pack - remove packet handler 438 * @pt: packet type declaration 439 * 440 * Remove a protocol handler that was previously added to the kernel 441 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 442 * from the kernel lists and can be freed or reused once this function 443 * returns. 444 * 445 * This call sleeps to guarantee that no CPU is looking at the packet 446 * type after return. 447 */ 448 void dev_remove_pack(struct packet_type *pt) 449 { 450 __dev_remove_pack(pt); 451 452 synchronize_net(); 453 } 454 EXPORT_SYMBOL(dev_remove_pack); 455 456 457 /** 458 * dev_add_offload - register offload handlers 459 * @po: protocol offload declaration 460 * 461 * Add protocol offload handlers to the networking stack. The passed 462 * &proto_offload is linked into kernel lists and may not be freed until 463 * it has been removed from the kernel lists. 464 * 465 * This call does not sleep therefore it can not 466 * guarantee all CPU's that are in middle of receiving packets 467 * will see the new offload handlers (until the next received packet). 468 */ 469 void dev_add_offload(struct packet_offload *po) 470 { 471 struct list_head *head = &offload_base; 472 473 spin_lock(&offload_lock); 474 list_add_rcu(&po->list, head); 475 spin_unlock(&offload_lock); 476 } 477 EXPORT_SYMBOL(dev_add_offload); 478 479 /** 480 * __dev_remove_offload - remove offload handler 481 * @po: packet offload declaration 482 * 483 * Remove a protocol offload handler that was previously added to the 484 * kernel offload handlers by dev_add_offload(). The passed &offload_type 485 * is removed from the kernel lists and can be freed or reused once this 486 * function returns. 487 * 488 * The packet type might still be in use by receivers 489 * and must not be freed until after all the CPU's have gone 490 * through a quiescent state. 491 */ 492 static void __dev_remove_offload(struct packet_offload *po) 493 { 494 struct list_head *head = &offload_base; 495 struct packet_offload *po1; 496 497 spin_lock(&offload_lock); 498 499 list_for_each_entry(po1, head, list) { 500 if (po == po1) { 501 list_del_rcu(&po->list); 502 goto out; 503 } 504 } 505 506 pr_warn("dev_remove_offload: %p not found\n", po); 507 out: 508 spin_unlock(&offload_lock); 509 } 510 511 /** 512 * dev_remove_offload - remove packet offload handler 513 * @po: packet offload declaration 514 * 515 * Remove a packet offload handler that was previously added to the kernel 516 * offload handlers by dev_add_offload(). The passed &offload_type is 517 * removed from the kernel lists and can be freed or reused once this 518 * function returns. 519 * 520 * This call sleeps to guarantee that no CPU is looking at the packet 521 * type after return. 522 */ 523 void dev_remove_offload(struct packet_offload *po) 524 { 525 __dev_remove_offload(po); 526 527 synchronize_net(); 528 } 529 EXPORT_SYMBOL(dev_remove_offload); 530 531 /****************************************************************************** 532 533 Device Boot-time Settings Routines 534 535 *******************************************************************************/ 536 537 /* Boot time configuration table */ 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 539 540 /** 541 * netdev_boot_setup_add - add new setup entry 542 * @name: name of the device 543 * @map: configured settings for the device 544 * 545 * Adds new setup entry to the dev_boot_setup list. The function 546 * returns 0 on error and 1 on success. This is a generic routine to 547 * all netdevices. 548 */ 549 static int netdev_boot_setup_add(char *name, struct ifmap *map) 550 { 551 struct netdev_boot_setup *s; 552 int i; 553 554 s = dev_boot_setup; 555 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 556 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 557 memset(s[i].name, 0, sizeof(s[i].name)); 558 strlcpy(s[i].name, name, IFNAMSIZ); 559 memcpy(&s[i].map, map, sizeof(s[i].map)); 560 break; 561 } 562 } 563 564 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 565 } 566 567 /** 568 * netdev_boot_setup_check - check boot time settings 569 * @dev: the netdevice 570 * 571 * Check boot time settings for the device. 572 * The found settings are set for the device to be used 573 * later in the device probing. 574 * Returns 0 if no settings found, 1 if they are. 575 */ 576 int netdev_boot_setup_check(struct net_device *dev) 577 { 578 struct netdev_boot_setup *s = dev_boot_setup; 579 int i; 580 581 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 582 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 583 !strcmp(dev->name, s[i].name)) { 584 dev->irq = s[i].map.irq; 585 dev->base_addr = s[i].map.base_addr; 586 dev->mem_start = s[i].map.mem_start; 587 dev->mem_end = s[i].map.mem_end; 588 return 1; 589 } 590 } 591 return 0; 592 } 593 EXPORT_SYMBOL(netdev_boot_setup_check); 594 595 596 /** 597 * netdev_boot_base - get address from boot time settings 598 * @prefix: prefix for network device 599 * @unit: id for network device 600 * 601 * Check boot time settings for the base address of device. 602 * The found settings are set for the device to be used 603 * later in the device probing. 604 * Returns 0 if no settings found. 605 */ 606 unsigned long netdev_boot_base(const char *prefix, int unit) 607 { 608 const struct netdev_boot_setup *s = dev_boot_setup; 609 char name[IFNAMSIZ]; 610 int i; 611 612 sprintf(name, "%s%d", prefix, unit); 613 614 /* 615 * If device already registered then return base of 1 616 * to indicate not to probe for this interface 617 */ 618 if (__dev_get_by_name(&init_net, name)) 619 return 1; 620 621 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 622 if (!strcmp(name, s[i].name)) 623 return s[i].map.base_addr; 624 return 0; 625 } 626 627 /* 628 * Saves at boot time configured settings for any netdevice. 629 */ 630 int __init netdev_boot_setup(char *str) 631 { 632 int ints[5]; 633 struct ifmap map; 634 635 str = get_options(str, ARRAY_SIZE(ints), ints); 636 if (!str || !*str) 637 return 0; 638 639 /* Save settings */ 640 memset(&map, 0, sizeof(map)); 641 if (ints[0] > 0) 642 map.irq = ints[1]; 643 if (ints[0] > 1) 644 map.base_addr = ints[2]; 645 if (ints[0] > 2) 646 map.mem_start = ints[3]; 647 if (ints[0] > 3) 648 map.mem_end = ints[4]; 649 650 /* Add new entry to the list */ 651 return netdev_boot_setup_add(str, &map); 652 } 653 654 __setup("netdev=", netdev_boot_setup); 655 656 /******************************************************************************* 657 658 Device Interface Subroutines 659 660 *******************************************************************************/ 661 662 /** 663 * __dev_get_by_name - find a device by its name 664 * @net: the applicable net namespace 665 * @name: name to find 666 * 667 * Find an interface by name. Must be called under RTNL semaphore 668 * or @dev_base_lock. If the name is found a pointer to the device 669 * is returned. If the name is not found then %NULL is returned. The 670 * reference counters are not incremented so the caller must be 671 * careful with locks. 672 */ 673 674 struct net_device *__dev_get_by_name(struct net *net, const char *name) 675 { 676 struct net_device *dev; 677 struct hlist_head *head = dev_name_hash(net, name); 678 679 hlist_for_each_entry(dev, head, name_hlist) 680 if (!strncmp(dev->name, name, IFNAMSIZ)) 681 return dev; 682 683 return NULL; 684 } 685 EXPORT_SYMBOL(__dev_get_by_name); 686 687 /** 688 * dev_get_by_name_rcu - find a device by its name 689 * @net: the applicable net namespace 690 * @name: name to find 691 * 692 * Find an interface by name. 693 * If the name is found a pointer to the device is returned. 694 * If the name is not found then %NULL is returned. 695 * The reference counters are not incremented so the caller must be 696 * careful with locks. The caller must hold RCU lock. 697 */ 698 699 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 700 { 701 struct net_device *dev; 702 struct hlist_head *head = dev_name_hash(net, name); 703 704 hlist_for_each_entry_rcu(dev, head, name_hlist) 705 if (!strncmp(dev->name, name, IFNAMSIZ)) 706 return dev; 707 708 return NULL; 709 } 710 EXPORT_SYMBOL(dev_get_by_name_rcu); 711 712 /** 713 * dev_get_by_name - find a device by its name 714 * @net: the applicable net namespace 715 * @name: name to find 716 * 717 * Find an interface by name. This can be called from any 718 * context and does its own locking. The returned handle has 719 * the usage count incremented and the caller must use dev_put() to 720 * release it when it is no longer needed. %NULL is returned if no 721 * matching device is found. 722 */ 723 724 struct net_device *dev_get_by_name(struct net *net, const char *name) 725 { 726 struct net_device *dev; 727 728 rcu_read_lock(); 729 dev = dev_get_by_name_rcu(net, name); 730 if (dev) 731 dev_hold(dev); 732 rcu_read_unlock(); 733 return dev; 734 } 735 EXPORT_SYMBOL(dev_get_by_name); 736 737 /** 738 * __dev_get_by_index - find a device by its ifindex 739 * @net: the applicable net namespace 740 * @ifindex: index of device 741 * 742 * Search for an interface by index. Returns %NULL if the device 743 * is not found or a pointer to the device. The device has not 744 * had its reference counter increased so the caller must be careful 745 * about locking. The caller must hold either the RTNL semaphore 746 * or @dev_base_lock. 747 */ 748 749 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 750 { 751 struct net_device *dev; 752 struct hlist_head *head = dev_index_hash(net, ifindex); 753 754 hlist_for_each_entry(dev, head, index_hlist) 755 if (dev->ifindex == ifindex) 756 return dev; 757 758 return NULL; 759 } 760 EXPORT_SYMBOL(__dev_get_by_index); 761 762 /** 763 * dev_get_by_index_rcu - find a device by its ifindex 764 * @net: the applicable net namespace 765 * @ifindex: index of device 766 * 767 * Search for an interface by index. Returns %NULL if the device 768 * is not found or a pointer to the device. The device has not 769 * had its reference counter increased so the caller must be careful 770 * about locking. The caller must hold RCU lock. 771 */ 772 773 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 774 { 775 struct net_device *dev; 776 struct hlist_head *head = dev_index_hash(net, ifindex); 777 778 hlist_for_each_entry_rcu(dev, head, index_hlist) 779 if (dev->ifindex == ifindex) 780 return dev; 781 782 return NULL; 783 } 784 EXPORT_SYMBOL(dev_get_by_index_rcu); 785 786 787 /** 788 * dev_get_by_index - find a device by its ifindex 789 * @net: the applicable net namespace 790 * @ifindex: index of device 791 * 792 * Search for an interface by index. Returns NULL if the device 793 * is not found or a pointer to the device. The device returned has 794 * had a reference added and the pointer is safe until the user calls 795 * dev_put to indicate they have finished with it. 796 */ 797 798 struct net_device *dev_get_by_index(struct net *net, int ifindex) 799 { 800 struct net_device *dev; 801 802 rcu_read_lock(); 803 dev = dev_get_by_index_rcu(net, ifindex); 804 if (dev) 805 dev_hold(dev); 806 rcu_read_unlock(); 807 return dev; 808 } 809 EXPORT_SYMBOL(dev_get_by_index); 810 811 /** 812 * netdev_get_name - get a netdevice name, knowing its ifindex. 813 * @net: network namespace 814 * @name: a pointer to the buffer where the name will be stored. 815 * @ifindex: the ifindex of the interface to get the name from. 816 * 817 * The use of raw_seqcount_begin() and cond_resched() before 818 * retrying is required as we want to give the writers a chance 819 * to complete when CONFIG_PREEMPT is not set. 820 */ 821 int netdev_get_name(struct net *net, char *name, int ifindex) 822 { 823 struct net_device *dev; 824 unsigned int seq; 825 826 retry: 827 seq = raw_seqcount_begin(&devnet_rename_seq); 828 rcu_read_lock(); 829 dev = dev_get_by_index_rcu(net, ifindex); 830 if (!dev) { 831 rcu_read_unlock(); 832 return -ENODEV; 833 } 834 835 strcpy(name, dev->name); 836 rcu_read_unlock(); 837 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 838 cond_resched(); 839 goto retry; 840 } 841 842 return 0; 843 } 844 845 /** 846 * dev_getbyhwaddr_rcu - find a device by its hardware address 847 * @net: the applicable net namespace 848 * @type: media type of device 849 * @ha: hardware address 850 * 851 * Search for an interface by MAC address. Returns NULL if the device 852 * is not found or a pointer to the device. 853 * The caller must hold RCU or RTNL. 854 * The returned device has not had its ref count increased 855 * and the caller must therefore be careful about locking 856 * 857 */ 858 859 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 860 const char *ha) 861 { 862 struct net_device *dev; 863 864 for_each_netdev_rcu(net, dev) 865 if (dev->type == type && 866 !memcmp(dev->dev_addr, ha, dev->addr_len)) 867 return dev; 868 869 return NULL; 870 } 871 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 872 873 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 874 { 875 struct net_device *dev; 876 877 ASSERT_RTNL(); 878 for_each_netdev(net, dev) 879 if (dev->type == type) 880 return dev; 881 882 return NULL; 883 } 884 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 885 886 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 887 { 888 struct net_device *dev, *ret = NULL; 889 890 rcu_read_lock(); 891 for_each_netdev_rcu(net, dev) 892 if (dev->type == type) { 893 dev_hold(dev); 894 ret = dev; 895 break; 896 } 897 rcu_read_unlock(); 898 return ret; 899 } 900 EXPORT_SYMBOL(dev_getfirstbyhwtype); 901 902 /** 903 * __dev_get_by_flags - find any device with given flags 904 * @net: the applicable net namespace 905 * @if_flags: IFF_* values 906 * @mask: bitmask of bits in if_flags to check 907 * 908 * Search for any interface with the given flags. Returns NULL if a device 909 * is not found or a pointer to the device. Must be called inside 910 * rtnl_lock(), and result refcount is unchanged. 911 */ 912 913 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 914 unsigned short mask) 915 { 916 struct net_device *dev, *ret; 917 918 ASSERT_RTNL(); 919 920 ret = NULL; 921 for_each_netdev(net, dev) { 922 if (((dev->flags ^ if_flags) & mask) == 0) { 923 ret = dev; 924 break; 925 } 926 } 927 return ret; 928 } 929 EXPORT_SYMBOL(__dev_get_by_flags); 930 931 /** 932 * dev_valid_name - check if name is okay for network device 933 * @name: name string 934 * 935 * Network device names need to be valid file names to 936 * to allow sysfs to work. We also disallow any kind of 937 * whitespace. 938 */ 939 bool dev_valid_name(const char *name) 940 { 941 if (*name == '\0') 942 return false; 943 if (strlen(name) >= IFNAMSIZ) 944 return false; 945 if (!strcmp(name, ".") || !strcmp(name, "..")) 946 return false; 947 948 while (*name) { 949 if (*name == '/' || *name == ':' || isspace(*name)) 950 return false; 951 name++; 952 } 953 return true; 954 } 955 EXPORT_SYMBOL(dev_valid_name); 956 957 /** 958 * __dev_alloc_name - allocate a name for a device 959 * @net: network namespace to allocate the device name in 960 * @name: name format string 961 * @buf: scratch buffer and result name string 962 * 963 * Passed a format string - eg "lt%d" it will try and find a suitable 964 * id. It scans list of devices to build up a free map, then chooses 965 * the first empty slot. The caller must hold the dev_base or rtnl lock 966 * while allocating the name and adding the device in order to avoid 967 * duplicates. 968 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 969 * Returns the number of the unit assigned or a negative errno code. 970 */ 971 972 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 973 { 974 int i = 0; 975 const char *p; 976 const int max_netdevices = 8*PAGE_SIZE; 977 unsigned long *inuse; 978 struct net_device *d; 979 980 p = strnchr(name, IFNAMSIZ-1, '%'); 981 if (p) { 982 /* 983 * Verify the string as this thing may have come from 984 * the user. There must be either one "%d" and no other "%" 985 * characters. 986 */ 987 if (p[1] != 'd' || strchr(p + 2, '%')) 988 return -EINVAL; 989 990 /* Use one page as a bit array of possible slots */ 991 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 992 if (!inuse) 993 return -ENOMEM; 994 995 for_each_netdev(net, d) { 996 if (!sscanf(d->name, name, &i)) 997 continue; 998 if (i < 0 || i >= max_netdevices) 999 continue; 1000 1001 /* avoid cases where sscanf is not exact inverse of printf */ 1002 snprintf(buf, IFNAMSIZ, name, i); 1003 if (!strncmp(buf, d->name, IFNAMSIZ)) 1004 set_bit(i, inuse); 1005 } 1006 1007 i = find_first_zero_bit(inuse, max_netdevices); 1008 free_page((unsigned long) inuse); 1009 } 1010 1011 if (buf != name) 1012 snprintf(buf, IFNAMSIZ, name, i); 1013 if (!__dev_get_by_name(net, buf)) 1014 return i; 1015 1016 /* It is possible to run out of possible slots 1017 * when the name is long and there isn't enough space left 1018 * for the digits, or if all bits are used. 1019 */ 1020 return -ENFILE; 1021 } 1022 1023 /** 1024 * dev_alloc_name - allocate a name for a device 1025 * @dev: device 1026 * @name: name format string 1027 * 1028 * Passed a format string - eg "lt%d" it will try and find a suitable 1029 * id. It scans list of devices to build up a free map, then chooses 1030 * the first empty slot. The caller must hold the dev_base or rtnl lock 1031 * while allocating the name and adding the device in order to avoid 1032 * duplicates. 1033 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1034 * Returns the number of the unit assigned or a negative errno code. 1035 */ 1036 1037 int dev_alloc_name(struct net_device *dev, const char *name) 1038 { 1039 char buf[IFNAMSIZ]; 1040 struct net *net; 1041 int ret; 1042 1043 BUG_ON(!dev_net(dev)); 1044 net = dev_net(dev); 1045 ret = __dev_alloc_name(net, name, buf); 1046 if (ret >= 0) 1047 strlcpy(dev->name, buf, IFNAMSIZ); 1048 return ret; 1049 } 1050 EXPORT_SYMBOL(dev_alloc_name); 1051 1052 static int dev_alloc_name_ns(struct net *net, 1053 struct net_device *dev, 1054 const char *name) 1055 { 1056 char buf[IFNAMSIZ]; 1057 int ret; 1058 1059 ret = __dev_alloc_name(net, name, buf); 1060 if (ret >= 0) 1061 strlcpy(dev->name, buf, IFNAMSIZ); 1062 return ret; 1063 } 1064 1065 static int dev_get_valid_name(struct net *net, 1066 struct net_device *dev, 1067 const char *name) 1068 { 1069 BUG_ON(!net); 1070 1071 if (!dev_valid_name(name)) 1072 return -EINVAL; 1073 1074 if (strchr(name, '%')) 1075 return dev_alloc_name_ns(net, dev, name); 1076 else if (__dev_get_by_name(net, name)) 1077 return -EEXIST; 1078 else if (dev->name != name) 1079 strlcpy(dev->name, name, IFNAMSIZ); 1080 1081 return 0; 1082 } 1083 1084 /** 1085 * dev_change_name - change name of a device 1086 * @dev: device 1087 * @newname: name (or format string) must be at least IFNAMSIZ 1088 * 1089 * Change name of a device, can pass format strings "eth%d". 1090 * for wildcarding. 1091 */ 1092 int dev_change_name(struct net_device *dev, const char *newname) 1093 { 1094 unsigned char old_assign_type; 1095 char oldname[IFNAMSIZ]; 1096 int err = 0; 1097 int ret; 1098 struct net *net; 1099 1100 ASSERT_RTNL(); 1101 BUG_ON(!dev_net(dev)); 1102 1103 net = dev_net(dev); 1104 if (dev->flags & IFF_UP) 1105 return -EBUSY; 1106 1107 write_seqcount_begin(&devnet_rename_seq); 1108 1109 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1110 write_seqcount_end(&devnet_rename_seq); 1111 return 0; 1112 } 1113 1114 memcpy(oldname, dev->name, IFNAMSIZ); 1115 1116 err = dev_get_valid_name(net, dev, newname); 1117 if (err < 0) { 1118 write_seqcount_end(&devnet_rename_seq); 1119 return err; 1120 } 1121 1122 if (oldname[0] && !strchr(oldname, '%')) 1123 netdev_info(dev, "renamed from %s\n", oldname); 1124 1125 old_assign_type = dev->name_assign_type; 1126 dev->name_assign_type = NET_NAME_RENAMED; 1127 1128 rollback: 1129 ret = device_rename(&dev->dev, dev->name); 1130 if (ret) { 1131 memcpy(dev->name, oldname, IFNAMSIZ); 1132 dev->name_assign_type = old_assign_type; 1133 write_seqcount_end(&devnet_rename_seq); 1134 return ret; 1135 } 1136 1137 write_seqcount_end(&devnet_rename_seq); 1138 1139 netdev_adjacent_rename_links(dev, oldname); 1140 1141 write_lock_bh(&dev_base_lock); 1142 hlist_del_rcu(&dev->name_hlist); 1143 write_unlock_bh(&dev_base_lock); 1144 1145 synchronize_rcu(); 1146 1147 write_lock_bh(&dev_base_lock); 1148 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1149 write_unlock_bh(&dev_base_lock); 1150 1151 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1152 ret = notifier_to_errno(ret); 1153 1154 if (ret) { 1155 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1156 if (err >= 0) { 1157 err = ret; 1158 write_seqcount_begin(&devnet_rename_seq); 1159 memcpy(dev->name, oldname, IFNAMSIZ); 1160 memcpy(oldname, newname, IFNAMSIZ); 1161 dev->name_assign_type = old_assign_type; 1162 old_assign_type = NET_NAME_RENAMED; 1163 goto rollback; 1164 } else { 1165 pr_err("%s: name change rollback failed: %d\n", 1166 dev->name, ret); 1167 } 1168 } 1169 1170 return err; 1171 } 1172 1173 /** 1174 * dev_set_alias - change ifalias of a device 1175 * @dev: device 1176 * @alias: name up to IFALIASZ 1177 * @len: limit of bytes to copy from info 1178 * 1179 * Set ifalias for a device, 1180 */ 1181 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1182 { 1183 char *new_ifalias; 1184 1185 ASSERT_RTNL(); 1186 1187 if (len >= IFALIASZ) 1188 return -EINVAL; 1189 1190 if (!len) { 1191 kfree(dev->ifalias); 1192 dev->ifalias = NULL; 1193 return 0; 1194 } 1195 1196 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1197 if (!new_ifalias) 1198 return -ENOMEM; 1199 dev->ifalias = new_ifalias; 1200 1201 strlcpy(dev->ifalias, alias, len+1); 1202 return len; 1203 } 1204 1205 1206 /** 1207 * netdev_features_change - device changes features 1208 * @dev: device to cause notification 1209 * 1210 * Called to indicate a device has changed features. 1211 */ 1212 void netdev_features_change(struct net_device *dev) 1213 { 1214 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1215 } 1216 EXPORT_SYMBOL(netdev_features_change); 1217 1218 /** 1219 * netdev_state_change - device changes state 1220 * @dev: device to cause notification 1221 * 1222 * Called to indicate a device has changed state. This function calls 1223 * the notifier chains for netdev_chain and sends a NEWLINK message 1224 * to the routing socket. 1225 */ 1226 void netdev_state_change(struct net_device *dev) 1227 { 1228 if (dev->flags & IFF_UP) { 1229 struct netdev_notifier_change_info change_info; 1230 1231 change_info.flags_changed = 0; 1232 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1233 &change_info.info); 1234 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1235 } 1236 } 1237 EXPORT_SYMBOL(netdev_state_change); 1238 1239 /** 1240 * netdev_notify_peers - notify network peers about existence of @dev 1241 * @dev: network device 1242 * 1243 * Generate traffic such that interested network peers are aware of 1244 * @dev, such as by generating a gratuitous ARP. This may be used when 1245 * a device wants to inform the rest of the network about some sort of 1246 * reconfiguration such as a failover event or virtual machine 1247 * migration. 1248 */ 1249 void netdev_notify_peers(struct net_device *dev) 1250 { 1251 rtnl_lock(); 1252 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1253 rtnl_unlock(); 1254 } 1255 EXPORT_SYMBOL(netdev_notify_peers); 1256 1257 static int __dev_open(struct net_device *dev) 1258 { 1259 const struct net_device_ops *ops = dev->netdev_ops; 1260 int ret; 1261 1262 ASSERT_RTNL(); 1263 1264 if (!netif_device_present(dev)) 1265 return -ENODEV; 1266 1267 /* Block netpoll from trying to do any rx path servicing. 1268 * If we don't do this there is a chance ndo_poll_controller 1269 * or ndo_poll may be running while we open the device 1270 */ 1271 netpoll_poll_disable(dev); 1272 1273 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1274 ret = notifier_to_errno(ret); 1275 if (ret) 1276 return ret; 1277 1278 set_bit(__LINK_STATE_START, &dev->state); 1279 1280 if (ops->ndo_validate_addr) 1281 ret = ops->ndo_validate_addr(dev); 1282 1283 if (!ret && ops->ndo_open) 1284 ret = ops->ndo_open(dev); 1285 1286 netpoll_poll_enable(dev); 1287 1288 if (ret) 1289 clear_bit(__LINK_STATE_START, &dev->state); 1290 else { 1291 dev->flags |= IFF_UP; 1292 dev_set_rx_mode(dev); 1293 dev_activate(dev); 1294 add_device_randomness(dev->dev_addr, dev->addr_len); 1295 } 1296 1297 return ret; 1298 } 1299 1300 /** 1301 * dev_open - prepare an interface for use. 1302 * @dev: device to open 1303 * 1304 * Takes a device from down to up state. The device's private open 1305 * function is invoked and then the multicast lists are loaded. Finally 1306 * the device is moved into the up state and a %NETDEV_UP message is 1307 * sent to the netdev notifier chain. 1308 * 1309 * Calling this function on an active interface is a nop. On a failure 1310 * a negative errno code is returned. 1311 */ 1312 int dev_open(struct net_device *dev) 1313 { 1314 int ret; 1315 1316 if (dev->flags & IFF_UP) 1317 return 0; 1318 1319 ret = __dev_open(dev); 1320 if (ret < 0) 1321 return ret; 1322 1323 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1324 call_netdevice_notifiers(NETDEV_UP, dev); 1325 1326 return ret; 1327 } 1328 EXPORT_SYMBOL(dev_open); 1329 1330 static int __dev_close_many(struct list_head *head) 1331 { 1332 struct net_device *dev; 1333 1334 ASSERT_RTNL(); 1335 might_sleep(); 1336 1337 list_for_each_entry(dev, head, close_list) { 1338 /* Temporarily disable netpoll until the interface is down */ 1339 netpoll_poll_disable(dev); 1340 1341 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1342 1343 clear_bit(__LINK_STATE_START, &dev->state); 1344 1345 /* Synchronize to scheduled poll. We cannot touch poll list, it 1346 * can be even on different cpu. So just clear netif_running(). 1347 * 1348 * dev->stop() will invoke napi_disable() on all of it's 1349 * napi_struct instances on this device. 1350 */ 1351 smp_mb__after_atomic(); /* Commit netif_running(). */ 1352 } 1353 1354 dev_deactivate_many(head); 1355 1356 list_for_each_entry(dev, head, close_list) { 1357 const struct net_device_ops *ops = dev->netdev_ops; 1358 1359 /* 1360 * Call the device specific close. This cannot fail. 1361 * Only if device is UP 1362 * 1363 * We allow it to be called even after a DETACH hot-plug 1364 * event. 1365 */ 1366 if (ops->ndo_stop) 1367 ops->ndo_stop(dev); 1368 1369 dev->flags &= ~IFF_UP; 1370 netpoll_poll_enable(dev); 1371 } 1372 1373 return 0; 1374 } 1375 1376 static int __dev_close(struct net_device *dev) 1377 { 1378 int retval; 1379 LIST_HEAD(single); 1380 1381 list_add(&dev->close_list, &single); 1382 retval = __dev_close_many(&single); 1383 list_del(&single); 1384 1385 return retval; 1386 } 1387 1388 int dev_close_many(struct list_head *head, bool unlink) 1389 { 1390 struct net_device *dev, *tmp; 1391 1392 /* Remove the devices that don't need to be closed */ 1393 list_for_each_entry_safe(dev, tmp, head, close_list) 1394 if (!(dev->flags & IFF_UP)) 1395 list_del_init(&dev->close_list); 1396 1397 __dev_close_many(head); 1398 1399 list_for_each_entry_safe(dev, tmp, head, close_list) { 1400 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1401 call_netdevice_notifiers(NETDEV_DOWN, dev); 1402 if (unlink) 1403 list_del_init(&dev->close_list); 1404 } 1405 1406 return 0; 1407 } 1408 EXPORT_SYMBOL(dev_close_many); 1409 1410 /** 1411 * dev_close - shutdown an interface. 1412 * @dev: device to shutdown 1413 * 1414 * This function moves an active device into down state. A 1415 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1416 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1417 * chain. 1418 */ 1419 int dev_close(struct net_device *dev) 1420 { 1421 if (dev->flags & IFF_UP) { 1422 LIST_HEAD(single); 1423 1424 list_add(&dev->close_list, &single); 1425 dev_close_many(&single, true); 1426 list_del(&single); 1427 } 1428 return 0; 1429 } 1430 EXPORT_SYMBOL(dev_close); 1431 1432 1433 /** 1434 * dev_disable_lro - disable Large Receive Offload on a device 1435 * @dev: device 1436 * 1437 * Disable Large Receive Offload (LRO) on a net device. Must be 1438 * called under RTNL. This is needed if received packets may be 1439 * forwarded to another interface. 1440 */ 1441 void dev_disable_lro(struct net_device *dev) 1442 { 1443 struct net_device *lower_dev; 1444 struct list_head *iter; 1445 1446 dev->wanted_features &= ~NETIF_F_LRO; 1447 netdev_update_features(dev); 1448 1449 if (unlikely(dev->features & NETIF_F_LRO)) 1450 netdev_WARN(dev, "failed to disable LRO!\n"); 1451 1452 netdev_for_each_lower_dev(dev, lower_dev, iter) 1453 dev_disable_lro(lower_dev); 1454 } 1455 EXPORT_SYMBOL(dev_disable_lro); 1456 1457 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1458 struct net_device *dev) 1459 { 1460 struct netdev_notifier_info info; 1461 1462 netdev_notifier_info_init(&info, dev); 1463 return nb->notifier_call(nb, val, &info); 1464 } 1465 1466 static int dev_boot_phase = 1; 1467 1468 /** 1469 * register_netdevice_notifier - register a network notifier block 1470 * @nb: notifier 1471 * 1472 * Register a notifier to be called when network device events occur. 1473 * The notifier passed is linked into the kernel structures and must 1474 * not be reused until it has been unregistered. A negative errno code 1475 * is returned on a failure. 1476 * 1477 * When registered all registration and up events are replayed 1478 * to the new notifier to allow device to have a race free 1479 * view of the network device list. 1480 */ 1481 1482 int register_netdevice_notifier(struct notifier_block *nb) 1483 { 1484 struct net_device *dev; 1485 struct net_device *last; 1486 struct net *net; 1487 int err; 1488 1489 rtnl_lock(); 1490 err = raw_notifier_chain_register(&netdev_chain, nb); 1491 if (err) 1492 goto unlock; 1493 if (dev_boot_phase) 1494 goto unlock; 1495 for_each_net(net) { 1496 for_each_netdev(net, dev) { 1497 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1498 err = notifier_to_errno(err); 1499 if (err) 1500 goto rollback; 1501 1502 if (!(dev->flags & IFF_UP)) 1503 continue; 1504 1505 call_netdevice_notifier(nb, NETDEV_UP, dev); 1506 } 1507 } 1508 1509 unlock: 1510 rtnl_unlock(); 1511 return err; 1512 1513 rollback: 1514 last = dev; 1515 for_each_net(net) { 1516 for_each_netdev(net, dev) { 1517 if (dev == last) 1518 goto outroll; 1519 1520 if (dev->flags & IFF_UP) { 1521 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1522 dev); 1523 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1524 } 1525 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1526 } 1527 } 1528 1529 outroll: 1530 raw_notifier_chain_unregister(&netdev_chain, nb); 1531 goto unlock; 1532 } 1533 EXPORT_SYMBOL(register_netdevice_notifier); 1534 1535 /** 1536 * unregister_netdevice_notifier - unregister a network notifier block 1537 * @nb: notifier 1538 * 1539 * Unregister a notifier previously registered by 1540 * register_netdevice_notifier(). The notifier is unlinked into the 1541 * kernel structures and may then be reused. A negative errno code 1542 * is returned on a failure. 1543 * 1544 * After unregistering unregister and down device events are synthesized 1545 * for all devices on the device list to the removed notifier to remove 1546 * the need for special case cleanup code. 1547 */ 1548 1549 int unregister_netdevice_notifier(struct notifier_block *nb) 1550 { 1551 struct net_device *dev; 1552 struct net *net; 1553 int err; 1554 1555 rtnl_lock(); 1556 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1557 if (err) 1558 goto unlock; 1559 1560 for_each_net(net) { 1561 for_each_netdev(net, dev) { 1562 if (dev->flags & IFF_UP) { 1563 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1564 dev); 1565 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1566 } 1567 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1568 } 1569 } 1570 unlock: 1571 rtnl_unlock(); 1572 return err; 1573 } 1574 EXPORT_SYMBOL(unregister_netdevice_notifier); 1575 1576 /** 1577 * call_netdevice_notifiers_info - call all network notifier blocks 1578 * @val: value passed unmodified to notifier function 1579 * @dev: net_device pointer passed unmodified to notifier function 1580 * @info: notifier information data 1581 * 1582 * Call all network notifier blocks. Parameters and return value 1583 * are as for raw_notifier_call_chain(). 1584 */ 1585 1586 static int call_netdevice_notifiers_info(unsigned long val, 1587 struct net_device *dev, 1588 struct netdev_notifier_info *info) 1589 { 1590 ASSERT_RTNL(); 1591 netdev_notifier_info_init(info, dev); 1592 return raw_notifier_call_chain(&netdev_chain, val, info); 1593 } 1594 1595 /** 1596 * call_netdevice_notifiers - call all network notifier blocks 1597 * @val: value passed unmodified to notifier function 1598 * @dev: net_device pointer passed unmodified to notifier function 1599 * 1600 * Call all network notifier blocks. Parameters and return value 1601 * are as for raw_notifier_call_chain(). 1602 */ 1603 1604 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1605 { 1606 struct netdev_notifier_info info; 1607 1608 return call_netdevice_notifiers_info(val, dev, &info); 1609 } 1610 EXPORT_SYMBOL(call_netdevice_notifiers); 1611 1612 static struct static_key netstamp_needed __read_mostly; 1613 #ifdef HAVE_JUMP_LABEL 1614 /* We are not allowed to call static_key_slow_dec() from irq context 1615 * If net_disable_timestamp() is called from irq context, defer the 1616 * static_key_slow_dec() calls. 1617 */ 1618 static atomic_t netstamp_needed_deferred; 1619 #endif 1620 1621 void net_enable_timestamp(void) 1622 { 1623 #ifdef HAVE_JUMP_LABEL 1624 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1625 1626 if (deferred) { 1627 while (--deferred) 1628 static_key_slow_dec(&netstamp_needed); 1629 return; 1630 } 1631 #endif 1632 static_key_slow_inc(&netstamp_needed); 1633 } 1634 EXPORT_SYMBOL(net_enable_timestamp); 1635 1636 void net_disable_timestamp(void) 1637 { 1638 #ifdef HAVE_JUMP_LABEL 1639 if (in_interrupt()) { 1640 atomic_inc(&netstamp_needed_deferred); 1641 return; 1642 } 1643 #endif 1644 static_key_slow_dec(&netstamp_needed); 1645 } 1646 EXPORT_SYMBOL(net_disable_timestamp); 1647 1648 static inline void net_timestamp_set(struct sk_buff *skb) 1649 { 1650 skb->tstamp.tv64 = 0; 1651 if (static_key_false(&netstamp_needed)) 1652 __net_timestamp(skb); 1653 } 1654 1655 #define net_timestamp_check(COND, SKB) \ 1656 if (static_key_false(&netstamp_needed)) { \ 1657 if ((COND) && !(SKB)->tstamp.tv64) \ 1658 __net_timestamp(SKB); \ 1659 } \ 1660 1661 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) 1662 { 1663 unsigned int len; 1664 1665 if (!(dev->flags & IFF_UP)) 1666 return false; 1667 1668 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1669 if (skb->len <= len) 1670 return true; 1671 1672 /* if TSO is enabled, we don't care about the length as the packet 1673 * could be forwarded without being segmented before 1674 */ 1675 if (skb_is_gso(skb)) 1676 return true; 1677 1678 return false; 1679 } 1680 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1681 1682 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1683 { 1684 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 1685 if (skb_copy_ubufs(skb, GFP_ATOMIC)) { 1686 atomic_long_inc(&dev->rx_dropped); 1687 kfree_skb(skb); 1688 return NET_RX_DROP; 1689 } 1690 } 1691 1692 if (unlikely(!is_skb_forwardable(dev, skb))) { 1693 atomic_long_inc(&dev->rx_dropped); 1694 kfree_skb(skb); 1695 return NET_RX_DROP; 1696 } 1697 1698 skb_scrub_packet(skb, true); 1699 skb->priority = 0; 1700 skb->protocol = eth_type_trans(skb, dev); 1701 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1702 1703 return 0; 1704 } 1705 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1706 1707 /** 1708 * dev_forward_skb - loopback an skb to another netif 1709 * 1710 * @dev: destination network device 1711 * @skb: buffer to forward 1712 * 1713 * return values: 1714 * NET_RX_SUCCESS (no congestion) 1715 * NET_RX_DROP (packet was dropped, but freed) 1716 * 1717 * dev_forward_skb can be used for injecting an skb from the 1718 * start_xmit function of one device into the receive queue 1719 * of another device. 1720 * 1721 * The receiving device may be in another namespace, so 1722 * we have to clear all information in the skb that could 1723 * impact namespace isolation. 1724 */ 1725 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1726 { 1727 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1728 } 1729 EXPORT_SYMBOL_GPL(dev_forward_skb); 1730 1731 static inline int deliver_skb(struct sk_buff *skb, 1732 struct packet_type *pt_prev, 1733 struct net_device *orig_dev) 1734 { 1735 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1736 return -ENOMEM; 1737 atomic_inc(&skb->users); 1738 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1739 } 1740 1741 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1742 struct packet_type **pt, 1743 struct net_device *orig_dev, 1744 __be16 type, 1745 struct list_head *ptype_list) 1746 { 1747 struct packet_type *ptype, *pt_prev = *pt; 1748 1749 list_for_each_entry_rcu(ptype, ptype_list, list) { 1750 if (ptype->type != type) 1751 continue; 1752 if (pt_prev) 1753 deliver_skb(skb, pt_prev, orig_dev); 1754 pt_prev = ptype; 1755 } 1756 *pt = pt_prev; 1757 } 1758 1759 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1760 { 1761 if (!ptype->af_packet_priv || !skb->sk) 1762 return false; 1763 1764 if (ptype->id_match) 1765 return ptype->id_match(ptype, skb->sk); 1766 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1767 return true; 1768 1769 return false; 1770 } 1771 1772 /* 1773 * Support routine. Sends outgoing frames to any network 1774 * taps currently in use. 1775 */ 1776 1777 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1778 { 1779 struct packet_type *ptype; 1780 struct sk_buff *skb2 = NULL; 1781 struct packet_type *pt_prev = NULL; 1782 struct list_head *ptype_list = &ptype_all; 1783 1784 rcu_read_lock(); 1785 again: 1786 list_for_each_entry_rcu(ptype, ptype_list, list) { 1787 /* Never send packets back to the socket 1788 * they originated from - MvS (miquels@drinkel.ow.org) 1789 */ 1790 if (skb_loop_sk(ptype, skb)) 1791 continue; 1792 1793 if (pt_prev) { 1794 deliver_skb(skb2, pt_prev, skb->dev); 1795 pt_prev = ptype; 1796 continue; 1797 } 1798 1799 /* need to clone skb, done only once */ 1800 skb2 = skb_clone(skb, GFP_ATOMIC); 1801 if (!skb2) 1802 goto out_unlock; 1803 1804 net_timestamp_set(skb2); 1805 1806 /* skb->nh should be correctly 1807 * set by sender, so that the second statement is 1808 * just protection against buggy protocols. 1809 */ 1810 skb_reset_mac_header(skb2); 1811 1812 if (skb_network_header(skb2) < skb2->data || 1813 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1814 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1815 ntohs(skb2->protocol), 1816 dev->name); 1817 skb_reset_network_header(skb2); 1818 } 1819 1820 skb2->transport_header = skb2->network_header; 1821 skb2->pkt_type = PACKET_OUTGOING; 1822 pt_prev = ptype; 1823 } 1824 1825 if (ptype_list == &ptype_all) { 1826 ptype_list = &dev->ptype_all; 1827 goto again; 1828 } 1829 out_unlock: 1830 if (pt_prev) 1831 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1832 rcu_read_unlock(); 1833 } 1834 1835 /** 1836 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1837 * @dev: Network device 1838 * @txq: number of queues available 1839 * 1840 * If real_num_tx_queues is changed the tc mappings may no longer be 1841 * valid. To resolve this verify the tc mapping remains valid and if 1842 * not NULL the mapping. With no priorities mapping to this 1843 * offset/count pair it will no longer be used. In the worst case TC0 1844 * is invalid nothing can be done so disable priority mappings. If is 1845 * expected that drivers will fix this mapping if they can before 1846 * calling netif_set_real_num_tx_queues. 1847 */ 1848 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1849 { 1850 int i; 1851 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1852 1853 /* If TC0 is invalidated disable TC mapping */ 1854 if (tc->offset + tc->count > txq) { 1855 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1856 dev->num_tc = 0; 1857 return; 1858 } 1859 1860 /* Invalidated prio to tc mappings set to TC0 */ 1861 for (i = 1; i < TC_BITMASK + 1; i++) { 1862 int q = netdev_get_prio_tc_map(dev, i); 1863 1864 tc = &dev->tc_to_txq[q]; 1865 if (tc->offset + tc->count > txq) { 1866 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1867 i, q); 1868 netdev_set_prio_tc_map(dev, i, 0); 1869 } 1870 } 1871 } 1872 1873 #ifdef CONFIG_XPS 1874 static DEFINE_MUTEX(xps_map_mutex); 1875 #define xmap_dereference(P) \ 1876 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1877 1878 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1879 int cpu, u16 index) 1880 { 1881 struct xps_map *map = NULL; 1882 int pos; 1883 1884 if (dev_maps) 1885 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1886 1887 for (pos = 0; map && pos < map->len; pos++) { 1888 if (map->queues[pos] == index) { 1889 if (map->len > 1) { 1890 map->queues[pos] = map->queues[--map->len]; 1891 } else { 1892 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1893 kfree_rcu(map, rcu); 1894 map = NULL; 1895 } 1896 break; 1897 } 1898 } 1899 1900 return map; 1901 } 1902 1903 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1904 { 1905 struct xps_dev_maps *dev_maps; 1906 int cpu, i; 1907 bool active = false; 1908 1909 mutex_lock(&xps_map_mutex); 1910 dev_maps = xmap_dereference(dev->xps_maps); 1911 1912 if (!dev_maps) 1913 goto out_no_maps; 1914 1915 for_each_possible_cpu(cpu) { 1916 for (i = index; i < dev->num_tx_queues; i++) { 1917 if (!remove_xps_queue(dev_maps, cpu, i)) 1918 break; 1919 } 1920 if (i == dev->num_tx_queues) 1921 active = true; 1922 } 1923 1924 if (!active) { 1925 RCU_INIT_POINTER(dev->xps_maps, NULL); 1926 kfree_rcu(dev_maps, rcu); 1927 } 1928 1929 for (i = index; i < dev->num_tx_queues; i++) 1930 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 1931 NUMA_NO_NODE); 1932 1933 out_no_maps: 1934 mutex_unlock(&xps_map_mutex); 1935 } 1936 1937 static struct xps_map *expand_xps_map(struct xps_map *map, 1938 int cpu, u16 index) 1939 { 1940 struct xps_map *new_map; 1941 int alloc_len = XPS_MIN_MAP_ALLOC; 1942 int i, pos; 1943 1944 for (pos = 0; map && pos < map->len; pos++) { 1945 if (map->queues[pos] != index) 1946 continue; 1947 return map; 1948 } 1949 1950 /* Need to add queue to this CPU's existing map */ 1951 if (map) { 1952 if (pos < map->alloc_len) 1953 return map; 1954 1955 alloc_len = map->alloc_len * 2; 1956 } 1957 1958 /* Need to allocate new map to store queue on this CPU's map */ 1959 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 1960 cpu_to_node(cpu)); 1961 if (!new_map) 1962 return NULL; 1963 1964 for (i = 0; i < pos; i++) 1965 new_map->queues[i] = map->queues[i]; 1966 new_map->alloc_len = alloc_len; 1967 new_map->len = pos; 1968 1969 return new_map; 1970 } 1971 1972 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 1973 u16 index) 1974 { 1975 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 1976 struct xps_map *map, *new_map; 1977 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 1978 int cpu, numa_node_id = -2; 1979 bool active = false; 1980 1981 mutex_lock(&xps_map_mutex); 1982 1983 dev_maps = xmap_dereference(dev->xps_maps); 1984 1985 /* allocate memory for queue storage */ 1986 for_each_online_cpu(cpu) { 1987 if (!cpumask_test_cpu(cpu, mask)) 1988 continue; 1989 1990 if (!new_dev_maps) 1991 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 1992 if (!new_dev_maps) { 1993 mutex_unlock(&xps_map_mutex); 1994 return -ENOMEM; 1995 } 1996 1997 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 1998 NULL; 1999 2000 map = expand_xps_map(map, cpu, index); 2001 if (!map) 2002 goto error; 2003 2004 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2005 } 2006 2007 if (!new_dev_maps) 2008 goto out_no_new_maps; 2009 2010 for_each_possible_cpu(cpu) { 2011 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2012 /* add queue to CPU maps */ 2013 int pos = 0; 2014 2015 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2016 while ((pos < map->len) && (map->queues[pos] != index)) 2017 pos++; 2018 2019 if (pos == map->len) 2020 map->queues[map->len++] = index; 2021 #ifdef CONFIG_NUMA 2022 if (numa_node_id == -2) 2023 numa_node_id = cpu_to_node(cpu); 2024 else if (numa_node_id != cpu_to_node(cpu)) 2025 numa_node_id = -1; 2026 #endif 2027 } else if (dev_maps) { 2028 /* fill in the new device map from the old device map */ 2029 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2030 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2031 } 2032 2033 } 2034 2035 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2036 2037 /* Cleanup old maps */ 2038 if (dev_maps) { 2039 for_each_possible_cpu(cpu) { 2040 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2041 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2042 if (map && map != new_map) 2043 kfree_rcu(map, rcu); 2044 } 2045 2046 kfree_rcu(dev_maps, rcu); 2047 } 2048 2049 dev_maps = new_dev_maps; 2050 active = true; 2051 2052 out_no_new_maps: 2053 /* update Tx queue numa node */ 2054 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2055 (numa_node_id >= 0) ? numa_node_id : 2056 NUMA_NO_NODE); 2057 2058 if (!dev_maps) 2059 goto out_no_maps; 2060 2061 /* removes queue from unused CPUs */ 2062 for_each_possible_cpu(cpu) { 2063 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2064 continue; 2065 2066 if (remove_xps_queue(dev_maps, cpu, index)) 2067 active = true; 2068 } 2069 2070 /* free map if not active */ 2071 if (!active) { 2072 RCU_INIT_POINTER(dev->xps_maps, NULL); 2073 kfree_rcu(dev_maps, rcu); 2074 } 2075 2076 out_no_maps: 2077 mutex_unlock(&xps_map_mutex); 2078 2079 return 0; 2080 error: 2081 /* remove any maps that we added */ 2082 for_each_possible_cpu(cpu) { 2083 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2084 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2085 NULL; 2086 if (new_map && new_map != map) 2087 kfree(new_map); 2088 } 2089 2090 mutex_unlock(&xps_map_mutex); 2091 2092 kfree(new_dev_maps); 2093 return -ENOMEM; 2094 } 2095 EXPORT_SYMBOL(netif_set_xps_queue); 2096 2097 #endif 2098 /* 2099 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2100 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2101 */ 2102 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2103 { 2104 int rc; 2105 2106 if (txq < 1 || txq > dev->num_tx_queues) 2107 return -EINVAL; 2108 2109 if (dev->reg_state == NETREG_REGISTERED || 2110 dev->reg_state == NETREG_UNREGISTERING) { 2111 ASSERT_RTNL(); 2112 2113 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2114 txq); 2115 if (rc) 2116 return rc; 2117 2118 if (dev->num_tc) 2119 netif_setup_tc(dev, txq); 2120 2121 if (txq < dev->real_num_tx_queues) { 2122 qdisc_reset_all_tx_gt(dev, txq); 2123 #ifdef CONFIG_XPS 2124 netif_reset_xps_queues_gt(dev, txq); 2125 #endif 2126 } 2127 } 2128 2129 dev->real_num_tx_queues = txq; 2130 return 0; 2131 } 2132 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2133 2134 #ifdef CONFIG_SYSFS 2135 /** 2136 * netif_set_real_num_rx_queues - set actual number of RX queues used 2137 * @dev: Network device 2138 * @rxq: Actual number of RX queues 2139 * 2140 * This must be called either with the rtnl_lock held or before 2141 * registration of the net device. Returns 0 on success, or a 2142 * negative error code. If called before registration, it always 2143 * succeeds. 2144 */ 2145 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2146 { 2147 int rc; 2148 2149 if (rxq < 1 || rxq > dev->num_rx_queues) 2150 return -EINVAL; 2151 2152 if (dev->reg_state == NETREG_REGISTERED) { 2153 ASSERT_RTNL(); 2154 2155 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2156 rxq); 2157 if (rc) 2158 return rc; 2159 } 2160 2161 dev->real_num_rx_queues = rxq; 2162 return 0; 2163 } 2164 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2165 #endif 2166 2167 /** 2168 * netif_get_num_default_rss_queues - default number of RSS queues 2169 * 2170 * This routine should set an upper limit on the number of RSS queues 2171 * used by default by multiqueue devices. 2172 */ 2173 int netif_get_num_default_rss_queues(void) 2174 { 2175 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2176 } 2177 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2178 2179 static inline void __netif_reschedule(struct Qdisc *q) 2180 { 2181 struct softnet_data *sd; 2182 unsigned long flags; 2183 2184 local_irq_save(flags); 2185 sd = this_cpu_ptr(&softnet_data); 2186 q->next_sched = NULL; 2187 *sd->output_queue_tailp = q; 2188 sd->output_queue_tailp = &q->next_sched; 2189 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2190 local_irq_restore(flags); 2191 } 2192 2193 void __netif_schedule(struct Qdisc *q) 2194 { 2195 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2196 __netif_reschedule(q); 2197 } 2198 EXPORT_SYMBOL(__netif_schedule); 2199 2200 struct dev_kfree_skb_cb { 2201 enum skb_free_reason reason; 2202 }; 2203 2204 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2205 { 2206 return (struct dev_kfree_skb_cb *)skb->cb; 2207 } 2208 2209 void netif_schedule_queue(struct netdev_queue *txq) 2210 { 2211 rcu_read_lock(); 2212 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2213 struct Qdisc *q = rcu_dereference(txq->qdisc); 2214 2215 __netif_schedule(q); 2216 } 2217 rcu_read_unlock(); 2218 } 2219 EXPORT_SYMBOL(netif_schedule_queue); 2220 2221 /** 2222 * netif_wake_subqueue - allow sending packets on subqueue 2223 * @dev: network device 2224 * @queue_index: sub queue index 2225 * 2226 * Resume individual transmit queue of a device with multiple transmit queues. 2227 */ 2228 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2229 { 2230 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2231 2232 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2233 struct Qdisc *q; 2234 2235 rcu_read_lock(); 2236 q = rcu_dereference(txq->qdisc); 2237 __netif_schedule(q); 2238 rcu_read_unlock(); 2239 } 2240 } 2241 EXPORT_SYMBOL(netif_wake_subqueue); 2242 2243 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2244 { 2245 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2246 struct Qdisc *q; 2247 2248 rcu_read_lock(); 2249 q = rcu_dereference(dev_queue->qdisc); 2250 __netif_schedule(q); 2251 rcu_read_unlock(); 2252 } 2253 } 2254 EXPORT_SYMBOL(netif_tx_wake_queue); 2255 2256 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2257 { 2258 unsigned long flags; 2259 2260 if (likely(atomic_read(&skb->users) == 1)) { 2261 smp_rmb(); 2262 atomic_set(&skb->users, 0); 2263 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2264 return; 2265 } 2266 get_kfree_skb_cb(skb)->reason = reason; 2267 local_irq_save(flags); 2268 skb->next = __this_cpu_read(softnet_data.completion_queue); 2269 __this_cpu_write(softnet_data.completion_queue, skb); 2270 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2271 local_irq_restore(flags); 2272 } 2273 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2274 2275 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2276 { 2277 if (in_irq() || irqs_disabled()) 2278 __dev_kfree_skb_irq(skb, reason); 2279 else 2280 dev_kfree_skb(skb); 2281 } 2282 EXPORT_SYMBOL(__dev_kfree_skb_any); 2283 2284 2285 /** 2286 * netif_device_detach - mark device as removed 2287 * @dev: network device 2288 * 2289 * Mark device as removed from system and therefore no longer available. 2290 */ 2291 void netif_device_detach(struct net_device *dev) 2292 { 2293 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2294 netif_running(dev)) { 2295 netif_tx_stop_all_queues(dev); 2296 } 2297 } 2298 EXPORT_SYMBOL(netif_device_detach); 2299 2300 /** 2301 * netif_device_attach - mark device as attached 2302 * @dev: network device 2303 * 2304 * Mark device as attached from system and restart if needed. 2305 */ 2306 void netif_device_attach(struct net_device *dev) 2307 { 2308 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2309 netif_running(dev)) { 2310 netif_tx_wake_all_queues(dev); 2311 __netdev_watchdog_up(dev); 2312 } 2313 } 2314 EXPORT_SYMBOL(netif_device_attach); 2315 2316 static void skb_warn_bad_offload(const struct sk_buff *skb) 2317 { 2318 static const netdev_features_t null_features = 0; 2319 struct net_device *dev = skb->dev; 2320 const char *driver = ""; 2321 2322 if (!net_ratelimit()) 2323 return; 2324 2325 if (dev && dev->dev.parent) 2326 driver = dev_driver_string(dev->dev.parent); 2327 2328 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2329 "gso_type=%d ip_summed=%d\n", 2330 driver, dev ? &dev->features : &null_features, 2331 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2332 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2333 skb_shinfo(skb)->gso_type, skb->ip_summed); 2334 } 2335 2336 /* 2337 * Invalidate hardware checksum when packet is to be mangled, and 2338 * complete checksum manually on outgoing path. 2339 */ 2340 int skb_checksum_help(struct sk_buff *skb) 2341 { 2342 __wsum csum; 2343 int ret = 0, offset; 2344 2345 if (skb->ip_summed == CHECKSUM_COMPLETE) 2346 goto out_set_summed; 2347 2348 if (unlikely(skb_shinfo(skb)->gso_size)) { 2349 skb_warn_bad_offload(skb); 2350 return -EINVAL; 2351 } 2352 2353 /* Before computing a checksum, we should make sure no frag could 2354 * be modified by an external entity : checksum could be wrong. 2355 */ 2356 if (skb_has_shared_frag(skb)) { 2357 ret = __skb_linearize(skb); 2358 if (ret) 2359 goto out; 2360 } 2361 2362 offset = skb_checksum_start_offset(skb); 2363 BUG_ON(offset >= skb_headlen(skb)); 2364 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2365 2366 offset += skb->csum_offset; 2367 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2368 2369 if (skb_cloned(skb) && 2370 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2371 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2372 if (ret) 2373 goto out; 2374 } 2375 2376 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2377 out_set_summed: 2378 skb->ip_summed = CHECKSUM_NONE; 2379 out: 2380 return ret; 2381 } 2382 EXPORT_SYMBOL(skb_checksum_help); 2383 2384 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2385 { 2386 __be16 type = skb->protocol; 2387 2388 /* Tunnel gso handlers can set protocol to ethernet. */ 2389 if (type == htons(ETH_P_TEB)) { 2390 struct ethhdr *eth; 2391 2392 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2393 return 0; 2394 2395 eth = (struct ethhdr *)skb_mac_header(skb); 2396 type = eth->h_proto; 2397 } 2398 2399 return __vlan_get_protocol(skb, type, depth); 2400 } 2401 2402 /** 2403 * skb_mac_gso_segment - mac layer segmentation handler. 2404 * @skb: buffer to segment 2405 * @features: features for the output path (see dev->features) 2406 */ 2407 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2408 netdev_features_t features) 2409 { 2410 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2411 struct packet_offload *ptype; 2412 int vlan_depth = skb->mac_len; 2413 __be16 type = skb_network_protocol(skb, &vlan_depth); 2414 2415 if (unlikely(!type)) 2416 return ERR_PTR(-EINVAL); 2417 2418 __skb_pull(skb, vlan_depth); 2419 2420 rcu_read_lock(); 2421 list_for_each_entry_rcu(ptype, &offload_base, list) { 2422 if (ptype->type == type && ptype->callbacks.gso_segment) { 2423 segs = ptype->callbacks.gso_segment(skb, features); 2424 break; 2425 } 2426 } 2427 rcu_read_unlock(); 2428 2429 __skb_push(skb, skb->data - skb_mac_header(skb)); 2430 2431 return segs; 2432 } 2433 EXPORT_SYMBOL(skb_mac_gso_segment); 2434 2435 2436 /* openvswitch calls this on rx path, so we need a different check. 2437 */ 2438 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2439 { 2440 if (tx_path) 2441 return skb->ip_summed != CHECKSUM_PARTIAL; 2442 else 2443 return skb->ip_summed == CHECKSUM_NONE; 2444 } 2445 2446 /** 2447 * __skb_gso_segment - Perform segmentation on skb. 2448 * @skb: buffer to segment 2449 * @features: features for the output path (see dev->features) 2450 * @tx_path: whether it is called in TX path 2451 * 2452 * This function segments the given skb and returns a list of segments. 2453 * 2454 * It may return NULL if the skb requires no segmentation. This is 2455 * only possible when GSO is used for verifying header integrity. 2456 */ 2457 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2458 netdev_features_t features, bool tx_path) 2459 { 2460 if (unlikely(skb_needs_check(skb, tx_path))) { 2461 int err; 2462 2463 skb_warn_bad_offload(skb); 2464 2465 err = skb_cow_head(skb, 0); 2466 if (err < 0) 2467 return ERR_PTR(err); 2468 } 2469 2470 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2471 SKB_GSO_CB(skb)->encap_level = 0; 2472 2473 skb_reset_mac_header(skb); 2474 skb_reset_mac_len(skb); 2475 2476 return skb_mac_gso_segment(skb, features); 2477 } 2478 EXPORT_SYMBOL(__skb_gso_segment); 2479 2480 /* Take action when hardware reception checksum errors are detected. */ 2481 #ifdef CONFIG_BUG 2482 void netdev_rx_csum_fault(struct net_device *dev) 2483 { 2484 if (net_ratelimit()) { 2485 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2486 dump_stack(); 2487 } 2488 } 2489 EXPORT_SYMBOL(netdev_rx_csum_fault); 2490 #endif 2491 2492 /* Actually, we should eliminate this check as soon as we know, that: 2493 * 1. IOMMU is present and allows to map all the memory. 2494 * 2. No high memory really exists on this machine. 2495 */ 2496 2497 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2498 { 2499 #ifdef CONFIG_HIGHMEM 2500 int i; 2501 if (!(dev->features & NETIF_F_HIGHDMA)) { 2502 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2503 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2504 if (PageHighMem(skb_frag_page(frag))) 2505 return 1; 2506 } 2507 } 2508 2509 if (PCI_DMA_BUS_IS_PHYS) { 2510 struct device *pdev = dev->dev.parent; 2511 2512 if (!pdev) 2513 return 0; 2514 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2515 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2516 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2517 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2518 return 1; 2519 } 2520 } 2521 #endif 2522 return 0; 2523 } 2524 2525 /* If MPLS offload request, verify we are testing hardware MPLS features 2526 * instead of standard features for the netdev. 2527 */ 2528 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2529 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2530 netdev_features_t features, 2531 __be16 type) 2532 { 2533 if (eth_p_mpls(type)) 2534 features &= skb->dev->mpls_features; 2535 2536 return features; 2537 } 2538 #else 2539 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2540 netdev_features_t features, 2541 __be16 type) 2542 { 2543 return features; 2544 } 2545 #endif 2546 2547 static netdev_features_t harmonize_features(struct sk_buff *skb, 2548 netdev_features_t features) 2549 { 2550 int tmp; 2551 __be16 type; 2552 2553 type = skb_network_protocol(skb, &tmp); 2554 features = net_mpls_features(skb, features, type); 2555 2556 if (skb->ip_summed != CHECKSUM_NONE && 2557 !can_checksum_protocol(features, type)) { 2558 features &= ~NETIF_F_ALL_CSUM; 2559 } else if (illegal_highdma(skb->dev, skb)) { 2560 features &= ~NETIF_F_SG; 2561 } 2562 2563 return features; 2564 } 2565 2566 netdev_features_t passthru_features_check(struct sk_buff *skb, 2567 struct net_device *dev, 2568 netdev_features_t features) 2569 { 2570 return features; 2571 } 2572 EXPORT_SYMBOL(passthru_features_check); 2573 2574 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2575 struct net_device *dev, 2576 netdev_features_t features) 2577 { 2578 return vlan_features_check(skb, features); 2579 } 2580 2581 netdev_features_t netif_skb_features(struct sk_buff *skb) 2582 { 2583 struct net_device *dev = skb->dev; 2584 netdev_features_t features = dev->features; 2585 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2586 2587 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) 2588 features &= ~NETIF_F_GSO_MASK; 2589 2590 /* If encapsulation offload request, verify we are testing 2591 * hardware encapsulation features instead of standard 2592 * features for the netdev 2593 */ 2594 if (skb->encapsulation) 2595 features &= dev->hw_enc_features; 2596 2597 if (skb_vlan_tagged(skb)) 2598 features = netdev_intersect_features(features, 2599 dev->vlan_features | 2600 NETIF_F_HW_VLAN_CTAG_TX | 2601 NETIF_F_HW_VLAN_STAG_TX); 2602 2603 if (dev->netdev_ops->ndo_features_check) 2604 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2605 features); 2606 else 2607 features &= dflt_features_check(skb, dev, features); 2608 2609 return harmonize_features(skb, features); 2610 } 2611 EXPORT_SYMBOL(netif_skb_features); 2612 2613 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2614 struct netdev_queue *txq, bool more) 2615 { 2616 unsigned int len; 2617 int rc; 2618 2619 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2620 dev_queue_xmit_nit(skb, dev); 2621 2622 len = skb->len; 2623 trace_net_dev_start_xmit(skb, dev); 2624 rc = netdev_start_xmit(skb, dev, txq, more); 2625 trace_net_dev_xmit(skb, rc, dev, len); 2626 2627 return rc; 2628 } 2629 2630 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2631 struct netdev_queue *txq, int *ret) 2632 { 2633 struct sk_buff *skb = first; 2634 int rc = NETDEV_TX_OK; 2635 2636 while (skb) { 2637 struct sk_buff *next = skb->next; 2638 2639 skb->next = NULL; 2640 rc = xmit_one(skb, dev, txq, next != NULL); 2641 if (unlikely(!dev_xmit_complete(rc))) { 2642 skb->next = next; 2643 goto out; 2644 } 2645 2646 skb = next; 2647 if (netif_xmit_stopped(txq) && skb) { 2648 rc = NETDEV_TX_BUSY; 2649 break; 2650 } 2651 } 2652 2653 out: 2654 *ret = rc; 2655 return skb; 2656 } 2657 2658 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2659 netdev_features_t features) 2660 { 2661 if (skb_vlan_tag_present(skb) && 2662 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2663 skb = __vlan_hwaccel_push_inside(skb); 2664 return skb; 2665 } 2666 2667 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2668 { 2669 netdev_features_t features; 2670 2671 if (skb->next) 2672 return skb; 2673 2674 features = netif_skb_features(skb); 2675 skb = validate_xmit_vlan(skb, features); 2676 if (unlikely(!skb)) 2677 goto out_null; 2678 2679 if (netif_needs_gso(dev, skb, features)) { 2680 struct sk_buff *segs; 2681 2682 segs = skb_gso_segment(skb, features); 2683 if (IS_ERR(segs)) { 2684 goto out_kfree_skb; 2685 } else if (segs) { 2686 consume_skb(skb); 2687 skb = segs; 2688 } 2689 } else { 2690 if (skb_needs_linearize(skb, features) && 2691 __skb_linearize(skb)) 2692 goto out_kfree_skb; 2693 2694 /* If packet is not checksummed and device does not 2695 * support checksumming for this protocol, complete 2696 * checksumming here. 2697 */ 2698 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2699 if (skb->encapsulation) 2700 skb_set_inner_transport_header(skb, 2701 skb_checksum_start_offset(skb)); 2702 else 2703 skb_set_transport_header(skb, 2704 skb_checksum_start_offset(skb)); 2705 if (!(features & NETIF_F_ALL_CSUM) && 2706 skb_checksum_help(skb)) 2707 goto out_kfree_skb; 2708 } 2709 } 2710 2711 return skb; 2712 2713 out_kfree_skb: 2714 kfree_skb(skb); 2715 out_null: 2716 return NULL; 2717 } 2718 2719 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2720 { 2721 struct sk_buff *next, *head = NULL, *tail; 2722 2723 for (; skb != NULL; skb = next) { 2724 next = skb->next; 2725 skb->next = NULL; 2726 2727 /* in case skb wont be segmented, point to itself */ 2728 skb->prev = skb; 2729 2730 skb = validate_xmit_skb(skb, dev); 2731 if (!skb) 2732 continue; 2733 2734 if (!head) 2735 head = skb; 2736 else 2737 tail->next = skb; 2738 /* If skb was segmented, skb->prev points to 2739 * the last segment. If not, it still contains skb. 2740 */ 2741 tail = skb->prev; 2742 } 2743 return head; 2744 } 2745 2746 static void qdisc_pkt_len_init(struct sk_buff *skb) 2747 { 2748 const struct skb_shared_info *shinfo = skb_shinfo(skb); 2749 2750 qdisc_skb_cb(skb)->pkt_len = skb->len; 2751 2752 /* To get more precise estimation of bytes sent on wire, 2753 * we add to pkt_len the headers size of all segments 2754 */ 2755 if (shinfo->gso_size) { 2756 unsigned int hdr_len; 2757 u16 gso_segs = shinfo->gso_segs; 2758 2759 /* mac layer + network layer */ 2760 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 2761 2762 /* + transport layer */ 2763 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 2764 hdr_len += tcp_hdrlen(skb); 2765 else 2766 hdr_len += sizeof(struct udphdr); 2767 2768 if (shinfo->gso_type & SKB_GSO_DODGY) 2769 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 2770 shinfo->gso_size); 2771 2772 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 2773 } 2774 } 2775 2776 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2777 struct net_device *dev, 2778 struct netdev_queue *txq) 2779 { 2780 spinlock_t *root_lock = qdisc_lock(q); 2781 bool contended; 2782 int rc; 2783 2784 qdisc_pkt_len_init(skb); 2785 qdisc_calculate_pkt_len(skb, q); 2786 /* 2787 * Heuristic to force contended enqueues to serialize on a 2788 * separate lock before trying to get qdisc main lock. 2789 * This permits __QDISC___STATE_RUNNING owner to get the lock more 2790 * often and dequeue packets faster. 2791 */ 2792 contended = qdisc_is_running(q); 2793 if (unlikely(contended)) 2794 spin_lock(&q->busylock); 2795 2796 spin_lock(root_lock); 2797 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2798 kfree_skb(skb); 2799 rc = NET_XMIT_DROP; 2800 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2801 qdisc_run_begin(q)) { 2802 /* 2803 * This is a work-conserving queue; there are no old skbs 2804 * waiting to be sent out; and the qdisc is not running - 2805 * xmit the skb directly. 2806 */ 2807 2808 qdisc_bstats_update(q, skb); 2809 2810 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 2811 if (unlikely(contended)) { 2812 spin_unlock(&q->busylock); 2813 contended = false; 2814 } 2815 __qdisc_run(q); 2816 } else 2817 qdisc_run_end(q); 2818 2819 rc = NET_XMIT_SUCCESS; 2820 } else { 2821 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2822 if (qdisc_run_begin(q)) { 2823 if (unlikely(contended)) { 2824 spin_unlock(&q->busylock); 2825 contended = false; 2826 } 2827 __qdisc_run(q); 2828 } 2829 } 2830 spin_unlock(root_lock); 2831 if (unlikely(contended)) 2832 spin_unlock(&q->busylock); 2833 return rc; 2834 } 2835 2836 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 2837 static void skb_update_prio(struct sk_buff *skb) 2838 { 2839 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 2840 2841 if (!skb->priority && skb->sk && map) { 2842 unsigned int prioidx = skb->sk->sk_cgrp_prioidx; 2843 2844 if (prioidx < map->priomap_len) 2845 skb->priority = map->priomap[prioidx]; 2846 } 2847 } 2848 #else 2849 #define skb_update_prio(skb) 2850 #endif 2851 2852 static DEFINE_PER_CPU(int, xmit_recursion); 2853 #define RECURSION_LIMIT 10 2854 2855 /** 2856 * dev_loopback_xmit - loop back @skb 2857 * @skb: buffer to transmit 2858 */ 2859 int dev_loopback_xmit(struct sk_buff *skb) 2860 { 2861 skb_reset_mac_header(skb); 2862 __skb_pull(skb, skb_network_offset(skb)); 2863 skb->pkt_type = PACKET_LOOPBACK; 2864 skb->ip_summed = CHECKSUM_UNNECESSARY; 2865 WARN_ON(!skb_dst(skb)); 2866 skb_dst_force(skb); 2867 netif_rx_ni(skb); 2868 return 0; 2869 } 2870 EXPORT_SYMBOL(dev_loopback_xmit); 2871 2872 /** 2873 * __dev_queue_xmit - transmit a buffer 2874 * @skb: buffer to transmit 2875 * @accel_priv: private data used for L2 forwarding offload 2876 * 2877 * Queue a buffer for transmission to a network device. The caller must 2878 * have set the device and priority and built the buffer before calling 2879 * this function. The function can be called from an interrupt. 2880 * 2881 * A negative errno code is returned on a failure. A success does not 2882 * guarantee the frame will be transmitted as it may be dropped due 2883 * to congestion or traffic shaping. 2884 * 2885 * ----------------------------------------------------------------------------------- 2886 * I notice this method can also return errors from the queue disciplines, 2887 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2888 * be positive. 2889 * 2890 * Regardless of the return value, the skb is consumed, so it is currently 2891 * difficult to retry a send to this method. (You can bump the ref count 2892 * before sending to hold a reference for retry if you are careful.) 2893 * 2894 * When calling this method, interrupts MUST be enabled. This is because 2895 * the BH enable code must have IRQs enabled so that it will not deadlock. 2896 * --BLG 2897 */ 2898 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 2899 { 2900 struct net_device *dev = skb->dev; 2901 struct netdev_queue *txq; 2902 struct Qdisc *q; 2903 int rc = -ENOMEM; 2904 2905 skb_reset_mac_header(skb); 2906 2907 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 2908 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 2909 2910 /* Disable soft irqs for various locks below. Also 2911 * stops preemption for RCU. 2912 */ 2913 rcu_read_lock_bh(); 2914 2915 skb_update_prio(skb); 2916 2917 /* If device/qdisc don't need skb->dst, release it right now while 2918 * its hot in this cpu cache. 2919 */ 2920 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2921 skb_dst_drop(skb); 2922 else 2923 skb_dst_force(skb); 2924 2925 txq = netdev_pick_tx(dev, skb, accel_priv); 2926 q = rcu_dereference_bh(txq->qdisc); 2927 2928 #ifdef CONFIG_NET_CLS_ACT 2929 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2930 #endif 2931 trace_net_dev_queue(skb); 2932 if (q->enqueue) { 2933 rc = __dev_xmit_skb(skb, q, dev, txq); 2934 goto out; 2935 } 2936 2937 /* The device has no queue. Common case for software devices: 2938 loopback, all the sorts of tunnels... 2939 2940 Really, it is unlikely that netif_tx_lock protection is necessary 2941 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2942 counters.) 2943 However, it is possible, that they rely on protection 2944 made by us here. 2945 2946 Check this and shot the lock. It is not prone from deadlocks. 2947 Either shot noqueue qdisc, it is even simpler 8) 2948 */ 2949 if (dev->flags & IFF_UP) { 2950 int cpu = smp_processor_id(); /* ok because BHs are off */ 2951 2952 if (txq->xmit_lock_owner != cpu) { 2953 2954 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 2955 goto recursion_alert; 2956 2957 skb = validate_xmit_skb(skb, dev); 2958 if (!skb) 2959 goto drop; 2960 2961 HARD_TX_LOCK(dev, txq, cpu); 2962 2963 if (!netif_xmit_stopped(txq)) { 2964 __this_cpu_inc(xmit_recursion); 2965 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 2966 __this_cpu_dec(xmit_recursion); 2967 if (dev_xmit_complete(rc)) { 2968 HARD_TX_UNLOCK(dev, txq); 2969 goto out; 2970 } 2971 } 2972 HARD_TX_UNLOCK(dev, txq); 2973 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 2974 dev->name); 2975 } else { 2976 /* Recursion is detected! It is possible, 2977 * unfortunately 2978 */ 2979 recursion_alert: 2980 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 2981 dev->name); 2982 } 2983 } 2984 2985 rc = -ENETDOWN; 2986 drop: 2987 rcu_read_unlock_bh(); 2988 2989 atomic_long_inc(&dev->tx_dropped); 2990 kfree_skb_list(skb); 2991 return rc; 2992 out: 2993 rcu_read_unlock_bh(); 2994 return rc; 2995 } 2996 2997 int dev_queue_xmit(struct sk_buff *skb) 2998 { 2999 return __dev_queue_xmit(skb, NULL); 3000 } 3001 EXPORT_SYMBOL(dev_queue_xmit); 3002 3003 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3004 { 3005 return __dev_queue_xmit(skb, accel_priv); 3006 } 3007 EXPORT_SYMBOL(dev_queue_xmit_accel); 3008 3009 3010 /*======================================================================= 3011 Receiver routines 3012 =======================================================================*/ 3013 3014 int netdev_max_backlog __read_mostly = 1000; 3015 EXPORT_SYMBOL(netdev_max_backlog); 3016 3017 int netdev_tstamp_prequeue __read_mostly = 1; 3018 int netdev_budget __read_mostly = 300; 3019 int weight_p __read_mostly = 64; /* old backlog weight */ 3020 3021 /* Called with irq disabled */ 3022 static inline void ____napi_schedule(struct softnet_data *sd, 3023 struct napi_struct *napi) 3024 { 3025 list_add_tail(&napi->poll_list, &sd->poll_list); 3026 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3027 } 3028 3029 #ifdef CONFIG_RPS 3030 3031 /* One global table that all flow-based protocols share. */ 3032 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3033 EXPORT_SYMBOL(rps_sock_flow_table); 3034 u32 rps_cpu_mask __read_mostly; 3035 EXPORT_SYMBOL(rps_cpu_mask); 3036 3037 struct static_key rps_needed __read_mostly; 3038 3039 static struct rps_dev_flow * 3040 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3041 struct rps_dev_flow *rflow, u16 next_cpu) 3042 { 3043 if (next_cpu != RPS_NO_CPU) { 3044 #ifdef CONFIG_RFS_ACCEL 3045 struct netdev_rx_queue *rxqueue; 3046 struct rps_dev_flow_table *flow_table; 3047 struct rps_dev_flow *old_rflow; 3048 u32 flow_id; 3049 u16 rxq_index; 3050 int rc; 3051 3052 /* Should we steer this flow to a different hardware queue? */ 3053 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3054 !(dev->features & NETIF_F_NTUPLE)) 3055 goto out; 3056 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3057 if (rxq_index == skb_get_rx_queue(skb)) 3058 goto out; 3059 3060 rxqueue = dev->_rx + rxq_index; 3061 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3062 if (!flow_table) 3063 goto out; 3064 flow_id = skb_get_hash(skb) & flow_table->mask; 3065 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3066 rxq_index, flow_id); 3067 if (rc < 0) 3068 goto out; 3069 old_rflow = rflow; 3070 rflow = &flow_table->flows[flow_id]; 3071 rflow->filter = rc; 3072 if (old_rflow->filter == rflow->filter) 3073 old_rflow->filter = RPS_NO_FILTER; 3074 out: 3075 #endif 3076 rflow->last_qtail = 3077 per_cpu(softnet_data, next_cpu).input_queue_head; 3078 } 3079 3080 rflow->cpu = next_cpu; 3081 return rflow; 3082 } 3083 3084 /* 3085 * get_rps_cpu is called from netif_receive_skb and returns the target 3086 * CPU from the RPS map of the receiving queue for a given skb. 3087 * rcu_read_lock must be held on entry. 3088 */ 3089 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3090 struct rps_dev_flow **rflowp) 3091 { 3092 const struct rps_sock_flow_table *sock_flow_table; 3093 struct netdev_rx_queue *rxqueue = dev->_rx; 3094 struct rps_dev_flow_table *flow_table; 3095 struct rps_map *map; 3096 int cpu = -1; 3097 u32 tcpu; 3098 u32 hash; 3099 3100 if (skb_rx_queue_recorded(skb)) { 3101 u16 index = skb_get_rx_queue(skb); 3102 3103 if (unlikely(index >= dev->real_num_rx_queues)) { 3104 WARN_ONCE(dev->real_num_rx_queues > 1, 3105 "%s received packet on queue %u, but number " 3106 "of RX queues is %u\n", 3107 dev->name, index, dev->real_num_rx_queues); 3108 goto done; 3109 } 3110 rxqueue += index; 3111 } 3112 3113 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3114 3115 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3116 map = rcu_dereference(rxqueue->rps_map); 3117 if (!flow_table && !map) 3118 goto done; 3119 3120 skb_reset_network_header(skb); 3121 hash = skb_get_hash(skb); 3122 if (!hash) 3123 goto done; 3124 3125 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3126 if (flow_table && sock_flow_table) { 3127 struct rps_dev_flow *rflow; 3128 u32 next_cpu; 3129 u32 ident; 3130 3131 /* First check into global flow table if there is a match */ 3132 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3133 if ((ident ^ hash) & ~rps_cpu_mask) 3134 goto try_rps; 3135 3136 next_cpu = ident & rps_cpu_mask; 3137 3138 /* OK, now we know there is a match, 3139 * we can look at the local (per receive queue) flow table 3140 */ 3141 rflow = &flow_table->flows[hash & flow_table->mask]; 3142 tcpu = rflow->cpu; 3143 3144 /* 3145 * If the desired CPU (where last recvmsg was done) is 3146 * different from current CPU (one in the rx-queue flow 3147 * table entry), switch if one of the following holds: 3148 * - Current CPU is unset (equal to RPS_NO_CPU). 3149 * - Current CPU is offline. 3150 * - The current CPU's queue tail has advanced beyond the 3151 * last packet that was enqueued using this table entry. 3152 * This guarantees that all previous packets for the flow 3153 * have been dequeued, thus preserving in order delivery. 3154 */ 3155 if (unlikely(tcpu != next_cpu) && 3156 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 3157 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3158 rflow->last_qtail)) >= 0)) { 3159 tcpu = next_cpu; 3160 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3161 } 3162 3163 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 3164 *rflowp = rflow; 3165 cpu = tcpu; 3166 goto done; 3167 } 3168 } 3169 3170 try_rps: 3171 3172 if (map) { 3173 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3174 if (cpu_online(tcpu)) { 3175 cpu = tcpu; 3176 goto done; 3177 } 3178 } 3179 3180 done: 3181 return cpu; 3182 } 3183 3184 #ifdef CONFIG_RFS_ACCEL 3185 3186 /** 3187 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3188 * @dev: Device on which the filter was set 3189 * @rxq_index: RX queue index 3190 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3191 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3192 * 3193 * Drivers that implement ndo_rx_flow_steer() should periodically call 3194 * this function for each installed filter and remove the filters for 3195 * which it returns %true. 3196 */ 3197 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3198 u32 flow_id, u16 filter_id) 3199 { 3200 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3201 struct rps_dev_flow_table *flow_table; 3202 struct rps_dev_flow *rflow; 3203 bool expire = true; 3204 int cpu; 3205 3206 rcu_read_lock(); 3207 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3208 if (flow_table && flow_id <= flow_table->mask) { 3209 rflow = &flow_table->flows[flow_id]; 3210 cpu = ACCESS_ONCE(rflow->cpu); 3211 if (rflow->filter == filter_id && cpu != RPS_NO_CPU && 3212 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3213 rflow->last_qtail) < 3214 (int)(10 * flow_table->mask))) 3215 expire = false; 3216 } 3217 rcu_read_unlock(); 3218 return expire; 3219 } 3220 EXPORT_SYMBOL(rps_may_expire_flow); 3221 3222 #endif /* CONFIG_RFS_ACCEL */ 3223 3224 /* Called from hardirq (IPI) context */ 3225 static void rps_trigger_softirq(void *data) 3226 { 3227 struct softnet_data *sd = data; 3228 3229 ____napi_schedule(sd, &sd->backlog); 3230 sd->received_rps++; 3231 } 3232 3233 #endif /* CONFIG_RPS */ 3234 3235 /* 3236 * Check if this softnet_data structure is another cpu one 3237 * If yes, queue it to our IPI list and return 1 3238 * If no, return 0 3239 */ 3240 static int rps_ipi_queued(struct softnet_data *sd) 3241 { 3242 #ifdef CONFIG_RPS 3243 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3244 3245 if (sd != mysd) { 3246 sd->rps_ipi_next = mysd->rps_ipi_list; 3247 mysd->rps_ipi_list = sd; 3248 3249 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3250 return 1; 3251 } 3252 #endif /* CONFIG_RPS */ 3253 return 0; 3254 } 3255 3256 #ifdef CONFIG_NET_FLOW_LIMIT 3257 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3258 #endif 3259 3260 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3261 { 3262 #ifdef CONFIG_NET_FLOW_LIMIT 3263 struct sd_flow_limit *fl; 3264 struct softnet_data *sd; 3265 unsigned int old_flow, new_flow; 3266 3267 if (qlen < (netdev_max_backlog >> 1)) 3268 return false; 3269 3270 sd = this_cpu_ptr(&softnet_data); 3271 3272 rcu_read_lock(); 3273 fl = rcu_dereference(sd->flow_limit); 3274 if (fl) { 3275 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3276 old_flow = fl->history[fl->history_head]; 3277 fl->history[fl->history_head] = new_flow; 3278 3279 fl->history_head++; 3280 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3281 3282 if (likely(fl->buckets[old_flow])) 3283 fl->buckets[old_flow]--; 3284 3285 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3286 fl->count++; 3287 rcu_read_unlock(); 3288 return true; 3289 } 3290 } 3291 rcu_read_unlock(); 3292 #endif 3293 return false; 3294 } 3295 3296 /* 3297 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3298 * queue (may be a remote CPU queue). 3299 */ 3300 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3301 unsigned int *qtail) 3302 { 3303 struct softnet_data *sd; 3304 unsigned long flags; 3305 unsigned int qlen; 3306 3307 sd = &per_cpu(softnet_data, cpu); 3308 3309 local_irq_save(flags); 3310 3311 rps_lock(sd); 3312 qlen = skb_queue_len(&sd->input_pkt_queue); 3313 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3314 if (qlen) { 3315 enqueue: 3316 __skb_queue_tail(&sd->input_pkt_queue, skb); 3317 input_queue_tail_incr_save(sd, qtail); 3318 rps_unlock(sd); 3319 local_irq_restore(flags); 3320 return NET_RX_SUCCESS; 3321 } 3322 3323 /* Schedule NAPI for backlog device 3324 * We can use non atomic operation since we own the queue lock 3325 */ 3326 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3327 if (!rps_ipi_queued(sd)) 3328 ____napi_schedule(sd, &sd->backlog); 3329 } 3330 goto enqueue; 3331 } 3332 3333 sd->dropped++; 3334 rps_unlock(sd); 3335 3336 local_irq_restore(flags); 3337 3338 atomic_long_inc(&skb->dev->rx_dropped); 3339 kfree_skb(skb); 3340 return NET_RX_DROP; 3341 } 3342 3343 static int netif_rx_internal(struct sk_buff *skb) 3344 { 3345 int ret; 3346 3347 net_timestamp_check(netdev_tstamp_prequeue, skb); 3348 3349 trace_netif_rx(skb); 3350 #ifdef CONFIG_RPS 3351 if (static_key_false(&rps_needed)) { 3352 struct rps_dev_flow voidflow, *rflow = &voidflow; 3353 int cpu; 3354 3355 preempt_disable(); 3356 rcu_read_lock(); 3357 3358 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3359 if (cpu < 0) 3360 cpu = smp_processor_id(); 3361 3362 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3363 3364 rcu_read_unlock(); 3365 preempt_enable(); 3366 } else 3367 #endif 3368 { 3369 unsigned int qtail; 3370 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3371 put_cpu(); 3372 } 3373 return ret; 3374 } 3375 3376 /** 3377 * netif_rx - post buffer to the network code 3378 * @skb: buffer to post 3379 * 3380 * This function receives a packet from a device driver and queues it for 3381 * the upper (protocol) levels to process. It always succeeds. The buffer 3382 * may be dropped during processing for congestion control or by the 3383 * protocol layers. 3384 * 3385 * return values: 3386 * NET_RX_SUCCESS (no congestion) 3387 * NET_RX_DROP (packet was dropped) 3388 * 3389 */ 3390 3391 int netif_rx(struct sk_buff *skb) 3392 { 3393 trace_netif_rx_entry(skb); 3394 3395 return netif_rx_internal(skb); 3396 } 3397 EXPORT_SYMBOL(netif_rx); 3398 3399 int netif_rx_ni(struct sk_buff *skb) 3400 { 3401 int err; 3402 3403 trace_netif_rx_ni_entry(skb); 3404 3405 preempt_disable(); 3406 err = netif_rx_internal(skb); 3407 if (local_softirq_pending()) 3408 do_softirq(); 3409 preempt_enable(); 3410 3411 return err; 3412 } 3413 EXPORT_SYMBOL(netif_rx_ni); 3414 3415 static void net_tx_action(struct softirq_action *h) 3416 { 3417 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3418 3419 if (sd->completion_queue) { 3420 struct sk_buff *clist; 3421 3422 local_irq_disable(); 3423 clist = sd->completion_queue; 3424 sd->completion_queue = NULL; 3425 local_irq_enable(); 3426 3427 while (clist) { 3428 struct sk_buff *skb = clist; 3429 clist = clist->next; 3430 3431 WARN_ON(atomic_read(&skb->users)); 3432 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3433 trace_consume_skb(skb); 3434 else 3435 trace_kfree_skb(skb, net_tx_action); 3436 __kfree_skb(skb); 3437 } 3438 } 3439 3440 if (sd->output_queue) { 3441 struct Qdisc *head; 3442 3443 local_irq_disable(); 3444 head = sd->output_queue; 3445 sd->output_queue = NULL; 3446 sd->output_queue_tailp = &sd->output_queue; 3447 local_irq_enable(); 3448 3449 while (head) { 3450 struct Qdisc *q = head; 3451 spinlock_t *root_lock; 3452 3453 head = head->next_sched; 3454 3455 root_lock = qdisc_lock(q); 3456 if (spin_trylock(root_lock)) { 3457 smp_mb__before_atomic(); 3458 clear_bit(__QDISC_STATE_SCHED, 3459 &q->state); 3460 qdisc_run(q); 3461 spin_unlock(root_lock); 3462 } else { 3463 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3464 &q->state)) { 3465 __netif_reschedule(q); 3466 } else { 3467 smp_mb__before_atomic(); 3468 clear_bit(__QDISC_STATE_SCHED, 3469 &q->state); 3470 } 3471 } 3472 } 3473 } 3474 } 3475 3476 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3477 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3478 /* This hook is defined here for ATM LANE */ 3479 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3480 unsigned char *addr) __read_mostly; 3481 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3482 #endif 3483 3484 #ifdef CONFIG_NET_CLS_ACT 3485 /* TODO: Maybe we should just force sch_ingress to be compiled in 3486 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 3487 * a compare and 2 stores extra right now if we dont have it on 3488 * but have CONFIG_NET_CLS_ACT 3489 * NOTE: This doesn't stop any functionality; if you dont have 3490 * the ingress scheduler, you just can't add policies on ingress. 3491 * 3492 */ 3493 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) 3494 { 3495 struct net_device *dev = skb->dev; 3496 u32 ttl = G_TC_RTTL(skb->tc_verd); 3497 int result = TC_ACT_OK; 3498 struct Qdisc *q; 3499 3500 if (unlikely(MAX_RED_LOOP < ttl++)) { 3501 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", 3502 skb->skb_iif, dev->ifindex); 3503 return TC_ACT_SHOT; 3504 } 3505 3506 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3507 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3508 3509 q = rcu_dereference(rxq->qdisc); 3510 if (q != &noop_qdisc) { 3511 spin_lock(qdisc_lock(q)); 3512 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 3513 result = qdisc_enqueue_root(skb, q); 3514 spin_unlock(qdisc_lock(q)); 3515 } 3516 3517 return result; 3518 } 3519 3520 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3521 struct packet_type **pt_prev, 3522 int *ret, struct net_device *orig_dev) 3523 { 3524 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); 3525 3526 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) 3527 goto out; 3528 3529 if (*pt_prev) { 3530 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3531 *pt_prev = NULL; 3532 } 3533 3534 switch (ing_filter(skb, rxq)) { 3535 case TC_ACT_SHOT: 3536 case TC_ACT_STOLEN: 3537 kfree_skb(skb); 3538 return NULL; 3539 } 3540 3541 out: 3542 skb->tc_verd = 0; 3543 return skb; 3544 } 3545 #endif 3546 3547 /** 3548 * netdev_rx_handler_register - register receive handler 3549 * @dev: device to register a handler for 3550 * @rx_handler: receive handler to register 3551 * @rx_handler_data: data pointer that is used by rx handler 3552 * 3553 * Register a receive handler for a device. This handler will then be 3554 * called from __netif_receive_skb. A negative errno code is returned 3555 * on a failure. 3556 * 3557 * The caller must hold the rtnl_mutex. 3558 * 3559 * For a general description of rx_handler, see enum rx_handler_result. 3560 */ 3561 int netdev_rx_handler_register(struct net_device *dev, 3562 rx_handler_func_t *rx_handler, 3563 void *rx_handler_data) 3564 { 3565 ASSERT_RTNL(); 3566 3567 if (dev->rx_handler) 3568 return -EBUSY; 3569 3570 /* Note: rx_handler_data must be set before rx_handler */ 3571 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3572 rcu_assign_pointer(dev->rx_handler, rx_handler); 3573 3574 return 0; 3575 } 3576 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3577 3578 /** 3579 * netdev_rx_handler_unregister - unregister receive handler 3580 * @dev: device to unregister a handler from 3581 * 3582 * Unregister a receive handler from a device. 3583 * 3584 * The caller must hold the rtnl_mutex. 3585 */ 3586 void netdev_rx_handler_unregister(struct net_device *dev) 3587 { 3588 3589 ASSERT_RTNL(); 3590 RCU_INIT_POINTER(dev->rx_handler, NULL); 3591 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3592 * section has a guarantee to see a non NULL rx_handler_data 3593 * as well. 3594 */ 3595 synchronize_net(); 3596 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3597 } 3598 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3599 3600 /* 3601 * Limit the use of PFMEMALLOC reserves to those protocols that implement 3602 * the special handling of PFMEMALLOC skbs. 3603 */ 3604 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3605 { 3606 switch (skb->protocol) { 3607 case htons(ETH_P_ARP): 3608 case htons(ETH_P_IP): 3609 case htons(ETH_P_IPV6): 3610 case htons(ETH_P_8021Q): 3611 case htons(ETH_P_8021AD): 3612 return true; 3613 default: 3614 return false; 3615 } 3616 } 3617 3618 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 3619 { 3620 struct packet_type *ptype, *pt_prev; 3621 rx_handler_func_t *rx_handler; 3622 struct net_device *orig_dev; 3623 bool deliver_exact = false; 3624 int ret = NET_RX_DROP; 3625 __be16 type; 3626 3627 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3628 3629 trace_netif_receive_skb(skb); 3630 3631 orig_dev = skb->dev; 3632 3633 skb_reset_network_header(skb); 3634 if (!skb_transport_header_was_set(skb)) 3635 skb_reset_transport_header(skb); 3636 skb_reset_mac_len(skb); 3637 3638 pt_prev = NULL; 3639 3640 rcu_read_lock(); 3641 3642 another_round: 3643 skb->skb_iif = skb->dev->ifindex; 3644 3645 __this_cpu_inc(softnet_data.processed); 3646 3647 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 3648 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 3649 skb = skb_vlan_untag(skb); 3650 if (unlikely(!skb)) 3651 goto unlock; 3652 } 3653 3654 #ifdef CONFIG_NET_CLS_ACT 3655 if (skb->tc_verd & TC_NCLS) { 3656 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3657 goto ncls; 3658 } 3659 #endif 3660 3661 if (pfmemalloc) 3662 goto skip_taps; 3663 3664 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3665 if (pt_prev) 3666 ret = deliver_skb(skb, pt_prev, orig_dev); 3667 pt_prev = ptype; 3668 } 3669 3670 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 3671 if (pt_prev) 3672 ret = deliver_skb(skb, pt_prev, orig_dev); 3673 pt_prev = ptype; 3674 } 3675 3676 skip_taps: 3677 #ifdef CONFIG_NET_CLS_ACT 3678 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3679 if (!skb) 3680 goto unlock; 3681 ncls: 3682 #endif 3683 3684 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 3685 goto drop; 3686 3687 if (skb_vlan_tag_present(skb)) { 3688 if (pt_prev) { 3689 ret = deliver_skb(skb, pt_prev, orig_dev); 3690 pt_prev = NULL; 3691 } 3692 if (vlan_do_receive(&skb)) 3693 goto another_round; 3694 else if (unlikely(!skb)) 3695 goto unlock; 3696 } 3697 3698 rx_handler = rcu_dereference(skb->dev->rx_handler); 3699 if (rx_handler) { 3700 if (pt_prev) { 3701 ret = deliver_skb(skb, pt_prev, orig_dev); 3702 pt_prev = NULL; 3703 } 3704 switch (rx_handler(&skb)) { 3705 case RX_HANDLER_CONSUMED: 3706 ret = NET_RX_SUCCESS; 3707 goto unlock; 3708 case RX_HANDLER_ANOTHER: 3709 goto another_round; 3710 case RX_HANDLER_EXACT: 3711 deliver_exact = true; 3712 case RX_HANDLER_PASS: 3713 break; 3714 default: 3715 BUG(); 3716 } 3717 } 3718 3719 if (unlikely(skb_vlan_tag_present(skb))) { 3720 if (skb_vlan_tag_get_id(skb)) 3721 skb->pkt_type = PACKET_OTHERHOST; 3722 /* Note: we might in the future use prio bits 3723 * and set skb->priority like in vlan_do_receive() 3724 * For the time being, just ignore Priority Code Point 3725 */ 3726 skb->vlan_tci = 0; 3727 } 3728 3729 type = skb->protocol; 3730 3731 /* deliver only exact match when indicated */ 3732 if (likely(!deliver_exact)) { 3733 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3734 &ptype_base[ntohs(type) & 3735 PTYPE_HASH_MASK]); 3736 } 3737 3738 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3739 &orig_dev->ptype_specific); 3740 3741 if (unlikely(skb->dev != orig_dev)) { 3742 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3743 &skb->dev->ptype_specific); 3744 } 3745 3746 if (pt_prev) { 3747 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 3748 goto drop; 3749 else 3750 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3751 } else { 3752 drop: 3753 atomic_long_inc(&skb->dev->rx_dropped); 3754 kfree_skb(skb); 3755 /* Jamal, now you will not able to escape explaining 3756 * me how you were going to use this. :-) 3757 */ 3758 ret = NET_RX_DROP; 3759 } 3760 3761 unlock: 3762 rcu_read_unlock(); 3763 return ret; 3764 } 3765 3766 static int __netif_receive_skb(struct sk_buff *skb) 3767 { 3768 int ret; 3769 3770 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 3771 unsigned long pflags = current->flags; 3772 3773 /* 3774 * PFMEMALLOC skbs are special, they should 3775 * - be delivered to SOCK_MEMALLOC sockets only 3776 * - stay away from userspace 3777 * - have bounded memory usage 3778 * 3779 * Use PF_MEMALLOC as this saves us from propagating the allocation 3780 * context down to all allocation sites. 3781 */ 3782 current->flags |= PF_MEMALLOC; 3783 ret = __netif_receive_skb_core(skb, true); 3784 tsk_restore_flags(current, pflags, PF_MEMALLOC); 3785 } else 3786 ret = __netif_receive_skb_core(skb, false); 3787 3788 return ret; 3789 } 3790 3791 static int netif_receive_skb_internal(struct sk_buff *skb) 3792 { 3793 net_timestamp_check(netdev_tstamp_prequeue, skb); 3794 3795 if (skb_defer_rx_timestamp(skb)) 3796 return NET_RX_SUCCESS; 3797 3798 #ifdef CONFIG_RPS 3799 if (static_key_false(&rps_needed)) { 3800 struct rps_dev_flow voidflow, *rflow = &voidflow; 3801 int cpu, ret; 3802 3803 rcu_read_lock(); 3804 3805 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3806 3807 if (cpu >= 0) { 3808 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3809 rcu_read_unlock(); 3810 return ret; 3811 } 3812 rcu_read_unlock(); 3813 } 3814 #endif 3815 return __netif_receive_skb(skb); 3816 } 3817 3818 /** 3819 * netif_receive_skb - process receive buffer from network 3820 * @skb: buffer to process 3821 * 3822 * netif_receive_skb() is the main receive data processing function. 3823 * It always succeeds. The buffer may be dropped during processing 3824 * for congestion control or by the protocol layers. 3825 * 3826 * This function may only be called from softirq context and interrupts 3827 * should be enabled. 3828 * 3829 * Return values (usually ignored): 3830 * NET_RX_SUCCESS: no congestion 3831 * NET_RX_DROP: packet was dropped 3832 */ 3833 int netif_receive_skb(struct sk_buff *skb) 3834 { 3835 trace_netif_receive_skb_entry(skb); 3836 3837 return netif_receive_skb_internal(skb); 3838 } 3839 EXPORT_SYMBOL(netif_receive_skb); 3840 3841 /* Network device is going away, flush any packets still pending 3842 * Called with irqs disabled. 3843 */ 3844 static void flush_backlog(void *arg) 3845 { 3846 struct net_device *dev = arg; 3847 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3848 struct sk_buff *skb, *tmp; 3849 3850 rps_lock(sd); 3851 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3852 if (skb->dev == dev) { 3853 __skb_unlink(skb, &sd->input_pkt_queue); 3854 kfree_skb(skb); 3855 input_queue_head_incr(sd); 3856 } 3857 } 3858 rps_unlock(sd); 3859 3860 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3861 if (skb->dev == dev) { 3862 __skb_unlink(skb, &sd->process_queue); 3863 kfree_skb(skb); 3864 input_queue_head_incr(sd); 3865 } 3866 } 3867 } 3868 3869 static int napi_gro_complete(struct sk_buff *skb) 3870 { 3871 struct packet_offload *ptype; 3872 __be16 type = skb->protocol; 3873 struct list_head *head = &offload_base; 3874 int err = -ENOENT; 3875 3876 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 3877 3878 if (NAPI_GRO_CB(skb)->count == 1) { 3879 skb_shinfo(skb)->gso_size = 0; 3880 goto out; 3881 } 3882 3883 rcu_read_lock(); 3884 list_for_each_entry_rcu(ptype, head, list) { 3885 if (ptype->type != type || !ptype->callbacks.gro_complete) 3886 continue; 3887 3888 err = ptype->callbacks.gro_complete(skb, 0); 3889 break; 3890 } 3891 rcu_read_unlock(); 3892 3893 if (err) { 3894 WARN_ON(&ptype->list == head); 3895 kfree_skb(skb); 3896 return NET_RX_SUCCESS; 3897 } 3898 3899 out: 3900 return netif_receive_skb_internal(skb); 3901 } 3902 3903 /* napi->gro_list contains packets ordered by age. 3904 * youngest packets at the head of it. 3905 * Complete skbs in reverse order to reduce latencies. 3906 */ 3907 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 3908 { 3909 struct sk_buff *skb, *prev = NULL; 3910 3911 /* scan list and build reverse chain */ 3912 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 3913 skb->prev = prev; 3914 prev = skb; 3915 } 3916 3917 for (skb = prev; skb; skb = prev) { 3918 skb->next = NULL; 3919 3920 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 3921 return; 3922 3923 prev = skb->prev; 3924 napi_gro_complete(skb); 3925 napi->gro_count--; 3926 } 3927 3928 napi->gro_list = NULL; 3929 } 3930 EXPORT_SYMBOL(napi_gro_flush); 3931 3932 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 3933 { 3934 struct sk_buff *p; 3935 unsigned int maclen = skb->dev->hard_header_len; 3936 u32 hash = skb_get_hash_raw(skb); 3937 3938 for (p = napi->gro_list; p; p = p->next) { 3939 unsigned long diffs; 3940 3941 NAPI_GRO_CB(p)->flush = 0; 3942 3943 if (hash != skb_get_hash_raw(p)) { 3944 NAPI_GRO_CB(p)->same_flow = 0; 3945 continue; 3946 } 3947 3948 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3949 diffs |= p->vlan_tci ^ skb->vlan_tci; 3950 if (maclen == ETH_HLEN) 3951 diffs |= compare_ether_header(skb_mac_header(p), 3952 skb_mac_header(skb)); 3953 else if (!diffs) 3954 diffs = memcmp(skb_mac_header(p), 3955 skb_mac_header(skb), 3956 maclen); 3957 NAPI_GRO_CB(p)->same_flow = !diffs; 3958 } 3959 } 3960 3961 static void skb_gro_reset_offset(struct sk_buff *skb) 3962 { 3963 const struct skb_shared_info *pinfo = skb_shinfo(skb); 3964 const skb_frag_t *frag0 = &pinfo->frags[0]; 3965 3966 NAPI_GRO_CB(skb)->data_offset = 0; 3967 NAPI_GRO_CB(skb)->frag0 = NULL; 3968 NAPI_GRO_CB(skb)->frag0_len = 0; 3969 3970 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 3971 pinfo->nr_frags && 3972 !PageHighMem(skb_frag_page(frag0))) { 3973 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 3974 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 3975 } 3976 } 3977 3978 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 3979 { 3980 struct skb_shared_info *pinfo = skb_shinfo(skb); 3981 3982 BUG_ON(skb->end - skb->tail < grow); 3983 3984 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 3985 3986 skb->data_len -= grow; 3987 skb->tail += grow; 3988 3989 pinfo->frags[0].page_offset += grow; 3990 skb_frag_size_sub(&pinfo->frags[0], grow); 3991 3992 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 3993 skb_frag_unref(skb, 0); 3994 memmove(pinfo->frags, pinfo->frags + 1, 3995 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 3996 } 3997 } 3998 3999 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4000 { 4001 struct sk_buff **pp = NULL; 4002 struct packet_offload *ptype; 4003 __be16 type = skb->protocol; 4004 struct list_head *head = &offload_base; 4005 int same_flow; 4006 enum gro_result ret; 4007 int grow; 4008 4009 if (!(skb->dev->features & NETIF_F_GRO)) 4010 goto normal; 4011 4012 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4013 goto normal; 4014 4015 gro_list_prepare(napi, skb); 4016 4017 rcu_read_lock(); 4018 list_for_each_entry_rcu(ptype, head, list) { 4019 if (ptype->type != type || !ptype->callbacks.gro_receive) 4020 continue; 4021 4022 skb_set_network_header(skb, skb_gro_offset(skb)); 4023 skb_reset_mac_len(skb); 4024 NAPI_GRO_CB(skb)->same_flow = 0; 4025 NAPI_GRO_CB(skb)->flush = 0; 4026 NAPI_GRO_CB(skb)->free = 0; 4027 NAPI_GRO_CB(skb)->udp_mark = 0; 4028 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4029 4030 /* Setup for GRO checksum validation */ 4031 switch (skb->ip_summed) { 4032 case CHECKSUM_COMPLETE: 4033 NAPI_GRO_CB(skb)->csum = skb->csum; 4034 NAPI_GRO_CB(skb)->csum_valid = 1; 4035 NAPI_GRO_CB(skb)->csum_cnt = 0; 4036 break; 4037 case CHECKSUM_UNNECESSARY: 4038 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4039 NAPI_GRO_CB(skb)->csum_valid = 0; 4040 break; 4041 default: 4042 NAPI_GRO_CB(skb)->csum_cnt = 0; 4043 NAPI_GRO_CB(skb)->csum_valid = 0; 4044 } 4045 4046 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4047 break; 4048 } 4049 rcu_read_unlock(); 4050 4051 if (&ptype->list == head) 4052 goto normal; 4053 4054 same_flow = NAPI_GRO_CB(skb)->same_flow; 4055 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4056 4057 if (pp) { 4058 struct sk_buff *nskb = *pp; 4059 4060 *pp = nskb->next; 4061 nskb->next = NULL; 4062 napi_gro_complete(nskb); 4063 napi->gro_count--; 4064 } 4065 4066 if (same_flow) 4067 goto ok; 4068 4069 if (NAPI_GRO_CB(skb)->flush) 4070 goto normal; 4071 4072 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4073 struct sk_buff *nskb = napi->gro_list; 4074 4075 /* locate the end of the list to select the 'oldest' flow */ 4076 while (nskb->next) { 4077 pp = &nskb->next; 4078 nskb = *pp; 4079 } 4080 *pp = NULL; 4081 nskb->next = NULL; 4082 napi_gro_complete(nskb); 4083 } else { 4084 napi->gro_count++; 4085 } 4086 NAPI_GRO_CB(skb)->count = 1; 4087 NAPI_GRO_CB(skb)->age = jiffies; 4088 NAPI_GRO_CB(skb)->last = skb; 4089 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4090 skb->next = napi->gro_list; 4091 napi->gro_list = skb; 4092 ret = GRO_HELD; 4093 4094 pull: 4095 grow = skb_gro_offset(skb) - skb_headlen(skb); 4096 if (grow > 0) 4097 gro_pull_from_frag0(skb, grow); 4098 ok: 4099 return ret; 4100 4101 normal: 4102 ret = GRO_NORMAL; 4103 goto pull; 4104 } 4105 4106 struct packet_offload *gro_find_receive_by_type(__be16 type) 4107 { 4108 struct list_head *offload_head = &offload_base; 4109 struct packet_offload *ptype; 4110 4111 list_for_each_entry_rcu(ptype, offload_head, list) { 4112 if (ptype->type != type || !ptype->callbacks.gro_receive) 4113 continue; 4114 return ptype; 4115 } 4116 return NULL; 4117 } 4118 EXPORT_SYMBOL(gro_find_receive_by_type); 4119 4120 struct packet_offload *gro_find_complete_by_type(__be16 type) 4121 { 4122 struct list_head *offload_head = &offload_base; 4123 struct packet_offload *ptype; 4124 4125 list_for_each_entry_rcu(ptype, offload_head, list) { 4126 if (ptype->type != type || !ptype->callbacks.gro_complete) 4127 continue; 4128 return ptype; 4129 } 4130 return NULL; 4131 } 4132 EXPORT_SYMBOL(gro_find_complete_by_type); 4133 4134 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4135 { 4136 switch (ret) { 4137 case GRO_NORMAL: 4138 if (netif_receive_skb_internal(skb)) 4139 ret = GRO_DROP; 4140 break; 4141 4142 case GRO_DROP: 4143 kfree_skb(skb); 4144 break; 4145 4146 case GRO_MERGED_FREE: 4147 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4148 kmem_cache_free(skbuff_head_cache, skb); 4149 else 4150 __kfree_skb(skb); 4151 break; 4152 4153 case GRO_HELD: 4154 case GRO_MERGED: 4155 break; 4156 } 4157 4158 return ret; 4159 } 4160 4161 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4162 { 4163 trace_napi_gro_receive_entry(skb); 4164 4165 skb_gro_reset_offset(skb); 4166 4167 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4168 } 4169 EXPORT_SYMBOL(napi_gro_receive); 4170 4171 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4172 { 4173 if (unlikely(skb->pfmemalloc)) { 4174 consume_skb(skb); 4175 return; 4176 } 4177 __skb_pull(skb, skb_headlen(skb)); 4178 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4179 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4180 skb->vlan_tci = 0; 4181 skb->dev = napi->dev; 4182 skb->skb_iif = 0; 4183 skb->encapsulation = 0; 4184 skb_shinfo(skb)->gso_type = 0; 4185 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4186 4187 napi->skb = skb; 4188 } 4189 4190 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4191 { 4192 struct sk_buff *skb = napi->skb; 4193 4194 if (!skb) { 4195 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4196 napi->skb = skb; 4197 } 4198 return skb; 4199 } 4200 EXPORT_SYMBOL(napi_get_frags); 4201 4202 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4203 struct sk_buff *skb, 4204 gro_result_t ret) 4205 { 4206 switch (ret) { 4207 case GRO_NORMAL: 4208 case GRO_HELD: 4209 __skb_push(skb, ETH_HLEN); 4210 skb->protocol = eth_type_trans(skb, skb->dev); 4211 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4212 ret = GRO_DROP; 4213 break; 4214 4215 case GRO_DROP: 4216 case GRO_MERGED_FREE: 4217 napi_reuse_skb(napi, skb); 4218 break; 4219 4220 case GRO_MERGED: 4221 break; 4222 } 4223 4224 return ret; 4225 } 4226 4227 /* Upper GRO stack assumes network header starts at gro_offset=0 4228 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4229 * We copy ethernet header into skb->data to have a common layout. 4230 */ 4231 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4232 { 4233 struct sk_buff *skb = napi->skb; 4234 const struct ethhdr *eth; 4235 unsigned int hlen = sizeof(*eth); 4236 4237 napi->skb = NULL; 4238 4239 skb_reset_mac_header(skb); 4240 skb_gro_reset_offset(skb); 4241 4242 eth = skb_gro_header_fast(skb, 0); 4243 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4244 eth = skb_gro_header_slow(skb, hlen, 0); 4245 if (unlikely(!eth)) { 4246 napi_reuse_skb(napi, skb); 4247 return NULL; 4248 } 4249 } else { 4250 gro_pull_from_frag0(skb, hlen); 4251 NAPI_GRO_CB(skb)->frag0 += hlen; 4252 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4253 } 4254 __skb_pull(skb, hlen); 4255 4256 /* 4257 * This works because the only protocols we care about don't require 4258 * special handling. 4259 * We'll fix it up properly in napi_frags_finish() 4260 */ 4261 skb->protocol = eth->h_proto; 4262 4263 return skb; 4264 } 4265 4266 gro_result_t napi_gro_frags(struct napi_struct *napi) 4267 { 4268 struct sk_buff *skb = napi_frags_skb(napi); 4269 4270 if (!skb) 4271 return GRO_DROP; 4272 4273 trace_napi_gro_frags_entry(skb); 4274 4275 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4276 } 4277 EXPORT_SYMBOL(napi_gro_frags); 4278 4279 /* Compute the checksum from gro_offset and return the folded value 4280 * after adding in any pseudo checksum. 4281 */ 4282 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4283 { 4284 __wsum wsum; 4285 __sum16 sum; 4286 4287 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4288 4289 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4290 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4291 if (likely(!sum)) { 4292 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4293 !skb->csum_complete_sw) 4294 netdev_rx_csum_fault(skb->dev); 4295 } 4296 4297 NAPI_GRO_CB(skb)->csum = wsum; 4298 NAPI_GRO_CB(skb)->csum_valid = 1; 4299 4300 return sum; 4301 } 4302 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4303 4304 /* 4305 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4306 * Note: called with local irq disabled, but exits with local irq enabled. 4307 */ 4308 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4309 { 4310 #ifdef CONFIG_RPS 4311 struct softnet_data *remsd = sd->rps_ipi_list; 4312 4313 if (remsd) { 4314 sd->rps_ipi_list = NULL; 4315 4316 local_irq_enable(); 4317 4318 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4319 while (remsd) { 4320 struct softnet_data *next = remsd->rps_ipi_next; 4321 4322 if (cpu_online(remsd->cpu)) 4323 smp_call_function_single_async(remsd->cpu, 4324 &remsd->csd); 4325 remsd = next; 4326 } 4327 } else 4328 #endif 4329 local_irq_enable(); 4330 } 4331 4332 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4333 { 4334 #ifdef CONFIG_RPS 4335 return sd->rps_ipi_list != NULL; 4336 #else 4337 return false; 4338 #endif 4339 } 4340 4341 static int process_backlog(struct napi_struct *napi, int quota) 4342 { 4343 int work = 0; 4344 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4345 4346 /* Check if we have pending ipi, its better to send them now, 4347 * not waiting net_rx_action() end. 4348 */ 4349 if (sd_has_rps_ipi_waiting(sd)) { 4350 local_irq_disable(); 4351 net_rps_action_and_irq_enable(sd); 4352 } 4353 4354 napi->weight = weight_p; 4355 local_irq_disable(); 4356 while (1) { 4357 struct sk_buff *skb; 4358 4359 while ((skb = __skb_dequeue(&sd->process_queue))) { 4360 local_irq_enable(); 4361 __netif_receive_skb(skb); 4362 local_irq_disable(); 4363 input_queue_head_incr(sd); 4364 if (++work >= quota) { 4365 local_irq_enable(); 4366 return work; 4367 } 4368 } 4369 4370 rps_lock(sd); 4371 if (skb_queue_empty(&sd->input_pkt_queue)) { 4372 /* 4373 * Inline a custom version of __napi_complete(). 4374 * only current cpu owns and manipulates this napi, 4375 * and NAPI_STATE_SCHED is the only possible flag set 4376 * on backlog. 4377 * We can use a plain write instead of clear_bit(), 4378 * and we dont need an smp_mb() memory barrier. 4379 */ 4380 napi->state = 0; 4381 rps_unlock(sd); 4382 4383 break; 4384 } 4385 4386 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4387 &sd->process_queue); 4388 rps_unlock(sd); 4389 } 4390 local_irq_enable(); 4391 4392 return work; 4393 } 4394 4395 /** 4396 * __napi_schedule - schedule for receive 4397 * @n: entry to schedule 4398 * 4399 * The entry's receive function will be scheduled to run. 4400 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4401 */ 4402 void __napi_schedule(struct napi_struct *n) 4403 { 4404 unsigned long flags; 4405 4406 local_irq_save(flags); 4407 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4408 local_irq_restore(flags); 4409 } 4410 EXPORT_SYMBOL(__napi_schedule); 4411 4412 /** 4413 * __napi_schedule_irqoff - schedule for receive 4414 * @n: entry to schedule 4415 * 4416 * Variant of __napi_schedule() assuming hard irqs are masked 4417 */ 4418 void __napi_schedule_irqoff(struct napi_struct *n) 4419 { 4420 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4421 } 4422 EXPORT_SYMBOL(__napi_schedule_irqoff); 4423 4424 void __napi_complete(struct napi_struct *n) 4425 { 4426 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4427 4428 list_del_init(&n->poll_list); 4429 smp_mb__before_atomic(); 4430 clear_bit(NAPI_STATE_SCHED, &n->state); 4431 } 4432 EXPORT_SYMBOL(__napi_complete); 4433 4434 void napi_complete_done(struct napi_struct *n, int work_done) 4435 { 4436 unsigned long flags; 4437 4438 /* 4439 * don't let napi dequeue from the cpu poll list 4440 * just in case its running on a different cpu 4441 */ 4442 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4443 return; 4444 4445 if (n->gro_list) { 4446 unsigned long timeout = 0; 4447 4448 if (work_done) 4449 timeout = n->dev->gro_flush_timeout; 4450 4451 if (timeout) 4452 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4453 HRTIMER_MODE_REL_PINNED); 4454 else 4455 napi_gro_flush(n, false); 4456 } 4457 if (likely(list_empty(&n->poll_list))) { 4458 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4459 } else { 4460 /* If n->poll_list is not empty, we need to mask irqs */ 4461 local_irq_save(flags); 4462 __napi_complete(n); 4463 local_irq_restore(flags); 4464 } 4465 } 4466 EXPORT_SYMBOL(napi_complete_done); 4467 4468 /* must be called under rcu_read_lock(), as we dont take a reference */ 4469 struct napi_struct *napi_by_id(unsigned int napi_id) 4470 { 4471 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4472 struct napi_struct *napi; 4473 4474 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4475 if (napi->napi_id == napi_id) 4476 return napi; 4477 4478 return NULL; 4479 } 4480 EXPORT_SYMBOL_GPL(napi_by_id); 4481 4482 void napi_hash_add(struct napi_struct *napi) 4483 { 4484 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { 4485 4486 spin_lock(&napi_hash_lock); 4487 4488 /* 0 is not a valid id, we also skip an id that is taken 4489 * we expect both events to be extremely rare 4490 */ 4491 napi->napi_id = 0; 4492 while (!napi->napi_id) { 4493 napi->napi_id = ++napi_gen_id; 4494 if (napi_by_id(napi->napi_id)) 4495 napi->napi_id = 0; 4496 } 4497 4498 hlist_add_head_rcu(&napi->napi_hash_node, 4499 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 4500 4501 spin_unlock(&napi_hash_lock); 4502 } 4503 } 4504 EXPORT_SYMBOL_GPL(napi_hash_add); 4505 4506 /* Warning : caller is responsible to make sure rcu grace period 4507 * is respected before freeing memory containing @napi 4508 */ 4509 void napi_hash_del(struct napi_struct *napi) 4510 { 4511 spin_lock(&napi_hash_lock); 4512 4513 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) 4514 hlist_del_rcu(&napi->napi_hash_node); 4515 4516 spin_unlock(&napi_hash_lock); 4517 } 4518 EXPORT_SYMBOL_GPL(napi_hash_del); 4519 4520 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 4521 { 4522 struct napi_struct *napi; 4523 4524 napi = container_of(timer, struct napi_struct, timer); 4525 if (napi->gro_list) 4526 napi_schedule(napi); 4527 4528 return HRTIMER_NORESTART; 4529 } 4530 4531 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4532 int (*poll)(struct napi_struct *, int), int weight) 4533 { 4534 INIT_LIST_HEAD(&napi->poll_list); 4535 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 4536 napi->timer.function = napi_watchdog; 4537 napi->gro_count = 0; 4538 napi->gro_list = NULL; 4539 napi->skb = NULL; 4540 napi->poll = poll; 4541 if (weight > NAPI_POLL_WEIGHT) 4542 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 4543 weight, dev->name); 4544 napi->weight = weight; 4545 list_add(&napi->dev_list, &dev->napi_list); 4546 napi->dev = dev; 4547 #ifdef CONFIG_NETPOLL 4548 spin_lock_init(&napi->poll_lock); 4549 napi->poll_owner = -1; 4550 #endif 4551 set_bit(NAPI_STATE_SCHED, &napi->state); 4552 } 4553 EXPORT_SYMBOL(netif_napi_add); 4554 4555 void napi_disable(struct napi_struct *n) 4556 { 4557 might_sleep(); 4558 set_bit(NAPI_STATE_DISABLE, &n->state); 4559 4560 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 4561 msleep(1); 4562 4563 hrtimer_cancel(&n->timer); 4564 4565 clear_bit(NAPI_STATE_DISABLE, &n->state); 4566 } 4567 EXPORT_SYMBOL(napi_disable); 4568 4569 void netif_napi_del(struct napi_struct *napi) 4570 { 4571 list_del_init(&napi->dev_list); 4572 napi_free_frags(napi); 4573 4574 kfree_skb_list(napi->gro_list); 4575 napi->gro_list = NULL; 4576 napi->gro_count = 0; 4577 } 4578 EXPORT_SYMBOL(netif_napi_del); 4579 4580 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 4581 { 4582 void *have; 4583 int work, weight; 4584 4585 list_del_init(&n->poll_list); 4586 4587 have = netpoll_poll_lock(n); 4588 4589 weight = n->weight; 4590 4591 /* This NAPI_STATE_SCHED test is for avoiding a race 4592 * with netpoll's poll_napi(). Only the entity which 4593 * obtains the lock and sees NAPI_STATE_SCHED set will 4594 * actually make the ->poll() call. Therefore we avoid 4595 * accidentally calling ->poll() when NAPI is not scheduled. 4596 */ 4597 work = 0; 4598 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 4599 work = n->poll(n, weight); 4600 trace_napi_poll(n); 4601 } 4602 4603 WARN_ON_ONCE(work > weight); 4604 4605 if (likely(work < weight)) 4606 goto out_unlock; 4607 4608 /* Drivers must not modify the NAPI state if they 4609 * consume the entire weight. In such cases this code 4610 * still "owns" the NAPI instance and therefore can 4611 * move the instance around on the list at-will. 4612 */ 4613 if (unlikely(napi_disable_pending(n))) { 4614 napi_complete(n); 4615 goto out_unlock; 4616 } 4617 4618 if (n->gro_list) { 4619 /* flush too old packets 4620 * If HZ < 1000, flush all packets. 4621 */ 4622 napi_gro_flush(n, HZ >= 1000); 4623 } 4624 4625 /* Some drivers may have called napi_schedule 4626 * prior to exhausting their budget. 4627 */ 4628 if (unlikely(!list_empty(&n->poll_list))) { 4629 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 4630 n->dev ? n->dev->name : "backlog"); 4631 goto out_unlock; 4632 } 4633 4634 list_add_tail(&n->poll_list, repoll); 4635 4636 out_unlock: 4637 netpoll_poll_unlock(have); 4638 4639 return work; 4640 } 4641 4642 static void net_rx_action(struct softirq_action *h) 4643 { 4644 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4645 unsigned long time_limit = jiffies + 2; 4646 int budget = netdev_budget; 4647 LIST_HEAD(list); 4648 LIST_HEAD(repoll); 4649 4650 local_irq_disable(); 4651 list_splice_init(&sd->poll_list, &list); 4652 local_irq_enable(); 4653 4654 for (;;) { 4655 struct napi_struct *n; 4656 4657 if (list_empty(&list)) { 4658 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 4659 return; 4660 break; 4661 } 4662 4663 n = list_first_entry(&list, struct napi_struct, poll_list); 4664 budget -= napi_poll(n, &repoll); 4665 4666 /* If softirq window is exhausted then punt. 4667 * Allow this to run for 2 jiffies since which will allow 4668 * an average latency of 1.5/HZ. 4669 */ 4670 if (unlikely(budget <= 0 || 4671 time_after_eq(jiffies, time_limit))) { 4672 sd->time_squeeze++; 4673 break; 4674 } 4675 } 4676 4677 local_irq_disable(); 4678 4679 list_splice_tail_init(&sd->poll_list, &list); 4680 list_splice_tail(&repoll, &list); 4681 list_splice(&list, &sd->poll_list); 4682 if (!list_empty(&sd->poll_list)) 4683 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4684 4685 net_rps_action_and_irq_enable(sd); 4686 } 4687 4688 struct netdev_adjacent { 4689 struct net_device *dev; 4690 4691 /* upper master flag, there can only be one master device per list */ 4692 bool master; 4693 4694 /* counter for the number of times this device was added to us */ 4695 u16 ref_nr; 4696 4697 /* private field for the users */ 4698 void *private; 4699 4700 struct list_head list; 4701 struct rcu_head rcu; 4702 }; 4703 4704 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, 4705 struct net_device *adj_dev, 4706 struct list_head *adj_list) 4707 { 4708 struct netdev_adjacent *adj; 4709 4710 list_for_each_entry(adj, adj_list, list) { 4711 if (adj->dev == adj_dev) 4712 return adj; 4713 } 4714 return NULL; 4715 } 4716 4717 /** 4718 * netdev_has_upper_dev - Check if device is linked to an upper device 4719 * @dev: device 4720 * @upper_dev: upper device to check 4721 * 4722 * Find out if a device is linked to specified upper device and return true 4723 * in case it is. Note that this checks only immediate upper device, 4724 * not through a complete stack of devices. The caller must hold the RTNL lock. 4725 */ 4726 bool netdev_has_upper_dev(struct net_device *dev, 4727 struct net_device *upper_dev) 4728 { 4729 ASSERT_RTNL(); 4730 4731 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); 4732 } 4733 EXPORT_SYMBOL(netdev_has_upper_dev); 4734 4735 /** 4736 * netdev_has_any_upper_dev - Check if device is linked to some device 4737 * @dev: device 4738 * 4739 * Find out if a device is linked to an upper device and return true in case 4740 * it is. The caller must hold the RTNL lock. 4741 */ 4742 static bool netdev_has_any_upper_dev(struct net_device *dev) 4743 { 4744 ASSERT_RTNL(); 4745 4746 return !list_empty(&dev->all_adj_list.upper); 4747 } 4748 4749 /** 4750 * netdev_master_upper_dev_get - Get master upper device 4751 * @dev: device 4752 * 4753 * Find a master upper device and return pointer to it or NULL in case 4754 * it's not there. The caller must hold the RTNL lock. 4755 */ 4756 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 4757 { 4758 struct netdev_adjacent *upper; 4759 4760 ASSERT_RTNL(); 4761 4762 if (list_empty(&dev->adj_list.upper)) 4763 return NULL; 4764 4765 upper = list_first_entry(&dev->adj_list.upper, 4766 struct netdev_adjacent, list); 4767 if (likely(upper->master)) 4768 return upper->dev; 4769 return NULL; 4770 } 4771 EXPORT_SYMBOL(netdev_master_upper_dev_get); 4772 4773 void *netdev_adjacent_get_private(struct list_head *adj_list) 4774 { 4775 struct netdev_adjacent *adj; 4776 4777 adj = list_entry(adj_list, struct netdev_adjacent, list); 4778 4779 return adj->private; 4780 } 4781 EXPORT_SYMBOL(netdev_adjacent_get_private); 4782 4783 /** 4784 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 4785 * @dev: device 4786 * @iter: list_head ** of the current position 4787 * 4788 * Gets the next device from the dev's upper list, starting from iter 4789 * position. The caller must hold RCU read lock. 4790 */ 4791 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 4792 struct list_head **iter) 4793 { 4794 struct netdev_adjacent *upper; 4795 4796 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4797 4798 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4799 4800 if (&upper->list == &dev->adj_list.upper) 4801 return NULL; 4802 4803 *iter = &upper->list; 4804 4805 return upper->dev; 4806 } 4807 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 4808 4809 /** 4810 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 4811 * @dev: device 4812 * @iter: list_head ** of the current position 4813 * 4814 * Gets the next device from the dev's upper list, starting from iter 4815 * position. The caller must hold RCU read lock. 4816 */ 4817 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 4818 struct list_head **iter) 4819 { 4820 struct netdev_adjacent *upper; 4821 4822 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4823 4824 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4825 4826 if (&upper->list == &dev->all_adj_list.upper) 4827 return NULL; 4828 4829 *iter = &upper->list; 4830 4831 return upper->dev; 4832 } 4833 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 4834 4835 /** 4836 * netdev_lower_get_next_private - Get the next ->private from the 4837 * lower neighbour list 4838 * @dev: device 4839 * @iter: list_head ** of the current position 4840 * 4841 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4842 * list, starting from iter position. The caller must hold either hold the 4843 * RTNL lock or its own locking that guarantees that the neighbour lower 4844 * list will remain unchainged. 4845 */ 4846 void *netdev_lower_get_next_private(struct net_device *dev, 4847 struct list_head **iter) 4848 { 4849 struct netdev_adjacent *lower; 4850 4851 lower = list_entry(*iter, struct netdev_adjacent, list); 4852 4853 if (&lower->list == &dev->adj_list.lower) 4854 return NULL; 4855 4856 *iter = lower->list.next; 4857 4858 return lower->private; 4859 } 4860 EXPORT_SYMBOL(netdev_lower_get_next_private); 4861 4862 /** 4863 * netdev_lower_get_next_private_rcu - Get the next ->private from the 4864 * lower neighbour list, RCU 4865 * variant 4866 * @dev: device 4867 * @iter: list_head ** of the current position 4868 * 4869 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4870 * list, starting from iter position. The caller must hold RCU read lock. 4871 */ 4872 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 4873 struct list_head **iter) 4874 { 4875 struct netdev_adjacent *lower; 4876 4877 WARN_ON_ONCE(!rcu_read_lock_held()); 4878 4879 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4880 4881 if (&lower->list == &dev->adj_list.lower) 4882 return NULL; 4883 4884 *iter = &lower->list; 4885 4886 return lower->private; 4887 } 4888 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 4889 4890 /** 4891 * netdev_lower_get_next - Get the next device from the lower neighbour 4892 * list 4893 * @dev: device 4894 * @iter: list_head ** of the current position 4895 * 4896 * Gets the next netdev_adjacent from the dev's lower neighbour 4897 * list, starting from iter position. The caller must hold RTNL lock or 4898 * its own locking that guarantees that the neighbour lower 4899 * list will remain unchainged. 4900 */ 4901 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 4902 { 4903 struct netdev_adjacent *lower; 4904 4905 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 4906 4907 if (&lower->list == &dev->adj_list.lower) 4908 return NULL; 4909 4910 *iter = &lower->list; 4911 4912 return lower->dev; 4913 } 4914 EXPORT_SYMBOL(netdev_lower_get_next); 4915 4916 /** 4917 * netdev_lower_get_first_private_rcu - Get the first ->private from the 4918 * lower neighbour list, RCU 4919 * variant 4920 * @dev: device 4921 * 4922 * Gets the first netdev_adjacent->private from the dev's lower neighbour 4923 * list. The caller must hold RCU read lock. 4924 */ 4925 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 4926 { 4927 struct netdev_adjacent *lower; 4928 4929 lower = list_first_or_null_rcu(&dev->adj_list.lower, 4930 struct netdev_adjacent, list); 4931 if (lower) 4932 return lower->private; 4933 return NULL; 4934 } 4935 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 4936 4937 /** 4938 * netdev_master_upper_dev_get_rcu - Get master upper device 4939 * @dev: device 4940 * 4941 * Find a master upper device and return pointer to it or NULL in case 4942 * it's not there. The caller must hold the RCU read lock. 4943 */ 4944 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 4945 { 4946 struct netdev_adjacent *upper; 4947 4948 upper = list_first_or_null_rcu(&dev->adj_list.upper, 4949 struct netdev_adjacent, list); 4950 if (upper && likely(upper->master)) 4951 return upper->dev; 4952 return NULL; 4953 } 4954 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 4955 4956 static int netdev_adjacent_sysfs_add(struct net_device *dev, 4957 struct net_device *adj_dev, 4958 struct list_head *dev_list) 4959 { 4960 char linkname[IFNAMSIZ+7]; 4961 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4962 "upper_%s" : "lower_%s", adj_dev->name); 4963 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 4964 linkname); 4965 } 4966 static void netdev_adjacent_sysfs_del(struct net_device *dev, 4967 char *name, 4968 struct list_head *dev_list) 4969 { 4970 char linkname[IFNAMSIZ+7]; 4971 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4972 "upper_%s" : "lower_%s", name); 4973 sysfs_remove_link(&(dev->dev.kobj), linkname); 4974 } 4975 4976 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 4977 struct net_device *adj_dev, 4978 struct list_head *dev_list) 4979 { 4980 return (dev_list == &dev->adj_list.upper || 4981 dev_list == &dev->adj_list.lower) && 4982 net_eq(dev_net(dev), dev_net(adj_dev)); 4983 } 4984 4985 static int __netdev_adjacent_dev_insert(struct net_device *dev, 4986 struct net_device *adj_dev, 4987 struct list_head *dev_list, 4988 void *private, bool master) 4989 { 4990 struct netdev_adjacent *adj; 4991 int ret; 4992 4993 adj = __netdev_find_adj(dev, adj_dev, dev_list); 4994 4995 if (adj) { 4996 adj->ref_nr++; 4997 return 0; 4998 } 4999 5000 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5001 if (!adj) 5002 return -ENOMEM; 5003 5004 adj->dev = adj_dev; 5005 adj->master = master; 5006 adj->ref_nr = 1; 5007 adj->private = private; 5008 dev_hold(adj_dev); 5009 5010 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5011 adj_dev->name, dev->name, adj_dev->name); 5012 5013 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5014 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5015 if (ret) 5016 goto free_adj; 5017 } 5018 5019 /* Ensure that master link is always the first item in list. */ 5020 if (master) { 5021 ret = sysfs_create_link(&(dev->dev.kobj), 5022 &(adj_dev->dev.kobj), "master"); 5023 if (ret) 5024 goto remove_symlinks; 5025 5026 list_add_rcu(&adj->list, dev_list); 5027 } else { 5028 list_add_tail_rcu(&adj->list, dev_list); 5029 } 5030 5031 return 0; 5032 5033 remove_symlinks: 5034 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5035 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5036 free_adj: 5037 kfree(adj); 5038 dev_put(adj_dev); 5039 5040 return ret; 5041 } 5042 5043 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5044 struct net_device *adj_dev, 5045 struct list_head *dev_list) 5046 { 5047 struct netdev_adjacent *adj; 5048 5049 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5050 5051 if (!adj) { 5052 pr_err("tried to remove device %s from %s\n", 5053 dev->name, adj_dev->name); 5054 BUG(); 5055 } 5056 5057 if (adj->ref_nr > 1) { 5058 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5059 adj->ref_nr-1); 5060 adj->ref_nr--; 5061 return; 5062 } 5063 5064 if (adj->master) 5065 sysfs_remove_link(&(dev->dev.kobj), "master"); 5066 5067 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5068 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5069 5070 list_del_rcu(&adj->list); 5071 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5072 adj_dev->name, dev->name, adj_dev->name); 5073 dev_put(adj_dev); 5074 kfree_rcu(adj, rcu); 5075 } 5076 5077 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5078 struct net_device *upper_dev, 5079 struct list_head *up_list, 5080 struct list_head *down_list, 5081 void *private, bool master) 5082 { 5083 int ret; 5084 5085 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5086 master); 5087 if (ret) 5088 return ret; 5089 5090 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5091 false); 5092 if (ret) { 5093 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5094 return ret; 5095 } 5096 5097 return 0; 5098 } 5099 5100 static int __netdev_adjacent_dev_link(struct net_device *dev, 5101 struct net_device *upper_dev) 5102 { 5103 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5104 &dev->all_adj_list.upper, 5105 &upper_dev->all_adj_list.lower, 5106 NULL, false); 5107 } 5108 5109 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5110 struct net_device *upper_dev, 5111 struct list_head *up_list, 5112 struct list_head *down_list) 5113 { 5114 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5115 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5116 } 5117 5118 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5119 struct net_device *upper_dev) 5120 { 5121 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5122 &dev->all_adj_list.upper, 5123 &upper_dev->all_adj_list.lower); 5124 } 5125 5126 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5127 struct net_device *upper_dev, 5128 void *private, bool master) 5129 { 5130 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5131 5132 if (ret) 5133 return ret; 5134 5135 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5136 &dev->adj_list.upper, 5137 &upper_dev->adj_list.lower, 5138 private, master); 5139 if (ret) { 5140 __netdev_adjacent_dev_unlink(dev, upper_dev); 5141 return ret; 5142 } 5143 5144 return 0; 5145 } 5146 5147 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5148 struct net_device *upper_dev) 5149 { 5150 __netdev_adjacent_dev_unlink(dev, upper_dev); 5151 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5152 &dev->adj_list.upper, 5153 &upper_dev->adj_list.lower); 5154 } 5155 5156 static int __netdev_upper_dev_link(struct net_device *dev, 5157 struct net_device *upper_dev, bool master, 5158 void *private) 5159 { 5160 struct netdev_adjacent *i, *j, *to_i, *to_j; 5161 int ret = 0; 5162 5163 ASSERT_RTNL(); 5164 5165 if (dev == upper_dev) 5166 return -EBUSY; 5167 5168 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5169 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) 5170 return -EBUSY; 5171 5172 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) 5173 return -EEXIST; 5174 5175 if (master && netdev_master_upper_dev_get(dev)) 5176 return -EBUSY; 5177 5178 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, 5179 master); 5180 if (ret) 5181 return ret; 5182 5183 /* Now that we linked these devs, make all the upper_dev's 5184 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5185 * versa, and don't forget the devices itself. All of these 5186 * links are non-neighbours. 5187 */ 5188 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5189 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5190 pr_debug("Interlinking %s with %s, non-neighbour\n", 5191 i->dev->name, j->dev->name); 5192 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5193 if (ret) 5194 goto rollback_mesh; 5195 } 5196 } 5197 5198 /* add dev to every upper_dev's upper device */ 5199 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5200 pr_debug("linking %s's upper device %s with %s\n", 5201 upper_dev->name, i->dev->name, dev->name); 5202 ret = __netdev_adjacent_dev_link(dev, i->dev); 5203 if (ret) 5204 goto rollback_upper_mesh; 5205 } 5206 5207 /* add upper_dev to every dev's lower device */ 5208 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5209 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5210 i->dev->name, upper_dev->name); 5211 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5212 if (ret) 5213 goto rollback_lower_mesh; 5214 } 5215 5216 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5217 return 0; 5218 5219 rollback_lower_mesh: 5220 to_i = i; 5221 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5222 if (i == to_i) 5223 break; 5224 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5225 } 5226 5227 i = NULL; 5228 5229 rollback_upper_mesh: 5230 to_i = i; 5231 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5232 if (i == to_i) 5233 break; 5234 __netdev_adjacent_dev_unlink(dev, i->dev); 5235 } 5236 5237 i = j = NULL; 5238 5239 rollback_mesh: 5240 to_i = i; 5241 to_j = j; 5242 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5243 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5244 if (i == to_i && j == to_j) 5245 break; 5246 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5247 } 5248 if (i == to_i) 5249 break; 5250 } 5251 5252 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5253 5254 return ret; 5255 } 5256 5257 /** 5258 * netdev_upper_dev_link - Add a link to the upper device 5259 * @dev: device 5260 * @upper_dev: new upper device 5261 * 5262 * Adds a link to device which is upper to this one. The caller must hold 5263 * the RTNL lock. On a failure a negative errno code is returned. 5264 * On success the reference counts are adjusted and the function 5265 * returns zero. 5266 */ 5267 int netdev_upper_dev_link(struct net_device *dev, 5268 struct net_device *upper_dev) 5269 { 5270 return __netdev_upper_dev_link(dev, upper_dev, false, NULL); 5271 } 5272 EXPORT_SYMBOL(netdev_upper_dev_link); 5273 5274 /** 5275 * netdev_master_upper_dev_link - Add a master link to the upper device 5276 * @dev: device 5277 * @upper_dev: new upper device 5278 * 5279 * Adds a link to device which is upper to this one. In this case, only 5280 * one master upper device can be linked, although other non-master devices 5281 * might be linked as well. The caller must hold the RTNL lock. 5282 * On a failure a negative errno code is returned. On success the reference 5283 * counts are adjusted and the function returns zero. 5284 */ 5285 int netdev_master_upper_dev_link(struct net_device *dev, 5286 struct net_device *upper_dev) 5287 { 5288 return __netdev_upper_dev_link(dev, upper_dev, true, NULL); 5289 } 5290 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5291 5292 int netdev_master_upper_dev_link_private(struct net_device *dev, 5293 struct net_device *upper_dev, 5294 void *private) 5295 { 5296 return __netdev_upper_dev_link(dev, upper_dev, true, private); 5297 } 5298 EXPORT_SYMBOL(netdev_master_upper_dev_link_private); 5299 5300 /** 5301 * netdev_upper_dev_unlink - Removes a link to upper device 5302 * @dev: device 5303 * @upper_dev: new upper device 5304 * 5305 * Removes a link to device which is upper to this one. The caller must hold 5306 * the RTNL lock. 5307 */ 5308 void netdev_upper_dev_unlink(struct net_device *dev, 5309 struct net_device *upper_dev) 5310 { 5311 struct netdev_adjacent *i, *j; 5312 ASSERT_RTNL(); 5313 5314 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5315 5316 /* Here is the tricky part. We must remove all dev's lower 5317 * devices from all upper_dev's upper devices and vice 5318 * versa, to maintain the graph relationship. 5319 */ 5320 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5321 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5322 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5323 5324 /* remove also the devices itself from lower/upper device 5325 * list 5326 */ 5327 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5328 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5329 5330 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5331 __netdev_adjacent_dev_unlink(dev, i->dev); 5332 5333 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5334 } 5335 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5336 5337 /** 5338 * netdev_bonding_info_change - Dispatch event about slave change 5339 * @dev: device 5340 * @bonding_info: info to dispatch 5341 * 5342 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5343 * The caller must hold the RTNL lock. 5344 */ 5345 void netdev_bonding_info_change(struct net_device *dev, 5346 struct netdev_bonding_info *bonding_info) 5347 { 5348 struct netdev_notifier_bonding_info info; 5349 5350 memcpy(&info.bonding_info, bonding_info, 5351 sizeof(struct netdev_bonding_info)); 5352 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5353 &info.info); 5354 } 5355 EXPORT_SYMBOL(netdev_bonding_info_change); 5356 5357 static void netdev_adjacent_add_links(struct net_device *dev) 5358 { 5359 struct netdev_adjacent *iter; 5360 5361 struct net *net = dev_net(dev); 5362 5363 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5364 if (!net_eq(net,dev_net(iter->dev))) 5365 continue; 5366 netdev_adjacent_sysfs_add(iter->dev, dev, 5367 &iter->dev->adj_list.lower); 5368 netdev_adjacent_sysfs_add(dev, iter->dev, 5369 &dev->adj_list.upper); 5370 } 5371 5372 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5373 if (!net_eq(net,dev_net(iter->dev))) 5374 continue; 5375 netdev_adjacent_sysfs_add(iter->dev, dev, 5376 &iter->dev->adj_list.upper); 5377 netdev_adjacent_sysfs_add(dev, iter->dev, 5378 &dev->adj_list.lower); 5379 } 5380 } 5381 5382 static void netdev_adjacent_del_links(struct net_device *dev) 5383 { 5384 struct netdev_adjacent *iter; 5385 5386 struct net *net = dev_net(dev); 5387 5388 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5389 if (!net_eq(net,dev_net(iter->dev))) 5390 continue; 5391 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5392 &iter->dev->adj_list.lower); 5393 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5394 &dev->adj_list.upper); 5395 } 5396 5397 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5398 if (!net_eq(net,dev_net(iter->dev))) 5399 continue; 5400 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5401 &iter->dev->adj_list.upper); 5402 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5403 &dev->adj_list.lower); 5404 } 5405 } 5406 5407 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5408 { 5409 struct netdev_adjacent *iter; 5410 5411 struct net *net = dev_net(dev); 5412 5413 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5414 if (!net_eq(net,dev_net(iter->dev))) 5415 continue; 5416 netdev_adjacent_sysfs_del(iter->dev, oldname, 5417 &iter->dev->adj_list.lower); 5418 netdev_adjacent_sysfs_add(iter->dev, dev, 5419 &iter->dev->adj_list.lower); 5420 } 5421 5422 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5423 if (!net_eq(net,dev_net(iter->dev))) 5424 continue; 5425 netdev_adjacent_sysfs_del(iter->dev, oldname, 5426 &iter->dev->adj_list.upper); 5427 netdev_adjacent_sysfs_add(iter->dev, dev, 5428 &iter->dev->adj_list.upper); 5429 } 5430 } 5431 5432 void *netdev_lower_dev_get_private(struct net_device *dev, 5433 struct net_device *lower_dev) 5434 { 5435 struct netdev_adjacent *lower; 5436 5437 if (!lower_dev) 5438 return NULL; 5439 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); 5440 if (!lower) 5441 return NULL; 5442 5443 return lower->private; 5444 } 5445 EXPORT_SYMBOL(netdev_lower_dev_get_private); 5446 5447 5448 int dev_get_nest_level(struct net_device *dev, 5449 bool (*type_check)(struct net_device *dev)) 5450 { 5451 struct net_device *lower = NULL; 5452 struct list_head *iter; 5453 int max_nest = -1; 5454 int nest; 5455 5456 ASSERT_RTNL(); 5457 5458 netdev_for_each_lower_dev(dev, lower, iter) { 5459 nest = dev_get_nest_level(lower, type_check); 5460 if (max_nest < nest) 5461 max_nest = nest; 5462 } 5463 5464 if (type_check(dev)) 5465 max_nest++; 5466 5467 return max_nest; 5468 } 5469 EXPORT_SYMBOL(dev_get_nest_level); 5470 5471 static void dev_change_rx_flags(struct net_device *dev, int flags) 5472 { 5473 const struct net_device_ops *ops = dev->netdev_ops; 5474 5475 if (ops->ndo_change_rx_flags) 5476 ops->ndo_change_rx_flags(dev, flags); 5477 } 5478 5479 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 5480 { 5481 unsigned int old_flags = dev->flags; 5482 kuid_t uid; 5483 kgid_t gid; 5484 5485 ASSERT_RTNL(); 5486 5487 dev->flags |= IFF_PROMISC; 5488 dev->promiscuity += inc; 5489 if (dev->promiscuity == 0) { 5490 /* 5491 * Avoid overflow. 5492 * If inc causes overflow, untouch promisc and return error. 5493 */ 5494 if (inc < 0) 5495 dev->flags &= ~IFF_PROMISC; 5496 else { 5497 dev->promiscuity -= inc; 5498 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 5499 dev->name); 5500 return -EOVERFLOW; 5501 } 5502 } 5503 if (dev->flags != old_flags) { 5504 pr_info("device %s %s promiscuous mode\n", 5505 dev->name, 5506 dev->flags & IFF_PROMISC ? "entered" : "left"); 5507 if (audit_enabled) { 5508 current_uid_gid(&uid, &gid); 5509 audit_log(current->audit_context, GFP_ATOMIC, 5510 AUDIT_ANOM_PROMISCUOUS, 5511 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 5512 dev->name, (dev->flags & IFF_PROMISC), 5513 (old_flags & IFF_PROMISC), 5514 from_kuid(&init_user_ns, audit_get_loginuid(current)), 5515 from_kuid(&init_user_ns, uid), 5516 from_kgid(&init_user_ns, gid), 5517 audit_get_sessionid(current)); 5518 } 5519 5520 dev_change_rx_flags(dev, IFF_PROMISC); 5521 } 5522 if (notify) 5523 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 5524 return 0; 5525 } 5526 5527 /** 5528 * dev_set_promiscuity - update promiscuity count on a device 5529 * @dev: device 5530 * @inc: modifier 5531 * 5532 * Add or remove promiscuity from a device. While the count in the device 5533 * remains above zero the interface remains promiscuous. Once it hits zero 5534 * the device reverts back to normal filtering operation. A negative inc 5535 * value is used to drop promiscuity on the device. 5536 * Return 0 if successful or a negative errno code on error. 5537 */ 5538 int dev_set_promiscuity(struct net_device *dev, int inc) 5539 { 5540 unsigned int old_flags = dev->flags; 5541 int err; 5542 5543 err = __dev_set_promiscuity(dev, inc, true); 5544 if (err < 0) 5545 return err; 5546 if (dev->flags != old_flags) 5547 dev_set_rx_mode(dev); 5548 return err; 5549 } 5550 EXPORT_SYMBOL(dev_set_promiscuity); 5551 5552 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 5553 { 5554 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 5555 5556 ASSERT_RTNL(); 5557 5558 dev->flags |= IFF_ALLMULTI; 5559 dev->allmulti += inc; 5560 if (dev->allmulti == 0) { 5561 /* 5562 * Avoid overflow. 5563 * If inc causes overflow, untouch allmulti and return error. 5564 */ 5565 if (inc < 0) 5566 dev->flags &= ~IFF_ALLMULTI; 5567 else { 5568 dev->allmulti -= inc; 5569 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 5570 dev->name); 5571 return -EOVERFLOW; 5572 } 5573 } 5574 if (dev->flags ^ old_flags) { 5575 dev_change_rx_flags(dev, IFF_ALLMULTI); 5576 dev_set_rx_mode(dev); 5577 if (notify) 5578 __dev_notify_flags(dev, old_flags, 5579 dev->gflags ^ old_gflags); 5580 } 5581 return 0; 5582 } 5583 5584 /** 5585 * dev_set_allmulti - update allmulti count on a device 5586 * @dev: device 5587 * @inc: modifier 5588 * 5589 * Add or remove reception of all multicast frames to a device. While the 5590 * count in the device remains above zero the interface remains listening 5591 * to all interfaces. Once it hits zero the device reverts back to normal 5592 * filtering operation. A negative @inc value is used to drop the counter 5593 * when releasing a resource needing all multicasts. 5594 * Return 0 if successful or a negative errno code on error. 5595 */ 5596 5597 int dev_set_allmulti(struct net_device *dev, int inc) 5598 { 5599 return __dev_set_allmulti(dev, inc, true); 5600 } 5601 EXPORT_SYMBOL(dev_set_allmulti); 5602 5603 /* 5604 * Upload unicast and multicast address lists to device and 5605 * configure RX filtering. When the device doesn't support unicast 5606 * filtering it is put in promiscuous mode while unicast addresses 5607 * are present. 5608 */ 5609 void __dev_set_rx_mode(struct net_device *dev) 5610 { 5611 const struct net_device_ops *ops = dev->netdev_ops; 5612 5613 /* dev_open will call this function so the list will stay sane. */ 5614 if (!(dev->flags&IFF_UP)) 5615 return; 5616 5617 if (!netif_device_present(dev)) 5618 return; 5619 5620 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 5621 /* Unicast addresses changes may only happen under the rtnl, 5622 * therefore calling __dev_set_promiscuity here is safe. 5623 */ 5624 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 5625 __dev_set_promiscuity(dev, 1, false); 5626 dev->uc_promisc = true; 5627 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 5628 __dev_set_promiscuity(dev, -1, false); 5629 dev->uc_promisc = false; 5630 } 5631 } 5632 5633 if (ops->ndo_set_rx_mode) 5634 ops->ndo_set_rx_mode(dev); 5635 } 5636 5637 void dev_set_rx_mode(struct net_device *dev) 5638 { 5639 netif_addr_lock_bh(dev); 5640 __dev_set_rx_mode(dev); 5641 netif_addr_unlock_bh(dev); 5642 } 5643 5644 /** 5645 * dev_get_flags - get flags reported to userspace 5646 * @dev: device 5647 * 5648 * Get the combination of flag bits exported through APIs to userspace. 5649 */ 5650 unsigned int dev_get_flags(const struct net_device *dev) 5651 { 5652 unsigned int flags; 5653 5654 flags = (dev->flags & ~(IFF_PROMISC | 5655 IFF_ALLMULTI | 5656 IFF_RUNNING | 5657 IFF_LOWER_UP | 5658 IFF_DORMANT)) | 5659 (dev->gflags & (IFF_PROMISC | 5660 IFF_ALLMULTI)); 5661 5662 if (netif_running(dev)) { 5663 if (netif_oper_up(dev)) 5664 flags |= IFF_RUNNING; 5665 if (netif_carrier_ok(dev)) 5666 flags |= IFF_LOWER_UP; 5667 if (netif_dormant(dev)) 5668 flags |= IFF_DORMANT; 5669 } 5670 5671 return flags; 5672 } 5673 EXPORT_SYMBOL(dev_get_flags); 5674 5675 int __dev_change_flags(struct net_device *dev, unsigned int flags) 5676 { 5677 unsigned int old_flags = dev->flags; 5678 int ret; 5679 5680 ASSERT_RTNL(); 5681 5682 /* 5683 * Set the flags on our device. 5684 */ 5685 5686 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 5687 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 5688 IFF_AUTOMEDIA)) | 5689 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 5690 IFF_ALLMULTI)); 5691 5692 /* 5693 * Load in the correct multicast list now the flags have changed. 5694 */ 5695 5696 if ((old_flags ^ flags) & IFF_MULTICAST) 5697 dev_change_rx_flags(dev, IFF_MULTICAST); 5698 5699 dev_set_rx_mode(dev); 5700 5701 /* 5702 * Have we downed the interface. We handle IFF_UP ourselves 5703 * according to user attempts to set it, rather than blindly 5704 * setting it. 5705 */ 5706 5707 ret = 0; 5708 if ((old_flags ^ flags) & IFF_UP) 5709 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 5710 5711 if ((flags ^ dev->gflags) & IFF_PROMISC) { 5712 int inc = (flags & IFF_PROMISC) ? 1 : -1; 5713 unsigned int old_flags = dev->flags; 5714 5715 dev->gflags ^= IFF_PROMISC; 5716 5717 if (__dev_set_promiscuity(dev, inc, false) >= 0) 5718 if (dev->flags != old_flags) 5719 dev_set_rx_mode(dev); 5720 } 5721 5722 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 5723 is important. Some (broken) drivers set IFF_PROMISC, when 5724 IFF_ALLMULTI is requested not asking us and not reporting. 5725 */ 5726 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 5727 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 5728 5729 dev->gflags ^= IFF_ALLMULTI; 5730 __dev_set_allmulti(dev, inc, false); 5731 } 5732 5733 return ret; 5734 } 5735 5736 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 5737 unsigned int gchanges) 5738 { 5739 unsigned int changes = dev->flags ^ old_flags; 5740 5741 if (gchanges) 5742 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 5743 5744 if (changes & IFF_UP) { 5745 if (dev->flags & IFF_UP) 5746 call_netdevice_notifiers(NETDEV_UP, dev); 5747 else 5748 call_netdevice_notifiers(NETDEV_DOWN, dev); 5749 } 5750 5751 if (dev->flags & IFF_UP && 5752 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 5753 struct netdev_notifier_change_info change_info; 5754 5755 change_info.flags_changed = changes; 5756 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 5757 &change_info.info); 5758 } 5759 } 5760 5761 /** 5762 * dev_change_flags - change device settings 5763 * @dev: device 5764 * @flags: device state flags 5765 * 5766 * Change settings on device based state flags. The flags are 5767 * in the userspace exported format. 5768 */ 5769 int dev_change_flags(struct net_device *dev, unsigned int flags) 5770 { 5771 int ret; 5772 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 5773 5774 ret = __dev_change_flags(dev, flags); 5775 if (ret < 0) 5776 return ret; 5777 5778 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 5779 __dev_notify_flags(dev, old_flags, changes); 5780 return ret; 5781 } 5782 EXPORT_SYMBOL(dev_change_flags); 5783 5784 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 5785 { 5786 const struct net_device_ops *ops = dev->netdev_ops; 5787 5788 if (ops->ndo_change_mtu) 5789 return ops->ndo_change_mtu(dev, new_mtu); 5790 5791 dev->mtu = new_mtu; 5792 return 0; 5793 } 5794 5795 /** 5796 * dev_set_mtu - Change maximum transfer unit 5797 * @dev: device 5798 * @new_mtu: new transfer unit 5799 * 5800 * Change the maximum transfer size of the network device. 5801 */ 5802 int dev_set_mtu(struct net_device *dev, int new_mtu) 5803 { 5804 int err, orig_mtu; 5805 5806 if (new_mtu == dev->mtu) 5807 return 0; 5808 5809 /* MTU must be positive. */ 5810 if (new_mtu < 0) 5811 return -EINVAL; 5812 5813 if (!netif_device_present(dev)) 5814 return -ENODEV; 5815 5816 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 5817 err = notifier_to_errno(err); 5818 if (err) 5819 return err; 5820 5821 orig_mtu = dev->mtu; 5822 err = __dev_set_mtu(dev, new_mtu); 5823 5824 if (!err) { 5825 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5826 err = notifier_to_errno(err); 5827 if (err) { 5828 /* setting mtu back and notifying everyone again, 5829 * so that they have a chance to revert changes. 5830 */ 5831 __dev_set_mtu(dev, orig_mtu); 5832 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5833 } 5834 } 5835 return err; 5836 } 5837 EXPORT_SYMBOL(dev_set_mtu); 5838 5839 /** 5840 * dev_set_group - Change group this device belongs to 5841 * @dev: device 5842 * @new_group: group this device should belong to 5843 */ 5844 void dev_set_group(struct net_device *dev, int new_group) 5845 { 5846 dev->group = new_group; 5847 } 5848 EXPORT_SYMBOL(dev_set_group); 5849 5850 /** 5851 * dev_set_mac_address - Change Media Access Control Address 5852 * @dev: device 5853 * @sa: new address 5854 * 5855 * Change the hardware (MAC) address of the device 5856 */ 5857 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 5858 { 5859 const struct net_device_ops *ops = dev->netdev_ops; 5860 int err; 5861 5862 if (!ops->ndo_set_mac_address) 5863 return -EOPNOTSUPP; 5864 if (sa->sa_family != dev->type) 5865 return -EINVAL; 5866 if (!netif_device_present(dev)) 5867 return -ENODEV; 5868 err = ops->ndo_set_mac_address(dev, sa); 5869 if (err) 5870 return err; 5871 dev->addr_assign_type = NET_ADDR_SET; 5872 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 5873 add_device_randomness(dev->dev_addr, dev->addr_len); 5874 return 0; 5875 } 5876 EXPORT_SYMBOL(dev_set_mac_address); 5877 5878 /** 5879 * dev_change_carrier - Change device carrier 5880 * @dev: device 5881 * @new_carrier: new value 5882 * 5883 * Change device carrier 5884 */ 5885 int dev_change_carrier(struct net_device *dev, bool new_carrier) 5886 { 5887 const struct net_device_ops *ops = dev->netdev_ops; 5888 5889 if (!ops->ndo_change_carrier) 5890 return -EOPNOTSUPP; 5891 if (!netif_device_present(dev)) 5892 return -ENODEV; 5893 return ops->ndo_change_carrier(dev, new_carrier); 5894 } 5895 EXPORT_SYMBOL(dev_change_carrier); 5896 5897 /** 5898 * dev_get_phys_port_id - Get device physical port ID 5899 * @dev: device 5900 * @ppid: port ID 5901 * 5902 * Get device physical port ID 5903 */ 5904 int dev_get_phys_port_id(struct net_device *dev, 5905 struct netdev_phys_item_id *ppid) 5906 { 5907 const struct net_device_ops *ops = dev->netdev_ops; 5908 5909 if (!ops->ndo_get_phys_port_id) 5910 return -EOPNOTSUPP; 5911 return ops->ndo_get_phys_port_id(dev, ppid); 5912 } 5913 EXPORT_SYMBOL(dev_get_phys_port_id); 5914 5915 /** 5916 * dev_get_phys_port_name - Get device physical port name 5917 * @dev: device 5918 * @name: port name 5919 * 5920 * Get device physical port name 5921 */ 5922 int dev_get_phys_port_name(struct net_device *dev, 5923 char *name, size_t len) 5924 { 5925 const struct net_device_ops *ops = dev->netdev_ops; 5926 5927 if (!ops->ndo_get_phys_port_name) 5928 return -EOPNOTSUPP; 5929 return ops->ndo_get_phys_port_name(dev, name, len); 5930 } 5931 EXPORT_SYMBOL(dev_get_phys_port_name); 5932 5933 /** 5934 * dev_new_index - allocate an ifindex 5935 * @net: the applicable net namespace 5936 * 5937 * Returns a suitable unique value for a new device interface 5938 * number. The caller must hold the rtnl semaphore or the 5939 * dev_base_lock to be sure it remains unique. 5940 */ 5941 static int dev_new_index(struct net *net) 5942 { 5943 int ifindex = net->ifindex; 5944 for (;;) { 5945 if (++ifindex <= 0) 5946 ifindex = 1; 5947 if (!__dev_get_by_index(net, ifindex)) 5948 return net->ifindex = ifindex; 5949 } 5950 } 5951 5952 /* Delayed registration/unregisteration */ 5953 static LIST_HEAD(net_todo_list); 5954 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 5955 5956 static void net_set_todo(struct net_device *dev) 5957 { 5958 list_add_tail(&dev->todo_list, &net_todo_list); 5959 dev_net(dev)->dev_unreg_count++; 5960 } 5961 5962 static void rollback_registered_many(struct list_head *head) 5963 { 5964 struct net_device *dev, *tmp; 5965 LIST_HEAD(close_head); 5966 5967 BUG_ON(dev_boot_phase); 5968 ASSERT_RTNL(); 5969 5970 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 5971 /* Some devices call without registering 5972 * for initialization unwind. Remove those 5973 * devices and proceed with the remaining. 5974 */ 5975 if (dev->reg_state == NETREG_UNINITIALIZED) { 5976 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 5977 dev->name, dev); 5978 5979 WARN_ON(1); 5980 list_del(&dev->unreg_list); 5981 continue; 5982 } 5983 dev->dismantle = true; 5984 BUG_ON(dev->reg_state != NETREG_REGISTERED); 5985 } 5986 5987 /* If device is running, close it first. */ 5988 list_for_each_entry(dev, head, unreg_list) 5989 list_add_tail(&dev->close_list, &close_head); 5990 dev_close_many(&close_head, true); 5991 5992 list_for_each_entry(dev, head, unreg_list) { 5993 /* And unlink it from device chain. */ 5994 unlist_netdevice(dev); 5995 5996 dev->reg_state = NETREG_UNREGISTERING; 5997 } 5998 5999 synchronize_net(); 6000 6001 list_for_each_entry(dev, head, unreg_list) { 6002 struct sk_buff *skb = NULL; 6003 6004 /* Shutdown queueing discipline. */ 6005 dev_shutdown(dev); 6006 6007 6008 /* Notify protocols, that we are about to destroy 6009 this device. They should clean all the things. 6010 */ 6011 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6012 6013 if (!dev->rtnl_link_ops || 6014 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6015 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6016 GFP_KERNEL); 6017 6018 /* 6019 * Flush the unicast and multicast chains 6020 */ 6021 dev_uc_flush(dev); 6022 dev_mc_flush(dev); 6023 6024 if (dev->netdev_ops->ndo_uninit) 6025 dev->netdev_ops->ndo_uninit(dev); 6026 6027 if (skb) 6028 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6029 6030 /* Notifier chain MUST detach us all upper devices. */ 6031 WARN_ON(netdev_has_any_upper_dev(dev)); 6032 6033 /* Remove entries from kobject tree */ 6034 netdev_unregister_kobject(dev); 6035 #ifdef CONFIG_XPS 6036 /* Remove XPS queueing entries */ 6037 netif_reset_xps_queues_gt(dev, 0); 6038 #endif 6039 } 6040 6041 synchronize_net(); 6042 6043 list_for_each_entry(dev, head, unreg_list) 6044 dev_put(dev); 6045 } 6046 6047 static void rollback_registered(struct net_device *dev) 6048 { 6049 LIST_HEAD(single); 6050 6051 list_add(&dev->unreg_list, &single); 6052 rollback_registered_many(&single); 6053 list_del(&single); 6054 } 6055 6056 static netdev_features_t netdev_fix_features(struct net_device *dev, 6057 netdev_features_t features) 6058 { 6059 /* Fix illegal checksum combinations */ 6060 if ((features & NETIF_F_HW_CSUM) && 6061 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6062 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6063 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6064 } 6065 6066 /* TSO requires that SG is present as well. */ 6067 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6068 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6069 features &= ~NETIF_F_ALL_TSO; 6070 } 6071 6072 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6073 !(features & NETIF_F_IP_CSUM)) { 6074 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6075 features &= ~NETIF_F_TSO; 6076 features &= ~NETIF_F_TSO_ECN; 6077 } 6078 6079 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6080 !(features & NETIF_F_IPV6_CSUM)) { 6081 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6082 features &= ~NETIF_F_TSO6; 6083 } 6084 6085 /* TSO ECN requires that TSO is present as well. */ 6086 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6087 features &= ~NETIF_F_TSO_ECN; 6088 6089 /* Software GSO depends on SG. */ 6090 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6091 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6092 features &= ~NETIF_F_GSO; 6093 } 6094 6095 /* UFO needs SG and checksumming */ 6096 if (features & NETIF_F_UFO) { 6097 /* maybe split UFO into V4 and V6? */ 6098 if (!((features & NETIF_F_GEN_CSUM) || 6099 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 6100 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6101 netdev_dbg(dev, 6102 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6103 features &= ~NETIF_F_UFO; 6104 } 6105 6106 if (!(features & NETIF_F_SG)) { 6107 netdev_dbg(dev, 6108 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6109 features &= ~NETIF_F_UFO; 6110 } 6111 } 6112 6113 #ifdef CONFIG_NET_RX_BUSY_POLL 6114 if (dev->netdev_ops->ndo_busy_poll) 6115 features |= NETIF_F_BUSY_POLL; 6116 else 6117 #endif 6118 features &= ~NETIF_F_BUSY_POLL; 6119 6120 return features; 6121 } 6122 6123 int __netdev_update_features(struct net_device *dev) 6124 { 6125 netdev_features_t features; 6126 int err = 0; 6127 6128 ASSERT_RTNL(); 6129 6130 features = netdev_get_wanted_features(dev); 6131 6132 if (dev->netdev_ops->ndo_fix_features) 6133 features = dev->netdev_ops->ndo_fix_features(dev, features); 6134 6135 /* driver might be less strict about feature dependencies */ 6136 features = netdev_fix_features(dev, features); 6137 6138 if (dev->features == features) 6139 return 0; 6140 6141 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6142 &dev->features, &features); 6143 6144 if (dev->netdev_ops->ndo_set_features) 6145 err = dev->netdev_ops->ndo_set_features(dev, features); 6146 6147 if (unlikely(err < 0)) { 6148 netdev_err(dev, 6149 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6150 err, &features, &dev->features); 6151 return -1; 6152 } 6153 6154 if (!err) 6155 dev->features = features; 6156 6157 return 1; 6158 } 6159 6160 /** 6161 * netdev_update_features - recalculate device features 6162 * @dev: the device to check 6163 * 6164 * Recalculate dev->features set and send notifications if it 6165 * has changed. Should be called after driver or hardware dependent 6166 * conditions might have changed that influence the features. 6167 */ 6168 void netdev_update_features(struct net_device *dev) 6169 { 6170 if (__netdev_update_features(dev)) 6171 netdev_features_change(dev); 6172 } 6173 EXPORT_SYMBOL(netdev_update_features); 6174 6175 /** 6176 * netdev_change_features - recalculate device features 6177 * @dev: the device to check 6178 * 6179 * Recalculate dev->features set and send notifications even 6180 * if they have not changed. Should be called instead of 6181 * netdev_update_features() if also dev->vlan_features might 6182 * have changed to allow the changes to be propagated to stacked 6183 * VLAN devices. 6184 */ 6185 void netdev_change_features(struct net_device *dev) 6186 { 6187 __netdev_update_features(dev); 6188 netdev_features_change(dev); 6189 } 6190 EXPORT_SYMBOL(netdev_change_features); 6191 6192 /** 6193 * netif_stacked_transfer_operstate - transfer operstate 6194 * @rootdev: the root or lower level device to transfer state from 6195 * @dev: the device to transfer operstate to 6196 * 6197 * Transfer operational state from root to device. This is normally 6198 * called when a stacking relationship exists between the root 6199 * device and the device(a leaf device). 6200 */ 6201 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6202 struct net_device *dev) 6203 { 6204 if (rootdev->operstate == IF_OPER_DORMANT) 6205 netif_dormant_on(dev); 6206 else 6207 netif_dormant_off(dev); 6208 6209 if (netif_carrier_ok(rootdev)) { 6210 if (!netif_carrier_ok(dev)) 6211 netif_carrier_on(dev); 6212 } else { 6213 if (netif_carrier_ok(dev)) 6214 netif_carrier_off(dev); 6215 } 6216 } 6217 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6218 6219 #ifdef CONFIG_SYSFS 6220 static int netif_alloc_rx_queues(struct net_device *dev) 6221 { 6222 unsigned int i, count = dev->num_rx_queues; 6223 struct netdev_rx_queue *rx; 6224 size_t sz = count * sizeof(*rx); 6225 6226 BUG_ON(count < 1); 6227 6228 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6229 if (!rx) { 6230 rx = vzalloc(sz); 6231 if (!rx) 6232 return -ENOMEM; 6233 } 6234 dev->_rx = rx; 6235 6236 for (i = 0; i < count; i++) 6237 rx[i].dev = dev; 6238 return 0; 6239 } 6240 #endif 6241 6242 static void netdev_init_one_queue(struct net_device *dev, 6243 struct netdev_queue *queue, void *_unused) 6244 { 6245 /* Initialize queue lock */ 6246 spin_lock_init(&queue->_xmit_lock); 6247 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6248 queue->xmit_lock_owner = -1; 6249 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6250 queue->dev = dev; 6251 #ifdef CONFIG_BQL 6252 dql_init(&queue->dql, HZ); 6253 #endif 6254 } 6255 6256 static void netif_free_tx_queues(struct net_device *dev) 6257 { 6258 kvfree(dev->_tx); 6259 } 6260 6261 static int netif_alloc_netdev_queues(struct net_device *dev) 6262 { 6263 unsigned int count = dev->num_tx_queues; 6264 struct netdev_queue *tx; 6265 size_t sz = count * sizeof(*tx); 6266 6267 BUG_ON(count < 1 || count > 0xffff); 6268 6269 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6270 if (!tx) { 6271 tx = vzalloc(sz); 6272 if (!tx) 6273 return -ENOMEM; 6274 } 6275 dev->_tx = tx; 6276 6277 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6278 spin_lock_init(&dev->tx_global_lock); 6279 6280 return 0; 6281 } 6282 6283 /** 6284 * register_netdevice - register a network device 6285 * @dev: device to register 6286 * 6287 * Take a completed network device structure and add it to the kernel 6288 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6289 * chain. 0 is returned on success. A negative errno code is returned 6290 * on a failure to set up the device, or if the name is a duplicate. 6291 * 6292 * Callers must hold the rtnl semaphore. You may want 6293 * register_netdev() instead of this. 6294 * 6295 * BUGS: 6296 * The locking appears insufficient to guarantee two parallel registers 6297 * will not get the same name. 6298 */ 6299 6300 int register_netdevice(struct net_device *dev) 6301 { 6302 int ret; 6303 struct net *net = dev_net(dev); 6304 6305 BUG_ON(dev_boot_phase); 6306 ASSERT_RTNL(); 6307 6308 might_sleep(); 6309 6310 /* When net_device's are persistent, this will be fatal. */ 6311 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6312 BUG_ON(!net); 6313 6314 spin_lock_init(&dev->addr_list_lock); 6315 netdev_set_addr_lockdep_class(dev); 6316 6317 dev->iflink = -1; 6318 6319 ret = dev_get_valid_name(net, dev, dev->name); 6320 if (ret < 0) 6321 goto out; 6322 6323 /* Init, if this function is available */ 6324 if (dev->netdev_ops->ndo_init) { 6325 ret = dev->netdev_ops->ndo_init(dev); 6326 if (ret) { 6327 if (ret > 0) 6328 ret = -EIO; 6329 goto out; 6330 } 6331 } 6332 6333 if (((dev->hw_features | dev->features) & 6334 NETIF_F_HW_VLAN_CTAG_FILTER) && 6335 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 6336 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 6337 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 6338 ret = -EINVAL; 6339 goto err_uninit; 6340 } 6341 6342 ret = -EBUSY; 6343 if (!dev->ifindex) 6344 dev->ifindex = dev_new_index(net); 6345 else if (__dev_get_by_index(net, dev->ifindex)) 6346 goto err_uninit; 6347 6348 if (dev->iflink == -1) 6349 dev->iflink = dev->ifindex; 6350 6351 /* Transfer changeable features to wanted_features and enable 6352 * software offloads (GSO and GRO). 6353 */ 6354 dev->hw_features |= NETIF_F_SOFT_FEATURES; 6355 dev->features |= NETIF_F_SOFT_FEATURES; 6356 dev->wanted_features = dev->features & dev->hw_features; 6357 6358 if (!(dev->flags & IFF_LOOPBACK)) { 6359 dev->hw_features |= NETIF_F_NOCACHE_COPY; 6360 } 6361 6362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 6363 */ 6364 dev->vlan_features |= NETIF_F_HIGHDMA; 6365 6366 /* Make NETIF_F_SG inheritable to tunnel devices. 6367 */ 6368 dev->hw_enc_features |= NETIF_F_SG; 6369 6370 /* Make NETIF_F_SG inheritable to MPLS. 6371 */ 6372 dev->mpls_features |= NETIF_F_SG; 6373 6374 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 6375 ret = notifier_to_errno(ret); 6376 if (ret) 6377 goto err_uninit; 6378 6379 ret = netdev_register_kobject(dev); 6380 if (ret) 6381 goto err_uninit; 6382 dev->reg_state = NETREG_REGISTERED; 6383 6384 __netdev_update_features(dev); 6385 6386 /* 6387 * Default initial state at registry is that the 6388 * device is present. 6389 */ 6390 6391 set_bit(__LINK_STATE_PRESENT, &dev->state); 6392 6393 linkwatch_init_dev(dev); 6394 6395 dev_init_scheduler(dev); 6396 dev_hold(dev); 6397 list_netdevice(dev); 6398 add_device_randomness(dev->dev_addr, dev->addr_len); 6399 6400 /* If the device has permanent device address, driver should 6401 * set dev_addr and also addr_assign_type should be set to 6402 * NET_ADDR_PERM (default value). 6403 */ 6404 if (dev->addr_assign_type == NET_ADDR_PERM) 6405 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 6406 6407 /* Notify protocols, that a new device appeared. */ 6408 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 6409 ret = notifier_to_errno(ret); 6410 if (ret) { 6411 rollback_registered(dev); 6412 dev->reg_state = NETREG_UNREGISTERED; 6413 } 6414 /* 6415 * Prevent userspace races by waiting until the network 6416 * device is fully setup before sending notifications. 6417 */ 6418 if (!dev->rtnl_link_ops || 6419 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6420 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 6421 6422 out: 6423 return ret; 6424 6425 err_uninit: 6426 if (dev->netdev_ops->ndo_uninit) 6427 dev->netdev_ops->ndo_uninit(dev); 6428 goto out; 6429 } 6430 EXPORT_SYMBOL(register_netdevice); 6431 6432 /** 6433 * init_dummy_netdev - init a dummy network device for NAPI 6434 * @dev: device to init 6435 * 6436 * This takes a network device structure and initialize the minimum 6437 * amount of fields so it can be used to schedule NAPI polls without 6438 * registering a full blown interface. This is to be used by drivers 6439 * that need to tie several hardware interfaces to a single NAPI 6440 * poll scheduler due to HW limitations. 6441 */ 6442 int init_dummy_netdev(struct net_device *dev) 6443 { 6444 /* Clear everything. Note we don't initialize spinlocks 6445 * are they aren't supposed to be taken by any of the 6446 * NAPI code and this dummy netdev is supposed to be 6447 * only ever used for NAPI polls 6448 */ 6449 memset(dev, 0, sizeof(struct net_device)); 6450 6451 /* make sure we BUG if trying to hit standard 6452 * register/unregister code path 6453 */ 6454 dev->reg_state = NETREG_DUMMY; 6455 6456 /* NAPI wants this */ 6457 INIT_LIST_HEAD(&dev->napi_list); 6458 6459 /* a dummy interface is started by default */ 6460 set_bit(__LINK_STATE_PRESENT, &dev->state); 6461 set_bit(__LINK_STATE_START, &dev->state); 6462 6463 /* Note : We dont allocate pcpu_refcnt for dummy devices, 6464 * because users of this 'device' dont need to change 6465 * its refcount. 6466 */ 6467 6468 return 0; 6469 } 6470 EXPORT_SYMBOL_GPL(init_dummy_netdev); 6471 6472 6473 /** 6474 * register_netdev - register a network device 6475 * @dev: device to register 6476 * 6477 * Take a completed network device structure and add it to the kernel 6478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6479 * chain. 0 is returned on success. A negative errno code is returned 6480 * on a failure to set up the device, or if the name is a duplicate. 6481 * 6482 * This is a wrapper around register_netdevice that takes the rtnl semaphore 6483 * and expands the device name if you passed a format string to 6484 * alloc_netdev. 6485 */ 6486 int register_netdev(struct net_device *dev) 6487 { 6488 int err; 6489 6490 rtnl_lock(); 6491 err = register_netdevice(dev); 6492 rtnl_unlock(); 6493 return err; 6494 } 6495 EXPORT_SYMBOL(register_netdev); 6496 6497 int netdev_refcnt_read(const struct net_device *dev) 6498 { 6499 int i, refcnt = 0; 6500 6501 for_each_possible_cpu(i) 6502 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 6503 return refcnt; 6504 } 6505 EXPORT_SYMBOL(netdev_refcnt_read); 6506 6507 /** 6508 * netdev_wait_allrefs - wait until all references are gone. 6509 * @dev: target net_device 6510 * 6511 * This is called when unregistering network devices. 6512 * 6513 * Any protocol or device that holds a reference should register 6514 * for netdevice notification, and cleanup and put back the 6515 * reference if they receive an UNREGISTER event. 6516 * We can get stuck here if buggy protocols don't correctly 6517 * call dev_put. 6518 */ 6519 static void netdev_wait_allrefs(struct net_device *dev) 6520 { 6521 unsigned long rebroadcast_time, warning_time; 6522 int refcnt; 6523 6524 linkwatch_forget_dev(dev); 6525 6526 rebroadcast_time = warning_time = jiffies; 6527 refcnt = netdev_refcnt_read(dev); 6528 6529 while (refcnt != 0) { 6530 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 6531 rtnl_lock(); 6532 6533 /* Rebroadcast unregister notification */ 6534 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6535 6536 __rtnl_unlock(); 6537 rcu_barrier(); 6538 rtnl_lock(); 6539 6540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 6542 &dev->state)) { 6543 /* We must not have linkwatch events 6544 * pending on unregister. If this 6545 * happens, we simply run the queue 6546 * unscheduled, resulting in a noop 6547 * for this device. 6548 */ 6549 linkwatch_run_queue(); 6550 } 6551 6552 __rtnl_unlock(); 6553 6554 rebroadcast_time = jiffies; 6555 } 6556 6557 msleep(250); 6558 6559 refcnt = netdev_refcnt_read(dev); 6560 6561 if (time_after(jiffies, warning_time + 10 * HZ)) { 6562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 6563 dev->name, refcnt); 6564 warning_time = jiffies; 6565 } 6566 } 6567 } 6568 6569 /* The sequence is: 6570 * 6571 * rtnl_lock(); 6572 * ... 6573 * register_netdevice(x1); 6574 * register_netdevice(x2); 6575 * ... 6576 * unregister_netdevice(y1); 6577 * unregister_netdevice(y2); 6578 * ... 6579 * rtnl_unlock(); 6580 * free_netdev(y1); 6581 * free_netdev(y2); 6582 * 6583 * We are invoked by rtnl_unlock(). 6584 * This allows us to deal with problems: 6585 * 1) We can delete sysfs objects which invoke hotplug 6586 * without deadlocking with linkwatch via keventd. 6587 * 2) Since we run with the RTNL semaphore not held, we can sleep 6588 * safely in order to wait for the netdev refcnt to drop to zero. 6589 * 6590 * We must not return until all unregister events added during 6591 * the interval the lock was held have been completed. 6592 */ 6593 void netdev_run_todo(void) 6594 { 6595 struct list_head list; 6596 6597 /* Snapshot list, allow later requests */ 6598 list_replace_init(&net_todo_list, &list); 6599 6600 __rtnl_unlock(); 6601 6602 6603 /* Wait for rcu callbacks to finish before next phase */ 6604 if (!list_empty(&list)) 6605 rcu_barrier(); 6606 6607 while (!list_empty(&list)) { 6608 struct net_device *dev 6609 = list_first_entry(&list, struct net_device, todo_list); 6610 list_del(&dev->todo_list); 6611 6612 rtnl_lock(); 6613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6614 __rtnl_unlock(); 6615 6616 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 6617 pr_err("network todo '%s' but state %d\n", 6618 dev->name, dev->reg_state); 6619 dump_stack(); 6620 continue; 6621 } 6622 6623 dev->reg_state = NETREG_UNREGISTERED; 6624 6625 on_each_cpu(flush_backlog, dev, 1); 6626 6627 netdev_wait_allrefs(dev); 6628 6629 /* paranoia */ 6630 BUG_ON(netdev_refcnt_read(dev)); 6631 BUG_ON(!list_empty(&dev->ptype_all)); 6632 BUG_ON(!list_empty(&dev->ptype_specific)); 6633 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 6634 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 6635 WARN_ON(dev->dn_ptr); 6636 6637 if (dev->destructor) 6638 dev->destructor(dev); 6639 6640 /* Report a network device has been unregistered */ 6641 rtnl_lock(); 6642 dev_net(dev)->dev_unreg_count--; 6643 __rtnl_unlock(); 6644 wake_up(&netdev_unregistering_wq); 6645 6646 /* Free network device */ 6647 kobject_put(&dev->dev.kobj); 6648 } 6649 } 6650 6651 /* Convert net_device_stats to rtnl_link_stats64. They have the same 6652 * fields in the same order, with only the type differing. 6653 */ 6654 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 6655 const struct net_device_stats *netdev_stats) 6656 { 6657 #if BITS_PER_LONG == 64 6658 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 6659 memcpy(stats64, netdev_stats, sizeof(*stats64)); 6660 #else 6661 size_t i, n = sizeof(*stats64) / sizeof(u64); 6662 const unsigned long *src = (const unsigned long *)netdev_stats; 6663 u64 *dst = (u64 *)stats64; 6664 6665 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 6666 sizeof(*stats64) / sizeof(u64)); 6667 for (i = 0; i < n; i++) 6668 dst[i] = src[i]; 6669 #endif 6670 } 6671 EXPORT_SYMBOL(netdev_stats_to_stats64); 6672 6673 /** 6674 * dev_get_stats - get network device statistics 6675 * @dev: device to get statistics from 6676 * @storage: place to store stats 6677 * 6678 * Get network statistics from device. Return @storage. 6679 * The device driver may provide its own method by setting 6680 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 6681 * otherwise the internal statistics structure is used. 6682 */ 6683 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 6684 struct rtnl_link_stats64 *storage) 6685 { 6686 const struct net_device_ops *ops = dev->netdev_ops; 6687 6688 if (ops->ndo_get_stats64) { 6689 memset(storage, 0, sizeof(*storage)); 6690 ops->ndo_get_stats64(dev, storage); 6691 } else if (ops->ndo_get_stats) { 6692 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 6693 } else { 6694 netdev_stats_to_stats64(storage, &dev->stats); 6695 } 6696 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 6697 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 6698 return storage; 6699 } 6700 EXPORT_SYMBOL(dev_get_stats); 6701 6702 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 6703 { 6704 struct netdev_queue *queue = dev_ingress_queue(dev); 6705 6706 #ifdef CONFIG_NET_CLS_ACT 6707 if (queue) 6708 return queue; 6709 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 6710 if (!queue) 6711 return NULL; 6712 netdev_init_one_queue(dev, queue, NULL); 6713 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 6714 queue->qdisc_sleeping = &noop_qdisc; 6715 rcu_assign_pointer(dev->ingress_queue, queue); 6716 #endif 6717 return queue; 6718 } 6719 6720 static const struct ethtool_ops default_ethtool_ops; 6721 6722 void netdev_set_default_ethtool_ops(struct net_device *dev, 6723 const struct ethtool_ops *ops) 6724 { 6725 if (dev->ethtool_ops == &default_ethtool_ops) 6726 dev->ethtool_ops = ops; 6727 } 6728 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 6729 6730 void netdev_freemem(struct net_device *dev) 6731 { 6732 char *addr = (char *)dev - dev->padded; 6733 6734 kvfree(addr); 6735 } 6736 6737 /** 6738 * alloc_netdev_mqs - allocate network device 6739 * @sizeof_priv: size of private data to allocate space for 6740 * @name: device name format string 6741 * @name_assign_type: origin of device name 6742 * @setup: callback to initialize device 6743 * @txqs: the number of TX subqueues to allocate 6744 * @rxqs: the number of RX subqueues to allocate 6745 * 6746 * Allocates a struct net_device with private data area for driver use 6747 * and performs basic initialization. Also allocates subqueue structs 6748 * for each queue on the device. 6749 */ 6750 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 6751 unsigned char name_assign_type, 6752 void (*setup)(struct net_device *), 6753 unsigned int txqs, unsigned int rxqs) 6754 { 6755 struct net_device *dev; 6756 size_t alloc_size; 6757 struct net_device *p; 6758 6759 BUG_ON(strlen(name) >= sizeof(dev->name)); 6760 6761 if (txqs < 1) { 6762 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 6763 return NULL; 6764 } 6765 6766 #ifdef CONFIG_SYSFS 6767 if (rxqs < 1) { 6768 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 6769 return NULL; 6770 } 6771 #endif 6772 6773 alloc_size = sizeof(struct net_device); 6774 if (sizeof_priv) { 6775 /* ensure 32-byte alignment of private area */ 6776 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 6777 alloc_size += sizeof_priv; 6778 } 6779 /* ensure 32-byte alignment of whole construct */ 6780 alloc_size += NETDEV_ALIGN - 1; 6781 6782 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6783 if (!p) 6784 p = vzalloc(alloc_size); 6785 if (!p) 6786 return NULL; 6787 6788 dev = PTR_ALIGN(p, NETDEV_ALIGN); 6789 dev->padded = (char *)dev - (char *)p; 6790 6791 dev->pcpu_refcnt = alloc_percpu(int); 6792 if (!dev->pcpu_refcnt) 6793 goto free_dev; 6794 6795 if (dev_addr_init(dev)) 6796 goto free_pcpu; 6797 6798 dev_mc_init(dev); 6799 dev_uc_init(dev); 6800 6801 dev_net_set(dev, &init_net); 6802 6803 dev->gso_max_size = GSO_MAX_SIZE; 6804 dev->gso_max_segs = GSO_MAX_SEGS; 6805 dev->gso_min_segs = 0; 6806 6807 INIT_LIST_HEAD(&dev->napi_list); 6808 INIT_LIST_HEAD(&dev->unreg_list); 6809 INIT_LIST_HEAD(&dev->close_list); 6810 INIT_LIST_HEAD(&dev->link_watch_list); 6811 INIT_LIST_HEAD(&dev->adj_list.upper); 6812 INIT_LIST_HEAD(&dev->adj_list.lower); 6813 INIT_LIST_HEAD(&dev->all_adj_list.upper); 6814 INIT_LIST_HEAD(&dev->all_adj_list.lower); 6815 INIT_LIST_HEAD(&dev->ptype_all); 6816 INIT_LIST_HEAD(&dev->ptype_specific); 6817 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 6818 setup(dev); 6819 6820 dev->num_tx_queues = txqs; 6821 dev->real_num_tx_queues = txqs; 6822 if (netif_alloc_netdev_queues(dev)) 6823 goto free_all; 6824 6825 #ifdef CONFIG_SYSFS 6826 dev->num_rx_queues = rxqs; 6827 dev->real_num_rx_queues = rxqs; 6828 if (netif_alloc_rx_queues(dev)) 6829 goto free_all; 6830 #endif 6831 6832 strcpy(dev->name, name); 6833 dev->name_assign_type = name_assign_type; 6834 dev->group = INIT_NETDEV_GROUP; 6835 if (!dev->ethtool_ops) 6836 dev->ethtool_ops = &default_ethtool_ops; 6837 return dev; 6838 6839 free_all: 6840 free_netdev(dev); 6841 return NULL; 6842 6843 free_pcpu: 6844 free_percpu(dev->pcpu_refcnt); 6845 free_dev: 6846 netdev_freemem(dev); 6847 return NULL; 6848 } 6849 EXPORT_SYMBOL(alloc_netdev_mqs); 6850 6851 /** 6852 * free_netdev - free network device 6853 * @dev: device 6854 * 6855 * This function does the last stage of destroying an allocated device 6856 * interface. The reference to the device object is released. 6857 * If this is the last reference then it will be freed. 6858 */ 6859 void free_netdev(struct net_device *dev) 6860 { 6861 struct napi_struct *p, *n; 6862 6863 netif_free_tx_queues(dev); 6864 #ifdef CONFIG_SYSFS 6865 kvfree(dev->_rx); 6866 #endif 6867 6868 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 6869 6870 /* Flush device addresses */ 6871 dev_addr_flush(dev); 6872 6873 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 6874 netif_napi_del(p); 6875 6876 free_percpu(dev->pcpu_refcnt); 6877 dev->pcpu_refcnt = NULL; 6878 6879 /* Compatibility with error handling in drivers */ 6880 if (dev->reg_state == NETREG_UNINITIALIZED) { 6881 netdev_freemem(dev); 6882 return; 6883 } 6884 6885 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 6886 dev->reg_state = NETREG_RELEASED; 6887 6888 /* will free via device release */ 6889 put_device(&dev->dev); 6890 } 6891 EXPORT_SYMBOL(free_netdev); 6892 6893 /** 6894 * synchronize_net - Synchronize with packet receive processing 6895 * 6896 * Wait for packets currently being received to be done. 6897 * Does not block later packets from starting. 6898 */ 6899 void synchronize_net(void) 6900 { 6901 might_sleep(); 6902 if (rtnl_is_locked()) 6903 synchronize_rcu_expedited(); 6904 else 6905 synchronize_rcu(); 6906 } 6907 EXPORT_SYMBOL(synchronize_net); 6908 6909 /** 6910 * unregister_netdevice_queue - remove device from the kernel 6911 * @dev: device 6912 * @head: list 6913 * 6914 * This function shuts down a device interface and removes it 6915 * from the kernel tables. 6916 * If head not NULL, device is queued to be unregistered later. 6917 * 6918 * Callers must hold the rtnl semaphore. You may want 6919 * unregister_netdev() instead of this. 6920 */ 6921 6922 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 6923 { 6924 ASSERT_RTNL(); 6925 6926 if (head) { 6927 list_move_tail(&dev->unreg_list, head); 6928 } else { 6929 rollback_registered(dev); 6930 /* Finish processing unregister after unlock */ 6931 net_set_todo(dev); 6932 } 6933 } 6934 EXPORT_SYMBOL(unregister_netdevice_queue); 6935 6936 /** 6937 * unregister_netdevice_many - unregister many devices 6938 * @head: list of devices 6939 * 6940 * Note: As most callers use a stack allocated list_head, 6941 * we force a list_del() to make sure stack wont be corrupted later. 6942 */ 6943 void unregister_netdevice_many(struct list_head *head) 6944 { 6945 struct net_device *dev; 6946 6947 if (!list_empty(head)) { 6948 rollback_registered_many(head); 6949 list_for_each_entry(dev, head, unreg_list) 6950 net_set_todo(dev); 6951 list_del(head); 6952 } 6953 } 6954 EXPORT_SYMBOL(unregister_netdevice_many); 6955 6956 /** 6957 * unregister_netdev - remove device from the kernel 6958 * @dev: device 6959 * 6960 * This function shuts down a device interface and removes it 6961 * from the kernel tables. 6962 * 6963 * This is just a wrapper for unregister_netdevice that takes 6964 * the rtnl semaphore. In general you want to use this and not 6965 * unregister_netdevice. 6966 */ 6967 void unregister_netdev(struct net_device *dev) 6968 { 6969 rtnl_lock(); 6970 unregister_netdevice(dev); 6971 rtnl_unlock(); 6972 } 6973 EXPORT_SYMBOL(unregister_netdev); 6974 6975 /** 6976 * dev_change_net_namespace - move device to different nethost namespace 6977 * @dev: device 6978 * @net: network namespace 6979 * @pat: If not NULL name pattern to try if the current device name 6980 * is already taken in the destination network namespace. 6981 * 6982 * This function shuts down a device interface and moves it 6983 * to a new network namespace. On success 0 is returned, on 6984 * a failure a netagive errno code is returned. 6985 * 6986 * Callers must hold the rtnl semaphore. 6987 */ 6988 6989 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 6990 { 6991 int err; 6992 6993 ASSERT_RTNL(); 6994 6995 /* Don't allow namespace local devices to be moved. */ 6996 err = -EINVAL; 6997 if (dev->features & NETIF_F_NETNS_LOCAL) 6998 goto out; 6999 7000 /* Ensure the device has been registrered */ 7001 if (dev->reg_state != NETREG_REGISTERED) 7002 goto out; 7003 7004 /* Get out if there is nothing todo */ 7005 err = 0; 7006 if (net_eq(dev_net(dev), net)) 7007 goto out; 7008 7009 /* Pick the destination device name, and ensure 7010 * we can use it in the destination network namespace. 7011 */ 7012 err = -EEXIST; 7013 if (__dev_get_by_name(net, dev->name)) { 7014 /* We get here if we can't use the current device name */ 7015 if (!pat) 7016 goto out; 7017 if (dev_get_valid_name(net, dev, pat) < 0) 7018 goto out; 7019 } 7020 7021 /* 7022 * And now a mini version of register_netdevice unregister_netdevice. 7023 */ 7024 7025 /* If device is running close it first. */ 7026 dev_close(dev); 7027 7028 /* And unlink it from device chain */ 7029 err = -ENODEV; 7030 unlist_netdevice(dev); 7031 7032 synchronize_net(); 7033 7034 /* Shutdown queueing discipline. */ 7035 dev_shutdown(dev); 7036 7037 /* Notify protocols, that we are about to destroy 7038 this device. They should clean all the things. 7039 7040 Note that dev->reg_state stays at NETREG_REGISTERED. 7041 This is wanted because this way 8021q and macvlan know 7042 the device is just moving and can keep their slaves up. 7043 */ 7044 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7045 rcu_barrier(); 7046 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7047 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7048 7049 /* 7050 * Flush the unicast and multicast chains 7051 */ 7052 dev_uc_flush(dev); 7053 dev_mc_flush(dev); 7054 7055 /* Send a netdev-removed uevent to the old namespace */ 7056 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7057 netdev_adjacent_del_links(dev); 7058 7059 /* Actually switch the network namespace */ 7060 dev_net_set(dev, net); 7061 7062 /* If there is an ifindex conflict assign a new one */ 7063 if (__dev_get_by_index(net, dev->ifindex)) { 7064 int iflink = (dev->iflink == dev->ifindex); 7065 dev->ifindex = dev_new_index(net); 7066 if (iflink) 7067 dev->iflink = dev->ifindex; 7068 } 7069 7070 /* Send a netdev-add uevent to the new namespace */ 7071 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7072 netdev_adjacent_add_links(dev); 7073 7074 /* Fixup kobjects */ 7075 err = device_rename(&dev->dev, dev->name); 7076 WARN_ON(err); 7077 7078 /* Add the device back in the hashes */ 7079 list_netdevice(dev); 7080 7081 /* Notify protocols, that a new device appeared. */ 7082 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7083 7084 /* 7085 * Prevent userspace races by waiting until the network 7086 * device is fully setup before sending notifications. 7087 */ 7088 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7089 7090 synchronize_net(); 7091 err = 0; 7092 out: 7093 return err; 7094 } 7095 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7096 7097 static int dev_cpu_callback(struct notifier_block *nfb, 7098 unsigned long action, 7099 void *ocpu) 7100 { 7101 struct sk_buff **list_skb; 7102 struct sk_buff *skb; 7103 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7104 struct softnet_data *sd, *oldsd; 7105 7106 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7107 return NOTIFY_OK; 7108 7109 local_irq_disable(); 7110 cpu = smp_processor_id(); 7111 sd = &per_cpu(softnet_data, cpu); 7112 oldsd = &per_cpu(softnet_data, oldcpu); 7113 7114 /* Find end of our completion_queue. */ 7115 list_skb = &sd->completion_queue; 7116 while (*list_skb) 7117 list_skb = &(*list_skb)->next; 7118 /* Append completion queue from offline CPU. */ 7119 *list_skb = oldsd->completion_queue; 7120 oldsd->completion_queue = NULL; 7121 7122 /* Append output queue from offline CPU. */ 7123 if (oldsd->output_queue) { 7124 *sd->output_queue_tailp = oldsd->output_queue; 7125 sd->output_queue_tailp = oldsd->output_queue_tailp; 7126 oldsd->output_queue = NULL; 7127 oldsd->output_queue_tailp = &oldsd->output_queue; 7128 } 7129 /* Append NAPI poll list from offline CPU, with one exception : 7130 * process_backlog() must be called by cpu owning percpu backlog. 7131 * We properly handle process_queue & input_pkt_queue later. 7132 */ 7133 while (!list_empty(&oldsd->poll_list)) { 7134 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7135 struct napi_struct, 7136 poll_list); 7137 7138 list_del_init(&napi->poll_list); 7139 if (napi->poll == process_backlog) 7140 napi->state = 0; 7141 else 7142 ____napi_schedule(sd, napi); 7143 } 7144 7145 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7146 local_irq_enable(); 7147 7148 /* Process offline CPU's input_pkt_queue */ 7149 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7150 netif_rx_ni(skb); 7151 input_queue_head_incr(oldsd); 7152 } 7153 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7154 netif_rx_ni(skb); 7155 input_queue_head_incr(oldsd); 7156 } 7157 7158 return NOTIFY_OK; 7159 } 7160 7161 7162 /** 7163 * netdev_increment_features - increment feature set by one 7164 * @all: current feature set 7165 * @one: new feature set 7166 * @mask: mask feature set 7167 * 7168 * Computes a new feature set after adding a device with feature set 7169 * @one to the master device with current feature set @all. Will not 7170 * enable anything that is off in @mask. Returns the new feature set. 7171 */ 7172 netdev_features_t netdev_increment_features(netdev_features_t all, 7173 netdev_features_t one, netdev_features_t mask) 7174 { 7175 if (mask & NETIF_F_GEN_CSUM) 7176 mask |= NETIF_F_ALL_CSUM; 7177 mask |= NETIF_F_VLAN_CHALLENGED; 7178 7179 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 7180 all &= one | ~NETIF_F_ALL_FOR_ALL; 7181 7182 /* If one device supports hw checksumming, set for all. */ 7183 if (all & NETIF_F_GEN_CSUM) 7184 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 7185 7186 return all; 7187 } 7188 EXPORT_SYMBOL(netdev_increment_features); 7189 7190 static struct hlist_head * __net_init netdev_create_hash(void) 7191 { 7192 int i; 7193 struct hlist_head *hash; 7194 7195 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7196 if (hash != NULL) 7197 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7198 INIT_HLIST_HEAD(&hash[i]); 7199 7200 return hash; 7201 } 7202 7203 /* Initialize per network namespace state */ 7204 static int __net_init netdev_init(struct net *net) 7205 { 7206 if (net != &init_net) 7207 INIT_LIST_HEAD(&net->dev_base_head); 7208 7209 net->dev_name_head = netdev_create_hash(); 7210 if (net->dev_name_head == NULL) 7211 goto err_name; 7212 7213 net->dev_index_head = netdev_create_hash(); 7214 if (net->dev_index_head == NULL) 7215 goto err_idx; 7216 7217 return 0; 7218 7219 err_idx: 7220 kfree(net->dev_name_head); 7221 err_name: 7222 return -ENOMEM; 7223 } 7224 7225 /** 7226 * netdev_drivername - network driver for the device 7227 * @dev: network device 7228 * 7229 * Determine network driver for device. 7230 */ 7231 const char *netdev_drivername(const struct net_device *dev) 7232 { 7233 const struct device_driver *driver; 7234 const struct device *parent; 7235 const char *empty = ""; 7236 7237 parent = dev->dev.parent; 7238 if (!parent) 7239 return empty; 7240 7241 driver = parent->driver; 7242 if (driver && driver->name) 7243 return driver->name; 7244 return empty; 7245 } 7246 7247 static void __netdev_printk(const char *level, const struct net_device *dev, 7248 struct va_format *vaf) 7249 { 7250 if (dev && dev->dev.parent) { 7251 dev_printk_emit(level[1] - '0', 7252 dev->dev.parent, 7253 "%s %s %s%s: %pV", 7254 dev_driver_string(dev->dev.parent), 7255 dev_name(dev->dev.parent), 7256 netdev_name(dev), netdev_reg_state(dev), 7257 vaf); 7258 } else if (dev) { 7259 printk("%s%s%s: %pV", 7260 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7261 } else { 7262 printk("%s(NULL net_device): %pV", level, vaf); 7263 } 7264 } 7265 7266 void netdev_printk(const char *level, const struct net_device *dev, 7267 const char *format, ...) 7268 { 7269 struct va_format vaf; 7270 va_list args; 7271 7272 va_start(args, format); 7273 7274 vaf.fmt = format; 7275 vaf.va = &args; 7276 7277 __netdev_printk(level, dev, &vaf); 7278 7279 va_end(args); 7280 } 7281 EXPORT_SYMBOL(netdev_printk); 7282 7283 #define define_netdev_printk_level(func, level) \ 7284 void func(const struct net_device *dev, const char *fmt, ...) \ 7285 { \ 7286 struct va_format vaf; \ 7287 va_list args; \ 7288 \ 7289 va_start(args, fmt); \ 7290 \ 7291 vaf.fmt = fmt; \ 7292 vaf.va = &args; \ 7293 \ 7294 __netdev_printk(level, dev, &vaf); \ 7295 \ 7296 va_end(args); \ 7297 } \ 7298 EXPORT_SYMBOL(func); 7299 7300 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 7301 define_netdev_printk_level(netdev_alert, KERN_ALERT); 7302 define_netdev_printk_level(netdev_crit, KERN_CRIT); 7303 define_netdev_printk_level(netdev_err, KERN_ERR); 7304 define_netdev_printk_level(netdev_warn, KERN_WARNING); 7305 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 7306 define_netdev_printk_level(netdev_info, KERN_INFO); 7307 7308 static void __net_exit netdev_exit(struct net *net) 7309 { 7310 kfree(net->dev_name_head); 7311 kfree(net->dev_index_head); 7312 } 7313 7314 static struct pernet_operations __net_initdata netdev_net_ops = { 7315 .init = netdev_init, 7316 .exit = netdev_exit, 7317 }; 7318 7319 static void __net_exit default_device_exit(struct net *net) 7320 { 7321 struct net_device *dev, *aux; 7322 /* 7323 * Push all migratable network devices back to the 7324 * initial network namespace 7325 */ 7326 rtnl_lock(); 7327 for_each_netdev_safe(net, dev, aux) { 7328 int err; 7329 char fb_name[IFNAMSIZ]; 7330 7331 /* Ignore unmoveable devices (i.e. loopback) */ 7332 if (dev->features & NETIF_F_NETNS_LOCAL) 7333 continue; 7334 7335 /* Leave virtual devices for the generic cleanup */ 7336 if (dev->rtnl_link_ops) 7337 continue; 7338 7339 /* Push remaining network devices to init_net */ 7340 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 7341 err = dev_change_net_namespace(dev, &init_net, fb_name); 7342 if (err) { 7343 pr_emerg("%s: failed to move %s to init_net: %d\n", 7344 __func__, dev->name, err); 7345 BUG(); 7346 } 7347 } 7348 rtnl_unlock(); 7349 } 7350 7351 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 7352 { 7353 /* Return with the rtnl_lock held when there are no network 7354 * devices unregistering in any network namespace in net_list. 7355 */ 7356 struct net *net; 7357 bool unregistering; 7358 DEFINE_WAIT_FUNC(wait, woken_wake_function); 7359 7360 add_wait_queue(&netdev_unregistering_wq, &wait); 7361 for (;;) { 7362 unregistering = false; 7363 rtnl_lock(); 7364 list_for_each_entry(net, net_list, exit_list) { 7365 if (net->dev_unreg_count > 0) { 7366 unregistering = true; 7367 break; 7368 } 7369 } 7370 if (!unregistering) 7371 break; 7372 __rtnl_unlock(); 7373 7374 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 7375 } 7376 remove_wait_queue(&netdev_unregistering_wq, &wait); 7377 } 7378 7379 static void __net_exit default_device_exit_batch(struct list_head *net_list) 7380 { 7381 /* At exit all network devices most be removed from a network 7382 * namespace. Do this in the reverse order of registration. 7383 * Do this across as many network namespaces as possible to 7384 * improve batching efficiency. 7385 */ 7386 struct net_device *dev; 7387 struct net *net; 7388 LIST_HEAD(dev_kill_list); 7389 7390 /* To prevent network device cleanup code from dereferencing 7391 * loopback devices or network devices that have been freed 7392 * wait here for all pending unregistrations to complete, 7393 * before unregistring the loopback device and allowing the 7394 * network namespace be freed. 7395 * 7396 * The netdev todo list containing all network devices 7397 * unregistrations that happen in default_device_exit_batch 7398 * will run in the rtnl_unlock() at the end of 7399 * default_device_exit_batch. 7400 */ 7401 rtnl_lock_unregistering(net_list); 7402 list_for_each_entry(net, net_list, exit_list) { 7403 for_each_netdev_reverse(net, dev) { 7404 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 7405 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 7406 else 7407 unregister_netdevice_queue(dev, &dev_kill_list); 7408 } 7409 } 7410 unregister_netdevice_many(&dev_kill_list); 7411 rtnl_unlock(); 7412 } 7413 7414 static struct pernet_operations __net_initdata default_device_ops = { 7415 .exit = default_device_exit, 7416 .exit_batch = default_device_exit_batch, 7417 }; 7418 7419 /* 7420 * Initialize the DEV module. At boot time this walks the device list and 7421 * unhooks any devices that fail to initialise (normally hardware not 7422 * present) and leaves us with a valid list of present and active devices. 7423 * 7424 */ 7425 7426 /* 7427 * This is called single threaded during boot, so no need 7428 * to take the rtnl semaphore. 7429 */ 7430 static int __init net_dev_init(void) 7431 { 7432 int i, rc = -ENOMEM; 7433 7434 BUG_ON(!dev_boot_phase); 7435 7436 if (dev_proc_init()) 7437 goto out; 7438 7439 if (netdev_kobject_init()) 7440 goto out; 7441 7442 INIT_LIST_HEAD(&ptype_all); 7443 for (i = 0; i < PTYPE_HASH_SIZE; i++) 7444 INIT_LIST_HEAD(&ptype_base[i]); 7445 7446 INIT_LIST_HEAD(&offload_base); 7447 7448 if (register_pernet_subsys(&netdev_net_ops)) 7449 goto out; 7450 7451 /* 7452 * Initialise the packet receive queues. 7453 */ 7454 7455 for_each_possible_cpu(i) { 7456 struct softnet_data *sd = &per_cpu(softnet_data, i); 7457 7458 skb_queue_head_init(&sd->input_pkt_queue); 7459 skb_queue_head_init(&sd->process_queue); 7460 INIT_LIST_HEAD(&sd->poll_list); 7461 sd->output_queue_tailp = &sd->output_queue; 7462 #ifdef CONFIG_RPS 7463 sd->csd.func = rps_trigger_softirq; 7464 sd->csd.info = sd; 7465 sd->cpu = i; 7466 #endif 7467 7468 sd->backlog.poll = process_backlog; 7469 sd->backlog.weight = weight_p; 7470 } 7471 7472 dev_boot_phase = 0; 7473 7474 /* The loopback device is special if any other network devices 7475 * is present in a network namespace the loopback device must 7476 * be present. Since we now dynamically allocate and free the 7477 * loopback device ensure this invariant is maintained by 7478 * keeping the loopback device as the first device on the 7479 * list of network devices. Ensuring the loopback devices 7480 * is the first device that appears and the last network device 7481 * that disappears. 7482 */ 7483 if (register_pernet_device(&loopback_net_ops)) 7484 goto out; 7485 7486 if (register_pernet_device(&default_device_ops)) 7487 goto out; 7488 7489 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 7490 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 7491 7492 hotcpu_notifier(dev_cpu_callback, 0); 7493 dst_init(); 7494 rc = 0; 7495 out: 7496 return rc; 7497 } 7498 7499 subsys_initcall(net_dev_init); 7500