1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <linux/bitops.h> 77 #include <linux/capability.h> 78 #include <linux/cpu.h> 79 #include <linux/types.h> 80 #include <linux/kernel.h> 81 #include <linux/hash.h> 82 #include <linux/slab.h> 83 #include <linux/sched.h> 84 #include <linux/mutex.h> 85 #include <linux/string.h> 86 #include <linux/mm.h> 87 #include <linux/socket.h> 88 #include <linux/sockios.h> 89 #include <linux/errno.h> 90 #include <linux/interrupt.h> 91 #include <linux/if_ether.h> 92 #include <linux/netdevice.h> 93 #include <linux/etherdevice.h> 94 #include <linux/ethtool.h> 95 #include <linux/notifier.h> 96 #include <linux/skbuff.h> 97 #include <net/net_namespace.h> 98 #include <net/sock.h> 99 #include <linux/rtnetlink.h> 100 #include <linux/stat.h> 101 #include <net/dst.h> 102 #include <net/pkt_sched.h> 103 #include <net/checksum.h> 104 #include <net/xfrm.h> 105 #include <linux/highmem.h> 106 #include <linux/init.h> 107 #include <linux/module.h> 108 #include <linux/netpoll.h> 109 #include <linux/rcupdate.h> 110 #include <linux/delay.h> 111 #include <net/iw_handler.h> 112 #include <asm/current.h> 113 #include <linux/audit.h> 114 #include <linux/dmaengine.h> 115 #include <linux/err.h> 116 #include <linux/ctype.h> 117 #include <linux/if_arp.h> 118 #include <linux/if_vlan.h> 119 #include <linux/ip.h> 120 #include <net/ip.h> 121 #include <net/mpls.h> 122 #include <linux/ipv6.h> 123 #include <linux/in.h> 124 #include <linux/jhash.h> 125 #include <linux/random.h> 126 #include <trace/events/napi.h> 127 #include <trace/events/net.h> 128 #include <trace/events/skb.h> 129 #include <linux/pci.h> 130 #include <linux/inetdevice.h> 131 #include <linux/cpu_rmap.h> 132 #include <linux/static_key.h> 133 #include <linux/hashtable.h> 134 #include <linux/vmalloc.h> 135 #include <linux/if_macvlan.h> 136 #include <linux/errqueue.h> 137 #include <linux/hrtimer.h> 138 139 #include "net-sysfs.h" 140 141 /* Instead of increasing this, you should create a hash table. */ 142 #define MAX_GRO_SKBS 8 143 144 /* This should be increased if a protocol with a bigger head is added. */ 145 #define GRO_MAX_HEAD (MAX_HEADER + 128) 146 147 static DEFINE_SPINLOCK(ptype_lock); 148 static DEFINE_SPINLOCK(offload_lock); 149 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 150 struct list_head ptype_all __read_mostly; /* Taps */ 151 static struct list_head offload_base __read_mostly; 152 153 static int netif_rx_internal(struct sk_buff *skb); 154 static int call_netdevice_notifiers_info(unsigned long val, 155 struct net_device *dev, 156 struct netdev_notifier_info *info); 157 158 /* 159 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 160 * semaphore. 161 * 162 * Pure readers hold dev_base_lock for reading, or rcu_read_lock() 163 * 164 * Writers must hold the rtnl semaphore while they loop through the 165 * dev_base_head list, and hold dev_base_lock for writing when they do the 166 * actual updates. This allows pure readers to access the list even 167 * while a writer is preparing to update it. 168 * 169 * To put it another way, dev_base_lock is held for writing only to 170 * protect against pure readers; the rtnl semaphore provides the 171 * protection against other writers. 172 * 173 * See, for example usages, register_netdevice() and 174 * unregister_netdevice(), which must be called with the rtnl 175 * semaphore held. 176 */ 177 DEFINE_RWLOCK(dev_base_lock); 178 EXPORT_SYMBOL(dev_base_lock); 179 180 /* protects napi_hash addition/deletion and napi_gen_id */ 181 static DEFINE_SPINLOCK(napi_hash_lock); 182 183 static unsigned int napi_gen_id; 184 static DEFINE_HASHTABLE(napi_hash, 8); 185 186 static seqcount_t devnet_rename_seq; 187 188 static inline void dev_base_seq_inc(struct net *net) 189 { 190 while (++net->dev_base_seq == 0); 191 } 192 193 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 194 { 195 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 196 197 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)]; 198 } 199 200 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 201 { 202 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 203 } 204 205 static inline void rps_lock(struct softnet_data *sd) 206 { 207 #ifdef CONFIG_RPS 208 spin_lock(&sd->input_pkt_queue.lock); 209 #endif 210 } 211 212 static inline void rps_unlock(struct softnet_data *sd) 213 { 214 #ifdef CONFIG_RPS 215 spin_unlock(&sd->input_pkt_queue.lock); 216 #endif 217 } 218 219 /* Device list insertion */ 220 static void list_netdevice(struct net_device *dev) 221 { 222 struct net *net = dev_net(dev); 223 224 ASSERT_RTNL(); 225 226 write_lock_bh(&dev_base_lock); 227 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); 228 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 229 hlist_add_head_rcu(&dev->index_hlist, 230 dev_index_hash(net, dev->ifindex)); 231 write_unlock_bh(&dev_base_lock); 232 233 dev_base_seq_inc(net); 234 } 235 236 /* Device list removal 237 * caller must respect a RCU grace period before freeing/reusing dev 238 */ 239 static void unlist_netdevice(struct net_device *dev) 240 { 241 ASSERT_RTNL(); 242 243 /* Unlink dev from the device chain */ 244 write_lock_bh(&dev_base_lock); 245 list_del_rcu(&dev->dev_list); 246 hlist_del_rcu(&dev->name_hlist); 247 hlist_del_rcu(&dev->index_hlist); 248 write_unlock_bh(&dev_base_lock); 249 250 dev_base_seq_inc(dev_net(dev)); 251 } 252 253 /* 254 * Our notifier list 255 */ 256 257 static RAW_NOTIFIER_HEAD(netdev_chain); 258 259 /* 260 * Device drivers call our routines to queue packets here. We empty the 261 * queue in the local softnet handler. 262 */ 263 264 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); 265 EXPORT_PER_CPU_SYMBOL(softnet_data); 266 267 #ifdef CONFIG_LOCKDEP 268 /* 269 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 270 * according to dev->type 271 */ 272 static const unsigned short netdev_lock_type[] = 273 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 274 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 275 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 276 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 277 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 278 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 279 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 280 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 281 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 282 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 283 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 284 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 285 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, 286 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, 287 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; 288 289 static const char *const netdev_lock_name[] = 290 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 291 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 292 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 293 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 294 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 295 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 296 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 297 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 298 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 299 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 300 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 301 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 302 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", 303 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", 304 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; 305 306 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 307 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 308 309 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 310 { 311 int i; 312 313 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 314 if (netdev_lock_type[i] == dev_type) 315 return i; 316 /* the last key is used by default */ 317 return ARRAY_SIZE(netdev_lock_type) - 1; 318 } 319 320 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 321 unsigned short dev_type) 322 { 323 int i; 324 325 i = netdev_lock_pos(dev_type); 326 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 327 netdev_lock_name[i]); 328 } 329 330 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 331 { 332 int i; 333 334 i = netdev_lock_pos(dev->type); 335 lockdep_set_class_and_name(&dev->addr_list_lock, 336 &netdev_addr_lock_key[i], 337 netdev_lock_name[i]); 338 } 339 #else 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 341 unsigned short dev_type) 342 { 343 } 344 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 345 { 346 } 347 #endif 348 349 /******************************************************************************* 350 351 Protocol management and registration routines 352 353 *******************************************************************************/ 354 355 /* 356 * Add a protocol ID to the list. Now that the input handler is 357 * smarter we can dispense with all the messy stuff that used to be 358 * here. 359 * 360 * BEWARE!!! Protocol handlers, mangling input packets, 361 * MUST BE last in hash buckets and checking protocol handlers 362 * MUST start from promiscuous ptype_all chain in net_bh. 363 * It is true now, do not change it. 364 * Explanation follows: if protocol handler, mangling packet, will 365 * be the first on list, it is not able to sense, that packet 366 * is cloned and should be copied-on-write, so that it will 367 * change it and subsequent readers will get broken packet. 368 * --ANK (980803) 369 */ 370 371 static inline struct list_head *ptype_head(const struct packet_type *pt) 372 { 373 if (pt->type == htons(ETH_P_ALL)) 374 return pt->dev ? &pt->dev->ptype_all : &ptype_all; 375 else 376 return pt->dev ? &pt->dev->ptype_specific : 377 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 378 } 379 380 /** 381 * dev_add_pack - add packet handler 382 * @pt: packet type declaration 383 * 384 * Add a protocol handler to the networking stack. The passed &packet_type 385 * is linked into kernel lists and may not be freed until it has been 386 * removed from the kernel lists. 387 * 388 * This call does not sleep therefore it can not 389 * guarantee all CPU's that are in middle of receiving packets 390 * will see the new packet type (until the next received packet). 391 */ 392 393 void dev_add_pack(struct packet_type *pt) 394 { 395 struct list_head *head = ptype_head(pt); 396 397 spin_lock(&ptype_lock); 398 list_add_rcu(&pt->list, head); 399 spin_unlock(&ptype_lock); 400 } 401 EXPORT_SYMBOL(dev_add_pack); 402 403 /** 404 * __dev_remove_pack - remove packet handler 405 * @pt: packet type declaration 406 * 407 * Remove a protocol handler that was previously added to the kernel 408 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 409 * from the kernel lists and can be freed or reused once this function 410 * returns. 411 * 412 * The packet type might still be in use by receivers 413 * and must not be freed until after all the CPU's have gone 414 * through a quiescent state. 415 */ 416 void __dev_remove_pack(struct packet_type *pt) 417 { 418 struct list_head *head = ptype_head(pt); 419 struct packet_type *pt1; 420 421 spin_lock(&ptype_lock); 422 423 list_for_each_entry(pt1, head, list) { 424 if (pt == pt1) { 425 list_del_rcu(&pt->list); 426 goto out; 427 } 428 } 429 430 pr_warn("dev_remove_pack: %p not found\n", pt); 431 out: 432 spin_unlock(&ptype_lock); 433 } 434 EXPORT_SYMBOL(__dev_remove_pack); 435 436 /** 437 * dev_remove_pack - remove packet handler 438 * @pt: packet type declaration 439 * 440 * Remove a protocol handler that was previously added to the kernel 441 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 442 * from the kernel lists and can be freed or reused once this function 443 * returns. 444 * 445 * This call sleeps to guarantee that no CPU is looking at the packet 446 * type after return. 447 */ 448 void dev_remove_pack(struct packet_type *pt) 449 { 450 __dev_remove_pack(pt); 451 452 synchronize_net(); 453 } 454 EXPORT_SYMBOL(dev_remove_pack); 455 456 457 /** 458 * dev_add_offload - register offload handlers 459 * @po: protocol offload declaration 460 * 461 * Add protocol offload handlers to the networking stack. The passed 462 * &proto_offload is linked into kernel lists and may not be freed until 463 * it has been removed from the kernel lists. 464 * 465 * This call does not sleep therefore it can not 466 * guarantee all CPU's that are in middle of receiving packets 467 * will see the new offload handlers (until the next received packet). 468 */ 469 void dev_add_offload(struct packet_offload *po) 470 { 471 struct list_head *head = &offload_base; 472 473 spin_lock(&offload_lock); 474 list_add_rcu(&po->list, head); 475 spin_unlock(&offload_lock); 476 } 477 EXPORT_SYMBOL(dev_add_offload); 478 479 /** 480 * __dev_remove_offload - remove offload handler 481 * @po: packet offload declaration 482 * 483 * Remove a protocol offload handler that was previously added to the 484 * kernel offload handlers by dev_add_offload(). The passed &offload_type 485 * is removed from the kernel lists and can be freed or reused once this 486 * function returns. 487 * 488 * The packet type might still be in use by receivers 489 * and must not be freed until after all the CPU's have gone 490 * through a quiescent state. 491 */ 492 static void __dev_remove_offload(struct packet_offload *po) 493 { 494 struct list_head *head = &offload_base; 495 struct packet_offload *po1; 496 497 spin_lock(&offload_lock); 498 499 list_for_each_entry(po1, head, list) { 500 if (po == po1) { 501 list_del_rcu(&po->list); 502 goto out; 503 } 504 } 505 506 pr_warn("dev_remove_offload: %p not found\n", po); 507 out: 508 spin_unlock(&offload_lock); 509 } 510 511 /** 512 * dev_remove_offload - remove packet offload handler 513 * @po: packet offload declaration 514 * 515 * Remove a packet offload handler that was previously added to the kernel 516 * offload handlers by dev_add_offload(). The passed &offload_type is 517 * removed from the kernel lists and can be freed or reused once this 518 * function returns. 519 * 520 * This call sleeps to guarantee that no CPU is looking at the packet 521 * type after return. 522 */ 523 void dev_remove_offload(struct packet_offload *po) 524 { 525 __dev_remove_offload(po); 526 527 synchronize_net(); 528 } 529 EXPORT_SYMBOL(dev_remove_offload); 530 531 /****************************************************************************** 532 533 Device Boot-time Settings Routines 534 535 *******************************************************************************/ 536 537 /* Boot time configuration table */ 538 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 539 540 /** 541 * netdev_boot_setup_add - add new setup entry 542 * @name: name of the device 543 * @map: configured settings for the device 544 * 545 * Adds new setup entry to the dev_boot_setup list. The function 546 * returns 0 on error and 1 on success. This is a generic routine to 547 * all netdevices. 548 */ 549 static int netdev_boot_setup_add(char *name, struct ifmap *map) 550 { 551 struct netdev_boot_setup *s; 552 int i; 553 554 s = dev_boot_setup; 555 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 556 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 557 memset(s[i].name, 0, sizeof(s[i].name)); 558 strlcpy(s[i].name, name, IFNAMSIZ); 559 memcpy(&s[i].map, map, sizeof(s[i].map)); 560 break; 561 } 562 } 563 564 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 565 } 566 567 /** 568 * netdev_boot_setup_check - check boot time settings 569 * @dev: the netdevice 570 * 571 * Check boot time settings for the device. 572 * The found settings are set for the device to be used 573 * later in the device probing. 574 * Returns 0 if no settings found, 1 if they are. 575 */ 576 int netdev_boot_setup_check(struct net_device *dev) 577 { 578 struct netdev_boot_setup *s = dev_boot_setup; 579 int i; 580 581 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 582 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 583 !strcmp(dev->name, s[i].name)) { 584 dev->irq = s[i].map.irq; 585 dev->base_addr = s[i].map.base_addr; 586 dev->mem_start = s[i].map.mem_start; 587 dev->mem_end = s[i].map.mem_end; 588 return 1; 589 } 590 } 591 return 0; 592 } 593 EXPORT_SYMBOL(netdev_boot_setup_check); 594 595 596 /** 597 * netdev_boot_base - get address from boot time settings 598 * @prefix: prefix for network device 599 * @unit: id for network device 600 * 601 * Check boot time settings for the base address of device. 602 * The found settings are set for the device to be used 603 * later in the device probing. 604 * Returns 0 if no settings found. 605 */ 606 unsigned long netdev_boot_base(const char *prefix, int unit) 607 { 608 const struct netdev_boot_setup *s = dev_boot_setup; 609 char name[IFNAMSIZ]; 610 int i; 611 612 sprintf(name, "%s%d", prefix, unit); 613 614 /* 615 * If device already registered then return base of 1 616 * to indicate not to probe for this interface 617 */ 618 if (__dev_get_by_name(&init_net, name)) 619 return 1; 620 621 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 622 if (!strcmp(name, s[i].name)) 623 return s[i].map.base_addr; 624 return 0; 625 } 626 627 /* 628 * Saves at boot time configured settings for any netdevice. 629 */ 630 int __init netdev_boot_setup(char *str) 631 { 632 int ints[5]; 633 struct ifmap map; 634 635 str = get_options(str, ARRAY_SIZE(ints), ints); 636 if (!str || !*str) 637 return 0; 638 639 /* Save settings */ 640 memset(&map, 0, sizeof(map)); 641 if (ints[0] > 0) 642 map.irq = ints[1]; 643 if (ints[0] > 1) 644 map.base_addr = ints[2]; 645 if (ints[0] > 2) 646 map.mem_start = ints[3]; 647 if (ints[0] > 3) 648 map.mem_end = ints[4]; 649 650 /* Add new entry to the list */ 651 return netdev_boot_setup_add(str, &map); 652 } 653 654 __setup("netdev=", netdev_boot_setup); 655 656 /******************************************************************************* 657 658 Device Interface Subroutines 659 660 *******************************************************************************/ 661 662 /** 663 * dev_get_iflink - get 'iflink' value of a interface 664 * @dev: targeted interface 665 * 666 * Indicates the ifindex the interface is linked to. 667 * Physical interfaces have the same 'ifindex' and 'iflink' values. 668 */ 669 670 int dev_get_iflink(const struct net_device *dev) 671 { 672 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink) 673 return dev->netdev_ops->ndo_get_iflink(dev); 674 675 return dev->iflink; 676 } 677 EXPORT_SYMBOL(dev_get_iflink); 678 679 /** 680 * __dev_get_by_name - find a device by its name 681 * @net: the applicable net namespace 682 * @name: name to find 683 * 684 * Find an interface by name. Must be called under RTNL semaphore 685 * or @dev_base_lock. If the name is found a pointer to the device 686 * is returned. If the name is not found then %NULL is returned. The 687 * reference counters are not incremented so the caller must be 688 * careful with locks. 689 */ 690 691 struct net_device *__dev_get_by_name(struct net *net, const char *name) 692 { 693 struct net_device *dev; 694 struct hlist_head *head = dev_name_hash(net, name); 695 696 hlist_for_each_entry(dev, head, name_hlist) 697 if (!strncmp(dev->name, name, IFNAMSIZ)) 698 return dev; 699 700 return NULL; 701 } 702 EXPORT_SYMBOL(__dev_get_by_name); 703 704 /** 705 * dev_get_by_name_rcu - find a device by its name 706 * @net: the applicable net namespace 707 * @name: name to find 708 * 709 * Find an interface by name. 710 * If the name is found a pointer to the device is returned. 711 * If the name is not found then %NULL is returned. 712 * The reference counters are not incremented so the caller must be 713 * careful with locks. The caller must hold RCU lock. 714 */ 715 716 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name) 717 { 718 struct net_device *dev; 719 struct hlist_head *head = dev_name_hash(net, name); 720 721 hlist_for_each_entry_rcu(dev, head, name_hlist) 722 if (!strncmp(dev->name, name, IFNAMSIZ)) 723 return dev; 724 725 return NULL; 726 } 727 EXPORT_SYMBOL(dev_get_by_name_rcu); 728 729 /** 730 * dev_get_by_name - find a device by its name 731 * @net: the applicable net namespace 732 * @name: name to find 733 * 734 * Find an interface by name. This can be called from any 735 * context and does its own locking. The returned handle has 736 * the usage count incremented and the caller must use dev_put() to 737 * release it when it is no longer needed. %NULL is returned if no 738 * matching device is found. 739 */ 740 741 struct net_device *dev_get_by_name(struct net *net, const char *name) 742 { 743 struct net_device *dev; 744 745 rcu_read_lock(); 746 dev = dev_get_by_name_rcu(net, name); 747 if (dev) 748 dev_hold(dev); 749 rcu_read_unlock(); 750 return dev; 751 } 752 EXPORT_SYMBOL(dev_get_by_name); 753 754 /** 755 * __dev_get_by_index - find a device by its ifindex 756 * @net: the applicable net namespace 757 * @ifindex: index of device 758 * 759 * Search for an interface by index. Returns %NULL if the device 760 * is not found or a pointer to the device. The device has not 761 * had its reference counter increased so the caller must be careful 762 * about locking. The caller must hold either the RTNL semaphore 763 * or @dev_base_lock. 764 */ 765 766 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 767 { 768 struct net_device *dev; 769 struct hlist_head *head = dev_index_hash(net, ifindex); 770 771 hlist_for_each_entry(dev, head, index_hlist) 772 if (dev->ifindex == ifindex) 773 return dev; 774 775 return NULL; 776 } 777 EXPORT_SYMBOL(__dev_get_by_index); 778 779 /** 780 * dev_get_by_index_rcu - find a device by its ifindex 781 * @net: the applicable net namespace 782 * @ifindex: index of device 783 * 784 * Search for an interface by index. Returns %NULL if the device 785 * is not found or a pointer to the device. The device has not 786 * had its reference counter increased so the caller must be careful 787 * about locking. The caller must hold RCU lock. 788 */ 789 790 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) 791 { 792 struct net_device *dev; 793 struct hlist_head *head = dev_index_hash(net, ifindex); 794 795 hlist_for_each_entry_rcu(dev, head, index_hlist) 796 if (dev->ifindex == ifindex) 797 return dev; 798 799 return NULL; 800 } 801 EXPORT_SYMBOL(dev_get_by_index_rcu); 802 803 804 /** 805 * dev_get_by_index - find a device by its ifindex 806 * @net: the applicable net namespace 807 * @ifindex: index of device 808 * 809 * Search for an interface by index. Returns NULL if the device 810 * is not found or a pointer to the device. The device returned has 811 * had a reference added and the pointer is safe until the user calls 812 * dev_put to indicate they have finished with it. 813 */ 814 815 struct net_device *dev_get_by_index(struct net *net, int ifindex) 816 { 817 struct net_device *dev; 818 819 rcu_read_lock(); 820 dev = dev_get_by_index_rcu(net, ifindex); 821 if (dev) 822 dev_hold(dev); 823 rcu_read_unlock(); 824 return dev; 825 } 826 EXPORT_SYMBOL(dev_get_by_index); 827 828 /** 829 * netdev_get_name - get a netdevice name, knowing its ifindex. 830 * @net: network namespace 831 * @name: a pointer to the buffer where the name will be stored. 832 * @ifindex: the ifindex of the interface to get the name from. 833 * 834 * The use of raw_seqcount_begin() and cond_resched() before 835 * retrying is required as we want to give the writers a chance 836 * to complete when CONFIG_PREEMPT is not set. 837 */ 838 int netdev_get_name(struct net *net, char *name, int ifindex) 839 { 840 struct net_device *dev; 841 unsigned int seq; 842 843 retry: 844 seq = raw_seqcount_begin(&devnet_rename_seq); 845 rcu_read_lock(); 846 dev = dev_get_by_index_rcu(net, ifindex); 847 if (!dev) { 848 rcu_read_unlock(); 849 return -ENODEV; 850 } 851 852 strcpy(name, dev->name); 853 rcu_read_unlock(); 854 if (read_seqcount_retry(&devnet_rename_seq, seq)) { 855 cond_resched(); 856 goto retry; 857 } 858 859 return 0; 860 } 861 862 /** 863 * dev_getbyhwaddr_rcu - find a device by its hardware address 864 * @net: the applicable net namespace 865 * @type: media type of device 866 * @ha: hardware address 867 * 868 * Search for an interface by MAC address. Returns NULL if the device 869 * is not found or a pointer to the device. 870 * The caller must hold RCU or RTNL. 871 * The returned device has not had its ref count increased 872 * and the caller must therefore be careful about locking 873 * 874 */ 875 876 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, 877 const char *ha) 878 { 879 struct net_device *dev; 880 881 for_each_netdev_rcu(net, dev) 882 if (dev->type == type && 883 !memcmp(dev->dev_addr, ha, dev->addr_len)) 884 return dev; 885 886 return NULL; 887 } 888 EXPORT_SYMBOL(dev_getbyhwaddr_rcu); 889 890 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 891 { 892 struct net_device *dev; 893 894 ASSERT_RTNL(); 895 for_each_netdev(net, dev) 896 if (dev->type == type) 897 return dev; 898 899 return NULL; 900 } 901 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 902 903 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 904 { 905 struct net_device *dev, *ret = NULL; 906 907 rcu_read_lock(); 908 for_each_netdev_rcu(net, dev) 909 if (dev->type == type) { 910 dev_hold(dev); 911 ret = dev; 912 break; 913 } 914 rcu_read_unlock(); 915 return ret; 916 } 917 EXPORT_SYMBOL(dev_getfirstbyhwtype); 918 919 /** 920 * __dev_get_by_flags - find any device with given flags 921 * @net: the applicable net namespace 922 * @if_flags: IFF_* values 923 * @mask: bitmask of bits in if_flags to check 924 * 925 * Search for any interface with the given flags. Returns NULL if a device 926 * is not found or a pointer to the device. Must be called inside 927 * rtnl_lock(), and result refcount is unchanged. 928 */ 929 930 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags, 931 unsigned short mask) 932 { 933 struct net_device *dev, *ret; 934 935 ASSERT_RTNL(); 936 937 ret = NULL; 938 for_each_netdev(net, dev) { 939 if (((dev->flags ^ if_flags) & mask) == 0) { 940 ret = dev; 941 break; 942 } 943 } 944 return ret; 945 } 946 EXPORT_SYMBOL(__dev_get_by_flags); 947 948 /** 949 * dev_valid_name - check if name is okay for network device 950 * @name: name string 951 * 952 * Network device names need to be valid file names to 953 * to allow sysfs to work. We also disallow any kind of 954 * whitespace. 955 */ 956 bool dev_valid_name(const char *name) 957 { 958 if (*name == '\0') 959 return false; 960 if (strlen(name) >= IFNAMSIZ) 961 return false; 962 if (!strcmp(name, ".") || !strcmp(name, "..")) 963 return false; 964 965 while (*name) { 966 if (*name == '/' || *name == ':' || isspace(*name)) 967 return false; 968 name++; 969 } 970 return true; 971 } 972 EXPORT_SYMBOL(dev_valid_name); 973 974 /** 975 * __dev_alloc_name - allocate a name for a device 976 * @net: network namespace to allocate the device name in 977 * @name: name format string 978 * @buf: scratch buffer and result name string 979 * 980 * Passed a format string - eg "lt%d" it will try and find a suitable 981 * id. It scans list of devices to build up a free map, then chooses 982 * the first empty slot. The caller must hold the dev_base or rtnl lock 983 * while allocating the name and adding the device in order to avoid 984 * duplicates. 985 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 986 * Returns the number of the unit assigned or a negative errno code. 987 */ 988 989 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 990 { 991 int i = 0; 992 const char *p; 993 const int max_netdevices = 8*PAGE_SIZE; 994 unsigned long *inuse; 995 struct net_device *d; 996 997 p = strnchr(name, IFNAMSIZ-1, '%'); 998 if (p) { 999 /* 1000 * Verify the string as this thing may have come from 1001 * the user. There must be either one "%d" and no other "%" 1002 * characters. 1003 */ 1004 if (p[1] != 'd' || strchr(p + 2, '%')) 1005 return -EINVAL; 1006 1007 /* Use one page as a bit array of possible slots */ 1008 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 1009 if (!inuse) 1010 return -ENOMEM; 1011 1012 for_each_netdev(net, d) { 1013 if (!sscanf(d->name, name, &i)) 1014 continue; 1015 if (i < 0 || i >= max_netdevices) 1016 continue; 1017 1018 /* avoid cases where sscanf is not exact inverse of printf */ 1019 snprintf(buf, IFNAMSIZ, name, i); 1020 if (!strncmp(buf, d->name, IFNAMSIZ)) 1021 set_bit(i, inuse); 1022 } 1023 1024 i = find_first_zero_bit(inuse, max_netdevices); 1025 free_page((unsigned long) inuse); 1026 } 1027 1028 if (buf != name) 1029 snprintf(buf, IFNAMSIZ, name, i); 1030 if (!__dev_get_by_name(net, buf)) 1031 return i; 1032 1033 /* It is possible to run out of possible slots 1034 * when the name is long and there isn't enough space left 1035 * for the digits, or if all bits are used. 1036 */ 1037 return -ENFILE; 1038 } 1039 1040 /** 1041 * dev_alloc_name - allocate a name for a device 1042 * @dev: device 1043 * @name: name format string 1044 * 1045 * Passed a format string - eg "lt%d" it will try and find a suitable 1046 * id. It scans list of devices to build up a free map, then chooses 1047 * the first empty slot. The caller must hold the dev_base or rtnl lock 1048 * while allocating the name and adding the device in order to avoid 1049 * duplicates. 1050 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 1051 * Returns the number of the unit assigned or a negative errno code. 1052 */ 1053 1054 int dev_alloc_name(struct net_device *dev, const char *name) 1055 { 1056 char buf[IFNAMSIZ]; 1057 struct net *net; 1058 int ret; 1059 1060 BUG_ON(!dev_net(dev)); 1061 net = dev_net(dev); 1062 ret = __dev_alloc_name(net, name, buf); 1063 if (ret >= 0) 1064 strlcpy(dev->name, buf, IFNAMSIZ); 1065 return ret; 1066 } 1067 EXPORT_SYMBOL(dev_alloc_name); 1068 1069 static int dev_alloc_name_ns(struct net *net, 1070 struct net_device *dev, 1071 const char *name) 1072 { 1073 char buf[IFNAMSIZ]; 1074 int ret; 1075 1076 ret = __dev_alloc_name(net, name, buf); 1077 if (ret >= 0) 1078 strlcpy(dev->name, buf, IFNAMSIZ); 1079 return ret; 1080 } 1081 1082 static int dev_get_valid_name(struct net *net, 1083 struct net_device *dev, 1084 const char *name) 1085 { 1086 BUG_ON(!net); 1087 1088 if (!dev_valid_name(name)) 1089 return -EINVAL; 1090 1091 if (strchr(name, '%')) 1092 return dev_alloc_name_ns(net, dev, name); 1093 else if (__dev_get_by_name(net, name)) 1094 return -EEXIST; 1095 else if (dev->name != name) 1096 strlcpy(dev->name, name, IFNAMSIZ); 1097 1098 return 0; 1099 } 1100 1101 /** 1102 * dev_change_name - change name of a device 1103 * @dev: device 1104 * @newname: name (or format string) must be at least IFNAMSIZ 1105 * 1106 * Change name of a device, can pass format strings "eth%d". 1107 * for wildcarding. 1108 */ 1109 int dev_change_name(struct net_device *dev, const char *newname) 1110 { 1111 unsigned char old_assign_type; 1112 char oldname[IFNAMSIZ]; 1113 int err = 0; 1114 int ret; 1115 struct net *net; 1116 1117 ASSERT_RTNL(); 1118 BUG_ON(!dev_net(dev)); 1119 1120 net = dev_net(dev); 1121 if (dev->flags & IFF_UP) 1122 return -EBUSY; 1123 1124 write_seqcount_begin(&devnet_rename_seq); 1125 1126 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) { 1127 write_seqcount_end(&devnet_rename_seq); 1128 return 0; 1129 } 1130 1131 memcpy(oldname, dev->name, IFNAMSIZ); 1132 1133 err = dev_get_valid_name(net, dev, newname); 1134 if (err < 0) { 1135 write_seqcount_end(&devnet_rename_seq); 1136 return err; 1137 } 1138 1139 if (oldname[0] && !strchr(oldname, '%')) 1140 netdev_info(dev, "renamed from %s\n", oldname); 1141 1142 old_assign_type = dev->name_assign_type; 1143 dev->name_assign_type = NET_NAME_RENAMED; 1144 1145 rollback: 1146 ret = device_rename(&dev->dev, dev->name); 1147 if (ret) { 1148 memcpy(dev->name, oldname, IFNAMSIZ); 1149 dev->name_assign_type = old_assign_type; 1150 write_seqcount_end(&devnet_rename_seq); 1151 return ret; 1152 } 1153 1154 write_seqcount_end(&devnet_rename_seq); 1155 1156 netdev_adjacent_rename_links(dev, oldname); 1157 1158 write_lock_bh(&dev_base_lock); 1159 hlist_del_rcu(&dev->name_hlist); 1160 write_unlock_bh(&dev_base_lock); 1161 1162 synchronize_rcu(); 1163 1164 write_lock_bh(&dev_base_lock); 1165 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name)); 1166 write_unlock_bh(&dev_base_lock); 1167 1168 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 1169 ret = notifier_to_errno(ret); 1170 1171 if (ret) { 1172 /* err >= 0 after dev_alloc_name() or stores the first errno */ 1173 if (err >= 0) { 1174 err = ret; 1175 write_seqcount_begin(&devnet_rename_seq); 1176 memcpy(dev->name, oldname, IFNAMSIZ); 1177 memcpy(oldname, newname, IFNAMSIZ); 1178 dev->name_assign_type = old_assign_type; 1179 old_assign_type = NET_NAME_RENAMED; 1180 goto rollback; 1181 } else { 1182 pr_err("%s: name change rollback failed: %d\n", 1183 dev->name, ret); 1184 } 1185 } 1186 1187 return err; 1188 } 1189 1190 /** 1191 * dev_set_alias - change ifalias of a device 1192 * @dev: device 1193 * @alias: name up to IFALIASZ 1194 * @len: limit of bytes to copy from info 1195 * 1196 * Set ifalias for a device, 1197 */ 1198 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1199 { 1200 char *new_ifalias; 1201 1202 ASSERT_RTNL(); 1203 1204 if (len >= IFALIASZ) 1205 return -EINVAL; 1206 1207 if (!len) { 1208 kfree(dev->ifalias); 1209 dev->ifalias = NULL; 1210 return 0; 1211 } 1212 1213 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1214 if (!new_ifalias) 1215 return -ENOMEM; 1216 dev->ifalias = new_ifalias; 1217 1218 strlcpy(dev->ifalias, alias, len+1); 1219 return len; 1220 } 1221 1222 1223 /** 1224 * netdev_features_change - device changes features 1225 * @dev: device to cause notification 1226 * 1227 * Called to indicate a device has changed features. 1228 */ 1229 void netdev_features_change(struct net_device *dev) 1230 { 1231 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 1232 } 1233 EXPORT_SYMBOL(netdev_features_change); 1234 1235 /** 1236 * netdev_state_change - device changes state 1237 * @dev: device to cause notification 1238 * 1239 * Called to indicate a device has changed state. This function calls 1240 * the notifier chains for netdev_chain and sends a NEWLINK message 1241 * to the routing socket. 1242 */ 1243 void netdev_state_change(struct net_device *dev) 1244 { 1245 if (dev->flags & IFF_UP) { 1246 struct netdev_notifier_change_info change_info; 1247 1248 change_info.flags_changed = 0; 1249 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 1250 &change_info.info); 1251 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); 1252 } 1253 } 1254 EXPORT_SYMBOL(netdev_state_change); 1255 1256 /** 1257 * netdev_notify_peers - notify network peers about existence of @dev 1258 * @dev: network device 1259 * 1260 * Generate traffic such that interested network peers are aware of 1261 * @dev, such as by generating a gratuitous ARP. This may be used when 1262 * a device wants to inform the rest of the network about some sort of 1263 * reconfiguration such as a failover event or virtual machine 1264 * migration. 1265 */ 1266 void netdev_notify_peers(struct net_device *dev) 1267 { 1268 rtnl_lock(); 1269 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev); 1270 rtnl_unlock(); 1271 } 1272 EXPORT_SYMBOL(netdev_notify_peers); 1273 1274 static int __dev_open(struct net_device *dev) 1275 { 1276 const struct net_device_ops *ops = dev->netdev_ops; 1277 int ret; 1278 1279 ASSERT_RTNL(); 1280 1281 if (!netif_device_present(dev)) 1282 return -ENODEV; 1283 1284 /* Block netpoll from trying to do any rx path servicing. 1285 * If we don't do this there is a chance ndo_poll_controller 1286 * or ndo_poll may be running while we open the device 1287 */ 1288 netpoll_poll_disable(dev); 1289 1290 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1291 ret = notifier_to_errno(ret); 1292 if (ret) 1293 return ret; 1294 1295 set_bit(__LINK_STATE_START, &dev->state); 1296 1297 if (ops->ndo_validate_addr) 1298 ret = ops->ndo_validate_addr(dev); 1299 1300 if (!ret && ops->ndo_open) 1301 ret = ops->ndo_open(dev); 1302 1303 netpoll_poll_enable(dev); 1304 1305 if (ret) 1306 clear_bit(__LINK_STATE_START, &dev->state); 1307 else { 1308 dev->flags |= IFF_UP; 1309 dev_set_rx_mode(dev); 1310 dev_activate(dev); 1311 add_device_randomness(dev->dev_addr, dev->addr_len); 1312 } 1313 1314 return ret; 1315 } 1316 1317 /** 1318 * dev_open - prepare an interface for use. 1319 * @dev: device to open 1320 * 1321 * Takes a device from down to up state. The device's private open 1322 * function is invoked and then the multicast lists are loaded. Finally 1323 * the device is moved into the up state and a %NETDEV_UP message is 1324 * sent to the netdev notifier chain. 1325 * 1326 * Calling this function on an active interface is a nop. On a failure 1327 * a negative errno code is returned. 1328 */ 1329 int dev_open(struct net_device *dev) 1330 { 1331 int ret; 1332 1333 if (dev->flags & IFF_UP) 1334 return 0; 1335 1336 ret = __dev_open(dev); 1337 if (ret < 0) 1338 return ret; 1339 1340 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1341 call_netdevice_notifiers(NETDEV_UP, dev); 1342 1343 return ret; 1344 } 1345 EXPORT_SYMBOL(dev_open); 1346 1347 static int __dev_close_many(struct list_head *head) 1348 { 1349 struct net_device *dev; 1350 1351 ASSERT_RTNL(); 1352 might_sleep(); 1353 1354 list_for_each_entry(dev, head, close_list) { 1355 /* Temporarily disable netpoll until the interface is down */ 1356 netpoll_poll_disable(dev); 1357 1358 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1359 1360 clear_bit(__LINK_STATE_START, &dev->state); 1361 1362 /* Synchronize to scheduled poll. We cannot touch poll list, it 1363 * can be even on different cpu. So just clear netif_running(). 1364 * 1365 * dev->stop() will invoke napi_disable() on all of it's 1366 * napi_struct instances on this device. 1367 */ 1368 smp_mb__after_atomic(); /* Commit netif_running(). */ 1369 } 1370 1371 dev_deactivate_many(head); 1372 1373 list_for_each_entry(dev, head, close_list) { 1374 const struct net_device_ops *ops = dev->netdev_ops; 1375 1376 /* 1377 * Call the device specific close. This cannot fail. 1378 * Only if device is UP 1379 * 1380 * We allow it to be called even after a DETACH hot-plug 1381 * event. 1382 */ 1383 if (ops->ndo_stop) 1384 ops->ndo_stop(dev); 1385 1386 dev->flags &= ~IFF_UP; 1387 netpoll_poll_enable(dev); 1388 } 1389 1390 return 0; 1391 } 1392 1393 static int __dev_close(struct net_device *dev) 1394 { 1395 int retval; 1396 LIST_HEAD(single); 1397 1398 list_add(&dev->close_list, &single); 1399 retval = __dev_close_many(&single); 1400 list_del(&single); 1401 1402 return retval; 1403 } 1404 1405 int dev_close_many(struct list_head *head, bool unlink) 1406 { 1407 struct net_device *dev, *tmp; 1408 1409 /* Remove the devices that don't need to be closed */ 1410 list_for_each_entry_safe(dev, tmp, head, close_list) 1411 if (!(dev->flags & IFF_UP)) 1412 list_del_init(&dev->close_list); 1413 1414 __dev_close_many(head); 1415 1416 list_for_each_entry_safe(dev, tmp, head, close_list) { 1417 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL); 1418 call_netdevice_notifiers(NETDEV_DOWN, dev); 1419 if (unlink) 1420 list_del_init(&dev->close_list); 1421 } 1422 1423 return 0; 1424 } 1425 EXPORT_SYMBOL(dev_close_many); 1426 1427 /** 1428 * dev_close - shutdown an interface. 1429 * @dev: device to shutdown 1430 * 1431 * This function moves an active device into down state. A 1432 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1433 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1434 * chain. 1435 */ 1436 int dev_close(struct net_device *dev) 1437 { 1438 if (dev->flags & IFF_UP) { 1439 LIST_HEAD(single); 1440 1441 list_add(&dev->close_list, &single); 1442 dev_close_many(&single, true); 1443 list_del(&single); 1444 } 1445 return 0; 1446 } 1447 EXPORT_SYMBOL(dev_close); 1448 1449 1450 /** 1451 * dev_disable_lro - disable Large Receive Offload on a device 1452 * @dev: device 1453 * 1454 * Disable Large Receive Offload (LRO) on a net device. Must be 1455 * called under RTNL. This is needed if received packets may be 1456 * forwarded to another interface. 1457 */ 1458 void dev_disable_lro(struct net_device *dev) 1459 { 1460 struct net_device *lower_dev; 1461 struct list_head *iter; 1462 1463 dev->wanted_features &= ~NETIF_F_LRO; 1464 netdev_update_features(dev); 1465 1466 if (unlikely(dev->features & NETIF_F_LRO)) 1467 netdev_WARN(dev, "failed to disable LRO!\n"); 1468 1469 netdev_for_each_lower_dev(dev, lower_dev, iter) 1470 dev_disable_lro(lower_dev); 1471 } 1472 EXPORT_SYMBOL(dev_disable_lro); 1473 1474 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, 1475 struct net_device *dev) 1476 { 1477 struct netdev_notifier_info info; 1478 1479 netdev_notifier_info_init(&info, dev); 1480 return nb->notifier_call(nb, val, &info); 1481 } 1482 1483 static int dev_boot_phase = 1; 1484 1485 /** 1486 * register_netdevice_notifier - register a network notifier block 1487 * @nb: notifier 1488 * 1489 * Register a notifier to be called when network device events occur. 1490 * The notifier passed is linked into the kernel structures and must 1491 * not be reused until it has been unregistered. A negative errno code 1492 * is returned on a failure. 1493 * 1494 * When registered all registration and up events are replayed 1495 * to the new notifier to allow device to have a race free 1496 * view of the network device list. 1497 */ 1498 1499 int register_netdevice_notifier(struct notifier_block *nb) 1500 { 1501 struct net_device *dev; 1502 struct net_device *last; 1503 struct net *net; 1504 int err; 1505 1506 rtnl_lock(); 1507 err = raw_notifier_chain_register(&netdev_chain, nb); 1508 if (err) 1509 goto unlock; 1510 if (dev_boot_phase) 1511 goto unlock; 1512 for_each_net(net) { 1513 for_each_netdev(net, dev) { 1514 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev); 1515 err = notifier_to_errno(err); 1516 if (err) 1517 goto rollback; 1518 1519 if (!(dev->flags & IFF_UP)) 1520 continue; 1521 1522 call_netdevice_notifier(nb, NETDEV_UP, dev); 1523 } 1524 } 1525 1526 unlock: 1527 rtnl_unlock(); 1528 return err; 1529 1530 rollback: 1531 last = dev; 1532 for_each_net(net) { 1533 for_each_netdev(net, dev) { 1534 if (dev == last) 1535 goto outroll; 1536 1537 if (dev->flags & IFF_UP) { 1538 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1539 dev); 1540 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1541 } 1542 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1543 } 1544 } 1545 1546 outroll: 1547 raw_notifier_chain_unregister(&netdev_chain, nb); 1548 goto unlock; 1549 } 1550 EXPORT_SYMBOL(register_netdevice_notifier); 1551 1552 /** 1553 * unregister_netdevice_notifier - unregister a network notifier block 1554 * @nb: notifier 1555 * 1556 * Unregister a notifier previously registered by 1557 * register_netdevice_notifier(). The notifier is unlinked into the 1558 * kernel structures and may then be reused. A negative errno code 1559 * is returned on a failure. 1560 * 1561 * After unregistering unregister and down device events are synthesized 1562 * for all devices on the device list to the removed notifier to remove 1563 * the need for special case cleanup code. 1564 */ 1565 1566 int unregister_netdevice_notifier(struct notifier_block *nb) 1567 { 1568 struct net_device *dev; 1569 struct net *net; 1570 int err; 1571 1572 rtnl_lock(); 1573 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1574 if (err) 1575 goto unlock; 1576 1577 for_each_net(net) { 1578 for_each_netdev(net, dev) { 1579 if (dev->flags & IFF_UP) { 1580 call_netdevice_notifier(nb, NETDEV_GOING_DOWN, 1581 dev); 1582 call_netdevice_notifier(nb, NETDEV_DOWN, dev); 1583 } 1584 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev); 1585 } 1586 } 1587 unlock: 1588 rtnl_unlock(); 1589 return err; 1590 } 1591 EXPORT_SYMBOL(unregister_netdevice_notifier); 1592 1593 /** 1594 * call_netdevice_notifiers_info - call all network notifier blocks 1595 * @val: value passed unmodified to notifier function 1596 * @dev: net_device pointer passed unmodified to notifier function 1597 * @info: notifier information data 1598 * 1599 * Call all network notifier blocks. Parameters and return value 1600 * are as for raw_notifier_call_chain(). 1601 */ 1602 1603 static int call_netdevice_notifiers_info(unsigned long val, 1604 struct net_device *dev, 1605 struct netdev_notifier_info *info) 1606 { 1607 ASSERT_RTNL(); 1608 netdev_notifier_info_init(info, dev); 1609 return raw_notifier_call_chain(&netdev_chain, val, info); 1610 } 1611 1612 /** 1613 * call_netdevice_notifiers - call all network notifier blocks 1614 * @val: value passed unmodified to notifier function 1615 * @dev: net_device pointer passed unmodified to notifier function 1616 * 1617 * Call all network notifier blocks. Parameters and return value 1618 * are as for raw_notifier_call_chain(). 1619 */ 1620 1621 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1622 { 1623 struct netdev_notifier_info info; 1624 1625 return call_netdevice_notifiers_info(val, dev, &info); 1626 } 1627 EXPORT_SYMBOL(call_netdevice_notifiers); 1628 1629 static struct static_key netstamp_needed __read_mostly; 1630 #ifdef HAVE_JUMP_LABEL 1631 /* We are not allowed to call static_key_slow_dec() from irq context 1632 * If net_disable_timestamp() is called from irq context, defer the 1633 * static_key_slow_dec() calls. 1634 */ 1635 static atomic_t netstamp_needed_deferred; 1636 #endif 1637 1638 void net_enable_timestamp(void) 1639 { 1640 #ifdef HAVE_JUMP_LABEL 1641 int deferred = atomic_xchg(&netstamp_needed_deferred, 0); 1642 1643 if (deferred) { 1644 while (--deferred) 1645 static_key_slow_dec(&netstamp_needed); 1646 return; 1647 } 1648 #endif 1649 static_key_slow_inc(&netstamp_needed); 1650 } 1651 EXPORT_SYMBOL(net_enable_timestamp); 1652 1653 void net_disable_timestamp(void) 1654 { 1655 #ifdef HAVE_JUMP_LABEL 1656 if (in_interrupt()) { 1657 atomic_inc(&netstamp_needed_deferred); 1658 return; 1659 } 1660 #endif 1661 static_key_slow_dec(&netstamp_needed); 1662 } 1663 EXPORT_SYMBOL(net_disable_timestamp); 1664 1665 static inline void net_timestamp_set(struct sk_buff *skb) 1666 { 1667 skb->tstamp.tv64 = 0; 1668 if (static_key_false(&netstamp_needed)) 1669 __net_timestamp(skb); 1670 } 1671 1672 #define net_timestamp_check(COND, SKB) \ 1673 if (static_key_false(&netstamp_needed)) { \ 1674 if ((COND) && !(SKB)->tstamp.tv64) \ 1675 __net_timestamp(SKB); \ 1676 } \ 1677 1678 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb) 1679 { 1680 unsigned int len; 1681 1682 if (!(dev->flags & IFF_UP)) 1683 return false; 1684 1685 len = dev->mtu + dev->hard_header_len + VLAN_HLEN; 1686 if (skb->len <= len) 1687 return true; 1688 1689 /* if TSO is enabled, we don't care about the length as the packet 1690 * could be forwarded without being segmented before 1691 */ 1692 if (skb_is_gso(skb)) 1693 return true; 1694 1695 return false; 1696 } 1697 EXPORT_SYMBOL_GPL(is_skb_forwardable); 1698 1699 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1700 { 1701 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) { 1702 if (skb_copy_ubufs(skb, GFP_ATOMIC)) { 1703 atomic_long_inc(&dev->rx_dropped); 1704 kfree_skb(skb); 1705 return NET_RX_DROP; 1706 } 1707 } 1708 1709 if (unlikely(!is_skb_forwardable(dev, skb))) { 1710 atomic_long_inc(&dev->rx_dropped); 1711 kfree_skb(skb); 1712 return NET_RX_DROP; 1713 } 1714 1715 skb_scrub_packet(skb, true); 1716 skb->priority = 0; 1717 skb->protocol = eth_type_trans(skb, dev); 1718 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1719 1720 return 0; 1721 } 1722 EXPORT_SYMBOL_GPL(__dev_forward_skb); 1723 1724 /** 1725 * dev_forward_skb - loopback an skb to another netif 1726 * 1727 * @dev: destination network device 1728 * @skb: buffer to forward 1729 * 1730 * return values: 1731 * NET_RX_SUCCESS (no congestion) 1732 * NET_RX_DROP (packet was dropped, but freed) 1733 * 1734 * dev_forward_skb can be used for injecting an skb from the 1735 * start_xmit function of one device into the receive queue 1736 * of another device. 1737 * 1738 * The receiving device may be in another namespace, so 1739 * we have to clear all information in the skb that could 1740 * impact namespace isolation. 1741 */ 1742 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) 1743 { 1744 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb); 1745 } 1746 EXPORT_SYMBOL_GPL(dev_forward_skb); 1747 1748 static inline int deliver_skb(struct sk_buff *skb, 1749 struct packet_type *pt_prev, 1750 struct net_device *orig_dev) 1751 { 1752 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 1753 return -ENOMEM; 1754 atomic_inc(&skb->users); 1755 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1756 } 1757 1758 static inline void deliver_ptype_list_skb(struct sk_buff *skb, 1759 struct packet_type **pt, 1760 struct net_device *orig_dev, 1761 __be16 type, 1762 struct list_head *ptype_list) 1763 { 1764 struct packet_type *ptype, *pt_prev = *pt; 1765 1766 list_for_each_entry_rcu(ptype, ptype_list, list) { 1767 if (ptype->type != type) 1768 continue; 1769 if (pt_prev) 1770 deliver_skb(skb, pt_prev, orig_dev); 1771 pt_prev = ptype; 1772 } 1773 *pt = pt_prev; 1774 } 1775 1776 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb) 1777 { 1778 if (!ptype->af_packet_priv || !skb->sk) 1779 return false; 1780 1781 if (ptype->id_match) 1782 return ptype->id_match(ptype, skb->sk); 1783 else if ((struct sock *)ptype->af_packet_priv == skb->sk) 1784 return true; 1785 1786 return false; 1787 } 1788 1789 /* 1790 * Support routine. Sends outgoing frames to any network 1791 * taps currently in use. 1792 */ 1793 1794 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1795 { 1796 struct packet_type *ptype; 1797 struct sk_buff *skb2 = NULL; 1798 struct packet_type *pt_prev = NULL; 1799 struct list_head *ptype_list = &ptype_all; 1800 1801 rcu_read_lock(); 1802 again: 1803 list_for_each_entry_rcu(ptype, ptype_list, list) { 1804 /* Never send packets back to the socket 1805 * they originated from - MvS (miquels@drinkel.ow.org) 1806 */ 1807 if (skb_loop_sk(ptype, skb)) 1808 continue; 1809 1810 if (pt_prev) { 1811 deliver_skb(skb2, pt_prev, skb->dev); 1812 pt_prev = ptype; 1813 continue; 1814 } 1815 1816 /* need to clone skb, done only once */ 1817 skb2 = skb_clone(skb, GFP_ATOMIC); 1818 if (!skb2) 1819 goto out_unlock; 1820 1821 net_timestamp_set(skb2); 1822 1823 /* skb->nh should be correctly 1824 * set by sender, so that the second statement is 1825 * just protection against buggy protocols. 1826 */ 1827 skb_reset_mac_header(skb2); 1828 1829 if (skb_network_header(skb2) < skb2->data || 1830 skb_network_header(skb2) > skb_tail_pointer(skb2)) { 1831 net_crit_ratelimited("protocol %04x is buggy, dev %s\n", 1832 ntohs(skb2->protocol), 1833 dev->name); 1834 skb_reset_network_header(skb2); 1835 } 1836 1837 skb2->transport_header = skb2->network_header; 1838 skb2->pkt_type = PACKET_OUTGOING; 1839 pt_prev = ptype; 1840 } 1841 1842 if (ptype_list == &ptype_all) { 1843 ptype_list = &dev->ptype_all; 1844 goto again; 1845 } 1846 out_unlock: 1847 if (pt_prev) 1848 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev); 1849 rcu_read_unlock(); 1850 } 1851 1852 /** 1853 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change 1854 * @dev: Network device 1855 * @txq: number of queues available 1856 * 1857 * If real_num_tx_queues is changed the tc mappings may no longer be 1858 * valid. To resolve this verify the tc mapping remains valid and if 1859 * not NULL the mapping. With no priorities mapping to this 1860 * offset/count pair it will no longer be used. In the worst case TC0 1861 * is invalid nothing can be done so disable priority mappings. If is 1862 * expected that drivers will fix this mapping if they can before 1863 * calling netif_set_real_num_tx_queues. 1864 */ 1865 static void netif_setup_tc(struct net_device *dev, unsigned int txq) 1866 { 1867 int i; 1868 struct netdev_tc_txq *tc = &dev->tc_to_txq[0]; 1869 1870 /* If TC0 is invalidated disable TC mapping */ 1871 if (tc->offset + tc->count > txq) { 1872 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n"); 1873 dev->num_tc = 0; 1874 return; 1875 } 1876 1877 /* Invalidated prio to tc mappings set to TC0 */ 1878 for (i = 1; i < TC_BITMASK + 1; i++) { 1879 int q = netdev_get_prio_tc_map(dev, i); 1880 1881 tc = &dev->tc_to_txq[q]; 1882 if (tc->offset + tc->count > txq) { 1883 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n", 1884 i, q); 1885 netdev_set_prio_tc_map(dev, i, 0); 1886 } 1887 } 1888 } 1889 1890 #ifdef CONFIG_XPS 1891 static DEFINE_MUTEX(xps_map_mutex); 1892 #define xmap_dereference(P) \ 1893 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) 1894 1895 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps, 1896 int cpu, u16 index) 1897 { 1898 struct xps_map *map = NULL; 1899 int pos; 1900 1901 if (dev_maps) 1902 map = xmap_dereference(dev_maps->cpu_map[cpu]); 1903 1904 for (pos = 0; map && pos < map->len; pos++) { 1905 if (map->queues[pos] == index) { 1906 if (map->len > 1) { 1907 map->queues[pos] = map->queues[--map->len]; 1908 } else { 1909 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL); 1910 kfree_rcu(map, rcu); 1911 map = NULL; 1912 } 1913 break; 1914 } 1915 } 1916 1917 return map; 1918 } 1919 1920 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index) 1921 { 1922 struct xps_dev_maps *dev_maps; 1923 int cpu, i; 1924 bool active = false; 1925 1926 mutex_lock(&xps_map_mutex); 1927 dev_maps = xmap_dereference(dev->xps_maps); 1928 1929 if (!dev_maps) 1930 goto out_no_maps; 1931 1932 for_each_possible_cpu(cpu) { 1933 for (i = index; i < dev->num_tx_queues; i++) { 1934 if (!remove_xps_queue(dev_maps, cpu, i)) 1935 break; 1936 } 1937 if (i == dev->num_tx_queues) 1938 active = true; 1939 } 1940 1941 if (!active) { 1942 RCU_INIT_POINTER(dev->xps_maps, NULL); 1943 kfree_rcu(dev_maps, rcu); 1944 } 1945 1946 for (i = index; i < dev->num_tx_queues; i++) 1947 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i), 1948 NUMA_NO_NODE); 1949 1950 out_no_maps: 1951 mutex_unlock(&xps_map_mutex); 1952 } 1953 1954 static struct xps_map *expand_xps_map(struct xps_map *map, 1955 int cpu, u16 index) 1956 { 1957 struct xps_map *new_map; 1958 int alloc_len = XPS_MIN_MAP_ALLOC; 1959 int i, pos; 1960 1961 for (pos = 0; map && pos < map->len; pos++) { 1962 if (map->queues[pos] != index) 1963 continue; 1964 return map; 1965 } 1966 1967 /* Need to add queue to this CPU's existing map */ 1968 if (map) { 1969 if (pos < map->alloc_len) 1970 return map; 1971 1972 alloc_len = map->alloc_len * 2; 1973 } 1974 1975 /* Need to allocate new map to store queue on this CPU's map */ 1976 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL, 1977 cpu_to_node(cpu)); 1978 if (!new_map) 1979 return NULL; 1980 1981 for (i = 0; i < pos; i++) 1982 new_map->queues[i] = map->queues[i]; 1983 new_map->alloc_len = alloc_len; 1984 new_map->len = pos; 1985 1986 return new_map; 1987 } 1988 1989 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, 1990 u16 index) 1991 { 1992 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; 1993 struct xps_map *map, *new_map; 1994 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES); 1995 int cpu, numa_node_id = -2; 1996 bool active = false; 1997 1998 mutex_lock(&xps_map_mutex); 1999 2000 dev_maps = xmap_dereference(dev->xps_maps); 2001 2002 /* allocate memory for queue storage */ 2003 for_each_online_cpu(cpu) { 2004 if (!cpumask_test_cpu(cpu, mask)) 2005 continue; 2006 2007 if (!new_dev_maps) 2008 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); 2009 if (!new_dev_maps) { 2010 mutex_unlock(&xps_map_mutex); 2011 return -ENOMEM; 2012 } 2013 2014 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2015 NULL; 2016 2017 map = expand_xps_map(map, cpu, index); 2018 if (!map) 2019 goto error; 2020 2021 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2022 } 2023 2024 if (!new_dev_maps) 2025 goto out_no_new_maps; 2026 2027 for_each_possible_cpu(cpu) { 2028 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) { 2029 /* add queue to CPU maps */ 2030 int pos = 0; 2031 2032 map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2033 while ((pos < map->len) && (map->queues[pos] != index)) 2034 pos++; 2035 2036 if (pos == map->len) 2037 map->queues[map->len++] = index; 2038 #ifdef CONFIG_NUMA 2039 if (numa_node_id == -2) 2040 numa_node_id = cpu_to_node(cpu); 2041 else if (numa_node_id != cpu_to_node(cpu)) 2042 numa_node_id = -1; 2043 #endif 2044 } else if (dev_maps) { 2045 /* fill in the new device map from the old device map */ 2046 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2047 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map); 2048 } 2049 2050 } 2051 2052 rcu_assign_pointer(dev->xps_maps, new_dev_maps); 2053 2054 /* Cleanup old maps */ 2055 if (dev_maps) { 2056 for_each_possible_cpu(cpu) { 2057 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2058 map = xmap_dereference(dev_maps->cpu_map[cpu]); 2059 if (map && map != new_map) 2060 kfree_rcu(map, rcu); 2061 } 2062 2063 kfree_rcu(dev_maps, rcu); 2064 } 2065 2066 dev_maps = new_dev_maps; 2067 active = true; 2068 2069 out_no_new_maps: 2070 /* update Tx queue numa node */ 2071 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), 2072 (numa_node_id >= 0) ? numa_node_id : 2073 NUMA_NO_NODE); 2074 2075 if (!dev_maps) 2076 goto out_no_maps; 2077 2078 /* removes queue from unused CPUs */ 2079 for_each_possible_cpu(cpu) { 2080 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) 2081 continue; 2082 2083 if (remove_xps_queue(dev_maps, cpu, index)) 2084 active = true; 2085 } 2086 2087 /* free map if not active */ 2088 if (!active) { 2089 RCU_INIT_POINTER(dev->xps_maps, NULL); 2090 kfree_rcu(dev_maps, rcu); 2091 } 2092 2093 out_no_maps: 2094 mutex_unlock(&xps_map_mutex); 2095 2096 return 0; 2097 error: 2098 /* remove any maps that we added */ 2099 for_each_possible_cpu(cpu) { 2100 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]); 2101 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) : 2102 NULL; 2103 if (new_map && new_map != map) 2104 kfree(new_map); 2105 } 2106 2107 mutex_unlock(&xps_map_mutex); 2108 2109 kfree(new_dev_maps); 2110 return -ENOMEM; 2111 } 2112 EXPORT_SYMBOL(netif_set_xps_queue); 2113 2114 #endif 2115 /* 2116 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 2117 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 2118 */ 2119 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) 2120 { 2121 int rc; 2122 2123 if (txq < 1 || txq > dev->num_tx_queues) 2124 return -EINVAL; 2125 2126 if (dev->reg_state == NETREG_REGISTERED || 2127 dev->reg_state == NETREG_UNREGISTERING) { 2128 ASSERT_RTNL(); 2129 2130 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues, 2131 txq); 2132 if (rc) 2133 return rc; 2134 2135 if (dev->num_tc) 2136 netif_setup_tc(dev, txq); 2137 2138 if (txq < dev->real_num_tx_queues) { 2139 qdisc_reset_all_tx_gt(dev, txq); 2140 #ifdef CONFIG_XPS 2141 netif_reset_xps_queues_gt(dev, txq); 2142 #endif 2143 } 2144 } 2145 2146 dev->real_num_tx_queues = txq; 2147 return 0; 2148 } 2149 EXPORT_SYMBOL(netif_set_real_num_tx_queues); 2150 2151 #ifdef CONFIG_SYSFS 2152 /** 2153 * netif_set_real_num_rx_queues - set actual number of RX queues used 2154 * @dev: Network device 2155 * @rxq: Actual number of RX queues 2156 * 2157 * This must be called either with the rtnl_lock held or before 2158 * registration of the net device. Returns 0 on success, or a 2159 * negative error code. If called before registration, it always 2160 * succeeds. 2161 */ 2162 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) 2163 { 2164 int rc; 2165 2166 if (rxq < 1 || rxq > dev->num_rx_queues) 2167 return -EINVAL; 2168 2169 if (dev->reg_state == NETREG_REGISTERED) { 2170 ASSERT_RTNL(); 2171 2172 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, 2173 rxq); 2174 if (rc) 2175 return rc; 2176 } 2177 2178 dev->real_num_rx_queues = rxq; 2179 return 0; 2180 } 2181 EXPORT_SYMBOL(netif_set_real_num_rx_queues); 2182 #endif 2183 2184 /** 2185 * netif_get_num_default_rss_queues - default number of RSS queues 2186 * 2187 * This routine should set an upper limit on the number of RSS queues 2188 * used by default by multiqueue devices. 2189 */ 2190 int netif_get_num_default_rss_queues(void) 2191 { 2192 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus()); 2193 } 2194 EXPORT_SYMBOL(netif_get_num_default_rss_queues); 2195 2196 static inline void __netif_reschedule(struct Qdisc *q) 2197 { 2198 struct softnet_data *sd; 2199 unsigned long flags; 2200 2201 local_irq_save(flags); 2202 sd = this_cpu_ptr(&softnet_data); 2203 q->next_sched = NULL; 2204 *sd->output_queue_tailp = q; 2205 sd->output_queue_tailp = &q->next_sched; 2206 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2207 local_irq_restore(flags); 2208 } 2209 2210 void __netif_schedule(struct Qdisc *q) 2211 { 2212 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 2213 __netif_reschedule(q); 2214 } 2215 EXPORT_SYMBOL(__netif_schedule); 2216 2217 struct dev_kfree_skb_cb { 2218 enum skb_free_reason reason; 2219 }; 2220 2221 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb) 2222 { 2223 return (struct dev_kfree_skb_cb *)skb->cb; 2224 } 2225 2226 void netif_schedule_queue(struct netdev_queue *txq) 2227 { 2228 rcu_read_lock(); 2229 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) { 2230 struct Qdisc *q = rcu_dereference(txq->qdisc); 2231 2232 __netif_schedule(q); 2233 } 2234 rcu_read_unlock(); 2235 } 2236 EXPORT_SYMBOL(netif_schedule_queue); 2237 2238 /** 2239 * netif_wake_subqueue - allow sending packets on subqueue 2240 * @dev: network device 2241 * @queue_index: sub queue index 2242 * 2243 * Resume individual transmit queue of a device with multiple transmit queues. 2244 */ 2245 void netif_wake_subqueue(struct net_device *dev, u16 queue_index) 2246 { 2247 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index); 2248 2249 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) { 2250 struct Qdisc *q; 2251 2252 rcu_read_lock(); 2253 q = rcu_dereference(txq->qdisc); 2254 __netif_schedule(q); 2255 rcu_read_unlock(); 2256 } 2257 } 2258 EXPORT_SYMBOL(netif_wake_subqueue); 2259 2260 void netif_tx_wake_queue(struct netdev_queue *dev_queue) 2261 { 2262 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) { 2263 struct Qdisc *q; 2264 2265 rcu_read_lock(); 2266 q = rcu_dereference(dev_queue->qdisc); 2267 __netif_schedule(q); 2268 rcu_read_unlock(); 2269 } 2270 } 2271 EXPORT_SYMBOL(netif_tx_wake_queue); 2272 2273 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) 2274 { 2275 unsigned long flags; 2276 2277 if (likely(atomic_read(&skb->users) == 1)) { 2278 smp_rmb(); 2279 atomic_set(&skb->users, 0); 2280 } else if (likely(!atomic_dec_and_test(&skb->users))) { 2281 return; 2282 } 2283 get_kfree_skb_cb(skb)->reason = reason; 2284 local_irq_save(flags); 2285 skb->next = __this_cpu_read(softnet_data.completion_queue); 2286 __this_cpu_write(softnet_data.completion_queue, skb); 2287 raise_softirq_irqoff(NET_TX_SOFTIRQ); 2288 local_irq_restore(flags); 2289 } 2290 EXPORT_SYMBOL(__dev_kfree_skb_irq); 2291 2292 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason) 2293 { 2294 if (in_irq() || irqs_disabled()) 2295 __dev_kfree_skb_irq(skb, reason); 2296 else 2297 dev_kfree_skb(skb); 2298 } 2299 EXPORT_SYMBOL(__dev_kfree_skb_any); 2300 2301 2302 /** 2303 * netif_device_detach - mark device as removed 2304 * @dev: network device 2305 * 2306 * Mark device as removed from system and therefore no longer available. 2307 */ 2308 void netif_device_detach(struct net_device *dev) 2309 { 2310 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 2311 netif_running(dev)) { 2312 netif_tx_stop_all_queues(dev); 2313 } 2314 } 2315 EXPORT_SYMBOL(netif_device_detach); 2316 2317 /** 2318 * netif_device_attach - mark device as attached 2319 * @dev: network device 2320 * 2321 * Mark device as attached from system and restart if needed. 2322 */ 2323 void netif_device_attach(struct net_device *dev) 2324 { 2325 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 2326 netif_running(dev)) { 2327 netif_tx_wake_all_queues(dev); 2328 __netdev_watchdog_up(dev); 2329 } 2330 } 2331 EXPORT_SYMBOL(netif_device_attach); 2332 2333 static void skb_warn_bad_offload(const struct sk_buff *skb) 2334 { 2335 static const netdev_features_t null_features = 0; 2336 struct net_device *dev = skb->dev; 2337 const char *driver = ""; 2338 2339 if (!net_ratelimit()) 2340 return; 2341 2342 if (dev && dev->dev.parent) 2343 driver = dev_driver_string(dev->dev.parent); 2344 2345 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d " 2346 "gso_type=%d ip_summed=%d\n", 2347 driver, dev ? &dev->features : &null_features, 2348 skb->sk ? &skb->sk->sk_route_caps : &null_features, 2349 skb->len, skb->data_len, skb_shinfo(skb)->gso_size, 2350 skb_shinfo(skb)->gso_type, skb->ip_summed); 2351 } 2352 2353 /* 2354 * Invalidate hardware checksum when packet is to be mangled, and 2355 * complete checksum manually on outgoing path. 2356 */ 2357 int skb_checksum_help(struct sk_buff *skb) 2358 { 2359 __wsum csum; 2360 int ret = 0, offset; 2361 2362 if (skb->ip_summed == CHECKSUM_COMPLETE) 2363 goto out_set_summed; 2364 2365 if (unlikely(skb_shinfo(skb)->gso_size)) { 2366 skb_warn_bad_offload(skb); 2367 return -EINVAL; 2368 } 2369 2370 /* Before computing a checksum, we should make sure no frag could 2371 * be modified by an external entity : checksum could be wrong. 2372 */ 2373 if (skb_has_shared_frag(skb)) { 2374 ret = __skb_linearize(skb); 2375 if (ret) 2376 goto out; 2377 } 2378 2379 offset = skb_checksum_start_offset(skb); 2380 BUG_ON(offset >= skb_headlen(skb)); 2381 csum = skb_checksum(skb, offset, skb->len - offset, 0); 2382 2383 offset += skb->csum_offset; 2384 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 2385 2386 if (skb_cloned(skb) && 2387 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 2388 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 2389 if (ret) 2390 goto out; 2391 } 2392 2393 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 2394 out_set_summed: 2395 skb->ip_summed = CHECKSUM_NONE; 2396 out: 2397 return ret; 2398 } 2399 EXPORT_SYMBOL(skb_checksum_help); 2400 2401 __be16 skb_network_protocol(struct sk_buff *skb, int *depth) 2402 { 2403 __be16 type = skb->protocol; 2404 2405 /* Tunnel gso handlers can set protocol to ethernet. */ 2406 if (type == htons(ETH_P_TEB)) { 2407 struct ethhdr *eth; 2408 2409 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) 2410 return 0; 2411 2412 eth = (struct ethhdr *)skb_mac_header(skb); 2413 type = eth->h_proto; 2414 } 2415 2416 return __vlan_get_protocol(skb, type, depth); 2417 } 2418 2419 /** 2420 * skb_mac_gso_segment - mac layer segmentation handler. 2421 * @skb: buffer to segment 2422 * @features: features for the output path (see dev->features) 2423 */ 2424 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, 2425 netdev_features_t features) 2426 { 2427 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 2428 struct packet_offload *ptype; 2429 int vlan_depth = skb->mac_len; 2430 __be16 type = skb_network_protocol(skb, &vlan_depth); 2431 2432 if (unlikely(!type)) 2433 return ERR_PTR(-EINVAL); 2434 2435 __skb_pull(skb, vlan_depth); 2436 2437 rcu_read_lock(); 2438 list_for_each_entry_rcu(ptype, &offload_base, list) { 2439 if (ptype->type == type && ptype->callbacks.gso_segment) { 2440 segs = ptype->callbacks.gso_segment(skb, features); 2441 break; 2442 } 2443 } 2444 rcu_read_unlock(); 2445 2446 __skb_push(skb, skb->data - skb_mac_header(skb)); 2447 2448 return segs; 2449 } 2450 EXPORT_SYMBOL(skb_mac_gso_segment); 2451 2452 2453 /* openvswitch calls this on rx path, so we need a different check. 2454 */ 2455 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) 2456 { 2457 if (tx_path) 2458 return skb->ip_summed != CHECKSUM_PARTIAL; 2459 else 2460 return skb->ip_summed == CHECKSUM_NONE; 2461 } 2462 2463 /** 2464 * __skb_gso_segment - Perform segmentation on skb. 2465 * @skb: buffer to segment 2466 * @features: features for the output path (see dev->features) 2467 * @tx_path: whether it is called in TX path 2468 * 2469 * This function segments the given skb and returns a list of segments. 2470 * 2471 * It may return NULL if the skb requires no segmentation. This is 2472 * only possible when GSO is used for verifying header integrity. 2473 */ 2474 struct sk_buff *__skb_gso_segment(struct sk_buff *skb, 2475 netdev_features_t features, bool tx_path) 2476 { 2477 if (unlikely(skb_needs_check(skb, tx_path))) { 2478 int err; 2479 2480 skb_warn_bad_offload(skb); 2481 2482 err = skb_cow_head(skb, 0); 2483 if (err < 0) 2484 return ERR_PTR(err); 2485 } 2486 2487 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); 2488 SKB_GSO_CB(skb)->encap_level = 0; 2489 2490 skb_reset_mac_header(skb); 2491 skb_reset_mac_len(skb); 2492 2493 return skb_mac_gso_segment(skb, features); 2494 } 2495 EXPORT_SYMBOL(__skb_gso_segment); 2496 2497 /* Take action when hardware reception checksum errors are detected. */ 2498 #ifdef CONFIG_BUG 2499 void netdev_rx_csum_fault(struct net_device *dev) 2500 { 2501 if (net_ratelimit()) { 2502 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>"); 2503 dump_stack(); 2504 } 2505 } 2506 EXPORT_SYMBOL(netdev_rx_csum_fault); 2507 #endif 2508 2509 /* Actually, we should eliminate this check as soon as we know, that: 2510 * 1. IOMMU is present and allows to map all the memory. 2511 * 2. No high memory really exists on this machine. 2512 */ 2513 2514 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 2515 { 2516 #ifdef CONFIG_HIGHMEM 2517 int i; 2518 if (!(dev->features & NETIF_F_HIGHDMA)) { 2519 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2520 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2521 if (PageHighMem(skb_frag_page(frag))) 2522 return 1; 2523 } 2524 } 2525 2526 if (PCI_DMA_BUS_IS_PHYS) { 2527 struct device *pdev = dev->dev.parent; 2528 2529 if (!pdev) 2530 return 0; 2531 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { 2532 skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 2533 dma_addr_t addr = page_to_phys(skb_frag_page(frag)); 2534 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask) 2535 return 1; 2536 } 2537 } 2538 #endif 2539 return 0; 2540 } 2541 2542 /* If MPLS offload request, verify we are testing hardware MPLS features 2543 * instead of standard features for the netdev. 2544 */ 2545 #if IS_ENABLED(CONFIG_NET_MPLS_GSO) 2546 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2547 netdev_features_t features, 2548 __be16 type) 2549 { 2550 if (eth_p_mpls(type)) 2551 features &= skb->dev->mpls_features; 2552 2553 return features; 2554 } 2555 #else 2556 static netdev_features_t net_mpls_features(struct sk_buff *skb, 2557 netdev_features_t features, 2558 __be16 type) 2559 { 2560 return features; 2561 } 2562 #endif 2563 2564 static netdev_features_t harmonize_features(struct sk_buff *skb, 2565 netdev_features_t features) 2566 { 2567 int tmp; 2568 __be16 type; 2569 2570 type = skb_network_protocol(skb, &tmp); 2571 features = net_mpls_features(skb, features, type); 2572 2573 if (skb->ip_summed != CHECKSUM_NONE && 2574 !can_checksum_protocol(features, type)) { 2575 features &= ~NETIF_F_ALL_CSUM; 2576 } else if (illegal_highdma(skb->dev, skb)) { 2577 features &= ~NETIF_F_SG; 2578 } 2579 2580 return features; 2581 } 2582 2583 netdev_features_t passthru_features_check(struct sk_buff *skb, 2584 struct net_device *dev, 2585 netdev_features_t features) 2586 { 2587 return features; 2588 } 2589 EXPORT_SYMBOL(passthru_features_check); 2590 2591 static netdev_features_t dflt_features_check(const struct sk_buff *skb, 2592 struct net_device *dev, 2593 netdev_features_t features) 2594 { 2595 return vlan_features_check(skb, features); 2596 } 2597 2598 netdev_features_t netif_skb_features(struct sk_buff *skb) 2599 { 2600 struct net_device *dev = skb->dev; 2601 netdev_features_t features = dev->features; 2602 u16 gso_segs = skb_shinfo(skb)->gso_segs; 2603 2604 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs) 2605 features &= ~NETIF_F_GSO_MASK; 2606 2607 /* If encapsulation offload request, verify we are testing 2608 * hardware encapsulation features instead of standard 2609 * features for the netdev 2610 */ 2611 if (skb->encapsulation) 2612 features &= dev->hw_enc_features; 2613 2614 if (skb_vlan_tagged(skb)) 2615 features = netdev_intersect_features(features, 2616 dev->vlan_features | 2617 NETIF_F_HW_VLAN_CTAG_TX | 2618 NETIF_F_HW_VLAN_STAG_TX); 2619 2620 if (dev->netdev_ops->ndo_features_check) 2621 features &= dev->netdev_ops->ndo_features_check(skb, dev, 2622 features); 2623 else 2624 features &= dflt_features_check(skb, dev, features); 2625 2626 return harmonize_features(skb, features); 2627 } 2628 EXPORT_SYMBOL(netif_skb_features); 2629 2630 static int xmit_one(struct sk_buff *skb, struct net_device *dev, 2631 struct netdev_queue *txq, bool more) 2632 { 2633 unsigned int len; 2634 int rc; 2635 2636 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) 2637 dev_queue_xmit_nit(skb, dev); 2638 2639 len = skb->len; 2640 trace_net_dev_start_xmit(skb, dev); 2641 rc = netdev_start_xmit(skb, dev, txq, more); 2642 trace_net_dev_xmit(skb, rc, dev, len); 2643 2644 return rc; 2645 } 2646 2647 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev, 2648 struct netdev_queue *txq, int *ret) 2649 { 2650 struct sk_buff *skb = first; 2651 int rc = NETDEV_TX_OK; 2652 2653 while (skb) { 2654 struct sk_buff *next = skb->next; 2655 2656 skb->next = NULL; 2657 rc = xmit_one(skb, dev, txq, next != NULL); 2658 if (unlikely(!dev_xmit_complete(rc))) { 2659 skb->next = next; 2660 goto out; 2661 } 2662 2663 skb = next; 2664 if (netif_xmit_stopped(txq) && skb) { 2665 rc = NETDEV_TX_BUSY; 2666 break; 2667 } 2668 } 2669 2670 out: 2671 *ret = rc; 2672 return skb; 2673 } 2674 2675 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb, 2676 netdev_features_t features) 2677 { 2678 if (skb_vlan_tag_present(skb) && 2679 !vlan_hw_offload_capable(features, skb->vlan_proto)) 2680 skb = __vlan_hwaccel_push_inside(skb); 2681 return skb; 2682 } 2683 2684 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) 2685 { 2686 netdev_features_t features; 2687 2688 if (skb->next) 2689 return skb; 2690 2691 features = netif_skb_features(skb); 2692 skb = validate_xmit_vlan(skb, features); 2693 if (unlikely(!skb)) 2694 goto out_null; 2695 2696 if (netif_needs_gso(dev, skb, features)) { 2697 struct sk_buff *segs; 2698 2699 segs = skb_gso_segment(skb, features); 2700 if (IS_ERR(segs)) { 2701 goto out_kfree_skb; 2702 } else if (segs) { 2703 consume_skb(skb); 2704 skb = segs; 2705 } 2706 } else { 2707 if (skb_needs_linearize(skb, features) && 2708 __skb_linearize(skb)) 2709 goto out_kfree_skb; 2710 2711 /* If packet is not checksummed and device does not 2712 * support checksumming for this protocol, complete 2713 * checksumming here. 2714 */ 2715 if (skb->ip_summed == CHECKSUM_PARTIAL) { 2716 if (skb->encapsulation) 2717 skb_set_inner_transport_header(skb, 2718 skb_checksum_start_offset(skb)); 2719 else 2720 skb_set_transport_header(skb, 2721 skb_checksum_start_offset(skb)); 2722 if (!(features & NETIF_F_ALL_CSUM) && 2723 skb_checksum_help(skb)) 2724 goto out_kfree_skb; 2725 } 2726 } 2727 2728 return skb; 2729 2730 out_kfree_skb: 2731 kfree_skb(skb); 2732 out_null: 2733 return NULL; 2734 } 2735 2736 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) 2737 { 2738 struct sk_buff *next, *head = NULL, *tail; 2739 2740 for (; skb != NULL; skb = next) { 2741 next = skb->next; 2742 skb->next = NULL; 2743 2744 /* in case skb wont be segmented, point to itself */ 2745 skb->prev = skb; 2746 2747 skb = validate_xmit_skb(skb, dev); 2748 if (!skb) 2749 continue; 2750 2751 if (!head) 2752 head = skb; 2753 else 2754 tail->next = skb; 2755 /* If skb was segmented, skb->prev points to 2756 * the last segment. If not, it still contains skb. 2757 */ 2758 tail = skb->prev; 2759 } 2760 return head; 2761 } 2762 2763 static void qdisc_pkt_len_init(struct sk_buff *skb) 2764 { 2765 const struct skb_shared_info *shinfo = skb_shinfo(skb); 2766 2767 qdisc_skb_cb(skb)->pkt_len = skb->len; 2768 2769 /* To get more precise estimation of bytes sent on wire, 2770 * we add to pkt_len the headers size of all segments 2771 */ 2772 if (shinfo->gso_size) { 2773 unsigned int hdr_len; 2774 u16 gso_segs = shinfo->gso_segs; 2775 2776 /* mac layer + network layer */ 2777 hdr_len = skb_transport_header(skb) - skb_mac_header(skb); 2778 2779 /* + transport layer */ 2780 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) 2781 hdr_len += tcp_hdrlen(skb); 2782 else 2783 hdr_len += sizeof(struct udphdr); 2784 2785 if (shinfo->gso_type & SKB_GSO_DODGY) 2786 gso_segs = DIV_ROUND_UP(skb->len - hdr_len, 2787 shinfo->gso_size); 2788 2789 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; 2790 } 2791 } 2792 2793 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 2794 struct net_device *dev, 2795 struct netdev_queue *txq) 2796 { 2797 spinlock_t *root_lock = qdisc_lock(q); 2798 bool contended; 2799 int rc; 2800 2801 qdisc_pkt_len_init(skb); 2802 qdisc_calculate_pkt_len(skb, q); 2803 /* 2804 * Heuristic to force contended enqueues to serialize on a 2805 * separate lock before trying to get qdisc main lock. 2806 * This permits __QDISC___STATE_RUNNING owner to get the lock more 2807 * often and dequeue packets faster. 2808 */ 2809 contended = qdisc_is_running(q); 2810 if (unlikely(contended)) 2811 spin_lock(&q->busylock); 2812 2813 spin_lock(root_lock); 2814 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 2815 kfree_skb(skb); 2816 rc = NET_XMIT_DROP; 2817 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 2818 qdisc_run_begin(q)) { 2819 /* 2820 * This is a work-conserving queue; there are no old skbs 2821 * waiting to be sent out; and the qdisc is not running - 2822 * xmit the skb directly. 2823 */ 2824 2825 qdisc_bstats_update(q, skb); 2826 2827 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { 2828 if (unlikely(contended)) { 2829 spin_unlock(&q->busylock); 2830 contended = false; 2831 } 2832 __qdisc_run(q); 2833 } else 2834 qdisc_run_end(q); 2835 2836 rc = NET_XMIT_SUCCESS; 2837 } else { 2838 rc = q->enqueue(skb, q) & NET_XMIT_MASK; 2839 if (qdisc_run_begin(q)) { 2840 if (unlikely(contended)) { 2841 spin_unlock(&q->busylock); 2842 contended = false; 2843 } 2844 __qdisc_run(q); 2845 } 2846 } 2847 spin_unlock(root_lock); 2848 if (unlikely(contended)) 2849 spin_unlock(&q->busylock); 2850 return rc; 2851 } 2852 2853 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO) 2854 static void skb_update_prio(struct sk_buff *skb) 2855 { 2856 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap); 2857 2858 if (!skb->priority && skb->sk && map) { 2859 unsigned int prioidx = skb->sk->sk_cgrp_prioidx; 2860 2861 if (prioidx < map->priomap_len) 2862 skb->priority = map->priomap[prioidx]; 2863 } 2864 } 2865 #else 2866 #define skb_update_prio(skb) 2867 #endif 2868 2869 static DEFINE_PER_CPU(int, xmit_recursion); 2870 #define RECURSION_LIMIT 10 2871 2872 /** 2873 * dev_loopback_xmit - loop back @skb 2874 * @skb: buffer to transmit 2875 */ 2876 int dev_loopback_xmit(struct sk_buff *skb) 2877 { 2878 skb_reset_mac_header(skb); 2879 __skb_pull(skb, skb_network_offset(skb)); 2880 skb->pkt_type = PACKET_LOOPBACK; 2881 skb->ip_summed = CHECKSUM_UNNECESSARY; 2882 WARN_ON(!skb_dst(skb)); 2883 skb_dst_force(skb); 2884 netif_rx_ni(skb); 2885 return 0; 2886 } 2887 EXPORT_SYMBOL(dev_loopback_xmit); 2888 2889 /** 2890 * __dev_queue_xmit - transmit a buffer 2891 * @skb: buffer to transmit 2892 * @accel_priv: private data used for L2 forwarding offload 2893 * 2894 * Queue a buffer for transmission to a network device. The caller must 2895 * have set the device and priority and built the buffer before calling 2896 * this function. The function can be called from an interrupt. 2897 * 2898 * A negative errno code is returned on a failure. A success does not 2899 * guarantee the frame will be transmitted as it may be dropped due 2900 * to congestion or traffic shaping. 2901 * 2902 * ----------------------------------------------------------------------------------- 2903 * I notice this method can also return errors from the queue disciplines, 2904 * including NET_XMIT_DROP, which is a positive value. So, errors can also 2905 * be positive. 2906 * 2907 * Regardless of the return value, the skb is consumed, so it is currently 2908 * difficult to retry a send to this method. (You can bump the ref count 2909 * before sending to hold a reference for retry if you are careful.) 2910 * 2911 * When calling this method, interrupts MUST be enabled. This is because 2912 * the BH enable code must have IRQs enabled so that it will not deadlock. 2913 * --BLG 2914 */ 2915 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) 2916 { 2917 struct net_device *dev = skb->dev; 2918 struct netdev_queue *txq; 2919 struct Qdisc *q; 2920 int rc = -ENOMEM; 2921 2922 skb_reset_mac_header(skb); 2923 2924 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP)) 2925 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED); 2926 2927 /* Disable soft irqs for various locks below. Also 2928 * stops preemption for RCU. 2929 */ 2930 rcu_read_lock_bh(); 2931 2932 skb_update_prio(skb); 2933 2934 /* If device/qdisc don't need skb->dst, release it right now while 2935 * its hot in this cpu cache. 2936 */ 2937 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 2938 skb_dst_drop(skb); 2939 else 2940 skb_dst_force(skb); 2941 2942 txq = netdev_pick_tx(dev, skb, accel_priv); 2943 q = rcu_dereference_bh(txq->qdisc); 2944 2945 #ifdef CONFIG_NET_CLS_ACT 2946 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 2947 #endif 2948 trace_net_dev_queue(skb); 2949 if (q->enqueue) { 2950 rc = __dev_xmit_skb(skb, q, dev, txq); 2951 goto out; 2952 } 2953 2954 /* The device has no queue. Common case for software devices: 2955 loopback, all the sorts of tunnels... 2956 2957 Really, it is unlikely that netif_tx_lock protection is necessary 2958 here. (f.e. loopback and IP tunnels are clean ignoring statistics 2959 counters.) 2960 However, it is possible, that they rely on protection 2961 made by us here. 2962 2963 Check this and shot the lock. It is not prone from deadlocks. 2964 Either shot noqueue qdisc, it is even simpler 8) 2965 */ 2966 if (dev->flags & IFF_UP) { 2967 int cpu = smp_processor_id(); /* ok because BHs are off */ 2968 2969 if (txq->xmit_lock_owner != cpu) { 2970 2971 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) 2972 goto recursion_alert; 2973 2974 skb = validate_xmit_skb(skb, dev); 2975 if (!skb) 2976 goto drop; 2977 2978 HARD_TX_LOCK(dev, txq, cpu); 2979 2980 if (!netif_xmit_stopped(txq)) { 2981 __this_cpu_inc(xmit_recursion); 2982 skb = dev_hard_start_xmit(skb, dev, txq, &rc); 2983 __this_cpu_dec(xmit_recursion); 2984 if (dev_xmit_complete(rc)) { 2985 HARD_TX_UNLOCK(dev, txq); 2986 goto out; 2987 } 2988 } 2989 HARD_TX_UNLOCK(dev, txq); 2990 net_crit_ratelimited("Virtual device %s asks to queue packet!\n", 2991 dev->name); 2992 } else { 2993 /* Recursion is detected! It is possible, 2994 * unfortunately 2995 */ 2996 recursion_alert: 2997 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n", 2998 dev->name); 2999 } 3000 } 3001 3002 rc = -ENETDOWN; 3003 drop: 3004 rcu_read_unlock_bh(); 3005 3006 atomic_long_inc(&dev->tx_dropped); 3007 kfree_skb_list(skb); 3008 return rc; 3009 out: 3010 rcu_read_unlock_bh(); 3011 return rc; 3012 } 3013 3014 int dev_queue_xmit(struct sk_buff *skb) 3015 { 3016 return __dev_queue_xmit(skb, NULL); 3017 } 3018 EXPORT_SYMBOL(dev_queue_xmit); 3019 3020 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv) 3021 { 3022 return __dev_queue_xmit(skb, accel_priv); 3023 } 3024 EXPORT_SYMBOL(dev_queue_xmit_accel); 3025 3026 3027 /*======================================================================= 3028 Receiver routines 3029 =======================================================================*/ 3030 3031 int netdev_max_backlog __read_mostly = 1000; 3032 EXPORT_SYMBOL(netdev_max_backlog); 3033 3034 int netdev_tstamp_prequeue __read_mostly = 1; 3035 int netdev_budget __read_mostly = 300; 3036 int weight_p __read_mostly = 64; /* old backlog weight */ 3037 3038 /* Called with irq disabled */ 3039 static inline void ____napi_schedule(struct softnet_data *sd, 3040 struct napi_struct *napi) 3041 { 3042 list_add_tail(&napi->poll_list, &sd->poll_list); 3043 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3044 } 3045 3046 #ifdef CONFIG_RPS 3047 3048 /* One global table that all flow-based protocols share. */ 3049 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 3050 EXPORT_SYMBOL(rps_sock_flow_table); 3051 u32 rps_cpu_mask __read_mostly; 3052 EXPORT_SYMBOL(rps_cpu_mask); 3053 3054 struct static_key rps_needed __read_mostly; 3055 3056 static struct rps_dev_flow * 3057 set_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3058 struct rps_dev_flow *rflow, u16 next_cpu) 3059 { 3060 if (next_cpu != RPS_NO_CPU) { 3061 #ifdef CONFIG_RFS_ACCEL 3062 struct netdev_rx_queue *rxqueue; 3063 struct rps_dev_flow_table *flow_table; 3064 struct rps_dev_flow *old_rflow; 3065 u32 flow_id; 3066 u16 rxq_index; 3067 int rc; 3068 3069 /* Should we steer this flow to a different hardware queue? */ 3070 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap || 3071 !(dev->features & NETIF_F_NTUPLE)) 3072 goto out; 3073 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu); 3074 if (rxq_index == skb_get_rx_queue(skb)) 3075 goto out; 3076 3077 rxqueue = dev->_rx + rxq_index; 3078 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3079 if (!flow_table) 3080 goto out; 3081 flow_id = skb_get_hash(skb) & flow_table->mask; 3082 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, 3083 rxq_index, flow_id); 3084 if (rc < 0) 3085 goto out; 3086 old_rflow = rflow; 3087 rflow = &flow_table->flows[flow_id]; 3088 rflow->filter = rc; 3089 if (old_rflow->filter == rflow->filter) 3090 old_rflow->filter = RPS_NO_FILTER; 3091 out: 3092 #endif 3093 rflow->last_qtail = 3094 per_cpu(softnet_data, next_cpu).input_queue_head; 3095 } 3096 3097 rflow->cpu = next_cpu; 3098 return rflow; 3099 } 3100 3101 /* 3102 * get_rps_cpu is called from netif_receive_skb and returns the target 3103 * CPU from the RPS map of the receiving queue for a given skb. 3104 * rcu_read_lock must be held on entry. 3105 */ 3106 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, 3107 struct rps_dev_flow **rflowp) 3108 { 3109 const struct rps_sock_flow_table *sock_flow_table; 3110 struct netdev_rx_queue *rxqueue = dev->_rx; 3111 struct rps_dev_flow_table *flow_table; 3112 struct rps_map *map; 3113 int cpu = -1; 3114 u32 tcpu; 3115 u32 hash; 3116 3117 if (skb_rx_queue_recorded(skb)) { 3118 u16 index = skb_get_rx_queue(skb); 3119 3120 if (unlikely(index >= dev->real_num_rx_queues)) { 3121 WARN_ONCE(dev->real_num_rx_queues > 1, 3122 "%s received packet on queue %u, but number " 3123 "of RX queues is %u\n", 3124 dev->name, index, dev->real_num_rx_queues); 3125 goto done; 3126 } 3127 rxqueue += index; 3128 } 3129 3130 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */ 3131 3132 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3133 map = rcu_dereference(rxqueue->rps_map); 3134 if (!flow_table && !map) 3135 goto done; 3136 3137 skb_reset_network_header(skb); 3138 hash = skb_get_hash(skb); 3139 if (!hash) 3140 goto done; 3141 3142 sock_flow_table = rcu_dereference(rps_sock_flow_table); 3143 if (flow_table && sock_flow_table) { 3144 struct rps_dev_flow *rflow; 3145 u32 next_cpu; 3146 u32 ident; 3147 3148 /* First check into global flow table if there is a match */ 3149 ident = sock_flow_table->ents[hash & sock_flow_table->mask]; 3150 if ((ident ^ hash) & ~rps_cpu_mask) 3151 goto try_rps; 3152 3153 next_cpu = ident & rps_cpu_mask; 3154 3155 /* OK, now we know there is a match, 3156 * we can look at the local (per receive queue) flow table 3157 */ 3158 rflow = &flow_table->flows[hash & flow_table->mask]; 3159 tcpu = rflow->cpu; 3160 3161 /* 3162 * If the desired CPU (where last recvmsg was done) is 3163 * different from current CPU (one in the rx-queue flow 3164 * table entry), switch if one of the following holds: 3165 * - Current CPU is unset (equal to RPS_NO_CPU). 3166 * - Current CPU is offline. 3167 * - The current CPU's queue tail has advanced beyond the 3168 * last packet that was enqueued using this table entry. 3169 * This guarantees that all previous packets for the flow 3170 * have been dequeued, thus preserving in order delivery. 3171 */ 3172 if (unlikely(tcpu != next_cpu) && 3173 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 3174 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 3175 rflow->last_qtail)) >= 0)) { 3176 tcpu = next_cpu; 3177 rflow = set_rps_cpu(dev, skb, rflow, next_cpu); 3178 } 3179 3180 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 3181 *rflowp = rflow; 3182 cpu = tcpu; 3183 goto done; 3184 } 3185 } 3186 3187 try_rps: 3188 3189 if (map) { 3190 tcpu = map->cpus[reciprocal_scale(hash, map->len)]; 3191 if (cpu_online(tcpu)) { 3192 cpu = tcpu; 3193 goto done; 3194 } 3195 } 3196 3197 done: 3198 return cpu; 3199 } 3200 3201 #ifdef CONFIG_RFS_ACCEL 3202 3203 /** 3204 * rps_may_expire_flow - check whether an RFS hardware filter may be removed 3205 * @dev: Device on which the filter was set 3206 * @rxq_index: RX queue index 3207 * @flow_id: Flow ID passed to ndo_rx_flow_steer() 3208 * @filter_id: Filter ID returned by ndo_rx_flow_steer() 3209 * 3210 * Drivers that implement ndo_rx_flow_steer() should periodically call 3211 * this function for each installed filter and remove the filters for 3212 * which it returns %true. 3213 */ 3214 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, 3215 u32 flow_id, u16 filter_id) 3216 { 3217 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index; 3218 struct rps_dev_flow_table *flow_table; 3219 struct rps_dev_flow *rflow; 3220 bool expire = true; 3221 int cpu; 3222 3223 rcu_read_lock(); 3224 flow_table = rcu_dereference(rxqueue->rps_flow_table); 3225 if (flow_table && flow_id <= flow_table->mask) { 3226 rflow = &flow_table->flows[flow_id]; 3227 cpu = ACCESS_ONCE(rflow->cpu); 3228 if (rflow->filter == filter_id && cpu != RPS_NO_CPU && 3229 ((int)(per_cpu(softnet_data, cpu).input_queue_head - 3230 rflow->last_qtail) < 3231 (int)(10 * flow_table->mask))) 3232 expire = false; 3233 } 3234 rcu_read_unlock(); 3235 return expire; 3236 } 3237 EXPORT_SYMBOL(rps_may_expire_flow); 3238 3239 #endif /* CONFIG_RFS_ACCEL */ 3240 3241 /* Called from hardirq (IPI) context */ 3242 static void rps_trigger_softirq(void *data) 3243 { 3244 struct softnet_data *sd = data; 3245 3246 ____napi_schedule(sd, &sd->backlog); 3247 sd->received_rps++; 3248 } 3249 3250 #endif /* CONFIG_RPS */ 3251 3252 /* 3253 * Check if this softnet_data structure is another cpu one 3254 * If yes, queue it to our IPI list and return 1 3255 * If no, return 0 3256 */ 3257 static int rps_ipi_queued(struct softnet_data *sd) 3258 { 3259 #ifdef CONFIG_RPS 3260 struct softnet_data *mysd = this_cpu_ptr(&softnet_data); 3261 3262 if (sd != mysd) { 3263 sd->rps_ipi_next = mysd->rps_ipi_list; 3264 mysd->rps_ipi_list = sd; 3265 3266 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 3267 return 1; 3268 } 3269 #endif /* CONFIG_RPS */ 3270 return 0; 3271 } 3272 3273 #ifdef CONFIG_NET_FLOW_LIMIT 3274 int netdev_flow_limit_table_len __read_mostly = (1 << 12); 3275 #endif 3276 3277 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) 3278 { 3279 #ifdef CONFIG_NET_FLOW_LIMIT 3280 struct sd_flow_limit *fl; 3281 struct softnet_data *sd; 3282 unsigned int old_flow, new_flow; 3283 3284 if (qlen < (netdev_max_backlog >> 1)) 3285 return false; 3286 3287 sd = this_cpu_ptr(&softnet_data); 3288 3289 rcu_read_lock(); 3290 fl = rcu_dereference(sd->flow_limit); 3291 if (fl) { 3292 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1); 3293 old_flow = fl->history[fl->history_head]; 3294 fl->history[fl->history_head] = new_flow; 3295 3296 fl->history_head++; 3297 fl->history_head &= FLOW_LIMIT_HISTORY - 1; 3298 3299 if (likely(fl->buckets[old_flow])) 3300 fl->buckets[old_flow]--; 3301 3302 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) { 3303 fl->count++; 3304 rcu_read_unlock(); 3305 return true; 3306 } 3307 } 3308 rcu_read_unlock(); 3309 #endif 3310 return false; 3311 } 3312 3313 /* 3314 * enqueue_to_backlog is called to queue an skb to a per CPU backlog 3315 * queue (may be a remote CPU queue). 3316 */ 3317 static int enqueue_to_backlog(struct sk_buff *skb, int cpu, 3318 unsigned int *qtail) 3319 { 3320 struct softnet_data *sd; 3321 unsigned long flags; 3322 unsigned int qlen; 3323 3324 sd = &per_cpu(softnet_data, cpu); 3325 3326 local_irq_save(flags); 3327 3328 rps_lock(sd); 3329 qlen = skb_queue_len(&sd->input_pkt_queue); 3330 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) { 3331 if (qlen) { 3332 enqueue: 3333 __skb_queue_tail(&sd->input_pkt_queue, skb); 3334 input_queue_tail_incr_save(sd, qtail); 3335 rps_unlock(sd); 3336 local_irq_restore(flags); 3337 return NET_RX_SUCCESS; 3338 } 3339 3340 /* Schedule NAPI for backlog device 3341 * We can use non atomic operation since we own the queue lock 3342 */ 3343 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) { 3344 if (!rps_ipi_queued(sd)) 3345 ____napi_schedule(sd, &sd->backlog); 3346 } 3347 goto enqueue; 3348 } 3349 3350 sd->dropped++; 3351 rps_unlock(sd); 3352 3353 local_irq_restore(flags); 3354 3355 atomic_long_inc(&skb->dev->rx_dropped); 3356 kfree_skb(skb); 3357 return NET_RX_DROP; 3358 } 3359 3360 static int netif_rx_internal(struct sk_buff *skb) 3361 { 3362 int ret; 3363 3364 net_timestamp_check(netdev_tstamp_prequeue, skb); 3365 3366 trace_netif_rx(skb); 3367 #ifdef CONFIG_RPS 3368 if (static_key_false(&rps_needed)) { 3369 struct rps_dev_flow voidflow, *rflow = &voidflow; 3370 int cpu; 3371 3372 preempt_disable(); 3373 rcu_read_lock(); 3374 3375 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3376 if (cpu < 0) 3377 cpu = smp_processor_id(); 3378 3379 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3380 3381 rcu_read_unlock(); 3382 preempt_enable(); 3383 } else 3384 #endif 3385 { 3386 unsigned int qtail; 3387 ret = enqueue_to_backlog(skb, get_cpu(), &qtail); 3388 put_cpu(); 3389 } 3390 return ret; 3391 } 3392 3393 /** 3394 * netif_rx - post buffer to the network code 3395 * @skb: buffer to post 3396 * 3397 * This function receives a packet from a device driver and queues it for 3398 * the upper (protocol) levels to process. It always succeeds. The buffer 3399 * may be dropped during processing for congestion control or by the 3400 * protocol layers. 3401 * 3402 * return values: 3403 * NET_RX_SUCCESS (no congestion) 3404 * NET_RX_DROP (packet was dropped) 3405 * 3406 */ 3407 3408 int netif_rx(struct sk_buff *skb) 3409 { 3410 trace_netif_rx_entry(skb); 3411 3412 return netif_rx_internal(skb); 3413 } 3414 EXPORT_SYMBOL(netif_rx); 3415 3416 int netif_rx_ni(struct sk_buff *skb) 3417 { 3418 int err; 3419 3420 trace_netif_rx_ni_entry(skb); 3421 3422 preempt_disable(); 3423 err = netif_rx_internal(skb); 3424 if (local_softirq_pending()) 3425 do_softirq(); 3426 preempt_enable(); 3427 3428 return err; 3429 } 3430 EXPORT_SYMBOL(netif_rx_ni); 3431 3432 static void net_tx_action(struct softirq_action *h) 3433 { 3434 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3435 3436 if (sd->completion_queue) { 3437 struct sk_buff *clist; 3438 3439 local_irq_disable(); 3440 clist = sd->completion_queue; 3441 sd->completion_queue = NULL; 3442 local_irq_enable(); 3443 3444 while (clist) { 3445 struct sk_buff *skb = clist; 3446 clist = clist->next; 3447 3448 WARN_ON(atomic_read(&skb->users)); 3449 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED)) 3450 trace_consume_skb(skb); 3451 else 3452 trace_kfree_skb(skb, net_tx_action); 3453 __kfree_skb(skb); 3454 } 3455 } 3456 3457 if (sd->output_queue) { 3458 struct Qdisc *head; 3459 3460 local_irq_disable(); 3461 head = sd->output_queue; 3462 sd->output_queue = NULL; 3463 sd->output_queue_tailp = &sd->output_queue; 3464 local_irq_enable(); 3465 3466 while (head) { 3467 struct Qdisc *q = head; 3468 spinlock_t *root_lock; 3469 3470 head = head->next_sched; 3471 3472 root_lock = qdisc_lock(q); 3473 if (spin_trylock(root_lock)) { 3474 smp_mb__before_atomic(); 3475 clear_bit(__QDISC_STATE_SCHED, 3476 &q->state); 3477 qdisc_run(q); 3478 spin_unlock(root_lock); 3479 } else { 3480 if (!test_bit(__QDISC_STATE_DEACTIVATED, 3481 &q->state)) { 3482 __netif_reschedule(q); 3483 } else { 3484 smp_mb__before_atomic(); 3485 clear_bit(__QDISC_STATE_SCHED, 3486 &q->state); 3487 } 3488 } 3489 } 3490 } 3491 } 3492 3493 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \ 3494 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)) 3495 /* This hook is defined here for ATM LANE */ 3496 int (*br_fdb_test_addr_hook)(struct net_device *dev, 3497 unsigned char *addr) __read_mostly; 3498 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 3499 #endif 3500 3501 #ifdef CONFIG_NET_CLS_ACT 3502 /* TODO: Maybe we should just force sch_ingress to be compiled in 3503 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 3504 * a compare and 2 stores extra right now if we dont have it on 3505 * but have CONFIG_NET_CLS_ACT 3506 * NOTE: This doesn't stop any functionality; if you dont have 3507 * the ingress scheduler, you just can't add policies on ingress. 3508 * 3509 */ 3510 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) 3511 { 3512 struct net_device *dev = skb->dev; 3513 u32 ttl = G_TC_RTTL(skb->tc_verd); 3514 int result = TC_ACT_OK; 3515 struct Qdisc *q; 3516 3517 if (unlikely(MAX_RED_LOOP < ttl++)) { 3518 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n", 3519 skb->skb_iif, dev->ifindex); 3520 return TC_ACT_SHOT; 3521 } 3522 3523 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 3524 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 3525 3526 q = rcu_dereference(rxq->qdisc); 3527 if (q != &noop_qdisc) { 3528 spin_lock(qdisc_lock(q)); 3529 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 3530 result = qdisc_enqueue_root(skb, q); 3531 spin_unlock(qdisc_lock(q)); 3532 } 3533 3534 return result; 3535 } 3536 3537 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 3538 struct packet_type **pt_prev, 3539 int *ret, struct net_device *orig_dev) 3540 { 3541 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); 3542 3543 if (!rxq || rcu_access_pointer(rxq->qdisc) == &noop_qdisc) 3544 goto out; 3545 3546 if (*pt_prev) { 3547 *ret = deliver_skb(skb, *pt_prev, orig_dev); 3548 *pt_prev = NULL; 3549 } 3550 3551 switch (ing_filter(skb, rxq)) { 3552 case TC_ACT_SHOT: 3553 case TC_ACT_STOLEN: 3554 kfree_skb(skb); 3555 return NULL; 3556 } 3557 3558 out: 3559 skb->tc_verd = 0; 3560 return skb; 3561 } 3562 #endif 3563 3564 /** 3565 * netdev_rx_handler_register - register receive handler 3566 * @dev: device to register a handler for 3567 * @rx_handler: receive handler to register 3568 * @rx_handler_data: data pointer that is used by rx handler 3569 * 3570 * Register a receive handler for a device. This handler will then be 3571 * called from __netif_receive_skb. A negative errno code is returned 3572 * on a failure. 3573 * 3574 * The caller must hold the rtnl_mutex. 3575 * 3576 * For a general description of rx_handler, see enum rx_handler_result. 3577 */ 3578 int netdev_rx_handler_register(struct net_device *dev, 3579 rx_handler_func_t *rx_handler, 3580 void *rx_handler_data) 3581 { 3582 ASSERT_RTNL(); 3583 3584 if (dev->rx_handler) 3585 return -EBUSY; 3586 3587 /* Note: rx_handler_data must be set before rx_handler */ 3588 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data); 3589 rcu_assign_pointer(dev->rx_handler, rx_handler); 3590 3591 return 0; 3592 } 3593 EXPORT_SYMBOL_GPL(netdev_rx_handler_register); 3594 3595 /** 3596 * netdev_rx_handler_unregister - unregister receive handler 3597 * @dev: device to unregister a handler from 3598 * 3599 * Unregister a receive handler from a device. 3600 * 3601 * The caller must hold the rtnl_mutex. 3602 */ 3603 void netdev_rx_handler_unregister(struct net_device *dev) 3604 { 3605 3606 ASSERT_RTNL(); 3607 RCU_INIT_POINTER(dev->rx_handler, NULL); 3608 /* a reader seeing a non NULL rx_handler in a rcu_read_lock() 3609 * section has a guarantee to see a non NULL rx_handler_data 3610 * as well. 3611 */ 3612 synchronize_net(); 3613 RCU_INIT_POINTER(dev->rx_handler_data, NULL); 3614 } 3615 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3616 3617 /* 3618 * Limit the use of PFMEMALLOC reserves to those protocols that implement 3619 * the special handling of PFMEMALLOC skbs. 3620 */ 3621 static bool skb_pfmemalloc_protocol(struct sk_buff *skb) 3622 { 3623 switch (skb->protocol) { 3624 case htons(ETH_P_ARP): 3625 case htons(ETH_P_IP): 3626 case htons(ETH_P_IPV6): 3627 case htons(ETH_P_8021Q): 3628 case htons(ETH_P_8021AD): 3629 return true; 3630 default: 3631 return false; 3632 } 3633 } 3634 3635 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) 3636 { 3637 struct packet_type *ptype, *pt_prev; 3638 rx_handler_func_t *rx_handler; 3639 struct net_device *orig_dev; 3640 bool deliver_exact = false; 3641 int ret = NET_RX_DROP; 3642 __be16 type; 3643 3644 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3645 3646 trace_netif_receive_skb(skb); 3647 3648 orig_dev = skb->dev; 3649 3650 skb_reset_network_header(skb); 3651 if (!skb_transport_header_was_set(skb)) 3652 skb_reset_transport_header(skb); 3653 skb_reset_mac_len(skb); 3654 3655 pt_prev = NULL; 3656 3657 rcu_read_lock(); 3658 3659 another_round: 3660 skb->skb_iif = skb->dev->ifindex; 3661 3662 __this_cpu_inc(softnet_data.processed); 3663 3664 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) || 3665 skb->protocol == cpu_to_be16(ETH_P_8021AD)) { 3666 skb = skb_vlan_untag(skb); 3667 if (unlikely(!skb)) 3668 goto unlock; 3669 } 3670 3671 #ifdef CONFIG_NET_CLS_ACT 3672 if (skb->tc_verd & TC_NCLS) { 3673 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 3674 goto ncls; 3675 } 3676 #endif 3677 3678 if (pfmemalloc) 3679 goto skip_taps; 3680 3681 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3682 if (pt_prev) 3683 ret = deliver_skb(skb, pt_prev, orig_dev); 3684 pt_prev = ptype; 3685 } 3686 3687 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { 3688 if (pt_prev) 3689 ret = deliver_skb(skb, pt_prev, orig_dev); 3690 pt_prev = ptype; 3691 } 3692 3693 skip_taps: 3694 #ifdef CONFIG_NET_CLS_ACT 3695 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3696 if (!skb) 3697 goto unlock; 3698 ncls: 3699 #endif 3700 3701 if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) 3702 goto drop; 3703 3704 if (skb_vlan_tag_present(skb)) { 3705 if (pt_prev) { 3706 ret = deliver_skb(skb, pt_prev, orig_dev); 3707 pt_prev = NULL; 3708 } 3709 if (vlan_do_receive(&skb)) 3710 goto another_round; 3711 else if (unlikely(!skb)) 3712 goto unlock; 3713 } 3714 3715 rx_handler = rcu_dereference(skb->dev->rx_handler); 3716 if (rx_handler) { 3717 if (pt_prev) { 3718 ret = deliver_skb(skb, pt_prev, orig_dev); 3719 pt_prev = NULL; 3720 } 3721 switch (rx_handler(&skb)) { 3722 case RX_HANDLER_CONSUMED: 3723 ret = NET_RX_SUCCESS; 3724 goto unlock; 3725 case RX_HANDLER_ANOTHER: 3726 goto another_round; 3727 case RX_HANDLER_EXACT: 3728 deliver_exact = true; 3729 case RX_HANDLER_PASS: 3730 break; 3731 default: 3732 BUG(); 3733 } 3734 } 3735 3736 if (unlikely(skb_vlan_tag_present(skb))) { 3737 if (skb_vlan_tag_get_id(skb)) 3738 skb->pkt_type = PACKET_OTHERHOST; 3739 /* Note: we might in the future use prio bits 3740 * and set skb->priority like in vlan_do_receive() 3741 * For the time being, just ignore Priority Code Point 3742 */ 3743 skb->vlan_tci = 0; 3744 } 3745 3746 type = skb->protocol; 3747 3748 /* deliver only exact match when indicated */ 3749 if (likely(!deliver_exact)) { 3750 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3751 &ptype_base[ntohs(type) & 3752 PTYPE_HASH_MASK]); 3753 } 3754 3755 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3756 &orig_dev->ptype_specific); 3757 3758 if (unlikely(skb->dev != orig_dev)) { 3759 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 3760 &skb->dev->ptype_specific); 3761 } 3762 3763 if (pt_prev) { 3764 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC))) 3765 goto drop; 3766 else 3767 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3768 } else { 3769 drop: 3770 atomic_long_inc(&skb->dev->rx_dropped); 3771 kfree_skb(skb); 3772 /* Jamal, now you will not able to escape explaining 3773 * me how you were going to use this. :-) 3774 */ 3775 ret = NET_RX_DROP; 3776 } 3777 3778 unlock: 3779 rcu_read_unlock(); 3780 return ret; 3781 } 3782 3783 static int __netif_receive_skb(struct sk_buff *skb) 3784 { 3785 int ret; 3786 3787 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { 3788 unsigned long pflags = current->flags; 3789 3790 /* 3791 * PFMEMALLOC skbs are special, they should 3792 * - be delivered to SOCK_MEMALLOC sockets only 3793 * - stay away from userspace 3794 * - have bounded memory usage 3795 * 3796 * Use PF_MEMALLOC as this saves us from propagating the allocation 3797 * context down to all allocation sites. 3798 */ 3799 current->flags |= PF_MEMALLOC; 3800 ret = __netif_receive_skb_core(skb, true); 3801 tsk_restore_flags(current, pflags, PF_MEMALLOC); 3802 } else 3803 ret = __netif_receive_skb_core(skb, false); 3804 3805 return ret; 3806 } 3807 3808 static int netif_receive_skb_internal(struct sk_buff *skb) 3809 { 3810 net_timestamp_check(netdev_tstamp_prequeue, skb); 3811 3812 if (skb_defer_rx_timestamp(skb)) 3813 return NET_RX_SUCCESS; 3814 3815 #ifdef CONFIG_RPS 3816 if (static_key_false(&rps_needed)) { 3817 struct rps_dev_flow voidflow, *rflow = &voidflow; 3818 int cpu, ret; 3819 3820 rcu_read_lock(); 3821 3822 cpu = get_rps_cpu(skb->dev, skb, &rflow); 3823 3824 if (cpu >= 0) { 3825 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); 3826 rcu_read_unlock(); 3827 return ret; 3828 } 3829 rcu_read_unlock(); 3830 } 3831 #endif 3832 return __netif_receive_skb(skb); 3833 } 3834 3835 /** 3836 * netif_receive_skb - process receive buffer from network 3837 * @skb: buffer to process 3838 * 3839 * netif_receive_skb() is the main receive data processing function. 3840 * It always succeeds. The buffer may be dropped during processing 3841 * for congestion control or by the protocol layers. 3842 * 3843 * This function may only be called from softirq context and interrupts 3844 * should be enabled. 3845 * 3846 * Return values (usually ignored): 3847 * NET_RX_SUCCESS: no congestion 3848 * NET_RX_DROP: packet was dropped 3849 */ 3850 int netif_receive_skb(struct sk_buff *skb) 3851 { 3852 trace_netif_receive_skb_entry(skb); 3853 3854 return netif_receive_skb_internal(skb); 3855 } 3856 EXPORT_SYMBOL(netif_receive_skb); 3857 3858 /* Network device is going away, flush any packets still pending 3859 * Called with irqs disabled. 3860 */ 3861 static void flush_backlog(void *arg) 3862 { 3863 struct net_device *dev = arg; 3864 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 3865 struct sk_buff *skb, *tmp; 3866 3867 rps_lock(sd); 3868 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) { 3869 if (skb->dev == dev) { 3870 __skb_unlink(skb, &sd->input_pkt_queue); 3871 kfree_skb(skb); 3872 input_queue_head_incr(sd); 3873 } 3874 } 3875 rps_unlock(sd); 3876 3877 skb_queue_walk_safe(&sd->process_queue, skb, tmp) { 3878 if (skb->dev == dev) { 3879 __skb_unlink(skb, &sd->process_queue); 3880 kfree_skb(skb); 3881 input_queue_head_incr(sd); 3882 } 3883 } 3884 } 3885 3886 static int napi_gro_complete(struct sk_buff *skb) 3887 { 3888 struct packet_offload *ptype; 3889 __be16 type = skb->protocol; 3890 struct list_head *head = &offload_base; 3891 int err = -ENOENT; 3892 3893 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb)); 3894 3895 if (NAPI_GRO_CB(skb)->count == 1) { 3896 skb_shinfo(skb)->gso_size = 0; 3897 goto out; 3898 } 3899 3900 rcu_read_lock(); 3901 list_for_each_entry_rcu(ptype, head, list) { 3902 if (ptype->type != type || !ptype->callbacks.gro_complete) 3903 continue; 3904 3905 err = ptype->callbacks.gro_complete(skb, 0); 3906 break; 3907 } 3908 rcu_read_unlock(); 3909 3910 if (err) { 3911 WARN_ON(&ptype->list == head); 3912 kfree_skb(skb); 3913 return NET_RX_SUCCESS; 3914 } 3915 3916 out: 3917 return netif_receive_skb_internal(skb); 3918 } 3919 3920 /* napi->gro_list contains packets ordered by age. 3921 * youngest packets at the head of it. 3922 * Complete skbs in reverse order to reduce latencies. 3923 */ 3924 void napi_gro_flush(struct napi_struct *napi, bool flush_old) 3925 { 3926 struct sk_buff *skb, *prev = NULL; 3927 3928 /* scan list and build reverse chain */ 3929 for (skb = napi->gro_list; skb != NULL; skb = skb->next) { 3930 skb->prev = prev; 3931 prev = skb; 3932 } 3933 3934 for (skb = prev; skb; skb = prev) { 3935 skb->next = NULL; 3936 3937 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) 3938 return; 3939 3940 prev = skb->prev; 3941 napi_gro_complete(skb); 3942 napi->gro_count--; 3943 } 3944 3945 napi->gro_list = NULL; 3946 } 3947 EXPORT_SYMBOL(napi_gro_flush); 3948 3949 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb) 3950 { 3951 struct sk_buff *p; 3952 unsigned int maclen = skb->dev->hard_header_len; 3953 u32 hash = skb_get_hash_raw(skb); 3954 3955 for (p = napi->gro_list; p; p = p->next) { 3956 unsigned long diffs; 3957 3958 NAPI_GRO_CB(p)->flush = 0; 3959 3960 if (hash != skb_get_hash_raw(p)) { 3961 NAPI_GRO_CB(p)->same_flow = 0; 3962 continue; 3963 } 3964 3965 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; 3966 diffs |= p->vlan_tci ^ skb->vlan_tci; 3967 if (maclen == ETH_HLEN) 3968 diffs |= compare_ether_header(skb_mac_header(p), 3969 skb_mac_header(skb)); 3970 else if (!diffs) 3971 diffs = memcmp(skb_mac_header(p), 3972 skb_mac_header(skb), 3973 maclen); 3974 NAPI_GRO_CB(p)->same_flow = !diffs; 3975 } 3976 } 3977 3978 static void skb_gro_reset_offset(struct sk_buff *skb) 3979 { 3980 const struct skb_shared_info *pinfo = skb_shinfo(skb); 3981 const skb_frag_t *frag0 = &pinfo->frags[0]; 3982 3983 NAPI_GRO_CB(skb)->data_offset = 0; 3984 NAPI_GRO_CB(skb)->frag0 = NULL; 3985 NAPI_GRO_CB(skb)->frag0_len = 0; 3986 3987 if (skb_mac_header(skb) == skb_tail_pointer(skb) && 3988 pinfo->nr_frags && 3989 !PageHighMem(skb_frag_page(frag0))) { 3990 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); 3991 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); 3992 } 3993 } 3994 3995 static void gro_pull_from_frag0(struct sk_buff *skb, int grow) 3996 { 3997 struct skb_shared_info *pinfo = skb_shinfo(skb); 3998 3999 BUG_ON(skb->end - skb->tail < grow); 4000 4001 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 4002 4003 skb->data_len -= grow; 4004 skb->tail += grow; 4005 4006 pinfo->frags[0].page_offset += grow; 4007 skb_frag_size_sub(&pinfo->frags[0], grow); 4008 4009 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) { 4010 skb_frag_unref(skb, 0); 4011 memmove(pinfo->frags, pinfo->frags + 1, 4012 --pinfo->nr_frags * sizeof(pinfo->frags[0])); 4013 } 4014 } 4015 4016 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4017 { 4018 struct sk_buff **pp = NULL; 4019 struct packet_offload *ptype; 4020 __be16 type = skb->protocol; 4021 struct list_head *head = &offload_base; 4022 int same_flow; 4023 enum gro_result ret; 4024 int grow; 4025 4026 if (!(skb->dev->features & NETIF_F_GRO)) 4027 goto normal; 4028 4029 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad) 4030 goto normal; 4031 4032 gro_list_prepare(napi, skb); 4033 4034 rcu_read_lock(); 4035 list_for_each_entry_rcu(ptype, head, list) { 4036 if (ptype->type != type || !ptype->callbacks.gro_receive) 4037 continue; 4038 4039 skb_set_network_header(skb, skb_gro_offset(skb)); 4040 skb_reset_mac_len(skb); 4041 NAPI_GRO_CB(skb)->same_flow = 0; 4042 NAPI_GRO_CB(skb)->flush = 0; 4043 NAPI_GRO_CB(skb)->free = 0; 4044 NAPI_GRO_CB(skb)->udp_mark = 0; 4045 NAPI_GRO_CB(skb)->gro_remcsum_start = 0; 4046 4047 /* Setup for GRO checksum validation */ 4048 switch (skb->ip_summed) { 4049 case CHECKSUM_COMPLETE: 4050 NAPI_GRO_CB(skb)->csum = skb->csum; 4051 NAPI_GRO_CB(skb)->csum_valid = 1; 4052 NAPI_GRO_CB(skb)->csum_cnt = 0; 4053 break; 4054 case CHECKSUM_UNNECESSARY: 4055 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1; 4056 NAPI_GRO_CB(skb)->csum_valid = 0; 4057 break; 4058 default: 4059 NAPI_GRO_CB(skb)->csum_cnt = 0; 4060 NAPI_GRO_CB(skb)->csum_valid = 0; 4061 } 4062 4063 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb); 4064 break; 4065 } 4066 rcu_read_unlock(); 4067 4068 if (&ptype->list == head) 4069 goto normal; 4070 4071 same_flow = NAPI_GRO_CB(skb)->same_flow; 4072 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 4073 4074 if (pp) { 4075 struct sk_buff *nskb = *pp; 4076 4077 *pp = nskb->next; 4078 nskb->next = NULL; 4079 napi_gro_complete(nskb); 4080 napi->gro_count--; 4081 } 4082 4083 if (same_flow) 4084 goto ok; 4085 4086 if (NAPI_GRO_CB(skb)->flush) 4087 goto normal; 4088 4089 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) { 4090 struct sk_buff *nskb = napi->gro_list; 4091 4092 /* locate the end of the list to select the 'oldest' flow */ 4093 while (nskb->next) { 4094 pp = &nskb->next; 4095 nskb = *pp; 4096 } 4097 *pp = NULL; 4098 nskb->next = NULL; 4099 napi_gro_complete(nskb); 4100 } else { 4101 napi->gro_count++; 4102 } 4103 NAPI_GRO_CB(skb)->count = 1; 4104 NAPI_GRO_CB(skb)->age = jiffies; 4105 NAPI_GRO_CB(skb)->last = skb; 4106 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 4107 skb->next = napi->gro_list; 4108 napi->gro_list = skb; 4109 ret = GRO_HELD; 4110 4111 pull: 4112 grow = skb_gro_offset(skb) - skb_headlen(skb); 4113 if (grow > 0) 4114 gro_pull_from_frag0(skb, grow); 4115 ok: 4116 return ret; 4117 4118 normal: 4119 ret = GRO_NORMAL; 4120 goto pull; 4121 } 4122 4123 struct packet_offload *gro_find_receive_by_type(__be16 type) 4124 { 4125 struct list_head *offload_head = &offload_base; 4126 struct packet_offload *ptype; 4127 4128 list_for_each_entry_rcu(ptype, offload_head, list) { 4129 if (ptype->type != type || !ptype->callbacks.gro_receive) 4130 continue; 4131 return ptype; 4132 } 4133 return NULL; 4134 } 4135 EXPORT_SYMBOL(gro_find_receive_by_type); 4136 4137 struct packet_offload *gro_find_complete_by_type(__be16 type) 4138 { 4139 struct list_head *offload_head = &offload_base; 4140 struct packet_offload *ptype; 4141 4142 list_for_each_entry_rcu(ptype, offload_head, list) { 4143 if (ptype->type != type || !ptype->callbacks.gro_complete) 4144 continue; 4145 return ptype; 4146 } 4147 return NULL; 4148 } 4149 EXPORT_SYMBOL(gro_find_complete_by_type); 4150 4151 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) 4152 { 4153 switch (ret) { 4154 case GRO_NORMAL: 4155 if (netif_receive_skb_internal(skb)) 4156 ret = GRO_DROP; 4157 break; 4158 4159 case GRO_DROP: 4160 kfree_skb(skb); 4161 break; 4162 4163 case GRO_MERGED_FREE: 4164 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) 4165 kmem_cache_free(skbuff_head_cache, skb); 4166 else 4167 __kfree_skb(skb); 4168 break; 4169 4170 case GRO_HELD: 4171 case GRO_MERGED: 4172 break; 4173 } 4174 4175 return ret; 4176 } 4177 4178 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 4179 { 4180 trace_napi_gro_receive_entry(skb); 4181 4182 skb_gro_reset_offset(skb); 4183 4184 return napi_skb_finish(dev_gro_receive(napi, skb), skb); 4185 } 4186 EXPORT_SYMBOL(napi_gro_receive); 4187 4188 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 4189 { 4190 if (unlikely(skb->pfmemalloc)) { 4191 consume_skb(skb); 4192 return; 4193 } 4194 __skb_pull(skb, skb_headlen(skb)); 4195 /* restore the reserve we had after netdev_alloc_skb_ip_align() */ 4196 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb)); 4197 skb->vlan_tci = 0; 4198 skb->dev = napi->dev; 4199 skb->skb_iif = 0; 4200 skb->encapsulation = 0; 4201 skb_shinfo(skb)->gso_type = 0; 4202 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); 4203 4204 napi->skb = skb; 4205 } 4206 4207 struct sk_buff *napi_get_frags(struct napi_struct *napi) 4208 { 4209 struct sk_buff *skb = napi->skb; 4210 4211 if (!skb) { 4212 skb = napi_alloc_skb(napi, GRO_MAX_HEAD); 4213 napi->skb = skb; 4214 } 4215 return skb; 4216 } 4217 EXPORT_SYMBOL(napi_get_frags); 4218 4219 static gro_result_t napi_frags_finish(struct napi_struct *napi, 4220 struct sk_buff *skb, 4221 gro_result_t ret) 4222 { 4223 switch (ret) { 4224 case GRO_NORMAL: 4225 case GRO_HELD: 4226 __skb_push(skb, ETH_HLEN); 4227 skb->protocol = eth_type_trans(skb, skb->dev); 4228 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb)) 4229 ret = GRO_DROP; 4230 break; 4231 4232 case GRO_DROP: 4233 case GRO_MERGED_FREE: 4234 napi_reuse_skb(napi, skb); 4235 break; 4236 4237 case GRO_MERGED: 4238 break; 4239 } 4240 4241 return ret; 4242 } 4243 4244 /* Upper GRO stack assumes network header starts at gro_offset=0 4245 * Drivers could call both napi_gro_frags() and napi_gro_receive() 4246 * We copy ethernet header into skb->data to have a common layout. 4247 */ 4248 static struct sk_buff *napi_frags_skb(struct napi_struct *napi) 4249 { 4250 struct sk_buff *skb = napi->skb; 4251 const struct ethhdr *eth; 4252 unsigned int hlen = sizeof(*eth); 4253 4254 napi->skb = NULL; 4255 4256 skb_reset_mac_header(skb); 4257 skb_gro_reset_offset(skb); 4258 4259 eth = skb_gro_header_fast(skb, 0); 4260 if (unlikely(skb_gro_header_hard(skb, hlen))) { 4261 eth = skb_gro_header_slow(skb, hlen, 0); 4262 if (unlikely(!eth)) { 4263 napi_reuse_skb(napi, skb); 4264 return NULL; 4265 } 4266 } else { 4267 gro_pull_from_frag0(skb, hlen); 4268 NAPI_GRO_CB(skb)->frag0 += hlen; 4269 NAPI_GRO_CB(skb)->frag0_len -= hlen; 4270 } 4271 __skb_pull(skb, hlen); 4272 4273 /* 4274 * This works because the only protocols we care about don't require 4275 * special handling. 4276 * We'll fix it up properly in napi_frags_finish() 4277 */ 4278 skb->protocol = eth->h_proto; 4279 4280 return skb; 4281 } 4282 4283 gro_result_t napi_gro_frags(struct napi_struct *napi) 4284 { 4285 struct sk_buff *skb = napi_frags_skb(napi); 4286 4287 if (!skb) 4288 return GRO_DROP; 4289 4290 trace_napi_gro_frags_entry(skb); 4291 4292 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb)); 4293 } 4294 EXPORT_SYMBOL(napi_gro_frags); 4295 4296 /* Compute the checksum from gro_offset and return the folded value 4297 * after adding in any pseudo checksum. 4298 */ 4299 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb) 4300 { 4301 __wsum wsum; 4302 __sum16 sum; 4303 4304 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0); 4305 4306 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */ 4307 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum)); 4308 if (likely(!sum)) { 4309 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && 4310 !skb->csum_complete_sw) 4311 netdev_rx_csum_fault(skb->dev); 4312 } 4313 4314 NAPI_GRO_CB(skb)->csum = wsum; 4315 NAPI_GRO_CB(skb)->csum_valid = 1; 4316 4317 return sum; 4318 } 4319 EXPORT_SYMBOL(__skb_gro_checksum_complete); 4320 4321 /* 4322 * net_rps_action_and_irq_enable sends any pending IPI's for rps. 4323 * Note: called with local irq disabled, but exits with local irq enabled. 4324 */ 4325 static void net_rps_action_and_irq_enable(struct softnet_data *sd) 4326 { 4327 #ifdef CONFIG_RPS 4328 struct softnet_data *remsd = sd->rps_ipi_list; 4329 4330 if (remsd) { 4331 sd->rps_ipi_list = NULL; 4332 4333 local_irq_enable(); 4334 4335 /* Send pending IPI's to kick RPS processing on remote cpus. */ 4336 while (remsd) { 4337 struct softnet_data *next = remsd->rps_ipi_next; 4338 4339 if (cpu_online(remsd->cpu)) 4340 smp_call_function_single_async(remsd->cpu, 4341 &remsd->csd); 4342 remsd = next; 4343 } 4344 } else 4345 #endif 4346 local_irq_enable(); 4347 } 4348 4349 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) 4350 { 4351 #ifdef CONFIG_RPS 4352 return sd->rps_ipi_list != NULL; 4353 #else 4354 return false; 4355 #endif 4356 } 4357 4358 static int process_backlog(struct napi_struct *napi, int quota) 4359 { 4360 int work = 0; 4361 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog); 4362 4363 /* Check if we have pending ipi, its better to send them now, 4364 * not waiting net_rx_action() end. 4365 */ 4366 if (sd_has_rps_ipi_waiting(sd)) { 4367 local_irq_disable(); 4368 net_rps_action_and_irq_enable(sd); 4369 } 4370 4371 napi->weight = weight_p; 4372 local_irq_disable(); 4373 while (1) { 4374 struct sk_buff *skb; 4375 4376 while ((skb = __skb_dequeue(&sd->process_queue))) { 4377 local_irq_enable(); 4378 __netif_receive_skb(skb); 4379 local_irq_disable(); 4380 input_queue_head_incr(sd); 4381 if (++work >= quota) { 4382 local_irq_enable(); 4383 return work; 4384 } 4385 } 4386 4387 rps_lock(sd); 4388 if (skb_queue_empty(&sd->input_pkt_queue)) { 4389 /* 4390 * Inline a custom version of __napi_complete(). 4391 * only current cpu owns and manipulates this napi, 4392 * and NAPI_STATE_SCHED is the only possible flag set 4393 * on backlog. 4394 * We can use a plain write instead of clear_bit(), 4395 * and we dont need an smp_mb() memory barrier. 4396 */ 4397 napi->state = 0; 4398 rps_unlock(sd); 4399 4400 break; 4401 } 4402 4403 skb_queue_splice_tail_init(&sd->input_pkt_queue, 4404 &sd->process_queue); 4405 rps_unlock(sd); 4406 } 4407 local_irq_enable(); 4408 4409 return work; 4410 } 4411 4412 /** 4413 * __napi_schedule - schedule for receive 4414 * @n: entry to schedule 4415 * 4416 * The entry's receive function will be scheduled to run. 4417 * Consider using __napi_schedule_irqoff() if hard irqs are masked. 4418 */ 4419 void __napi_schedule(struct napi_struct *n) 4420 { 4421 unsigned long flags; 4422 4423 local_irq_save(flags); 4424 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4425 local_irq_restore(flags); 4426 } 4427 EXPORT_SYMBOL(__napi_schedule); 4428 4429 /** 4430 * __napi_schedule_irqoff - schedule for receive 4431 * @n: entry to schedule 4432 * 4433 * Variant of __napi_schedule() assuming hard irqs are masked 4434 */ 4435 void __napi_schedule_irqoff(struct napi_struct *n) 4436 { 4437 ____napi_schedule(this_cpu_ptr(&softnet_data), n); 4438 } 4439 EXPORT_SYMBOL(__napi_schedule_irqoff); 4440 4441 void __napi_complete(struct napi_struct *n) 4442 { 4443 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 4444 4445 list_del_init(&n->poll_list); 4446 smp_mb__before_atomic(); 4447 clear_bit(NAPI_STATE_SCHED, &n->state); 4448 } 4449 EXPORT_SYMBOL(__napi_complete); 4450 4451 void napi_complete_done(struct napi_struct *n, int work_done) 4452 { 4453 unsigned long flags; 4454 4455 /* 4456 * don't let napi dequeue from the cpu poll list 4457 * just in case its running on a different cpu 4458 */ 4459 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 4460 return; 4461 4462 if (n->gro_list) { 4463 unsigned long timeout = 0; 4464 4465 if (work_done) 4466 timeout = n->dev->gro_flush_timeout; 4467 4468 if (timeout) 4469 hrtimer_start(&n->timer, ns_to_ktime(timeout), 4470 HRTIMER_MODE_REL_PINNED); 4471 else 4472 napi_gro_flush(n, false); 4473 } 4474 if (likely(list_empty(&n->poll_list))) { 4475 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state)); 4476 } else { 4477 /* If n->poll_list is not empty, we need to mask irqs */ 4478 local_irq_save(flags); 4479 __napi_complete(n); 4480 local_irq_restore(flags); 4481 } 4482 } 4483 EXPORT_SYMBOL(napi_complete_done); 4484 4485 /* must be called under rcu_read_lock(), as we dont take a reference */ 4486 struct napi_struct *napi_by_id(unsigned int napi_id) 4487 { 4488 unsigned int hash = napi_id % HASH_SIZE(napi_hash); 4489 struct napi_struct *napi; 4490 4491 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) 4492 if (napi->napi_id == napi_id) 4493 return napi; 4494 4495 return NULL; 4496 } 4497 EXPORT_SYMBOL_GPL(napi_by_id); 4498 4499 void napi_hash_add(struct napi_struct *napi) 4500 { 4501 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) { 4502 4503 spin_lock(&napi_hash_lock); 4504 4505 /* 0 is not a valid id, we also skip an id that is taken 4506 * we expect both events to be extremely rare 4507 */ 4508 napi->napi_id = 0; 4509 while (!napi->napi_id) { 4510 napi->napi_id = ++napi_gen_id; 4511 if (napi_by_id(napi->napi_id)) 4512 napi->napi_id = 0; 4513 } 4514 4515 hlist_add_head_rcu(&napi->napi_hash_node, 4516 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]); 4517 4518 spin_unlock(&napi_hash_lock); 4519 } 4520 } 4521 EXPORT_SYMBOL_GPL(napi_hash_add); 4522 4523 /* Warning : caller is responsible to make sure rcu grace period 4524 * is respected before freeing memory containing @napi 4525 */ 4526 void napi_hash_del(struct napi_struct *napi) 4527 { 4528 spin_lock(&napi_hash_lock); 4529 4530 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) 4531 hlist_del_rcu(&napi->napi_hash_node); 4532 4533 spin_unlock(&napi_hash_lock); 4534 } 4535 EXPORT_SYMBOL_GPL(napi_hash_del); 4536 4537 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) 4538 { 4539 struct napi_struct *napi; 4540 4541 napi = container_of(timer, struct napi_struct, timer); 4542 if (napi->gro_list) 4543 napi_schedule(napi); 4544 4545 return HRTIMER_NORESTART; 4546 } 4547 4548 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 4549 int (*poll)(struct napi_struct *, int), int weight) 4550 { 4551 INIT_LIST_HEAD(&napi->poll_list); 4552 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); 4553 napi->timer.function = napi_watchdog; 4554 napi->gro_count = 0; 4555 napi->gro_list = NULL; 4556 napi->skb = NULL; 4557 napi->poll = poll; 4558 if (weight > NAPI_POLL_WEIGHT) 4559 pr_err_once("netif_napi_add() called with weight %d on device %s\n", 4560 weight, dev->name); 4561 napi->weight = weight; 4562 list_add(&napi->dev_list, &dev->napi_list); 4563 napi->dev = dev; 4564 #ifdef CONFIG_NETPOLL 4565 spin_lock_init(&napi->poll_lock); 4566 napi->poll_owner = -1; 4567 #endif 4568 set_bit(NAPI_STATE_SCHED, &napi->state); 4569 } 4570 EXPORT_SYMBOL(netif_napi_add); 4571 4572 void napi_disable(struct napi_struct *n) 4573 { 4574 might_sleep(); 4575 set_bit(NAPI_STATE_DISABLE, &n->state); 4576 4577 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state)) 4578 msleep(1); 4579 4580 hrtimer_cancel(&n->timer); 4581 4582 clear_bit(NAPI_STATE_DISABLE, &n->state); 4583 } 4584 EXPORT_SYMBOL(napi_disable); 4585 4586 void netif_napi_del(struct napi_struct *napi) 4587 { 4588 list_del_init(&napi->dev_list); 4589 napi_free_frags(napi); 4590 4591 kfree_skb_list(napi->gro_list); 4592 napi->gro_list = NULL; 4593 napi->gro_count = 0; 4594 } 4595 EXPORT_SYMBOL(netif_napi_del); 4596 4597 static int napi_poll(struct napi_struct *n, struct list_head *repoll) 4598 { 4599 void *have; 4600 int work, weight; 4601 4602 list_del_init(&n->poll_list); 4603 4604 have = netpoll_poll_lock(n); 4605 4606 weight = n->weight; 4607 4608 /* This NAPI_STATE_SCHED test is for avoiding a race 4609 * with netpoll's poll_napi(). Only the entity which 4610 * obtains the lock and sees NAPI_STATE_SCHED set will 4611 * actually make the ->poll() call. Therefore we avoid 4612 * accidentally calling ->poll() when NAPI is not scheduled. 4613 */ 4614 work = 0; 4615 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 4616 work = n->poll(n, weight); 4617 trace_napi_poll(n); 4618 } 4619 4620 WARN_ON_ONCE(work > weight); 4621 4622 if (likely(work < weight)) 4623 goto out_unlock; 4624 4625 /* Drivers must not modify the NAPI state if they 4626 * consume the entire weight. In such cases this code 4627 * still "owns" the NAPI instance and therefore can 4628 * move the instance around on the list at-will. 4629 */ 4630 if (unlikely(napi_disable_pending(n))) { 4631 napi_complete(n); 4632 goto out_unlock; 4633 } 4634 4635 if (n->gro_list) { 4636 /* flush too old packets 4637 * If HZ < 1000, flush all packets. 4638 */ 4639 napi_gro_flush(n, HZ >= 1000); 4640 } 4641 4642 /* Some drivers may have called napi_schedule 4643 * prior to exhausting their budget. 4644 */ 4645 if (unlikely(!list_empty(&n->poll_list))) { 4646 pr_warn_once("%s: Budget exhausted after napi rescheduled\n", 4647 n->dev ? n->dev->name : "backlog"); 4648 goto out_unlock; 4649 } 4650 4651 list_add_tail(&n->poll_list, repoll); 4652 4653 out_unlock: 4654 netpoll_poll_unlock(have); 4655 4656 return work; 4657 } 4658 4659 static void net_rx_action(struct softirq_action *h) 4660 { 4661 struct softnet_data *sd = this_cpu_ptr(&softnet_data); 4662 unsigned long time_limit = jiffies + 2; 4663 int budget = netdev_budget; 4664 LIST_HEAD(list); 4665 LIST_HEAD(repoll); 4666 4667 local_irq_disable(); 4668 list_splice_init(&sd->poll_list, &list); 4669 local_irq_enable(); 4670 4671 for (;;) { 4672 struct napi_struct *n; 4673 4674 if (list_empty(&list)) { 4675 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) 4676 return; 4677 break; 4678 } 4679 4680 n = list_first_entry(&list, struct napi_struct, poll_list); 4681 budget -= napi_poll(n, &repoll); 4682 4683 /* If softirq window is exhausted then punt. 4684 * Allow this to run for 2 jiffies since which will allow 4685 * an average latency of 1.5/HZ. 4686 */ 4687 if (unlikely(budget <= 0 || 4688 time_after_eq(jiffies, time_limit))) { 4689 sd->time_squeeze++; 4690 break; 4691 } 4692 } 4693 4694 local_irq_disable(); 4695 4696 list_splice_tail_init(&sd->poll_list, &list); 4697 list_splice_tail(&repoll, &list); 4698 list_splice(&list, &sd->poll_list); 4699 if (!list_empty(&sd->poll_list)) 4700 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 4701 4702 net_rps_action_and_irq_enable(sd); 4703 } 4704 4705 struct netdev_adjacent { 4706 struct net_device *dev; 4707 4708 /* upper master flag, there can only be one master device per list */ 4709 bool master; 4710 4711 /* counter for the number of times this device was added to us */ 4712 u16 ref_nr; 4713 4714 /* private field for the users */ 4715 void *private; 4716 4717 struct list_head list; 4718 struct rcu_head rcu; 4719 }; 4720 4721 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev, 4722 struct net_device *adj_dev, 4723 struct list_head *adj_list) 4724 { 4725 struct netdev_adjacent *adj; 4726 4727 list_for_each_entry(adj, adj_list, list) { 4728 if (adj->dev == adj_dev) 4729 return adj; 4730 } 4731 return NULL; 4732 } 4733 4734 /** 4735 * netdev_has_upper_dev - Check if device is linked to an upper device 4736 * @dev: device 4737 * @upper_dev: upper device to check 4738 * 4739 * Find out if a device is linked to specified upper device and return true 4740 * in case it is. Note that this checks only immediate upper device, 4741 * not through a complete stack of devices. The caller must hold the RTNL lock. 4742 */ 4743 bool netdev_has_upper_dev(struct net_device *dev, 4744 struct net_device *upper_dev) 4745 { 4746 ASSERT_RTNL(); 4747 4748 return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper); 4749 } 4750 EXPORT_SYMBOL(netdev_has_upper_dev); 4751 4752 /** 4753 * netdev_has_any_upper_dev - Check if device is linked to some device 4754 * @dev: device 4755 * 4756 * Find out if a device is linked to an upper device and return true in case 4757 * it is. The caller must hold the RTNL lock. 4758 */ 4759 static bool netdev_has_any_upper_dev(struct net_device *dev) 4760 { 4761 ASSERT_RTNL(); 4762 4763 return !list_empty(&dev->all_adj_list.upper); 4764 } 4765 4766 /** 4767 * netdev_master_upper_dev_get - Get master upper device 4768 * @dev: device 4769 * 4770 * Find a master upper device and return pointer to it or NULL in case 4771 * it's not there. The caller must hold the RTNL lock. 4772 */ 4773 struct net_device *netdev_master_upper_dev_get(struct net_device *dev) 4774 { 4775 struct netdev_adjacent *upper; 4776 4777 ASSERT_RTNL(); 4778 4779 if (list_empty(&dev->adj_list.upper)) 4780 return NULL; 4781 4782 upper = list_first_entry(&dev->adj_list.upper, 4783 struct netdev_adjacent, list); 4784 if (likely(upper->master)) 4785 return upper->dev; 4786 return NULL; 4787 } 4788 EXPORT_SYMBOL(netdev_master_upper_dev_get); 4789 4790 void *netdev_adjacent_get_private(struct list_head *adj_list) 4791 { 4792 struct netdev_adjacent *adj; 4793 4794 adj = list_entry(adj_list, struct netdev_adjacent, list); 4795 4796 return adj->private; 4797 } 4798 EXPORT_SYMBOL(netdev_adjacent_get_private); 4799 4800 /** 4801 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list 4802 * @dev: device 4803 * @iter: list_head ** of the current position 4804 * 4805 * Gets the next device from the dev's upper list, starting from iter 4806 * position. The caller must hold RCU read lock. 4807 */ 4808 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev, 4809 struct list_head **iter) 4810 { 4811 struct netdev_adjacent *upper; 4812 4813 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4814 4815 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4816 4817 if (&upper->list == &dev->adj_list.upper) 4818 return NULL; 4819 4820 *iter = &upper->list; 4821 4822 return upper->dev; 4823 } 4824 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu); 4825 4826 /** 4827 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list 4828 * @dev: device 4829 * @iter: list_head ** of the current position 4830 * 4831 * Gets the next device from the dev's upper list, starting from iter 4832 * position. The caller must hold RCU read lock. 4833 */ 4834 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev, 4835 struct list_head **iter) 4836 { 4837 struct netdev_adjacent *upper; 4838 4839 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held()); 4840 4841 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4842 4843 if (&upper->list == &dev->all_adj_list.upper) 4844 return NULL; 4845 4846 *iter = &upper->list; 4847 4848 return upper->dev; 4849 } 4850 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); 4851 4852 /** 4853 * netdev_lower_get_next_private - Get the next ->private from the 4854 * lower neighbour list 4855 * @dev: device 4856 * @iter: list_head ** of the current position 4857 * 4858 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4859 * list, starting from iter position. The caller must hold either hold the 4860 * RTNL lock or its own locking that guarantees that the neighbour lower 4861 * list will remain unchainged. 4862 */ 4863 void *netdev_lower_get_next_private(struct net_device *dev, 4864 struct list_head **iter) 4865 { 4866 struct netdev_adjacent *lower; 4867 4868 lower = list_entry(*iter, struct netdev_adjacent, list); 4869 4870 if (&lower->list == &dev->adj_list.lower) 4871 return NULL; 4872 4873 *iter = lower->list.next; 4874 4875 return lower->private; 4876 } 4877 EXPORT_SYMBOL(netdev_lower_get_next_private); 4878 4879 /** 4880 * netdev_lower_get_next_private_rcu - Get the next ->private from the 4881 * lower neighbour list, RCU 4882 * variant 4883 * @dev: device 4884 * @iter: list_head ** of the current position 4885 * 4886 * Gets the next netdev_adjacent->private from the dev's lower neighbour 4887 * list, starting from iter position. The caller must hold RCU read lock. 4888 */ 4889 void *netdev_lower_get_next_private_rcu(struct net_device *dev, 4890 struct list_head **iter) 4891 { 4892 struct netdev_adjacent *lower; 4893 4894 WARN_ON_ONCE(!rcu_read_lock_held()); 4895 4896 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list); 4897 4898 if (&lower->list == &dev->adj_list.lower) 4899 return NULL; 4900 4901 *iter = &lower->list; 4902 4903 return lower->private; 4904 } 4905 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); 4906 4907 /** 4908 * netdev_lower_get_next - Get the next device from the lower neighbour 4909 * list 4910 * @dev: device 4911 * @iter: list_head ** of the current position 4912 * 4913 * Gets the next netdev_adjacent from the dev's lower neighbour 4914 * list, starting from iter position. The caller must hold RTNL lock or 4915 * its own locking that guarantees that the neighbour lower 4916 * list will remain unchainged. 4917 */ 4918 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) 4919 { 4920 struct netdev_adjacent *lower; 4921 4922 lower = list_entry((*iter)->next, struct netdev_adjacent, list); 4923 4924 if (&lower->list == &dev->adj_list.lower) 4925 return NULL; 4926 4927 *iter = &lower->list; 4928 4929 return lower->dev; 4930 } 4931 EXPORT_SYMBOL(netdev_lower_get_next); 4932 4933 /** 4934 * netdev_lower_get_first_private_rcu - Get the first ->private from the 4935 * lower neighbour list, RCU 4936 * variant 4937 * @dev: device 4938 * 4939 * Gets the first netdev_adjacent->private from the dev's lower neighbour 4940 * list. The caller must hold RCU read lock. 4941 */ 4942 void *netdev_lower_get_first_private_rcu(struct net_device *dev) 4943 { 4944 struct netdev_adjacent *lower; 4945 4946 lower = list_first_or_null_rcu(&dev->adj_list.lower, 4947 struct netdev_adjacent, list); 4948 if (lower) 4949 return lower->private; 4950 return NULL; 4951 } 4952 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu); 4953 4954 /** 4955 * netdev_master_upper_dev_get_rcu - Get master upper device 4956 * @dev: device 4957 * 4958 * Find a master upper device and return pointer to it or NULL in case 4959 * it's not there. The caller must hold the RCU read lock. 4960 */ 4961 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev) 4962 { 4963 struct netdev_adjacent *upper; 4964 4965 upper = list_first_or_null_rcu(&dev->adj_list.upper, 4966 struct netdev_adjacent, list); 4967 if (upper && likely(upper->master)) 4968 return upper->dev; 4969 return NULL; 4970 } 4971 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu); 4972 4973 static int netdev_adjacent_sysfs_add(struct net_device *dev, 4974 struct net_device *adj_dev, 4975 struct list_head *dev_list) 4976 { 4977 char linkname[IFNAMSIZ+7]; 4978 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4979 "upper_%s" : "lower_%s", adj_dev->name); 4980 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj), 4981 linkname); 4982 } 4983 static void netdev_adjacent_sysfs_del(struct net_device *dev, 4984 char *name, 4985 struct list_head *dev_list) 4986 { 4987 char linkname[IFNAMSIZ+7]; 4988 sprintf(linkname, dev_list == &dev->adj_list.upper ? 4989 "upper_%s" : "lower_%s", name); 4990 sysfs_remove_link(&(dev->dev.kobj), linkname); 4991 } 4992 4993 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev, 4994 struct net_device *adj_dev, 4995 struct list_head *dev_list) 4996 { 4997 return (dev_list == &dev->adj_list.upper || 4998 dev_list == &dev->adj_list.lower) && 4999 net_eq(dev_net(dev), dev_net(adj_dev)); 5000 } 5001 5002 static int __netdev_adjacent_dev_insert(struct net_device *dev, 5003 struct net_device *adj_dev, 5004 struct list_head *dev_list, 5005 void *private, bool master) 5006 { 5007 struct netdev_adjacent *adj; 5008 int ret; 5009 5010 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5011 5012 if (adj) { 5013 adj->ref_nr++; 5014 return 0; 5015 } 5016 5017 adj = kmalloc(sizeof(*adj), GFP_KERNEL); 5018 if (!adj) 5019 return -ENOMEM; 5020 5021 adj->dev = adj_dev; 5022 adj->master = master; 5023 adj->ref_nr = 1; 5024 adj->private = private; 5025 dev_hold(adj_dev); 5026 5027 pr_debug("dev_hold for %s, because of link added from %s to %s\n", 5028 adj_dev->name, dev->name, adj_dev->name); 5029 5030 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) { 5031 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list); 5032 if (ret) 5033 goto free_adj; 5034 } 5035 5036 /* Ensure that master link is always the first item in list. */ 5037 if (master) { 5038 ret = sysfs_create_link(&(dev->dev.kobj), 5039 &(adj_dev->dev.kobj), "master"); 5040 if (ret) 5041 goto remove_symlinks; 5042 5043 list_add_rcu(&adj->list, dev_list); 5044 } else { 5045 list_add_tail_rcu(&adj->list, dev_list); 5046 } 5047 5048 return 0; 5049 5050 remove_symlinks: 5051 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5052 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5053 free_adj: 5054 kfree(adj); 5055 dev_put(adj_dev); 5056 5057 return ret; 5058 } 5059 5060 static void __netdev_adjacent_dev_remove(struct net_device *dev, 5061 struct net_device *adj_dev, 5062 struct list_head *dev_list) 5063 { 5064 struct netdev_adjacent *adj; 5065 5066 adj = __netdev_find_adj(dev, adj_dev, dev_list); 5067 5068 if (!adj) { 5069 pr_err("tried to remove device %s from %s\n", 5070 dev->name, adj_dev->name); 5071 BUG(); 5072 } 5073 5074 if (adj->ref_nr > 1) { 5075 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name, 5076 adj->ref_nr-1); 5077 adj->ref_nr--; 5078 return; 5079 } 5080 5081 if (adj->master) 5082 sysfs_remove_link(&(dev->dev.kobj), "master"); 5083 5084 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) 5085 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list); 5086 5087 list_del_rcu(&adj->list); 5088 pr_debug("dev_put for %s, because link removed from %s to %s\n", 5089 adj_dev->name, dev->name, adj_dev->name); 5090 dev_put(adj_dev); 5091 kfree_rcu(adj, rcu); 5092 } 5093 5094 static int __netdev_adjacent_dev_link_lists(struct net_device *dev, 5095 struct net_device *upper_dev, 5096 struct list_head *up_list, 5097 struct list_head *down_list, 5098 void *private, bool master) 5099 { 5100 int ret; 5101 5102 ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private, 5103 master); 5104 if (ret) 5105 return ret; 5106 5107 ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private, 5108 false); 5109 if (ret) { 5110 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5111 return ret; 5112 } 5113 5114 return 0; 5115 } 5116 5117 static int __netdev_adjacent_dev_link(struct net_device *dev, 5118 struct net_device *upper_dev) 5119 { 5120 return __netdev_adjacent_dev_link_lists(dev, upper_dev, 5121 &dev->all_adj_list.upper, 5122 &upper_dev->all_adj_list.lower, 5123 NULL, false); 5124 } 5125 5126 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev, 5127 struct net_device *upper_dev, 5128 struct list_head *up_list, 5129 struct list_head *down_list) 5130 { 5131 __netdev_adjacent_dev_remove(dev, upper_dev, up_list); 5132 __netdev_adjacent_dev_remove(upper_dev, dev, down_list); 5133 } 5134 5135 static void __netdev_adjacent_dev_unlink(struct net_device *dev, 5136 struct net_device *upper_dev) 5137 { 5138 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5139 &dev->all_adj_list.upper, 5140 &upper_dev->all_adj_list.lower); 5141 } 5142 5143 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev, 5144 struct net_device *upper_dev, 5145 void *private, bool master) 5146 { 5147 int ret = __netdev_adjacent_dev_link(dev, upper_dev); 5148 5149 if (ret) 5150 return ret; 5151 5152 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 5153 &dev->adj_list.upper, 5154 &upper_dev->adj_list.lower, 5155 private, master); 5156 if (ret) { 5157 __netdev_adjacent_dev_unlink(dev, upper_dev); 5158 return ret; 5159 } 5160 5161 return 0; 5162 } 5163 5164 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev, 5165 struct net_device *upper_dev) 5166 { 5167 __netdev_adjacent_dev_unlink(dev, upper_dev); 5168 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 5169 &dev->adj_list.upper, 5170 &upper_dev->adj_list.lower); 5171 } 5172 5173 static int __netdev_upper_dev_link(struct net_device *dev, 5174 struct net_device *upper_dev, bool master, 5175 void *private) 5176 { 5177 struct netdev_adjacent *i, *j, *to_i, *to_j; 5178 int ret = 0; 5179 5180 ASSERT_RTNL(); 5181 5182 if (dev == upper_dev) 5183 return -EBUSY; 5184 5185 /* To prevent loops, check if dev is not upper device to upper_dev. */ 5186 if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper)) 5187 return -EBUSY; 5188 5189 if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper)) 5190 return -EEXIST; 5191 5192 if (master && netdev_master_upper_dev_get(dev)) 5193 return -EBUSY; 5194 5195 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, 5196 master); 5197 if (ret) 5198 return ret; 5199 5200 /* Now that we linked these devs, make all the upper_dev's 5201 * all_adj_list.upper visible to every dev's all_adj_list.lower an 5202 * versa, and don't forget the devices itself. All of these 5203 * links are non-neighbours. 5204 */ 5205 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5206 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5207 pr_debug("Interlinking %s with %s, non-neighbour\n", 5208 i->dev->name, j->dev->name); 5209 ret = __netdev_adjacent_dev_link(i->dev, j->dev); 5210 if (ret) 5211 goto rollback_mesh; 5212 } 5213 } 5214 5215 /* add dev to every upper_dev's upper device */ 5216 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5217 pr_debug("linking %s's upper device %s with %s\n", 5218 upper_dev->name, i->dev->name, dev->name); 5219 ret = __netdev_adjacent_dev_link(dev, i->dev); 5220 if (ret) 5221 goto rollback_upper_mesh; 5222 } 5223 5224 /* add upper_dev to every dev's lower device */ 5225 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5226 pr_debug("linking %s's lower device %s with %s\n", dev->name, 5227 i->dev->name, upper_dev->name); 5228 ret = __netdev_adjacent_dev_link(i->dev, upper_dev); 5229 if (ret) 5230 goto rollback_lower_mesh; 5231 } 5232 5233 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5234 return 0; 5235 5236 rollback_lower_mesh: 5237 to_i = i; 5238 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5239 if (i == to_i) 5240 break; 5241 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5242 } 5243 5244 i = NULL; 5245 5246 rollback_upper_mesh: 5247 to_i = i; 5248 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) { 5249 if (i == to_i) 5250 break; 5251 __netdev_adjacent_dev_unlink(dev, i->dev); 5252 } 5253 5254 i = j = NULL; 5255 5256 rollback_mesh: 5257 to_i = i; 5258 to_j = j; 5259 list_for_each_entry(i, &dev->all_adj_list.lower, list) { 5260 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) { 5261 if (i == to_i && j == to_j) 5262 break; 5263 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5264 } 5265 if (i == to_i) 5266 break; 5267 } 5268 5269 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5270 5271 return ret; 5272 } 5273 5274 /** 5275 * netdev_upper_dev_link - Add a link to the upper device 5276 * @dev: device 5277 * @upper_dev: new upper device 5278 * 5279 * Adds a link to device which is upper to this one. The caller must hold 5280 * the RTNL lock. On a failure a negative errno code is returned. 5281 * On success the reference counts are adjusted and the function 5282 * returns zero. 5283 */ 5284 int netdev_upper_dev_link(struct net_device *dev, 5285 struct net_device *upper_dev) 5286 { 5287 return __netdev_upper_dev_link(dev, upper_dev, false, NULL); 5288 } 5289 EXPORT_SYMBOL(netdev_upper_dev_link); 5290 5291 /** 5292 * netdev_master_upper_dev_link - Add a master link to the upper device 5293 * @dev: device 5294 * @upper_dev: new upper device 5295 * 5296 * Adds a link to device which is upper to this one. In this case, only 5297 * one master upper device can be linked, although other non-master devices 5298 * might be linked as well. The caller must hold the RTNL lock. 5299 * On a failure a negative errno code is returned. On success the reference 5300 * counts are adjusted and the function returns zero. 5301 */ 5302 int netdev_master_upper_dev_link(struct net_device *dev, 5303 struct net_device *upper_dev) 5304 { 5305 return __netdev_upper_dev_link(dev, upper_dev, true, NULL); 5306 } 5307 EXPORT_SYMBOL(netdev_master_upper_dev_link); 5308 5309 int netdev_master_upper_dev_link_private(struct net_device *dev, 5310 struct net_device *upper_dev, 5311 void *private) 5312 { 5313 return __netdev_upper_dev_link(dev, upper_dev, true, private); 5314 } 5315 EXPORT_SYMBOL(netdev_master_upper_dev_link_private); 5316 5317 /** 5318 * netdev_upper_dev_unlink - Removes a link to upper device 5319 * @dev: device 5320 * @upper_dev: new upper device 5321 * 5322 * Removes a link to device which is upper to this one. The caller must hold 5323 * the RTNL lock. 5324 */ 5325 void netdev_upper_dev_unlink(struct net_device *dev, 5326 struct net_device *upper_dev) 5327 { 5328 struct netdev_adjacent *i, *j; 5329 ASSERT_RTNL(); 5330 5331 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); 5332 5333 /* Here is the tricky part. We must remove all dev's lower 5334 * devices from all upper_dev's upper devices and vice 5335 * versa, to maintain the graph relationship. 5336 */ 5337 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5338 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) 5339 __netdev_adjacent_dev_unlink(i->dev, j->dev); 5340 5341 /* remove also the devices itself from lower/upper device 5342 * list 5343 */ 5344 list_for_each_entry(i, &dev->all_adj_list.lower, list) 5345 __netdev_adjacent_dev_unlink(i->dev, upper_dev); 5346 5347 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) 5348 __netdev_adjacent_dev_unlink(dev, i->dev); 5349 5350 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); 5351 } 5352 EXPORT_SYMBOL(netdev_upper_dev_unlink); 5353 5354 /** 5355 * netdev_bonding_info_change - Dispatch event about slave change 5356 * @dev: device 5357 * @bonding_info: info to dispatch 5358 * 5359 * Send NETDEV_BONDING_INFO to netdev notifiers with info. 5360 * The caller must hold the RTNL lock. 5361 */ 5362 void netdev_bonding_info_change(struct net_device *dev, 5363 struct netdev_bonding_info *bonding_info) 5364 { 5365 struct netdev_notifier_bonding_info info; 5366 5367 memcpy(&info.bonding_info, bonding_info, 5368 sizeof(struct netdev_bonding_info)); 5369 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev, 5370 &info.info); 5371 } 5372 EXPORT_SYMBOL(netdev_bonding_info_change); 5373 5374 static void netdev_adjacent_add_links(struct net_device *dev) 5375 { 5376 struct netdev_adjacent *iter; 5377 5378 struct net *net = dev_net(dev); 5379 5380 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5381 if (!net_eq(net,dev_net(iter->dev))) 5382 continue; 5383 netdev_adjacent_sysfs_add(iter->dev, dev, 5384 &iter->dev->adj_list.lower); 5385 netdev_adjacent_sysfs_add(dev, iter->dev, 5386 &dev->adj_list.upper); 5387 } 5388 5389 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5390 if (!net_eq(net,dev_net(iter->dev))) 5391 continue; 5392 netdev_adjacent_sysfs_add(iter->dev, dev, 5393 &iter->dev->adj_list.upper); 5394 netdev_adjacent_sysfs_add(dev, iter->dev, 5395 &dev->adj_list.lower); 5396 } 5397 } 5398 5399 static void netdev_adjacent_del_links(struct net_device *dev) 5400 { 5401 struct netdev_adjacent *iter; 5402 5403 struct net *net = dev_net(dev); 5404 5405 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5406 if (!net_eq(net,dev_net(iter->dev))) 5407 continue; 5408 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5409 &iter->dev->adj_list.lower); 5410 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5411 &dev->adj_list.upper); 5412 } 5413 5414 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5415 if (!net_eq(net,dev_net(iter->dev))) 5416 continue; 5417 netdev_adjacent_sysfs_del(iter->dev, dev->name, 5418 &iter->dev->adj_list.upper); 5419 netdev_adjacent_sysfs_del(dev, iter->dev->name, 5420 &dev->adj_list.lower); 5421 } 5422 } 5423 5424 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname) 5425 { 5426 struct netdev_adjacent *iter; 5427 5428 struct net *net = dev_net(dev); 5429 5430 list_for_each_entry(iter, &dev->adj_list.upper, list) { 5431 if (!net_eq(net,dev_net(iter->dev))) 5432 continue; 5433 netdev_adjacent_sysfs_del(iter->dev, oldname, 5434 &iter->dev->adj_list.lower); 5435 netdev_adjacent_sysfs_add(iter->dev, dev, 5436 &iter->dev->adj_list.lower); 5437 } 5438 5439 list_for_each_entry(iter, &dev->adj_list.lower, list) { 5440 if (!net_eq(net,dev_net(iter->dev))) 5441 continue; 5442 netdev_adjacent_sysfs_del(iter->dev, oldname, 5443 &iter->dev->adj_list.upper); 5444 netdev_adjacent_sysfs_add(iter->dev, dev, 5445 &iter->dev->adj_list.upper); 5446 } 5447 } 5448 5449 void *netdev_lower_dev_get_private(struct net_device *dev, 5450 struct net_device *lower_dev) 5451 { 5452 struct netdev_adjacent *lower; 5453 5454 if (!lower_dev) 5455 return NULL; 5456 lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower); 5457 if (!lower) 5458 return NULL; 5459 5460 return lower->private; 5461 } 5462 EXPORT_SYMBOL(netdev_lower_dev_get_private); 5463 5464 5465 int dev_get_nest_level(struct net_device *dev, 5466 bool (*type_check)(struct net_device *dev)) 5467 { 5468 struct net_device *lower = NULL; 5469 struct list_head *iter; 5470 int max_nest = -1; 5471 int nest; 5472 5473 ASSERT_RTNL(); 5474 5475 netdev_for_each_lower_dev(dev, lower, iter) { 5476 nest = dev_get_nest_level(lower, type_check); 5477 if (max_nest < nest) 5478 max_nest = nest; 5479 } 5480 5481 if (type_check(dev)) 5482 max_nest++; 5483 5484 return max_nest; 5485 } 5486 EXPORT_SYMBOL(dev_get_nest_level); 5487 5488 static void dev_change_rx_flags(struct net_device *dev, int flags) 5489 { 5490 const struct net_device_ops *ops = dev->netdev_ops; 5491 5492 if (ops->ndo_change_rx_flags) 5493 ops->ndo_change_rx_flags(dev, flags); 5494 } 5495 5496 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify) 5497 { 5498 unsigned int old_flags = dev->flags; 5499 kuid_t uid; 5500 kgid_t gid; 5501 5502 ASSERT_RTNL(); 5503 5504 dev->flags |= IFF_PROMISC; 5505 dev->promiscuity += inc; 5506 if (dev->promiscuity == 0) { 5507 /* 5508 * Avoid overflow. 5509 * If inc causes overflow, untouch promisc and return error. 5510 */ 5511 if (inc < 0) 5512 dev->flags &= ~IFF_PROMISC; 5513 else { 5514 dev->promiscuity -= inc; 5515 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n", 5516 dev->name); 5517 return -EOVERFLOW; 5518 } 5519 } 5520 if (dev->flags != old_flags) { 5521 pr_info("device %s %s promiscuous mode\n", 5522 dev->name, 5523 dev->flags & IFF_PROMISC ? "entered" : "left"); 5524 if (audit_enabled) { 5525 current_uid_gid(&uid, &gid); 5526 audit_log(current->audit_context, GFP_ATOMIC, 5527 AUDIT_ANOM_PROMISCUOUS, 5528 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 5529 dev->name, (dev->flags & IFF_PROMISC), 5530 (old_flags & IFF_PROMISC), 5531 from_kuid(&init_user_ns, audit_get_loginuid(current)), 5532 from_kuid(&init_user_ns, uid), 5533 from_kgid(&init_user_ns, gid), 5534 audit_get_sessionid(current)); 5535 } 5536 5537 dev_change_rx_flags(dev, IFF_PROMISC); 5538 } 5539 if (notify) 5540 __dev_notify_flags(dev, old_flags, IFF_PROMISC); 5541 return 0; 5542 } 5543 5544 /** 5545 * dev_set_promiscuity - update promiscuity count on a device 5546 * @dev: device 5547 * @inc: modifier 5548 * 5549 * Add or remove promiscuity from a device. While the count in the device 5550 * remains above zero the interface remains promiscuous. Once it hits zero 5551 * the device reverts back to normal filtering operation. A negative inc 5552 * value is used to drop promiscuity on the device. 5553 * Return 0 if successful or a negative errno code on error. 5554 */ 5555 int dev_set_promiscuity(struct net_device *dev, int inc) 5556 { 5557 unsigned int old_flags = dev->flags; 5558 int err; 5559 5560 err = __dev_set_promiscuity(dev, inc, true); 5561 if (err < 0) 5562 return err; 5563 if (dev->flags != old_flags) 5564 dev_set_rx_mode(dev); 5565 return err; 5566 } 5567 EXPORT_SYMBOL(dev_set_promiscuity); 5568 5569 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify) 5570 { 5571 unsigned int old_flags = dev->flags, old_gflags = dev->gflags; 5572 5573 ASSERT_RTNL(); 5574 5575 dev->flags |= IFF_ALLMULTI; 5576 dev->allmulti += inc; 5577 if (dev->allmulti == 0) { 5578 /* 5579 * Avoid overflow. 5580 * If inc causes overflow, untouch allmulti and return error. 5581 */ 5582 if (inc < 0) 5583 dev->flags &= ~IFF_ALLMULTI; 5584 else { 5585 dev->allmulti -= inc; 5586 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n", 5587 dev->name); 5588 return -EOVERFLOW; 5589 } 5590 } 5591 if (dev->flags ^ old_flags) { 5592 dev_change_rx_flags(dev, IFF_ALLMULTI); 5593 dev_set_rx_mode(dev); 5594 if (notify) 5595 __dev_notify_flags(dev, old_flags, 5596 dev->gflags ^ old_gflags); 5597 } 5598 return 0; 5599 } 5600 5601 /** 5602 * dev_set_allmulti - update allmulti count on a device 5603 * @dev: device 5604 * @inc: modifier 5605 * 5606 * Add or remove reception of all multicast frames to a device. While the 5607 * count in the device remains above zero the interface remains listening 5608 * to all interfaces. Once it hits zero the device reverts back to normal 5609 * filtering operation. A negative @inc value is used to drop the counter 5610 * when releasing a resource needing all multicasts. 5611 * Return 0 if successful or a negative errno code on error. 5612 */ 5613 5614 int dev_set_allmulti(struct net_device *dev, int inc) 5615 { 5616 return __dev_set_allmulti(dev, inc, true); 5617 } 5618 EXPORT_SYMBOL(dev_set_allmulti); 5619 5620 /* 5621 * Upload unicast and multicast address lists to device and 5622 * configure RX filtering. When the device doesn't support unicast 5623 * filtering it is put in promiscuous mode while unicast addresses 5624 * are present. 5625 */ 5626 void __dev_set_rx_mode(struct net_device *dev) 5627 { 5628 const struct net_device_ops *ops = dev->netdev_ops; 5629 5630 /* dev_open will call this function so the list will stay sane. */ 5631 if (!(dev->flags&IFF_UP)) 5632 return; 5633 5634 if (!netif_device_present(dev)) 5635 return; 5636 5637 if (!(dev->priv_flags & IFF_UNICAST_FLT)) { 5638 /* Unicast addresses changes may only happen under the rtnl, 5639 * therefore calling __dev_set_promiscuity here is safe. 5640 */ 5641 if (!netdev_uc_empty(dev) && !dev->uc_promisc) { 5642 __dev_set_promiscuity(dev, 1, false); 5643 dev->uc_promisc = true; 5644 } else if (netdev_uc_empty(dev) && dev->uc_promisc) { 5645 __dev_set_promiscuity(dev, -1, false); 5646 dev->uc_promisc = false; 5647 } 5648 } 5649 5650 if (ops->ndo_set_rx_mode) 5651 ops->ndo_set_rx_mode(dev); 5652 } 5653 5654 void dev_set_rx_mode(struct net_device *dev) 5655 { 5656 netif_addr_lock_bh(dev); 5657 __dev_set_rx_mode(dev); 5658 netif_addr_unlock_bh(dev); 5659 } 5660 5661 /** 5662 * dev_get_flags - get flags reported to userspace 5663 * @dev: device 5664 * 5665 * Get the combination of flag bits exported through APIs to userspace. 5666 */ 5667 unsigned int dev_get_flags(const struct net_device *dev) 5668 { 5669 unsigned int flags; 5670 5671 flags = (dev->flags & ~(IFF_PROMISC | 5672 IFF_ALLMULTI | 5673 IFF_RUNNING | 5674 IFF_LOWER_UP | 5675 IFF_DORMANT)) | 5676 (dev->gflags & (IFF_PROMISC | 5677 IFF_ALLMULTI)); 5678 5679 if (netif_running(dev)) { 5680 if (netif_oper_up(dev)) 5681 flags |= IFF_RUNNING; 5682 if (netif_carrier_ok(dev)) 5683 flags |= IFF_LOWER_UP; 5684 if (netif_dormant(dev)) 5685 flags |= IFF_DORMANT; 5686 } 5687 5688 return flags; 5689 } 5690 EXPORT_SYMBOL(dev_get_flags); 5691 5692 int __dev_change_flags(struct net_device *dev, unsigned int flags) 5693 { 5694 unsigned int old_flags = dev->flags; 5695 int ret; 5696 5697 ASSERT_RTNL(); 5698 5699 /* 5700 * Set the flags on our device. 5701 */ 5702 5703 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 5704 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 5705 IFF_AUTOMEDIA)) | 5706 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 5707 IFF_ALLMULTI)); 5708 5709 /* 5710 * Load in the correct multicast list now the flags have changed. 5711 */ 5712 5713 if ((old_flags ^ flags) & IFF_MULTICAST) 5714 dev_change_rx_flags(dev, IFF_MULTICAST); 5715 5716 dev_set_rx_mode(dev); 5717 5718 /* 5719 * Have we downed the interface. We handle IFF_UP ourselves 5720 * according to user attempts to set it, rather than blindly 5721 * setting it. 5722 */ 5723 5724 ret = 0; 5725 if ((old_flags ^ flags) & IFF_UP) 5726 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev); 5727 5728 if ((flags ^ dev->gflags) & IFF_PROMISC) { 5729 int inc = (flags & IFF_PROMISC) ? 1 : -1; 5730 unsigned int old_flags = dev->flags; 5731 5732 dev->gflags ^= IFF_PROMISC; 5733 5734 if (__dev_set_promiscuity(dev, inc, false) >= 0) 5735 if (dev->flags != old_flags) 5736 dev_set_rx_mode(dev); 5737 } 5738 5739 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 5740 is important. Some (broken) drivers set IFF_PROMISC, when 5741 IFF_ALLMULTI is requested not asking us and not reporting. 5742 */ 5743 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 5744 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 5745 5746 dev->gflags ^= IFF_ALLMULTI; 5747 __dev_set_allmulti(dev, inc, false); 5748 } 5749 5750 return ret; 5751 } 5752 5753 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, 5754 unsigned int gchanges) 5755 { 5756 unsigned int changes = dev->flags ^ old_flags; 5757 5758 if (gchanges) 5759 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC); 5760 5761 if (changes & IFF_UP) { 5762 if (dev->flags & IFF_UP) 5763 call_netdevice_notifiers(NETDEV_UP, dev); 5764 else 5765 call_netdevice_notifiers(NETDEV_DOWN, dev); 5766 } 5767 5768 if (dev->flags & IFF_UP && 5769 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) { 5770 struct netdev_notifier_change_info change_info; 5771 5772 change_info.flags_changed = changes; 5773 call_netdevice_notifiers_info(NETDEV_CHANGE, dev, 5774 &change_info.info); 5775 } 5776 } 5777 5778 /** 5779 * dev_change_flags - change device settings 5780 * @dev: device 5781 * @flags: device state flags 5782 * 5783 * Change settings on device based state flags. The flags are 5784 * in the userspace exported format. 5785 */ 5786 int dev_change_flags(struct net_device *dev, unsigned int flags) 5787 { 5788 int ret; 5789 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags; 5790 5791 ret = __dev_change_flags(dev, flags); 5792 if (ret < 0) 5793 return ret; 5794 5795 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags); 5796 __dev_notify_flags(dev, old_flags, changes); 5797 return ret; 5798 } 5799 EXPORT_SYMBOL(dev_change_flags); 5800 5801 static int __dev_set_mtu(struct net_device *dev, int new_mtu) 5802 { 5803 const struct net_device_ops *ops = dev->netdev_ops; 5804 5805 if (ops->ndo_change_mtu) 5806 return ops->ndo_change_mtu(dev, new_mtu); 5807 5808 dev->mtu = new_mtu; 5809 return 0; 5810 } 5811 5812 /** 5813 * dev_set_mtu - Change maximum transfer unit 5814 * @dev: device 5815 * @new_mtu: new transfer unit 5816 * 5817 * Change the maximum transfer size of the network device. 5818 */ 5819 int dev_set_mtu(struct net_device *dev, int new_mtu) 5820 { 5821 int err, orig_mtu; 5822 5823 if (new_mtu == dev->mtu) 5824 return 0; 5825 5826 /* MTU must be positive. */ 5827 if (new_mtu < 0) 5828 return -EINVAL; 5829 5830 if (!netif_device_present(dev)) 5831 return -ENODEV; 5832 5833 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev); 5834 err = notifier_to_errno(err); 5835 if (err) 5836 return err; 5837 5838 orig_mtu = dev->mtu; 5839 err = __dev_set_mtu(dev, new_mtu); 5840 5841 if (!err) { 5842 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5843 err = notifier_to_errno(err); 5844 if (err) { 5845 /* setting mtu back and notifying everyone again, 5846 * so that they have a chance to revert changes. 5847 */ 5848 __dev_set_mtu(dev, orig_mtu); 5849 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 5850 } 5851 } 5852 return err; 5853 } 5854 EXPORT_SYMBOL(dev_set_mtu); 5855 5856 /** 5857 * dev_set_group - Change group this device belongs to 5858 * @dev: device 5859 * @new_group: group this device should belong to 5860 */ 5861 void dev_set_group(struct net_device *dev, int new_group) 5862 { 5863 dev->group = new_group; 5864 } 5865 EXPORT_SYMBOL(dev_set_group); 5866 5867 /** 5868 * dev_set_mac_address - Change Media Access Control Address 5869 * @dev: device 5870 * @sa: new address 5871 * 5872 * Change the hardware (MAC) address of the device 5873 */ 5874 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 5875 { 5876 const struct net_device_ops *ops = dev->netdev_ops; 5877 int err; 5878 5879 if (!ops->ndo_set_mac_address) 5880 return -EOPNOTSUPP; 5881 if (sa->sa_family != dev->type) 5882 return -EINVAL; 5883 if (!netif_device_present(dev)) 5884 return -ENODEV; 5885 err = ops->ndo_set_mac_address(dev, sa); 5886 if (err) 5887 return err; 5888 dev->addr_assign_type = NET_ADDR_SET; 5889 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 5890 add_device_randomness(dev->dev_addr, dev->addr_len); 5891 return 0; 5892 } 5893 EXPORT_SYMBOL(dev_set_mac_address); 5894 5895 /** 5896 * dev_change_carrier - Change device carrier 5897 * @dev: device 5898 * @new_carrier: new value 5899 * 5900 * Change device carrier 5901 */ 5902 int dev_change_carrier(struct net_device *dev, bool new_carrier) 5903 { 5904 const struct net_device_ops *ops = dev->netdev_ops; 5905 5906 if (!ops->ndo_change_carrier) 5907 return -EOPNOTSUPP; 5908 if (!netif_device_present(dev)) 5909 return -ENODEV; 5910 return ops->ndo_change_carrier(dev, new_carrier); 5911 } 5912 EXPORT_SYMBOL(dev_change_carrier); 5913 5914 /** 5915 * dev_get_phys_port_id - Get device physical port ID 5916 * @dev: device 5917 * @ppid: port ID 5918 * 5919 * Get device physical port ID 5920 */ 5921 int dev_get_phys_port_id(struct net_device *dev, 5922 struct netdev_phys_item_id *ppid) 5923 { 5924 const struct net_device_ops *ops = dev->netdev_ops; 5925 5926 if (!ops->ndo_get_phys_port_id) 5927 return -EOPNOTSUPP; 5928 return ops->ndo_get_phys_port_id(dev, ppid); 5929 } 5930 EXPORT_SYMBOL(dev_get_phys_port_id); 5931 5932 /** 5933 * dev_get_phys_port_name - Get device physical port name 5934 * @dev: device 5935 * @name: port name 5936 * 5937 * Get device physical port name 5938 */ 5939 int dev_get_phys_port_name(struct net_device *dev, 5940 char *name, size_t len) 5941 { 5942 const struct net_device_ops *ops = dev->netdev_ops; 5943 5944 if (!ops->ndo_get_phys_port_name) 5945 return -EOPNOTSUPP; 5946 return ops->ndo_get_phys_port_name(dev, name, len); 5947 } 5948 EXPORT_SYMBOL(dev_get_phys_port_name); 5949 5950 /** 5951 * dev_new_index - allocate an ifindex 5952 * @net: the applicable net namespace 5953 * 5954 * Returns a suitable unique value for a new device interface 5955 * number. The caller must hold the rtnl semaphore or the 5956 * dev_base_lock to be sure it remains unique. 5957 */ 5958 static int dev_new_index(struct net *net) 5959 { 5960 int ifindex = net->ifindex; 5961 for (;;) { 5962 if (++ifindex <= 0) 5963 ifindex = 1; 5964 if (!__dev_get_by_index(net, ifindex)) 5965 return net->ifindex = ifindex; 5966 } 5967 } 5968 5969 /* Delayed registration/unregisteration */ 5970 static LIST_HEAD(net_todo_list); 5971 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); 5972 5973 static void net_set_todo(struct net_device *dev) 5974 { 5975 list_add_tail(&dev->todo_list, &net_todo_list); 5976 dev_net(dev)->dev_unreg_count++; 5977 } 5978 5979 static void rollback_registered_many(struct list_head *head) 5980 { 5981 struct net_device *dev, *tmp; 5982 LIST_HEAD(close_head); 5983 5984 BUG_ON(dev_boot_phase); 5985 ASSERT_RTNL(); 5986 5987 list_for_each_entry_safe(dev, tmp, head, unreg_list) { 5988 /* Some devices call without registering 5989 * for initialization unwind. Remove those 5990 * devices and proceed with the remaining. 5991 */ 5992 if (dev->reg_state == NETREG_UNINITIALIZED) { 5993 pr_debug("unregister_netdevice: device %s/%p never was registered\n", 5994 dev->name, dev); 5995 5996 WARN_ON(1); 5997 list_del(&dev->unreg_list); 5998 continue; 5999 } 6000 dev->dismantle = true; 6001 BUG_ON(dev->reg_state != NETREG_REGISTERED); 6002 } 6003 6004 /* If device is running, close it first. */ 6005 list_for_each_entry(dev, head, unreg_list) 6006 list_add_tail(&dev->close_list, &close_head); 6007 dev_close_many(&close_head, true); 6008 6009 list_for_each_entry(dev, head, unreg_list) { 6010 /* And unlink it from device chain. */ 6011 unlist_netdevice(dev); 6012 6013 dev->reg_state = NETREG_UNREGISTERING; 6014 } 6015 6016 synchronize_net(); 6017 6018 list_for_each_entry(dev, head, unreg_list) { 6019 struct sk_buff *skb = NULL; 6020 6021 /* Shutdown queueing discipline. */ 6022 dev_shutdown(dev); 6023 6024 6025 /* Notify protocols, that we are about to destroy 6026 this device. They should clean all the things. 6027 */ 6028 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6029 6030 if (!dev->rtnl_link_ops || 6031 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6032 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 6033 GFP_KERNEL); 6034 6035 /* 6036 * Flush the unicast and multicast chains 6037 */ 6038 dev_uc_flush(dev); 6039 dev_mc_flush(dev); 6040 6041 if (dev->netdev_ops->ndo_uninit) 6042 dev->netdev_ops->ndo_uninit(dev); 6043 6044 if (skb) 6045 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL); 6046 6047 /* Notifier chain MUST detach us all upper devices. */ 6048 WARN_ON(netdev_has_any_upper_dev(dev)); 6049 6050 /* Remove entries from kobject tree */ 6051 netdev_unregister_kobject(dev); 6052 #ifdef CONFIG_XPS 6053 /* Remove XPS queueing entries */ 6054 netif_reset_xps_queues_gt(dev, 0); 6055 #endif 6056 } 6057 6058 synchronize_net(); 6059 6060 list_for_each_entry(dev, head, unreg_list) 6061 dev_put(dev); 6062 } 6063 6064 static void rollback_registered(struct net_device *dev) 6065 { 6066 LIST_HEAD(single); 6067 6068 list_add(&dev->unreg_list, &single); 6069 rollback_registered_many(&single); 6070 list_del(&single); 6071 } 6072 6073 static netdev_features_t netdev_fix_features(struct net_device *dev, 6074 netdev_features_t features) 6075 { 6076 /* Fix illegal checksum combinations */ 6077 if ((features & NETIF_F_HW_CSUM) && 6078 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6079 netdev_warn(dev, "mixed HW and IP checksum settings.\n"); 6080 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 6081 } 6082 6083 /* TSO requires that SG is present as well. */ 6084 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) { 6085 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n"); 6086 features &= ~NETIF_F_ALL_TSO; 6087 } 6088 6089 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) && 6090 !(features & NETIF_F_IP_CSUM)) { 6091 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n"); 6092 features &= ~NETIF_F_TSO; 6093 features &= ~NETIF_F_TSO_ECN; 6094 } 6095 6096 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) && 6097 !(features & NETIF_F_IPV6_CSUM)) { 6098 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n"); 6099 features &= ~NETIF_F_TSO6; 6100 } 6101 6102 /* TSO ECN requires that TSO is present as well. */ 6103 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN) 6104 features &= ~NETIF_F_TSO_ECN; 6105 6106 /* Software GSO depends on SG. */ 6107 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) { 6108 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n"); 6109 features &= ~NETIF_F_GSO; 6110 } 6111 6112 /* UFO needs SG and checksumming */ 6113 if (features & NETIF_F_UFO) { 6114 /* maybe split UFO into V4 and V6? */ 6115 if (!((features & NETIF_F_GEN_CSUM) || 6116 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 6117 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 6118 netdev_dbg(dev, 6119 "Dropping NETIF_F_UFO since no checksum offload features.\n"); 6120 features &= ~NETIF_F_UFO; 6121 } 6122 6123 if (!(features & NETIF_F_SG)) { 6124 netdev_dbg(dev, 6125 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n"); 6126 features &= ~NETIF_F_UFO; 6127 } 6128 } 6129 6130 #ifdef CONFIG_NET_RX_BUSY_POLL 6131 if (dev->netdev_ops->ndo_busy_poll) 6132 features |= NETIF_F_BUSY_POLL; 6133 else 6134 #endif 6135 features &= ~NETIF_F_BUSY_POLL; 6136 6137 return features; 6138 } 6139 6140 int __netdev_update_features(struct net_device *dev) 6141 { 6142 netdev_features_t features; 6143 int err = 0; 6144 6145 ASSERT_RTNL(); 6146 6147 features = netdev_get_wanted_features(dev); 6148 6149 if (dev->netdev_ops->ndo_fix_features) 6150 features = dev->netdev_ops->ndo_fix_features(dev, features); 6151 6152 /* driver might be less strict about feature dependencies */ 6153 features = netdev_fix_features(dev, features); 6154 6155 if (dev->features == features) 6156 return 0; 6157 6158 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", 6159 &dev->features, &features); 6160 6161 if (dev->netdev_ops->ndo_set_features) 6162 err = dev->netdev_ops->ndo_set_features(dev, features); 6163 6164 if (unlikely(err < 0)) { 6165 netdev_err(dev, 6166 "set_features() failed (%d); wanted %pNF, left %pNF\n", 6167 err, &features, &dev->features); 6168 return -1; 6169 } 6170 6171 if (!err) 6172 dev->features = features; 6173 6174 return 1; 6175 } 6176 6177 /** 6178 * netdev_update_features - recalculate device features 6179 * @dev: the device to check 6180 * 6181 * Recalculate dev->features set and send notifications if it 6182 * has changed. Should be called after driver or hardware dependent 6183 * conditions might have changed that influence the features. 6184 */ 6185 void netdev_update_features(struct net_device *dev) 6186 { 6187 if (__netdev_update_features(dev)) 6188 netdev_features_change(dev); 6189 } 6190 EXPORT_SYMBOL(netdev_update_features); 6191 6192 /** 6193 * netdev_change_features - recalculate device features 6194 * @dev: the device to check 6195 * 6196 * Recalculate dev->features set and send notifications even 6197 * if they have not changed. Should be called instead of 6198 * netdev_update_features() if also dev->vlan_features might 6199 * have changed to allow the changes to be propagated to stacked 6200 * VLAN devices. 6201 */ 6202 void netdev_change_features(struct net_device *dev) 6203 { 6204 __netdev_update_features(dev); 6205 netdev_features_change(dev); 6206 } 6207 EXPORT_SYMBOL(netdev_change_features); 6208 6209 /** 6210 * netif_stacked_transfer_operstate - transfer operstate 6211 * @rootdev: the root or lower level device to transfer state from 6212 * @dev: the device to transfer operstate to 6213 * 6214 * Transfer operational state from root to device. This is normally 6215 * called when a stacking relationship exists between the root 6216 * device and the device(a leaf device). 6217 */ 6218 void netif_stacked_transfer_operstate(const struct net_device *rootdev, 6219 struct net_device *dev) 6220 { 6221 if (rootdev->operstate == IF_OPER_DORMANT) 6222 netif_dormant_on(dev); 6223 else 6224 netif_dormant_off(dev); 6225 6226 if (netif_carrier_ok(rootdev)) { 6227 if (!netif_carrier_ok(dev)) 6228 netif_carrier_on(dev); 6229 } else { 6230 if (netif_carrier_ok(dev)) 6231 netif_carrier_off(dev); 6232 } 6233 } 6234 EXPORT_SYMBOL(netif_stacked_transfer_operstate); 6235 6236 #ifdef CONFIG_SYSFS 6237 static int netif_alloc_rx_queues(struct net_device *dev) 6238 { 6239 unsigned int i, count = dev->num_rx_queues; 6240 struct netdev_rx_queue *rx; 6241 size_t sz = count * sizeof(*rx); 6242 6243 BUG_ON(count < 1); 6244 6245 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6246 if (!rx) { 6247 rx = vzalloc(sz); 6248 if (!rx) 6249 return -ENOMEM; 6250 } 6251 dev->_rx = rx; 6252 6253 for (i = 0; i < count; i++) 6254 rx[i].dev = dev; 6255 return 0; 6256 } 6257 #endif 6258 6259 static void netdev_init_one_queue(struct net_device *dev, 6260 struct netdev_queue *queue, void *_unused) 6261 { 6262 /* Initialize queue lock */ 6263 spin_lock_init(&queue->_xmit_lock); 6264 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); 6265 queue->xmit_lock_owner = -1; 6266 netdev_queue_numa_node_write(queue, NUMA_NO_NODE); 6267 queue->dev = dev; 6268 #ifdef CONFIG_BQL 6269 dql_init(&queue->dql, HZ); 6270 #endif 6271 } 6272 6273 static void netif_free_tx_queues(struct net_device *dev) 6274 { 6275 kvfree(dev->_tx); 6276 } 6277 6278 static int netif_alloc_netdev_queues(struct net_device *dev) 6279 { 6280 unsigned int count = dev->num_tx_queues; 6281 struct netdev_queue *tx; 6282 size_t sz = count * sizeof(*tx); 6283 6284 BUG_ON(count < 1 || count > 0xffff); 6285 6286 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6287 if (!tx) { 6288 tx = vzalloc(sz); 6289 if (!tx) 6290 return -ENOMEM; 6291 } 6292 dev->_tx = tx; 6293 6294 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 6295 spin_lock_init(&dev->tx_global_lock); 6296 6297 return 0; 6298 } 6299 6300 /** 6301 * register_netdevice - register a network device 6302 * @dev: device to register 6303 * 6304 * Take a completed network device structure and add it to the kernel 6305 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6306 * chain. 0 is returned on success. A negative errno code is returned 6307 * on a failure to set up the device, or if the name is a duplicate. 6308 * 6309 * Callers must hold the rtnl semaphore. You may want 6310 * register_netdev() instead of this. 6311 * 6312 * BUGS: 6313 * The locking appears insufficient to guarantee two parallel registers 6314 * will not get the same name. 6315 */ 6316 6317 int register_netdevice(struct net_device *dev) 6318 { 6319 int ret; 6320 struct net *net = dev_net(dev); 6321 6322 BUG_ON(dev_boot_phase); 6323 ASSERT_RTNL(); 6324 6325 might_sleep(); 6326 6327 /* When net_device's are persistent, this will be fatal. */ 6328 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 6329 BUG_ON(!net); 6330 6331 spin_lock_init(&dev->addr_list_lock); 6332 netdev_set_addr_lockdep_class(dev); 6333 6334 dev->iflink = -1; 6335 6336 ret = dev_get_valid_name(net, dev, dev->name); 6337 if (ret < 0) 6338 goto out; 6339 6340 /* Init, if this function is available */ 6341 if (dev->netdev_ops->ndo_init) { 6342 ret = dev->netdev_ops->ndo_init(dev); 6343 if (ret) { 6344 if (ret > 0) 6345 ret = -EIO; 6346 goto out; 6347 } 6348 } 6349 6350 if (((dev->hw_features | dev->features) & 6351 NETIF_F_HW_VLAN_CTAG_FILTER) && 6352 (!dev->netdev_ops->ndo_vlan_rx_add_vid || 6353 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) { 6354 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n"); 6355 ret = -EINVAL; 6356 goto err_uninit; 6357 } 6358 6359 ret = -EBUSY; 6360 if (!dev->ifindex) 6361 dev->ifindex = dev_new_index(net); 6362 else if (__dev_get_by_index(net, dev->ifindex)) 6363 goto err_uninit; 6364 6365 if (dev_get_iflink(dev) == -1) 6366 dev->iflink = dev->ifindex; 6367 6368 /* Transfer changeable features to wanted_features and enable 6369 * software offloads (GSO and GRO). 6370 */ 6371 dev->hw_features |= NETIF_F_SOFT_FEATURES; 6372 dev->features |= NETIF_F_SOFT_FEATURES; 6373 dev->wanted_features = dev->features & dev->hw_features; 6374 6375 if (!(dev->flags & IFF_LOOPBACK)) { 6376 dev->hw_features |= NETIF_F_NOCACHE_COPY; 6377 } 6378 6379 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. 6380 */ 6381 dev->vlan_features |= NETIF_F_HIGHDMA; 6382 6383 /* Make NETIF_F_SG inheritable to tunnel devices. 6384 */ 6385 dev->hw_enc_features |= NETIF_F_SG; 6386 6387 /* Make NETIF_F_SG inheritable to MPLS. 6388 */ 6389 dev->mpls_features |= NETIF_F_SG; 6390 6391 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 6392 ret = notifier_to_errno(ret); 6393 if (ret) 6394 goto err_uninit; 6395 6396 ret = netdev_register_kobject(dev); 6397 if (ret) 6398 goto err_uninit; 6399 dev->reg_state = NETREG_REGISTERED; 6400 6401 __netdev_update_features(dev); 6402 6403 /* 6404 * Default initial state at registry is that the 6405 * device is present. 6406 */ 6407 6408 set_bit(__LINK_STATE_PRESENT, &dev->state); 6409 6410 linkwatch_init_dev(dev); 6411 6412 dev_init_scheduler(dev); 6413 dev_hold(dev); 6414 list_netdevice(dev); 6415 add_device_randomness(dev->dev_addr, dev->addr_len); 6416 6417 /* If the device has permanent device address, driver should 6418 * set dev_addr and also addr_assign_type should be set to 6419 * NET_ADDR_PERM (default value). 6420 */ 6421 if (dev->addr_assign_type == NET_ADDR_PERM) 6422 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len); 6423 6424 /* Notify protocols, that a new device appeared. */ 6425 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 6426 ret = notifier_to_errno(ret); 6427 if (ret) { 6428 rollback_registered(dev); 6429 dev->reg_state = NETREG_UNREGISTERED; 6430 } 6431 /* 6432 * Prevent userspace races by waiting until the network 6433 * device is fully setup before sending notifications. 6434 */ 6435 if (!dev->rtnl_link_ops || 6436 dev->rtnl_link_state == RTNL_LINK_INITIALIZED) 6437 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 6438 6439 out: 6440 return ret; 6441 6442 err_uninit: 6443 if (dev->netdev_ops->ndo_uninit) 6444 dev->netdev_ops->ndo_uninit(dev); 6445 goto out; 6446 } 6447 EXPORT_SYMBOL(register_netdevice); 6448 6449 /** 6450 * init_dummy_netdev - init a dummy network device for NAPI 6451 * @dev: device to init 6452 * 6453 * This takes a network device structure and initialize the minimum 6454 * amount of fields so it can be used to schedule NAPI polls without 6455 * registering a full blown interface. This is to be used by drivers 6456 * that need to tie several hardware interfaces to a single NAPI 6457 * poll scheduler due to HW limitations. 6458 */ 6459 int init_dummy_netdev(struct net_device *dev) 6460 { 6461 /* Clear everything. Note we don't initialize spinlocks 6462 * are they aren't supposed to be taken by any of the 6463 * NAPI code and this dummy netdev is supposed to be 6464 * only ever used for NAPI polls 6465 */ 6466 memset(dev, 0, sizeof(struct net_device)); 6467 6468 /* make sure we BUG if trying to hit standard 6469 * register/unregister code path 6470 */ 6471 dev->reg_state = NETREG_DUMMY; 6472 6473 /* NAPI wants this */ 6474 INIT_LIST_HEAD(&dev->napi_list); 6475 6476 /* a dummy interface is started by default */ 6477 set_bit(__LINK_STATE_PRESENT, &dev->state); 6478 set_bit(__LINK_STATE_START, &dev->state); 6479 6480 /* Note : We dont allocate pcpu_refcnt for dummy devices, 6481 * because users of this 'device' dont need to change 6482 * its refcount. 6483 */ 6484 6485 return 0; 6486 } 6487 EXPORT_SYMBOL_GPL(init_dummy_netdev); 6488 6489 6490 /** 6491 * register_netdev - register a network device 6492 * @dev: device to register 6493 * 6494 * Take a completed network device structure and add it to the kernel 6495 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 6496 * chain. 0 is returned on success. A negative errno code is returned 6497 * on a failure to set up the device, or if the name is a duplicate. 6498 * 6499 * This is a wrapper around register_netdevice that takes the rtnl semaphore 6500 * and expands the device name if you passed a format string to 6501 * alloc_netdev. 6502 */ 6503 int register_netdev(struct net_device *dev) 6504 { 6505 int err; 6506 6507 rtnl_lock(); 6508 err = register_netdevice(dev); 6509 rtnl_unlock(); 6510 return err; 6511 } 6512 EXPORT_SYMBOL(register_netdev); 6513 6514 int netdev_refcnt_read(const struct net_device *dev) 6515 { 6516 int i, refcnt = 0; 6517 6518 for_each_possible_cpu(i) 6519 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); 6520 return refcnt; 6521 } 6522 EXPORT_SYMBOL(netdev_refcnt_read); 6523 6524 /** 6525 * netdev_wait_allrefs - wait until all references are gone. 6526 * @dev: target net_device 6527 * 6528 * This is called when unregistering network devices. 6529 * 6530 * Any protocol or device that holds a reference should register 6531 * for netdevice notification, and cleanup and put back the 6532 * reference if they receive an UNREGISTER event. 6533 * We can get stuck here if buggy protocols don't correctly 6534 * call dev_put. 6535 */ 6536 static void netdev_wait_allrefs(struct net_device *dev) 6537 { 6538 unsigned long rebroadcast_time, warning_time; 6539 int refcnt; 6540 6541 linkwatch_forget_dev(dev); 6542 6543 rebroadcast_time = warning_time = jiffies; 6544 refcnt = netdev_refcnt_read(dev); 6545 6546 while (refcnt != 0) { 6547 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 6548 rtnl_lock(); 6549 6550 /* Rebroadcast unregister notification */ 6551 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 6552 6553 __rtnl_unlock(); 6554 rcu_barrier(); 6555 rtnl_lock(); 6556 6557 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6558 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 6559 &dev->state)) { 6560 /* We must not have linkwatch events 6561 * pending on unregister. If this 6562 * happens, we simply run the queue 6563 * unscheduled, resulting in a noop 6564 * for this device. 6565 */ 6566 linkwatch_run_queue(); 6567 } 6568 6569 __rtnl_unlock(); 6570 6571 rebroadcast_time = jiffies; 6572 } 6573 6574 msleep(250); 6575 6576 refcnt = netdev_refcnt_read(dev); 6577 6578 if (time_after(jiffies, warning_time + 10 * HZ)) { 6579 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", 6580 dev->name, refcnt); 6581 warning_time = jiffies; 6582 } 6583 } 6584 } 6585 6586 /* The sequence is: 6587 * 6588 * rtnl_lock(); 6589 * ... 6590 * register_netdevice(x1); 6591 * register_netdevice(x2); 6592 * ... 6593 * unregister_netdevice(y1); 6594 * unregister_netdevice(y2); 6595 * ... 6596 * rtnl_unlock(); 6597 * free_netdev(y1); 6598 * free_netdev(y2); 6599 * 6600 * We are invoked by rtnl_unlock(). 6601 * This allows us to deal with problems: 6602 * 1) We can delete sysfs objects which invoke hotplug 6603 * without deadlocking with linkwatch via keventd. 6604 * 2) Since we run with the RTNL semaphore not held, we can sleep 6605 * safely in order to wait for the netdev refcnt to drop to zero. 6606 * 6607 * We must not return until all unregister events added during 6608 * the interval the lock was held have been completed. 6609 */ 6610 void netdev_run_todo(void) 6611 { 6612 struct list_head list; 6613 6614 /* Snapshot list, allow later requests */ 6615 list_replace_init(&net_todo_list, &list); 6616 6617 __rtnl_unlock(); 6618 6619 6620 /* Wait for rcu callbacks to finish before next phase */ 6621 if (!list_empty(&list)) 6622 rcu_barrier(); 6623 6624 while (!list_empty(&list)) { 6625 struct net_device *dev 6626 = list_first_entry(&list, struct net_device, todo_list); 6627 list_del(&dev->todo_list); 6628 6629 rtnl_lock(); 6630 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 6631 __rtnl_unlock(); 6632 6633 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 6634 pr_err("network todo '%s' but state %d\n", 6635 dev->name, dev->reg_state); 6636 dump_stack(); 6637 continue; 6638 } 6639 6640 dev->reg_state = NETREG_UNREGISTERED; 6641 6642 on_each_cpu(flush_backlog, dev, 1); 6643 6644 netdev_wait_allrefs(dev); 6645 6646 /* paranoia */ 6647 BUG_ON(netdev_refcnt_read(dev)); 6648 BUG_ON(!list_empty(&dev->ptype_all)); 6649 BUG_ON(!list_empty(&dev->ptype_specific)); 6650 WARN_ON(rcu_access_pointer(dev->ip_ptr)); 6651 WARN_ON(rcu_access_pointer(dev->ip6_ptr)); 6652 WARN_ON(dev->dn_ptr); 6653 6654 if (dev->destructor) 6655 dev->destructor(dev); 6656 6657 /* Report a network device has been unregistered */ 6658 rtnl_lock(); 6659 dev_net(dev)->dev_unreg_count--; 6660 __rtnl_unlock(); 6661 wake_up(&netdev_unregistering_wq); 6662 6663 /* Free network device */ 6664 kobject_put(&dev->dev.kobj); 6665 } 6666 } 6667 6668 /* Convert net_device_stats to rtnl_link_stats64. They have the same 6669 * fields in the same order, with only the type differing. 6670 */ 6671 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, 6672 const struct net_device_stats *netdev_stats) 6673 { 6674 #if BITS_PER_LONG == 64 6675 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); 6676 memcpy(stats64, netdev_stats, sizeof(*stats64)); 6677 #else 6678 size_t i, n = sizeof(*stats64) / sizeof(u64); 6679 const unsigned long *src = (const unsigned long *)netdev_stats; 6680 u64 *dst = (u64 *)stats64; 6681 6682 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != 6683 sizeof(*stats64) / sizeof(u64)); 6684 for (i = 0; i < n; i++) 6685 dst[i] = src[i]; 6686 #endif 6687 } 6688 EXPORT_SYMBOL(netdev_stats_to_stats64); 6689 6690 /** 6691 * dev_get_stats - get network device statistics 6692 * @dev: device to get statistics from 6693 * @storage: place to store stats 6694 * 6695 * Get network statistics from device. Return @storage. 6696 * The device driver may provide its own method by setting 6697 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats; 6698 * otherwise the internal statistics structure is used. 6699 */ 6700 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, 6701 struct rtnl_link_stats64 *storage) 6702 { 6703 const struct net_device_ops *ops = dev->netdev_ops; 6704 6705 if (ops->ndo_get_stats64) { 6706 memset(storage, 0, sizeof(*storage)); 6707 ops->ndo_get_stats64(dev, storage); 6708 } else if (ops->ndo_get_stats) { 6709 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); 6710 } else { 6711 netdev_stats_to_stats64(storage, &dev->stats); 6712 } 6713 storage->rx_dropped += atomic_long_read(&dev->rx_dropped); 6714 storage->tx_dropped += atomic_long_read(&dev->tx_dropped); 6715 return storage; 6716 } 6717 EXPORT_SYMBOL(dev_get_stats); 6718 6719 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) 6720 { 6721 struct netdev_queue *queue = dev_ingress_queue(dev); 6722 6723 #ifdef CONFIG_NET_CLS_ACT 6724 if (queue) 6725 return queue; 6726 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 6727 if (!queue) 6728 return NULL; 6729 netdev_init_one_queue(dev, queue, NULL); 6730 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc); 6731 queue->qdisc_sleeping = &noop_qdisc; 6732 rcu_assign_pointer(dev->ingress_queue, queue); 6733 #endif 6734 return queue; 6735 } 6736 6737 static const struct ethtool_ops default_ethtool_ops; 6738 6739 void netdev_set_default_ethtool_ops(struct net_device *dev, 6740 const struct ethtool_ops *ops) 6741 { 6742 if (dev->ethtool_ops == &default_ethtool_ops) 6743 dev->ethtool_ops = ops; 6744 } 6745 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops); 6746 6747 void netdev_freemem(struct net_device *dev) 6748 { 6749 char *addr = (char *)dev - dev->padded; 6750 6751 kvfree(addr); 6752 } 6753 6754 /** 6755 * alloc_netdev_mqs - allocate network device 6756 * @sizeof_priv: size of private data to allocate space for 6757 * @name: device name format string 6758 * @name_assign_type: origin of device name 6759 * @setup: callback to initialize device 6760 * @txqs: the number of TX subqueues to allocate 6761 * @rxqs: the number of RX subqueues to allocate 6762 * 6763 * Allocates a struct net_device with private data area for driver use 6764 * and performs basic initialization. Also allocates subqueue structs 6765 * for each queue on the device. 6766 */ 6767 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, 6768 unsigned char name_assign_type, 6769 void (*setup)(struct net_device *), 6770 unsigned int txqs, unsigned int rxqs) 6771 { 6772 struct net_device *dev; 6773 size_t alloc_size; 6774 struct net_device *p; 6775 6776 BUG_ON(strlen(name) >= sizeof(dev->name)); 6777 6778 if (txqs < 1) { 6779 pr_err("alloc_netdev: Unable to allocate device with zero queues\n"); 6780 return NULL; 6781 } 6782 6783 #ifdef CONFIG_SYSFS 6784 if (rxqs < 1) { 6785 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); 6786 return NULL; 6787 } 6788 #endif 6789 6790 alloc_size = sizeof(struct net_device); 6791 if (sizeof_priv) { 6792 /* ensure 32-byte alignment of private area */ 6793 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 6794 alloc_size += sizeof_priv; 6795 } 6796 /* ensure 32-byte alignment of whole construct */ 6797 alloc_size += NETDEV_ALIGN - 1; 6798 6799 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 6800 if (!p) 6801 p = vzalloc(alloc_size); 6802 if (!p) 6803 return NULL; 6804 6805 dev = PTR_ALIGN(p, NETDEV_ALIGN); 6806 dev->padded = (char *)dev - (char *)p; 6807 6808 dev->pcpu_refcnt = alloc_percpu(int); 6809 if (!dev->pcpu_refcnt) 6810 goto free_dev; 6811 6812 if (dev_addr_init(dev)) 6813 goto free_pcpu; 6814 6815 dev_mc_init(dev); 6816 dev_uc_init(dev); 6817 6818 dev_net_set(dev, &init_net); 6819 6820 dev->gso_max_size = GSO_MAX_SIZE; 6821 dev->gso_max_segs = GSO_MAX_SEGS; 6822 dev->gso_min_segs = 0; 6823 6824 INIT_LIST_HEAD(&dev->napi_list); 6825 INIT_LIST_HEAD(&dev->unreg_list); 6826 INIT_LIST_HEAD(&dev->close_list); 6827 INIT_LIST_HEAD(&dev->link_watch_list); 6828 INIT_LIST_HEAD(&dev->adj_list.upper); 6829 INIT_LIST_HEAD(&dev->adj_list.lower); 6830 INIT_LIST_HEAD(&dev->all_adj_list.upper); 6831 INIT_LIST_HEAD(&dev->all_adj_list.lower); 6832 INIT_LIST_HEAD(&dev->ptype_all); 6833 INIT_LIST_HEAD(&dev->ptype_specific); 6834 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; 6835 setup(dev); 6836 6837 dev->num_tx_queues = txqs; 6838 dev->real_num_tx_queues = txqs; 6839 if (netif_alloc_netdev_queues(dev)) 6840 goto free_all; 6841 6842 #ifdef CONFIG_SYSFS 6843 dev->num_rx_queues = rxqs; 6844 dev->real_num_rx_queues = rxqs; 6845 if (netif_alloc_rx_queues(dev)) 6846 goto free_all; 6847 #endif 6848 6849 strcpy(dev->name, name); 6850 dev->name_assign_type = name_assign_type; 6851 dev->group = INIT_NETDEV_GROUP; 6852 if (!dev->ethtool_ops) 6853 dev->ethtool_ops = &default_ethtool_ops; 6854 return dev; 6855 6856 free_all: 6857 free_netdev(dev); 6858 return NULL; 6859 6860 free_pcpu: 6861 free_percpu(dev->pcpu_refcnt); 6862 free_dev: 6863 netdev_freemem(dev); 6864 return NULL; 6865 } 6866 EXPORT_SYMBOL(alloc_netdev_mqs); 6867 6868 /** 6869 * free_netdev - free network device 6870 * @dev: device 6871 * 6872 * This function does the last stage of destroying an allocated device 6873 * interface. The reference to the device object is released. 6874 * If this is the last reference then it will be freed. 6875 */ 6876 void free_netdev(struct net_device *dev) 6877 { 6878 struct napi_struct *p, *n; 6879 6880 netif_free_tx_queues(dev); 6881 #ifdef CONFIG_SYSFS 6882 kvfree(dev->_rx); 6883 #endif 6884 6885 kfree(rcu_dereference_protected(dev->ingress_queue, 1)); 6886 6887 /* Flush device addresses */ 6888 dev_addr_flush(dev); 6889 6890 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 6891 netif_napi_del(p); 6892 6893 free_percpu(dev->pcpu_refcnt); 6894 dev->pcpu_refcnt = NULL; 6895 6896 /* Compatibility with error handling in drivers */ 6897 if (dev->reg_state == NETREG_UNINITIALIZED) { 6898 netdev_freemem(dev); 6899 return; 6900 } 6901 6902 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 6903 dev->reg_state = NETREG_RELEASED; 6904 6905 /* will free via device release */ 6906 put_device(&dev->dev); 6907 } 6908 EXPORT_SYMBOL(free_netdev); 6909 6910 /** 6911 * synchronize_net - Synchronize with packet receive processing 6912 * 6913 * Wait for packets currently being received to be done. 6914 * Does not block later packets from starting. 6915 */ 6916 void synchronize_net(void) 6917 { 6918 might_sleep(); 6919 if (rtnl_is_locked()) 6920 synchronize_rcu_expedited(); 6921 else 6922 synchronize_rcu(); 6923 } 6924 EXPORT_SYMBOL(synchronize_net); 6925 6926 /** 6927 * unregister_netdevice_queue - remove device from the kernel 6928 * @dev: device 6929 * @head: list 6930 * 6931 * This function shuts down a device interface and removes it 6932 * from the kernel tables. 6933 * If head not NULL, device is queued to be unregistered later. 6934 * 6935 * Callers must hold the rtnl semaphore. You may want 6936 * unregister_netdev() instead of this. 6937 */ 6938 6939 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 6940 { 6941 ASSERT_RTNL(); 6942 6943 if (head) { 6944 list_move_tail(&dev->unreg_list, head); 6945 } else { 6946 rollback_registered(dev); 6947 /* Finish processing unregister after unlock */ 6948 net_set_todo(dev); 6949 } 6950 } 6951 EXPORT_SYMBOL(unregister_netdevice_queue); 6952 6953 /** 6954 * unregister_netdevice_many - unregister many devices 6955 * @head: list of devices 6956 * 6957 * Note: As most callers use a stack allocated list_head, 6958 * we force a list_del() to make sure stack wont be corrupted later. 6959 */ 6960 void unregister_netdevice_many(struct list_head *head) 6961 { 6962 struct net_device *dev; 6963 6964 if (!list_empty(head)) { 6965 rollback_registered_many(head); 6966 list_for_each_entry(dev, head, unreg_list) 6967 net_set_todo(dev); 6968 list_del(head); 6969 } 6970 } 6971 EXPORT_SYMBOL(unregister_netdevice_many); 6972 6973 /** 6974 * unregister_netdev - remove device from the kernel 6975 * @dev: device 6976 * 6977 * This function shuts down a device interface and removes it 6978 * from the kernel tables. 6979 * 6980 * This is just a wrapper for unregister_netdevice that takes 6981 * the rtnl semaphore. In general you want to use this and not 6982 * unregister_netdevice. 6983 */ 6984 void unregister_netdev(struct net_device *dev) 6985 { 6986 rtnl_lock(); 6987 unregister_netdevice(dev); 6988 rtnl_unlock(); 6989 } 6990 EXPORT_SYMBOL(unregister_netdev); 6991 6992 /** 6993 * dev_change_net_namespace - move device to different nethost namespace 6994 * @dev: device 6995 * @net: network namespace 6996 * @pat: If not NULL name pattern to try if the current device name 6997 * is already taken in the destination network namespace. 6998 * 6999 * This function shuts down a device interface and moves it 7000 * to a new network namespace. On success 0 is returned, on 7001 * a failure a netagive errno code is returned. 7002 * 7003 * Callers must hold the rtnl semaphore. 7004 */ 7005 7006 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 7007 { 7008 int err; 7009 7010 ASSERT_RTNL(); 7011 7012 /* Don't allow namespace local devices to be moved. */ 7013 err = -EINVAL; 7014 if (dev->features & NETIF_F_NETNS_LOCAL) 7015 goto out; 7016 7017 /* Ensure the device has been registrered */ 7018 if (dev->reg_state != NETREG_REGISTERED) 7019 goto out; 7020 7021 /* Get out if there is nothing todo */ 7022 err = 0; 7023 if (net_eq(dev_net(dev), net)) 7024 goto out; 7025 7026 /* Pick the destination device name, and ensure 7027 * we can use it in the destination network namespace. 7028 */ 7029 err = -EEXIST; 7030 if (__dev_get_by_name(net, dev->name)) { 7031 /* We get here if we can't use the current device name */ 7032 if (!pat) 7033 goto out; 7034 if (dev_get_valid_name(net, dev, pat) < 0) 7035 goto out; 7036 } 7037 7038 /* 7039 * And now a mini version of register_netdevice unregister_netdevice. 7040 */ 7041 7042 /* If device is running close it first. */ 7043 dev_close(dev); 7044 7045 /* And unlink it from device chain */ 7046 err = -ENODEV; 7047 unlist_netdevice(dev); 7048 7049 synchronize_net(); 7050 7051 /* Shutdown queueing discipline. */ 7052 dev_shutdown(dev); 7053 7054 /* Notify protocols, that we are about to destroy 7055 this device. They should clean all the things. 7056 7057 Note that dev->reg_state stays at NETREG_REGISTERED. 7058 This is wanted because this way 8021q and macvlan know 7059 the device is just moving and can keep their slaves up. 7060 */ 7061 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 7062 rcu_barrier(); 7063 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); 7064 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL); 7065 7066 /* 7067 * Flush the unicast and multicast chains 7068 */ 7069 dev_uc_flush(dev); 7070 dev_mc_flush(dev); 7071 7072 /* Send a netdev-removed uevent to the old namespace */ 7073 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE); 7074 netdev_adjacent_del_links(dev); 7075 7076 /* Actually switch the network namespace */ 7077 dev_net_set(dev, net); 7078 7079 /* If there is an ifindex conflict assign a new one */ 7080 if (__dev_get_by_index(net, dev->ifindex)) { 7081 int iflink = (dev_get_iflink(dev) == dev->ifindex); 7082 dev->ifindex = dev_new_index(net); 7083 if (iflink) 7084 dev->iflink = dev->ifindex; 7085 } 7086 7087 /* Send a netdev-add uevent to the new namespace */ 7088 kobject_uevent(&dev->dev.kobj, KOBJ_ADD); 7089 netdev_adjacent_add_links(dev); 7090 7091 /* Fixup kobjects */ 7092 err = device_rename(&dev->dev, dev->name); 7093 WARN_ON(err); 7094 7095 /* Add the device back in the hashes */ 7096 list_netdevice(dev); 7097 7098 /* Notify protocols, that a new device appeared. */ 7099 call_netdevice_notifiers(NETDEV_REGISTER, dev); 7100 7101 /* 7102 * Prevent userspace races by waiting until the network 7103 * device is fully setup before sending notifications. 7104 */ 7105 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL); 7106 7107 synchronize_net(); 7108 err = 0; 7109 out: 7110 return err; 7111 } 7112 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 7113 7114 static int dev_cpu_callback(struct notifier_block *nfb, 7115 unsigned long action, 7116 void *ocpu) 7117 { 7118 struct sk_buff **list_skb; 7119 struct sk_buff *skb; 7120 unsigned int cpu, oldcpu = (unsigned long)ocpu; 7121 struct softnet_data *sd, *oldsd; 7122 7123 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 7124 return NOTIFY_OK; 7125 7126 local_irq_disable(); 7127 cpu = smp_processor_id(); 7128 sd = &per_cpu(softnet_data, cpu); 7129 oldsd = &per_cpu(softnet_data, oldcpu); 7130 7131 /* Find end of our completion_queue. */ 7132 list_skb = &sd->completion_queue; 7133 while (*list_skb) 7134 list_skb = &(*list_skb)->next; 7135 /* Append completion queue from offline CPU. */ 7136 *list_skb = oldsd->completion_queue; 7137 oldsd->completion_queue = NULL; 7138 7139 /* Append output queue from offline CPU. */ 7140 if (oldsd->output_queue) { 7141 *sd->output_queue_tailp = oldsd->output_queue; 7142 sd->output_queue_tailp = oldsd->output_queue_tailp; 7143 oldsd->output_queue = NULL; 7144 oldsd->output_queue_tailp = &oldsd->output_queue; 7145 } 7146 /* Append NAPI poll list from offline CPU, with one exception : 7147 * process_backlog() must be called by cpu owning percpu backlog. 7148 * We properly handle process_queue & input_pkt_queue later. 7149 */ 7150 while (!list_empty(&oldsd->poll_list)) { 7151 struct napi_struct *napi = list_first_entry(&oldsd->poll_list, 7152 struct napi_struct, 7153 poll_list); 7154 7155 list_del_init(&napi->poll_list); 7156 if (napi->poll == process_backlog) 7157 napi->state = 0; 7158 else 7159 ____napi_schedule(sd, napi); 7160 } 7161 7162 raise_softirq_irqoff(NET_TX_SOFTIRQ); 7163 local_irq_enable(); 7164 7165 /* Process offline CPU's input_pkt_queue */ 7166 while ((skb = __skb_dequeue(&oldsd->process_queue))) { 7167 netif_rx_ni(skb); 7168 input_queue_head_incr(oldsd); 7169 } 7170 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { 7171 netif_rx_ni(skb); 7172 input_queue_head_incr(oldsd); 7173 } 7174 7175 return NOTIFY_OK; 7176 } 7177 7178 7179 /** 7180 * netdev_increment_features - increment feature set by one 7181 * @all: current feature set 7182 * @one: new feature set 7183 * @mask: mask feature set 7184 * 7185 * Computes a new feature set after adding a device with feature set 7186 * @one to the master device with current feature set @all. Will not 7187 * enable anything that is off in @mask. Returns the new feature set. 7188 */ 7189 netdev_features_t netdev_increment_features(netdev_features_t all, 7190 netdev_features_t one, netdev_features_t mask) 7191 { 7192 if (mask & NETIF_F_GEN_CSUM) 7193 mask |= NETIF_F_ALL_CSUM; 7194 mask |= NETIF_F_VLAN_CHALLENGED; 7195 7196 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask; 7197 all &= one | ~NETIF_F_ALL_FOR_ALL; 7198 7199 /* If one device supports hw checksumming, set for all. */ 7200 if (all & NETIF_F_GEN_CSUM) 7201 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM); 7202 7203 return all; 7204 } 7205 EXPORT_SYMBOL(netdev_increment_features); 7206 7207 static struct hlist_head * __net_init netdev_create_hash(void) 7208 { 7209 int i; 7210 struct hlist_head *hash; 7211 7212 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 7213 if (hash != NULL) 7214 for (i = 0; i < NETDEV_HASHENTRIES; i++) 7215 INIT_HLIST_HEAD(&hash[i]); 7216 7217 return hash; 7218 } 7219 7220 /* Initialize per network namespace state */ 7221 static int __net_init netdev_init(struct net *net) 7222 { 7223 if (net != &init_net) 7224 INIT_LIST_HEAD(&net->dev_base_head); 7225 7226 net->dev_name_head = netdev_create_hash(); 7227 if (net->dev_name_head == NULL) 7228 goto err_name; 7229 7230 net->dev_index_head = netdev_create_hash(); 7231 if (net->dev_index_head == NULL) 7232 goto err_idx; 7233 7234 return 0; 7235 7236 err_idx: 7237 kfree(net->dev_name_head); 7238 err_name: 7239 return -ENOMEM; 7240 } 7241 7242 /** 7243 * netdev_drivername - network driver for the device 7244 * @dev: network device 7245 * 7246 * Determine network driver for device. 7247 */ 7248 const char *netdev_drivername(const struct net_device *dev) 7249 { 7250 const struct device_driver *driver; 7251 const struct device *parent; 7252 const char *empty = ""; 7253 7254 parent = dev->dev.parent; 7255 if (!parent) 7256 return empty; 7257 7258 driver = parent->driver; 7259 if (driver && driver->name) 7260 return driver->name; 7261 return empty; 7262 } 7263 7264 static void __netdev_printk(const char *level, const struct net_device *dev, 7265 struct va_format *vaf) 7266 { 7267 if (dev && dev->dev.parent) { 7268 dev_printk_emit(level[1] - '0', 7269 dev->dev.parent, 7270 "%s %s %s%s: %pV", 7271 dev_driver_string(dev->dev.parent), 7272 dev_name(dev->dev.parent), 7273 netdev_name(dev), netdev_reg_state(dev), 7274 vaf); 7275 } else if (dev) { 7276 printk("%s%s%s: %pV", 7277 level, netdev_name(dev), netdev_reg_state(dev), vaf); 7278 } else { 7279 printk("%s(NULL net_device): %pV", level, vaf); 7280 } 7281 } 7282 7283 void netdev_printk(const char *level, const struct net_device *dev, 7284 const char *format, ...) 7285 { 7286 struct va_format vaf; 7287 va_list args; 7288 7289 va_start(args, format); 7290 7291 vaf.fmt = format; 7292 vaf.va = &args; 7293 7294 __netdev_printk(level, dev, &vaf); 7295 7296 va_end(args); 7297 } 7298 EXPORT_SYMBOL(netdev_printk); 7299 7300 #define define_netdev_printk_level(func, level) \ 7301 void func(const struct net_device *dev, const char *fmt, ...) \ 7302 { \ 7303 struct va_format vaf; \ 7304 va_list args; \ 7305 \ 7306 va_start(args, fmt); \ 7307 \ 7308 vaf.fmt = fmt; \ 7309 vaf.va = &args; \ 7310 \ 7311 __netdev_printk(level, dev, &vaf); \ 7312 \ 7313 va_end(args); \ 7314 } \ 7315 EXPORT_SYMBOL(func); 7316 7317 define_netdev_printk_level(netdev_emerg, KERN_EMERG); 7318 define_netdev_printk_level(netdev_alert, KERN_ALERT); 7319 define_netdev_printk_level(netdev_crit, KERN_CRIT); 7320 define_netdev_printk_level(netdev_err, KERN_ERR); 7321 define_netdev_printk_level(netdev_warn, KERN_WARNING); 7322 define_netdev_printk_level(netdev_notice, KERN_NOTICE); 7323 define_netdev_printk_level(netdev_info, KERN_INFO); 7324 7325 static void __net_exit netdev_exit(struct net *net) 7326 { 7327 kfree(net->dev_name_head); 7328 kfree(net->dev_index_head); 7329 } 7330 7331 static struct pernet_operations __net_initdata netdev_net_ops = { 7332 .init = netdev_init, 7333 .exit = netdev_exit, 7334 }; 7335 7336 static void __net_exit default_device_exit(struct net *net) 7337 { 7338 struct net_device *dev, *aux; 7339 /* 7340 * Push all migratable network devices back to the 7341 * initial network namespace 7342 */ 7343 rtnl_lock(); 7344 for_each_netdev_safe(net, dev, aux) { 7345 int err; 7346 char fb_name[IFNAMSIZ]; 7347 7348 /* Ignore unmoveable devices (i.e. loopback) */ 7349 if (dev->features & NETIF_F_NETNS_LOCAL) 7350 continue; 7351 7352 /* Leave virtual devices for the generic cleanup */ 7353 if (dev->rtnl_link_ops) 7354 continue; 7355 7356 /* Push remaining network devices to init_net */ 7357 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 7358 err = dev_change_net_namespace(dev, &init_net, fb_name); 7359 if (err) { 7360 pr_emerg("%s: failed to move %s to init_net: %d\n", 7361 __func__, dev->name, err); 7362 BUG(); 7363 } 7364 } 7365 rtnl_unlock(); 7366 } 7367 7368 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) 7369 { 7370 /* Return with the rtnl_lock held when there are no network 7371 * devices unregistering in any network namespace in net_list. 7372 */ 7373 struct net *net; 7374 bool unregistering; 7375 DEFINE_WAIT_FUNC(wait, woken_wake_function); 7376 7377 add_wait_queue(&netdev_unregistering_wq, &wait); 7378 for (;;) { 7379 unregistering = false; 7380 rtnl_lock(); 7381 list_for_each_entry(net, net_list, exit_list) { 7382 if (net->dev_unreg_count > 0) { 7383 unregistering = true; 7384 break; 7385 } 7386 } 7387 if (!unregistering) 7388 break; 7389 __rtnl_unlock(); 7390 7391 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); 7392 } 7393 remove_wait_queue(&netdev_unregistering_wq, &wait); 7394 } 7395 7396 static void __net_exit default_device_exit_batch(struct list_head *net_list) 7397 { 7398 /* At exit all network devices most be removed from a network 7399 * namespace. Do this in the reverse order of registration. 7400 * Do this across as many network namespaces as possible to 7401 * improve batching efficiency. 7402 */ 7403 struct net_device *dev; 7404 struct net *net; 7405 LIST_HEAD(dev_kill_list); 7406 7407 /* To prevent network device cleanup code from dereferencing 7408 * loopback devices or network devices that have been freed 7409 * wait here for all pending unregistrations to complete, 7410 * before unregistring the loopback device and allowing the 7411 * network namespace be freed. 7412 * 7413 * The netdev todo list containing all network devices 7414 * unregistrations that happen in default_device_exit_batch 7415 * will run in the rtnl_unlock() at the end of 7416 * default_device_exit_batch. 7417 */ 7418 rtnl_lock_unregistering(net_list); 7419 list_for_each_entry(net, net_list, exit_list) { 7420 for_each_netdev_reverse(net, dev) { 7421 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) 7422 dev->rtnl_link_ops->dellink(dev, &dev_kill_list); 7423 else 7424 unregister_netdevice_queue(dev, &dev_kill_list); 7425 } 7426 } 7427 unregister_netdevice_many(&dev_kill_list); 7428 rtnl_unlock(); 7429 } 7430 7431 static struct pernet_operations __net_initdata default_device_ops = { 7432 .exit = default_device_exit, 7433 .exit_batch = default_device_exit_batch, 7434 }; 7435 7436 /* 7437 * Initialize the DEV module. At boot time this walks the device list and 7438 * unhooks any devices that fail to initialise (normally hardware not 7439 * present) and leaves us with a valid list of present and active devices. 7440 * 7441 */ 7442 7443 /* 7444 * This is called single threaded during boot, so no need 7445 * to take the rtnl semaphore. 7446 */ 7447 static int __init net_dev_init(void) 7448 { 7449 int i, rc = -ENOMEM; 7450 7451 BUG_ON(!dev_boot_phase); 7452 7453 if (dev_proc_init()) 7454 goto out; 7455 7456 if (netdev_kobject_init()) 7457 goto out; 7458 7459 INIT_LIST_HEAD(&ptype_all); 7460 for (i = 0; i < PTYPE_HASH_SIZE; i++) 7461 INIT_LIST_HEAD(&ptype_base[i]); 7462 7463 INIT_LIST_HEAD(&offload_base); 7464 7465 if (register_pernet_subsys(&netdev_net_ops)) 7466 goto out; 7467 7468 /* 7469 * Initialise the packet receive queues. 7470 */ 7471 7472 for_each_possible_cpu(i) { 7473 struct softnet_data *sd = &per_cpu(softnet_data, i); 7474 7475 skb_queue_head_init(&sd->input_pkt_queue); 7476 skb_queue_head_init(&sd->process_queue); 7477 INIT_LIST_HEAD(&sd->poll_list); 7478 sd->output_queue_tailp = &sd->output_queue; 7479 #ifdef CONFIG_RPS 7480 sd->csd.func = rps_trigger_softirq; 7481 sd->csd.info = sd; 7482 sd->cpu = i; 7483 #endif 7484 7485 sd->backlog.poll = process_backlog; 7486 sd->backlog.weight = weight_p; 7487 } 7488 7489 dev_boot_phase = 0; 7490 7491 /* The loopback device is special if any other network devices 7492 * is present in a network namespace the loopback device must 7493 * be present. Since we now dynamically allocate and free the 7494 * loopback device ensure this invariant is maintained by 7495 * keeping the loopback device as the first device on the 7496 * list of network devices. Ensuring the loopback devices 7497 * is the first device that appears and the last network device 7498 * that disappears. 7499 */ 7500 if (register_pernet_device(&loopback_net_ops)) 7501 goto out; 7502 7503 if (register_pernet_device(&default_device_ops)) 7504 goto out; 7505 7506 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 7507 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 7508 7509 hotcpu_notifier(dev_cpu_callback, 0); 7510 dst_init(); 7511 rc = 0; 7512 out: 7513 return rc; 7514 } 7515 7516 subsys_initcall(net_dev_init); 7517