1 /* 2 * NET3 Protocol independent device support routines. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Derived from the non IP parts of dev.c 1.0.19 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Mark Evans, <evansmp@uhura.aston.ac.uk> 13 * 14 * Additional Authors: 15 * Florian la Roche <rzsfl@rz.uni-sb.de> 16 * Alan Cox <gw4pts@gw4pts.ampr.org> 17 * David Hinds <dahinds@users.sourceforge.net> 18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> 19 * Adam Sulmicki <adam@cfar.umd.edu> 20 * Pekka Riikonen <priikone@poesidon.pspt.fi> 21 * 22 * Changes: 23 * D.J. Barrow : Fixed bug where dev->refcnt gets set 24 * to 2 if register_netdev gets called 25 * before net_dev_init & also removed a 26 * few lines of code in the process. 27 * Alan Cox : device private ioctl copies fields back. 28 * Alan Cox : Transmit queue code does relevant 29 * stunts to keep the queue safe. 30 * Alan Cox : Fixed double lock. 31 * Alan Cox : Fixed promisc NULL pointer trap 32 * ???????? : Support the full private ioctl range 33 * Alan Cox : Moved ioctl permission check into 34 * drivers 35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI 36 * Alan Cox : 100 backlog just doesn't cut it when 37 * you start doing multicast video 8) 38 * Alan Cox : Rewrote net_bh and list manager. 39 * Alan Cox : Fix ETH_P_ALL echoback lengths. 40 * Alan Cox : Took out transmit every packet pass 41 * Saved a few bytes in the ioctl handler 42 * Alan Cox : Network driver sets packet type before 43 * calling netif_rx. Saves a function 44 * call a packet. 45 * Alan Cox : Hashed net_bh() 46 * Richard Kooijman: Timestamp fixes. 47 * Alan Cox : Wrong field in SIOCGIFDSTADDR 48 * Alan Cox : Device lock protection. 49 * Alan Cox : Fixed nasty side effect of device close 50 * changes. 51 * Rudi Cilibrasi : Pass the right thing to 52 * set_mac_address() 53 * Dave Miller : 32bit quantity for the device lock to 54 * make it work out on a Sparc. 55 * Bjorn Ekwall : Added KERNELD hack. 56 * Alan Cox : Cleaned up the backlog initialise. 57 * Craig Metz : SIOCGIFCONF fix if space for under 58 * 1 device. 59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there 60 * is no device open function. 61 * Andi Kleen : Fix error reporting for SIOCGIFCONF 62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF 63 * Cyrus Durgin : Cleaned for KMOD 64 * Adam Sulmicki : Bug Fix : Network Device Unload 65 * A network device unload needs to purge 66 * the backlog queue. 67 * Paul Rusty Russell : SIOCSIFNAME 68 * Pekka Riikonen : Netdev boot-time settings code 69 * Andrew Morton : Make unregister_netdevice wait 70 * indefinitely on dev->refcnt 71 * J Hadi Salim : - Backlog queue sampling 72 * - netif_rx() feedback 73 */ 74 75 #include <asm/uaccess.h> 76 #include <asm/system.h> 77 #include <linux/bitops.h> 78 #include <linux/capability.h> 79 #include <linux/cpu.h> 80 #include <linux/types.h> 81 #include <linux/kernel.h> 82 #include <linux/sched.h> 83 #include <linux/mutex.h> 84 #include <linux/string.h> 85 #include <linux/mm.h> 86 #include <linux/socket.h> 87 #include <linux/sockios.h> 88 #include <linux/errno.h> 89 #include <linux/interrupt.h> 90 #include <linux/if_ether.h> 91 #include <linux/netdevice.h> 92 #include <linux/etherdevice.h> 93 #include <linux/ethtool.h> 94 #include <linux/notifier.h> 95 #include <linux/skbuff.h> 96 #include <net/net_namespace.h> 97 #include <net/sock.h> 98 #include <linux/rtnetlink.h> 99 #include <linux/proc_fs.h> 100 #include <linux/seq_file.h> 101 #include <linux/stat.h> 102 #include <linux/if_bridge.h> 103 #include <linux/if_macvlan.h> 104 #include <net/dst.h> 105 #include <net/pkt_sched.h> 106 #include <net/checksum.h> 107 #include <linux/highmem.h> 108 #include <linux/init.h> 109 #include <linux/kmod.h> 110 #include <linux/module.h> 111 #include <linux/netpoll.h> 112 #include <linux/rcupdate.h> 113 #include <linux/delay.h> 114 #include <net/wext.h> 115 #include <net/iw_handler.h> 116 #include <asm/current.h> 117 #include <linux/audit.h> 118 #include <linux/dmaengine.h> 119 #include <linux/err.h> 120 #include <linux/ctype.h> 121 #include <linux/if_arp.h> 122 #include <linux/if_vlan.h> 123 #include <linux/ip.h> 124 #include <net/ip.h> 125 #include <linux/ipv6.h> 126 #include <linux/in.h> 127 #include <linux/jhash.h> 128 #include <linux/random.h> 129 #include <trace/events/napi.h> 130 131 #include "net-sysfs.h" 132 133 /* Instead of increasing this, you should create a hash table. */ 134 #define MAX_GRO_SKBS 8 135 136 /* This should be increased if a protocol with a bigger head is added. */ 137 #define GRO_MAX_HEAD (MAX_HEADER + 128) 138 139 /* 140 * The list of packet types we will receive (as opposed to discard) 141 * and the routines to invoke. 142 * 143 * Why 16. Because with 16 the only overlap we get on a hash of the 144 * low nibble of the protocol value is RARP/SNAP/X.25. 145 * 146 * NOTE: That is no longer true with the addition of VLAN tags. Not 147 * sure which should go first, but I bet it won't make much 148 * difference if we are running VLANs. The good news is that 149 * this protocol won't be in the list unless compiled in, so 150 * the average user (w/out VLANs) will not be adversely affected. 151 * --BLG 152 * 153 * 0800 IP 154 * 8100 802.1Q VLAN 155 * 0001 802.3 156 * 0002 AX.25 157 * 0004 802.2 158 * 8035 RARP 159 * 0005 SNAP 160 * 0805 X.25 161 * 0806 ARP 162 * 8137 IPX 163 * 0009 Localtalk 164 * 86DD IPv6 165 */ 166 167 #define PTYPE_HASH_SIZE (16) 168 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1) 169 170 static DEFINE_SPINLOCK(ptype_lock); 171 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; 172 static struct list_head ptype_all __read_mostly; /* Taps */ 173 174 /* 175 * The @dev_base_head list is protected by @dev_base_lock and the rtnl 176 * semaphore. 177 * 178 * Pure readers hold dev_base_lock for reading. 179 * 180 * Writers must hold the rtnl semaphore while they loop through the 181 * dev_base_head list, and hold dev_base_lock for writing when they do the 182 * actual updates. This allows pure readers to access the list even 183 * while a writer is preparing to update it. 184 * 185 * To put it another way, dev_base_lock is held for writing only to 186 * protect against pure readers; the rtnl semaphore provides the 187 * protection against other writers. 188 * 189 * See, for example usages, register_netdevice() and 190 * unregister_netdevice(), which must be called with the rtnl 191 * semaphore held. 192 */ 193 DEFINE_RWLOCK(dev_base_lock); 194 EXPORT_SYMBOL(dev_base_lock); 195 196 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name) 197 { 198 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ)); 199 return &net->dev_name_head[hash & (NETDEV_HASHENTRIES - 1)]; 200 } 201 202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) 203 { 204 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)]; 205 } 206 207 /* Device list insertion */ 208 static int list_netdevice(struct net_device *dev) 209 { 210 struct net *net = dev_net(dev); 211 212 ASSERT_RTNL(); 213 214 write_lock_bh(&dev_base_lock); 215 list_add_tail(&dev->dev_list, &net->dev_base_head); 216 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 217 hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); 218 write_unlock_bh(&dev_base_lock); 219 return 0; 220 } 221 222 /* Device list removal */ 223 static void unlist_netdevice(struct net_device *dev) 224 { 225 ASSERT_RTNL(); 226 227 /* Unlink dev from the device chain */ 228 write_lock_bh(&dev_base_lock); 229 list_del(&dev->dev_list); 230 hlist_del(&dev->name_hlist); 231 hlist_del(&dev->index_hlist); 232 write_unlock_bh(&dev_base_lock); 233 } 234 235 /* 236 * Our notifier list 237 */ 238 239 static RAW_NOTIFIER_HEAD(netdev_chain); 240 241 /* 242 * Device drivers call our routines to queue packets here. We empty the 243 * queue in the local softnet handler. 244 */ 245 246 DEFINE_PER_CPU(struct softnet_data, softnet_data); 247 EXPORT_PER_CPU_SYMBOL(softnet_data); 248 249 #ifdef CONFIG_LOCKDEP 250 /* 251 * register_netdevice() inits txq->_xmit_lock and sets lockdep class 252 * according to dev->type 253 */ 254 static const unsigned short netdev_lock_type[] = 255 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, 256 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, 257 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, 258 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, 259 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, 260 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, 261 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, 262 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, 263 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, 264 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, 265 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, 266 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, 267 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211, 268 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, 269 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154, 270 ARPHRD_VOID, ARPHRD_NONE}; 271 272 static const char *const netdev_lock_name[] = 273 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", 274 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", 275 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", 276 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", 277 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", 278 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", 279 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", 280 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", 281 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", 282 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", 283 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", 284 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", 285 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211", 286 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", 287 "_xmit_PHONET_PIPE", "_xmit_IEEE802154", 288 "_xmit_VOID", "_xmit_NONE"}; 289 290 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; 291 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)]; 292 293 static inline unsigned short netdev_lock_pos(unsigned short dev_type) 294 { 295 int i; 296 297 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) 298 if (netdev_lock_type[i] == dev_type) 299 return i; 300 /* the last key is used by default */ 301 return ARRAY_SIZE(netdev_lock_type) - 1; 302 } 303 304 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 305 unsigned short dev_type) 306 { 307 int i; 308 309 i = netdev_lock_pos(dev_type); 310 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], 311 netdev_lock_name[i]); 312 } 313 314 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 315 { 316 int i; 317 318 i = netdev_lock_pos(dev->type); 319 lockdep_set_class_and_name(&dev->addr_list_lock, 320 &netdev_addr_lock_key[i], 321 netdev_lock_name[i]); 322 } 323 #else 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, 325 unsigned short dev_type) 326 { 327 } 328 static inline void netdev_set_addr_lockdep_class(struct net_device *dev) 329 { 330 } 331 #endif 332 333 /******************************************************************************* 334 335 Protocol management and registration routines 336 337 *******************************************************************************/ 338 339 /* 340 * Add a protocol ID to the list. Now that the input handler is 341 * smarter we can dispense with all the messy stuff that used to be 342 * here. 343 * 344 * BEWARE!!! Protocol handlers, mangling input packets, 345 * MUST BE last in hash buckets and checking protocol handlers 346 * MUST start from promiscuous ptype_all chain in net_bh. 347 * It is true now, do not change it. 348 * Explanation follows: if protocol handler, mangling packet, will 349 * be the first on list, it is not able to sense, that packet 350 * is cloned and should be copied-on-write, so that it will 351 * change it and subsequent readers will get broken packet. 352 * --ANK (980803) 353 */ 354 355 /** 356 * dev_add_pack - add packet handler 357 * @pt: packet type declaration 358 * 359 * Add a protocol handler to the networking stack. The passed &packet_type 360 * is linked into kernel lists and may not be freed until it has been 361 * removed from the kernel lists. 362 * 363 * This call does not sleep therefore it can not 364 * guarantee all CPU's that are in middle of receiving packets 365 * will see the new packet type (until the next received packet). 366 */ 367 368 void dev_add_pack(struct packet_type *pt) 369 { 370 int hash; 371 372 spin_lock_bh(&ptype_lock); 373 if (pt->type == htons(ETH_P_ALL)) 374 list_add_rcu(&pt->list, &ptype_all); 375 else { 376 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 377 list_add_rcu(&pt->list, &ptype_base[hash]); 378 } 379 spin_unlock_bh(&ptype_lock); 380 } 381 EXPORT_SYMBOL(dev_add_pack); 382 383 /** 384 * __dev_remove_pack - remove packet handler 385 * @pt: packet type declaration 386 * 387 * Remove a protocol handler that was previously added to the kernel 388 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 389 * from the kernel lists and can be freed or reused once this function 390 * returns. 391 * 392 * The packet type might still be in use by receivers 393 * and must not be freed until after all the CPU's have gone 394 * through a quiescent state. 395 */ 396 void __dev_remove_pack(struct packet_type *pt) 397 { 398 struct list_head *head; 399 struct packet_type *pt1; 400 401 spin_lock_bh(&ptype_lock); 402 403 if (pt->type == htons(ETH_P_ALL)) 404 head = &ptype_all; 405 else 406 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 407 408 list_for_each_entry(pt1, head, list) { 409 if (pt == pt1) { 410 list_del_rcu(&pt->list); 411 goto out; 412 } 413 } 414 415 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); 416 out: 417 spin_unlock_bh(&ptype_lock); 418 } 419 EXPORT_SYMBOL(__dev_remove_pack); 420 421 /** 422 * dev_remove_pack - remove packet handler 423 * @pt: packet type declaration 424 * 425 * Remove a protocol handler that was previously added to the kernel 426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed 427 * from the kernel lists and can be freed or reused once this function 428 * returns. 429 * 430 * This call sleeps to guarantee that no CPU is looking at the packet 431 * type after return. 432 */ 433 void dev_remove_pack(struct packet_type *pt) 434 { 435 __dev_remove_pack(pt); 436 437 synchronize_net(); 438 } 439 EXPORT_SYMBOL(dev_remove_pack); 440 441 /****************************************************************************** 442 443 Device Boot-time Settings Routines 444 445 *******************************************************************************/ 446 447 /* Boot time configuration table */ 448 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; 449 450 /** 451 * netdev_boot_setup_add - add new setup entry 452 * @name: name of the device 453 * @map: configured settings for the device 454 * 455 * Adds new setup entry to the dev_boot_setup list. The function 456 * returns 0 on error and 1 on success. This is a generic routine to 457 * all netdevices. 458 */ 459 static int netdev_boot_setup_add(char *name, struct ifmap *map) 460 { 461 struct netdev_boot_setup *s; 462 int i; 463 464 s = dev_boot_setup; 465 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 466 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { 467 memset(s[i].name, 0, sizeof(s[i].name)); 468 strlcpy(s[i].name, name, IFNAMSIZ); 469 memcpy(&s[i].map, map, sizeof(s[i].map)); 470 break; 471 } 472 } 473 474 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; 475 } 476 477 /** 478 * netdev_boot_setup_check - check boot time settings 479 * @dev: the netdevice 480 * 481 * Check boot time settings for the device. 482 * The found settings are set for the device to be used 483 * later in the device probing. 484 * Returns 0 if no settings found, 1 if they are. 485 */ 486 int netdev_boot_setup_check(struct net_device *dev) 487 { 488 struct netdev_boot_setup *s = dev_boot_setup; 489 int i; 490 491 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { 492 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && 493 !strcmp(dev->name, s[i].name)) { 494 dev->irq = s[i].map.irq; 495 dev->base_addr = s[i].map.base_addr; 496 dev->mem_start = s[i].map.mem_start; 497 dev->mem_end = s[i].map.mem_end; 498 return 1; 499 } 500 } 501 return 0; 502 } 503 EXPORT_SYMBOL(netdev_boot_setup_check); 504 505 506 /** 507 * netdev_boot_base - get address from boot time settings 508 * @prefix: prefix for network device 509 * @unit: id for network device 510 * 511 * Check boot time settings for the base address of device. 512 * The found settings are set for the device to be used 513 * later in the device probing. 514 * Returns 0 if no settings found. 515 */ 516 unsigned long netdev_boot_base(const char *prefix, int unit) 517 { 518 const struct netdev_boot_setup *s = dev_boot_setup; 519 char name[IFNAMSIZ]; 520 int i; 521 522 sprintf(name, "%s%d", prefix, unit); 523 524 /* 525 * If device already registered then return base of 1 526 * to indicate not to probe for this interface 527 */ 528 if (__dev_get_by_name(&init_net, name)) 529 return 1; 530 531 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) 532 if (!strcmp(name, s[i].name)) 533 return s[i].map.base_addr; 534 return 0; 535 } 536 537 /* 538 * Saves at boot time configured settings for any netdevice. 539 */ 540 int __init netdev_boot_setup(char *str) 541 { 542 int ints[5]; 543 struct ifmap map; 544 545 str = get_options(str, ARRAY_SIZE(ints), ints); 546 if (!str || !*str) 547 return 0; 548 549 /* Save settings */ 550 memset(&map, 0, sizeof(map)); 551 if (ints[0] > 0) 552 map.irq = ints[1]; 553 if (ints[0] > 1) 554 map.base_addr = ints[2]; 555 if (ints[0] > 2) 556 map.mem_start = ints[3]; 557 if (ints[0] > 3) 558 map.mem_end = ints[4]; 559 560 /* Add new entry to the list */ 561 return netdev_boot_setup_add(str, &map); 562 } 563 564 __setup("netdev=", netdev_boot_setup); 565 566 /******************************************************************************* 567 568 Device Interface Subroutines 569 570 *******************************************************************************/ 571 572 /** 573 * __dev_get_by_name - find a device by its name 574 * @net: the applicable net namespace 575 * @name: name to find 576 * 577 * Find an interface by name. Must be called under RTNL semaphore 578 * or @dev_base_lock. If the name is found a pointer to the device 579 * is returned. If the name is not found then %NULL is returned. The 580 * reference counters are not incremented so the caller must be 581 * careful with locks. 582 */ 583 584 struct net_device *__dev_get_by_name(struct net *net, const char *name) 585 { 586 struct hlist_node *p; 587 588 hlist_for_each(p, dev_name_hash(net, name)) { 589 struct net_device *dev 590 = hlist_entry(p, struct net_device, name_hlist); 591 if (!strncmp(dev->name, name, IFNAMSIZ)) 592 return dev; 593 } 594 return NULL; 595 } 596 EXPORT_SYMBOL(__dev_get_by_name); 597 598 /** 599 * dev_get_by_name - find a device by its name 600 * @net: the applicable net namespace 601 * @name: name to find 602 * 603 * Find an interface by name. This can be called from any 604 * context and does its own locking. The returned handle has 605 * the usage count incremented and the caller must use dev_put() to 606 * release it when it is no longer needed. %NULL is returned if no 607 * matching device is found. 608 */ 609 610 struct net_device *dev_get_by_name(struct net *net, const char *name) 611 { 612 struct net_device *dev; 613 614 read_lock(&dev_base_lock); 615 dev = __dev_get_by_name(net, name); 616 if (dev) 617 dev_hold(dev); 618 read_unlock(&dev_base_lock); 619 return dev; 620 } 621 EXPORT_SYMBOL(dev_get_by_name); 622 623 /** 624 * __dev_get_by_index - find a device by its ifindex 625 * @net: the applicable net namespace 626 * @ifindex: index of device 627 * 628 * Search for an interface by index. Returns %NULL if the device 629 * is not found or a pointer to the device. The device has not 630 * had its reference counter increased so the caller must be careful 631 * about locking. The caller must hold either the RTNL semaphore 632 * or @dev_base_lock. 633 */ 634 635 struct net_device *__dev_get_by_index(struct net *net, int ifindex) 636 { 637 struct hlist_node *p; 638 639 hlist_for_each(p, dev_index_hash(net, ifindex)) { 640 struct net_device *dev 641 = hlist_entry(p, struct net_device, index_hlist); 642 if (dev->ifindex == ifindex) 643 return dev; 644 } 645 return NULL; 646 } 647 EXPORT_SYMBOL(__dev_get_by_index); 648 649 650 /** 651 * dev_get_by_index - find a device by its ifindex 652 * @net: the applicable net namespace 653 * @ifindex: index of device 654 * 655 * Search for an interface by index. Returns NULL if the device 656 * is not found or a pointer to the device. The device returned has 657 * had a reference added and the pointer is safe until the user calls 658 * dev_put to indicate they have finished with it. 659 */ 660 661 struct net_device *dev_get_by_index(struct net *net, int ifindex) 662 { 663 struct net_device *dev; 664 665 read_lock(&dev_base_lock); 666 dev = __dev_get_by_index(net, ifindex); 667 if (dev) 668 dev_hold(dev); 669 read_unlock(&dev_base_lock); 670 return dev; 671 } 672 EXPORT_SYMBOL(dev_get_by_index); 673 674 /** 675 * dev_getbyhwaddr - find a device by its hardware address 676 * @net: the applicable net namespace 677 * @type: media type of device 678 * @ha: hardware address 679 * 680 * Search for an interface by MAC address. Returns NULL if the device 681 * is not found or a pointer to the device. The caller must hold the 682 * rtnl semaphore. The returned device has not had its ref count increased 683 * and the caller must therefore be careful about locking 684 * 685 * BUGS: 686 * If the API was consistent this would be __dev_get_by_hwaddr 687 */ 688 689 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha) 690 { 691 struct net_device *dev; 692 693 ASSERT_RTNL(); 694 695 for_each_netdev(net, dev) 696 if (dev->type == type && 697 !memcmp(dev->dev_addr, ha, dev->addr_len)) 698 return dev; 699 700 return NULL; 701 } 702 EXPORT_SYMBOL(dev_getbyhwaddr); 703 704 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type) 705 { 706 struct net_device *dev; 707 708 ASSERT_RTNL(); 709 for_each_netdev(net, dev) 710 if (dev->type == type) 711 return dev; 712 713 return NULL; 714 } 715 EXPORT_SYMBOL(__dev_getfirstbyhwtype); 716 717 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) 718 { 719 struct net_device *dev; 720 721 rtnl_lock(); 722 dev = __dev_getfirstbyhwtype(net, type); 723 if (dev) 724 dev_hold(dev); 725 rtnl_unlock(); 726 return dev; 727 } 728 EXPORT_SYMBOL(dev_getfirstbyhwtype); 729 730 /** 731 * dev_get_by_flags - find any device with given flags 732 * @net: the applicable net namespace 733 * @if_flags: IFF_* values 734 * @mask: bitmask of bits in if_flags to check 735 * 736 * Search for any interface with the given flags. Returns NULL if a device 737 * is not found or a pointer to the device. The device returned has 738 * had a reference added and the pointer is safe until the user calls 739 * dev_put to indicate they have finished with it. 740 */ 741 742 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags, 743 unsigned short mask) 744 { 745 struct net_device *dev, *ret; 746 747 ret = NULL; 748 read_lock(&dev_base_lock); 749 for_each_netdev(net, dev) { 750 if (((dev->flags ^ if_flags) & mask) == 0) { 751 dev_hold(dev); 752 ret = dev; 753 break; 754 } 755 } 756 read_unlock(&dev_base_lock); 757 return ret; 758 } 759 EXPORT_SYMBOL(dev_get_by_flags); 760 761 /** 762 * dev_valid_name - check if name is okay for network device 763 * @name: name string 764 * 765 * Network device names need to be valid file names to 766 * to allow sysfs to work. We also disallow any kind of 767 * whitespace. 768 */ 769 int dev_valid_name(const char *name) 770 { 771 if (*name == '\0') 772 return 0; 773 if (strlen(name) >= IFNAMSIZ) 774 return 0; 775 if (!strcmp(name, ".") || !strcmp(name, "..")) 776 return 0; 777 778 while (*name) { 779 if (*name == '/' || isspace(*name)) 780 return 0; 781 name++; 782 } 783 return 1; 784 } 785 EXPORT_SYMBOL(dev_valid_name); 786 787 /** 788 * __dev_alloc_name - allocate a name for a device 789 * @net: network namespace to allocate the device name in 790 * @name: name format string 791 * @buf: scratch buffer and result name string 792 * 793 * Passed a format string - eg "lt%d" it will try and find a suitable 794 * id. It scans list of devices to build up a free map, then chooses 795 * the first empty slot. The caller must hold the dev_base or rtnl lock 796 * while allocating the name and adding the device in order to avoid 797 * duplicates. 798 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 799 * Returns the number of the unit assigned or a negative errno code. 800 */ 801 802 static int __dev_alloc_name(struct net *net, const char *name, char *buf) 803 { 804 int i = 0; 805 const char *p; 806 const int max_netdevices = 8*PAGE_SIZE; 807 unsigned long *inuse; 808 struct net_device *d; 809 810 p = strnchr(name, IFNAMSIZ-1, '%'); 811 if (p) { 812 /* 813 * Verify the string as this thing may have come from 814 * the user. There must be either one "%d" and no other "%" 815 * characters. 816 */ 817 if (p[1] != 'd' || strchr(p + 2, '%')) 818 return -EINVAL; 819 820 /* Use one page as a bit array of possible slots */ 821 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC); 822 if (!inuse) 823 return -ENOMEM; 824 825 for_each_netdev(net, d) { 826 if (!sscanf(d->name, name, &i)) 827 continue; 828 if (i < 0 || i >= max_netdevices) 829 continue; 830 831 /* avoid cases where sscanf is not exact inverse of printf */ 832 snprintf(buf, IFNAMSIZ, name, i); 833 if (!strncmp(buf, d->name, IFNAMSIZ)) 834 set_bit(i, inuse); 835 } 836 837 i = find_first_zero_bit(inuse, max_netdevices); 838 free_page((unsigned long) inuse); 839 } 840 841 snprintf(buf, IFNAMSIZ, name, i); 842 if (!__dev_get_by_name(net, buf)) 843 return i; 844 845 /* It is possible to run out of possible slots 846 * when the name is long and there isn't enough space left 847 * for the digits, or if all bits are used. 848 */ 849 return -ENFILE; 850 } 851 852 /** 853 * dev_alloc_name - allocate a name for a device 854 * @dev: device 855 * @name: name format string 856 * 857 * Passed a format string - eg "lt%d" it will try and find a suitable 858 * id. It scans list of devices to build up a free map, then chooses 859 * the first empty slot. The caller must hold the dev_base or rtnl lock 860 * while allocating the name and adding the device in order to avoid 861 * duplicates. 862 * Limited to bits_per_byte * page size devices (ie 32K on most platforms). 863 * Returns the number of the unit assigned or a negative errno code. 864 */ 865 866 int dev_alloc_name(struct net_device *dev, const char *name) 867 { 868 char buf[IFNAMSIZ]; 869 struct net *net; 870 int ret; 871 872 BUG_ON(!dev_net(dev)); 873 net = dev_net(dev); 874 ret = __dev_alloc_name(net, name, buf); 875 if (ret >= 0) 876 strlcpy(dev->name, buf, IFNAMSIZ); 877 return ret; 878 } 879 EXPORT_SYMBOL(dev_alloc_name); 880 881 882 /** 883 * dev_change_name - change name of a device 884 * @dev: device 885 * @newname: name (or format string) must be at least IFNAMSIZ 886 * 887 * Change name of a device, can pass format strings "eth%d". 888 * for wildcarding. 889 */ 890 int dev_change_name(struct net_device *dev, const char *newname) 891 { 892 char oldname[IFNAMSIZ]; 893 int err = 0; 894 int ret; 895 struct net *net; 896 897 ASSERT_RTNL(); 898 BUG_ON(!dev_net(dev)); 899 900 net = dev_net(dev); 901 if (dev->flags & IFF_UP) 902 return -EBUSY; 903 904 if (!dev_valid_name(newname)) 905 return -EINVAL; 906 907 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) 908 return 0; 909 910 memcpy(oldname, dev->name, IFNAMSIZ); 911 912 if (strchr(newname, '%')) { 913 err = dev_alloc_name(dev, newname); 914 if (err < 0) 915 return err; 916 } else if (__dev_get_by_name(net, newname)) 917 return -EEXIST; 918 else 919 strlcpy(dev->name, newname, IFNAMSIZ); 920 921 rollback: 922 /* For now only devices in the initial network namespace 923 * are in sysfs. 924 */ 925 if (net == &init_net) { 926 ret = device_rename(&dev->dev, dev->name); 927 if (ret) { 928 memcpy(dev->name, oldname, IFNAMSIZ); 929 return ret; 930 } 931 } 932 933 write_lock_bh(&dev_base_lock); 934 hlist_del(&dev->name_hlist); 935 hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name)); 936 write_unlock_bh(&dev_base_lock); 937 938 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); 939 ret = notifier_to_errno(ret); 940 941 if (ret) { 942 if (err) { 943 printk(KERN_ERR 944 "%s: name change rollback failed: %d.\n", 945 dev->name, ret); 946 } else { 947 err = ret; 948 memcpy(dev->name, oldname, IFNAMSIZ); 949 goto rollback; 950 } 951 } 952 953 return err; 954 } 955 956 /** 957 * dev_set_alias - change ifalias of a device 958 * @dev: device 959 * @alias: name up to IFALIASZ 960 * @len: limit of bytes to copy from info 961 * 962 * Set ifalias for a device, 963 */ 964 int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 965 { 966 ASSERT_RTNL(); 967 968 if (len >= IFALIASZ) 969 return -EINVAL; 970 971 if (!len) { 972 if (dev->ifalias) { 973 kfree(dev->ifalias); 974 dev->ifalias = NULL; 975 } 976 return 0; 977 } 978 979 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 980 if (!dev->ifalias) 981 return -ENOMEM; 982 983 strlcpy(dev->ifalias, alias, len+1); 984 return len; 985 } 986 987 988 /** 989 * netdev_features_change - device changes features 990 * @dev: device to cause notification 991 * 992 * Called to indicate a device has changed features. 993 */ 994 void netdev_features_change(struct net_device *dev) 995 { 996 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev); 997 } 998 EXPORT_SYMBOL(netdev_features_change); 999 1000 /** 1001 * netdev_state_change - device changes state 1002 * @dev: device to cause notification 1003 * 1004 * Called to indicate a device has changed state. This function calls 1005 * the notifier chains for netdev_chain and sends a NEWLINK message 1006 * to the routing socket. 1007 */ 1008 void netdev_state_change(struct net_device *dev) 1009 { 1010 if (dev->flags & IFF_UP) { 1011 call_netdevice_notifiers(NETDEV_CHANGE, dev); 1012 rtmsg_ifinfo(RTM_NEWLINK, dev, 0); 1013 } 1014 } 1015 EXPORT_SYMBOL(netdev_state_change); 1016 1017 void netdev_bonding_change(struct net_device *dev, unsigned long event) 1018 { 1019 call_netdevice_notifiers(event, dev); 1020 } 1021 EXPORT_SYMBOL(netdev_bonding_change); 1022 1023 /** 1024 * dev_load - load a network module 1025 * @net: the applicable net namespace 1026 * @name: name of interface 1027 * 1028 * If a network interface is not present and the process has suitable 1029 * privileges this function loads the module. If module loading is not 1030 * available in this kernel then it becomes a nop. 1031 */ 1032 1033 void dev_load(struct net *net, const char *name) 1034 { 1035 struct net_device *dev; 1036 1037 read_lock(&dev_base_lock); 1038 dev = __dev_get_by_name(net, name); 1039 read_unlock(&dev_base_lock); 1040 1041 if (!dev && capable(CAP_NET_ADMIN)) 1042 request_module("%s", name); 1043 } 1044 EXPORT_SYMBOL(dev_load); 1045 1046 /** 1047 * dev_open - prepare an interface for use. 1048 * @dev: device to open 1049 * 1050 * Takes a device from down to up state. The device's private open 1051 * function is invoked and then the multicast lists are loaded. Finally 1052 * the device is moved into the up state and a %NETDEV_UP message is 1053 * sent to the netdev notifier chain. 1054 * 1055 * Calling this function on an active interface is a nop. On a failure 1056 * a negative errno code is returned. 1057 */ 1058 int dev_open(struct net_device *dev) 1059 { 1060 const struct net_device_ops *ops = dev->netdev_ops; 1061 int ret; 1062 1063 ASSERT_RTNL(); 1064 1065 /* 1066 * Is it already up? 1067 */ 1068 1069 if (dev->flags & IFF_UP) 1070 return 0; 1071 1072 /* 1073 * Is it even present? 1074 */ 1075 if (!netif_device_present(dev)) 1076 return -ENODEV; 1077 1078 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev); 1079 ret = notifier_to_errno(ret); 1080 if (ret) 1081 return ret; 1082 1083 /* 1084 * Call device private open method 1085 */ 1086 set_bit(__LINK_STATE_START, &dev->state); 1087 1088 if (ops->ndo_validate_addr) 1089 ret = ops->ndo_validate_addr(dev); 1090 1091 if (!ret && ops->ndo_open) 1092 ret = ops->ndo_open(dev); 1093 1094 /* 1095 * If it went open OK then: 1096 */ 1097 1098 if (ret) 1099 clear_bit(__LINK_STATE_START, &dev->state); 1100 else { 1101 /* 1102 * Set the flags. 1103 */ 1104 dev->flags |= IFF_UP; 1105 1106 /* 1107 * Enable NET_DMA 1108 */ 1109 net_dmaengine_get(); 1110 1111 /* 1112 * Initialize multicasting status 1113 */ 1114 dev_set_rx_mode(dev); 1115 1116 /* 1117 * Wakeup transmit queue engine 1118 */ 1119 dev_activate(dev); 1120 1121 /* 1122 * ... and announce new interface. 1123 */ 1124 call_netdevice_notifiers(NETDEV_UP, dev); 1125 } 1126 1127 return ret; 1128 } 1129 EXPORT_SYMBOL(dev_open); 1130 1131 /** 1132 * dev_close - shutdown an interface. 1133 * @dev: device to shutdown 1134 * 1135 * This function moves an active device into down state. A 1136 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device 1137 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier 1138 * chain. 1139 */ 1140 int dev_close(struct net_device *dev) 1141 { 1142 const struct net_device_ops *ops = dev->netdev_ops; 1143 ASSERT_RTNL(); 1144 1145 might_sleep(); 1146 1147 if (!(dev->flags & IFF_UP)) 1148 return 0; 1149 1150 /* 1151 * Tell people we are going down, so that they can 1152 * prepare to death, when device is still operating. 1153 */ 1154 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev); 1155 1156 clear_bit(__LINK_STATE_START, &dev->state); 1157 1158 /* Synchronize to scheduled poll. We cannot touch poll list, 1159 * it can be even on different cpu. So just clear netif_running(). 1160 * 1161 * dev->stop() will invoke napi_disable() on all of it's 1162 * napi_struct instances on this device. 1163 */ 1164 smp_mb__after_clear_bit(); /* Commit netif_running(). */ 1165 1166 dev_deactivate(dev); 1167 1168 /* 1169 * Call the device specific close. This cannot fail. 1170 * Only if device is UP 1171 * 1172 * We allow it to be called even after a DETACH hot-plug 1173 * event. 1174 */ 1175 if (ops->ndo_stop) 1176 ops->ndo_stop(dev); 1177 1178 /* 1179 * Device is now down. 1180 */ 1181 1182 dev->flags &= ~IFF_UP; 1183 1184 /* 1185 * Tell people we are down 1186 */ 1187 call_netdevice_notifiers(NETDEV_DOWN, dev); 1188 1189 /* 1190 * Shutdown NET_DMA 1191 */ 1192 net_dmaengine_put(); 1193 1194 return 0; 1195 } 1196 EXPORT_SYMBOL(dev_close); 1197 1198 1199 /** 1200 * dev_disable_lro - disable Large Receive Offload on a device 1201 * @dev: device 1202 * 1203 * Disable Large Receive Offload (LRO) on a net device. Must be 1204 * called under RTNL. This is needed if received packets may be 1205 * forwarded to another interface. 1206 */ 1207 void dev_disable_lro(struct net_device *dev) 1208 { 1209 if (dev->ethtool_ops && dev->ethtool_ops->get_flags && 1210 dev->ethtool_ops->set_flags) { 1211 u32 flags = dev->ethtool_ops->get_flags(dev); 1212 if (flags & ETH_FLAG_LRO) { 1213 flags &= ~ETH_FLAG_LRO; 1214 dev->ethtool_ops->set_flags(dev, flags); 1215 } 1216 } 1217 WARN_ON(dev->features & NETIF_F_LRO); 1218 } 1219 EXPORT_SYMBOL(dev_disable_lro); 1220 1221 1222 static int dev_boot_phase = 1; 1223 1224 /* 1225 * Device change register/unregister. These are not inline or static 1226 * as we export them to the world. 1227 */ 1228 1229 /** 1230 * register_netdevice_notifier - register a network notifier block 1231 * @nb: notifier 1232 * 1233 * Register a notifier to be called when network device events occur. 1234 * The notifier passed is linked into the kernel structures and must 1235 * not be reused until it has been unregistered. A negative errno code 1236 * is returned on a failure. 1237 * 1238 * When registered all registration and up events are replayed 1239 * to the new notifier to allow device to have a race free 1240 * view of the network device list. 1241 */ 1242 1243 int register_netdevice_notifier(struct notifier_block *nb) 1244 { 1245 struct net_device *dev; 1246 struct net_device *last; 1247 struct net *net; 1248 int err; 1249 1250 rtnl_lock(); 1251 err = raw_notifier_chain_register(&netdev_chain, nb); 1252 if (err) 1253 goto unlock; 1254 if (dev_boot_phase) 1255 goto unlock; 1256 for_each_net(net) { 1257 for_each_netdev(net, dev) { 1258 err = nb->notifier_call(nb, NETDEV_REGISTER, dev); 1259 err = notifier_to_errno(err); 1260 if (err) 1261 goto rollback; 1262 1263 if (!(dev->flags & IFF_UP)) 1264 continue; 1265 1266 nb->notifier_call(nb, NETDEV_UP, dev); 1267 } 1268 } 1269 1270 unlock: 1271 rtnl_unlock(); 1272 return err; 1273 1274 rollback: 1275 last = dev; 1276 for_each_net(net) { 1277 for_each_netdev(net, dev) { 1278 if (dev == last) 1279 break; 1280 1281 if (dev->flags & IFF_UP) { 1282 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev); 1283 nb->notifier_call(nb, NETDEV_DOWN, dev); 1284 } 1285 nb->notifier_call(nb, NETDEV_UNREGISTER, dev); 1286 } 1287 } 1288 1289 raw_notifier_chain_unregister(&netdev_chain, nb); 1290 goto unlock; 1291 } 1292 EXPORT_SYMBOL(register_netdevice_notifier); 1293 1294 /** 1295 * unregister_netdevice_notifier - unregister a network notifier block 1296 * @nb: notifier 1297 * 1298 * Unregister a notifier previously registered by 1299 * register_netdevice_notifier(). The notifier is unlinked into the 1300 * kernel structures and may then be reused. A negative errno code 1301 * is returned on a failure. 1302 */ 1303 1304 int unregister_netdevice_notifier(struct notifier_block *nb) 1305 { 1306 int err; 1307 1308 rtnl_lock(); 1309 err = raw_notifier_chain_unregister(&netdev_chain, nb); 1310 rtnl_unlock(); 1311 return err; 1312 } 1313 EXPORT_SYMBOL(unregister_netdevice_notifier); 1314 1315 /** 1316 * call_netdevice_notifiers - call all network notifier blocks 1317 * @val: value passed unmodified to notifier function 1318 * @dev: net_device pointer passed unmodified to notifier function 1319 * 1320 * Call all network notifier blocks. Parameters and return value 1321 * are as for raw_notifier_call_chain(). 1322 */ 1323 1324 int call_netdevice_notifiers(unsigned long val, struct net_device *dev) 1325 { 1326 return raw_notifier_call_chain(&netdev_chain, val, dev); 1327 } 1328 1329 /* When > 0 there are consumers of rx skb time stamps */ 1330 static atomic_t netstamp_needed = ATOMIC_INIT(0); 1331 1332 void net_enable_timestamp(void) 1333 { 1334 atomic_inc(&netstamp_needed); 1335 } 1336 EXPORT_SYMBOL(net_enable_timestamp); 1337 1338 void net_disable_timestamp(void) 1339 { 1340 atomic_dec(&netstamp_needed); 1341 } 1342 EXPORT_SYMBOL(net_disable_timestamp); 1343 1344 static inline void net_timestamp(struct sk_buff *skb) 1345 { 1346 if (atomic_read(&netstamp_needed)) 1347 __net_timestamp(skb); 1348 else 1349 skb->tstamp.tv64 = 0; 1350 } 1351 1352 /* 1353 * Support routine. Sends outgoing frames to any network 1354 * taps currently in use. 1355 */ 1356 1357 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 1358 { 1359 struct packet_type *ptype; 1360 1361 #ifdef CONFIG_NET_CLS_ACT 1362 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) 1363 net_timestamp(skb); 1364 #else 1365 net_timestamp(skb); 1366 #endif 1367 1368 rcu_read_lock(); 1369 list_for_each_entry_rcu(ptype, &ptype_all, list) { 1370 /* Never send packets back to the socket 1371 * they originated from - MvS (miquels@drinkel.ow.org) 1372 */ 1373 if ((ptype->dev == dev || !ptype->dev) && 1374 (ptype->af_packet_priv == NULL || 1375 (struct sock *)ptype->af_packet_priv != skb->sk)) { 1376 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 1377 if (!skb2) 1378 break; 1379 1380 /* skb->nh should be correctly 1381 set by sender, so that the second statement is 1382 just protection against buggy protocols. 1383 */ 1384 skb_reset_mac_header(skb2); 1385 1386 if (skb_network_header(skb2) < skb2->data || 1387 skb2->network_header > skb2->tail) { 1388 if (net_ratelimit()) 1389 printk(KERN_CRIT "protocol %04x is " 1390 "buggy, dev %s\n", 1391 skb2->protocol, dev->name); 1392 skb_reset_network_header(skb2); 1393 } 1394 1395 skb2->transport_header = skb2->network_header; 1396 skb2->pkt_type = PACKET_OUTGOING; 1397 ptype->func(skb2, skb->dev, ptype, skb->dev); 1398 } 1399 } 1400 rcu_read_unlock(); 1401 } 1402 1403 1404 static inline void __netif_reschedule(struct Qdisc *q) 1405 { 1406 struct softnet_data *sd; 1407 unsigned long flags; 1408 1409 local_irq_save(flags); 1410 sd = &__get_cpu_var(softnet_data); 1411 q->next_sched = sd->output_queue; 1412 sd->output_queue = q; 1413 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1414 local_irq_restore(flags); 1415 } 1416 1417 void __netif_schedule(struct Qdisc *q) 1418 { 1419 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) 1420 __netif_reschedule(q); 1421 } 1422 EXPORT_SYMBOL(__netif_schedule); 1423 1424 void dev_kfree_skb_irq(struct sk_buff *skb) 1425 { 1426 if (atomic_dec_and_test(&skb->users)) { 1427 struct softnet_data *sd; 1428 unsigned long flags; 1429 1430 local_irq_save(flags); 1431 sd = &__get_cpu_var(softnet_data); 1432 skb->next = sd->completion_queue; 1433 sd->completion_queue = skb; 1434 raise_softirq_irqoff(NET_TX_SOFTIRQ); 1435 local_irq_restore(flags); 1436 } 1437 } 1438 EXPORT_SYMBOL(dev_kfree_skb_irq); 1439 1440 void dev_kfree_skb_any(struct sk_buff *skb) 1441 { 1442 if (in_irq() || irqs_disabled()) 1443 dev_kfree_skb_irq(skb); 1444 else 1445 dev_kfree_skb(skb); 1446 } 1447 EXPORT_SYMBOL(dev_kfree_skb_any); 1448 1449 1450 /** 1451 * netif_device_detach - mark device as removed 1452 * @dev: network device 1453 * 1454 * Mark device as removed from system and therefore no longer available. 1455 */ 1456 void netif_device_detach(struct net_device *dev) 1457 { 1458 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && 1459 netif_running(dev)) { 1460 netif_tx_stop_all_queues(dev); 1461 } 1462 } 1463 EXPORT_SYMBOL(netif_device_detach); 1464 1465 /** 1466 * netif_device_attach - mark device as attached 1467 * @dev: network device 1468 * 1469 * Mark device as attached from system and restart if needed. 1470 */ 1471 void netif_device_attach(struct net_device *dev) 1472 { 1473 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && 1474 netif_running(dev)) { 1475 netif_tx_wake_all_queues(dev); 1476 __netdev_watchdog_up(dev); 1477 } 1478 } 1479 EXPORT_SYMBOL(netif_device_attach); 1480 1481 static bool can_checksum_protocol(unsigned long features, __be16 protocol) 1482 { 1483 return ((features & NETIF_F_GEN_CSUM) || 1484 ((features & NETIF_F_IP_CSUM) && 1485 protocol == htons(ETH_P_IP)) || 1486 ((features & NETIF_F_IPV6_CSUM) && 1487 protocol == htons(ETH_P_IPV6)) || 1488 ((features & NETIF_F_FCOE_CRC) && 1489 protocol == htons(ETH_P_FCOE))); 1490 } 1491 1492 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) 1493 { 1494 if (can_checksum_protocol(dev->features, skb->protocol)) 1495 return true; 1496 1497 if (skb->protocol == htons(ETH_P_8021Q)) { 1498 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 1499 if (can_checksum_protocol(dev->features & dev->vlan_features, 1500 veh->h_vlan_encapsulated_proto)) 1501 return true; 1502 } 1503 1504 return false; 1505 } 1506 1507 /* 1508 * Invalidate hardware checksum when packet is to be mangled, and 1509 * complete checksum manually on outgoing path. 1510 */ 1511 int skb_checksum_help(struct sk_buff *skb) 1512 { 1513 __wsum csum; 1514 int ret = 0, offset; 1515 1516 if (skb->ip_summed == CHECKSUM_COMPLETE) 1517 goto out_set_summed; 1518 1519 if (unlikely(skb_shinfo(skb)->gso_size)) { 1520 /* Let GSO fix up the checksum. */ 1521 goto out_set_summed; 1522 } 1523 1524 offset = skb->csum_start - skb_headroom(skb); 1525 BUG_ON(offset >= skb_headlen(skb)); 1526 csum = skb_checksum(skb, offset, skb->len - offset, 0); 1527 1528 offset += skb->csum_offset; 1529 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); 1530 1531 if (skb_cloned(skb) && 1532 !skb_clone_writable(skb, offset + sizeof(__sum16))) { 1533 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); 1534 if (ret) 1535 goto out; 1536 } 1537 1538 *(__sum16 *)(skb->data + offset) = csum_fold(csum); 1539 out_set_summed: 1540 skb->ip_summed = CHECKSUM_NONE; 1541 out: 1542 return ret; 1543 } 1544 EXPORT_SYMBOL(skb_checksum_help); 1545 1546 /** 1547 * skb_gso_segment - Perform segmentation on skb. 1548 * @skb: buffer to segment 1549 * @features: features for the output path (see dev->features) 1550 * 1551 * This function segments the given skb and returns a list of segments. 1552 * 1553 * It may return NULL if the skb requires no segmentation. This is 1554 * only possible when GSO is used for verifying header integrity. 1555 */ 1556 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1557 { 1558 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1559 struct packet_type *ptype; 1560 __be16 type = skb->protocol; 1561 int err; 1562 1563 skb_reset_mac_header(skb); 1564 skb->mac_len = skb->network_header - skb->mac_header; 1565 __skb_pull(skb, skb->mac_len); 1566 1567 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1568 struct net_device *dev = skb->dev; 1569 struct ethtool_drvinfo info = {}; 1570 1571 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo) 1572 dev->ethtool_ops->get_drvinfo(dev, &info); 1573 1574 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d " 1575 "ip_summed=%d", 1576 info.driver, dev ? dev->features : 0L, 1577 skb->sk ? skb->sk->sk_route_caps : 0L, 1578 skb->len, skb->data_len, skb->ip_summed); 1579 1580 if (skb_header_cloned(skb) && 1581 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) 1582 return ERR_PTR(err); 1583 } 1584 1585 rcu_read_lock(); 1586 list_for_each_entry_rcu(ptype, 1587 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 1588 if (ptype->type == type && !ptype->dev && ptype->gso_segment) { 1589 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 1590 err = ptype->gso_send_check(skb); 1591 segs = ERR_PTR(err); 1592 if (err || skb_gso_ok(skb, features)) 1593 break; 1594 __skb_push(skb, (skb->data - 1595 skb_network_header(skb))); 1596 } 1597 segs = ptype->gso_segment(skb, features); 1598 break; 1599 } 1600 } 1601 rcu_read_unlock(); 1602 1603 __skb_push(skb, skb->data - skb_mac_header(skb)); 1604 1605 return segs; 1606 } 1607 EXPORT_SYMBOL(skb_gso_segment); 1608 1609 /* Take action when hardware reception checksum errors are detected. */ 1610 #ifdef CONFIG_BUG 1611 void netdev_rx_csum_fault(struct net_device *dev) 1612 { 1613 if (net_ratelimit()) { 1614 printk(KERN_ERR "%s: hw csum failure.\n", 1615 dev ? dev->name : "<unknown>"); 1616 dump_stack(); 1617 } 1618 } 1619 EXPORT_SYMBOL(netdev_rx_csum_fault); 1620 #endif 1621 1622 /* Actually, we should eliminate this check as soon as we know, that: 1623 * 1. IOMMU is present and allows to map all the memory. 1624 * 2. No high memory really exists on this machine. 1625 */ 1626 1627 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) 1628 { 1629 #ifdef CONFIG_HIGHMEM 1630 int i; 1631 1632 if (dev->features & NETIF_F_HIGHDMA) 1633 return 0; 1634 1635 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) 1636 if (PageHighMem(skb_shinfo(skb)->frags[i].page)) 1637 return 1; 1638 1639 #endif 1640 return 0; 1641 } 1642 1643 struct dev_gso_cb { 1644 void (*destructor)(struct sk_buff *skb); 1645 }; 1646 1647 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb) 1648 1649 static void dev_gso_skb_destructor(struct sk_buff *skb) 1650 { 1651 struct dev_gso_cb *cb; 1652 1653 do { 1654 struct sk_buff *nskb = skb->next; 1655 1656 skb->next = nskb->next; 1657 nskb->next = NULL; 1658 kfree_skb(nskb); 1659 } while (skb->next); 1660 1661 cb = DEV_GSO_CB(skb); 1662 if (cb->destructor) 1663 cb->destructor(skb); 1664 } 1665 1666 /** 1667 * dev_gso_segment - Perform emulated hardware segmentation on skb. 1668 * @skb: buffer to segment 1669 * 1670 * This function segments the given skb and stores the list of segments 1671 * in skb->next. 1672 */ 1673 static int dev_gso_segment(struct sk_buff *skb) 1674 { 1675 struct net_device *dev = skb->dev; 1676 struct sk_buff *segs; 1677 int features = dev->features & ~(illegal_highdma(dev, skb) ? 1678 NETIF_F_SG : 0); 1679 1680 segs = skb_gso_segment(skb, features); 1681 1682 /* Verifying header integrity only. */ 1683 if (!segs) 1684 return 0; 1685 1686 if (IS_ERR(segs)) 1687 return PTR_ERR(segs); 1688 1689 skb->next = segs; 1690 DEV_GSO_CB(skb)->destructor = skb->destructor; 1691 skb->destructor = dev_gso_skb_destructor; 1692 1693 return 0; 1694 } 1695 1696 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1697 struct netdev_queue *txq) 1698 { 1699 const struct net_device_ops *ops = dev->netdev_ops; 1700 int rc; 1701 1702 if (likely(!skb->next)) { 1703 if (!list_empty(&ptype_all)) 1704 dev_queue_xmit_nit(skb, dev); 1705 1706 if (netif_needs_gso(dev, skb)) { 1707 if (unlikely(dev_gso_segment(skb))) 1708 goto out_kfree_skb; 1709 if (skb->next) 1710 goto gso; 1711 } 1712 1713 /* 1714 * If device doesnt need skb->dst, release it right now while 1715 * its hot in this cpu cache 1716 */ 1717 if (dev->priv_flags & IFF_XMIT_DST_RELEASE) 1718 skb_dst_drop(skb); 1719 1720 rc = ops->ndo_start_xmit(skb, dev); 1721 if (rc == NETDEV_TX_OK) 1722 txq_trans_update(txq); 1723 /* 1724 * TODO: if skb_orphan() was called by 1725 * dev->hard_start_xmit() (for example, the unmodified 1726 * igb driver does that; bnx2 doesn't), then 1727 * skb_tx_software_timestamp() will be unable to send 1728 * back the time stamp. 1729 * 1730 * How can this be prevented? Always create another 1731 * reference to the socket before calling 1732 * dev->hard_start_xmit()? Prevent that skb_orphan() 1733 * does anything in dev->hard_start_xmit() by clearing 1734 * the skb destructor before the call and restoring it 1735 * afterwards, then doing the skb_orphan() ourselves? 1736 */ 1737 return rc; 1738 } 1739 1740 gso: 1741 do { 1742 struct sk_buff *nskb = skb->next; 1743 1744 skb->next = nskb->next; 1745 nskb->next = NULL; 1746 rc = ops->ndo_start_xmit(nskb, dev); 1747 if (unlikely(rc != NETDEV_TX_OK)) { 1748 nskb->next = skb->next; 1749 skb->next = nskb; 1750 return rc; 1751 } 1752 txq_trans_update(txq); 1753 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 1754 return NETDEV_TX_BUSY; 1755 } while (skb->next); 1756 1757 skb->destructor = DEV_GSO_CB(skb)->destructor; 1758 1759 out_kfree_skb: 1760 kfree_skb(skb); 1761 return NETDEV_TX_OK; 1762 } 1763 1764 static u32 skb_tx_hashrnd; 1765 1766 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) 1767 { 1768 u32 hash; 1769 1770 if (skb_rx_queue_recorded(skb)) { 1771 hash = skb_get_rx_queue(skb); 1772 while (unlikely(hash >= dev->real_num_tx_queues)) 1773 hash -= dev->real_num_tx_queues; 1774 return hash; 1775 } 1776 1777 if (skb->sk && skb->sk->sk_hash) 1778 hash = skb->sk->sk_hash; 1779 else 1780 hash = skb->protocol; 1781 1782 hash = jhash_1word(hash, skb_tx_hashrnd); 1783 1784 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1785 } 1786 EXPORT_SYMBOL(skb_tx_hash); 1787 1788 static struct netdev_queue *dev_pick_tx(struct net_device *dev, 1789 struct sk_buff *skb) 1790 { 1791 u16 queue_index; 1792 struct sock *sk = skb->sk; 1793 1794 if (sk_tx_queue_recorded(sk)) { 1795 queue_index = sk_tx_queue_get(sk); 1796 } else { 1797 const struct net_device_ops *ops = dev->netdev_ops; 1798 1799 if (ops->ndo_select_queue) { 1800 queue_index = ops->ndo_select_queue(dev, skb); 1801 } else { 1802 queue_index = 0; 1803 if (dev->real_num_tx_queues > 1) 1804 queue_index = skb_tx_hash(dev, skb); 1805 1806 if (sk && sk->sk_dst_cache) 1807 sk_tx_queue_set(sk, queue_index); 1808 } 1809 } 1810 1811 skb_set_queue_mapping(skb, queue_index); 1812 return netdev_get_tx_queue(dev, queue_index); 1813 } 1814 1815 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, 1816 struct net_device *dev, 1817 struct netdev_queue *txq) 1818 { 1819 spinlock_t *root_lock = qdisc_lock(q); 1820 int rc; 1821 1822 spin_lock(root_lock); 1823 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { 1824 kfree_skb(skb); 1825 rc = NET_XMIT_DROP; 1826 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && 1827 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) { 1828 /* 1829 * This is a work-conserving queue; there are no old skbs 1830 * waiting to be sent out; and the qdisc is not running - 1831 * xmit the skb directly. 1832 */ 1833 __qdisc_update_bstats(q, skb->len); 1834 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) 1835 __qdisc_run(q); 1836 else 1837 clear_bit(__QDISC_STATE_RUNNING, &q->state); 1838 1839 rc = NET_XMIT_SUCCESS; 1840 } else { 1841 rc = qdisc_enqueue_root(skb, q); 1842 qdisc_run(q); 1843 } 1844 spin_unlock(root_lock); 1845 1846 return rc; 1847 } 1848 1849 /** 1850 * dev_queue_xmit - transmit a buffer 1851 * @skb: buffer to transmit 1852 * 1853 * Queue a buffer for transmission to a network device. The caller must 1854 * have set the device and priority and built the buffer before calling 1855 * this function. The function can be called from an interrupt. 1856 * 1857 * A negative errno code is returned on a failure. A success does not 1858 * guarantee the frame will be transmitted as it may be dropped due 1859 * to congestion or traffic shaping. 1860 * 1861 * ----------------------------------------------------------------------------------- 1862 * I notice this method can also return errors from the queue disciplines, 1863 * including NET_XMIT_DROP, which is a positive value. So, errors can also 1864 * be positive. 1865 * 1866 * Regardless of the return value, the skb is consumed, so it is currently 1867 * difficult to retry a send to this method. (You can bump the ref count 1868 * before sending to hold a reference for retry if you are careful.) 1869 * 1870 * When calling this method, interrupts MUST be enabled. This is because 1871 * the BH enable code must have IRQs enabled so that it will not deadlock. 1872 * --BLG 1873 */ 1874 int dev_queue_xmit(struct sk_buff *skb) 1875 { 1876 struct net_device *dev = skb->dev; 1877 struct netdev_queue *txq; 1878 struct Qdisc *q; 1879 int rc = -ENOMEM; 1880 1881 /* GSO will handle the following emulations directly. */ 1882 if (netif_needs_gso(dev, skb)) 1883 goto gso; 1884 1885 if (skb_has_frags(skb) && 1886 !(dev->features & NETIF_F_FRAGLIST) && 1887 __skb_linearize(skb)) 1888 goto out_kfree_skb; 1889 1890 /* Fragmented skb is linearized if device does not support SG, 1891 * or if at least one of fragments is in highmem and device 1892 * does not support DMA from it. 1893 */ 1894 if (skb_shinfo(skb)->nr_frags && 1895 (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && 1896 __skb_linearize(skb)) 1897 goto out_kfree_skb; 1898 1899 /* If packet is not checksummed and device does not support 1900 * checksumming for this protocol, complete checksumming here. 1901 */ 1902 if (skb->ip_summed == CHECKSUM_PARTIAL) { 1903 skb_set_transport_header(skb, skb->csum_start - 1904 skb_headroom(skb)); 1905 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb)) 1906 goto out_kfree_skb; 1907 } 1908 1909 gso: 1910 /* Disable soft irqs for various locks below. Also 1911 * stops preemption for RCU. 1912 */ 1913 rcu_read_lock_bh(); 1914 1915 txq = dev_pick_tx(dev, skb); 1916 q = rcu_dereference(txq->qdisc); 1917 1918 #ifdef CONFIG_NET_CLS_ACT 1919 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); 1920 #endif 1921 if (q->enqueue) { 1922 rc = __dev_xmit_skb(skb, q, dev, txq); 1923 goto out; 1924 } 1925 1926 /* The device has no queue. Common case for software devices: 1927 loopback, all the sorts of tunnels... 1928 1929 Really, it is unlikely that netif_tx_lock protection is necessary 1930 here. (f.e. loopback and IP tunnels are clean ignoring statistics 1931 counters.) 1932 However, it is possible, that they rely on protection 1933 made by us here. 1934 1935 Check this and shot the lock. It is not prone from deadlocks. 1936 Either shot noqueue qdisc, it is even simpler 8) 1937 */ 1938 if (dev->flags & IFF_UP) { 1939 int cpu = smp_processor_id(); /* ok because BHs are off */ 1940 1941 if (txq->xmit_lock_owner != cpu) { 1942 1943 HARD_TX_LOCK(dev, txq, cpu); 1944 1945 if (!netif_tx_queue_stopped(txq)) { 1946 rc = NET_XMIT_SUCCESS; 1947 if (!dev_hard_start_xmit(skb, dev, txq)) { 1948 HARD_TX_UNLOCK(dev, txq); 1949 goto out; 1950 } 1951 } 1952 HARD_TX_UNLOCK(dev, txq); 1953 if (net_ratelimit()) 1954 printk(KERN_CRIT "Virtual device %s asks to " 1955 "queue packet!\n", dev->name); 1956 } else { 1957 /* Recursion is detected! It is possible, 1958 * unfortunately */ 1959 if (net_ratelimit()) 1960 printk(KERN_CRIT "Dead loop on virtual device " 1961 "%s, fix it urgently!\n", dev->name); 1962 } 1963 } 1964 1965 rc = -ENETDOWN; 1966 rcu_read_unlock_bh(); 1967 1968 out_kfree_skb: 1969 kfree_skb(skb); 1970 return rc; 1971 out: 1972 rcu_read_unlock_bh(); 1973 return rc; 1974 } 1975 EXPORT_SYMBOL(dev_queue_xmit); 1976 1977 1978 /*======================================================================= 1979 Receiver routines 1980 =======================================================================*/ 1981 1982 int netdev_max_backlog __read_mostly = 1000; 1983 int netdev_budget __read_mostly = 300; 1984 int weight_p __read_mostly = 64; /* old backlog weight */ 1985 1986 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; 1987 1988 1989 /** 1990 * netif_rx - post buffer to the network code 1991 * @skb: buffer to post 1992 * 1993 * This function receives a packet from a device driver and queues it for 1994 * the upper (protocol) levels to process. It always succeeds. The buffer 1995 * may be dropped during processing for congestion control or by the 1996 * protocol layers. 1997 * 1998 * return values: 1999 * NET_RX_SUCCESS (no congestion) 2000 * NET_RX_DROP (packet was dropped) 2001 * 2002 */ 2003 2004 int netif_rx(struct sk_buff *skb) 2005 { 2006 struct softnet_data *queue; 2007 unsigned long flags; 2008 2009 /* if netpoll wants it, pretend we never saw it */ 2010 if (netpoll_rx(skb)) 2011 return NET_RX_DROP; 2012 2013 if (!skb->tstamp.tv64) 2014 net_timestamp(skb); 2015 2016 /* 2017 * The code is rearranged so that the path is the most 2018 * short when CPU is congested, but is still operating. 2019 */ 2020 local_irq_save(flags); 2021 queue = &__get_cpu_var(softnet_data); 2022 2023 __get_cpu_var(netdev_rx_stat).total++; 2024 if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { 2025 if (queue->input_pkt_queue.qlen) { 2026 enqueue: 2027 __skb_queue_tail(&queue->input_pkt_queue, skb); 2028 local_irq_restore(flags); 2029 return NET_RX_SUCCESS; 2030 } 2031 2032 napi_schedule(&queue->backlog); 2033 goto enqueue; 2034 } 2035 2036 __get_cpu_var(netdev_rx_stat).dropped++; 2037 local_irq_restore(flags); 2038 2039 kfree_skb(skb); 2040 return NET_RX_DROP; 2041 } 2042 EXPORT_SYMBOL(netif_rx); 2043 2044 int netif_rx_ni(struct sk_buff *skb) 2045 { 2046 int err; 2047 2048 preempt_disable(); 2049 err = netif_rx(skb); 2050 if (local_softirq_pending()) 2051 do_softirq(); 2052 preempt_enable(); 2053 2054 return err; 2055 } 2056 EXPORT_SYMBOL(netif_rx_ni); 2057 2058 static void net_tx_action(struct softirq_action *h) 2059 { 2060 struct softnet_data *sd = &__get_cpu_var(softnet_data); 2061 2062 if (sd->completion_queue) { 2063 struct sk_buff *clist; 2064 2065 local_irq_disable(); 2066 clist = sd->completion_queue; 2067 sd->completion_queue = NULL; 2068 local_irq_enable(); 2069 2070 while (clist) { 2071 struct sk_buff *skb = clist; 2072 clist = clist->next; 2073 2074 WARN_ON(atomic_read(&skb->users)); 2075 __kfree_skb(skb); 2076 } 2077 } 2078 2079 if (sd->output_queue) { 2080 struct Qdisc *head; 2081 2082 local_irq_disable(); 2083 head = sd->output_queue; 2084 sd->output_queue = NULL; 2085 local_irq_enable(); 2086 2087 while (head) { 2088 struct Qdisc *q = head; 2089 spinlock_t *root_lock; 2090 2091 head = head->next_sched; 2092 2093 root_lock = qdisc_lock(q); 2094 if (spin_trylock(root_lock)) { 2095 smp_mb__before_clear_bit(); 2096 clear_bit(__QDISC_STATE_SCHED, 2097 &q->state); 2098 qdisc_run(q); 2099 spin_unlock(root_lock); 2100 } else { 2101 if (!test_bit(__QDISC_STATE_DEACTIVATED, 2102 &q->state)) { 2103 __netif_reschedule(q); 2104 } else { 2105 smp_mb__before_clear_bit(); 2106 clear_bit(__QDISC_STATE_SCHED, 2107 &q->state); 2108 } 2109 } 2110 } 2111 } 2112 } 2113 2114 static inline int deliver_skb(struct sk_buff *skb, 2115 struct packet_type *pt_prev, 2116 struct net_device *orig_dev) 2117 { 2118 atomic_inc(&skb->users); 2119 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2120 } 2121 2122 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) 2123 2124 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE) 2125 /* This hook is defined here for ATM LANE */ 2126 int (*br_fdb_test_addr_hook)(struct net_device *dev, 2127 unsigned char *addr) __read_mostly; 2128 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); 2129 #endif 2130 2131 /* 2132 * If bridge module is loaded call bridging hook. 2133 * returns NULL if packet was consumed. 2134 */ 2135 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p, 2136 struct sk_buff *skb) __read_mostly; 2137 EXPORT_SYMBOL_GPL(br_handle_frame_hook); 2138 2139 static inline struct sk_buff *handle_bridge(struct sk_buff *skb, 2140 struct packet_type **pt_prev, int *ret, 2141 struct net_device *orig_dev) 2142 { 2143 struct net_bridge_port *port; 2144 2145 if (skb->pkt_type == PACKET_LOOPBACK || 2146 (port = rcu_dereference(skb->dev->br_port)) == NULL) 2147 return skb; 2148 2149 if (*pt_prev) { 2150 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2151 *pt_prev = NULL; 2152 } 2153 2154 return br_handle_frame_hook(port, skb); 2155 } 2156 #else 2157 #define handle_bridge(skb, pt_prev, ret, orig_dev) (skb) 2158 #endif 2159 2160 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE) 2161 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly; 2162 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook); 2163 2164 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb, 2165 struct packet_type **pt_prev, 2166 int *ret, 2167 struct net_device *orig_dev) 2168 { 2169 if (skb->dev->macvlan_port == NULL) 2170 return skb; 2171 2172 if (*pt_prev) { 2173 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2174 *pt_prev = NULL; 2175 } 2176 return macvlan_handle_frame_hook(skb); 2177 } 2178 #else 2179 #define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb) 2180 #endif 2181 2182 #ifdef CONFIG_NET_CLS_ACT 2183 /* TODO: Maybe we should just force sch_ingress to be compiled in 2184 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions 2185 * a compare and 2 stores extra right now if we dont have it on 2186 * but have CONFIG_NET_CLS_ACT 2187 * NOTE: This doesnt stop any functionality; if you dont have 2188 * the ingress scheduler, you just cant add policies on ingress. 2189 * 2190 */ 2191 static int ing_filter(struct sk_buff *skb) 2192 { 2193 struct net_device *dev = skb->dev; 2194 u32 ttl = G_TC_RTTL(skb->tc_verd); 2195 struct netdev_queue *rxq; 2196 int result = TC_ACT_OK; 2197 struct Qdisc *q; 2198 2199 if (MAX_RED_LOOP < ttl++) { 2200 printk(KERN_WARNING 2201 "Redir loop detected Dropping packet (%d->%d)\n", 2202 skb->iif, dev->ifindex); 2203 return TC_ACT_SHOT; 2204 } 2205 2206 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); 2207 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); 2208 2209 rxq = &dev->rx_queue; 2210 2211 q = rxq->qdisc; 2212 if (q != &noop_qdisc) { 2213 spin_lock(qdisc_lock(q)); 2214 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) 2215 result = qdisc_enqueue_root(skb, q); 2216 spin_unlock(qdisc_lock(q)); 2217 } 2218 2219 return result; 2220 } 2221 2222 static inline struct sk_buff *handle_ing(struct sk_buff *skb, 2223 struct packet_type **pt_prev, 2224 int *ret, struct net_device *orig_dev) 2225 { 2226 if (skb->dev->rx_queue.qdisc == &noop_qdisc) 2227 goto out; 2228 2229 if (*pt_prev) { 2230 *ret = deliver_skb(skb, *pt_prev, orig_dev); 2231 *pt_prev = NULL; 2232 } else { 2233 /* Huh? Why does turning on AF_PACKET affect this? */ 2234 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); 2235 } 2236 2237 switch (ing_filter(skb)) { 2238 case TC_ACT_SHOT: 2239 case TC_ACT_STOLEN: 2240 kfree_skb(skb); 2241 return NULL; 2242 } 2243 2244 out: 2245 skb->tc_verd = 0; 2246 return skb; 2247 } 2248 #endif 2249 2250 /* 2251 * netif_nit_deliver - deliver received packets to network taps 2252 * @skb: buffer 2253 * 2254 * This function is used to deliver incoming packets to network 2255 * taps. It should be used when the normal netif_receive_skb path 2256 * is bypassed, for example because of VLAN acceleration. 2257 */ 2258 void netif_nit_deliver(struct sk_buff *skb) 2259 { 2260 struct packet_type *ptype; 2261 2262 if (list_empty(&ptype_all)) 2263 return; 2264 2265 skb_reset_network_header(skb); 2266 skb_reset_transport_header(skb); 2267 skb->mac_len = skb->network_header - skb->mac_header; 2268 2269 rcu_read_lock(); 2270 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2271 if (!ptype->dev || ptype->dev == skb->dev) 2272 deliver_skb(skb, ptype, skb->dev); 2273 } 2274 rcu_read_unlock(); 2275 } 2276 2277 /** 2278 * netif_receive_skb - process receive buffer from network 2279 * @skb: buffer to process 2280 * 2281 * netif_receive_skb() is the main receive data processing function. 2282 * It always succeeds. The buffer may be dropped during processing 2283 * for congestion control or by the protocol layers. 2284 * 2285 * This function may only be called from softirq context and interrupts 2286 * should be enabled. 2287 * 2288 * Return values (usually ignored): 2289 * NET_RX_SUCCESS: no congestion 2290 * NET_RX_DROP: packet was dropped 2291 */ 2292 int netif_receive_skb(struct sk_buff *skb) 2293 { 2294 struct packet_type *ptype, *pt_prev; 2295 struct net_device *orig_dev; 2296 struct net_device *null_or_orig; 2297 int ret = NET_RX_DROP; 2298 __be16 type; 2299 2300 if (!skb->tstamp.tv64) 2301 net_timestamp(skb); 2302 2303 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) 2304 return NET_RX_SUCCESS; 2305 2306 /* if we've gotten here through NAPI, check netpoll */ 2307 if (netpoll_receive_skb(skb)) 2308 return NET_RX_DROP; 2309 2310 if (!skb->iif) 2311 skb->iif = skb->dev->ifindex; 2312 2313 null_or_orig = NULL; 2314 orig_dev = skb->dev; 2315 if (orig_dev->master) { 2316 if (skb_bond_should_drop(skb)) 2317 null_or_orig = orig_dev; /* deliver only exact match */ 2318 else 2319 skb->dev = orig_dev->master; 2320 } 2321 2322 __get_cpu_var(netdev_rx_stat).total++; 2323 2324 skb_reset_network_header(skb); 2325 skb_reset_transport_header(skb); 2326 skb->mac_len = skb->network_header - skb->mac_header; 2327 2328 pt_prev = NULL; 2329 2330 rcu_read_lock(); 2331 2332 #ifdef CONFIG_NET_CLS_ACT 2333 if (skb->tc_verd & TC_NCLS) { 2334 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2335 goto ncls; 2336 } 2337 #endif 2338 2339 list_for_each_entry_rcu(ptype, &ptype_all, list) { 2340 if (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2341 ptype->dev == orig_dev) { 2342 if (pt_prev) 2343 ret = deliver_skb(skb, pt_prev, orig_dev); 2344 pt_prev = ptype; 2345 } 2346 } 2347 2348 #ifdef CONFIG_NET_CLS_ACT 2349 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 2350 if (!skb) 2351 goto out; 2352 ncls: 2353 #endif 2354 2355 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev); 2356 if (!skb) 2357 goto out; 2358 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev); 2359 if (!skb) 2360 goto out; 2361 2362 type = skb->protocol; 2363 list_for_each_entry_rcu(ptype, 2364 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) { 2365 if (ptype->type == type && 2366 (ptype->dev == null_or_orig || ptype->dev == skb->dev || 2367 ptype->dev == orig_dev)) { 2368 if (pt_prev) 2369 ret = deliver_skb(skb, pt_prev, orig_dev); 2370 pt_prev = ptype; 2371 } 2372 } 2373 2374 if (pt_prev) { 2375 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 2376 } else { 2377 kfree_skb(skb); 2378 /* Jamal, now you will not able to escape explaining 2379 * me how you were going to use this. :-) 2380 */ 2381 ret = NET_RX_DROP; 2382 } 2383 2384 out: 2385 rcu_read_unlock(); 2386 return ret; 2387 } 2388 EXPORT_SYMBOL(netif_receive_skb); 2389 2390 /* Network device is going away, flush any packets still pending */ 2391 static void flush_backlog(void *arg) 2392 { 2393 struct net_device *dev = arg; 2394 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2395 struct sk_buff *skb, *tmp; 2396 2397 skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp) 2398 if (skb->dev == dev) { 2399 __skb_unlink(skb, &queue->input_pkt_queue); 2400 kfree_skb(skb); 2401 } 2402 } 2403 2404 static int napi_gro_complete(struct sk_buff *skb) 2405 { 2406 struct packet_type *ptype; 2407 __be16 type = skb->protocol; 2408 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 2409 int err = -ENOENT; 2410 2411 if (NAPI_GRO_CB(skb)->count == 1) { 2412 skb_shinfo(skb)->gso_size = 0; 2413 goto out; 2414 } 2415 2416 rcu_read_lock(); 2417 list_for_each_entry_rcu(ptype, head, list) { 2418 if (ptype->type != type || ptype->dev || !ptype->gro_complete) 2419 continue; 2420 2421 err = ptype->gro_complete(skb); 2422 break; 2423 } 2424 rcu_read_unlock(); 2425 2426 if (err) { 2427 WARN_ON(&ptype->list == head); 2428 kfree_skb(skb); 2429 return NET_RX_SUCCESS; 2430 } 2431 2432 out: 2433 return netif_receive_skb(skb); 2434 } 2435 2436 void napi_gro_flush(struct napi_struct *napi) 2437 { 2438 struct sk_buff *skb, *next; 2439 2440 for (skb = napi->gro_list; skb; skb = next) { 2441 next = skb->next; 2442 skb->next = NULL; 2443 napi_gro_complete(skb); 2444 } 2445 2446 napi->gro_count = 0; 2447 napi->gro_list = NULL; 2448 } 2449 EXPORT_SYMBOL(napi_gro_flush); 2450 2451 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2452 { 2453 struct sk_buff **pp = NULL; 2454 struct packet_type *ptype; 2455 __be16 type = skb->protocol; 2456 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; 2457 int same_flow; 2458 int mac_len; 2459 int ret; 2460 2461 if (!(skb->dev->features & NETIF_F_GRO)) 2462 goto normal; 2463 2464 if (skb_is_gso(skb) || skb_has_frags(skb)) 2465 goto normal; 2466 2467 rcu_read_lock(); 2468 list_for_each_entry_rcu(ptype, head, list) { 2469 if (ptype->type != type || ptype->dev || !ptype->gro_receive) 2470 continue; 2471 2472 skb_set_network_header(skb, skb_gro_offset(skb)); 2473 mac_len = skb->network_header - skb->mac_header; 2474 skb->mac_len = mac_len; 2475 NAPI_GRO_CB(skb)->same_flow = 0; 2476 NAPI_GRO_CB(skb)->flush = 0; 2477 NAPI_GRO_CB(skb)->free = 0; 2478 2479 pp = ptype->gro_receive(&napi->gro_list, skb); 2480 break; 2481 } 2482 rcu_read_unlock(); 2483 2484 if (&ptype->list == head) 2485 goto normal; 2486 2487 same_flow = NAPI_GRO_CB(skb)->same_flow; 2488 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED; 2489 2490 if (pp) { 2491 struct sk_buff *nskb = *pp; 2492 2493 *pp = nskb->next; 2494 nskb->next = NULL; 2495 napi_gro_complete(nskb); 2496 napi->gro_count--; 2497 } 2498 2499 if (same_flow) 2500 goto ok; 2501 2502 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS) 2503 goto normal; 2504 2505 napi->gro_count++; 2506 NAPI_GRO_CB(skb)->count = 1; 2507 skb_shinfo(skb)->gso_size = skb_gro_len(skb); 2508 skb->next = napi->gro_list; 2509 napi->gro_list = skb; 2510 ret = GRO_HELD; 2511 2512 pull: 2513 if (skb_headlen(skb) < skb_gro_offset(skb)) { 2514 int grow = skb_gro_offset(skb) - skb_headlen(skb); 2515 2516 BUG_ON(skb->end - skb->tail < grow); 2517 2518 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); 2519 2520 skb->tail += grow; 2521 skb->data_len -= grow; 2522 2523 skb_shinfo(skb)->frags[0].page_offset += grow; 2524 skb_shinfo(skb)->frags[0].size -= grow; 2525 2526 if (unlikely(!skb_shinfo(skb)->frags[0].size)) { 2527 put_page(skb_shinfo(skb)->frags[0].page); 2528 memmove(skb_shinfo(skb)->frags, 2529 skb_shinfo(skb)->frags + 1, 2530 --skb_shinfo(skb)->nr_frags); 2531 } 2532 } 2533 2534 ok: 2535 return ret; 2536 2537 normal: 2538 ret = GRO_NORMAL; 2539 goto pull; 2540 } 2541 EXPORT_SYMBOL(dev_gro_receive); 2542 2543 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2544 { 2545 struct sk_buff *p; 2546 2547 if (netpoll_rx_on(skb)) 2548 return GRO_NORMAL; 2549 2550 for (p = napi->gro_list; p; p = p->next) { 2551 NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) 2552 && !compare_ether_header(skb_mac_header(p), 2553 skb_gro_mac_header(skb)); 2554 NAPI_GRO_CB(p)->flush = 0; 2555 } 2556 2557 return dev_gro_receive(napi, skb); 2558 } 2559 2560 int napi_skb_finish(int ret, struct sk_buff *skb) 2561 { 2562 int err = NET_RX_SUCCESS; 2563 2564 switch (ret) { 2565 case GRO_NORMAL: 2566 return netif_receive_skb(skb); 2567 2568 case GRO_DROP: 2569 err = NET_RX_DROP; 2570 /* fall through */ 2571 2572 case GRO_MERGED_FREE: 2573 kfree_skb(skb); 2574 break; 2575 } 2576 2577 return err; 2578 } 2579 EXPORT_SYMBOL(napi_skb_finish); 2580 2581 void skb_gro_reset_offset(struct sk_buff *skb) 2582 { 2583 NAPI_GRO_CB(skb)->data_offset = 0; 2584 NAPI_GRO_CB(skb)->frag0 = NULL; 2585 NAPI_GRO_CB(skb)->frag0_len = 0; 2586 2587 if (skb->mac_header == skb->tail && 2588 !PageHighMem(skb_shinfo(skb)->frags[0].page)) { 2589 NAPI_GRO_CB(skb)->frag0 = 2590 page_address(skb_shinfo(skb)->frags[0].page) + 2591 skb_shinfo(skb)->frags[0].page_offset; 2592 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size; 2593 } 2594 } 2595 EXPORT_SYMBOL(skb_gro_reset_offset); 2596 2597 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) 2598 { 2599 skb_gro_reset_offset(skb); 2600 2601 return napi_skb_finish(__napi_gro_receive(napi, skb), skb); 2602 } 2603 EXPORT_SYMBOL(napi_gro_receive); 2604 2605 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) 2606 { 2607 __skb_pull(skb, skb_headlen(skb)); 2608 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); 2609 2610 napi->skb = skb; 2611 } 2612 EXPORT_SYMBOL(napi_reuse_skb); 2613 2614 struct sk_buff *napi_get_frags(struct napi_struct *napi) 2615 { 2616 struct sk_buff *skb = napi->skb; 2617 2618 if (!skb) { 2619 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD); 2620 if (skb) 2621 napi->skb = skb; 2622 } 2623 return skb; 2624 } 2625 EXPORT_SYMBOL(napi_get_frags); 2626 2627 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret) 2628 { 2629 int err = NET_RX_SUCCESS; 2630 2631 switch (ret) { 2632 case GRO_NORMAL: 2633 case GRO_HELD: 2634 skb->protocol = eth_type_trans(skb, napi->dev); 2635 2636 if (ret == GRO_NORMAL) 2637 return netif_receive_skb(skb); 2638 2639 skb_gro_pull(skb, -ETH_HLEN); 2640 break; 2641 2642 case GRO_DROP: 2643 err = NET_RX_DROP; 2644 /* fall through */ 2645 2646 case GRO_MERGED_FREE: 2647 napi_reuse_skb(napi, skb); 2648 break; 2649 } 2650 2651 return err; 2652 } 2653 EXPORT_SYMBOL(napi_frags_finish); 2654 2655 struct sk_buff *napi_frags_skb(struct napi_struct *napi) 2656 { 2657 struct sk_buff *skb = napi->skb; 2658 struct ethhdr *eth; 2659 unsigned int hlen; 2660 unsigned int off; 2661 2662 napi->skb = NULL; 2663 2664 skb_reset_mac_header(skb); 2665 skb_gro_reset_offset(skb); 2666 2667 off = skb_gro_offset(skb); 2668 hlen = off + sizeof(*eth); 2669 eth = skb_gro_header_fast(skb, off); 2670 if (skb_gro_header_hard(skb, hlen)) { 2671 eth = skb_gro_header_slow(skb, hlen, off); 2672 if (unlikely(!eth)) { 2673 napi_reuse_skb(napi, skb); 2674 skb = NULL; 2675 goto out; 2676 } 2677 } 2678 2679 skb_gro_pull(skb, sizeof(*eth)); 2680 2681 /* 2682 * This works because the only protocols we care about don't require 2683 * special handling. We'll fix it up properly at the end. 2684 */ 2685 skb->protocol = eth->h_proto; 2686 2687 out: 2688 return skb; 2689 } 2690 EXPORT_SYMBOL(napi_frags_skb); 2691 2692 int napi_gro_frags(struct napi_struct *napi) 2693 { 2694 struct sk_buff *skb = napi_frags_skb(napi); 2695 2696 if (!skb) 2697 return NET_RX_DROP; 2698 2699 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb)); 2700 } 2701 EXPORT_SYMBOL(napi_gro_frags); 2702 2703 static int process_backlog(struct napi_struct *napi, int quota) 2704 { 2705 int work = 0; 2706 struct softnet_data *queue = &__get_cpu_var(softnet_data); 2707 unsigned long start_time = jiffies; 2708 2709 napi->weight = weight_p; 2710 do { 2711 struct sk_buff *skb; 2712 2713 local_irq_disable(); 2714 skb = __skb_dequeue(&queue->input_pkt_queue); 2715 if (!skb) { 2716 __napi_complete(napi); 2717 local_irq_enable(); 2718 break; 2719 } 2720 local_irq_enable(); 2721 2722 netif_receive_skb(skb); 2723 } while (++work < quota && jiffies == start_time); 2724 2725 return work; 2726 } 2727 2728 /** 2729 * __napi_schedule - schedule for receive 2730 * @n: entry to schedule 2731 * 2732 * The entry's receive function will be scheduled to run 2733 */ 2734 void __napi_schedule(struct napi_struct *n) 2735 { 2736 unsigned long flags; 2737 2738 local_irq_save(flags); 2739 list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list); 2740 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2741 local_irq_restore(flags); 2742 } 2743 EXPORT_SYMBOL(__napi_schedule); 2744 2745 void __napi_complete(struct napi_struct *n) 2746 { 2747 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); 2748 BUG_ON(n->gro_list); 2749 2750 list_del(&n->poll_list); 2751 smp_mb__before_clear_bit(); 2752 clear_bit(NAPI_STATE_SCHED, &n->state); 2753 } 2754 EXPORT_SYMBOL(__napi_complete); 2755 2756 void napi_complete(struct napi_struct *n) 2757 { 2758 unsigned long flags; 2759 2760 /* 2761 * don't let napi dequeue from the cpu poll list 2762 * just in case its running on a different cpu 2763 */ 2764 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) 2765 return; 2766 2767 napi_gro_flush(n); 2768 local_irq_save(flags); 2769 __napi_complete(n); 2770 local_irq_restore(flags); 2771 } 2772 EXPORT_SYMBOL(napi_complete); 2773 2774 void netif_napi_add(struct net_device *dev, struct napi_struct *napi, 2775 int (*poll)(struct napi_struct *, int), int weight) 2776 { 2777 INIT_LIST_HEAD(&napi->poll_list); 2778 napi->gro_count = 0; 2779 napi->gro_list = NULL; 2780 napi->skb = NULL; 2781 napi->poll = poll; 2782 napi->weight = weight; 2783 list_add(&napi->dev_list, &dev->napi_list); 2784 napi->dev = dev; 2785 #ifdef CONFIG_NETPOLL 2786 spin_lock_init(&napi->poll_lock); 2787 napi->poll_owner = -1; 2788 #endif 2789 set_bit(NAPI_STATE_SCHED, &napi->state); 2790 } 2791 EXPORT_SYMBOL(netif_napi_add); 2792 2793 void netif_napi_del(struct napi_struct *napi) 2794 { 2795 struct sk_buff *skb, *next; 2796 2797 list_del_init(&napi->dev_list); 2798 napi_free_frags(napi); 2799 2800 for (skb = napi->gro_list; skb; skb = next) { 2801 next = skb->next; 2802 skb->next = NULL; 2803 kfree_skb(skb); 2804 } 2805 2806 napi->gro_list = NULL; 2807 napi->gro_count = 0; 2808 } 2809 EXPORT_SYMBOL(netif_napi_del); 2810 2811 2812 static void net_rx_action(struct softirq_action *h) 2813 { 2814 struct list_head *list = &__get_cpu_var(softnet_data).poll_list; 2815 unsigned long time_limit = jiffies + 2; 2816 int budget = netdev_budget; 2817 void *have; 2818 2819 local_irq_disable(); 2820 2821 while (!list_empty(list)) { 2822 struct napi_struct *n; 2823 int work, weight; 2824 2825 /* If softirq window is exhuasted then punt. 2826 * Allow this to run for 2 jiffies since which will allow 2827 * an average latency of 1.5/HZ. 2828 */ 2829 if (unlikely(budget <= 0 || time_after(jiffies, time_limit))) 2830 goto softnet_break; 2831 2832 local_irq_enable(); 2833 2834 /* Even though interrupts have been re-enabled, this 2835 * access is safe because interrupts can only add new 2836 * entries to the tail of this list, and only ->poll() 2837 * calls can remove this head entry from the list. 2838 */ 2839 n = list_entry(list->next, struct napi_struct, poll_list); 2840 2841 have = netpoll_poll_lock(n); 2842 2843 weight = n->weight; 2844 2845 /* This NAPI_STATE_SCHED test is for avoiding a race 2846 * with netpoll's poll_napi(). Only the entity which 2847 * obtains the lock and sees NAPI_STATE_SCHED set will 2848 * actually make the ->poll() call. Therefore we avoid 2849 * accidently calling ->poll() when NAPI is not scheduled. 2850 */ 2851 work = 0; 2852 if (test_bit(NAPI_STATE_SCHED, &n->state)) { 2853 work = n->poll(n, weight); 2854 trace_napi_poll(n); 2855 } 2856 2857 WARN_ON_ONCE(work > weight); 2858 2859 budget -= work; 2860 2861 local_irq_disable(); 2862 2863 /* Drivers must not modify the NAPI state if they 2864 * consume the entire weight. In such cases this code 2865 * still "owns" the NAPI instance and therefore can 2866 * move the instance around on the list at-will. 2867 */ 2868 if (unlikely(work == weight)) { 2869 if (unlikely(napi_disable_pending(n))) { 2870 local_irq_enable(); 2871 napi_complete(n); 2872 local_irq_disable(); 2873 } else 2874 list_move_tail(&n->poll_list, list); 2875 } 2876 2877 netpoll_poll_unlock(have); 2878 } 2879 out: 2880 local_irq_enable(); 2881 2882 #ifdef CONFIG_NET_DMA 2883 /* 2884 * There may not be any more sk_buffs coming right now, so push 2885 * any pending DMA copies to hardware 2886 */ 2887 dma_issue_pending_all(); 2888 #endif 2889 2890 return; 2891 2892 softnet_break: 2893 __get_cpu_var(netdev_rx_stat).time_squeeze++; 2894 __raise_softirq_irqoff(NET_RX_SOFTIRQ); 2895 goto out; 2896 } 2897 2898 static gifconf_func_t *gifconf_list[NPROTO]; 2899 2900 /** 2901 * register_gifconf - register a SIOCGIF handler 2902 * @family: Address family 2903 * @gifconf: Function handler 2904 * 2905 * Register protocol dependent address dumping routines. The handler 2906 * that is passed must not be freed or reused until it has been replaced 2907 * by another handler. 2908 */ 2909 int register_gifconf(unsigned int family, gifconf_func_t *gifconf) 2910 { 2911 if (family >= NPROTO) 2912 return -EINVAL; 2913 gifconf_list[family] = gifconf; 2914 return 0; 2915 } 2916 EXPORT_SYMBOL(register_gifconf); 2917 2918 2919 /* 2920 * Map an interface index to its name (SIOCGIFNAME) 2921 */ 2922 2923 /* 2924 * We need this ioctl for efficient implementation of the 2925 * if_indextoname() function required by the IPv6 API. Without 2926 * it, we would have to search all the interfaces to find a 2927 * match. --pb 2928 */ 2929 2930 static int dev_ifname(struct net *net, struct ifreq __user *arg) 2931 { 2932 struct net_device *dev; 2933 struct ifreq ifr; 2934 2935 /* 2936 * Fetch the caller's info block. 2937 */ 2938 2939 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 2940 return -EFAULT; 2941 2942 read_lock(&dev_base_lock); 2943 dev = __dev_get_by_index(net, ifr.ifr_ifindex); 2944 if (!dev) { 2945 read_unlock(&dev_base_lock); 2946 return -ENODEV; 2947 } 2948 2949 strcpy(ifr.ifr_name, dev->name); 2950 read_unlock(&dev_base_lock); 2951 2952 if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) 2953 return -EFAULT; 2954 return 0; 2955 } 2956 2957 /* 2958 * Perform a SIOCGIFCONF call. This structure will change 2959 * size eventually, and there is nothing I can do about it. 2960 * Thus we will need a 'compatibility mode'. 2961 */ 2962 2963 static int dev_ifconf(struct net *net, char __user *arg) 2964 { 2965 struct ifconf ifc; 2966 struct net_device *dev; 2967 char __user *pos; 2968 int len; 2969 int total; 2970 int i; 2971 2972 /* 2973 * Fetch the caller's info block. 2974 */ 2975 2976 if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) 2977 return -EFAULT; 2978 2979 pos = ifc.ifc_buf; 2980 len = ifc.ifc_len; 2981 2982 /* 2983 * Loop over the interfaces, and write an info block for each. 2984 */ 2985 2986 total = 0; 2987 for_each_netdev(net, dev) { 2988 for (i = 0; i < NPROTO; i++) { 2989 if (gifconf_list[i]) { 2990 int done; 2991 if (!pos) 2992 done = gifconf_list[i](dev, NULL, 0); 2993 else 2994 done = gifconf_list[i](dev, pos + total, 2995 len - total); 2996 if (done < 0) 2997 return -EFAULT; 2998 total += done; 2999 } 3000 } 3001 } 3002 3003 /* 3004 * All done. Write the updated control block back to the caller. 3005 */ 3006 ifc.ifc_len = total; 3007 3008 /* 3009 * Both BSD and Solaris return 0 here, so we do too. 3010 */ 3011 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; 3012 } 3013 3014 #ifdef CONFIG_PROC_FS 3015 /* 3016 * This is invoked by the /proc filesystem handler to display a device 3017 * in detail. 3018 */ 3019 void *dev_seq_start(struct seq_file *seq, loff_t *pos) 3020 __acquires(dev_base_lock) 3021 { 3022 struct net *net = seq_file_net(seq); 3023 loff_t off; 3024 struct net_device *dev; 3025 3026 read_lock(&dev_base_lock); 3027 if (!*pos) 3028 return SEQ_START_TOKEN; 3029 3030 off = 1; 3031 for_each_netdev(net, dev) 3032 if (off++ == *pos) 3033 return dev; 3034 3035 return NULL; 3036 } 3037 3038 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3039 { 3040 struct net *net = seq_file_net(seq); 3041 ++*pos; 3042 return v == SEQ_START_TOKEN ? 3043 first_net_device(net) : next_net_device((struct net_device *)v); 3044 } 3045 3046 void dev_seq_stop(struct seq_file *seq, void *v) 3047 __releases(dev_base_lock) 3048 { 3049 read_unlock(&dev_base_lock); 3050 } 3051 3052 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) 3053 { 3054 const struct net_device_stats *stats = dev_get_stats(dev); 3055 3056 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " 3057 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", 3058 dev->name, stats->rx_bytes, stats->rx_packets, 3059 stats->rx_errors, 3060 stats->rx_dropped + stats->rx_missed_errors, 3061 stats->rx_fifo_errors, 3062 stats->rx_length_errors + stats->rx_over_errors + 3063 stats->rx_crc_errors + stats->rx_frame_errors, 3064 stats->rx_compressed, stats->multicast, 3065 stats->tx_bytes, stats->tx_packets, 3066 stats->tx_errors, stats->tx_dropped, 3067 stats->tx_fifo_errors, stats->collisions, 3068 stats->tx_carrier_errors + 3069 stats->tx_aborted_errors + 3070 stats->tx_window_errors + 3071 stats->tx_heartbeat_errors, 3072 stats->tx_compressed); 3073 } 3074 3075 /* 3076 * Called from the PROCfs module. This now uses the new arbitrary sized 3077 * /proc/net interface to create /proc/net/dev 3078 */ 3079 static int dev_seq_show(struct seq_file *seq, void *v) 3080 { 3081 if (v == SEQ_START_TOKEN) 3082 seq_puts(seq, "Inter-| Receive " 3083 " | Transmit\n" 3084 " face |bytes packets errs drop fifo frame " 3085 "compressed multicast|bytes packets errs " 3086 "drop fifo colls carrier compressed\n"); 3087 else 3088 dev_seq_printf_stats(seq, v); 3089 return 0; 3090 } 3091 3092 static struct netif_rx_stats *softnet_get_online(loff_t *pos) 3093 { 3094 struct netif_rx_stats *rc = NULL; 3095 3096 while (*pos < nr_cpu_ids) 3097 if (cpu_online(*pos)) { 3098 rc = &per_cpu(netdev_rx_stat, *pos); 3099 break; 3100 } else 3101 ++*pos; 3102 return rc; 3103 } 3104 3105 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) 3106 { 3107 return softnet_get_online(pos); 3108 } 3109 3110 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3111 { 3112 ++*pos; 3113 return softnet_get_online(pos); 3114 } 3115 3116 static void softnet_seq_stop(struct seq_file *seq, void *v) 3117 { 3118 } 3119 3120 static int softnet_seq_show(struct seq_file *seq, void *v) 3121 { 3122 struct netif_rx_stats *s = v; 3123 3124 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", 3125 s->total, s->dropped, s->time_squeeze, 0, 3126 0, 0, 0, 0, /* was fastroute */ 3127 s->cpu_collision); 3128 return 0; 3129 } 3130 3131 static const struct seq_operations dev_seq_ops = { 3132 .start = dev_seq_start, 3133 .next = dev_seq_next, 3134 .stop = dev_seq_stop, 3135 .show = dev_seq_show, 3136 }; 3137 3138 static int dev_seq_open(struct inode *inode, struct file *file) 3139 { 3140 return seq_open_net(inode, file, &dev_seq_ops, 3141 sizeof(struct seq_net_private)); 3142 } 3143 3144 static const struct file_operations dev_seq_fops = { 3145 .owner = THIS_MODULE, 3146 .open = dev_seq_open, 3147 .read = seq_read, 3148 .llseek = seq_lseek, 3149 .release = seq_release_net, 3150 }; 3151 3152 static const struct seq_operations softnet_seq_ops = { 3153 .start = softnet_seq_start, 3154 .next = softnet_seq_next, 3155 .stop = softnet_seq_stop, 3156 .show = softnet_seq_show, 3157 }; 3158 3159 static int softnet_seq_open(struct inode *inode, struct file *file) 3160 { 3161 return seq_open(file, &softnet_seq_ops); 3162 } 3163 3164 static const struct file_operations softnet_seq_fops = { 3165 .owner = THIS_MODULE, 3166 .open = softnet_seq_open, 3167 .read = seq_read, 3168 .llseek = seq_lseek, 3169 .release = seq_release, 3170 }; 3171 3172 static void *ptype_get_idx(loff_t pos) 3173 { 3174 struct packet_type *pt = NULL; 3175 loff_t i = 0; 3176 int t; 3177 3178 list_for_each_entry_rcu(pt, &ptype_all, list) { 3179 if (i == pos) 3180 return pt; 3181 ++i; 3182 } 3183 3184 for (t = 0; t < PTYPE_HASH_SIZE; t++) { 3185 list_for_each_entry_rcu(pt, &ptype_base[t], list) { 3186 if (i == pos) 3187 return pt; 3188 ++i; 3189 } 3190 } 3191 return NULL; 3192 } 3193 3194 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos) 3195 __acquires(RCU) 3196 { 3197 rcu_read_lock(); 3198 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN; 3199 } 3200 3201 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3202 { 3203 struct packet_type *pt; 3204 struct list_head *nxt; 3205 int hash; 3206 3207 ++*pos; 3208 if (v == SEQ_START_TOKEN) 3209 return ptype_get_idx(0); 3210 3211 pt = v; 3212 nxt = pt->list.next; 3213 if (pt->type == htons(ETH_P_ALL)) { 3214 if (nxt != &ptype_all) 3215 goto found; 3216 hash = 0; 3217 nxt = ptype_base[0].next; 3218 } else 3219 hash = ntohs(pt->type) & PTYPE_HASH_MASK; 3220 3221 while (nxt == &ptype_base[hash]) { 3222 if (++hash >= PTYPE_HASH_SIZE) 3223 return NULL; 3224 nxt = ptype_base[hash].next; 3225 } 3226 found: 3227 return list_entry(nxt, struct packet_type, list); 3228 } 3229 3230 static void ptype_seq_stop(struct seq_file *seq, void *v) 3231 __releases(RCU) 3232 { 3233 rcu_read_unlock(); 3234 } 3235 3236 static int ptype_seq_show(struct seq_file *seq, void *v) 3237 { 3238 struct packet_type *pt = v; 3239 3240 if (v == SEQ_START_TOKEN) 3241 seq_puts(seq, "Type Device Function\n"); 3242 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { 3243 if (pt->type == htons(ETH_P_ALL)) 3244 seq_puts(seq, "ALL "); 3245 else 3246 seq_printf(seq, "%04x", ntohs(pt->type)); 3247 3248 seq_printf(seq, " %-8s %pF\n", 3249 pt->dev ? pt->dev->name : "", pt->func); 3250 } 3251 3252 return 0; 3253 } 3254 3255 static const struct seq_operations ptype_seq_ops = { 3256 .start = ptype_seq_start, 3257 .next = ptype_seq_next, 3258 .stop = ptype_seq_stop, 3259 .show = ptype_seq_show, 3260 }; 3261 3262 static int ptype_seq_open(struct inode *inode, struct file *file) 3263 { 3264 return seq_open_net(inode, file, &ptype_seq_ops, 3265 sizeof(struct seq_net_private)); 3266 } 3267 3268 static const struct file_operations ptype_seq_fops = { 3269 .owner = THIS_MODULE, 3270 .open = ptype_seq_open, 3271 .read = seq_read, 3272 .llseek = seq_lseek, 3273 .release = seq_release_net, 3274 }; 3275 3276 3277 static int __net_init dev_proc_net_init(struct net *net) 3278 { 3279 int rc = -ENOMEM; 3280 3281 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops)) 3282 goto out; 3283 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops)) 3284 goto out_dev; 3285 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops)) 3286 goto out_softnet; 3287 3288 if (wext_proc_init(net)) 3289 goto out_ptype; 3290 rc = 0; 3291 out: 3292 return rc; 3293 out_ptype: 3294 proc_net_remove(net, "ptype"); 3295 out_softnet: 3296 proc_net_remove(net, "softnet_stat"); 3297 out_dev: 3298 proc_net_remove(net, "dev"); 3299 goto out; 3300 } 3301 3302 static void __net_exit dev_proc_net_exit(struct net *net) 3303 { 3304 wext_proc_exit(net); 3305 3306 proc_net_remove(net, "ptype"); 3307 proc_net_remove(net, "softnet_stat"); 3308 proc_net_remove(net, "dev"); 3309 } 3310 3311 static struct pernet_operations __net_initdata dev_proc_ops = { 3312 .init = dev_proc_net_init, 3313 .exit = dev_proc_net_exit, 3314 }; 3315 3316 static int __init dev_proc_init(void) 3317 { 3318 return register_pernet_subsys(&dev_proc_ops); 3319 } 3320 #else 3321 #define dev_proc_init() 0 3322 #endif /* CONFIG_PROC_FS */ 3323 3324 3325 /** 3326 * netdev_set_master - set up master/slave pair 3327 * @slave: slave device 3328 * @master: new master device 3329 * 3330 * Changes the master device of the slave. Pass %NULL to break the 3331 * bonding. The caller must hold the RTNL semaphore. On a failure 3332 * a negative errno code is returned. On success the reference counts 3333 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the 3334 * function returns zero. 3335 */ 3336 int netdev_set_master(struct net_device *slave, struct net_device *master) 3337 { 3338 struct net_device *old = slave->master; 3339 3340 ASSERT_RTNL(); 3341 3342 if (master) { 3343 if (old) 3344 return -EBUSY; 3345 dev_hold(master); 3346 } 3347 3348 slave->master = master; 3349 3350 synchronize_net(); 3351 3352 if (old) 3353 dev_put(old); 3354 3355 if (master) 3356 slave->flags |= IFF_SLAVE; 3357 else 3358 slave->flags &= ~IFF_SLAVE; 3359 3360 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); 3361 return 0; 3362 } 3363 EXPORT_SYMBOL(netdev_set_master); 3364 3365 static void dev_change_rx_flags(struct net_device *dev, int flags) 3366 { 3367 const struct net_device_ops *ops = dev->netdev_ops; 3368 3369 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags) 3370 ops->ndo_change_rx_flags(dev, flags); 3371 } 3372 3373 static int __dev_set_promiscuity(struct net_device *dev, int inc) 3374 { 3375 unsigned short old_flags = dev->flags; 3376 uid_t uid; 3377 gid_t gid; 3378 3379 ASSERT_RTNL(); 3380 3381 dev->flags |= IFF_PROMISC; 3382 dev->promiscuity += inc; 3383 if (dev->promiscuity == 0) { 3384 /* 3385 * Avoid overflow. 3386 * If inc causes overflow, untouch promisc and return error. 3387 */ 3388 if (inc < 0) 3389 dev->flags &= ~IFF_PROMISC; 3390 else { 3391 dev->promiscuity -= inc; 3392 printk(KERN_WARNING "%s: promiscuity touches roof, " 3393 "set promiscuity failed, promiscuity feature " 3394 "of device might be broken.\n", dev->name); 3395 return -EOVERFLOW; 3396 } 3397 } 3398 if (dev->flags != old_flags) { 3399 printk(KERN_INFO "device %s %s promiscuous mode\n", 3400 dev->name, (dev->flags & IFF_PROMISC) ? "entered" : 3401 "left"); 3402 if (audit_enabled) { 3403 current_uid_gid(&uid, &gid); 3404 audit_log(current->audit_context, GFP_ATOMIC, 3405 AUDIT_ANOM_PROMISCUOUS, 3406 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u", 3407 dev->name, (dev->flags & IFF_PROMISC), 3408 (old_flags & IFF_PROMISC), 3409 audit_get_loginuid(current), 3410 uid, gid, 3411 audit_get_sessionid(current)); 3412 } 3413 3414 dev_change_rx_flags(dev, IFF_PROMISC); 3415 } 3416 return 0; 3417 } 3418 3419 /** 3420 * dev_set_promiscuity - update promiscuity count on a device 3421 * @dev: device 3422 * @inc: modifier 3423 * 3424 * Add or remove promiscuity from a device. While the count in the device 3425 * remains above zero the interface remains promiscuous. Once it hits zero 3426 * the device reverts back to normal filtering operation. A negative inc 3427 * value is used to drop promiscuity on the device. 3428 * Return 0 if successful or a negative errno code on error. 3429 */ 3430 int dev_set_promiscuity(struct net_device *dev, int inc) 3431 { 3432 unsigned short old_flags = dev->flags; 3433 int err; 3434 3435 err = __dev_set_promiscuity(dev, inc); 3436 if (err < 0) 3437 return err; 3438 if (dev->flags != old_flags) 3439 dev_set_rx_mode(dev); 3440 return err; 3441 } 3442 EXPORT_SYMBOL(dev_set_promiscuity); 3443 3444 /** 3445 * dev_set_allmulti - update allmulti count on a device 3446 * @dev: device 3447 * @inc: modifier 3448 * 3449 * Add or remove reception of all multicast frames to a device. While the 3450 * count in the device remains above zero the interface remains listening 3451 * to all interfaces. Once it hits zero the device reverts back to normal 3452 * filtering operation. A negative @inc value is used to drop the counter 3453 * when releasing a resource needing all multicasts. 3454 * Return 0 if successful or a negative errno code on error. 3455 */ 3456 3457 int dev_set_allmulti(struct net_device *dev, int inc) 3458 { 3459 unsigned short old_flags = dev->flags; 3460 3461 ASSERT_RTNL(); 3462 3463 dev->flags |= IFF_ALLMULTI; 3464 dev->allmulti += inc; 3465 if (dev->allmulti == 0) { 3466 /* 3467 * Avoid overflow. 3468 * If inc causes overflow, untouch allmulti and return error. 3469 */ 3470 if (inc < 0) 3471 dev->flags &= ~IFF_ALLMULTI; 3472 else { 3473 dev->allmulti -= inc; 3474 printk(KERN_WARNING "%s: allmulti touches roof, " 3475 "set allmulti failed, allmulti feature of " 3476 "device might be broken.\n", dev->name); 3477 return -EOVERFLOW; 3478 } 3479 } 3480 if (dev->flags ^ old_flags) { 3481 dev_change_rx_flags(dev, IFF_ALLMULTI); 3482 dev_set_rx_mode(dev); 3483 } 3484 return 0; 3485 } 3486 EXPORT_SYMBOL(dev_set_allmulti); 3487 3488 /* 3489 * Upload unicast and multicast address lists to device and 3490 * configure RX filtering. When the device doesn't support unicast 3491 * filtering it is put in promiscuous mode while unicast addresses 3492 * are present. 3493 */ 3494 void __dev_set_rx_mode(struct net_device *dev) 3495 { 3496 const struct net_device_ops *ops = dev->netdev_ops; 3497 3498 /* dev_open will call this function so the list will stay sane. */ 3499 if (!(dev->flags&IFF_UP)) 3500 return; 3501 3502 if (!netif_device_present(dev)) 3503 return; 3504 3505 if (ops->ndo_set_rx_mode) 3506 ops->ndo_set_rx_mode(dev); 3507 else { 3508 /* Unicast addresses changes may only happen under the rtnl, 3509 * therefore calling __dev_set_promiscuity here is safe. 3510 */ 3511 if (dev->uc.count > 0 && !dev->uc_promisc) { 3512 __dev_set_promiscuity(dev, 1); 3513 dev->uc_promisc = 1; 3514 } else if (dev->uc.count == 0 && dev->uc_promisc) { 3515 __dev_set_promiscuity(dev, -1); 3516 dev->uc_promisc = 0; 3517 } 3518 3519 if (ops->ndo_set_multicast_list) 3520 ops->ndo_set_multicast_list(dev); 3521 } 3522 } 3523 3524 void dev_set_rx_mode(struct net_device *dev) 3525 { 3526 netif_addr_lock_bh(dev); 3527 __dev_set_rx_mode(dev); 3528 netif_addr_unlock_bh(dev); 3529 } 3530 3531 /* hw addresses list handling functions */ 3532 3533 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr, 3534 int addr_len, unsigned char addr_type) 3535 { 3536 struct netdev_hw_addr *ha; 3537 int alloc_size; 3538 3539 if (addr_len > MAX_ADDR_LEN) 3540 return -EINVAL; 3541 3542 list_for_each_entry(ha, &list->list, list) { 3543 if (!memcmp(ha->addr, addr, addr_len) && 3544 ha->type == addr_type) { 3545 ha->refcount++; 3546 return 0; 3547 } 3548 } 3549 3550 3551 alloc_size = sizeof(*ha); 3552 if (alloc_size < L1_CACHE_BYTES) 3553 alloc_size = L1_CACHE_BYTES; 3554 ha = kmalloc(alloc_size, GFP_ATOMIC); 3555 if (!ha) 3556 return -ENOMEM; 3557 memcpy(ha->addr, addr, addr_len); 3558 ha->type = addr_type; 3559 ha->refcount = 1; 3560 ha->synced = false; 3561 list_add_tail_rcu(&ha->list, &list->list); 3562 list->count++; 3563 return 0; 3564 } 3565 3566 static void ha_rcu_free(struct rcu_head *head) 3567 { 3568 struct netdev_hw_addr *ha; 3569 3570 ha = container_of(head, struct netdev_hw_addr, rcu_head); 3571 kfree(ha); 3572 } 3573 3574 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr, 3575 int addr_len, unsigned char addr_type) 3576 { 3577 struct netdev_hw_addr *ha; 3578 3579 list_for_each_entry(ha, &list->list, list) { 3580 if (!memcmp(ha->addr, addr, addr_len) && 3581 (ha->type == addr_type || !addr_type)) { 3582 if (--ha->refcount) 3583 return 0; 3584 list_del_rcu(&ha->list); 3585 call_rcu(&ha->rcu_head, ha_rcu_free); 3586 list->count--; 3587 return 0; 3588 } 3589 } 3590 return -ENOENT; 3591 } 3592 3593 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list, 3594 struct netdev_hw_addr_list *from_list, 3595 int addr_len, 3596 unsigned char addr_type) 3597 { 3598 int err; 3599 struct netdev_hw_addr *ha, *ha2; 3600 unsigned char type; 3601 3602 list_for_each_entry(ha, &from_list->list, list) { 3603 type = addr_type ? addr_type : ha->type; 3604 err = __hw_addr_add(to_list, ha->addr, addr_len, type); 3605 if (err) 3606 goto unroll; 3607 } 3608 return 0; 3609 3610 unroll: 3611 list_for_each_entry(ha2, &from_list->list, list) { 3612 if (ha2 == ha) 3613 break; 3614 type = addr_type ? addr_type : ha2->type; 3615 __hw_addr_del(to_list, ha2->addr, addr_len, type); 3616 } 3617 return err; 3618 } 3619 3620 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list, 3621 struct netdev_hw_addr_list *from_list, 3622 int addr_len, 3623 unsigned char addr_type) 3624 { 3625 struct netdev_hw_addr *ha; 3626 unsigned char type; 3627 3628 list_for_each_entry(ha, &from_list->list, list) { 3629 type = addr_type ? addr_type : ha->type; 3630 __hw_addr_del(to_list, ha->addr, addr_len, addr_type); 3631 } 3632 } 3633 3634 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list, 3635 struct netdev_hw_addr_list *from_list, 3636 int addr_len) 3637 { 3638 int err = 0; 3639 struct netdev_hw_addr *ha, *tmp; 3640 3641 list_for_each_entry_safe(ha, tmp, &from_list->list, list) { 3642 if (!ha->synced) { 3643 err = __hw_addr_add(to_list, ha->addr, 3644 addr_len, ha->type); 3645 if (err) 3646 break; 3647 ha->synced = true; 3648 ha->refcount++; 3649 } else if (ha->refcount == 1) { 3650 __hw_addr_del(to_list, ha->addr, addr_len, ha->type); 3651 __hw_addr_del(from_list, ha->addr, addr_len, ha->type); 3652 } 3653 } 3654 return err; 3655 } 3656 3657 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list, 3658 struct netdev_hw_addr_list *from_list, 3659 int addr_len) 3660 { 3661 struct netdev_hw_addr *ha, *tmp; 3662 3663 list_for_each_entry_safe(ha, tmp, &from_list->list, list) { 3664 if (ha->synced) { 3665 __hw_addr_del(to_list, ha->addr, 3666 addr_len, ha->type); 3667 ha->synced = false; 3668 __hw_addr_del(from_list, ha->addr, 3669 addr_len, ha->type); 3670 } 3671 } 3672 } 3673 3674 static void __hw_addr_flush(struct netdev_hw_addr_list *list) 3675 { 3676 struct netdev_hw_addr *ha, *tmp; 3677 3678 list_for_each_entry_safe(ha, tmp, &list->list, list) { 3679 list_del_rcu(&ha->list); 3680 call_rcu(&ha->rcu_head, ha_rcu_free); 3681 } 3682 list->count = 0; 3683 } 3684 3685 static void __hw_addr_init(struct netdev_hw_addr_list *list) 3686 { 3687 INIT_LIST_HEAD(&list->list); 3688 list->count = 0; 3689 } 3690 3691 /* Device addresses handling functions */ 3692 3693 static void dev_addr_flush(struct net_device *dev) 3694 { 3695 /* rtnl_mutex must be held here */ 3696 3697 __hw_addr_flush(&dev->dev_addrs); 3698 dev->dev_addr = NULL; 3699 } 3700 3701 static int dev_addr_init(struct net_device *dev) 3702 { 3703 unsigned char addr[MAX_ADDR_LEN]; 3704 struct netdev_hw_addr *ha; 3705 int err; 3706 3707 /* rtnl_mutex must be held here */ 3708 3709 __hw_addr_init(&dev->dev_addrs); 3710 memset(addr, 0, sizeof(addr)); 3711 err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr), 3712 NETDEV_HW_ADDR_T_LAN); 3713 if (!err) { 3714 /* 3715 * Get the first (previously created) address from the list 3716 * and set dev_addr pointer to this location. 3717 */ 3718 ha = list_first_entry(&dev->dev_addrs.list, 3719 struct netdev_hw_addr, list); 3720 dev->dev_addr = ha->addr; 3721 } 3722 return err; 3723 } 3724 3725 /** 3726 * dev_addr_add - Add a device address 3727 * @dev: device 3728 * @addr: address to add 3729 * @addr_type: address type 3730 * 3731 * Add a device address to the device or increase the reference count if 3732 * it already exists. 3733 * 3734 * The caller must hold the rtnl_mutex. 3735 */ 3736 int dev_addr_add(struct net_device *dev, unsigned char *addr, 3737 unsigned char addr_type) 3738 { 3739 int err; 3740 3741 ASSERT_RTNL(); 3742 3743 err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type); 3744 if (!err) 3745 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3746 return err; 3747 } 3748 EXPORT_SYMBOL(dev_addr_add); 3749 3750 /** 3751 * dev_addr_del - Release a device address. 3752 * @dev: device 3753 * @addr: address to delete 3754 * @addr_type: address type 3755 * 3756 * Release reference to a device address and remove it from the device 3757 * if the reference count drops to zero. 3758 * 3759 * The caller must hold the rtnl_mutex. 3760 */ 3761 int dev_addr_del(struct net_device *dev, unsigned char *addr, 3762 unsigned char addr_type) 3763 { 3764 int err; 3765 struct netdev_hw_addr *ha; 3766 3767 ASSERT_RTNL(); 3768 3769 /* 3770 * We can not remove the first address from the list because 3771 * dev->dev_addr points to that. 3772 */ 3773 ha = list_first_entry(&dev->dev_addrs.list, 3774 struct netdev_hw_addr, list); 3775 if (ha->addr == dev->dev_addr && ha->refcount == 1) 3776 return -ENOENT; 3777 3778 err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len, 3779 addr_type); 3780 if (!err) 3781 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 3782 return err; 3783 } 3784 EXPORT_SYMBOL(dev_addr_del); 3785 3786 /** 3787 * dev_addr_add_multiple - Add device addresses from another device 3788 * @to_dev: device to which addresses will be added 3789 * @from_dev: device from which addresses will be added 3790 * @addr_type: address type - 0 means type will be used from from_dev 3791 * 3792 * Add device addresses of the one device to another. 3793 ** 3794 * The caller must hold the rtnl_mutex. 3795 */ 3796 int dev_addr_add_multiple(struct net_device *to_dev, 3797 struct net_device *from_dev, 3798 unsigned char addr_type) 3799 { 3800 int err; 3801 3802 ASSERT_RTNL(); 3803 3804 if (from_dev->addr_len != to_dev->addr_len) 3805 return -EINVAL; 3806 err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, 3807 to_dev->addr_len, addr_type); 3808 if (!err) 3809 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); 3810 return err; 3811 } 3812 EXPORT_SYMBOL(dev_addr_add_multiple); 3813 3814 /** 3815 * dev_addr_del_multiple - Delete device addresses by another device 3816 * @to_dev: device where the addresses will be deleted 3817 * @from_dev: device by which addresses the addresses will be deleted 3818 * @addr_type: address type - 0 means type will used from from_dev 3819 * 3820 * Deletes addresses in to device by the list of addresses in from device. 3821 * 3822 * The caller must hold the rtnl_mutex. 3823 */ 3824 int dev_addr_del_multiple(struct net_device *to_dev, 3825 struct net_device *from_dev, 3826 unsigned char addr_type) 3827 { 3828 ASSERT_RTNL(); 3829 3830 if (from_dev->addr_len != to_dev->addr_len) 3831 return -EINVAL; 3832 __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs, 3833 to_dev->addr_len, addr_type); 3834 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev); 3835 return 0; 3836 } 3837 EXPORT_SYMBOL(dev_addr_del_multiple); 3838 3839 /* multicast addresses handling functions */ 3840 3841 int __dev_addr_delete(struct dev_addr_list **list, int *count, 3842 void *addr, int alen, int glbl) 3843 { 3844 struct dev_addr_list *da; 3845 3846 for (; (da = *list) != NULL; list = &da->next) { 3847 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3848 alen == da->da_addrlen) { 3849 if (glbl) { 3850 int old_glbl = da->da_gusers; 3851 da->da_gusers = 0; 3852 if (old_glbl == 0) 3853 break; 3854 } 3855 if (--da->da_users) 3856 return 0; 3857 3858 *list = da->next; 3859 kfree(da); 3860 (*count)--; 3861 return 0; 3862 } 3863 } 3864 return -ENOENT; 3865 } 3866 3867 int __dev_addr_add(struct dev_addr_list **list, int *count, 3868 void *addr, int alen, int glbl) 3869 { 3870 struct dev_addr_list *da; 3871 3872 for (da = *list; da != NULL; da = da->next) { 3873 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 && 3874 da->da_addrlen == alen) { 3875 if (glbl) { 3876 int old_glbl = da->da_gusers; 3877 da->da_gusers = 1; 3878 if (old_glbl) 3879 return 0; 3880 } 3881 da->da_users++; 3882 return 0; 3883 } 3884 } 3885 3886 da = kzalloc(sizeof(*da), GFP_ATOMIC); 3887 if (da == NULL) 3888 return -ENOMEM; 3889 memcpy(da->da_addr, addr, alen); 3890 da->da_addrlen = alen; 3891 da->da_users = 1; 3892 da->da_gusers = glbl ? 1 : 0; 3893 da->next = *list; 3894 *list = da; 3895 (*count)++; 3896 return 0; 3897 } 3898 3899 /** 3900 * dev_unicast_delete - Release secondary unicast address. 3901 * @dev: device 3902 * @addr: address to delete 3903 * 3904 * Release reference to a secondary unicast address and remove it 3905 * from the device if the reference count drops to zero. 3906 * 3907 * The caller must hold the rtnl_mutex. 3908 */ 3909 int dev_unicast_delete(struct net_device *dev, void *addr) 3910 { 3911 int err; 3912 3913 ASSERT_RTNL(); 3914 3915 netif_addr_lock_bh(dev); 3916 err = __hw_addr_del(&dev->uc, addr, dev->addr_len, 3917 NETDEV_HW_ADDR_T_UNICAST); 3918 if (!err) 3919 __dev_set_rx_mode(dev); 3920 netif_addr_unlock_bh(dev); 3921 return err; 3922 } 3923 EXPORT_SYMBOL(dev_unicast_delete); 3924 3925 /** 3926 * dev_unicast_add - add a secondary unicast address 3927 * @dev: device 3928 * @addr: address to add 3929 * 3930 * Add a secondary unicast address to the device or increase 3931 * the reference count if it already exists. 3932 * 3933 * The caller must hold the rtnl_mutex. 3934 */ 3935 int dev_unicast_add(struct net_device *dev, void *addr) 3936 { 3937 int err; 3938 3939 ASSERT_RTNL(); 3940 3941 netif_addr_lock_bh(dev); 3942 err = __hw_addr_add(&dev->uc, addr, dev->addr_len, 3943 NETDEV_HW_ADDR_T_UNICAST); 3944 if (!err) 3945 __dev_set_rx_mode(dev); 3946 netif_addr_unlock_bh(dev); 3947 return err; 3948 } 3949 EXPORT_SYMBOL(dev_unicast_add); 3950 3951 int __dev_addr_sync(struct dev_addr_list **to, int *to_count, 3952 struct dev_addr_list **from, int *from_count) 3953 { 3954 struct dev_addr_list *da, *next; 3955 int err = 0; 3956 3957 da = *from; 3958 while (da != NULL) { 3959 next = da->next; 3960 if (!da->da_synced) { 3961 err = __dev_addr_add(to, to_count, 3962 da->da_addr, da->da_addrlen, 0); 3963 if (err < 0) 3964 break; 3965 da->da_synced = 1; 3966 da->da_users++; 3967 } else if (da->da_users == 1) { 3968 __dev_addr_delete(to, to_count, 3969 da->da_addr, da->da_addrlen, 0); 3970 __dev_addr_delete(from, from_count, 3971 da->da_addr, da->da_addrlen, 0); 3972 } 3973 da = next; 3974 } 3975 return err; 3976 } 3977 EXPORT_SYMBOL_GPL(__dev_addr_sync); 3978 3979 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count, 3980 struct dev_addr_list **from, int *from_count) 3981 { 3982 struct dev_addr_list *da, *next; 3983 3984 da = *from; 3985 while (da != NULL) { 3986 next = da->next; 3987 if (da->da_synced) { 3988 __dev_addr_delete(to, to_count, 3989 da->da_addr, da->da_addrlen, 0); 3990 da->da_synced = 0; 3991 __dev_addr_delete(from, from_count, 3992 da->da_addr, da->da_addrlen, 0); 3993 } 3994 da = next; 3995 } 3996 } 3997 EXPORT_SYMBOL_GPL(__dev_addr_unsync); 3998 3999 /** 4000 * dev_unicast_sync - Synchronize device's unicast list to another device 4001 * @to: destination device 4002 * @from: source device 4003 * 4004 * Add newly added addresses to the destination device and release 4005 * addresses that have no users left. The source device must be 4006 * locked by netif_tx_lock_bh. 4007 * 4008 * This function is intended to be called from the dev->set_rx_mode 4009 * function of layered software devices. 4010 */ 4011 int dev_unicast_sync(struct net_device *to, struct net_device *from) 4012 { 4013 int err = 0; 4014 4015 if (to->addr_len != from->addr_len) 4016 return -EINVAL; 4017 4018 netif_addr_lock_bh(to); 4019 err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len); 4020 if (!err) 4021 __dev_set_rx_mode(to); 4022 netif_addr_unlock_bh(to); 4023 return err; 4024 } 4025 EXPORT_SYMBOL(dev_unicast_sync); 4026 4027 /** 4028 * dev_unicast_unsync - Remove synchronized addresses from the destination device 4029 * @to: destination device 4030 * @from: source device 4031 * 4032 * Remove all addresses that were added to the destination device by 4033 * dev_unicast_sync(). This function is intended to be called from the 4034 * dev->stop function of layered software devices. 4035 */ 4036 void dev_unicast_unsync(struct net_device *to, struct net_device *from) 4037 { 4038 if (to->addr_len != from->addr_len) 4039 return; 4040 4041 netif_addr_lock_bh(from); 4042 netif_addr_lock(to); 4043 __hw_addr_unsync(&to->uc, &from->uc, to->addr_len); 4044 __dev_set_rx_mode(to); 4045 netif_addr_unlock(to); 4046 netif_addr_unlock_bh(from); 4047 } 4048 EXPORT_SYMBOL(dev_unicast_unsync); 4049 4050 static void dev_unicast_flush(struct net_device *dev) 4051 { 4052 netif_addr_lock_bh(dev); 4053 __hw_addr_flush(&dev->uc); 4054 netif_addr_unlock_bh(dev); 4055 } 4056 4057 static void dev_unicast_init(struct net_device *dev) 4058 { 4059 __hw_addr_init(&dev->uc); 4060 } 4061 4062 4063 static void __dev_addr_discard(struct dev_addr_list **list) 4064 { 4065 struct dev_addr_list *tmp; 4066 4067 while (*list != NULL) { 4068 tmp = *list; 4069 *list = tmp->next; 4070 if (tmp->da_users > tmp->da_gusers) 4071 printk("__dev_addr_discard: address leakage! " 4072 "da_users=%d\n", tmp->da_users); 4073 kfree(tmp); 4074 } 4075 } 4076 4077 static void dev_addr_discard(struct net_device *dev) 4078 { 4079 netif_addr_lock_bh(dev); 4080 4081 __dev_addr_discard(&dev->mc_list); 4082 dev->mc_count = 0; 4083 4084 netif_addr_unlock_bh(dev); 4085 } 4086 4087 /** 4088 * dev_get_flags - get flags reported to userspace 4089 * @dev: device 4090 * 4091 * Get the combination of flag bits exported through APIs to userspace. 4092 */ 4093 unsigned dev_get_flags(const struct net_device *dev) 4094 { 4095 unsigned flags; 4096 4097 flags = (dev->flags & ~(IFF_PROMISC | 4098 IFF_ALLMULTI | 4099 IFF_RUNNING | 4100 IFF_LOWER_UP | 4101 IFF_DORMANT)) | 4102 (dev->gflags & (IFF_PROMISC | 4103 IFF_ALLMULTI)); 4104 4105 if (netif_running(dev)) { 4106 if (netif_oper_up(dev)) 4107 flags |= IFF_RUNNING; 4108 if (netif_carrier_ok(dev)) 4109 flags |= IFF_LOWER_UP; 4110 if (netif_dormant(dev)) 4111 flags |= IFF_DORMANT; 4112 } 4113 4114 return flags; 4115 } 4116 EXPORT_SYMBOL(dev_get_flags); 4117 4118 /** 4119 * dev_change_flags - change device settings 4120 * @dev: device 4121 * @flags: device state flags 4122 * 4123 * Change settings on device based state flags. The flags are 4124 * in the userspace exported format. 4125 */ 4126 int dev_change_flags(struct net_device *dev, unsigned flags) 4127 { 4128 int ret, changes; 4129 int old_flags = dev->flags; 4130 4131 ASSERT_RTNL(); 4132 4133 /* 4134 * Set the flags on our device. 4135 */ 4136 4137 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | 4138 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | 4139 IFF_AUTOMEDIA)) | 4140 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | 4141 IFF_ALLMULTI)); 4142 4143 /* 4144 * Load in the correct multicast list now the flags have changed. 4145 */ 4146 4147 if ((old_flags ^ flags) & IFF_MULTICAST) 4148 dev_change_rx_flags(dev, IFF_MULTICAST); 4149 4150 dev_set_rx_mode(dev); 4151 4152 /* 4153 * Have we downed the interface. We handle IFF_UP ourselves 4154 * according to user attempts to set it, rather than blindly 4155 * setting it. 4156 */ 4157 4158 ret = 0; 4159 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ 4160 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); 4161 4162 if (!ret) 4163 dev_set_rx_mode(dev); 4164 } 4165 4166 if (dev->flags & IFF_UP && 4167 ((old_flags ^ dev->flags) & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | 4168 IFF_VOLATILE))) 4169 call_netdevice_notifiers(NETDEV_CHANGE, dev); 4170 4171 if ((flags ^ dev->gflags) & IFF_PROMISC) { 4172 int inc = (flags & IFF_PROMISC) ? 1 : -1; 4173 4174 dev->gflags ^= IFF_PROMISC; 4175 dev_set_promiscuity(dev, inc); 4176 } 4177 4178 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI 4179 is important. Some (broken) drivers set IFF_PROMISC, when 4180 IFF_ALLMULTI is requested not asking us and not reporting. 4181 */ 4182 if ((flags ^ dev->gflags) & IFF_ALLMULTI) { 4183 int inc = (flags & IFF_ALLMULTI) ? 1 : -1; 4184 4185 dev->gflags ^= IFF_ALLMULTI; 4186 dev_set_allmulti(dev, inc); 4187 } 4188 4189 /* Exclude state transition flags, already notified */ 4190 changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING); 4191 if (changes) 4192 rtmsg_ifinfo(RTM_NEWLINK, dev, changes); 4193 4194 return ret; 4195 } 4196 EXPORT_SYMBOL(dev_change_flags); 4197 4198 /** 4199 * dev_set_mtu - Change maximum transfer unit 4200 * @dev: device 4201 * @new_mtu: new transfer unit 4202 * 4203 * Change the maximum transfer size of the network device. 4204 */ 4205 int dev_set_mtu(struct net_device *dev, int new_mtu) 4206 { 4207 const struct net_device_ops *ops = dev->netdev_ops; 4208 int err; 4209 4210 if (new_mtu == dev->mtu) 4211 return 0; 4212 4213 /* MTU must be positive. */ 4214 if (new_mtu < 0) 4215 return -EINVAL; 4216 4217 if (!netif_device_present(dev)) 4218 return -ENODEV; 4219 4220 err = 0; 4221 if (ops->ndo_change_mtu) 4222 err = ops->ndo_change_mtu(dev, new_mtu); 4223 else 4224 dev->mtu = new_mtu; 4225 4226 if (!err && dev->flags & IFF_UP) 4227 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); 4228 return err; 4229 } 4230 EXPORT_SYMBOL(dev_set_mtu); 4231 4232 /** 4233 * dev_set_mac_address - Change Media Access Control Address 4234 * @dev: device 4235 * @sa: new address 4236 * 4237 * Change the hardware (MAC) address of the device 4238 */ 4239 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) 4240 { 4241 const struct net_device_ops *ops = dev->netdev_ops; 4242 int err; 4243 4244 if (!ops->ndo_set_mac_address) 4245 return -EOPNOTSUPP; 4246 if (sa->sa_family != dev->type) 4247 return -EINVAL; 4248 if (!netif_device_present(dev)) 4249 return -ENODEV; 4250 err = ops->ndo_set_mac_address(dev, sa); 4251 if (!err) 4252 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4253 return err; 4254 } 4255 EXPORT_SYMBOL(dev_set_mac_address); 4256 4257 /* 4258 * Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock) 4259 */ 4260 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd) 4261 { 4262 int err; 4263 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 4264 4265 if (!dev) 4266 return -ENODEV; 4267 4268 switch (cmd) { 4269 case SIOCGIFFLAGS: /* Get interface flags */ 4270 ifr->ifr_flags = (short) dev_get_flags(dev); 4271 return 0; 4272 4273 case SIOCGIFMETRIC: /* Get the metric on the interface 4274 (currently unused) */ 4275 ifr->ifr_metric = 0; 4276 return 0; 4277 4278 case SIOCGIFMTU: /* Get the MTU of a device */ 4279 ifr->ifr_mtu = dev->mtu; 4280 return 0; 4281 4282 case SIOCGIFHWADDR: 4283 if (!dev->addr_len) 4284 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); 4285 else 4286 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, 4287 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4288 ifr->ifr_hwaddr.sa_family = dev->type; 4289 return 0; 4290 4291 case SIOCGIFSLAVE: 4292 err = -EINVAL; 4293 break; 4294 4295 case SIOCGIFMAP: 4296 ifr->ifr_map.mem_start = dev->mem_start; 4297 ifr->ifr_map.mem_end = dev->mem_end; 4298 ifr->ifr_map.base_addr = dev->base_addr; 4299 ifr->ifr_map.irq = dev->irq; 4300 ifr->ifr_map.dma = dev->dma; 4301 ifr->ifr_map.port = dev->if_port; 4302 return 0; 4303 4304 case SIOCGIFINDEX: 4305 ifr->ifr_ifindex = dev->ifindex; 4306 return 0; 4307 4308 case SIOCGIFTXQLEN: 4309 ifr->ifr_qlen = dev->tx_queue_len; 4310 return 0; 4311 4312 default: 4313 /* dev_ioctl() should ensure this case 4314 * is never reached 4315 */ 4316 WARN_ON(1); 4317 err = -EINVAL; 4318 break; 4319 4320 } 4321 return err; 4322 } 4323 4324 /* 4325 * Perform the SIOCxIFxxx calls, inside rtnl_lock() 4326 */ 4327 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) 4328 { 4329 int err; 4330 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name); 4331 const struct net_device_ops *ops; 4332 4333 if (!dev) 4334 return -ENODEV; 4335 4336 ops = dev->netdev_ops; 4337 4338 switch (cmd) { 4339 case SIOCSIFFLAGS: /* Set interface flags */ 4340 return dev_change_flags(dev, ifr->ifr_flags); 4341 4342 case SIOCSIFMETRIC: /* Set the metric on the interface 4343 (currently unused) */ 4344 return -EOPNOTSUPP; 4345 4346 case SIOCSIFMTU: /* Set the MTU of a device */ 4347 return dev_set_mtu(dev, ifr->ifr_mtu); 4348 4349 case SIOCSIFHWADDR: 4350 return dev_set_mac_address(dev, &ifr->ifr_hwaddr); 4351 4352 case SIOCSIFHWBROADCAST: 4353 if (ifr->ifr_hwaddr.sa_family != dev->type) 4354 return -EINVAL; 4355 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, 4356 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); 4357 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4358 return 0; 4359 4360 case SIOCSIFMAP: 4361 if (ops->ndo_set_config) { 4362 if (!netif_device_present(dev)) 4363 return -ENODEV; 4364 return ops->ndo_set_config(dev, &ifr->ifr_map); 4365 } 4366 return -EOPNOTSUPP; 4367 4368 case SIOCADDMULTI: 4369 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4370 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4371 return -EINVAL; 4372 if (!netif_device_present(dev)) 4373 return -ENODEV; 4374 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, 4375 dev->addr_len, 1); 4376 4377 case SIOCDELMULTI: 4378 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) || 4379 ifr->ifr_hwaddr.sa_family != AF_UNSPEC) 4380 return -EINVAL; 4381 if (!netif_device_present(dev)) 4382 return -ENODEV; 4383 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, 4384 dev->addr_len, 1); 4385 4386 case SIOCSIFTXQLEN: 4387 if (ifr->ifr_qlen < 0) 4388 return -EINVAL; 4389 dev->tx_queue_len = ifr->ifr_qlen; 4390 return 0; 4391 4392 case SIOCSIFNAME: 4393 ifr->ifr_newname[IFNAMSIZ-1] = '\0'; 4394 return dev_change_name(dev, ifr->ifr_newname); 4395 4396 /* 4397 * Unknown or private ioctl 4398 */ 4399 default: 4400 if ((cmd >= SIOCDEVPRIVATE && 4401 cmd <= SIOCDEVPRIVATE + 15) || 4402 cmd == SIOCBONDENSLAVE || 4403 cmd == SIOCBONDRELEASE || 4404 cmd == SIOCBONDSETHWADDR || 4405 cmd == SIOCBONDSLAVEINFOQUERY || 4406 cmd == SIOCBONDINFOQUERY || 4407 cmd == SIOCBONDCHANGEACTIVE || 4408 cmd == SIOCGMIIPHY || 4409 cmd == SIOCGMIIREG || 4410 cmd == SIOCSMIIREG || 4411 cmd == SIOCBRADDIF || 4412 cmd == SIOCBRDELIF || 4413 cmd == SIOCSHWTSTAMP || 4414 cmd == SIOCWANDEV) { 4415 err = -EOPNOTSUPP; 4416 if (ops->ndo_do_ioctl) { 4417 if (netif_device_present(dev)) 4418 err = ops->ndo_do_ioctl(dev, ifr, cmd); 4419 else 4420 err = -ENODEV; 4421 } 4422 } else 4423 err = -EINVAL; 4424 4425 } 4426 return err; 4427 } 4428 4429 /* 4430 * This function handles all "interface"-type I/O control requests. The actual 4431 * 'doing' part of this is dev_ifsioc above. 4432 */ 4433 4434 /** 4435 * dev_ioctl - network device ioctl 4436 * @net: the applicable net namespace 4437 * @cmd: command to issue 4438 * @arg: pointer to a struct ifreq in user space 4439 * 4440 * Issue ioctl functions to devices. This is normally called by the 4441 * user space syscall interfaces but can sometimes be useful for 4442 * other purposes. The return value is the return from the syscall if 4443 * positive or a negative errno code on error. 4444 */ 4445 4446 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) 4447 { 4448 struct ifreq ifr; 4449 int ret; 4450 char *colon; 4451 4452 /* One special case: SIOCGIFCONF takes ifconf argument 4453 and requires shared lock, because it sleeps writing 4454 to user space. 4455 */ 4456 4457 if (cmd == SIOCGIFCONF) { 4458 rtnl_lock(); 4459 ret = dev_ifconf(net, (char __user *) arg); 4460 rtnl_unlock(); 4461 return ret; 4462 } 4463 if (cmd == SIOCGIFNAME) 4464 return dev_ifname(net, (struct ifreq __user *)arg); 4465 4466 if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) 4467 return -EFAULT; 4468 4469 ifr.ifr_name[IFNAMSIZ-1] = 0; 4470 4471 colon = strchr(ifr.ifr_name, ':'); 4472 if (colon) 4473 *colon = 0; 4474 4475 /* 4476 * See which interface the caller is talking about. 4477 */ 4478 4479 switch (cmd) { 4480 /* 4481 * These ioctl calls: 4482 * - can be done by all. 4483 * - atomic and do not require locking. 4484 * - return a value 4485 */ 4486 case SIOCGIFFLAGS: 4487 case SIOCGIFMETRIC: 4488 case SIOCGIFMTU: 4489 case SIOCGIFHWADDR: 4490 case SIOCGIFSLAVE: 4491 case SIOCGIFMAP: 4492 case SIOCGIFINDEX: 4493 case SIOCGIFTXQLEN: 4494 dev_load(net, ifr.ifr_name); 4495 read_lock(&dev_base_lock); 4496 ret = dev_ifsioc_locked(net, &ifr, cmd); 4497 read_unlock(&dev_base_lock); 4498 if (!ret) { 4499 if (colon) 4500 *colon = ':'; 4501 if (copy_to_user(arg, &ifr, 4502 sizeof(struct ifreq))) 4503 ret = -EFAULT; 4504 } 4505 return ret; 4506 4507 case SIOCETHTOOL: 4508 dev_load(net, ifr.ifr_name); 4509 rtnl_lock(); 4510 ret = dev_ethtool(net, &ifr); 4511 rtnl_unlock(); 4512 if (!ret) { 4513 if (colon) 4514 *colon = ':'; 4515 if (copy_to_user(arg, &ifr, 4516 sizeof(struct ifreq))) 4517 ret = -EFAULT; 4518 } 4519 return ret; 4520 4521 /* 4522 * These ioctl calls: 4523 * - require superuser power. 4524 * - require strict serialization. 4525 * - return a value 4526 */ 4527 case SIOCGMIIPHY: 4528 case SIOCGMIIREG: 4529 case SIOCSIFNAME: 4530 if (!capable(CAP_NET_ADMIN)) 4531 return -EPERM; 4532 dev_load(net, ifr.ifr_name); 4533 rtnl_lock(); 4534 ret = dev_ifsioc(net, &ifr, cmd); 4535 rtnl_unlock(); 4536 if (!ret) { 4537 if (colon) 4538 *colon = ':'; 4539 if (copy_to_user(arg, &ifr, 4540 sizeof(struct ifreq))) 4541 ret = -EFAULT; 4542 } 4543 return ret; 4544 4545 /* 4546 * These ioctl calls: 4547 * - require superuser power. 4548 * - require strict serialization. 4549 * - do not return a value 4550 */ 4551 case SIOCSIFFLAGS: 4552 case SIOCSIFMETRIC: 4553 case SIOCSIFMTU: 4554 case SIOCSIFMAP: 4555 case SIOCSIFHWADDR: 4556 case SIOCSIFSLAVE: 4557 case SIOCADDMULTI: 4558 case SIOCDELMULTI: 4559 case SIOCSIFHWBROADCAST: 4560 case SIOCSIFTXQLEN: 4561 case SIOCSMIIREG: 4562 case SIOCBONDENSLAVE: 4563 case SIOCBONDRELEASE: 4564 case SIOCBONDSETHWADDR: 4565 case SIOCBONDCHANGEACTIVE: 4566 case SIOCBRADDIF: 4567 case SIOCBRDELIF: 4568 case SIOCSHWTSTAMP: 4569 if (!capable(CAP_NET_ADMIN)) 4570 return -EPERM; 4571 /* fall through */ 4572 case SIOCBONDSLAVEINFOQUERY: 4573 case SIOCBONDINFOQUERY: 4574 dev_load(net, ifr.ifr_name); 4575 rtnl_lock(); 4576 ret = dev_ifsioc(net, &ifr, cmd); 4577 rtnl_unlock(); 4578 return ret; 4579 4580 case SIOCGIFMEM: 4581 /* Get the per device memory space. We can add this but 4582 * currently do not support it */ 4583 case SIOCSIFMEM: 4584 /* Set the per device memory buffer space. 4585 * Not applicable in our case */ 4586 case SIOCSIFLINK: 4587 return -EINVAL; 4588 4589 /* 4590 * Unknown or private ioctl. 4591 */ 4592 default: 4593 if (cmd == SIOCWANDEV || 4594 (cmd >= SIOCDEVPRIVATE && 4595 cmd <= SIOCDEVPRIVATE + 15)) { 4596 dev_load(net, ifr.ifr_name); 4597 rtnl_lock(); 4598 ret = dev_ifsioc(net, &ifr, cmd); 4599 rtnl_unlock(); 4600 if (!ret && copy_to_user(arg, &ifr, 4601 sizeof(struct ifreq))) 4602 ret = -EFAULT; 4603 return ret; 4604 } 4605 /* Take care of Wireless Extensions */ 4606 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) 4607 return wext_handle_ioctl(net, &ifr, cmd, arg); 4608 return -EINVAL; 4609 } 4610 } 4611 4612 4613 /** 4614 * dev_new_index - allocate an ifindex 4615 * @net: the applicable net namespace 4616 * 4617 * Returns a suitable unique value for a new device interface 4618 * number. The caller must hold the rtnl semaphore or the 4619 * dev_base_lock to be sure it remains unique. 4620 */ 4621 static int dev_new_index(struct net *net) 4622 { 4623 static int ifindex; 4624 for (;;) { 4625 if (++ifindex <= 0) 4626 ifindex = 1; 4627 if (!__dev_get_by_index(net, ifindex)) 4628 return ifindex; 4629 } 4630 } 4631 4632 /* Delayed registration/unregisteration */ 4633 static LIST_HEAD(net_todo_list); 4634 4635 static void net_set_todo(struct net_device *dev) 4636 { 4637 list_add_tail(&dev->todo_list, &net_todo_list); 4638 } 4639 4640 static void rollback_registered_many(struct list_head *head) 4641 { 4642 struct net_device *dev; 4643 4644 BUG_ON(dev_boot_phase); 4645 ASSERT_RTNL(); 4646 4647 list_for_each_entry(dev, head, unreg_list) { 4648 /* Some devices call without registering 4649 * for initialization unwind. 4650 */ 4651 if (dev->reg_state == NETREG_UNINITIALIZED) { 4652 pr_debug("unregister_netdevice: device %s/%p never " 4653 "was registered\n", dev->name, dev); 4654 4655 WARN_ON(1); 4656 return; 4657 } 4658 4659 BUG_ON(dev->reg_state != NETREG_REGISTERED); 4660 4661 /* If device is running, close it first. */ 4662 dev_close(dev); 4663 4664 /* And unlink it from device chain. */ 4665 unlist_netdevice(dev); 4666 4667 dev->reg_state = NETREG_UNREGISTERING; 4668 } 4669 4670 synchronize_net(); 4671 4672 list_for_each_entry(dev, head, unreg_list) { 4673 /* Shutdown queueing discipline. */ 4674 dev_shutdown(dev); 4675 4676 4677 /* Notify protocols, that we are about to destroy 4678 this device. They should clean all the things. 4679 */ 4680 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4681 4682 /* 4683 * Flush the unicast and multicast chains 4684 */ 4685 dev_unicast_flush(dev); 4686 dev_addr_discard(dev); 4687 4688 if (dev->netdev_ops->ndo_uninit) 4689 dev->netdev_ops->ndo_uninit(dev); 4690 4691 /* Notifier chain MUST detach us from master device. */ 4692 WARN_ON(dev->master); 4693 4694 /* Remove entries from kobject tree */ 4695 netdev_unregister_kobject(dev); 4696 } 4697 4698 synchronize_net(); 4699 4700 list_for_each_entry(dev, head, unreg_list) 4701 dev_put(dev); 4702 } 4703 4704 static void rollback_registered(struct net_device *dev) 4705 { 4706 LIST_HEAD(single); 4707 4708 list_add(&dev->unreg_list, &single); 4709 rollback_registered_many(&single); 4710 } 4711 4712 static void __netdev_init_queue_locks_one(struct net_device *dev, 4713 struct netdev_queue *dev_queue, 4714 void *_unused) 4715 { 4716 spin_lock_init(&dev_queue->_xmit_lock); 4717 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); 4718 dev_queue->xmit_lock_owner = -1; 4719 } 4720 4721 static void netdev_init_queue_locks(struct net_device *dev) 4722 { 4723 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); 4724 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); 4725 } 4726 4727 unsigned long netdev_fix_features(unsigned long features, const char *name) 4728 { 4729 /* Fix illegal SG+CSUM combinations. */ 4730 if ((features & NETIF_F_SG) && 4731 !(features & NETIF_F_ALL_CSUM)) { 4732 if (name) 4733 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 4734 "checksum feature.\n", name); 4735 features &= ~NETIF_F_SG; 4736 } 4737 4738 /* TSO requires that SG is present as well. */ 4739 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 4740 if (name) 4741 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " 4742 "SG feature.\n", name); 4743 features &= ~NETIF_F_TSO; 4744 } 4745 4746 if (features & NETIF_F_UFO) { 4747 if (!(features & NETIF_F_GEN_CSUM)) { 4748 if (name) 4749 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4750 "since no NETIF_F_HW_CSUM feature.\n", 4751 name); 4752 features &= ~NETIF_F_UFO; 4753 } 4754 4755 if (!(features & NETIF_F_SG)) { 4756 if (name) 4757 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 4758 "since no NETIF_F_SG feature.\n", name); 4759 features &= ~NETIF_F_UFO; 4760 } 4761 } 4762 4763 return features; 4764 } 4765 EXPORT_SYMBOL(netdev_fix_features); 4766 4767 /** 4768 * register_netdevice - register a network device 4769 * @dev: device to register 4770 * 4771 * Take a completed network device structure and add it to the kernel 4772 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4773 * chain. 0 is returned on success. A negative errno code is returned 4774 * on a failure to set up the device, or if the name is a duplicate. 4775 * 4776 * Callers must hold the rtnl semaphore. You may want 4777 * register_netdev() instead of this. 4778 * 4779 * BUGS: 4780 * The locking appears insufficient to guarantee two parallel registers 4781 * will not get the same name. 4782 */ 4783 4784 int register_netdevice(struct net_device *dev) 4785 { 4786 struct hlist_head *head; 4787 struct hlist_node *p; 4788 int ret; 4789 struct net *net = dev_net(dev); 4790 4791 BUG_ON(dev_boot_phase); 4792 ASSERT_RTNL(); 4793 4794 might_sleep(); 4795 4796 /* When net_device's are persistent, this will be fatal. */ 4797 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); 4798 BUG_ON(!net); 4799 4800 spin_lock_init(&dev->addr_list_lock); 4801 netdev_set_addr_lockdep_class(dev); 4802 netdev_init_queue_locks(dev); 4803 4804 dev->iflink = -1; 4805 4806 /* Init, if this function is available */ 4807 if (dev->netdev_ops->ndo_init) { 4808 ret = dev->netdev_ops->ndo_init(dev); 4809 if (ret) { 4810 if (ret > 0) 4811 ret = -EIO; 4812 goto out; 4813 } 4814 } 4815 4816 if (!dev_valid_name(dev->name)) { 4817 ret = -EINVAL; 4818 goto err_uninit; 4819 } 4820 4821 dev->ifindex = dev_new_index(net); 4822 if (dev->iflink == -1) 4823 dev->iflink = dev->ifindex; 4824 4825 /* Check for existence of name */ 4826 head = dev_name_hash(net, dev->name); 4827 hlist_for_each(p, head) { 4828 struct net_device *d 4829 = hlist_entry(p, struct net_device, name_hlist); 4830 if (!strncmp(d->name, dev->name, IFNAMSIZ)) { 4831 ret = -EEXIST; 4832 goto err_uninit; 4833 } 4834 } 4835 4836 /* Fix illegal checksum combinations */ 4837 if ((dev->features & NETIF_F_HW_CSUM) && 4838 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 4839 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", 4840 dev->name); 4841 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM); 4842 } 4843 4844 if ((dev->features & NETIF_F_NO_CSUM) && 4845 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 4846 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", 4847 dev->name); 4848 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); 4849 } 4850 4851 dev->features = netdev_fix_features(dev->features, dev->name); 4852 4853 /* Enable software GSO if SG is supported. */ 4854 if (dev->features & NETIF_F_SG) 4855 dev->features |= NETIF_F_GSO; 4856 4857 netdev_initialize_kobject(dev); 4858 4859 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); 4860 ret = notifier_to_errno(ret); 4861 if (ret) 4862 goto err_uninit; 4863 4864 ret = netdev_register_kobject(dev); 4865 if (ret) 4866 goto err_uninit; 4867 dev->reg_state = NETREG_REGISTERED; 4868 4869 /* 4870 * Default initial state at registry is that the 4871 * device is present. 4872 */ 4873 4874 set_bit(__LINK_STATE_PRESENT, &dev->state); 4875 4876 dev_init_scheduler(dev); 4877 dev_hold(dev); 4878 list_netdevice(dev); 4879 4880 /* Notify protocols, that a new device appeared. */ 4881 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 4882 ret = notifier_to_errno(ret); 4883 if (ret) { 4884 rollback_registered(dev); 4885 dev->reg_state = NETREG_UNREGISTERED; 4886 } 4887 4888 out: 4889 return ret; 4890 4891 err_uninit: 4892 if (dev->netdev_ops->ndo_uninit) 4893 dev->netdev_ops->ndo_uninit(dev); 4894 goto out; 4895 } 4896 EXPORT_SYMBOL(register_netdevice); 4897 4898 /** 4899 * init_dummy_netdev - init a dummy network device for NAPI 4900 * @dev: device to init 4901 * 4902 * This takes a network device structure and initialize the minimum 4903 * amount of fields so it can be used to schedule NAPI polls without 4904 * registering a full blown interface. This is to be used by drivers 4905 * that need to tie several hardware interfaces to a single NAPI 4906 * poll scheduler due to HW limitations. 4907 */ 4908 int init_dummy_netdev(struct net_device *dev) 4909 { 4910 /* Clear everything. Note we don't initialize spinlocks 4911 * are they aren't supposed to be taken by any of the 4912 * NAPI code and this dummy netdev is supposed to be 4913 * only ever used for NAPI polls 4914 */ 4915 memset(dev, 0, sizeof(struct net_device)); 4916 4917 /* make sure we BUG if trying to hit standard 4918 * register/unregister code path 4919 */ 4920 dev->reg_state = NETREG_DUMMY; 4921 4922 /* initialize the ref count */ 4923 atomic_set(&dev->refcnt, 1); 4924 4925 /* NAPI wants this */ 4926 INIT_LIST_HEAD(&dev->napi_list); 4927 4928 /* a dummy interface is started by default */ 4929 set_bit(__LINK_STATE_PRESENT, &dev->state); 4930 set_bit(__LINK_STATE_START, &dev->state); 4931 4932 return 0; 4933 } 4934 EXPORT_SYMBOL_GPL(init_dummy_netdev); 4935 4936 4937 /** 4938 * register_netdev - register a network device 4939 * @dev: device to register 4940 * 4941 * Take a completed network device structure and add it to the kernel 4942 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier 4943 * chain. 0 is returned on success. A negative errno code is returned 4944 * on a failure to set up the device, or if the name is a duplicate. 4945 * 4946 * This is a wrapper around register_netdevice that takes the rtnl semaphore 4947 * and expands the device name if you passed a format string to 4948 * alloc_netdev. 4949 */ 4950 int register_netdev(struct net_device *dev) 4951 { 4952 int err; 4953 4954 rtnl_lock(); 4955 4956 /* 4957 * If the name is a format string the caller wants us to do a 4958 * name allocation. 4959 */ 4960 if (strchr(dev->name, '%')) { 4961 err = dev_alloc_name(dev, dev->name); 4962 if (err < 0) 4963 goto out; 4964 } 4965 4966 err = register_netdevice(dev); 4967 out: 4968 rtnl_unlock(); 4969 return err; 4970 } 4971 EXPORT_SYMBOL(register_netdev); 4972 4973 /* 4974 * netdev_wait_allrefs - wait until all references are gone. 4975 * 4976 * This is called when unregistering network devices. 4977 * 4978 * Any protocol or device that holds a reference should register 4979 * for netdevice notification, and cleanup and put back the 4980 * reference if they receive an UNREGISTER event. 4981 * We can get stuck here if buggy protocols don't correctly 4982 * call dev_put. 4983 */ 4984 static void netdev_wait_allrefs(struct net_device *dev) 4985 { 4986 unsigned long rebroadcast_time, warning_time; 4987 4988 rebroadcast_time = warning_time = jiffies; 4989 while (atomic_read(&dev->refcnt) != 0) { 4990 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { 4991 rtnl_lock(); 4992 4993 /* Rebroadcast unregister notification */ 4994 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 4995 4996 if (test_bit(__LINK_STATE_LINKWATCH_PENDING, 4997 &dev->state)) { 4998 /* We must not have linkwatch events 4999 * pending on unregister. If this 5000 * happens, we simply run the queue 5001 * unscheduled, resulting in a noop 5002 * for this device. 5003 */ 5004 linkwatch_run_queue(); 5005 } 5006 5007 __rtnl_unlock(); 5008 5009 rebroadcast_time = jiffies; 5010 } 5011 5012 msleep(250); 5013 5014 if (time_after(jiffies, warning_time + 10 * HZ)) { 5015 printk(KERN_EMERG "unregister_netdevice: " 5016 "waiting for %s to become free. Usage " 5017 "count = %d\n", 5018 dev->name, atomic_read(&dev->refcnt)); 5019 warning_time = jiffies; 5020 } 5021 } 5022 } 5023 5024 /* The sequence is: 5025 * 5026 * rtnl_lock(); 5027 * ... 5028 * register_netdevice(x1); 5029 * register_netdevice(x2); 5030 * ... 5031 * unregister_netdevice(y1); 5032 * unregister_netdevice(y2); 5033 * ... 5034 * rtnl_unlock(); 5035 * free_netdev(y1); 5036 * free_netdev(y2); 5037 * 5038 * We are invoked by rtnl_unlock(). 5039 * This allows us to deal with problems: 5040 * 1) We can delete sysfs objects which invoke hotplug 5041 * without deadlocking with linkwatch via keventd. 5042 * 2) Since we run with the RTNL semaphore not held, we can sleep 5043 * safely in order to wait for the netdev refcnt to drop to zero. 5044 * 5045 * We must not return until all unregister events added during 5046 * the interval the lock was held have been completed. 5047 */ 5048 void netdev_run_todo(void) 5049 { 5050 struct list_head list; 5051 5052 /* Snapshot list, allow later requests */ 5053 list_replace_init(&net_todo_list, &list); 5054 5055 __rtnl_unlock(); 5056 5057 while (!list_empty(&list)) { 5058 struct net_device *dev 5059 = list_entry(list.next, struct net_device, todo_list); 5060 list_del(&dev->todo_list); 5061 5062 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) { 5063 printk(KERN_ERR "network todo '%s' but state %d\n", 5064 dev->name, dev->reg_state); 5065 dump_stack(); 5066 continue; 5067 } 5068 5069 dev->reg_state = NETREG_UNREGISTERED; 5070 5071 on_each_cpu(flush_backlog, dev, 1); 5072 5073 netdev_wait_allrefs(dev); 5074 5075 /* paranoia */ 5076 BUG_ON(atomic_read(&dev->refcnt)); 5077 WARN_ON(dev->ip_ptr); 5078 WARN_ON(dev->ip6_ptr); 5079 WARN_ON(dev->dn_ptr); 5080 5081 if (dev->destructor) 5082 dev->destructor(dev); 5083 5084 /* Free network device */ 5085 kobject_put(&dev->dev.kobj); 5086 } 5087 } 5088 5089 /** 5090 * dev_get_stats - get network device statistics 5091 * @dev: device to get statistics from 5092 * 5093 * Get network statistics from device. The device driver may provide 5094 * its own method by setting dev->netdev_ops->get_stats; otherwise 5095 * the internal statistics structure is used. 5096 */ 5097 const struct net_device_stats *dev_get_stats(struct net_device *dev) 5098 { 5099 const struct net_device_ops *ops = dev->netdev_ops; 5100 5101 if (ops->ndo_get_stats) 5102 return ops->ndo_get_stats(dev); 5103 else { 5104 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0; 5105 struct net_device_stats *stats = &dev->stats; 5106 unsigned int i; 5107 struct netdev_queue *txq; 5108 5109 for (i = 0; i < dev->num_tx_queues; i++) { 5110 txq = netdev_get_tx_queue(dev, i); 5111 tx_bytes += txq->tx_bytes; 5112 tx_packets += txq->tx_packets; 5113 tx_dropped += txq->tx_dropped; 5114 } 5115 if (tx_bytes || tx_packets || tx_dropped) { 5116 stats->tx_bytes = tx_bytes; 5117 stats->tx_packets = tx_packets; 5118 stats->tx_dropped = tx_dropped; 5119 } 5120 return stats; 5121 } 5122 } 5123 EXPORT_SYMBOL(dev_get_stats); 5124 5125 static void netdev_init_one_queue(struct net_device *dev, 5126 struct netdev_queue *queue, 5127 void *_unused) 5128 { 5129 queue->dev = dev; 5130 } 5131 5132 static void netdev_init_queues(struct net_device *dev) 5133 { 5134 netdev_init_one_queue(dev, &dev->rx_queue, NULL); 5135 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); 5136 spin_lock_init(&dev->tx_global_lock); 5137 } 5138 5139 /** 5140 * alloc_netdev_mq - allocate network device 5141 * @sizeof_priv: size of private data to allocate space for 5142 * @name: device name format string 5143 * @setup: callback to initialize device 5144 * @queue_count: the number of subqueues to allocate 5145 * 5146 * Allocates a struct net_device with private data area for driver use 5147 * and performs basic initialization. Also allocates subquue structs 5148 * for each queue on the device at the end of the netdevice. 5149 */ 5150 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, 5151 void (*setup)(struct net_device *), unsigned int queue_count) 5152 { 5153 struct netdev_queue *tx; 5154 struct net_device *dev; 5155 size_t alloc_size; 5156 struct net_device *p; 5157 5158 BUG_ON(strlen(name) >= sizeof(dev->name)); 5159 5160 alloc_size = sizeof(struct net_device); 5161 if (sizeof_priv) { 5162 /* ensure 32-byte alignment of private area */ 5163 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); 5164 alloc_size += sizeof_priv; 5165 } 5166 /* ensure 32-byte alignment of whole construct */ 5167 alloc_size += NETDEV_ALIGN - 1; 5168 5169 p = kzalloc(alloc_size, GFP_KERNEL); 5170 if (!p) { 5171 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n"); 5172 return NULL; 5173 } 5174 5175 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); 5176 if (!tx) { 5177 printk(KERN_ERR "alloc_netdev: Unable to allocate " 5178 "tx qdiscs.\n"); 5179 goto free_p; 5180 } 5181 5182 dev = PTR_ALIGN(p, NETDEV_ALIGN); 5183 dev->padded = (char *)dev - (char *)p; 5184 5185 if (dev_addr_init(dev)) 5186 goto free_tx; 5187 5188 dev_unicast_init(dev); 5189 5190 dev_net_set(dev, &init_net); 5191 5192 dev->_tx = tx; 5193 dev->num_tx_queues = queue_count; 5194 dev->real_num_tx_queues = queue_count; 5195 5196 dev->gso_max_size = GSO_MAX_SIZE; 5197 5198 netdev_init_queues(dev); 5199 5200 INIT_LIST_HEAD(&dev->napi_list); 5201 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5202 setup(dev); 5203 strcpy(dev->name, name); 5204 return dev; 5205 5206 free_tx: 5207 kfree(tx); 5208 5209 free_p: 5210 kfree(p); 5211 return NULL; 5212 } 5213 EXPORT_SYMBOL(alloc_netdev_mq); 5214 5215 /** 5216 * free_netdev - free network device 5217 * @dev: device 5218 * 5219 * This function does the last stage of destroying an allocated device 5220 * interface. The reference to the device object is released. 5221 * If this is the last reference then it will be freed. 5222 */ 5223 void free_netdev(struct net_device *dev) 5224 { 5225 struct napi_struct *p, *n; 5226 5227 release_net(dev_net(dev)); 5228 5229 kfree(dev->_tx); 5230 5231 /* Flush device addresses */ 5232 dev_addr_flush(dev); 5233 5234 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) 5235 netif_napi_del(p); 5236 5237 /* Compatibility with error handling in drivers */ 5238 if (dev->reg_state == NETREG_UNINITIALIZED) { 5239 kfree((char *)dev - dev->padded); 5240 return; 5241 } 5242 5243 BUG_ON(dev->reg_state != NETREG_UNREGISTERED); 5244 dev->reg_state = NETREG_RELEASED; 5245 5246 /* will free via device release */ 5247 put_device(&dev->dev); 5248 } 5249 EXPORT_SYMBOL(free_netdev); 5250 5251 /** 5252 * synchronize_net - Synchronize with packet receive processing 5253 * 5254 * Wait for packets currently being received to be done. 5255 * Does not block later packets from starting. 5256 */ 5257 void synchronize_net(void) 5258 { 5259 might_sleep(); 5260 synchronize_rcu(); 5261 } 5262 EXPORT_SYMBOL(synchronize_net); 5263 5264 /** 5265 * unregister_netdevice_queue - remove device from the kernel 5266 * @dev: device 5267 * @head: list 5268 5269 * This function shuts down a device interface and removes it 5270 * from the kernel tables. 5271 * If head not NULL, device is queued to be unregistered later. 5272 * 5273 * Callers must hold the rtnl semaphore. You may want 5274 * unregister_netdev() instead of this. 5275 */ 5276 5277 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head) 5278 { 5279 ASSERT_RTNL(); 5280 5281 if (head) { 5282 list_add_tail(&dev->unreg_list, head); 5283 } else { 5284 rollback_registered(dev); 5285 /* Finish processing unregister after unlock */ 5286 net_set_todo(dev); 5287 } 5288 } 5289 EXPORT_SYMBOL(unregister_netdevice_queue); 5290 5291 /** 5292 * unregister_netdevice_many - unregister many devices 5293 * @head: list of devices 5294 * 5295 */ 5296 void unregister_netdevice_many(struct list_head *head) 5297 { 5298 struct net_device *dev; 5299 5300 if (!list_empty(head)) { 5301 rollback_registered_many(head); 5302 list_for_each_entry(dev, head, unreg_list) 5303 net_set_todo(dev); 5304 } 5305 } 5306 5307 /** 5308 * unregister_netdev - remove device from the kernel 5309 * @dev: device 5310 * 5311 * This function shuts down a device interface and removes it 5312 * from the kernel tables. 5313 * 5314 * This is just a wrapper for unregister_netdevice that takes 5315 * the rtnl semaphore. In general you want to use this and not 5316 * unregister_netdevice. 5317 */ 5318 void unregister_netdev(struct net_device *dev) 5319 { 5320 rtnl_lock(); 5321 unregister_netdevice(dev); 5322 rtnl_unlock(); 5323 } 5324 EXPORT_SYMBOL(unregister_netdev); 5325 5326 /** 5327 * dev_change_net_namespace - move device to different nethost namespace 5328 * @dev: device 5329 * @net: network namespace 5330 * @pat: If not NULL name pattern to try if the current device name 5331 * is already taken in the destination network namespace. 5332 * 5333 * This function shuts down a device interface and moves it 5334 * to a new network namespace. On success 0 is returned, on 5335 * a failure a netagive errno code is returned. 5336 * 5337 * Callers must hold the rtnl semaphore. 5338 */ 5339 5340 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) 5341 { 5342 char buf[IFNAMSIZ]; 5343 const char *destname; 5344 int err; 5345 5346 ASSERT_RTNL(); 5347 5348 /* Don't allow namespace local devices to be moved. */ 5349 err = -EINVAL; 5350 if (dev->features & NETIF_F_NETNS_LOCAL) 5351 goto out; 5352 5353 #ifdef CONFIG_SYSFS 5354 /* Don't allow real devices to be moved when sysfs 5355 * is enabled. 5356 */ 5357 err = -EINVAL; 5358 if (dev->dev.parent) 5359 goto out; 5360 #endif 5361 5362 /* Ensure the device has been registrered */ 5363 err = -EINVAL; 5364 if (dev->reg_state != NETREG_REGISTERED) 5365 goto out; 5366 5367 /* Get out if there is nothing todo */ 5368 err = 0; 5369 if (net_eq(dev_net(dev), net)) 5370 goto out; 5371 5372 /* Pick the destination device name, and ensure 5373 * we can use it in the destination network namespace. 5374 */ 5375 err = -EEXIST; 5376 destname = dev->name; 5377 if (__dev_get_by_name(net, destname)) { 5378 /* We get here if we can't use the current device name */ 5379 if (!pat) 5380 goto out; 5381 if (!dev_valid_name(pat)) 5382 goto out; 5383 if (strchr(pat, '%')) { 5384 if (__dev_alloc_name(net, pat, buf) < 0) 5385 goto out; 5386 destname = buf; 5387 } else 5388 destname = pat; 5389 if (__dev_get_by_name(net, destname)) 5390 goto out; 5391 } 5392 5393 /* 5394 * And now a mini version of register_netdevice unregister_netdevice. 5395 */ 5396 5397 /* If device is running close it first. */ 5398 dev_close(dev); 5399 5400 /* And unlink it from device chain */ 5401 err = -ENODEV; 5402 unlist_netdevice(dev); 5403 5404 synchronize_net(); 5405 5406 /* Shutdown queueing discipline. */ 5407 dev_shutdown(dev); 5408 5409 /* Notify protocols, that we are about to destroy 5410 this device. They should clean all the things. 5411 */ 5412 call_netdevice_notifiers(NETDEV_UNREGISTER, dev); 5413 5414 /* 5415 * Flush the unicast and multicast chains 5416 */ 5417 dev_unicast_flush(dev); 5418 dev_addr_discard(dev); 5419 5420 netdev_unregister_kobject(dev); 5421 5422 /* Actually switch the network namespace */ 5423 dev_net_set(dev, net); 5424 5425 /* Assign the new device name */ 5426 if (destname != dev->name) 5427 strcpy(dev->name, destname); 5428 5429 /* If there is an ifindex conflict assign a new one */ 5430 if (__dev_get_by_index(net, dev->ifindex)) { 5431 int iflink = (dev->iflink == dev->ifindex); 5432 dev->ifindex = dev_new_index(net); 5433 if (iflink) 5434 dev->iflink = dev->ifindex; 5435 } 5436 5437 /* Fixup kobjects */ 5438 err = netdev_register_kobject(dev); 5439 WARN_ON(err); 5440 5441 /* Add the device back in the hashes */ 5442 list_netdevice(dev); 5443 5444 /* Notify protocols, that a new device appeared. */ 5445 call_netdevice_notifiers(NETDEV_REGISTER, dev); 5446 5447 synchronize_net(); 5448 err = 0; 5449 out: 5450 return err; 5451 } 5452 EXPORT_SYMBOL_GPL(dev_change_net_namespace); 5453 5454 static int dev_cpu_callback(struct notifier_block *nfb, 5455 unsigned long action, 5456 void *ocpu) 5457 { 5458 struct sk_buff **list_skb; 5459 struct Qdisc **list_net; 5460 struct sk_buff *skb; 5461 unsigned int cpu, oldcpu = (unsigned long)ocpu; 5462 struct softnet_data *sd, *oldsd; 5463 5464 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 5465 return NOTIFY_OK; 5466 5467 local_irq_disable(); 5468 cpu = smp_processor_id(); 5469 sd = &per_cpu(softnet_data, cpu); 5470 oldsd = &per_cpu(softnet_data, oldcpu); 5471 5472 /* Find end of our completion_queue. */ 5473 list_skb = &sd->completion_queue; 5474 while (*list_skb) 5475 list_skb = &(*list_skb)->next; 5476 /* Append completion queue from offline CPU. */ 5477 *list_skb = oldsd->completion_queue; 5478 oldsd->completion_queue = NULL; 5479 5480 /* Find end of our output_queue. */ 5481 list_net = &sd->output_queue; 5482 while (*list_net) 5483 list_net = &(*list_net)->next_sched; 5484 /* Append output queue from offline CPU. */ 5485 *list_net = oldsd->output_queue; 5486 oldsd->output_queue = NULL; 5487 5488 raise_softirq_irqoff(NET_TX_SOFTIRQ); 5489 local_irq_enable(); 5490 5491 /* Process offline CPU's input_pkt_queue */ 5492 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) 5493 netif_rx(skb); 5494 5495 return NOTIFY_OK; 5496 } 5497 5498 5499 /** 5500 * netdev_increment_features - increment feature set by one 5501 * @all: current feature set 5502 * @one: new feature set 5503 * @mask: mask feature set 5504 * 5505 * Computes a new feature set after adding a device with feature set 5506 * @one to the master device with current feature set @all. Will not 5507 * enable anything that is off in @mask. Returns the new feature set. 5508 */ 5509 unsigned long netdev_increment_features(unsigned long all, unsigned long one, 5510 unsigned long mask) 5511 { 5512 /* If device needs checksumming, downgrade to it. */ 5513 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 5514 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); 5515 else if (mask & NETIF_F_ALL_CSUM) { 5516 /* If one device supports v4/v6 checksumming, set for all. */ 5517 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && 5518 !(all & NETIF_F_GEN_CSUM)) { 5519 all &= ~NETIF_F_ALL_CSUM; 5520 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); 5521 } 5522 5523 /* If one device supports hw checksumming, set for all. */ 5524 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { 5525 all &= ~NETIF_F_ALL_CSUM; 5526 all |= NETIF_F_HW_CSUM; 5527 } 5528 } 5529 5530 one |= NETIF_F_ALL_CSUM; 5531 5532 one |= all & NETIF_F_ONE_FOR_ALL; 5533 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO; 5534 all |= one & mask & NETIF_F_ONE_FOR_ALL; 5535 5536 return all; 5537 } 5538 EXPORT_SYMBOL(netdev_increment_features); 5539 5540 static struct hlist_head *netdev_create_hash(void) 5541 { 5542 int i; 5543 struct hlist_head *hash; 5544 5545 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL); 5546 if (hash != NULL) 5547 for (i = 0; i < NETDEV_HASHENTRIES; i++) 5548 INIT_HLIST_HEAD(&hash[i]); 5549 5550 return hash; 5551 } 5552 5553 /* Initialize per network namespace state */ 5554 static int __net_init netdev_init(struct net *net) 5555 { 5556 INIT_LIST_HEAD(&net->dev_base_head); 5557 5558 net->dev_name_head = netdev_create_hash(); 5559 if (net->dev_name_head == NULL) 5560 goto err_name; 5561 5562 net->dev_index_head = netdev_create_hash(); 5563 if (net->dev_index_head == NULL) 5564 goto err_idx; 5565 5566 return 0; 5567 5568 err_idx: 5569 kfree(net->dev_name_head); 5570 err_name: 5571 return -ENOMEM; 5572 } 5573 5574 /** 5575 * netdev_drivername - network driver for the device 5576 * @dev: network device 5577 * @buffer: buffer for resulting name 5578 * @len: size of buffer 5579 * 5580 * Determine network driver for device. 5581 */ 5582 char *netdev_drivername(const struct net_device *dev, char *buffer, int len) 5583 { 5584 const struct device_driver *driver; 5585 const struct device *parent; 5586 5587 if (len <= 0 || !buffer) 5588 return buffer; 5589 buffer[0] = 0; 5590 5591 parent = dev->dev.parent; 5592 5593 if (!parent) 5594 return buffer; 5595 5596 driver = parent->driver; 5597 if (driver && driver->name) 5598 strlcpy(buffer, driver->name, len); 5599 return buffer; 5600 } 5601 5602 static void __net_exit netdev_exit(struct net *net) 5603 { 5604 kfree(net->dev_name_head); 5605 kfree(net->dev_index_head); 5606 } 5607 5608 static struct pernet_operations __net_initdata netdev_net_ops = { 5609 .init = netdev_init, 5610 .exit = netdev_exit, 5611 }; 5612 5613 static void __net_exit default_device_exit(struct net *net) 5614 { 5615 struct net_device *dev; 5616 /* 5617 * Push all migratable of the network devices back to the 5618 * initial network namespace 5619 */ 5620 rtnl_lock(); 5621 restart: 5622 for_each_netdev(net, dev) { 5623 int err; 5624 char fb_name[IFNAMSIZ]; 5625 5626 /* Ignore unmoveable devices (i.e. loopback) */ 5627 if (dev->features & NETIF_F_NETNS_LOCAL) 5628 continue; 5629 5630 /* Delete virtual devices */ 5631 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) { 5632 dev->rtnl_link_ops->dellink(dev); 5633 goto restart; 5634 } 5635 5636 /* Push remaing network devices to init_net */ 5637 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); 5638 err = dev_change_net_namespace(dev, &init_net, fb_name); 5639 if (err) { 5640 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n", 5641 __func__, dev->name, err); 5642 BUG(); 5643 } 5644 goto restart; 5645 } 5646 rtnl_unlock(); 5647 } 5648 5649 static struct pernet_operations __net_initdata default_device_ops = { 5650 .exit = default_device_exit, 5651 }; 5652 5653 /* 5654 * Initialize the DEV module. At boot time this walks the device list and 5655 * unhooks any devices that fail to initialise (normally hardware not 5656 * present) and leaves us with a valid list of present and active devices. 5657 * 5658 */ 5659 5660 /* 5661 * This is called single threaded during boot, so no need 5662 * to take the rtnl semaphore. 5663 */ 5664 static int __init net_dev_init(void) 5665 { 5666 int i, rc = -ENOMEM; 5667 5668 BUG_ON(!dev_boot_phase); 5669 5670 if (dev_proc_init()) 5671 goto out; 5672 5673 if (netdev_kobject_init()) 5674 goto out; 5675 5676 INIT_LIST_HEAD(&ptype_all); 5677 for (i = 0; i < PTYPE_HASH_SIZE; i++) 5678 INIT_LIST_HEAD(&ptype_base[i]); 5679 5680 if (register_pernet_subsys(&netdev_net_ops)) 5681 goto out; 5682 5683 /* 5684 * Initialise the packet receive queues. 5685 */ 5686 5687 for_each_possible_cpu(i) { 5688 struct softnet_data *queue; 5689 5690 queue = &per_cpu(softnet_data, i); 5691 skb_queue_head_init(&queue->input_pkt_queue); 5692 queue->completion_queue = NULL; 5693 INIT_LIST_HEAD(&queue->poll_list); 5694 5695 queue->backlog.poll = process_backlog; 5696 queue->backlog.weight = weight_p; 5697 queue->backlog.gro_list = NULL; 5698 queue->backlog.gro_count = 0; 5699 } 5700 5701 dev_boot_phase = 0; 5702 5703 /* The loopback device is special if any other network devices 5704 * is present in a network namespace the loopback device must 5705 * be present. Since we now dynamically allocate and free the 5706 * loopback device ensure this invariant is maintained by 5707 * keeping the loopback device as the first device on the 5708 * list of network devices. Ensuring the loopback devices 5709 * is the first device that appears and the last network device 5710 * that disappears. 5711 */ 5712 if (register_pernet_device(&loopback_net_ops)) 5713 goto out; 5714 5715 if (register_pernet_device(&default_device_ops)) 5716 goto out; 5717 5718 open_softirq(NET_TX_SOFTIRQ, net_tx_action); 5719 open_softirq(NET_RX_SOFTIRQ, net_rx_action); 5720 5721 hotcpu_notifier(dev_cpu_callback, 0); 5722 dst_init(); 5723 dev_mcast_init(); 5724 rc = 0; 5725 out: 5726 return rc; 5727 } 5728 5729 subsys_initcall(net_dev_init); 5730 5731 static int __init initialize_hashrnd(void) 5732 { 5733 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd)); 5734 return 0; 5735 } 5736 5737 late_initcall_sync(initialize_hashrnd); 5738 5739