1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static u32 highest_client_id; 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 static void ib_client_put(struct ib_client *client) 102 { 103 if (refcount_dec_and_test(&client->uses)) 104 complete(&client->uses_zero); 105 } 106 107 /* 108 * If client_data is registered then the corresponding client must also still 109 * be registered. 110 */ 111 #define CLIENT_DATA_REGISTERED XA_MARK_1 112 113 unsigned int rdma_dev_net_id; 114 115 /* 116 * A list of net namespaces is maintained in an xarray. This is necessary 117 * because we can't get the locking right using the existing net ns list. We 118 * would require a init_net callback after the list is updated. 119 */ 120 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 121 /* 122 * rwsem to protect accessing the rdma_nets xarray entries. 123 */ 124 static DECLARE_RWSEM(rdma_nets_rwsem); 125 126 bool ib_devices_shared_netns = true; 127 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 128 MODULE_PARM_DESC(netns_mode, 129 "Share device among net namespaces; default=1 (shared)"); 130 /** 131 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 132 * from a specified net namespace or not. 133 * @dev: Pointer to rdma device which needs to be checked 134 * @net: Pointer to net namesapce for which access to be checked 135 * 136 * When the rdma device is in shared mode, it ignores the net namespace. 137 * When the rdma device is exclusive to a net namespace, rdma device net 138 * namespace is checked against the specified one. 139 */ 140 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 141 { 142 return (ib_devices_shared_netns || 143 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 144 } 145 EXPORT_SYMBOL(rdma_dev_access_netns); 146 147 /* 148 * xarray has this behavior where it won't iterate over NULL values stored in 149 * allocated arrays. So we need our own iterator to see all values stored in 150 * the array. This does the same thing as xa_for_each except that it also 151 * returns NULL valued entries if the array is allocating. Simplified to only 152 * work on simple xarrays. 153 */ 154 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 155 xa_mark_t filter) 156 { 157 XA_STATE(xas, xa, *indexp); 158 void *entry; 159 160 rcu_read_lock(); 161 do { 162 entry = xas_find_marked(&xas, ULONG_MAX, filter); 163 if (xa_is_zero(entry)) 164 break; 165 } while (xas_retry(&xas, entry)); 166 rcu_read_unlock(); 167 168 if (entry) { 169 *indexp = xas.xa_index; 170 if (xa_is_zero(entry)) 171 return NULL; 172 return entry; 173 } 174 return XA_ERROR(-ENOENT); 175 } 176 #define xan_for_each_marked(xa, index, entry, filter) \ 177 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 178 !xa_is_err(entry); \ 179 (index)++, entry = xan_find_marked(xa, &(index), filter)) 180 181 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 182 static DEFINE_SPINLOCK(ndev_hash_lock); 183 static DECLARE_HASHTABLE(ndev_hash, 5); 184 185 static void free_netdevs(struct ib_device *ib_dev); 186 static void ib_unregister_work(struct work_struct *work); 187 static void __ib_unregister_device(struct ib_device *device); 188 static int ib_security_change(struct notifier_block *nb, unsigned long event, 189 void *lsm_data); 190 static void ib_policy_change_task(struct work_struct *work); 191 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 192 193 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 194 struct va_format *vaf) 195 { 196 if (ibdev && ibdev->dev.parent) 197 dev_printk_emit(level[1] - '0', 198 ibdev->dev.parent, 199 "%s %s %s: %pV", 200 dev_driver_string(ibdev->dev.parent), 201 dev_name(ibdev->dev.parent), 202 dev_name(&ibdev->dev), 203 vaf); 204 else if (ibdev) 205 printk("%s%s: %pV", 206 level, dev_name(&ibdev->dev), vaf); 207 else 208 printk("%s(NULL ib_device): %pV", level, vaf); 209 } 210 211 void ibdev_printk(const char *level, const struct ib_device *ibdev, 212 const char *format, ...) 213 { 214 struct va_format vaf; 215 va_list args; 216 217 va_start(args, format); 218 219 vaf.fmt = format; 220 vaf.va = &args; 221 222 __ibdev_printk(level, ibdev, &vaf); 223 224 va_end(args); 225 } 226 EXPORT_SYMBOL(ibdev_printk); 227 228 #define define_ibdev_printk_level(func, level) \ 229 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 230 { \ 231 struct va_format vaf; \ 232 va_list args; \ 233 \ 234 va_start(args, fmt); \ 235 \ 236 vaf.fmt = fmt; \ 237 vaf.va = &args; \ 238 \ 239 __ibdev_printk(level, ibdev, &vaf); \ 240 \ 241 va_end(args); \ 242 } \ 243 EXPORT_SYMBOL(func); 244 245 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 246 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 247 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 248 define_ibdev_printk_level(ibdev_err, KERN_ERR); 249 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 250 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 251 define_ibdev_printk_level(ibdev_info, KERN_INFO); 252 253 static struct notifier_block ibdev_lsm_nb = { 254 .notifier_call = ib_security_change, 255 }; 256 257 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 258 struct net *net); 259 260 /* Pointer to the RCU head at the start of the ib_port_data array */ 261 struct ib_port_data_rcu { 262 struct rcu_head rcu_head; 263 struct ib_port_data pdata[]; 264 }; 265 266 static void ib_device_check_mandatory(struct ib_device *device) 267 { 268 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 269 static const struct { 270 size_t offset; 271 char *name; 272 } mandatory_table[] = { 273 IB_MANDATORY_FUNC(query_device), 274 IB_MANDATORY_FUNC(query_port), 275 IB_MANDATORY_FUNC(query_pkey), 276 IB_MANDATORY_FUNC(alloc_pd), 277 IB_MANDATORY_FUNC(dealloc_pd), 278 IB_MANDATORY_FUNC(create_qp), 279 IB_MANDATORY_FUNC(modify_qp), 280 IB_MANDATORY_FUNC(destroy_qp), 281 IB_MANDATORY_FUNC(post_send), 282 IB_MANDATORY_FUNC(post_recv), 283 IB_MANDATORY_FUNC(create_cq), 284 IB_MANDATORY_FUNC(destroy_cq), 285 IB_MANDATORY_FUNC(poll_cq), 286 IB_MANDATORY_FUNC(req_notify_cq), 287 IB_MANDATORY_FUNC(get_dma_mr), 288 IB_MANDATORY_FUNC(dereg_mr), 289 IB_MANDATORY_FUNC(get_port_immutable) 290 }; 291 int i; 292 293 device->kverbs_provider = true; 294 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 295 if (!*(void **) ((void *) &device->ops + 296 mandatory_table[i].offset)) { 297 device->kverbs_provider = false; 298 break; 299 } 300 } 301 } 302 303 /* 304 * Caller must perform ib_device_put() to return the device reference count 305 * when ib_device_get_by_index() returns valid device pointer. 306 */ 307 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 308 { 309 struct ib_device *device; 310 311 down_read(&devices_rwsem); 312 device = xa_load(&devices, index); 313 if (device) { 314 if (!rdma_dev_access_netns(device, net)) { 315 device = NULL; 316 goto out; 317 } 318 319 if (!ib_device_try_get(device)) 320 device = NULL; 321 } 322 out: 323 up_read(&devices_rwsem); 324 return device; 325 } 326 327 /** 328 * ib_device_put - Release IB device reference 329 * @device: device whose reference to be released 330 * 331 * ib_device_put() releases reference to the IB device to allow it to be 332 * unregistered and eventually free. 333 */ 334 void ib_device_put(struct ib_device *device) 335 { 336 if (refcount_dec_and_test(&device->refcount)) 337 complete(&device->unreg_completion); 338 } 339 EXPORT_SYMBOL(ib_device_put); 340 341 static struct ib_device *__ib_device_get_by_name(const char *name) 342 { 343 struct ib_device *device; 344 unsigned long index; 345 346 xa_for_each (&devices, index, device) 347 if (!strcmp(name, dev_name(&device->dev))) 348 return device; 349 350 return NULL; 351 } 352 353 /** 354 * ib_device_get_by_name - Find an IB device by name 355 * @name: The name to look for 356 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 357 * 358 * Find and hold an ib_device by its name. The caller must call 359 * ib_device_put() on the returned pointer. 360 */ 361 struct ib_device *ib_device_get_by_name(const char *name, 362 enum rdma_driver_id driver_id) 363 { 364 struct ib_device *device; 365 366 down_read(&devices_rwsem); 367 device = __ib_device_get_by_name(name); 368 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 369 device->ops.driver_id != driver_id) 370 device = NULL; 371 372 if (device) { 373 if (!ib_device_try_get(device)) 374 device = NULL; 375 } 376 up_read(&devices_rwsem); 377 return device; 378 } 379 EXPORT_SYMBOL(ib_device_get_by_name); 380 381 static int rename_compat_devs(struct ib_device *device) 382 { 383 struct ib_core_device *cdev; 384 unsigned long index; 385 int ret = 0; 386 387 mutex_lock(&device->compat_devs_mutex); 388 xa_for_each (&device->compat_devs, index, cdev) { 389 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 390 if (ret) { 391 dev_warn(&cdev->dev, 392 "Fail to rename compatdev to new name %s\n", 393 dev_name(&device->dev)); 394 break; 395 } 396 } 397 mutex_unlock(&device->compat_devs_mutex); 398 return ret; 399 } 400 401 int ib_device_rename(struct ib_device *ibdev, const char *name) 402 { 403 unsigned long index; 404 void *client_data; 405 int ret; 406 407 down_write(&devices_rwsem); 408 if (!strcmp(name, dev_name(&ibdev->dev))) { 409 up_write(&devices_rwsem); 410 return 0; 411 } 412 413 if (__ib_device_get_by_name(name)) { 414 up_write(&devices_rwsem); 415 return -EEXIST; 416 } 417 418 ret = device_rename(&ibdev->dev, name); 419 if (ret) { 420 up_write(&devices_rwsem); 421 return ret; 422 } 423 424 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 425 ret = rename_compat_devs(ibdev); 426 427 downgrade_write(&devices_rwsem); 428 down_read(&ibdev->client_data_rwsem); 429 xan_for_each_marked(&ibdev->client_data, index, client_data, 430 CLIENT_DATA_REGISTERED) { 431 struct ib_client *client = xa_load(&clients, index); 432 433 if (!client || !client->rename) 434 continue; 435 436 client->rename(ibdev, client_data); 437 } 438 up_read(&ibdev->client_data_rwsem); 439 up_read(&devices_rwsem); 440 return 0; 441 } 442 443 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 444 { 445 if (use_dim > 1) 446 return -EINVAL; 447 ibdev->use_cq_dim = use_dim; 448 449 return 0; 450 } 451 452 static int alloc_name(struct ib_device *ibdev, const char *name) 453 { 454 struct ib_device *device; 455 unsigned long index; 456 struct ida inuse; 457 int rc; 458 int i; 459 460 lockdep_assert_held_write(&devices_rwsem); 461 ida_init(&inuse); 462 xa_for_each (&devices, index, device) { 463 char buf[IB_DEVICE_NAME_MAX]; 464 465 if (sscanf(dev_name(&device->dev), name, &i) != 1) 466 continue; 467 if (i < 0 || i >= INT_MAX) 468 continue; 469 snprintf(buf, sizeof buf, name, i); 470 if (strcmp(buf, dev_name(&device->dev)) != 0) 471 continue; 472 473 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 474 if (rc < 0) 475 goto out; 476 } 477 478 rc = ida_alloc(&inuse, GFP_KERNEL); 479 if (rc < 0) 480 goto out; 481 482 rc = dev_set_name(&ibdev->dev, name, rc); 483 out: 484 ida_destroy(&inuse); 485 return rc; 486 } 487 488 static void ib_device_release(struct device *device) 489 { 490 struct ib_device *dev = container_of(device, struct ib_device, dev); 491 492 free_netdevs(dev); 493 WARN_ON(refcount_read(&dev->refcount)); 494 if (dev->port_data) { 495 ib_cache_release_one(dev); 496 ib_security_release_port_pkey_list(dev); 497 rdma_counter_release(dev); 498 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 499 pdata[0]), 500 rcu_head); 501 } 502 503 mutex_destroy(&dev->unregistration_lock); 504 mutex_destroy(&dev->compat_devs_mutex); 505 506 xa_destroy(&dev->compat_devs); 507 xa_destroy(&dev->client_data); 508 kfree_rcu(dev, rcu_head); 509 } 510 511 static int ib_device_uevent(struct device *device, 512 struct kobj_uevent_env *env) 513 { 514 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 515 return -ENOMEM; 516 517 /* 518 * It would be nice to pass the node GUID with the event... 519 */ 520 521 return 0; 522 } 523 524 static const void *net_namespace(struct device *d) 525 { 526 struct ib_core_device *coredev = 527 container_of(d, struct ib_core_device, dev); 528 529 return read_pnet(&coredev->rdma_net); 530 } 531 532 static struct class ib_class = { 533 .name = "infiniband", 534 .dev_release = ib_device_release, 535 .dev_uevent = ib_device_uevent, 536 .ns_type = &net_ns_type_operations, 537 .namespace = net_namespace, 538 }; 539 540 static void rdma_init_coredev(struct ib_core_device *coredev, 541 struct ib_device *dev, struct net *net) 542 { 543 /* This BUILD_BUG_ON is intended to catch layout change 544 * of union of ib_core_device and device. 545 * dev must be the first element as ib_core and providers 546 * driver uses it. Adding anything in ib_core_device before 547 * device will break this assumption. 548 */ 549 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 550 offsetof(struct ib_device, dev)); 551 552 coredev->dev.class = &ib_class; 553 coredev->dev.groups = dev->groups; 554 device_initialize(&coredev->dev); 555 coredev->owner = dev; 556 INIT_LIST_HEAD(&coredev->port_list); 557 write_pnet(&coredev->rdma_net, net); 558 } 559 560 /** 561 * _ib_alloc_device - allocate an IB device struct 562 * @size:size of structure to allocate 563 * 564 * Low-level drivers should use ib_alloc_device() to allocate &struct 565 * ib_device. @size is the size of the structure to be allocated, 566 * including any private data used by the low-level driver. 567 * ib_dealloc_device() must be used to free structures allocated with 568 * ib_alloc_device(). 569 */ 570 struct ib_device *_ib_alloc_device(size_t size) 571 { 572 struct ib_device *device; 573 574 if (WARN_ON(size < sizeof(struct ib_device))) 575 return NULL; 576 577 device = kzalloc(size, GFP_KERNEL); 578 if (!device) 579 return NULL; 580 581 if (rdma_restrack_init(device)) { 582 kfree(device); 583 return NULL; 584 } 585 586 device->groups[0] = &ib_dev_attr_group; 587 rdma_init_coredev(&device->coredev, device, &init_net); 588 589 INIT_LIST_HEAD(&device->event_handler_list); 590 spin_lock_init(&device->event_handler_lock); 591 mutex_init(&device->unregistration_lock); 592 /* 593 * client_data needs to be alloc because we don't want our mark to be 594 * destroyed if the user stores NULL in the client data. 595 */ 596 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 597 init_rwsem(&device->client_data_rwsem); 598 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 599 mutex_init(&device->compat_devs_mutex); 600 init_completion(&device->unreg_completion); 601 INIT_WORK(&device->unregistration_work, ib_unregister_work); 602 603 return device; 604 } 605 EXPORT_SYMBOL(_ib_alloc_device); 606 607 /** 608 * ib_dealloc_device - free an IB device struct 609 * @device:structure to free 610 * 611 * Free a structure allocated with ib_alloc_device(). 612 */ 613 void ib_dealloc_device(struct ib_device *device) 614 { 615 if (device->ops.dealloc_driver) 616 device->ops.dealloc_driver(device); 617 618 /* 619 * ib_unregister_driver() requires all devices to remain in the xarray 620 * while their ops are callable. The last op we call is dealloc_driver 621 * above. This is needed to create a fence on op callbacks prior to 622 * allowing the driver module to unload. 623 */ 624 down_write(&devices_rwsem); 625 if (xa_load(&devices, device->index) == device) 626 xa_erase(&devices, device->index); 627 up_write(&devices_rwsem); 628 629 /* Expedite releasing netdev references */ 630 free_netdevs(device); 631 632 WARN_ON(!xa_empty(&device->compat_devs)); 633 WARN_ON(!xa_empty(&device->client_data)); 634 WARN_ON(refcount_read(&device->refcount)); 635 rdma_restrack_clean(device); 636 /* Balances with device_initialize */ 637 put_device(&device->dev); 638 } 639 EXPORT_SYMBOL(ib_dealloc_device); 640 641 /* 642 * add_client_context() and remove_client_context() must be safe against 643 * parallel calls on the same device - registration/unregistration of both the 644 * device and client can be occurring in parallel. 645 * 646 * The routines need to be a fence, any caller must not return until the add 647 * or remove is fully completed. 648 */ 649 static int add_client_context(struct ib_device *device, 650 struct ib_client *client) 651 { 652 int ret = 0; 653 654 if (!device->kverbs_provider && !client->no_kverbs_req) 655 return 0; 656 657 down_write(&device->client_data_rwsem); 658 /* 659 * So long as the client is registered hold both the client and device 660 * unregistration locks. 661 */ 662 if (!refcount_inc_not_zero(&client->uses)) 663 goto out_unlock; 664 refcount_inc(&device->refcount); 665 666 /* 667 * Another caller to add_client_context got here first and has already 668 * completely initialized context. 669 */ 670 if (xa_get_mark(&device->client_data, client->client_id, 671 CLIENT_DATA_REGISTERED)) 672 goto out; 673 674 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 675 GFP_KERNEL)); 676 if (ret) 677 goto out; 678 downgrade_write(&device->client_data_rwsem); 679 if (client->add) 680 client->add(device); 681 682 /* Readers shall not see a client until add has been completed */ 683 xa_set_mark(&device->client_data, client->client_id, 684 CLIENT_DATA_REGISTERED); 685 up_read(&device->client_data_rwsem); 686 return 0; 687 688 out: 689 ib_device_put(device); 690 ib_client_put(client); 691 out_unlock: 692 up_write(&device->client_data_rwsem); 693 return ret; 694 } 695 696 static void remove_client_context(struct ib_device *device, 697 unsigned int client_id) 698 { 699 struct ib_client *client; 700 void *client_data; 701 702 down_write(&device->client_data_rwsem); 703 if (!xa_get_mark(&device->client_data, client_id, 704 CLIENT_DATA_REGISTERED)) { 705 up_write(&device->client_data_rwsem); 706 return; 707 } 708 client_data = xa_load(&device->client_data, client_id); 709 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 710 client = xa_load(&clients, client_id); 711 up_write(&device->client_data_rwsem); 712 713 /* 714 * Notice we cannot be holding any exclusive locks when calling the 715 * remove callback as the remove callback can recurse back into any 716 * public functions in this module and thus try for any locks those 717 * functions take. 718 * 719 * For this reason clients and drivers should not call the 720 * unregistration functions will holdling any locks. 721 */ 722 if (client->remove) 723 client->remove(device, client_data); 724 725 xa_erase(&device->client_data, client_id); 726 ib_device_put(device); 727 ib_client_put(client); 728 } 729 730 static int alloc_port_data(struct ib_device *device) 731 { 732 struct ib_port_data_rcu *pdata_rcu; 733 unsigned int port; 734 735 if (device->port_data) 736 return 0; 737 738 /* This can only be called once the physical port range is defined */ 739 if (WARN_ON(!device->phys_port_cnt)) 740 return -EINVAL; 741 742 /* 743 * device->port_data is indexed directly by the port number to make 744 * access to this data as efficient as possible. 745 * 746 * Therefore port_data is declared as a 1 based array with potential 747 * empty slots at the beginning. 748 */ 749 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 750 rdma_end_port(device) + 1), 751 GFP_KERNEL); 752 if (!pdata_rcu) 753 return -ENOMEM; 754 /* 755 * The rcu_head is put in front of the port data array and the stored 756 * pointer is adjusted since we never need to see that member until 757 * kfree_rcu. 758 */ 759 device->port_data = pdata_rcu->pdata; 760 761 rdma_for_each_port (device, port) { 762 struct ib_port_data *pdata = &device->port_data[port]; 763 764 pdata->ib_dev = device; 765 spin_lock_init(&pdata->pkey_list_lock); 766 INIT_LIST_HEAD(&pdata->pkey_list); 767 spin_lock_init(&pdata->netdev_lock); 768 INIT_HLIST_NODE(&pdata->ndev_hash_link); 769 } 770 return 0; 771 } 772 773 static int verify_immutable(const struct ib_device *dev, u8 port) 774 { 775 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 776 rdma_max_mad_size(dev, port) != 0); 777 } 778 779 static int setup_port_data(struct ib_device *device) 780 { 781 unsigned int port; 782 int ret; 783 784 ret = alloc_port_data(device); 785 if (ret) 786 return ret; 787 788 rdma_for_each_port (device, port) { 789 struct ib_port_data *pdata = &device->port_data[port]; 790 791 ret = device->ops.get_port_immutable(device, port, 792 &pdata->immutable); 793 if (ret) 794 return ret; 795 796 if (verify_immutable(device, port)) 797 return -EINVAL; 798 } 799 return 0; 800 } 801 802 void ib_get_device_fw_str(struct ib_device *dev, char *str) 803 { 804 if (dev->ops.get_dev_fw_str) 805 dev->ops.get_dev_fw_str(dev, str); 806 else 807 str[0] = '\0'; 808 } 809 EXPORT_SYMBOL(ib_get_device_fw_str); 810 811 static void ib_policy_change_task(struct work_struct *work) 812 { 813 struct ib_device *dev; 814 unsigned long index; 815 816 down_read(&devices_rwsem); 817 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 818 unsigned int i; 819 820 rdma_for_each_port (dev, i) { 821 u64 sp; 822 int ret = ib_get_cached_subnet_prefix(dev, 823 i, 824 &sp); 825 826 WARN_ONCE(ret, 827 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 828 ret); 829 if (!ret) 830 ib_security_cache_change(dev, i, sp); 831 } 832 } 833 up_read(&devices_rwsem); 834 } 835 836 static int ib_security_change(struct notifier_block *nb, unsigned long event, 837 void *lsm_data) 838 { 839 if (event != LSM_POLICY_CHANGE) 840 return NOTIFY_DONE; 841 842 schedule_work(&ib_policy_change_work); 843 ib_mad_agent_security_change(); 844 845 return NOTIFY_OK; 846 } 847 848 static void compatdev_release(struct device *dev) 849 { 850 struct ib_core_device *cdev = 851 container_of(dev, struct ib_core_device, dev); 852 853 kfree(cdev); 854 } 855 856 static int add_one_compat_dev(struct ib_device *device, 857 struct rdma_dev_net *rnet) 858 { 859 struct ib_core_device *cdev; 860 int ret; 861 862 lockdep_assert_held(&rdma_nets_rwsem); 863 if (!ib_devices_shared_netns) 864 return 0; 865 866 /* 867 * Create and add compat device in all namespaces other than where it 868 * is currently bound to. 869 */ 870 if (net_eq(read_pnet(&rnet->net), 871 read_pnet(&device->coredev.rdma_net))) 872 return 0; 873 874 /* 875 * The first of init_net() or ib_register_device() to take the 876 * compat_devs_mutex wins and gets to add the device. Others will wait 877 * for completion here. 878 */ 879 mutex_lock(&device->compat_devs_mutex); 880 cdev = xa_load(&device->compat_devs, rnet->id); 881 if (cdev) { 882 ret = 0; 883 goto done; 884 } 885 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 886 if (ret) 887 goto done; 888 889 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 890 if (!cdev) { 891 ret = -ENOMEM; 892 goto cdev_err; 893 } 894 895 cdev->dev.parent = device->dev.parent; 896 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 897 cdev->dev.release = compatdev_release; 898 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 899 900 ret = device_add(&cdev->dev); 901 if (ret) 902 goto add_err; 903 ret = ib_setup_port_attrs(cdev); 904 if (ret) 905 goto port_err; 906 907 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 908 cdev, GFP_KERNEL)); 909 if (ret) 910 goto insert_err; 911 912 mutex_unlock(&device->compat_devs_mutex); 913 return 0; 914 915 insert_err: 916 ib_free_port_attrs(cdev); 917 port_err: 918 device_del(&cdev->dev); 919 add_err: 920 put_device(&cdev->dev); 921 cdev_err: 922 xa_release(&device->compat_devs, rnet->id); 923 done: 924 mutex_unlock(&device->compat_devs_mutex); 925 return ret; 926 } 927 928 static void remove_one_compat_dev(struct ib_device *device, u32 id) 929 { 930 struct ib_core_device *cdev; 931 932 mutex_lock(&device->compat_devs_mutex); 933 cdev = xa_erase(&device->compat_devs, id); 934 mutex_unlock(&device->compat_devs_mutex); 935 if (cdev) { 936 ib_free_port_attrs(cdev); 937 device_del(&cdev->dev); 938 put_device(&cdev->dev); 939 } 940 } 941 942 static void remove_compat_devs(struct ib_device *device) 943 { 944 struct ib_core_device *cdev; 945 unsigned long index; 946 947 xa_for_each (&device->compat_devs, index, cdev) 948 remove_one_compat_dev(device, index); 949 } 950 951 static int add_compat_devs(struct ib_device *device) 952 { 953 struct rdma_dev_net *rnet; 954 unsigned long index; 955 int ret = 0; 956 957 lockdep_assert_held(&devices_rwsem); 958 959 down_read(&rdma_nets_rwsem); 960 xa_for_each (&rdma_nets, index, rnet) { 961 ret = add_one_compat_dev(device, rnet); 962 if (ret) 963 break; 964 } 965 up_read(&rdma_nets_rwsem); 966 return ret; 967 } 968 969 static void remove_all_compat_devs(void) 970 { 971 struct ib_compat_device *cdev; 972 struct ib_device *dev; 973 unsigned long index; 974 975 down_read(&devices_rwsem); 976 xa_for_each (&devices, index, dev) { 977 unsigned long c_index = 0; 978 979 /* Hold nets_rwsem so that any other thread modifying this 980 * system param can sync with this thread. 981 */ 982 down_read(&rdma_nets_rwsem); 983 xa_for_each (&dev->compat_devs, c_index, cdev) 984 remove_one_compat_dev(dev, c_index); 985 up_read(&rdma_nets_rwsem); 986 } 987 up_read(&devices_rwsem); 988 } 989 990 static int add_all_compat_devs(void) 991 { 992 struct rdma_dev_net *rnet; 993 struct ib_device *dev; 994 unsigned long index; 995 int ret = 0; 996 997 down_read(&devices_rwsem); 998 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 999 unsigned long net_index = 0; 1000 1001 /* Hold nets_rwsem so that any other thread modifying this 1002 * system param can sync with this thread. 1003 */ 1004 down_read(&rdma_nets_rwsem); 1005 xa_for_each (&rdma_nets, net_index, rnet) { 1006 ret = add_one_compat_dev(dev, rnet); 1007 if (ret) 1008 break; 1009 } 1010 up_read(&rdma_nets_rwsem); 1011 } 1012 up_read(&devices_rwsem); 1013 if (ret) 1014 remove_all_compat_devs(); 1015 return ret; 1016 } 1017 1018 int rdma_compatdev_set(u8 enable) 1019 { 1020 struct rdma_dev_net *rnet; 1021 unsigned long index; 1022 int ret = 0; 1023 1024 down_write(&rdma_nets_rwsem); 1025 if (ib_devices_shared_netns == enable) { 1026 up_write(&rdma_nets_rwsem); 1027 return 0; 1028 } 1029 1030 /* enable/disable of compat devices is not supported 1031 * when more than default init_net exists. 1032 */ 1033 xa_for_each (&rdma_nets, index, rnet) { 1034 ret++; 1035 break; 1036 } 1037 if (!ret) 1038 ib_devices_shared_netns = enable; 1039 up_write(&rdma_nets_rwsem); 1040 if (ret) 1041 return -EBUSY; 1042 1043 if (enable) 1044 ret = add_all_compat_devs(); 1045 else 1046 remove_all_compat_devs(); 1047 return ret; 1048 } 1049 1050 static void rdma_dev_exit_net(struct net *net) 1051 { 1052 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1053 struct ib_device *dev; 1054 unsigned long index; 1055 int ret; 1056 1057 down_write(&rdma_nets_rwsem); 1058 /* 1059 * Prevent the ID from being re-used and hide the id from xa_for_each. 1060 */ 1061 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1062 WARN_ON(ret); 1063 up_write(&rdma_nets_rwsem); 1064 1065 down_read(&devices_rwsem); 1066 xa_for_each (&devices, index, dev) { 1067 get_device(&dev->dev); 1068 /* 1069 * Release the devices_rwsem so that pontentially blocking 1070 * device_del, doesn't hold the devices_rwsem for too long. 1071 */ 1072 up_read(&devices_rwsem); 1073 1074 remove_one_compat_dev(dev, rnet->id); 1075 1076 /* 1077 * If the real device is in the NS then move it back to init. 1078 */ 1079 rdma_dev_change_netns(dev, net, &init_net); 1080 1081 put_device(&dev->dev); 1082 down_read(&devices_rwsem); 1083 } 1084 up_read(&devices_rwsem); 1085 1086 rdma_nl_net_exit(rnet); 1087 xa_erase(&rdma_nets, rnet->id); 1088 } 1089 1090 static __net_init int rdma_dev_init_net(struct net *net) 1091 { 1092 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1093 unsigned long index; 1094 struct ib_device *dev; 1095 int ret; 1096 1097 write_pnet(&rnet->net, net); 1098 1099 ret = rdma_nl_net_init(rnet); 1100 if (ret) 1101 return ret; 1102 1103 /* No need to create any compat devices in default init_net. */ 1104 if (net_eq(net, &init_net)) 1105 return 0; 1106 1107 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1108 if (ret) { 1109 rdma_nl_net_exit(rnet); 1110 return ret; 1111 } 1112 1113 down_read(&devices_rwsem); 1114 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1115 /* Hold nets_rwsem so that netlink command cannot change 1116 * system configuration for device sharing mode. 1117 */ 1118 down_read(&rdma_nets_rwsem); 1119 ret = add_one_compat_dev(dev, rnet); 1120 up_read(&rdma_nets_rwsem); 1121 if (ret) 1122 break; 1123 } 1124 up_read(&devices_rwsem); 1125 1126 if (ret) 1127 rdma_dev_exit_net(net); 1128 1129 return ret; 1130 } 1131 1132 /* 1133 * Assign the unique string device name and the unique device index. This is 1134 * undone by ib_dealloc_device. 1135 */ 1136 static int assign_name(struct ib_device *device, const char *name) 1137 { 1138 static u32 last_id; 1139 int ret; 1140 1141 down_write(&devices_rwsem); 1142 /* Assign a unique name to the device */ 1143 if (strchr(name, '%')) 1144 ret = alloc_name(device, name); 1145 else 1146 ret = dev_set_name(&device->dev, name); 1147 if (ret) 1148 goto out; 1149 1150 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1151 ret = -ENFILE; 1152 goto out; 1153 } 1154 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1155 1156 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1157 &last_id, GFP_KERNEL); 1158 if (ret > 0) 1159 ret = 0; 1160 1161 out: 1162 up_write(&devices_rwsem); 1163 return ret; 1164 } 1165 1166 static void setup_dma_device(struct ib_device *device) 1167 { 1168 struct device *parent = device->dev.parent; 1169 1170 WARN_ON_ONCE(device->dma_device); 1171 if (device->dev.dma_ops) { 1172 /* 1173 * The caller provided custom DMA operations. Copy the 1174 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1175 * into device->dev. 1176 */ 1177 device->dma_device = &device->dev; 1178 if (!device->dev.dma_mask) { 1179 if (parent) 1180 device->dev.dma_mask = parent->dma_mask; 1181 else 1182 WARN_ON_ONCE(true); 1183 } 1184 if (!device->dev.coherent_dma_mask) { 1185 if (parent) 1186 device->dev.coherent_dma_mask = 1187 parent->coherent_dma_mask; 1188 else 1189 WARN_ON_ONCE(true); 1190 } 1191 } else { 1192 /* 1193 * The caller did not provide custom DMA operations. Use the 1194 * DMA mapping operations of the parent device. 1195 */ 1196 WARN_ON_ONCE(!parent); 1197 device->dma_device = parent; 1198 } 1199 1200 if (!device->dev.dma_parms) { 1201 if (parent) { 1202 /* 1203 * The caller did not provide DMA parameters, so 1204 * 'parent' probably represents a PCI device. The PCI 1205 * core sets the maximum segment size to 64 1206 * KB. Increase this parameter to 2 GB. 1207 */ 1208 device->dev.dma_parms = parent->dma_parms; 1209 dma_set_max_seg_size(device->dma_device, SZ_2G); 1210 } else { 1211 WARN_ON_ONCE(true); 1212 } 1213 } 1214 } 1215 1216 /* 1217 * setup_device() allocates memory and sets up data that requires calling the 1218 * device ops, this is the only reason these actions are not done during 1219 * ib_alloc_device. It is undone by ib_dealloc_device(). 1220 */ 1221 static int setup_device(struct ib_device *device) 1222 { 1223 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1224 int ret; 1225 1226 setup_dma_device(device); 1227 ib_device_check_mandatory(device); 1228 1229 ret = setup_port_data(device); 1230 if (ret) { 1231 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1232 return ret; 1233 } 1234 1235 memset(&device->attrs, 0, sizeof(device->attrs)); 1236 ret = device->ops.query_device(device, &device->attrs, &uhw); 1237 if (ret) { 1238 dev_warn(&device->dev, 1239 "Couldn't query the device attributes\n"); 1240 return ret; 1241 } 1242 1243 return 0; 1244 } 1245 1246 static void disable_device(struct ib_device *device) 1247 { 1248 u32 cid; 1249 1250 WARN_ON(!refcount_read(&device->refcount)); 1251 1252 down_write(&devices_rwsem); 1253 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1254 up_write(&devices_rwsem); 1255 1256 /* 1257 * Remove clients in LIFO order, see assign_client_id. This could be 1258 * more efficient if xarray learns to reverse iterate. Since no new 1259 * clients can be added to this ib_device past this point we only need 1260 * the maximum possible client_id value here. 1261 */ 1262 down_read(&clients_rwsem); 1263 cid = highest_client_id; 1264 up_read(&clients_rwsem); 1265 while (cid) { 1266 cid--; 1267 remove_client_context(device, cid); 1268 } 1269 1270 /* Pairs with refcount_set in enable_device */ 1271 ib_device_put(device); 1272 wait_for_completion(&device->unreg_completion); 1273 1274 /* 1275 * compat devices must be removed after device refcount drops to zero. 1276 * Otherwise init_net() may add more compatdevs after removing compat 1277 * devices and before device is disabled. 1278 */ 1279 remove_compat_devs(device); 1280 } 1281 1282 /* 1283 * An enabled device is visible to all clients and to all the public facing 1284 * APIs that return a device pointer. This always returns with a new get, even 1285 * if it fails. 1286 */ 1287 static int enable_device_and_get(struct ib_device *device) 1288 { 1289 struct ib_client *client; 1290 unsigned long index; 1291 int ret = 0; 1292 1293 /* 1294 * One ref belongs to the xa and the other belongs to this 1295 * thread. This is needed to guard against parallel unregistration. 1296 */ 1297 refcount_set(&device->refcount, 2); 1298 down_write(&devices_rwsem); 1299 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1300 1301 /* 1302 * By using downgrade_write() we ensure that no other thread can clear 1303 * DEVICE_REGISTERED while we are completing the client setup. 1304 */ 1305 downgrade_write(&devices_rwsem); 1306 1307 if (device->ops.enable_driver) { 1308 ret = device->ops.enable_driver(device); 1309 if (ret) 1310 goto out; 1311 } 1312 1313 down_read(&clients_rwsem); 1314 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1315 ret = add_client_context(device, client); 1316 if (ret) 1317 break; 1318 } 1319 up_read(&clients_rwsem); 1320 if (!ret) 1321 ret = add_compat_devs(device); 1322 out: 1323 up_read(&devices_rwsem); 1324 return ret; 1325 } 1326 1327 /** 1328 * ib_register_device - Register an IB device with IB core 1329 * @device: Device to register 1330 * @name: unique string device name. This may include a '%' which will 1331 * cause a unique index to be added to the passed device name. 1332 * 1333 * Low-level drivers use ib_register_device() to register their 1334 * devices with the IB core. All registered clients will receive a 1335 * callback for each device that is added. @device must be allocated 1336 * with ib_alloc_device(). 1337 * 1338 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1339 * asynchronously then the device pointer may become freed as soon as this 1340 * function returns. 1341 */ 1342 int ib_register_device(struct ib_device *device, const char *name) 1343 { 1344 int ret; 1345 1346 ret = assign_name(device, name); 1347 if (ret) 1348 return ret; 1349 1350 ret = setup_device(device); 1351 if (ret) 1352 return ret; 1353 1354 ret = ib_cache_setup_one(device); 1355 if (ret) { 1356 dev_warn(&device->dev, 1357 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1358 return ret; 1359 } 1360 1361 ib_device_register_rdmacg(device); 1362 1363 rdma_counter_init(device); 1364 1365 /* 1366 * Ensure that ADD uevent is not fired because it 1367 * is too early amd device is not initialized yet. 1368 */ 1369 dev_set_uevent_suppress(&device->dev, true); 1370 ret = device_add(&device->dev); 1371 if (ret) 1372 goto cg_cleanup; 1373 1374 ret = ib_device_register_sysfs(device); 1375 if (ret) { 1376 dev_warn(&device->dev, 1377 "Couldn't register device with driver model\n"); 1378 goto dev_cleanup; 1379 } 1380 1381 ret = enable_device_and_get(device); 1382 dev_set_uevent_suppress(&device->dev, false); 1383 /* Mark for userspace that device is ready */ 1384 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1385 if (ret) { 1386 void (*dealloc_fn)(struct ib_device *); 1387 1388 /* 1389 * If we hit this error flow then we don't want to 1390 * automatically dealloc the device since the caller is 1391 * expected to call ib_dealloc_device() after 1392 * ib_register_device() fails. This is tricky due to the 1393 * possibility for a parallel unregistration along with this 1394 * error flow. Since we have a refcount here we know any 1395 * parallel flow is stopped in disable_device and will see the 1396 * NULL pointers, causing the responsibility to 1397 * ib_dealloc_device() to revert back to this thread. 1398 */ 1399 dealloc_fn = device->ops.dealloc_driver; 1400 device->ops.dealloc_driver = NULL; 1401 ib_device_put(device); 1402 __ib_unregister_device(device); 1403 device->ops.dealloc_driver = dealloc_fn; 1404 return ret; 1405 } 1406 ib_device_put(device); 1407 1408 return 0; 1409 1410 dev_cleanup: 1411 device_del(&device->dev); 1412 cg_cleanup: 1413 dev_set_uevent_suppress(&device->dev, false); 1414 ib_device_unregister_rdmacg(device); 1415 ib_cache_cleanup_one(device); 1416 return ret; 1417 } 1418 EXPORT_SYMBOL(ib_register_device); 1419 1420 /* Callers must hold a get on the device. */ 1421 static void __ib_unregister_device(struct ib_device *ib_dev) 1422 { 1423 /* 1424 * We have a registration lock so that all the calls to unregister are 1425 * fully fenced, once any unregister returns the device is truely 1426 * unregistered even if multiple callers are unregistering it at the 1427 * same time. This also interacts with the registration flow and 1428 * provides sane semantics if register and unregister are racing. 1429 */ 1430 mutex_lock(&ib_dev->unregistration_lock); 1431 if (!refcount_read(&ib_dev->refcount)) 1432 goto out; 1433 1434 disable_device(ib_dev); 1435 1436 /* Expedite removing unregistered pointers from the hash table */ 1437 free_netdevs(ib_dev); 1438 1439 ib_device_unregister_sysfs(ib_dev); 1440 device_del(&ib_dev->dev); 1441 ib_device_unregister_rdmacg(ib_dev); 1442 ib_cache_cleanup_one(ib_dev); 1443 1444 /* 1445 * Drivers using the new flow may not call ib_dealloc_device except 1446 * in error unwind prior to registration success. 1447 */ 1448 if (ib_dev->ops.dealloc_driver) { 1449 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1450 ib_dealloc_device(ib_dev); 1451 } 1452 out: 1453 mutex_unlock(&ib_dev->unregistration_lock); 1454 } 1455 1456 /** 1457 * ib_unregister_device - Unregister an IB device 1458 * @ib_dev: The device to unregister 1459 * 1460 * Unregister an IB device. All clients will receive a remove callback. 1461 * 1462 * Callers should call this routine only once, and protect against races with 1463 * registration. Typically it should only be called as part of a remove 1464 * callback in an implementation of driver core's struct device_driver and 1465 * related. 1466 * 1467 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1468 * this function. 1469 */ 1470 void ib_unregister_device(struct ib_device *ib_dev) 1471 { 1472 get_device(&ib_dev->dev); 1473 __ib_unregister_device(ib_dev); 1474 put_device(&ib_dev->dev); 1475 } 1476 EXPORT_SYMBOL(ib_unregister_device); 1477 1478 /** 1479 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1480 * @ib_dev: The device to unregister 1481 * 1482 * This is the same as ib_unregister_device(), except it includes an internal 1483 * ib_device_put() that should match a 'get' obtained by the caller. 1484 * 1485 * It is safe to call this routine concurrently from multiple threads while 1486 * holding the 'get'. When the function returns the device is fully 1487 * unregistered. 1488 * 1489 * Drivers using this flow MUST use the driver_unregister callback to clean up 1490 * their resources associated with the device and dealloc it. 1491 */ 1492 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1493 { 1494 WARN_ON(!ib_dev->ops.dealloc_driver); 1495 get_device(&ib_dev->dev); 1496 ib_device_put(ib_dev); 1497 __ib_unregister_device(ib_dev); 1498 put_device(&ib_dev->dev); 1499 } 1500 EXPORT_SYMBOL(ib_unregister_device_and_put); 1501 1502 /** 1503 * ib_unregister_driver - Unregister all IB devices for a driver 1504 * @driver_id: The driver to unregister 1505 * 1506 * This implements a fence for device unregistration. It only returns once all 1507 * devices associated with the driver_id have fully completed their 1508 * unregistration and returned from ib_unregister_device*(). 1509 * 1510 * If device's are not yet unregistered it goes ahead and starts unregistering 1511 * them. 1512 * 1513 * This does not block creation of new devices with the given driver_id, that 1514 * is the responsibility of the caller. 1515 */ 1516 void ib_unregister_driver(enum rdma_driver_id driver_id) 1517 { 1518 struct ib_device *ib_dev; 1519 unsigned long index; 1520 1521 down_read(&devices_rwsem); 1522 xa_for_each (&devices, index, ib_dev) { 1523 if (ib_dev->ops.driver_id != driver_id) 1524 continue; 1525 1526 get_device(&ib_dev->dev); 1527 up_read(&devices_rwsem); 1528 1529 WARN_ON(!ib_dev->ops.dealloc_driver); 1530 __ib_unregister_device(ib_dev); 1531 1532 put_device(&ib_dev->dev); 1533 down_read(&devices_rwsem); 1534 } 1535 up_read(&devices_rwsem); 1536 } 1537 EXPORT_SYMBOL(ib_unregister_driver); 1538 1539 static void ib_unregister_work(struct work_struct *work) 1540 { 1541 struct ib_device *ib_dev = 1542 container_of(work, struct ib_device, unregistration_work); 1543 1544 __ib_unregister_device(ib_dev); 1545 put_device(&ib_dev->dev); 1546 } 1547 1548 /** 1549 * ib_unregister_device_queued - Unregister a device using a work queue 1550 * @ib_dev: The device to unregister 1551 * 1552 * This schedules an asynchronous unregistration using a WQ for the device. A 1553 * driver should use this to avoid holding locks while doing unregistration, 1554 * such as holding the RTNL lock. 1555 * 1556 * Drivers using this API must use ib_unregister_driver before module unload 1557 * to ensure that all scheduled unregistrations have completed. 1558 */ 1559 void ib_unregister_device_queued(struct ib_device *ib_dev) 1560 { 1561 WARN_ON(!refcount_read(&ib_dev->refcount)); 1562 WARN_ON(!ib_dev->ops.dealloc_driver); 1563 get_device(&ib_dev->dev); 1564 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1565 put_device(&ib_dev->dev); 1566 } 1567 EXPORT_SYMBOL(ib_unregister_device_queued); 1568 1569 /* 1570 * The caller must pass in a device that has the kref held and the refcount 1571 * released. If the device is in cur_net and still registered then it is moved 1572 * into net. 1573 */ 1574 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1575 struct net *net) 1576 { 1577 int ret2 = -EINVAL; 1578 int ret; 1579 1580 mutex_lock(&device->unregistration_lock); 1581 1582 /* 1583 * If a device not under ib_device_get() or if the unregistration_lock 1584 * is not held, the namespace can be changed, or it can be unregistered. 1585 * Check again under the lock. 1586 */ 1587 if (refcount_read(&device->refcount) == 0 || 1588 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1589 ret = -ENODEV; 1590 goto out; 1591 } 1592 1593 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1594 disable_device(device); 1595 1596 /* 1597 * At this point no one can be using the device, so it is safe to 1598 * change the namespace. 1599 */ 1600 write_pnet(&device->coredev.rdma_net, net); 1601 1602 down_read(&devices_rwsem); 1603 /* 1604 * Currently rdma devices are system wide unique. So the device name 1605 * is guaranteed free in the new namespace. Publish the new namespace 1606 * at the sysfs level. 1607 */ 1608 ret = device_rename(&device->dev, dev_name(&device->dev)); 1609 up_read(&devices_rwsem); 1610 if (ret) { 1611 dev_warn(&device->dev, 1612 "%s: Couldn't rename device after namespace change\n", 1613 __func__); 1614 /* Try and put things back and re-enable the device */ 1615 write_pnet(&device->coredev.rdma_net, cur_net); 1616 } 1617 1618 ret2 = enable_device_and_get(device); 1619 if (ret2) { 1620 /* 1621 * This shouldn't really happen, but if it does, let the user 1622 * retry at later point. So don't disable the device. 1623 */ 1624 dev_warn(&device->dev, 1625 "%s: Couldn't re-enable device after namespace change\n", 1626 __func__); 1627 } 1628 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1629 1630 ib_device_put(device); 1631 out: 1632 mutex_unlock(&device->unregistration_lock); 1633 if (ret) 1634 return ret; 1635 return ret2; 1636 } 1637 1638 int ib_device_set_netns_put(struct sk_buff *skb, 1639 struct ib_device *dev, u32 ns_fd) 1640 { 1641 struct net *net; 1642 int ret; 1643 1644 net = get_net_ns_by_fd(ns_fd); 1645 if (IS_ERR(net)) { 1646 ret = PTR_ERR(net); 1647 goto net_err; 1648 } 1649 1650 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1651 ret = -EPERM; 1652 goto ns_err; 1653 } 1654 1655 /* 1656 * Currently supported only for those providers which support 1657 * disassociation and don't do port specific sysfs init. Once a 1658 * port_cleanup infrastructure is implemented, this limitation will be 1659 * removed. 1660 */ 1661 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1662 ib_devices_shared_netns) { 1663 ret = -EOPNOTSUPP; 1664 goto ns_err; 1665 } 1666 1667 get_device(&dev->dev); 1668 ib_device_put(dev); 1669 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1670 put_device(&dev->dev); 1671 1672 put_net(net); 1673 return ret; 1674 1675 ns_err: 1676 put_net(net); 1677 net_err: 1678 ib_device_put(dev); 1679 return ret; 1680 } 1681 1682 static struct pernet_operations rdma_dev_net_ops = { 1683 .init = rdma_dev_init_net, 1684 .exit = rdma_dev_exit_net, 1685 .id = &rdma_dev_net_id, 1686 .size = sizeof(struct rdma_dev_net), 1687 }; 1688 1689 static int assign_client_id(struct ib_client *client) 1690 { 1691 int ret; 1692 1693 down_write(&clients_rwsem); 1694 /* 1695 * The add/remove callbacks must be called in FIFO/LIFO order. To 1696 * achieve this we assign client_ids so they are sorted in 1697 * registration order. 1698 */ 1699 client->client_id = highest_client_id; 1700 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1701 if (ret) 1702 goto out; 1703 1704 highest_client_id++; 1705 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1706 1707 out: 1708 up_write(&clients_rwsem); 1709 return ret; 1710 } 1711 1712 static void remove_client_id(struct ib_client *client) 1713 { 1714 down_write(&clients_rwsem); 1715 xa_erase(&clients, client->client_id); 1716 for (; highest_client_id; highest_client_id--) 1717 if (xa_load(&clients, highest_client_id - 1)) 1718 break; 1719 up_write(&clients_rwsem); 1720 } 1721 1722 /** 1723 * ib_register_client - Register an IB client 1724 * @client:Client to register 1725 * 1726 * Upper level users of the IB drivers can use ib_register_client() to 1727 * register callbacks for IB device addition and removal. When an IB 1728 * device is added, each registered client's add method will be called 1729 * (in the order the clients were registered), and when a device is 1730 * removed, each client's remove method will be called (in the reverse 1731 * order that clients were registered). In addition, when 1732 * ib_register_client() is called, the client will receive an add 1733 * callback for all devices already registered. 1734 */ 1735 int ib_register_client(struct ib_client *client) 1736 { 1737 struct ib_device *device; 1738 unsigned long index; 1739 int ret; 1740 1741 refcount_set(&client->uses, 1); 1742 init_completion(&client->uses_zero); 1743 ret = assign_client_id(client); 1744 if (ret) 1745 return ret; 1746 1747 down_read(&devices_rwsem); 1748 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1749 ret = add_client_context(device, client); 1750 if (ret) { 1751 up_read(&devices_rwsem); 1752 ib_unregister_client(client); 1753 return ret; 1754 } 1755 } 1756 up_read(&devices_rwsem); 1757 return 0; 1758 } 1759 EXPORT_SYMBOL(ib_register_client); 1760 1761 /** 1762 * ib_unregister_client - Unregister an IB client 1763 * @client:Client to unregister 1764 * 1765 * Upper level users use ib_unregister_client() to remove their client 1766 * registration. When ib_unregister_client() is called, the client 1767 * will receive a remove callback for each IB device still registered. 1768 * 1769 * This is a full fence, once it returns no client callbacks will be called, 1770 * or are running in another thread. 1771 */ 1772 void ib_unregister_client(struct ib_client *client) 1773 { 1774 struct ib_device *device; 1775 unsigned long index; 1776 1777 down_write(&clients_rwsem); 1778 ib_client_put(client); 1779 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1780 up_write(&clients_rwsem); 1781 1782 /* We do not want to have locks while calling client->remove() */ 1783 rcu_read_lock(); 1784 xa_for_each (&devices, index, device) { 1785 if (!ib_device_try_get(device)) 1786 continue; 1787 rcu_read_unlock(); 1788 1789 remove_client_context(device, client->client_id); 1790 1791 ib_device_put(device); 1792 rcu_read_lock(); 1793 } 1794 rcu_read_unlock(); 1795 1796 /* 1797 * remove_client_context() is not a fence, it can return even though a 1798 * removal is ongoing. Wait until all removals are completed. 1799 */ 1800 wait_for_completion(&client->uses_zero); 1801 remove_client_id(client); 1802 } 1803 EXPORT_SYMBOL(ib_unregister_client); 1804 1805 static int __ib_get_global_client_nl_info(const char *client_name, 1806 struct ib_client_nl_info *res) 1807 { 1808 struct ib_client *client; 1809 unsigned long index; 1810 int ret = -ENOENT; 1811 1812 down_read(&clients_rwsem); 1813 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1814 if (strcmp(client->name, client_name) != 0) 1815 continue; 1816 if (!client->get_global_nl_info) { 1817 ret = -EOPNOTSUPP; 1818 break; 1819 } 1820 ret = client->get_global_nl_info(res); 1821 if (WARN_ON(ret == -ENOENT)) 1822 ret = -EINVAL; 1823 if (!ret && res->cdev) 1824 get_device(res->cdev); 1825 break; 1826 } 1827 up_read(&clients_rwsem); 1828 return ret; 1829 } 1830 1831 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1832 const char *client_name, 1833 struct ib_client_nl_info *res) 1834 { 1835 unsigned long index; 1836 void *client_data; 1837 int ret = -ENOENT; 1838 1839 down_read(&ibdev->client_data_rwsem); 1840 xan_for_each_marked (&ibdev->client_data, index, client_data, 1841 CLIENT_DATA_REGISTERED) { 1842 struct ib_client *client = xa_load(&clients, index); 1843 1844 if (!client || strcmp(client->name, client_name) != 0) 1845 continue; 1846 if (!client->get_nl_info) { 1847 ret = -EOPNOTSUPP; 1848 break; 1849 } 1850 ret = client->get_nl_info(ibdev, client_data, res); 1851 if (WARN_ON(ret == -ENOENT)) 1852 ret = -EINVAL; 1853 1854 /* 1855 * The cdev is guaranteed valid as long as we are inside the 1856 * client_data_rwsem as remove_one can't be called. Keep it 1857 * valid for the caller. 1858 */ 1859 if (!ret && res->cdev) 1860 get_device(res->cdev); 1861 break; 1862 } 1863 up_read(&ibdev->client_data_rwsem); 1864 1865 return ret; 1866 } 1867 1868 /** 1869 * ib_get_client_nl_info - Fetch the nl_info from a client 1870 * @device - IB device 1871 * @client_name - Name of the client 1872 * @res - Result of the query 1873 */ 1874 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1875 struct ib_client_nl_info *res) 1876 { 1877 int ret; 1878 1879 if (ibdev) 1880 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1881 else 1882 ret = __ib_get_global_client_nl_info(client_name, res); 1883 #ifdef CONFIG_MODULES 1884 if (ret == -ENOENT) { 1885 request_module("rdma-client-%s", client_name); 1886 if (ibdev) 1887 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1888 else 1889 ret = __ib_get_global_client_nl_info(client_name, res); 1890 } 1891 #endif 1892 if (ret) { 1893 if (ret == -ENOENT) 1894 return -EOPNOTSUPP; 1895 return ret; 1896 } 1897 1898 if (WARN_ON(!res->cdev)) 1899 return -EINVAL; 1900 return 0; 1901 } 1902 1903 /** 1904 * ib_set_client_data - Set IB client context 1905 * @device:Device to set context for 1906 * @client:Client to set context for 1907 * @data:Context to set 1908 * 1909 * ib_set_client_data() sets client context data that can be retrieved with 1910 * ib_get_client_data(). This can only be called while the client is 1911 * registered to the device, once the ib_client remove() callback returns this 1912 * cannot be called. 1913 */ 1914 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1915 void *data) 1916 { 1917 void *rc; 1918 1919 if (WARN_ON(IS_ERR(data))) 1920 data = NULL; 1921 1922 rc = xa_store(&device->client_data, client->client_id, data, 1923 GFP_KERNEL); 1924 WARN_ON(xa_is_err(rc)); 1925 } 1926 EXPORT_SYMBOL(ib_set_client_data); 1927 1928 /** 1929 * ib_register_event_handler - Register an IB event handler 1930 * @event_handler:Handler to register 1931 * 1932 * ib_register_event_handler() registers an event handler that will be 1933 * called back when asynchronous IB events occur (as defined in 1934 * chapter 11 of the InfiniBand Architecture Specification). This 1935 * callback may occur in interrupt context. 1936 */ 1937 void ib_register_event_handler(struct ib_event_handler *event_handler) 1938 { 1939 unsigned long flags; 1940 1941 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1942 list_add_tail(&event_handler->list, 1943 &event_handler->device->event_handler_list); 1944 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1945 } 1946 EXPORT_SYMBOL(ib_register_event_handler); 1947 1948 /** 1949 * ib_unregister_event_handler - Unregister an event handler 1950 * @event_handler:Handler to unregister 1951 * 1952 * Unregister an event handler registered with 1953 * ib_register_event_handler(). 1954 */ 1955 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1956 { 1957 unsigned long flags; 1958 1959 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1960 list_del(&event_handler->list); 1961 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1962 } 1963 EXPORT_SYMBOL(ib_unregister_event_handler); 1964 1965 /** 1966 * ib_dispatch_event - Dispatch an asynchronous event 1967 * @event:Event to dispatch 1968 * 1969 * Low-level drivers must call ib_dispatch_event() to dispatch the 1970 * event to all registered event handlers when an asynchronous event 1971 * occurs. 1972 */ 1973 void ib_dispatch_event(struct ib_event *event) 1974 { 1975 unsigned long flags; 1976 struct ib_event_handler *handler; 1977 1978 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1979 1980 list_for_each_entry(handler, &event->device->event_handler_list, list) 1981 handler->handler(handler, event); 1982 1983 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1984 } 1985 EXPORT_SYMBOL(ib_dispatch_event); 1986 1987 static int iw_query_port(struct ib_device *device, 1988 u8 port_num, 1989 struct ib_port_attr *port_attr) 1990 { 1991 struct in_device *inetdev; 1992 struct net_device *netdev; 1993 int err; 1994 1995 memset(port_attr, 0, sizeof(*port_attr)); 1996 1997 netdev = ib_device_get_netdev(device, port_num); 1998 if (!netdev) 1999 return -ENODEV; 2000 2001 port_attr->max_mtu = IB_MTU_4096; 2002 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2003 2004 if (!netif_carrier_ok(netdev)) { 2005 port_attr->state = IB_PORT_DOWN; 2006 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2007 } else { 2008 rcu_read_lock(); 2009 inetdev = __in_dev_get_rcu(netdev); 2010 2011 if (inetdev && inetdev->ifa_list) { 2012 port_attr->state = IB_PORT_ACTIVE; 2013 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2014 } else { 2015 port_attr->state = IB_PORT_INIT; 2016 port_attr->phys_state = 2017 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2018 } 2019 2020 rcu_read_unlock(); 2021 } 2022 2023 dev_put(netdev); 2024 err = device->ops.query_port(device, port_num, port_attr); 2025 if (err) 2026 return err; 2027 2028 return 0; 2029 } 2030 2031 static int __ib_query_port(struct ib_device *device, 2032 u8 port_num, 2033 struct ib_port_attr *port_attr) 2034 { 2035 union ib_gid gid = {}; 2036 int err; 2037 2038 memset(port_attr, 0, sizeof(*port_attr)); 2039 2040 err = device->ops.query_port(device, port_num, port_attr); 2041 if (err || port_attr->subnet_prefix) 2042 return err; 2043 2044 if (rdma_port_get_link_layer(device, port_num) != 2045 IB_LINK_LAYER_INFINIBAND) 2046 return 0; 2047 2048 err = device->ops.query_gid(device, port_num, 0, &gid); 2049 if (err) 2050 return err; 2051 2052 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 2053 return 0; 2054 } 2055 2056 /** 2057 * ib_query_port - Query IB port attributes 2058 * @device:Device to query 2059 * @port_num:Port number to query 2060 * @port_attr:Port attributes 2061 * 2062 * ib_query_port() returns the attributes of a port through the 2063 * @port_attr pointer. 2064 */ 2065 int ib_query_port(struct ib_device *device, 2066 u8 port_num, 2067 struct ib_port_attr *port_attr) 2068 { 2069 if (!rdma_is_port_valid(device, port_num)) 2070 return -EINVAL; 2071 2072 if (rdma_protocol_iwarp(device, port_num)) 2073 return iw_query_port(device, port_num, port_attr); 2074 else 2075 return __ib_query_port(device, port_num, port_attr); 2076 } 2077 EXPORT_SYMBOL(ib_query_port); 2078 2079 static void add_ndev_hash(struct ib_port_data *pdata) 2080 { 2081 unsigned long flags; 2082 2083 might_sleep(); 2084 2085 spin_lock_irqsave(&ndev_hash_lock, flags); 2086 if (hash_hashed(&pdata->ndev_hash_link)) { 2087 hash_del_rcu(&pdata->ndev_hash_link); 2088 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2089 /* 2090 * We cannot do hash_add_rcu after a hash_del_rcu until the 2091 * grace period 2092 */ 2093 synchronize_rcu(); 2094 spin_lock_irqsave(&ndev_hash_lock, flags); 2095 } 2096 if (pdata->netdev) 2097 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2098 (uintptr_t)pdata->netdev); 2099 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2100 } 2101 2102 /** 2103 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2104 * @ib_dev: Device to modify 2105 * @ndev: net_device to affiliate, may be NULL 2106 * @port: IB port the net_device is connected to 2107 * 2108 * Drivers should use this to link the ib_device to a netdev so the netdev 2109 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2110 * affiliated with any port. 2111 * 2112 * The caller must ensure that the given ndev is not unregistered or 2113 * unregistering, and that either the ib_device is unregistered or 2114 * ib_device_set_netdev() is called with NULL when the ndev sends a 2115 * NETDEV_UNREGISTER event. 2116 */ 2117 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2118 unsigned int port) 2119 { 2120 struct net_device *old_ndev; 2121 struct ib_port_data *pdata; 2122 unsigned long flags; 2123 int ret; 2124 2125 /* 2126 * Drivers wish to call this before ib_register_driver, so we have to 2127 * setup the port data early. 2128 */ 2129 ret = alloc_port_data(ib_dev); 2130 if (ret) 2131 return ret; 2132 2133 if (!rdma_is_port_valid(ib_dev, port)) 2134 return -EINVAL; 2135 2136 pdata = &ib_dev->port_data[port]; 2137 spin_lock_irqsave(&pdata->netdev_lock, flags); 2138 old_ndev = rcu_dereference_protected( 2139 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2140 if (old_ndev == ndev) { 2141 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2142 return 0; 2143 } 2144 2145 if (ndev) 2146 dev_hold(ndev); 2147 rcu_assign_pointer(pdata->netdev, ndev); 2148 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2149 2150 add_ndev_hash(pdata); 2151 if (old_ndev) 2152 dev_put(old_ndev); 2153 2154 return 0; 2155 } 2156 EXPORT_SYMBOL(ib_device_set_netdev); 2157 2158 static void free_netdevs(struct ib_device *ib_dev) 2159 { 2160 unsigned long flags; 2161 unsigned int port; 2162 2163 if (!ib_dev->port_data) 2164 return; 2165 2166 rdma_for_each_port (ib_dev, port) { 2167 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2168 struct net_device *ndev; 2169 2170 spin_lock_irqsave(&pdata->netdev_lock, flags); 2171 ndev = rcu_dereference_protected( 2172 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2173 if (ndev) { 2174 spin_lock(&ndev_hash_lock); 2175 hash_del_rcu(&pdata->ndev_hash_link); 2176 spin_unlock(&ndev_hash_lock); 2177 2178 /* 2179 * If this is the last dev_put there is still a 2180 * synchronize_rcu before the netdev is kfreed, so we 2181 * can continue to rely on unlocked pointer 2182 * comparisons after the put 2183 */ 2184 rcu_assign_pointer(pdata->netdev, NULL); 2185 dev_put(ndev); 2186 } 2187 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2188 } 2189 } 2190 2191 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2192 unsigned int port) 2193 { 2194 struct ib_port_data *pdata; 2195 struct net_device *res; 2196 2197 if (!rdma_is_port_valid(ib_dev, port)) 2198 return NULL; 2199 2200 pdata = &ib_dev->port_data[port]; 2201 2202 /* 2203 * New drivers should use ib_device_set_netdev() not the legacy 2204 * get_netdev(). 2205 */ 2206 if (ib_dev->ops.get_netdev) 2207 res = ib_dev->ops.get_netdev(ib_dev, port); 2208 else { 2209 spin_lock(&pdata->netdev_lock); 2210 res = rcu_dereference_protected( 2211 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2212 if (res) 2213 dev_hold(res); 2214 spin_unlock(&pdata->netdev_lock); 2215 } 2216 2217 /* 2218 * If we are starting to unregister expedite things by preventing 2219 * propagation of an unregistering netdev. 2220 */ 2221 if (res && res->reg_state != NETREG_REGISTERED) { 2222 dev_put(res); 2223 return NULL; 2224 } 2225 2226 return res; 2227 } 2228 2229 /** 2230 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2231 * @ndev: netdev to locate 2232 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2233 * 2234 * Find and hold an ib_device that is associated with a netdev via 2235 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2236 * returned pointer. 2237 */ 2238 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2239 enum rdma_driver_id driver_id) 2240 { 2241 struct ib_device *res = NULL; 2242 struct ib_port_data *cur; 2243 2244 rcu_read_lock(); 2245 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2246 (uintptr_t)ndev) { 2247 if (rcu_access_pointer(cur->netdev) == ndev && 2248 (driver_id == RDMA_DRIVER_UNKNOWN || 2249 cur->ib_dev->ops.driver_id == driver_id) && 2250 ib_device_try_get(cur->ib_dev)) { 2251 res = cur->ib_dev; 2252 break; 2253 } 2254 } 2255 rcu_read_unlock(); 2256 2257 return res; 2258 } 2259 EXPORT_SYMBOL(ib_device_get_by_netdev); 2260 2261 /** 2262 * ib_enum_roce_netdev - enumerate all RoCE ports 2263 * @ib_dev : IB device we want to query 2264 * @filter: Should we call the callback? 2265 * @filter_cookie: Cookie passed to filter 2266 * @cb: Callback to call for each found RoCE ports 2267 * @cookie: Cookie passed back to the callback 2268 * 2269 * Enumerates all of the physical RoCE ports of ib_dev 2270 * which are related to netdevice and calls callback() on each 2271 * device for which filter() function returns non zero. 2272 */ 2273 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2274 roce_netdev_filter filter, 2275 void *filter_cookie, 2276 roce_netdev_callback cb, 2277 void *cookie) 2278 { 2279 unsigned int port; 2280 2281 rdma_for_each_port (ib_dev, port) 2282 if (rdma_protocol_roce(ib_dev, port)) { 2283 struct net_device *idev = 2284 ib_device_get_netdev(ib_dev, port); 2285 2286 if (filter(ib_dev, port, idev, filter_cookie)) 2287 cb(ib_dev, port, idev, cookie); 2288 2289 if (idev) 2290 dev_put(idev); 2291 } 2292 } 2293 2294 /** 2295 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2296 * @filter: Should we call the callback? 2297 * @filter_cookie: Cookie passed to filter 2298 * @cb: Callback to call for each found RoCE ports 2299 * @cookie: Cookie passed back to the callback 2300 * 2301 * Enumerates all RoCE devices' physical ports which are related 2302 * to netdevices and calls callback() on each device for which 2303 * filter() function returns non zero. 2304 */ 2305 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2306 void *filter_cookie, 2307 roce_netdev_callback cb, 2308 void *cookie) 2309 { 2310 struct ib_device *dev; 2311 unsigned long index; 2312 2313 down_read(&devices_rwsem); 2314 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2315 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2316 up_read(&devices_rwsem); 2317 } 2318 2319 /** 2320 * ib_enum_all_devs - enumerate all ib_devices 2321 * @cb: Callback to call for each found ib_device 2322 * 2323 * Enumerates all ib_devices and calls callback() on each device. 2324 */ 2325 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2326 struct netlink_callback *cb) 2327 { 2328 unsigned long index; 2329 struct ib_device *dev; 2330 unsigned int idx = 0; 2331 int ret = 0; 2332 2333 down_read(&devices_rwsem); 2334 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2335 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2336 continue; 2337 2338 ret = nldev_cb(dev, skb, cb, idx); 2339 if (ret) 2340 break; 2341 idx++; 2342 } 2343 up_read(&devices_rwsem); 2344 return ret; 2345 } 2346 2347 /** 2348 * ib_query_pkey - Get P_Key table entry 2349 * @device:Device to query 2350 * @port_num:Port number to query 2351 * @index:P_Key table index to query 2352 * @pkey:Returned P_Key 2353 * 2354 * ib_query_pkey() fetches the specified P_Key table entry. 2355 */ 2356 int ib_query_pkey(struct ib_device *device, 2357 u8 port_num, u16 index, u16 *pkey) 2358 { 2359 if (!rdma_is_port_valid(device, port_num)) 2360 return -EINVAL; 2361 2362 return device->ops.query_pkey(device, port_num, index, pkey); 2363 } 2364 EXPORT_SYMBOL(ib_query_pkey); 2365 2366 /** 2367 * ib_modify_device - Change IB device attributes 2368 * @device:Device to modify 2369 * @device_modify_mask:Mask of attributes to change 2370 * @device_modify:New attribute values 2371 * 2372 * ib_modify_device() changes a device's attributes as specified by 2373 * the @device_modify_mask and @device_modify structure. 2374 */ 2375 int ib_modify_device(struct ib_device *device, 2376 int device_modify_mask, 2377 struct ib_device_modify *device_modify) 2378 { 2379 if (!device->ops.modify_device) 2380 return -EOPNOTSUPP; 2381 2382 return device->ops.modify_device(device, device_modify_mask, 2383 device_modify); 2384 } 2385 EXPORT_SYMBOL(ib_modify_device); 2386 2387 /** 2388 * ib_modify_port - Modifies the attributes for the specified port. 2389 * @device: The device to modify. 2390 * @port_num: The number of the port to modify. 2391 * @port_modify_mask: Mask used to specify which attributes of the port 2392 * to change. 2393 * @port_modify: New attribute values for the port. 2394 * 2395 * ib_modify_port() changes a port's attributes as specified by the 2396 * @port_modify_mask and @port_modify structure. 2397 */ 2398 int ib_modify_port(struct ib_device *device, 2399 u8 port_num, int port_modify_mask, 2400 struct ib_port_modify *port_modify) 2401 { 2402 int rc; 2403 2404 if (!rdma_is_port_valid(device, port_num)) 2405 return -EINVAL; 2406 2407 if (device->ops.modify_port) 2408 rc = device->ops.modify_port(device, port_num, 2409 port_modify_mask, 2410 port_modify); 2411 else if (rdma_protocol_roce(device, port_num) && 2412 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2413 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2414 rc = 0; 2415 else 2416 rc = -EOPNOTSUPP; 2417 return rc; 2418 } 2419 EXPORT_SYMBOL(ib_modify_port); 2420 2421 /** 2422 * ib_find_gid - Returns the port number and GID table index where 2423 * a specified GID value occurs. Its searches only for IB link layer. 2424 * @device: The device to query. 2425 * @gid: The GID value to search for. 2426 * @port_num: The port number of the device where the GID value was found. 2427 * @index: The index into the GID table where the GID was found. This 2428 * parameter may be NULL. 2429 */ 2430 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2431 u8 *port_num, u16 *index) 2432 { 2433 union ib_gid tmp_gid; 2434 unsigned int port; 2435 int ret, i; 2436 2437 rdma_for_each_port (device, port) { 2438 if (!rdma_protocol_ib(device, port)) 2439 continue; 2440 2441 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2442 ++i) { 2443 ret = rdma_query_gid(device, port, i, &tmp_gid); 2444 if (ret) 2445 return ret; 2446 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2447 *port_num = port; 2448 if (index) 2449 *index = i; 2450 return 0; 2451 } 2452 } 2453 } 2454 2455 return -ENOENT; 2456 } 2457 EXPORT_SYMBOL(ib_find_gid); 2458 2459 /** 2460 * ib_find_pkey - Returns the PKey table index where a specified 2461 * PKey value occurs. 2462 * @device: The device to query. 2463 * @port_num: The port number of the device to search for the PKey. 2464 * @pkey: The PKey value to search for. 2465 * @index: The index into the PKey table where the PKey was found. 2466 */ 2467 int ib_find_pkey(struct ib_device *device, 2468 u8 port_num, u16 pkey, u16 *index) 2469 { 2470 int ret, i; 2471 u16 tmp_pkey; 2472 int partial_ix = -1; 2473 2474 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2475 ++i) { 2476 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2477 if (ret) 2478 return ret; 2479 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2480 /* if there is full-member pkey take it.*/ 2481 if (tmp_pkey & 0x8000) { 2482 *index = i; 2483 return 0; 2484 } 2485 if (partial_ix < 0) 2486 partial_ix = i; 2487 } 2488 } 2489 2490 /*no full-member, if exists take the limited*/ 2491 if (partial_ix >= 0) { 2492 *index = partial_ix; 2493 return 0; 2494 } 2495 return -ENOENT; 2496 } 2497 EXPORT_SYMBOL(ib_find_pkey); 2498 2499 /** 2500 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2501 * for a received CM request 2502 * @dev: An RDMA device on which the request has been received. 2503 * @port: Port number on the RDMA device. 2504 * @pkey: The Pkey the request came on. 2505 * @gid: A GID that the net_dev uses to communicate. 2506 * @addr: Contains the IP address that the request specified as its 2507 * destination. 2508 * 2509 */ 2510 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2511 u8 port, 2512 u16 pkey, 2513 const union ib_gid *gid, 2514 const struct sockaddr *addr) 2515 { 2516 struct net_device *net_dev = NULL; 2517 unsigned long index; 2518 void *client_data; 2519 2520 if (!rdma_protocol_ib(dev, port)) 2521 return NULL; 2522 2523 /* 2524 * Holding the read side guarantees that the client will not become 2525 * unregistered while we are calling get_net_dev_by_params() 2526 */ 2527 down_read(&dev->client_data_rwsem); 2528 xan_for_each_marked (&dev->client_data, index, client_data, 2529 CLIENT_DATA_REGISTERED) { 2530 struct ib_client *client = xa_load(&clients, index); 2531 2532 if (!client || !client->get_net_dev_by_params) 2533 continue; 2534 2535 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2536 addr, client_data); 2537 if (net_dev) 2538 break; 2539 } 2540 up_read(&dev->client_data_rwsem); 2541 2542 return net_dev; 2543 } 2544 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2545 2546 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2547 { 2548 struct ib_device_ops *dev_ops = &dev->ops; 2549 #define SET_DEVICE_OP(ptr, name) \ 2550 do { \ 2551 if (ops->name) \ 2552 if (!((ptr)->name)) \ 2553 (ptr)->name = ops->name; \ 2554 } while (0) 2555 2556 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2557 2558 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2559 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2560 dev_ops->driver_id != ops->driver_id); 2561 dev_ops->driver_id = ops->driver_id; 2562 } 2563 if (ops->owner) { 2564 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2565 dev_ops->owner = ops->owner; 2566 } 2567 if (ops->uverbs_abi_ver) 2568 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2569 2570 dev_ops->uverbs_no_driver_id_binding |= 2571 ops->uverbs_no_driver_id_binding; 2572 2573 SET_DEVICE_OP(dev_ops, add_gid); 2574 SET_DEVICE_OP(dev_ops, advise_mr); 2575 SET_DEVICE_OP(dev_ops, alloc_dm); 2576 SET_DEVICE_OP(dev_ops, alloc_fmr); 2577 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2578 SET_DEVICE_OP(dev_ops, alloc_mr); 2579 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2580 SET_DEVICE_OP(dev_ops, alloc_mw); 2581 SET_DEVICE_OP(dev_ops, alloc_pd); 2582 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2583 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2584 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2585 SET_DEVICE_OP(dev_ops, attach_mcast); 2586 SET_DEVICE_OP(dev_ops, check_mr_status); 2587 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2588 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2589 SET_DEVICE_OP(dev_ops, counter_dealloc); 2590 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2591 SET_DEVICE_OP(dev_ops, counter_update_stats); 2592 SET_DEVICE_OP(dev_ops, create_ah); 2593 SET_DEVICE_OP(dev_ops, create_counters); 2594 SET_DEVICE_OP(dev_ops, create_cq); 2595 SET_DEVICE_OP(dev_ops, create_flow); 2596 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2597 SET_DEVICE_OP(dev_ops, create_qp); 2598 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2599 SET_DEVICE_OP(dev_ops, create_srq); 2600 SET_DEVICE_OP(dev_ops, create_wq); 2601 SET_DEVICE_OP(dev_ops, dealloc_dm); 2602 SET_DEVICE_OP(dev_ops, dealloc_driver); 2603 SET_DEVICE_OP(dev_ops, dealloc_fmr); 2604 SET_DEVICE_OP(dev_ops, dealloc_mw); 2605 SET_DEVICE_OP(dev_ops, dealloc_pd); 2606 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2607 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2608 SET_DEVICE_OP(dev_ops, del_gid); 2609 SET_DEVICE_OP(dev_ops, dereg_mr); 2610 SET_DEVICE_OP(dev_ops, destroy_ah); 2611 SET_DEVICE_OP(dev_ops, destroy_counters); 2612 SET_DEVICE_OP(dev_ops, destroy_cq); 2613 SET_DEVICE_OP(dev_ops, destroy_flow); 2614 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2615 SET_DEVICE_OP(dev_ops, destroy_qp); 2616 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2617 SET_DEVICE_OP(dev_ops, destroy_srq); 2618 SET_DEVICE_OP(dev_ops, destroy_wq); 2619 SET_DEVICE_OP(dev_ops, detach_mcast); 2620 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2621 SET_DEVICE_OP(dev_ops, drain_rq); 2622 SET_DEVICE_OP(dev_ops, drain_sq); 2623 SET_DEVICE_OP(dev_ops, enable_driver); 2624 SET_DEVICE_OP(dev_ops, fill_res_entry); 2625 SET_DEVICE_OP(dev_ops, fill_stat_entry); 2626 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2627 SET_DEVICE_OP(dev_ops, get_dma_mr); 2628 SET_DEVICE_OP(dev_ops, get_hw_stats); 2629 SET_DEVICE_OP(dev_ops, get_link_layer); 2630 SET_DEVICE_OP(dev_ops, get_netdev); 2631 SET_DEVICE_OP(dev_ops, get_port_immutable); 2632 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2633 SET_DEVICE_OP(dev_ops, get_vf_config); 2634 SET_DEVICE_OP(dev_ops, get_vf_guid); 2635 SET_DEVICE_OP(dev_ops, get_vf_stats); 2636 SET_DEVICE_OP(dev_ops, init_port); 2637 SET_DEVICE_OP(dev_ops, iw_accept); 2638 SET_DEVICE_OP(dev_ops, iw_add_ref); 2639 SET_DEVICE_OP(dev_ops, iw_connect); 2640 SET_DEVICE_OP(dev_ops, iw_create_listen); 2641 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2642 SET_DEVICE_OP(dev_ops, iw_get_qp); 2643 SET_DEVICE_OP(dev_ops, iw_reject); 2644 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2645 SET_DEVICE_OP(dev_ops, map_mr_sg); 2646 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2647 SET_DEVICE_OP(dev_ops, map_phys_fmr); 2648 SET_DEVICE_OP(dev_ops, mmap); 2649 SET_DEVICE_OP(dev_ops, mmap_free); 2650 SET_DEVICE_OP(dev_ops, modify_ah); 2651 SET_DEVICE_OP(dev_ops, modify_cq); 2652 SET_DEVICE_OP(dev_ops, modify_device); 2653 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2654 SET_DEVICE_OP(dev_ops, modify_port); 2655 SET_DEVICE_OP(dev_ops, modify_qp); 2656 SET_DEVICE_OP(dev_ops, modify_srq); 2657 SET_DEVICE_OP(dev_ops, modify_wq); 2658 SET_DEVICE_OP(dev_ops, peek_cq); 2659 SET_DEVICE_OP(dev_ops, poll_cq); 2660 SET_DEVICE_OP(dev_ops, post_recv); 2661 SET_DEVICE_OP(dev_ops, post_send); 2662 SET_DEVICE_OP(dev_ops, post_srq_recv); 2663 SET_DEVICE_OP(dev_ops, process_mad); 2664 SET_DEVICE_OP(dev_ops, query_ah); 2665 SET_DEVICE_OP(dev_ops, query_device); 2666 SET_DEVICE_OP(dev_ops, query_gid); 2667 SET_DEVICE_OP(dev_ops, query_pkey); 2668 SET_DEVICE_OP(dev_ops, query_port); 2669 SET_DEVICE_OP(dev_ops, query_qp); 2670 SET_DEVICE_OP(dev_ops, query_srq); 2671 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2672 SET_DEVICE_OP(dev_ops, read_counters); 2673 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2674 SET_DEVICE_OP(dev_ops, reg_user_mr); 2675 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2676 SET_DEVICE_OP(dev_ops, req_notify_cq); 2677 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2678 SET_DEVICE_OP(dev_ops, resize_cq); 2679 SET_DEVICE_OP(dev_ops, set_vf_guid); 2680 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2681 SET_DEVICE_OP(dev_ops, unmap_fmr); 2682 2683 SET_OBJ_SIZE(dev_ops, ib_ah); 2684 SET_OBJ_SIZE(dev_ops, ib_cq); 2685 SET_OBJ_SIZE(dev_ops, ib_pd); 2686 SET_OBJ_SIZE(dev_ops, ib_srq); 2687 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2688 } 2689 EXPORT_SYMBOL(ib_set_device_ops); 2690 2691 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2692 [RDMA_NL_LS_OP_RESOLVE] = { 2693 .doit = ib_nl_handle_resolve_resp, 2694 .flags = RDMA_NL_ADMIN_PERM, 2695 }, 2696 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2697 .doit = ib_nl_handle_set_timeout, 2698 .flags = RDMA_NL_ADMIN_PERM, 2699 }, 2700 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2701 .doit = ib_nl_handle_ip_res_resp, 2702 .flags = RDMA_NL_ADMIN_PERM, 2703 }, 2704 }; 2705 2706 static int __init ib_core_init(void) 2707 { 2708 int ret; 2709 2710 ib_wq = alloc_workqueue("infiniband", 0, 0); 2711 if (!ib_wq) 2712 return -ENOMEM; 2713 2714 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2715 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2716 if (!ib_comp_wq) { 2717 ret = -ENOMEM; 2718 goto err; 2719 } 2720 2721 ib_comp_unbound_wq = 2722 alloc_workqueue("ib-comp-unb-wq", 2723 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2724 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2725 if (!ib_comp_unbound_wq) { 2726 ret = -ENOMEM; 2727 goto err_comp; 2728 } 2729 2730 ret = class_register(&ib_class); 2731 if (ret) { 2732 pr_warn("Couldn't create InfiniBand device class\n"); 2733 goto err_comp_unbound; 2734 } 2735 2736 rdma_nl_init(); 2737 2738 ret = addr_init(); 2739 if (ret) { 2740 pr_warn("Could't init IB address resolution\n"); 2741 goto err_ibnl; 2742 } 2743 2744 ret = ib_mad_init(); 2745 if (ret) { 2746 pr_warn("Couldn't init IB MAD\n"); 2747 goto err_addr; 2748 } 2749 2750 ret = ib_sa_init(); 2751 if (ret) { 2752 pr_warn("Couldn't init SA\n"); 2753 goto err_mad; 2754 } 2755 2756 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2757 if (ret) { 2758 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2759 goto err_sa; 2760 } 2761 2762 ret = register_pernet_device(&rdma_dev_net_ops); 2763 if (ret) { 2764 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2765 goto err_compat; 2766 } 2767 2768 nldev_init(); 2769 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2770 roce_gid_mgmt_init(); 2771 2772 return 0; 2773 2774 err_compat: 2775 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2776 err_sa: 2777 ib_sa_cleanup(); 2778 err_mad: 2779 ib_mad_cleanup(); 2780 err_addr: 2781 addr_cleanup(); 2782 err_ibnl: 2783 class_unregister(&ib_class); 2784 err_comp_unbound: 2785 destroy_workqueue(ib_comp_unbound_wq); 2786 err_comp: 2787 destroy_workqueue(ib_comp_wq); 2788 err: 2789 destroy_workqueue(ib_wq); 2790 return ret; 2791 } 2792 2793 static void __exit ib_core_cleanup(void) 2794 { 2795 roce_gid_mgmt_cleanup(); 2796 nldev_exit(); 2797 rdma_nl_unregister(RDMA_NL_LS); 2798 unregister_pernet_device(&rdma_dev_net_ops); 2799 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2800 ib_sa_cleanup(); 2801 ib_mad_cleanup(); 2802 addr_cleanup(); 2803 rdma_nl_exit(); 2804 class_unregister(&ib_class); 2805 destroy_workqueue(ib_comp_unbound_wq); 2806 destroy_workqueue(ib_comp_wq); 2807 /* Make sure that any pending umem accounting work is done. */ 2808 destroy_workqueue(ib_wq); 2809 flush_workqueue(system_unbound_wq); 2810 WARN_ON(!xa_empty(&clients)); 2811 WARN_ON(!xa_empty(&devices)); 2812 } 2813 2814 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2815 2816 /* ib core relies on netdev stack to first register net_ns_type_operations 2817 * ns kobject type before ib_core initialization. 2818 */ 2819 fs_initcall(ib_core_init); 2820 module_exit(ib_core_cleanup); 2821