1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static u32 highest_client_id; 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 static void ib_client_put(struct ib_client *client) 102 { 103 if (refcount_dec_and_test(&client->uses)) 104 complete(&client->uses_zero); 105 } 106 107 /* 108 * If client_data is registered then the corresponding client must also still 109 * be registered. 110 */ 111 #define CLIENT_DATA_REGISTERED XA_MARK_1 112 113 unsigned int rdma_dev_net_id; 114 115 /* 116 * A list of net namespaces is maintained in an xarray. This is necessary 117 * because we can't get the locking right using the existing net ns list. We 118 * would require a init_net callback after the list is updated. 119 */ 120 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 121 /* 122 * rwsem to protect accessing the rdma_nets xarray entries. 123 */ 124 static DECLARE_RWSEM(rdma_nets_rwsem); 125 126 bool ib_devices_shared_netns = true; 127 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 128 MODULE_PARM_DESC(netns_mode, 129 "Share device among net namespaces; default=1 (shared)"); 130 /** 131 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 132 * from a specified net namespace or not. 133 * @device: Pointer to rdma device which needs to be checked 134 * @net: Pointer to net namesapce for which access to be checked 135 * 136 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 137 * from a specified net namespace or not. When 138 * rdma device is in shared mode, it ignores the 139 * net namespace. When rdma device is exclusive 140 * to a net namespace, rdma device net namespace is 141 * checked against the specified one. 142 */ 143 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 144 { 145 return (ib_devices_shared_netns || 146 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 147 } 148 EXPORT_SYMBOL(rdma_dev_access_netns); 149 150 /* 151 * xarray has this behavior where it won't iterate over NULL values stored in 152 * allocated arrays. So we need our own iterator to see all values stored in 153 * the array. This does the same thing as xa_for_each except that it also 154 * returns NULL valued entries if the array is allocating. Simplified to only 155 * work on simple xarrays. 156 */ 157 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 158 xa_mark_t filter) 159 { 160 XA_STATE(xas, xa, *indexp); 161 void *entry; 162 163 rcu_read_lock(); 164 do { 165 entry = xas_find_marked(&xas, ULONG_MAX, filter); 166 if (xa_is_zero(entry)) 167 break; 168 } while (xas_retry(&xas, entry)); 169 rcu_read_unlock(); 170 171 if (entry) { 172 *indexp = xas.xa_index; 173 if (xa_is_zero(entry)) 174 return NULL; 175 return entry; 176 } 177 return XA_ERROR(-ENOENT); 178 } 179 #define xan_for_each_marked(xa, index, entry, filter) \ 180 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 181 !xa_is_err(entry); \ 182 (index)++, entry = xan_find_marked(xa, &(index), filter)) 183 184 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 185 static DEFINE_SPINLOCK(ndev_hash_lock); 186 static DECLARE_HASHTABLE(ndev_hash, 5); 187 188 static void free_netdevs(struct ib_device *ib_dev); 189 static void ib_unregister_work(struct work_struct *work); 190 static void __ib_unregister_device(struct ib_device *device); 191 static int ib_security_change(struct notifier_block *nb, unsigned long event, 192 void *lsm_data); 193 static void ib_policy_change_task(struct work_struct *work); 194 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 195 196 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 197 struct va_format *vaf) 198 { 199 if (ibdev && ibdev->dev.parent) 200 dev_printk_emit(level[1] - '0', 201 ibdev->dev.parent, 202 "%s %s %s: %pV", 203 dev_driver_string(ibdev->dev.parent), 204 dev_name(ibdev->dev.parent), 205 dev_name(&ibdev->dev), 206 vaf); 207 else if (ibdev) 208 printk("%s%s: %pV", 209 level, dev_name(&ibdev->dev), vaf); 210 else 211 printk("%s(NULL ib_device): %pV", level, vaf); 212 } 213 214 void ibdev_printk(const char *level, const struct ib_device *ibdev, 215 const char *format, ...) 216 { 217 struct va_format vaf; 218 va_list args; 219 220 va_start(args, format); 221 222 vaf.fmt = format; 223 vaf.va = &args; 224 225 __ibdev_printk(level, ibdev, &vaf); 226 227 va_end(args); 228 } 229 EXPORT_SYMBOL(ibdev_printk); 230 231 #define define_ibdev_printk_level(func, level) \ 232 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 233 { \ 234 struct va_format vaf; \ 235 va_list args; \ 236 \ 237 va_start(args, fmt); \ 238 \ 239 vaf.fmt = fmt; \ 240 vaf.va = &args; \ 241 \ 242 __ibdev_printk(level, ibdev, &vaf); \ 243 \ 244 va_end(args); \ 245 } \ 246 EXPORT_SYMBOL(func); 247 248 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 249 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 250 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 251 define_ibdev_printk_level(ibdev_err, KERN_ERR); 252 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 253 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 254 define_ibdev_printk_level(ibdev_info, KERN_INFO); 255 256 static struct notifier_block ibdev_lsm_nb = { 257 .notifier_call = ib_security_change, 258 }; 259 260 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 261 struct net *net); 262 263 /* Pointer to the RCU head at the start of the ib_port_data array */ 264 struct ib_port_data_rcu { 265 struct rcu_head rcu_head; 266 struct ib_port_data pdata[]; 267 }; 268 269 static void ib_device_check_mandatory(struct ib_device *device) 270 { 271 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 272 static const struct { 273 size_t offset; 274 char *name; 275 } mandatory_table[] = { 276 IB_MANDATORY_FUNC(query_device), 277 IB_MANDATORY_FUNC(query_port), 278 IB_MANDATORY_FUNC(query_pkey), 279 IB_MANDATORY_FUNC(alloc_pd), 280 IB_MANDATORY_FUNC(dealloc_pd), 281 IB_MANDATORY_FUNC(create_qp), 282 IB_MANDATORY_FUNC(modify_qp), 283 IB_MANDATORY_FUNC(destroy_qp), 284 IB_MANDATORY_FUNC(post_send), 285 IB_MANDATORY_FUNC(post_recv), 286 IB_MANDATORY_FUNC(create_cq), 287 IB_MANDATORY_FUNC(destroy_cq), 288 IB_MANDATORY_FUNC(poll_cq), 289 IB_MANDATORY_FUNC(req_notify_cq), 290 IB_MANDATORY_FUNC(get_dma_mr), 291 IB_MANDATORY_FUNC(dereg_mr), 292 IB_MANDATORY_FUNC(get_port_immutable) 293 }; 294 int i; 295 296 device->kverbs_provider = true; 297 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 298 if (!*(void **) ((void *) &device->ops + 299 mandatory_table[i].offset)) { 300 device->kverbs_provider = false; 301 break; 302 } 303 } 304 } 305 306 /* 307 * Caller must perform ib_device_put() to return the device reference count 308 * when ib_device_get_by_index() returns valid device pointer. 309 */ 310 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 311 { 312 struct ib_device *device; 313 314 down_read(&devices_rwsem); 315 device = xa_load(&devices, index); 316 if (device) { 317 if (!rdma_dev_access_netns(device, net)) { 318 device = NULL; 319 goto out; 320 } 321 322 if (!ib_device_try_get(device)) 323 device = NULL; 324 } 325 out: 326 up_read(&devices_rwsem); 327 return device; 328 } 329 330 /** 331 * ib_device_put - Release IB device reference 332 * @device: device whose reference to be released 333 * 334 * ib_device_put() releases reference to the IB device to allow it to be 335 * unregistered and eventually free. 336 */ 337 void ib_device_put(struct ib_device *device) 338 { 339 if (refcount_dec_and_test(&device->refcount)) 340 complete(&device->unreg_completion); 341 } 342 EXPORT_SYMBOL(ib_device_put); 343 344 static struct ib_device *__ib_device_get_by_name(const char *name) 345 { 346 struct ib_device *device; 347 unsigned long index; 348 349 xa_for_each (&devices, index, device) 350 if (!strcmp(name, dev_name(&device->dev))) 351 return device; 352 353 return NULL; 354 } 355 356 /** 357 * ib_device_get_by_name - Find an IB device by name 358 * @name: The name to look for 359 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 360 * 361 * Find and hold an ib_device by its name. The caller must call 362 * ib_device_put() on the returned pointer. 363 */ 364 struct ib_device *ib_device_get_by_name(const char *name, 365 enum rdma_driver_id driver_id) 366 { 367 struct ib_device *device; 368 369 down_read(&devices_rwsem); 370 device = __ib_device_get_by_name(name); 371 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 372 device->ops.driver_id != driver_id) 373 device = NULL; 374 375 if (device) { 376 if (!ib_device_try_get(device)) 377 device = NULL; 378 } 379 up_read(&devices_rwsem); 380 return device; 381 } 382 EXPORT_SYMBOL(ib_device_get_by_name); 383 384 static int rename_compat_devs(struct ib_device *device) 385 { 386 struct ib_core_device *cdev; 387 unsigned long index; 388 int ret = 0; 389 390 mutex_lock(&device->compat_devs_mutex); 391 xa_for_each (&device->compat_devs, index, cdev) { 392 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 393 if (ret) { 394 dev_warn(&cdev->dev, 395 "Fail to rename compatdev to new name %s\n", 396 dev_name(&device->dev)); 397 break; 398 } 399 } 400 mutex_unlock(&device->compat_devs_mutex); 401 return ret; 402 } 403 404 int ib_device_rename(struct ib_device *ibdev, const char *name) 405 { 406 unsigned long index; 407 void *client_data; 408 int ret; 409 410 down_write(&devices_rwsem); 411 if (!strcmp(name, dev_name(&ibdev->dev))) { 412 up_write(&devices_rwsem); 413 return 0; 414 } 415 416 if (__ib_device_get_by_name(name)) { 417 up_write(&devices_rwsem); 418 return -EEXIST; 419 } 420 421 ret = device_rename(&ibdev->dev, name); 422 if (ret) { 423 up_write(&devices_rwsem); 424 return ret; 425 } 426 427 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 428 ret = rename_compat_devs(ibdev); 429 430 downgrade_write(&devices_rwsem); 431 down_read(&ibdev->client_data_rwsem); 432 xan_for_each_marked(&ibdev->client_data, index, client_data, 433 CLIENT_DATA_REGISTERED) { 434 struct ib_client *client = xa_load(&clients, index); 435 436 if (!client || !client->rename) 437 continue; 438 439 client->rename(ibdev, client_data); 440 } 441 up_read(&ibdev->client_data_rwsem); 442 up_read(&devices_rwsem); 443 return 0; 444 } 445 446 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 447 { 448 if (use_dim > 1) 449 return -EINVAL; 450 ibdev->use_cq_dim = use_dim; 451 452 return 0; 453 } 454 455 static int alloc_name(struct ib_device *ibdev, const char *name) 456 { 457 struct ib_device *device; 458 unsigned long index; 459 struct ida inuse; 460 int rc; 461 int i; 462 463 lockdep_assert_held_write(&devices_rwsem); 464 ida_init(&inuse); 465 xa_for_each (&devices, index, device) { 466 char buf[IB_DEVICE_NAME_MAX]; 467 468 if (sscanf(dev_name(&device->dev), name, &i) != 1) 469 continue; 470 if (i < 0 || i >= INT_MAX) 471 continue; 472 snprintf(buf, sizeof buf, name, i); 473 if (strcmp(buf, dev_name(&device->dev)) != 0) 474 continue; 475 476 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 477 if (rc < 0) 478 goto out; 479 } 480 481 rc = ida_alloc(&inuse, GFP_KERNEL); 482 if (rc < 0) 483 goto out; 484 485 rc = dev_set_name(&ibdev->dev, name, rc); 486 out: 487 ida_destroy(&inuse); 488 return rc; 489 } 490 491 static void ib_device_release(struct device *device) 492 { 493 struct ib_device *dev = container_of(device, struct ib_device, dev); 494 495 free_netdevs(dev); 496 WARN_ON(refcount_read(&dev->refcount)); 497 if (dev->port_data) { 498 ib_cache_release_one(dev); 499 ib_security_release_port_pkey_list(dev); 500 rdma_counter_release(dev); 501 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 502 pdata[0]), 503 rcu_head); 504 } 505 506 mutex_destroy(&dev->unregistration_lock); 507 mutex_destroy(&dev->compat_devs_mutex); 508 509 xa_destroy(&dev->compat_devs); 510 xa_destroy(&dev->client_data); 511 kfree_rcu(dev, rcu_head); 512 } 513 514 static int ib_device_uevent(struct device *device, 515 struct kobj_uevent_env *env) 516 { 517 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 518 return -ENOMEM; 519 520 /* 521 * It would be nice to pass the node GUID with the event... 522 */ 523 524 return 0; 525 } 526 527 static const void *net_namespace(struct device *d) 528 { 529 struct ib_core_device *coredev = 530 container_of(d, struct ib_core_device, dev); 531 532 return read_pnet(&coredev->rdma_net); 533 } 534 535 static struct class ib_class = { 536 .name = "infiniband", 537 .dev_release = ib_device_release, 538 .dev_uevent = ib_device_uevent, 539 .ns_type = &net_ns_type_operations, 540 .namespace = net_namespace, 541 }; 542 543 static void rdma_init_coredev(struct ib_core_device *coredev, 544 struct ib_device *dev, struct net *net) 545 { 546 /* This BUILD_BUG_ON is intended to catch layout change 547 * of union of ib_core_device and device. 548 * dev must be the first element as ib_core and providers 549 * driver uses it. Adding anything in ib_core_device before 550 * device will break this assumption. 551 */ 552 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 553 offsetof(struct ib_device, dev)); 554 555 coredev->dev.class = &ib_class; 556 coredev->dev.groups = dev->groups; 557 device_initialize(&coredev->dev); 558 coredev->owner = dev; 559 INIT_LIST_HEAD(&coredev->port_list); 560 write_pnet(&coredev->rdma_net, net); 561 } 562 563 /** 564 * _ib_alloc_device - allocate an IB device struct 565 * @size:size of structure to allocate 566 * 567 * Low-level drivers should use ib_alloc_device() to allocate &struct 568 * ib_device. @size is the size of the structure to be allocated, 569 * including any private data used by the low-level driver. 570 * ib_dealloc_device() must be used to free structures allocated with 571 * ib_alloc_device(). 572 */ 573 struct ib_device *_ib_alloc_device(size_t size) 574 { 575 struct ib_device *device; 576 577 if (WARN_ON(size < sizeof(struct ib_device))) 578 return NULL; 579 580 device = kzalloc(size, GFP_KERNEL); 581 if (!device) 582 return NULL; 583 584 if (rdma_restrack_init(device)) { 585 kfree(device); 586 return NULL; 587 } 588 589 device->groups[0] = &ib_dev_attr_group; 590 rdma_init_coredev(&device->coredev, device, &init_net); 591 592 INIT_LIST_HEAD(&device->event_handler_list); 593 spin_lock_init(&device->event_handler_lock); 594 mutex_init(&device->unregistration_lock); 595 /* 596 * client_data needs to be alloc because we don't want our mark to be 597 * destroyed if the user stores NULL in the client data. 598 */ 599 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 600 init_rwsem(&device->client_data_rwsem); 601 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 602 mutex_init(&device->compat_devs_mutex); 603 init_completion(&device->unreg_completion); 604 INIT_WORK(&device->unregistration_work, ib_unregister_work); 605 606 return device; 607 } 608 EXPORT_SYMBOL(_ib_alloc_device); 609 610 /** 611 * ib_dealloc_device - free an IB device struct 612 * @device:structure to free 613 * 614 * Free a structure allocated with ib_alloc_device(). 615 */ 616 void ib_dealloc_device(struct ib_device *device) 617 { 618 if (device->ops.dealloc_driver) 619 device->ops.dealloc_driver(device); 620 621 /* 622 * ib_unregister_driver() requires all devices to remain in the xarray 623 * while their ops are callable. The last op we call is dealloc_driver 624 * above. This is needed to create a fence on op callbacks prior to 625 * allowing the driver module to unload. 626 */ 627 down_write(&devices_rwsem); 628 if (xa_load(&devices, device->index) == device) 629 xa_erase(&devices, device->index); 630 up_write(&devices_rwsem); 631 632 /* Expedite releasing netdev references */ 633 free_netdevs(device); 634 635 WARN_ON(!xa_empty(&device->compat_devs)); 636 WARN_ON(!xa_empty(&device->client_data)); 637 WARN_ON(refcount_read(&device->refcount)); 638 rdma_restrack_clean(device); 639 /* Balances with device_initialize */ 640 put_device(&device->dev); 641 } 642 EXPORT_SYMBOL(ib_dealloc_device); 643 644 /* 645 * add_client_context() and remove_client_context() must be safe against 646 * parallel calls on the same device - registration/unregistration of both the 647 * device and client can be occurring in parallel. 648 * 649 * The routines need to be a fence, any caller must not return until the add 650 * or remove is fully completed. 651 */ 652 static int add_client_context(struct ib_device *device, 653 struct ib_client *client) 654 { 655 int ret = 0; 656 657 if (!device->kverbs_provider && !client->no_kverbs_req) 658 return 0; 659 660 down_write(&device->client_data_rwsem); 661 /* 662 * So long as the client is registered hold both the client and device 663 * unregistration locks. 664 */ 665 if (!refcount_inc_not_zero(&client->uses)) 666 goto out_unlock; 667 refcount_inc(&device->refcount); 668 669 /* 670 * Another caller to add_client_context got here first and has already 671 * completely initialized context. 672 */ 673 if (xa_get_mark(&device->client_data, client->client_id, 674 CLIENT_DATA_REGISTERED)) 675 goto out; 676 677 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 678 GFP_KERNEL)); 679 if (ret) 680 goto out; 681 downgrade_write(&device->client_data_rwsem); 682 if (client->add) 683 client->add(device); 684 685 /* Readers shall not see a client until add has been completed */ 686 xa_set_mark(&device->client_data, client->client_id, 687 CLIENT_DATA_REGISTERED); 688 up_read(&device->client_data_rwsem); 689 return 0; 690 691 out: 692 ib_device_put(device); 693 ib_client_put(client); 694 out_unlock: 695 up_write(&device->client_data_rwsem); 696 return ret; 697 } 698 699 static void remove_client_context(struct ib_device *device, 700 unsigned int client_id) 701 { 702 struct ib_client *client; 703 void *client_data; 704 705 down_write(&device->client_data_rwsem); 706 if (!xa_get_mark(&device->client_data, client_id, 707 CLIENT_DATA_REGISTERED)) { 708 up_write(&device->client_data_rwsem); 709 return; 710 } 711 client_data = xa_load(&device->client_data, client_id); 712 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 713 client = xa_load(&clients, client_id); 714 up_write(&device->client_data_rwsem); 715 716 /* 717 * Notice we cannot be holding any exclusive locks when calling the 718 * remove callback as the remove callback can recurse back into any 719 * public functions in this module and thus try for any locks those 720 * functions take. 721 * 722 * For this reason clients and drivers should not call the 723 * unregistration functions will holdling any locks. 724 */ 725 if (client->remove) 726 client->remove(device, client_data); 727 728 xa_erase(&device->client_data, client_id); 729 ib_device_put(device); 730 ib_client_put(client); 731 } 732 733 static int alloc_port_data(struct ib_device *device) 734 { 735 struct ib_port_data_rcu *pdata_rcu; 736 unsigned int port; 737 738 if (device->port_data) 739 return 0; 740 741 /* This can only be called once the physical port range is defined */ 742 if (WARN_ON(!device->phys_port_cnt)) 743 return -EINVAL; 744 745 /* 746 * device->port_data is indexed directly by the port number to make 747 * access to this data as efficient as possible. 748 * 749 * Therefore port_data is declared as a 1 based array with potential 750 * empty slots at the beginning. 751 */ 752 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 753 rdma_end_port(device) + 1), 754 GFP_KERNEL); 755 if (!pdata_rcu) 756 return -ENOMEM; 757 /* 758 * The rcu_head is put in front of the port data array and the stored 759 * pointer is adjusted since we never need to see that member until 760 * kfree_rcu. 761 */ 762 device->port_data = pdata_rcu->pdata; 763 764 rdma_for_each_port (device, port) { 765 struct ib_port_data *pdata = &device->port_data[port]; 766 767 pdata->ib_dev = device; 768 spin_lock_init(&pdata->pkey_list_lock); 769 INIT_LIST_HEAD(&pdata->pkey_list); 770 spin_lock_init(&pdata->netdev_lock); 771 INIT_HLIST_NODE(&pdata->ndev_hash_link); 772 } 773 return 0; 774 } 775 776 static int verify_immutable(const struct ib_device *dev, u8 port) 777 { 778 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 779 rdma_max_mad_size(dev, port) != 0); 780 } 781 782 static int setup_port_data(struct ib_device *device) 783 { 784 unsigned int port; 785 int ret; 786 787 ret = alloc_port_data(device); 788 if (ret) 789 return ret; 790 791 rdma_for_each_port (device, port) { 792 struct ib_port_data *pdata = &device->port_data[port]; 793 794 ret = device->ops.get_port_immutable(device, port, 795 &pdata->immutable); 796 if (ret) 797 return ret; 798 799 if (verify_immutable(device, port)) 800 return -EINVAL; 801 } 802 return 0; 803 } 804 805 void ib_get_device_fw_str(struct ib_device *dev, char *str) 806 { 807 if (dev->ops.get_dev_fw_str) 808 dev->ops.get_dev_fw_str(dev, str); 809 else 810 str[0] = '\0'; 811 } 812 EXPORT_SYMBOL(ib_get_device_fw_str); 813 814 static void ib_policy_change_task(struct work_struct *work) 815 { 816 struct ib_device *dev; 817 unsigned long index; 818 819 down_read(&devices_rwsem); 820 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 821 unsigned int i; 822 823 rdma_for_each_port (dev, i) { 824 u64 sp; 825 int ret = ib_get_cached_subnet_prefix(dev, 826 i, 827 &sp); 828 829 WARN_ONCE(ret, 830 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 831 ret); 832 if (!ret) 833 ib_security_cache_change(dev, i, sp); 834 } 835 } 836 up_read(&devices_rwsem); 837 } 838 839 static int ib_security_change(struct notifier_block *nb, unsigned long event, 840 void *lsm_data) 841 { 842 if (event != LSM_POLICY_CHANGE) 843 return NOTIFY_DONE; 844 845 schedule_work(&ib_policy_change_work); 846 ib_mad_agent_security_change(); 847 848 return NOTIFY_OK; 849 } 850 851 static void compatdev_release(struct device *dev) 852 { 853 struct ib_core_device *cdev = 854 container_of(dev, struct ib_core_device, dev); 855 856 kfree(cdev); 857 } 858 859 static int add_one_compat_dev(struct ib_device *device, 860 struct rdma_dev_net *rnet) 861 { 862 struct ib_core_device *cdev; 863 int ret; 864 865 lockdep_assert_held(&rdma_nets_rwsem); 866 if (!ib_devices_shared_netns) 867 return 0; 868 869 /* 870 * Create and add compat device in all namespaces other than where it 871 * is currently bound to. 872 */ 873 if (net_eq(read_pnet(&rnet->net), 874 read_pnet(&device->coredev.rdma_net))) 875 return 0; 876 877 /* 878 * The first of init_net() or ib_register_device() to take the 879 * compat_devs_mutex wins and gets to add the device. Others will wait 880 * for completion here. 881 */ 882 mutex_lock(&device->compat_devs_mutex); 883 cdev = xa_load(&device->compat_devs, rnet->id); 884 if (cdev) { 885 ret = 0; 886 goto done; 887 } 888 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 889 if (ret) 890 goto done; 891 892 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 893 if (!cdev) { 894 ret = -ENOMEM; 895 goto cdev_err; 896 } 897 898 cdev->dev.parent = device->dev.parent; 899 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 900 cdev->dev.release = compatdev_release; 901 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 902 903 ret = device_add(&cdev->dev); 904 if (ret) 905 goto add_err; 906 ret = ib_setup_port_attrs(cdev); 907 if (ret) 908 goto port_err; 909 910 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 911 cdev, GFP_KERNEL)); 912 if (ret) 913 goto insert_err; 914 915 mutex_unlock(&device->compat_devs_mutex); 916 return 0; 917 918 insert_err: 919 ib_free_port_attrs(cdev); 920 port_err: 921 device_del(&cdev->dev); 922 add_err: 923 put_device(&cdev->dev); 924 cdev_err: 925 xa_release(&device->compat_devs, rnet->id); 926 done: 927 mutex_unlock(&device->compat_devs_mutex); 928 return ret; 929 } 930 931 static void remove_one_compat_dev(struct ib_device *device, u32 id) 932 { 933 struct ib_core_device *cdev; 934 935 mutex_lock(&device->compat_devs_mutex); 936 cdev = xa_erase(&device->compat_devs, id); 937 mutex_unlock(&device->compat_devs_mutex); 938 if (cdev) { 939 ib_free_port_attrs(cdev); 940 device_del(&cdev->dev); 941 put_device(&cdev->dev); 942 } 943 } 944 945 static void remove_compat_devs(struct ib_device *device) 946 { 947 struct ib_core_device *cdev; 948 unsigned long index; 949 950 xa_for_each (&device->compat_devs, index, cdev) 951 remove_one_compat_dev(device, index); 952 } 953 954 static int add_compat_devs(struct ib_device *device) 955 { 956 struct rdma_dev_net *rnet; 957 unsigned long index; 958 int ret = 0; 959 960 lockdep_assert_held(&devices_rwsem); 961 962 down_read(&rdma_nets_rwsem); 963 xa_for_each (&rdma_nets, index, rnet) { 964 ret = add_one_compat_dev(device, rnet); 965 if (ret) 966 break; 967 } 968 up_read(&rdma_nets_rwsem); 969 return ret; 970 } 971 972 static void remove_all_compat_devs(void) 973 { 974 struct ib_compat_device *cdev; 975 struct ib_device *dev; 976 unsigned long index; 977 978 down_read(&devices_rwsem); 979 xa_for_each (&devices, index, dev) { 980 unsigned long c_index = 0; 981 982 /* Hold nets_rwsem so that any other thread modifying this 983 * system param can sync with this thread. 984 */ 985 down_read(&rdma_nets_rwsem); 986 xa_for_each (&dev->compat_devs, c_index, cdev) 987 remove_one_compat_dev(dev, c_index); 988 up_read(&rdma_nets_rwsem); 989 } 990 up_read(&devices_rwsem); 991 } 992 993 static int add_all_compat_devs(void) 994 { 995 struct rdma_dev_net *rnet; 996 struct ib_device *dev; 997 unsigned long index; 998 int ret = 0; 999 1000 down_read(&devices_rwsem); 1001 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1002 unsigned long net_index = 0; 1003 1004 /* Hold nets_rwsem so that any other thread modifying this 1005 * system param can sync with this thread. 1006 */ 1007 down_read(&rdma_nets_rwsem); 1008 xa_for_each (&rdma_nets, net_index, rnet) { 1009 ret = add_one_compat_dev(dev, rnet); 1010 if (ret) 1011 break; 1012 } 1013 up_read(&rdma_nets_rwsem); 1014 } 1015 up_read(&devices_rwsem); 1016 if (ret) 1017 remove_all_compat_devs(); 1018 return ret; 1019 } 1020 1021 int rdma_compatdev_set(u8 enable) 1022 { 1023 struct rdma_dev_net *rnet; 1024 unsigned long index; 1025 int ret = 0; 1026 1027 down_write(&rdma_nets_rwsem); 1028 if (ib_devices_shared_netns == enable) { 1029 up_write(&rdma_nets_rwsem); 1030 return 0; 1031 } 1032 1033 /* enable/disable of compat devices is not supported 1034 * when more than default init_net exists. 1035 */ 1036 xa_for_each (&rdma_nets, index, rnet) { 1037 ret++; 1038 break; 1039 } 1040 if (!ret) 1041 ib_devices_shared_netns = enable; 1042 up_write(&rdma_nets_rwsem); 1043 if (ret) 1044 return -EBUSY; 1045 1046 if (enable) 1047 ret = add_all_compat_devs(); 1048 else 1049 remove_all_compat_devs(); 1050 return ret; 1051 } 1052 1053 static void rdma_dev_exit_net(struct net *net) 1054 { 1055 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1056 struct ib_device *dev; 1057 unsigned long index; 1058 int ret; 1059 1060 down_write(&rdma_nets_rwsem); 1061 /* 1062 * Prevent the ID from being re-used and hide the id from xa_for_each. 1063 */ 1064 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1065 WARN_ON(ret); 1066 up_write(&rdma_nets_rwsem); 1067 1068 down_read(&devices_rwsem); 1069 xa_for_each (&devices, index, dev) { 1070 get_device(&dev->dev); 1071 /* 1072 * Release the devices_rwsem so that pontentially blocking 1073 * device_del, doesn't hold the devices_rwsem for too long. 1074 */ 1075 up_read(&devices_rwsem); 1076 1077 remove_one_compat_dev(dev, rnet->id); 1078 1079 /* 1080 * If the real device is in the NS then move it back to init. 1081 */ 1082 rdma_dev_change_netns(dev, net, &init_net); 1083 1084 put_device(&dev->dev); 1085 down_read(&devices_rwsem); 1086 } 1087 up_read(&devices_rwsem); 1088 1089 rdma_nl_net_exit(rnet); 1090 xa_erase(&rdma_nets, rnet->id); 1091 } 1092 1093 static __net_init int rdma_dev_init_net(struct net *net) 1094 { 1095 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1096 unsigned long index; 1097 struct ib_device *dev; 1098 int ret; 1099 1100 write_pnet(&rnet->net, net); 1101 1102 ret = rdma_nl_net_init(rnet); 1103 if (ret) 1104 return ret; 1105 1106 /* No need to create any compat devices in default init_net. */ 1107 if (net_eq(net, &init_net)) 1108 return 0; 1109 1110 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1111 if (ret) { 1112 rdma_nl_net_exit(rnet); 1113 return ret; 1114 } 1115 1116 down_read(&devices_rwsem); 1117 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1118 /* Hold nets_rwsem so that netlink command cannot change 1119 * system configuration for device sharing mode. 1120 */ 1121 down_read(&rdma_nets_rwsem); 1122 ret = add_one_compat_dev(dev, rnet); 1123 up_read(&rdma_nets_rwsem); 1124 if (ret) 1125 break; 1126 } 1127 up_read(&devices_rwsem); 1128 1129 if (ret) 1130 rdma_dev_exit_net(net); 1131 1132 return ret; 1133 } 1134 1135 /* 1136 * Assign the unique string device name and the unique device index. This is 1137 * undone by ib_dealloc_device. 1138 */ 1139 static int assign_name(struct ib_device *device, const char *name) 1140 { 1141 static u32 last_id; 1142 int ret; 1143 1144 down_write(&devices_rwsem); 1145 /* Assign a unique name to the device */ 1146 if (strchr(name, '%')) 1147 ret = alloc_name(device, name); 1148 else 1149 ret = dev_set_name(&device->dev, name); 1150 if (ret) 1151 goto out; 1152 1153 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1154 ret = -ENFILE; 1155 goto out; 1156 } 1157 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1158 1159 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1160 &last_id, GFP_KERNEL); 1161 if (ret > 0) 1162 ret = 0; 1163 1164 out: 1165 up_write(&devices_rwsem); 1166 return ret; 1167 } 1168 1169 static void setup_dma_device(struct ib_device *device) 1170 { 1171 struct device *parent = device->dev.parent; 1172 1173 WARN_ON_ONCE(device->dma_device); 1174 if (device->dev.dma_ops) { 1175 /* 1176 * The caller provided custom DMA operations. Copy the 1177 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1178 * into device->dev. 1179 */ 1180 device->dma_device = &device->dev; 1181 if (!device->dev.dma_mask) { 1182 if (parent) 1183 device->dev.dma_mask = parent->dma_mask; 1184 else 1185 WARN_ON_ONCE(true); 1186 } 1187 if (!device->dev.coherent_dma_mask) { 1188 if (parent) 1189 device->dev.coherent_dma_mask = 1190 parent->coherent_dma_mask; 1191 else 1192 WARN_ON_ONCE(true); 1193 } 1194 } else { 1195 /* 1196 * The caller did not provide custom DMA operations. Use the 1197 * DMA mapping operations of the parent device. 1198 */ 1199 WARN_ON_ONCE(!parent); 1200 device->dma_device = parent; 1201 } 1202 /* Setup default max segment size for all IB devices */ 1203 dma_set_max_seg_size(device->dma_device, SZ_2G); 1204 1205 } 1206 1207 /* 1208 * setup_device() allocates memory and sets up data that requires calling the 1209 * device ops, this is the only reason these actions are not done during 1210 * ib_alloc_device. It is undone by ib_dealloc_device(). 1211 */ 1212 static int setup_device(struct ib_device *device) 1213 { 1214 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1215 int ret; 1216 1217 setup_dma_device(device); 1218 ib_device_check_mandatory(device); 1219 1220 ret = setup_port_data(device); 1221 if (ret) { 1222 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1223 return ret; 1224 } 1225 1226 memset(&device->attrs, 0, sizeof(device->attrs)); 1227 ret = device->ops.query_device(device, &device->attrs, &uhw); 1228 if (ret) { 1229 dev_warn(&device->dev, 1230 "Couldn't query the device attributes\n"); 1231 return ret; 1232 } 1233 1234 return 0; 1235 } 1236 1237 static void disable_device(struct ib_device *device) 1238 { 1239 u32 cid; 1240 1241 WARN_ON(!refcount_read(&device->refcount)); 1242 1243 down_write(&devices_rwsem); 1244 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1245 up_write(&devices_rwsem); 1246 1247 /* 1248 * Remove clients in LIFO order, see assign_client_id. This could be 1249 * more efficient if xarray learns to reverse iterate. Since no new 1250 * clients can be added to this ib_device past this point we only need 1251 * the maximum possible client_id value here. 1252 */ 1253 down_read(&clients_rwsem); 1254 cid = highest_client_id; 1255 up_read(&clients_rwsem); 1256 while (cid) { 1257 cid--; 1258 remove_client_context(device, cid); 1259 } 1260 1261 /* Pairs with refcount_set in enable_device */ 1262 ib_device_put(device); 1263 wait_for_completion(&device->unreg_completion); 1264 1265 /* 1266 * compat devices must be removed after device refcount drops to zero. 1267 * Otherwise init_net() may add more compatdevs after removing compat 1268 * devices and before device is disabled. 1269 */ 1270 remove_compat_devs(device); 1271 } 1272 1273 /* 1274 * An enabled device is visible to all clients and to all the public facing 1275 * APIs that return a device pointer. This always returns with a new get, even 1276 * if it fails. 1277 */ 1278 static int enable_device_and_get(struct ib_device *device) 1279 { 1280 struct ib_client *client; 1281 unsigned long index; 1282 int ret = 0; 1283 1284 /* 1285 * One ref belongs to the xa and the other belongs to this 1286 * thread. This is needed to guard against parallel unregistration. 1287 */ 1288 refcount_set(&device->refcount, 2); 1289 down_write(&devices_rwsem); 1290 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1291 1292 /* 1293 * By using downgrade_write() we ensure that no other thread can clear 1294 * DEVICE_REGISTERED while we are completing the client setup. 1295 */ 1296 downgrade_write(&devices_rwsem); 1297 1298 if (device->ops.enable_driver) { 1299 ret = device->ops.enable_driver(device); 1300 if (ret) 1301 goto out; 1302 } 1303 1304 down_read(&clients_rwsem); 1305 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1306 ret = add_client_context(device, client); 1307 if (ret) 1308 break; 1309 } 1310 up_read(&clients_rwsem); 1311 if (!ret) 1312 ret = add_compat_devs(device); 1313 out: 1314 up_read(&devices_rwsem); 1315 return ret; 1316 } 1317 1318 /** 1319 * ib_register_device - Register an IB device with IB core 1320 * @device:Device to register 1321 * 1322 * Low-level drivers use ib_register_device() to register their 1323 * devices with the IB core. All registered clients will receive a 1324 * callback for each device that is added. @device must be allocated 1325 * with ib_alloc_device(). 1326 * 1327 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1328 * asynchronously then the device pointer may become freed as soon as this 1329 * function returns. 1330 */ 1331 int ib_register_device(struct ib_device *device, const char *name) 1332 { 1333 int ret; 1334 1335 ret = assign_name(device, name); 1336 if (ret) 1337 return ret; 1338 1339 ret = setup_device(device); 1340 if (ret) 1341 return ret; 1342 1343 ret = ib_cache_setup_one(device); 1344 if (ret) { 1345 dev_warn(&device->dev, 1346 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1347 return ret; 1348 } 1349 1350 ib_device_register_rdmacg(device); 1351 1352 rdma_counter_init(device); 1353 1354 /* 1355 * Ensure that ADD uevent is not fired because it 1356 * is too early amd device is not initialized yet. 1357 */ 1358 dev_set_uevent_suppress(&device->dev, true); 1359 ret = device_add(&device->dev); 1360 if (ret) 1361 goto cg_cleanup; 1362 1363 ret = ib_device_register_sysfs(device); 1364 if (ret) { 1365 dev_warn(&device->dev, 1366 "Couldn't register device with driver model\n"); 1367 goto dev_cleanup; 1368 } 1369 1370 ret = enable_device_and_get(device); 1371 dev_set_uevent_suppress(&device->dev, false); 1372 /* Mark for userspace that device is ready */ 1373 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1374 if (ret) { 1375 void (*dealloc_fn)(struct ib_device *); 1376 1377 /* 1378 * If we hit this error flow then we don't want to 1379 * automatically dealloc the device since the caller is 1380 * expected to call ib_dealloc_device() after 1381 * ib_register_device() fails. This is tricky due to the 1382 * possibility for a parallel unregistration along with this 1383 * error flow. Since we have a refcount here we know any 1384 * parallel flow is stopped in disable_device and will see the 1385 * NULL pointers, causing the responsibility to 1386 * ib_dealloc_device() to revert back to this thread. 1387 */ 1388 dealloc_fn = device->ops.dealloc_driver; 1389 device->ops.dealloc_driver = NULL; 1390 ib_device_put(device); 1391 __ib_unregister_device(device); 1392 device->ops.dealloc_driver = dealloc_fn; 1393 return ret; 1394 } 1395 ib_device_put(device); 1396 1397 return 0; 1398 1399 dev_cleanup: 1400 device_del(&device->dev); 1401 cg_cleanup: 1402 dev_set_uevent_suppress(&device->dev, false); 1403 ib_device_unregister_rdmacg(device); 1404 ib_cache_cleanup_one(device); 1405 return ret; 1406 } 1407 EXPORT_SYMBOL(ib_register_device); 1408 1409 /* Callers must hold a get on the device. */ 1410 static void __ib_unregister_device(struct ib_device *ib_dev) 1411 { 1412 /* 1413 * We have a registration lock so that all the calls to unregister are 1414 * fully fenced, once any unregister returns the device is truely 1415 * unregistered even if multiple callers are unregistering it at the 1416 * same time. This also interacts with the registration flow and 1417 * provides sane semantics if register and unregister are racing. 1418 */ 1419 mutex_lock(&ib_dev->unregistration_lock); 1420 if (!refcount_read(&ib_dev->refcount)) 1421 goto out; 1422 1423 disable_device(ib_dev); 1424 1425 /* Expedite removing unregistered pointers from the hash table */ 1426 free_netdevs(ib_dev); 1427 1428 ib_device_unregister_sysfs(ib_dev); 1429 device_del(&ib_dev->dev); 1430 ib_device_unregister_rdmacg(ib_dev); 1431 ib_cache_cleanup_one(ib_dev); 1432 1433 /* 1434 * Drivers using the new flow may not call ib_dealloc_device except 1435 * in error unwind prior to registration success. 1436 */ 1437 if (ib_dev->ops.dealloc_driver) { 1438 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1439 ib_dealloc_device(ib_dev); 1440 } 1441 out: 1442 mutex_unlock(&ib_dev->unregistration_lock); 1443 } 1444 1445 /** 1446 * ib_unregister_device - Unregister an IB device 1447 * @device: The device to unregister 1448 * 1449 * Unregister an IB device. All clients will receive a remove callback. 1450 * 1451 * Callers should call this routine only once, and protect against races with 1452 * registration. Typically it should only be called as part of a remove 1453 * callback in an implementation of driver core's struct device_driver and 1454 * related. 1455 * 1456 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1457 * this function. 1458 */ 1459 void ib_unregister_device(struct ib_device *ib_dev) 1460 { 1461 get_device(&ib_dev->dev); 1462 __ib_unregister_device(ib_dev); 1463 put_device(&ib_dev->dev); 1464 } 1465 EXPORT_SYMBOL(ib_unregister_device); 1466 1467 /** 1468 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1469 * device: The device to unregister 1470 * 1471 * This is the same as ib_unregister_device(), except it includes an internal 1472 * ib_device_put() that should match a 'get' obtained by the caller. 1473 * 1474 * It is safe to call this routine concurrently from multiple threads while 1475 * holding the 'get'. When the function returns the device is fully 1476 * unregistered. 1477 * 1478 * Drivers using this flow MUST use the driver_unregister callback to clean up 1479 * their resources associated with the device and dealloc it. 1480 */ 1481 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1482 { 1483 WARN_ON(!ib_dev->ops.dealloc_driver); 1484 get_device(&ib_dev->dev); 1485 ib_device_put(ib_dev); 1486 __ib_unregister_device(ib_dev); 1487 put_device(&ib_dev->dev); 1488 } 1489 EXPORT_SYMBOL(ib_unregister_device_and_put); 1490 1491 /** 1492 * ib_unregister_driver - Unregister all IB devices for a driver 1493 * @driver_id: The driver to unregister 1494 * 1495 * This implements a fence for device unregistration. It only returns once all 1496 * devices associated with the driver_id have fully completed their 1497 * unregistration and returned from ib_unregister_device*(). 1498 * 1499 * If device's are not yet unregistered it goes ahead and starts unregistering 1500 * them. 1501 * 1502 * This does not block creation of new devices with the given driver_id, that 1503 * is the responsibility of the caller. 1504 */ 1505 void ib_unregister_driver(enum rdma_driver_id driver_id) 1506 { 1507 struct ib_device *ib_dev; 1508 unsigned long index; 1509 1510 down_read(&devices_rwsem); 1511 xa_for_each (&devices, index, ib_dev) { 1512 if (ib_dev->ops.driver_id != driver_id) 1513 continue; 1514 1515 get_device(&ib_dev->dev); 1516 up_read(&devices_rwsem); 1517 1518 WARN_ON(!ib_dev->ops.dealloc_driver); 1519 __ib_unregister_device(ib_dev); 1520 1521 put_device(&ib_dev->dev); 1522 down_read(&devices_rwsem); 1523 } 1524 up_read(&devices_rwsem); 1525 } 1526 EXPORT_SYMBOL(ib_unregister_driver); 1527 1528 static void ib_unregister_work(struct work_struct *work) 1529 { 1530 struct ib_device *ib_dev = 1531 container_of(work, struct ib_device, unregistration_work); 1532 1533 __ib_unregister_device(ib_dev); 1534 put_device(&ib_dev->dev); 1535 } 1536 1537 /** 1538 * ib_unregister_device_queued - Unregister a device using a work queue 1539 * device: The device to unregister 1540 * 1541 * This schedules an asynchronous unregistration using a WQ for the device. A 1542 * driver should use this to avoid holding locks while doing unregistration, 1543 * such as holding the RTNL lock. 1544 * 1545 * Drivers using this API must use ib_unregister_driver before module unload 1546 * to ensure that all scheduled unregistrations have completed. 1547 */ 1548 void ib_unregister_device_queued(struct ib_device *ib_dev) 1549 { 1550 WARN_ON(!refcount_read(&ib_dev->refcount)); 1551 WARN_ON(!ib_dev->ops.dealloc_driver); 1552 get_device(&ib_dev->dev); 1553 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1554 put_device(&ib_dev->dev); 1555 } 1556 EXPORT_SYMBOL(ib_unregister_device_queued); 1557 1558 /* 1559 * The caller must pass in a device that has the kref held and the refcount 1560 * released. If the device is in cur_net and still registered then it is moved 1561 * into net. 1562 */ 1563 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1564 struct net *net) 1565 { 1566 int ret2 = -EINVAL; 1567 int ret; 1568 1569 mutex_lock(&device->unregistration_lock); 1570 1571 /* 1572 * If a device not under ib_device_get() or if the unregistration_lock 1573 * is not held, the namespace can be changed, or it can be unregistered. 1574 * Check again under the lock. 1575 */ 1576 if (refcount_read(&device->refcount) == 0 || 1577 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1578 ret = -ENODEV; 1579 goto out; 1580 } 1581 1582 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1583 disable_device(device); 1584 1585 /* 1586 * At this point no one can be using the device, so it is safe to 1587 * change the namespace. 1588 */ 1589 write_pnet(&device->coredev.rdma_net, net); 1590 1591 down_read(&devices_rwsem); 1592 /* 1593 * Currently rdma devices are system wide unique. So the device name 1594 * is guaranteed free in the new namespace. Publish the new namespace 1595 * at the sysfs level. 1596 */ 1597 ret = device_rename(&device->dev, dev_name(&device->dev)); 1598 up_read(&devices_rwsem); 1599 if (ret) { 1600 dev_warn(&device->dev, 1601 "%s: Couldn't rename device after namespace change\n", 1602 __func__); 1603 /* Try and put things back and re-enable the device */ 1604 write_pnet(&device->coredev.rdma_net, cur_net); 1605 } 1606 1607 ret2 = enable_device_and_get(device); 1608 if (ret2) { 1609 /* 1610 * This shouldn't really happen, but if it does, let the user 1611 * retry at later point. So don't disable the device. 1612 */ 1613 dev_warn(&device->dev, 1614 "%s: Couldn't re-enable device after namespace change\n", 1615 __func__); 1616 } 1617 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1618 1619 ib_device_put(device); 1620 out: 1621 mutex_unlock(&device->unregistration_lock); 1622 if (ret) 1623 return ret; 1624 return ret2; 1625 } 1626 1627 int ib_device_set_netns_put(struct sk_buff *skb, 1628 struct ib_device *dev, u32 ns_fd) 1629 { 1630 struct net *net; 1631 int ret; 1632 1633 net = get_net_ns_by_fd(ns_fd); 1634 if (IS_ERR(net)) { 1635 ret = PTR_ERR(net); 1636 goto net_err; 1637 } 1638 1639 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1640 ret = -EPERM; 1641 goto ns_err; 1642 } 1643 1644 /* 1645 * Currently supported only for those providers which support 1646 * disassociation and don't do port specific sysfs init. Once a 1647 * port_cleanup infrastructure is implemented, this limitation will be 1648 * removed. 1649 */ 1650 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1651 ib_devices_shared_netns) { 1652 ret = -EOPNOTSUPP; 1653 goto ns_err; 1654 } 1655 1656 get_device(&dev->dev); 1657 ib_device_put(dev); 1658 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1659 put_device(&dev->dev); 1660 1661 put_net(net); 1662 return ret; 1663 1664 ns_err: 1665 put_net(net); 1666 net_err: 1667 ib_device_put(dev); 1668 return ret; 1669 } 1670 1671 static struct pernet_operations rdma_dev_net_ops = { 1672 .init = rdma_dev_init_net, 1673 .exit = rdma_dev_exit_net, 1674 .id = &rdma_dev_net_id, 1675 .size = sizeof(struct rdma_dev_net), 1676 }; 1677 1678 static int assign_client_id(struct ib_client *client) 1679 { 1680 int ret; 1681 1682 down_write(&clients_rwsem); 1683 /* 1684 * The add/remove callbacks must be called in FIFO/LIFO order. To 1685 * achieve this we assign client_ids so they are sorted in 1686 * registration order. 1687 */ 1688 client->client_id = highest_client_id; 1689 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1690 if (ret) 1691 goto out; 1692 1693 highest_client_id++; 1694 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1695 1696 out: 1697 up_write(&clients_rwsem); 1698 return ret; 1699 } 1700 1701 static void remove_client_id(struct ib_client *client) 1702 { 1703 down_write(&clients_rwsem); 1704 xa_erase(&clients, client->client_id); 1705 for (; highest_client_id; highest_client_id--) 1706 if (xa_load(&clients, highest_client_id - 1)) 1707 break; 1708 up_write(&clients_rwsem); 1709 } 1710 1711 /** 1712 * ib_register_client - Register an IB client 1713 * @client:Client to register 1714 * 1715 * Upper level users of the IB drivers can use ib_register_client() to 1716 * register callbacks for IB device addition and removal. When an IB 1717 * device is added, each registered client's add method will be called 1718 * (in the order the clients were registered), and when a device is 1719 * removed, each client's remove method will be called (in the reverse 1720 * order that clients were registered). In addition, when 1721 * ib_register_client() is called, the client will receive an add 1722 * callback for all devices already registered. 1723 */ 1724 int ib_register_client(struct ib_client *client) 1725 { 1726 struct ib_device *device; 1727 unsigned long index; 1728 int ret; 1729 1730 refcount_set(&client->uses, 1); 1731 init_completion(&client->uses_zero); 1732 ret = assign_client_id(client); 1733 if (ret) 1734 return ret; 1735 1736 down_read(&devices_rwsem); 1737 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1738 ret = add_client_context(device, client); 1739 if (ret) { 1740 up_read(&devices_rwsem); 1741 ib_unregister_client(client); 1742 return ret; 1743 } 1744 } 1745 up_read(&devices_rwsem); 1746 return 0; 1747 } 1748 EXPORT_SYMBOL(ib_register_client); 1749 1750 /** 1751 * ib_unregister_client - Unregister an IB client 1752 * @client:Client to unregister 1753 * 1754 * Upper level users use ib_unregister_client() to remove their client 1755 * registration. When ib_unregister_client() is called, the client 1756 * will receive a remove callback for each IB device still registered. 1757 * 1758 * This is a full fence, once it returns no client callbacks will be called, 1759 * or are running in another thread. 1760 */ 1761 void ib_unregister_client(struct ib_client *client) 1762 { 1763 struct ib_device *device; 1764 unsigned long index; 1765 1766 down_write(&clients_rwsem); 1767 ib_client_put(client); 1768 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1769 up_write(&clients_rwsem); 1770 1771 /* We do not want to have locks while calling client->remove() */ 1772 rcu_read_lock(); 1773 xa_for_each (&devices, index, device) { 1774 if (!ib_device_try_get(device)) 1775 continue; 1776 rcu_read_unlock(); 1777 1778 remove_client_context(device, client->client_id); 1779 1780 ib_device_put(device); 1781 rcu_read_lock(); 1782 } 1783 rcu_read_unlock(); 1784 1785 /* 1786 * remove_client_context() is not a fence, it can return even though a 1787 * removal is ongoing. Wait until all removals are completed. 1788 */ 1789 wait_for_completion(&client->uses_zero); 1790 remove_client_id(client); 1791 } 1792 EXPORT_SYMBOL(ib_unregister_client); 1793 1794 static int __ib_get_global_client_nl_info(const char *client_name, 1795 struct ib_client_nl_info *res) 1796 { 1797 struct ib_client *client; 1798 unsigned long index; 1799 int ret = -ENOENT; 1800 1801 down_read(&clients_rwsem); 1802 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1803 if (strcmp(client->name, client_name) != 0) 1804 continue; 1805 if (!client->get_global_nl_info) { 1806 ret = -EOPNOTSUPP; 1807 break; 1808 } 1809 ret = client->get_global_nl_info(res); 1810 if (WARN_ON(ret == -ENOENT)) 1811 ret = -EINVAL; 1812 if (!ret && res->cdev) 1813 get_device(res->cdev); 1814 break; 1815 } 1816 up_read(&clients_rwsem); 1817 return ret; 1818 } 1819 1820 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1821 const char *client_name, 1822 struct ib_client_nl_info *res) 1823 { 1824 unsigned long index; 1825 void *client_data; 1826 int ret = -ENOENT; 1827 1828 down_read(&ibdev->client_data_rwsem); 1829 xan_for_each_marked (&ibdev->client_data, index, client_data, 1830 CLIENT_DATA_REGISTERED) { 1831 struct ib_client *client = xa_load(&clients, index); 1832 1833 if (!client || strcmp(client->name, client_name) != 0) 1834 continue; 1835 if (!client->get_nl_info) { 1836 ret = -EOPNOTSUPP; 1837 break; 1838 } 1839 ret = client->get_nl_info(ibdev, client_data, res); 1840 if (WARN_ON(ret == -ENOENT)) 1841 ret = -EINVAL; 1842 1843 /* 1844 * The cdev is guaranteed valid as long as we are inside the 1845 * client_data_rwsem as remove_one can't be called. Keep it 1846 * valid for the caller. 1847 */ 1848 if (!ret && res->cdev) 1849 get_device(res->cdev); 1850 break; 1851 } 1852 up_read(&ibdev->client_data_rwsem); 1853 1854 return ret; 1855 } 1856 1857 /** 1858 * ib_get_client_nl_info - Fetch the nl_info from a client 1859 * @device - IB device 1860 * @client_name - Name of the client 1861 * @res - Result of the query 1862 */ 1863 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1864 struct ib_client_nl_info *res) 1865 { 1866 int ret; 1867 1868 if (ibdev) 1869 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1870 else 1871 ret = __ib_get_global_client_nl_info(client_name, res); 1872 #ifdef CONFIG_MODULES 1873 if (ret == -ENOENT) { 1874 request_module("rdma-client-%s", client_name); 1875 if (ibdev) 1876 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1877 else 1878 ret = __ib_get_global_client_nl_info(client_name, res); 1879 } 1880 #endif 1881 if (ret) { 1882 if (ret == -ENOENT) 1883 return -EOPNOTSUPP; 1884 return ret; 1885 } 1886 1887 if (WARN_ON(!res->cdev)) 1888 return -EINVAL; 1889 return 0; 1890 } 1891 1892 /** 1893 * ib_set_client_data - Set IB client context 1894 * @device:Device to set context for 1895 * @client:Client to set context for 1896 * @data:Context to set 1897 * 1898 * ib_set_client_data() sets client context data that can be retrieved with 1899 * ib_get_client_data(). This can only be called while the client is 1900 * registered to the device, once the ib_client remove() callback returns this 1901 * cannot be called. 1902 */ 1903 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1904 void *data) 1905 { 1906 void *rc; 1907 1908 if (WARN_ON(IS_ERR(data))) 1909 data = NULL; 1910 1911 rc = xa_store(&device->client_data, client->client_id, data, 1912 GFP_KERNEL); 1913 WARN_ON(xa_is_err(rc)); 1914 } 1915 EXPORT_SYMBOL(ib_set_client_data); 1916 1917 /** 1918 * ib_register_event_handler - Register an IB event handler 1919 * @event_handler:Handler to register 1920 * 1921 * ib_register_event_handler() registers an event handler that will be 1922 * called back when asynchronous IB events occur (as defined in 1923 * chapter 11 of the InfiniBand Architecture Specification). This 1924 * callback may occur in interrupt context. 1925 */ 1926 void ib_register_event_handler(struct ib_event_handler *event_handler) 1927 { 1928 unsigned long flags; 1929 1930 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1931 list_add_tail(&event_handler->list, 1932 &event_handler->device->event_handler_list); 1933 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1934 } 1935 EXPORT_SYMBOL(ib_register_event_handler); 1936 1937 /** 1938 * ib_unregister_event_handler - Unregister an event handler 1939 * @event_handler:Handler to unregister 1940 * 1941 * Unregister an event handler registered with 1942 * ib_register_event_handler(). 1943 */ 1944 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1945 { 1946 unsigned long flags; 1947 1948 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1949 list_del(&event_handler->list); 1950 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1951 } 1952 EXPORT_SYMBOL(ib_unregister_event_handler); 1953 1954 /** 1955 * ib_dispatch_event - Dispatch an asynchronous event 1956 * @event:Event to dispatch 1957 * 1958 * Low-level drivers must call ib_dispatch_event() to dispatch the 1959 * event to all registered event handlers when an asynchronous event 1960 * occurs. 1961 */ 1962 void ib_dispatch_event(struct ib_event *event) 1963 { 1964 unsigned long flags; 1965 struct ib_event_handler *handler; 1966 1967 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1968 1969 list_for_each_entry(handler, &event->device->event_handler_list, list) 1970 handler->handler(handler, event); 1971 1972 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1973 } 1974 EXPORT_SYMBOL(ib_dispatch_event); 1975 1976 static int iw_query_port(struct ib_device *device, 1977 u8 port_num, 1978 struct ib_port_attr *port_attr) 1979 { 1980 struct in_device *inetdev; 1981 struct net_device *netdev; 1982 int err; 1983 1984 memset(port_attr, 0, sizeof(*port_attr)); 1985 1986 netdev = ib_device_get_netdev(device, port_num); 1987 if (!netdev) 1988 return -ENODEV; 1989 1990 dev_put(netdev); 1991 1992 port_attr->max_mtu = IB_MTU_4096; 1993 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 1994 1995 if (!netif_carrier_ok(netdev)) { 1996 port_attr->state = IB_PORT_DOWN; 1997 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 1998 } else { 1999 inetdev = in_dev_get(netdev); 2000 2001 if (inetdev && inetdev->ifa_list) { 2002 port_attr->state = IB_PORT_ACTIVE; 2003 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2004 in_dev_put(inetdev); 2005 } else { 2006 port_attr->state = IB_PORT_INIT; 2007 port_attr->phys_state = 2008 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2009 } 2010 } 2011 2012 err = device->ops.query_port(device, port_num, port_attr); 2013 if (err) 2014 return err; 2015 2016 return 0; 2017 } 2018 2019 static int __ib_query_port(struct ib_device *device, 2020 u8 port_num, 2021 struct ib_port_attr *port_attr) 2022 { 2023 union ib_gid gid = {}; 2024 int err; 2025 2026 memset(port_attr, 0, sizeof(*port_attr)); 2027 2028 err = device->ops.query_port(device, port_num, port_attr); 2029 if (err || port_attr->subnet_prefix) 2030 return err; 2031 2032 if (rdma_port_get_link_layer(device, port_num) != 2033 IB_LINK_LAYER_INFINIBAND) 2034 return 0; 2035 2036 err = device->ops.query_gid(device, port_num, 0, &gid); 2037 if (err) 2038 return err; 2039 2040 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 2041 return 0; 2042 } 2043 2044 /** 2045 * ib_query_port - Query IB port attributes 2046 * @device:Device to query 2047 * @port_num:Port number to query 2048 * @port_attr:Port attributes 2049 * 2050 * ib_query_port() returns the attributes of a port through the 2051 * @port_attr pointer. 2052 */ 2053 int ib_query_port(struct ib_device *device, 2054 u8 port_num, 2055 struct ib_port_attr *port_attr) 2056 { 2057 if (!rdma_is_port_valid(device, port_num)) 2058 return -EINVAL; 2059 2060 if (rdma_protocol_iwarp(device, port_num)) 2061 return iw_query_port(device, port_num, port_attr); 2062 else 2063 return __ib_query_port(device, port_num, port_attr); 2064 } 2065 EXPORT_SYMBOL(ib_query_port); 2066 2067 static void add_ndev_hash(struct ib_port_data *pdata) 2068 { 2069 unsigned long flags; 2070 2071 might_sleep(); 2072 2073 spin_lock_irqsave(&ndev_hash_lock, flags); 2074 if (hash_hashed(&pdata->ndev_hash_link)) { 2075 hash_del_rcu(&pdata->ndev_hash_link); 2076 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2077 /* 2078 * We cannot do hash_add_rcu after a hash_del_rcu until the 2079 * grace period 2080 */ 2081 synchronize_rcu(); 2082 spin_lock_irqsave(&ndev_hash_lock, flags); 2083 } 2084 if (pdata->netdev) 2085 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2086 (uintptr_t)pdata->netdev); 2087 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2088 } 2089 2090 /** 2091 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2092 * @ib_dev: Device to modify 2093 * @ndev: net_device to affiliate, may be NULL 2094 * @port: IB port the net_device is connected to 2095 * 2096 * Drivers should use this to link the ib_device to a netdev so the netdev 2097 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2098 * affiliated with any port. 2099 * 2100 * The caller must ensure that the given ndev is not unregistered or 2101 * unregistering, and that either the ib_device is unregistered or 2102 * ib_device_set_netdev() is called with NULL when the ndev sends a 2103 * NETDEV_UNREGISTER event. 2104 */ 2105 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2106 unsigned int port) 2107 { 2108 struct net_device *old_ndev; 2109 struct ib_port_data *pdata; 2110 unsigned long flags; 2111 int ret; 2112 2113 /* 2114 * Drivers wish to call this before ib_register_driver, so we have to 2115 * setup the port data early. 2116 */ 2117 ret = alloc_port_data(ib_dev); 2118 if (ret) 2119 return ret; 2120 2121 if (!rdma_is_port_valid(ib_dev, port)) 2122 return -EINVAL; 2123 2124 pdata = &ib_dev->port_data[port]; 2125 spin_lock_irqsave(&pdata->netdev_lock, flags); 2126 old_ndev = rcu_dereference_protected( 2127 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2128 if (old_ndev == ndev) { 2129 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2130 return 0; 2131 } 2132 2133 if (ndev) 2134 dev_hold(ndev); 2135 rcu_assign_pointer(pdata->netdev, ndev); 2136 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2137 2138 add_ndev_hash(pdata); 2139 if (old_ndev) 2140 dev_put(old_ndev); 2141 2142 return 0; 2143 } 2144 EXPORT_SYMBOL(ib_device_set_netdev); 2145 2146 static void free_netdevs(struct ib_device *ib_dev) 2147 { 2148 unsigned long flags; 2149 unsigned int port; 2150 2151 if (!ib_dev->port_data) 2152 return; 2153 2154 rdma_for_each_port (ib_dev, port) { 2155 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2156 struct net_device *ndev; 2157 2158 spin_lock_irqsave(&pdata->netdev_lock, flags); 2159 ndev = rcu_dereference_protected( 2160 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2161 if (ndev) { 2162 spin_lock(&ndev_hash_lock); 2163 hash_del_rcu(&pdata->ndev_hash_link); 2164 spin_unlock(&ndev_hash_lock); 2165 2166 /* 2167 * If this is the last dev_put there is still a 2168 * synchronize_rcu before the netdev is kfreed, so we 2169 * can continue to rely on unlocked pointer 2170 * comparisons after the put 2171 */ 2172 rcu_assign_pointer(pdata->netdev, NULL); 2173 dev_put(ndev); 2174 } 2175 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2176 } 2177 } 2178 2179 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2180 unsigned int port) 2181 { 2182 struct ib_port_data *pdata; 2183 struct net_device *res; 2184 2185 if (!rdma_is_port_valid(ib_dev, port)) 2186 return NULL; 2187 2188 pdata = &ib_dev->port_data[port]; 2189 2190 /* 2191 * New drivers should use ib_device_set_netdev() not the legacy 2192 * get_netdev(). 2193 */ 2194 if (ib_dev->ops.get_netdev) 2195 res = ib_dev->ops.get_netdev(ib_dev, port); 2196 else { 2197 spin_lock(&pdata->netdev_lock); 2198 res = rcu_dereference_protected( 2199 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2200 if (res) 2201 dev_hold(res); 2202 spin_unlock(&pdata->netdev_lock); 2203 } 2204 2205 /* 2206 * If we are starting to unregister expedite things by preventing 2207 * propagation of an unregistering netdev. 2208 */ 2209 if (res && res->reg_state != NETREG_REGISTERED) { 2210 dev_put(res); 2211 return NULL; 2212 } 2213 2214 return res; 2215 } 2216 2217 /** 2218 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2219 * @ndev: netdev to locate 2220 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2221 * 2222 * Find and hold an ib_device that is associated with a netdev via 2223 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2224 * returned pointer. 2225 */ 2226 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2227 enum rdma_driver_id driver_id) 2228 { 2229 struct ib_device *res = NULL; 2230 struct ib_port_data *cur; 2231 2232 rcu_read_lock(); 2233 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2234 (uintptr_t)ndev) { 2235 if (rcu_access_pointer(cur->netdev) == ndev && 2236 (driver_id == RDMA_DRIVER_UNKNOWN || 2237 cur->ib_dev->ops.driver_id == driver_id) && 2238 ib_device_try_get(cur->ib_dev)) { 2239 res = cur->ib_dev; 2240 break; 2241 } 2242 } 2243 rcu_read_unlock(); 2244 2245 return res; 2246 } 2247 EXPORT_SYMBOL(ib_device_get_by_netdev); 2248 2249 /** 2250 * ib_enum_roce_netdev - enumerate all RoCE ports 2251 * @ib_dev : IB device we want to query 2252 * @filter: Should we call the callback? 2253 * @filter_cookie: Cookie passed to filter 2254 * @cb: Callback to call for each found RoCE ports 2255 * @cookie: Cookie passed back to the callback 2256 * 2257 * Enumerates all of the physical RoCE ports of ib_dev 2258 * which are related to netdevice and calls callback() on each 2259 * device for which filter() function returns non zero. 2260 */ 2261 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2262 roce_netdev_filter filter, 2263 void *filter_cookie, 2264 roce_netdev_callback cb, 2265 void *cookie) 2266 { 2267 unsigned int port; 2268 2269 rdma_for_each_port (ib_dev, port) 2270 if (rdma_protocol_roce(ib_dev, port)) { 2271 struct net_device *idev = 2272 ib_device_get_netdev(ib_dev, port); 2273 2274 if (filter(ib_dev, port, idev, filter_cookie)) 2275 cb(ib_dev, port, idev, cookie); 2276 2277 if (idev) 2278 dev_put(idev); 2279 } 2280 } 2281 2282 /** 2283 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2284 * @filter: Should we call the callback? 2285 * @filter_cookie: Cookie passed to filter 2286 * @cb: Callback to call for each found RoCE ports 2287 * @cookie: Cookie passed back to the callback 2288 * 2289 * Enumerates all RoCE devices' physical ports which are related 2290 * to netdevices and calls callback() on each device for which 2291 * filter() function returns non zero. 2292 */ 2293 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2294 void *filter_cookie, 2295 roce_netdev_callback cb, 2296 void *cookie) 2297 { 2298 struct ib_device *dev; 2299 unsigned long index; 2300 2301 down_read(&devices_rwsem); 2302 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2303 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2304 up_read(&devices_rwsem); 2305 } 2306 2307 /** 2308 * ib_enum_all_devs - enumerate all ib_devices 2309 * @cb: Callback to call for each found ib_device 2310 * 2311 * Enumerates all ib_devices and calls callback() on each device. 2312 */ 2313 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2314 struct netlink_callback *cb) 2315 { 2316 unsigned long index; 2317 struct ib_device *dev; 2318 unsigned int idx = 0; 2319 int ret = 0; 2320 2321 down_read(&devices_rwsem); 2322 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2323 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2324 continue; 2325 2326 ret = nldev_cb(dev, skb, cb, idx); 2327 if (ret) 2328 break; 2329 idx++; 2330 } 2331 up_read(&devices_rwsem); 2332 return ret; 2333 } 2334 2335 /** 2336 * ib_query_pkey - Get P_Key table entry 2337 * @device:Device to query 2338 * @port_num:Port number to query 2339 * @index:P_Key table index to query 2340 * @pkey:Returned P_Key 2341 * 2342 * ib_query_pkey() fetches the specified P_Key table entry. 2343 */ 2344 int ib_query_pkey(struct ib_device *device, 2345 u8 port_num, u16 index, u16 *pkey) 2346 { 2347 if (!rdma_is_port_valid(device, port_num)) 2348 return -EINVAL; 2349 2350 return device->ops.query_pkey(device, port_num, index, pkey); 2351 } 2352 EXPORT_SYMBOL(ib_query_pkey); 2353 2354 /** 2355 * ib_modify_device - Change IB device attributes 2356 * @device:Device to modify 2357 * @device_modify_mask:Mask of attributes to change 2358 * @device_modify:New attribute values 2359 * 2360 * ib_modify_device() changes a device's attributes as specified by 2361 * the @device_modify_mask and @device_modify structure. 2362 */ 2363 int ib_modify_device(struct ib_device *device, 2364 int device_modify_mask, 2365 struct ib_device_modify *device_modify) 2366 { 2367 if (!device->ops.modify_device) 2368 return -ENOSYS; 2369 2370 return device->ops.modify_device(device, device_modify_mask, 2371 device_modify); 2372 } 2373 EXPORT_SYMBOL(ib_modify_device); 2374 2375 /** 2376 * ib_modify_port - Modifies the attributes for the specified port. 2377 * @device: The device to modify. 2378 * @port_num: The number of the port to modify. 2379 * @port_modify_mask: Mask used to specify which attributes of the port 2380 * to change. 2381 * @port_modify: New attribute values for the port. 2382 * 2383 * ib_modify_port() changes a port's attributes as specified by the 2384 * @port_modify_mask and @port_modify structure. 2385 */ 2386 int ib_modify_port(struct ib_device *device, 2387 u8 port_num, int port_modify_mask, 2388 struct ib_port_modify *port_modify) 2389 { 2390 int rc; 2391 2392 if (!rdma_is_port_valid(device, port_num)) 2393 return -EINVAL; 2394 2395 if (device->ops.modify_port) 2396 rc = device->ops.modify_port(device, port_num, 2397 port_modify_mask, 2398 port_modify); 2399 else 2400 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; 2401 return rc; 2402 } 2403 EXPORT_SYMBOL(ib_modify_port); 2404 2405 /** 2406 * ib_find_gid - Returns the port number and GID table index where 2407 * a specified GID value occurs. Its searches only for IB link layer. 2408 * @device: The device to query. 2409 * @gid: The GID value to search for. 2410 * @port_num: The port number of the device where the GID value was found. 2411 * @index: The index into the GID table where the GID was found. This 2412 * parameter may be NULL. 2413 */ 2414 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2415 u8 *port_num, u16 *index) 2416 { 2417 union ib_gid tmp_gid; 2418 unsigned int port; 2419 int ret, i; 2420 2421 rdma_for_each_port (device, port) { 2422 if (!rdma_protocol_ib(device, port)) 2423 continue; 2424 2425 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2426 ++i) { 2427 ret = rdma_query_gid(device, port, i, &tmp_gid); 2428 if (ret) 2429 return ret; 2430 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2431 *port_num = port; 2432 if (index) 2433 *index = i; 2434 return 0; 2435 } 2436 } 2437 } 2438 2439 return -ENOENT; 2440 } 2441 EXPORT_SYMBOL(ib_find_gid); 2442 2443 /** 2444 * ib_find_pkey - Returns the PKey table index where a specified 2445 * PKey value occurs. 2446 * @device: The device to query. 2447 * @port_num: The port number of the device to search for the PKey. 2448 * @pkey: The PKey value to search for. 2449 * @index: The index into the PKey table where the PKey was found. 2450 */ 2451 int ib_find_pkey(struct ib_device *device, 2452 u8 port_num, u16 pkey, u16 *index) 2453 { 2454 int ret, i; 2455 u16 tmp_pkey; 2456 int partial_ix = -1; 2457 2458 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2459 ++i) { 2460 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2461 if (ret) 2462 return ret; 2463 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2464 /* if there is full-member pkey take it.*/ 2465 if (tmp_pkey & 0x8000) { 2466 *index = i; 2467 return 0; 2468 } 2469 if (partial_ix < 0) 2470 partial_ix = i; 2471 } 2472 } 2473 2474 /*no full-member, if exists take the limited*/ 2475 if (partial_ix >= 0) { 2476 *index = partial_ix; 2477 return 0; 2478 } 2479 return -ENOENT; 2480 } 2481 EXPORT_SYMBOL(ib_find_pkey); 2482 2483 /** 2484 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2485 * for a received CM request 2486 * @dev: An RDMA device on which the request has been received. 2487 * @port: Port number on the RDMA device. 2488 * @pkey: The Pkey the request came on. 2489 * @gid: A GID that the net_dev uses to communicate. 2490 * @addr: Contains the IP address that the request specified as its 2491 * destination. 2492 * 2493 */ 2494 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2495 u8 port, 2496 u16 pkey, 2497 const union ib_gid *gid, 2498 const struct sockaddr *addr) 2499 { 2500 struct net_device *net_dev = NULL; 2501 unsigned long index; 2502 void *client_data; 2503 2504 if (!rdma_protocol_ib(dev, port)) 2505 return NULL; 2506 2507 /* 2508 * Holding the read side guarantees that the client will not become 2509 * unregistered while we are calling get_net_dev_by_params() 2510 */ 2511 down_read(&dev->client_data_rwsem); 2512 xan_for_each_marked (&dev->client_data, index, client_data, 2513 CLIENT_DATA_REGISTERED) { 2514 struct ib_client *client = xa_load(&clients, index); 2515 2516 if (!client || !client->get_net_dev_by_params) 2517 continue; 2518 2519 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2520 addr, client_data); 2521 if (net_dev) 2522 break; 2523 } 2524 up_read(&dev->client_data_rwsem); 2525 2526 return net_dev; 2527 } 2528 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2529 2530 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2531 { 2532 struct ib_device_ops *dev_ops = &dev->ops; 2533 #define SET_DEVICE_OP(ptr, name) \ 2534 do { \ 2535 if (ops->name) \ 2536 if (!((ptr)->name)) \ 2537 (ptr)->name = ops->name; \ 2538 } while (0) 2539 2540 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2541 2542 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2543 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2544 dev_ops->driver_id != ops->driver_id); 2545 dev_ops->driver_id = ops->driver_id; 2546 } 2547 if (ops->owner) { 2548 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2549 dev_ops->owner = ops->owner; 2550 } 2551 if (ops->uverbs_abi_ver) 2552 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2553 2554 dev_ops->uverbs_no_driver_id_binding |= 2555 ops->uverbs_no_driver_id_binding; 2556 2557 SET_DEVICE_OP(dev_ops, add_gid); 2558 SET_DEVICE_OP(dev_ops, advise_mr); 2559 SET_DEVICE_OP(dev_ops, alloc_dm); 2560 SET_DEVICE_OP(dev_ops, alloc_fmr); 2561 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2562 SET_DEVICE_OP(dev_ops, alloc_mr); 2563 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2564 SET_DEVICE_OP(dev_ops, alloc_mw); 2565 SET_DEVICE_OP(dev_ops, alloc_pd); 2566 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2567 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2568 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2569 SET_DEVICE_OP(dev_ops, attach_mcast); 2570 SET_DEVICE_OP(dev_ops, check_mr_status); 2571 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2572 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2573 SET_DEVICE_OP(dev_ops, counter_dealloc); 2574 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2575 SET_DEVICE_OP(dev_ops, counter_update_stats); 2576 SET_DEVICE_OP(dev_ops, create_ah); 2577 SET_DEVICE_OP(dev_ops, create_counters); 2578 SET_DEVICE_OP(dev_ops, create_cq); 2579 SET_DEVICE_OP(dev_ops, create_flow); 2580 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2581 SET_DEVICE_OP(dev_ops, create_qp); 2582 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2583 SET_DEVICE_OP(dev_ops, create_srq); 2584 SET_DEVICE_OP(dev_ops, create_wq); 2585 SET_DEVICE_OP(dev_ops, dealloc_dm); 2586 SET_DEVICE_OP(dev_ops, dealloc_driver); 2587 SET_DEVICE_OP(dev_ops, dealloc_fmr); 2588 SET_DEVICE_OP(dev_ops, dealloc_mw); 2589 SET_DEVICE_OP(dev_ops, dealloc_pd); 2590 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2591 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2592 SET_DEVICE_OP(dev_ops, del_gid); 2593 SET_DEVICE_OP(dev_ops, dereg_mr); 2594 SET_DEVICE_OP(dev_ops, destroy_ah); 2595 SET_DEVICE_OP(dev_ops, destroy_counters); 2596 SET_DEVICE_OP(dev_ops, destroy_cq); 2597 SET_DEVICE_OP(dev_ops, destroy_flow); 2598 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2599 SET_DEVICE_OP(dev_ops, destroy_qp); 2600 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2601 SET_DEVICE_OP(dev_ops, destroy_srq); 2602 SET_DEVICE_OP(dev_ops, destroy_wq); 2603 SET_DEVICE_OP(dev_ops, detach_mcast); 2604 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2605 SET_DEVICE_OP(dev_ops, drain_rq); 2606 SET_DEVICE_OP(dev_ops, drain_sq); 2607 SET_DEVICE_OP(dev_ops, enable_driver); 2608 SET_DEVICE_OP(dev_ops, fill_res_entry); 2609 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2610 SET_DEVICE_OP(dev_ops, get_dma_mr); 2611 SET_DEVICE_OP(dev_ops, get_hw_stats); 2612 SET_DEVICE_OP(dev_ops, get_link_layer); 2613 SET_DEVICE_OP(dev_ops, get_netdev); 2614 SET_DEVICE_OP(dev_ops, get_port_immutable); 2615 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2616 SET_DEVICE_OP(dev_ops, get_vf_config); 2617 SET_DEVICE_OP(dev_ops, get_vf_stats); 2618 SET_DEVICE_OP(dev_ops, init_port); 2619 SET_DEVICE_OP(dev_ops, invalidate_range); 2620 SET_DEVICE_OP(dev_ops, iw_accept); 2621 SET_DEVICE_OP(dev_ops, iw_add_ref); 2622 SET_DEVICE_OP(dev_ops, iw_connect); 2623 SET_DEVICE_OP(dev_ops, iw_create_listen); 2624 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2625 SET_DEVICE_OP(dev_ops, iw_get_qp); 2626 SET_DEVICE_OP(dev_ops, iw_reject); 2627 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2628 SET_DEVICE_OP(dev_ops, map_mr_sg); 2629 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2630 SET_DEVICE_OP(dev_ops, map_phys_fmr); 2631 SET_DEVICE_OP(dev_ops, mmap); 2632 SET_DEVICE_OP(dev_ops, modify_ah); 2633 SET_DEVICE_OP(dev_ops, modify_cq); 2634 SET_DEVICE_OP(dev_ops, modify_device); 2635 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2636 SET_DEVICE_OP(dev_ops, modify_port); 2637 SET_DEVICE_OP(dev_ops, modify_qp); 2638 SET_DEVICE_OP(dev_ops, modify_srq); 2639 SET_DEVICE_OP(dev_ops, modify_wq); 2640 SET_DEVICE_OP(dev_ops, peek_cq); 2641 SET_DEVICE_OP(dev_ops, poll_cq); 2642 SET_DEVICE_OP(dev_ops, post_recv); 2643 SET_DEVICE_OP(dev_ops, post_send); 2644 SET_DEVICE_OP(dev_ops, post_srq_recv); 2645 SET_DEVICE_OP(dev_ops, process_mad); 2646 SET_DEVICE_OP(dev_ops, query_ah); 2647 SET_DEVICE_OP(dev_ops, query_device); 2648 SET_DEVICE_OP(dev_ops, query_gid); 2649 SET_DEVICE_OP(dev_ops, query_pkey); 2650 SET_DEVICE_OP(dev_ops, query_port); 2651 SET_DEVICE_OP(dev_ops, query_qp); 2652 SET_DEVICE_OP(dev_ops, query_srq); 2653 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2654 SET_DEVICE_OP(dev_ops, read_counters); 2655 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2656 SET_DEVICE_OP(dev_ops, reg_user_mr); 2657 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2658 SET_DEVICE_OP(dev_ops, req_notify_cq); 2659 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2660 SET_DEVICE_OP(dev_ops, resize_cq); 2661 SET_DEVICE_OP(dev_ops, set_vf_guid); 2662 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2663 SET_DEVICE_OP(dev_ops, unmap_fmr); 2664 2665 SET_OBJ_SIZE(dev_ops, ib_ah); 2666 SET_OBJ_SIZE(dev_ops, ib_cq); 2667 SET_OBJ_SIZE(dev_ops, ib_pd); 2668 SET_OBJ_SIZE(dev_ops, ib_srq); 2669 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2670 } 2671 EXPORT_SYMBOL(ib_set_device_ops); 2672 2673 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2674 [RDMA_NL_LS_OP_RESOLVE] = { 2675 .doit = ib_nl_handle_resolve_resp, 2676 .flags = RDMA_NL_ADMIN_PERM, 2677 }, 2678 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2679 .doit = ib_nl_handle_set_timeout, 2680 .flags = RDMA_NL_ADMIN_PERM, 2681 }, 2682 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2683 .doit = ib_nl_handle_ip_res_resp, 2684 .flags = RDMA_NL_ADMIN_PERM, 2685 }, 2686 }; 2687 2688 static int __init ib_core_init(void) 2689 { 2690 int ret; 2691 2692 ib_wq = alloc_workqueue("infiniband", 0, 0); 2693 if (!ib_wq) 2694 return -ENOMEM; 2695 2696 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2697 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2698 if (!ib_comp_wq) { 2699 ret = -ENOMEM; 2700 goto err; 2701 } 2702 2703 ib_comp_unbound_wq = 2704 alloc_workqueue("ib-comp-unb-wq", 2705 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2706 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2707 if (!ib_comp_unbound_wq) { 2708 ret = -ENOMEM; 2709 goto err_comp; 2710 } 2711 2712 ret = class_register(&ib_class); 2713 if (ret) { 2714 pr_warn("Couldn't create InfiniBand device class\n"); 2715 goto err_comp_unbound; 2716 } 2717 2718 ret = addr_init(); 2719 if (ret) { 2720 pr_warn("Could't init IB address resolution\n"); 2721 goto err_ibnl; 2722 } 2723 2724 ret = ib_mad_init(); 2725 if (ret) { 2726 pr_warn("Couldn't init IB MAD\n"); 2727 goto err_addr; 2728 } 2729 2730 ret = ib_sa_init(); 2731 if (ret) { 2732 pr_warn("Couldn't init SA\n"); 2733 goto err_mad; 2734 } 2735 2736 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2737 if (ret) { 2738 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2739 goto err_sa; 2740 } 2741 2742 ret = register_pernet_device(&rdma_dev_net_ops); 2743 if (ret) { 2744 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2745 goto err_compat; 2746 } 2747 2748 nldev_init(); 2749 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2750 roce_gid_mgmt_init(); 2751 2752 return 0; 2753 2754 err_compat: 2755 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2756 err_sa: 2757 ib_sa_cleanup(); 2758 err_mad: 2759 ib_mad_cleanup(); 2760 err_addr: 2761 addr_cleanup(); 2762 err_ibnl: 2763 class_unregister(&ib_class); 2764 err_comp_unbound: 2765 destroy_workqueue(ib_comp_unbound_wq); 2766 err_comp: 2767 destroy_workqueue(ib_comp_wq); 2768 err: 2769 destroy_workqueue(ib_wq); 2770 return ret; 2771 } 2772 2773 static void __exit ib_core_cleanup(void) 2774 { 2775 roce_gid_mgmt_cleanup(); 2776 nldev_exit(); 2777 rdma_nl_unregister(RDMA_NL_LS); 2778 unregister_pernet_device(&rdma_dev_net_ops); 2779 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2780 ib_sa_cleanup(); 2781 ib_mad_cleanup(); 2782 addr_cleanup(); 2783 rdma_nl_exit(); 2784 class_unregister(&ib_class); 2785 destroy_workqueue(ib_comp_unbound_wq); 2786 destroy_workqueue(ib_comp_wq); 2787 /* Make sure that any pending umem accounting work is done. */ 2788 destroy_workqueue(ib_wq); 2789 flush_workqueue(system_unbound_wq); 2790 WARN_ON(!xa_empty(&clients)); 2791 WARN_ON(!xa_empty(&devices)); 2792 } 2793 2794 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2795 2796 /* ib core relies on netdev stack to first register net_ns_type_operations 2797 * ns kobject type before ib_core initialization. 2798 */ 2799 fs_initcall(ib_core_init); 2800 module_exit(ib_core_cleanup); 2801