1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <linux/security.h> 43 #include <linux/notifier.h> 44 #include <linux/hashtable.h> 45 #include <rdma/rdma_netlink.h> 46 #include <rdma/ib_addr.h> 47 #include <rdma/ib_cache.h> 48 #include <rdma/rdma_counter.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static u32 highest_client_id; 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 static void ib_client_put(struct ib_client *client) 102 { 103 if (refcount_dec_and_test(&client->uses)) 104 complete(&client->uses_zero); 105 } 106 107 /* 108 * If client_data is registered then the corresponding client must also still 109 * be registered. 110 */ 111 #define CLIENT_DATA_REGISTERED XA_MARK_1 112 113 unsigned int rdma_dev_net_id; 114 115 /* 116 * A list of net namespaces is maintained in an xarray. This is necessary 117 * because we can't get the locking right using the existing net ns list. We 118 * would require a init_net callback after the list is updated. 119 */ 120 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 121 /* 122 * rwsem to protect accessing the rdma_nets xarray entries. 123 */ 124 static DECLARE_RWSEM(rdma_nets_rwsem); 125 126 bool ib_devices_shared_netns = true; 127 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 128 MODULE_PARM_DESC(netns_mode, 129 "Share device among net namespaces; default=1 (shared)"); 130 /** 131 * rdma_dev_access_netns() - Return whether an rdma device can be accessed 132 * from a specified net namespace or not. 133 * @dev: Pointer to rdma device which needs to be checked 134 * @net: Pointer to net namesapce for which access to be checked 135 * 136 * When the rdma device is in shared mode, it ignores the net namespace. 137 * When the rdma device is exclusive to a net namespace, rdma device net 138 * namespace is checked against the specified one. 139 */ 140 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 141 { 142 return (ib_devices_shared_netns || 143 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 144 } 145 EXPORT_SYMBOL(rdma_dev_access_netns); 146 147 /* 148 * xarray has this behavior where it won't iterate over NULL values stored in 149 * allocated arrays. So we need our own iterator to see all values stored in 150 * the array. This does the same thing as xa_for_each except that it also 151 * returns NULL valued entries if the array is allocating. Simplified to only 152 * work on simple xarrays. 153 */ 154 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 155 xa_mark_t filter) 156 { 157 XA_STATE(xas, xa, *indexp); 158 void *entry; 159 160 rcu_read_lock(); 161 do { 162 entry = xas_find_marked(&xas, ULONG_MAX, filter); 163 if (xa_is_zero(entry)) 164 break; 165 } while (xas_retry(&xas, entry)); 166 rcu_read_unlock(); 167 168 if (entry) { 169 *indexp = xas.xa_index; 170 if (xa_is_zero(entry)) 171 return NULL; 172 return entry; 173 } 174 return XA_ERROR(-ENOENT); 175 } 176 #define xan_for_each_marked(xa, index, entry, filter) \ 177 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 178 !xa_is_err(entry); \ 179 (index)++, entry = xan_find_marked(xa, &(index), filter)) 180 181 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 182 static DEFINE_SPINLOCK(ndev_hash_lock); 183 static DECLARE_HASHTABLE(ndev_hash, 5); 184 185 static void free_netdevs(struct ib_device *ib_dev); 186 static void ib_unregister_work(struct work_struct *work); 187 static void __ib_unregister_device(struct ib_device *device); 188 static int ib_security_change(struct notifier_block *nb, unsigned long event, 189 void *lsm_data); 190 static void ib_policy_change_task(struct work_struct *work); 191 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 192 193 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 194 struct va_format *vaf) 195 { 196 if (ibdev && ibdev->dev.parent) 197 dev_printk_emit(level[1] - '0', 198 ibdev->dev.parent, 199 "%s %s %s: %pV", 200 dev_driver_string(ibdev->dev.parent), 201 dev_name(ibdev->dev.parent), 202 dev_name(&ibdev->dev), 203 vaf); 204 else if (ibdev) 205 printk("%s%s: %pV", 206 level, dev_name(&ibdev->dev), vaf); 207 else 208 printk("%s(NULL ib_device): %pV", level, vaf); 209 } 210 211 void ibdev_printk(const char *level, const struct ib_device *ibdev, 212 const char *format, ...) 213 { 214 struct va_format vaf; 215 va_list args; 216 217 va_start(args, format); 218 219 vaf.fmt = format; 220 vaf.va = &args; 221 222 __ibdev_printk(level, ibdev, &vaf); 223 224 va_end(args); 225 } 226 EXPORT_SYMBOL(ibdev_printk); 227 228 #define define_ibdev_printk_level(func, level) \ 229 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 230 { \ 231 struct va_format vaf; \ 232 va_list args; \ 233 \ 234 va_start(args, fmt); \ 235 \ 236 vaf.fmt = fmt; \ 237 vaf.va = &args; \ 238 \ 239 __ibdev_printk(level, ibdev, &vaf); \ 240 \ 241 va_end(args); \ 242 } \ 243 EXPORT_SYMBOL(func); 244 245 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 246 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 247 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 248 define_ibdev_printk_level(ibdev_err, KERN_ERR); 249 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 250 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 251 define_ibdev_printk_level(ibdev_info, KERN_INFO); 252 253 static struct notifier_block ibdev_lsm_nb = { 254 .notifier_call = ib_security_change, 255 }; 256 257 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 258 struct net *net); 259 260 /* Pointer to the RCU head at the start of the ib_port_data array */ 261 struct ib_port_data_rcu { 262 struct rcu_head rcu_head; 263 struct ib_port_data pdata[]; 264 }; 265 266 static void ib_device_check_mandatory(struct ib_device *device) 267 { 268 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 269 static const struct { 270 size_t offset; 271 char *name; 272 } mandatory_table[] = { 273 IB_MANDATORY_FUNC(query_device), 274 IB_MANDATORY_FUNC(query_port), 275 IB_MANDATORY_FUNC(alloc_pd), 276 IB_MANDATORY_FUNC(dealloc_pd), 277 IB_MANDATORY_FUNC(create_qp), 278 IB_MANDATORY_FUNC(modify_qp), 279 IB_MANDATORY_FUNC(destroy_qp), 280 IB_MANDATORY_FUNC(post_send), 281 IB_MANDATORY_FUNC(post_recv), 282 IB_MANDATORY_FUNC(create_cq), 283 IB_MANDATORY_FUNC(destroy_cq), 284 IB_MANDATORY_FUNC(poll_cq), 285 IB_MANDATORY_FUNC(req_notify_cq), 286 IB_MANDATORY_FUNC(get_dma_mr), 287 IB_MANDATORY_FUNC(dereg_mr), 288 IB_MANDATORY_FUNC(get_port_immutable) 289 }; 290 int i; 291 292 device->kverbs_provider = true; 293 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 294 if (!*(void **) ((void *) &device->ops + 295 mandatory_table[i].offset)) { 296 device->kverbs_provider = false; 297 break; 298 } 299 } 300 } 301 302 /* 303 * Caller must perform ib_device_put() to return the device reference count 304 * when ib_device_get_by_index() returns valid device pointer. 305 */ 306 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 307 { 308 struct ib_device *device; 309 310 down_read(&devices_rwsem); 311 device = xa_load(&devices, index); 312 if (device) { 313 if (!rdma_dev_access_netns(device, net)) { 314 device = NULL; 315 goto out; 316 } 317 318 if (!ib_device_try_get(device)) 319 device = NULL; 320 } 321 out: 322 up_read(&devices_rwsem); 323 return device; 324 } 325 326 /** 327 * ib_device_put - Release IB device reference 328 * @device: device whose reference to be released 329 * 330 * ib_device_put() releases reference to the IB device to allow it to be 331 * unregistered and eventually free. 332 */ 333 void ib_device_put(struct ib_device *device) 334 { 335 if (refcount_dec_and_test(&device->refcount)) 336 complete(&device->unreg_completion); 337 } 338 EXPORT_SYMBOL(ib_device_put); 339 340 static struct ib_device *__ib_device_get_by_name(const char *name) 341 { 342 struct ib_device *device; 343 unsigned long index; 344 345 xa_for_each (&devices, index, device) 346 if (!strcmp(name, dev_name(&device->dev))) 347 return device; 348 349 return NULL; 350 } 351 352 /** 353 * ib_device_get_by_name - Find an IB device by name 354 * @name: The name to look for 355 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 356 * 357 * Find and hold an ib_device by its name. The caller must call 358 * ib_device_put() on the returned pointer. 359 */ 360 struct ib_device *ib_device_get_by_name(const char *name, 361 enum rdma_driver_id driver_id) 362 { 363 struct ib_device *device; 364 365 down_read(&devices_rwsem); 366 device = __ib_device_get_by_name(name); 367 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 368 device->ops.driver_id != driver_id) 369 device = NULL; 370 371 if (device) { 372 if (!ib_device_try_get(device)) 373 device = NULL; 374 } 375 up_read(&devices_rwsem); 376 return device; 377 } 378 EXPORT_SYMBOL(ib_device_get_by_name); 379 380 static int rename_compat_devs(struct ib_device *device) 381 { 382 struct ib_core_device *cdev; 383 unsigned long index; 384 int ret = 0; 385 386 mutex_lock(&device->compat_devs_mutex); 387 xa_for_each (&device->compat_devs, index, cdev) { 388 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 389 if (ret) { 390 dev_warn(&cdev->dev, 391 "Fail to rename compatdev to new name %s\n", 392 dev_name(&device->dev)); 393 break; 394 } 395 } 396 mutex_unlock(&device->compat_devs_mutex); 397 return ret; 398 } 399 400 int ib_device_rename(struct ib_device *ibdev, const char *name) 401 { 402 unsigned long index; 403 void *client_data; 404 int ret; 405 406 down_write(&devices_rwsem); 407 if (!strcmp(name, dev_name(&ibdev->dev))) { 408 up_write(&devices_rwsem); 409 return 0; 410 } 411 412 if (__ib_device_get_by_name(name)) { 413 up_write(&devices_rwsem); 414 return -EEXIST; 415 } 416 417 ret = device_rename(&ibdev->dev, name); 418 if (ret) { 419 up_write(&devices_rwsem); 420 return ret; 421 } 422 423 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 424 ret = rename_compat_devs(ibdev); 425 426 downgrade_write(&devices_rwsem); 427 down_read(&ibdev->client_data_rwsem); 428 xan_for_each_marked(&ibdev->client_data, index, client_data, 429 CLIENT_DATA_REGISTERED) { 430 struct ib_client *client = xa_load(&clients, index); 431 432 if (!client || !client->rename) 433 continue; 434 435 client->rename(ibdev, client_data); 436 } 437 up_read(&ibdev->client_data_rwsem); 438 up_read(&devices_rwsem); 439 return 0; 440 } 441 442 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 443 { 444 if (use_dim > 1) 445 return -EINVAL; 446 ibdev->use_cq_dim = use_dim; 447 448 return 0; 449 } 450 451 static int alloc_name(struct ib_device *ibdev, const char *name) 452 { 453 struct ib_device *device; 454 unsigned long index; 455 struct ida inuse; 456 int rc; 457 int i; 458 459 lockdep_assert_held_write(&devices_rwsem); 460 ida_init(&inuse); 461 xa_for_each (&devices, index, device) { 462 char buf[IB_DEVICE_NAME_MAX]; 463 464 if (sscanf(dev_name(&device->dev), name, &i) != 1) 465 continue; 466 if (i < 0 || i >= INT_MAX) 467 continue; 468 snprintf(buf, sizeof buf, name, i); 469 if (strcmp(buf, dev_name(&device->dev)) != 0) 470 continue; 471 472 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 473 if (rc < 0) 474 goto out; 475 } 476 477 rc = ida_alloc(&inuse, GFP_KERNEL); 478 if (rc < 0) 479 goto out; 480 481 rc = dev_set_name(&ibdev->dev, name, rc); 482 out: 483 ida_destroy(&inuse); 484 return rc; 485 } 486 487 static void ib_device_release(struct device *device) 488 { 489 struct ib_device *dev = container_of(device, struct ib_device, dev); 490 491 free_netdevs(dev); 492 WARN_ON(refcount_read(&dev->refcount)); 493 if (dev->port_data) { 494 ib_cache_release_one(dev); 495 ib_security_release_port_pkey_list(dev); 496 rdma_counter_release(dev); 497 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 498 pdata[0]), 499 rcu_head); 500 } 501 502 mutex_destroy(&dev->unregistration_lock); 503 mutex_destroy(&dev->compat_devs_mutex); 504 505 xa_destroy(&dev->compat_devs); 506 xa_destroy(&dev->client_data); 507 kfree_rcu(dev, rcu_head); 508 } 509 510 static int ib_device_uevent(struct device *device, 511 struct kobj_uevent_env *env) 512 { 513 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 514 return -ENOMEM; 515 516 /* 517 * It would be nice to pass the node GUID with the event... 518 */ 519 520 return 0; 521 } 522 523 static const void *net_namespace(struct device *d) 524 { 525 struct ib_core_device *coredev = 526 container_of(d, struct ib_core_device, dev); 527 528 return read_pnet(&coredev->rdma_net); 529 } 530 531 static struct class ib_class = { 532 .name = "infiniband", 533 .dev_release = ib_device_release, 534 .dev_uevent = ib_device_uevent, 535 .ns_type = &net_ns_type_operations, 536 .namespace = net_namespace, 537 }; 538 539 static void rdma_init_coredev(struct ib_core_device *coredev, 540 struct ib_device *dev, struct net *net) 541 { 542 /* This BUILD_BUG_ON is intended to catch layout change 543 * of union of ib_core_device and device. 544 * dev must be the first element as ib_core and providers 545 * driver uses it. Adding anything in ib_core_device before 546 * device will break this assumption. 547 */ 548 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 549 offsetof(struct ib_device, dev)); 550 551 coredev->dev.class = &ib_class; 552 coredev->dev.groups = dev->groups; 553 device_initialize(&coredev->dev); 554 coredev->owner = dev; 555 INIT_LIST_HEAD(&coredev->port_list); 556 write_pnet(&coredev->rdma_net, net); 557 } 558 559 /** 560 * _ib_alloc_device - allocate an IB device struct 561 * @size:size of structure to allocate 562 * 563 * Low-level drivers should use ib_alloc_device() to allocate &struct 564 * ib_device. @size is the size of the structure to be allocated, 565 * including any private data used by the low-level driver. 566 * ib_dealloc_device() must be used to free structures allocated with 567 * ib_alloc_device(). 568 */ 569 struct ib_device *_ib_alloc_device(size_t size) 570 { 571 struct ib_device *device; 572 573 if (WARN_ON(size < sizeof(struct ib_device))) 574 return NULL; 575 576 device = kzalloc(size, GFP_KERNEL); 577 if (!device) 578 return NULL; 579 580 if (rdma_restrack_init(device)) { 581 kfree(device); 582 return NULL; 583 } 584 585 device->groups[0] = &ib_dev_attr_group; 586 rdma_init_coredev(&device->coredev, device, &init_net); 587 588 INIT_LIST_HEAD(&device->event_handler_list); 589 spin_lock_init(&device->qp_open_list_lock); 590 init_rwsem(&device->event_handler_rwsem); 591 mutex_init(&device->unregistration_lock); 592 /* 593 * client_data needs to be alloc because we don't want our mark to be 594 * destroyed if the user stores NULL in the client data. 595 */ 596 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 597 init_rwsem(&device->client_data_rwsem); 598 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 599 mutex_init(&device->compat_devs_mutex); 600 init_completion(&device->unreg_completion); 601 INIT_WORK(&device->unregistration_work, ib_unregister_work); 602 603 return device; 604 } 605 EXPORT_SYMBOL(_ib_alloc_device); 606 607 /** 608 * ib_dealloc_device - free an IB device struct 609 * @device:structure to free 610 * 611 * Free a structure allocated with ib_alloc_device(). 612 */ 613 void ib_dealloc_device(struct ib_device *device) 614 { 615 if (device->ops.dealloc_driver) 616 device->ops.dealloc_driver(device); 617 618 /* 619 * ib_unregister_driver() requires all devices to remain in the xarray 620 * while their ops are callable. The last op we call is dealloc_driver 621 * above. This is needed to create a fence on op callbacks prior to 622 * allowing the driver module to unload. 623 */ 624 down_write(&devices_rwsem); 625 if (xa_load(&devices, device->index) == device) 626 xa_erase(&devices, device->index); 627 up_write(&devices_rwsem); 628 629 /* Expedite releasing netdev references */ 630 free_netdevs(device); 631 632 WARN_ON(!xa_empty(&device->compat_devs)); 633 WARN_ON(!xa_empty(&device->client_data)); 634 WARN_ON(refcount_read(&device->refcount)); 635 rdma_restrack_clean(device); 636 /* Balances with device_initialize */ 637 put_device(&device->dev); 638 } 639 EXPORT_SYMBOL(ib_dealloc_device); 640 641 /* 642 * add_client_context() and remove_client_context() must be safe against 643 * parallel calls on the same device - registration/unregistration of both the 644 * device and client can be occurring in parallel. 645 * 646 * The routines need to be a fence, any caller must not return until the add 647 * or remove is fully completed. 648 */ 649 static int add_client_context(struct ib_device *device, 650 struct ib_client *client) 651 { 652 int ret = 0; 653 654 if (!device->kverbs_provider && !client->no_kverbs_req) 655 return 0; 656 657 down_write(&device->client_data_rwsem); 658 /* 659 * So long as the client is registered hold both the client and device 660 * unregistration locks. 661 */ 662 if (!refcount_inc_not_zero(&client->uses)) 663 goto out_unlock; 664 refcount_inc(&device->refcount); 665 666 /* 667 * Another caller to add_client_context got here first and has already 668 * completely initialized context. 669 */ 670 if (xa_get_mark(&device->client_data, client->client_id, 671 CLIENT_DATA_REGISTERED)) 672 goto out; 673 674 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 675 GFP_KERNEL)); 676 if (ret) 677 goto out; 678 downgrade_write(&device->client_data_rwsem); 679 if (client->add) { 680 if (client->add(device)) { 681 /* 682 * If a client fails to add then the error code is 683 * ignored, but we won't call any more ops on this 684 * client. 685 */ 686 xa_erase(&device->client_data, client->client_id); 687 up_read(&device->client_data_rwsem); 688 ib_device_put(device); 689 ib_client_put(client); 690 return 0; 691 } 692 } 693 694 /* Readers shall not see a client until add has been completed */ 695 xa_set_mark(&device->client_data, client->client_id, 696 CLIENT_DATA_REGISTERED); 697 up_read(&device->client_data_rwsem); 698 return 0; 699 700 out: 701 ib_device_put(device); 702 ib_client_put(client); 703 out_unlock: 704 up_write(&device->client_data_rwsem); 705 return ret; 706 } 707 708 static void remove_client_context(struct ib_device *device, 709 unsigned int client_id) 710 { 711 struct ib_client *client; 712 void *client_data; 713 714 down_write(&device->client_data_rwsem); 715 if (!xa_get_mark(&device->client_data, client_id, 716 CLIENT_DATA_REGISTERED)) { 717 up_write(&device->client_data_rwsem); 718 return; 719 } 720 client_data = xa_load(&device->client_data, client_id); 721 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 722 client = xa_load(&clients, client_id); 723 up_write(&device->client_data_rwsem); 724 725 /* 726 * Notice we cannot be holding any exclusive locks when calling the 727 * remove callback as the remove callback can recurse back into any 728 * public functions in this module and thus try for any locks those 729 * functions take. 730 * 731 * For this reason clients and drivers should not call the 732 * unregistration functions will holdling any locks. 733 */ 734 if (client->remove) 735 client->remove(device, client_data); 736 737 xa_erase(&device->client_data, client_id); 738 ib_device_put(device); 739 ib_client_put(client); 740 } 741 742 static int alloc_port_data(struct ib_device *device) 743 { 744 struct ib_port_data_rcu *pdata_rcu; 745 unsigned int port; 746 747 if (device->port_data) 748 return 0; 749 750 /* This can only be called once the physical port range is defined */ 751 if (WARN_ON(!device->phys_port_cnt)) 752 return -EINVAL; 753 754 /* 755 * device->port_data is indexed directly by the port number to make 756 * access to this data as efficient as possible. 757 * 758 * Therefore port_data is declared as a 1 based array with potential 759 * empty slots at the beginning. 760 */ 761 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 762 rdma_end_port(device) + 1), 763 GFP_KERNEL); 764 if (!pdata_rcu) 765 return -ENOMEM; 766 /* 767 * The rcu_head is put in front of the port data array and the stored 768 * pointer is adjusted since we never need to see that member until 769 * kfree_rcu. 770 */ 771 device->port_data = pdata_rcu->pdata; 772 773 rdma_for_each_port (device, port) { 774 struct ib_port_data *pdata = &device->port_data[port]; 775 776 pdata->ib_dev = device; 777 spin_lock_init(&pdata->pkey_list_lock); 778 INIT_LIST_HEAD(&pdata->pkey_list); 779 spin_lock_init(&pdata->netdev_lock); 780 INIT_HLIST_NODE(&pdata->ndev_hash_link); 781 } 782 return 0; 783 } 784 785 static int verify_immutable(const struct ib_device *dev, u8 port) 786 { 787 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 788 rdma_max_mad_size(dev, port) != 0); 789 } 790 791 static int setup_port_data(struct ib_device *device) 792 { 793 unsigned int port; 794 int ret; 795 796 ret = alloc_port_data(device); 797 if (ret) 798 return ret; 799 800 rdma_for_each_port (device, port) { 801 struct ib_port_data *pdata = &device->port_data[port]; 802 803 ret = device->ops.get_port_immutable(device, port, 804 &pdata->immutable); 805 if (ret) 806 return ret; 807 808 if (verify_immutable(device, port)) 809 return -EINVAL; 810 } 811 return 0; 812 } 813 814 void ib_get_device_fw_str(struct ib_device *dev, char *str) 815 { 816 if (dev->ops.get_dev_fw_str) 817 dev->ops.get_dev_fw_str(dev, str); 818 else 819 str[0] = '\0'; 820 } 821 EXPORT_SYMBOL(ib_get_device_fw_str); 822 823 static void ib_policy_change_task(struct work_struct *work) 824 { 825 struct ib_device *dev; 826 unsigned long index; 827 828 down_read(&devices_rwsem); 829 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 830 unsigned int i; 831 832 rdma_for_each_port (dev, i) { 833 u64 sp; 834 int ret = ib_get_cached_subnet_prefix(dev, 835 i, 836 &sp); 837 838 WARN_ONCE(ret, 839 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 840 ret); 841 if (!ret) 842 ib_security_cache_change(dev, i, sp); 843 } 844 } 845 up_read(&devices_rwsem); 846 } 847 848 static int ib_security_change(struct notifier_block *nb, unsigned long event, 849 void *lsm_data) 850 { 851 if (event != LSM_POLICY_CHANGE) 852 return NOTIFY_DONE; 853 854 schedule_work(&ib_policy_change_work); 855 ib_mad_agent_security_change(); 856 857 return NOTIFY_OK; 858 } 859 860 static void compatdev_release(struct device *dev) 861 { 862 struct ib_core_device *cdev = 863 container_of(dev, struct ib_core_device, dev); 864 865 kfree(cdev); 866 } 867 868 static int add_one_compat_dev(struct ib_device *device, 869 struct rdma_dev_net *rnet) 870 { 871 struct ib_core_device *cdev; 872 int ret; 873 874 lockdep_assert_held(&rdma_nets_rwsem); 875 if (!ib_devices_shared_netns) 876 return 0; 877 878 /* 879 * Create and add compat device in all namespaces other than where it 880 * is currently bound to. 881 */ 882 if (net_eq(read_pnet(&rnet->net), 883 read_pnet(&device->coredev.rdma_net))) 884 return 0; 885 886 /* 887 * The first of init_net() or ib_register_device() to take the 888 * compat_devs_mutex wins and gets to add the device. Others will wait 889 * for completion here. 890 */ 891 mutex_lock(&device->compat_devs_mutex); 892 cdev = xa_load(&device->compat_devs, rnet->id); 893 if (cdev) { 894 ret = 0; 895 goto done; 896 } 897 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 898 if (ret) 899 goto done; 900 901 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 902 if (!cdev) { 903 ret = -ENOMEM; 904 goto cdev_err; 905 } 906 907 cdev->dev.parent = device->dev.parent; 908 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 909 cdev->dev.release = compatdev_release; 910 ret = dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 911 if (ret) 912 goto add_err; 913 914 ret = device_add(&cdev->dev); 915 if (ret) 916 goto add_err; 917 ret = ib_setup_port_attrs(cdev); 918 if (ret) 919 goto port_err; 920 921 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 922 cdev, GFP_KERNEL)); 923 if (ret) 924 goto insert_err; 925 926 mutex_unlock(&device->compat_devs_mutex); 927 return 0; 928 929 insert_err: 930 ib_free_port_attrs(cdev); 931 port_err: 932 device_del(&cdev->dev); 933 add_err: 934 put_device(&cdev->dev); 935 cdev_err: 936 xa_release(&device->compat_devs, rnet->id); 937 done: 938 mutex_unlock(&device->compat_devs_mutex); 939 return ret; 940 } 941 942 static void remove_one_compat_dev(struct ib_device *device, u32 id) 943 { 944 struct ib_core_device *cdev; 945 946 mutex_lock(&device->compat_devs_mutex); 947 cdev = xa_erase(&device->compat_devs, id); 948 mutex_unlock(&device->compat_devs_mutex); 949 if (cdev) { 950 ib_free_port_attrs(cdev); 951 device_del(&cdev->dev); 952 put_device(&cdev->dev); 953 } 954 } 955 956 static void remove_compat_devs(struct ib_device *device) 957 { 958 struct ib_core_device *cdev; 959 unsigned long index; 960 961 xa_for_each (&device->compat_devs, index, cdev) 962 remove_one_compat_dev(device, index); 963 } 964 965 static int add_compat_devs(struct ib_device *device) 966 { 967 struct rdma_dev_net *rnet; 968 unsigned long index; 969 int ret = 0; 970 971 lockdep_assert_held(&devices_rwsem); 972 973 down_read(&rdma_nets_rwsem); 974 xa_for_each (&rdma_nets, index, rnet) { 975 ret = add_one_compat_dev(device, rnet); 976 if (ret) 977 break; 978 } 979 up_read(&rdma_nets_rwsem); 980 return ret; 981 } 982 983 static void remove_all_compat_devs(void) 984 { 985 struct ib_compat_device *cdev; 986 struct ib_device *dev; 987 unsigned long index; 988 989 down_read(&devices_rwsem); 990 xa_for_each (&devices, index, dev) { 991 unsigned long c_index = 0; 992 993 /* Hold nets_rwsem so that any other thread modifying this 994 * system param can sync with this thread. 995 */ 996 down_read(&rdma_nets_rwsem); 997 xa_for_each (&dev->compat_devs, c_index, cdev) 998 remove_one_compat_dev(dev, c_index); 999 up_read(&rdma_nets_rwsem); 1000 } 1001 up_read(&devices_rwsem); 1002 } 1003 1004 static int add_all_compat_devs(void) 1005 { 1006 struct rdma_dev_net *rnet; 1007 struct ib_device *dev; 1008 unsigned long index; 1009 int ret = 0; 1010 1011 down_read(&devices_rwsem); 1012 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1013 unsigned long net_index = 0; 1014 1015 /* Hold nets_rwsem so that any other thread modifying this 1016 * system param can sync with this thread. 1017 */ 1018 down_read(&rdma_nets_rwsem); 1019 xa_for_each (&rdma_nets, net_index, rnet) { 1020 ret = add_one_compat_dev(dev, rnet); 1021 if (ret) 1022 break; 1023 } 1024 up_read(&rdma_nets_rwsem); 1025 } 1026 up_read(&devices_rwsem); 1027 if (ret) 1028 remove_all_compat_devs(); 1029 return ret; 1030 } 1031 1032 int rdma_compatdev_set(u8 enable) 1033 { 1034 struct rdma_dev_net *rnet; 1035 unsigned long index; 1036 int ret = 0; 1037 1038 down_write(&rdma_nets_rwsem); 1039 if (ib_devices_shared_netns == enable) { 1040 up_write(&rdma_nets_rwsem); 1041 return 0; 1042 } 1043 1044 /* enable/disable of compat devices is not supported 1045 * when more than default init_net exists. 1046 */ 1047 xa_for_each (&rdma_nets, index, rnet) { 1048 ret++; 1049 break; 1050 } 1051 if (!ret) 1052 ib_devices_shared_netns = enable; 1053 up_write(&rdma_nets_rwsem); 1054 if (ret) 1055 return -EBUSY; 1056 1057 if (enable) 1058 ret = add_all_compat_devs(); 1059 else 1060 remove_all_compat_devs(); 1061 return ret; 1062 } 1063 1064 static void rdma_dev_exit_net(struct net *net) 1065 { 1066 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1067 struct ib_device *dev; 1068 unsigned long index; 1069 int ret; 1070 1071 down_write(&rdma_nets_rwsem); 1072 /* 1073 * Prevent the ID from being re-used and hide the id from xa_for_each. 1074 */ 1075 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1076 WARN_ON(ret); 1077 up_write(&rdma_nets_rwsem); 1078 1079 down_read(&devices_rwsem); 1080 xa_for_each (&devices, index, dev) { 1081 get_device(&dev->dev); 1082 /* 1083 * Release the devices_rwsem so that pontentially blocking 1084 * device_del, doesn't hold the devices_rwsem for too long. 1085 */ 1086 up_read(&devices_rwsem); 1087 1088 remove_one_compat_dev(dev, rnet->id); 1089 1090 /* 1091 * If the real device is in the NS then move it back to init. 1092 */ 1093 rdma_dev_change_netns(dev, net, &init_net); 1094 1095 put_device(&dev->dev); 1096 down_read(&devices_rwsem); 1097 } 1098 up_read(&devices_rwsem); 1099 1100 rdma_nl_net_exit(rnet); 1101 xa_erase(&rdma_nets, rnet->id); 1102 } 1103 1104 static __net_init int rdma_dev_init_net(struct net *net) 1105 { 1106 struct rdma_dev_net *rnet = rdma_net_to_dev_net(net); 1107 unsigned long index; 1108 struct ib_device *dev; 1109 int ret; 1110 1111 write_pnet(&rnet->net, net); 1112 1113 ret = rdma_nl_net_init(rnet); 1114 if (ret) 1115 return ret; 1116 1117 /* No need to create any compat devices in default init_net. */ 1118 if (net_eq(net, &init_net)) 1119 return 0; 1120 1121 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1122 if (ret) { 1123 rdma_nl_net_exit(rnet); 1124 return ret; 1125 } 1126 1127 down_read(&devices_rwsem); 1128 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1129 /* Hold nets_rwsem so that netlink command cannot change 1130 * system configuration for device sharing mode. 1131 */ 1132 down_read(&rdma_nets_rwsem); 1133 ret = add_one_compat_dev(dev, rnet); 1134 up_read(&rdma_nets_rwsem); 1135 if (ret) 1136 break; 1137 } 1138 up_read(&devices_rwsem); 1139 1140 if (ret) 1141 rdma_dev_exit_net(net); 1142 1143 return ret; 1144 } 1145 1146 /* 1147 * Assign the unique string device name and the unique device index. This is 1148 * undone by ib_dealloc_device. 1149 */ 1150 static int assign_name(struct ib_device *device, const char *name) 1151 { 1152 static u32 last_id; 1153 int ret; 1154 1155 down_write(&devices_rwsem); 1156 /* Assign a unique name to the device */ 1157 if (strchr(name, '%')) 1158 ret = alloc_name(device, name); 1159 else 1160 ret = dev_set_name(&device->dev, name); 1161 if (ret) 1162 goto out; 1163 1164 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1165 ret = -ENFILE; 1166 goto out; 1167 } 1168 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1169 1170 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1171 &last_id, GFP_KERNEL); 1172 if (ret > 0) 1173 ret = 0; 1174 1175 out: 1176 up_write(&devices_rwsem); 1177 return ret; 1178 } 1179 1180 static void setup_dma_device(struct ib_device *device) 1181 { 1182 struct device *parent = device->dev.parent; 1183 1184 WARN_ON_ONCE(device->dma_device); 1185 1186 #ifdef CONFIG_DMA_OPS 1187 if (device->dev.dma_ops) { 1188 /* 1189 * The caller provided custom DMA operations. Copy the 1190 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1191 * into device->dev. 1192 */ 1193 device->dma_device = &device->dev; 1194 if (!device->dev.dma_mask) { 1195 if (parent) 1196 device->dev.dma_mask = parent->dma_mask; 1197 else 1198 WARN_ON_ONCE(true); 1199 } 1200 if (!device->dev.coherent_dma_mask) { 1201 if (parent) 1202 device->dev.coherent_dma_mask = 1203 parent->coherent_dma_mask; 1204 else 1205 WARN_ON_ONCE(true); 1206 } 1207 } else 1208 #endif /* CONFIG_DMA_OPS */ 1209 { 1210 /* 1211 * The caller did not provide custom DMA operations. Use the 1212 * DMA mapping operations of the parent device. 1213 */ 1214 WARN_ON_ONCE(!parent); 1215 device->dma_device = parent; 1216 } 1217 1218 if (!device->dev.dma_parms) { 1219 if (parent) { 1220 /* 1221 * The caller did not provide DMA parameters, so 1222 * 'parent' probably represents a PCI device. The PCI 1223 * core sets the maximum segment size to 64 1224 * KB. Increase this parameter to 2 GB. 1225 */ 1226 device->dev.dma_parms = parent->dma_parms; 1227 dma_set_max_seg_size(device->dma_device, SZ_2G); 1228 } else { 1229 WARN_ON_ONCE(true); 1230 } 1231 } 1232 } 1233 1234 /* 1235 * setup_device() allocates memory and sets up data that requires calling the 1236 * device ops, this is the only reason these actions are not done during 1237 * ib_alloc_device. It is undone by ib_dealloc_device(). 1238 */ 1239 static int setup_device(struct ib_device *device) 1240 { 1241 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1242 int ret; 1243 1244 setup_dma_device(device); 1245 ib_device_check_mandatory(device); 1246 1247 ret = setup_port_data(device); 1248 if (ret) { 1249 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1250 return ret; 1251 } 1252 1253 memset(&device->attrs, 0, sizeof(device->attrs)); 1254 ret = device->ops.query_device(device, &device->attrs, &uhw); 1255 if (ret) { 1256 dev_warn(&device->dev, 1257 "Couldn't query the device attributes\n"); 1258 return ret; 1259 } 1260 1261 return 0; 1262 } 1263 1264 static void disable_device(struct ib_device *device) 1265 { 1266 u32 cid; 1267 1268 WARN_ON(!refcount_read(&device->refcount)); 1269 1270 down_write(&devices_rwsem); 1271 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1272 up_write(&devices_rwsem); 1273 1274 /* 1275 * Remove clients in LIFO order, see assign_client_id. This could be 1276 * more efficient if xarray learns to reverse iterate. Since no new 1277 * clients can be added to this ib_device past this point we only need 1278 * the maximum possible client_id value here. 1279 */ 1280 down_read(&clients_rwsem); 1281 cid = highest_client_id; 1282 up_read(&clients_rwsem); 1283 while (cid) { 1284 cid--; 1285 remove_client_context(device, cid); 1286 } 1287 1288 ib_cq_pool_destroy(device); 1289 1290 /* Pairs with refcount_set in enable_device */ 1291 ib_device_put(device); 1292 wait_for_completion(&device->unreg_completion); 1293 1294 /* 1295 * compat devices must be removed after device refcount drops to zero. 1296 * Otherwise init_net() may add more compatdevs after removing compat 1297 * devices and before device is disabled. 1298 */ 1299 remove_compat_devs(device); 1300 } 1301 1302 /* 1303 * An enabled device is visible to all clients and to all the public facing 1304 * APIs that return a device pointer. This always returns with a new get, even 1305 * if it fails. 1306 */ 1307 static int enable_device_and_get(struct ib_device *device) 1308 { 1309 struct ib_client *client; 1310 unsigned long index; 1311 int ret = 0; 1312 1313 /* 1314 * One ref belongs to the xa and the other belongs to this 1315 * thread. This is needed to guard against parallel unregistration. 1316 */ 1317 refcount_set(&device->refcount, 2); 1318 down_write(&devices_rwsem); 1319 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1320 1321 /* 1322 * By using downgrade_write() we ensure that no other thread can clear 1323 * DEVICE_REGISTERED while we are completing the client setup. 1324 */ 1325 downgrade_write(&devices_rwsem); 1326 1327 if (device->ops.enable_driver) { 1328 ret = device->ops.enable_driver(device); 1329 if (ret) 1330 goto out; 1331 } 1332 1333 ib_cq_pool_init(device); 1334 1335 down_read(&clients_rwsem); 1336 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1337 ret = add_client_context(device, client); 1338 if (ret) 1339 break; 1340 } 1341 up_read(&clients_rwsem); 1342 if (!ret) 1343 ret = add_compat_devs(device); 1344 out: 1345 up_read(&devices_rwsem); 1346 return ret; 1347 } 1348 1349 static void prevent_dealloc_device(struct ib_device *ib_dev) 1350 { 1351 } 1352 1353 /** 1354 * ib_register_device - Register an IB device with IB core 1355 * @device: Device to register 1356 * @name: unique string device name. This may include a '%' which will 1357 * cause a unique index to be added to the passed device name. 1358 * 1359 * Low-level drivers use ib_register_device() to register their 1360 * devices with the IB core. All registered clients will receive a 1361 * callback for each device that is added. @device must be allocated 1362 * with ib_alloc_device(). 1363 * 1364 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1365 * asynchronously then the device pointer may become freed as soon as this 1366 * function returns. 1367 */ 1368 int ib_register_device(struct ib_device *device, const char *name) 1369 { 1370 int ret; 1371 1372 ret = assign_name(device, name); 1373 if (ret) 1374 return ret; 1375 1376 ret = setup_device(device); 1377 if (ret) 1378 return ret; 1379 1380 ret = ib_cache_setup_one(device); 1381 if (ret) { 1382 dev_warn(&device->dev, 1383 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1384 return ret; 1385 } 1386 1387 ib_device_register_rdmacg(device); 1388 1389 rdma_counter_init(device); 1390 1391 /* 1392 * Ensure that ADD uevent is not fired because it 1393 * is too early amd device is not initialized yet. 1394 */ 1395 dev_set_uevent_suppress(&device->dev, true); 1396 ret = device_add(&device->dev); 1397 if (ret) 1398 goto cg_cleanup; 1399 1400 ret = ib_device_register_sysfs(device); 1401 if (ret) { 1402 dev_warn(&device->dev, 1403 "Couldn't register device with driver model\n"); 1404 goto dev_cleanup; 1405 } 1406 1407 ret = enable_device_and_get(device); 1408 dev_set_uevent_suppress(&device->dev, false); 1409 /* Mark for userspace that device is ready */ 1410 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1411 if (ret) { 1412 void (*dealloc_fn)(struct ib_device *); 1413 1414 /* 1415 * If we hit this error flow then we don't want to 1416 * automatically dealloc the device since the caller is 1417 * expected to call ib_dealloc_device() after 1418 * ib_register_device() fails. This is tricky due to the 1419 * possibility for a parallel unregistration along with this 1420 * error flow. Since we have a refcount here we know any 1421 * parallel flow is stopped in disable_device and will see the 1422 * special dealloc_driver pointer, causing the responsibility to 1423 * ib_dealloc_device() to revert back to this thread. 1424 */ 1425 dealloc_fn = device->ops.dealloc_driver; 1426 device->ops.dealloc_driver = prevent_dealloc_device; 1427 ib_device_put(device); 1428 __ib_unregister_device(device); 1429 device->ops.dealloc_driver = dealloc_fn; 1430 return ret; 1431 } 1432 ib_device_put(device); 1433 1434 return 0; 1435 1436 dev_cleanup: 1437 device_del(&device->dev); 1438 cg_cleanup: 1439 dev_set_uevent_suppress(&device->dev, false); 1440 ib_device_unregister_rdmacg(device); 1441 ib_cache_cleanup_one(device); 1442 return ret; 1443 } 1444 EXPORT_SYMBOL(ib_register_device); 1445 1446 /* Callers must hold a get on the device. */ 1447 static void __ib_unregister_device(struct ib_device *ib_dev) 1448 { 1449 /* 1450 * We have a registration lock so that all the calls to unregister are 1451 * fully fenced, once any unregister returns the device is truely 1452 * unregistered even if multiple callers are unregistering it at the 1453 * same time. This also interacts with the registration flow and 1454 * provides sane semantics if register and unregister are racing. 1455 */ 1456 mutex_lock(&ib_dev->unregistration_lock); 1457 if (!refcount_read(&ib_dev->refcount)) 1458 goto out; 1459 1460 disable_device(ib_dev); 1461 1462 /* Expedite removing unregistered pointers from the hash table */ 1463 free_netdevs(ib_dev); 1464 1465 ib_device_unregister_sysfs(ib_dev); 1466 device_del(&ib_dev->dev); 1467 ib_device_unregister_rdmacg(ib_dev); 1468 ib_cache_cleanup_one(ib_dev); 1469 1470 /* 1471 * Drivers using the new flow may not call ib_dealloc_device except 1472 * in error unwind prior to registration success. 1473 */ 1474 if (ib_dev->ops.dealloc_driver && 1475 ib_dev->ops.dealloc_driver != prevent_dealloc_device) { 1476 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1477 ib_dealloc_device(ib_dev); 1478 } 1479 out: 1480 mutex_unlock(&ib_dev->unregistration_lock); 1481 } 1482 1483 /** 1484 * ib_unregister_device - Unregister an IB device 1485 * @ib_dev: The device to unregister 1486 * 1487 * Unregister an IB device. All clients will receive a remove callback. 1488 * 1489 * Callers should call this routine only once, and protect against races with 1490 * registration. Typically it should only be called as part of a remove 1491 * callback in an implementation of driver core's struct device_driver and 1492 * related. 1493 * 1494 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1495 * this function. 1496 */ 1497 void ib_unregister_device(struct ib_device *ib_dev) 1498 { 1499 get_device(&ib_dev->dev); 1500 __ib_unregister_device(ib_dev); 1501 put_device(&ib_dev->dev); 1502 } 1503 EXPORT_SYMBOL(ib_unregister_device); 1504 1505 /** 1506 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1507 * @ib_dev: The device to unregister 1508 * 1509 * This is the same as ib_unregister_device(), except it includes an internal 1510 * ib_device_put() that should match a 'get' obtained by the caller. 1511 * 1512 * It is safe to call this routine concurrently from multiple threads while 1513 * holding the 'get'. When the function returns the device is fully 1514 * unregistered. 1515 * 1516 * Drivers using this flow MUST use the driver_unregister callback to clean up 1517 * their resources associated with the device and dealloc it. 1518 */ 1519 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1520 { 1521 WARN_ON(!ib_dev->ops.dealloc_driver); 1522 get_device(&ib_dev->dev); 1523 ib_device_put(ib_dev); 1524 __ib_unregister_device(ib_dev); 1525 put_device(&ib_dev->dev); 1526 } 1527 EXPORT_SYMBOL(ib_unregister_device_and_put); 1528 1529 /** 1530 * ib_unregister_driver - Unregister all IB devices for a driver 1531 * @driver_id: The driver to unregister 1532 * 1533 * This implements a fence for device unregistration. It only returns once all 1534 * devices associated with the driver_id have fully completed their 1535 * unregistration and returned from ib_unregister_device*(). 1536 * 1537 * If device's are not yet unregistered it goes ahead and starts unregistering 1538 * them. 1539 * 1540 * This does not block creation of new devices with the given driver_id, that 1541 * is the responsibility of the caller. 1542 */ 1543 void ib_unregister_driver(enum rdma_driver_id driver_id) 1544 { 1545 struct ib_device *ib_dev; 1546 unsigned long index; 1547 1548 down_read(&devices_rwsem); 1549 xa_for_each (&devices, index, ib_dev) { 1550 if (ib_dev->ops.driver_id != driver_id) 1551 continue; 1552 1553 get_device(&ib_dev->dev); 1554 up_read(&devices_rwsem); 1555 1556 WARN_ON(!ib_dev->ops.dealloc_driver); 1557 __ib_unregister_device(ib_dev); 1558 1559 put_device(&ib_dev->dev); 1560 down_read(&devices_rwsem); 1561 } 1562 up_read(&devices_rwsem); 1563 } 1564 EXPORT_SYMBOL(ib_unregister_driver); 1565 1566 static void ib_unregister_work(struct work_struct *work) 1567 { 1568 struct ib_device *ib_dev = 1569 container_of(work, struct ib_device, unregistration_work); 1570 1571 __ib_unregister_device(ib_dev); 1572 put_device(&ib_dev->dev); 1573 } 1574 1575 /** 1576 * ib_unregister_device_queued - Unregister a device using a work queue 1577 * @ib_dev: The device to unregister 1578 * 1579 * This schedules an asynchronous unregistration using a WQ for the device. A 1580 * driver should use this to avoid holding locks while doing unregistration, 1581 * such as holding the RTNL lock. 1582 * 1583 * Drivers using this API must use ib_unregister_driver before module unload 1584 * to ensure that all scheduled unregistrations have completed. 1585 */ 1586 void ib_unregister_device_queued(struct ib_device *ib_dev) 1587 { 1588 WARN_ON(!refcount_read(&ib_dev->refcount)); 1589 WARN_ON(!ib_dev->ops.dealloc_driver); 1590 get_device(&ib_dev->dev); 1591 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1592 put_device(&ib_dev->dev); 1593 } 1594 EXPORT_SYMBOL(ib_unregister_device_queued); 1595 1596 /* 1597 * The caller must pass in a device that has the kref held and the refcount 1598 * released. If the device is in cur_net and still registered then it is moved 1599 * into net. 1600 */ 1601 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1602 struct net *net) 1603 { 1604 int ret2 = -EINVAL; 1605 int ret; 1606 1607 mutex_lock(&device->unregistration_lock); 1608 1609 /* 1610 * If a device not under ib_device_get() or if the unregistration_lock 1611 * is not held, the namespace can be changed, or it can be unregistered. 1612 * Check again under the lock. 1613 */ 1614 if (refcount_read(&device->refcount) == 0 || 1615 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1616 ret = -ENODEV; 1617 goto out; 1618 } 1619 1620 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1621 disable_device(device); 1622 1623 /* 1624 * At this point no one can be using the device, so it is safe to 1625 * change the namespace. 1626 */ 1627 write_pnet(&device->coredev.rdma_net, net); 1628 1629 down_read(&devices_rwsem); 1630 /* 1631 * Currently rdma devices are system wide unique. So the device name 1632 * is guaranteed free in the new namespace. Publish the new namespace 1633 * at the sysfs level. 1634 */ 1635 ret = device_rename(&device->dev, dev_name(&device->dev)); 1636 up_read(&devices_rwsem); 1637 if (ret) { 1638 dev_warn(&device->dev, 1639 "%s: Couldn't rename device after namespace change\n", 1640 __func__); 1641 /* Try and put things back and re-enable the device */ 1642 write_pnet(&device->coredev.rdma_net, cur_net); 1643 } 1644 1645 ret2 = enable_device_and_get(device); 1646 if (ret2) { 1647 /* 1648 * This shouldn't really happen, but if it does, let the user 1649 * retry at later point. So don't disable the device. 1650 */ 1651 dev_warn(&device->dev, 1652 "%s: Couldn't re-enable device after namespace change\n", 1653 __func__); 1654 } 1655 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1656 1657 ib_device_put(device); 1658 out: 1659 mutex_unlock(&device->unregistration_lock); 1660 if (ret) 1661 return ret; 1662 return ret2; 1663 } 1664 1665 int ib_device_set_netns_put(struct sk_buff *skb, 1666 struct ib_device *dev, u32 ns_fd) 1667 { 1668 struct net *net; 1669 int ret; 1670 1671 net = get_net_ns_by_fd(ns_fd); 1672 if (IS_ERR(net)) { 1673 ret = PTR_ERR(net); 1674 goto net_err; 1675 } 1676 1677 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1678 ret = -EPERM; 1679 goto ns_err; 1680 } 1681 1682 /* 1683 * Currently supported only for those providers which support 1684 * disassociation and don't do port specific sysfs init. Once a 1685 * port_cleanup infrastructure is implemented, this limitation will be 1686 * removed. 1687 */ 1688 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1689 ib_devices_shared_netns) { 1690 ret = -EOPNOTSUPP; 1691 goto ns_err; 1692 } 1693 1694 get_device(&dev->dev); 1695 ib_device_put(dev); 1696 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1697 put_device(&dev->dev); 1698 1699 put_net(net); 1700 return ret; 1701 1702 ns_err: 1703 put_net(net); 1704 net_err: 1705 ib_device_put(dev); 1706 return ret; 1707 } 1708 1709 static struct pernet_operations rdma_dev_net_ops = { 1710 .init = rdma_dev_init_net, 1711 .exit = rdma_dev_exit_net, 1712 .id = &rdma_dev_net_id, 1713 .size = sizeof(struct rdma_dev_net), 1714 }; 1715 1716 static int assign_client_id(struct ib_client *client) 1717 { 1718 int ret; 1719 1720 down_write(&clients_rwsem); 1721 /* 1722 * The add/remove callbacks must be called in FIFO/LIFO order. To 1723 * achieve this we assign client_ids so they are sorted in 1724 * registration order. 1725 */ 1726 client->client_id = highest_client_id; 1727 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1728 if (ret) 1729 goto out; 1730 1731 highest_client_id++; 1732 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1733 1734 out: 1735 up_write(&clients_rwsem); 1736 return ret; 1737 } 1738 1739 static void remove_client_id(struct ib_client *client) 1740 { 1741 down_write(&clients_rwsem); 1742 xa_erase(&clients, client->client_id); 1743 for (; highest_client_id; highest_client_id--) 1744 if (xa_load(&clients, highest_client_id - 1)) 1745 break; 1746 up_write(&clients_rwsem); 1747 } 1748 1749 /** 1750 * ib_register_client - Register an IB client 1751 * @client:Client to register 1752 * 1753 * Upper level users of the IB drivers can use ib_register_client() to 1754 * register callbacks for IB device addition and removal. When an IB 1755 * device is added, each registered client's add method will be called 1756 * (in the order the clients were registered), and when a device is 1757 * removed, each client's remove method will be called (in the reverse 1758 * order that clients were registered). In addition, when 1759 * ib_register_client() is called, the client will receive an add 1760 * callback for all devices already registered. 1761 */ 1762 int ib_register_client(struct ib_client *client) 1763 { 1764 struct ib_device *device; 1765 unsigned long index; 1766 int ret; 1767 1768 refcount_set(&client->uses, 1); 1769 init_completion(&client->uses_zero); 1770 ret = assign_client_id(client); 1771 if (ret) 1772 return ret; 1773 1774 down_read(&devices_rwsem); 1775 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1776 ret = add_client_context(device, client); 1777 if (ret) { 1778 up_read(&devices_rwsem); 1779 ib_unregister_client(client); 1780 return ret; 1781 } 1782 } 1783 up_read(&devices_rwsem); 1784 return 0; 1785 } 1786 EXPORT_SYMBOL(ib_register_client); 1787 1788 /** 1789 * ib_unregister_client - Unregister an IB client 1790 * @client:Client to unregister 1791 * 1792 * Upper level users use ib_unregister_client() to remove their client 1793 * registration. When ib_unregister_client() is called, the client 1794 * will receive a remove callback for each IB device still registered. 1795 * 1796 * This is a full fence, once it returns no client callbacks will be called, 1797 * or are running in another thread. 1798 */ 1799 void ib_unregister_client(struct ib_client *client) 1800 { 1801 struct ib_device *device; 1802 unsigned long index; 1803 1804 down_write(&clients_rwsem); 1805 ib_client_put(client); 1806 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1807 up_write(&clients_rwsem); 1808 1809 /* We do not want to have locks while calling client->remove() */ 1810 rcu_read_lock(); 1811 xa_for_each (&devices, index, device) { 1812 if (!ib_device_try_get(device)) 1813 continue; 1814 rcu_read_unlock(); 1815 1816 remove_client_context(device, client->client_id); 1817 1818 ib_device_put(device); 1819 rcu_read_lock(); 1820 } 1821 rcu_read_unlock(); 1822 1823 /* 1824 * remove_client_context() is not a fence, it can return even though a 1825 * removal is ongoing. Wait until all removals are completed. 1826 */ 1827 wait_for_completion(&client->uses_zero); 1828 remove_client_id(client); 1829 } 1830 EXPORT_SYMBOL(ib_unregister_client); 1831 1832 static int __ib_get_global_client_nl_info(const char *client_name, 1833 struct ib_client_nl_info *res) 1834 { 1835 struct ib_client *client; 1836 unsigned long index; 1837 int ret = -ENOENT; 1838 1839 down_read(&clients_rwsem); 1840 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1841 if (strcmp(client->name, client_name) != 0) 1842 continue; 1843 if (!client->get_global_nl_info) { 1844 ret = -EOPNOTSUPP; 1845 break; 1846 } 1847 ret = client->get_global_nl_info(res); 1848 if (WARN_ON(ret == -ENOENT)) 1849 ret = -EINVAL; 1850 if (!ret && res->cdev) 1851 get_device(res->cdev); 1852 break; 1853 } 1854 up_read(&clients_rwsem); 1855 return ret; 1856 } 1857 1858 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1859 const char *client_name, 1860 struct ib_client_nl_info *res) 1861 { 1862 unsigned long index; 1863 void *client_data; 1864 int ret = -ENOENT; 1865 1866 down_read(&ibdev->client_data_rwsem); 1867 xan_for_each_marked (&ibdev->client_data, index, client_data, 1868 CLIENT_DATA_REGISTERED) { 1869 struct ib_client *client = xa_load(&clients, index); 1870 1871 if (!client || strcmp(client->name, client_name) != 0) 1872 continue; 1873 if (!client->get_nl_info) { 1874 ret = -EOPNOTSUPP; 1875 break; 1876 } 1877 ret = client->get_nl_info(ibdev, client_data, res); 1878 if (WARN_ON(ret == -ENOENT)) 1879 ret = -EINVAL; 1880 1881 /* 1882 * The cdev is guaranteed valid as long as we are inside the 1883 * client_data_rwsem as remove_one can't be called. Keep it 1884 * valid for the caller. 1885 */ 1886 if (!ret && res->cdev) 1887 get_device(res->cdev); 1888 break; 1889 } 1890 up_read(&ibdev->client_data_rwsem); 1891 1892 return ret; 1893 } 1894 1895 /** 1896 * ib_get_client_nl_info - Fetch the nl_info from a client 1897 * @device - IB device 1898 * @client_name - Name of the client 1899 * @res - Result of the query 1900 */ 1901 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1902 struct ib_client_nl_info *res) 1903 { 1904 int ret; 1905 1906 if (ibdev) 1907 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1908 else 1909 ret = __ib_get_global_client_nl_info(client_name, res); 1910 #ifdef CONFIG_MODULES 1911 if (ret == -ENOENT) { 1912 request_module("rdma-client-%s", client_name); 1913 if (ibdev) 1914 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1915 else 1916 ret = __ib_get_global_client_nl_info(client_name, res); 1917 } 1918 #endif 1919 if (ret) { 1920 if (ret == -ENOENT) 1921 return -EOPNOTSUPP; 1922 return ret; 1923 } 1924 1925 if (WARN_ON(!res->cdev)) 1926 return -EINVAL; 1927 return 0; 1928 } 1929 1930 /** 1931 * ib_set_client_data - Set IB client context 1932 * @device:Device to set context for 1933 * @client:Client to set context for 1934 * @data:Context to set 1935 * 1936 * ib_set_client_data() sets client context data that can be retrieved with 1937 * ib_get_client_data(). This can only be called while the client is 1938 * registered to the device, once the ib_client remove() callback returns this 1939 * cannot be called. 1940 */ 1941 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1942 void *data) 1943 { 1944 void *rc; 1945 1946 if (WARN_ON(IS_ERR(data))) 1947 data = NULL; 1948 1949 rc = xa_store(&device->client_data, client->client_id, data, 1950 GFP_KERNEL); 1951 WARN_ON(xa_is_err(rc)); 1952 } 1953 EXPORT_SYMBOL(ib_set_client_data); 1954 1955 /** 1956 * ib_register_event_handler - Register an IB event handler 1957 * @event_handler:Handler to register 1958 * 1959 * ib_register_event_handler() registers an event handler that will be 1960 * called back when asynchronous IB events occur (as defined in 1961 * chapter 11 of the InfiniBand Architecture Specification). This 1962 * callback occurs in workqueue context. 1963 */ 1964 void ib_register_event_handler(struct ib_event_handler *event_handler) 1965 { 1966 down_write(&event_handler->device->event_handler_rwsem); 1967 list_add_tail(&event_handler->list, 1968 &event_handler->device->event_handler_list); 1969 up_write(&event_handler->device->event_handler_rwsem); 1970 } 1971 EXPORT_SYMBOL(ib_register_event_handler); 1972 1973 /** 1974 * ib_unregister_event_handler - Unregister an event handler 1975 * @event_handler:Handler to unregister 1976 * 1977 * Unregister an event handler registered with 1978 * ib_register_event_handler(). 1979 */ 1980 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1981 { 1982 down_write(&event_handler->device->event_handler_rwsem); 1983 list_del(&event_handler->list); 1984 up_write(&event_handler->device->event_handler_rwsem); 1985 } 1986 EXPORT_SYMBOL(ib_unregister_event_handler); 1987 1988 void ib_dispatch_event_clients(struct ib_event *event) 1989 { 1990 struct ib_event_handler *handler; 1991 1992 down_read(&event->device->event_handler_rwsem); 1993 1994 list_for_each_entry(handler, &event->device->event_handler_list, list) 1995 handler->handler(handler, event); 1996 1997 up_read(&event->device->event_handler_rwsem); 1998 } 1999 2000 static int iw_query_port(struct ib_device *device, 2001 u8 port_num, 2002 struct ib_port_attr *port_attr) 2003 { 2004 struct in_device *inetdev; 2005 struct net_device *netdev; 2006 2007 memset(port_attr, 0, sizeof(*port_attr)); 2008 2009 netdev = ib_device_get_netdev(device, port_num); 2010 if (!netdev) 2011 return -ENODEV; 2012 2013 port_attr->max_mtu = IB_MTU_4096; 2014 port_attr->active_mtu = ib_mtu_int_to_enum(netdev->mtu); 2015 2016 if (!netif_carrier_ok(netdev)) { 2017 port_attr->state = IB_PORT_DOWN; 2018 port_attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; 2019 } else { 2020 rcu_read_lock(); 2021 inetdev = __in_dev_get_rcu(netdev); 2022 2023 if (inetdev && inetdev->ifa_list) { 2024 port_attr->state = IB_PORT_ACTIVE; 2025 port_attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; 2026 } else { 2027 port_attr->state = IB_PORT_INIT; 2028 port_attr->phys_state = 2029 IB_PORT_PHYS_STATE_PORT_CONFIGURATION_TRAINING; 2030 } 2031 2032 rcu_read_unlock(); 2033 } 2034 2035 dev_put(netdev); 2036 return device->ops.query_port(device, port_num, port_attr); 2037 } 2038 2039 static int __ib_query_port(struct ib_device *device, 2040 u8 port_num, 2041 struct ib_port_attr *port_attr) 2042 { 2043 union ib_gid gid = {}; 2044 int err; 2045 2046 memset(port_attr, 0, sizeof(*port_attr)); 2047 2048 err = device->ops.query_port(device, port_num, port_attr); 2049 if (err || port_attr->subnet_prefix) 2050 return err; 2051 2052 if (rdma_port_get_link_layer(device, port_num) != 2053 IB_LINK_LAYER_INFINIBAND) 2054 return 0; 2055 2056 err = device->ops.query_gid(device, port_num, 0, &gid); 2057 if (err) 2058 return err; 2059 2060 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 2061 return 0; 2062 } 2063 2064 /** 2065 * ib_query_port - Query IB port attributes 2066 * @device:Device to query 2067 * @port_num:Port number to query 2068 * @port_attr:Port attributes 2069 * 2070 * ib_query_port() returns the attributes of a port through the 2071 * @port_attr pointer. 2072 */ 2073 int ib_query_port(struct ib_device *device, 2074 u8 port_num, 2075 struct ib_port_attr *port_attr) 2076 { 2077 if (!rdma_is_port_valid(device, port_num)) 2078 return -EINVAL; 2079 2080 if (rdma_protocol_iwarp(device, port_num)) 2081 return iw_query_port(device, port_num, port_attr); 2082 else 2083 return __ib_query_port(device, port_num, port_attr); 2084 } 2085 EXPORT_SYMBOL(ib_query_port); 2086 2087 static void add_ndev_hash(struct ib_port_data *pdata) 2088 { 2089 unsigned long flags; 2090 2091 might_sleep(); 2092 2093 spin_lock_irqsave(&ndev_hash_lock, flags); 2094 if (hash_hashed(&pdata->ndev_hash_link)) { 2095 hash_del_rcu(&pdata->ndev_hash_link); 2096 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2097 /* 2098 * We cannot do hash_add_rcu after a hash_del_rcu until the 2099 * grace period 2100 */ 2101 synchronize_rcu(); 2102 spin_lock_irqsave(&ndev_hash_lock, flags); 2103 } 2104 if (pdata->netdev) 2105 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 2106 (uintptr_t)pdata->netdev); 2107 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2108 } 2109 2110 /** 2111 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2112 * @ib_dev: Device to modify 2113 * @ndev: net_device to affiliate, may be NULL 2114 * @port: IB port the net_device is connected to 2115 * 2116 * Drivers should use this to link the ib_device to a netdev so the netdev 2117 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2118 * affiliated with any port. 2119 * 2120 * The caller must ensure that the given ndev is not unregistered or 2121 * unregistering, and that either the ib_device is unregistered or 2122 * ib_device_set_netdev() is called with NULL when the ndev sends a 2123 * NETDEV_UNREGISTER event. 2124 */ 2125 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2126 unsigned int port) 2127 { 2128 struct net_device *old_ndev; 2129 struct ib_port_data *pdata; 2130 unsigned long flags; 2131 int ret; 2132 2133 /* 2134 * Drivers wish to call this before ib_register_driver, so we have to 2135 * setup the port data early. 2136 */ 2137 ret = alloc_port_data(ib_dev); 2138 if (ret) 2139 return ret; 2140 2141 if (!rdma_is_port_valid(ib_dev, port)) 2142 return -EINVAL; 2143 2144 pdata = &ib_dev->port_data[port]; 2145 spin_lock_irqsave(&pdata->netdev_lock, flags); 2146 old_ndev = rcu_dereference_protected( 2147 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2148 if (old_ndev == ndev) { 2149 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2150 return 0; 2151 } 2152 2153 if (ndev) 2154 dev_hold(ndev); 2155 rcu_assign_pointer(pdata->netdev, ndev); 2156 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2157 2158 add_ndev_hash(pdata); 2159 if (old_ndev) 2160 dev_put(old_ndev); 2161 2162 return 0; 2163 } 2164 EXPORT_SYMBOL(ib_device_set_netdev); 2165 2166 static void free_netdevs(struct ib_device *ib_dev) 2167 { 2168 unsigned long flags; 2169 unsigned int port; 2170 2171 if (!ib_dev->port_data) 2172 return; 2173 2174 rdma_for_each_port (ib_dev, port) { 2175 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2176 struct net_device *ndev; 2177 2178 spin_lock_irqsave(&pdata->netdev_lock, flags); 2179 ndev = rcu_dereference_protected( 2180 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2181 if (ndev) { 2182 spin_lock(&ndev_hash_lock); 2183 hash_del_rcu(&pdata->ndev_hash_link); 2184 spin_unlock(&ndev_hash_lock); 2185 2186 /* 2187 * If this is the last dev_put there is still a 2188 * synchronize_rcu before the netdev is kfreed, so we 2189 * can continue to rely on unlocked pointer 2190 * comparisons after the put 2191 */ 2192 rcu_assign_pointer(pdata->netdev, NULL); 2193 dev_put(ndev); 2194 } 2195 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2196 } 2197 } 2198 2199 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2200 unsigned int port) 2201 { 2202 struct ib_port_data *pdata; 2203 struct net_device *res; 2204 2205 if (!rdma_is_port_valid(ib_dev, port)) 2206 return NULL; 2207 2208 pdata = &ib_dev->port_data[port]; 2209 2210 /* 2211 * New drivers should use ib_device_set_netdev() not the legacy 2212 * get_netdev(). 2213 */ 2214 if (ib_dev->ops.get_netdev) 2215 res = ib_dev->ops.get_netdev(ib_dev, port); 2216 else { 2217 spin_lock(&pdata->netdev_lock); 2218 res = rcu_dereference_protected( 2219 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2220 if (res) 2221 dev_hold(res); 2222 spin_unlock(&pdata->netdev_lock); 2223 } 2224 2225 /* 2226 * If we are starting to unregister expedite things by preventing 2227 * propagation of an unregistering netdev. 2228 */ 2229 if (res && res->reg_state != NETREG_REGISTERED) { 2230 dev_put(res); 2231 return NULL; 2232 } 2233 2234 return res; 2235 } 2236 2237 /** 2238 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2239 * @ndev: netdev to locate 2240 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2241 * 2242 * Find and hold an ib_device that is associated with a netdev via 2243 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2244 * returned pointer. 2245 */ 2246 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2247 enum rdma_driver_id driver_id) 2248 { 2249 struct ib_device *res = NULL; 2250 struct ib_port_data *cur; 2251 2252 rcu_read_lock(); 2253 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2254 (uintptr_t)ndev) { 2255 if (rcu_access_pointer(cur->netdev) == ndev && 2256 (driver_id == RDMA_DRIVER_UNKNOWN || 2257 cur->ib_dev->ops.driver_id == driver_id) && 2258 ib_device_try_get(cur->ib_dev)) { 2259 res = cur->ib_dev; 2260 break; 2261 } 2262 } 2263 rcu_read_unlock(); 2264 2265 return res; 2266 } 2267 EXPORT_SYMBOL(ib_device_get_by_netdev); 2268 2269 /** 2270 * ib_enum_roce_netdev - enumerate all RoCE ports 2271 * @ib_dev : IB device we want to query 2272 * @filter: Should we call the callback? 2273 * @filter_cookie: Cookie passed to filter 2274 * @cb: Callback to call for each found RoCE ports 2275 * @cookie: Cookie passed back to the callback 2276 * 2277 * Enumerates all of the physical RoCE ports of ib_dev 2278 * which are related to netdevice and calls callback() on each 2279 * device for which filter() function returns non zero. 2280 */ 2281 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2282 roce_netdev_filter filter, 2283 void *filter_cookie, 2284 roce_netdev_callback cb, 2285 void *cookie) 2286 { 2287 unsigned int port; 2288 2289 rdma_for_each_port (ib_dev, port) 2290 if (rdma_protocol_roce(ib_dev, port)) { 2291 struct net_device *idev = 2292 ib_device_get_netdev(ib_dev, port); 2293 2294 if (filter(ib_dev, port, idev, filter_cookie)) 2295 cb(ib_dev, port, idev, cookie); 2296 2297 if (idev) 2298 dev_put(idev); 2299 } 2300 } 2301 2302 /** 2303 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2304 * @filter: Should we call the callback? 2305 * @filter_cookie: Cookie passed to filter 2306 * @cb: Callback to call for each found RoCE ports 2307 * @cookie: Cookie passed back to the callback 2308 * 2309 * Enumerates all RoCE devices' physical ports which are related 2310 * to netdevices and calls callback() on each device for which 2311 * filter() function returns non zero. 2312 */ 2313 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2314 void *filter_cookie, 2315 roce_netdev_callback cb, 2316 void *cookie) 2317 { 2318 struct ib_device *dev; 2319 unsigned long index; 2320 2321 down_read(&devices_rwsem); 2322 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2323 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2324 up_read(&devices_rwsem); 2325 } 2326 2327 /** 2328 * ib_enum_all_devs - enumerate all ib_devices 2329 * @cb: Callback to call for each found ib_device 2330 * 2331 * Enumerates all ib_devices and calls callback() on each device. 2332 */ 2333 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2334 struct netlink_callback *cb) 2335 { 2336 unsigned long index; 2337 struct ib_device *dev; 2338 unsigned int idx = 0; 2339 int ret = 0; 2340 2341 down_read(&devices_rwsem); 2342 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2343 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2344 continue; 2345 2346 ret = nldev_cb(dev, skb, cb, idx); 2347 if (ret) 2348 break; 2349 idx++; 2350 } 2351 up_read(&devices_rwsem); 2352 return ret; 2353 } 2354 2355 /** 2356 * ib_query_pkey - Get P_Key table entry 2357 * @device:Device to query 2358 * @port_num:Port number to query 2359 * @index:P_Key table index to query 2360 * @pkey:Returned P_Key 2361 * 2362 * ib_query_pkey() fetches the specified P_Key table entry. 2363 */ 2364 int ib_query_pkey(struct ib_device *device, 2365 u8 port_num, u16 index, u16 *pkey) 2366 { 2367 if (!rdma_is_port_valid(device, port_num)) 2368 return -EINVAL; 2369 2370 if (!device->ops.query_pkey) 2371 return -EOPNOTSUPP; 2372 2373 return device->ops.query_pkey(device, port_num, index, pkey); 2374 } 2375 EXPORT_SYMBOL(ib_query_pkey); 2376 2377 /** 2378 * ib_modify_device - Change IB device attributes 2379 * @device:Device to modify 2380 * @device_modify_mask:Mask of attributes to change 2381 * @device_modify:New attribute values 2382 * 2383 * ib_modify_device() changes a device's attributes as specified by 2384 * the @device_modify_mask and @device_modify structure. 2385 */ 2386 int ib_modify_device(struct ib_device *device, 2387 int device_modify_mask, 2388 struct ib_device_modify *device_modify) 2389 { 2390 if (!device->ops.modify_device) 2391 return -EOPNOTSUPP; 2392 2393 return device->ops.modify_device(device, device_modify_mask, 2394 device_modify); 2395 } 2396 EXPORT_SYMBOL(ib_modify_device); 2397 2398 /** 2399 * ib_modify_port - Modifies the attributes for the specified port. 2400 * @device: The device to modify. 2401 * @port_num: The number of the port to modify. 2402 * @port_modify_mask: Mask used to specify which attributes of the port 2403 * to change. 2404 * @port_modify: New attribute values for the port. 2405 * 2406 * ib_modify_port() changes a port's attributes as specified by the 2407 * @port_modify_mask and @port_modify structure. 2408 */ 2409 int ib_modify_port(struct ib_device *device, 2410 u8 port_num, int port_modify_mask, 2411 struct ib_port_modify *port_modify) 2412 { 2413 int rc; 2414 2415 if (!rdma_is_port_valid(device, port_num)) 2416 return -EINVAL; 2417 2418 if (device->ops.modify_port) 2419 rc = device->ops.modify_port(device, port_num, 2420 port_modify_mask, 2421 port_modify); 2422 else if (rdma_protocol_roce(device, port_num) && 2423 ((port_modify->set_port_cap_mask & ~IB_PORT_CM_SUP) == 0 || 2424 (port_modify->clr_port_cap_mask & ~IB_PORT_CM_SUP) == 0)) 2425 rc = 0; 2426 else 2427 rc = -EOPNOTSUPP; 2428 return rc; 2429 } 2430 EXPORT_SYMBOL(ib_modify_port); 2431 2432 /** 2433 * ib_find_gid - Returns the port number and GID table index where 2434 * a specified GID value occurs. Its searches only for IB link layer. 2435 * @device: The device to query. 2436 * @gid: The GID value to search for. 2437 * @port_num: The port number of the device where the GID value was found. 2438 * @index: The index into the GID table where the GID was found. This 2439 * parameter may be NULL. 2440 */ 2441 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2442 u8 *port_num, u16 *index) 2443 { 2444 union ib_gid tmp_gid; 2445 unsigned int port; 2446 int ret, i; 2447 2448 rdma_for_each_port (device, port) { 2449 if (!rdma_protocol_ib(device, port)) 2450 continue; 2451 2452 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2453 ++i) { 2454 ret = rdma_query_gid(device, port, i, &tmp_gid); 2455 if (ret) 2456 return ret; 2457 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2458 *port_num = port; 2459 if (index) 2460 *index = i; 2461 return 0; 2462 } 2463 } 2464 } 2465 2466 return -ENOENT; 2467 } 2468 EXPORT_SYMBOL(ib_find_gid); 2469 2470 /** 2471 * ib_find_pkey - Returns the PKey table index where a specified 2472 * PKey value occurs. 2473 * @device: The device to query. 2474 * @port_num: The port number of the device to search for the PKey. 2475 * @pkey: The PKey value to search for. 2476 * @index: The index into the PKey table where the PKey was found. 2477 */ 2478 int ib_find_pkey(struct ib_device *device, 2479 u8 port_num, u16 pkey, u16 *index) 2480 { 2481 int ret, i; 2482 u16 tmp_pkey; 2483 int partial_ix = -1; 2484 2485 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2486 ++i) { 2487 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2488 if (ret) 2489 return ret; 2490 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2491 /* if there is full-member pkey take it.*/ 2492 if (tmp_pkey & 0x8000) { 2493 *index = i; 2494 return 0; 2495 } 2496 if (partial_ix < 0) 2497 partial_ix = i; 2498 } 2499 } 2500 2501 /*no full-member, if exists take the limited*/ 2502 if (partial_ix >= 0) { 2503 *index = partial_ix; 2504 return 0; 2505 } 2506 return -ENOENT; 2507 } 2508 EXPORT_SYMBOL(ib_find_pkey); 2509 2510 /** 2511 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2512 * for a received CM request 2513 * @dev: An RDMA device on which the request has been received. 2514 * @port: Port number on the RDMA device. 2515 * @pkey: The Pkey the request came on. 2516 * @gid: A GID that the net_dev uses to communicate. 2517 * @addr: Contains the IP address that the request specified as its 2518 * destination. 2519 * 2520 */ 2521 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2522 u8 port, 2523 u16 pkey, 2524 const union ib_gid *gid, 2525 const struct sockaddr *addr) 2526 { 2527 struct net_device *net_dev = NULL; 2528 unsigned long index; 2529 void *client_data; 2530 2531 if (!rdma_protocol_ib(dev, port)) 2532 return NULL; 2533 2534 /* 2535 * Holding the read side guarantees that the client will not become 2536 * unregistered while we are calling get_net_dev_by_params() 2537 */ 2538 down_read(&dev->client_data_rwsem); 2539 xan_for_each_marked (&dev->client_data, index, client_data, 2540 CLIENT_DATA_REGISTERED) { 2541 struct ib_client *client = xa_load(&clients, index); 2542 2543 if (!client || !client->get_net_dev_by_params) 2544 continue; 2545 2546 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2547 addr, client_data); 2548 if (net_dev) 2549 break; 2550 } 2551 up_read(&dev->client_data_rwsem); 2552 2553 return net_dev; 2554 } 2555 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2556 2557 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2558 { 2559 struct ib_device_ops *dev_ops = &dev->ops; 2560 #define SET_DEVICE_OP(ptr, name) \ 2561 do { \ 2562 if (ops->name) \ 2563 if (!((ptr)->name)) \ 2564 (ptr)->name = ops->name; \ 2565 } while (0) 2566 2567 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2568 2569 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2570 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2571 dev_ops->driver_id != ops->driver_id); 2572 dev_ops->driver_id = ops->driver_id; 2573 } 2574 if (ops->owner) { 2575 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2576 dev_ops->owner = ops->owner; 2577 } 2578 if (ops->uverbs_abi_ver) 2579 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2580 2581 dev_ops->uverbs_no_driver_id_binding |= 2582 ops->uverbs_no_driver_id_binding; 2583 2584 SET_DEVICE_OP(dev_ops, add_gid); 2585 SET_DEVICE_OP(dev_ops, advise_mr); 2586 SET_DEVICE_OP(dev_ops, alloc_dm); 2587 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2588 SET_DEVICE_OP(dev_ops, alloc_mr); 2589 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2590 SET_DEVICE_OP(dev_ops, alloc_mw); 2591 SET_DEVICE_OP(dev_ops, alloc_pd); 2592 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2593 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2594 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2595 SET_DEVICE_OP(dev_ops, attach_mcast); 2596 SET_DEVICE_OP(dev_ops, check_mr_status); 2597 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2598 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2599 SET_DEVICE_OP(dev_ops, counter_dealloc); 2600 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2601 SET_DEVICE_OP(dev_ops, counter_update_stats); 2602 SET_DEVICE_OP(dev_ops, create_ah); 2603 SET_DEVICE_OP(dev_ops, create_counters); 2604 SET_DEVICE_OP(dev_ops, create_cq); 2605 SET_DEVICE_OP(dev_ops, create_flow); 2606 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2607 SET_DEVICE_OP(dev_ops, create_qp); 2608 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2609 SET_DEVICE_OP(dev_ops, create_srq); 2610 SET_DEVICE_OP(dev_ops, create_wq); 2611 SET_DEVICE_OP(dev_ops, dealloc_dm); 2612 SET_DEVICE_OP(dev_ops, dealloc_driver); 2613 SET_DEVICE_OP(dev_ops, dealloc_mw); 2614 SET_DEVICE_OP(dev_ops, dealloc_pd); 2615 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2616 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2617 SET_DEVICE_OP(dev_ops, del_gid); 2618 SET_DEVICE_OP(dev_ops, dereg_mr); 2619 SET_DEVICE_OP(dev_ops, destroy_ah); 2620 SET_DEVICE_OP(dev_ops, destroy_counters); 2621 SET_DEVICE_OP(dev_ops, destroy_cq); 2622 SET_DEVICE_OP(dev_ops, destroy_flow); 2623 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2624 SET_DEVICE_OP(dev_ops, destroy_qp); 2625 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2626 SET_DEVICE_OP(dev_ops, destroy_srq); 2627 SET_DEVICE_OP(dev_ops, destroy_wq); 2628 SET_DEVICE_OP(dev_ops, detach_mcast); 2629 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2630 SET_DEVICE_OP(dev_ops, drain_rq); 2631 SET_DEVICE_OP(dev_ops, drain_sq); 2632 SET_DEVICE_OP(dev_ops, enable_driver); 2633 SET_DEVICE_OP(dev_ops, fill_res_cm_id_entry); 2634 SET_DEVICE_OP(dev_ops, fill_res_cq_entry); 2635 SET_DEVICE_OP(dev_ops, fill_res_cq_entry_raw); 2636 SET_DEVICE_OP(dev_ops, fill_res_mr_entry); 2637 SET_DEVICE_OP(dev_ops, fill_res_mr_entry_raw); 2638 SET_DEVICE_OP(dev_ops, fill_res_qp_entry); 2639 SET_DEVICE_OP(dev_ops, fill_res_qp_entry_raw); 2640 SET_DEVICE_OP(dev_ops, fill_stat_mr_entry); 2641 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2642 SET_DEVICE_OP(dev_ops, get_dma_mr); 2643 SET_DEVICE_OP(dev_ops, get_hw_stats); 2644 SET_DEVICE_OP(dev_ops, get_link_layer); 2645 SET_DEVICE_OP(dev_ops, get_netdev); 2646 SET_DEVICE_OP(dev_ops, get_port_immutable); 2647 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2648 SET_DEVICE_OP(dev_ops, get_vf_config); 2649 SET_DEVICE_OP(dev_ops, get_vf_guid); 2650 SET_DEVICE_OP(dev_ops, get_vf_stats); 2651 SET_DEVICE_OP(dev_ops, init_port); 2652 SET_DEVICE_OP(dev_ops, iw_accept); 2653 SET_DEVICE_OP(dev_ops, iw_add_ref); 2654 SET_DEVICE_OP(dev_ops, iw_connect); 2655 SET_DEVICE_OP(dev_ops, iw_create_listen); 2656 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2657 SET_DEVICE_OP(dev_ops, iw_get_qp); 2658 SET_DEVICE_OP(dev_ops, iw_reject); 2659 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2660 SET_DEVICE_OP(dev_ops, map_mr_sg); 2661 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2662 SET_DEVICE_OP(dev_ops, mmap); 2663 SET_DEVICE_OP(dev_ops, mmap_free); 2664 SET_DEVICE_OP(dev_ops, modify_ah); 2665 SET_DEVICE_OP(dev_ops, modify_cq); 2666 SET_DEVICE_OP(dev_ops, modify_device); 2667 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2668 SET_DEVICE_OP(dev_ops, modify_port); 2669 SET_DEVICE_OP(dev_ops, modify_qp); 2670 SET_DEVICE_OP(dev_ops, modify_srq); 2671 SET_DEVICE_OP(dev_ops, modify_wq); 2672 SET_DEVICE_OP(dev_ops, peek_cq); 2673 SET_DEVICE_OP(dev_ops, poll_cq); 2674 SET_DEVICE_OP(dev_ops, post_recv); 2675 SET_DEVICE_OP(dev_ops, post_send); 2676 SET_DEVICE_OP(dev_ops, post_srq_recv); 2677 SET_DEVICE_OP(dev_ops, process_mad); 2678 SET_DEVICE_OP(dev_ops, query_ah); 2679 SET_DEVICE_OP(dev_ops, query_device); 2680 SET_DEVICE_OP(dev_ops, query_gid); 2681 SET_DEVICE_OP(dev_ops, query_pkey); 2682 SET_DEVICE_OP(dev_ops, query_port); 2683 SET_DEVICE_OP(dev_ops, query_qp); 2684 SET_DEVICE_OP(dev_ops, query_srq); 2685 SET_DEVICE_OP(dev_ops, query_ucontext); 2686 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2687 SET_DEVICE_OP(dev_ops, read_counters); 2688 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2689 SET_DEVICE_OP(dev_ops, reg_user_mr); 2690 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2691 SET_DEVICE_OP(dev_ops, req_notify_cq); 2692 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2693 SET_DEVICE_OP(dev_ops, resize_cq); 2694 SET_DEVICE_OP(dev_ops, set_vf_guid); 2695 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2696 2697 SET_OBJ_SIZE(dev_ops, ib_ah); 2698 SET_OBJ_SIZE(dev_ops, ib_counters); 2699 SET_OBJ_SIZE(dev_ops, ib_cq); 2700 SET_OBJ_SIZE(dev_ops, ib_pd); 2701 SET_OBJ_SIZE(dev_ops, ib_srq); 2702 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2703 SET_OBJ_SIZE(dev_ops, ib_xrcd); 2704 } 2705 EXPORT_SYMBOL(ib_set_device_ops); 2706 2707 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2708 [RDMA_NL_LS_OP_RESOLVE] = { 2709 .doit = ib_nl_handle_resolve_resp, 2710 .flags = RDMA_NL_ADMIN_PERM, 2711 }, 2712 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2713 .doit = ib_nl_handle_set_timeout, 2714 .flags = RDMA_NL_ADMIN_PERM, 2715 }, 2716 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2717 .doit = ib_nl_handle_ip_res_resp, 2718 .flags = RDMA_NL_ADMIN_PERM, 2719 }, 2720 }; 2721 2722 static int __init ib_core_init(void) 2723 { 2724 int ret; 2725 2726 ib_wq = alloc_workqueue("infiniband", 0, 0); 2727 if (!ib_wq) 2728 return -ENOMEM; 2729 2730 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2731 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2732 if (!ib_comp_wq) { 2733 ret = -ENOMEM; 2734 goto err; 2735 } 2736 2737 ib_comp_unbound_wq = 2738 alloc_workqueue("ib-comp-unb-wq", 2739 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2740 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2741 if (!ib_comp_unbound_wq) { 2742 ret = -ENOMEM; 2743 goto err_comp; 2744 } 2745 2746 ret = class_register(&ib_class); 2747 if (ret) { 2748 pr_warn("Couldn't create InfiniBand device class\n"); 2749 goto err_comp_unbound; 2750 } 2751 2752 rdma_nl_init(); 2753 2754 ret = addr_init(); 2755 if (ret) { 2756 pr_warn("Couldn't init IB address resolution\n"); 2757 goto err_ibnl; 2758 } 2759 2760 ret = ib_mad_init(); 2761 if (ret) { 2762 pr_warn("Couldn't init IB MAD\n"); 2763 goto err_addr; 2764 } 2765 2766 ret = ib_sa_init(); 2767 if (ret) { 2768 pr_warn("Couldn't init SA\n"); 2769 goto err_mad; 2770 } 2771 2772 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2773 if (ret) { 2774 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2775 goto err_sa; 2776 } 2777 2778 ret = register_pernet_device(&rdma_dev_net_ops); 2779 if (ret) { 2780 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2781 goto err_compat; 2782 } 2783 2784 nldev_init(); 2785 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2786 roce_gid_mgmt_init(); 2787 2788 return 0; 2789 2790 err_compat: 2791 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2792 err_sa: 2793 ib_sa_cleanup(); 2794 err_mad: 2795 ib_mad_cleanup(); 2796 err_addr: 2797 addr_cleanup(); 2798 err_ibnl: 2799 class_unregister(&ib_class); 2800 err_comp_unbound: 2801 destroy_workqueue(ib_comp_unbound_wq); 2802 err_comp: 2803 destroy_workqueue(ib_comp_wq); 2804 err: 2805 destroy_workqueue(ib_wq); 2806 return ret; 2807 } 2808 2809 static void __exit ib_core_cleanup(void) 2810 { 2811 roce_gid_mgmt_cleanup(); 2812 nldev_exit(); 2813 rdma_nl_unregister(RDMA_NL_LS); 2814 unregister_pernet_device(&rdma_dev_net_ops); 2815 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2816 ib_sa_cleanup(); 2817 ib_mad_cleanup(); 2818 addr_cleanup(); 2819 rdma_nl_exit(); 2820 class_unregister(&ib_class); 2821 destroy_workqueue(ib_comp_unbound_wq); 2822 destroy_workqueue(ib_comp_wq); 2823 /* Make sure that any pending umem accounting work is done. */ 2824 destroy_workqueue(ib_wq); 2825 flush_workqueue(system_unbound_wq); 2826 WARN_ON(!xa_empty(&clients)); 2827 WARN_ON(!xa_empty(&devices)); 2828 } 2829 2830 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2831 2832 /* ib core relies on netdev stack to first register net_ns_type_operations 2833 * ns kobject type before ib_core initialization. 2834 */ 2835 fs_initcall(ib_core_init); 2836 module_exit(ib_core_cleanup); 2837