1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <linux/security.h> 44 #include <linux/notifier.h> 45 #include <linux/hashtable.h> 46 #include <rdma/rdma_netlink.h> 47 #include <rdma/ib_addr.h> 48 #include <rdma/ib_cache.h> 49 #include <rdma/rdma_counter.h> 50 51 #include "core_priv.h" 52 #include "restrack.h" 53 54 MODULE_AUTHOR("Roland Dreier"); 55 MODULE_DESCRIPTION("core kernel InfiniBand API"); 56 MODULE_LICENSE("Dual BSD/GPL"); 57 58 struct workqueue_struct *ib_comp_wq; 59 struct workqueue_struct *ib_comp_unbound_wq; 60 struct workqueue_struct *ib_wq; 61 EXPORT_SYMBOL_GPL(ib_wq); 62 63 /* 64 * Each of the three rwsem locks (devices, clients, client_data) protects the 65 * xarray of the same name. Specifically it allows the caller to assert that 66 * the MARK will/will not be changing under the lock, and for devices and 67 * clients, that the value in the xarray is still a valid pointer. Change of 68 * the MARK is linked to the object state, so holding the lock and testing the 69 * MARK also asserts that the contained object is in a certain state. 70 * 71 * This is used to build a two stage register/unregister flow where objects 72 * can continue to be in the xarray even though they are still in progress to 73 * register/unregister. 74 * 75 * The xarray itself provides additional locking, and restartable iteration, 76 * which is also relied on. 77 * 78 * Locks should not be nested, with the exception of client_data, which is 79 * allowed to nest under the read side of the other two locks. 80 * 81 * The devices_rwsem also protects the device name list, any change or 82 * assignment of device name must also hold the write side to guarantee unique 83 * names. 84 */ 85 86 /* 87 * devices contains devices that have had their names assigned. The 88 * devices may not be registered. Users that care about the registration 89 * status need to call ib_device_try_get() on the device to ensure it is 90 * registered, and keep it registered, for the required duration. 91 * 92 */ 93 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 94 static DECLARE_RWSEM(devices_rwsem); 95 #define DEVICE_REGISTERED XA_MARK_1 96 97 static LIST_HEAD(client_list); 98 #define CLIENT_REGISTERED XA_MARK_1 99 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 100 static DECLARE_RWSEM(clients_rwsem); 101 102 /* 103 * If client_data is registered then the corresponding client must also still 104 * be registered. 105 */ 106 #define CLIENT_DATA_REGISTERED XA_MARK_1 107 108 /** 109 * struct rdma_dev_net - rdma net namespace metadata for a net 110 * @net: Pointer to owner net namespace 111 * @id: xarray id to identify the net namespace. 112 */ 113 struct rdma_dev_net { 114 possible_net_t net; 115 u32 id; 116 }; 117 118 static unsigned int rdma_dev_net_id; 119 120 /* 121 * A list of net namespaces is maintained in an xarray. This is necessary 122 * because we can't get the locking right using the existing net ns list. We 123 * would require a init_net callback after the list is updated. 124 */ 125 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 126 /* 127 * rwsem to protect accessing the rdma_nets xarray entries. 128 */ 129 static DECLARE_RWSEM(rdma_nets_rwsem); 130 131 bool ib_devices_shared_netns = true; 132 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 133 MODULE_PARM_DESC(netns_mode, 134 "Share device among net namespaces; default=1 (shared)"); 135 /** 136 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 137 * from a specified net namespace or not. 138 * @device: Pointer to rdma device which needs to be checked 139 * @net: Pointer to net namesapce for which access to be checked 140 * 141 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 142 * from a specified net namespace or not. When 143 * rdma device is in shared mode, it ignores the 144 * net namespace. When rdma device is exclusive 145 * to a net namespace, rdma device net namespace is 146 * checked against the specified one. 147 */ 148 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 149 { 150 return (ib_devices_shared_netns || 151 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 152 } 153 EXPORT_SYMBOL(rdma_dev_access_netns); 154 155 /* 156 * xarray has this behavior where it won't iterate over NULL values stored in 157 * allocated arrays. So we need our own iterator to see all values stored in 158 * the array. This does the same thing as xa_for_each except that it also 159 * returns NULL valued entries if the array is allocating. Simplified to only 160 * work on simple xarrays. 161 */ 162 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 163 xa_mark_t filter) 164 { 165 XA_STATE(xas, xa, *indexp); 166 void *entry; 167 168 rcu_read_lock(); 169 do { 170 entry = xas_find_marked(&xas, ULONG_MAX, filter); 171 if (xa_is_zero(entry)) 172 break; 173 } while (xas_retry(&xas, entry)); 174 rcu_read_unlock(); 175 176 if (entry) { 177 *indexp = xas.xa_index; 178 if (xa_is_zero(entry)) 179 return NULL; 180 return entry; 181 } 182 return XA_ERROR(-ENOENT); 183 } 184 #define xan_for_each_marked(xa, index, entry, filter) \ 185 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 186 !xa_is_err(entry); \ 187 (index)++, entry = xan_find_marked(xa, &(index), filter)) 188 189 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 190 static DEFINE_SPINLOCK(ndev_hash_lock); 191 static DECLARE_HASHTABLE(ndev_hash, 5); 192 193 static void free_netdevs(struct ib_device *ib_dev); 194 static void ib_unregister_work(struct work_struct *work); 195 static void __ib_unregister_device(struct ib_device *device); 196 static int ib_security_change(struct notifier_block *nb, unsigned long event, 197 void *lsm_data); 198 static void ib_policy_change_task(struct work_struct *work); 199 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 200 201 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 202 struct va_format *vaf) 203 { 204 if (ibdev && ibdev->dev.parent) 205 dev_printk_emit(level[1] - '0', 206 ibdev->dev.parent, 207 "%s %s %s: %pV", 208 dev_driver_string(ibdev->dev.parent), 209 dev_name(ibdev->dev.parent), 210 dev_name(&ibdev->dev), 211 vaf); 212 else if (ibdev) 213 printk("%s%s: %pV", 214 level, dev_name(&ibdev->dev), vaf); 215 else 216 printk("%s(NULL ib_device): %pV", level, vaf); 217 } 218 219 void ibdev_printk(const char *level, const struct ib_device *ibdev, 220 const char *format, ...) 221 { 222 struct va_format vaf; 223 va_list args; 224 225 va_start(args, format); 226 227 vaf.fmt = format; 228 vaf.va = &args; 229 230 __ibdev_printk(level, ibdev, &vaf); 231 232 va_end(args); 233 } 234 EXPORT_SYMBOL(ibdev_printk); 235 236 #define define_ibdev_printk_level(func, level) \ 237 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 238 { \ 239 struct va_format vaf; \ 240 va_list args; \ 241 \ 242 va_start(args, fmt); \ 243 \ 244 vaf.fmt = fmt; \ 245 vaf.va = &args; \ 246 \ 247 __ibdev_printk(level, ibdev, &vaf); \ 248 \ 249 va_end(args); \ 250 } \ 251 EXPORT_SYMBOL(func); 252 253 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 254 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 255 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 256 define_ibdev_printk_level(ibdev_err, KERN_ERR); 257 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 258 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 259 define_ibdev_printk_level(ibdev_info, KERN_INFO); 260 261 static struct notifier_block ibdev_lsm_nb = { 262 .notifier_call = ib_security_change, 263 }; 264 265 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 266 struct net *net); 267 268 /* Pointer to the RCU head at the start of the ib_port_data array */ 269 struct ib_port_data_rcu { 270 struct rcu_head rcu_head; 271 struct ib_port_data pdata[]; 272 }; 273 274 static void ib_device_check_mandatory(struct ib_device *device) 275 { 276 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 277 static const struct { 278 size_t offset; 279 char *name; 280 } mandatory_table[] = { 281 IB_MANDATORY_FUNC(query_device), 282 IB_MANDATORY_FUNC(query_port), 283 IB_MANDATORY_FUNC(query_pkey), 284 IB_MANDATORY_FUNC(alloc_pd), 285 IB_MANDATORY_FUNC(dealloc_pd), 286 IB_MANDATORY_FUNC(create_qp), 287 IB_MANDATORY_FUNC(modify_qp), 288 IB_MANDATORY_FUNC(destroy_qp), 289 IB_MANDATORY_FUNC(post_send), 290 IB_MANDATORY_FUNC(post_recv), 291 IB_MANDATORY_FUNC(create_cq), 292 IB_MANDATORY_FUNC(destroy_cq), 293 IB_MANDATORY_FUNC(poll_cq), 294 IB_MANDATORY_FUNC(req_notify_cq), 295 IB_MANDATORY_FUNC(get_dma_mr), 296 IB_MANDATORY_FUNC(dereg_mr), 297 IB_MANDATORY_FUNC(get_port_immutable) 298 }; 299 int i; 300 301 device->kverbs_provider = true; 302 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 303 if (!*(void **) ((void *) &device->ops + 304 mandatory_table[i].offset)) { 305 device->kverbs_provider = false; 306 break; 307 } 308 } 309 } 310 311 /* 312 * Caller must perform ib_device_put() to return the device reference count 313 * when ib_device_get_by_index() returns valid device pointer. 314 */ 315 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 316 { 317 struct ib_device *device; 318 319 down_read(&devices_rwsem); 320 device = xa_load(&devices, index); 321 if (device) { 322 if (!rdma_dev_access_netns(device, net)) { 323 device = NULL; 324 goto out; 325 } 326 327 if (!ib_device_try_get(device)) 328 device = NULL; 329 } 330 out: 331 up_read(&devices_rwsem); 332 return device; 333 } 334 335 /** 336 * ib_device_put - Release IB device reference 337 * @device: device whose reference to be released 338 * 339 * ib_device_put() releases reference to the IB device to allow it to be 340 * unregistered and eventually free. 341 */ 342 void ib_device_put(struct ib_device *device) 343 { 344 if (refcount_dec_and_test(&device->refcount)) 345 complete(&device->unreg_completion); 346 } 347 EXPORT_SYMBOL(ib_device_put); 348 349 static struct ib_device *__ib_device_get_by_name(const char *name) 350 { 351 struct ib_device *device; 352 unsigned long index; 353 354 xa_for_each (&devices, index, device) 355 if (!strcmp(name, dev_name(&device->dev))) 356 return device; 357 358 return NULL; 359 } 360 361 /** 362 * ib_device_get_by_name - Find an IB device by name 363 * @name: The name to look for 364 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 365 * 366 * Find and hold an ib_device by its name. The caller must call 367 * ib_device_put() on the returned pointer. 368 */ 369 struct ib_device *ib_device_get_by_name(const char *name, 370 enum rdma_driver_id driver_id) 371 { 372 struct ib_device *device; 373 374 down_read(&devices_rwsem); 375 device = __ib_device_get_by_name(name); 376 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 377 device->ops.driver_id != driver_id) 378 device = NULL; 379 380 if (device) { 381 if (!ib_device_try_get(device)) 382 device = NULL; 383 } 384 up_read(&devices_rwsem); 385 return device; 386 } 387 EXPORT_SYMBOL(ib_device_get_by_name); 388 389 static int rename_compat_devs(struct ib_device *device) 390 { 391 struct ib_core_device *cdev; 392 unsigned long index; 393 int ret = 0; 394 395 mutex_lock(&device->compat_devs_mutex); 396 xa_for_each (&device->compat_devs, index, cdev) { 397 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 398 if (ret) { 399 dev_warn(&cdev->dev, 400 "Fail to rename compatdev to new name %s\n", 401 dev_name(&device->dev)); 402 break; 403 } 404 } 405 mutex_unlock(&device->compat_devs_mutex); 406 return ret; 407 } 408 409 int ib_device_rename(struct ib_device *ibdev, const char *name) 410 { 411 unsigned long index; 412 void *client_data; 413 int ret; 414 415 down_write(&devices_rwsem); 416 if (!strcmp(name, dev_name(&ibdev->dev))) { 417 up_write(&devices_rwsem); 418 return 0; 419 } 420 421 if (__ib_device_get_by_name(name)) { 422 up_write(&devices_rwsem); 423 return -EEXIST; 424 } 425 426 ret = device_rename(&ibdev->dev, name); 427 if (ret) { 428 up_write(&devices_rwsem); 429 return ret; 430 } 431 432 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 433 ret = rename_compat_devs(ibdev); 434 435 downgrade_write(&devices_rwsem); 436 down_read(&ibdev->client_data_rwsem); 437 xan_for_each_marked(&ibdev->client_data, index, client_data, 438 CLIENT_DATA_REGISTERED) { 439 struct ib_client *client = xa_load(&clients, index); 440 441 if (!client || !client->rename) 442 continue; 443 444 client->rename(ibdev, client_data); 445 } 446 up_read(&ibdev->client_data_rwsem); 447 up_read(&devices_rwsem); 448 return 0; 449 } 450 451 int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim) 452 { 453 if (use_dim > 1) 454 return -EINVAL; 455 ibdev->use_cq_dim = use_dim; 456 457 return 0; 458 } 459 460 static int alloc_name(struct ib_device *ibdev, const char *name) 461 { 462 struct ib_device *device; 463 unsigned long index; 464 struct ida inuse; 465 int rc; 466 int i; 467 468 lockdep_assert_held_write(&devices_rwsem); 469 ida_init(&inuse); 470 xa_for_each (&devices, index, device) { 471 char buf[IB_DEVICE_NAME_MAX]; 472 473 if (sscanf(dev_name(&device->dev), name, &i) != 1) 474 continue; 475 if (i < 0 || i >= INT_MAX) 476 continue; 477 snprintf(buf, sizeof buf, name, i); 478 if (strcmp(buf, dev_name(&device->dev)) != 0) 479 continue; 480 481 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 482 if (rc < 0) 483 goto out; 484 } 485 486 rc = ida_alloc(&inuse, GFP_KERNEL); 487 if (rc < 0) 488 goto out; 489 490 rc = dev_set_name(&ibdev->dev, name, rc); 491 out: 492 ida_destroy(&inuse); 493 return rc; 494 } 495 496 static void ib_device_release(struct device *device) 497 { 498 struct ib_device *dev = container_of(device, struct ib_device, dev); 499 500 free_netdevs(dev); 501 WARN_ON(refcount_read(&dev->refcount)); 502 if (dev->port_data) { 503 ib_cache_release_one(dev); 504 ib_security_release_port_pkey_list(dev); 505 rdma_counter_release(dev); 506 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 507 pdata[0]), 508 rcu_head); 509 } 510 511 xa_destroy(&dev->compat_devs); 512 xa_destroy(&dev->client_data); 513 kfree_rcu(dev, rcu_head); 514 } 515 516 static int ib_device_uevent(struct device *device, 517 struct kobj_uevent_env *env) 518 { 519 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 520 return -ENOMEM; 521 522 /* 523 * It would be nice to pass the node GUID with the event... 524 */ 525 526 return 0; 527 } 528 529 static const void *net_namespace(struct device *d) 530 { 531 struct ib_core_device *coredev = 532 container_of(d, struct ib_core_device, dev); 533 534 return read_pnet(&coredev->rdma_net); 535 } 536 537 static struct class ib_class = { 538 .name = "infiniband", 539 .dev_release = ib_device_release, 540 .dev_uevent = ib_device_uevent, 541 .ns_type = &net_ns_type_operations, 542 .namespace = net_namespace, 543 }; 544 545 static void rdma_init_coredev(struct ib_core_device *coredev, 546 struct ib_device *dev, struct net *net) 547 { 548 /* This BUILD_BUG_ON is intended to catch layout change 549 * of union of ib_core_device and device. 550 * dev must be the first element as ib_core and providers 551 * driver uses it. Adding anything in ib_core_device before 552 * device will break this assumption. 553 */ 554 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 555 offsetof(struct ib_device, dev)); 556 557 coredev->dev.class = &ib_class; 558 coredev->dev.groups = dev->groups; 559 device_initialize(&coredev->dev); 560 coredev->owner = dev; 561 INIT_LIST_HEAD(&coredev->port_list); 562 write_pnet(&coredev->rdma_net, net); 563 } 564 565 /** 566 * _ib_alloc_device - allocate an IB device struct 567 * @size:size of structure to allocate 568 * 569 * Low-level drivers should use ib_alloc_device() to allocate &struct 570 * ib_device. @size is the size of the structure to be allocated, 571 * including any private data used by the low-level driver. 572 * ib_dealloc_device() must be used to free structures allocated with 573 * ib_alloc_device(). 574 */ 575 struct ib_device *_ib_alloc_device(size_t size) 576 { 577 struct ib_device *device; 578 579 if (WARN_ON(size < sizeof(struct ib_device))) 580 return NULL; 581 582 device = kzalloc(size, GFP_KERNEL); 583 if (!device) 584 return NULL; 585 586 if (rdma_restrack_init(device)) { 587 kfree(device); 588 return NULL; 589 } 590 591 device->groups[0] = &ib_dev_attr_group; 592 rdma_init_coredev(&device->coredev, device, &init_net); 593 594 INIT_LIST_HEAD(&device->event_handler_list); 595 spin_lock_init(&device->event_handler_lock); 596 mutex_init(&device->unregistration_lock); 597 /* 598 * client_data needs to be alloc because we don't want our mark to be 599 * destroyed if the user stores NULL in the client data. 600 */ 601 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 602 init_rwsem(&device->client_data_rwsem); 603 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 604 mutex_init(&device->compat_devs_mutex); 605 init_completion(&device->unreg_completion); 606 INIT_WORK(&device->unregistration_work, ib_unregister_work); 607 608 return device; 609 } 610 EXPORT_SYMBOL(_ib_alloc_device); 611 612 /** 613 * ib_dealloc_device - free an IB device struct 614 * @device:structure to free 615 * 616 * Free a structure allocated with ib_alloc_device(). 617 */ 618 void ib_dealloc_device(struct ib_device *device) 619 { 620 if (device->ops.dealloc_driver) 621 device->ops.dealloc_driver(device); 622 623 /* 624 * ib_unregister_driver() requires all devices to remain in the xarray 625 * while their ops are callable. The last op we call is dealloc_driver 626 * above. This is needed to create a fence on op callbacks prior to 627 * allowing the driver module to unload. 628 */ 629 down_write(&devices_rwsem); 630 if (xa_load(&devices, device->index) == device) 631 xa_erase(&devices, device->index); 632 up_write(&devices_rwsem); 633 634 /* Expedite releasing netdev references */ 635 free_netdevs(device); 636 637 WARN_ON(!xa_empty(&device->compat_devs)); 638 WARN_ON(!xa_empty(&device->client_data)); 639 WARN_ON(refcount_read(&device->refcount)); 640 rdma_restrack_clean(device); 641 /* Balances with device_initialize */ 642 put_device(&device->dev); 643 } 644 EXPORT_SYMBOL(ib_dealloc_device); 645 646 /* 647 * add_client_context() and remove_client_context() must be safe against 648 * parallel calls on the same device - registration/unregistration of both the 649 * device and client can be occurring in parallel. 650 * 651 * The routines need to be a fence, any caller must not return until the add 652 * or remove is fully completed. 653 */ 654 static int add_client_context(struct ib_device *device, 655 struct ib_client *client) 656 { 657 int ret = 0; 658 659 if (!device->kverbs_provider && !client->no_kverbs_req) 660 return 0; 661 662 down_write(&device->client_data_rwsem); 663 /* 664 * Another caller to add_client_context got here first and has already 665 * completely initialized context. 666 */ 667 if (xa_get_mark(&device->client_data, client->client_id, 668 CLIENT_DATA_REGISTERED)) 669 goto out; 670 671 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 672 GFP_KERNEL)); 673 if (ret) 674 goto out; 675 downgrade_write(&device->client_data_rwsem); 676 if (client->add) 677 client->add(device); 678 679 /* Readers shall not see a client until add has been completed */ 680 xa_set_mark(&device->client_data, client->client_id, 681 CLIENT_DATA_REGISTERED); 682 up_read(&device->client_data_rwsem); 683 return 0; 684 685 out: 686 up_write(&device->client_data_rwsem); 687 return ret; 688 } 689 690 static void remove_client_context(struct ib_device *device, 691 unsigned int client_id) 692 { 693 struct ib_client *client; 694 void *client_data; 695 696 down_write(&device->client_data_rwsem); 697 if (!xa_get_mark(&device->client_data, client_id, 698 CLIENT_DATA_REGISTERED)) { 699 up_write(&device->client_data_rwsem); 700 return; 701 } 702 client_data = xa_load(&device->client_data, client_id); 703 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 704 client = xa_load(&clients, client_id); 705 downgrade_write(&device->client_data_rwsem); 706 707 /* 708 * Notice we cannot be holding any exclusive locks when calling the 709 * remove callback as the remove callback can recurse back into any 710 * public functions in this module and thus try for any locks those 711 * functions take. 712 * 713 * For this reason clients and drivers should not call the 714 * unregistration functions will holdling any locks. 715 * 716 * It tempting to drop the client_data_rwsem too, but this is required 717 * to ensure that unregister_client does not return until all clients 718 * are completely unregistered, which is required to avoid module 719 * unloading races. 720 */ 721 if (client->remove) 722 client->remove(device, client_data); 723 724 xa_erase(&device->client_data, client_id); 725 up_read(&device->client_data_rwsem); 726 } 727 728 static int alloc_port_data(struct ib_device *device) 729 { 730 struct ib_port_data_rcu *pdata_rcu; 731 unsigned int port; 732 733 if (device->port_data) 734 return 0; 735 736 /* This can only be called once the physical port range is defined */ 737 if (WARN_ON(!device->phys_port_cnt)) 738 return -EINVAL; 739 740 /* 741 * device->port_data is indexed directly by the port number to make 742 * access to this data as efficient as possible. 743 * 744 * Therefore port_data is declared as a 1 based array with potential 745 * empty slots at the beginning. 746 */ 747 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 748 rdma_end_port(device) + 1), 749 GFP_KERNEL); 750 if (!pdata_rcu) 751 return -ENOMEM; 752 /* 753 * The rcu_head is put in front of the port data array and the stored 754 * pointer is adjusted since we never need to see that member until 755 * kfree_rcu. 756 */ 757 device->port_data = pdata_rcu->pdata; 758 759 rdma_for_each_port (device, port) { 760 struct ib_port_data *pdata = &device->port_data[port]; 761 762 pdata->ib_dev = device; 763 spin_lock_init(&pdata->pkey_list_lock); 764 INIT_LIST_HEAD(&pdata->pkey_list); 765 spin_lock_init(&pdata->netdev_lock); 766 INIT_HLIST_NODE(&pdata->ndev_hash_link); 767 } 768 return 0; 769 } 770 771 static int verify_immutable(const struct ib_device *dev, u8 port) 772 { 773 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 774 rdma_max_mad_size(dev, port) != 0); 775 } 776 777 static int setup_port_data(struct ib_device *device) 778 { 779 unsigned int port; 780 int ret; 781 782 ret = alloc_port_data(device); 783 if (ret) 784 return ret; 785 786 rdma_for_each_port (device, port) { 787 struct ib_port_data *pdata = &device->port_data[port]; 788 789 ret = device->ops.get_port_immutable(device, port, 790 &pdata->immutable); 791 if (ret) 792 return ret; 793 794 if (verify_immutable(device, port)) 795 return -EINVAL; 796 } 797 return 0; 798 } 799 800 void ib_get_device_fw_str(struct ib_device *dev, char *str) 801 { 802 if (dev->ops.get_dev_fw_str) 803 dev->ops.get_dev_fw_str(dev, str); 804 else 805 str[0] = '\0'; 806 } 807 EXPORT_SYMBOL(ib_get_device_fw_str); 808 809 static void ib_policy_change_task(struct work_struct *work) 810 { 811 struct ib_device *dev; 812 unsigned long index; 813 814 down_read(&devices_rwsem); 815 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 816 unsigned int i; 817 818 rdma_for_each_port (dev, i) { 819 u64 sp; 820 int ret = ib_get_cached_subnet_prefix(dev, 821 i, 822 &sp); 823 824 WARN_ONCE(ret, 825 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 826 ret); 827 if (!ret) 828 ib_security_cache_change(dev, i, sp); 829 } 830 } 831 up_read(&devices_rwsem); 832 } 833 834 static int ib_security_change(struct notifier_block *nb, unsigned long event, 835 void *lsm_data) 836 { 837 if (event != LSM_POLICY_CHANGE) 838 return NOTIFY_DONE; 839 840 schedule_work(&ib_policy_change_work); 841 ib_mad_agent_security_change(); 842 843 return NOTIFY_OK; 844 } 845 846 static void compatdev_release(struct device *dev) 847 { 848 struct ib_core_device *cdev = 849 container_of(dev, struct ib_core_device, dev); 850 851 kfree(cdev); 852 } 853 854 static int add_one_compat_dev(struct ib_device *device, 855 struct rdma_dev_net *rnet) 856 { 857 struct ib_core_device *cdev; 858 int ret; 859 860 lockdep_assert_held(&rdma_nets_rwsem); 861 if (!ib_devices_shared_netns) 862 return 0; 863 864 /* 865 * Create and add compat device in all namespaces other than where it 866 * is currently bound to. 867 */ 868 if (net_eq(read_pnet(&rnet->net), 869 read_pnet(&device->coredev.rdma_net))) 870 return 0; 871 872 /* 873 * The first of init_net() or ib_register_device() to take the 874 * compat_devs_mutex wins and gets to add the device. Others will wait 875 * for completion here. 876 */ 877 mutex_lock(&device->compat_devs_mutex); 878 cdev = xa_load(&device->compat_devs, rnet->id); 879 if (cdev) { 880 ret = 0; 881 goto done; 882 } 883 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 884 if (ret) 885 goto done; 886 887 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 888 if (!cdev) { 889 ret = -ENOMEM; 890 goto cdev_err; 891 } 892 893 cdev->dev.parent = device->dev.parent; 894 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 895 cdev->dev.release = compatdev_release; 896 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 897 898 ret = device_add(&cdev->dev); 899 if (ret) 900 goto add_err; 901 ret = ib_setup_port_attrs(cdev); 902 if (ret) 903 goto port_err; 904 905 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 906 cdev, GFP_KERNEL)); 907 if (ret) 908 goto insert_err; 909 910 mutex_unlock(&device->compat_devs_mutex); 911 return 0; 912 913 insert_err: 914 ib_free_port_attrs(cdev); 915 port_err: 916 device_del(&cdev->dev); 917 add_err: 918 put_device(&cdev->dev); 919 cdev_err: 920 xa_release(&device->compat_devs, rnet->id); 921 done: 922 mutex_unlock(&device->compat_devs_mutex); 923 return ret; 924 } 925 926 static void remove_one_compat_dev(struct ib_device *device, u32 id) 927 { 928 struct ib_core_device *cdev; 929 930 mutex_lock(&device->compat_devs_mutex); 931 cdev = xa_erase(&device->compat_devs, id); 932 mutex_unlock(&device->compat_devs_mutex); 933 if (cdev) { 934 ib_free_port_attrs(cdev); 935 device_del(&cdev->dev); 936 put_device(&cdev->dev); 937 } 938 } 939 940 static void remove_compat_devs(struct ib_device *device) 941 { 942 struct ib_core_device *cdev; 943 unsigned long index; 944 945 xa_for_each (&device->compat_devs, index, cdev) 946 remove_one_compat_dev(device, index); 947 } 948 949 static int add_compat_devs(struct ib_device *device) 950 { 951 struct rdma_dev_net *rnet; 952 unsigned long index; 953 int ret = 0; 954 955 lockdep_assert_held(&devices_rwsem); 956 957 down_read(&rdma_nets_rwsem); 958 xa_for_each (&rdma_nets, index, rnet) { 959 ret = add_one_compat_dev(device, rnet); 960 if (ret) 961 break; 962 } 963 up_read(&rdma_nets_rwsem); 964 return ret; 965 } 966 967 static void remove_all_compat_devs(void) 968 { 969 struct ib_compat_device *cdev; 970 struct ib_device *dev; 971 unsigned long index; 972 973 down_read(&devices_rwsem); 974 xa_for_each (&devices, index, dev) { 975 unsigned long c_index = 0; 976 977 /* Hold nets_rwsem so that any other thread modifying this 978 * system param can sync with this thread. 979 */ 980 down_read(&rdma_nets_rwsem); 981 xa_for_each (&dev->compat_devs, c_index, cdev) 982 remove_one_compat_dev(dev, c_index); 983 up_read(&rdma_nets_rwsem); 984 } 985 up_read(&devices_rwsem); 986 } 987 988 static int add_all_compat_devs(void) 989 { 990 struct rdma_dev_net *rnet; 991 struct ib_device *dev; 992 unsigned long index; 993 int ret = 0; 994 995 down_read(&devices_rwsem); 996 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 997 unsigned long net_index = 0; 998 999 /* Hold nets_rwsem so that any other thread modifying this 1000 * system param can sync with this thread. 1001 */ 1002 down_read(&rdma_nets_rwsem); 1003 xa_for_each (&rdma_nets, net_index, rnet) { 1004 ret = add_one_compat_dev(dev, rnet); 1005 if (ret) 1006 break; 1007 } 1008 up_read(&rdma_nets_rwsem); 1009 } 1010 up_read(&devices_rwsem); 1011 if (ret) 1012 remove_all_compat_devs(); 1013 return ret; 1014 } 1015 1016 int rdma_compatdev_set(u8 enable) 1017 { 1018 struct rdma_dev_net *rnet; 1019 unsigned long index; 1020 int ret = 0; 1021 1022 down_write(&rdma_nets_rwsem); 1023 if (ib_devices_shared_netns == enable) { 1024 up_write(&rdma_nets_rwsem); 1025 return 0; 1026 } 1027 1028 /* enable/disable of compat devices is not supported 1029 * when more than default init_net exists. 1030 */ 1031 xa_for_each (&rdma_nets, index, rnet) { 1032 ret++; 1033 break; 1034 } 1035 if (!ret) 1036 ib_devices_shared_netns = enable; 1037 up_write(&rdma_nets_rwsem); 1038 if (ret) 1039 return -EBUSY; 1040 1041 if (enable) 1042 ret = add_all_compat_devs(); 1043 else 1044 remove_all_compat_devs(); 1045 return ret; 1046 } 1047 1048 static void rdma_dev_exit_net(struct net *net) 1049 { 1050 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1051 struct ib_device *dev; 1052 unsigned long index; 1053 int ret; 1054 1055 down_write(&rdma_nets_rwsem); 1056 /* 1057 * Prevent the ID from being re-used and hide the id from xa_for_each. 1058 */ 1059 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1060 WARN_ON(ret); 1061 up_write(&rdma_nets_rwsem); 1062 1063 down_read(&devices_rwsem); 1064 xa_for_each (&devices, index, dev) { 1065 get_device(&dev->dev); 1066 /* 1067 * Release the devices_rwsem so that pontentially blocking 1068 * device_del, doesn't hold the devices_rwsem for too long. 1069 */ 1070 up_read(&devices_rwsem); 1071 1072 remove_one_compat_dev(dev, rnet->id); 1073 1074 /* 1075 * If the real device is in the NS then move it back to init. 1076 */ 1077 rdma_dev_change_netns(dev, net, &init_net); 1078 1079 put_device(&dev->dev); 1080 down_read(&devices_rwsem); 1081 } 1082 up_read(&devices_rwsem); 1083 1084 xa_erase(&rdma_nets, rnet->id); 1085 } 1086 1087 static __net_init int rdma_dev_init_net(struct net *net) 1088 { 1089 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1090 unsigned long index; 1091 struct ib_device *dev; 1092 int ret; 1093 1094 /* No need to create any compat devices in default init_net. */ 1095 if (net_eq(net, &init_net)) 1096 return 0; 1097 1098 write_pnet(&rnet->net, net); 1099 1100 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1101 if (ret) 1102 return ret; 1103 1104 down_read(&devices_rwsem); 1105 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1106 /* Hold nets_rwsem so that netlink command cannot change 1107 * system configuration for device sharing mode. 1108 */ 1109 down_read(&rdma_nets_rwsem); 1110 ret = add_one_compat_dev(dev, rnet); 1111 up_read(&rdma_nets_rwsem); 1112 if (ret) 1113 break; 1114 } 1115 up_read(&devices_rwsem); 1116 1117 if (ret) 1118 rdma_dev_exit_net(net); 1119 1120 return ret; 1121 } 1122 1123 /* 1124 * Assign the unique string device name and the unique device index. This is 1125 * undone by ib_dealloc_device. 1126 */ 1127 static int assign_name(struct ib_device *device, const char *name) 1128 { 1129 static u32 last_id; 1130 int ret; 1131 1132 down_write(&devices_rwsem); 1133 /* Assign a unique name to the device */ 1134 if (strchr(name, '%')) 1135 ret = alloc_name(device, name); 1136 else 1137 ret = dev_set_name(&device->dev, name); 1138 if (ret) 1139 goto out; 1140 1141 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1142 ret = -ENFILE; 1143 goto out; 1144 } 1145 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1146 1147 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1148 &last_id, GFP_KERNEL); 1149 if (ret > 0) 1150 ret = 0; 1151 1152 out: 1153 up_write(&devices_rwsem); 1154 return ret; 1155 } 1156 1157 static void setup_dma_device(struct ib_device *device) 1158 { 1159 struct device *parent = device->dev.parent; 1160 1161 WARN_ON_ONCE(device->dma_device); 1162 if (device->dev.dma_ops) { 1163 /* 1164 * The caller provided custom DMA operations. Copy the 1165 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1166 * into device->dev. 1167 */ 1168 device->dma_device = &device->dev; 1169 if (!device->dev.dma_mask) { 1170 if (parent) 1171 device->dev.dma_mask = parent->dma_mask; 1172 else 1173 WARN_ON_ONCE(true); 1174 } 1175 if (!device->dev.coherent_dma_mask) { 1176 if (parent) 1177 device->dev.coherent_dma_mask = 1178 parent->coherent_dma_mask; 1179 else 1180 WARN_ON_ONCE(true); 1181 } 1182 } else { 1183 /* 1184 * The caller did not provide custom DMA operations. Use the 1185 * DMA mapping operations of the parent device. 1186 */ 1187 WARN_ON_ONCE(!parent); 1188 device->dma_device = parent; 1189 } 1190 /* Setup default max segment size for all IB devices */ 1191 dma_set_max_seg_size(device->dma_device, SZ_2G); 1192 1193 } 1194 1195 /* 1196 * setup_device() allocates memory and sets up data that requires calling the 1197 * device ops, this is the only reason these actions are not done during 1198 * ib_alloc_device. It is undone by ib_dealloc_device(). 1199 */ 1200 static int setup_device(struct ib_device *device) 1201 { 1202 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1203 int ret; 1204 1205 setup_dma_device(device); 1206 ib_device_check_mandatory(device); 1207 1208 ret = setup_port_data(device); 1209 if (ret) { 1210 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1211 return ret; 1212 } 1213 1214 memset(&device->attrs, 0, sizeof(device->attrs)); 1215 ret = device->ops.query_device(device, &device->attrs, &uhw); 1216 if (ret) { 1217 dev_warn(&device->dev, 1218 "Couldn't query the device attributes\n"); 1219 return ret; 1220 } 1221 1222 return 0; 1223 } 1224 1225 static void disable_device(struct ib_device *device) 1226 { 1227 struct ib_client *client; 1228 1229 WARN_ON(!refcount_read(&device->refcount)); 1230 1231 down_write(&devices_rwsem); 1232 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1233 up_write(&devices_rwsem); 1234 1235 down_read(&clients_rwsem); 1236 list_for_each_entry_reverse(client, &client_list, list) 1237 remove_client_context(device, client->client_id); 1238 up_read(&clients_rwsem); 1239 1240 /* Pairs with refcount_set in enable_device */ 1241 ib_device_put(device); 1242 wait_for_completion(&device->unreg_completion); 1243 1244 /* 1245 * compat devices must be removed after device refcount drops to zero. 1246 * Otherwise init_net() may add more compatdevs after removing compat 1247 * devices and before device is disabled. 1248 */ 1249 remove_compat_devs(device); 1250 } 1251 1252 /* 1253 * An enabled device is visible to all clients and to all the public facing 1254 * APIs that return a device pointer. This always returns with a new get, even 1255 * if it fails. 1256 */ 1257 static int enable_device_and_get(struct ib_device *device) 1258 { 1259 struct ib_client *client; 1260 unsigned long index; 1261 int ret = 0; 1262 1263 /* 1264 * One ref belongs to the xa and the other belongs to this 1265 * thread. This is needed to guard against parallel unregistration. 1266 */ 1267 refcount_set(&device->refcount, 2); 1268 down_write(&devices_rwsem); 1269 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1270 1271 /* 1272 * By using downgrade_write() we ensure that no other thread can clear 1273 * DEVICE_REGISTERED while we are completing the client setup. 1274 */ 1275 downgrade_write(&devices_rwsem); 1276 1277 if (device->ops.enable_driver) { 1278 ret = device->ops.enable_driver(device); 1279 if (ret) 1280 goto out; 1281 } 1282 1283 down_read(&clients_rwsem); 1284 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1285 ret = add_client_context(device, client); 1286 if (ret) 1287 break; 1288 } 1289 up_read(&clients_rwsem); 1290 if (!ret) 1291 ret = add_compat_devs(device); 1292 out: 1293 up_read(&devices_rwsem); 1294 return ret; 1295 } 1296 1297 /** 1298 * ib_register_device - Register an IB device with IB core 1299 * @device:Device to register 1300 * 1301 * Low-level drivers use ib_register_device() to register their 1302 * devices with the IB core. All registered clients will receive a 1303 * callback for each device that is added. @device must be allocated 1304 * with ib_alloc_device(). 1305 * 1306 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1307 * asynchronously then the device pointer may become freed as soon as this 1308 * function returns. 1309 */ 1310 int ib_register_device(struct ib_device *device, const char *name) 1311 { 1312 int ret; 1313 1314 ret = assign_name(device, name); 1315 if (ret) 1316 return ret; 1317 1318 ret = setup_device(device); 1319 if (ret) 1320 return ret; 1321 1322 ret = ib_cache_setup_one(device); 1323 if (ret) { 1324 dev_warn(&device->dev, 1325 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1326 return ret; 1327 } 1328 1329 ib_device_register_rdmacg(device); 1330 1331 rdma_counter_init(device); 1332 1333 /* 1334 * Ensure that ADD uevent is not fired because it 1335 * is too early amd device is not initialized yet. 1336 */ 1337 dev_set_uevent_suppress(&device->dev, true); 1338 ret = device_add(&device->dev); 1339 if (ret) 1340 goto cg_cleanup; 1341 1342 ret = ib_device_register_sysfs(device); 1343 if (ret) { 1344 dev_warn(&device->dev, 1345 "Couldn't register device with driver model\n"); 1346 goto dev_cleanup; 1347 } 1348 1349 ret = enable_device_and_get(device); 1350 dev_set_uevent_suppress(&device->dev, false); 1351 /* Mark for userspace that device is ready */ 1352 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1353 if (ret) { 1354 void (*dealloc_fn)(struct ib_device *); 1355 1356 /* 1357 * If we hit this error flow then we don't want to 1358 * automatically dealloc the device since the caller is 1359 * expected to call ib_dealloc_device() after 1360 * ib_register_device() fails. This is tricky due to the 1361 * possibility for a parallel unregistration along with this 1362 * error flow. Since we have a refcount here we know any 1363 * parallel flow is stopped in disable_device and will see the 1364 * NULL pointers, causing the responsibility to 1365 * ib_dealloc_device() to revert back to this thread. 1366 */ 1367 dealloc_fn = device->ops.dealloc_driver; 1368 device->ops.dealloc_driver = NULL; 1369 ib_device_put(device); 1370 __ib_unregister_device(device); 1371 device->ops.dealloc_driver = dealloc_fn; 1372 return ret; 1373 } 1374 ib_device_put(device); 1375 1376 return 0; 1377 1378 dev_cleanup: 1379 device_del(&device->dev); 1380 cg_cleanup: 1381 dev_set_uevent_suppress(&device->dev, false); 1382 ib_device_unregister_rdmacg(device); 1383 ib_cache_cleanup_one(device); 1384 return ret; 1385 } 1386 EXPORT_SYMBOL(ib_register_device); 1387 1388 /* Callers must hold a get on the device. */ 1389 static void __ib_unregister_device(struct ib_device *ib_dev) 1390 { 1391 /* 1392 * We have a registration lock so that all the calls to unregister are 1393 * fully fenced, once any unregister returns the device is truely 1394 * unregistered even if multiple callers are unregistering it at the 1395 * same time. This also interacts with the registration flow and 1396 * provides sane semantics if register and unregister are racing. 1397 */ 1398 mutex_lock(&ib_dev->unregistration_lock); 1399 if (!refcount_read(&ib_dev->refcount)) 1400 goto out; 1401 1402 disable_device(ib_dev); 1403 1404 /* Expedite removing unregistered pointers from the hash table */ 1405 free_netdevs(ib_dev); 1406 1407 ib_device_unregister_sysfs(ib_dev); 1408 device_del(&ib_dev->dev); 1409 ib_device_unregister_rdmacg(ib_dev); 1410 ib_cache_cleanup_one(ib_dev); 1411 1412 /* 1413 * Drivers using the new flow may not call ib_dealloc_device except 1414 * in error unwind prior to registration success. 1415 */ 1416 if (ib_dev->ops.dealloc_driver) { 1417 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1418 ib_dealloc_device(ib_dev); 1419 } 1420 out: 1421 mutex_unlock(&ib_dev->unregistration_lock); 1422 } 1423 1424 /** 1425 * ib_unregister_device - Unregister an IB device 1426 * @device: The device to unregister 1427 * 1428 * Unregister an IB device. All clients will receive a remove callback. 1429 * 1430 * Callers should call this routine only once, and protect against races with 1431 * registration. Typically it should only be called as part of a remove 1432 * callback in an implementation of driver core's struct device_driver and 1433 * related. 1434 * 1435 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1436 * this function. 1437 */ 1438 void ib_unregister_device(struct ib_device *ib_dev) 1439 { 1440 get_device(&ib_dev->dev); 1441 __ib_unregister_device(ib_dev); 1442 put_device(&ib_dev->dev); 1443 } 1444 EXPORT_SYMBOL(ib_unregister_device); 1445 1446 /** 1447 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1448 * device: The device to unregister 1449 * 1450 * This is the same as ib_unregister_device(), except it includes an internal 1451 * ib_device_put() that should match a 'get' obtained by the caller. 1452 * 1453 * It is safe to call this routine concurrently from multiple threads while 1454 * holding the 'get'. When the function returns the device is fully 1455 * unregistered. 1456 * 1457 * Drivers using this flow MUST use the driver_unregister callback to clean up 1458 * their resources associated with the device and dealloc it. 1459 */ 1460 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1461 { 1462 WARN_ON(!ib_dev->ops.dealloc_driver); 1463 get_device(&ib_dev->dev); 1464 ib_device_put(ib_dev); 1465 __ib_unregister_device(ib_dev); 1466 put_device(&ib_dev->dev); 1467 } 1468 EXPORT_SYMBOL(ib_unregister_device_and_put); 1469 1470 /** 1471 * ib_unregister_driver - Unregister all IB devices for a driver 1472 * @driver_id: The driver to unregister 1473 * 1474 * This implements a fence for device unregistration. It only returns once all 1475 * devices associated with the driver_id have fully completed their 1476 * unregistration and returned from ib_unregister_device*(). 1477 * 1478 * If device's are not yet unregistered it goes ahead and starts unregistering 1479 * them. 1480 * 1481 * This does not block creation of new devices with the given driver_id, that 1482 * is the responsibility of the caller. 1483 */ 1484 void ib_unregister_driver(enum rdma_driver_id driver_id) 1485 { 1486 struct ib_device *ib_dev; 1487 unsigned long index; 1488 1489 down_read(&devices_rwsem); 1490 xa_for_each (&devices, index, ib_dev) { 1491 if (ib_dev->ops.driver_id != driver_id) 1492 continue; 1493 1494 get_device(&ib_dev->dev); 1495 up_read(&devices_rwsem); 1496 1497 WARN_ON(!ib_dev->ops.dealloc_driver); 1498 __ib_unregister_device(ib_dev); 1499 1500 put_device(&ib_dev->dev); 1501 down_read(&devices_rwsem); 1502 } 1503 up_read(&devices_rwsem); 1504 } 1505 EXPORT_SYMBOL(ib_unregister_driver); 1506 1507 static void ib_unregister_work(struct work_struct *work) 1508 { 1509 struct ib_device *ib_dev = 1510 container_of(work, struct ib_device, unregistration_work); 1511 1512 __ib_unregister_device(ib_dev); 1513 put_device(&ib_dev->dev); 1514 } 1515 1516 /** 1517 * ib_unregister_device_queued - Unregister a device using a work queue 1518 * device: The device to unregister 1519 * 1520 * This schedules an asynchronous unregistration using a WQ for the device. A 1521 * driver should use this to avoid holding locks while doing unregistration, 1522 * such as holding the RTNL lock. 1523 * 1524 * Drivers using this API must use ib_unregister_driver before module unload 1525 * to ensure that all scheduled unregistrations have completed. 1526 */ 1527 void ib_unregister_device_queued(struct ib_device *ib_dev) 1528 { 1529 WARN_ON(!refcount_read(&ib_dev->refcount)); 1530 WARN_ON(!ib_dev->ops.dealloc_driver); 1531 get_device(&ib_dev->dev); 1532 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1533 put_device(&ib_dev->dev); 1534 } 1535 EXPORT_SYMBOL(ib_unregister_device_queued); 1536 1537 /* 1538 * The caller must pass in a device that has the kref held and the refcount 1539 * released. If the device is in cur_net and still registered then it is moved 1540 * into net. 1541 */ 1542 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1543 struct net *net) 1544 { 1545 int ret2 = -EINVAL; 1546 int ret; 1547 1548 mutex_lock(&device->unregistration_lock); 1549 1550 /* 1551 * If a device not under ib_device_get() or if the unregistration_lock 1552 * is not held, the namespace can be changed, or it can be unregistered. 1553 * Check again under the lock. 1554 */ 1555 if (refcount_read(&device->refcount) == 0 || 1556 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1557 ret = -ENODEV; 1558 goto out; 1559 } 1560 1561 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1562 disable_device(device); 1563 1564 /* 1565 * At this point no one can be using the device, so it is safe to 1566 * change the namespace. 1567 */ 1568 write_pnet(&device->coredev.rdma_net, net); 1569 1570 down_read(&devices_rwsem); 1571 /* 1572 * Currently rdma devices are system wide unique. So the device name 1573 * is guaranteed free in the new namespace. Publish the new namespace 1574 * at the sysfs level. 1575 */ 1576 ret = device_rename(&device->dev, dev_name(&device->dev)); 1577 up_read(&devices_rwsem); 1578 if (ret) { 1579 dev_warn(&device->dev, 1580 "%s: Couldn't rename device after namespace change\n", 1581 __func__); 1582 /* Try and put things back and re-enable the device */ 1583 write_pnet(&device->coredev.rdma_net, cur_net); 1584 } 1585 1586 ret2 = enable_device_and_get(device); 1587 if (ret2) { 1588 /* 1589 * This shouldn't really happen, but if it does, let the user 1590 * retry at later point. So don't disable the device. 1591 */ 1592 dev_warn(&device->dev, 1593 "%s: Couldn't re-enable device after namespace change\n", 1594 __func__); 1595 } 1596 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1597 1598 ib_device_put(device); 1599 out: 1600 mutex_unlock(&device->unregistration_lock); 1601 if (ret) 1602 return ret; 1603 return ret2; 1604 } 1605 1606 int ib_device_set_netns_put(struct sk_buff *skb, 1607 struct ib_device *dev, u32 ns_fd) 1608 { 1609 struct net *net; 1610 int ret; 1611 1612 net = get_net_ns_by_fd(ns_fd); 1613 if (IS_ERR(net)) { 1614 ret = PTR_ERR(net); 1615 goto net_err; 1616 } 1617 1618 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1619 ret = -EPERM; 1620 goto ns_err; 1621 } 1622 1623 /* 1624 * Currently supported only for those providers which support 1625 * disassociation and don't do port specific sysfs init. Once a 1626 * port_cleanup infrastructure is implemented, this limitation will be 1627 * removed. 1628 */ 1629 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1630 ib_devices_shared_netns) { 1631 ret = -EOPNOTSUPP; 1632 goto ns_err; 1633 } 1634 1635 get_device(&dev->dev); 1636 ib_device_put(dev); 1637 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1638 put_device(&dev->dev); 1639 1640 put_net(net); 1641 return ret; 1642 1643 ns_err: 1644 put_net(net); 1645 net_err: 1646 ib_device_put(dev); 1647 return ret; 1648 } 1649 1650 static struct pernet_operations rdma_dev_net_ops = { 1651 .init = rdma_dev_init_net, 1652 .exit = rdma_dev_exit_net, 1653 .id = &rdma_dev_net_id, 1654 .size = sizeof(struct rdma_dev_net), 1655 }; 1656 1657 static int assign_client_id(struct ib_client *client) 1658 { 1659 int ret; 1660 1661 down_write(&clients_rwsem); 1662 /* 1663 * The add/remove callbacks must be called in FIFO/LIFO order. To 1664 * achieve this we assign client_ids so they are sorted in 1665 * registration order, and retain a linked list we can reverse iterate 1666 * to get the LIFO order. The extra linked list can go away if xarray 1667 * learns to reverse iterate. 1668 */ 1669 if (list_empty(&client_list)) { 1670 client->client_id = 0; 1671 } else { 1672 struct ib_client *last; 1673 1674 last = list_last_entry(&client_list, struct ib_client, list); 1675 client->client_id = last->client_id + 1; 1676 } 1677 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1678 if (ret) 1679 goto out; 1680 1681 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1682 list_add_tail(&client->list, &client_list); 1683 1684 out: 1685 up_write(&clients_rwsem); 1686 return ret; 1687 } 1688 1689 /** 1690 * ib_register_client - Register an IB client 1691 * @client:Client to register 1692 * 1693 * Upper level users of the IB drivers can use ib_register_client() to 1694 * register callbacks for IB device addition and removal. When an IB 1695 * device is added, each registered client's add method will be called 1696 * (in the order the clients were registered), and when a device is 1697 * removed, each client's remove method will be called (in the reverse 1698 * order that clients were registered). In addition, when 1699 * ib_register_client() is called, the client will receive an add 1700 * callback for all devices already registered. 1701 */ 1702 int ib_register_client(struct ib_client *client) 1703 { 1704 struct ib_device *device; 1705 unsigned long index; 1706 int ret; 1707 1708 ret = assign_client_id(client); 1709 if (ret) 1710 return ret; 1711 1712 down_read(&devices_rwsem); 1713 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1714 ret = add_client_context(device, client); 1715 if (ret) { 1716 up_read(&devices_rwsem); 1717 ib_unregister_client(client); 1718 return ret; 1719 } 1720 } 1721 up_read(&devices_rwsem); 1722 return 0; 1723 } 1724 EXPORT_SYMBOL(ib_register_client); 1725 1726 /** 1727 * ib_unregister_client - Unregister an IB client 1728 * @client:Client to unregister 1729 * 1730 * Upper level users use ib_unregister_client() to remove their client 1731 * registration. When ib_unregister_client() is called, the client 1732 * will receive a remove callback for each IB device still registered. 1733 * 1734 * This is a full fence, once it returns no client callbacks will be called, 1735 * or are running in another thread. 1736 */ 1737 void ib_unregister_client(struct ib_client *client) 1738 { 1739 struct ib_device *device; 1740 unsigned long index; 1741 1742 down_write(&clients_rwsem); 1743 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1744 up_write(&clients_rwsem); 1745 /* 1746 * Every device still known must be serialized to make sure we are 1747 * done with the client callbacks before we return. 1748 */ 1749 down_read(&devices_rwsem); 1750 xa_for_each (&devices, index, device) 1751 remove_client_context(device, client->client_id); 1752 up_read(&devices_rwsem); 1753 1754 down_write(&clients_rwsem); 1755 list_del(&client->list); 1756 xa_erase(&clients, client->client_id); 1757 up_write(&clients_rwsem); 1758 } 1759 EXPORT_SYMBOL(ib_unregister_client); 1760 1761 static int __ib_get_global_client_nl_info(const char *client_name, 1762 struct ib_client_nl_info *res) 1763 { 1764 struct ib_client *client; 1765 unsigned long index; 1766 int ret = -ENOENT; 1767 1768 down_read(&clients_rwsem); 1769 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1770 if (strcmp(client->name, client_name) != 0) 1771 continue; 1772 if (!client->get_global_nl_info) { 1773 ret = -EOPNOTSUPP; 1774 break; 1775 } 1776 ret = client->get_global_nl_info(res); 1777 if (WARN_ON(ret == -ENOENT)) 1778 ret = -EINVAL; 1779 if (!ret && res->cdev) 1780 get_device(res->cdev); 1781 break; 1782 } 1783 up_read(&clients_rwsem); 1784 return ret; 1785 } 1786 1787 static int __ib_get_client_nl_info(struct ib_device *ibdev, 1788 const char *client_name, 1789 struct ib_client_nl_info *res) 1790 { 1791 unsigned long index; 1792 void *client_data; 1793 int ret = -ENOENT; 1794 1795 down_read(&ibdev->client_data_rwsem); 1796 xan_for_each_marked (&ibdev->client_data, index, client_data, 1797 CLIENT_DATA_REGISTERED) { 1798 struct ib_client *client = xa_load(&clients, index); 1799 1800 if (!client || strcmp(client->name, client_name) != 0) 1801 continue; 1802 if (!client->get_nl_info) { 1803 ret = -EOPNOTSUPP; 1804 break; 1805 } 1806 ret = client->get_nl_info(ibdev, client_data, res); 1807 if (WARN_ON(ret == -ENOENT)) 1808 ret = -EINVAL; 1809 1810 /* 1811 * The cdev is guaranteed valid as long as we are inside the 1812 * client_data_rwsem as remove_one can't be called. Keep it 1813 * valid for the caller. 1814 */ 1815 if (!ret && res->cdev) 1816 get_device(res->cdev); 1817 break; 1818 } 1819 up_read(&ibdev->client_data_rwsem); 1820 1821 return ret; 1822 } 1823 1824 /** 1825 * ib_get_client_nl_info - Fetch the nl_info from a client 1826 * @device - IB device 1827 * @client_name - Name of the client 1828 * @res - Result of the query 1829 */ 1830 int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, 1831 struct ib_client_nl_info *res) 1832 { 1833 int ret; 1834 1835 if (ibdev) 1836 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1837 else 1838 ret = __ib_get_global_client_nl_info(client_name, res); 1839 #ifdef CONFIG_MODULES 1840 if (ret == -ENOENT) { 1841 request_module("rdma-client-%s", client_name); 1842 if (ibdev) 1843 ret = __ib_get_client_nl_info(ibdev, client_name, res); 1844 else 1845 ret = __ib_get_global_client_nl_info(client_name, res); 1846 } 1847 #endif 1848 if (ret) { 1849 if (ret == -ENOENT) 1850 return -EOPNOTSUPP; 1851 return ret; 1852 } 1853 1854 if (WARN_ON(!res->cdev)) 1855 return -EINVAL; 1856 return 0; 1857 } 1858 1859 /** 1860 * ib_set_client_data - Set IB client context 1861 * @device:Device to set context for 1862 * @client:Client to set context for 1863 * @data:Context to set 1864 * 1865 * ib_set_client_data() sets client context data that can be retrieved with 1866 * ib_get_client_data(). This can only be called while the client is 1867 * registered to the device, once the ib_client remove() callback returns this 1868 * cannot be called. 1869 */ 1870 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1871 void *data) 1872 { 1873 void *rc; 1874 1875 if (WARN_ON(IS_ERR(data))) 1876 data = NULL; 1877 1878 rc = xa_store(&device->client_data, client->client_id, data, 1879 GFP_KERNEL); 1880 WARN_ON(xa_is_err(rc)); 1881 } 1882 EXPORT_SYMBOL(ib_set_client_data); 1883 1884 /** 1885 * ib_register_event_handler - Register an IB event handler 1886 * @event_handler:Handler to register 1887 * 1888 * ib_register_event_handler() registers an event handler that will be 1889 * called back when asynchronous IB events occur (as defined in 1890 * chapter 11 of the InfiniBand Architecture Specification). This 1891 * callback may occur in interrupt context. 1892 */ 1893 void ib_register_event_handler(struct ib_event_handler *event_handler) 1894 { 1895 unsigned long flags; 1896 1897 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1898 list_add_tail(&event_handler->list, 1899 &event_handler->device->event_handler_list); 1900 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1901 } 1902 EXPORT_SYMBOL(ib_register_event_handler); 1903 1904 /** 1905 * ib_unregister_event_handler - Unregister an event handler 1906 * @event_handler:Handler to unregister 1907 * 1908 * Unregister an event handler registered with 1909 * ib_register_event_handler(). 1910 */ 1911 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1912 { 1913 unsigned long flags; 1914 1915 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1916 list_del(&event_handler->list); 1917 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1918 } 1919 EXPORT_SYMBOL(ib_unregister_event_handler); 1920 1921 /** 1922 * ib_dispatch_event - Dispatch an asynchronous event 1923 * @event:Event to dispatch 1924 * 1925 * Low-level drivers must call ib_dispatch_event() to dispatch the 1926 * event to all registered event handlers when an asynchronous event 1927 * occurs. 1928 */ 1929 void ib_dispatch_event(struct ib_event *event) 1930 { 1931 unsigned long flags; 1932 struct ib_event_handler *handler; 1933 1934 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1935 1936 list_for_each_entry(handler, &event->device->event_handler_list, list) 1937 handler->handler(handler, event); 1938 1939 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1940 } 1941 EXPORT_SYMBOL(ib_dispatch_event); 1942 1943 /** 1944 * ib_query_port - Query IB port attributes 1945 * @device:Device to query 1946 * @port_num:Port number to query 1947 * @port_attr:Port attributes 1948 * 1949 * ib_query_port() returns the attributes of a port through the 1950 * @port_attr pointer. 1951 */ 1952 int ib_query_port(struct ib_device *device, 1953 u8 port_num, 1954 struct ib_port_attr *port_attr) 1955 { 1956 union ib_gid gid; 1957 int err; 1958 1959 if (!rdma_is_port_valid(device, port_num)) 1960 return -EINVAL; 1961 1962 memset(port_attr, 0, sizeof(*port_attr)); 1963 err = device->ops.query_port(device, port_num, port_attr); 1964 if (err || port_attr->subnet_prefix) 1965 return err; 1966 1967 if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) 1968 return 0; 1969 1970 err = device->ops.query_gid(device, port_num, 0, &gid); 1971 if (err) 1972 return err; 1973 1974 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 1975 return 0; 1976 } 1977 EXPORT_SYMBOL(ib_query_port); 1978 1979 static void add_ndev_hash(struct ib_port_data *pdata) 1980 { 1981 unsigned long flags; 1982 1983 might_sleep(); 1984 1985 spin_lock_irqsave(&ndev_hash_lock, flags); 1986 if (hash_hashed(&pdata->ndev_hash_link)) { 1987 hash_del_rcu(&pdata->ndev_hash_link); 1988 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1989 /* 1990 * We cannot do hash_add_rcu after a hash_del_rcu until the 1991 * grace period 1992 */ 1993 synchronize_rcu(); 1994 spin_lock_irqsave(&ndev_hash_lock, flags); 1995 } 1996 if (pdata->netdev) 1997 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 1998 (uintptr_t)pdata->netdev); 1999 spin_unlock_irqrestore(&ndev_hash_lock, flags); 2000 } 2001 2002 /** 2003 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 2004 * @ib_dev: Device to modify 2005 * @ndev: net_device to affiliate, may be NULL 2006 * @port: IB port the net_device is connected to 2007 * 2008 * Drivers should use this to link the ib_device to a netdev so the netdev 2009 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 2010 * affiliated with any port. 2011 * 2012 * The caller must ensure that the given ndev is not unregistered or 2013 * unregistering, and that either the ib_device is unregistered or 2014 * ib_device_set_netdev() is called with NULL when the ndev sends a 2015 * NETDEV_UNREGISTER event. 2016 */ 2017 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 2018 unsigned int port) 2019 { 2020 struct net_device *old_ndev; 2021 struct ib_port_data *pdata; 2022 unsigned long flags; 2023 int ret; 2024 2025 /* 2026 * Drivers wish to call this before ib_register_driver, so we have to 2027 * setup the port data early. 2028 */ 2029 ret = alloc_port_data(ib_dev); 2030 if (ret) 2031 return ret; 2032 2033 if (!rdma_is_port_valid(ib_dev, port)) 2034 return -EINVAL; 2035 2036 pdata = &ib_dev->port_data[port]; 2037 spin_lock_irqsave(&pdata->netdev_lock, flags); 2038 old_ndev = rcu_dereference_protected( 2039 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2040 if (old_ndev == ndev) { 2041 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2042 return 0; 2043 } 2044 2045 if (ndev) 2046 dev_hold(ndev); 2047 rcu_assign_pointer(pdata->netdev, ndev); 2048 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2049 2050 add_ndev_hash(pdata); 2051 if (old_ndev) 2052 dev_put(old_ndev); 2053 2054 return 0; 2055 } 2056 EXPORT_SYMBOL(ib_device_set_netdev); 2057 2058 static void free_netdevs(struct ib_device *ib_dev) 2059 { 2060 unsigned long flags; 2061 unsigned int port; 2062 2063 if (!ib_dev->port_data) 2064 return; 2065 2066 rdma_for_each_port (ib_dev, port) { 2067 struct ib_port_data *pdata = &ib_dev->port_data[port]; 2068 struct net_device *ndev; 2069 2070 spin_lock_irqsave(&pdata->netdev_lock, flags); 2071 ndev = rcu_dereference_protected( 2072 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2073 if (ndev) { 2074 spin_lock(&ndev_hash_lock); 2075 hash_del_rcu(&pdata->ndev_hash_link); 2076 spin_unlock(&ndev_hash_lock); 2077 2078 /* 2079 * If this is the last dev_put there is still a 2080 * synchronize_rcu before the netdev is kfreed, so we 2081 * can continue to rely on unlocked pointer 2082 * comparisons after the put 2083 */ 2084 rcu_assign_pointer(pdata->netdev, NULL); 2085 dev_put(ndev); 2086 } 2087 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 2088 } 2089 } 2090 2091 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 2092 unsigned int port) 2093 { 2094 struct ib_port_data *pdata; 2095 struct net_device *res; 2096 2097 if (!rdma_is_port_valid(ib_dev, port)) 2098 return NULL; 2099 2100 pdata = &ib_dev->port_data[port]; 2101 2102 /* 2103 * New drivers should use ib_device_set_netdev() not the legacy 2104 * get_netdev(). 2105 */ 2106 if (ib_dev->ops.get_netdev) 2107 res = ib_dev->ops.get_netdev(ib_dev, port); 2108 else { 2109 spin_lock(&pdata->netdev_lock); 2110 res = rcu_dereference_protected( 2111 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2112 if (res) 2113 dev_hold(res); 2114 spin_unlock(&pdata->netdev_lock); 2115 } 2116 2117 /* 2118 * If we are starting to unregister expedite things by preventing 2119 * propagation of an unregistering netdev. 2120 */ 2121 if (res && res->reg_state != NETREG_REGISTERED) { 2122 dev_put(res); 2123 return NULL; 2124 } 2125 2126 return res; 2127 } 2128 2129 /** 2130 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2131 * @ndev: netdev to locate 2132 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2133 * 2134 * Find and hold an ib_device that is associated with a netdev via 2135 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2136 * returned pointer. 2137 */ 2138 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2139 enum rdma_driver_id driver_id) 2140 { 2141 struct ib_device *res = NULL; 2142 struct ib_port_data *cur; 2143 2144 rcu_read_lock(); 2145 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2146 (uintptr_t)ndev) { 2147 if (rcu_access_pointer(cur->netdev) == ndev && 2148 (driver_id == RDMA_DRIVER_UNKNOWN || 2149 cur->ib_dev->ops.driver_id == driver_id) && 2150 ib_device_try_get(cur->ib_dev)) { 2151 res = cur->ib_dev; 2152 break; 2153 } 2154 } 2155 rcu_read_unlock(); 2156 2157 return res; 2158 } 2159 EXPORT_SYMBOL(ib_device_get_by_netdev); 2160 2161 /** 2162 * ib_enum_roce_netdev - enumerate all RoCE ports 2163 * @ib_dev : IB device we want to query 2164 * @filter: Should we call the callback? 2165 * @filter_cookie: Cookie passed to filter 2166 * @cb: Callback to call for each found RoCE ports 2167 * @cookie: Cookie passed back to the callback 2168 * 2169 * Enumerates all of the physical RoCE ports of ib_dev 2170 * which are related to netdevice and calls callback() on each 2171 * device for which filter() function returns non zero. 2172 */ 2173 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2174 roce_netdev_filter filter, 2175 void *filter_cookie, 2176 roce_netdev_callback cb, 2177 void *cookie) 2178 { 2179 unsigned int port; 2180 2181 rdma_for_each_port (ib_dev, port) 2182 if (rdma_protocol_roce(ib_dev, port)) { 2183 struct net_device *idev = 2184 ib_device_get_netdev(ib_dev, port); 2185 2186 if (filter(ib_dev, port, idev, filter_cookie)) 2187 cb(ib_dev, port, idev, cookie); 2188 2189 if (idev) 2190 dev_put(idev); 2191 } 2192 } 2193 2194 /** 2195 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2196 * @filter: Should we call the callback? 2197 * @filter_cookie: Cookie passed to filter 2198 * @cb: Callback to call for each found RoCE ports 2199 * @cookie: Cookie passed back to the callback 2200 * 2201 * Enumerates all RoCE devices' physical ports which are related 2202 * to netdevices and calls callback() on each device for which 2203 * filter() function returns non zero. 2204 */ 2205 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2206 void *filter_cookie, 2207 roce_netdev_callback cb, 2208 void *cookie) 2209 { 2210 struct ib_device *dev; 2211 unsigned long index; 2212 2213 down_read(&devices_rwsem); 2214 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2215 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2216 up_read(&devices_rwsem); 2217 } 2218 2219 /** 2220 * ib_enum_all_devs - enumerate all ib_devices 2221 * @cb: Callback to call for each found ib_device 2222 * 2223 * Enumerates all ib_devices and calls callback() on each device. 2224 */ 2225 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2226 struct netlink_callback *cb) 2227 { 2228 unsigned long index; 2229 struct ib_device *dev; 2230 unsigned int idx = 0; 2231 int ret = 0; 2232 2233 down_read(&devices_rwsem); 2234 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2235 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2236 continue; 2237 2238 ret = nldev_cb(dev, skb, cb, idx); 2239 if (ret) 2240 break; 2241 idx++; 2242 } 2243 up_read(&devices_rwsem); 2244 return ret; 2245 } 2246 2247 /** 2248 * ib_query_pkey - Get P_Key table entry 2249 * @device:Device to query 2250 * @port_num:Port number to query 2251 * @index:P_Key table index to query 2252 * @pkey:Returned P_Key 2253 * 2254 * ib_query_pkey() fetches the specified P_Key table entry. 2255 */ 2256 int ib_query_pkey(struct ib_device *device, 2257 u8 port_num, u16 index, u16 *pkey) 2258 { 2259 if (!rdma_is_port_valid(device, port_num)) 2260 return -EINVAL; 2261 2262 return device->ops.query_pkey(device, port_num, index, pkey); 2263 } 2264 EXPORT_SYMBOL(ib_query_pkey); 2265 2266 /** 2267 * ib_modify_device - Change IB device attributes 2268 * @device:Device to modify 2269 * @device_modify_mask:Mask of attributes to change 2270 * @device_modify:New attribute values 2271 * 2272 * ib_modify_device() changes a device's attributes as specified by 2273 * the @device_modify_mask and @device_modify structure. 2274 */ 2275 int ib_modify_device(struct ib_device *device, 2276 int device_modify_mask, 2277 struct ib_device_modify *device_modify) 2278 { 2279 if (!device->ops.modify_device) 2280 return -ENOSYS; 2281 2282 return device->ops.modify_device(device, device_modify_mask, 2283 device_modify); 2284 } 2285 EXPORT_SYMBOL(ib_modify_device); 2286 2287 /** 2288 * ib_modify_port - Modifies the attributes for the specified port. 2289 * @device: The device to modify. 2290 * @port_num: The number of the port to modify. 2291 * @port_modify_mask: Mask used to specify which attributes of the port 2292 * to change. 2293 * @port_modify: New attribute values for the port. 2294 * 2295 * ib_modify_port() changes a port's attributes as specified by the 2296 * @port_modify_mask and @port_modify structure. 2297 */ 2298 int ib_modify_port(struct ib_device *device, 2299 u8 port_num, int port_modify_mask, 2300 struct ib_port_modify *port_modify) 2301 { 2302 int rc; 2303 2304 if (!rdma_is_port_valid(device, port_num)) 2305 return -EINVAL; 2306 2307 if (device->ops.modify_port) 2308 rc = device->ops.modify_port(device, port_num, 2309 port_modify_mask, 2310 port_modify); 2311 else 2312 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; 2313 return rc; 2314 } 2315 EXPORT_SYMBOL(ib_modify_port); 2316 2317 /** 2318 * ib_find_gid - Returns the port number and GID table index where 2319 * a specified GID value occurs. Its searches only for IB link layer. 2320 * @device: The device to query. 2321 * @gid: The GID value to search for. 2322 * @port_num: The port number of the device where the GID value was found. 2323 * @index: The index into the GID table where the GID was found. This 2324 * parameter may be NULL. 2325 */ 2326 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2327 u8 *port_num, u16 *index) 2328 { 2329 union ib_gid tmp_gid; 2330 unsigned int port; 2331 int ret, i; 2332 2333 rdma_for_each_port (device, port) { 2334 if (!rdma_protocol_ib(device, port)) 2335 continue; 2336 2337 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2338 ++i) { 2339 ret = rdma_query_gid(device, port, i, &tmp_gid); 2340 if (ret) 2341 return ret; 2342 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2343 *port_num = port; 2344 if (index) 2345 *index = i; 2346 return 0; 2347 } 2348 } 2349 } 2350 2351 return -ENOENT; 2352 } 2353 EXPORT_SYMBOL(ib_find_gid); 2354 2355 /** 2356 * ib_find_pkey - Returns the PKey table index where a specified 2357 * PKey value occurs. 2358 * @device: The device to query. 2359 * @port_num: The port number of the device to search for the PKey. 2360 * @pkey: The PKey value to search for. 2361 * @index: The index into the PKey table where the PKey was found. 2362 */ 2363 int ib_find_pkey(struct ib_device *device, 2364 u8 port_num, u16 pkey, u16 *index) 2365 { 2366 int ret, i; 2367 u16 tmp_pkey; 2368 int partial_ix = -1; 2369 2370 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2371 ++i) { 2372 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2373 if (ret) 2374 return ret; 2375 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2376 /* if there is full-member pkey take it.*/ 2377 if (tmp_pkey & 0x8000) { 2378 *index = i; 2379 return 0; 2380 } 2381 if (partial_ix < 0) 2382 partial_ix = i; 2383 } 2384 } 2385 2386 /*no full-member, if exists take the limited*/ 2387 if (partial_ix >= 0) { 2388 *index = partial_ix; 2389 return 0; 2390 } 2391 return -ENOENT; 2392 } 2393 EXPORT_SYMBOL(ib_find_pkey); 2394 2395 /** 2396 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2397 * for a received CM request 2398 * @dev: An RDMA device on which the request has been received. 2399 * @port: Port number on the RDMA device. 2400 * @pkey: The Pkey the request came on. 2401 * @gid: A GID that the net_dev uses to communicate. 2402 * @addr: Contains the IP address that the request specified as its 2403 * destination. 2404 * 2405 */ 2406 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2407 u8 port, 2408 u16 pkey, 2409 const union ib_gid *gid, 2410 const struct sockaddr *addr) 2411 { 2412 struct net_device *net_dev = NULL; 2413 unsigned long index; 2414 void *client_data; 2415 2416 if (!rdma_protocol_ib(dev, port)) 2417 return NULL; 2418 2419 /* 2420 * Holding the read side guarantees that the client will not become 2421 * unregistered while we are calling get_net_dev_by_params() 2422 */ 2423 down_read(&dev->client_data_rwsem); 2424 xan_for_each_marked (&dev->client_data, index, client_data, 2425 CLIENT_DATA_REGISTERED) { 2426 struct ib_client *client = xa_load(&clients, index); 2427 2428 if (!client || !client->get_net_dev_by_params) 2429 continue; 2430 2431 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2432 addr, client_data); 2433 if (net_dev) 2434 break; 2435 } 2436 up_read(&dev->client_data_rwsem); 2437 2438 return net_dev; 2439 } 2440 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2441 2442 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2443 { 2444 struct ib_device_ops *dev_ops = &dev->ops; 2445 #define SET_DEVICE_OP(ptr, name) \ 2446 do { \ 2447 if (ops->name) \ 2448 if (!((ptr)->name)) \ 2449 (ptr)->name = ops->name; \ 2450 } while (0) 2451 2452 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2453 2454 if (ops->driver_id != RDMA_DRIVER_UNKNOWN) { 2455 WARN_ON(dev_ops->driver_id != RDMA_DRIVER_UNKNOWN && 2456 dev_ops->driver_id != ops->driver_id); 2457 dev_ops->driver_id = ops->driver_id; 2458 } 2459 if (ops->owner) { 2460 WARN_ON(dev_ops->owner && dev_ops->owner != ops->owner); 2461 dev_ops->owner = ops->owner; 2462 } 2463 if (ops->uverbs_abi_ver) 2464 dev_ops->uverbs_abi_ver = ops->uverbs_abi_ver; 2465 2466 dev_ops->uverbs_no_driver_id_binding |= 2467 ops->uverbs_no_driver_id_binding; 2468 2469 SET_DEVICE_OP(dev_ops, add_gid); 2470 SET_DEVICE_OP(dev_ops, advise_mr); 2471 SET_DEVICE_OP(dev_ops, alloc_dm); 2472 SET_DEVICE_OP(dev_ops, alloc_fmr); 2473 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2474 SET_DEVICE_OP(dev_ops, alloc_mr); 2475 SET_DEVICE_OP(dev_ops, alloc_mr_integrity); 2476 SET_DEVICE_OP(dev_ops, alloc_mw); 2477 SET_DEVICE_OP(dev_ops, alloc_pd); 2478 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2479 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2480 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2481 SET_DEVICE_OP(dev_ops, attach_mcast); 2482 SET_DEVICE_OP(dev_ops, check_mr_status); 2483 SET_DEVICE_OP(dev_ops, counter_alloc_stats); 2484 SET_DEVICE_OP(dev_ops, counter_bind_qp); 2485 SET_DEVICE_OP(dev_ops, counter_dealloc); 2486 SET_DEVICE_OP(dev_ops, counter_unbind_qp); 2487 SET_DEVICE_OP(dev_ops, counter_update_stats); 2488 SET_DEVICE_OP(dev_ops, create_ah); 2489 SET_DEVICE_OP(dev_ops, create_counters); 2490 SET_DEVICE_OP(dev_ops, create_cq); 2491 SET_DEVICE_OP(dev_ops, create_flow); 2492 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2493 SET_DEVICE_OP(dev_ops, create_qp); 2494 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2495 SET_DEVICE_OP(dev_ops, create_srq); 2496 SET_DEVICE_OP(dev_ops, create_wq); 2497 SET_DEVICE_OP(dev_ops, dealloc_dm); 2498 SET_DEVICE_OP(dev_ops, dealloc_driver); 2499 SET_DEVICE_OP(dev_ops, dealloc_fmr); 2500 SET_DEVICE_OP(dev_ops, dealloc_mw); 2501 SET_DEVICE_OP(dev_ops, dealloc_pd); 2502 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2503 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2504 SET_DEVICE_OP(dev_ops, del_gid); 2505 SET_DEVICE_OP(dev_ops, dereg_mr); 2506 SET_DEVICE_OP(dev_ops, destroy_ah); 2507 SET_DEVICE_OP(dev_ops, destroy_counters); 2508 SET_DEVICE_OP(dev_ops, destroy_cq); 2509 SET_DEVICE_OP(dev_ops, destroy_flow); 2510 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2511 SET_DEVICE_OP(dev_ops, destroy_qp); 2512 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2513 SET_DEVICE_OP(dev_ops, destroy_srq); 2514 SET_DEVICE_OP(dev_ops, destroy_wq); 2515 SET_DEVICE_OP(dev_ops, detach_mcast); 2516 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2517 SET_DEVICE_OP(dev_ops, drain_rq); 2518 SET_DEVICE_OP(dev_ops, drain_sq); 2519 SET_DEVICE_OP(dev_ops, enable_driver); 2520 SET_DEVICE_OP(dev_ops, fill_res_entry); 2521 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2522 SET_DEVICE_OP(dev_ops, get_dma_mr); 2523 SET_DEVICE_OP(dev_ops, get_hw_stats); 2524 SET_DEVICE_OP(dev_ops, get_link_layer); 2525 SET_DEVICE_OP(dev_ops, get_netdev); 2526 SET_DEVICE_OP(dev_ops, get_port_immutable); 2527 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2528 SET_DEVICE_OP(dev_ops, get_vf_config); 2529 SET_DEVICE_OP(dev_ops, get_vf_stats); 2530 SET_DEVICE_OP(dev_ops, init_port); 2531 SET_DEVICE_OP(dev_ops, iw_accept); 2532 SET_DEVICE_OP(dev_ops, iw_add_ref); 2533 SET_DEVICE_OP(dev_ops, iw_connect); 2534 SET_DEVICE_OP(dev_ops, iw_create_listen); 2535 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2536 SET_DEVICE_OP(dev_ops, iw_get_qp); 2537 SET_DEVICE_OP(dev_ops, iw_reject); 2538 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2539 SET_DEVICE_OP(dev_ops, map_mr_sg); 2540 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2541 SET_DEVICE_OP(dev_ops, map_phys_fmr); 2542 SET_DEVICE_OP(dev_ops, mmap); 2543 SET_DEVICE_OP(dev_ops, modify_ah); 2544 SET_DEVICE_OP(dev_ops, modify_cq); 2545 SET_DEVICE_OP(dev_ops, modify_device); 2546 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2547 SET_DEVICE_OP(dev_ops, modify_port); 2548 SET_DEVICE_OP(dev_ops, modify_qp); 2549 SET_DEVICE_OP(dev_ops, modify_srq); 2550 SET_DEVICE_OP(dev_ops, modify_wq); 2551 SET_DEVICE_OP(dev_ops, peek_cq); 2552 SET_DEVICE_OP(dev_ops, poll_cq); 2553 SET_DEVICE_OP(dev_ops, post_recv); 2554 SET_DEVICE_OP(dev_ops, post_send); 2555 SET_DEVICE_OP(dev_ops, post_srq_recv); 2556 SET_DEVICE_OP(dev_ops, process_mad); 2557 SET_DEVICE_OP(dev_ops, query_ah); 2558 SET_DEVICE_OP(dev_ops, query_device); 2559 SET_DEVICE_OP(dev_ops, query_gid); 2560 SET_DEVICE_OP(dev_ops, query_pkey); 2561 SET_DEVICE_OP(dev_ops, query_port); 2562 SET_DEVICE_OP(dev_ops, query_qp); 2563 SET_DEVICE_OP(dev_ops, query_srq); 2564 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2565 SET_DEVICE_OP(dev_ops, read_counters); 2566 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2567 SET_DEVICE_OP(dev_ops, reg_user_mr); 2568 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2569 SET_DEVICE_OP(dev_ops, req_notify_cq); 2570 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2571 SET_DEVICE_OP(dev_ops, resize_cq); 2572 SET_DEVICE_OP(dev_ops, set_vf_guid); 2573 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2574 SET_DEVICE_OP(dev_ops, unmap_fmr); 2575 2576 SET_OBJ_SIZE(dev_ops, ib_ah); 2577 SET_OBJ_SIZE(dev_ops, ib_cq); 2578 SET_OBJ_SIZE(dev_ops, ib_pd); 2579 SET_OBJ_SIZE(dev_ops, ib_srq); 2580 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2581 } 2582 EXPORT_SYMBOL(ib_set_device_ops); 2583 2584 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2585 [RDMA_NL_LS_OP_RESOLVE] = { 2586 .doit = ib_nl_handle_resolve_resp, 2587 .flags = RDMA_NL_ADMIN_PERM, 2588 }, 2589 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2590 .doit = ib_nl_handle_set_timeout, 2591 .flags = RDMA_NL_ADMIN_PERM, 2592 }, 2593 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2594 .doit = ib_nl_handle_ip_res_resp, 2595 .flags = RDMA_NL_ADMIN_PERM, 2596 }, 2597 }; 2598 2599 static int __init ib_core_init(void) 2600 { 2601 int ret; 2602 2603 ib_wq = alloc_workqueue("infiniband", 0, 0); 2604 if (!ib_wq) 2605 return -ENOMEM; 2606 2607 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2608 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2609 if (!ib_comp_wq) { 2610 ret = -ENOMEM; 2611 goto err; 2612 } 2613 2614 ib_comp_unbound_wq = 2615 alloc_workqueue("ib-comp-unb-wq", 2616 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2617 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2618 if (!ib_comp_unbound_wq) { 2619 ret = -ENOMEM; 2620 goto err_comp; 2621 } 2622 2623 ret = class_register(&ib_class); 2624 if (ret) { 2625 pr_warn("Couldn't create InfiniBand device class\n"); 2626 goto err_comp_unbound; 2627 } 2628 2629 ret = rdma_nl_init(); 2630 if (ret) { 2631 pr_warn("Couldn't init IB netlink interface: err %d\n", ret); 2632 goto err_sysfs; 2633 } 2634 2635 ret = addr_init(); 2636 if (ret) { 2637 pr_warn("Could't init IB address resolution\n"); 2638 goto err_ibnl; 2639 } 2640 2641 ret = ib_mad_init(); 2642 if (ret) { 2643 pr_warn("Couldn't init IB MAD\n"); 2644 goto err_addr; 2645 } 2646 2647 ret = ib_sa_init(); 2648 if (ret) { 2649 pr_warn("Couldn't init SA\n"); 2650 goto err_mad; 2651 } 2652 2653 ret = register_blocking_lsm_notifier(&ibdev_lsm_nb); 2654 if (ret) { 2655 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2656 goto err_sa; 2657 } 2658 2659 ret = register_pernet_device(&rdma_dev_net_ops); 2660 if (ret) { 2661 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2662 goto err_compat; 2663 } 2664 2665 nldev_init(); 2666 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2667 roce_gid_mgmt_init(); 2668 2669 return 0; 2670 2671 err_compat: 2672 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2673 err_sa: 2674 ib_sa_cleanup(); 2675 err_mad: 2676 ib_mad_cleanup(); 2677 err_addr: 2678 addr_cleanup(); 2679 err_ibnl: 2680 rdma_nl_exit(); 2681 err_sysfs: 2682 class_unregister(&ib_class); 2683 err_comp_unbound: 2684 destroy_workqueue(ib_comp_unbound_wq); 2685 err_comp: 2686 destroy_workqueue(ib_comp_wq); 2687 err: 2688 destroy_workqueue(ib_wq); 2689 return ret; 2690 } 2691 2692 static void __exit ib_core_cleanup(void) 2693 { 2694 roce_gid_mgmt_cleanup(); 2695 nldev_exit(); 2696 rdma_nl_unregister(RDMA_NL_LS); 2697 unregister_pernet_device(&rdma_dev_net_ops); 2698 unregister_blocking_lsm_notifier(&ibdev_lsm_nb); 2699 ib_sa_cleanup(); 2700 ib_mad_cleanup(); 2701 addr_cleanup(); 2702 rdma_nl_exit(); 2703 class_unregister(&ib_class); 2704 destroy_workqueue(ib_comp_unbound_wq); 2705 destroy_workqueue(ib_comp_wq); 2706 /* Make sure that any pending umem accounting work is done. */ 2707 destroy_workqueue(ib_wq); 2708 flush_workqueue(system_unbound_wq); 2709 WARN_ON(!xa_empty(&clients)); 2710 WARN_ON(!xa_empty(&devices)); 2711 } 2712 2713 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2714 2715 /* ib core relies on netdev stack to first register net_ns_type_operations 2716 * ns kobject type before ib_core initialization. 2717 */ 2718 fs_initcall(ib_core_init); 2719 module_exit(ib_core_cleanup); 2720