1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <net/net_namespace.h> 42 #include <net/netns/generic.h> 43 #include <linux/security.h> 44 #include <linux/notifier.h> 45 #include <linux/hashtable.h> 46 #include <rdma/rdma_netlink.h> 47 #include <rdma/ib_addr.h> 48 #include <rdma/ib_cache.h> 49 50 #include "core_priv.h" 51 #include "restrack.h" 52 53 MODULE_AUTHOR("Roland Dreier"); 54 MODULE_DESCRIPTION("core kernel InfiniBand API"); 55 MODULE_LICENSE("Dual BSD/GPL"); 56 57 struct workqueue_struct *ib_comp_wq; 58 struct workqueue_struct *ib_comp_unbound_wq; 59 struct workqueue_struct *ib_wq; 60 EXPORT_SYMBOL_GPL(ib_wq); 61 62 /* 63 * Each of the three rwsem locks (devices, clients, client_data) protects the 64 * xarray of the same name. Specifically it allows the caller to assert that 65 * the MARK will/will not be changing under the lock, and for devices and 66 * clients, that the value in the xarray is still a valid pointer. Change of 67 * the MARK is linked to the object state, so holding the lock and testing the 68 * MARK also asserts that the contained object is in a certain state. 69 * 70 * This is used to build a two stage register/unregister flow where objects 71 * can continue to be in the xarray even though they are still in progress to 72 * register/unregister. 73 * 74 * The xarray itself provides additional locking, and restartable iteration, 75 * which is also relied on. 76 * 77 * Locks should not be nested, with the exception of client_data, which is 78 * allowed to nest under the read side of the other two locks. 79 * 80 * The devices_rwsem also protects the device name list, any change or 81 * assignment of device name must also hold the write side to guarantee unique 82 * names. 83 */ 84 85 /* 86 * devices contains devices that have had their names assigned. The 87 * devices may not be registered. Users that care about the registration 88 * status need to call ib_device_try_get() on the device to ensure it is 89 * registered, and keep it registered, for the required duration. 90 * 91 */ 92 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 93 static DECLARE_RWSEM(devices_rwsem); 94 #define DEVICE_REGISTERED XA_MARK_1 95 96 static LIST_HEAD(client_list); 97 #define CLIENT_REGISTERED XA_MARK_1 98 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 99 static DECLARE_RWSEM(clients_rwsem); 100 101 /* 102 * If client_data is registered then the corresponding client must also still 103 * be registered. 104 */ 105 #define CLIENT_DATA_REGISTERED XA_MARK_1 106 107 /** 108 * struct rdma_dev_net - rdma net namespace metadata for a net 109 * @net: Pointer to owner net namespace 110 * @id: xarray id to identify the net namespace. 111 */ 112 struct rdma_dev_net { 113 possible_net_t net; 114 u32 id; 115 }; 116 117 static unsigned int rdma_dev_net_id; 118 119 /* 120 * A list of net namespaces is maintained in an xarray. This is necessary 121 * because we can't get the locking right using the existing net ns list. We 122 * would require a init_net callback after the list is updated. 123 */ 124 static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC); 125 /* 126 * rwsem to protect accessing the rdma_nets xarray entries. 127 */ 128 static DECLARE_RWSEM(rdma_nets_rwsem); 129 130 bool ib_devices_shared_netns = true; 131 module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444); 132 MODULE_PARM_DESC(netns_mode, 133 "Share device among net namespaces; default=1 (shared)"); 134 /** 135 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 136 * from a specified net namespace or not. 137 * @device: Pointer to rdma device which needs to be checked 138 * @net: Pointer to net namesapce for which access to be checked 139 * 140 * rdma_dev_access_netns() - Return whether a rdma device can be accessed 141 * from a specified net namespace or not. When 142 * rdma device is in shared mode, it ignores the 143 * net namespace. When rdma device is exclusive 144 * to a net namespace, rdma device net namespace is 145 * checked against the specified one. 146 */ 147 bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net) 148 { 149 return (ib_devices_shared_netns || 150 net_eq(read_pnet(&dev->coredev.rdma_net), net)); 151 } 152 EXPORT_SYMBOL(rdma_dev_access_netns); 153 154 /* 155 * xarray has this behavior where it won't iterate over NULL values stored in 156 * allocated arrays. So we need our own iterator to see all values stored in 157 * the array. This does the same thing as xa_for_each except that it also 158 * returns NULL valued entries if the array is allocating. Simplified to only 159 * work on simple xarrays. 160 */ 161 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 162 xa_mark_t filter) 163 { 164 XA_STATE(xas, xa, *indexp); 165 void *entry; 166 167 rcu_read_lock(); 168 do { 169 entry = xas_find_marked(&xas, ULONG_MAX, filter); 170 if (xa_is_zero(entry)) 171 break; 172 } while (xas_retry(&xas, entry)); 173 rcu_read_unlock(); 174 175 if (entry) { 176 *indexp = xas.xa_index; 177 if (xa_is_zero(entry)) 178 return NULL; 179 return entry; 180 } 181 return XA_ERROR(-ENOENT); 182 } 183 #define xan_for_each_marked(xa, index, entry, filter) \ 184 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 185 !xa_is_err(entry); \ 186 (index)++, entry = xan_find_marked(xa, &(index), filter)) 187 188 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 189 static DEFINE_SPINLOCK(ndev_hash_lock); 190 static DECLARE_HASHTABLE(ndev_hash, 5); 191 192 static void free_netdevs(struct ib_device *ib_dev); 193 static void ib_unregister_work(struct work_struct *work); 194 static void __ib_unregister_device(struct ib_device *device); 195 static int ib_security_change(struct notifier_block *nb, unsigned long event, 196 void *lsm_data); 197 static void ib_policy_change_task(struct work_struct *work); 198 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 199 200 static void __ibdev_printk(const char *level, const struct ib_device *ibdev, 201 struct va_format *vaf) 202 { 203 if (ibdev && ibdev->dev.parent) 204 dev_printk_emit(level[1] - '0', 205 ibdev->dev.parent, 206 "%s %s %s: %pV", 207 dev_driver_string(ibdev->dev.parent), 208 dev_name(ibdev->dev.parent), 209 dev_name(&ibdev->dev), 210 vaf); 211 else if (ibdev) 212 printk("%s%s: %pV", 213 level, dev_name(&ibdev->dev), vaf); 214 else 215 printk("%s(NULL ib_device): %pV", level, vaf); 216 } 217 218 void ibdev_printk(const char *level, const struct ib_device *ibdev, 219 const char *format, ...) 220 { 221 struct va_format vaf; 222 va_list args; 223 224 va_start(args, format); 225 226 vaf.fmt = format; 227 vaf.va = &args; 228 229 __ibdev_printk(level, ibdev, &vaf); 230 231 va_end(args); 232 } 233 EXPORT_SYMBOL(ibdev_printk); 234 235 #define define_ibdev_printk_level(func, level) \ 236 void func(const struct ib_device *ibdev, const char *fmt, ...) \ 237 { \ 238 struct va_format vaf; \ 239 va_list args; \ 240 \ 241 va_start(args, fmt); \ 242 \ 243 vaf.fmt = fmt; \ 244 vaf.va = &args; \ 245 \ 246 __ibdev_printk(level, ibdev, &vaf); \ 247 \ 248 va_end(args); \ 249 } \ 250 EXPORT_SYMBOL(func); 251 252 define_ibdev_printk_level(ibdev_emerg, KERN_EMERG); 253 define_ibdev_printk_level(ibdev_alert, KERN_ALERT); 254 define_ibdev_printk_level(ibdev_crit, KERN_CRIT); 255 define_ibdev_printk_level(ibdev_err, KERN_ERR); 256 define_ibdev_printk_level(ibdev_warn, KERN_WARNING); 257 define_ibdev_printk_level(ibdev_notice, KERN_NOTICE); 258 define_ibdev_printk_level(ibdev_info, KERN_INFO); 259 260 static struct notifier_block ibdev_lsm_nb = { 261 .notifier_call = ib_security_change, 262 }; 263 264 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 265 struct net *net); 266 267 /* Pointer to the RCU head at the start of the ib_port_data array */ 268 struct ib_port_data_rcu { 269 struct rcu_head rcu_head; 270 struct ib_port_data pdata[]; 271 }; 272 273 static int ib_device_check_mandatory(struct ib_device *device) 274 { 275 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 276 static const struct { 277 size_t offset; 278 char *name; 279 } mandatory_table[] = { 280 IB_MANDATORY_FUNC(query_device), 281 IB_MANDATORY_FUNC(query_port), 282 IB_MANDATORY_FUNC(query_pkey), 283 IB_MANDATORY_FUNC(alloc_pd), 284 IB_MANDATORY_FUNC(dealloc_pd), 285 IB_MANDATORY_FUNC(create_qp), 286 IB_MANDATORY_FUNC(modify_qp), 287 IB_MANDATORY_FUNC(destroy_qp), 288 IB_MANDATORY_FUNC(post_send), 289 IB_MANDATORY_FUNC(post_recv), 290 IB_MANDATORY_FUNC(create_cq), 291 IB_MANDATORY_FUNC(destroy_cq), 292 IB_MANDATORY_FUNC(poll_cq), 293 IB_MANDATORY_FUNC(req_notify_cq), 294 IB_MANDATORY_FUNC(get_dma_mr), 295 IB_MANDATORY_FUNC(dereg_mr), 296 IB_MANDATORY_FUNC(get_port_immutable) 297 }; 298 int i; 299 300 device->kverbs_provider = true; 301 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 302 if (!*(void **) ((void *) &device->ops + 303 mandatory_table[i].offset)) { 304 device->kverbs_provider = false; 305 break; 306 } 307 } 308 309 return 0; 310 } 311 312 /* 313 * Caller must perform ib_device_put() to return the device reference count 314 * when ib_device_get_by_index() returns valid device pointer. 315 */ 316 struct ib_device *ib_device_get_by_index(const struct net *net, u32 index) 317 { 318 struct ib_device *device; 319 320 down_read(&devices_rwsem); 321 device = xa_load(&devices, index); 322 if (device) { 323 if (!rdma_dev_access_netns(device, net)) { 324 device = NULL; 325 goto out; 326 } 327 328 if (!ib_device_try_get(device)) 329 device = NULL; 330 } 331 out: 332 up_read(&devices_rwsem); 333 return device; 334 } 335 336 /** 337 * ib_device_put - Release IB device reference 338 * @device: device whose reference to be released 339 * 340 * ib_device_put() releases reference to the IB device to allow it to be 341 * unregistered and eventually free. 342 */ 343 void ib_device_put(struct ib_device *device) 344 { 345 if (refcount_dec_and_test(&device->refcount)) 346 complete(&device->unreg_completion); 347 } 348 EXPORT_SYMBOL(ib_device_put); 349 350 static struct ib_device *__ib_device_get_by_name(const char *name) 351 { 352 struct ib_device *device; 353 unsigned long index; 354 355 xa_for_each (&devices, index, device) 356 if (!strcmp(name, dev_name(&device->dev))) 357 return device; 358 359 return NULL; 360 } 361 362 /** 363 * ib_device_get_by_name - Find an IB device by name 364 * @name: The name to look for 365 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 366 * 367 * Find and hold an ib_device by its name. The caller must call 368 * ib_device_put() on the returned pointer. 369 */ 370 struct ib_device *ib_device_get_by_name(const char *name, 371 enum rdma_driver_id driver_id) 372 { 373 struct ib_device *device; 374 375 down_read(&devices_rwsem); 376 device = __ib_device_get_by_name(name); 377 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 378 device->driver_id != driver_id) 379 device = NULL; 380 381 if (device) { 382 if (!ib_device_try_get(device)) 383 device = NULL; 384 } 385 up_read(&devices_rwsem); 386 return device; 387 } 388 EXPORT_SYMBOL(ib_device_get_by_name); 389 390 static int rename_compat_devs(struct ib_device *device) 391 { 392 struct ib_core_device *cdev; 393 unsigned long index; 394 int ret = 0; 395 396 mutex_lock(&device->compat_devs_mutex); 397 xa_for_each (&device->compat_devs, index, cdev) { 398 ret = device_rename(&cdev->dev, dev_name(&device->dev)); 399 if (ret) { 400 dev_warn(&cdev->dev, 401 "Fail to rename compatdev to new name %s\n", 402 dev_name(&device->dev)); 403 break; 404 } 405 } 406 mutex_unlock(&device->compat_devs_mutex); 407 return ret; 408 } 409 410 int ib_device_rename(struct ib_device *ibdev, const char *name) 411 { 412 unsigned long index; 413 void *client_data; 414 int ret; 415 416 down_write(&devices_rwsem); 417 if (!strcmp(name, dev_name(&ibdev->dev))) { 418 up_write(&devices_rwsem); 419 return 0; 420 } 421 422 if (__ib_device_get_by_name(name)) { 423 up_write(&devices_rwsem); 424 return -EEXIST; 425 } 426 427 ret = device_rename(&ibdev->dev, name); 428 if (ret) { 429 up_write(&devices_rwsem); 430 return ret; 431 } 432 433 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 434 ret = rename_compat_devs(ibdev); 435 436 downgrade_write(&devices_rwsem); 437 down_read(&ibdev->client_data_rwsem); 438 xan_for_each_marked(&ibdev->client_data, index, client_data, 439 CLIENT_DATA_REGISTERED) { 440 struct ib_client *client = xa_load(&clients, index); 441 442 if (!client || !client->rename) 443 continue; 444 445 client->rename(ibdev, client_data); 446 } 447 up_read(&ibdev->client_data_rwsem); 448 up_read(&devices_rwsem); 449 return 0; 450 } 451 452 static int alloc_name(struct ib_device *ibdev, const char *name) 453 { 454 struct ib_device *device; 455 unsigned long index; 456 struct ida inuse; 457 int rc; 458 int i; 459 460 lockdep_assert_held_exclusive(&devices_rwsem); 461 ida_init(&inuse); 462 xa_for_each (&devices, index, device) { 463 char buf[IB_DEVICE_NAME_MAX]; 464 465 if (sscanf(dev_name(&device->dev), name, &i) != 1) 466 continue; 467 if (i < 0 || i >= INT_MAX) 468 continue; 469 snprintf(buf, sizeof buf, name, i); 470 if (strcmp(buf, dev_name(&device->dev)) != 0) 471 continue; 472 473 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 474 if (rc < 0) 475 goto out; 476 } 477 478 rc = ida_alloc(&inuse, GFP_KERNEL); 479 if (rc < 0) 480 goto out; 481 482 rc = dev_set_name(&ibdev->dev, name, rc); 483 out: 484 ida_destroy(&inuse); 485 return rc; 486 } 487 488 static void ib_device_release(struct device *device) 489 { 490 struct ib_device *dev = container_of(device, struct ib_device, dev); 491 492 free_netdevs(dev); 493 WARN_ON(refcount_read(&dev->refcount)); 494 if (dev->port_data) { 495 ib_cache_release_one(dev); 496 ib_security_release_port_pkey_list(dev); 497 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 498 pdata[0]), 499 rcu_head); 500 } 501 xa_destroy(&dev->compat_devs); 502 xa_destroy(&dev->client_data); 503 kfree_rcu(dev, rcu_head); 504 } 505 506 static int ib_device_uevent(struct device *device, 507 struct kobj_uevent_env *env) 508 { 509 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 510 return -ENOMEM; 511 512 /* 513 * It would be nice to pass the node GUID with the event... 514 */ 515 516 return 0; 517 } 518 519 static const void *net_namespace(struct device *d) 520 { 521 struct ib_core_device *coredev = 522 container_of(d, struct ib_core_device, dev); 523 524 return read_pnet(&coredev->rdma_net); 525 } 526 527 static struct class ib_class = { 528 .name = "infiniband", 529 .dev_release = ib_device_release, 530 .dev_uevent = ib_device_uevent, 531 .ns_type = &net_ns_type_operations, 532 .namespace = net_namespace, 533 }; 534 535 static void rdma_init_coredev(struct ib_core_device *coredev, 536 struct ib_device *dev, struct net *net) 537 { 538 /* This BUILD_BUG_ON is intended to catch layout change 539 * of union of ib_core_device and device. 540 * dev must be the first element as ib_core and providers 541 * driver uses it. Adding anything in ib_core_device before 542 * device will break this assumption. 543 */ 544 BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) != 545 offsetof(struct ib_device, dev)); 546 547 coredev->dev.class = &ib_class; 548 coredev->dev.groups = dev->groups; 549 device_initialize(&coredev->dev); 550 coredev->owner = dev; 551 INIT_LIST_HEAD(&coredev->port_list); 552 write_pnet(&coredev->rdma_net, net); 553 } 554 555 /** 556 * _ib_alloc_device - allocate an IB device struct 557 * @size:size of structure to allocate 558 * 559 * Low-level drivers should use ib_alloc_device() to allocate &struct 560 * ib_device. @size is the size of the structure to be allocated, 561 * including any private data used by the low-level driver. 562 * ib_dealloc_device() must be used to free structures allocated with 563 * ib_alloc_device(). 564 */ 565 struct ib_device *_ib_alloc_device(size_t size) 566 { 567 struct ib_device *device; 568 569 if (WARN_ON(size < sizeof(struct ib_device))) 570 return NULL; 571 572 device = kzalloc(size, GFP_KERNEL); 573 if (!device) 574 return NULL; 575 576 if (rdma_restrack_init(device)) { 577 kfree(device); 578 return NULL; 579 } 580 581 device->groups[0] = &ib_dev_attr_group; 582 rdma_init_coredev(&device->coredev, device, &init_net); 583 584 INIT_LIST_HEAD(&device->event_handler_list); 585 spin_lock_init(&device->event_handler_lock); 586 mutex_init(&device->unregistration_lock); 587 /* 588 * client_data needs to be alloc because we don't want our mark to be 589 * destroyed if the user stores NULL in the client data. 590 */ 591 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 592 init_rwsem(&device->client_data_rwsem); 593 xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC); 594 mutex_init(&device->compat_devs_mutex); 595 init_completion(&device->unreg_completion); 596 INIT_WORK(&device->unregistration_work, ib_unregister_work); 597 598 return device; 599 } 600 EXPORT_SYMBOL(_ib_alloc_device); 601 602 /** 603 * ib_dealloc_device - free an IB device struct 604 * @device:structure to free 605 * 606 * Free a structure allocated with ib_alloc_device(). 607 */ 608 void ib_dealloc_device(struct ib_device *device) 609 { 610 if (device->ops.dealloc_driver) 611 device->ops.dealloc_driver(device); 612 613 /* 614 * ib_unregister_driver() requires all devices to remain in the xarray 615 * while their ops are callable. The last op we call is dealloc_driver 616 * above. This is needed to create a fence on op callbacks prior to 617 * allowing the driver module to unload. 618 */ 619 down_write(&devices_rwsem); 620 if (xa_load(&devices, device->index) == device) 621 xa_erase(&devices, device->index); 622 up_write(&devices_rwsem); 623 624 /* Expedite releasing netdev references */ 625 free_netdevs(device); 626 627 WARN_ON(!xa_empty(&device->compat_devs)); 628 WARN_ON(!xa_empty(&device->client_data)); 629 WARN_ON(refcount_read(&device->refcount)); 630 rdma_restrack_clean(device); 631 /* Balances with device_initialize */ 632 put_device(&device->dev); 633 } 634 EXPORT_SYMBOL(ib_dealloc_device); 635 636 /* 637 * add_client_context() and remove_client_context() must be safe against 638 * parallel calls on the same device - registration/unregistration of both the 639 * device and client can be occurring in parallel. 640 * 641 * The routines need to be a fence, any caller must not return until the add 642 * or remove is fully completed. 643 */ 644 static int add_client_context(struct ib_device *device, 645 struct ib_client *client) 646 { 647 int ret = 0; 648 649 if (!device->kverbs_provider && !client->no_kverbs_req) 650 return 0; 651 652 down_write(&device->client_data_rwsem); 653 /* 654 * Another caller to add_client_context got here first and has already 655 * completely initialized context. 656 */ 657 if (xa_get_mark(&device->client_data, client->client_id, 658 CLIENT_DATA_REGISTERED)) 659 goto out; 660 661 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 662 GFP_KERNEL)); 663 if (ret) 664 goto out; 665 downgrade_write(&device->client_data_rwsem); 666 if (client->add) 667 client->add(device); 668 669 /* Readers shall not see a client until add has been completed */ 670 xa_set_mark(&device->client_data, client->client_id, 671 CLIENT_DATA_REGISTERED); 672 up_read(&device->client_data_rwsem); 673 return 0; 674 675 out: 676 up_write(&device->client_data_rwsem); 677 return ret; 678 } 679 680 static void remove_client_context(struct ib_device *device, 681 unsigned int client_id) 682 { 683 struct ib_client *client; 684 void *client_data; 685 686 down_write(&device->client_data_rwsem); 687 if (!xa_get_mark(&device->client_data, client_id, 688 CLIENT_DATA_REGISTERED)) { 689 up_write(&device->client_data_rwsem); 690 return; 691 } 692 client_data = xa_load(&device->client_data, client_id); 693 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 694 client = xa_load(&clients, client_id); 695 downgrade_write(&device->client_data_rwsem); 696 697 /* 698 * Notice we cannot be holding any exclusive locks when calling the 699 * remove callback as the remove callback can recurse back into any 700 * public functions in this module and thus try for any locks those 701 * functions take. 702 * 703 * For this reason clients and drivers should not call the 704 * unregistration functions will holdling any locks. 705 * 706 * It tempting to drop the client_data_rwsem too, but this is required 707 * to ensure that unregister_client does not return until all clients 708 * are completely unregistered, which is required to avoid module 709 * unloading races. 710 */ 711 if (client->remove) 712 client->remove(device, client_data); 713 714 xa_erase(&device->client_data, client_id); 715 up_read(&device->client_data_rwsem); 716 } 717 718 static int alloc_port_data(struct ib_device *device) 719 { 720 struct ib_port_data_rcu *pdata_rcu; 721 unsigned int port; 722 723 if (device->port_data) 724 return 0; 725 726 /* This can only be called once the physical port range is defined */ 727 if (WARN_ON(!device->phys_port_cnt)) 728 return -EINVAL; 729 730 /* 731 * device->port_data is indexed directly by the port number to make 732 * access to this data as efficient as possible. 733 * 734 * Therefore port_data is declared as a 1 based array with potential 735 * empty slots at the beginning. 736 */ 737 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 738 rdma_end_port(device) + 1), 739 GFP_KERNEL); 740 if (!pdata_rcu) 741 return -ENOMEM; 742 /* 743 * The rcu_head is put in front of the port data array and the stored 744 * pointer is adjusted since we never need to see that member until 745 * kfree_rcu. 746 */ 747 device->port_data = pdata_rcu->pdata; 748 749 rdma_for_each_port (device, port) { 750 struct ib_port_data *pdata = &device->port_data[port]; 751 752 pdata->ib_dev = device; 753 spin_lock_init(&pdata->pkey_list_lock); 754 INIT_LIST_HEAD(&pdata->pkey_list); 755 spin_lock_init(&pdata->netdev_lock); 756 INIT_HLIST_NODE(&pdata->ndev_hash_link); 757 } 758 return 0; 759 } 760 761 static int verify_immutable(const struct ib_device *dev, u8 port) 762 { 763 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 764 rdma_max_mad_size(dev, port) != 0); 765 } 766 767 static int setup_port_data(struct ib_device *device) 768 { 769 unsigned int port; 770 int ret; 771 772 ret = alloc_port_data(device); 773 if (ret) 774 return ret; 775 776 rdma_for_each_port (device, port) { 777 struct ib_port_data *pdata = &device->port_data[port]; 778 779 ret = device->ops.get_port_immutable(device, port, 780 &pdata->immutable); 781 if (ret) 782 return ret; 783 784 if (verify_immutable(device, port)) 785 return -EINVAL; 786 } 787 return 0; 788 } 789 790 void ib_get_device_fw_str(struct ib_device *dev, char *str) 791 { 792 if (dev->ops.get_dev_fw_str) 793 dev->ops.get_dev_fw_str(dev, str); 794 else 795 str[0] = '\0'; 796 } 797 EXPORT_SYMBOL(ib_get_device_fw_str); 798 799 static void ib_policy_change_task(struct work_struct *work) 800 { 801 struct ib_device *dev; 802 unsigned long index; 803 804 down_read(&devices_rwsem); 805 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 806 unsigned int i; 807 808 rdma_for_each_port (dev, i) { 809 u64 sp; 810 int ret = ib_get_cached_subnet_prefix(dev, 811 i, 812 &sp); 813 814 WARN_ONCE(ret, 815 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 816 ret); 817 if (!ret) 818 ib_security_cache_change(dev, i, sp); 819 } 820 } 821 up_read(&devices_rwsem); 822 } 823 824 static int ib_security_change(struct notifier_block *nb, unsigned long event, 825 void *lsm_data) 826 { 827 if (event != LSM_POLICY_CHANGE) 828 return NOTIFY_DONE; 829 830 schedule_work(&ib_policy_change_work); 831 ib_mad_agent_security_change(); 832 833 return NOTIFY_OK; 834 } 835 836 static void compatdev_release(struct device *dev) 837 { 838 struct ib_core_device *cdev = 839 container_of(dev, struct ib_core_device, dev); 840 841 kfree(cdev); 842 } 843 844 static int add_one_compat_dev(struct ib_device *device, 845 struct rdma_dev_net *rnet) 846 { 847 struct ib_core_device *cdev; 848 int ret; 849 850 lockdep_assert_held(&rdma_nets_rwsem); 851 if (!ib_devices_shared_netns) 852 return 0; 853 854 /* 855 * Create and add compat device in all namespaces other than where it 856 * is currently bound to. 857 */ 858 if (net_eq(read_pnet(&rnet->net), 859 read_pnet(&device->coredev.rdma_net))) 860 return 0; 861 862 /* 863 * The first of init_net() or ib_register_device() to take the 864 * compat_devs_mutex wins and gets to add the device. Others will wait 865 * for completion here. 866 */ 867 mutex_lock(&device->compat_devs_mutex); 868 cdev = xa_load(&device->compat_devs, rnet->id); 869 if (cdev) { 870 ret = 0; 871 goto done; 872 } 873 ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL); 874 if (ret) 875 goto done; 876 877 cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); 878 if (!cdev) { 879 ret = -ENOMEM; 880 goto cdev_err; 881 } 882 883 cdev->dev.parent = device->dev.parent; 884 rdma_init_coredev(cdev, device, read_pnet(&rnet->net)); 885 cdev->dev.release = compatdev_release; 886 dev_set_name(&cdev->dev, "%s", dev_name(&device->dev)); 887 888 ret = device_add(&cdev->dev); 889 if (ret) 890 goto add_err; 891 ret = ib_setup_port_attrs(cdev); 892 if (ret) 893 goto port_err; 894 895 ret = xa_err(xa_store(&device->compat_devs, rnet->id, 896 cdev, GFP_KERNEL)); 897 if (ret) 898 goto insert_err; 899 900 mutex_unlock(&device->compat_devs_mutex); 901 return 0; 902 903 insert_err: 904 ib_free_port_attrs(cdev); 905 port_err: 906 device_del(&cdev->dev); 907 add_err: 908 put_device(&cdev->dev); 909 cdev_err: 910 xa_release(&device->compat_devs, rnet->id); 911 done: 912 mutex_unlock(&device->compat_devs_mutex); 913 return ret; 914 } 915 916 static void remove_one_compat_dev(struct ib_device *device, u32 id) 917 { 918 struct ib_core_device *cdev; 919 920 mutex_lock(&device->compat_devs_mutex); 921 cdev = xa_erase(&device->compat_devs, id); 922 mutex_unlock(&device->compat_devs_mutex); 923 if (cdev) { 924 ib_free_port_attrs(cdev); 925 device_del(&cdev->dev); 926 put_device(&cdev->dev); 927 } 928 } 929 930 static void remove_compat_devs(struct ib_device *device) 931 { 932 struct ib_core_device *cdev; 933 unsigned long index; 934 935 xa_for_each (&device->compat_devs, index, cdev) 936 remove_one_compat_dev(device, index); 937 } 938 939 static int add_compat_devs(struct ib_device *device) 940 { 941 struct rdma_dev_net *rnet; 942 unsigned long index; 943 int ret = 0; 944 945 lockdep_assert_held(&devices_rwsem); 946 947 down_read(&rdma_nets_rwsem); 948 xa_for_each (&rdma_nets, index, rnet) { 949 ret = add_one_compat_dev(device, rnet); 950 if (ret) 951 break; 952 } 953 up_read(&rdma_nets_rwsem); 954 return ret; 955 } 956 957 static void remove_all_compat_devs(void) 958 { 959 struct ib_compat_device *cdev; 960 struct ib_device *dev; 961 unsigned long index; 962 963 down_read(&devices_rwsem); 964 xa_for_each (&devices, index, dev) { 965 unsigned long c_index = 0; 966 967 /* Hold nets_rwsem so that any other thread modifying this 968 * system param can sync with this thread. 969 */ 970 down_read(&rdma_nets_rwsem); 971 xa_for_each (&dev->compat_devs, c_index, cdev) 972 remove_one_compat_dev(dev, c_index); 973 up_read(&rdma_nets_rwsem); 974 } 975 up_read(&devices_rwsem); 976 } 977 978 static int add_all_compat_devs(void) 979 { 980 struct rdma_dev_net *rnet; 981 struct ib_device *dev; 982 unsigned long index; 983 int ret = 0; 984 985 down_read(&devices_rwsem); 986 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 987 unsigned long net_index = 0; 988 989 /* Hold nets_rwsem so that any other thread modifying this 990 * system param can sync with this thread. 991 */ 992 down_read(&rdma_nets_rwsem); 993 xa_for_each (&rdma_nets, net_index, rnet) { 994 ret = add_one_compat_dev(dev, rnet); 995 if (ret) 996 break; 997 } 998 up_read(&rdma_nets_rwsem); 999 } 1000 up_read(&devices_rwsem); 1001 if (ret) 1002 remove_all_compat_devs(); 1003 return ret; 1004 } 1005 1006 int rdma_compatdev_set(u8 enable) 1007 { 1008 struct rdma_dev_net *rnet; 1009 unsigned long index; 1010 int ret = 0; 1011 1012 down_write(&rdma_nets_rwsem); 1013 if (ib_devices_shared_netns == enable) { 1014 up_write(&rdma_nets_rwsem); 1015 return 0; 1016 } 1017 1018 /* enable/disable of compat devices is not supported 1019 * when more than default init_net exists. 1020 */ 1021 xa_for_each (&rdma_nets, index, rnet) { 1022 ret++; 1023 break; 1024 } 1025 if (!ret) 1026 ib_devices_shared_netns = enable; 1027 up_write(&rdma_nets_rwsem); 1028 if (ret) 1029 return -EBUSY; 1030 1031 if (enable) 1032 ret = add_all_compat_devs(); 1033 else 1034 remove_all_compat_devs(); 1035 return ret; 1036 } 1037 1038 static void rdma_dev_exit_net(struct net *net) 1039 { 1040 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1041 struct ib_device *dev; 1042 unsigned long index; 1043 int ret; 1044 1045 down_write(&rdma_nets_rwsem); 1046 /* 1047 * Prevent the ID from being re-used and hide the id from xa_for_each. 1048 */ 1049 ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL)); 1050 WARN_ON(ret); 1051 up_write(&rdma_nets_rwsem); 1052 1053 down_read(&devices_rwsem); 1054 xa_for_each (&devices, index, dev) { 1055 get_device(&dev->dev); 1056 /* 1057 * Release the devices_rwsem so that pontentially blocking 1058 * device_del, doesn't hold the devices_rwsem for too long. 1059 */ 1060 up_read(&devices_rwsem); 1061 1062 remove_one_compat_dev(dev, rnet->id); 1063 1064 /* 1065 * If the real device is in the NS then move it back to init. 1066 */ 1067 rdma_dev_change_netns(dev, net, &init_net); 1068 1069 put_device(&dev->dev); 1070 down_read(&devices_rwsem); 1071 } 1072 up_read(&devices_rwsem); 1073 1074 xa_erase(&rdma_nets, rnet->id); 1075 } 1076 1077 static __net_init int rdma_dev_init_net(struct net *net) 1078 { 1079 struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id); 1080 unsigned long index; 1081 struct ib_device *dev; 1082 int ret; 1083 1084 /* No need to create any compat devices in default init_net. */ 1085 if (net_eq(net, &init_net)) 1086 return 0; 1087 1088 write_pnet(&rnet->net, net); 1089 1090 ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL); 1091 if (ret) 1092 return ret; 1093 1094 down_read(&devices_rwsem); 1095 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1096 /* Hold nets_rwsem so that netlink command cannot change 1097 * system configuration for device sharing mode. 1098 */ 1099 down_read(&rdma_nets_rwsem); 1100 ret = add_one_compat_dev(dev, rnet); 1101 up_read(&rdma_nets_rwsem); 1102 if (ret) 1103 break; 1104 } 1105 up_read(&devices_rwsem); 1106 1107 if (ret) 1108 rdma_dev_exit_net(net); 1109 1110 return ret; 1111 } 1112 1113 /* 1114 * Assign the unique string device name and the unique device index. This is 1115 * undone by ib_dealloc_device. 1116 */ 1117 static int assign_name(struct ib_device *device, const char *name) 1118 { 1119 static u32 last_id; 1120 int ret; 1121 1122 down_write(&devices_rwsem); 1123 /* Assign a unique name to the device */ 1124 if (strchr(name, '%')) 1125 ret = alloc_name(device, name); 1126 else 1127 ret = dev_set_name(&device->dev, name); 1128 if (ret) 1129 goto out; 1130 1131 if (__ib_device_get_by_name(dev_name(&device->dev))) { 1132 ret = -ENFILE; 1133 goto out; 1134 } 1135 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 1136 1137 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 1138 &last_id, GFP_KERNEL); 1139 if (ret > 0) 1140 ret = 0; 1141 1142 out: 1143 up_write(&devices_rwsem); 1144 return ret; 1145 } 1146 1147 static void setup_dma_device(struct ib_device *device) 1148 { 1149 struct device *parent = device->dev.parent; 1150 1151 WARN_ON_ONCE(device->dma_device); 1152 if (device->dev.dma_ops) { 1153 /* 1154 * The caller provided custom DMA operations. Copy the 1155 * DMA-related fields that are used by e.g. dma_alloc_coherent() 1156 * into device->dev. 1157 */ 1158 device->dma_device = &device->dev; 1159 if (!device->dev.dma_mask) { 1160 if (parent) 1161 device->dev.dma_mask = parent->dma_mask; 1162 else 1163 WARN_ON_ONCE(true); 1164 } 1165 if (!device->dev.coherent_dma_mask) { 1166 if (parent) 1167 device->dev.coherent_dma_mask = 1168 parent->coherent_dma_mask; 1169 else 1170 WARN_ON_ONCE(true); 1171 } 1172 } else { 1173 /* 1174 * The caller did not provide custom DMA operations. Use the 1175 * DMA mapping operations of the parent device. 1176 */ 1177 WARN_ON_ONCE(!parent); 1178 device->dma_device = parent; 1179 } 1180 /* Setup default max segment size for all IB devices */ 1181 dma_set_max_seg_size(device->dma_device, SZ_2G); 1182 1183 } 1184 1185 /* 1186 * setup_device() allocates memory and sets up data that requires calling the 1187 * device ops, this is the only reason these actions are not done during 1188 * ib_alloc_device. It is undone by ib_dealloc_device(). 1189 */ 1190 static int setup_device(struct ib_device *device) 1191 { 1192 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 1193 int ret; 1194 1195 setup_dma_device(device); 1196 1197 ret = ib_device_check_mandatory(device); 1198 if (ret) 1199 return ret; 1200 1201 ret = setup_port_data(device); 1202 if (ret) { 1203 dev_warn(&device->dev, "Couldn't create per-port data\n"); 1204 return ret; 1205 } 1206 1207 memset(&device->attrs, 0, sizeof(device->attrs)); 1208 ret = device->ops.query_device(device, &device->attrs, &uhw); 1209 if (ret) { 1210 dev_warn(&device->dev, 1211 "Couldn't query the device attributes\n"); 1212 return ret; 1213 } 1214 1215 return 0; 1216 } 1217 1218 static void disable_device(struct ib_device *device) 1219 { 1220 struct ib_client *client; 1221 1222 WARN_ON(!refcount_read(&device->refcount)); 1223 1224 down_write(&devices_rwsem); 1225 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 1226 up_write(&devices_rwsem); 1227 1228 down_read(&clients_rwsem); 1229 list_for_each_entry_reverse(client, &client_list, list) 1230 remove_client_context(device, client->client_id); 1231 up_read(&clients_rwsem); 1232 1233 /* Pairs with refcount_set in enable_device */ 1234 ib_device_put(device); 1235 wait_for_completion(&device->unreg_completion); 1236 1237 /* 1238 * compat devices must be removed after device refcount drops to zero. 1239 * Otherwise init_net() may add more compatdevs after removing compat 1240 * devices and before device is disabled. 1241 */ 1242 remove_compat_devs(device); 1243 } 1244 1245 /* 1246 * An enabled device is visible to all clients and to all the public facing 1247 * APIs that return a device pointer. This always returns with a new get, even 1248 * if it fails. 1249 */ 1250 static int enable_device_and_get(struct ib_device *device) 1251 { 1252 struct ib_client *client; 1253 unsigned long index; 1254 int ret = 0; 1255 1256 /* 1257 * One ref belongs to the xa and the other belongs to this 1258 * thread. This is needed to guard against parallel unregistration. 1259 */ 1260 refcount_set(&device->refcount, 2); 1261 down_write(&devices_rwsem); 1262 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 1263 1264 /* 1265 * By using downgrade_write() we ensure that no other thread can clear 1266 * DEVICE_REGISTERED while we are completing the client setup. 1267 */ 1268 downgrade_write(&devices_rwsem); 1269 1270 if (device->ops.enable_driver) { 1271 ret = device->ops.enable_driver(device); 1272 if (ret) 1273 goto out; 1274 } 1275 1276 down_read(&clients_rwsem); 1277 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 1278 ret = add_client_context(device, client); 1279 if (ret) 1280 break; 1281 } 1282 up_read(&clients_rwsem); 1283 if (!ret) 1284 ret = add_compat_devs(device); 1285 out: 1286 up_read(&devices_rwsem); 1287 return ret; 1288 } 1289 1290 /** 1291 * ib_register_device - Register an IB device with IB core 1292 * @device:Device to register 1293 * 1294 * Low-level drivers use ib_register_device() to register their 1295 * devices with the IB core. All registered clients will receive a 1296 * callback for each device that is added. @device must be allocated 1297 * with ib_alloc_device(). 1298 * 1299 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 1300 * asynchronously then the device pointer may become freed as soon as this 1301 * function returns. 1302 */ 1303 int ib_register_device(struct ib_device *device, const char *name) 1304 { 1305 int ret; 1306 1307 ret = assign_name(device, name); 1308 if (ret) 1309 return ret; 1310 1311 ret = setup_device(device); 1312 if (ret) 1313 return ret; 1314 1315 ret = ib_cache_setup_one(device); 1316 if (ret) { 1317 dev_warn(&device->dev, 1318 "Couldn't set up InfiniBand P_Key/GID cache\n"); 1319 return ret; 1320 } 1321 1322 ib_device_register_rdmacg(device); 1323 1324 /* 1325 * Ensure that ADD uevent is not fired because it 1326 * is too early amd device is not initialized yet. 1327 */ 1328 dev_set_uevent_suppress(&device->dev, true); 1329 ret = device_add(&device->dev); 1330 if (ret) 1331 goto cg_cleanup; 1332 1333 ret = ib_device_register_sysfs(device); 1334 if (ret) { 1335 dev_warn(&device->dev, 1336 "Couldn't register device with driver model\n"); 1337 goto dev_cleanup; 1338 } 1339 1340 ret = enable_device_and_get(device); 1341 dev_set_uevent_suppress(&device->dev, false); 1342 /* Mark for userspace that device is ready */ 1343 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1344 if (ret) { 1345 void (*dealloc_fn)(struct ib_device *); 1346 1347 /* 1348 * If we hit this error flow then we don't want to 1349 * automatically dealloc the device since the caller is 1350 * expected to call ib_dealloc_device() after 1351 * ib_register_device() fails. This is tricky due to the 1352 * possibility for a parallel unregistration along with this 1353 * error flow. Since we have a refcount here we know any 1354 * parallel flow is stopped in disable_device and will see the 1355 * NULL pointers, causing the responsibility to 1356 * ib_dealloc_device() to revert back to this thread. 1357 */ 1358 dealloc_fn = device->ops.dealloc_driver; 1359 device->ops.dealloc_driver = NULL; 1360 ib_device_put(device); 1361 __ib_unregister_device(device); 1362 device->ops.dealloc_driver = dealloc_fn; 1363 return ret; 1364 } 1365 ib_device_put(device); 1366 1367 return 0; 1368 1369 dev_cleanup: 1370 device_del(&device->dev); 1371 cg_cleanup: 1372 dev_set_uevent_suppress(&device->dev, false); 1373 ib_device_unregister_rdmacg(device); 1374 ib_cache_cleanup_one(device); 1375 return ret; 1376 } 1377 EXPORT_SYMBOL(ib_register_device); 1378 1379 /* Callers must hold a get on the device. */ 1380 static void __ib_unregister_device(struct ib_device *ib_dev) 1381 { 1382 /* 1383 * We have a registration lock so that all the calls to unregister are 1384 * fully fenced, once any unregister returns the device is truely 1385 * unregistered even if multiple callers are unregistering it at the 1386 * same time. This also interacts with the registration flow and 1387 * provides sane semantics if register and unregister are racing. 1388 */ 1389 mutex_lock(&ib_dev->unregistration_lock); 1390 if (!refcount_read(&ib_dev->refcount)) 1391 goto out; 1392 1393 disable_device(ib_dev); 1394 1395 /* Expedite removing unregistered pointers from the hash table */ 1396 free_netdevs(ib_dev); 1397 1398 ib_device_unregister_sysfs(ib_dev); 1399 device_del(&ib_dev->dev); 1400 ib_device_unregister_rdmacg(ib_dev); 1401 ib_cache_cleanup_one(ib_dev); 1402 1403 /* 1404 * Drivers using the new flow may not call ib_dealloc_device except 1405 * in error unwind prior to registration success. 1406 */ 1407 if (ib_dev->ops.dealloc_driver) { 1408 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 1409 ib_dealloc_device(ib_dev); 1410 } 1411 out: 1412 mutex_unlock(&ib_dev->unregistration_lock); 1413 } 1414 1415 /** 1416 * ib_unregister_device - Unregister an IB device 1417 * @device: The device to unregister 1418 * 1419 * Unregister an IB device. All clients will receive a remove callback. 1420 * 1421 * Callers should call this routine only once, and protect against races with 1422 * registration. Typically it should only be called as part of a remove 1423 * callback in an implementation of driver core's struct device_driver and 1424 * related. 1425 * 1426 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 1427 * this function. 1428 */ 1429 void ib_unregister_device(struct ib_device *ib_dev) 1430 { 1431 get_device(&ib_dev->dev); 1432 __ib_unregister_device(ib_dev); 1433 put_device(&ib_dev->dev); 1434 } 1435 EXPORT_SYMBOL(ib_unregister_device); 1436 1437 /** 1438 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 1439 * device: The device to unregister 1440 * 1441 * This is the same as ib_unregister_device(), except it includes an internal 1442 * ib_device_put() that should match a 'get' obtained by the caller. 1443 * 1444 * It is safe to call this routine concurrently from multiple threads while 1445 * holding the 'get'. When the function returns the device is fully 1446 * unregistered. 1447 * 1448 * Drivers using this flow MUST use the driver_unregister callback to clean up 1449 * their resources associated with the device and dealloc it. 1450 */ 1451 void ib_unregister_device_and_put(struct ib_device *ib_dev) 1452 { 1453 WARN_ON(!ib_dev->ops.dealloc_driver); 1454 get_device(&ib_dev->dev); 1455 ib_device_put(ib_dev); 1456 __ib_unregister_device(ib_dev); 1457 put_device(&ib_dev->dev); 1458 } 1459 EXPORT_SYMBOL(ib_unregister_device_and_put); 1460 1461 /** 1462 * ib_unregister_driver - Unregister all IB devices for a driver 1463 * @driver_id: The driver to unregister 1464 * 1465 * This implements a fence for device unregistration. It only returns once all 1466 * devices associated with the driver_id have fully completed their 1467 * unregistration and returned from ib_unregister_device*(). 1468 * 1469 * If device's are not yet unregistered it goes ahead and starts unregistering 1470 * them. 1471 * 1472 * This does not block creation of new devices with the given driver_id, that 1473 * is the responsibility of the caller. 1474 */ 1475 void ib_unregister_driver(enum rdma_driver_id driver_id) 1476 { 1477 struct ib_device *ib_dev; 1478 unsigned long index; 1479 1480 down_read(&devices_rwsem); 1481 xa_for_each (&devices, index, ib_dev) { 1482 if (ib_dev->driver_id != driver_id) 1483 continue; 1484 1485 get_device(&ib_dev->dev); 1486 up_read(&devices_rwsem); 1487 1488 WARN_ON(!ib_dev->ops.dealloc_driver); 1489 __ib_unregister_device(ib_dev); 1490 1491 put_device(&ib_dev->dev); 1492 down_read(&devices_rwsem); 1493 } 1494 up_read(&devices_rwsem); 1495 } 1496 EXPORT_SYMBOL(ib_unregister_driver); 1497 1498 static void ib_unregister_work(struct work_struct *work) 1499 { 1500 struct ib_device *ib_dev = 1501 container_of(work, struct ib_device, unregistration_work); 1502 1503 __ib_unregister_device(ib_dev); 1504 put_device(&ib_dev->dev); 1505 } 1506 1507 /** 1508 * ib_unregister_device_queued - Unregister a device using a work queue 1509 * device: The device to unregister 1510 * 1511 * This schedules an asynchronous unregistration using a WQ for the device. A 1512 * driver should use this to avoid holding locks while doing unregistration, 1513 * such as holding the RTNL lock. 1514 * 1515 * Drivers using this API must use ib_unregister_driver before module unload 1516 * to ensure that all scheduled unregistrations have completed. 1517 */ 1518 void ib_unregister_device_queued(struct ib_device *ib_dev) 1519 { 1520 WARN_ON(!refcount_read(&ib_dev->refcount)); 1521 WARN_ON(!ib_dev->ops.dealloc_driver); 1522 get_device(&ib_dev->dev); 1523 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1524 put_device(&ib_dev->dev); 1525 } 1526 EXPORT_SYMBOL(ib_unregister_device_queued); 1527 1528 /* 1529 * The caller must pass in a device that has the kref held and the refcount 1530 * released. If the device is in cur_net and still registered then it is moved 1531 * into net. 1532 */ 1533 static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net, 1534 struct net *net) 1535 { 1536 int ret2 = -EINVAL; 1537 int ret; 1538 1539 mutex_lock(&device->unregistration_lock); 1540 1541 /* 1542 * If a device not under ib_device_get() or if the unregistration_lock 1543 * is not held, the namespace can be changed, or it can be unregistered. 1544 * Check again under the lock. 1545 */ 1546 if (refcount_read(&device->refcount) == 0 || 1547 !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) { 1548 ret = -ENODEV; 1549 goto out; 1550 } 1551 1552 kobject_uevent(&device->dev.kobj, KOBJ_REMOVE); 1553 disable_device(device); 1554 1555 /* 1556 * At this point no one can be using the device, so it is safe to 1557 * change the namespace. 1558 */ 1559 write_pnet(&device->coredev.rdma_net, net); 1560 1561 down_read(&devices_rwsem); 1562 /* 1563 * Currently rdma devices are system wide unique. So the device name 1564 * is guaranteed free in the new namespace. Publish the new namespace 1565 * at the sysfs level. 1566 */ 1567 ret = device_rename(&device->dev, dev_name(&device->dev)); 1568 up_read(&devices_rwsem); 1569 if (ret) { 1570 dev_warn(&device->dev, 1571 "%s: Couldn't rename device after namespace change\n", 1572 __func__); 1573 /* Try and put things back and re-enable the device */ 1574 write_pnet(&device->coredev.rdma_net, cur_net); 1575 } 1576 1577 ret2 = enable_device_and_get(device); 1578 if (ret2) { 1579 /* 1580 * This shouldn't really happen, but if it does, let the user 1581 * retry at later point. So don't disable the device. 1582 */ 1583 dev_warn(&device->dev, 1584 "%s: Couldn't re-enable device after namespace change\n", 1585 __func__); 1586 } 1587 kobject_uevent(&device->dev.kobj, KOBJ_ADD); 1588 1589 ib_device_put(device); 1590 out: 1591 mutex_unlock(&device->unregistration_lock); 1592 if (ret) 1593 return ret; 1594 return ret2; 1595 } 1596 1597 int ib_device_set_netns_put(struct sk_buff *skb, 1598 struct ib_device *dev, u32 ns_fd) 1599 { 1600 struct net *net; 1601 int ret; 1602 1603 net = get_net_ns_by_fd(ns_fd); 1604 if (IS_ERR(net)) { 1605 ret = PTR_ERR(net); 1606 goto net_err; 1607 } 1608 1609 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { 1610 ret = -EPERM; 1611 goto ns_err; 1612 } 1613 1614 /* 1615 * Currently supported only for those providers which support 1616 * disassociation and don't do port specific sysfs init. Once a 1617 * port_cleanup infrastructure is implemented, this limitation will be 1618 * removed. 1619 */ 1620 if (!dev->ops.disassociate_ucontext || dev->ops.init_port || 1621 ib_devices_shared_netns) { 1622 ret = -EOPNOTSUPP; 1623 goto ns_err; 1624 } 1625 1626 get_device(&dev->dev); 1627 ib_device_put(dev); 1628 ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net); 1629 put_device(&dev->dev); 1630 1631 put_net(net); 1632 return ret; 1633 1634 ns_err: 1635 put_net(net); 1636 net_err: 1637 ib_device_put(dev); 1638 return ret; 1639 } 1640 1641 static struct pernet_operations rdma_dev_net_ops = { 1642 .init = rdma_dev_init_net, 1643 .exit = rdma_dev_exit_net, 1644 .id = &rdma_dev_net_id, 1645 .size = sizeof(struct rdma_dev_net), 1646 }; 1647 1648 static int assign_client_id(struct ib_client *client) 1649 { 1650 int ret; 1651 1652 down_write(&clients_rwsem); 1653 /* 1654 * The add/remove callbacks must be called in FIFO/LIFO order. To 1655 * achieve this we assign client_ids so they are sorted in 1656 * registration order, and retain a linked list we can reverse iterate 1657 * to get the LIFO order. The extra linked list can go away if xarray 1658 * learns to reverse iterate. 1659 */ 1660 if (list_empty(&client_list)) { 1661 client->client_id = 0; 1662 } else { 1663 struct ib_client *last; 1664 1665 last = list_last_entry(&client_list, struct ib_client, list); 1666 client->client_id = last->client_id + 1; 1667 } 1668 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1669 if (ret) 1670 goto out; 1671 1672 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1673 list_add_tail(&client->list, &client_list); 1674 1675 out: 1676 up_write(&clients_rwsem); 1677 return ret; 1678 } 1679 1680 /** 1681 * ib_register_client - Register an IB client 1682 * @client:Client to register 1683 * 1684 * Upper level users of the IB drivers can use ib_register_client() to 1685 * register callbacks for IB device addition and removal. When an IB 1686 * device is added, each registered client's add method will be called 1687 * (in the order the clients were registered), and when a device is 1688 * removed, each client's remove method will be called (in the reverse 1689 * order that clients were registered). In addition, when 1690 * ib_register_client() is called, the client will receive an add 1691 * callback for all devices already registered. 1692 */ 1693 int ib_register_client(struct ib_client *client) 1694 { 1695 struct ib_device *device; 1696 unsigned long index; 1697 int ret; 1698 1699 ret = assign_client_id(client); 1700 if (ret) 1701 return ret; 1702 1703 down_read(&devices_rwsem); 1704 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1705 ret = add_client_context(device, client); 1706 if (ret) { 1707 up_read(&devices_rwsem); 1708 ib_unregister_client(client); 1709 return ret; 1710 } 1711 } 1712 up_read(&devices_rwsem); 1713 return 0; 1714 } 1715 EXPORT_SYMBOL(ib_register_client); 1716 1717 /** 1718 * ib_unregister_client - Unregister an IB client 1719 * @client:Client to unregister 1720 * 1721 * Upper level users use ib_unregister_client() to remove their client 1722 * registration. When ib_unregister_client() is called, the client 1723 * will receive a remove callback for each IB device still registered. 1724 * 1725 * This is a full fence, once it returns no client callbacks will be called, 1726 * or are running in another thread. 1727 */ 1728 void ib_unregister_client(struct ib_client *client) 1729 { 1730 struct ib_device *device; 1731 unsigned long index; 1732 1733 down_write(&clients_rwsem); 1734 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1735 up_write(&clients_rwsem); 1736 /* 1737 * Every device still known must be serialized to make sure we are 1738 * done with the client callbacks before we return. 1739 */ 1740 down_read(&devices_rwsem); 1741 xa_for_each (&devices, index, device) 1742 remove_client_context(device, client->client_id); 1743 up_read(&devices_rwsem); 1744 1745 down_write(&clients_rwsem); 1746 list_del(&client->list); 1747 xa_erase(&clients, client->client_id); 1748 up_write(&clients_rwsem); 1749 } 1750 EXPORT_SYMBOL(ib_unregister_client); 1751 1752 /** 1753 * ib_set_client_data - Set IB client context 1754 * @device:Device to set context for 1755 * @client:Client to set context for 1756 * @data:Context to set 1757 * 1758 * ib_set_client_data() sets client context data that can be retrieved with 1759 * ib_get_client_data(). This can only be called while the client is 1760 * registered to the device, once the ib_client remove() callback returns this 1761 * cannot be called. 1762 */ 1763 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1764 void *data) 1765 { 1766 void *rc; 1767 1768 if (WARN_ON(IS_ERR(data))) 1769 data = NULL; 1770 1771 rc = xa_store(&device->client_data, client->client_id, data, 1772 GFP_KERNEL); 1773 WARN_ON(xa_is_err(rc)); 1774 } 1775 EXPORT_SYMBOL(ib_set_client_data); 1776 1777 /** 1778 * ib_register_event_handler - Register an IB event handler 1779 * @event_handler:Handler to register 1780 * 1781 * ib_register_event_handler() registers an event handler that will be 1782 * called back when asynchronous IB events occur (as defined in 1783 * chapter 11 of the InfiniBand Architecture Specification). This 1784 * callback may occur in interrupt context. 1785 */ 1786 void ib_register_event_handler(struct ib_event_handler *event_handler) 1787 { 1788 unsigned long flags; 1789 1790 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1791 list_add_tail(&event_handler->list, 1792 &event_handler->device->event_handler_list); 1793 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1794 } 1795 EXPORT_SYMBOL(ib_register_event_handler); 1796 1797 /** 1798 * ib_unregister_event_handler - Unregister an event handler 1799 * @event_handler:Handler to unregister 1800 * 1801 * Unregister an event handler registered with 1802 * ib_register_event_handler(). 1803 */ 1804 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1805 { 1806 unsigned long flags; 1807 1808 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1809 list_del(&event_handler->list); 1810 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1811 } 1812 EXPORT_SYMBOL(ib_unregister_event_handler); 1813 1814 /** 1815 * ib_dispatch_event - Dispatch an asynchronous event 1816 * @event:Event to dispatch 1817 * 1818 * Low-level drivers must call ib_dispatch_event() to dispatch the 1819 * event to all registered event handlers when an asynchronous event 1820 * occurs. 1821 */ 1822 void ib_dispatch_event(struct ib_event *event) 1823 { 1824 unsigned long flags; 1825 struct ib_event_handler *handler; 1826 1827 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1828 1829 list_for_each_entry(handler, &event->device->event_handler_list, list) 1830 handler->handler(handler, event); 1831 1832 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1833 } 1834 EXPORT_SYMBOL(ib_dispatch_event); 1835 1836 /** 1837 * ib_query_port - Query IB port attributes 1838 * @device:Device to query 1839 * @port_num:Port number to query 1840 * @port_attr:Port attributes 1841 * 1842 * ib_query_port() returns the attributes of a port through the 1843 * @port_attr pointer. 1844 */ 1845 int ib_query_port(struct ib_device *device, 1846 u8 port_num, 1847 struct ib_port_attr *port_attr) 1848 { 1849 union ib_gid gid; 1850 int err; 1851 1852 if (!rdma_is_port_valid(device, port_num)) 1853 return -EINVAL; 1854 1855 memset(port_attr, 0, sizeof(*port_attr)); 1856 err = device->ops.query_port(device, port_num, port_attr); 1857 if (err || port_attr->subnet_prefix) 1858 return err; 1859 1860 if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) 1861 return 0; 1862 1863 err = device->ops.query_gid(device, port_num, 0, &gid); 1864 if (err) 1865 return err; 1866 1867 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 1868 return 0; 1869 } 1870 EXPORT_SYMBOL(ib_query_port); 1871 1872 static void add_ndev_hash(struct ib_port_data *pdata) 1873 { 1874 unsigned long flags; 1875 1876 might_sleep(); 1877 1878 spin_lock_irqsave(&ndev_hash_lock, flags); 1879 if (hash_hashed(&pdata->ndev_hash_link)) { 1880 hash_del_rcu(&pdata->ndev_hash_link); 1881 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1882 /* 1883 * We cannot do hash_add_rcu after a hash_del_rcu until the 1884 * grace period 1885 */ 1886 synchronize_rcu(); 1887 spin_lock_irqsave(&ndev_hash_lock, flags); 1888 } 1889 if (pdata->netdev) 1890 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 1891 (uintptr_t)pdata->netdev); 1892 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1893 } 1894 1895 /** 1896 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 1897 * @ib_dev: Device to modify 1898 * @ndev: net_device to affiliate, may be NULL 1899 * @port: IB port the net_device is connected to 1900 * 1901 * Drivers should use this to link the ib_device to a netdev so the netdev 1902 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 1903 * affiliated with any port. 1904 * 1905 * The caller must ensure that the given ndev is not unregistered or 1906 * unregistering, and that either the ib_device is unregistered or 1907 * ib_device_set_netdev() is called with NULL when the ndev sends a 1908 * NETDEV_UNREGISTER event. 1909 */ 1910 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 1911 unsigned int port) 1912 { 1913 struct net_device *old_ndev; 1914 struct ib_port_data *pdata; 1915 unsigned long flags; 1916 int ret; 1917 1918 /* 1919 * Drivers wish to call this before ib_register_driver, so we have to 1920 * setup the port data early. 1921 */ 1922 ret = alloc_port_data(ib_dev); 1923 if (ret) 1924 return ret; 1925 1926 if (!rdma_is_port_valid(ib_dev, port)) 1927 return -EINVAL; 1928 1929 pdata = &ib_dev->port_data[port]; 1930 spin_lock_irqsave(&pdata->netdev_lock, flags); 1931 old_ndev = rcu_dereference_protected( 1932 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 1933 if (old_ndev == ndev) { 1934 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1935 return 0; 1936 } 1937 1938 if (ndev) 1939 dev_hold(ndev); 1940 rcu_assign_pointer(pdata->netdev, ndev); 1941 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1942 1943 add_ndev_hash(pdata); 1944 if (old_ndev) 1945 dev_put(old_ndev); 1946 1947 return 0; 1948 } 1949 EXPORT_SYMBOL(ib_device_set_netdev); 1950 1951 static void free_netdevs(struct ib_device *ib_dev) 1952 { 1953 unsigned long flags; 1954 unsigned int port; 1955 1956 if (!ib_dev->port_data) 1957 return; 1958 1959 rdma_for_each_port (ib_dev, port) { 1960 struct ib_port_data *pdata = &ib_dev->port_data[port]; 1961 struct net_device *ndev; 1962 1963 spin_lock_irqsave(&pdata->netdev_lock, flags); 1964 ndev = rcu_dereference_protected( 1965 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 1966 if (ndev) { 1967 spin_lock(&ndev_hash_lock); 1968 hash_del_rcu(&pdata->ndev_hash_link); 1969 spin_unlock(&ndev_hash_lock); 1970 1971 /* 1972 * If this is the last dev_put there is still a 1973 * synchronize_rcu before the netdev is kfreed, so we 1974 * can continue to rely on unlocked pointer 1975 * comparisons after the put 1976 */ 1977 rcu_assign_pointer(pdata->netdev, NULL); 1978 dev_put(ndev); 1979 } 1980 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1981 } 1982 } 1983 1984 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 1985 unsigned int port) 1986 { 1987 struct ib_port_data *pdata; 1988 struct net_device *res; 1989 1990 if (!rdma_is_port_valid(ib_dev, port)) 1991 return NULL; 1992 1993 pdata = &ib_dev->port_data[port]; 1994 1995 /* 1996 * New drivers should use ib_device_set_netdev() not the legacy 1997 * get_netdev(). 1998 */ 1999 if (ib_dev->ops.get_netdev) 2000 res = ib_dev->ops.get_netdev(ib_dev, port); 2001 else { 2002 spin_lock(&pdata->netdev_lock); 2003 res = rcu_dereference_protected( 2004 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 2005 if (res) 2006 dev_hold(res); 2007 spin_unlock(&pdata->netdev_lock); 2008 } 2009 2010 /* 2011 * If we are starting to unregister expedite things by preventing 2012 * propagation of an unregistering netdev. 2013 */ 2014 if (res && res->reg_state != NETREG_REGISTERED) { 2015 dev_put(res); 2016 return NULL; 2017 } 2018 2019 return res; 2020 } 2021 2022 /** 2023 * ib_device_get_by_netdev - Find an IB device associated with a netdev 2024 * @ndev: netdev to locate 2025 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 2026 * 2027 * Find and hold an ib_device that is associated with a netdev via 2028 * ib_device_set_netdev(). The caller must call ib_device_put() on the 2029 * returned pointer. 2030 */ 2031 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 2032 enum rdma_driver_id driver_id) 2033 { 2034 struct ib_device *res = NULL; 2035 struct ib_port_data *cur; 2036 2037 rcu_read_lock(); 2038 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 2039 (uintptr_t)ndev) { 2040 if (rcu_access_pointer(cur->netdev) == ndev && 2041 (driver_id == RDMA_DRIVER_UNKNOWN || 2042 cur->ib_dev->driver_id == driver_id) && 2043 ib_device_try_get(cur->ib_dev)) { 2044 res = cur->ib_dev; 2045 break; 2046 } 2047 } 2048 rcu_read_unlock(); 2049 2050 return res; 2051 } 2052 EXPORT_SYMBOL(ib_device_get_by_netdev); 2053 2054 /** 2055 * ib_enum_roce_netdev - enumerate all RoCE ports 2056 * @ib_dev : IB device we want to query 2057 * @filter: Should we call the callback? 2058 * @filter_cookie: Cookie passed to filter 2059 * @cb: Callback to call for each found RoCE ports 2060 * @cookie: Cookie passed back to the callback 2061 * 2062 * Enumerates all of the physical RoCE ports of ib_dev 2063 * which are related to netdevice and calls callback() on each 2064 * device for which filter() function returns non zero. 2065 */ 2066 void ib_enum_roce_netdev(struct ib_device *ib_dev, 2067 roce_netdev_filter filter, 2068 void *filter_cookie, 2069 roce_netdev_callback cb, 2070 void *cookie) 2071 { 2072 unsigned int port; 2073 2074 rdma_for_each_port (ib_dev, port) 2075 if (rdma_protocol_roce(ib_dev, port)) { 2076 struct net_device *idev = 2077 ib_device_get_netdev(ib_dev, port); 2078 2079 if (filter(ib_dev, port, idev, filter_cookie)) 2080 cb(ib_dev, port, idev, cookie); 2081 2082 if (idev) 2083 dev_put(idev); 2084 } 2085 } 2086 2087 /** 2088 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 2089 * @filter: Should we call the callback? 2090 * @filter_cookie: Cookie passed to filter 2091 * @cb: Callback to call for each found RoCE ports 2092 * @cookie: Cookie passed back to the callback 2093 * 2094 * Enumerates all RoCE devices' physical ports which are related 2095 * to netdevices and calls callback() on each device for which 2096 * filter() function returns non zero. 2097 */ 2098 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 2099 void *filter_cookie, 2100 roce_netdev_callback cb, 2101 void *cookie) 2102 { 2103 struct ib_device *dev; 2104 unsigned long index; 2105 2106 down_read(&devices_rwsem); 2107 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 2108 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 2109 up_read(&devices_rwsem); 2110 } 2111 2112 /** 2113 * ib_enum_all_devs - enumerate all ib_devices 2114 * @cb: Callback to call for each found ib_device 2115 * 2116 * Enumerates all ib_devices and calls callback() on each device. 2117 */ 2118 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 2119 struct netlink_callback *cb) 2120 { 2121 unsigned long index; 2122 struct ib_device *dev; 2123 unsigned int idx = 0; 2124 int ret = 0; 2125 2126 down_read(&devices_rwsem); 2127 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 2128 if (!rdma_dev_access_netns(dev, sock_net(skb->sk))) 2129 continue; 2130 2131 ret = nldev_cb(dev, skb, cb, idx); 2132 if (ret) 2133 break; 2134 idx++; 2135 } 2136 up_read(&devices_rwsem); 2137 return ret; 2138 } 2139 2140 /** 2141 * ib_query_pkey - Get P_Key table entry 2142 * @device:Device to query 2143 * @port_num:Port number to query 2144 * @index:P_Key table index to query 2145 * @pkey:Returned P_Key 2146 * 2147 * ib_query_pkey() fetches the specified P_Key table entry. 2148 */ 2149 int ib_query_pkey(struct ib_device *device, 2150 u8 port_num, u16 index, u16 *pkey) 2151 { 2152 if (!rdma_is_port_valid(device, port_num)) 2153 return -EINVAL; 2154 2155 return device->ops.query_pkey(device, port_num, index, pkey); 2156 } 2157 EXPORT_SYMBOL(ib_query_pkey); 2158 2159 /** 2160 * ib_modify_device - Change IB device attributes 2161 * @device:Device to modify 2162 * @device_modify_mask:Mask of attributes to change 2163 * @device_modify:New attribute values 2164 * 2165 * ib_modify_device() changes a device's attributes as specified by 2166 * the @device_modify_mask and @device_modify structure. 2167 */ 2168 int ib_modify_device(struct ib_device *device, 2169 int device_modify_mask, 2170 struct ib_device_modify *device_modify) 2171 { 2172 if (!device->ops.modify_device) 2173 return -ENOSYS; 2174 2175 return device->ops.modify_device(device, device_modify_mask, 2176 device_modify); 2177 } 2178 EXPORT_SYMBOL(ib_modify_device); 2179 2180 /** 2181 * ib_modify_port - Modifies the attributes for the specified port. 2182 * @device: The device to modify. 2183 * @port_num: The number of the port to modify. 2184 * @port_modify_mask: Mask used to specify which attributes of the port 2185 * to change. 2186 * @port_modify: New attribute values for the port. 2187 * 2188 * ib_modify_port() changes a port's attributes as specified by the 2189 * @port_modify_mask and @port_modify structure. 2190 */ 2191 int ib_modify_port(struct ib_device *device, 2192 u8 port_num, int port_modify_mask, 2193 struct ib_port_modify *port_modify) 2194 { 2195 int rc; 2196 2197 if (!rdma_is_port_valid(device, port_num)) 2198 return -EINVAL; 2199 2200 if (device->ops.modify_port) 2201 rc = device->ops.modify_port(device, port_num, 2202 port_modify_mask, 2203 port_modify); 2204 else 2205 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; 2206 return rc; 2207 } 2208 EXPORT_SYMBOL(ib_modify_port); 2209 2210 /** 2211 * ib_find_gid - Returns the port number and GID table index where 2212 * a specified GID value occurs. Its searches only for IB link layer. 2213 * @device: The device to query. 2214 * @gid: The GID value to search for. 2215 * @port_num: The port number of the device where the GID value was found. 2216 * @index: The index into the GID table where the GID was found. This 2217 * parameter may be NULL. 2218 */ 2219 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 2220 u8 *port_num, u16 *index) 2221 { 2222 union ib_gid tmp_gid; 2223 unsigned int port; 2224 int ret, i; 2225 2226 rdma_for_each_port (device, port) { 2227 if (!rdma_protocol_ib(device, port)) 2228 continue; 2229 2230 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 2231 ++i) { 2232 ret = rdma_query_gid(device, port, i, &tmp_gid); 2233 if (ret) 2234 return ret; 2235 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 2236 *port_num = port; 2237 if (index) 2238 *index = i; 2239 return 0; 2240 } 2241 } 2242 } 2243 2244 return -ENOENT; 2245 } 2246 EXPORT_SYMBOL(ib_find_gid); 2247 2248 /** 2249 * ib_find_pkey - Returns the PKey table index where a specified 2250 * PKey value occurs. 2251 * @device: The device to query. 2252 * @port_num: The port number of the device to search for the PKey. 2253 * @pkey: The PKey value to search for. 2254 * @index: The index into the PKey table where the PKey was found. 2255 */ 2256 int ib_find_pkey(struct ib_device *device, 2257 u8 port_num, u16 pkey, u16 *index) 2258 { 2259 int ret, i; 2260 u16 tmp_pkey; 2261 int partial_ix = -1; 2262 2263 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 2264 ++i) { 2265 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 2266 if (ret) 2267 return ret; 2268 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 2269 /* if there is full-member pkey take it.*/ 2270 if (tmp_pkey & 0x8000) { 2271 *index = i; 2272 return 0; 2273 } 2274 if (partial_ix < 0) 2275 partial_ix = i; 2276 } 2277 } 2278 2279 /*no full-member, if exists take the limited*/ 2280 if (partial_ix >= 0) { 2281 *index = partial_ix; 2282 return 0; 2283 } 2284 return -ENOENT; 2285 } 2286 EXPORT_SYMBOL(ib_find_pkey); 2287 2288 /** 2289 * ib_get_net_dev_by_params() - Return the appropriate net_dev 2290 * for a received CM request 2291 * @dev: An RDMA device on which the request has been received. 2292 * @port: Port number on the RDMA device. 2293 * @pkey: The Pkey the request came on. 2294 * @gid: A GID that the net_dev uses to communicate. 2295 * @addr: Contains the IP address that the request specified as its 2296 * destination. 2297 * 2298 */ 2299 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 2300 u8 port, 2301 u16 pkey, 2302 const union ib_gid *gid, 2303 const struct sockaddr *addr) 2304 { 2305 struct net_device *net_dev = NULL; 2306 unsigned long index; 2307 void *client_data; 2308 2309 if (!rdma_protocol_ib(dev, port)) 2310 return NULL; 2311 2312 /* 2313 * Holding the read side guarantees that the client will not become 2314 * unregistered while we are calling get_net_dev_by_params() 2315 */ 2316 down_read(&dev->client_data_rwsem); 2317 xan_for_each_marked (&dev->client_data, index, client_data, 2318 CLIENT_DATA_REGISTERED) { 2319 struct ib_client *client = xa_load(&clients, index); 2320 2321 if (!client || !client->get_net_dev_by_params) 2322 continue; 2323 2324 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 2325 addr, client_data); 2326 if (net_dev) 2327 break; 2328 } 2329 up_read(&dev->client_data_rwsem); 2330 2331 return net_dev; 2332 } 2333 EXPORT_SYMBOL(ib_get_net_dev_by_params); 2334 2335 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 2336 { 2337 struct ib_device_ops *dev_ops = &dev->ops; 2338 #define SET_DEVICE_OP(ptr, name) \ 2339 do { \ 2340 if (ops->name) \ 2341 if (!((ptr)->name)) \ 2342 (ptr)->name = ops->name; \ 2343 } while (0) 2344 2345 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 2346 2347 SET_DEVICE_OP(dev_ops, add_gid); 2348 SET_DEVICE_OP(dev_ops, advise_mr); 2349 SET_DEVICE_OP(dev_ops, alloc_dm); 2350 SET_DEVICE_OP(dev_ops, alloc_fmr); 2351 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 2352 SET_DEVICE_OP(dev_ops, alloc_mr); 2353 SET_DEVICE_OP(dev_ops, alloc_mw); 2354 SET_DEVICE_OP(dev_ops, alloc_pd); 2355 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 2356 SET_DEVICE_OP(dev_ops, alloc_ucontext); 2357 SET_DEVICE_OP(dev_ops, alloc_xrcd); 2358 SET_DEVICE_OP(dev_ops, attach_mcast); 2359 SET_DEVICE_OP(dev_ops, check_mr_status); 2360 SET_DEVICE_OP(dev_ops, create_ah); 2361 SET_DEVICE_OP(dev_ops, create_counters); 2362 SET_DEVICE_OP(dev_ops, create_cq); 2363 SET_DEVICE_OP(dev_ops, create_flow); 2364 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 2365 SET_DEVICE_OP(dev_ops, create_qp); 2366 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 2367 SET_DEVICE_OP(dev_ops, create_srq); 2368 SET_DEVICE_OP(dev_ops, create_wq); 2369 SET_DEVICE_OP(dev_ops, dealloc_dm); 2370 SET_DEVICE_OP(dev_ops, dealloc_driver); 2371 SET_DEVICE_OP(dev_ops, dealloc_fmr); 2372 SET_DEVICE_OP(dev_ops, dealloc_mw); 2373 SET_DEVICE_OP(dev_ops, dealloc_pd); 2374 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 2375 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 2376 SET_DEVICE_OP(dev_ops, del_gid); 2377 SET_DEVICE_OP(dev_ops, dereg_mr); 2378 SET_DEVICE_OP(dev_ops, destroy_ah); 2379 SET_DEVICE_OP(dev_ops, destroy_counters); 2380 SET_DEVICE_OP(dev_ops, destroy_cq); 2381 SET_DEVICE_OP(dev_ops, destroy_flow); 2382 SET_DEVICE_OP(dev_ops, destroy_flow_action); 2383 SET_DEVICE_OP(dev_ops, destroy_qp); 2384 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 2385 SET_DEVICE_OP(dev_ops, destroy_srq); 2386 SET_DEVICE_OP(dev_ops, destroy_wq); 2387 SET_DEVICE_OP(dev_ops, detach_mcast); 2388 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 2389 SET_DEVICE_OP(dev_ops, drain_rq); 2390 SET_DEVICE_OP(dev_ops, drain_sq); 2391 SET_DEVICE_OP(dev_ops, enable_driver); 2392 SET_DEVICE_OP(dev_ops, fill_res_entry); 2393 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 2394 SET_DEVICE_OP(dev_ops, get_dma_mr); 2395 SET_DEVICE_OP(dev_ops, get_hw_stats); 2396 SET_DEVICE_OP(dev_ops, get_link_layer); 2397 SET_DEVICE_OP(dev_ops, get_netdev); 2398 SET_DEVICE_OP(dev_ops, get_port_immutable); 2399 SET_DEVICE_OP(dev_ops, get_vector_affinity); 2400 SET_DEVICE_OP(dev_ops, get_vf_config); 2401 SET_DEVICE_OP(dev_ops, get_vf_stats); 2402 SET_DEVICE_OP(dev_ops, init_port); 2403 SET_DEVICE_OP(dev_ops, iw_accept); 2404 SET_DEVICE_OP(dev_ops, iw_add_ref); 2405 SET_DEVICE_OP(dev_ops, iw_connect); 2406 SET_DEVICE_OP(dev_ops, iw_create_listen); 2407 SET_DEVICE_OP(dev_ops, iw_destroy_listen); 2408 SET_DEVICE_OP(dev_ops, iw_get_qp); 2409 SET_DEVICE_OP(dev_ops, iw_reject); 2410 SET_DEVICE_OP(dev_ops, iw_rem_ref); 2411 SET_DEVICE_OP(dev_ops, map_mr_sg); 2412 SET_DEVICE_OP(dev_ops, map_phys_fmr); 2413 SET_DEVICE_OP(dev_ops, mmap); 2414 SET_DEVICE_OP(dev_ops, modify_ah); 2415 SET_DEVICE_OP(dev_ops, modify_cq); 2416 SET_DEVICE_OP(dev_ops, modify_device); 2417 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 2418 SET_DEVICE_OP(dev_ops, modify_port); 2419 SET_DEVICE_OP(dev_ops, modify_qp); 2420 SET_DEVICE_OP(dev_ops, modify_srq); 2421 SET_DEVICE_OP(dev_ops, modify_wq); 2422 SET_DEVICE_OP(dev_ops, peek_cq); 2423 SET_DEVICE_OP(dev_ops, poll_cq); 2424 SET_DEVICE_OP(dev_ops, post_recv); 2425 SET_DEVICE_OP(dev_ops, post_send); 2426 SET_DEVICE_OP(dev_ops, post_srq_recv); 2427 SET_DEVICE_OP(dev_ops, process_mad); 2428 SET_DEVICE_OP(dev_ops, query_ah); 2429 SET_DEVICE_OP(dev_ops, query_device); 2430 SET_DEVICE_OP(dev_ops, query_gid); 2431 SET_DEVICE_OP(dev_ops, query_pkey); 2432 SET_DEVICE_OP(dev_ops, query_port); 2433 SET_DEVICE_OP(dev_ops, query_qp); 2434 SET_DEVICE_OP(dev_ops, query_srq); 2435 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 2436 SET_DEVICE_OP(dev_ops, read_counters); 2437 SET_DEVICE_OP(dev_ops, reg_dm_mr); 2438 SET_DEVICE_OP(dev_ops, reg_user_mr); 2439 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 2440 SET_DEVICE_OP(dev_ops, req_notify_cq); 2441 SET_DEVICE_OP(dev_ops, rereg_user_mr); 2442 SET_DEVICE_OP(dev_ops, resize_cq); 2443 SET_DEVICE_OP(dev_ops, set_vf_guid); 2444 SET_DEVICE_OP(dev_ops, set_vf_link_state); 2445 SET_DEVICE_OP(dev_ops, unmap_fmr); 2446 2447 SET_OBJ_SIZE(dev_ops, ib_ah); 2448 SET_OBJ_SIZE(dev_ops, ib_pd); 2449 SET_OBJ_SIZE(dev_ops, ib_srq); 2450 SET_OBJ_SIZE(dev_ops, ib_ucontext); 2451 } 2452 EXPORT_SYMBOL(ib_set_device_ops); 2453 2454 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 2455 [RDMA_NL_LS_OP_RESOLVE] = { 2456 .doit = ib_nl_handle_resolve_resp, 2457 .flags = RDMA_NL_ADMIN_PERM, 2458 }, 2459 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 2460 .doit = ib_nl_handle_set_timeout, 2461 .flags = RDMA_NL_ADMIN_PERM, 2462 }, 2463 [RDMA_NL_LS_OP_IP_RESOLVE] = { 2464 .doit = ib_nl_handle_ip_res_resp, 2465 .flags = RDMA_NL_ADMIN_PERM, 2466 }, 2467 }; 2468 2469 static int __init ib_core_init(void) 2470 { 2471 int ret; 2472 2473 ib_wq = alloc_workqueue("infiniband", 0, 0); 2474 if (!ib_wq) 2475 return -ENOMEM; 2476 2477 ib_comp_wq = alloc_workqueue("ib-comp-wq", 2478 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 2479 if (!ib_comp_wq) { 2480 ret = -ENOMEM; 2481 goto err; 2482 } 2483 2484 ib_comp_unbound_wq = 2485 alloc_workqueue("ib-comp-unb-wq", 2486 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 2487 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 2488 if (!ib_comp_unbound_wq) { 2489 ret = -ENOMEM; 2490 goto err_comp; 2491 } 2492 2493 ret = class_register(&ib_class); 2494 if (ret) { 2495 pr_warn("Couldn't create InfiniBand device class\n"); 2496 goto err_comp_unbound; 2497 } 2498 2499 ret = rdma_nl_init(); 2500 if (ret) { 2501 pr_warn("Couldn't init IB netlink interface: err %d\n", ret); 2502 goto err_sysfs; 2503 } 2504 2505 ret = addr_init(); 2506 if (ret) { 2507 pr_warn("Could't init IB address resolution\n"); 2508 goto err_ibnl; 2509 } 2510 2511 ret = ib_mad_init(); 2512 if (ret) { 2513 pr_warn("Couldn't init IB MAD\n"); 2514 goto err_addr; 2515 } 2516 2517 ret = ib_sa_init(); 2518 if (ret) { 2519 pr_warn("Couldn't init SA\n"); 2520 goto err_mad; 2521 } 2522 2523 ret = register_lsm_notifier(&ibdev_lsm_nb); 2524 if (ret) { 2525 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 2526 goto err_sa; 2527 } 2528 2529 ret = register_pernet_device(&rdma_dev_net_ops); 2530 if (ret) { 2531 pr_warn("Couldn't init compat dev. ret %d\n", ret); 2532 goto err_compat; 2533 } 2534 2535 nldev_init(); 2536 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 2537 roce_gid_mgmt_init(); 2538 2539 return 0; 2540 2541 err_compat: 2542 unregister_lsm_notifier(&ibdev_lsm_nb); 2543 err_sa: 2544 ib_sa_cleanup(); 2545 err_mad: 2546 ib_mad_cleanup(); 2547 err_addr: 2548 addr_cleanup(); 2549 err_ibnl: 2550 rdma_nl_exit(); 2551 err_sysfs: 2552 class_unregister(&ib_class); 2553 err_comp_unbound: 2554 destroy_workqueue(ib_comp_unbound_wq); 2555 err_comp: 2556 destroy_workqueue(ib_comp_wq); 2557 err: 2558 destroy_workqueue(ib_wq); 2559 return ret; 2560 } 2561 2562 static void __exit ib_core_cleanup(void) 2563 { 2564 roce_gid_mgmt_cleanup(); 2565 nldev_exit(); 2566 rdma_nl_unregister(RDMA_NL_LS); 2567 unregister_pernet_device(&rdma_dev_net_ops); 2568 unregister_lsm_notifier(&ibdev_lsm_nb); 2569 ib_sa_cleanup(); 2570 ib_mad_cleanup(); 2571 addr_cleanup(); 2572 rdma_nl_exit(); 2573 class_unregister(&ib_class); 2574 destroy_workqueue(ib_comp_unbound_wq); 2575 destroy_workqueue(ib_comp_wq); 2576 /* Make sure that any pending umem accounting work is done. */ 2577 destroy_workqueue(ib_wq); 2578 flush_workqueue(system_unbound_wq); 2579 WARN_ON(!xa_empty(&clients)); 2580 WARN_ON(!xa_empty(&devices)); 2581 } 2582 2583 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 2584 2585 /* ib core relies on netdev stack to first register net_ns_type_operations 2586 * ns kobject type before ib_core initialization. 2587 */ 2588 fs_initcall(ib_core_init); 2589 module_exit(ib_core_cleanup); 2590