1 /* 2 * Copyright (c) 2004 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 #include <linux/module.h> 35 #include <linux/string.h> 36 #include <linux/errno.h> 37 #include <linux/kernel.h> 38 #include <linux/slab.h> 39 #include <linux/init.h> 40 #include <linux/netdevice.h> 41 #include <linux/security.h> 42 #include <linux/notifier.h> 43 #include <linux/hashtable.h> 44 #include <rdma/rdma_netlink.h> 45 #include <rdma/ib_addr.h> 46 #include <rdma/ib_cache.h> 47 48 #include "core_priv.h" 49 #include "restrack.h" 50 51 MODULE_AUTHOR("Roland Dreier"); 52 MODULE_DESCRIPTION("core kernel InfiniBand API"); 53 MODULE_LICENSE("Dual BSD/GPL"); 54 55 struct workqueue_struct *ib_comp_wq; 56 struct workqueue_struct *ib_comp_unbound_wq; 57 struct workqueue_struct *ib_wq; 58 EXPORT_SYMBOL_GPL(ib_wq); 59 60 /* 61 * Each of the three rwsem locks (devices, clients, client_data) protects the 62 * xarray of the same name. Specifically it allows the caller to assert that 63 * the MARK will/will not be changing under the lock, and for devices and 64 * clients, that the value in the xarray is still a valid pointer. Change of 65 * the MARK is linked to the object state, so holding the lock and testing the 66 * MARK also asserts that the contained object is in a certain state. 67 * 68 * This is used to build a two stage register/unregister flow where objects 69 * can continue to be in the xarray even though they are still in progress to 70 * register/unregister. 71 * 72 * The xarray itself provides additional locking, and restartable iteration, 73 * which is also relied on. 74 * 75 * Locks should not be nested, with the exception of client_data, which is 76 * allowed to nest under the read side of the other two locks. 77 * 78 * The devices_rwsem also protects the device name list, any change or 79 * assignment of device name must also hold the write side to guarantee unique 80 * names. 81 */ 82 83 /* 84 * devices contains devices that have had their names assigned. The 85 * devices may not be registered. Users that care about the registration 86 * status need to call ib_device_try_get() on the device to ensure it is 87 * registered, and keep it registered, for the required duration. 88 * 89 */ 90 static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC); 91 static DECLARE_RWSEM(devices_rwsem); 92 #define DEVICE_REGISTERED XA_MARK_1 93 94 static LIST_HEAD(client_list); 95 #define CLIENT_REGISTERED XA_MARK_1 96 static DEFINE_XARRAY_FLAGS(clients, XA_FLAGS_ALLOC); 97 static DECLARE_RWSEM(clients_rwsem); 98 99 /* 100 * If client_data is registered then the corresponding client must also still 101 * be registered. 102 */ 103 #define CLIENT_DATA_REGISTERED XA_MARK_1 104 /* 105 * xarray has this behavior where it won't iterate over NULL values stored in 106 * allocated arrays. So we need our own iterator to see all values stored in 107 * the array. This does the same thing as xa_for_each except that it also 108 * returns NULL valued entries if the array is allocating. Simplified to only 109 * work on simple xarrays. 110 */ 111 static void *xan_find_marked(struct xarray *xa, unsigned long *indexp, 112 xa_mark_t filter) 113 { 114 XA_STATE(xas, xa, *indexp); 115 void *entry; 116 117 rcu_read_lock(); 118 do { 119 entry = xas_find_marked(&xas, ULONG_MAX, filter); 120 if (xa_is_zero(entry)) 121 break; 122 } while (xas_retry(&xas, entry)); 123 rcu_read_unlock(); 124 125 if (entry) { 126 *indexp = xas.xa_index; 127 if (xa_is_zero(entry)) 128 return NULL; 129 return entry; 130 } 131 return XA_ERROR(-ENOENT); 132 } 133 #define xan_for_each_marked(xa, index, entry, filter) \ 134 for (index = 0, entry = xan_find_marked(xa, &(index), filter); \ 135 !xa_is_err(entry); \ 136 (index)++, entry = xan_find_marked(xa, &(index), filter)) 137 138 /* RCU hash table mapping netdevice pointers to struct ib_port_data */ 139 static DEFINE_SPINLOCK(ndev_hash_lock); 140 static DECLARE_HASHTABLE(ndev_hash, 5); 141 142 static void free_netdevs(struct ib_device *ib_dev); 143 static void ib_unregister_work(struct work_struct *work); 144 static void __ib_unregister_device(struct ib_device *device); 145 static int ib_security_change(struct notifier_block *nb, unsigned long event, 146 void *lsm_data); 147 static void ib_policy_change_task(struct work_struct *work); 148 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task); 149 150 static struct notifier_block ibdev_lsm_nb = { 151 .notifier_call = ib_security_change, 152 }; 153 154 /* Pointer to the RCU head at the start of the ib_port_data array */ 155 struct ib_port_data_rcu { 156 struct rcu_head rcu_head; 157 struct ib_port_data pdata[]; 158 }; 159 160 static int ib_device_check_mandatory(struct ib_device *device) 161 { 162 #define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device_ops, x), #x } 163 static const struct { 164 size_t offset; 165 char *name; 166 } mandatory_table[] = { 167 IB_MANDATORY_FUNC(query_device), 168 IB_MANDATORY_FUNC(query_port), 169 IB_MANDATORY_FUNC(query_pkey), 170 IB_MANDATORY_FUNC(alloc_pd), 171 IB_MANDATORY_FUNC(dealloc_pd), 172 IB_MANDATORY_FUNC(create_qp), 173 IB_MANDATORY_FUNC(modify_qp), 174 IB_MANDATORY_FUNC(destroy_qp), 175 IB_MANDATORY_FUNC(post_send), 176 IB_MANDATORY_FUNC(post_recv), 177 IB_MANDATORY_FUNC(create_cq), 178 IB_MANDATORY_FUNC(destroy_cq), 179 IB_MANDATORY_FUNC(poll_cq), 180 IB_MANDATORY_FUNC(req_notify_cq), 181 IB_MANDATORY_FUNC(get_dma_mr), 182 IB_MANDATORY_FUNC(dereg_mr), 183 IB_MANDATORY_FUNC(get_port_immutable) 184 }; 185 int i; 186 187 device->kverbs_provider = true; 188 for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { 189 if (!*(void **) ((void *) &device->ops + 190 mandatory_table[i].offset)) { 191 device->kverbs_provider = false; 192 break; 193 } 194 } 195 196 return 0; 197 } 198 199 /* 200 * Caller must perform ib_device_put() to return the device reference count 201 * when ib_device_get_by_index() returns valid device pointer. 202 */ 203 struct ib_device *ib_device_get_by_index(u32 index) 204 { 205 struct ib_device *device; 206 207 down_read(&devices_rwsem); 208 device = xa_load(&devices, index); 209 if (device) { 210 if (!ib_device_try_get(device)) 211 device = NULL; 212 } 213 up_read(&devices_rwsem); 214 return device; 215 } 216 217 /** 218 * ib_device_put - Release IB device reference 219 * @device: device whose reference to be released 220 * 221 * ib_device_put() releases reference to the IB device to allow it to be 222 * unregistered and eventually free. 223 */ 224 void ib_device_put(struct ib_device *device) 225 { 226 if (refcount_dec_and_test(&device->refcount)) 227 complete(&device->unreg_completion); 228 } 229 EXPORT_SYMBOL(ib_device_put); 230 231 static struct ib_device *__ib_device_get_by_name(const char *name) 232 { 233 struct ib_device *device; 234 unsigned long index; 235 236 xa_for_each (&devices, index, device) 237 if (!strcmp(name, dev_name(&device->dev))) 238 return device; 239 240 return NULL; 241 } 242 243 /** 244 * ib_device_get_by_name - Find an IB device by name 245 * @name: The name to look for 246 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 247 * 248 * Find and hold an ib_device by its name. The caller must call 249 * ib_device_put() on the returned pointer. 250 */ 251 struct ib_device *ib_device_get_by_name(const char *name, 252 enum rdma_driver_id driver_id) 253 { 254 struct ib_device *device; 255 256 down_read(&devices_rwsem); 257 device = __ib_device_get_by_name(name); 258 if (device && driver_id != RDMA_DRIVER_UNKNOWN && 259 device->driver_id != driver_id) 260 device = NULL; 261 262 if (device) { 263 if (!ib_device_try_get(device)) 264 device = NULL; 265 } 266 up_read(&devices_rwsem); 267 return device; 268 } 269 EXPORT_SYMBOL(ib_device_get_by_name); 270 271 int ib_device_rename(struct ib_device *ibdev, const char *name) 272 { 273 int ret; 274 275 down_write(&devices_rwsem); 276 if (!strcmp(name, dev_name(&ibdev->dev))) { 277 ret = 0; 278 goto out; 279 } 280 281 if (__ib_device_get_by_name(name)) { 282 ret = -EEXIST; 283 goto out; 284 } 285 286 ret = device_rename(&ibdev->dev, name); 287 if (ret) 288 goto out; 289 strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); 290 out: 291 up_write(&devices_rwsem); 292 return ret; 293 } 294 295 static int alloc_name(struct ib_device *ibdev, const char *name) 296 { 297 struct ib_device *device; 298 unsigned long index; 299 struct ida inuse; 300 int rc; 301 int i; 302 303 lockdep_assert_held_exclusive(&devices_rwsem); 304 ida_init(&inuse); 305 xa_for_each (&devices, index, device) { 306 char buf[IB_DEVICE_NAME_MAX]; 307 308 if (sscanf(dev_name(&device->dev), name, &i) != 1) 309 continue; 310 if (i < 0 || i >= INT_MAX) 311 continue; 312 snprintf(buf, sizeof buf, name, i); 313 if (strcmp(buf, dev_name(&device->dev)) != 0) 314 continue; 315 316 rc = ida_alloc_range(&inuse, i, i, GFP_KERNEL); 317 if (rc < 0) 318 goto out; 319 } 320 321 rc = ida_alloc(&inuse, GFP_KERNEL); 322 if (rc < 0) 323 goto out; 324 325 rc = dev_set_name(&ibdev->dev, name, rc); 326 out: 327 ida_destroy(&inuse); 328 return rc; 329 } 330 331 static void ib_device_release(struct device *device) 332 { 333 struct ib_device *dev = container_of(device, struct ib_device, dev); 334 335 free_netdevs(dev); 336 WARN_ON(refcount_read(&dev->refcount)); 337 ib_cache_release_one(dev); 338 ib_security_release_port_pkey_list(dev); 339 xa_destroy(&dev->client_data); 340 if (dev->port_data) 341 kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu, 342 pdata[0]), 343 rcu_head); 344 kfree_rcu(dev, rcu_head); 345 } 346 347 static int ib_device_uevent(struct device *device, 348 struct kobj_uevent_env *env) 349 { 350 if (add_uevent_var(env, "NAME=%s", dev_name(device))) 351 return -ENOMEM; 352 353 /* 354 * It would be nice to pass the node GUID with the event... 355 */ 356 357 return 0; 358 } 359 360 static struct class ib_class = { 361 .name = "infiniband", 362 .dev_release = ib_device_release, 363 .dev_uevent = ib_device_uevent, 364 }; 365 366 /** 367 * _ib_alloc_device - allocate an IB device struct 368 * @size:size of structure to allocate 369 * 370 * Low-level drivers should use ib_alloc_device() to allocate &struct 371 * ib_device. @size is the size of the structure to be allocated, 372 * including any private data used by the low-level driver. 373 * ib_dealloc_device() must be used to free structures allocated with 374 * ib_alloc_device(). 375 */ 376 struct ib_device *_ib_alloc_device(size_t size) 377 { 378 struct ib_device *device; 379 380 if (WARN_ON(size < sizeof(struct ib_device))) 381 return NULL; 382 383 device = kzalloc(size, GFP_KERNEL); 384 if (!device) 385 return NULL; 386 387 if (rdma_restrack_init(device)) { 388 kfree(device); 389 return NULL; 390 } 391 392 device->dev.class = &ib_class; 393 device->groups[0] = &ib_dev_attr_group; 394 device->dev.groups = device->groups; 395 device_initialize(&device->dev); 396 397 INIT_LIST_HEAD(&device->event_handler_list); 398 spin_lock_init(&device->event_handler_lock); 399 mutex_init(&device->unregistration_lock); 400 /* 401 * client_data needs to be alloc because we don't want our mark to be 402 * destroyed if the user stores NULL in the client data. 403 */ 404 xa_init_flags(&device->client_data, XA_FLAGS_ALLOC); 405 init_rwsem(&device->client_data_rwsem); 406 INIT_LIST_HEAD(&device->port_list); 407 init_completion(&device->unreg_completion); 408 INIT_WORK(&device->unregistration_work, ib_unregister_work); 409 410 return device; 411 } 412 EXPORT_SYMBOL(_ib_alloc_device); 413 414 /** 415 * ib_dealloc_device - free an IB device struct 416 * @device:structure to free 417 * 418 * Free a structure allocated with ib_alloc_device(). 419 */ 420 void ib_dealloc_device(struct ib_device *device) 421 { 422 if (device->ops.dealloc_driver) 423 device->ops.dealloc_driver(device); 424 425 /* 426 * ib_unregister_driver() requires all devices to remain in the xarray 427 * while their ops are callable. The last op we call is dealloc_driver 428 * above. This is needed to create a fence on op callbacks prior to 429 * allowing the driver module to unload. 430 */ 431 down_write(&devices_rwsem); 432 if (xa_load(&devices, device->index) == device) 433 xa_erase(&devices, device->index); 434 up_write(&devices_rwsem); 435 436 /* Expedite releasing netdev references */ 437 free_netdevs(device); 438 439 WARN_ON(!xa_empty(&device->client_data)); 440 WARN_ON(refcount_read(&device->refcount)); 441 rdma_restrack_clean(device); 442 /* Balances with device_initialize */ 443 put_device(&device->dev); 444 } 445 EXPORT_SYMBOL(ib_dealloc_device); 446 447 /* 448 * add_client_context() and remove_client_context() must be safe against 449 * parallel calls on the same device - registration/unregistration of both the 450 * device and client can be occurring in parallel. 451 * 452 * The routines need to be a fence, any caller must not return until the add 453 * or remove is fully completed. 454 */ 455 static int add_client_context(struct ib_device *device, 456 struct ib_client *client) 457 { 458 int ret = 0; 459 460 if (!device->kverbs_provider && !client->no_kverbs_req) 461 return 0; 462 463 down_write(&device->client_data_rwsem); 464 /* 465 * Another caller to add_client_context got here first and has already 466 * completely initialized context. 467 */ 468 if (xa_get_mark(&device->client_data, client->client_id, 469 CLIENT_DATA_REGISTERED)) 470 goto out; 471 472 ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, 473 GFP_KERNEL)); 474 if (ret) 475 goto out; 476 downgrade_write(&device->client_data_rwsem); 477 if (client->add) 478 client->add(device); 479 480 /* Readers shall not see a client until add has been completed */ 481 xa_set_mark(&device->client_data, client->client_id, 482 CLIENT_DATA_REGISTERED); 483 up_read(&device->client_data_rwsem); 484 return 0; 485 486 out: 487 up_write(&device->client_data_rwsem); 488 return ret; 489 } 490 491 static void remove_client_context(struct ib_device *device, 492 unsigned int client_id) 493 { 494 struct ib_client *client; 495 void *client_data; 496 497 down_write(&device->client_data_rwsem); 498 if (!xa_get_mark(&device->client_data, client_id, 499 CLIENT_DATA_REGISTERED)) { 500 up_write(&device->client_data_rwsem); 501 return; 502 } 503 client_data = xa_load(&device->client_data, client_id); 504 xa_clear_mark(&device->client_data, client_id, CLIENT_DATA_REGISTERED); 505 client = xa_load(&clients, client_id); 506 downgrade_write(&device->client_data_rwsem); 507 508 /* 509 * Notice we cannot be holding any exclusive locks when calling the 510 * remove callback as the remove callback can recurse back into any 511 * public functions in this module and thus try for any locks those 512 * functions take. 513 * 514 * For this reason clients and drivers should not call the 515 * unregistration functions will holdling any locks. 516 * 517 * It tempting to drop the client_data_rwsem too, but this is required 518 * to ensure that unregister_client does not return until all clients 519 * are completely unregistered, which is required to avoid module 520 * unloading races. 521 */ 522 if (client->remove) 523 client->remove(device, client_data); 524 525 xa_erase(&device->client_data, client_id); 526 up_read(&device->client_data_rwsem); 527 } 528 529 static int alloc_port_data(struct ib_device *device) 530 { 531 struct ib_port_data_rcu *pdata_rcu; 532 unsigned int port; 533 534 if (device->port_data) 535 return 0; 536 537 /* This can only be called once the physical port range is defined */ 538 if (WARN_ON(!device->phys_port_cnt)) 539 return -EINVAL; 540 541 /* 542 * device->port_data is indexed directly by the port number to make 543 * access to this data as efficient as possible. 544 * 545 * Therefore port_data is declared as a 1 based array with potential 546 * empty slots at the beginning. 547 */ 548 pdata_rcu = kzalloc(struct_size(pdata_rcu, pdata, 549 rdma_end_port(device) + 1), 550 GFP_KERNEL); 551 if (!pdata_rcu) 552 return -ENOMEM; 553 /* 554 * The rcu_head is put in front of the port data array and the stored 555 * pointer is adjusted since we never need to see that member until 556 * kfree_rcu. 557 */ 558 device->port_data = pdata_rcu->pdata; 559 560 rdma_for_each_port (device, port) { 561 struct ib_port_data *pdata = &device->port_data[port]; 562 563 pdata->ib_dev = device; 564 spin_lock_init(&pdata->pkey_list_lock); 565 INIT_LIST_HEAD(&pdata->pkey_list); 566 spin_lock_init(&pdata->netdev_lock); 567 INIT_HLIST_NODE(&pdata->ndev_hash_link); 568 } 569 return 0; 570 } 571 572 static int verify_immutable(const struct ib_device *dev, u8 port) 573 { 574 return WARN_ON(!rdma_cap_ib_mad(dev, port) && 575 rdma_max_mad_size(dev, port) != 0); 576 } 577 578 static int setup_port_data(struct ib_device *device) 579 { 580 unsigned int port; 581 int ret; 582 583 ret = alloc_port_data(device); 584 if (ret) 585 return ret; 586 587 rdma_for_each_port (device, port) { 588 struct ib_port_data *pdata = &device->port_data[port]; 589 590 ret = device->ops.get_port_immutable(device, port, 591 &pdata->immutable); 592 if (ret) 593 return ret; 594 595 if (verify_immutable(device, port)) 596 return -EINVAL; 597 } 598 return 0; 599 } 600 601 void ib_get_device_fw_str(struct ib_device *dev, char *str) 602 { 603 if (dev->ops.get_dev_fw_str) 604 dev->ops.get_dev_fw_str(dev, str); 605 else 606 str[0] = '\0'; 607 } 608 EXPORT_SYMBOL(ib_get_device_fw_str); 609 610 static void ib_policy_change_task(struct work_struct *work) 611 { 612 struct ib_device *dev; 613 unsigned long index; 614 615 down_read(&devices_rwsem); 616 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 617 unsigned int i; 618 619 rdma_for_each_port (dev, i) { 620 u64 sp; 621 int ret = ib_get_cached_subnet_prefix(dev, 622 i, 623 &sp); 624 625 WARN_ONCE(ret, 626 "ib_get_cached_subnet_prefix err: %d, this should never happen here\n", 627 ret); 628 if (!ret) 629 ib_security_cache_change(dev, i, sp); 630 } 631 } 632 up_read(&devices_rwsem); 633 } 634 635 static int ib_security_change(struct notifier_block *nb, unsigned long event, 636 void *lsm_data) 637 { 638 if (event != LSM_POLICY_CHANGE) 639 return NOTIFY_DONE; 640 641 schedule_work(&ib_policy_change_work); 642 ib_mad_agent_security_change(); 643 644 return NOTIFY_OK; 645 } 646 647 /* 648 * Assign the unique string device name and the unique device index. This is 649 * undone by ib_dealloc_device. 650 */ 651 static int assign_name(struct ib_device *device, const char *name) 652 { 653 static u32 last_id; 654 int ret; 655 656 down_write(&devices_rwsem); 657 /* Assign a unique name to the device */ 658 if (strchr(name, '%')) 659 ret = alloc_name(device, name); 660 else 661 ret = dev_set_name(&device->dev, name); 662 if (ret) 663 goto out; 664 665 if (__ib_device_get_by_name(dev_name(&device->dev))) { 666 ret = -ENFILE; 667 goto out; 668 } 669 strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); 670 671 ret = xa_alloc_cyclic(&devices, &device->index, device, xa_limit_31b, 672 &last_id, GFP_KERNEL); 673 if (ret > 0) 674 ret = 0; 675 676 out: 677 up_write(&devices_rwsem); 678 return ret; 679 } 680 681 static void setup_dma_device(struct ib_device *device) 682 { 683 struct device *parent = device->dev.parent; 684 685 WARN_ON_ONCE(device->dma_device); 686 if (device->dev.dma_ops) { 687 /* 688 * The caller provided custom DMA operations. Copy the 689 * DMA-related fields that are used by e.g. dma_alloc_coherent() 690 * into device->dev. 691 */ 692 device->dma_device = &device->dev; 693 if (!device->dev.dma_mask) { 694 if (parent) 695 device->dev.dma_mask = parent->dma_mask; 696 else 697 WARN_ON_ONCE(true); 698 } 699 if (!device->dev.coherent_dma_mask) { 700 if (parent) 701 device->dev.coherent_dma_mask = 702 parent->coherent_dma_mask; 703 else 704 WARN_ON_ONCE(true); 705 } 706 } else { 707 /* 708 * The caller did not provide custom DMA operations. Use the 709 * DMA mapping operations of the parent device. 710 */ 711 WARN_ON_ONCE(!parent); 712 device->dma_device = parent; 713 } 714 } 715 716 /* 717 * setup_device() allocates memory and sets up data that requires calling the 718 * device ops, this is the only reason these actions are not done during 719 * ib_alloc_device. It is undone by ib_dealloc_device(). 720 */ 721 static int setup_device(struct ib_device *device) 722 { 723 struct ib_udata uhw = {.outlen = 0, .inlen = 0}; 724 int ret; 725 726 setup_dma_device(device); 727 728 ret = ib_device_check_mandatory(device); 729 if (ret) 730 return ret; 731 732 ret = setup_port_data(device); 733 if (ret) { 734 dev_warn(&device->dev, "Couldn't create per-port data\n"); 735 return ret; 736 } 737 738 memset(&device->attrs, 0, sizeof(device->attrs)); 739 ret = device->ops.query_device(device, &device->attrs, &uhw); 740 if (ret) { 741 dev_warn(&device->dev, 742 "Couldn't query the device attributes\n"); 743 return ret; 744 } 745 746 return 0; 747 } 748 749 static void disable_device(struct ib_device *device) 750 { 751 struct ib_client *client; 752 753 WARN_ON(!refcount_read(&device->refcount)); 754 755 down_write(&devices_rwsem); 756 xa_clear_mark(&devices, device->index, DEVICE_REGISTERED); 757 up_write(&devices_rwsem); 758 759 down_read(&clients_rwsem); 760 list_for_each_entry_reverse(client, &client_list, list) 761 remove_client_context(device, client->client_id); 762 up_read(&clients_rwsem); 763 764 /* Pairs with refcount_set in enable_device */ 765 ib_device_put(device); 766 wait_for_completion(&device->unreg_completion); 767 768 /* Expedite removing unregistered pointers from the hash table */ 769 free_netdevs(device); 770 } 771 772 /* 773 * An enabled device is visible to all clients and to all the public facing 774 * APIs that return a device pointer. This always returns with a new get, even 775 * if it fails. 776 */ 777 static int enable_device_and_get(struct ib_device *device) 778 { 779 struct ib_client *client; 780 unsigned long index; 781 int ret = 0; 782 783 /* 784 * One ref belongs to the xa and the other belongs to this 785 * thread. This is needed to guard against parallel unregistration. 786 */ 787 refcount_set(&device->refcount, 2); 788 down_write(&devices_rwsem); 789 xa_set_mark(&devices, device->index, DEVICE_REGISTERED); 790 791 /* 792 * By using downgrade_write() we ensure that no other thread can clear 793 * DEVICE_REGISTERED while we are completing the client setup. 794 */ 795 downgrade_write(&devices_rwsem); 796 797 if (device->ops.enable_driver) { 798 ret = device->ops.enable_driver(device); 799 if (ret) 800 goto out; 801 } 802 803 down_read(&clients_rwsem); 804 xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) { 805 ret = add_client_context(device, client); 806 if (ret) 807 break; 808 } 809 up_read(&clients_rwsem); 810 811 out: 812 up_read(&devices_rwsem); 813 return ret; 814 } 815 816 /** 817 * ib_register_device - Register an IB device with IB core 818 * @device:Device to register 819 * 820 * Low-level drivers use ib_register_device() to register their 821 * devices with the IB core. All registered clients will receive a 822 * callback for each device that is added. @device must be allocated 823 * with ib_alloc_device(). 824 * 825 * If the driver uses ops.dealloc_driver and calls any ib_unregister_device() 826 * asynchronously then the device pointer may become freed as soon as this 827 * function returns. 828 */ 829 int ib_register_device(struct ib_device *device, const char *name) 830 { 831 int ret; 832 833 ret = assign_name(device, name); 834 if (ret) 835 return ret; 836 837 ret = setup_device(device); 838 if (ret) 839 return ret; 840 841 ret = ib_cache_setup_one(device); 842 if (ret) { 843 dev_warn(&device->dev, 844 "Couldn't set up InfiniBand P_Key/GID cache\n"); 845 return ret; 846 } 847 848 ib_device_register_rdmacg(device); 849 850 ret = device_add(&device->dev); 851 if (ret) 852 goto cg_cleanup; 853 854 ret = ib_device_register_sysfs(device); 855 if (ret) { 856 dev_warn(&device->dev, 857 "Couldn't register device with driver model\n"); 858 goto dev_cleanup; 859 } 860 861 ret = enable_device_and_get(device); 862 if (ret) { 863 void (*dealloc_fn)(struct ib_device *); 864 865 /* 866 * If we hit this error flow then we don't want to 867 * automatically dealloc the device since the caller is 868 * expected to call ib_dealloc_device() after 869 * ib_register_device() fails. This is tricky due to the 870 * possibility for a parallel unregistration along with this 871 * error flow. Since we have a refcount here we know any 872 * parallel flow is stopped in disable_device and will see the 873 * NULL pointers, causing the responsibility to 874 * ib_dealloc_device() to revert back to this thread. 875 */ 876 dealloc_fn = device->ops.dealloc_driver; 877 device->ops.dealloc_driver = NULL; 878 ib_device_put(device); 879 __ib_unregister_device(device); 880 device->ops.dealloc_driver = dealloc_fn; 881 return ret; 882 } 883 ib_device_put(device); 884 885 return 0; 886 887 dev_cleanup: 888 device_del(&device->dev); 889 cg_cleanup: 890 ib_device_unregister_rdmacg(device); 891 ib_cache_cleanup_one(device); 892 return ret; 893 } 894 EXPORT_SYMBOL(ib_register_device); 895 896 /* Callers must hold a get on the device. */ 897 static void __ib_unregister_device(struct ib_device *ib_dev) 898 { 899 /* 900 * We have a registration lock so that all the calls to unregister are 901 * fully fenced, once any unregister returns the device is truely 902 * unregistered even if multiple callers are unregistering it at the 903 * same time. This also interacts with the registration flow and 904 * provides sane semantics if register and unregister are racing. 905 */ 906 mutex_lock(&ib_dev->unregistration_lock); 907 if (!refcount_read(&ib_dev->refcount)) 908 goto out; 909 910 disable_device(ib_dev); 911 ib_device_unregister_sysfs(ib_dev); 912 device_del(&ib_dev->dev); 913 ib_device_unregister_rdmacg(ib_dev); 914 ib_cache_cleanup_one(ib_dev); 915 916 /* 917 * Drivers using the new flow may not call ib_dealloc_device except 918 * in error unwind prior to registration success. 919 */ 920 if (ib_dev->ops.dealloc_driver) { 921 WARN_ON(kref_read(&ib_dev->dev.kobj.kref) <= 1); 922 ib_dealloc_device(ib_dev); 923 } 924 out: 925 mutex_unlock(&ib_dev->unregistration_lock); 926 } 927 928 /** 929 * ib_unregister_device - Unregister an IB device 930 * @device: The device to unregister 931 * 932 * Unregister an IB device. All clients will receive a remove callback. 933 * 934 * Callers should call this routine only once, and protect against races with 935 * registration. Typically it should only be called as part of a remove 936 * callback in an implementation of driver core's struct device_driver and 937 * related. 938 * 939 * If ops.dealloc_driver is used then ib_dev will be freed upon return from 940 * this function. 941 */ 942 void ib_unregister_device(struct ib_device *ib_dev) 943 { 944 get_device(&ib_dev->dev); 945 __ib_unregister_device(ib_dev); 946 put_device(&ib_dev->dev); 947 } 948 EXPORT_SYMBOL(ib_unregister_device); 949 950 /** 951 * ib_unregister_device_and_put - Unregister a device while holding a 'get' 952 * device: The device to unregister 953 * 954 * This is the same as ib_unregister_device(), except it includes an internal 955 * ib_device_put() that should match a 'get' obtained by the caller. 956 * 957 * It is safe to call this routine concurrently from multiple threads while 958 * holding the 'get'. When the function returns the device is fully 959 * unregistered. 960 * 961 * Drivers using this flow MUST use the driver_unregister callback to clean up 962 * their resources associated with the device and dealloc it. 963 */ 964 void ib_unregister_device_and_put(struct ib_device *ib_dev) 965 { 966 WARN_ON(!ib_dev->ops.dealloc_driver); 967 get_device(&ib_dev->dev); 968 ib_device_put(ib_dev); 969 __ib_unregister_device(ib_dev); 970 put_device(&ib_dev->dev); 971 } 972 EXPORT_SYMBOL(ib_unregister_device_and_put); 973 974 /** 975 * ib_unregister_driver - Unregister all IB devices for a driver 976 * @driver_id: The driver to unregister 977 * 978 * This implements a fence for device unregistration. It only returns once all 979 * devices associated with the driver_id have fully completed their 980 * unregistration and returned from ib_unregister_device*(). 981 * 982 * If device's are not yet unregistered it goes ahead and starts unregistering 983 * them. 984 * 985 * This does not block creation of new devices with the given driver_id, that 986 * is the responsibility of the caller. 987 */ 988 void ib_unregister_driver(enum rdma_driver_id driver_id) 989 { 990 struct ib_device *ib_dev; 991 unsigned long index; 992 993 down_read(&devices_rwsem); 994 xa_for_each (&devices, index, ib_dev) { 995 if (ib_dev->driver_id != driver_id) 996 continue; 997 998 get_device(&ib_dev->dev); 999 up_read(&devices_rwsem); 1000 1001 WARN_ON(!ib_dev->ops.dealloc_driver); 1002 __ib_unregister_device(ib_dev); 1003 1004 put_device(&ib_dev->dev); 1005 down_read(&devices_rwsem); 1006 } 1007 up_read(&devices_rwsem); 1008 } 1009 EXPORT_SYMBOL(ib_unregister_driver); 1010 1011 static void ib_unregister_work(struct work_struct *work) 1012 { 1013 struct ib_device *ib_dev = 1014 container_of(work, struct ib_device, unregistration_work); 1015 1016 __ib_unregister_device(ib_dev); 1017 put_device(&ib_dev->dev); 1018 } 1019 1020 /** 1021 * ib_unregister_device_queued - Unregister a device using a work queue 1022 * device: The device to unregister 1023 * 1024 * This schedules an asynchronous unregistration using a WQ for the device. A 1025 * driver should use this to avoid holding locks while doing unregistration, 1026 * such as holding the RTNL lock. 1027 * 1028 * Drivers using this API must use ib_unregister_driver before module unload 1029 * to ensure that all scheduled unregistrations have completed. 1030 */ 1031 void ib_unregister_device_queued(struct ib_device *ib_dev) 1032 { 1033 WARN_ON(!refcount_read(&ib_dev->refcount)); 1034 WARN_ON(!ib_dev->ops.dealloc_driver); 1035 get_device(&ib_dev->dev); 1036 if (!queue_work(system_unbound_wq, &ib_dev->unregistration_work)) 1037 put_device(&ib_dev->dev); 1038 } 1039 EXPORT_SYMBOL(ib_unregister_device_queued); 1040 1041 static int assign_client_id(struct ib_client *client) 1042 { 1043 int ret; 1044 1045 down_write(&clients_rwsem); 1046 /* 1047 * The add/remove callbacks must be called in FIFO/LIFO order. To 1048 * achieve this we assign client_ids so they are sorted in 1049 * registration order, and retain a linked list we can reverse iterate 1050 * to get the LIFO order. The extra linked list can go away if xarray 1051 * learns to reverse iterate. 1052 */ 1053 if (list_empty(&client_list)) { 1054 client->client_id = 0; 1055 } else { 1056 struct ib_client *last; 1057 1058 last = list_last_entry(&client_list, struct ib_client, list); 1059 client->client_id = last->client_id + 1; 1060 } 1061 ret = xa_insert(&clients, client->client_id, client, GFP_KERNEL); 1062 if (ret) 1063 goto out; 1064 1065 xa_set_mark(&clients, client->client_id, CLIENT_REGISTERED); 1066 list_add_tail(&client->list, &client_list); 1067 1068 out: 1069 up_write(&clients_rwsem); 1070 return ret; 1071 } 1072 1073 /** 1074 * ib_register_client - Register an IB client 1075 * @client:Client to register 1076 * 1077 * Upper level users of the IB drivers can use ib_register_client() to 1078 * register callbacks for IB device addition and removal. When an IB 1079 * device is added, each registered client's add method will be called 1080 * (in the order the clients were registered), and when a device is 1081 * removed, each client's remove method will be called (in the reverse 1082 * order that clients were registered). In addition, when 1083 * ib_register_client() is called, the client will receive an add 1084 * callback for all devices already registered. 1085 */ 1086 int ib_register_client(struct ib_client *client) 1087 { 1088 struct ib_device *device; 1089 unsigned long index; 1090 int ret; 1091 1092 ret = assign_client_id(client); 1093 if (ret) 1094 return ret; 1095 1096 down_read(&devices_rwsem); 1097 xa_for_each_marked (&devices, index, device, DEVICE_REGISTERED) { 1098 ret = add_client_context(device, client); 1099 if (ret) { 1100 up_read(&devices_rwsem); 1101 ib_unregister_client(client); 1102 return ret; 1103 } 1104 } 1105 up_read(&devices_rwsem); 1106 return 0; 1107 } 1108 EXPORT_SYMBOL(ib_register_client); 1109 1110 /** 1111 * ib_unregister_client - Unregister an IB client 1112 * @client:Client to unregister 1113 * 1114 * Upper level users use ib_unregister_client() to remove their client 1115 * registration. When ib_unregister_client() is called, the client 1116 * will receive a remove callback for each IB device still registered. 1117 * 1118 * This is a full fence, once it returns no client callbacks will be called, 1119 * or are running in another thread. 1120 */ 1121 void ib_unregister_client(struct ib_client *client) 1122 { 1123 struct ib_device *device; 1124 unsigned long index; 1125 1126 down_write(&clients_rwsem); 1127 xa_clear_mark(&clients, client->client_id, CLIENT_REGISTERED); 1128 up_write(&clients_rwsem); 1129 /* 1130 * Every device still known must be serialized to make sure we are 1131 * done with the client callbacks before we return. 1132 */ 1133 down_read(&devices_rwsem); 1134 xa_for_each (&devices, index, device) 1135 remove_client_context(device, client->client_id); 1136 up_read(&devices_rwsem); 1137 1138 down_write(&clients_rwsem); 1139 list_del(&client->list); 1140 xa_erase(&clients, client->client_id); 1141 up_write(&clients_rwsem); 1142 } 1143 EXPORT_SYMBOL(ib_unregister_client); 1144 1145 /** 1146 * ib_set_client_data - Set IB client context 1147 * @device:Device to set context for 1148 * @client:Client to set context for 1149 * @data:Context to set 1150 * 1151 * ib_set_client_data() sets client context data that can be retrieved with 1152 * ib_get_client_data(). This can only be called while the client is 1153 * registered to the device, once the ib_client remove() callback returns this 1154 * cannot be called. 1155 */ 1156 void ib_set_client_data(struct ib_device *device, struct ib_client *client, 1157 void *data) 1158 { 1159 void *rc; 1160 1161 if (WARN_ON(IS_ERR(data))) 1162 data = NULL; 1163 1164 rc = xa_store(&device->client_data, client->client_id, data, 1165 GFP_KERNEL); 1166 WARN_ON(xa_is_err(rc)); 1167 } 1168 EXPORT_SYMBOL(ib_set_client_data); 1169 1170 /** 1171 * ib_register_event_handler - Register an IB event handler 1172 * @event_handler:Handler to register 1173 * 1174 * ib_register_event_handler() registers an event handler that will be 1175 * called back when asynchronous IB events occur (as defined in 1176 * chapter 11 of the InfiniBand Architecture Specification). This 1177 * callback may occur in interrupt context. 1178 */ 1179 void ib_register_event_handler(struct ib_event_handler *event_handler) 1180 { 1181 unsigned long flags; 1182 1183 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1184 list_add_tail(&event_handler->list, 1185 &event_handler->device->event_handler_list); 1186 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1187 } 1188 EXPORT_SYMBOL(ib_register_event_handler); 1189 1190 /** 1191 * ib_unregister_event_handler - Unregister an event handler 1192 * @event_handler:Handler to unregister 1193 * 1194 * Unregister an event handler registered with 1195 * ib_register_event_handler(). 1196 */ 1197 void ib_unregister_event_handler(struct ib_event_handler *event_handler) 1198 { 1199 unsigned long flags; 1200 1201 spin_lock_irqsave(&event_handler->device->event_handler_lock, flags); 1202 list_del(&event_handler->list); 1203 spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags); 1204 } 1205 EXPORT_SYMBOL(ib_unregister_event_handler); 1206 1207 /** 1208 * ib_dispatch_event - Dispatch an asynchronous event 1209 * @event:Event to dispatch 1210 * 1211 * Low-level drivers must call ib_dispatch_event() to dispatch the 1212 * event to all registered event handlers when an asynchronous event 1213 * occurs. 1214 */ 1215 void ib_dispatch_event(struct ib_event *event) 1216 { 1217 unsigned long flags; 1218 struct ib_event_handler *handler; 1219 1220 spin_lock_irqsave(&event->device->event_handler_lock, flags); 1221 1222 list_for_each_entry(handler, &event->device->event_handler_list, list) 1223 handler->handler(handler, event); 1224 1225 spin_unlock_irqrestore(&event->device->event_handler_lock, flags); 1226 } 1227 EXPORT_SYMBOL(ib_dispatch_event); 1228 1229 /** 1230 * ib_query_port - Query IB port attributes 1231 * @device:Device to query 1232 * @port_num:Port number to query 1233 * @port_attr:Port attributes 1234 * 1235 * ib_query_port() returns the attributes of a port through the 1236 * @port_attr pointer. 1237 */ 1238 int ib_query_port(struct ib_device *device, 1239 u8 port_num, 1240 struct ib_port_attr *port_attr) 1241 { 1242 union ib_gid gid; 1243 int err; 1244 1245 if (!rdma_is_port_valid(device, port_num)) 1246 return -EINVAL; 1247 1248 memset(port_attr, 0, sizeof(*port_attr)); 1249 err = device->ops.query_port(device, port_num, port_attr); 1250 if (err || port_attr->subnet_prefix) 1251 return err; 1252 1253 if (rdma_port_get_link_layer(device, port_num) != IB_LINK_LAYER_INFINIBAND) 1254 return 0; 1255 1256 err = device->ops.query_gid(device, port_num, 0, &gid); 1257 if (err) 1258 return err; 1259 1260 port_attr->subnet_prefix = be64_to_cpu(gid.global.subnet_prefix); 1261 return 0; 1262 } 1263 EXPORT_SYMBOL(ib_query_port); 1264 1265 static void add_ndev_hash(struct ib_port_data *pdata) 1266 { 1267 unsigned long flags; 1268 1269 might_sleep(); 1270 1271 spin_lock_irqsave(&ndev_hash_lock, flags); 1272 if (hash_hashed(&pdata->ndev_hash_link)) { 1273 hash_del_rcu(&pdata->ndev_hash_link); 1274 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1275 /* 1276 * We cannot do hash_add_rcu after a hash_del_rcu until the 1277 * grace period 1278 */ 1279 synchronize_rcu(); 1280 spin_lock_irqsave(&ndev_hash_lock, flags); 1281 } 1282 if (pdata->netdev) 1283 hash_add_rcu(ndev_hash, &pdata->ndev_hash_link, 1284 (uintptr_t)pdata->netdev); 1285 spin_unlock_irqrestore(&ndev_hash_lock, flags); 1286 } 1287 1288 /** 1289 * ib_device_set_netdev - Associate the ib_dev with an underlying net_device 1290 * @ib_dev: Device to modify 1291 * @ndev: net_device to affiliate, may be NULL 1292 * @port: IB port the net_device is connected to 1293 * 1294 * Drivers should use this to link the ib_device to a netdev so the netdev 1295 * shows up in interfaces like ib_enum_roce_netdev. Only one netdev may be 1296 * affiliated with any port. 1297 * 1298 * The caller must ensure that the given ndev is not unregistered or 1299 * unregistering, and that either the ib_device is unregistered or 1300 * ib_device_set_netdev() is called with NULL when the ndev sends a 1301 * NETDEV_UNREGISTER event. 1302 */ 1303 int ib_device_set_netdev(struct ib_device *ib_dev, struct net_device *ndev, 1304 unsigned int port) 1305 { 1306 struct net_device *old_ndev; 1307 struct ib_port_data *pdata; 1308 unsigned long flags; 1309 int ret; 1310 1311 /* 1312 * Drivers wish to call this before ib_register_driver, so we have to 1313 * setup the port data early. 1314 */ 1315 ret = alloc_port_data(ib_dev); 1316 if (ret) 1317 return ret; 1318 1319 if (!rdma_is_port_valid(ib_dev, port)) 1320 return -EINVAL; 1321 1322 pdata = &ib_dev->port_data[port]; 1323 spin_lock_irqsave(&pdata->netdev_lock, flags); 1324 old_ndev = rcu_dereference_protected( 1325 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 1326 if (old_ndev == ndev) { 1327 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1328 return 0; 1329 } 1330 1331 if (ndev) 1332 dev_hold(ndev); 1333 rcu_assign_pointer(pdata->netdev, ndev); 1334 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1335 1336 add_ndev_hash(pdata); 1337 if (old_ndev) 1338 dev_put(old_ndev); 1339 1340 return 0; 1341 } 1342 EXPORT_SYMBOL(ib_device_set_netdev); 1343 1344 static void free_netdevs(struct ib_device *ib_dev) 1345 { 1346 unsigned long flags; 1347 unsigned int port; 1348 1349 rdma_for_each_port (ib_dev, port) { 1350 struct ib_port_data *pdata = &ib_dev->port_data[port]; 1351 struct net_device *ndev; 1352 1353 spin_lock_irqsave(&pdata->netdev_lock, flags); 1354 ndev = rcu_dereference_protected( 1355 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 1356 if (ndev) { 1357 spin_lock(&ndev_hash_lock); 1358 hash_del_rcu(&pdata->ndev_hash_link); 1359 spin_unlock(&ndev_hash_lock); 1360 1361 /* 1362 * If this is the last dev_put there is still a 1363 * synchronize_rcu before the netdev is kfreed, so we 1364 * can continue to rely on unlocked pointer 1365 * comparisons after the put 1366 */ 1367 rcu_assign_pointer(pdata->netdev, NULL); 1368 dev_put(ndev); 1369 } 1370 spin_unlock_irqrestore(&pdata->netdev_lock, flags); 1371 } 1372 } 1373 1374 struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, 1375 unsigned int port) 1376 { 1377 struct ib_port_data *pdata; 1378 struct net_device *res; 1379 1380 if (!rdma_is_port_valid(ib_dev, port)) 1381 return NULL; 1382 1383 pdata = &ib_dev->port_data[port]; 1384 1385 /* 1386 * New drivers should use ib_device_set_netdev() not the legacy 1387 * get_netdev(). 1388 */ 1389 if (ib_dev->ops.get_netdev) 1390 res = ib_dev->ops.get_netdev(ib_dev, port); 1391 else { 1392 spin_lock(&pdata->netdev_lock); 1393 res = rcu_dereference_protected( 1394 pdata->netdev, lockdep_is_held(&pdata->netdev_lock)); 1395 if (res) 1396 dev_hold(res); 1397 spin_unlock(&pdata->netdev_lock); 1398 } 1399 1400 /* 1401 * If we are starting to unregister expedite things by preventing 1402 * propagation of an unregistering netdev. 1403 */ 1404 if (res && res->reg_state != NETREG_REGISTERED) { 1405 dev_put(res); 1406 return NULL; 1407 } 1408 1409 return res; 1410 } 1411 1412 /** 1413 * ib_device_get_by_netdev - Find an IB device associated with a netdev 1414 * @ndev: netdev to locate 1415 * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 1416 * 1417 * Find and hold an ib_device that is associated with a netdev via 1418 * ib_device_set_netdev(). The caller must call ib_device_put() on the 1419 * returned pointer. 1420 */ 1421 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 1422 enum rdma_driver_id driver_id) 1423 { 1424 struct ib_device *res = NULL; 1425 struct ib_port_data *cur; 1426 1427 rcu_read_lock(); 1428 hash_for_each_possible_rcu (ndev_hash, cur, ndev_hash_link, 1429 (uintptr_t)ndev) { 1430 if (rcu_access_pointer(cur->netdev) == ndev && 1431 (driver_id == RDMA_DRIVER_UNKNOWN || 1432 cur->ib_dev->driver_id == driver_id) && 1433 ib_device_try_get(cur->ib_dev)) { 1434 res = cur->ib_dev; 1435 break; 1436 } 1437 } 1438 rcu_read_unlock(); 1439 1440 return res; 1441 } 1442 EXPORT_SYMBOL(ib_device_get_by_netdev); 1443 1444 /** 1445 * ib_enum_roce_netdev - enumerate all RoCE ports 1446 * @ib_dev : IB device we want to query 1447 * @filter: Should we call the callback? 1448 * @filter_cookie: Cookie passed to filter 1449 * @cb: Callback to call for each found RoCE ports 1450 * @cookie: Cookie passed back to the callback 1451 * 1452 * Enumerates all of the physical RoCE ports of ib_dev 1453 * which are related to netdevice and calls callback() on each 1454 * device for which filter() function returns non zero. 1455 */ 1456 void ib_enum_roce_netdev(struct ib_device *ib_dev, 1457 roce_netdev_filter filter, 1458 void *filter_cookie, 1459 roce_netdev_callback cb, 1460 void *cookie) 1461 { 1462 unsigned int port; 1463 1464 rdma_for_each_port (ib_dev, port) 1465 if (rdma_protocol_roce(ib_dev, port)) { 1466 struct net_device *idev = 1467 ib_device_get_netdev(ib_dev, port); 1468 1469 if (filter(ib_dev, port, idev, filter_cookie)) 1470 cb(ib_dev, port, idev, cookie); 1471 1472 if (idev) 1473 dev_put(idev); 1474 } 1475 } 1476 1477 /** 1478 * ib_enum_all_roce_netdevs - enumerate all RoCE devices 1479 * @filter: Should we call the callback? 1480 * @filter_cookie: Cookie passed to filter 1481 * @cb: Callback to call for each found RoCE ports 1482 * @cookie: Cookie passed back to the callback 1483 * 1484 * Enumerates all RoCE devices' physical ports which are related 1485 * to netdevices and calls callback() on each device for which 1486 * filter() function returns non zero. 1487 */ 1488 void ib_enum_all_roce_netdevs(roce_netdev_filter filter, 1489 void *filter_cookie, 1490 roce_netdev_callback cb, 1491 void *cookie) 1492 { 1493 struct ib_device *dev; 1494 unsigned long index; 1495 1496 down_read(&devices_rwsem); 1497 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) 1498 ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie); 1499 up_read(&devices_rwsem); 1500 } 1501 1502 /** 1503 * ib_enum_all_devs - enumerate all ib_devices 1504 * @cb: Callback to call for each found ib_device 1505 * 1506 * Enumerates all ib_devices and calls callback() on each device. 1507 */ 1508 int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, 1509 struct netlink_callback *cb) 1510 { 1511 unsigned long index; 1512 struct ib_device *dev; 1513 unsigned int idx = 0; 1514 int ret = 0; 1515 1516 down_read(&devices_rwsem); 1517 xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) { 1518 ret = nldev_cb(dev, skb, cb, idx); 1519 if (ret) 1520 break; 1521 idx++; 1522 } 1523 up_read(&devices_rwsem); 1524 return ret; 1525 } 1526 1527 /** 1528 * ib_query_pkey - Get P_Key table entry 1529 * @device:Device to query 1530 * @port_num:Port number to query 1531 * @index:P_Key table index to query 1532 * @pkey:Returned P_Key 1533 * 1534 * ib_query_pkey() fetches the specified P_Key table entry. 1535 */ 1536 int ib_query_pkey(struct ib_device *device, 1537 u8 port_num, u16 index, u16 *pkey) 1538 { 1539 if (!rdma_is_port_valid(device, port_num)) 1540 return -EINVAL; 1541 1542 return device->ops.query_pkey(device, port_num, index, pkey); 1543 } 1544 EXPORT_SYMBOL(ib_query_pkey); 1545 1546 /** 1547 * ib_modify_device - Change IB device attributes 1548 * @device:Device to modify 1549 * @device_modify_mask:Mask of attributes to change 1550 * @device_modify:New attribute values 1551 * 1552 * ib_modify_device() changes a device's attributes as specified by 1553 * the @device_modify_mask and @device_modify structure. 1554 */ 1555 int ib_modify_device(struct ib_device *device, 1556 int device_modify_mask, 1557 struct ib_device_modify *device_modify) 1558 { 1559 if (!device->ops.modify_device) 1560 return -ENOSYS; 1561 1562 return device->ops.modify_device(device, device_modify_mask, 1563 device_modify); 1564 } 1565 EXPORT_SYMBOL(ib_modify_device); 1566 1567 /** 1568 * ib_modify_port - Modifies the attributes for the specified port. 1569 * @device: The device to modify. 1570 * @port_num: The number of the port to modify. 1571 * @port_modify_mask: Mask used to specify which attributes of the port 1572 * to change. 1573 * @port_modify: New attribute values for the port. 1574 * 1575 * ib_modify_port() changes a port's attributes as specified by the 1576 * @port_modify_mask and @port_modify structure. 1577 */ 1578 int ib_modify_port(struct ib_device *device, 1579 u8 port_num, int port_modify_mask, 1580 struct ib_port_modify *port_modify) 1581 { 1582 int rc; 1583 1584 if (!rdma_is_port_valid(device, port_num)) 1585 return -EINVAL; 1586 1587 if (device->ops.modify_port) 1588 rc = device->ops.modify_port(device, port_num, 1589 port_modify_mask, 1590 port_modify); 1591 else 1592 rc = rdma_protocol_roce(device, port_num) ? 0 : -ENOSYS; 1593 return rc; 1594 } 1595 EXPORT_SYMBOL(ib_modify_port); 1596 1597 /** 1598 * ib_find_gid - Returns the port number and GID table index where 1599 * a specified GID value occurs. Its searches only for IB link layer. 1600 * @device: The device to query. 1601 * @gid: The GID value to search for. 1602 * @port_num: The port number of the device where the GID value was found. 1603 * @index: The index into the GID table where the GID was found. This 1604 * parameter may be NULL. 1605 */ 1606 int ib_find_gid(struct ib_device *device, union ib_gid *gid, 1607 u8 *port_num, u16 *index) 1608 { 1609 union ib_gid tmp_gid; 1610 unsigned int port; 1611 int ret, i; 1612 1613 rdma_for_each_port (device, port) { 1614 if (!rdma_protocol_ib(device, port)) 1615 continue; 1616 1617 for (i = 0; i < device->port_data[port].immutable.gid_tbl_len; 1618 ++i) { 1619 ret = rdma_query_gid(device, port, i, &tmp_gid); 1620 if (ret) 1621 return ret; 1622 if (!memcmp(&tmp_gid, gid, sizeof *gid)) { 1623 *port_num = port; 1624 if (index) 1625 *index = i; 1626 return 0; 1627 } 1628 } 1629 } 1630 1631 return -ENOENT; 1632 } 1633 EXPORT_SYMBOL(ib_find_gid); 1634 1635 /** 1636 * ib_find_pkey - Returns the PKey table index where a specified 1637 * PKey value occurs. 1638 * @device: The device to query. 1639 * @port_num: The port number of the device to search for the PKey. 1640 * @pkey: The PKey value to search for. 1641 * @index: The index into the PKey table where the PKey was found. 1642 */ 1643 int ib_find_pkey(struct ib_device *device, 1644 u8 port_num, u16 pkey, u16 *index) 1645 { 1646 int ret, i; 1647 u16 tmp_pkey; 1648 int partial_ix = -1; 1649 1650 for (i = 0; i < device->port_data[port_num].immutable.pkey_tbl_len; 1651 ++i) { 1652 ret = ib_query_pkey(device, port_num, i, &tmp_pkey); 1653 if (ret) 1654 return ret; 1655 if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) { 1656 /* if there is full-member pkey take it.*/ 1657 if (tmp_pkey & 0x8000) { 1658 *index = i; 1659 return 0; 1660 } 1661 if (partial_ix < 0) 1662 partial_ix = i; 1663 } 1664 } 1665 1666 /*no full-member, if exists take the limited*/ 1667 if (partial_ix >= 0) { 1668 *index = partial_ix; 1669 return 0; 1670 } 1671 return -ENOENT; 1672 } 1673 EXPORT_SYMBOL(ib_find_pkey); 1674 1675 /** 1676 * ib_get_net_dev_by_params() - Return the appropriate net_dev 1677 * for a received CM request 1678 * @dev: An RDMA device on which the request has been received. 1679 * @port: Port number on the RDMA device. 1680 * @pkey: The Pkey the request came on. 1681 * @gid: A GID that the net_dev uses to communicate. 1682 * @addr: Contains the IP address that the request specified as its 1683 * destination. 1684 * 1685 */ 1686 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, 1687 u8 port, 1688 u16 pkey, 1689 const union ib_gid *gid, 1690 const struct sockaddr *addr) 1691 { 1692 struct net_device *net_dev = NULL; 1693 unsigned long index; 1694 void *client_data; 1695 1696 if (!rdma_protocol_ib(dev, port)) 1697 return NULL; 1698 1699 /* 1700 * Holding the read side guarantees that the client will not become 1701 * unregistered while we are calling get_net_dev_by_params() 1702 */ 1703 down_read(&dev->client_data_rwsem); 1704 xan_for_each_marked (&dev->client_data, index, client_data, 1705 CLIENT_DATA_REGISTERED) { 1706 struct ib_client *client = xa_load(&clients, index); 1707 1708 if (!client || !client->get_net_dev_by_params) 1709 continue; 1710 1711 net_dev = client->get_net_dev_by_params(dev, port, pkey, gid, 1712 addr, client_data); 1713 if (net_dev) 1714 break; 1715 } 1716 up_read(&dev->client_data_rwsem); 1717 1718 return net_dev; 1719 } 1720 EXPORT_SYMBOL(ib_get_net_dev_by_params); 1721 1722 void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops) 1723 { 1724 struct ib_device_ops *dev_ops = &dev->ops; 1725 #define SET_DEVICE_OP(ptr, name) \ 1726 do { \ 1727 if (ops->name) \ 1728 if (!((ptr)->name)) \ 1729 (ptr)->name = ops->name; \ 1730 } while (0) 1731 1732 #define SET_OBJ_SIZE(ptr, name) SET_DEVICE_OP(ptr, size_##name) 1733 1734 SET_DEVICE_OP(dev_ops, add_gid); 1735 SET_DEVICE_OP(dev_ops, advise_mr); 1736 SET_DEVICE_OP(dev_ops, alloc_dm); 1737 SET_DEVICE_OP(dev_ops, alloc_fmr); 1738 SET_DEVICE_OP(dev_ops, alloc_hw_stats); 1739 SET_DEVICE_OP(dev_ops, alloc_mr); 1740 SET_DEVICE_OP(dev_ops, alloc_mw); 1741 SET_DEVICE_OP(dev_ops, alloc_pd); 1742 SET_DEVICE_OP(dev_ops, alloc_rdma_netdev); 1743 SET_DEVICE_OP(dev_ops, alloc_ucontext); 1744 SET_DEVICE_OP(dev_ops, alloc_xrcd); 1745 SET_DEVICE_OP(dev_ops, attach_mcast); 1746 SET_DEVICE_OP(dev_ops, check_mr_status); 1747 SET_DEVICE_OP(dev_ops, create_ah); 1748 SET_DEVICE_OP(dev_ops, create_counters); 1749 SET_DEVICE_OP(dev_ops, create_cq); 1750 SET_DEVICE_OP(dev_ops, create_flow); 1751 SET_DEVICE_OP(dev_ops, create_flow_action_esp); 1752 SET_DEVICE_OP(dev_ops, create_qp); 1753 SET_DEVICE_OP(dev_ops, create_rwq_ind_table); 1754 SET_DEVICE_OP(dev_ops, create_srq); 1755 SET_DEVICE_OP(dev_ops, create_wq); 1756 SET_DEVICE_OP(dev_ops, dealloc_dm); 1757 SET_DEVICE_OP(dev_ops, dealloc_driver); 1758 SET_DEVICE_OP(dev_ops, dealloc_fmr); 1759 SET_DEVICE_OP(dev_ops, dealloc_mw); 1760 SET_DEVICE_OP(dev_ops, dealloc_pd); 1761 SET_DEVICE_OP(dev_ops, dealloc_ucontext); 1762 SET_DEVICE_OP(dev_ops, dealloc_xrcd); 1763 SET_DEVICE_OP(dev_ops, del_gid); 1764 SET_DEVICE_OP(dev_ops, dereg_mr); 1765 SET_DEVICE_OP(dev_ops, destroy_ah); 1766 SET_DEVICE_OP(dev_ops, destroy_counters); 1767 SET_DEVICE_OP(dev_ops, destroy_cq); 1768 SET_DEVICE_OP(dev_ops, destroy_flow); 1769 SET_DEVICE_OP(dev_ops, destroy_flow_action); 1770 SET_DEVICE_OP(dev_ops, destroy_qp); 1771 SET_DEVICE_OP(dev_ops, destroy_rwq_ind_table); 1772 SET_DEVICE_OP(dev_ops, destroy_srq); 1773 SET_DEVICE_OP(dev_ops, destroy_wq); 1774 SET_DEVICE_OP(dev_ops, detach_mcast); 1775 SET_DEVICE_OP(dev_ops, disassociate_ucontext); 1776 SET_DEVICE_OP(dev_ops, drain_rq); 1777 SET_DEVICE_OP(dev_ops, drain_sq); 1778 SET_DEVICE_OP(dev_ops, enable_driver); 1779 SET_DEVICE_OP(dev_ops, fill_res_entry); 1780 SET_DEVICE_OP(dev_ops, get_dev_fw_str); 1781 SET_DEVICE_OP(dev_ops, get_dma_mr); 1782 SET_DEVICE_OP(dev_ops, get_hw_stats); 1783 SET_DEVICE_OP(dev_ops, get_link_layer); 1784 SET_DEVICE_OP(dev_ops, get_netdev); 1785 SET_DEVICE_OP(dev_ops, get_port_immutable); 1786 SET_DEVICE_OP(dev_ops, get_vector_affinity); 1787 SET_DEVICE_OP(dev_ops, get_vf_config); 1788 SET_DEVICE_OP(dev_ops, get_vf_stats); 1789 SET_DEVICE_OP(dev_ops, init_port); 1790 SET_DEVICE_OP(dev_ops, map_mr_sg); 1791 SET_DEVICE_OP(dev_ops, map_phys_fmr); 1792 SET_DEVICE_OP(dev_ops, mmap); 1793 SET_DEVICE_OP(dev_ops, modify_ah); 1794 SET_DEVICE_OP(dev_ops, modify_cq); 1795 SET_DEVICE_OP(dev_ops, modify_device); 1796 SET_DEVICE_OP(dev_ops, modify_flow_action_esp); 1797 SET_DEVICE_OP(dev_ops, modify_port); 1798 SET_DEVICE_OP(dev_ops, modify_qp); 1799 SET_DEVICE_OP(dev_ops, modify_srq); 1800 SET_DEVICE_OP(dev_ops, modify_wq); 1801 SET_DEVICE_OP(dev_ops, peek_cq); 1802 SET_DEVICE_OP(dev_ops, poll_cq); 1803 SET_DEVICE_OP(dev_ops, post_recv); 1804 SET_DEVICE_OP(dev_ops, post_send); 1805 SET_DEVICE_OP(dev_ops, post_srq_recv); 1806 SET_DEVICE_OP(dev_ops, process_mad); 1807 SET_DEVICE_OP(dev_ops, query_ah); 1808 SET_DEVICE_OP(dev_ops, query_device); 1809 SET_DEVICE_OP(dev_ops, query_gid); 1810 SET_DEVICE_OP(dev_ops, query_pkey); 1811 SET_DEVICE_OP(dev_ops, query_port); 1812 SET_DEVICE_OP(dev_ops, query_qp); 1813 SET_DEVICE_OP(dev_ops, query_srq); 1814 SET_DEVICE_OP(dev_ops, rdma_netdev_get_params); 1815 SET_DEVICE_OP(dev_ops, read_counters); 1816 SET_DEVICE_OP(dev_ops, reg_dm_mr); 1817 SET_DEVICE_OP(dev_ops, reg_user_mr); 1818 SET_DEVICE_OP(dev_ops, req_ncomp_notif); 1819 SET_DEVICE_OP(dev_ops, req_notify_cq); 1820 SET_DEVICE_OP(dev_ops, rereg_user_mr); 1821 SET_DEVICE_OP(dev_ops, resize_cq); 1822 SET_DEVICE_OP(dev_ops, set_vf_guid); 1823 SET_DEVICE_OP(dev_ops, set_vf_link_state); 1824 SET_DEVICE_OP(dev_ops, unmap_fmr); 1825 1826 SET_OBJ_SIZE(dev_ops, ib_pd); 1827 SET_OBJ_SIZE(dev_ops, ib_ucontext); 1828 } 1829 EXPORT_SYMBOL(ib_set_device_ops); 1830 1831 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = { 1832 [RDMA_NL_LS_OP_RESOLVE] = { 1833 .doit = ib_nl_handle_resolve_resp, 1834 .flags = RDMA_NL_ADMIN_PERM, 1835 }, 1836 [RDMA_NL_LS_OP_SET_TIMEOUT] = { 1837 .doit = ib_nl_handle_set_timeout, 1838 .flags = RDMA_NL_ADMIN_PERM, 1839 }, 1840 [RDMA_NL_LS_OP_IP_RESOLVE] = { 1841 .doit = ib_nl_handle_ip_res_resp, 1842 .flags = RDMA_NL_ADMIN_PERM, 1843 }, 1844 }; 1845 1846 static int __init ib_core_init(void) 1847 { 1848 int ret; 1849 1850 ib_wq = alloc_workqueue("infiniband", 0, 0); 1851 if (!ib_wq) 1852 return -ENOMEM; 1853 1854 ib_comp_wq = alloc_workqueue("ib-comp-wq", 1855 WQ_HIGHPRI | WQ_MEM_RECLAIM | WQ_SYSFS, 0); 1856 if (!ib_comp_wq) { 1857 ret = -ENOMEM; 1858 goto err; 1859 } 1860 1861 ib_comp_unbound_wq = 1862 alloc_workqueue("ib-comp-unb-wq", 1863 WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | 1864 WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); 1865 if (!ib_comp_unbound_wq) { 1866 ret = -ENOMEM; 1867 goto err_comp; 1868 } 1869 1870 ret = class_register(&ib_class); 1871 if (ret) { 1872 pr_warn("Couldn't create InfiniBand device class\n"); 1873 goto err_comp_unbound; 1874 } 1875 1876 ret = rdma_nl_init(); 1877 if (ret) { 1878 pr_warn("Couldn't init IB netlink interface: err %d\n", ret); 1879 goto err_sysfs; 1880 } 1881 1882 ret = addr_init(); 1883 if (ret) { 1884 pr_warn("Could't init IB address resolution\n"); 1885 goto err_ibnl; 1886 } 1887 1888 ret = ib_mad_init(); 1889 if (ret) { 1890 pr_warn("Couldn't init IB MAD\n"); 1891 goto err_addr; 1892 } 1893 1894 ret = ib_sa_init(); 1895 if (ret) { 1896 pr_warn("Couldn't init SA\n"); 1897 goto err_mad; 1898 } 1899 1900 ret = register_lsm_notifier(&ibdev_lsm_nb); 1901 if (ret) { 1902 pr_warn("Couldn't register LSM notifier. ret %d\n", ret); 1903 goto err_sa; 1904 } 1905 1906 nldev_init(); 1907 rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table); 1908 roce_gid_mgmt_init(); 1909 1910 return 0; 1911 1912 err_sa: 1913 ib_sa_cleanup(); 1914 err_mad: 1915 ib_mad_cleanup(); 1916 err_addr: 1917 addr_cleanup(); 1918 err_ibnl: 1919 rdma_nl_exit(); 1920 err_sysfs: 1921 class_unregister(&ib_class); 1922 err_comp_unbound: 1923 destroy_workqueue(ib_comp_unbound_wq); 1924 err_comp: 1925 destroy_workqueue(ib_comp_wq); 1926 err: 1927 destroy_workqueue(ib_wq); 1928 return ret; 1929 } 1930 1931 static void __exit ib_core_cleanup(void) 1932 { 1933 roce_gid_mgmt_cleanup(); 1934 nldev_exit(); 1935 rdma_nl_unregister(RDMA_NL_LS); 1936 unregister_lsm_notifier(&ibdev_lsm_nb); 1937 ib_sa_cleanup(); 1938 ib_mad_cleanup(); 1939 addr_cleanup(); 1940 rdma_nl_exit(); 1941 class_unregister(&ib_class); 1942 destroy_workqueue(ib_comp_unbound_wq); 1943 destroy_workqueue(ib_comp_wq); 1944 /* Make sure that any pending umem accounting work is done. */ 1945 destroy_workqueue(ib_wq); 1946 flush_workqueue(system_unbound_wq); 1947 WARN_ON(!xa_empty(&clients)); 1948 WARN_ON(!xa_empty(&devices)); 1949 } 1950 1951 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4); 1952 1953 subsys_initcall(ib_core_init); 1954 module_exit(ib_core_cleanup); 1955