1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem.h> 43 #include <rdma/ib_umem_odp.h> 44 #include <rdma/ib_verbs.h> 45 #include "dm.h" 46 #include "mlx5_ib.h" 47 #include "umr.h" 48 49 enum { 50 MAX_PENDING_REG_MR = 8, 51 }; 52 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned int page_size, bool populate); 60 61 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 62 struct ib_pd *pd) 63 { 64 struct mlx5_ib_dev *dev = to_mdev(pd->device); 65 66 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 67 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 68 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 69 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 70 MLX5_SET(mkc, mkc, lr, 1); 71 72 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 73 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 77 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 78 } 79 80 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 81 MLX5_SET(mkc, mkc, qpn, 0xffffff); 82 MLX5_SET64(mkc, mkc, start_addr, start_addr); 83 } 84 85 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 86 { 87 u8 key = atomic_inc_return(&dev->mkey_var); 88 void *mkc; 89 90 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 91 MLX5_SET(mkc, mkc, mkey_7_0, key); 92 *mkey = key; 93 } 94 95 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 96 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 97 { 98 int ret; 99 100 assign_mkey_variant(dev, &mkey->key, in); 101 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 102 if (!ret) 103 init_waitqueue_head(&mkey->wait); 104 105 return ret; 106 } 107 108 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 109 { 110 struct mlx5_ib_dev *dev = async_create->ent->dev; 111 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 112 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 113 114 MLX5_SET(create_mkey_in, async_create->in, opcode, 115 MLX5_CMD_OP_CREATE_MKEY); 116 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 117 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 118 async_create->out, outlen, create_mkey_callback, 119 &async_create->cb_work); 120 } 121 122 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 123 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 124 125 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 126 { 127 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 128 129 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 130 } 131 132 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 133 { 134 if (status == -ENXIO) /* core driver is not available */ 135 return; 136 137 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 138 if (status != -EREMOTEIO) /* driver specific failure */ 139 return; 140 141 /* Failed in FW, print cmd out failure details */ 142 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 143 } 144 145 146 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, 147 void *to_store) 148 { 149 XA_STATE(xas, &ent->mkeys, 0); 150 void *curr; 151 152 xa_lock_irq(&ent->mkeys); 153 if (limit_pendings && 154 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { 155 xa_unlock_irq(&ent->mkeys); 156 return -EAGAIN; 157 } 158 while (1) { 159 /* 160 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version 161 * doesn't transparently unlock. Instead we set the xas index to 162 * the current value of reserved every iteration. 163 */ 164 xas_set(&xas, ent->reserved); 165 curr = xas_load(&xas); 166 if (!curr) { 167 if (to_store && ent->stored == ent->reserved) 168 xas_store(&xas, to_store); 169 else 170 xas_store(&xas, XA_ZERO_ENTRY); 171 if (xas_valid(&xas)) { 172 ent->reserved++; 173 if (to_store) { 174 if (ent->stored != ent->reserved) 175 __xa_store(&ent->mkeys, 176 ent->stored, 177 to_store, 178 GFP_KERNEL); 179 ent->stored++; 180 queue_adjust_cache_locked(ent); 181 WRITE_ONCE(ent->dev->cache.last_add, 182 jiffies); 183 } 184 } 185 } 186 xa_unlock_irq(&ent->mkeys); 187 188 /* 189 * Notice xas_nomem() must always be called as it cleans 190 * up any cached allocation. 191 */ 192 if (!xas_nomem(&xas, GFP_KERNEL)) 193 break; 194 xa_lock_irq(&ent->mkeys); 195 } 196 if (xas_error(&xas)) 197 return xas_error(&xas); 198 if (WARN_ON(curr)) 199 return -EINVAL; 200 return 0; 201 } 202 203 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) 204 { 205 void *old; 206 207 ent->reserved--; 208 old = __xa_erase(&ent->mkeys, ent->reserved); 209 WARN_ON(old); 210 } 211 212 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) 213 { 214 void *old; 215 216 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); 217 WARN_ON(old); 218 ent->stored++; 219 } 220 221 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) 222 { 223 void *old, *xa_mkey; 224 225 ent->stored--; 226 ent->reserved--; 227 228 if (ent->stored == ent->reserved) { 229 xa_mkey = __xa_erase(&ent->mkeys, ent->stored); 230 WARN_ON(!xa_mkey); 231 return (u32)xa_to_value(xa_mkey); 232 } 233 234 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, 235 GFP_KERNEL); 236 WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); 237 old = __xa_erase(&ent->mkeys, ent->reserved); 238 WARN_ON(old); 239 return (u32)xa_to_value(xa_mkey); 240 } 241 242 static void create_mkey_callback(int status, struct mlx5_async_work *context) 243 { 244 struct mlx5r_async_create_mkey *mkey_out = 245 container_of(context, struct mlx5r_async_create_mkey, cb_work); 246 struct mlx5_cache_ent *ent = mkey_out->ent; 247 struct mlx5_ib_dev *dev = ent->dev; 248 unsigned long flags; 249 250 if (status) { 251 create_mkey_warn(dev, status, mkey_out->out); 252 kfree(mkey_out); 253 xa_lock_irqsave(&ent->mkeys, flags); 254 undo_push_reserve_mkey(ent); 255 WRITE_ONCE(dev->fill_delay, 1); 256 xa_unlock_irqrestore(&ent->mkeys, flags); 257 mod_timer(&dev->delay_timer, jiffies + HZ); 258 return; 259 } 260 261 mkey_out->mkey |= mlx5_idx_to_mkey( 262 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 263 WRITE_ONCE(dev->cache.last_add, jiffies); 264 265 xa_lock_irqsave(&ent->mkeys, flags); 266 push_to_reserved(ent, mkey_out->mkey); 267 /* If we are doing fill_to_high_water then keep going. */ 268 queue_adjust_cache_locked(ent); 269 xa_unlock_irqrestore(&ent->mkeys, flags); 270 kfree(mkey_out); 271 } 272 273 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 274 { 275 int ret = 0; 276 277 switch (access_mode) { 278 case MLX5_MKC_ACCESS_MODE_MTT: 279 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 280 sizeof(struct mlx5_mtt)); 281 break; 282 case MLX5_MKC_ACCESS_MODE_KSM: 283 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 284 sizeof(struct mlx5_klm)); 285 break; 286 default: 287 WARN_ON(1); 288 } 289 return ret; 290 } 291 292 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 293 { 294 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 295 MLX5_SET(mkc, mkc, free, 1); 296 MLX5_SET(mkc, mkc, umr_en, 1); 297 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 298 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 299 300 MLX5_SET(mkc, mkc, translations_octword_size, 301 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 302 MLX5_SET(mkc, mkc, log_page_size, ent->page); 303 } 304 305 /* Asynchronously schedule new MRs to be populated in the cache. */ 306 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 307 { 308 struct mlx5r_async_create_mkey *async_create; 309 void *mkc; 310 int err = 0; 311 int i; 312 313 for (i = 0; i < num; i++) { 314 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 315 GFP_KERNEL); 316 if (!async_create) 317 return -ENOMEM; 318 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 319 memory_key_mkey_entry); 320 set_cache_mkc(ent, mkc); 321 async_create->ent = ent; 322 323 err = push_mkey(ent, true, NULL); 324 if (err) 325 goto free_async_create; 326 327 err = mlx5_ib_create_mkey_cb(async_create); 328 if (err) { 329 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 330 goto err_undo_reserve; 331 } 332 } 333 334 return 0; 335 336 err_undo_reserve: 337 xa_lock_irq(&ent->mkeys); 338 undo_push_reserve_mkey(ent); 339 xa_unlock_irq(&ent->mkeys); 340 free_async_create: 341 kfree(async_create); 342 return err; 343 } 344 345 /* Synchronously create a MR in the cache */ 346 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 347 { 348 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 349 void *mkc; 350 u32 *in; 351 int err; 352 353 in = kzalloc(inlen, GFP_KERNEL); 354 if (!in) 355 return -ENOMEM; 356 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 357 set_cache_mkc(ent, mkc); 358 359 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 360 if (err) 361 goto free_in; 362 363 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 364 free_in: 365 kfree(in); 366 return err; 367 } 368 369 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 370 { 371 u32 mkey; 372 373 lockdep_assert_held(&ent->mkeys.xa_lock); 374 if (!ent->stored) 375 return; 376 mkey = pop_stored_mkey(ent); 377 xa_unlock_irq(&ent->mkeys); 378 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 379 xa_lock_irq(&ent->mkeys); 380 } 381 382 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 383 bool limit_fill) 384 __acquires(&ent->mkeys) __releases(&ent->mkeys) 385 { 386 int err; 387 388 lockdep_assert_held(&ent->mkeys.xa_lock); 389 390 while (true) { 391 if (limit_fill) 392 target = ent->limit * 2; 393 if (target == ent->reserved) 394 return 0; 395 if (target > ent->reserved) { 396 u32 todo = target - ent->reserved; 397 398 xa_unlock_irq(&ent->mkeys); 399 err = add_keys(ent, todo); 400 if (err == -EAGAIN) 401 usleep_range(3000, 5000); 402 xa_lock_irq(&ent->mkeys); 403 if (err) { 404 if (err != -EAGAIN) 405 return err; 406 } else 407 return 0; 408 } else { 409 remove_cache_mr_locked(ent); 410 } 411 } 412 } 413 414 static ssize_t size_write(struct file *filp, const char __user *buf, 415 size_t count, loff_t *pos) 416 { 417 struct mlx5_cache_ent *ent = filp->private_data; 418 u32 target; 419 int err; 420 421 err = kstrtou32_from_user(buf, count, 0, &target); 422 if (err) 423 return err; 424 425 /* 426 * Target is the new value of total_mrs the user requests, however we 427 * cannot free MRs that are in use. Compute the target value for stored 428 * mkeys. 429 */ 430 xa_lock_irq(&ent->mkeys); 431 if (target < ent->in_use) { 432 err = -EINVAL; 433 goto err_unlock; 434 } 435 target = target - ent->in_use; 436 if (target < ent->limit || target > ent->limit*2) { 437 err = -EINVAL; 438 goto err_unlock; 439 } 440 err = resize_available_mrs(ent, target, false); 441 if (err) 442 goto err_unlock; 443 xa_unlock_irq(&ent->mkeys); 444 445 return count; 446 447 err_unlock: 448 xa_unlock_irq(&ent->mkeys); 449 return err; 450 } 451 452 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 453 loff_t *pos) 454 { 455 struct mlx5_cache_ent *ent = filp->private_data; 456 char lbuf[20]; 457 int err; 458 459 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); 460 if (err < 0) 461 return err; 462 463 return simple_read_from_buffer(buf, count, pos, lbuf, err); 464 } 465 466 static const struct file_operations size_fops = { 467 .owner = THIS_MODULE, 468 .open = simple_open, 469 .write = size_write, 470 .read = size_read, 471 }; 472 473 static ssize_t limit_write(struct file *filp, const char __user *buf, 474 size_t count, loff_t *pos) 475 { 476 struct mlx5_cache_ent *ent = filp->private_data; 477 u32 var; 478 int err; 479 480 err = kstrtou32_from_user(buf, count, 0, &var); 481 if (err) 482 return err; 483 484 /* 485 * Upon set we immediately fill the cache to high water mark implied by 486 * the limit. 487 */ 488 xa_lock_irq(&ent->mkeys); 489 ent->limit = var; 490 err = resize_available_mrs(ent, 0, true); 491 xa_unlock_irq(&ent->mkeys); 492 if (err) 493 return err; 494 return count; 495 } 496 497 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 498 loff_t *pos) 499 { 500 struct mlx5_cache_ent *ent = filp->private_data; 501 char lbuf[20]; 502 int err; 503 504 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 505 if (err < 0) 506 return err; 507 508 return simple_read_from_buffer(buf, count, pos, lbuf, err); 509 } 510 511 static const struct file_operations limit_fops = { 512 .owner = THIS_MODULE, 513 .open = simple_open, 514 .write = limit_write, 515 .read = limit_read, 516 }; 517 518 static bool someone_adding(struct mlx5_mkey_cache *cache) 519 { 520 unsigned int i; 521 522 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 523 struct mlx5_cache_ent *ent = &cache->ent[i]; 524 bool ret; 525 526 xa_lock_irq(&ent->mkeys); 527 ret = ent->stored < ent->limit; 528 xa_unlock_irq(&ent->mkeys); 529 if (ret) 530 return true; 531 } 532 return false; 533 } 534 535 /* 536 * Check if the bucket is outside the high/low water mark and schedule an async 537 * update. The cache refill has hysteresis, once the low water mark is hit it is 538 * refilled up to the high mark. 539 */ 540 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 541 { 542 lockdep_assert_held(&ent->mkeys.xa_lock); 543 544 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 545 return; 546 if (ent->stored < ent->limit) { 547 ent->fill_to_high_water = true; 548 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 549 } else if (ent->fill_to_high_water && 550 ent->reserved < 2 * ent->limit) { 551 /* 552 * Once we start populating due to hitting a low water mark 553 * continue until we pass the high water mark. 554 */ 555 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 556 } else if (ent->stored == 2 * ent->limit) { 557 ent->fill_to_high_water = false; 558 } else if (ent->stored > 2 * ent->limit) { 559 /* Queue deletion of excess entries */ 560 ent->fill_to_high_water = false; 561 if (ent->stored != ent->reserved) 562 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 563 msecs_to_jiffies(1000)); 564 else 565 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 566 } 567 } 568 569 static void __cache_work_func(struct mlx5_cache_ent *ent) 570 { 571 struct mlx5_ib_dev *dev = ent->dev; 572 struct mlx5_mkey_cache *cache = &dev->cache; 573 int err; 574 575 xa_lock_irq(&ent->mkeys); 576 if (ent->disabled) 577 goto out; 578 579 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && 580 !READ_ONCE(dev->fill_delay)) { 581 xa_unlock_irq(&ent->mkeys); 582 err = add_keys(ent, 1); 583 xa_lock_irq(&ent->mkeys); 584 if (ent->disabled) 585 goto out; 586 if (err) { 587 /* 588 * EAGAIN only happens if there are pending MRs, so we 589 * will be rescheduled when storing them. The only 590 * failure path here is ENOMEM. 591 */ 592 if (err != -EAGAIN) { 593 mlx5_ib_warn( 594 dev, 595 "command failed order %d, err %d\n", 596 ent->order, err); 597 queue_delayed_work(cache->wq, &ent->dwork, 598 msecs_to_jiffies(1000)); 599 } 600 } 601 } else if (ent->stored > 2 * ent->limit) { 602 bool need_delay; 603 604 /* 605 * The remove_cache_mr() logic is performed as garbage 606 * collection task. Such task is intended to be run when no 607 * other active processes are running. 608 * 609 * The need_resched() will return TRUE if there are user tasks 610 * to be activated in near future. 611 * 612 * In such case, we don't execute remove_cache_mr() and postpone 613 * the garbage collection work to try to run in next cycle, in 614 * order to free CPU resources to other tasks. 615 */ 616 xa_unlock_irq(&ent->mkeys); 617 need_delay = need_resched() || someone_adding(cache) || 618 !time_after(jiffies, 619 READ_ONCE(cache->last_add) + 300 * HZ); 620 xa_lock_irq(&ent->mkeys); 621 if (ent->disabled) 622 goto out; 623 if (need_delay) { 624 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 625 goto out; 626 } 627 remove_cache_mr_locked(ent); 628 queue_adjust_cache_locked(ent); 629 } 630 out: 631 xa_unlock_irq(&ent->mkeys); 632 } 633 634 static void delayed_cache_work_func(struct work_struct *work) 635 { 636 struct mlx5_cache_ent *ent; 637 638 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 639 __cache_work_func(ent); 640 } 641 642 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 643 struct mlx5_cache_ent *ent, 644 int access_flags) 645 { 646 struct mlx5_ib_mr *mr; 647 int err; 648 649 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) 650 return ERR_PTR(-EOPNOTSUPP); 651 652 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 653 if (!mr) 654 return ERR_PTR(-ENOMEM); 655 656 xa_lock_irq(&ent->mkeys); 657 ent->in_use++; 658 659 if (!ent->stored) { 660 queue_adjust_cache_locked(ent); 661 ent->miss++; 662 xa_unlock_irq(&ent->mkeys); 663 err = create_cache_mkey(ent, &mr->mmkey.key); 664 if (err) { 665 xa_lock_irq(&ent->mkeys); 666 ent->in_use--; 667 xa_unlock_irq(&ent->mkeys); 668 kfree(mr); 669 return ERR_PTR(err); 670 } 671 } else { 672 mr->mmkey.key = pop_stored_mkey(ent); 673 queue_adjust_cache_locked(ent); 674 xa_unlock_irq(&ent->mkeys); 675 } 676 mr->mmkey.cache_ent = ent; 677 mr->mmkey.type = MLX5_MKEY_MR; 678 init_waitqueue_head(&mr->mmkey.wait); 679 return mr; 680 } 681 682 static void clean_keys(struct mlx5_ib_dev *dev, int c) 683 { 684 struct mlx5_mkey_cache *cache = &dev->cache; 685 struct mlx5_cache_ent *ent = &cache->ent[c]; 686 u32 mkey; 687 688 cancel_delayed_work(&ent->dwork); 689 xa_lock_irq(&ent->mkeys); 690 while (ent->stored) { 691 mkey = pop_stored_mkey(ent); 692 xa_unlock_irq(&ent->mkeys); 693 mlx5_core_destroy_mkey(dev->mdev, mkey); 694 xa_lock_irq(&ent->mkeys); 695 } 696 xa_unlock_irq(&ent->mkeys); 697 } 698 699 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 700 { 701 if (!mlx5_debugfs_root || dev->is_rep) 702 return; 703 704 debugfs_remove_recursive(dev->cache.root); 705 dev->cache.root = NULL; 706 } 707 708 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 709 { 710 struct mlx5_mkey_cache *cache = &dev->cache; 711 struct mlx5_cache_ent *ent; 712 struct dentry *dir; 713 int i; 714 715 if (!mlx5_debugfs_root || dev->is_rep) 716 return; 717 718 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 719 720 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 721 ent = &cache->ent[i]; 722 sprintf(ent->name, "%d", ent->order); 723 dir = debugfs_create_dir(ent->name, cache->root); 724 debugfs_create_file("size", 0600, dir, ent, &size_fops); 725 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 726 debugfs_create_ulong("cur", 0400, dir, &ent->stored); 727 debugfs_create_u32("miss", 0600, dir, &ent->miss); 728 } 729 } 730 731 static void delay_time_func(struct timer_list *t) 732 { 733 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 734 735 WRITE_ONCE(dev->fill_delay, 0); 736 } 737 738 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 739 { 740 struct mlx5_mkey_cache *cache = &dev->cache; 741 struct mlx5_cache_ent *ent; 742 int i; 743 744 mutex_init(&dev->slow_path_mutex); 745 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 746 if (!cache->wq) { 747 mlx5_ib_warn(dev, "failed to create work queue\n"); 748 return -ENOMEM; 749 } 750 751 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 752 timer_setup(&dev->delay_timer, delay_time_func, 0); 753 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 754 ent = &cache->ent[i]; 755 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); 756 ent->order = i + 2; 757 ent->dev = dev; 758 ent->limit = 0; 759 760 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 761 762 if (i > MKEY_CACHE_LAST_STD_ENTRY) { 763 mlx5_odp_init_mkey_cache_entry(ent); 764 continue; 765 } 766 767 if (ent->order > mkey_cache_max_order(dev)) 768 continue; 769 770 ent->page = PAGE_SHIFT; 771 ent->ndescs = 1 << ent->order; 772 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 773 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 774 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 775 mlx5r_umr_can_load_pas(dev, 0)) 776 ent->limit = dev->mdev->profile.mr_cache[i].limit; 777 else 778 ent->limit = 0; 779 xa_lock_irq(&ent->mkeys); 780 queue_adjust_cache_locked(ent); 781 xa_unlock_irq(&ent->mkeys); 782 } 783 784 mlx5_mkey_cache_debugfs_init(dev); 785 786 return 0; 787 } 788 789 int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 790 { 791 unsigned int i; 792 793 if (!dev->cache.wq) 794 return 0; 795 796 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 797 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 798 799 xa_lock_irq(&ent->mkeys); 800 ent->disabled = true; 801 xa_unlock_irq(&ent->mkeys); 802 cancel_delayed_work_sync(&ent->dwork); 803 } 804 805 mlx5_mkey_cache_debugfs_cleanup(dev); 806 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 807 808 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) 809 clean_keys(dev, i); 810 811 destroy_workqueue(dev->cache.wq); 812 del_timer_sync(&dev->delay_timer); 813 814 return 0; 815 } 816 817 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 818 { 819 struct mlx5_ib_dev *dev = to_mdev(pd->device); 820 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 821 struct mlx5_ib_mr *mr; 822 void *mkc; 823 u32 *in; 824 int err; 825 826 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 827 if (!mr) 828 return ERR_PTR(-ENOMEM); 829 830 in = kzalloc(inlen, GFP_KERNEL); 831 if (!in) { 832 err = -ENOMEM; 833 goto err_free; 834 } 835 836 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 837 838 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 839 MLX5_SET(mkc, mkc, length64, 1); 840 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 841 pd); 842 843 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 844 if (err) 845 goto err_in; 846 847 kfree(in); 848 mr->mmkey.type = MLX5_MKEY_MR; 849 mr->ibmr.lkey = mr->mmkey.key; 850 mr->ibmr.rkey = mr->mmkey.key; 851 mr->umem = NULL; 852 853 return &mr->ibmr; 854 855 err_in: 856 kfree(in); 857 858 err_free: 859 kfree(mr); 860 861 return ERR_PTR(err); 862 } 863 864 static int get_octo_len(u64 addr, u64 len, int page_shift) 865 { 866 u64 page_size = 1ULL << page_shift; 867 u64 offset; 868 int npages; 869 870 offset = addr & (page_size - 1); 871 npages = ALIGN(len + offset, page_size) >> page_shift; 872 return (npages + 1) / 2; 873 } 874 875 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 876 { 877 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 878 return MKEY_CACHE_LAST_STD_ENTRY + 2; 879 return MLX5_MAX_UMR_SHIFT; 880 } 881 882 static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, 883 unsigned int order) 884 { 885 struct mlx5_mkey_cache *cache = &dev->cache; 886 887 if (order < cache->ent[0].order) 888 return &cache->ent[0]; 889 order = order - cache->ent[0].order; 890 if (order > MKEY_CACHE_LAST_STD_ENTRY) 891 return NULL; 892 return &cache->ent[order]; 893 } 894 895 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 896 u64 length, int access_flags, u64 iova) 897 { 898 mr->ibmr.lkey = mr->mmkey.key; 899 mr->ibmr.rkey = mr->mmkey.key; 900 mr->ibmr.length = length; 901 mr->ibmr.device = &dev->ib_dev; 902 mr->ibmr.iova = iova; 903 mr->access_flags = access_flags; 904 } 905 906 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 907 u64 iova) 908 { 909 /* 910 * The alignment of iova has already been checked upon entering 911 * UVERBS_METHOD_REG_DMABUF_MR 912 */ 913 umem->iova = iova; 914 return PAGE_SIZE; 915 } 916 917 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 918 struct ib_umem *umem, u64 iova, 919 int access_flags) 920 { 921 struct mlx5_ib_dev *dev = to_mdev(pd->device); 922 struct mlx5_cache_ent *ent; 923 struct mlx5_ib_mr *mr; 924 unsigned int page_size; 925 926 if (umem->is_dmabuf) 927 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 928 else 929 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 930 0, iova); 931 if (WARN_ON(!page_size)) 932 return ERR_PTR(-EINVAL); 933 ent = mkey_cache_ent_from_order( 934 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 935 /* 936 * Matches access in alloc_cache_mr(). If the MR can't come from the 937 * cache then synchronously create an uncached one. 938 */ 939 if (!ent || ent->limit == 0 || 940 !mlx5r_umr_can_reconfig(dev, 0, access_flags)) { 941 mutex_lock(&dev->slow_path_mutex); 942 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 943 mutex_unlock(&dev->slow_path_mutex); 944 return mr; 945 } 946 947 mr = mlx5_mr_cache_alloc(dev, ent, access_flags); 948 if (IS_ERR(mr)) 949 return mr; 950 951 mr->ibmr.pd = pd; 952 mr->umem = umem; 953 mr->page_shift = order_base_2(page_size); 954 set_mr_fields(dev, mr, umem->length, access_flags, iova); 955 956 return mr; 957 } 958 959 /* 960 * If ibmr is NULL it will be allocated by reg_create. 961 * Else, the given ibmr will be used. 962 */ 963 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 964 u64 iova, int access_flags, 965 unsigned int page_size, bool populate) 966 { 967 struct mlx5_ib_dev *dev = to_mdev(pd->device); 968 struct mlx5_ib_mr *mr; 969 __be64 *pas; 970 void *mkc; 971 int inlen; 972 u32 *in; 973 int err; 974 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 975 976 if (!page_size) 977 return ERR_PTR(-EINVAL); 978 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 979 if (!mr) 980 return ERR_PTR(-ENOMEM); 981 982 mr->ibmr.pd = pd; 983 mr->access_flags = access_flags; 984 mr->page_shift = order_base_2(page_size); 985 986 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 987 if (populate) 988 inlen += sizeof(*pas) * 989 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 990 in = kvzalloc(inlen, GFP_KERNEL); 991 if (!in) { 992 err = -ENOMEM; 993 goto err_1; 994 } 995 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 996 if (populate) { 997 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 998 err = -EINVAL; 999 goto err_2; 1000 } 1001 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1002 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1003 } 1004 1005 /* The pg_access bit allows setting the access flags 1006 * in the page list submitted with the command. */ 1007 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1008 1009 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1010 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1011 populate ? pd : dev->umrc.pd); 1012 MLX5_SET(mkc, mkc, free, !populate); 1013 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1014 MLX5_SET(mkc, mkc, umr_en, 1); 1015 1016 MLX5_SET64(mkc, mkc, len, umem->length); 1017 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1018 MLX5_SET(mkc, mkc, translations_octword_size, 1019 get_octo_len(iova, umem->length, mr->page_shift)); 1020 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1021 if (populate) { 1022 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1023 get_octo_len(iova, umem->length, mr->page_shift)); 1024 } 1025 1026 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1027 if (err) { 1028 mlx5_ib_warn(dev, "create mkey failed\n"); 1029 goto err_2; 1030 } 1031 mr->mmkey.type = MLX5_MKEY_MR; 1032 mr->umem = umem; 1033 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1034 kvfree(in); 1035 1036 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1037 1038 return mr; 1039 1040 err_2: 1041 kvfree(in); 1042 err_1: 1043 kfree(mr); 1044 return ERR_PTR(err); 1045 } 1046 1047 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1048 u64 length, int acc, int mode) 1049 { 1050 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1051 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1052 struct mlx5_ib_mr *mr; 1053 void *mkc; 1054 u32 *in; 1055 int err; 1056 1057 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1058 if (!mr) 1059 return ERR_PTR(-ENOMEM); 1060 1061 in = kzalloc(inlen, GFP_KERNEL); 1062 if (!in) { 1063 err = -ENOMEM; 1064 goto err_free; 1065 } 1066 1067 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1068 1069 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1070 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1071 MLX5_SET64(mkc, mkc, len, length); 1072 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1073 1074 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1075 if (err) 1076 goto err_in; 1077 1078 kfree(in); 1079 1080 set_mr_fields(dev, mr, length, acc, start_addr); 1081 1082 return &mr->ibmr; 1083 1084 err_in: 1085 kfree(in); 1086 1087 err_free: 1088 kfree(mr); 1089 1090 return ERR_PTR(err); 1091 } 1092 1093 int mlx5_ib_advise_mr(struct ib_pd *pd, 1094 enum ib_uverbs_advise_mr_advice advice, 1095 u32 flags, 1096 struct ib_sge *sg_list, 1097 u32 num_sge, 1098 struct uverbs_attr_bundle *attrs) 1099 { 1100 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1101 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1102 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1103 return -EOPNOTSUPP; 1104 1105 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1106 sg_list, num_sge); 1107 } 1108 1109 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1110 struct ib_dm_mr_attr *attr, 1111 struct uverbs_attr_bundle *attrs) 1112 { 1113 struct mlx5_ib_dm *mdm = to_mdm(dm); 1114 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1115 u64 start_addr = mdm->dev_addr + attr->offset; 1116 int mode; 1117 1118 switch (mdm->type) { 1119 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1120 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1121 return ERR_PTR(-EINVAL); 1122 1123 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1124 start_addr -= pci_resource_start(dev->pdev, 0); 1125 break; 1126 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1127 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1128 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1129 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1130 return ERR_PTR(-EINVAL); 1131 1132 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1133 break; 1134 default: 1135 return ERR_PTR(-EINVAL); 1136 } 1137 1138 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1139 attr->access_flags, mode); 1140 } 1141 1142 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1143 u64 iova, int access_flags) 1144 { 1145 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1146 struct mlx5_ib_mr *mr = NULL; 1147 bool xlt_with_umr; 1148 int err; 1149 1150 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1151 if (xlt_with_umr) { 1152 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1153 } else { 1154 unsigned int page_size = mlx5_umem_find_best_pgsz( 1155 umem, mkc, log_page_size, 0, iova); 1156 1157 mutex_lock(&dev->slow_path_mutex); 1158 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1159 mutex_unlock(&dev->slow_path_mutex); 1160 } 1161 if (IS_ERR(mr)) { 1162 ib_umem_release(umem); 1163 return ERR_CAST(mr); 1164 } 1165 1166 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1167 1168 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1169 1170 if (xlt_with_umr) { 1171 /* 1172 * If the MR was created with reg_create then it will be 1173 * configured properly but left disabled. It is safe to go ahead 1174 * and configure it again via UMR while enabling it. 1175 */ 1176 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1177 if (err) { 1178 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1179 return ERR_PTR(err); 1180 } 1181 } 1182 return &mr->ibmr; 1183 } 1184 1185 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1186 u64 iova, int access_flags, 1187 struct ib_udata *udata) 1188 { 1189 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1190 struct ib_umem_odp *odp; 1191 struct mlx5_ib_mr *mr; 1192 int err; 1193 1194 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1195 return ERR_PTR(-EOPNOTSUPP); 1196 1197 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1198 if (err) 1199 return ERR_PTR(err); 1200 if (!start && length == U64_MAX) { 1201 if (iova != 0) 1202 return ERR_PTR(-EINVAL); 1203 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1204 return ERR_PTR(-EINVAL); 1205 1206 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1207 if (IS_ERR(mr)) 1208 return ERR_CAST(mr); 1209 return &mr->ibmr; 1210 } 1211 1212 /* ODP requires xlt update via umr to work. */ 1213 if (!mlx5r_umr_can_load_pas(dev, length)) 1214 return ERR_PTR(-EINVAL); 1215 1216 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1217 &mlx5_mn_ops); 1218 if (IS_ERR(odp)) 1219 return ERR_CAST(odp); 1220 1221 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1222 if (IS_ERR(mr)) { 1223 ib_umem_release(&odp->umem); 1224 return ERR_CAST(mr); 1225 } 1226 xa_init(&mr->implicit_children); 1227 1228 odp->private = mr; 1229 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1230 if (err) 1231 goto err_dereg_mr; 1232 1233 err = mlx5_ib_init_odp_mr(mr); 1234 if (err) 1235 goto err_dereg_mr; 1236 return &mr->ibmr; 1237 1238 err_dereg_mr: 1239 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1240 return ERR_PTR(err); 1241 } 1242 1243 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1244 u64 iova, int access_flags, 1245 struct ib_udata *udata) 1246 { 1247 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1248 struct ib_umem *umem; 1249 1250 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1251 return ERR_PTR(-EOPNOTSUPP); 1252 1253 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1254 start, iova, length, access_flags); 1255 1256 if (access_flags & IB_ACCESS_ON_DEMAND) 1257 return create_user_odp_mr(pd, start, length, iova, access_flags, 1258 udata); 1259 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1260 if (IS_ERR(umem)) 1261 return ERR_CAST(umem); 1262 return create_real_mr(pd, umem, iova, access_flags); 1263 } 1264 1265 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1266 { 1267 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1268 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1269 1270 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1271 1272 if (!umem_dmabuf->sgt) 1273 return; 1274 1275 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1276 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1277 } 1278 1279 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1280 .allow_peer2peer = 1, 1281 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1282 }; 1283 1284 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1285 u64 length, u64 virt_addr, 1286 int fd, int access_flags, 1287 struct ib_udata *udata) 1288 { 1289 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1290 struct mlx5_ib_mr *mr = NULL; 1291 struct ib_umem_dmabuf *umem_dmabuf; 1292 int err; 1293 1294 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1295 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1296 return ERR_PTR(-EOPNOTSUPP); 1297 1298 mlx5_ib_dbg(dev, 1299 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1300 offset, virt_addr, length, fd, access_flags); 1301 1302 /* dmabuf requires xlt update via umr to work. */ 1303 if (!mlx5r_umr_can_load_pas(dev, length)) 1304 return ERR_PTR(-EINVAL); 1305 1306 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1307 access_flags, 1308 &mlx5_ib_dmabuf_attach_ops); 1309 if (IS_ERR(umem_dmabuf)) { 1310 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1311 PTR_ERR(umem_dmabuf)); 1312 return ERR_CAST(umem_dmabuf); 1313 } 1314 1315 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1316 access_flags); 1317 if (IS_ERR(mr)) { 1318 ib_umem_release(&umem_dmabuf->umem); 1319 return ERR_CAST(mr); 1320 } 1321 1322 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1323 1324 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1325 umem_dmabuf->private = mr; 1326 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1327 if (err) 1328 goto err_dereg_mr; 1329 1330 err = mlx5_ib_init_dmabuf_mr(mr); 1331 if (err) 1332 goto err_dereg_mr; 1333 return &mr->ibmr; 1334 1335 err_dereg_mr: 1336 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1337 return ERR_PTR(err); 1338 } 1339 1340 /* 1341 * True if the change in access flags can be done via UMR, only some access 1342 * flags can be updated. 1343 */ 1344 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1345 unsigned int current_access_flags, 1346 unsigned int target_access_flags) 1347 { 1348 unsigned int diffs = current_access_flags ^ target_access_flags; 1349 1350 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1351 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1352 return false; 1353 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1354 target_access_flags); 1355 } 1356 1357 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1358 struct ib_umem *new_umem, 1359 int new_access_flags, u64 iova, 1360 unsigned long *page_size) 1361 { 1362 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1363 1364 /* We only track the allocated sizes of MRs from the cache */ 1365 if (!mr->mmkey.cache_ent) 1366 return false; 1367 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1368 return false; 1369 1370 *page_size = 1371 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1372 if (WARN_ON(!*page_size)) 1373 return false; 1374 return (1ULL << mr->mmkey.cache_ent->order) >= 1375 ib_umem_num_dma_blocks(new_umem, *page_size); 1376 } 1377 1378 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1379 int access_flags, int flags, struct ib_umem *new_umem, 1380 u64 iova, unsigned long page_size) 1381 { 1382 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1383 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1384 struct ib_umem *old_umem = mr->umem; 1385 int err; 1386 1387 /* 1388 * To keep everything simple the MR is revoked before we start to mess 1389 * with it. This ensure the change is atomic relative to any use of the 1390 * MR. 1391 */ 1392 err = mlx5r_umr_revoke_mr(mr); 1393 if (err) 1394 return err; 1395 1396 if (flags & IB_MR_REREG_PD) { 1397 mr->ibmr.pd = pd; 1398 upd_flags |= MLX5_IB_UPD_XLT_PD; 1399 } 1400 if (flags & IB_MR_REREG_ACCESS) { 1401 mr->access_flags = access_flags; 1402 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1403 } 1404 1405 mr->ibmr.length = new_umem->length; 1406 mr->ibmr.iova = iova; 1407 mr->ibmr.length = new_umem->length; 1408 mr->page_shift = order_base_2(page_size); 1409 mr->umem = new_umem; 1410 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1411 if (err) { 1412 /* 1413 * The MR is revoked at this point so there is no issue to free 1414 * new_umem. 1415 */ 1416 mr->umem = old_umem; 1417 return err; 1418 } 1419 1420 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1421 ib_umem_release(old_umem); 1422 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1423 return 0; 1424 } 1425 1426 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1427 u64 length, u64 iova, int new_access_flags, 1428 struct ib_pd *new_pd, 1429 struct ib_udata *udata) 1430 { 1431 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1432 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1433 int err; 1434 1435 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1436 return ERR_PTR(-EOPNOTSUPP); 1437 1438 mlx5_ib_dbg( 1439 dev, 1440 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1441 start, iova, length, new_access_flags); 1442 1443 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1444 return ERR_PTR(-EOPNOTSUPP); 1445 1446 if (!(flags & IB_MR_REREG_ACCESS)) 1447 new_access_flags = mr->access_flags; 1448 if (!(flags & IB_MR_REREG_PD)) 1449 new_pd = ib_mr->pd; 1450 1451 if (!(flags & IB_MR_REREG_TRANS)) { 1452 struct ib_umem *umem; 1453 1454 /* Fast path for PD/access change */ 1455 if (can_use_umr_rereg_access(dev, mr->access_flags, 1456 new_access_flags)) { 1457 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1458 new_access_flags); 1459 if (err) 1460 return ERR_PTR(err); 1461 return NULL; 1462 } 1463 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1464 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1465 goto recreate; 1466 1467 /* 1468 * Only one active MR can refer to a umem at one time, revoke 1469 * the old MR before assigning the umem to the new one. 1470 */ 1471 err = mlx5r_umr_revoke_mr(mr); 1472 if (err) 1473 return ERR_PTR(err); 1474 umem = mr->umem; 1475 mr->umem = NULL; 1476 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1477 1478 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1479 new_access_flags); 1480 } 1481 1482 /* 1483 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1484 * but the logic around releasing the umem is different 1485 */ 1486 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1487 goto recreate; 1488 1489 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1490 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1491 struct ib_umem *new_umem; 1492 unsigned long page_size; 1493 1494 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1495 new_access_flags); 1496 if (IS_ERR(new_umem)) 1497 return ERR_CAST(new_umem); 1498 1499 /* Fast path for PAS change */ 1500 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1501 &page_size)) { 1502 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1503 new_umem, iova, page_size); 1504 if (err) { 1505 ib_umem_release(new_umem); 1506 return ERR_PTR(err); 1507 } 1508 return NULL; 1509 } 1510 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1511 } 1512 1513 /* 1514 * Everything else has no state we can preserve, just create a new MR 1515 * from scratch 1516 */ 1517 recreate: 1518 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1519 new_access_flags, udata); 1520 } 1521 1522 static int 1523 mlx5_alloc_priv_descs(struct ib_device *device, 1524 struct mlx5_ib_mr *mr, 1525 int ndescs, 1526 int desc_size) 1527 { 1528 struct mlx5_ib_dev *dev = to_mdev(device); 1529 struct device *ddev = &dev->mdev->pdev->dev; 1530 int size = ndescs * desc_size; 1531 int add_size; 1532 int ret; 1533 1534 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1535 1536 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1537 if (!mr->descs_alloc) 1538 return -ENOMEM; 1539 1540 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1541 1542 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1543 if (dma_mapping_error(ddev, mr->desc_map)) { 1544 ret = -ENOMEM; 1545 goto err; 1546 } 1547 1548 return 0; 1549 err: 1550 kfree(mr->descs_alloc); 1551 1552 return ret; 1553 } 1554 1555 static void 1556 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1557 { 1558 if (!mr->umem && mr->descs) { 1559 struct ib_device *device = mr->ibmr.device; 1560 int size = mr->max_descs * mr->desc_size; 1561 struct mlx5_ib_dev *dev = to_mdev(device); 1562 1563 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1564 DMA_TO_DEVICE); 1565 kfree(mr->descs_alloc); 1566 mr->descs = NULL; 1567 } 1568 } 1569 1570 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1571 { 1572 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1573 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1574 int rc; 1575 1576 /* 1577 * Any async use of the mr must hold the refcount, once the refcount 1578 * goes to zero no other thread, such as ODP page faults, prefetch, any 1579 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1580 */ 1581 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1582 refcount_read(&mr->mmkey.usecount) != 0 && 1583 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1584 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1585 1586 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1587 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1588 mr->sig, NULL, GFP_KERNEL); 1589 1590 if (mr->mtt_mr) { 1591 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1592 if (rc) 1593 return rc; 1594 mr->mtt_mr = NULL; 1595 } 1596 if (mr->klm_mr) { 1597 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1598 if (rc) 1599 return rc; 1600 mr->klm_mr = NULL; 1601 } 1602 1603 if (mlx5_core_destroy_psv(dev->mdev, 1604 mr->sig->psv_memory.psv_idx)) 1605 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1606 mr->sig->psv_memory.psv_idx); 1607 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1608 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1609 mr->sig->psv_wire.psv_idx); 1610 kfree(mr->sig); 1611 mr->sig = NULL; 1612 } 1613 1614 /* Stop DMA */ 1615 if (mr->mmkey.cache_ent) { 1616 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1617 mr->mmkey.cache_ent->in_use--; 1618 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); 1619 1620 if (mlx5r_umr_revoke_mr(mr) || 1621 push_mkey(mr->mmkey.cache_ent, false, 1622 xa_mk_value(mr->mmkey.key))) 1623 mr->mmkey.cache_ent = NULL; 1624 } 1625 if (!mr->mmkey.cache_ent) { 1626 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1627 if (rc) 1628 return rc; 1629 } 1630 1631 if (mr->umem) { 1632 bool is_odp = is_odp_mr(mr); 1633 1634 if (!is_odp) 1635 atomic_sub(ib_umem_num_pages(mr->umem), 1636 &dev->mdev->priv.reg_pages); 1637 ib_umem_release(mr->umem); 1638 if (is_odp) 1639 mlx5_ib_free_odp_mr(mr); 1640 } 1641 1642 if (!mr->mmkey.cache_ent) 1643 mlx5_free_priv_descs(mr); 1644 1645 kfree(mr); 1646 return 0; 1647 } 1648 1649 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1650 int access_mode, int page_shift) 1651 { 1652 void *mkc; 1653 1654 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1655 1656 /* This is only used from the kernel, so setting the PD is OK. */ 1657 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1658 MLX5_SET(mkc, mkc, free, 1); 1659 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1660 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1661 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1662 MLX5_SET(mkc, mkc, umr_en, 1); 1663 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1664 } 1665 1666 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1667 int ndescs, int desc_size, int page_shift, 1668 int access_mode, u32 *in, int inlen) 1669 { 1670 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1671 int err; 1672 1673 mr->access_mode = access_mode; 1674 mr->desc_size = desc_size; 1675 mr->max_descs = ndescs; 1676 1677 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1678 if (err) 1679 return err; 1680 1681 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1682 1683 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1684 if (err) 1685 goto err_free_descs; 1686 1687 mr->mmkey.type = MLX5_MKEY_MR; 1688 mr->ibmr.lkey = mr->mmkey.key; 1689 mr->ibmr.rkey = mr->mmkey.key; 1690 1691 return 0; 1692 1693 err_free_descs: 1694 mlx5_free_priv_descs(mr); 1695 return err; 1696 } 1697 1698 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1699 u32 max_num_sg, u32 max_num_meta_sg, 1700 int desc_size, int access_mode) 1701 { 1702 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1703 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1704 int page_shift = 0; 1705 struct mlx5_ib_mr *mr; 1706 u32 *in; 1707 int err; 1708 1709 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1710 if (!mr) 1711 return ERR_PTR(-ENOMEM); 1712 1713 mr->ibmr.pd = pd; 1714 mr->ibmr.device = pd->device; 1715 1716 in = kzalloc(inlen, GFP_KERNEL); 1717 if (!in) { 1718 err = -ENOMEM; 1719 goto err_free; 1720 } 1721 1722 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1723 page_shift = PAGE_SHIFT; 1724 1725 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1726 access_mode, in, inlen); 1727 if (err) 1728 goto err_free_in; 1729 1730 mr->umem = NULL; 1731 kfree(in); 1732 1733 return mr; 1734 1735 err_free_in: 1736 kfree(in); 1737 err_free: 1738 kfree(mr); 1739 return ERR_PTR(err); 1740 } 1741 1742 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1743 int ndescs, u32 *in, int inlen) 1744 { 1745 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 1746 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 1747 inlen); 1748 } 1749 1750 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1751 int ndescs, u32 *in, int inlen) 1752 { 1753 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 1754 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1755 } 1756 1757 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1758 int max_num_sg, int max_num_meta_sg, 1759 u32 *in, int inlen) 1760 { 1761 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1762 u32 psv_index[2]; 1763 void *mkc; 1764 int err; 1765 1766 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 1767 if (!mr->sig) 1768 return -ENOMEM; 1769 1770 /* create mem & wire PSVs */ 1771 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 1772 if (err) 1773 goto err_free_sig; 1774 1775 mr->sig->psv_memory.psv_idx = psv_index[0]; 1776 mr->sig->psv_wire.psv_idx = psv_index[1]; 1777 1778 mr->sig->sig_status_checked = true; 1779 mr->sig->sig_err_exists = false; 1780 /* Next UMR, Arm SIGERR */ 1781 ++mr->sig->sigerr_count; 1782 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1783 sizeof(struct mlx5_klm), 1784 MLX5_MKC_ACCESS_MODE_KLMS); 1785 if (IS_ERR(mr->klm_mr)) { 1786 err = PTR_ERR(mr->klm_mr); 1787 goto err_destroy_psv; 1788 } 1789 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1790 sizeof(struct mlx5_mtt), 1791 MLX5_MKC_ACCESS_MODE_MTT); 1792 if (IS_ERR(mr->mtt_mr)) { 1793 err = PTR_ERR(mr->mtt_mr); 1794 goto err_free_klm_mr; 1795 } 1796 1797 /* Set bsf descriptors for mkey */ 1798 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1799 MLX5_SET(mkc, mkc, bsf_en, 1); 1800 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 1801 1802 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 1803 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1804 if (err) 1805 goto err_free_mtt_mr; 1806 1807 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1808 mr->sig, GFP_KERNEL)); 1809 if (err) 1810 goto err_free_descs; 1811 return 0; 1812 1813 err_free_descs: 1814 destroy_mkey(dev, mr); 1815 mlx5_free_priv_descs(mr); 1816 err_free_mtt_mr: 1817 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1818 mr->mtt_mr = NULL; 1819 err_free_klm_mr: 1820 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1821 mr->klm_mr = NULL; 1822 err_destroy_psv: 1823 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 1824 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1825 mr->sig->psv_memory.psv_idx); 1826 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1827 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1828 mr->sig->psv_wire.psv_idx); 1829 err_free_sig: 1830 kfree(mr->sig); 1831 1832 return err; 1833 } 1834 1835 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 1836 enum ib_mr_type mr_type, u32 max_num_sg, 1837 u32 max_num_meta_sg) 1838 { 1839 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1840 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1841 int ndescs = ALIGN(max_num_sg, 4); 1842 struct mlx5_ib_mr *mr; 1843 u32 *in; 1844 int err; 1845 1846 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1847 if (!mr) 1848 return ERR_PTR(-ENOMEM); 1849 1850 in = kzalloc(inlen, GFP_KERNEL); 1851 if (!in) { 1852 err = -ENOMEM; 1853 goto err_free; 1854 } 1855 1856 mr->ibmr.device = pd->device; 1857 mr->umem = NULL; 1858 1859 switch (mr_type) { 1860 case IB_MR_TYPE_MEM_REG: 1861 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 1862 break; 1863 case IB_MR_TYPE_SG_GAPS: 1864 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 1865 break; 1866 case IB_MR_TYPE_INTEGRITY: 1867 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 1868 max_num_meta_sg, in, inlen); 1869 break; 1870 default: 1871 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 1872 err = -EINVAL; 1873 } 1874 1875 if (err) 1876 goto err_free_in; 1877 1878 kfree(in); 1879 1880 return &mr->ibmr; 1881 1882 err_free_in: 1883 kfree(in); 1884 err_free: 1885 kfree(mr); 1886 return ERR_PTR(err); 1887 } 1888 1889 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1890 u32 max_num_sg) 1891 { 1892 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 1893 } 1894 1895 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 1896 u32 max_num_sg, u32 max_num_meta_sg) 1897 { 1898 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 1899 max_num_meta_sg); 1900 } 1901 1902 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 1903 { 1904 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 1905 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1906 struct mlx5_ib_mw *mw = to_mmw(ibmw); 1907 unsigned int ndescs; 1908 u32 *in = NULL; 1909 void *mkc; 1910 int err; 1911 struct mlx5_ib_alloc_mw req = {}; 1912 struct { 1913 __u32 comp_mask; 1914 __u32 response_length; 1915 } resp = {}; 1916 1917 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 1918 if (err) 1919 return err; 1920 1921 if (req.comp_mask || req.reserved1 || req.reserved2) 1922 return -EOPNOTSUPP; 1923 1924 if (udata->inlen > sizeof(req) && 1925 !ib_is_udata_cleared(udata, sizeof(req), 1926 udata->inlen - sizeof(req))) 1927 return -EOPNOTSUPP; 1928 1929 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 1930 1931 in = kzalloc(inlen, GFP_KERNEL); 1932 if (!in) { 1933 err = -ENOMEM; 1934 goto free; 1935 } 1936 1937 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1938 1939 MLX5_SET(mkc, mkc, free, 1); 1940 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1941 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 1942 MLX5_SET(mkc, mkc, umr_en, 1); 1943 MLX5_SET(mkc, mkc, lr, 1); 1944 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 1945 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 1946 MLX5_SET(mkc, mkc, qpn, 0xffffff); 1947 1948 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 1949 if (err) 1950 goto free; 1951 1952 mw->mmkey.type = MLX5_MKEY_MW; 1953 ibmw->rkey = mw->mmkey.key; 1954 mw->mmkey.ndescs = ndescs; 1955 1956 resp.response_length = 1957 min(offsetofend(typeof(resp), response_length), udata->outlen); 1958 if (resp.response_length) { 1959 err = ib_copy_to_udata(udata, &resp, resp.response_length); 1960 if (err) 1961 goto free_mkey; 1962 } 1963 1964 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 1965 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 1966 if (err) 1967 goto free_mkey; 1968 } 1969 1970 kfree(in); 1971 return 0; 1972 1973 free_mkey: 1974 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 1975 free: 1976 kfree(in); 1977 return err; 1978 } 1979 1980 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 1981 { 1982 struct mlx5_ib_dev *dev = to_mdev(mw->device); 1983 struct mlx5_ib_mw *mmw = to_mmw(mw); 1984 1985 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1986 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 1987 /* 1988 * pagefault_single_data_segment() may be accessing mmw 1989 * if the user bound an ODP MR to this MW. 1990 */ 1991 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 1992 1993 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 1994 } 1995 1996 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 1997 struct ib_mr_status *mr_status) 1998 { 1999 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2000 int ret = 0; 2001 2002 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2003 pr_err("Invalid status check mask\n"); 2004 ret = -EINVAL; 2005 goto done; 2006 } 2007 2008 mr_status->fail_status = 0; 2009 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2010 if (!mmr->sig) { 2011 ret = -EINVAL; 2012 pr_err("signature status check requested on a non-signature enabled MR\n"); 2013 goto done; 2014 } 2015 2016 mmr->sig->sig_status_checked = true; 2017 if (!mmr->sig->sig_err_exists) 2018 goto done; 2019 2020 if (ibmr->lkey == mmr->sig->err_item.key) 2021 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2022 sizeof(mr_status->sig_err)); 2023 else { 2024 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2025 mr_status->sig_err.sig_err_offset = 0; 2026 mr_status->sig_err.key = mmr->sig->err_item.key; 2027 } 2028 2029 mmr->sig->sig_err_exists = false; 2030 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2031 } 2032 2033 done: 2034 return ret; 2035 } 2036 2037 static int 2038 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2039 int data_sg_nents, unsigned int *data_sg_offset, 2040 struct scatterlist *meta_sg, int meta_sg_nents, 2041 unsigned int *meta_sg_offset) 2042 { 2043 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2044 unsigned int sg_offset = 0; 2045 int n = 0; 2046 2047 mr->meta_length = 0; 2048 if (data_sg_nents == 1) { 2049 n++; 2050 mr->mmkey.ndescs = 1; 2051 if (data_sg_offset) 2052 sg_offset = *data_sg_offset; 2053 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2054 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2055 if (meta_sg_nents == 1) { 2056 n++; 2057 mr->meta_ndescs = 1; 2058 if (meta_sg_offset) 2059 sg_offset = *meta_sg_offset; 2060 else 2061 sg_offset = 0; 2062 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2063 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2064 } 2065 ibmr->length = mr->data_length + mr->meta_length; 2066 } 2067 2068 return n; 2069 } 2070 2071 static int 2072 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2073 struct scatterlist *sgl, 2074 unsigned short sg_nents, 2075 unsigned int *sg_offset_p, 2076 struct scatterlist *meta_sgl, 2077 unsigned short meta_sg_nents, 2078 unsigned int *meta_sg_offset_p) 2079 { 2080 struct scatterlist *sg = sgl; 2081 struct mlx5_klm *klms = mr->descs; 2082 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2083 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2084 int i, j = 0; 2085 2086 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2087 mr->ibmr.length = 0; 2088 2089 for_each_sg(sgl, sg, sg_nents, i) { 2090 if (unlikely(i >= mr->max_descs)) 2091 break; 2092 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2093 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2094 klms[i].key = cpu_to_be32(lkey); 2095 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2096 2097 sg_offset = 0; 2098 } 2099 2100 if (sg_offset_p) 2101 *sg_offset_p = sg_offset; 2102 2103 mr->mmkey.ndescs = i; 2104 mr->data_length = mr->ibmr.length; 2105 2106 if (meta_sg_nents) { 2107 sg = meta_sgl; 2108 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2109 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2110 if (unlikely(i + j >= mr->max_descs)) 2111 break; 2112 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2113 sg_offset); 2114 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2115 sg_offset); 2116 klms[i + j].key = cpu_to_be32(lkey); 2117 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2118 2119 sg_offset = 0; 2120 } 2121 if (meta_sg_offset_p) 2122 *meta_sg_offset_p = sg_offset; 2123 2124 mr->meta_ndescs = j; 2125 mr->meta_length = mr->ibmr.length - mr->data_length; 2126 } 2127 2128 return i + j; 2129 } 2130 2131 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2132 { 2133 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2134 __be64 *descs; 2135 2136 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2137 return -ENOMEM; 2138 2139 descs = mr->descs; 2140 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2141 2142 return 0; 2143 } 2144 2145 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2146 { 2147 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2148 __be64 *descs; 2149 2150 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2151 return -ENOMEM; 2152 2153 descs = mr->descs; 2154 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2155 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2156 2157 return 0; 2158 } 2159 2160 static int 2161 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2162 int data_sg_nents, unsigned int *data_sg_offset, 2163 struct scatterlist *meta_sg, int meta_sg_nents, 2164 unsigned int *meta_sg_offset) 2165 { 2166 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2167 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2168 int n; 2169 2170 pi_mr->mmkey.ndescs = 0; 2171 pi_mr->meta_ndescs = 0; 2172 pi_mr->meta_length = 0; 2173 2174 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2175 pi_mr->desc_size * pi_mr->max_descs, 2176 DMA_TO_DEVICE); 2177 2178 pi_mr->ibmr.page_size = ibmr->page_size; 2179 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2180 mlx5_set_page); 2181 if (n != data_sg_nents) 2182 return n; 2183 2184 pi_mr->data_iova = pi_mr->ibmr.iova; 2185 pi_mr->data_length = pi_mr->ibmr.length; 2186 pi_mr->ibmr.length = pi_mr->data_length; 2187 ibmr->length = pi_mr->data_length; 2188 2189 if (meta_sg_nents) { 2190 u64 page_mask = ~((u64)ibmr->page_size - 1); 2191 u64 iova = pi_mr->data_iova; 2192 2193 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2194 meta_sg_offset, mlx5_set_page_pi); 2195 2196 pi_mr->meta_length = pi_mr->ibmr.length; 2197 /* 2198 * PI address for the HW is the offset of the metadata address 2199 * relative to the first data page address. 2200 * It equals to first data page address + size of data pages + 2201 * metadata offset at the first metadata page 2202 */ 2203 pi_mr->pi_iova = (iova & page_mask) + 2204 pi_mr->mmkey.ndescs * ibmr->page_size + 2205 (pi_mr->ibmr.iova & ~page_mask); 2206 /* 2207 * In order to use one MTT MR for data and metadata, we register 2208 * also the gaps between the end of the data and the start of 2209 * the metadata (the sig MR will verify that the HW will access 2210 * to right addresses). This mapping is safe because we use 2211 * internal mkey for the registration. 2212 */ 2213 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2214 pi_mr->ibmr.iova = iova; 2215 ibmr->length += pi_mr->meta_length; 2216 } 2217 2218 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2219 pi_mr->desc_size * pi_mr->max_descs, 2220 DMA_TO_DEVICE); 2221 2222 return n; 2223 } 2224 2225 static int 2226 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2227 int data_sg_nents, unsigned int *data_sg_offset, 2228 struct scatterlist *meta_sg, int meta_sg_nents, 2229 unsigned int *meta_sg_offset) 2230 { 2231 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2232 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2233 int n; 2234 2235 pi_mr->mmkey.ndescs = 0; 2236 pi_mr->meta_ndescs = 0; 2237 pi_mr->meta_length = 0; 2238 2239 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2240 pi_mr->desc_size * pi_mr->max_descs, 2241 DMA_TO_DEVICE); 2242 2243 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2244 meta_sg, meta_sg_nents, meta_sg_offset); 2245 2246 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2247 pi_mr->desc_size * pi_mr->max_descs, 2248 DMA_TO_DEVICE); 2249 2250 /* This is zero-based memory region */ 2251 pi_mr->data_iova = 0; 2252 pi_mr->ibmr.iova = 0; 2253 pi_mr->pi_iova = pi_mr->data_length; 2254 ibmr->length = pi_mr->ibmr.length; 2255 2256 return n; 2257 } 2258 2259 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2260 int data_sg_nents, unsigned int *data_sg_offset, 2261 struct scatterlist *meta_sg, int meta_sg_nents, 2262 unsigned int *meta_sg_offset) 2263 { 2264 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2265 struct mlx5_ib_mr *pi_mr = NULL; 2266 int n; 2267 2268 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2269 2270 mr->mmkey.ndescs = 0; 2271 mr->data_length = 0; 2272 mr->data_iova = 0; 2273 mr->meta_ndescs = 0; 2274 mr->pi_iova = 0; 2275 /* 2276 * As a performance optimization, if possible, there is no need to 2277 * perform UMR operation to register the data/metadata buffers. 2278 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2279 * Fallback to UMR only in case of a failure. 2280 */ 2281 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2282 data_sg_offset, meta_sg, meta_sg_nents, 2283 meta_sg_offset); 2284 if (n == data_sg_nents + meta_sg_nents) 2285 goto out; 2286 /* 2287 * As a performance optimization, if possible, there is no need to map 2288 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2289 * descriptors and fallback to KLM only in case of a failure. 2290 * It's more efficient for the HW to work with MTT descriptors 2291 * (especially in high load). 2292 * Use KLM (indirect access) only if it's mandatory. 2293 */ 2294 pi_mr = mr->mtt_mr; 2295 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2296 data_sg_offset, meta_sg, meta_sg_nents, 2297 meta_sg_offset); 2298 if (n == data_sg_nents + meta_sg_nents) 2299 goto out; 2300 2301 pi_mr = mr->klm_mr; 2302 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2303 data_sg_offset, meta_sg, meta_sg_nents, 2304 meta_sg_offset); 2305 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2306 return -ENOMEM; 2307 2308 out: 2309 /* This is zero-based memory region */ 2310 ibmr->iova = 0; 2311 mr->pi_mr = pi_mr; 2312 if (pi_mr) 2313 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2314 else 2315 ibmr->sig_attrs->meta_length = mr->meta_length; 2316 2317 return 0; 2318 } 2319 2320 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2321 unsigned int *sg_offset) 2322 { 2323 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2324 int n; 2325 2326 mr->mmkey.ndescs = 0; 2327 2328 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2329 mr->desc_size * mr->max_descs, 2330 DMA_TO_DEVICE); 2331 2332 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2333 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2334 NULL); 2335 else 2336 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2337 mlx5_set_page); 2338 2339 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2340 mr->desc_size * mr->max_descs, 2341 DMA_TO_DEVICE); 2342 2343 return n; 2344 } 2345