1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 47 enum { 48 MAX_PENDING_REG_MR = 8, 49 }; 50 51 #define MLX5_UMR_ALIGN 2048 52 53 static void 54 create_mkey_callback(int status, struct mlx5_async_work *context); 55 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 56 u64 iova, int access_flags, 57 unsigned int page_size, bool populate); 58 59 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 60 struct ib_pd *pd) 61 { 62 struct mlx5_ib_dev *dev = to_mdev(pd->device); 63 64 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 65 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 66 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 67 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 68 MLX5_SET(mkc, mkc, lr, 1); 69 70 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 71 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 72 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 73 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 76 } 77 78 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 79 MLX5_SET(mkc, mkc, qpn, 0xffffff); 80 MLX5_SET64(mkc, mkc, start_addr, start_addr); 81 } 82 83 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 84 { 85 u8 key = atomic_inc_return(&dev->mkey_var); 86 void *mkc; 87 88 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 89 MLX5_SET(mkc, mkc, mkey_7_0, key); 90 *mkey = key; 91 } 92 93 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 94 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 95 { 96 int ret; 97 98 assign_mkey_variant(dev, &mkey->key, in); 99 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 100 if (!ret) 101 init_waitqueue_head(&mkey->wait); 102 103 return ret; 104 } 105 106 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 107 { 108 struct mlx5_ib_dev *dev = async_create->ent->dev; 109 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 110 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 111 112 MLX5_SET(create_mkey_in, async_create->in, opcode, 113 MLX5_CMD_OP_CREATE_MKEY); 114 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 115 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 116 async_create->out, outlen, create_mkey_callback, 117 &async_create->cb_work); 118 } 119 120 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 121 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 122 123 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 124 { 125 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 126 127 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 128 } 129 130 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 131 { 132 if (status == -ENXIO) /* core driver is not available */ 133 return; 134 135 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 136 if (status != -EREMOTEIO) /* driver specific failure */ 137 return; 138 139 /* Failed in FW, print cmd out failure details */ 140 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 141 } 142 143 static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings, 144 void *to_store) 145 { 146 XA_STATE(xas, &ent->mkeys, 0); 147 void *curr; 148 149 if (limit_pendings && 150 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) 151 return -EAGAIN; 152 153 while (1) { 154 /* 155 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version 156 * doesn't transparently unlock. Instead we set the xas index to 157 * the current value of reserved every iteration. 158 */ 159 xas_set(&xas, ent->reserved); 160 curr = xas_load(&xas); 161 if (!curr) { 162 if (to_store && ent->stored == ent->reserved) 163 xas_store(&xas, to_store); 164 else 165 xas_store(&xas, XA_ZERO_ENTRY); 166 if (xas_valid(&xas)) { 167 ent->reserved++; 168 if (to_store) { 169 if (ent->stored != ent->reserved) 170 __xa_store(&ent->mkeys, 171 ent->stored, 172 to_store, 173 GFP_KERNEL); 174 ent->stored++; 175 queue_adjust_cache_locked(ent); 176 WRITE_ONCE(ent->dev->cache.last_add, 177 jiffies); 178 } 179 } 180 } 181 xa_unlock_irq(&ent->mkeys); 182 183 /* 184 * Notice xas_nomem() must always be called as it cleans 185 * up any cached allocation. 186 */ 187 if (!xas_nomem(&xas, GFP_KERNEL)) 188 break; 189 xa_lock_irq(&ent->mkeys); 190 } 191 xa_lock_irq(&ent->mkeys); 192 if (xas_error(&xas)) 193 return xas_error(&xas); 194 if (WARN_ON(curr)) 195 return -EINVAL; 196 return 0; 197 } 198 199 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, 200 void *to_store) 201 { 202 int ret; 203 204 xa_lock_irq(&ent->mkeys); 205 ret = push_mkey_locked(ent, limit_pendings, to_store); 206 xa_unlock_irq(&ent->mkeys); 207 return ret; 208 } 209 210 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) 211 { 212 void *old; 213 214 ent->reserved--; 215 old = __xa_erase(&ent->mkeys, ent->reserved); 216 WARN_ON(old); 217 } 218 219 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) 220 { 221 void *old; 222 223 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); 224 WARN_ON(old); 225 ent->stored++; 226 } 227 228 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) 229 { 230 void *old, *xa_mkey; 231 232 ent->stored--; 233 ent->reserved--; 234 235 if (ent->stored == ent->reserved) { 236 xa_mkey = __xa_erase(&ent->mkeys, ent->stored); 237 WARN_ON(!xa_mkey); 238 return (u32)xa_to_value(xa_mkey); 239 } 240 241 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, 242 GFP_KERNEL); 243 WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); 244 old = __xa_erase(&ent->mkeys, ent->reserved); 245 WARN_ON(old); 246 return (u32)xa_to_value(xa_mkey); 247 } 248 249 static void create_mkey_callback(int status, struct mlx5_async_work *context) 250 { 251 struct mlx5r_async_create_mkey *mkey_out = 252 container_of(context, struct mlx5r_async_create_mkey, cb_work); 253 struct mlx5_cache_ent *ent = mkey_out->ent; 254 struct mlx5_ib_dev *dev = ent->dev; 255 unsigned long flags; 256 257 if (status) { 258 create_mkey_warn(dev, status, mkey_out->out); 259 kfree(mkey_out); 260 xa_lock_irqsave(&ent->mkeys, flags); 261 undo_push_reserve_mkey(ent); 262 WRITE_ONCE(dev->fill_delay, 1); 263 xa_unlock_irqrestore(&ent->mkeys, flags); 264 mod_timer(&dev->delay_timer, jiffies + HZ); 265 return; 266 } 267 268 mkey_out->mkey |= mlx5_idx_to_mkey( 269 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 270 WRITE_ONCE(dev->cache.last_add, jiffies); 271 272 xa_lock_irqsave(&ent->mkeys, flags); 273 push_to_reserved(ent, mkey_out->mkey); 274 /* If we are doing fill_to_high_water then keep going. */ 275 queue_adjust_cache_locked(ent); 276 xa_unlock_irqrestore(&ent->mkeys, flags); 277 kfree(mkey_out); 278 } 279 280 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 281 { 282 int ret = 0; 283 284 switch (access_mode) { 285 case MLX5_MKC_ACCESS_MODE_MTT: 286 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 287 sizeof(struct mlx5_mtt)); 288 break; 289 case MLX5_MKC_ACCESS_MODE_KSM: 290 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 291 sizeof(struct mlx5_klm)); 292 break; 293 default: 294 WARN_ON(1); 295 } 296 return ret; 297 } 298 299 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 300 { 301 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 302 MLX5_SET(mkc, mkc, free, 1); 303 MLX5_SET(mkc, mkc, umr_en, 1); 304 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 305 MLX5_SET(mkc, mkc, access_mode_4_2, 306 (ent->rb_key.access_mode >> 2) & 0x7); 307 308 MLX5_SET(mkc, mkc, translations_octword_size, 309 get_mkc_octo_size(ent->rb_key.access_mode, 310 ent->rb_key.ndescs)); 311 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 312 } 313 314 /* Asynchronously schedule new MRs to be populated in the cache. */ 315 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 316 { 317 struct mlx5r_async_create_mkey *async_create; 318 void *mkc; 319 int err = 0; 320 int i; 321 322 for (i = 0; i < num; i++) { 323 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 324 GFP_KERNEL); 325 if (!async_create) 326 return -ENOMEM; 327 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 328 memory_key_mkey_entry); 329 set_cache_mkc(ent, mkc); 330 async_create->ent = ent; 331 332 err = push_mkey(ent, true, NULL); 333 if (err) 334 goto free_async_create; 335 336 err = mlx5_ib_create_mkey_cb(async_create); 337 if (err) { 338 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 339 goto err_undo_reserve; 340 } 341 } 342 343 return 0; 344 345 err_undo_reserve: 346 xa_lock_irq(&ent->mkeys); 347 undo_push_reserve_mkey(ent); 348 xa_unlock_irq(&ent->mkeys); 349 free_async_create: 350 kfree(async_create); 351 return err; 352 } 353 354 /* Synchronously create a MR in the cache */ 355 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 356 { 357 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 358 void *mkc; 359 u32 *in; 360 int err; 361 362 in = kzalloc(inlen, GFP_KERNEL); 363 if (!in) 364 return -ENOMEM; 365 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 366 set_cache_mkc(ent, mkc); 367 368 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 369 if (err) 370 goto free_in; 371 372 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 373 free_in: 374 kfree(in); 375 return err; 376 } 377 378 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 379 { 380 u32 mkey; 381 382 lockdep_assert_held(&ent->mkeys.xa_lock); 383 if (!ent->stored) 384 return; 385 mkey = pop_stored_mkey(ent); 386 xa_unlock_irq(&ent->mkeys); 387 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 388 xa_lock_irq(&ent->mkeys); 389 } 390 391 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 392 bool limit_fill) 393 __acquires(&ent->mkeys) __releases(&ent->mkeys) 394 { 395 int err; 396 397 lockdep_assert_held(&ent->mkeys.xa_lock); 398 399 while (true) { 400 if (limit_fill) 401 target = ent->limit * 2; 402 if (target == ent->reserved) 403 return 0; 404 if (target > ent->reserved) { 405 u32 todo = target - ent->reserved; 406 407 xa_unlock_irq(&ent->mkeys); 408 err = add_keys(ent, todo); 409 if (err == -EAGAIN) 410 usleep_range(3000, 5000); 411 xa_lock_irq(&ent->mkeys); 412 if (err) { 413 if (err != -EAGAIN) 414 return err; 415 } else 416 return 0; 417 } else { 418 remove_cache_mr_locked(ent); 419 } 420 } 421 } 422 423 static ssize_t size_write(struct file *filp, const char __user *buf, 424 size_t count, loff_t *pos) 425 { 426 struct mlx5_cache_ent *ent = filp->private_data; 427 u32 target; 428 int err; 429 430 err = kstrtou32_from_user(buf, count, 0, &target); 431 if (err) 432 return err; 433 434 /* 435 * Target is the new value of total_mrs the user requests, however we 436 * cannot free MRs that are in use. Compute the target value for stored 437 * mkeys. 438 */ 439 xa_lock_irq(&ent->mkeys); 440 if (target < ent->in_use) { 441 err = -EINVAL; 442 goto err_unlock; 443 } 444 target = target - ent->in_use; 445 if (target < ent->limit || target > ent->limit*2) { 446 err = -EINVAL; 447 goto err_unlock; 448 } 449 err = resize_available_mrs(ent, target, false); 450 if (err) 451 goto err_unlock; 452 xa_unlock_irq(&ent->mkeys); 453 454 return count; 455 456 err_unlock: 457 xa_unlock_irq(&ent->mkeys); 458 return err; 459 } 460 461 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 462 loff_t *pos) 463 { 464 struct mlx5_cache_ent *ent = filp->private_data; 465 char lbuf[20]; 466 int err; 467 468 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); 469 if (err < 0) 470 return err; 471 472 return simple_read_from_buffer(buf, count, pos, lbuf, err); 473 } 474 475 static const struct file_operations size_fops = { 476 .owner = THIS_MODULE, 477 .open = simple_open, 478 .write = size_write, 479 .read = size_read, 480 }; 481 482 static ssize_t limit_write(struct file *filp, const char __user *buf, 483 size_t count, loff_t *pos) 484 { 485 struct mlx5_cache_ent *ent = filp->private_data; 486 u32 var; 487 int err; 488 489 err = kstrtou32_from_user(buf, count, 0, &var); 490 if (err) 491 return err; 492 493 /* 494 * Upon set we immediately fill the cache to high water mark implied by 495 * the limit. 496 */ 497 xa_lock_irq(&ent->mkeys); 498 ent->limit = var; 499 err = resize_available_mrs(ent, 0, true); 500 xa_unlock_irq(&ent->mkeys); 501 if (err) 502 return err; 503 return count; 504 } 505 506 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 507 loff_t *pos) 508 { 509 struct mlx5_cache_ent *ent = filp->private_data; 510 char lbuf[20]; 511 int err; 512 513 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 514 if (err < 0) 515 return err; 516 517 return simple_read_from_buffer(buf, count, pos, lbuf, err); 518 } 519 520 static const struct file_operations limit_fops = { 521 .owner = THIS_MODULE, 522 .open = simple_open, 523 .write = limit_write, 524 .read = limit_read, 525 }; 526 527 static bool someone_adding(struct mlx5_mkey_cache *cache) 528 { 529 struct mlx5_cache_ent *ent; 530 struct rb_node *node; 531 bool ret; 532 533 mutex_lock(&cache->rb_lock); 534 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 535 ent = rb_entry(node, struct mlx5_cache_ent, node); 536 xa_lock_irq(&ent->mkeys); 537 ret = ent->stored < ent->limit; 538 xa_unlock_irq(&ent->mkeys); 539 if (ret) { 540 mutex_unlock(&cache->rb_lock); 541 return true; 542 } 543 } 544 mutex_unlock(&cache->rb_lock); 545 return false; 546 } 547 548 /* 549 * Check if the bucket is outside the high/low water mark and schedule an async 550 * update. The cache refill has hysteresis, once the low water mark is hit it is 551 * refilled up to the high mark. 552 */ 553 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 554 { 555 lockdep_assert_held(&ent->mkeys.xa_lock); 556 557 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 558 return; 559 if (ent->stored < ent->limit) { 560 ent->fill_to_high_water = true; 561 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 562 } else if (ent->fill_to_high_water && 563 ent->reserved < 2 * ent->limit) { 564 /* 565 * Once we start populating due to hitting a low water mark 566 * continue until we pass the high water mark. 567 */ 568 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 569 } else if (ent->stored == 2 * ent->limit) { 570 ent->fill_to_high_water = false; 571 } else if (ent->stored > 2 * ent->limit) { 572 /* Queue deletion of excess entries */ 573 ent->fill_to_high_water = false; 574 if (ent->stored != ent->reserved) 575 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 576 msecs_to_jiffies(1000)); 577 else 578 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 579 } 580 } 581 582 static void __cache_work_func(struct mlx5_cache_ent *ent) 583 { 584 struct mlx5_ib_dev *dev = ent->dev; 585 struct mlx5_mkey_cache *cache = &dev->cache; 586 int err; 587 588 xa_lock_irq(&ent->mkeys); 589 if (ent->disabled) 590 goto out; 591 592 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && 593 !READ_ONCE(dev->fill_delay)) { 594 xa_unlock_irq(&ent->mkeys); 595 err = add_keys(ent, 1); 596 xa_lock_irq(&ent->mkeys); 597 if (ent->disabled) 598 goto out; 599 if (err) { 600 /* 601 * EAGAIN only happens if there are pending MRs, so we 602 * will be rescheduled when storing them. The only 603 * failure path here is ENOMEM. 604 */ 605 if (err != -EAGAIN) { 606 mlx5_ib_warn( 607 dev, 608 "add keys command failed, err %d\n", 609 err); 610 queue_delayed_work(cache->wq, &ent->dwork, 611 msecs_to_jiffies(1000)); 612 } 613 } 614 } else if (ent->stored > 2 * ent->limit) { 615 bool need_delay; 616 617 /* 618 * The remove_cache_mr() logic is performed as garbage 619 * collection task. Such task is intended to be run when no 620 * other active processes are running. 621 * 622 * The need_resched() will return TRUE if there are user tasks 623 * to be activated in near future. 624 * 625 * In such case, we don't execute remove_cache_mr() and postpone 626 * the garbage collection work to try to run in next cycle, in 627 * order to free CPU resources to other tasks. 628 */ 629 xa_unlock_irq(&ent->mkeys); 630 need_delay = need_resched() || someone_adding(cache) || 631 !time_after(jiffies, 632 READ_ONCE(cache->last_add) + 300 * HZ); 633 xa_lock_irq(&ent->mkeys); 634 if (ent->disabled) 635 goto out; 636 if (need_delay) { 637 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 638 goto out; 639 } 640 remove_cache_mr_locked(ent); 641 queue_adjust_cache_locked(ent); 642 } 643 out: 644 xa_unlock_irq(&ent->mkeys); 645 } 646 647 static void delayed_cache_work_func(struct work_struct *work) 648 { 649 struct mlx5_cache_ent *ent; 650 651 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 652 __cache_work_func(ent); 653 } 654 655 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 656 struct mlx5r_cache_rb_key key2) 657 { 658 int res; 659 660 res = key1.ats - key2.ats; 661 if (res) 662 return res; 663 664 res = key1.access_mode - key2.access_mode; 665 if (res) 666 return res; 667 668 res = key1.access_flags - key2.access_flags; 669 if (res) 670 return res; 671 672 /* 673 * keep ndescs the last in the compare table since the find function 674 * searches for an exact match on all properties and only closest 675 * match in size. 676 */ 677 return key1.ndescs - key2.ndescs; 678 } 679 680 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 681 struct mlx5_cache_ent *ent) 682 { 683 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 684 struct mlx5_cache_ent *cur; 685 int cmp; 686 687 /* Figure out where to put new node */ 688 while (*new) { 689 cur = rb_entry(*new, struct mlx5_cache_ent, node); 690 parent = *new; 691 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 692 if (cmp > 0) 693 new = &((*new)->rb_left); 694 if (cmp < 0) 695 new = &((*new)->rb_right); 696 if (cmp == 0) { 697 mutex_unlock(&cache->rb_lock); 698 return -EEXIST; 699 } 700 } 701 702 /* Add new node and rebalance tree. */ 703 rb_link_node(&ent->node, parent, new); 704 rb_insert_color(&ent->node, &cache->rb_root); 705 706 return 0; 707 } 708 709 static struct mlx5_cache_ent * 710 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 711 struct mlx5r_cache_rb_key rb_key) 712 { 713 struct rb_node *node = dev->cache.rb_root.rb_node; 714 struct mlx5_cache_ent *cur, *smallest = NULL; 715 int cmp; 716 717 /* 718 * Find the smallest ent with order >= requested_order. 719 */ 720 while (node) { 721 cur = rb_entry(node, struct mlx5_cache_ent, node); 722 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 723 if (cmp > 0) { 724 smallest = cur; 725 node = node->rb_left; 726 } 727 if (cmp < 0) 728 node = node->rb_right; 729 if (cmp == 0) 730 return cur; 731 } 732 733 return (smallest && 734 smallest->rb_key.access_mode == rb_key.access_mode && 735 smallest->rb_key.access_flags == rb_key.access_flags && 736 smallest->rb_key.ats == rb_key.ats) ? 737 smallest : 738 NULL; 739 } 740 741 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 742 struct mlx5_cache_ent *ent, 743 int access_flags) 744 { 745 struct mlx5_ib_mr *mr; 746 int err; 747 748 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 749 if (!mr) 750 return ERR_PTR(-ENOMEM); 751 752 xa_lock_irq(&ent->mkeys); 753 ent->in_use++; 754 755 if (!ent->stored) { 756 queue_adjust_cache_locked(ent); 757 ent->miss++; 758 xa_unlock_irq(&ent->mkeys); 759 err = create_cache_mkey(ent, &mr->mmkey.key); 760 if (err) { 761 xa_lock_irq(&ent->mkeys); 762 ent->in_use--; 763 xa_unlock_irq(&ent->mkeys); 764 kfree(mr); 765 return ERR_PTR(err); 766 } 767 } else { 768 mr->mmkey.key = pop_stored_mkey(ent); 769 queue_adjust_cache_locked(ent); 770 xa_unlock_irq(&ent->mkeys); 771 } 772 mr->mmkey.cache_ent = ent; 773 mr->mmkey.type = MLX5_MKEY_MR; 774 init_waitqueue_head(&mr->mmkey.wait); 775 return mr; 776 } 777 778 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 779 int access_flags) 780 { 781 int ret = 0; 782 783 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 784 MLX5_CAP_GEN(dev->mdev, atomic) && 785 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 786 ret |= IB_ACCESS_REMOTE_ATOMIC; 787 788 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 789 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 790 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 791 ret |= IB_ACCESS_RELAXED_ORDERING; 792 793 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 794 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && 795 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 796 ret |= IB_ACCESS_RELAXED_ORDERING; 797 798 return ret; 799 } 800 801 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 802 int access_flags, int access_mode, 803 int ndescs) 804 { 805 struct mlx5r_cache_rb_key rb_key = { 806 .ndescs = ndescs, 807 .access_mode = access_mode, 808 .access_flags = get_unchangeable_access_flags(dev, access_flags) 809 }; 810 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 811 812 if (!ent) 813 return ERR_PTR(-EOPNOTSUPP); 814 815 return _mlx5_mr_cache_alloc(dev, ent, access_flags); 816 } 817 818 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 819 { 820 u32 mkey; 821 822 cancel_delayed_work(&ent->dwork); 823 xa_lock_irq(&ent->mkeys); 824 while (ent->stored) { 825 mkey = pop_stored_mkey(ent); 826 xa_unlock_irq(&ent->mkeys); 827 mlx5_core_destroy_mkey(dev->mdev, mkey); 828 xa_lock_irq(&ent->mkeys); 829 } 830 xa_unlock_irq(&ent->mkeys); 831 } 832 833 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 834 { 835 if (!mlx5_debugfs_root || dev->is_rep) 836 return; 837 838 debugfs_remove_recursive(dev->cache.fs_root); 839 dev->cache.fs_root = NULL; 840 } 841 842 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 843 struct mlx5_cache_ent *ent) 844 { 845 int order = order_base_2(ent->rb_key.ndescs); 846 struct dentry *dir; 847 848 if (!mlx5_debugfs_root || dev->is_rep) 849 return; 850 851 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 852 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 853 854 sprintf(ent->name, "%d", order); 855 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 856 debugfs_create_file("size", 0600, dir, ent, &size_fops); 857 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 858 debugfs_create_ulong("cur", 0400, dir, &ent->stored); 859 debugfs_create_u32("miss", 0600, dir, &ent->miss); 860 } 861 862 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 863 { 864 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 865 struct mlx5_mkey_cache *cache = &dev->cache; 866 867 if (!mlx5_debugfs_root || dev->is_rep) 868 return; 869 870 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 871 } 872 873 static void delay_time_func(struct timer_list *t) 874 { 875 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 876 877 WRITE_ONCE(dev->fill_delay, 0); 878 } 879 880 struct mlx5_cache_ent * 881 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 882 struct mlx5r_cache_rb_key rb_key, 883 bool persistent_entry) 884 { 885 struct mlx5_cache_ent *ent; 886 int order; 887 int ret; 888 889 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 890 if (!ent) 891 return ERR_PTR(-ENOMEM); 892 893 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); 894 ent->rb_key = rb_key; 895 ent->dev = dev; 896 ent->is_tmp = !persistent_entry; 897 898 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 899 900 ret = mlx5_cache_ent_insert(&dev->cache, ent); 901 if (ret) { 902 kfree(ent); 903 return ERR_PTR(ret); 904 } 905 906 if (persistent_entry) { 907 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 908 order = MLX5_IMR_KSM_CACHE_ENTRY; 909 else 910 order = order_base_2(rb_key.ndescs) - 2; 911 912 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 913 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 914 mlx5r_umr_can_load_pas(dev, 0)) 915 ent->limit = dev->mdev->profile.mr_cache[order].limit; 916 else 917 ent->limit = 0; 918 919 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 920 } else { 921 mod_delayed_work(ent->dev->cache.wq, 922 &ent->dev->cache.remove_ent_dwork, 923 msecs_to_jiffies(30 * 1000)); 924 } 925 926 return ent; 927 } 928 929 static void remove_ent_work_func(struct work_struct *work) 930 { 931 struct mlx5_mkey_cache *cache; 932 struct mlx5_cache_ent *ent; 933 struct rb_node *cur; 934 935 cache = container_of(work, struct mlx5_mkey_cache, 936 remove_ent_dwork.work); 937 mutex_lock(&cache->rb_lock); 938 cur = rb_last(&cache->rb_root); 939 while (cur) { 940 ent = rb_entry(cur, struct mlx5_cache_ent, node); 941 cur = rb_prev(cur); 942 mutex_unlock(&cache->rb_lock); 943 944 xa_lock_irq(&ent->mkeys); 945 if (!ent->is_tmp) { 946 xa_unlock_irq(&ent->mkeys); 947 mutex_lock(&cache->rb_lock); 948 continue; 949 } 950 xa_unlock_irq(&ent->mkeys); 951 952 clean_keys(ent->dev, ent); 953 mutex_lock(&cache->rb_lock); 954 } 955 mutex_unlock(&cache->rb_lock); 956 } 957 958 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 959 { 960 struct mlx5_mkey_cache *cache = &dev->cache; 961 struct rb_root *root = &dev->cache.rb_root; 962 struct mlx5r_cache_rb_key rb_key = { 963 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 964 }; 965 struct mlx5_cache_ent *ent; 966 struct rb_node *node; 967 int ret; 968 int i; 969 970 mutex_init(&dev->slow_path_mutex); 971 mutex_init(&dev->cache.rb_lock); 972 dev->cache.rb_root = RB_ROOT; 973 INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); 974 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 975 if (!cache->wq) { 976 mlx5_ib_warn(dev, "failed to create work queue\n"); 977 return -ENOMEM; 978 } 979 980 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 981 timer_setup(&dev->delay_timer, delay_time_func, 0); 982 mlx5_mkey_cache_debugfs_init(dev); 983 mutex_lock(&cache->rb_lock); 984 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 985 rb_key.ndescs = 1 << (i + 2); 986 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 987 if (IS_ERR(ent)) { 988 ret = PTR_ERR(ent); 989 goto err; 990 } 991 } 992 993 ret = mlx5_odp_init_mkey_cache(dev); 994 if (ret) 995 goto err; 996 997 mutex_unlock(&cache->rb_lock); 998 for (node = rb_first(root); node; node = rb_next(node)) { 999 ent = rb_entry(node, struct mlx5_cache_ent, node); 1000 xa_lock_irq(&ent->mkeys); 1001 queue_adjust_cache_locked(ent); 1002 xa_unlock_irq(&ent->mkeys); 1003 } 1004 1005 return 0; 1006 1007 err: 1008 mutex_unlock(&cache->rb_lock); 1009 mlx5_mkey_cache_debugfs_cleanup(dev); 1010 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 1011 return ret; 1012 } 1013 1014 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 1015 { 1016 struct rb_root *root = &dev->cache.rb_root; 1017 struct mlx5_cache_ent *ent; 1018 struct rb_node *node; 1019 1020 if (!dev->cache.wq) 1021 return; 1022 1023 cancel_delayed_work_sync(&dev->cache.remove_ent_dwork); 1024 mutex_lock(&dev->cache.rb_lock); 1025 for (node = rb_first(root); node; node = rb_next(node)) { 1026 ent = rb_entry(node, struct mlx5_cache_ent, node); 1027 xa_lock_irq(&ent->mkeys); 1028 ent->disabled = true; 1029 xa_unlock_irq(&ent->mkeys); 1030 cancel_delayed_work_sync(&ent->dwork); 1031 } 1032 1033 mlx5_mkey_cache_debugfs_cleanup(dev); 1034 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1035 1036 node = rb_first(root); 1037 while (node) { 1038 ent = rb_entry(node, struct mlx5_cache_ent, node); 1039 node = rb_next(node); 1040 clean_keys(dev, ent); 1041 rb_erase(&ent->node, root); 1042 kfree(ent); 1043 } 1044 mutex_unlock(&dev->cache.rb_lock); 1045 1046 destroy_workqueue(dev->cache.wq); 1047 del_timer_sync(&dev->delay_timer); 1048 } 1049 1050 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1051 { 1052 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1053 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1054 struct mlx5_ib_mr *mr; 1055 void *mkc; 1056 u32 *in; 1057 int err; 1058 1059 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1060 if (!mr) 1061 return ERR_PTR(-ENOMEM); 1062 1063 in = kzalloc(inlen, GFP_KERNEL); 1064 if (!in) { 1065 err = -ENOMEM; 1066 goto err_free; 1067 } 1068 1069 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1070 1071 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1072 MLX5_SET(mkc, mkc, length64, 1); 1073 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1074 pd); 1075 1076 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1077 if (err) 1078 goto err_in; 1079 1080 kfree(in); 1081 mr->mmkey.type = MLX5_MKEY_MR; 1082 mr->ibmr.lkey = mr->mmkey.key; 1083 mr->ibmr.rkey = mr->mmkey.key; 1084 mr->umem = NULL; 1085 1086 return &mr->ibmr; 1087 1088 err_in: 1089 kfree(in); 1090 1091 err_free: 1092 kfree(mr); 1093 1094 return ERR_PTR(err); 1095 } 1096 1097 static int get_octo_len(u64 addr, u64 len, int page_shift) 1098 { 1099 u64 page_size = 1ULL << page_shift; 1100 u64 offset; 1101 int npages; 1102 1103 offset = addr & (page_size - 1); 1104 npages = ALIGN(len + offset, page_size) >> page_shift; 1105 return (npages + 1) / 2; 1106 } 1107 1108 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1109 { 1110 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1111 return MKEY_CACHE_LAST_STD_ENTRY; 1112 return MLX5_MAX_UMR_SHIFT; 1113 } 1114 1115 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1116 u64 length, int access_flags, u64 iova) 1117 { 1118 mr->ibmr.lkey = mr->mmkey.key; 1119 mr->ibmr.rkey = mr->mmkey.key; 1120 mr->ibmr.length = length; 1121 mr->ibmr.device = &dev->ib_dev; 1122 mr->ibmr.iova = iova; 1123 mr->access_flags = access_flags; 1124 } 1125 1126 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1127 u64 iova) 1128 { 1129 /* 1130 * The alignment of iova has already been checked upon entering 1131 * UVERBS_METHOD_REG_DMABUF_MR 1132 */ 1133 umem->iova = iova; 1134 return PAGE_SIZE; 1135 } 1136 1137 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1138 struct ib_umem *umem, u64 iova, 1139 int access_flags) 1140 { 1141 struct mlx5r_cache_rb_key rb_key = { 1142 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 1143 }; 1144 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1145 struct mlx5_cache_ent *ent; 1146 struct mlx5_ib_mr *mr; 1147 unsigned int page_size; 1148 1149 if (umem->is_dmabuf) 1150 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1151 else 1152 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 1153 0, iova); 1154 if (WARN_ON(!page_size)) 1155 return ERR_PTR(-EINVAL); 1156 1157 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1158 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1159 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1160 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1161 /* 1162 * If the MR can't come from the cache then synchronously create an uncached 1163 * one. 1164 */ 1165 if (!ent) { 1166 mutex_lock(&dev->slow_path_mutex); 1167 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 1168 mutex_unlock(&dev->slow_path_mutex); 1169 mr->mmkey.rb_key = rb_key; 1170 return mr; 1171 } 1172 1173 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); 1174 if (IS_ERR(mr)) 1175 return mr; 1176 1177 mr->ibmr.pd = pd; 1178 mr->umem = umem; 1179 mr->page_shift = order_base_2(page_size); 1180 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1181 1182 return mr; 1183 } 1184 1185 /* 1186 * If ibmr is NULL it will be allocated by reg_create. 1187 * Else, the given ibmr will be used. 1188 */ 1189 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1190 u64 iova, int access_flags, 1191 unsigned int page_size, bool populate) 1192 { 1193 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1194 struct mlx5_ib_mr *mr; 1195 __be64 *pas; 1196 void *mkc; 1197 int inlen; 1198 u32 *in; 1199 int err; 1200 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1201 1202 if (!page_size) 1203 return ERR_PTR(-EINVAL); 1204 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1205 if (!mr) 1206 return ERR_PTR(-ENOMEM); 1207 1208 mr->ibmr.pd = pd; 1209 mr->access_flags = access_flags; 1210 mr->page_shift = order_base_2(page_size); 1211 1212 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1213 if (populate) 1214 inlen += sizeof(*pas) * 1215 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1216 in = kvzalloc(inlen, GFP_KERNEL); 1217 if (!in) { 1218 err = -ENOMEM; 1219 goto err_1; 1220 } 1221 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1222 if (populate) { 1223 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1224 err = -EINVAL; 1225 goto err_2; 1226 } 1227 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1228 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1229 } 1230 1231 /* The pg_access bit allows setting the access flags 1232 * in the page list submitted with the command. */ 1233 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1234 1235 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1236 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1237 populate ? pd : dev->umrc.pd); 1238 MLX5_SET(mkc, mkc, free, !populate); 1239 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1240 MLX5_SET(mkc, mkc, umr_en, 1); 1241 1242 MLX5_SET64(mkc, mkc, len, umem->length); 1243 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1244 MLX5_SET(mkc, mkc, translations_octword_size, 1245 get_octo_len(iova, umem->length, mr->page_shift)); 1246 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1247 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1248 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1249 if (populate) { 1250 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1251 get_octo_len(iova, umem->length, mr->page_shift)); 1252 } 1253 1254 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1255 if (err) { 1256 mlx5_ib_warn(dev, "create mkey failed\n"); 1257 goto err_2; 1258 } 1259 mr->mmkey.type = MLX5_MKEY_MR; 1260 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1261 mr->umem = umem; 1262 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1263 kvfree(in); 1264 1265 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1266 1267 return mr; 1268 1269 err_2: 1270 kvfree(in); 1271 err_1: 1272 kfree(mr); 1273 return ERR_PTR(err); 1274 } 1275 1276 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1277 u64 length, int acc, int mode) 1278 { 1279 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1280 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1281 struct mlx5_ib_mr *mr; 1282 void *mkc; 1283 u32 *in; 1284 int err; 1285 1286 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1287 if (!mr) 1288 return ERR_PTR(-ENOMEM); 1289 1290 in = kzalloc(inlen, GFP_KERNEL); 1291 if (!in) { 1292 err = -ENOMEM; 1293 goto err_free; 1294 } 1295 1296 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1297 1298 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1299 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1300 MLX5_SET64(mkc, mkc, len, length); 1301 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1302 1303 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1304 if (err) 1305 goto err_in; 1306 1307 kfree(in); 1308 1309 set_mr_fields(dev, mr, length, acc, start_addr); 1310 1311 return &mr->ibmr; 1312 1313 err_in: 1314 kfree(in); 1315 1316 err_free: 1317 kfree(mr); 1318 1319 return ERR_PTR(err); 1320 } 1321 1322 int mlx5_ib_advise_mr(struct ib_pd *pd, 1323 enum ib_uverbs_advise_mr_advice advice, 1324 u32 flags, 1325 struct ib_sge *sg_list, 1326 u32 num_sge, 1327 struct uverbs_attr_bundle *attrs) 1328 { 1329 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1330 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1331 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1332 return -EOPNOTSUPP; 1333 1334 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1335 sg_list, num_sge); 1336 } 1337 1338 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1339 struct ib_dm_mr_attr *attr, 1340 struct uverbs_attr_bundle *attrs) 1341 { 1342 struct mlx5_ib_dm *mdm = to_mdm(dm); 1343 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1344 u64 start_addr = mdm->dev_addr + attr->offset; 1345 int mode; 1346 1347 switch (mdm->type) { 1348 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1349 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1350 return ERR_PTR(-EINVAL); 1351 1352 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1353 start_addr -= pci_resource_start(dev->pdev, 0); 1354 break; 1355 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1356 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1357 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1358 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1359 return ERR_PTR(-EINVAL); 1360 1361 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1362 break; 1363 default: 1364 return ERR_PTR(-EINVAL); 1365 } 1366 1367 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1368 attr->access_flags, mode); 1369 } 1370 1371 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1372 u64 iova, int access_flags) 1373 { 1374 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1375 struct mlx5_ib_mr *mr = NULL; 1376 bool xlt_with_umr; 1377 int err; 1378 1379 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1380 if (xlt_with_umr) { 1381 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1382 } else { 1383 unsigned int page_size = mlx5_umem_find_best_pgsz( 1384 umem, mkc, log_page_size, 0, iova); 1385 1386 mutex_lock(&dev->slow_path_mutex); 1387 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1388 mutex_unlock(&dev->slow_path_mutex); 1389 } 1390 if (IS_ERR(mr)) { 1391 ib_umem_release(umem); 1392 return ERR_CAST(mr); 1393 } 1394 1395 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1396 1397 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1398 1399 if (xlt_with_umr) { 1400 /* 1401 * If the MR was created with reg_create then it will be 1402 * configured properly but left disabled. It is safe to go ahead 1403 * and configure it again via UMR while enabling it. 1404 */ 1405 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1406 if (err) { 1407 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1408 return ERR_PTR(err); 1409 } 1410 } 1411 return &mr->ibmr; 1412 } 1413 1414 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1415 u64 iova, int access_flags, 1416 struct ib_udata *udata) 1417 { 1418 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1419 struct ib_umem_odp *odp; 1420 struct mlx5_ib_mr *mr; 1421 int err; 1422 1423 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1424 return ERR_PTR(-EOPNOTSUPP); 1425 1426 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1427 if (err) 1428 return ERR_PTR(err); 1429 if (!start && length == U64_MAX) { 1430 if (iova != 0) 1431 return ERR_PTR(-EINVAL); 1432 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1433 return ERR_PTR(-EINVAL); 1434 1435 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1436 if (IS_ERR(mr)) 1437 return ERR_CAST(mr); 1438 return &mr->ibmr; 1439 } 1440 1441 /* ODP requires xlt update via umr to work. */ 1442 if (!mlx5r_umr_can_load_pas(dev, length)) 1443 return ERR_PTR(-EINVAL); 1444 1445 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1446 &mlx5_mn_ops); 1447 if (IS_ERR(odp)) 1448 return ERR_CAST(odp); 1449 1450 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1451 if (IS_ERR(mr)) { 1452 ib_umem_release(&odp->umem); 1453 return ERR_CAST(mr); 1454 } 1455 xa_init(&mr->implicit_children); 1456 1457 odp->private = mr; 1458 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1459 if (err) 1460 goto err_dereg_mr; 1461 1462 err = mlx5_ib_init_odp_mr(mr); 1463 if (err) 1464 goto err_dereg_mr; 1465 return &mr->ibmr; 1466 1467 err_dereg_mr: 1468 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1469 return ERR_PTR(err); 1470 } 1471 1472 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1473 u64 iova, int access_flags, 1474 struct ib_udata *udata) 1475 { 1476 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1477 struct ib_umem *umem; 1478 1479 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1480 return ERR_PTR(-EOPNOTSUPP); 1481 1482 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1483 start, iova, length, access_flags); 1484 1485 if (access_flags & IB_ACCESS_ON_DEMAND) 1486 return create_user_odp_mr(pd, start, length, iova, access_flags, 1487 udata); 1488 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1489 if (IS_ERR(umem)) 1490 return ERR_CAST(umem); 1491 return create_real_mr(pd, umem, iova, access_flags); 1492 } 1493 1494 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1495 { 1496 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1497 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1498 1499 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1500 1501 if (!umem_dmabuf->sgt) 1502 return; 1503 1504 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1505 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1506 } 1507 1508 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1509 .allow_peer2peer = 1, 1510 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1511 }; 1512 1513 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1514 u64 length, u64 virt_addr, 1515 int fd, int access_flags, 1516 struct ib_udata *udata) 1517 { 1518 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1519 struct mlx5_ib_mr *mr = NULL; 1520 struct ib_umem_dmabuf *umem_dmabuf; 1521 int err; 1522 1523 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1524 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1525 return ERR_PTR(-EOPNOTSUPP); 1526 1527 mlx5_ib_dbg(dev, 1528 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1529 offset, virt_addr, length, fd, access_flags); 1530 1531 /* dmabuf requires xlt update via umr to work. */ 1532 if (!mlx5r_umr_can_load_pas(dev, length)) 1533 return ERR_PTR(-EINVAL); 1534 1535 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1536 access_flags, 1537 &mlx5_ib_dmabuf_attach_ops); 1538 if (IS_ERR(umem_dmabuf)) { 1539 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1540 PTR_ERR(umem_dmabuf)); 1541 return ERR_CAST(umem_dmabuf); 1542 } 1543 1544 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1545 access_flags); 1546 if (IS_ERR(mr)) { 1547 ib_umem_release(&umem_dmabuf->umem); 1548 return ERR_CAST(mr); 1549 } 1550 1551 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1552 1553 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1554 umem_dmabuf->private = mr; 1555 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1556 if (err) 1557 goto err_dereg_mr; 1558 1559 err = mlx5_ib_init_dmabuf_mr(mr); 1560 if (err) 1561 goto err_dereg_mr; 1562 return &mr->ibmr; 1563 1564 err_dereg_mr: 1565 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1566 return ERR_PTR(err); 1567 } 1568 1569 /* 1570 * True if the change in access flags can be done via UMR, only some access 1571 * flags can be updated. 1572 */ 1573 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1574 unsigned int current_access_flags, 1575 unsigned int target_access_flags) 1576 { 1577 unsigned int diffs = current_access_flags ^ target_access_flags; 1578 1579 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1580 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1581 return false; 1582 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1583 target_access_flags); 1584 } 1585 1586 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1587 struct ib_umem *new_umem, 1588 int new_access_flags, u64 iova, 1589 unsigned long *page_size) 1590 { 1591 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1592 1593 /* We only track the allocated sizes of MRs from the cache */ 1594 if (!mr->mmkey.cache_ent) 1595 return false; 1596 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1597 return false; 1598 1599 *page_size = 1600 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1601 if (WARN_ON(!*page_size)) 1602 return false; 1603 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1604 ib_umem_num_dma_blocks(new_umem, *page_size); 1605 } 1606 1607 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1608 int access_flags, int flags, struct ib_umem *new_umem, 1609 u64 iova, unsigned long page_size) 1610 { 1611 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1612 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1613 struct ib_umem *old_umem = mr->umem; 1614 int err; 1615 1616 /* 1617 * To keep everything simple the MR is revoked before we start to mess 1618 * with it. This ensure the change is atomic relative to any use of the 1619 * MR. 1620 */ 1621 err = mlx5r_umr_revoke_mr(mr); 1622 if (err) 1623 return err; 1624 1625 if (flags & IB_MR_REREG_PD) { 1626 mr->ibmr.pd = pd; 1627 upd_flags |= MLX5_IB_UPD_XLT_PD; 1628 } 1629 if (flags & IB_MR_REREG_ACCESS) { 1630 mr->access_flags = access_flags; 1631 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1632 } 1633 1634 mr->ibmr.iova = iova; 1635 mr->ibmr.length = new_umem->length; 1636 mr->page_shift = order_base_2(page_size); 1637 mr->umem = new_umem; 1638 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1639 if (err) { 1640 /* 1641 * The MR is revoked at this point so there is no issue to free 1642 * new_umem. 1643 */ 1644 mr->umem = old_umem; 1645 return err; 1646 } 1647 1648 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1649 ib_umem_release(old_umem); 1650 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1651 return 0; 1652 } 1653 1654 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1655 u64 length, u64 iova, int new_access_flags, 1656 struct ib_pd *new_pd, 1657 struct ib_udata *udata) 1658 { 1659 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1660 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1661 int err; 1662 1663 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1664 return ERR_PTR(-EOPNOTSUPP); 1665 1666 mlx5_ib_dbg( 1667 dev, 1668 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1669 start, iova, length, new_access_flags); 1670 1671 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1672 return ERR_PTR(-EOPNOTSUPP); 1673 1674 if (!(flags & IB_MR_REREG_ACCESS)) 1675 new_access_flags = mr->access_flags; 1676 if (!(flags & IB_MR_REREG_PD)) 1677 new_pd = ib_mr->pd; 1678 1679 if (!(flags & IB_MR_REREG_TRANS)) { 1680 struct ib_umem *umem; 1681 1682 /* Fast path for PD/access change */ 1683 if (can_use_umr_rereg_access(dev, mr->access_flags, 1684 new_access_flags)) { 1685 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1686 new_access_flags); 1687 if (err) 1688 return ERR_PTR(err); 1689 return NULL; 1690 } 1691 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1692 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1693 goto recreate; 1694 1695 /* 1696 * Only one active MR can refer to a umem at one time, revoke 1697 * the old MR before assigning the umem to the new one. 1698 */ 1699 err = mlx5r_umr_revoke_mr(mr); 1700 if (err) 1701 return ERR_PTR(err); 1702 umem = mr->umem; 1703 mr->umem = NULL; 1704 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1705 1706 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1707 new_access_flags); 1708 } 1709 1710 /* 1711 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1712 * but the logic around releasing the umem is different 1713 */ 1714 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1715 goto recreate; 1716 1717 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1718 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1719 struct ib_umem *new_umem; 1720 unsigned long page_size; 1721 1722 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1723 new_access_flags); 1724 if (IS_ERR(new_umem)) 1725 return ERR_CAST(new_umem); 1726 1727 /* Fast path for PAS change */ 1728 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1729 &page_size)) { 1730 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1731 new_umem, iova, page_size); 1732 if (err) { 1733 ib_umem_release(new_umem); 1734 return ERR_PTR(err); 1735 } 1736 return NULL; 1737 } 1738 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1739 } 1740 1741 /* 1742 * Everything else has no state we can preserve, just create a new MR 1743 * from scratch 1744 */ 1745 recreate: 1746 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1747 new_access_flags, udata); 1748 } 1749 1750 static int 1751 mlx5_alloc_priv_descs(struct ib_device *device, 1752 struct mlx5_ib_mr *mr, 1753 int ndescs, 1754 int desc_size) 1755 { 1756 struct mlx5_ib_dev *dev = to_mdev(device); 1757 struct device *ddev = &dev->mdev->pdev->dev; 1758 int size = ndescs * desc_size; 1759 int add_size; 1760 int ret; 1761 1762 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1763 1764 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1765 if (!mr->descs_alloc) 1766 return -ENOMEM; 1767 1768 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1769 1770 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1771 if (dma_mapping_error(ddev, mr->desc_map)) { 1772 ret = -ENOMEM; 1773 goto err; 1774 } 1775 1776 return 0; 1777 err: 1778 kfree(mr->descs_alloc); 1779 1780 return ret; 1781 } 1782 1783 static void 1784 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1785 { 1786 if (!mr->umem && mr->descs) { 1787 struct ib_device *device = mr->ibmr.device; 1788 int size = mr->max_descs * mr->desc_size; 1789 struct mlx5_ib_dev *dev = to_mdev(device); 1790 1791 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1792 DMA_TO_DEVICE); 1793 kfree(mr->descs_alloc); 1794 mr->descs = NULL; 1795 } 1796 } 1797 1798 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1799 struct mlx5_ib_mr *mr) 1800 { 1801 struct mlx5_mkey_cache *cache = &dev->cache; 1802 struct mlx5_cache_ent *ent; 1803 int ret; 1804 1805 if (mr->mmkey.cache_ent) { 1806 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1807 mr->mmkey.cache_ent->in_use--; 1808 goto end; 1809 } 1810 1811 mutex_lock(&cache->rb_lock); 1812 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1813 if (ent) { 1814 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1815 if (ent->disabled) { 1816 mutex_unlock(&cache->rb_lock); 1817 return -EOPNOTSUPP; 1818 } 1819 mr->mmkey.cache_ent = ent; 1820 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1821 mutex_unlock(&cache->rb_lock); 1822 goto end; 1823 } 1824 } 1825 1826 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1827 mutex_unlock(&cache->rb_lock); 1828 if (IS_ERR(ent)) 1829 return PTR_ERR(ent); 1830 1831 mr->mmkey.cache_ent = ent; 1832 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1833 1834 end: 1835 ret = push_mkey_locked(mr->mmkey.cache_ent, false, 1836 xa_mk_value(mr->mmkey.key)); 1837 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); 1838 return ret; 1839 } 1840 1841 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1842 { 1843 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1844 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1845 int rc; 1846 1847 /* 1848 * Any async use of the mr must hold the refcount, once the refcount 1849 * goes to zero no other thread, such as ODP page faults, prefetch, any 1850 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1851 */ 1852 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1853 refcount_read(&mr->mmkey.usecount) != 0 && 1854 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1855 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1856 1857 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1858 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1859 mr->sig, NULL, GFP_KERNEL); 1860 1861 if (mr->mtt_mr) { 1862 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1863 if (rc) 1864 return rc; 1865 mr->mtt_mr = NULL; 1866 } 1867 if (mr->klm_mr) { 1868 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1869 if (rc) 1870 return rc; 1871 mr->klm_mr = NULL; 1872 } 1873 1874 if (mlx5_core_destroy_psv(dev->mdev, 1875 mr->sig->psv_memory.psv_idx)) 1876 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1877 mr->sig->psv_memory.psv_idx); 1878 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1879 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1880 mr->sig->psv_wire.psv_idx); 1881 kfree(mr->sig); 1882 mr->sig = NULL; 1883 } 1884 1885 /* Stop DMA */ 1886 if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) 1887 if (mlx5r_umr_revoke_mr(mr) || 1888 cache_ent_find_and_store(dev, mr)) 1889 mr->mmkey.cache_ent = NULL; 1890 1891 if (!mr->mmkey.cache_ent) { 1892 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1893 if (rc) 1894 return rc; 1895 } 1896 1897 if (mr->umem) { 1898 bool is_odp = is_odp_mr(mr); 1899 1900 if (!is_odp) 1901 atomic_sub(ib_umem_num_pages(mr->umem), 1902 &dev->mdev->priv.reg_pages); 1903 ib_umem_release(mr->umem); 1904 if (is_odp) 1905 mlx5_ib_free_odp_mr(mr); 1906 } 1907 1908 if (!mr->mmkey.cache_ent) 1909 mlx5_free_priv_descs(mr); 1910 1911 kfree(mr); 1912 return 0; 1913 } 1914 1915 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1916 int access_mode, int page_shift) 1917 { 1918 void *mkc; 1919 1920 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1921 1922 /* This is only used from the kernel, so setting the PD is OK. */ 1923 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1924 MLX5_SET(mkc, mkc, free, 1); 1925 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1926 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1927 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1928 MLX5_SET(mkc, mkc, umr_en, 1); 1929 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1930 } 1931 1932 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1933 int ndescs, int desc_size, int page_shift, 1934 int access_mode, u32 *in, int inlen) 1935 { 1936 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1937 int err; 1938 1939 mr->access_mode = access_mode; 1940 mr->desc_size = desc_size; 1941 mr->max_descs = ndescs; 1942 1943 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1944 if (err) 1945 return err; 1946 1947 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1948 1949 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1950 if (err) 1951 goto err_free_descs; 1952 1953 mr->mmkey.type = MLX5_MKEY_MR; 1954 mr->ibmr.lkey = mr->mmkey.key; 1955 mr->ibmr.rkey = mr->mmkey.key; 1956 1957 return 0; 1958 1959 err_free_descs: 1960 mlx5_free_priv_descs(mr); 1961 return err; 1962 } 1963 1964 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1965 u32 max_num_sg, u32 max_num_meta_sg, 1966 int desc_size, int access_mode) 1967 { 1968 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1969 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1970 int page_shift = 0; 1971 struct mlx5_ib_mr *mr; 1972 u32 *in; 1973 int err; 1974 1975 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1976 if (!mr) 1977 return ERR_PTR(-ENOMEM); 1978 1979 mr->ibmr.pd = pd; 1980 mr->ibmr.device = pd->device; 1981 1982 in = kzalloc(inlen, GFP_KERNEL); 1983 if (!in) { 1984 err = -ENOMEM; 1985 goto err_free; 1986 } 1987 1988 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1989 page_shift = PAGE_SHIFT; 1990 1991 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1992 access_mode, in, inlen); 1993 if (err) 1994 goto err_free_in; 1995 1996 mr->umem = NULL; 1997 kfree(in); 1998 1999 return mr; 2000 2001 err_free_in: 2002 kfree(in); 2003 err_free: 2004 kfree(mr); 2005 return ERR_PTR(err); 2006 } 2007 2008 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2009 int ndescs, u32 *in, int inlen) 2010 { 2011 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2012 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2013 inlen); 2014 } 2015 2016 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2017 int ndescs, u32 *in, int inlen) 2018 { 2019 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2020 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2021 } 2022 2023 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2024 int max_num_sg, int max_num_meta_sg, 2025 u32 *in, int inlen) 2026 { 2027 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2028 u32 psv_index[2]; 2029 void *mkc; 2030 int err; 2031 2032 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2033 if (!mr->sig) 2034 return -ENOMEM; 2035 2036 /* create mem & wire PSVs */ 2037 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2038 if (err) 2039 goto err_free_sig; 2040 2041 mr->sig->psv_memory.psv_idx = psv_index[0]; 2042 mr->sig->psv_wire.psv_idx = psv_index[1]; 2043 2044 mr->sig->sig_status_checked = true; 2045 mr->sig->sig_err_exists = false; 2046 /* Next UMR, Arm SIGERR */ 2047 ++mr->sig->sigerr_count; 2048 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2049 sizeof(struct mlx5_klm), 2050 MLX5_MKC_ACCESS_MODE_KLMS); 2051 if (IS_ERR(mr->klm_mr)) { 2052 err = PTR_ERR(mr->klm_mr); 2053 goto err_destroy_psv; 2054 } 2055 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2056 sizeof(struct mlx5_mtt), 2057 MLX5_MKC_ACCESS_MODE_MTT); 2058 if (IS_ERR(mr->mtt_mr)) { 2059 err = PTR_ERR(mr->mtt_mr); 2060 goto err_free_klm_mr; 2061 } 2062 2063 /* Set bsf descriptors for mkey */ 2064 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2065 MLX5_SET(mkc, mkc, bsf_en, 1); 2066 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2067 2068 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2069 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2070 if (err) 2071 goto err_free_mtt_mr; 2072 2073 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2074 mr->sig, GFP_KERNEL)); 2075 if (err) 2076 goto err_free_descs; 2077 return 0; 2078 2079 err_free_descs: 2080 destroy_mkey(dev, mr); 2081 mlx5_free_priv_descs(mr); 2082 err_free_mtt_mr: 2083 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2084 mr->mtt_mr = NULL; 2085 err_free_klm_mr: 2086 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2087 mr->klm_mr = NULL; 2088 err_destroy_psv: 2089 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2090 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2091 mr->sig->psv_memory.psv_idx); 2092 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2093 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2094 mr->sig->psv_wire.psv_idx); 2095 err_free_sig: 2096 kfree(mr->sig); 2097 2098 return err; 2099 } 2100 2101 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2102 enum ib_mr_type mr_type, u32 max_num_sg, 2103 u32 max_num_meta_sg) 2104 { 2105 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2106 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2107 int ndescs = ALIGN(max_num_sg, 4); 2108 struct mlx5_ib_mr *mr; 2109 u32 *in; 2110 int err; 2111 2112 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2113 if (!mr) 2114 return ERR_PTR(-ENOMEM); 2115 2116 in = kzalloc(inlen, GFP_KERNEL); 2117 if (!in) { 2118 err = -ENOMEM; 2119 goto err_free; 2120 } 2121 2122 mr->ibmr.device = pd->device; 2123 mr->umem = NULL; 2124 2125 switch (mr_type) { 2126 case IB_MR_TYPE_MEM_REG: 2127 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2128 break; 2129 case IB_MR_TYPE_SG_GAPS: 2130 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2131 break; 2132 case IB_MR_TYPE_INTEGRITY: 2133 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2134 max_num_meta_sg, in, inlen); 2135 break; 2136 default: 2137 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2138 err = -EINVAL; 2139 } 2140 2141 if (err) 2142 goto err_free_in; 2143 2144 kfree(in); 2145 2146 return &mr->ibmr; 2147 2148 err_free_in: 2149 kfree(in); 2150 err_free: 2151 kfree(mr); 2152 return ERR_PTR(err); 2153 } 2154 2155 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2156 u32 max_num_sg) 2157 { 2158 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2159 } 2160 2161 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2162 u32 max_num_sg, u32 max_num_meta_sg) 2163 { 2164 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2165 max_num_meta_sg); 2166 } 2167 2168 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2169 { 2170 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2171 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2172 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2173 unsigned int ndescs; 2174 u32 *in = NULL; 2175 void *mkc; 2176 int err; 2177 struct mlx5_ib_alloc_mw req = {}; 2178 struct { 2179 __u32 comp_mask; 2180 __u32 response_length; 2181 } resp = {}; 2182 2183 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2184 if (err) 2185 return err; 2186 2187 if (req.comp_mask || req.reserved1 || req.reserved2) 2188 return -EOPNOTSUPP; 2189 2190 if (udata->inlen > sizeof(req) && 2191 !ib_is_udata_cleared(udata, sizeof(req), 2192 udata->inlen - sizeof(req))) 2193 return -EOPNOTSUPP; 2194 2195 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2196 2197 in = kzalloc(inlen, GFP_KERNEL); 2198 if (!in) 2199 return -ENOMEM; 2200 2201 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2202 2203 MLX5_SET(mkc, mkc, free, 1); 2204 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2205 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2206 MLX5_SET(mkc, mkc, umr_en, 1); 2207 MLX5_SET(mkc, mkc, lr, 1); 2208 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2209 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2210 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2211 2212 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2213 if (err) 2214 goto free; 2215 2216 mw->mmkey.type = MLX5_MKEY_MW; 2217 ibmw->rkey = mw->mmkey.key; 2218 mw->mmkey.ndescs = ndescs; 2219 2220 resp.response_length = 2221 min(offsetofend(typeof(resp), response_length), udata->outlen); 2222 if (resp.response_length) { 2223 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2224 if (err) 2225 goto free_mkey; 2226 } 2227 2228 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2229 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2230 if (err) 2231 goto free_mkey; 2232 } 2233 2234 kfree(in); 2235 return 0; 2236 2237 free_mkey: 2238 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2239 free: 2240 kfree(in); 2241 return err; 2242 } 2243 2244 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2245 { 2246 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2247 struct mlx5_ib_mw *mmw = to_mmw(mw); 2248 2249 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2250 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2251 /* 2252 * pagefault_single_data_segment() may be accessing mmw 2253 * if the user bound an ODP MR to this MW. 2254 */ 2255 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2256 2257 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2258 } 2259 2260 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2261 struct ib_mr_status *mr_status) 2262 { 2263 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2264 int ret = 0; 2265 2266 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2267 pr_err("Invalid status check mask\n"); 2268 ret = -EINVAL; 2269 goto done; 2270 } 2271 2272 mr_status->fail_status = 0; 2273 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2274 if (!mmr->sig) { 2275 ret = -EINVAL; 2276 pr_err("signature status check requested on a non-signature enabled MR\n"); 2277 goto done; 2278 } 2279 2280 mmr->sig->sig_status_checked = true; 2281 if (!mmr->sig->sig_err_exists) 2282 goto done; 2283 2284 if (ibmr->lkey == mmr->sig->err_item.key) 2285 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2286 sizeof(mr_status->sig_err)); 2287 else { 2288 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2289 mr_status->sig_err.sig_err_offset = 0; 2290 mr_status->sig_err.key = mmr->sig->err_item.key; 2291 } 2292 2293 mmr->sig->sig_err_exists = false; 2294 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2295 } 2296 2297 done: 2298 return ret; 2299 } 2300 2301 static int 2302 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2303 int data_sg_nents, unsigned int *data_sg_offset, 2304 struct scatterlist *meta_sg, int meta_sg_nents, 2305 unsigned int *meta_sg_offset) 2306 { 2307 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2308 unsigned int sg_offset = 0; 2309 int n = 0; 2310 2311 mr->meta_length = 0; 2312 if (data_sg_nents == 1) { 2313 n++; 2314 mr->mmkey.ndescs = 1; 2315 if (data_sg_offset) 2316 sg_offset = *data_sg_offset; 2317 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2318 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2319 if (meta_sg_nents == 1) { 2320 n++; 2321 mr->meta_ndescs = 1; 2322 if (meta_sg_offset) 2323 sg_offset = *meta_sg_offset; 2324 else 2325 sg_offset = 0; 2326 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2327 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2328 } 2329 ibmr->length = mr->data_length + mr->meta_length; 2330 } 2331 2332 return n; 2333 } 2334 2335 static int 2336 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2337 struct scatterlist *sgl, 2338 unsigned short sg_nents, 2339 unsigned int *sg_offset_p, 2340 struct scatterlist *meta_sgl, 2341 unsigned short meta_sg_nents, 2342 unsigned int *meta_sg_offset_p) 2343 { 2344 struct scatterlist *sg = sgl; 2345 struct mlx5_klm *klms = mr->descs; 2346 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2347 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2348 int i, j = 0; 2349 2350 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2351 mr->ibmr.length = 0; 2352 2353 for_each_sg(sgl, sg, sg_nents, i) { 2354 if (unlikely(i >= mr->max_descs)) 2355 break; 2356 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2357 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2358 klms[i].key = cpu_to_be32(lkey); 2359 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2360 2361 sg_offset = 0; 2362 } 2363 2364 if (sg_offset_p) 2365 *sg_offset_p = sg_offset; 2366 2367 mr->mmkey.ndescs = i; 2368 mr->data_length = mr->ibmr.length; 2369 2370 if (meta_sg_nents) { 2371 sg = meta_sgl; 2372 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2373 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2374 if (unlikely(i + j >= mr->max_descs)) 2375 break; 2376 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2377 sg_offset); 2378 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2379 sg_offset); 2380 klms[i + j].key = cpu_to_be32(lkey); 2381 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2382 2383 sg_offset = 0; 2384 } 2385 if (meta_sg_offset_p) 2386 *meta_sg_offset_p = sg_offset; 2387 2388 mr->meta_ndescs = j; 2389 mr->meta_length = mr->ibmr.length - mr->data_length; 2390 } 2391 2392 return i + j; 2393 } 2394 2395 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2396 { 2397 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2398 __be64 *descs; 2399 2400 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2401 return -ENOMEM; 2402 2403 descs = mr->descs; 2404 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2405 2406 return 0; 2407 } 2408 2409 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2410 { 2411 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2412 __be64 *descs; 2413 2414 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2415 return -ENOMEM; 2416 2417 descs = mr->descs; 2418 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2419 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2420 2421 return 0; 2422 } 2423 2424 static int 2425 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2426 int data_sg_nents, unsigned int *data_sg_offset, 2427 struct scatterlist *meta_sg, int meta_sg_nents, 2428 unsigned int *meta_sg_offset) 2429 { 2430 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2431 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2432 int n; 2433 2434 pi_mr->mmkey.ndescs = 0; 2435 pi_mr->meta_ndescs = 0; 2436 pi_mr->meta_length = 0; 2437 2438 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2439 pi_mr->desc_size * pi_mr->max_descs, 2440 DMA_TO_DEVICE); 2441 2442 pi_mr->ibmr.page_size = ibmr->page_size; 2443 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2444 mlx5_set_page); 2445 if (n != data_sg_nents) 2446 return n; 2447 2448 pi_mr->data_iova = pi_mr->ibmr.iova; 2449 pi_mr->data_length = pi_mr->ibmr.length; 2450 pi_mr->ibmr.length = pi_mr->data_length; 2451 ibmr->length = pi_mr->data_length; 2452 2453 if (meta_sg_nents) { 2454 u64 page_mask = ~((u64)ibmr->page_size - 1); 2455 u64 iova = pi_mr->data_iova; 2456 2457 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2458 meta_sg_offset, mlx5_set_page_pi); 2459 2460 pi_mr->meta_length = pi_mr->ibmr.length; 2461 /* 2462 * PI address for the HW is the offset of the metadata address 2463 * relative to the first data page address. 2464 * It equals to first data page address + size of data pages + 2465 * metadata offset at the first metadata page 2466 */ 2467 pi_mr->pi_iova = (iova & page_mask) + 2468 pi_mr->mmkey.ndescs * ibmr->page_size + 2469 (pi_mr->ibmr.iova & ~page_mask); 2470 /* 2471 * In order to use one MTT MR for data and metadata, we register 2472 * also the gaps between the end of the data and the start of 2473 * the metadata (the sig MR will verify that the HW will access 2474 * to right addresses). This mapping is safe because we use 2475 * internal mkey for the registration. 2476 */ 2477 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2478 pi_mr->ibmr.iova = iova; 2479 ibmr->length += pi_mr->meta_length; 2480 } 2481 2482 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2483 pi_mr->desc_size * pi_mr->max_descs, 2484 DMA_TO_DEVICE); 2485 2486 return n; 2487 } 2488 2489 static int 2490 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2491 int data_sg_nents, unsigned int *data_sg_offset, 2492 struct scatterlist *meta_sg, int meta_sg_nents, 2493 unsigned int *meta_sg_offset) 2494 { 2495 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2496 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2497 int n; 2498 2499 pi_mr->mmkey.ndescs = 0; 2500 pi_mr->meta_ndescs = 0; 2501 pi_mr->meta_length = 0; 2502 2503 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2504 pi_mr->desc_size * pi_mr->max_descs, 2505 DMA_TO_DEVICE); 2506 2507 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2508 meta_sg, meta_sg_nents, meta_sg_offset); 2509 2510 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2511 pi_mr->desc_size * pi_mr->max_descs, 2512 DMA_TO_DEVICE); 2513 2514 /* This is zero-based memory region */ 2515 pi_mr->data_iova = 0; 2516 pi_mr->ibmr.iova = 0; 2517 pi_mr->pi_iova = pi_mr->data_length; 2518 ibmr->length = pi_mr->ibmr.length; 2519 2520 return n; 2521 } 2522 2523 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2524 int data_sg_nents, unsigned int *data_sg_offset, 2525 struct scatterlist *meta_sg, int meta_sg_nents, 2526 unsigned int *meta_sg_offset) 2527 { 2528 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2529 struct mlx5_ib_mr *pi_mr = NULL; 2530 int n; 2531 2532 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2533 2534 mr->mmkey.ndescs = 0; 2535 mr->data_length = 0; 2536 mr->data_iova = 0; 2537 mr->meta_ndescs = 0; 2538 mr->pi_iova = 0; 2539 /* 2540 * As a performance optimization, if possible, there is no need to 2541 * perform UMR operation to register the data/metadata buffers. 2542 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2543 * Fallback to UMR only in case of a failure. 2544 */ 2545 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2546 data_sg_offset, meta_sg, meta_sg_nents, 2547 meta_sg_offset); 2548 if (n == data_sg_nents + meta_sg_nents) 2549 goto out; 2550 /* 2551 * As a performance optimization, if possible, there is no need to map 2552 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2553 * descriptors and fallback to KLM only in case of a failure. 2554 * It's more efficient for the HW to work with MTT descriptors 2555 * (especially in high load). 2556 * Use KLM (indirect access) only if it's mandatory. 2557 */ 2558 pi_mr = mr->mtt_mr; 2559 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2560 data_sg_offset, meta_sg, meta_sg_nents, 2561 meta_sg_offset); 2562 if (n == data_sg_nents + meta_sg_nents) 2563 goto out; 2564 2565 pi_mr = mr->klm_mr; 2566 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2567 data_sg_offset, meta_sg, meta_sg_nents, 2568 meta_sg_offset); 2569 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2570 return -ENOMEM; 2571 2572 out: 2573 /* This is zero-based memory region */ 2574 ibmr->iova = 0; 2575 mr->pi_mr = pi_mr; 2576 if (pi_mr) 2577 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2578 else 2579 ibmr->sig_attrs->meta_length = mr->meta_length; 2580 2581 return 0; 2582 } 2583 2584 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2585 unsigned int *sg_offset) 2586 { 2587 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2588 int n; 2589 2590 mr->mmkey.ndescs = 0; 2591 2592 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2593 mr->desc_size * mr->max_descs, 2594 DMA_TO_DEVICE); 2595 2596 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2597 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2598 NULL); 2599 else 2600 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2601 mlx5_set_page); 2602 2603 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2604 mr->desc_size * mr->max_descs, 2605 DMA_TO_DEVICE); 2606 2607 return n; 2608 } 2609