1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 47 enum { 48 MAX_PENDING_REG_MR = 8, 49 }; 50 51 #define MLX5_UMR_ALIGN 2048 52 53 static void 54 create_mkey_callback(int status, struct mlx5_async_work *context); 55 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 56 u64 iova, int access_flags, 57 unsigned int page_size, bool populate); 58 59 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 60 struct ib_pd *pd) 61 { 62 struct mlx5_ib_dev *dev = to_mdev(pd->device); 63 64 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 65 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 66 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 67 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 68 MLX5_SET(mkc, mkc, lr, 1); 69 70 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 71 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 72 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 73 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 76 } 77 78 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 79 MLX5_SET(mkc, mkc, qpn, 0xffffff); 80 MLX5_SET64(mkc, mkc, start_addr, start_addr); 81 } 82 83 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 84 { 85 u8 key = atomic_inc_return(&dev->mkey_var); 86 void *mkc; 87 88 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 89 MLX5_SET(mkc, mkc, mkey_7_0, key); 90 *mkey = key; 91 } 92 93 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 94 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 95 { 96 int ret; 97 98 assign_mkey_variant(dev, &mkey->key, in); 99 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 100 if (!ret) 101 init_waitqueue_head(&mkey->wait); 102 103 return ret; 104 } 105 106 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 107 { 108 struct mlx5_ib_dev *dev = async_create->ent->dev; 109 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 110 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 111 112 MLX5_SET(create_mkey_in, async_create->in, opcode, 113 MLX5_CMD_OP_CREATE_MKEY); 114 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 115 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 116 async_create->out, outlen, create_mkey_callback, 117 &async_create->cb_work); 118 } 119 120 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 121 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 122 123 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 124 { 125 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 126 127 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 128 } 129 130 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 131 { 132 if (status == -ENXIO) /* core driver is not available */ 133 return; 134 135 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 136 if (status != -EREMOTEIO) /* driver specific failure */ 137 return; 138 139 /* Failed in FW, print cmd out failure details */ 140 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 141 } 142 143 static int push_mkey_locked(struct mlx5_cache_ent *ent, bool limit_pendings, 144 void *to_store) 145 { 146 XA_STATE(xas, &ent->mkeys, 0); 147 void *curr; 148 149 if (limit_pendings && 150 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) 151 return -EAGAIN; 152 153 while (1) { 154 /* 155 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version 156 * doesn't transparently unlock. Instead we set the xas index to 157 * the current value of reserved every iteration. 158 */ 159 xas_set(&xas, ent->reserved); 160 curr = xas_load(&xas); 161 if (!curr) { 162 if (to_store && ent->stored == ent->reserved) 163 xas_store(&xas, to_store); 164 else 165 xas_store(&xas, XA_ZERO_ENTRY); 166 if (xas_valid(&xas)) { 167 ent->reserved++; 168 if (to_store) { 169 if (ent->stored != ent->reserved) 170 __xa_store(&ent->mkeys, 171 ent->stored, 172 to_store, 173 GFP_KERNEL); 174 ent->stored++; 175 queue_adjust_cache_locked(ent); 176 WRITE_ONCE(ent->dev->cache.last_add, 177 jiffies); 178 } 179 } 180 } 181 xa_unlock_irq(&ent->mkeys); 182 183 /* 184 * Notice xas_nomem() must always be called as it cleans 185 * up any cached allocation. 186 */ 187 if (!xas_nomem(&xas, GFP_KERNEL)) 188 break; 189 xa_lock_irq(&ent->mkeys); 190 } 191 xa_lock_irq(&ent->mkeys); 192 if (xas_error(&xas)) 193 return xas_error(&xas); 194 if (WARN_ON(curr)) 195 return -EINVAL; 196 return 0; 197 } 198 199 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, 200 void *to_store) 201 { 202 int ret; 203 204 xa_lock_irq(&ent->mkeys); 205 ret = push_mkey_locked(ent, limit_pendings, to_store); 206 xa_unlock_irq(&ent->mkeys); 207 return ret; 208 } 209 210 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) 211 { 212 void *old; 213 214 ent->reserved--; 215 old = __xa_erase(&ent->mkeys, ent->reserved); 216 WARN_ON(old); 217 } 218 219 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) 220 { 221 void *old; 222 223 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); 224 WARN_ON(old); 225 ent->stored++; 226 } 227 228 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) 229 { 230 void *old, *xa_mkey; 231 232 ent->stored--; 233 ent->reserved--; 234 235 if (ent->stored == ent->reserved) { 236 xa_mkey = __xa_erase(&ent->mkeys, ent->stored); 237 WARN_ON(!xa_mkey); 238 return (u32)xa_to_value(xa_mkey); 239 } 240 241 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, 242 GFP_KERNEL); 243 WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); 244 old = __xa_erase(&ent->mkeys, ent->reserved); 245 WARN_ON(old); 246 return (u32)xa_to_value(xa_mkey); 247 } 248 249 static void create_mkey_callback(int status, struct mlx5_async_work *context) 250 { 251 struct mlx5r_async_create_mkey *mkey_out = 252 container_of(context, struct mlx5r_async_create_mkey, cb_work); 253 struct mlx5_cache_ent *ent = mkey_out->ent; 254 struct mlx5_ib_dev *dev = ent->dev; 255 unsigned long flags; 256 257 if (status) { 258 create_mkey_warn(dev, status, mkey_out->out); 259 kfree(mkey_out); 260 xa_lock_irqsave(&ent->mkeys, flags); 261 undo_push_reserve_mkey(ent); 262 WRITE_ONCE(dev->fill_delay, 1); 263 xa_unlock_irqrestore(&ent->mkeys, flags); 264 mod_timer(&dev->delay_timer, jiffies + HZ); 265 return; 266 } 267 268 mkey_out->mkey |= mlx5_idx_to_mkey( 269 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 270 WRITE_ONCE(dev->cache.last_add, jiffies); 271 272 xa_lock_irqsave(&ent->mkeys, flags); 273 push_to_reserved(ent, mkey_out->mkey); 274 /* If we are doing fill_to_high_water then keep going. */ 275 queue_adjust_cache_locked(ent); 276 xa_unlock_irqrestore(&ent->mkeys, flags); 277 kfree(mkey_out); 278 } 279 280 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 281 { 282 int ret = 0; 283 284 switch (access_mode) { 285 case MLX5_MKC_ACCESS_MODE_MTT: 286 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 287 sizeof(struct mlx5_mtt)); 288 break; 289 case MLX5_MKC_ACCESS_MODE_KSM: 290 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 291 sizeof(struct mlx5_klm)); 292 break; 293 default: 294 WARN_ON(1); 295 } 296 return ret; 297 } 298 299 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 300 { 301 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 302 MLX5_SET(mkc, mkc, free, 1); 303 MLX5_SET(mkc, mkc, umr_en, 1); 304 MLX5_SET(mkc, mkc, access_mode_1_0, ent->rb_key.access_mode & 0x3); 305 MLX5_SET(mkc, mkc, access_mode_4_2, 306 (ent->rb_key.access_mode >> 2) & 0x7); 307 308 MLX5_SET(mkc, mkc, translations_octword_size, 309 get_mkc_octo_size(ent->rb_key.access_mode, 310 ent->rb_key.ndescs)); 311 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); 312 } 313 314 /* Asynchronously schedule new MRs to be populated in the cache. */ 315 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 316 { 317 struct mlx5r_async_create_mkey *async_create; 318 void *mkc; 319 int err = 0; 320 int i; 321 322 for (i = 0; i < num; i++) { 323 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 324 GFP_KERNEL); 325 if (!async_create) 326 return -ENOMEM; 327 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 328 memory_key_mkey_entry); 329 set_cache_mkc(ent, mkc); 330 async_create->ent = ent; 331 332 err = push_mkey(ent, true, NULL); 333 if (err) 334 goto free_async_create; 335 336 err = mlx5_ib_create_mkey_cb(async_create); 337 if (err) { 338 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 339 goto err_undo_reserve; 340 } 341 } 342 343 return 0; 344 345 err_undo_reserve: 346 xa_lock_irq(&ent->mkeys); 347 undo_push_reserve_mkey(ent); 348 xa_unlock_irq(&ent->mkeys); 349 free_async_create: 350 kfree(async_create); 351 return err; 352 } 353 354 /* Synchronously create a MR in the cache */ 355 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 356 { 357 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 358 void *mkc; 359 u32 *in; 360 int err; 361 362 in = kzalloc(inlen, GFP_KERNEL); 363 if (!in) 364 return -ENOMEM; 365 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 366 set_cache_mkc(ent, mkc); 367 368 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 369 if (err) 370 goto free_in; 371 372 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 373 free_in: 374 kfree(in); 375 return err; 376 } 377 378 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 379 { 380 u32 mkey; 381 382 lockdep_assert_held(&ent->mkeys.xa_lock); 383 if (!ent->stored) 384 return; 385 mkey = pop_stored_mkey(ent); 386 xa_unlock_irq(&ent->mkeys); 387 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 388 xa_lock_irq(&ent->mkeys); 389 } 390 391 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 392 bool limit_fill) 393 __acquires(&ent->mkeys) __releases(&ent->mkeys) 394 { 395 int err; 396 397 lockdep_assert_held(&ent->mkeys.xa_lock); 398 399 while (true) { 400 if (limit_fill) 401 target = ent->limit * 2; 402 if (target == ent->reserved) 403 return 0; 404 if (target > ent->reserved) { 405 u32 todo = target - ent->reserved; 406 407 xa_unlock_irq(&ent->mkeys); 408 err = add_keys(ent, todo); 409 if (err == -EAGAIN) 410 usleep_range(3000, 5000); 411 xa_lock_irq(&ent->mkeys); 412 if (err) { 413 if (err != -EAGAIN) 414 return err; 415 } else 416 return 0; 417 } else { 418 remove_cache_mr_locked(ent); 419 } 420 } 421 } 422 423 static ssize_t size_write(struct file *filp, const char __user *buf, 424 size_t count, loff_t *pos) 425 { 426 struct mlx5_cache_ent *ent = filp->private_data; 427 u32 target; 428 int err; 429 430 err = kstrtou32_from_user(buf, count, 0, &target); 431 if (err) 432 return err; 433 434 /* 435 * Target is the new value of total_mrs the user requests, however we 436 * cannot free MRs that are in use. Compute the target value for stored 437 * mkeys. 438 */ 439 xa_lock_irq(&ent->mkeys); 440 if (target < ent->in_use) { 441 err = -EINVAL; 442 goto err_unlock; 443 } 444 target = target - ent->in_use; 445 if (target < ent->limit || target > ent->limit*2) { 446 err = -EINVAL; 447 goto err_unlock; 448 } 449 err = resize_available_mrs(ent, target, false); 450 if (err) 451 goto err_unlock; 452 xa_unlock_irq(&ent->mkeys); 453 454 return count; 455 456 err_unlock: 457 xa_unlock_irq(&ent->mkeys); 458 return err; 459 } 460 461 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 462 loff_t *pos) 463 { 464 struct mlx5_cache_ent *ent = filp->private_data; 465 char lbuf[20]; 466 int err; 467 468 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); 469 if (err < 0) 470 return err; 471 472 return simple_read_from_buffer(buf, count, pos, lbuf, err); 473 } 474 475 static const struct file_operations size_fops = { 476 .owner = THIS_MODULE, 477 .open = simple_open, 478 .write = size_write, 479 .read = size_read, 480 }; 481 482 static ssize_t limit_write(struct file *filp, const char __user *buf, 483 size_t count, loff_t *pos) 484 { 485 struct mlx5_cache_ent *ent = filp->private_data; 486 u32 var; 487 int err; 488 489 err = kstrtou32_from_user(buf, count, 0, &var); 490 if (err) 491 return err; 492 493 /* 494 * Upon set we immediately fill the cache to high water mark implied by 495 * the limit. 496 */ 497 xa_lock_irq(&ent->mkeys); 498 ent->limit = var; 499 err = resize_available_mrs(ent, 0, true); 500 xa_unlock_irq(&ent->mkeys); 501 if (err) 502 return err; 503 return count; 504 } 505 506 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 507 loff_t *pos) 508 { 509 struct mlx5_cache_ent *ent = filp->private_data; 510 char lbuf[20]; 511 int err; 512 513 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 514 if (err < 0) 515 return err; 516 517 return simple_read_from_buffer(buf, count, pos, lbuf, err); 518 } 519 520 static const struct file_operations limit_fops = { 521 .owner = THIS_MODULE, 522 .open = simple_open, 523 .write = limit_write, 524 .read = limit_read, 525 }; 526 527 static bool someone_adding(struct mlx5_mkey_cache *cache) 528 { 529 struct mlx5_cache_ent *ent; 530 struct rb_node *node; 531 bool ret; 532 533 mutex_lock(&cache->rb_lock); 534 for (node = rb_first(&cache->rb_root); node; node = rb_next(node)) { 535 ent = rb_entry(node, struct mlx5_cache_ent, node); 536 xa_lock_irq(&ent->mkeys); 537 ret = ent->stored < ent->limit; 538 xa_unlock_irq(&ent->mkeys); 539 if (ret) { 540 mutex_unlock(&cache->rb_lock); 541 return true; 542 } 543 } 544 mutex_unlock(&cache->rb_lock); 545 return false; 546 } 547 548 /* 549 * Check if the bucket is outside the high/low water mark and schedule an async 550 * update. The cache refill has hysteresis, once the low water mark is hit it is 551 * refilled up to the high mark. 552 */ 553 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 554 { 555 lockdep_assert_held(&ent->mkeys.xa_lock); 556 557 if (ent->disabled || READ_ONCE(ent->dev->fill_delay) || ent->is_tmp) 558 return; 559 if (ent->stored < ent->limit) { 560 ent->fill_to_high_water = true; 561 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 562 } else if (ent->fill_to_high_water && 563 ent->reserved < 2 * ent->limit) { 564 /* 565 * Once we start populating due to hitting a low water mark 566 * continue until we pass the high water mark. 567 */ 568 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 569 } else if (ent->stored == 2 * ent->limit) { 570 ent->fill_to_high_water = false; 571 } else if (ent->stored > 2 * ent->limit) { 572 /* Queue deletion of excess entries */ 573 ent->fill_to_high_water = false; 574 if (ent->stored != ent->reserved) 575 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 576 msecs_to_jiffies(1000)); 577 else 578 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 579 } 580 } 581 582 static void __cache_work_func(struct mlx5_cache_ent *ent) 583 { 584 struct mlx5_ib_dev *dev = ent->dev; 585 struct mlx5_mkey_cache *cache = &dev->cache; 586 int err; 587 588 xa_lock_irq(&ent->mkeys); 589 if (ent->disabled) 590 goto out; 591 592 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && 593 !READ_ONCE(dev->fill_delay)) { 594 xa_unlock_irq(&ent->mkeys); 595 err = add_keys(ent, 1); 596 xa_lock_irq(&ent->mkeys); 597 if (ent->disabled) 598 goto out; 599 if (err) { 600 /* 601 * EAGAIN only happens if there are pending MRs, so we 602 * will be rescheduled when storing them. The only 603 * failure path here is ENOMEM. 604 */ 605 if (err != -EAGAIN) { 606 mlx5_ib_warn( 607 dev, 608 "add keys command failed, err %d\n", 609 err); 610 queue_delayed_work(cache->wq, &ent->dwork, 611 msecs_to_jiffies(1000)); 612 } 613 } 614 } else if (ent->stored > 2 * ent->limit) { 615 bool need_delay; 616 617 /* 618 * The remove_cache_mr() logic is performed as garbage 619 * collection task. Such task is intended to be run when no 620 * other active processes are running. 621 * 622 * The need_resched() will return TRUE if there are user tasks 623 * to be activated in near future. 624 * 625 * In such case, we don't execute remove_cache_mr() and postpone 626 * the garbage collection work to try to run in next cycle, in 627 * order to free CPU resources to other tasks. 628 */ 629 xa_unlock_irq(&ent->mkeys); 630 need_delay = need_resched() || someone_adding(cache) || 631 !time_after(jiffies, 632 READ_ONCE(cache->last_add) + 300 * HZ); 633 xa_lock_irq(&ent->mkeys); 634 if (ent->disabled) 635 goto out; 636 if (need_delay) { 637 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 638 goto out; 639 } 640 remove_cache_mr_locked(ent); 641 queue_adjust_cache_locked(ent); 642 } 643 out: 644 xa_unlock_irq(&ent->mkeys); 645 } 646 647 static void delayed_cache_work_func(struct work_struct *work) 648 { 649 struct mlx5_cache_ent *ent; 650 651 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 652 __cache_work_func(ent); 653 } 654 655 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1, 656 struct mlx5r_cache_rb_key key2) 657 { 658 int res; 659 660 res = key1.ats - key2.ats; 661 if (res) 662 return res; 663 664 res = key1.access_mode - key2.access_mode; 665 if (res) 666 return res; 667 668 res = key1.access_flags - key2.access_flags; 669 if (res) 670 return res; 671 672 /* 673 * keep ndescs the last in the compare table since the find function 674 * searches for an exact match on all properties and only closest 675 * match in size. 676 */ 677 return key1.ndescs - key2.ndescs; 678 } 679 680 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache *cache, 681 struct mlx5_cache_ent *ent) 682 { 683 struct rb_node **new = &cache->rb_root.rb_node, *parent = NULL; 684 struct mlx5_cache_ent *cur; 685 int cmp; 686 687 /* Figure out where to put new node */ 688 while (*new) { 689 cur = rb_entry(*new, struct mlx5_cache_ent, node); 690 parent = *new; 691 cmp = cache_ent_key_cmp(cur->rb_key, ent->rb_key); 692 if (cmp > 0) 693 new = &((*new)->rb_left); 694 if (cmp < 0) 695 new = &((*new)->rb_right); 696 if (cmp == 0) { 697 mutex_unlock(&cache->rb_lock); 698 return -EEXIST; 699 } 700 } 701 702 /* Add new node and rebalance tree. */ 703 rb_link_node(&ent->node, parent, new); 704 rb_insert_color(&ent->node, &cache->rb_root); 705 706 return 0; 707 } 708 709 static struct mlx5_cache_ent * 710 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev *dev, 711 struct mlx5r_cache_rb_key rb_key) 712 { 713 struct rb_node *node = dev->cache.rb_root.rb_node; 714 struct mlx5_cache_ent *cur, *smallest = NULL; 715 int cmp; 716 717 /* 718 * Find the smallest ent with order >= requested_order. 719 */ 720 while (node) { 721 cur = rb_entry(node, struct mlx5_cache_ent, node); 722 cmp = cache_ent_key_cmp(cur->rb_key, rb_key); 723 if (cmp > 0) { 724 smallest = cur; 725 node = node->rb_left; 726 } 727 if (cmp < 0) 728 node = node->rb_right; 729 if (cmp == 0) 730 return cur; 731 } 732 733 return (smallest && 734 smallest->rb_key.access_mode == rb_key.access_mode && 735 smallest->rb_key.access_flags == rb_key.access_flags && 736 smallest->rb_key.ats == rb_key.ats) ? 737 smallest : 738 NULL; 739 } 740 741 static struct mlx5_ib_mr *_mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 742 struct mlx5_cache_ent *ent, 743 int access_flags) 744 { 745 struct mlx5_ib_mr *mr; 746 int err; 747 748 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 749 if (!mr) 750 return ERR_PTR(-ENOMEM); 751 752 xa_lock_irq(&ent->mkeys); 753 ent->in_use++; 754 755 if (!ent->stored) { 756 queue_adjust_cache_locked(ent); 757 ent->miss++; 758 xa_unlock_irq(&ent->mkeys); 759 err = create_cache_mkey(ent, &mr->mmkey.key); 760 if (err) { 761 xa_lock_irq(&ent->mkeys); 762 ent->in_use--; 763 xa_unlock_irq(&ent->mkeys); 764 kfree(mr); 765 return ERR_PTR(err); 766 } 767 } else { 768 mr->mmkey.key = pop_stored_mkey(ent); 769 queue_adjust_cache_locked(ent); 770 xa_unlock_irq(&ent->mkeys); 771 } 772 mr->mmkey.cache_ent = ent; 773 mr->mmkey.type = MLX5_MKEY_MR; 774 init_waitqueue_head(&mr->mmkey.wait); 775 return mr; 776 } 777 778 static int get_unchangeable_access_flags(struct mlx5_ib_dev *dev, 779 int access_flags) 780 { 781 int ret = 0; 782 783 if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && 784 MLX5_CAP_GEN(dev->mdev, atomic) && 785 MLX5_CAP_GEN(dev->mdev, umr_modify_atomic_disabled)) 786 ret |= IB_ACCESS_REMOTE_ATOMIC; 787 788 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 789 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write) && 790 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write_umr)) 791 ret |= IB_ACCESS_RELAXED_ORDERING; 792 793 if ((access_flags & IB_ACCESS_RELAXED_ORDERING) && 794 MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read) && 795 !MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read_umr)) 796 ret |= IB_ACCESS_RELAXED_ORDERING; 797 798 return ret; 799 } 800 801 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 802 int access_flags, int access_mode, 803 int ndescs) 804 { 805 struct mlx5r_cache_rb_key rb_key = { 806 .ndescs = ndescs, 807 .access_mode = access_mode, 808 .access_flags = get_unchangeable_access_flags(dev, access_flags) 809 }; 810 struct mlx5_cache_ent *ent = mkey_cache_ent_from_rb_key(dev, rb_key); 811 812 if (!ent) 813 return ERR_PTR(-EOPNOTSUPP); 814 815 return _mlx5_mr_cache_alloc(dev, ent, access_flags); 816 } 817 818 static void clean_keys(struct mlx5_ib_dev *dev, struct mlx5_cache_ent *ent) 819 { 820 u32 mkey; 821 822 cancel_delayed_work(&ent->dwork); 823 xa_lock_irq(&ent->mkeys); 824 while (ent->stored) { 825 mkey = pop_stored_mkey(ent); 826 xa_unlock_irq(&ent->mkeys); 827 mlx5_core_destroy_mkey(dev->mdev, mkey); 828 xa_lock_irq(&ent->mkeys); 829 } 830 xa_unlock_irq(&ent->mkeys); 831 } 832 833 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 834 { 835 if (!mlx5_debugfs_root || dev->is_rep) 836 return; 837 838 debugfs_remove_recursive(dev->cache.fs_root); 839 dev->cache.fs_root = NULL; 840 } 841 842 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev *dev, 843 struct mlx5_cache_ent *ent) 844 { 845 int order = order_base_2(ent->rb_key.ndescs); 846 struct dentry *dir; 847 848 if (!mlx5_debugfs_root || dev->is_rep) 849 return; 850 851 if (ent->rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 852 order = MLX5_IMR_KSM_CACHE_ENTRY + 2; 853 854 sprintf(ent->name, "%d", order); 855 dir = debugfs_create_dir(ent->name, dev->cache.fs_root); 856 debugfs_create_file("size", 0600, dir, ent, &size_fops); 857 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 858 debugfs_create_ulong("cur", 0400, dir, &ent->stored); 859 debugfs_create_u32("miss", 0600, dir, &ent->miss); 860 } 861 862 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 863 { 864 struct dentry *dbg_root = mlx5_debugfs_get_dev_root(dev->mdev); 865 struct mlx5_mkey_cache *cache = &dev->cache; 866 867 if (!mlx5_debugfs_root || dev->is_rep) 868 return; 869 870 cache->fs_root = debugfs_create_dir("mr_cache", dbg_root); 871 } 872 873 static void delay_time_func(struct timer_list *t) 874 { 875 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 876 877 WRITE_ONCE(dev->fill_delay, 0); 878 } 879 880 struct mlx5_cache_ent * 881 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev *dev, 882 struct mlx5r_cache_rb_key rb_key, 883 bool persistent_entry) 884 { 885 struct mlx5_cache_ent *ent; 886 int order; 887 int ret; 888 889 ent = kzalloc(sizeof(*ent), GFP_KERNEL); 890 if (!ent) 891 return ERR_PTR(-ENOMEM); 892 893 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); 894 ent->rb_key = rb_key; 895 ent->dev = dev; 896 ent->is_tmp = !persistent_entry; 897 898 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 899 900 ret = mlx5_cache_ent_insert(&dev->cache, ent); 901 if (ret) { 902 kfree(ent); 903 return ERR_PTR(ret); 904 } 905 906 if (persistent_entry) { 907 if (rb_key.access_mode == MLX5_MKC_ACCESS_MODE_KSM) 908 order = MLX5_IMR_KSM_CACHE_ENTRY; 909 else 910 order = order_base_2(rb_key.ndescs) - 2; 911 912 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 913 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 914 mlx5r_umr_can_load_pas(dev, 0)) 915 ent->limit = dev->mdev->profile.mr_cache[order].limit; 916 else 917 ent->limit = 0; 918 919 mlx5_mkey_cache_debugfs_add_ent(dev, ent); 920 } else { 921 mod_delayed_work(ent->dev->cache.wq, 922 &ent->dev->cache.remove_ent_dwork, 923 msecs_to_jiffies(30 * 1000)); 924 } 925 926 return ent; 927 } 928 929 static void remove_ent_work_func(struct work_struct *work) 930 { 931 struct mlx5_mkey_cache *cache; 932 struct mlx5_cache_ent *ent; 933 struct rb_node *cur; 934 935 cache = container_of(work, struct mlx5_mkey_cache, 936 remove_ent_dwork.work); 937 mutex_lock(&cache->rb_lock); 938 cur = rb_last(&cache->rb_root); 939 while (cur) { 940 ent = rb_entry(cur, struct mlx5_cache_ent, node); 941 cur = rb_prev(cur); 942 mutex_unlock(&cache->rb_lock); 943 944 xa_lock_irq(&ent->mkeys); 945 if (!ent->is_tmp) { 946 xa_unlock_irq(&ent->mkeys); 947 mutex_lock(&cache->rb_lock); 948 continue; 949 } 950 xa_unlock_irq(&ent->mkeys); 951 952 clean_keys(ent->dev, ent); 953 mutex_lock(&cache->rb_lock); 954 } 955 mutex_unlock(&cache->rb_lock); 956 } 957 958 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 959 { 960 struct mlx5_mkey_cache *cache = &dev->cache; 961 struct rb_root *root = &dev->cache.rb_root; 962 struct mlx5r_cache_rb_key rb_key = { 963 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 964 }; 965 struct mlx5_cache_ent *ent; 966 struct rb_node *node; 967 int ret; 968 int i; 969 970 mutex_init(&dev->slow_path_mutex); 971 mutex_init(&dev->cache.rb_lock); 972 dev->cache.rb_root = RB_ROOT; 973 INIT_DELAYED_WORK(&dev->cache.remove_ent_dwork, remove_ent_work_func); 974 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 975 if (!cache->wq) { 976 mlx5_ib_warn(dev, "failed to create work queue\n"); 977 return -ENOMEM; 978 } 979 980 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 981 timer_setup(&dev->delay_timer, delay_time_func, 0); 982 mlx5_mkey_cache_debugfs_init(dev); 983 mutex_lock(&cache->rb_lock); 984 for (i = 0; i <= mkey_cache_max_order(dev); i++) { 985 rb_key.ndescs = 1 << (i + 2); 986 ent = mlx5r_cache_create_ent_locked(dev, rb_key, true); 987 if (IS_ERR(ent)) { 988 ret = PTR_ERR(ent); 989 goto err; 990 } 991 } 992 993 ret = mlx5_odp_init_mkey_cache(dev); 994 if (ret) 995 goto err; 996 997 mutex_unlock(&cache->rb_lock); 998 for (node = rb_first(root); node; node = rb_next(node)) { 999 ent = rb_entry(node, struct mlx5_cache_ent, node); 1000 xa_lock_irq(&ent->mkeys); 1001 queue_adjust_cache_locked(ent); 1002 xa_unlock_irq(&ent->mkeys); 1003 } 1004 1005 return 0; 1006 1007 err: 1008 mutex_unlock(&cache->rb_lock); 1009 mlx5_mkey_cache_debugfs_cleanup(dev); 1010 mlx5_ib_warn(dev, "failed to create mkey cache entry\n"); 1011 return ret; 1012 } 1013 1014 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 1015 { 1016 struct rb_root *root = &dev->cache.rb_root; 1017 struct mlx5_cache_ent *ent; 1018 struct rb_node *node; 1019 1020 if (!dev->cache.wq) 1021 return; 1022 1023 cancel_delayed_work_sync(&dev->cache.remove_ent_dwork); 1024 mutex_lock(&dev->cache.rb_lock); 1025 for (node = rb_first(root); node; node = rb_next(node)) { 1026 ent = rb_entry(node, struct mlx5_cache_ent, node); 1027 xa_lock_irq(&ent->mkeys); 1028 ent->disabled = true; 1029 xa_unlock_irq(&ent->mkeys); 1030 cancel_delayed_work_sync(&ent->dwork); 1031 } 1032 1033 mlx5_mkey_cache_debugfs_cleanup(dev); 1034 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 1035 1036 node = rb_first(root); 1037 while (node) { 1038 ent = rb_entry(node, struct mlx5_cache_ent, node); 1039 node = rb_next(node); 1040 clean_keys(dev, ent); 1041 rb_erase(&ent->node, root); 1042 kfree(ent); 1043 } 1044 mutex_unlock(&dev->cache.rb_lock); 1045 1046 destroy_workqueue(dev->cache.wq); 1047 del_timer_sync(&dev->delay_timer); 1048 } 1049 1050 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 1051 { 1052 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1053 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1054 struct mlx5_ib_mr *mr; 1055 void *mkc; 1056 u32 *in; 1057 int err; 1058 1059 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1060 if (!mr) 1061 return ERR_PTR(-ENOMEM); 1062 1063 in = kzalloc(inlen, GFP_KERNEL); 1064 if (!in) { 1065 err = -ENOMEM; 1066 goto err_free; 1067 } 1068 1069 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1070 1071 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 1072 MLX5_SET(mkc, mkc, length64, 1); 1073 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 1074 pd); 1075 1076 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1077 if (err) 1078 goto err_in; 1079 1080 kfree(in); 1081 mr->mmkey.type = MLX5_MKEY_MR; 1082 mr->ibmr.lkey = mr->mmkey.key; 1083 mr->ibmr.rkey = mr->mmkey.key; 1084 mr->umem = NULL; 1085 1086 return &mr->ibmr; 1087 1088 err_in: 1089 kfree(in); 1090 1091 err_free: 1092 kfree(mr); 1093 1094 return ERR_PTR(err); 1095 } 1096 1097 static int get_octo_len(u64 addr, u64 len, int page_shift) 1098 { 1099 u64 page_size = 1ULL << page_shift; 1100 u64 offset; 1101 int npages; 1102 1103 offset = addr & (page_size - 1); 1104 npages = ALIGN(len + offset, page_size) >> page_shift; 1105 return (npages + 1) / 2; 1106 } 1107 1108 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 1109 { 1110 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 1111 return MKEY_CACHE_LAST_STD_ENTRY; 1112 return MLX5_MAX_UMR_SHIFT; 1113 } 1114 1115 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 1116 u64 length, int access_flags, u64 iova) 1117 { 1118 mr->ibmr.lkey = mr->mmkey.key; 1119 mr->ibmr.rkey = mr->mmkey.key; 1120 mr->ibmr.length = length; 1121 mr->ibmr.device = &dev->ib_dev; 1122 mr->ibmr.iova = iova; 1123 mr->access_flags = access_flags; 1124 } 1125 1126 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 1127 u64 iova) 1128 { 1129 /* 1130 * The alignment of iova has already been checked upon entering 1131 * UVERBS_METHOD_REG_DMABUF_MR 1132 */ 1133 umem->iova = iova; 1134 return PAGE_SIZE; 1135 } 1136 1137 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 1138 struct ib_umem *umem, u64 iova, 1139 int access_flags) 1140 { 1141 struct mlx5r_cache_rb_key rb_key = { 1142 .access_mode = MLX5_MKC_ACCESS_MODE_MTT, 1143 }; 1144 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1145 struct mlx5_cache_ent *ent; 1146 struct mlx5_ib_mr *mr; 1147 unsigned int page_size; 1148 1149 if (umem->is_dmabuf) 1150 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 1151 else 1152 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 1153 0, iova); 1154 if (WARN_ON(!page_size)) 1155 return ERR_PTR(-EINVAL); 1156 1157 rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size); 1158 rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags); 1159 rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags); 1160 ent = mkey_cache_ent_from_rb_key(dev, rb_key); 1161 /* 1162 * If the MR can't come from the cache then synchronously create an uncached 1163 * one. 1164 */ 1165 if (!ent) { 1166 mutex_lock(&dev->slow_path_mutex); 1167 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 1168 mutex_unlock(&dev->slow_path_mutex); 1169 if (IS_ERR(mr)) 1170 return mr; 1171 mr->mmkey.rb_key = rb_key; 1172 return mr; 1173 } 1174 1175 mr = _mlx5_mr_cache_alloc(dev, ent, access_flags); 1176 if (IS_ERR(mr)) 1177 return mr; 1178 1179 mr->ibmr.pd = pd; 1180 mr->umem = umem; 1181 mr->page_shift = order_base_2(page_size); 1182 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1183 1184 return mr; 1185 } 1186 1187 /* 1188 * If ibmr is NULL it will be allocated by reg_create. 1189 * Else, the given ibmr will be used. 1190 */ 1191 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 1192 u64 iova, int access_flags, 1193 unsigned int page_size, bool populate) 1194 { 1195 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1196 struct mlx5_ib_mr *mr; 1197 __be64 *pas; 1198 void *mkc; 1199 int inlen; 1200 u32 *in; 1201 int err; 1202 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 1203 1204 if (!page_size) 1205 return ERR_PTR(-EINVAL); 1206 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1207 if (!mr) 1208 return ERR_PTR(-ENOMEM); 1209 1210 mr->ibmr.pd = pd; 1211 mr->access_flags = access_flags; 1212 mr->page_shift = order_base_2(page_size); 1213 1214 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1215 if (populate) 1216 inlen += sizeof(*pas) * 1217 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 1218 in = kvzalloc(inlen, GFP_KERNEL); 1219 if (!in) { 1220 err = -ENOMEM; 1221 goto err_1; 1222 } 1223 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 1224 if (populate) { 1225 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 1226 err = -EINVAL; 1227 goto err_2; 1228 } 1229 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1230 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1231 } 1232 1233 /* The pg_access bit allows setting the access flags 1234 * in the page list submitted with the command. */ 1235 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1236 1237 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1238 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1239 populate ? pd : dev->umrc.pd); 1240 MLX5_SET(mkc, mkc, free, !populate); 1241 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1242 MLX5_SET(mkc, mkc, umr_en, 1); 1243 1244 MLX5_SET64(mkc, mkc, len, umem->length); 1245 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1246 MLX5_SET(mkc, mkc, translations_octword_size, 1247 get_octo_len(iova, umem->length, mr->page_shift)); 1248 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1249 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1250 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1251 if (populate) { 1252 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1253 get_octo_len(iova, umem->length, mr->page_shift)); 1254 } 1255 1256 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1257 if (err) { 1258 mlx5_ib_warn(dev, "create mkey failed\n"); 1259 goto err_2; 1260 } 1261 mr->mmkey.type = MLX5_MKEY_MR; 1262 mr->mmkey.ndescs = get_octo_len(iova, umem->length, mr->page_shift); 1263 mr->umem = umem; 1264 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1265 kvfree(in); 1266 1267 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1268 1269 return mr; 1270 1271 err_2: 1272 kvfree(in); 1273 err_1: 1274 kfree(mr); 1275 return ERR_PTR(err); 1276 } 1277 1278 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1279 u64 length, int acc, int mode) 1280 { 1281 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1282 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1283 struct mlx5_ib_mr *mr; 1284 void *mkc; 1285 u32 *in; 1286 int err; 1287 1288 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1289 if (!mr) 1290 return ERR_PTR(-ENOMEM); 1291 1292 in = kzalloc(inlen, GFP_KERNEL); 1293 if (!in) { 1294 err = -ENOMEM; 1295 goto err_free; 1296 } 1297 1298 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1299 1300 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1301 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1302 MLX5_SET64(mkc, mkc, len, length); 1303 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1304 1305 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1306 if (err) 1307 goto err_in; 1308 1309 kfree(in); 1310 1311 set_mr_fields(dev, mr, length, acc, start_addr); 1312 1313 return &mr->ibmr; 1314 1315 err_in: 1316 kfree(in); 1317 1318 err_free: 1319 kfree(mr); 1320 1321 return ERR_PTR(err); 1322 } 1323 1324 int mlx5_ib_advise_mr(struct ib_pd *pd, 1325 enum ib_uverbs_advise_mr_advice advice, 1326 u32 flags, 1327 struct ib_sge *sg_list, 1328 u32 num_sge, 1329 struct uverbs_attr_bundle *attrs) 1330 { 1331 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1332 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1333 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1334 return -EOPNOTSUPP; 1335 1336 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1337 sg_list, num_sge); 1338 } 1339 1340 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1341 struct ib_dm_mr_attr *attr, 1342 struct uverbs_attr_bundle *attrs) 1343 { 1344 struct mlx5_ib_dm *mdm = to_mdm(dm); 1345 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1346 u64 start_addr = mdm->dev_addr + attr->offset; 1347 int mode; 1348 1349 switch (mdm->type) { 1350 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1351 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1352 return ERR_PTR(-EINVAL); 1353 1354 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1355 start_addr -= pci_resource_start(dev->pdev, 0); 1356 break; 1357 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1358 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1359 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1360 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1361 return ERR_PTR(-EINVAL); 1362 1363 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1364 break; 1365 default: 1366 return ERR_PTR(-EINVAL); 1367 } 1368 1369 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1370 attr->access_flags, mode); 1371 } 1372 1373 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1374 u64 iova, int access_flags) 1375 { 1376 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1377 struct mlx5_ib_mr *mr = NULL; 1378 bool xlt_with_umr; 1379 int err; 1380 1381 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1382 if (xlt_with_umr) { 1383 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1384 } else { 1385 unsigned int page_size = mlx5_umem_find_best_pgsz( 1386 umem, mkc, log_page_size, 0, iova); 1387 1388 mutex_lock(&dev->slow_path_mutex); 1389 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1390 mutex_unlock(&dev->slow_path_mutex); 1391 } 1392 if (IS_ERR(mr)) { 1393 ib_umem_release(umem); 1394 return ERR_CAST(mr); 1395 } 1396 1397 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1398 1399 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1400 1401 if (xlt_with_umr) { 1402 /* 1403 * If the MR was created with reg_create then it will be 1404 * configured properly but left disabled. It is safe to go ahead 1405 * and configure it again via UMR while enabling it. 1406 */ 1407 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1408 if (err) { 1409 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1410 return ERR_PTR(err); 1411 } 1412 } 1413 return &mr->ibmr; 1414 } 1415 1416 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1417 u64 iova, int access_flags, 1418 struct ib_udata *udata) 1419 { 1420 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1421 struct ib_umem_odp *odp; 1422 struct mlx5_ib_mr *mr; 1423 int err; 1424 1425 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1426 return ERR_PTR(-EOPNOTSUPP); 1427 1428 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1429 if (err) 1430 return ERR_PTR(err); 1431 if (!start && length == U64_MAX) { 1432 if (iova != 0) 1433 return ERR_PTR(-EINVAL); 1434 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1435 return ERR_PTR(-EINVAL); 1436 1437 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1438 if (IS_ERR(mr)) 1439 return ERR_CAST(mr); 1440 return &mr->ibmr; 1441 } 1442 1443 /* ODP requires xlt update via umr to work. */ 1444 if (!mlx5r_umr_can_load_pas(dev, length)) 1445 return ERR_PTR(-EINVAL); 1446 1447 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1448 &mlx5_mn_ops); 1449 if (IS_ERR(odp)) 1450 return ERR_CAST(odp); 1451 1452 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1453 if (IS_ERR(mr)) { 1454 ib_umem_release(&odp->umem); 1455 return ERR_CAST(mr); 1456 } 1457 xa_init(&mr->implicit_children); 1458 1459 odp->private = mr; 1460 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1461 if (err) 1462 goto err_dereg_mr; 1463 1464 err = mlx5_ib_init_odp_mr(mr); 1465 if (err) 1466 goto err_dereg_mr; 1467 return &mr->ibmr; 1468 1469 err_dereg_mr: 1470 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1471 return ERR_PTR(err); 1472 } 1473 1474 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1475 u64 iova, int access_flags, 1476 struct ib_udata *udata) 1477 { 1478 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1479 struct ib_umem *umem; 1480 1481 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1482 return ERR_PTR(-EOPNOTSUPP); 1483 1484 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1485 start, iova, length, access_flags); 1486 1487 if (access_flags & IB_ACCESS_ON_DEMAND) 1488 return create_user_odp_mr(pd, start, length, iova, access_flags, 1489 udata); 1490 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1491 if (IS_ERR(umem)) 1492 return ERR_CAST(umem); 1493 return create_real_mr(pd, umem, iova, access_flags); 1494 } 1495 1496 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1497 { 1498 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1499 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1500 1501 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1502 1503 if (!umem_dmabuf->sgt) 1504 return; 1505 1506 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1507 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1508 } 1509 1510 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1511 .allow_peer2peer = 1, 1512 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1513 }; 1514 1515 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1516 u64 length, u64 virt_addr, 1517 int fd, int access_flags, 1518 struct ib_udata *udata) 1519 { 1520 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1521 struct mlx5_ib_mr *mr = NULL; 1522 struct ib_umem_dmabuf *umem_dmabuf; 1523 int err; 1524 1525 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1526 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1527 return ERR_PTR(-EOPNOTSUPP); 1528 1529 mlx5_ib_dbg(dev, 1530 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1531 offset, virt_addr, length, fd, access_flags); 1532 1533 /* dmabuf requires xlt update via umr to work. */ 1534 if (!mlx5r_umr_can_load_pas(dev, length)) 1535 return ERR_PTR(-EINVAL); 1536 1537 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1538 access_flags, 1539 &mlx5_ib_dmabuf_attach_ops); 1540 if (IS_ERR(umem_dmabuf)) { 1541 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1542 PTR_ERR(umem_dmabuf)); 1543 return ERR_CAST(umem_dmabuf); 1544 } 1545 1546 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1547 access_flags); 1548 if (IS_ERR(mr)) { 1549 ib_umem_release(&umem_dmabuf->umem); 1550 return ERR_CAST(mr); 1551 } 1552 1553 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1554 1555 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1556 umem_dmabuf->private = mr; 1557 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1558 if (err) 1559 goto err_dereg_mr; 1560 1561 err = mlx5_ib_init_dmabuf_mr(mr); 1562 if (err) 1563 goto err_dereg_mr; 1564 return &mr->ibmr; 1565 1566 err_dereg_mr: 1567 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1568 return ERR_PTR(err); 1569 } 1570 1571 /* 1572 * True if the change in access flags can be done via UMR, only some access 1573 * flags can be updated. 1574 */ 1575 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1576 unsigned int current_access_flags, 1577 unsigned int target_access_flags) 1578 { 1579 unsigned int diffs = current_access_flags ^ target_access_flags; 1580 1581 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1582 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1583 return false; 1584 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1585 target_access_flags); 1586 } 1587 1588 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1589 struct ib_umem *new_umem, 1590 int new_access_flags, u64 iova, 1591 unsigned long *page_size) 1592 { 1593 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1594 1595 /* We only track the allocated sizes of MRs from the cache */ 1596 if (!mr->mmkey.cache_ent) 1597 return false; 1598 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1599 return false; 1600 1601 *page_size = 1602 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1603 if (WARN_ON(!*page_size)) 1604 return false; 1605 return (mr->mmkey.cache_ent->rb_key.ndescs) >= 1606 ib_umem_num_dma_blocks(new_umem, *page_size); 1607 } 1608 1609 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1610 int access_flags, int flags, struct ib_umem *new_umem, 1611 u64 iova, unsigned long page_size) 1612 { 1613 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1614 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1615 struct ib_umem *old_umem = mr->umem; 1616 int err; 1617 1618 /* 1619 * To keep everything simple the MR is revoked before we start to mess 1620 * with it. This ensure the change is atomic relative to any use of the 1621 * MR. 1622 */ 1623 err = mlx5r_umr_revoke_mr(mr); 1624 if (err) 1625 return err; 1626 1627 if (flags & IB_MR_REREG_PD) { 1628 mr->ibmr.pd = pd; 1629 upd_flags |= MLX5_IB_UPD_XLT_PD; 1630 } 1631 if (flags & IB_MR_REREG_ACCESS) { 1632 mr->access_flags = access_flags; 1633 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1634 } 1635 1636 mr->ibmr.iova = iova; 1637 mr->ibmr.length = new_umem->length; 1638 mr->page_shift = order_base_2(page_size); 1639 mr->umem = new_umem; 1640 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1641 if (err) { 1642 /* 1643 * The MR is revoked at this point so there is no issue to free 1644 * new_umem. 1645 */ 1646 mr->umem = old_umem; 1647 return err; 1648 } 1649 1650 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1651 ib_umem_release(old_umem); 1652 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1653 return 0; 1654 } 1655 1656 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1657 u64 length, u64 iova, int new_access_flags, 1658 struct ib_pd *new_pd, 1659 struct ib_udata *udata) 1660 { 1661 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1662 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1663 int err; 1664 1665 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1666 return ERR_PTR(-EOPNOTSUPP); 1667 1668 mlx5_ib_dbg( 1669 dev, 1670 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1671 start, iova, length, new_access_flags); 1672 1673 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1674 return ERR_PTR(-EOPNOTSUPP); 1675 1676 if (!(flags & IB_MR_REREG_ACCESS)) 1677 new_access_flags = mr->access_flags; 1678 if (!(flags & IB_MR_REREG_PD)) 1679 new_pd = ib_mr->pd; 1680 1681 if (!(flags & IB_MR_REREG_TRANS)) { 1682 struct ib_umem *umem; 1683 1684 /* Fast path for PD/access change */ 1685 if (can_use_umr_rereg_access(dev, mr->access_flags, 1686 new_access_flags)) { 1687 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1688 new_access_flags); 1689 if (err) 1690 return ERR_PTR(err); 1691 return NULL; 1692 } 1693 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1694 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1695 goto recreate; 1696 1697 /* 1698 * Only one active MR can refer to a umem at one time, revoke 1699 * the old MR before assigning the umem to the new one. 1700 */ 1701 err = mlx5r_umr_revoke_mr(mr); 1702 if (err) 1703 return ERR_PTR(err); 1704 umem = mr->umem; 1705 mr->umem = NULL; 1706 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1707 1708 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1709 new_access_flags); 1710 } 1711 1712 /* 1713 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1714 * but the logic around releasing the umem is different 1715 */ 1716 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1717 goto recreate; 1718 1719 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1720 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1721 struct ib_umem *new_umem; 1722 unsigned long page_size; 1723 1724 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1725 new_access_flags); 1726 if (IS_ERR(new_umem)) 1727 return ERR_CAST(new_umem); 1728 1729 /* Fast path for PAS change */ 1730 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1731 &page_size)) { 1732 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1733 new_umem, iova, page_size); 1734 if (err) { 1735 ib_umem_release(new_umem); 1736 return ERR_PTR(err); 1737 } 1738 return NULL; 1739 } 1740 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1741 } 1742 1743 /* 1744 * Everything else has no state we can preserve, just create a new MR 1745 * from scratch 1746 */ 1747 recreate: 1748 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1749 new_access_flags, udata); 1750 } 1751 1752 static int 1753 mlx5_alloc_priv_descs(struct ib_device *device, 1754 struct mlx5_ib_mr *mr, 1755 int ndescs, 1756 int desc_size) 1757 { 1758 struct mlx5_ib_dev *dev = to_mdev(device); 1759 struct device *ddev = &dev->mdev->pdev->dev; 1760 int size = ndescs * desc_size; 1761 int add_size; 1762 int ret; 1763 1764 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1765 1766 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1767 if (!mr->descs_alloc) 1768 return -ENOMEM; 1769 1770 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1771 1772 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1773 if (dma_mapping_error(ddev, mr->desc_map)) { 1774 ret = -ENOMEM; 1775 goto err; 1776 } 1777 1778 return 0; 1779 err: 1780 kfree(mr->descs_alloc); 1781 1782 return ret; 1783 } 1784 1785 static void 1786 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1787 { 1788 if (!mr->umem && mr->descs) { 1789 struct ib_device *device = mr->ibmr.device; 1790 int size = mr->max_descs * mr->desc_size; 1791 struct mlx5_ib_dev *dev = to_mdev(device); 1792 1793 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1794 DMA_TO_DEVICE); 1795 kfree(mr->descs_alloc); 1796 mr->descs = NULL; 1797 } 1798 } 1799 1800 static int cache_ent_find_and_store(struct mlx5_ib_dev *dev, 1801 struct mlx5_ib_mr *mr) 1802 { 1803 struct mlx5_mkey_cache *cache = &dev->cache; 1804 struct mlx5_cache_ent *ent; 1805 int ret; 1806 1807 if (mr->mmkey.cache_ent) { 1808 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1809 mr->mmkey.cache_ent->in_use--; 1810 goto end; 1811 } 1812 1813 mutex_lock(&cache->rb_lock); 1814 ent = mkey_cache_ent_from_rb_key(dev, mr->mmkey.rb_key); 1815 if (ent) { 1816 if (ent->rb_key.ndescs == mr->mmkey.rb_key.ndescs) { 1817 if (ent->disabled) { 1818 mutex_unlock(&cache->rb_lock); 1819 return -EOPNOTSUPP; 1820 } 1821 mr->mmkey.cache_ent = ent; 1822 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1823 mutex_unlock(&cache->rb_lock); 1824 goto end; 1825 } 1826 } 1827 1828 ent = mlx5r_cache_create_ent_locked(dev, mr->mmkey.rb_key, false); 1829 mutex_unlock(&cache->rb_lock); 1830 if (IS_ERR(ent)) 1831 return PTR_ERR(ent); 1832 1833 mr->mmkey.cache_ent = ent; 1834 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1835 1836 end: 1837 ret = push_mkey_locked(mr->mmkey.cache_ent, false, 1838 xa_mk_value(mr->mmkey.key)); 1839 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); 1840 return ret; 1841 } 1842 1843 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1844 { 1845 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1846 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1847 int rc; 1848 1849 /* 1850 * Any async use of the mr must hold the refcount, once the refcount 1851 * goes to zero no other thread, such as ODP page faults, prefetch, any 1852 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1853 */ 1854 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1855 refcount_read(&mr->mmkey.usecount) != 0 && 1856 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1857 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1858 1859 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1860 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1861 mr->sig, NULL, GFP_KERNEL); 1862 1863 if (mr->mtt_mr) { 1864 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1865 if (rc) 1866 return rc; 1867 mr->mtt_mr = NULL; 1868 } 1869 if (mr->klm_mr) { 1870 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1871 if (rc) 1872 return rc; 1873 mr->klm_mr = NULL; 1874 } 1875 1876 if (mlx5_core_destroy_psv(dev->mdev, 1877 mr->sig->psv_memory.psv_idx)) 1878 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1879 mr->sig->psv_memory.psv_idx); 1880 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1881 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1882 mr->sig->psv_wire.psv_idx); 1883 kfree(mr->sig); 1884 mr->sig = NULL; 1885 } 1886 1887 /* Stop DMA */ 1888 if (mr->umem && mlx5r_umr_can_load_pas(dev, mr->umem->length)) 1889 if (mlx5r_umr_revoke_mr(mr) || 1890 cache_ent_find_and_store(dev, mr)) 1891 mr->mmkey.cache_ent = NULL; 1892 1893 if (!mr->mmkey.cache_ent) { 1894 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1895 if (rc) 1896 return rc; 1897 } 1898 1899 if (mr->umem) { 1900 bool is_odp = is_odp_mr(mr); 1901 1902 if (!is_odp) 1903 atomic_sub(ib_umem_num_pages(mr->umem), 1904 &dev->mdev->priv.reg_pages); 1905 ib_umem_release(mr->umem); 1906 if (is_odp) 1907 mlx5_ib_free_odp_mr(mr); 1908 } 1909 1910 if (!mr->mmkey.cache_ent) 1911 mlx5_free_priv_descs(mr); 1912 1913 kfree(mr); 1914 return 0; 1915 } 1916 1917 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1918 int access_mode, int page_shift) 1919 { 1920 void *mkc; 1921 1922 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1923 1924 /* This is only used from the kernel, so setting the PD is OK. */ 1925 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1926 MLX5_SET(mkc, mkc, free, 1); 1927 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1928 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1929 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1930 MLX5_SET(mkc, mkc, umr_en, 1); 1931 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1932 } 1933 1934 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1935 int ndescs, int desc_size, int page_shift, 1936 int access_mode, u32 *in, int inlen) 1937 { 1938 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1939 int err; 1940 1941 mr->access_mode = access_mode; 1942 mr->desc_size = desc_size; 1943 mr->max_descs = ndescs; 1944 1945 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1946 if (err) 1947 return err; 1948 1949 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1950 1951 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1952 if (err) 1953 goto err_free_descs; 1954 1955 mr->mmkey.type = MLX5_MKEY_MR; 1956 mr->ibmr.lkey = mr->mmkey.key; 1957 mr->ibmr.rkey = mr->mmkey.key; 1958 1959 return 0; 1960 1961 err_free_descs: 1962 mlx5_free_priv_descs(mr); 1963 return err; 1964 } 1965 1966 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1967 u32 max_num_sg, u32 max_num_meta_sg, 1968 int desc_size, int access_mode) 1969 { 1970 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1971 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1972 int page_shift = 0; 1973 struct mlx5_ib_mr *mr; 1974 u32 *in; 1975 int err; 1976 1977 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1978 if (!mr) 1979 return ERR_PTR(-ENOMEM); 1980 1981 mr->ibmr.pd = pd; 1982 mr->ibmr.device = pd->device; 1983 1984 in = kzalloc(inlen, GFP_KERNEL); 1985 if (!in) { 1986 err = -ENOMEM; 1987 goto err_free; 1988 } 1989 1990 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1991 page_shift = PAGE_SHIFT; 1992 1993 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1994 access_mode, in, inlen); 1995 if (err) 1996 goto err_free_in; 1997 1998 mr->umem = NULL; 1999 kfree(in); 2000 2001 return mr; 2002 2003 err_free_in: 2004 kfree(in); 2005 err_free: 2006 kfree(mr); 2007 return ERR_PTR(err); 2008 } 2009 2010 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2011 int ndescs, u32 *in, int inlen) 2012 { 2013 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 2014 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 2015 inlen); 2016 } 2017 2018 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2019 int ndescs, u32 *in, int inlen) 2020 { 2021 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 2022 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2023 } 2024 2025 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 2026 int max_num_sg, int max_num_meta_sg, 2027 u32 *in, int inlen) 2028 { 2029 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2030 u32 psv_index[2]; 2031 void *mkc; 2032 int err; 2033 2034 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 2035 if (!mr->sig) 2036 return -ENOMEM; 2037 2038 /* create mem & wire PSVs */ 2039 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 2040 if (err) 2041 goto err_free_sig; 2042 2043 mr->sig->psv_memory.psv_idx = psv_index[0]; 2044 mr->sig->psv_wire.psv_idx = psv_index[1]; 2045 2046 mr->sig->sig_status_checked = true; 2047 mr->sig->sig_err_exists = false; 2048 /* Next UMR, Arm SIGERR */ 2049 ++mr->sig->sigerr_count; 2050 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2051 sizeof(struct mlx5_klm), 2052 MLX5_MKC_ACCESS_MODE_KLMS); 2053 if (IS_ERR(mr->klm_mr)) { 2054 err = PTR_ERR(mr->klm_mr); 2055 goto err_destroy_psv; 2056 } 2057 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 2058 sizeof(struct mlx5_mtt), 2059 MLX5_MKC_ACCESS_MODE_MTT); 2060 if (IS_ERR(mr->mtt_mr)) { 2061 err = PTR_ERR(mr->mtt_mr); 2062 goto err_free_klm_mr; 2063 } 2064 2065 /* Set bsf descriptors for mkey */ 2066 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2067 MLX5_SET(mkc, mkc, bsf_en, 1); 2068 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 2069 2070 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 2071 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 2072 if (err) 2073 goto err_free_mtt_mr; 2074 2075 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 2076 mr->sig, GFP_KERNEL)); 2077 if (err) 2078 goto err_free_descs; 2079 return 0; 2080 2081 err_free_descs: 2082 destroy_mkey(dev, mr); 2083 mlx5_free_priv_descs(mr); 2084 err_free_mtt_mr: 2085 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2086 mr->mtt_mr = NULL; 2087 err_free_klm_mr: 2088 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2089 mr->klm_mr = NULL; 2090 err_destroy_psv: 2091 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 2092 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 2093 mr->sig->psv_memory.psv_idx); 2094 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 2095 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 2096 mr->sig->psv_wire.psv_idx); 2097 err_free_sig: 2098 kfree(mr->sig); 2099 2100 return err; 2101 } 2102 2103 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 2104 enum ib_mr_type mr_type, u32 max_num_sg, 2105 u32 max_num_meta_sg) 2106 { 2107 struct mlx5_ib_dev *dev = to_mdev(pd->device); 2108 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2109 int ndescs = ALIGN(max_num_sg, 4); 2110 struct mlx5_ib_mr *mr; 2111 u32 *in; 2112 int err; 2113 2114 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 2115 if (!mr) 2116 return ERR_PTR(-ENOMEM); 2117 2118 in = kzalloc(inlen, GFP_KERNEL); 2119 if (!in) { 2120 err = -ENOMEM; 2121 goto err_free; 2122 } 2123 2124 mr->ibmr.device = pd->device; 2125 mr->umem = NULL; 2126 2127 switch (mr_type) { 2128 case IB_MR_TYPE_MEM_REG: 2129 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 2130 break; 2131 case IB_MR_TYPE_SG_GAPS: 2132 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 2133 break; 2134 case IB_MR_TYPE_INTEGRITY: 2135 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 2136 max_num_meta_sg, in, inlen); 2137 break; 2138 default: 2139 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 2140 err = -EINVAL; 2141 } 2142 2143 if (err) 2144 goto err_free_in; 2145 2146 kfree(in); 2147 2148 return &mr->ibmr; 2149 2150 err_free_in: 2151 kfree(in); 2152 err_free: 2153 kfree(mr); 2154 return ERR_PTR(err); 2155 } 2156 2157 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 2158 u32 max_num_sg) 2159 { 2160 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 2161 } 2162 2163 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 2164 u32 max_num_sg, u32 max_num_meta_sg) 2165 { 2166 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 2167 max_num_meta_sg); 2168 } 2169 2170 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 2171 { 2172 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 2173 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 2174 struct mlx5_ib_mw *mw = to_mmw(ibmw); 2175 unsigned int ndescs; 2176 u32 *in = NULL; 2177 void *mkc; 2178 int err; 2179 struct mlx5_ib_alloc_mw req = {}; 2180 struct { 2181 __u32 comp_mask; 2182 __u32 response_length; 2183 } resp = {}; 2184 2185 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 2186 if (err) 2187 return err; 2188 2189 if (req.comp_mask || req.reserved1 || req.reserved2) 2190 return -EOPNOTSUPP; 2191 2192 if (udata->inlen > sizeof(req) && 2193 !ib_is_udata_cleared(udata, sizeof(req), 2194 udata->inlen - sizeof(req))) 2195 return -EOPNOTSUPP; 2196 2197 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 2198 2199 in = kzalloc(inlen, GFP_KERNEL); 2200 if (!in) 2201 return -ENOMEM; 2202 2203 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 2204 2205 MLX5_SET(mkc, mkc, free, 1); 2206 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 2207 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 2208 MLX5_SET(mkc, mkc, umr_en, 1); 2209 MLX5_SET(mkc, mkc, lr, 1); 2210 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 2211 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 2212 MLX5_SET(mkc, mkc, qpn, 0xffffff); 2213 2214 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 2215 if (err) 2216 goto free; 2217 2218 mw->mmkey.type = MLX5_MKEY_MW; 2219 ibmw->rkey = mw->mmkey.key; 2220 mw->mmkey.ndescs = ndescs; 2221 2222 resp.response_length = 2223 min(offsetofend(typeof(resp), response_length), udata->outlen); 2224 if (resp.response_length) { 2225 err = ib_copy_to_udata(udata, &resp, resp.response_length); 2226 if (err) 2227 goto free_mkey; 2228 } 2229 2230 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 2231 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 2232 if (err) 2233 goto free_mkey; 2234 } 2235 2236 kfree(in); 2237 return 0; 2238 2239 free_mkey: 2240 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 2241 free: 2242 kfree(in); 2243 return err; 2244 } 2245 2246 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 2247 { 2248 struct mlx5_ib_dev *dev = to_mdev(mw->device); 2249 struct mlx5_ib_mw *mmw = to_mmw(mw); 2250 2251 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 2252 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 2253 /* 2254 * pagefault_single_data_segment() may be accessing mmw 2255 * if the user bound an ODP MR to this MW. 2256 */ 2257 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 2258 2259 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 2260 } 2261 2262 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 2263 struct ib_mr_status *mr_status) 2264 { 2265 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 2266 int ret = 0; 2267 2268 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2269 pr_err("Invalid status check mask\n"); 2270 ret = -EINVAL; 2271 goto done; 2272 } 2273 2274 mr_status->fail_status = 0; 2275 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2276 if (!mmr->sig) { 2277 ret = -EINVAL; 2278 pr_err("signature status check requested on a non-signature enabled MR\n"); 2279 goto done; 2280 } 2281 2282 mmr->sig->sig_status_checked = true; 2283 if (!mmr->sig->sig_err_exists) 2284 goto done; 2285 2286 if (ibmr->lkey == mmr->sig->err_item.key) 2287 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2288 sizeof(mr_status->sig_err)); 2289 else { 2290 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2291 mr_status->sig_err.sig_err_offset = 0; 2292 mr_status->sig_err.key = mmr->sig->err_item.key; 2293 } 2294 2295 mmr->sig->sig_err_exists = false; 2296 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2297 } 2298 2299 done: 2300 return ret; 2301 } 2302 2303 static int 2304 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2305 int data_sg_nents, unsigned int *data_sg_offset, 2306 struct scatterlist *meta_sg, int meta_sg_nents, 2307 unsigned int *meta_sg_offset) 2308 { 2309 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2310 unsigned int sg_offset = 0; 2311 int n = 0; 2312 2313 mr->meta_length = 0; 2314 if (data_sg_nents == 1) { 2315 n++; 2316 mr->mmkey.ndescs = 1; 2317 if (data_sg_offset) 2318 sg_offset = *data_sg_offset; 2319 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2320 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2321 if (meta_sg_nents == 1) { 2322 n++; 2323 mr->meta_ndescs = 1; 2324 if (meta_sg_offset) 2325 sg_offset = *meta_sg_offset; 2326 else 2327 sg_offset = 0; 2328 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2329 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2330 } 2331 ibmr->length = mr->data_length + mr->meta_length; 2332 } 2333 2334 return n; 2335 } 2336 2337 static int 2338 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2339 struct scatterlist *sgl, 2340 unsigned short sg_nents, 2341 unsigned int *sg_offset_p, 2342 struct scatterlist *meta_sgl, 2343 unsigned short meta_sg_nents, 2344 unsigned int *meta_sg_offset_p) 2345 { 2346 struct scatterlist *sg = sgl; 2347 struct mlx5_klm *klms = mr->descs; 2348 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2349 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2350 int i, j = 0; 2351 2352 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2353 mr->ibmr.length = 0; 2354 2355 for_each_sg(sgl, sg, sg_nents, i) { 2356 if (unlikely(i >= mr->max_descs)) 2357 break; 2358 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2359 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2360 klms[i].key = cpu_to_be32(lkey); 2361 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2362 2363 sg_offset = 0; 2364 } 2365 2366 if (sg_offset_p) 2367 *sg_offset_p = sg_offset; 2368 2369 mr->mmkey.ndescs = i; 2370 mr->data_length = mr->ibmr.length; 2371 2372 if (meta_sg_nents) { 2373 sg = meta_sgl; 2374 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2375 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2376 if (unlikely(i + j >= mr->max_descs)) 2377 break; 2378 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2379 sg_offset); 2380 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2381 sg_offset); 2382 klms[i + j].key = cpu_to_be32(lkey); 2383 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2384 2385 sg_offset = 0; 2386 } 2387 if (meta_sg_offset_p) 2388 *meta_sg_offset_p = sg_offset; 2389 2390 mr->meta_ndescs = j; 2391 mr->meta_length = mr->ibmr.length - mr->data_length; 2392 } 2393 2394 return i + j; 2395 } 2396 2397 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2398 { 2399 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2400 __be64 *descs; 2401 2402 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2403 return -ENOMEM; 2404 2405 descs = mr->descs; 2406 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2407 2408 return 0; 2409 } 2410 2411 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2412 { 2413 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2414 __be64 *descs; 2415 2416 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2417 return -ENOMEM; 2418 2419 descs = mr->descs; 2420 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2421 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2422 2423 return 0; 2424 } 2425 2426 static int 2427 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2428 int data_sg_nents, unsigned int *data_sg_offset, 2429 struct scatterlist *meta_sg, int meta_sg_nents, 2430 unsigned int *meta_sg_offset) 2431 { 2432 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2433 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2434 int n; 2435 2436 pi_mr->mmkey.ndescs = 0; 2437 pi_mr->meta_ndescs = 0; 2438 pi_mr->meta_length = 0; 2439 2440 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2441 pi_mr->desc_size * pi_mr->max_descs, 2442 DMA_TO_DEVICE); 2443 2444 pi_mr->ibmr.page_size = ibmr->page_size; 2445 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2446 mlx5_set_page); 2447 if (n != data_sg_nents) 2448 return n; 2449 2450 pi_mr->data_iova = pi_mr->ibmr.iova; 2451 pi_mr->data_length = pi_mr->ibmr.length; 2452 pi_mr->ibmr.length = pi_mr->data_length; 2453 ibmr->length = pi_mr->data_length; 2454 2455 if (meta_sg_nents) { 2456 u64 page_mask = ~((u64)ibmr->page_size - 1); 2457 u64 iova = pi_mr->data_iova; 2458 2459 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2460 meta_sg_offset, mlx5_set_page_pi); 2461 2462 pi_mr->meta_length = pi_mr->ibmr.length; 2463 /* 2464 * PI address for the HW is the offset of the metadata address 2465 * relative to the first data page address. 2466 * It equals to first data page address + size of data pages + 2467 * metadata offset at the first metadata page 2468 */ 2469 pi_mr->pi_iova = (iova & page_mask) + 2470 pi_mr->mmkey.ndescs * ibmr->page_size + 2471 (pi_mr->ibmr.iova & ~page_mask); 2472 /* 2473 * In order to use one MTT MR for data and metadata, we register 2474 * also the gaps between the end of the data and the start of 2475 * the metadata (the sig MR will verify that the HW will access 2476 * to right addresses). This mapping is safe because we use 2477 * internal mkey for the registration. 2478 */ 2479 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2480 pi_mr->ibmr.iova = iova; 2481 ibmr->length += pi_mr->meta_length; 2482 } 2483 2484 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2485 pi_mr->desc_size * pi_mr->max_descs, 2486 DMA_TO_DEVICE); 2487 2488 return n; 2489 } 2490 2491 static int 2492 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2493 int data_sg_nents, unsigned int *data_sg_offset, 2494 struct scatterlist *meta_sg, int meta_sg_nents, 2495 unsigned int *meta_sg_offset) 2496 { 2497 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2498 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2499 int n; 2500 2501 pi_mr->mmkey.ndescs = 0; 2502 pi_mr->meta_ndescs = 0; 2503 pi_mr->meta_length = 0; 2504 2505 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2506 pi_mr->desc_size * pi_mr->max_descs, 2507 DMA_TO_DEVICE); 2508 2509 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2510 meta_sg, meta_sg_nents, meta_sg_offset); 2511 2512 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2513 pi_mr->desc_size * pi_mr->max_descs, 2514 DMA_TO_DEVICE); 2515 2516 /* This is zero-based memory region */ 2517 pi_mr->data_iova = 0; 2518 pi_mr->ibmr.iova = 0; 2519 pi_mr->pi_iova = pi_mr->data_length; 2520 ibmr->length = pi_mr->ibmr.length; 2521 2522 return n; 2523 } 2524 2525 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2526 int data_sg_nents, unsigned int *data_sg_offset, 2527 struct scatterlist *meta_sg, int meta_sg_nents, 2528 unsigned int *meta_sg_offset) 2529 { 2530 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2531 struct mlx5_ib_mr *pi_mr = NULL; 2532 int n; 2533 2534 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2535 2536 mr->mmkey.ndescs = 0; 2537 mr->data_length = 0; 2538 mr->data_iova = 0; 2539 mr->meta_ndescs = 0; 2540 mr->pi_iova = 0; 2541 /* 2542 * As a performance optimization, if possible, there is no need to 2543 * perform UMR operation to register the data/metadata buffers. 2544 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2545 * Fallback to UMR only in case of a failure. 2546 */ 2547 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2548 data_sg_offset, meta_sg, meta_sg_nents, 2549 meta_sg_offset); 2550 if (n == data_sg_nents + meta_sg_nents) 2551 goto out; 2552 /* 2553 * As a performance optimization, if possible, there is no need to map 2554 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2555 * descriptors and fallback to KLM only in case of a failure. 2556 * It's more efficient for the HW to work with MTT descriptors 2557 * (especially in high load). 2558 * Use KLM (indirect access) only if it's mandatory. 2559 */ 2560 pi_mr = mr->mtt_mr; 2561 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2562 data_sg_offset, meta_sg, meta_sg_nents, 2563 meta_sg_offset); 2564 if (n == data_sg_nents + meta_sg_nents) 2565 goto out; 2566 2567 pi_mr = mr->klm_mr; 2568 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2569 data_sg_offset, meta_sg, meta_sg_nents, 2570 meta_sg_offset); 2571 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2572 return -ENOMEM; 2573 2574 out: 2575 /* This is zero-based memory region */ 2576 ibmr->iova = 0; 2577 mr->pi_mr = pi_mr; 2578 if (pi_mr) 2579 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2580 else 2581 ibmr->sig_attrs->meta_length = mr->meta_length; 2582 2583 return 0; 2584 } 2585 2586 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2587 unsigned int *sg_offset) 2588 { 2589 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2590 int n; 2591 2592 mr->mmkey.ndescs = 0; 2593 2594 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2595 mr->desc_size * mr->max_descs, 2596 DMA_TO_DEVICE); 2597 2598 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2599 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2600 NULL); 2601 else 2602 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2603 mlx5_set_page); 2604 2605 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2606 mr->desc_size * mr->max_descs, 2607 DMA_TO_DEVICE); 2608 2609 return n; 2610 } 2611