1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem_odp.h> 43 #include "dm.h" 44 #include "mlx5_ib.h" 45 #include "umr.h" 46 47 enum { 48 MAX_PENDING_REG_MR = 8, 49 }; 50 51 #define MLX5_UMR_ALIGN 2048 52 53 static void 54 create_mkey_callback(int status, struct mlx5_async_work *context); 55 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 56 u64 iova, int access_flags, 57 unsigned int page_size, bool populate); 58 59 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 60 struct ib_pd *pd) 61 { 62 struct mlx5_ib_dev *dev = to_mdev(pd->device); 63 64 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 65 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 66 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 67 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 68 MLX5_SET(mkc, mkc, lr, 1); 69 70 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 71 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 72 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 73 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 76 } 77 78 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 79 MLX5_SET(mkc, mkc, qpn, 0xffffff); 80 MLX5_SET64(mkc, mkc, start_addr, start_addr); 81 } 82 83 static void assign_mkey_variant(struct mlx5_ib_dev *dev, u32 *mkey, u32 *in) 84 { 85 u8 key = atomic_inc_return(&dev->mkey_var); 86 void *mkc; 87 88 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 89 MLX5_SET(mkc, mkc, mkey_7_0, key); 90 *mkey = key; 91 } 92 93 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 94 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 95 { 96 int ret; 97 98 assign_mkey_variant(dev, &mkey->key, in); 99 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 100 if (!ret) 101 init_waitqueue_head(&mkey->wait); 102 103 return ret; 104 } 105 106 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey *async_create) 107 { 108 struct mlx5_ib_dev *dev = async_create->ent->dev; 109 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 110 size_t outlen = MLX5_ST_SZ_BYTES(create_mkey_out); 111 112 MLX5_SET(create_mkey_in, async_create->in, opcode, 113 MLX5_CMD_OP_CREATE_MKEY); 114 assign_mkey_variant(dev, &async_create->mkey, async_create->in); 115 return mlx5_cmd_exec_cb(&dev->async_ctx, async_create->in, inlen, 116 async_create->out, outlen, create_mkey_callback, 117 &async_create->cb_work); 118 } 119 120 static int mkey_cache_max_order(struct mlx5_ib_dev *dev); 121 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 122 123 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 124 { 125 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 126 127 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 128 } 129 130 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 131 { 132 if (status == -ENXIO) /* core driver is not available */ 133 return; 134 135 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 136 if (status != -EREMOTEIO) /* driver specific failure */ 137 return; 138 139 /* Failed in FW, print cmd out failure details */ 140 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 141 } 142 143 144 static int push_mkey(struct mlx5_cache_ent *ent, bool limit_pendings, 145 void *to_store) 146 { 147 XA_STATE(xas, &ent->mkeys, 0); 148 void *curr; 149 150 xa_lock_irq(&ent->mkeys); 151 if (limit_pendings && 152 (ent->reserved - ent->stored) > MAX_PENDING_REG_MR) { 153 xa_unlock_irq(&ent->mkeys); 154 return -EAGAIN; 155 } 156 while (1) { 157 /* 158 * This is cmpxchg (NULL, XA_ZERO_ENTRY) however this version 159 * doesn't transparently unlock. Instead we set the xas index to 160 * the current value of reserved every iteration. 161 */ 162 xas_set(&xas, ent->reserved); 163 curr = xas_load(&xas); 164 if (!curr) { 165 if (to_store && ent->stored == ent->reserved) 166 xas_store(&xas, to_store); 167 else 168 xas_store(&xas, XA_ZERO_ENTRY); 169 if (xas_valid(&xas)) { 170 ent->reserved++; 171 if (to_store) { 172 if (ent->stored != ent->reserved) 173 __xa_store(&ent->mkeys, 174 ent->stored, 175 to_store, 176 GFP_KERNEL); 177 ent->stored++; 178 queue_adjust_cache_locked(ent); 179 WRITE_ONCE(ent->dev->cache.last_add, 180 jiffies); 181 } 182 } 183 } 184 xa_unlock_irq(&ent->mkeys); 185 186 /* 187 * Notice xas_nomem() must always be called as it cleans 188 * up any cached allocation. 189 */ 190 if (!xas_nomem(&xas, GFP_KERNEL)) 191 break; 192 xa_lock_irq(&ent->mkeys); 193 } 194 if (xas_error(&xas)) 195 return xas_error(&xas); 196 if (WARN_ON(curr)) 197 return -EINVAL; 198 return 0; 199 } 200 201 static void undo_push_reserve_mkey(struct mlx5_cache_ent *ent) 202 { 203 void *old; 204 205 ent->reserved--; 206 old = __xa_erase(&ent->mkeys, ent->reserved); 207 WARN_ON(old); 208 } 209 210 static void push_to_reserved(struct mlx5_cache_ent *ent, u32 mkey) 211 { 212 void *old; 213 214 old = __xa_store(&ent->mkeys, ent->stored, xa_mk_value(mkey), 0); 215 WARN_ON(old); 216 ent->stored++; 217 } 218 219 static u32 pop_stored_mkey(struct mlx5_cache_ent *ent) 220 { 221 void *old, *xa_mkey; 222 223 ent->stored--; 224 ent->reserved--; 225 226 if (ent->stored == ent->reserved) { 227 xa_mkey = __xa_erase(&ent->mkeys, ent->stored); 228 WARN_ON(!xa_mkey); 229 return (u32)xa_to_value(xa_mkey); 230 } 231 232 xa_mkey = __xa_store(&ent->mkeys, ent->stored, XA_ZERO_ENTRY, 233 GFP_KERNEL); 234 WARN_ON(!xa_mkey || xa_is_err(xa_mkey)); 235 old = __xa_erase(&ent->mkeys, ent->reserved); 236 WARN_ON(old); 237 return (u32)xa_to_value(xa_mkey); 238 } 239 240 static void create_mkey_callback(int status, struct mlx5_async_work *context) 241 { 242 struct mlx5r_async_create_mkey *mkey_out = 243 container_of(context, struct mlx5r_async_create_mkey, cb_work); 244 struct mlx5_cache_ent *ent = mkey_out->ent; 245 struct mlx5_ib_dev *dev = ent->dev; 246 unsigned long flags; 247 248 if (status) { 249 create_mkey_warn(dev, status, mkey_out->out); 250 kfree(mkey_out); 251 xa_lock_irqsave(&ent->mkeys, flags); 252 undo_push_reserve_mkey(ent); 253 WRITE_ONCE(dev->fill_delay, 1); 254 xa_unlock_irqrestore(&ent->mkeys, flags); 255 mod_timer(&dev->delay_timer, jiffies + HZ); 256 return; 257 } 258 259 mkey_out->mkey |= mlx5_idx_to_mkey( 260 MLX5_GET(create_mkey_out, mkey_out->out, mkey_index)); 261 WRITE_ONCE(dev->cache.last_add, jiffies); 262 263 xa_lock_irqsave(&ent->mkeys, flags); 264 push_to_reserved(ent, mkey_out->mkey); 265 /* If we are doing fill_to_high_water then keep going. */ 266 queue_adjust_cache_locked(ent); 267 xa_unlock_irqrestore(&ent->mkeys, flags); 268 kfree(mkey_out); 269 } 270 271 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 272 { 273 int ret = 0; 274 275 switch (access_mode) { 276 case MLX5_MKC_ACCESS_MODE_MTT: 277 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 278 sizeof(struct mlx5_mtt)); 279 break; 280 case MLX5_MKC_ACCESS_MODE_KSM: 281 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 282 sizeof(struct mlx5_klm)); 283 break; 284 default: 285 WARN_ON(1); 286 } 287 return ret; 288 } 289 290 static void set_cache_mkc(struct mlx5_cache_ent *ent, void *mkc) 291 { 292 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 293 MLX5_SET(mkc, mkc, free, 1); 294 MLX5_SET(mkc, mkc, umr_en, 1); 295 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 296 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 297 298 MLX5_SET(mkc, mkc, translations_octword_size, 299 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 300 MLX5_SET(mkc, mkc, log_page_size, ent->page); 301 } 302 303 /* Asynchronously schedule new MRs to be populated in the cache. */ 304 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 305 { 306 struct mlx5r_async_create_mkey *async_create; 307 void *mkc; 308 int err = 0; 309 int i; 310 311 for (i = 0; i < num; i++) { 312 async_create = kzalloc(sizeof(struct mlx5r_async_create_mkey), 313 GFP_KERNEL); 314 if (!async_create) 315 return -ENOMEM; 316 mkc = MLX5_ADDR_OF(create_mkey_in, async_create->in, 317 memory_key_mkey_entry); 318 set_cache_mkc(ent, mkc); 319 async_create->ent = ent; 320 321 err = push_mkey(ent, true, NULL); 322 if (err) 323 goto free_async_create; 324 325 err = mlx5_ib_create_mkey_cb(async_create); 326 if (err) { 327 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 328 goto err_undo_reserve; 329 } 330 } 331 332 return 0; 333 334 err_undo_reserve: 335 xa_lock_irq(&ent->mkeys); 336 undo_push_reserve_mkey(ent); 337 xa_unlock_irq(&ent->mkeys); 338 free_async_create: 339 kfree(async_create); 340 return err; 341 } 342 343 /* Synchronously create a MR in the cache */ 344 static int create_cache_mkey(struct mlx5_cache_ent *ent, u32 *mkey) 345 { 346 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 347 void *mkc; 348 u32 *in; 349 int err; 350 351 in = kzalloc(inlen, GFP_KERNEL); 352 if (!in) 353 return -ENOMEM; 354 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 355 set_cache_mkc(ent, mkc); 356 357 err = mlx5_core_create_mkey(ent->dev->mdev, mkey, in, inlen); 358 if (err) 359 goto free_in; 360 361 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 362 free_in: 363 kfree(in); 364 return err; 365 } 366 367 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 368 { 369 u32 mkey; 370 371 lockdep_assert_held(&ent->mkeys.xa_lock); 372 if (!ent->stored) 373 return; 374 mkey = pop_stored_mkey(ent); 375 xa_unlock_irq(&ent->mkeys); 376 mlx5_core_destroy_mkey(ent->dev->mdev, mkey); 377 xa_lock_irq(&ent->mkeys); 378 } 379 380 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 381 bool limit_fill) 382 __acquires(&ent->mkeys) __releases(&ent->mkeys) 383 { 384 int err; 385 386 lockdep_assert_held(&ent->mkeys.xa_lock); 387 388 while (true) { 389 if (limit_fill) 390 target = ent->limit * 2; 391 if (target == ent->reserved) 392 return 0; 393 if (target > ent->reserved) { 394 u32 todo = target - ent->reserved; 395 396 xa_unlock_irq(&ent->mkeys); 397 err = add_keys(ent, todo); 398 if (err == -EAGAIN) 399 usleep_range(3000, 5000); 400 xa_lock_irq(&ent->mkeys); 401 if (err) { 402 if (err != -EAGAIN) 403 return err; 404 } else 405 return 0; 406 } else { 407 remove_cache_mr_locked(ent); 408 } 409 } 410 } 411 412 static ssize_t size_write(struct file *filp, const char __user *buf, 413 size_t count, loff_t *pos) 414 { 415 struct mlx5_cache_ent *ent = filp->private_data; 416 u32 target; 417 int err; 418 419 err = kstrtou32_from_user(buf, count, 0, &target); 420 if (err) 421 return err; 422 423 /* 424 * Target is the new value of total_mrs the user requests, however we 425 * cannot free MRs that are in use. Compute the target value for stored 426 * mkeys. 427 */ 428 xa_lock_irq(&ent->mkeys); 429 if (target < ent->in_use) { 430 err = -EINVAL; 431 goto err_unlock; 432 } 433 target = target - ent->in_use; 434 if (target < ent->limit || target > ent->limit*2) { 435 err = -EINVAL; 436 goto err_unlock; 437 } 438 err = resize_available_mrs(ent, target, false); 439 if (err) 440 goto err_unlock; 441 xa_unlock_irq(&ent->mkeys); 442 443 return count; 444 445 err_unlock: 446 xa_unlock_irq(&ent->mkeys); 447 return err; 448 } 449 450 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 451 loff_t *pos) 452 { 453 struct mlx5_cache_ent *ent = filp->private_data; 454 char lbuf[20]; 455 int err; 456 457 err = snprintf(lbuf, sizeof(lbuf), "%ld\n", ent->stored + ent->in_use); 458 if (err < 0) 459 return err; 460 461 return simple_read_from_buffer(buf, count, pos, lbuf, err); 462 } 463 464 static const struct file_operations size_fops = { 465 .owner = THIS_MODULE, 466 .open = simple_open, 467 .write = size_write, 468 .read = size_read, 469 }; 470 471 static ssize_t limit_write(struct file *filp, const char __user *buf, 472 size_t count, loff_t *pos) 473 { 474 struct mlx5_cache_ent *ent = filp->private_data; 475 u32 var; 476 int err; 477 478 err = kstrtou32_from_user(buf, count, 0, &var); 479 if (err) 480 return err; 481 482 /* 483 * Upon set we immediately fill the cache to high water mark implied by 484 * the limit. 485 */ 486 xa_lock_irq(&ent->mkeys); 487 ent->limit = var; 488 err = resize_available_mrs(ent, 0, true); 489 xa_unlock_irq(&ent->mkeys); 490 if (err) 491 return err; 492 return count; 493 } 494 495 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 496 loff_t *pos) 497 { 498 struct mlx5_cache_ent *ent = filp->private_data; 499 char lbuf[20]; 500 int err; 501 502 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 503 if (err < 0) 504 return err; 505 506 return simple_read_from_buffer(buf, count, pos, lbuf, err); 507 } 508 509 static const struct file_operations limit_fops = { 510 .owner = THIS_MODULE, 511 .open = simple_open, 512 .write = limit_write, 513 .read = limit_read, 514 }; 515 516 static bool someone_adding(struct mlx5_mkey_cache *cache) 517 { 518 unsigned int i; 519 520 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 521 struct mlx5_cache_ent *ent = &cache->ent[i]; 522 bool ret; 523 524 xa_lock_irq(&ent->mkeys); 525 ret = ent->stored < ent->limit; 526 xa_unlock_irq(&ent->mkeys); 527 if (ret) 528 return true; 529 } 530 return false; 531 } 532 533 /* 534 * Check if the bucket is outside the high/low water mark and schedule an async 535 * update. The cache refill has hysteresis, once the low water mark is hit it is 536 * refilled up to the high mark. 537 */ 538 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 539 { 540 lockdep_assert_held(&ent->mkeys.xa_lock); 541 542 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 543 return; 544 if (ent->stored < ent->limit) { 545 ent->fill_to_high_water = true; 546 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 547 } else if (ent->fill_to_high_water && 548 ent->reserved < 2 * ent->limit) { 549 /* 550 * Once we start populating due to hitting a low water mark 551 * continue until we pass the high water mark. 552 */ 553 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 554 } else if (ent->stored == 2 * ent->limit) { 555 ent->fill_to_high_water = false; 556 } else if (ent->stored > 2 * ent->limit) { 557 /* Queue deletion of excess entries */ 558 ent->fill_to_high_water = false; 559 if (ent->stored != ent->reserved) 560 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 561 msecs_to_jiffies(1000)); 562 else 563 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 564 } 565 } 566 567 static void __cache_work_func(struct mlx5_cache_ent *ent) 568 { 569 struct mlx5_ib_dev *dev = ent->dev; 570 struct mlx5_mkey_cache *cache = &dev->cache; 571 int err; 572 573 xa_lock_irq(&ent->mkeys); 574 if (ent->disabled) 575 goto out; 576 577 if (ent->fill_to_high_water && ent->reserved < 2 * ent->limit && 578 !READ_ONCE(dev->fill_delay)) { 579 xa_unlock_irq(&ent->mkeys); 580 err = add_keys(ent, 1); 581 xa_lock_irq(&ent->mkeys); 582 if (ent->disabled) 583 goto out; 584 if (err) { 585 /* 586 * EAGAIN only happens if there are pending MRs, so we 587 * will be rescheduled when storing them. The only 588 * failure path here is ENOMEM. 589 */ 590 if (err != -EAGAIN) { 591 mlx5_ib_warn( 592 dev, 593 "command failed order %d, err %d\n", 594 ent->order, err); 595 queue_delayed_work(cache->wq, &ent->dwork, 596 msecs_to_jiffies(1000)); 597 } 598 } 599 } else if (ent->stored > 2 * ent->limit) { 600 bool need_delay; 601 602 /* 603 * The remove_cache_mr() logic is performed as garbage 604 * collection task. Such task is intended to be run when no 605 * other active processes are running. 606 * 607 * The need_resched() will return TRUE if there are user tasks 608 * to be activated in near future. 609 * 610 * In such case, we don't execute remove_cache_mr() and postpone 611 * the garbage collection work to try to run in next cycle, in 612 * order to free CPU resources to other tasks. 613 */ 614 xa_unlock_irq(&ent->mkeys); 615 need_delay = need_resched() || someone_adding(cache) || 616 !time_after(jiffies, 617 READ_ONCE(cache->last_add) + 300 * HZ); 618 xa_lock_irq(&ent->mkeys); 619 if (ent->disabled) 620 goto out; 621 if (need_delay) { 622 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 623 goto out; 624 } 625 remove_cache_mr_locked(ent); 626 queue_adjust_cache_locked(ent); 627 } 628 out: 629 xa_unlock_irq(&ent->mkeys); 630 } 631 632 static void delayed_cache_work_func(struct work_struct *work) 633 { 634 struct mlx5_cache_ent *ent; 635 636 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 637 __cache_work_func(ent); 638 } 639 640 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 641 struct mlx5_cache_ent *ent, 642 int access_flags) 643 { 644 struct mlx5_ib_mr *mr; 645 int err; 646 647 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) 648 return ERR_PTR(-EOPNOTSUPP); 649 650 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 651 if (!mr) 652 return ERR_PTR(-ENOMEM); 653 654 xa_lock_irq(&ent->mkeys); 655 ent->in_use++; 656 657 if (!ent->stored) { 658 queue_adjust_cache_locked(ent); 659 ent->miss++; 660 xa_unlock_irq(&ent->mkeys); 661 err = create_cache_mkey(ent, &mr->mmkey.key); 662 if (err) { 663 xa_lock_irq(&ent->mkeys); 664 ent->in_use--; 665 xa_unlock_irq(&ent->mkeys); 666 kfree(mr); 667 return ERR_PTR(err); 668 } 669 } else { 670 mr->mmkey.key = pop_stored_mkey(ent); 671 queue_adjust_cache_locked(ent); 672 xa_unlock_irq(&ent->mkeys); 673 } 674 mr->mmkey.cache_ent = ent; 675 mr->mmkey.type = MLX5_MKEY_MR; 676 init_waitqueue_head(&mr->mmkey.wait); 677 return mr; 678 } 679 680 static void clean_keys(struct mlx5_ib_dev *dev, int c) 681 { 682 struct mlx5_mkey_cache *cache = &dev->cache; 683 struct mlx5_cache_ent *ent = &cache->ent[c]; 684 u32 mkey; 685 686 cancel_delayed_work(&ent->dwork); 687 xa_lock_irq(&ent->mkeys); 688 while (ent->stored) { 689 mkey = pop_stored_mkey(ent); 690 xa_unlock_irq(&ent->mkeys); 691 mlx5_core_destroy_mkey(dev->mdev, mkey); 692 xa_lock_irq(&ent->mkeys); 693 } 694 xa_unlock_irq(&ent->mkeys); 695 } 696 697 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 698 { 699 if (!mlx5_debugfs_root || dev->is_rep) 700 return; 701 702 debugfs_remove_recursive(dev->cache.root); 703 dev->cache.root = NULL; 704 } 705 706 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev *dev) 707 { 708 struct mlx5_mkey_cache *cache = &dev->cache; 709 struct mlx5_cache_ent *ent; 710 struct dentry *dir; 711 int i; 712 713 if (!mlx5_debugfs_root || dev->is_rep) 714 return; 715 716 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 717 718 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 719 ent = &cache->ent[i]; 720 sprintf(ent->name, "%d", ent->order); 721 dir = debugfs_create_dir(ent->name, cache->root); 722 debugfs_create_file("size", 0600, dir, ent, &size_fops); 723 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 724 debugfs_create_ulong("cur", 0400, dir, &ent->stored); 725 debugfs_create_u32("miss", 0600, dir, &ent->miss); 726 } 727 } 728 729 static void delay_time_func(struct timer_list *t) 730 { 731 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 732 733 WRITE_ONCE(dev->fill_delay, 0); 734 } 735 736 int mlx5_mkey_cache_init(struct mlx5_ib_dev *dev) 737 { 738 struct mlx5_mkey_cache *cache = &dev->cache; 739 struct mlx5_cache_ent *ent; 740 int i; 741 742 mutex_init(&dev->slow_path_mutex); 743 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 744 if (!cache->wq) { 745 mlx5_ib_warn(dev, "failed to create work queue\n"); 746 return -ENOMEM; 747 } 748 749 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 750 timer_setup(&dev->delay_timer, delay_time_func, 0); 751 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 752 ent = &cache->ent[i]; 753 xa_init_flags(&ent->mkeys, XA_FLAGS_LOCK_IRQ); 754 ent->order = i + 2; 755 ent->dev = dev; 756 ent->limit = 0; 757 758 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 759 760 if (i > MKEY_CACHE_LAST_STD_ENTRY) { 761 mlx5_odp_init_mkey_cache_entry(ent); 762 continue; 763 } 764 765 if (ent->order > mkey_cache_max_order(dev)) 766 continue; 767 768 ent->page = PAGE_SHIFT; 769 ent->ndescs = 1 << ent->order; 770 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 771 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 772 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 773 mlx5r_umr_can_load_pas(dev, 0)) 774 ent->limit = dev->mdev->profile.mr_cache[i].limit; 775 else 776 ent->limit = 0; 777 xa_lock_irq(&ent->mkeys); 778 queue_adjust_cache_locked(ent); 779 xa_unlock_irq(&ent->mkeys); 780 } 781 782 mlx5_mkey_cache_debugfs_init(dev); 783 784 return 0; 785 } 786 787 int mlx5_mkey_cache_cleanup(struct mlx5_ib_dev *dev) 788 { 789 unsigned int i; 790 791 if (!dev->cache.wq) 792 return 0; 793 794 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) { 795 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 796 797 xa_lock_irq(&ent->mkeys); 798 ent->disabled = true; 799 xa_unlock_irq(&ent->mkeys); 800 cancel_delayed_work_sync(&ent->dwork); 801 } 802 803 mlx5_mkey_cache_debugfs_cleanup(dev); 804 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 805 806 for (i = 0; i < MAX_MKEY_CACHE_ENTRIES; i++) 807 clean_keys(dev, i); 808 809 destroy_workqueue(dev->cache.wq); 810 del_timer_sync(&dev->delay_timer); 811 812 return 0; 813 } 814 815 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 816 { 817 struct mlx5_ib_dev *dev = to_mdev(pd->device); 818 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 819 struct mlx5_ib_mr *mr; 820 void *mkc; 821 u32 *in; 822 int err; 823 824 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 825 if (!mr) 826 return ERR_PTR(-ENOMEM); 827 828 in = kzalloc(inlen, GFP_KERNEL); 829 if (!in) { 830 err = -ENOMEM; 831 goto err_free; 832 } 833 834 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 835 836 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 837 MLX5_SET(mkc, mkc, length64, 1); 838 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 839 pd); 840 841 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 842 if (err) 843 goto err_in; 844 845 kfree(in); 846 mr->mmkey.type = MLX5_MKEY_MR; 847 mr->ibmr.lkey = mr->mmkey.key; 848 mr->ibmr.rkey = mr->mmkey.key; 849 mr->umem = NULL; 850 851 return &mr->ibmr; 852 853 err_in: 854 kfree(in); 855 856 err_free: 857 kfree(mr); 858 859 return ERR_PTR(err); 860 } 861 862 static int get_octo_len(u64 addr, u64 len, int page_shift) 863 { 864 u64 page_size = 1ULL << page_shift; 865 u64 offset; 866 int npages; 867 868 offset = addr & (page_size - 1); 869 npages = ALIGN(len + offset, page_size) >> page_shift; 870 return (npages + 1) / 2; 871 } 872 873 static int mkey_cache_max_order(struct mlx5_ib_dev *dev) 874 { 875 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 876 return MKEY_CACHE_LAST_STD_ENTRY + 2; 877 return MLX5_MAX_UMR_SHIFT; 878 } 879 880 static struct mlx5_cache_ent *mkey_cache_ent_from_order(struct mlx5_ib_dev *dev, 881 unsigned int order) 882 { 883 struct mlx5_mkey_cache *cache = &dev->cache; 884 885 if (order < cache->ent[0].order) 886 return &cache->ent[0]; 887 order = order - cache->ent[0].order; 888 if (order > MKEY_CACHE_LAST_STD_ENTRY) 889 return NULL; 890 return &cache->ent[order]; 891 } 892 893 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 894 u64 length, int access_flags, u64 iova) 895 { 896 mr->ibmr.lkey = mr->mmkey.key; 897 mr->ibmr.rkey = mr->mmkey.key; 898 mr->ibmr.length = length; 899 mr->ibmr.device = &dev->ib_dev; 900 mr->ibmr.iova = iova; 901 mr->access_flags = access_flags; 902 } 903 904 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 905 u64 iova) 906 { 907 /* 908 * The alignment of iova has already been checked upon entering 909 * UVERBS_METHOD_REG_DMABUF_MR 910 */ 911 umem->iova = iova; 912 return PAGE_SIZE; 913 } 914 915 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 916 struct ib_umem *umem, u64 iova, 917 int access_flags) 918 { 919 struct mlx5_ib_dev *dev = to_mdev(pd->device); 920 struct mlx5_cache_ent *ent; 921 struct mlx5_ib_mr *mr; 922 unsigned int page_size; 923 924 if (umem->is_dmabuf) 925 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 926 else 927 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 928 0, iova); 929 if (WARN_ON(!page_size)) 930 return ERR_PTR(-EINVAL); 931 ent = mkey_cache_ent_from_order( 932 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 933 /* 934 * Matches access in alloc_cache_mr(). If the MR can't come from the 935 * cache then synchronously create an uncached one. 936 */ 937 if (!ent || ent->limit == 0 || 938 !mlx5r_umr_can_reconfig(dev, 0, access_flags) || 939 mlx5_umem_needs_ats(dev, umem, access_flags)) { 940 mutex_lock(&dev->slow_path_mutex); 941 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 942 mutex_unlock(&dev->slow_path_mutex); 943 return mr; 944 } 945 946 mr = mlx5_mr_cache_alloc(dev, ent, access_flags); 947 if (IS_ERR(mr)) 948 return mr; 949 950 mr->ibmr.pd = pd; 951 mr->umem = umem; 952 mr->page_shift = order_base_2(page_size); 953 set_mr_fields(dev, mr, umem->length, access_flags, iova); 954 955 return mr; 956 } 957 958 /* 959 * If ibmr is NULL it will be allocated by reg_create. 960 * Else, the given ibmr will be used. 961 */ 962 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 963 u64 iova, int access_flags, 964 unsigned int page_size, bool populate) 965 { 966 struct mlx5_ib_dev *dev = to_mdev(pd->device); 967 struct mlx5_ib_mr *mr; 968 __be64 *pas; 969 void *mkc; 970 int inlen; 971 u32 *in; 972 int err; 973 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 974 975 if (!page_size) 976 return ERR_PTR(-EINVAL); 977 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 978 if (!mr) 979 return ERR_PTR(-ENOMEM); 980 981 mr->ibmr.pd = pd; 982 mr->access_flags = access_flags; 983 mr->page_shift = order_base_2(page_size); 984 985 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 986 if (populate) 987 inlen += sizeof(*pas) * 988 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 989 in = kvzalloc(inlen, GFP_KERNEL); 990 if (!in) { 991 err = -ENOMEM; 992 goto err_1; 993 } 994 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 995 if (populate) { 996 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 997 err = -EINVAL; 998 goto err_2; 999 } 1000 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 1001 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 1002 } 1003 1004 /* The pg_access bit allows setting the access flags 1005 * in the page list submitted with the command. */ 1006 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 1007 1008 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1009 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 1010 populate ? pd : dev->umrc.pd); 1011 MLX5_SET(mkc, mkc, free, !populate); 1012 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 1013 MLX5_SET(mkc, mkc, umr_en, 1); 1014 1015 MLX5_SET64(mkc, mkc, len, umem->length); 1016 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 1017 MLX5_SET(mkc, mkc, translations_octword_size, 1018 get_octo_len(iova, umem->length, mr->page_shift)); 1019 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 1020 if (mlx5_umem_needs_ats(dev, umem, access_flags)) 1021 MLX5_SET(mkc, mkc, ma_translation_mode, 1); 1022 if (populate) { 1023 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 1024 get_octo_len(iova, umem->length, mr->page_shift)); 1025 } 1026 1027 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1028 if (err) { 1029 mlx5_ib_warn(dev, "create mkey failed\n"); 1030 goto err_2; 1031 } 1032 mr->mmkey.type = MLX5_MKEY_MR; 1033 mr->umem = umem; 1034 set_mr_fields(dev, mr, umem->length, access_flags, iova); 1035 kvfree(in); 1036 1037 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 1038 1039 return mr; 1040 1041 err_2: 1042 kvfree(in); 1043 err_1: 1044 kfree(mr); 1045 return ERR_PTR(err); 1046 } 1047 1048 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1049 u64 length, int acc, int mode) 1050 { 1051 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1052 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1053 struct mlx5_ib_mr *mr; 1054 void *mkc; 1055 u32 *in; 1056 int err; 1057 1058 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1059 if (!mr) 1060 return ERR_PTR(-ENOMEM); 1061 1062 in = kzalloc(inlen, GFP_KERNEL); 1063 if (!in) { 1064 err = -ENOMEM; 1065 goto err_free; 1066 } 1067 1068 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1069 1070 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1071 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1072 MLX5_SET64(mkc, mkc, len, length); 1073 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1074 1075 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1076 if (err) 1077 goto err_in; 1078 1079 kfree(in); 1080 1081 set_mr_fields(dev, mr, length, acc, start_addr); 1082 1083 return &mr->ibmr; 1084 1085 err_in: 1086 kfree(in); 1087 1088 err_free: 1089 kfree(mr); 1090 1091 return ERR_PTR(err); 1092 } 1093 1094 int mlx5_ib_advise_mr(struct ib_pd *pd, 1095 enum ib_uverbs_advise_mr_advice advice, 1096 u32 flags, 1097 struct ib_sge *sg_list, 1098 u32 num_sge, 1099 struct uverbs_attr_bundle *attrs) 1100 { 1101 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1102 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1103 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1104 return -EOPNOTSUPP; 1105 1106 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1107 sg_list, num_sge); 1108 } 1109 1110 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1111 struct ib_dm_mr_attr *attr, 1112 struct uverbs_attr_bundle *attrs) 1113 { 1114 struct mlx5_ib_dm *mdm = to_mdm(dm); 1115 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1116 u64 start_addr = mdm->dev_addr + attr->offset; 1117 int mode; 1118 1119 switch (mdm->type) { 1120 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1121 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1122 return ERR_PTR(-EINVAL); 1123 1124 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1125 start_addr -= pci_resource_start(dev->pdev, 0); 1126 break; 1127 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1128 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1129 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM: 1130 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1131 return ERR_PTR(-EINVAL); 1132 1133 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1134 break; 1135 default: 1136 return ERR_PTR(-EINVAL); 1137 } 1138 1139 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1140 attr->access_flags, mode); 1141 } 1142 1143 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1144 u64 iova, int access_flags) 1145 { 1146 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1147 struct mlx5_ib_mr *mr = NULL; 1148 bool xlt_with_umr; 1149 int err; 1150 1151 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1152 if (xlt_with_umr) { 1153 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1154 } else { 1155 unsigned int page_size = mlx5_umem_find_best_pgsz( 1156 umem, mkc, log_page_size, 0, iova); 1157 1158 mutex_lock(&dev->slow_path_mutex); 1159 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1160 mutex_unlock(&dev->slow_path_mutex); 1161 } 1162 if (IS_ERR(mr)) { 1163 ib_umem_release(umem); 1164 return ERR_CAST(mr); 1165 } 1166 1167 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1168 1169 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1170 1171 if (xlt_with_umr) { 1172 /* 1173 * If the MR was created with reg_create then it will be 1174 * configured properly but left disabled. It is safe to go ahead 1175 * and configure it again via UMR while enabling it. 1176 */ 1177 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1178 if (err) { 1179 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1180 return ERR_PTR(err); 1181 } 1182 } 1183 return &mr->ibmr; 1184 } 1185 1186 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1187 u64 iova, int access_flags, 1188 struct ib_udata *udata) 1189 { 1190 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1191 struct ib_umem_odp *odp; 1192 struct mlx5_ib_mr *mr; 1193 int err; 1194 1195 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1196 return ERR_PTR(-EOPNOTSUPP); 1197 1198 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1199 if (err) 1200 return ERR_PTR(err); 1201 if (!start && length == U64_MAX) { 1202 if (iova != 0) 1203 return ERR_PTR(-EINVAL); 1204 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1205 return ERR_PTR(-EINVAL); 1206 1207 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1208 if (IS_ERR(mr)) 1209 return ERR_CAST(mr); 1210 return &mr->ibmr; 1211 } 1212 1213 /* ODP requires xlt update via umr to work. */ 1214 if (!mlx5r_umr_can_load_pas(dev, length)) 1215 return ERR_PTR(-EINVAL); 1216 1217 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1218 &mlx5_mn_ops); 1219 if (IS_ERR(odp)) 1220 return ERR_CAST(odp); 1221 1222 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1223 if (IS_ERR(mr)) { 1224 ib_umem_release(&odp->umem); 1225 return ERR_CAST(mr); 1226 } 1227 xa_init(&mr->implicit_children); 1228 1229 odp->private = mr; 1230 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1231 if (err) 1232 goto err_dereg_mr; 1233 1234 err = mlx5_ib_init_odp_mr(mr); 1235 if (err) 1236 goto err_dereg_mr; 1237 return &mr->ibmr; 1238 1239 err_dereg_mr: 1240 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1241 return ERR_PTR(err); 1242 } 1243 1244 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1245 u64 iova, int access_flags, 1246 struct ib_udata *udata) 1247 { 1248 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1249 struct ib_umem *umem; 1250 1251 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1252 return ERR_PTR(-EOPNOTSUPP); 1253 1254 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1255 start, iova, length, access_flags); 1256 1257 if (access_flags & IB_ACCESS_ON_DEMAND) 1258 return create_user_odp_mr(pd, start, length, iova, access_flags, 1259 udata); 1260 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1261 if (IS_ERR(umem)) 1262 return ERR_CAST(umem); 1263 return create_real_mr(pd, umem, iova, access_flags); 1264 } 1265 1266 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1267 { 1268 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1269 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1270 1271 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1272 1273 if (!umem_dmabuf->sgt) 1274 return; 1275 1276 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1277 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1278 } 1279 1280 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1281 .allow_peer2peer = 1, 1282 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1283 }; 1284 1285 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1286 u64 length, u64 virt_addr, 1287 int fd, int access_flags, 1288 struct ib_udata *udata) 1289 { 1290 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1291 struct mlx5_ib_mr *mr = NULL; 1292 struct ib_umem_dmabuf *umem_dmabuf; 1293 int err; 1294 1295 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1296 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1297 return ERR_PTR(-EOPNOTSUPP); 1298 1299 mlx5_ib_dbg(dev, 1300 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1301 offset, virt_addr, length, fd, access_flags); 1302 1303 /* dmabuf requires xlt update via umr to work. */ 1304 if (!mlx5r_umr_can_load_pas(dev, length)) 1305 return ERR_PTR(-EINVAL); 1306 1307 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1308 access_flags, 1309 &mlx5_ib_dmabuf_attach_ops); 1310 if (IS_ERR(umem_dmabuf)) { 1311 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1312 PTR_ERR(umem_dmabuf)); 1313 return ERR_CAST(umem_dmabuf); 1314 } 1315 1316 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1317 access_flags); 1318 if (IS_ERR(mr)) { 1319 ib_umem_release(&umem_dmabuf->umem); 1320 return ERR_CAST(mr); 1321 } 1322 1323 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1324 1325 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1326 umem_dmabuf->private = mr; 1327 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1328 if (err) 1329 goto err_dereg_mr; 1330 1331 err = mlx5_ib_init_dmabuf_mr(mr); 1332 if (err) 1333 goto err_dereg_mr; 1334 return &mr->ibmr; 1335 1336 err_dereg_mr: 1337 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1338 return ERR_PTR(err); 1339 } 1340 1341 /* 1342 * True if the change in access flags can be done via UMR, only some access 1343 * flags can be updated. 1344 */ 1345 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1346 unsigned int current_access_flags, 1347 unsigned int target_access_flags) 1348 { 1349 unsigned int diffs = current_access_flags ^ target_access_flags; 1350 1351 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1352 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1353 return false; 1354 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1355 target_access_flags); 1356 } 1357 1358 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1359 struct ib_umem *new_umem, 1360 int new_access_flags, u64 iova, 1361 unsigned long *page_size) 1362 { 1363 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1364 1365 /* We only track the allocated sizes of MRs from the cache */ 1366 if (!mr->mmkey.cache_ent) 1367 return false; 1368 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1369 return false; 1370 1371 *page_size = 1372 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1373 if (WARN_ON(!*page_size)) 1374 return false; 1375 return (1ULL << mr->mmkey.cache_ent->order) >= 1376 ib_umem_num_dma_blocks(new_umem, *page_size); 1377 } 1378 1379 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1380 int access_flags, int flags, struct ib_umem *new_umem, 1381 u64 iova, unsigned long page_size) 1382 { 1383 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1384 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1385 struct ib_umem *old_umem = mr->umem; 1386 int err; 1387 1388 /* 1389 * To keep everything simple the MR is revoked before we start to mess 1390 * with it. This ensure the change is atomic relative to any use of the 1391 * MR. 1392 */ 1393 err = mlx5r_umr_revoke_mr(mr); 1394 if (err) 1395 return err; 1396 1397 if (flags & IB_MR_REREG_PD) { 1398 mr->ibmr.pd = pd; 1399 upd_flags |= MLX5_IB_UPD_XLT_PD; 1400 } 1401 if (flags & IB_MR_REREG_ACCESS) { 1402 mr->access_flags = access_flags; 1403 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1404 } 1405 1406 mr->ibmr.iova = iova; 1407 mr->ibmr.length = new_umem->length; 1408 mr->page_shift = order_base_2(page_size); 1409 mr->umem = new_umem; 1410 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1411 if (err) { 1412 /* 1413 * The MR is revoked at this point so there is no issue to free 1414 * new_umem. 1415 */ 1416 mr->umem = old_umem; 1417 return err; 1418 } 1419 1420 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1421 ib_umem_release(old_umem); 1422 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1423 return 0; 1424 } 1425 1426 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1427 u64 length, u64 iova, int new_access_flags, 1428 struct ib_pd *new_pd, 1429 struct ib_udata *udata) 1430 { 1431 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1432 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1433 int err; 1434 1435 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1436 return ERR_PTR(-EOPNOTSUPP); 1437 1438 mlx5_ib_dbg( 1439 dev, 1440 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1441 start, iova, length, new_access_flags); 1442 1443 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1444 return ERR_PTR(-EOPNOTSUPP); 1445 1446 if (!(flags & IB_MR_REREG_ACCESS)) 1447 new_access_flags = mr->access_flags; 1448 if (!(flags & IB_MR_REREG_PD)) 1449 new_pd = ib_mr->pd; 1450 1451 if (!(flags & IB_MR_REREG_TRANS)) { 1452 struct ib_umem *umem; 1453 1454 /* Fast path for PD/access change */ 1455 if (can_use_umr_rereg_access(dev, mr->access_flags, 1456 new_access_flags)) { 1457 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1458 new_access_flags); 1459 if (err) 1460 return ERR_PTR(err); 1461 return NULL; 1462 } 1463 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1464 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1465 goto recreate; 1466 1467 /* 1468 * Only one active MR can refer to a umem at one time, revoke 1469 * the old MR before assigning the umem to the new one. 1470 */ 1471 err = mlx5r_umr_revoke_mr(mr); 1472 if (err) 1473 return ERR_PTR(err); 1474 umem = mr->umem; 1475 mr->umem = NULL; 1476 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1477 1478 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1479 new_access_flags); 1480 } 1481 1482 /* 1483 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1484 * but the logic around releasing the umem is different 1485 */ 1486 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1487 goto recreate; 1488 1489 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1490 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1491 struct ib_umem *new_umem; 1492 unsigned long page_size; 1493 1494 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1495 new_access_flags); 1496 if (IS_ERR(new_umem)) 1497 return ERR_CAST(new_umem); 1498 1499 /* Fast path for PAS change */ 1500 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1501 &page_size)) { 1502 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1503 new_umem, iova, page_size); 1504 if (err) { 1505 ib_umem_release(new_umem); 1506 return ERR_PTR(err); 1507 } 1508 return NULL; 1509 } 1510 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1511 } 1512 1513 /* 1514 * Everything else has no state we can preserve, just create a new MR 1515 * from scratch 1516 */ 1517 recreate: 1518 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1519 new_access_flags, udata); 1520 } 1521 1522 static int 1523 mlx5_alloc_priv_descs(struct ib_device *device, 1524 struct mlx5_ib_mr *mr, 1525 int ndescs, 1526 int desc_size) 1527 { 1528 struct mlx5_ib_dev *dev = to_mdev(device); 1529 struct device *ddev = &dev->mdev->pdev->dev; 1530 int size = ndescs * desc_size; 1531 int add_size; 1532 int ret; 1533 1534 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1535 1536 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1537 if (!mr->descs_alloc) 1538 return -ENOMEM; 1539 1540 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1541 1542 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1543 if (dma_mapping_error(ddev, mr->desc_map)) { 1544 ret = -ENOMEM; 1545 goto err; 1546 } 1547 1548 return 0; 1549 err: 1550 kfree(mr->descs_alloc); 1551 1552 return ret; 1553 } 1554 1555 static void 1556 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1557 { 1558 if (!mr->umem && mr->descs) { 1559 struct ib_device *device = mr->ibmr.device; 1560 int size = mr->max_descs * mr->desc_size; 1561 struct mlx5_ib_dev *dev = to_mdev(device); 1562 1563 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1564 DMA_TO_DEVICE); 1565 kfree(mr->descs_alloc); 1566 mr->descs = NULL; 1567 } 1568 } 1569 1570 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1571 { 1572 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1573 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1574 int rc; 1575 1576 /* 1577 * Any async use of the mr must hold the refcount, once the refcount 1578 * goes to zero no other thread, such as ODP page faults, prefetch, any 1579 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1580 */ 1581 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1582 refcount_read(&mr->mmkey.usecount) != 0 && 1583 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1584 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1585 1586 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1587 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1588 mr->sig, NULL, GFP_KERNEL); 1589 1590 if (mr->mtt_mr) { 1591 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1592 if (rc) 1593 return rc; 1594 mr->mtt_mr = NULL; 1595 } 1596 if (mr->klm_mr) { 1597 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1598 if (rc) 1599 return rc; 1600 mr->klm_mr = NULL; 1601 } 1602 1603 if (mlx5_core_destroy_psv(dev->mdev, 1604 mr->sig->psv_memory.psv_idx)) 1605 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1606 mr->sig->psv_memory.psv_idx); 1607 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1608 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1609 mr->sig->psv_wire.psv_idx); 1610 kfree(mr->sig); 1611 mr->sig = NULL; 1612 } 1613 1614 /* Stop DMA */ 1615 if (mr->mmkey.cache_ent) { 1616 xa_lock_irq(&mr->mmkey.cache_ent->mkeys); 1617 mr->mmkey.cache_ent->in_use--; 1618 xa_unlock_irq(&mr->mmkey.cache_ent->mkeys); 1619 1620 if (mlx5r_umr_revoke_mr(mr) || 1621 push_mkey(mr->mmkey.cache_ent, false, 1622 xa_mk_value(mr->mmkey.key))) 1623 mr->mmkey.cache_ent = NULL; 1624 } 1625 if (!mr->mmkey.cache_ent) { 1626 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1627 if (rc) 1628 return rc; 1629 } 1630 1631 if (mr->umem) { 1632 bool is_odp = is_odp_mr(mr); 1633 1634 if (!is_odp) 1635 atomic_sub(ib_umem_num_pages(mr->umem), 1636 &dev->mdev->priv.reg_pages); 1637 ib_umem_release(mr->umem); 1638 if (is_odp) 1639 mlx5_ib_free_odp_mr(mr); 1640 } 1641 1642 if (!mr->mmkey.cache_ent) 1643 mlx5_free_priv_descs(mr); 1644 1645 kfree(mr); 1646 return 0; 1647 } 1648 1649 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1650 int access_mode, int page_shift) 1651 { 1652 void *mkc; 1653 1654 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1655 1656 /* This is only used from the kernel, so setting the PD is OK. */ 1657 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1658 MLX5_SET(mkc, mkc, free, 1); 1659 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1660 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1661 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1662 MLX5_SET(mkc, mkc, umr_en, 1); 1663 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1664 } 1665 1666 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1667 int ndescs, int desc_size, int page_shift, 1668 int access_mode, u32 *in, int inlen) 1669 { 1670 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1671 int err; 1672 1673 mr->access_mode = access_mode; 1674 mr->desc_size = desc_size; 1675 mr->max_descs = ndescs; 1676 1677 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1678 if (err) 1679 return err; 1680 1681 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1682 1683 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1684 if (err) 1685 goto err_free_descs; 1686 1687 mr->mmkey.type = MLX5_MKEY_MR; 1688 mr->ibmr.lkey = mr->mmkey.key; 1689 mr->ibmr.rkey = mr->mmkey.key; 1690 1691 return 0; 1692 1693 err_free_descs: 1694 mlx5_free_priv_descs(mr); 1695 return err; 1696 } 1697 1698 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1699 u32 max_num_sg, u32 max_num_meta_sg, 1700 int desc_size, int access_mode) 1701 { 1702 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1703 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1704 int page_shift = 0; 1705 struct mlx5_ib_mr *mr; 1706 u32 *in; 1707 int err; 1708 1709 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1710 if (!mr) 1711 return ERR_PTR(-ENOMEM); 1712 1713 mr->ibmr.pd = pd; 1714 mr->ibmr.device = pd->device; 1715 1716 in = kzalloc(inlen, GFP_KERNEL); 1717 if (!in) { 1718 err = -ENOMEM; 1719 goto err_free; 1720 } 1721 1722 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1723 page_shift = PAGE_SHIFT; 1724 1725 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1726 access_mode, in, inlen); 1727 if (err) 1728 goto err_free_in; 1729 1730 mr->umem = NULL; 1731 kfree(in); 1732 1733 return mr; 1734 1735 err_free_in: 1736 kfree(in); 1737 err_free: 1738 kfree(mr); 1739 return ERR_PTR(err); 1740 } 1741 1742 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1743 int ndescs, u32 *in, int inlen) 1744 { 1745 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 1746 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 1747 inlen); 1748 } 1749 1750 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1751 int ndescs, u32 *in, int inlen) 1752 { 1753 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 1754 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1755 } 1756 1757 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1758 int max_num_sg, int max_num_meta_sg, 1759 u32 *in, int inlen) 1760 { 1761 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1762 u32 psv_index[2]; 1763 void *mkc; 1764 int err; 1765 1766 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 1767 if (!mr->sig) 1768 return -ENOMEM; 1769 1770 /* create mem & wire PSVs */ 1771 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 1772 if (err) 1773 goto err_free_sig; 1774 1775 mr->sig->psv_memory.psv_idx = psv_index[0]; 1776 mr->sig->psv_wire.psv_idx = psv_index[1]; 1777 1778 mr->sig->sig_status_checked = true; 1779 mr->sig->sig_err_exists = false; 1780 /* Next UMR, Arm SIGERR */ 1781 ++mr->sig->sigerr_count; 1782 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1783 sizeof(struct mlx5_klm), 1784 MLX5_MKC_ACCESS_MODE_KLMS); 1785 if (IS_ERR(mr->klm_mr)) { 1786 err = PTR_ERR(mr->klm_mr); 1787 goto err_destroy_psv; 1788 } 1789 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1790 sizeof(struct mlx5_mtt), 1791 MLX5_MKC_ACCESS_MODE_MTT); 1792 if (IS_ERR(mr->mtt_mr)) { 1793 err = PTR_ERR(mr->mtt_mr); 1794 goto err_free_klm_mr; 1795 } 1796 1797 /* Set bsf descriptors for mkey */ 1798 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1799 MLX5_SET(mkc, mkc, bsf_en, 1); 1800 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 1801 1802 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 1803 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1804 if (err) 1805 goto err_free_mtt_mr; 1806 1807 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1808 mr->sig, GFP_KERNEL)); 1809 if (err) 1810 goto err_free_descs; 1811 return 0; 1812 1813 err_free_descs: 1814 destroy_mkey(dev, mr); 1815 mlx5_free_priv_descs(mr); 1816 err_free_mtt_mr: 1817 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1818 mr->mtt_mr = NULL; 1819 err_free_klm_mr: 1820 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1821 mr->klm_mr = NULL; 1822 err_destroy_psv: 1823 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 1824 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1825 mr->sig->psv_memory.psv_idx); 1826 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1827 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1828 mr->sig->psv_wire.psv_idx); 1829 err_free_sig: 1830 kfree(mr->sig); 1831 1832 return err; 1833 } 1834 1835 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 1836 enum ib_mr_type mr_type, u32 max_num_sg, 1837 u32 max_num_meta_sg) 1838 { 1839 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1840 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1841 int ndescs = ALIGN(max_num_sg, 4); 1842 struct mlx5_ib_mr *mr; 1843 u32 *in; 1844 int err; 1845 1846 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1847 if (!mr) 1848 return ERR_PTR(-ENOMEM); 1849 1850 in = kzalloc(inlen, GFP_KERNEL); 1851 if (!in) { 1852 err = -ENOMEM; 1853 goto err_free; 1854 } 1855 1856 mr->ibmr.device = pd->device; 1857 mr->umem = NULL; 1858 1859 switch (mr_type) { 1860 case IB_MR_TYPE_MEM_REG: 1861 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 1862 break; 1863 case IB_MR_TYPE_SG_GAPS: 1864 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 1865 break; 1866 case IB_MR_TYPE_INTEGRITY: 1867 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 1868 max_num_meta_sg, in, inlen); 1869 break; 1870 default: 1871 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 1872 err = -EINVAL; 1873 } 1874 1875 if (err) 1876 goto err_free_in; 1877 1878 kfree(in); 1879 1880 return &mr->ibmr; 1881 1882 err_free_in: 1883 kfree(in); 1884 err_free: 1885 kfree(mr); 1886 return ERR_PTR(err); 1887 } 1888 1889 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1890 u32 max_num_sg) 1891 { 1892 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 1893 } 1894 1895 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 1896 u32 max_num_sg, u32 max_num_meta_sg) 1897 { 1898 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 1899 max_num_meta_sg); 1900 } 1901 1902 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 1903 { 1904 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 1905 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1906 struct mlx5_ib_mw *mw = to_mmw(ibmw); 1907 unsigned int ndescs; 1908 u32 *in = NULL; 1909 void *mkc; 1910 int err; 1911 struct mlx5_ib_alloc_mw req = {}; 1912 struct { 1913 __u32 comp_mask; 1914 __u32 response_length; 1915 } resp = {}; 1916 1917 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 1918 if (err) 1919 return err; 1920 1921 if (req.comp_mask || req.reserved1 || req.reserved2) 1922 return -EOPNOTSUPP; 1923 1924 if (udata->inlen > sizeof(req) && 1925 !ib_is_udata_cleared(udata, sizeof(req), 1926 udata->inlen - sizeof(req))) 1927 return -EOPNOTSUPP; 1928 1929 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 1930 1931 in = kzalloc(inlen, GFP_KERNEL); 1932 if (!in) 1933 return -ENOMEM; 1934 1935 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1936 1937 MLX5_SET(mkc, mkc, free, 1); 1938 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1939 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 1940 MLX5_SET(mkc, mkc, umr_en, 1); 1941 MLX5_SET(mkc, mkc, lr, 1); 1942 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 1943 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 1944 MLX5_SET(mkc, mkc, qpn, 0xffffff); 1945 1946 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 1947 if (err) 1948 goto free; 1949 1950 mw->mmkey.type = MLX5_MKEY_MW; 1951 ibmw->rkey = mw->mmkey.key; 1952 mw->mmkey.ndescs = ndescs; 1953 1954 resp.response_length = 1955 min(offsetofend(typeof(resp), response_length), udata->outlen); 1956 if (resp.response_length) { 1957 err = ib_copy_to_udata(udata, &resp, resp.response_length); 1958 if (err) 1959 goto free_mkey; 1960 } 1961 1962 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 1963 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 1964 if (err) 1965 goto free_mkey; 1966 } 1967 1968 kfree(in); 1969 return 0; 1970 1971 free_mkey: 1972 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 1973 free: 1974 kfree(in); 1975 return err; 1976 } 1977 1978 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 1979 { 1980 struct mlx5_ib_dev *dev = to_mdev(mw->device); 1981 struct mlx5_ib_mw *mmw = to_mmw(mw); 1982 1983 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1984 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 1985 /* 1986 * pagefault_single_data_segment() may be accessing mmw 1987 * if the user bound an ODP MR to this MW. 1988 */ 1989 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 1990 1991 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 1992 } 1993 1994 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 1995 struct ib_mr_status *mr_status) 1996 { 1997 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1998 int ret = 0; 1999 2000 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 2001 pr_err("Invalid status check mask\n"); 2002 ret = -EINVAL; 2003 goto done; 2004 } 2005 2006 mr_status->fail_status = 0; 2007 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 2008 if (!mmr->sig) { 2009 ret = -EINVAL; 2010 pr_err("signature status check requested on a non-signature enabled MR\n"); 2011 goto done; 2012 } 2013 2014 mmr->sig->sig_status_checked = true; 2015 if (!mmr->sig->sig_err_exists) 2016 goto done; 2017 2018 if (ibmr->lkey == mmr->sig->err_item.key) 2019 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 2020 sizeof(mr_status->sig_err)); 2021 else { 2022 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 2023 mr_status->sig_err.sig_err_offset = 0; 2024 mr_status->sig_err.key = mmr->sig->err_item.key; 2025 } 2026 2027 mmr->sig->sig_err_exists = false; 2028 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 2029 } 2030 2031 done: 2032 return ret; 2033 } 2034 2035 static int 2036 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2037 int data_sg_nents, unsigned int *data_sg_offset, 2038 struct scatterlist *meta_sg, int meta_sg_nents, 2039 unsigned int *meta_sg_offset) 2040 { 2041 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2042 unsigned int sg_offset = 0; 2043 int n = 0; 2044 2045 mr->meta_length = 0; 2046 if (data_sg_nents == 1) { 2047 n++; 2048 mr->mmkey.ndescs = 1; 2049 if (data_sg_offset) 2050 sg_offset = *data_sg_offset; 2051 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2052 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2053 if (meta_sg_nents == 1) { 2054 n++; 2055 mr->meta_ndescs = 1; 2056 if (meta_sg_offset) 2057 sg_offset = *meta_sg_offset; 2058 else 2059 sg_offset = 0; 2060 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2061 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2062 } 2063 ibmr->length = mr->data_length + mr->meta_length; 2064 } 2065 2066 return n; 2067 } 2068 2069 static int 2070 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2071 struct scatterlist *sgl, 2072 unsigned short sg_nents, 2073 unsigned int *sg_offset_p, 2074 struct scatterlist *meta_sgl, 2075 unsigned short meta_sg_nents, 2076 unsigned int *meta_sg_offset_p) 2077 { 2078 struct scatterlist *sg = sgl; 2079 struct mlx5_klm *klms = mr->descs; 2080 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2081 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2082 int i, j = 0; 2083 2084 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2085 mr->ibmr.length = 0; 2086 2087 for_each_sg(sgl, sg, sg_nents, i) { 2088 if (unlikely(i >= mr->max_descs)) 2089 break; 2090 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2091 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2092 klms[i].key = cpu_to_be32(lkey); 2093 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2094 2095 sg_offset = 0; 2096 } 2097 2098 if (sg_offset_p) 2099 *sg_offset_p = sg_offset; 2100 2101 mr->mmkey.ndescs = i; 2102 mr->data_length = mr->ibmr.length; 2103 2104 if (meta_sg_nents) { 2105 sg = meta_sgl; 2106 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2107 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2108 if (unlikely(i + j >= mr->max_descs)) 2109 break; 2110 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2111 sg_offset); 2112 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2113 sg_offset); 2114 klms[i + j].key = cpu_to_be32(lkey); 2115 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2116 2117 sg_offset = 0; 2118 } 2119 if (meta_sg_offset_p) 2120 *meta_sg_offset_p = sg_offset; 2121 2122 mr->meta_ndescs = j; 2123 mr->meta_length = mr->ibmr.length - mr->data_length; 2124 } 2125 2126 return i + j; 2127 } 2128 2129 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2130 { 2131 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2132 __be64 *descs; 2133 2134 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2135 return -ENOMEM; 2136 2137 descs = mr->descs; 2138 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2139 2140 return 0; 2141 } 2142 2143 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2144 { 2145 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2146 __be64 *descs; 2147 2148 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2149 return -ENOMEM; 2150 2151 descs = mr->descs; 2152 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2153 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2154 2155 return 0; 2156 } 2157 2158 static int 2159 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2160 int data_sg_nents, unsigned int *data_sg_offset, 2161 struct scatterlist *meta_sg, int meta_sg_nents, 2162 unsigned int *meta_sg_offset) 2163 { 2164 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2165 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2166 int n; 2167 2168 pi_mr->mmkey.ndescs = 0; 2169 pi_mr->meta_ndescs = 0; 2170 pi_mr->meta_length = 0; 2171 2172 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2173 pi_mr->desc_size * pi_mr->max_descs, 2174 DMA_TO_DEVICE); 2175 2176 pi_mr->ibmr.page_size = ibmr->page_size; 2177 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2178 mlx5_set_page); 2179 if (n != data_sg_nents) 2180 return n; 2181 2182 pi_mr->data_iova = pi_mr->ibmr.iova; 2183 pi_mr->data_length = pi_mr->ibmr.length; 2184 pi_mr->ibmr.length = pi_mr->data_length; 2185 ibmr->length = pi_mr->data_length; 2186 2187 if (meta_sg_nents) { 2188 u64 page_mask = ~((u64)ibmr->page_size - 1); 2189 u64 iova = pi_mr->data_iova; 2190 2191 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2192 meta_sg_offset, mlx5_set_page_pi); 2193 2194 pi_mr->meta_length = pi_mr->ibmr.length; 2195 /* 2196 * PI address for the HW is the offset of the metadata address 2197 * relative to the first data page address. 2198 * It equals to first data page address + size of data pages + 2199 * metadata offset at the first metadata page 2200 */ 2201 pi_mr->pi_iova = (iova & page_mask) + 2202 pi_mr->mmkey.ndescs * ibmr->page_size + 2203 (pi_mr->ibmr.iova & ~page_mask); 2204 /* 2205 * In order to use one MTT MR for data and metadata, we register 2206 * also the gaps between the end of the data and the start of 2207 * the metadata (the sig MR will verify that the HW will access 2208 * to right addresses). This mapping is safe because we use 2209 * internal mkey for the registration. 2210 */ 2211 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2212 pi_mr->ibmr.iova = iova; 2213 ibmr->length += pi_mr->meta_length; 2214 } 2215 2216 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2217 pi_mr->desc_size * pi_mr->max_descs, 2218 DMA_TO_DEVICE); 2219 2220 return n; 2221 } 2222 2223 static int 2224 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2225 int data_sg_nents, unsigned int *data_sg_offset, 2226 struct scatterlist *meta_sg, int meta_sg_nents, 2227 unsigned int *meta_sg_offset) 2228 { 2229 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2230 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2231 int n; 2232 2233 pi_mr->mmkey.ndescs = 0; 2234 pi_mr->meta_ndescs = 0; 2235 pi_mr->meta_length = 0; 2236 2237 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2238 pi_mr->desc_size * pi_mr->max_descs, 2239 DMA_TO_DEVICE); 2240 2241 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2242 meta_sg, meta_sg_nents, meta_sg_offset); 2243 2244 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2245 pi_mr->desc_size * pi_mr->max_descs, 2246 DMA_TO_DEVICE); 2247 2248 /* This is zero-based memory region */ 2249 pi_mr->data_iova = 0; 2250 pi_mr->ibmr.iova = 0; 2251 pi_mr->pi_iova = pi_mr->data_length; 2252 ibmr->length = pi_mr->ibmr.length; 2253 2254 return n; 2255 } 2256 2257 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2258 int data_sg_nents, unsigned int *data_sg_offset, 2259 struct scatterlist *meta_sg, int meta_sg_nents, 2260 unsigned int *meta_sg_offset) 2261 { 2262 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2263 struct mlx5_ib_mr *pi_mr = NULL; 2264 int n; 2265 2266 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2267 2268 mr->mmkey.ndescs = 0; 2269 mr->data_length = 0; 2270 mr->data_iova = 0; 2271 mr->meta_ndescs = 0; 2272 mr->pi_iova = 0; 2273 /* 2274 * As a performance optimization, if possible, there is no need to 2275 * perform UMR operation to register the data/metadata buffers. 2276 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2277 * Fallback to UMR only in case of a failure. 2278 */ 2279 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2280 data_sg_offset, meta_sg, meta_sg_nents, 2281 meta_sg_offset); 2282 if (n == data_sg_nents + meta_sg_nents) 2283 goto out; 2284 /* 2285 * As a performance optimization, if possible, there is no need to map 2286 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2287 * descriptors and fallback to KLM only in case of a failure. 2288 * It's more efficient for the HW to work with MTT descriptors 2289 * (especially in high load). 2290 * Use KLM (indirect access) only if it's mandatory. 2291 */ 2292 pi_mr = mr->mtt_mr; 2293 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2294 data_sg_offset, meta_sg, meta_sg_nents, 2295 meta_sg_offset); 2296 if (n == data_sg_nents + meta_sg_nents) 2297 goto out; 2298 2299 pi_mr = mr->klm_mr; 2300 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2301 data_sg_offset, meta_sg, meta_sg_nents, 2302 meta_sg_offset); 2303 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2304 return -ENOMEM; 2305 2306 out: 2307 /* This is zero-based memory region */ 2308 ibmr->iova = 0; 2309 mr->pi_mr = pi_mr; 2310 if (pi_mr) 2311 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2312 else 2313 ibmr->sig_attrs->meta_length = mr->meta_length; 2314 2315 return 0; 2316 } 2317 2318 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2319 unsigned int *sg_offset) 2320 { 2321 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2322 int n; 2323 2324 mr->mmkey.ndescs = 0; 2325 2326 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2327 mr->desc_size * mr->max_descs, 2328 DMA_TO_DEVICE); 2329 2330 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2331 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2332 NULL); 2333 else 2334 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2335 mlx5_set_page); 2336 2337 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2338 mr->desc_size * mr->max_descs, 2339 DMA_TO_DEVICE); 2340 2341 return n; 2342 } 2343