1 /* 2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved. 3 * Copyright (c) 2020, Intel Corporation. All rights reserved. 4 * 5 * This software is available to you under a choice of one of two 6 * licenses. You may choose to be licensed under the terms of the GNU 7 * General Public License (GPL) Version 2, available from the file 8 * COPYING in the main directory of this source tree, or the 9 * OpenIB.org BSD license below: 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the following 13 * conditions are met: 14 * 15 * - Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the following 17 * disclaimer. 18 * 19 * - Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials 22 * provided with the distribution. 23 * 24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 31 * SOFTWARE. 32 */ 33 34 35 #include <linux/kref.h> 36 #include <linux/random.h> 37 #include <linux/debugfs.h> 38 #include <linux/export.h> 39 #include <linux/delay.h> 40 #include <linux/dma-buf.h> 41 #include <linux/dma-resv.h> 42 #include <rdma/ib_umem.h> 43 #include <rdma/ib_umem_odp.h> 44 #include <rdma/ib_verbs.h> 45 #include "dm.h" 46 #include "mlx5_ib.h" 47 #include "umr.h" 48 49 enum { 50 MAX_PENDING_REG_MR = 8, 51 }; 52 53 #define MLX5_UMR_ALIGN 2048 54 55 static void 56 create_mkey_callback(int status, struct mlx5_async_work *context); 57 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 58 u64 iova, int access_flags, 59 unsigned int page_size, bool populate); 60 61 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr, 62 struct ib_pd *pd) 63 { 64 struct mlx5_ib_dev *dev = to_mdev(pd->device); 65 66 MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC)); 67 MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE)); 68 MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ)); 69 MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE)); 70 MLX5_SET(mkc, mkc, lr, 1); 71 72 if ((acc & IB_ACCESS_RELAXED_ORDERING) && 73 pcie_relaxed_ordering_enabled(dev->mdev->pdev)) { 74 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_write)) 75 MLX5_SET(mkc, mkc, relaxed_ordering_write, 1); 76 if (MLX5_CAP_GEN(dev->mdev, relaxed_ordering_read)) 77 MLX5_SET(mkc, mkc, relaxed_ordering_read, 1); 78 } 79 80 MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn); 81 MLX5_SET(mkc, mkc, qpn, 0xffffff); 82 MLX5_SET64(mkc, mkc, start_addr, start_addr); 83 } 84 85 static void assign_mkey_variant(struct mlx5_ib_dev *dev, 86 struct mlx5_ib_mkey *mkey, u32 *in) 87 { 88 u8 key = atomic_inc_return(&dev->mkey_var); 89 void *mkc; 90 91 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 92 MLX5_SET(mkc, mkc, mkey_7_0, key); 93 mkey->key = key; 94 } 95 96 static int mlx5_ib_create_mkey(struct mlx5_ib_dev *dev, 97 struct mlx5_ib_mkey *mkey, u32 *in, int inlen) 98 { 99 int ret; 100 101 assign_mkey_variant(dev, mkey, in); 102 ret = mlx5_core_create_mkey(dev->mdev, &mkey->key, in, inlen); 103 if (!ret) 104 init_waitqueue_head(&mkey->wait); 105 106 return ret; 107 } 108 109 static int 110 mlx5_ib_create_mkey_cb(struct mlx5_ib_dev *dev, 111 struct mlx5_ib_mkey *mkey, 112 struct mlx5_async_ctx *async_ctx, 113 u32 *in, int inlen, u32 *out, int outlen, 114 struct mlx5_async_work *context) 115 { 116 MLX5_SET(create_mkey_in, in, opcode, MLX5_CMD_OP_CREATE_MKEY); 117 assign_mkey_variant(dev, mkey, in); 118 return mlx5_cmd_exec_cb(async_ctx, in, inlen, out, outlen, 119 create_mkey_callback, context); 120 } 121 122 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 123 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 124 125 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 126 { 127 WARN_ON(xa_load(&dev->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))); 128 129 return mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 130 } 131 132 static void create_mkey_warn(struct mlx5_ib_dev *dev, int status, void *out) 133 { 134 if (status == -ENXIO) /* core driver is not available */ 135 return; 136 137 mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); 138 if (status != -EREMOTEIO) /* driver specific failure */ 139 return; 140 141 /* Failed in FW, print cmd out failure details */ 142 mlx5_cmd_out_err(dev->mdev, MLX5_CMD_OP_CREATE_MKEY, 0, out); 143 } 144 145 static void create_mkey_callback(int status, struct mlx5_async_work *context) 146 { 147 struct mlx5_ib_mr *mr = 148 container_of(context, struct mlx5_ib_mr, cb_work); 149 struct mlx5_cache_ent *ent = mr->cache_ent; 150 struct mlx5_ib_dev *dev = ent->dev; 151 unsigned long flags; 152 153 if (status) { 154 create_mkey_warn(dev, status, mr->out); 155 kfree(mr); 156 spin_lock_irqsave(&ent->lock, flags); 157 ent->pending--; 158 WRITE_ONCE(dev->fill_delay, 1); 159 spin_unlock_irqrestore(&ent->lock, flags); 160 mod_timer(&dev->delay_timer, jiffies + HZ); 161 return; 162 } 163 164 mr->mmkey.type = MLX5_MKEY_MR; 165 mr->mmkey.key |= mlx5_idx_to_mkey( 166 MLX5_GET(create_mkey_out, mr->out, mkey_index)); 167 init_waitqueue_head(&mr->mmkey.wait); 168 169 WRITE_ONCE(dev->cache.last_add, jiffies); 170 171 spin_lock_irqsave(&ent->lock, flags); 172 list_add_tail(&mr->list, &ent->head); 173 ent->available_mrs++; 174 ent->total_mrs++; 175 /* If we are doing fill_to_high_water then keep going. */ 176 queue_adjust_cache_locked(ent); 177 ent->pending--; 178 spin_unlock_irqrestore(&ent->lock, flags); 179 } 180 181 static int get_mkc_octo_size(unsigned int access_mode, unsigned int ndescs) 182 { 183 int ret = 0; 184 185 switch (access_mode) { 186 case MLX5_MKC_ACCESS_MODE_MTT: 187 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 188 sizeof(struct mlx5_mtt)); 189 break; 190 case MLX5_MKC_ACCESS_MODE_KSM: 191 ret = DIV_ROUND_UP(ndescs, MLX5_IB_UMR_OCTOWORD / 192 sizeof(struct mlx5_klm)); 193 break; 194 default: 195 WARN_ON(1); 196 } 197 return ret; 198 } 199 200 static struct mlx5_ib_mr *alloc_cache_mr(struct mlx5_cache_ent *ent, void *mkc) 201 { 202 struct mlx5_ib_mr *mr; 203 204 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 205 if (!mr) 206 return NULL; 207 mr->cache_ent = ent; 208 209 set_mkc_access_pd_addr_fields(mkc, 0, 0, ent->dev->umrc.pd); 210 MLX5_SET(mkc, mkc, free, 1); 211 MLX5_SET(mkc, mkc, umr_en, 1); 212 MLX5_SET(mkc, mkc, access_mode_1_0, ent->access_mode & 0x3); 213 MLX5_SET(mkc, mkc, access_mode_4_2, (ent->access_mode >> 2) & 0x7); 214 215 MLX5_SET(mkc, mkc, translations_octword_size, 216 get_mkc_octo_size(ent->access_mode, ent->ndescs)); 217 MLX5_SET(mkc, mkc, log_page_size, ent->page); 218 return mr; 219 } 220 221 /* Asynchronously schedule new MRs to be populated in the cache. */ 222 static int add_keys(struct mlx5_cache_ent *ent, unsigned int num) 223 { 224 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 225 struct mlx5_ib_mr *mr; 226 void *mkc; 227 u32 *in; 228 int err = 0; 229 int i; 230 231 in = kzalloc(inlen, GFP_KERNEL); 232 if (!in) 233 return -ENOMEM; 234 235 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 236 for (i = 0; i < num; i++) { 237 mr = alloc_cache_mr(ent, mkc); 238 if (!mr) { 239 err = -ENOMEM; 240 break; 241 } 242 spin_lock_irq(&ent->lock); 243 if (ent->pending >= MAX_PENDING_REG_MR) { 244 err = -EAGAIN; 245 spin_unlock_irq(&ent->lock); 246 kfree(mr); 247 break; 248 } 249 ent->pending++; 250 spin_unlock_irq(&ent->lock); 251 err = mlx5_ib_create_mkey_cb(ent->dev, &mr->mmkey, 252 &ent->dev->async_ctx, in, inlen, 253 mr->out, sizeof(mr->out), 254 &mr->cb_work); 255 if (err) { 256 spin_lock_irq(&ent->lock); 257 ent->pending--; 258 spin_unlock_irq(&ent->lock); 259 mlx5_ib_warn(ent->dev, "create mkey failed %d\n", err); 260 kfree(mr); 261 break; 262 } 263 } 264 265 kfree(in); 266 return err; 267 } 268 269 /* Synchronously create a MR in the cache */ 270 static struct mlx5_ib_mr *create_cache_mr(struct mlx5_cache_ent *ent) 271 { 272 size_t inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 273 struct mlx5_ib_mr *mr; 274 void *mkc; 275 u32 *in; 276 int err; 277 278 in = kzalloc(inlen, GFP_KERNEL); 279 if (!in) 280 return ERR_PTR(-ENOMEM); 281 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 282 283 mr = alloc_cache_mr(ent, mkc); 284 if (!mr) { 285 err = -ENOMEM; 286 goto free_in; 287 } 288 289 err = mlx5_core_create_mkey(ent->dev->mdev, &mr->mmkey.key, in, inlen); 290 if (err) 291 goto free_mr; 292 293 init_waitqueue_head(&mr->mmkey.wait); 294 mr->mmkey.type = MLX5_MKEY_MR; 295 WRITE_ONCE(ent->dev->cache.last_add, jiffies); 296 spin_lock_irq(&ent->lock); 297 ent->total_mrs++; 298 spin_unlock_irq(&ent->lock); 299 kfree(in); 300 return mr; 301 free_mr: 302 kfree(mr); 303 free_in: 304 kfree(in); 305 return ERR_PTR(err); 306 } 307 308 static void remove_cache_mr_locked(struct mlx5_cache_ent *ent) 309 { 310 struct mlx5_ib_mr *mr; 311 312 lockdep_assert_held(&ent->lock); 313 if (list_empty(&ent->head)) 314 return; 315 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 316 list_del(&mr->list); 317 ent->available_mrs--; 318 ent->total_mrs--; 319 spin_unlock_irq(&ent->lock); 320 mlx5_core_destroy_mkey(ent->dev->mdev, mr->mmkey.key); 321 kfree(mr); 322 spin_lock_irq(&ent->lock); 323 } 324 325 static int resize_available_mrs(struct mlx5_cache_ent *ent, unsigned int target, 326 bool limit_fill) 327 { 328 int err; 329 330 lockdep_assert_held(&ent->lock); 331 332 while (true) { 333 if (limit_fill) 334 target = ent->limit * 2; 335 if (target == ent->available_mrs + ent->pending) 336 return 0; 337 if (target > ent->available_mrs + ent->pending) { 338 u32 todo = target - (ent->available_mrs + ent->pending); 339 340 spin_unlock_irq(&ent->lock); 341 err = add_keys(ent, todo); 342 if (err == -EAGAIN) 343 usleep_range(3000, 5000); 344 spin_lock_irq(&ent->lock); 345 if (err) { 346 if (err != -EAGAIN) 347 return err; 348 } else 349 return 0; 350 } else { 351 remove_cache_mr_locked(ent); 352 } 353 } 354 } 355 356 static ssize_t size_write(struct file *filp, const char __user *buf, 357 size_t count, loff_t *pos) 358 { 359 struct mlx5_cache_ent *ent = filp->private_data; 360 u32 target; 361 int err; 362 363 err = kstrtou32_from_user(buf, count, 0, &target); 364 if (err) 365 return err; 366 367 /* 368 * Target is the new value of total_mrs the user requests, however we 369 * cannot free MRs that are in use. Compute the target value for 370 * available_mrs. 371 */ 372 spin_lock_irq(&ent->lock); 373 if (target < ent->total_mrs - ent->available_mrs) { 374 err = -EINVAL; 375 goto err_unlock; 376 } 377 target = target - (ent->total_mrs - ent->available_mrs); 378 if (target < ent->limit || target > ent->limit*2) { 379 err = -EINVAL; 380 goto err_unlock; 381 } 382 err = resize_available_mrs(ent, target, false); 383 if (err) 384 goto err_unlock; 385 spin_unlock_irq(&ent->lock); 386 387 return count; 388 389 err_unlock: 390 spin_unlock_irq(&ent->lock); 391 return err; 392 } 393 394 static ssize_t size_read(struct file *filp, char __user *buf, size_t count, 395 loff_t *pos) 396 { 397 struct mlx5_cache_ent *ent = filp->private_data; 398 char lbuf[20]; 399 int err; 400 401 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->total_mrs); 402 if (err < 0) 403 return err; 404 405 return simple_read_from_buffer(buf, count, pos, lbuf, err); 406 } 407 408 static const struct file_operations size_fops = { 409 .owner = THIS_MODULE, 410 .open = simple_open, 411 .write = size_write, 412 .read = size_read, 413 }; 414 415 static ssize_t limit_write(struct file *filp, const char __user *buf, 416 size_t count, loff_t *pos) 417 { 418 struct mlx5_cache_ent *ent = filp->private_data; 419 u32 var; 420 int err; 421 422 err = kstrtou32_from_user(buf, count, 0, &var); 423 if (err) 424 return err; 425 426 /* 427 * Upon set we immediately fill the cache to high water mark implied by 428 * the limit. 429 */ 430 spin_lock_irq(&ent->lock); 431 ent->limit = var; 432 err = resize_available_mrs(ent, 0, true); 433 spin_unlock_irq(&ent->lock); 434 if (err) 435 return err; 436 return count; 437 } 438 439 static ssize_t limit_read(struct file *filp, char __user *buf, size_t count, 440 loff_t *pos) 441 { 442 struct mlx5_cache_ent *ent = filp->private_data; 443 char lbuf[20]; 444 int err; 445 446 err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit); 447 if (err < 0) 448 return err; 449 450 return simple_read_from_buffer(buf, count, pos, lbuf, err); 451 } 452 453 static const struct file_operations limit_fops = { 454 .owner = THIS_MODULE, 455 .open = simple_open, 456 .write = limit_write, 457 .read = limit_read, 458 }; 459 460 static bool someone_adding(struct mlx5_mr_cache *cache) 461 { 462 unsigned int i; 463 464 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 465 struct mlx5_cache_ent *ent = &cache->ent[i]; 466 bool ret; 467 468 spin_lock_irq(&ent->lock); 469 ret = ent->available_mrs < ent->limit; 470 spin_unlock_irq(&ent->lock); 471 if (ret) 472 return true; 473 } 474 return false; 475 } 476 477 /* 478 * Check if the bucket is outside the high/low water mark and schedule an async 479 * update. The cache refill has hysteresis, once the low water mark is hit it is 480 * refilled up to the high mark. 481 */ 482 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent) 483 { 484 lockdep_assert_held(&ent->lock); 485 486 if (ent->disabled || READ_ONCE(ent->dev->fill_delay)) 487 return; 488 if (ent->available_mrs < ent->limit) { 489 ent->fill_to_high_water = true; 490 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 491 } else if (ent->fill_to_high_water && 492 ent->available_mrs + ent->pending < 2 * ent->limit) { 493 /* 494 * Once we start populating due to hitting a low water mark 495 * continue until we pass the high water mark. 496 */ 497 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 498 } else if (ent->available_mrs == 2 * ent->limit) { 499 ent->fill_to_high_water = false; 500 } else if (ent->available_mrs > 2 * ent->limit) { 501 /* Queue deletion of excess entries */ 502 ent->fill_to_high_water = false; 503 if (ent->pending) 504 queue_delayed_work(ent->dev->cache.wq, &ent->dwork, 505 msecs_to_jiffies(1000)); 506 else 507 mod_delayed_work(ent->dev->cache.wq, &ent->dwork, 0); 508 } 509 } 510 511 static void __cache_work_func(struct mlx5_cache_ent *ent) 512 { 513 struct mlx5_ib_dev *dev = ent->dev; 514 struct mlx5_mr_cache *cache = &dev->cache; 515 int err; 516 517 spin_lock_irq(&ent->lock); 518 if (ent->disabled) 519 goto out; 520 521 if (ent->fill_to_high_water && 522 ent->available_mrs + ent->pending < 2 * ent->limit && 523 !READ_ONCE(dev->fill_delay)) { 524 spin_unlock_irq(&ent->lock); 525 err = add_keys(ent, 1); 526 spin_lock_irq(&ent->lock); 527 if (ent->disabled) 528 goto out; 529 if (err) { 530 /* 531 * EAGAIN only happens if pending is positive, so we 532 * will be rescheduled from reg_mr_callback(). The only 533 * failure path here is ENOMEM. 534 */ 535 if (err != -EAGAIN) { 536 mlx5_ib_warn( 537 dev, 538 "command failed order %d, err %d\n", 539 ent->order, err); 540 queue_delayed_work(cache->wq, &ent->dwork, 541 msecs_to_jiffies(1000)); 542 } 543 } 544 } else if (ent->available_mrs > 2 * ent->limit) { 545 bool need_delay; 546 547 /* 548 * The remove_cache_mr() logic is performed as garbage 549 * collection task. Such task is intended to be run when no 550 * other active processes are running. 551 * 552 * The need_resched() will return TRUE if there are user tasks 553 * to be activated in near future. 554 * 555 * In such case, we don't execute remove_cache_mr() and postpone 556 * the garbage collection work to try to run in next cycle, in 557 * order to free CPU resources to other tasks. 558 */ 559 spin_unlock_irq(&ent->lock); 560 need_delay = need_resched() || someone_adding(cache) || 561 !time_after(jiffies, 562 READ_ONCE(cache->last_add) + 300 * HZ); 563 spin_lock_irq(&ent->lock); 564 if (ent->disabled) 565 goto out; 566 if (need_delay) { 567 queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); 568 goto out; 569 } 570 remove_cache_mr_locked(ent); 571 queue_adjust_cache_locked(ent); 572 } 573 out: 574 spin_unlock_irq(&ent->lock); 575 } 576 577 static void delayed_cache_work_func(struct work_struct *work) 578 { 579 struct mlx5_cache_ent *ent; 580 581 ent = container_of(work, struct mlx5_cache_ent, dwork.work); 582 __cache_work_func(ent); 583 } 584 585 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 586 struct mlx5_cache_ent *ent, 587 int access_flags) 588 { 589 struct mlx5_ib_mr *mr; 590 591 /* Matches access in alloc_cache_mr() */ 592 if (!mlx5r_umr_can_reconfig(dev, 0, access_flags)) 593 return ERR_PTR(-EOPNOTSUPP); 594 595 spin_lock_irq(&ent->lock); 596 if (list_empty(&ent->head)) { 597 queue_adjust_cache_locked(ent); 598 ent->miss++; 599 spin_unlock_irq(&ent->lock); 600 mr = create_cache_mr(ent); 601 if (IS_ERR(mr)) 602 return mr; 603 } else { 604 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 605 list_del(&mr->list); 606 ent->available_mrs--; 607 queue_adjust_cache_locked(ent); 608 spin_unlock_irq(&ent->lock); 609 610 mlx5_clear_mr(mr); 611 } 612 return mr; 613 } 614 615 static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 616 { 617 struct mlx5_cache_ent *ent = mr->cache_ent; 618 619 WRITE_ONCE(dev->cache.last_add, jiffies); 620 spin_lock_irq(&ent->lock); 621 list_add_tail(&mr->list, &ent->head); 622 ent->available_mrs++; 623 queue_adjust_cache_locked(ent); 624 spin_unlock_irq(&ent->lock); 625 } 626 627 static void clean_keys(struct mlx5_ib_dev *dev, int c) 628 { 629 struct mlx5_mr_cache *cache = &dev->cache; 630 struct mlx5_cache_ent *ent = &cache->ent[c]; 631 struct mlx5_ib_mr *tmp_mr; 632 struct mlx5_ib_mr *mr; 633 LIST_HEAD(del_list); 634 635 cancel_delayed_work(&ent->dwork); 636 while (1) { 637 spin_lock_irq(&ent->lock); 638 if (list_empty(&ent->head)) { 639 spin_unlock_irq(&ent->lock); 640 break; 641 } 642 mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); 643 list_move(&mr->list, &del_list); 644 ent->available_mrs--; 645 ent->total_mrs--; 646 spin_unlock_irq(&ent->lock); 647 mlx5_core_destroy_mkey(dev->mdev, mr->mmkey.key); 648 } 649 650 list_for_each_entry_safe(mr, tmp_mr, &del_list, list) { 651 list_del(&mr->list); 652 kfree(mr); 653 } 654 } 655 656 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) 657 { 658 if (!mlx5_debugfs_root || dev->is_rep) 659 return; 660 661 debugfs_remove_recursive(dev->cache.root); 662 dev->cache.root = NULL; 663 } 664 665 static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev) 666 { 667 struct mlx5_mr_cache *cache = &dev->cache; 668 struct mlx5_cache_ent *ent; 669 struct dentry *dir; 670 int i; 671 672 if (!mlx5_debugfs_root || dev->is_rep) 673 return; 674 675 cache->root = debugfs_create_dir("mr_cache", mlx5_debugfs_get_dev_root(dev->mdev)); 676 677 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 678 ent = &cache->ent[i]; 679 sprintf(ent->name, "%d", ent->order); 680 dir = debugfs_create_dir(ent->name, cache->root); 681 debugfs_create_file("size", 0600, dir, ent, &size_fops); 682 debugfs_create_file("limit", 0600, dir, ent, &limit_fops); 683 debugfs_create_u32("cur", 0400, dir, &ent->available_mrs); 684 debugfs_create_u32("miss", 0600, dir, &ent->miss); 685 } 686 } 687 688 static void delay_time_func(struct timer_list *t) 689 { 690 struct mlx5_ib_dev *dev = from_timer(dev, t, delay_timer); 691 692 WRITE_ONCE(dev->fill_delay, 0); 693 } 694 695 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) 696 { 697 struct mlx5_mr_cache *cache = &dev->cache; 698 struct mlx5_cache_ent *ent; 699 int i; 700 701 mutex_init(&dev->slow_path_mutex); 702 cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM); 703 if (!cache->wq) { 704 mlx5_ib_warn(dev, "failed to create work queue\n"); 705 return -ENOMEM; 706 } 707 708 mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx); 709 timer_setup(&dev->delay_timer, delay_time_func, 0); 710 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 711 ent = &cache->ent[i]; 712 INIT_LIST_HEAD(&ent->head); 713 spin_lock_init(&ent->lock); 714 ent->order = i + 2; 715 ent->dev = dev; 716 ent->limit = 0; 717 718 INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); 719 720 if (i > MR_CACHE_LAST_STD_ENTRY) { 721 mlx5_odp_init_mr_cache_entry(ent); 722 continue; 723 } 724 725 if (ent->order > mr_cache_max_order(dev)) 726 continue; 727 728 ent->page = PAGE_SHIFT; 729 ent->ndescs = 1 << ent->order; 730 ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT; 731 if ((dev->mdev->profile.mask & MLX5_PROF_MASK_MR_CACHE) && 732 !dev->is_rep && mlx5_core_is_pf(dev->mdev) && 733 mlx5r_umr_can_load_pas(dev, 0)) 734 ent->limit = dev->mdev->profile.mr_cache[i].limit; 735 else 736 ent->limit = 0; 737 spin_lock_irq(&ent->lock); 738 queue_adjust_cache_locked(ent); 739 spin_unlock_irq(&ent->lock); 740 } 741 742 mlx5_mr_cache_debugfs_init(dev); 743 744 return 0; 745 } 746 747 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) 748 { 749 unsigned int i; 750 751 if (!dev->cache.wq) 752 return 0; 753 754 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { 755 struct mlx5_cache_ent *ent = &dev->cache.ent[i]; 756 757 spin_lock_irq(&ent->lock); 758 ent->disabled = true; 759 spin_unlock_irq(&ent->lock); 760 cancel_delayed_work_sync(&ent->dwork); 761 } 762 763 mlx5_mr_cache_debugfs_cleanup(dev); 764 mlx5_cmd_cleanup_async_ctx(&dev->async_ctx); 765 766 for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) 767 clean_keys(dev, i); 768 769 destroy_workqueue(dev->cache.wq); 770 del_timer_sync(&dev->delay_timer); 771 772 return 0; 773 } 774 775 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) 776 { 777 struct mlx5_ib_dev *dev = to_mdev(pd->device); 778 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 779 struct mlx5_ib_mr *mr; 780 void *mkc; 781 u32 *in; 782 int err; 783 784 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 785 if (!mr) 786 return ERR_PTR(-ENOMEM); 787 788 in = kzalloc(inlen, GFP_KERNEL); 789 if (!in) { 790 err = -ENOMEM; 791 goto err_free; 792 } 793 794 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 795 796 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA); 797 MLX5_SET(mkc, mkc, length64, 1); 798 set_mkc_access_pd_addr_fields(mkc, acc | IB_ACCESS_RELAXED_ORDERING, 0, 799 pd); 800 801 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 802 if (err) 803 goto err_in; 804 805 kfree(in); 806 mr->mmkey.type = MLX5_MKEY_MR; 807 mr->ibmr.lkey = mr->mmkey.key; 808 mr->ibmr.rkey = mr->mmkey.key; 809 mr->umem = NULL; 810 811 return &mr->ibmr; 812 813 err_in: 814 kfree(in); 815 816 err_free: 817 kfree(mr); 818 819 return ERR_PTR(err); 820 } 821 822 static int get_octo_len(u64 addr, u64 len, int page_shift) 823 { 824 u64 page_size = 1ULL << page_shift; 825 u64 offset; 826 int npages; 827 828 offset = addr & (page_size - 1); 829 npages = ALIGN(len + offset, page_size) >> page_shift; 830 return (npages + 1) / 2; 831 } 832 833 static int mr_cache_max_order(struct mlx5_ib_dev *dev) 834 { 835 if (MLX5_CAP_GEN(dev->mdev, umr_extended_translation_offset)) 836 return MR_CACHE_LAST_STD_ENTRY + 2; 837 return MLX5_MAX_UMR_SHIFT; 838 } 839 840 static struct mlx5_cache_ent *mr_cache_ent_from_order(struct mlx5_ib_dev *dev, 841 unsigned int order) 842 { 843 struct mlx5_mr_cache *cache = &dev->cache; 844 845 if (order < cache->ent[0].order) 846 return &cache->ent[0]; 847 order = order - cache->ent[0].order; 848 if (order > MR_CACHE_LAST_STD_ENTRY) 849 return NULL; 850 return &cache->ent[order]; 851 } 852 853 static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, 854 u64 length, int access_flags, u64 iova) 855 { 856 mr->ibmr.lkey = mr->mmkey.key; 857 mr->ibmr.rkey = mr->mmkey.key; 858 mr->ibmr.length = length; 859 mr->ibmr.device = &dev->ib_dev; 860 mr->ibmr.iova = iova; 861 mr->access_flags = access_flags; 862 } 863 864 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem, 865 u64 iova) 866 { 867 /* 868 * The alignment of iova has already been checked upon entering 869 * UVERBS_METHOD_REG_DMABUF_MR 870 */ 871 umem->iova = iova; 872 return PAGE_SIZE; 873 } 874 875 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd, 876 struct ib_umem *umem, u64 iova, 877 int access_flags) 878 { 879 struct mlx5_ib_dev *dev = to_mdev(pd->device); 880 struct mlx5_cache_ent *ent; 881 struct mlx5_ib_mr *mr; 882 unsigned int page_size; 883 884 if (umem->is_dmabuf) 885 page_size = mlx5_umem_dmabuf_default_pgsz(umem, iova); 886 else 887 page_size = mlx5_umem_find_best_pgsz(umem, mkc, log_page_size, 888 0, iova); 889 if (WARN_ON(!page_size)) 890 return ERR_PTR(-EINVAL); 891 ent = mr_cache_ent_from_order( 892 dev, order_base_2(ib_umem_num_dma_blocks(umem, page_size))); 893 /* 894 * Matches access in alloc_cache_mr(). If the MR can't come from the 895 * cache then synchronously create an uncached one. 896 */ 897 if (!ent || ent->limit == 0 || 898 !mlx5r_umr_can_reconfig(dev, 0, access_flags)) { 899 mutex_lock(&dev->slow_path_mutex); 900 mr = reg_create(pd, umem, iova, access_flags, page_size, false); 901 mutex_unlock(&dev->slow_path_mutex); 902 return mr; 903 } 904 905 mr = mlx5_mr_cache_alloc(dev, ent, access_flags); 906 if (IS_ERR(mr)) 907 return mr; 908 909 mr->ibmr.pd = pd; 910 mr->umem = umem; 911 mr->page_shift = order_base_2(page_size); 912 set_mr_fields(dev, mr, umem->length, access_flags, iova); 913 914 return mr; 915 } 916 917 /* 918 * If ibmr is NULL it will be allocated by reg_create. 919 * Else, the given ibmr will be used. 920 */ 921 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem, 922 u64 iova, int access_flags, 923 unsigned int page_size, bool populate) 924 { 925 struct mlx5_ib_dev *dev = to_mdev(pd->device); 926 struct mlx5_ib_mr *mr; 927 __be64 *pas; 928 void *mkc; 929 int inlen; 930 u32 *in; 931 int err; 932 bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)); 933 934 if (!page_size) 935 return ERR_PTR(-EINVAL); 936 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 937 if (!mr) 938 return ERR_PTR(-ENOMEM); 939 940 mr->ibmr.pd = pd; 941 mr->access_flags = access_flags; 942 mr->page_shift = order_base_2(page_size); 943 944 inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 945 if (populate) 946 inlen += sizeof(*pas) * 947 roundup(ib_umem_num_dma_blocks(umem, page_size), 2); 948 in = kvzalloc(inlen, GFP_KERNEL); 949 if (!in) { 950 err = -ENOMEM; 951 goto err_1; 952 } 953 pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); 954 if (populate) { 955 if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) { 956 err = -EINVAL; 957 goto err_2; 958 } 959 mlx5_ib_populate_pas(umem, 1UL << mr->page_shift, pas, 960 pg_cap ? MLX5_IB_MTT_PRESENT : 0); 961 } 962 963 /* The pg_access bit allows setting the access flags 964 * in the page list submitted with the command. */ 965 MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap)); 966 967 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 968 set_mkc_access_pd_addr_fields(mkc, access_flags, iova, 969 populate ? pd : dev->umrc.pd); 970 MLX5_SET(mkc, mkc, free, !populate); 971 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); 972 MLX5_SET(mkc, mkc, umr_en, 1); 973 974 MLX5_SET64(mkc, mkc, len, umem->length); 975 MLX5_SET(mkc, mkc, bsf_octword_size, 0); 976 MLX5_SET(mkc, mkc, translations_octword_size, 977 get_octo_len(iova, umem->length, mr->page_shift)); 978 MLX5_SET(mkc, mkc, log_page_size, mr->page_shift); 979 if (populate) { 980 MLX5_SET(create_mkey_in, in, translations_octword_actual_size, 981 get_octo_len(iova, umem->length, mr->page_shift)); 982 } 983 984 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 985 if (err) { 986 mlx5_ib_warn(dev, "create mkey failed\n"); 987 goto err_2; 988 } 989 mr->mmkey.type = MLX5_MKEY_MR; 990 mr->umem = umem; 991 set_mr_fields(dev, mr, umem->length, access_flags, iova); 992 kvfree(in); 993 994 mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key); 995 996 return mr; 997 998 err_2: 999 kvfree(in); 1000 err_1: 1001 kfree(mr); 1002 return ERR_PTR(err); 1003 } 1004 1005 static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr, 1006 u64 length, int acc, int mode) 1007 { 1008 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1009 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1010 struct mlx5_ib_mr *mr; 1011 void *mkc; 1012 u32 *in; 1013 int err; 1014 1015 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1016 if (!mr) 1017 return ERR_PTR(-ENOMEM); 1018 1019 in = kzalloc(inlen, GFP_KERNEL); 1020 if (!in) { 1021 err = -ENOMEM; 1022 goto err_free; 1023 } 1024 1025 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1026 1027 MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3); 1028 MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7); 1029 MLX5_SET64(mkc, mkc, len, length); 1030 set_mkc_access_pd_addr_fields(mkc, acc, start_addr, pd); 1031 1032 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1033 if (err) 1034 goto err_in; 1035 1036 kfree(in); 1037 1038 set_mr_fields(dev, mr, length, acc, start_addr); 1039 1040 return &mr->ibmr; 1041 1042 err_in: 1043 kfree(in); 1044 1045 err_free: 1046 kfree(mr); 1047 1048 return ERR_PTR(err); 1049 } 1050 1051 int mlx5_ib_advise_mr(struct ib_pd *pd, 1052 enum ib_uverbs_advise_mr_advice advice, 1053 u32 flags, 1054 struct ib_sge *sg_list, 1055 u32 num_sge, 1056 struct uverbs_attr_bundle *attrs) 1057 { 1058 if (advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH && 1059 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE && 1060 advice != IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT) 1061 return -EOPNOTSUPP; 1062 1063 return mlx5_ib_advise_mr_prefetch(pd, advice, flags, 1064 sg_list, num_sge); 1065 } 1066 1067 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, 1068 struct ib_dm_mr_attr *attr, 1069 struct uverbs_attr_bundle *attrs) 1070 { 1071 struct mlx5_ib_dm *mdm = to_mdm(dm); 1072 struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev; 1073 u64 start_addr = mdm->dev_addr + attr->offset; 1074 int mode; 1075 1076 switch (mdm->type) { 1077 case MLX5_IB_UAPI_DM_TYPE_MEMIC: 1078 if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS) 1079 return ERR_PTR(-EINVAL); 1080 1081 mode = MLX5_MKC_ACCESS_MODE_MEMIC; 1082 start_addr -= pci_resource_start(dev->pdev, 0); 1083 break; 1084 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM: 1085 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM: 1086 if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS) 1087 return ERR_PTR(-EINVAL); 1088 1089 mode = MLX5_MKC_ACCESS_MODE_SW_ICM; 1090 break; 1091 default: 1092 return ERR_PTR(-EINVAL); 1093 } 1094 1095 return mlx5_ib_get_dm_mr(pd, start_addr, attr->length, 1096 attr->access_flags, mode); 1097 } 1098 1099 static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem, 1100 u64 iova, int access_flags) 1101 { 1102 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1103 struct mlx5_ib_mr *mr = NULL; 1104 bool xlt_with_umr; 1105 int err; 1106 1107 xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length); 1108 if (xlt_with_umr) { 1109 mr = alloc_cacheable_mr(pd, umem, iova, access_flags); 1110 } else { 1111 unsigned int page_size = mlx5_umem_find_best_pgsz( 1112 umem, mkc, log_page_size, 0, iova); 1113 1114 mutex_lock(&dev->slow_path_mutex); 1115 mr = reg_create(pd, umem, iova, access_flags, page_size, true); 1116 mutex_unlock(&dev->slow_path_mutex); 1117 } 1118 if (IS_ERR(mr)) { 1119 ib_umem_release(umem); 1120 return ERR_CAST(mr); 1121 } 1122 1123 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1124 1125 atomic_add(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1126 1127 if (xlt_with_umr) { 1128 /* 1129 * If the MR was created with reg_create then it will be 1130 * configured properly but left disabled. It is safe to go ahead 1131 * and configure it again via UMR while enabling it. 1132 */ 1133 err = mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1134 if (err) { 1135 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1136 return ERR_PTR(err); 1137 } 1138 } 1139 return &mr->ibmr; 1140 } 1141 1142 static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length, 1143 u64 iova, int access_flags, 1144 struct ib_udata *udata) 1145 { 1146 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1147 struct ib_umem_odp *odp; 1148 struct mlx5_ib_mr *mr; 1149 int err; 1150 1151 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1152 return ERR_PTR(-EOPNOTSUPP); 1153 1154 err = mlx5r_odp_create_eq(dev, &dev->odp_pf_eq); 1155 if (err) 1156 return ERR_PTR(err); 1157 if (!start && length == U64_MAX) { 1158 if (iova != 0) 1159 return ERR_PTR(-EINVAL); 1160 if (!(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT)) 1161 return ERR_PTR(-EINVAL); 1162 1163 mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags); 1164 if (IS_ERR(mr)) 1165 return ERR_CAST(mr); 1166 return &mr->ibmr; 1167 } 1168 1169 /* ODP requires xlt update via umr to work. */ 1170 if (!mlx5r_umr_can_load_pas(dev, length)) 1171 return ERR_PTR(-EINVAL); 1172 1173 odp = ib_umem_odp_get(&dev->ib_dev, start, length, access_flags, 1174 &mlx5_mn_ops); 1175 if (IS_ERR(odp)) 1176 return ERR_CAST(odp); 1177 1178 mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags); 1179 if (IS_ERR(mr)) { 1180 ib_umem_release(&odp->umem); 1181 return ERR_CAST(mr); 1182 } 1183 xa_init(&mr->implicit_children); 1184 1185 odp->private = mr; 1186 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1187 if (err) 1188 goto err_dereg_mr; 1189 1190 err = mlx5_ib_init_odp_mr(mr); 1191 if (err) 1192 goto err_dereg_mr; 1193 return &mr->ibmr; 1194 1195 err_dereg_mr: 1196 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1197 return ERR_PTR(err); 1198 } 1199 1200 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, 1201 u64 iova, int access_flags, 1202 struct ib_udata *udata) 1203 { 1204 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1205 struct ib_umem *umem; 1206 1207 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1208 return ERR_PTR(-EOPNOTSUPP); 1209 1210 mlx5_ib_dbg(dev, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1211 start, iova, length, access_flags); 1212 1213 if (access_flags & IB_ACCESS_ON_DEMAND) 1214 return create_user_odp_mr(pd, start, length, iova, access_flags, 1215 udata); 1216 umem = ib_umem_get(&dev->ib_dev, start, length, access_flags); 1217 if (IS_ERR(umem)) 1218 return ERR_CAST(umem); 1219 return create_real_mr(pd, umem, iova, access_flags); 1220 } 1221 1222 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment *attach) 1223 { 1224 struct ib_umem_dmabuf *umem_dmabuf = attach->importer_priv; 1225 struct mlx5_ib_mr *mr = umem_dmabuf->private; 1226 1227 dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv); 1228 1229 if (!umem_dmabuf->sgt) 1230 return; 1231 1232 mlx5r_umr_update_mr_pas(mr, MLX5_IB_UPD_XLT_ZAP); 1233 ib_umem_dmabuf_unmap_pages(umem_dmabuf); 1234 } 1235 1236 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = { 1237 .allow_peer2peer = 1, 1238 .move_notify = mlx5_ib_dmabuf_invalidate_cb, 1239 }; 1240 1241 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset, 1242 u64 length, u64 virt_addr, 1243 int fd, int access_flags, 1244 struct ib_udata *udata) 1245 { 1246 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1247 struct mlx5_ib_mr *mr = NULL; 1248 struct ib_umem_dmabuf *umem_dmabuf; 1249 int err; 1250 1251 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || 1252 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) 1253 return ERR_PTR(-EOPNOTSUPP); 1254 1255 mlx5_ib_dbg(dev, 1256 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n", 1257 offset, virt_addr, length, fd, access_flags); 1258 1259 /* dmabuf requires xlt update via umr to work. */ 1260 if (!mlx5r_umr_can_load_pas(dev, length)) 1261 return ERR_PTR(-EINVAL); 1262 1263 umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd, 1264 access_flags, 1265 &mlx5_ib_dmabuf_attach_ops); 1266 if (IS_ERR(umem_dmabuf)) { 1267 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n", 1268 PTR_ERR(umem_dmabuf)); 1269 return ERR_CAST(umem_dmabuf); 1270 } 1271 1272 mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr, 1273 access_flags); 1274 if (IS_ERR(mr)) { 1275 ib_umem_release(&umem_dmabuf->umem); 1276 return ERR_CAST(mr); 1277 } 1278 1279 mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key); 1280 1281 atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages); 1282 umem_dmabuf->private = mr; 1283 err = mlx5r_store_odp_mkey(dev, &mr->mmkey); 1284 if (err) 1285 goto err_dereg_mr; 1286 1287 err = mlx5_ib_init_dmabuf_mr(mr); 1288 if (err) 1289 goto err_dereg_mr; 1290 return &mr->ibmr; 1291 1292 err_dereg_mr: 1293 mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1294 return ERR_PTR(err); 1295 } 1296 1297 /* 1298 * True if the change in access flags can be done via UMR, only some access 1299 * flags can be updated. 1300 */ 1301 static bool can_use_umr_rereg_access(struct mlx5_ib_dev *dev, 1302 unsigned int current_access_flags, 1303 unsigned int target_access_flags) 1304 { 1305 unsigned int diffs = current_access_flags ^ target_access_flags; 1306 1307 if (diffs & ~(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | 1308 IB_ACCESS_REMOTE_READ | IB_ACCESS_RELAXED_ORDERING)) 1309 return false; 1310 return mlx5r_umr_can_reconfig(dev, current_access_flags, 1311 target_access_flags); 1312 } 1313 1314 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr *mr, 1315 struct ib_umem *new_umem, 1316 int new_access_flags, u64 iova, 1317 unsigned long *page_size) 1318 { 1319 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1320 1321 /* We only track the allocated sizes of MRs from the cache */ 1322 if (!mr->cache_ent) 1323 return false; 1324 if (!mlx5r_umr_can_load_pas(dev, new_umem->length)) 1325 return false; 1326 1327 *page_size = 1328 mlx5_umem_find_best_pgsz(new_umem, mkc, log_page_size, 0, iova); 1329 if (WARN_ON(!*page_size)) 1330 return false; 1331 return (1ULL << mr->cache_ent->order) >= 1332 ib_umem_num_dma_blocks(new_umem, *page_size); 1333 } 1334 1335 static int umr_rereg_pas(struct mlx5_ib_mr *mr, struct ib_pd *pd, 1336 int access_flags, int flags, struct ib_umem *new_umem, 1337 u64 iova, unsigned long page_size) 1338 { 1339 struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device); 1340 int upd_flags = MLX5_IB_UPD_XLT_ADDR | MLX5_IB_UPD_XLT_ENABLE; 1341 struct ib_umem *old_umem = mr->umem; 1342 int err; 1343 1344 /* 1345 * To keep everything simple the MR is revoked before we start to mess 1346 * with it. This ensure the change is atomic relative to any use of the 1347 * MR. 1348 */ 1349 err = mlx5r_umr_revoke_mr(mr); 1350 if (err) 1351 return err; 1352 1353 if (flags & IB_MR_REREG_PD) { 1354 mr->ibmr.pd = pd; 1355 upd_flags |= MLX5_IB_UPD_XLT_PD; 1356 } 1357 if (flags & IB_MR_REREG_ACCESS) { 1358 mr->access_flags = access_flags; 1359 upd_flags |= MLX5_IB_UPD_XLT_ACCESS; 1360 } 1361 1362 mr->ibmr.length = new_umem->length; 1363 mr->ibmr.iova = iova; 1364 mr->ibmr.length = new_umem->length; 1365 mr->page_shift = order_base_2(page_size); 1366 mr->umem = new_umem; 1367 err = mlx5r_umr_update_mr_pas(mr, upd_flags); 1368 if (err) { 1369 /* 1370 * The MR is revoked at this point so there is no issue to free 1371 * new_umem. 1372 */ 1373 mr->umem = old_umem; 1374 return err; 1375 } 1376 1377 atomic_sub(ib_umem_num_pages(old_umem), &dev->mdev->priv.reg_pages); 1378 ib_umem_release(old_umem); 1379 atomic_add(ib_umem_num_pages(new_umem), &dev->mdev->priv.reg_pages); 1380 return 0; 1381 } 1382 1383 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1384 u64 length, u64 iova, int new_access_flags, 1385 struct ib_pd *new_pd, 1386 struct ib_udata *udata) 1387 { 1388 struct mlx5_ib_dev *dev = to_mdev(ib_mr->device); 1389 struct mlx5_ib_mr *mr = to_mmr(ib_mr); 1390 int err; 1391 1392 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM)) 1393 return ERR_PTR(-EOPNOTSUPP); 1394 1395 mlx5_ib_dbg( 1396 dev, 1397 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n", 1398 start, iova, length, new_access_flags); 1399 1400 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 1401 return ERR_PTR(-EOPNOTSUPP); 1402 1403 if (!(flags & IB_MR_REREG_ACCESS)) 1404 new_access_flags = mr->access_flags; 1405 if (!(flags & IB_MR_REREG_PD)) 1406 new_pd = ib_mr->pd; 1407 1408 if (!(flags & IB_MR_REREG_TRANS)) { 1409 struct ib_umem *umem; 1410 1411 /* Fast path for PD/access change */ 1412 if (can_use_umr_rereg_access(dev, mr->access_flags, 1413 new_access_flags)) { 1414 err = mlx5r_umr_rereg_pd_access(mr, new_pd, 1415 new_access_flags); 1416 if (err) 1417 return ERR_PTR(err); 1418 return NULL; 1419 } 1420 /* DM or ODP MR's don't have a normal umem so we can't re-use it */ 1421 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1422 goto recreate; 1423 1424 /* 1425 * Only one active MR can refer to a umem at one time, revoke 1426 * the old MR before assigning the umem to the new one. 1427 */ 1428 err = mlx5r_umr_revoke_mr(mr); 1429 if (err) 1430 return ERR_PTR(err); 1431 umem = mr->umem; 1432 mr->umem = NULL; 1433 atomic_sub(ib_umem_num_pages(umem), &dev->mdev->priv.reg_pages); 1434 1435 return create_real_mr(new_pd, umem, mr->ibmr.iova, 1436 new_access_flags); 1437 } 1438 1439 /* 1440 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does 1441 * but the logic around releasing the umem is different 1442 */ 1443 if (!mr->umem || is_odp_mr(mr) || is_dmabuf_mr(mr)) 1444 goto recreate; 1445 1446 if (!(new_access_flags & IB_ACCESS_ON_DEMAND) && 1447 can_use_umr_rereg_access(dev, mr->access_flags, new_access_flags)) { 1448 struct ib_umem *new_umem; 1449 unsigned long page_size; 1450 1451 new_umem = ib_umem_get(&dev->ib_dev, start, length, 1452 new_access_flags); 1453 if (IS_ERR(new_umem)) 1454 return ERR_CAST(new_umem); 1455 1456 /* Fast path for PAS change */ 1457 if (can_use_umr_rereg_pas(mr, new_umem, new_access_flags, iova, 1458 &page_size)) { 1459 err = umr_rereg_pas(mr, new_pd, new_access_flags, flags, 1460 new_umem, iova, page_size); 1461 if (err) { 1462 ib_umem_release(new_umem); 1463 return ERR_PTR(err); 1464 } 1465 return NULL; 1466 } 1467 return create_real_mr(new_pd, new_umem, iova, new_access_flags); 1468 } 1469 1470 /* 1471 * Everything else has no state we can preserve, just create a new MR 1472 * from scratch 1473 */ 1474 recreate: 1475 return mlx5_ib_reg_user_mr(new_pd, start, length, iova, 1476 new_access_flags, udata); 1477 } 1478 1479 static int 1480 mlx5_alloc_priv_descs(struct ib_device *device, 1481 struct mlx5_ib_mr *mr, 1482 int ndescs, 1483 int desc_size) 1484 { 1485 struct mlx5_ib_dev *dev = to_mdev(device); 1486 struct device *ddev = &dev->mdev->pdev->dev; 1487 int size = ndescs * desc_size; 1488 int add_size; 1489 int ret; 1490 1491 add_size = max_t(int, MLX5_UMR_ALIGN - ARCH_KMALLOC_MINALIGN, 0); 1492 1493 mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL); 1494 if (!mr->descs_alloc) 1495 return -ENOMEM; 1496 1497 mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN); 1498 1499 mr->desc_map = dma_map_single(ddev, mr->descs, size, DMA_TO_DEVICE); 1500 if (dma_mapping_error(ddev, mr->desc_map)) { 1501 ret = -ENOMEM; 1502 goto err; 1503 } 1504 1505 return 0; 1506 err: 1507 kfree(mr->descs_alloc); 1508 1509 return ret; 1510 } 1511 1512 static void 1513 mlx5_free_priv_descs(struct mlx5_ib_mr *mr) 1514 { 1515 if (!mr->umem && mr->descs) { 1516 struct ib_device *device = mr->ibmr.device; 1517 int size = mr->max_descs * mr->desc_size; 1518 struct mlx5_ib_dev *dev = to_mdev(device); 1519 1520 dma_unmap_single(&dev->mdev->pdev->dev, mr->desc_map, size, 1521 DMA_TO_DEVICE); 1522 kfree(mr->descs_alloc); 1523 mr->descs = NULL; 1524 } 1525 } 1526 1527 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1528 { 1529 struct mlx5_ib_mr *mr = to_mmr(ibmr); 1530 struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1531 int rc; 1532 1533 /* 1534 * Any async use of the mr must hold the refcount, once the refcount 1535 * goes to zero no other thread, such as ODP page faults, prefetch, any 1536 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1537 */ 1538 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1539 refcount_read(&mr->mmkey.usecount) != 0 && 1540 xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1541 mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1542 1543 if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1544 xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1545 mr->sig, NULL, GFP_KERNEL); 1546 1547 if (mr->mtt_mr) { 1548 rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1549 if (rc) 1550 return rc; 1551 mr->mtt_mr = NULL; 1552 } 1553 if (mr->klm_mr) { 1554 rc = mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1555 if (rc) 1556 return rc; 1557 mr->klm_mr = NULL; 1558 } 1559 1560 if (mlx5_core_destroy_psv(dev->mdev, 1561 mr->sig->psv_memory.psv_idx)) 1562 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1563 mr->sig->psv_memory.psv_idx); 1564 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1565 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1566 mr->sig->psv_wire.psv_idx); 1567 kfree(mr->sig); 1568 mr->sig = NULL; 1569 } 1570 1571 /* Stop DMA */ 1572 if (mr->cache_ent) { 1573 if (mlx5r_umr_revoke_mr(mr)) { 1574 spin_lock_irq(&mr->cache_ent->lock); 1575 mr->cache_ent->total_mrs--; 1576 spin_unlock_irq(&mr->cache_ent->lock); 1577 mr->cache_ent = NULL; 1578 } 1579 } 1580 if (!mr->cache_ent) { 1581 rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1582 if (rc) 1583 return rc; 1584 } 1585 1586 if (mr->umem) { 1587 bool is_odp = is_odp_mr(mr); 1588 1589 if (!is_odp) 1590 atomic_sub(ib_umem_num_pages(mr->umem), 1591 &dev->mdev->priv.reg_pages); 1592 ib_umem_release(mr->umem); 1593 if (is_odp) 1594 mlx5_ib_free_odp_mr(mr); 1595 } 1596 1597 if (mr->cache_ent) { 1598 mlx5_mr_cache_free(dev, mr); 1599 } else { 1600 mlx5_free_priv_descs(mr); 1601 kfree(mr); 1602 } 1603 return 0; 1604 } 1605 1606 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs, 1607 int access_mode, int page_shift) 1608 { 1609 void *mkc; 1610 1611 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1612 1613 /* This is only used from the kernel, so setting the PD is OK. */ 1614 set_mkc_access_pd_addr_fields(mkc, IB_ACCESS_RELAXED_ORDERING, 0, pd); 1615 MLX5_SET(mkc, mkc, free, 1); 1616 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1617 MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3); 1618 MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7); 1619 MLX5_SET(mkc, mkc, umr_en, 1); 1620 MLX5_SET(mkc, mkc, log_page_size, page_shift); 1621 } 1622 1623 static int _mlx5_alloc_mkey_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1624 int ndescs, int desc_size, int page_shift, 1625 int access_mode, u32 *in, int inlen) 1626 { 1627 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1628 int err; 1629 1630 mr->access_mode = access_mode; 1631 mr->desc_size = desc_size; 1632 mr->max_descs = ndescs; 1633 1634 err = mlx5_alloc_priv_descs(pd->device, mr, ndescs, desc_size); 1635 if (err) 1636 return err; 1637 1638 mlx5_set_umr_free_mkey(pd, in, ndescs, access_mode, page_shift); 1639 1640 err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen); 1641 if (err) 1642 goto err_free_descs; 1643 1644 mr->mmkey.type = MLX5_MKEY_MR; 1645 mr->ibmr.lkey = mr->mmkey.key; 1646 mr->ibmr.rkey = mr->mmkey.key; 1647 1648 return 0; 1649 1650 err_free_descs: 1651 mlx5_free_priv_descs(mr); 1652 return err; 1653 } 1654 1655 static struct mlx5_ib_mr *mlx5_ib_alloc_pi_mr(struct ib_pd *pd, 1656 u32 max_num_sg, u32 max_num_meta_sg, 1657 int desc_size, int access_mode) 1658 { 1659 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1660 int ndescs = ALIGN(max_num_sg + max_num_meta_sg, 4); 1661 int page_shift = 0; 1662 struct mlx5_ib_mr *mr; 1663 u32 *in; 1664 int err; 1665 1666 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1667 if (!mr) 1668 return ERR_PTR(-ENOMEM); 1669 1670 mr->ibmr.pd = pd; 1671 mr->ibmr.device = pd->device; 1672 1673 in = kzalloc(inlen, GFP_KERNEL); 1674 if (!in) { 1675 err = -ENOMEM; 1676 goto err_free; 1677 } 1678 1679 if (access_mode == MLX5_MKC_ACCESS_MODE_MTT) 1680 page_shift = PAGE_SHIFT; 1681 1682 err = _mlx5_alloc_mkey_descs(pd, mr, ndescs, desc_size, page_shift, 1683 access_mode, in, inlen); 1684 if (err) 1685 goto err_free_in; 1686 1687 mr->umem = NULL; 1688 kfree(in); 1689 1690 return mr; 1691 1692 err_free_in: 1693 kfree(in); 1694 err_free: 1695 kfree(mr); 1696 return ERR_PTR(err); 1697 } 1698 1699 static int mlx5_alloc_mem_reg_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1700 int ndescs, u32 *in, int inlen) 1701 { 1702 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_mtt), 1703 PAGE_SHIFT, MLX5_MKC_ACCESS_MODE_MTT, in, 1704 inlen); 1705 } 1706 1707 static int mlx5_alloc_sg_gaps_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1708 int ndescs, u32 *in, int inlen) 1709 { 1710 return _mlx5_alloc_mkey_descs(pd, mr, ndescs, sizeof(struct mlx5_klm), 1711 0, MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1712 } 1713 1714 static int mlx5_alloc_integrity_descs(struct ib_pd *pd, struct mlx5_ib_mr *mr, 1715 int max_num_sg, int max_num_meta_sg, 1716 u32 *in, int inlen) 1717 { 1718 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1719 u32 psv_index[2]; 1720 void *mkc; 1721 int err; 1722 1723 mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL); 1724 if (!mr->sig) 1725 return -ENOMEM; 1726 1727 /* create mem & wire PSVs */ 1728 err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn, 2, psv_index); 1729 if (err) 1730 goto err_free_sig; 1731 1732 mr->sig->psv_memory.psv_idx = psv_index[0]; 1733 mr->sig->psv_wire.psv_idx = psv_index[1]; 1734 1735 mr->sig->sig_status_checked = true; 1736 mr->sig->sig_err_exists = false; 1737 /* Next UMR, Arm SIGERR */ 1738 ++mr->sig->sigerr_count; 1739 mr->klm_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1740 sizeof(struct mlx5_klm), 1741 MLX5_MKC_ACCESS_MODE_KLMS); 1742 if (IS_ERR(mr->klm_mr)) { 1743 err = PTR_ERR(mr->klm_mr); 1744 goto err_destroy_psv; 1745 } 1746 mr->mtt_mr = mlx5_ib_alloc_pi_mr(pd, max_num_sg, max_num_meta_sg, 1747 sizeof(struct mlx5_mtt), 1748 MLX5_MKC_ACCESS_MODE_MTT); 1749 if (IS_ERR(mr->mtt_mr)) { 1750 err = PTR_ERR(mr->mtt_mr); 1751 goto err_free_klm_mr; 1752 } 1753 1754 /* Set bsf descriptors for mkey */ 1755 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1756 MLX5_SET(mkc, mkc, bsf_en, 1); 1757 MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE); 1758 1759 err = _mlx5_alloc_mkey_descs(pd, mr, 4, sizeof(struct mlx5_klm), 0, 1760 MLX5_MKC_ACCESS_MODE_KLMS, in, inlen); 1761 if (err) 1762 goto err_free_mtt_mr; 1763 1764 err = xa_err(xa_store(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), 1765 mr->sig, GFP_KERNEL)); 1766 if (err) 1767 goto err_free_descs; 1768 return 0; 1769 1770 err_free_descs: 1771 destroy_mkey(dev, mr); 1772 mlx5_free_priv_descs(mr); 1773 err_free_mtt_mr: 1774 mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1775 mr->mtt_mr = NULL; 1776 err_free_klm_mr: 1777 mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1778 mr->klm_mr = NULL; 1779 err_destroy_psv: 1780 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx)) 1781 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1782 mr->sig->psv_memory.psv_idx); 1783 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1784 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1785 mr->sig->psv_wire.psv_idx); 1786 err_free_sig: 1787 kfree(mr->sig); 1788 1789 return err; 1790 } 1791 1792 static struct ib_mr *__mlx5_ib_alloc_mr(struct ib_pd *pd, 1793 enum ib_mr_type mr_type, u32 max_num_sg, 1794 u32 max_num_meta_sg) 1795 { 1796 struct mlx5_ib_dev *dev = to_mdev(pd->device); 1797 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1798 int ndescs = ALIGN(max_num_sg, 4); 1799 struct mlx5_ib_mr *mr; 1800 u32 *in; 1801 int err; 1802 1803 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1804 if (!mr) 1805 return ERR_PTR(-ENOMEM); 1806 1807 in = kzalloc(inlen, GFP_KERNEL); 1808 if (!in) { 1809 err = -ENOMEM; 1810 goto err_free; 1811 } 1812 1813 mr->ibmr.device = pd->device; 1814 mr->umem = NULL; 1815 1816 switch (mr_type) { 1817 case IB_MR_TYPE_MEM_REG: 1818 err = mlx5_alloc_mem_reg_descs(pd, mr, ndescs, in, inlen); 1819 break; 1820 case IB_MR_TYPE_SG_GAPS: 1821 err = mlx5_alloc_sg_gaps_descs(pd, mr, ndescs, in, inlen); 1822 break; 1823 case IB_MR_TYPE_INTEGRITY: 1824 err = mlx5_alloc_integrity_descs(pd, mr, max_num_sg, 1825 max_num_meta_sg, in, inlen); 1826 break; 1827 default: 1828 mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type); 1829 err = -EINVAL; 1830 } 1831 1832 if (err) 1833 goto err_free_in; 1834 1835 kfree(in); 1836 1837 return &mr->ibmr; 1838 1839 err_free_in: 1840 kfree(in); 1841 err_free: 1842 kfree(mr); 1843 return ERR_PTR(err); 1844 } 1845 1846 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1847 u32 max_num_sg) 1848 { 1849 return __mlx5_ib_alloc_mr(pd, mr_type, max_num_sg, 0); 1850 } 1851 1852 struct ib_mr *mlx5_ib_alloc_mr_integrity(struct ib_pd *pd, 1853 u32 max_num_sg, u32 max_num_meta_sg) 1854 { 1855 return __mlx5_ib_alloc_mr(pd, IB_MR_TYPE_INTEGRITY, max_num_sg, 1856 max_num_meta_sg); 1857 } 1858 1859 int mlx5_ib_alloc_mw(struct ib_mw *ibmw, struct ib_udata *udata) 1860 { 1861 struct mlx5_ib_dev *dev = to_mdev(ibmw->device); 1862 int inlen = MLX5_ST_SZ_BYTES(create_mkey_in); 1863 struct mlx5_ib_mw *mw = to_mmw(ibmw); 1864 unsigned int ndescs; 1865 u32 *in = NULL; 1866 void *mkc; 1867 int err; 1868 struct mlx5_ib_alloc_mw req = {}; 1869 struct { 1870 __u32 comp_mask; 1871 __u32 response_length; 1872 } resp = {}; 1873 1874 err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req))); 1875 if (err) 1876 return err; 1877 1878 if (req.comp_mask || req.reserved1 || req.reserved2) 1879 return -EOPNOTSUPP; 1880 1881 if (udata->inlen > sizeof(req) && 1882 !ib_is_udata_cleared(udata, sizeof(req), 1883 udata->inlen - sizeof(req))) 1884 return -EOPNOTSUPP; 1885 1886 ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4); 1887 1888 in = kzalloc(inlen, GFP_KERNEL); 1889 if (!in) { 1890 err = -ENOMEM; 1891 goto free; 1892 } 1893 1894 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); 1895 1896 MLX5_SET(mkc, mkc, free, 1); 1897 MLX5_SET(mkc, mkc, translations_octword_size, ndescs); 1898 MLX5_SET(mkc, mkc, pd, to_mpd(ibmw->pd)->pdn); 1899 MLX5_SET(mkc, mkc, umr_en, 1); 1900 MLX5_SET(mkc, mkc, lr, 1); 1901 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_KLMS); 1902 MLX5_SET(mkc, mkc, en_rinval, !!((ibmw->type == IB_MW_TYPE_2))); 1903 MLX5_SET(mkc, mkc, qpn, 0xffffff); 1904 1905 err = mlx5_ib_create_mkey(dev, &mw->mmkey, in, inlen); 1906 if (err) 1907 goto free; 1908 1909 mw->mmkey.type = MLX5_MKEY_MW; 1910 ibmw->rkey = mw->mmkey.key; 1911 mw->mmkey.ndescs = ndescs; 1912 1913 resp.response_length = 1914 min(offsetofend(typeof(resp), response_length), udata->outlen); 1915 if (resp.response_length) { 1916 err = ib_copy_to_udata(udata, &resp, resp.response_length); 1917 if (err) 1918 goto free_mkey; 1919 } 1920 1921 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { 1922 err = mlx5r_store_odp_mkey(dev, &mw->mmkey); 1923 if (err) 1924 goto free_mkey; 1925 } 1926 1927 kfree(in); 1928 return 0; 1929 1930 free_mkey: 1931 mlx5_core_destroy_mkey(dev->mdev, mw->mmkey.key); 1932 free: 1933 kfree(in); 1934 return err; 1935 } 1936 1937 int mlx5_ib_dealloc_mw(struct ib_mw *mw) 1938 { 1939 struct mlx5_ib_dev *dev = to_mdev(mw->device); 1940 struct mlx5_ib_mw *mmw = to_mmw(mw); 1941 1942 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1943 xa_erase(&dev->odp_mkeys, mlx5_base_mkey(mmw->mmkey.key))) 1944 /* 1945 * pagefault_single_data_segment() may be accessing mmw 1946 * if the user bound an ODP MR to this MW. 1947 */ 1948 mlx5r_deref_wait_odp_mkey(&mmw->mmkey); 1949 1950 return mlx5_core_destroy_mkey(dev->mdev, mmw->mmkey.key); 1951 } 1952 1953 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 1954 struct ib_mr_status *mr_status) 1955 { 1956 struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1957 int ret = 0; 1958 1959 if (check_mask & ~IB_MR_CHECK_SIG_STATUS) { 1960 pr_err("Invalid status check mask\n"); 1961 ret = -EINVAL; 1962 goto done; 1963 } 1964 1965 mr_status->fail_status = 0; 1966 if (check_mask & IB_MR_CHECK_SIG_STATUS) { 1967 if (!mmr->sig) { 1968 ret = -EINVAL; 1969 pr_err("signature status check requested on a non-signature enabled MR\n"); 1970 goto done; 1971 } 1972 1973 mmr->sig->sig_status_checked = true; 1974 if (!mmr->sig->sig_err_exists) 1975 goto done; 1976 1977 if (ibmr->lkey == mmr->sig->err_item.key) 1978 memcpy(&mr_status->sig_err, &mmr->sig->err_item, 1979 sizeof(mr_status->sig_err)); 1980 else { 1981 mr_status->sig_err.err_type = IB_SIG_BAD_GUARD; 1982 mr_status->sig_err.sig_err_offset = 0; 1983 mr_status->sig_err.key = mmr->sig->err_item.key; 1984 } 1985 1986 mmr->sig->sig_err_exists = false; 1987 mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS; 1988 } 1989 1990 done: 1991 return ret; 1992 } 1993 1994 static int 1995 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 1996 int data_sg_nents, unsigned int *data_sg_offset, 1997 struct scatterlist *meta_sg, int meta_sg_nents, 1998 unsigned int *meta_sg_offset) 1999 { 2000 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2001 unsigned int sg_offset = 0; 2002 int n = 0; 2003 2004 mr->meta_length = 0; 2005 if (data_sg_nents == 1) { 2006 n++; 2007 mr->mmkey.ndescs = 1; 2008 if (data_sg_offset) 2009 sg_offset = *data_sg_offset; 2010 mr->data_length = sg_dma_len(data_sg) - sg_offset; 2011 mr->data_iova = sg_dma_address(data_sg) + sg_offset; 2012 if (meta_sg_nents == 1) { 2013 n++; 2014 mr->meta_ndescs = 1; 2015 if (meta_sg_offset) 2016 sg_offset = *meta_sg_offset; 2017 else 2018 sg_offset = 0; 2019 mr->meta_length = sg_dma_len(meta_sg) - sg_offset; 2020 mr->pi_iova = sg_dma_address(meta_sg) + sg_offset; 2021 } 2022 ibmr->length = mr->data_length + mr->meta_length; 2023 } 2024 2025 return n; 2026 } 2027 2028 static int 2029 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr, 2030 struct scatterlist *sgl, 2031 unsigned short sg_nents, 2032 unsigned int *sg_offset_p, 2033 struct scatterlist *meta_sgl, 2034 unsigned short meta_sg_nents, 2035 unsigned int *meta_sg_offset_p) 2036 { 2037 struct scatterlist *sg = sgl; 2038 struct mlx5_klm *klms = mr->descs; 2039 unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; 2040 u32 lkey = mr->ibmr.pd->local_dma_lkey; 2041 int i, j = 0; 2042 2043 mr->ibmr.iova = sg_dma_address(sg) + sg_offset; 2044 mr->ibmr.length = 0; 2045 2046 for_each_sg(sgl, sg, sg_nents, i) { 2047 if (unlikely(i >= mr->max_descs)) 2048 break; 2049 klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset); 2050 klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset); 2051 klms[i].key = cpu_to_be32(lkey); 2052 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2053 2054 sg_offset = 0; 2055 } 2056 2057 if (sg_offset_p) 2058 *sg_offset_p = sg_offset; 2059 2060 mr->mmkey.ndescs = i; 2061 mr->data_length = mr->ibmr.length; 2062 2063 if (meta_sg_nents) { 2064 sg = meta_sgl; 2065 sg_offset = meta_sg_offset_p ? *meta_sg_offset_p : 0; 2066 for_each_sg(meta_sgl, sg, meta_sg_nents, j) { 2067 if (unlikely(i + j >= mr->max_descs)) 2068 break; 2069 klms[i + j].va = cpu_to_be64(sg_dma_address(sg) + 2070 sg_offset); 2071 klms[i + j].bcount = cpu_to_be32(sg_dma_len(sg) - 2072 sg_offset); 2073 klms[i + j].key = cpu_to_be32(lkey); 2074 mr->ibmr.length += sg_dma_len(sg) - sg_offset; 2075 2076 sg_offset = 0; 2077 } 2078 if (meta_sg_offset_p) 2079 *meta_sg_offset_p = sg_offset; 2080 2081 mr->meta_ndescs = j; 2082 mr->meta_length = mr->ibmr.length - mr->data_length; 2083 } 2084 2085 return i + j; 2086 } 2087 2088 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr) 2089 { 2090 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2091 __be64 *descs; 2092 2093 if (unlikely(mr->mmkey.ndescs == mr->max_descs)) 2094 return -ENOMEM; 2095 2096 descs = mr->descs; 2097 descs[mr->mmkey.ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2098 2099 return 0; 2100 } 2101 2102 static int mlx5_set_page_pi(struct ib_mr *ibmr, u64 addr) 2103 { 2104 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2105 __be64 *descs; 2106 2107 if (unlikely(mr->mmkey.ndescs + mr->meta_ndescs == mr->max_descs)) 2108 return -ENOMEM; 2109 2110 descs = mr->descs; 2111 descs[mr->mmkey.ndescs + mr->meta_ndescs++] = 2112 cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR); 2113 2114 return 0; 2115 } 2116 2117 static int 2118 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2119 int data_sg_nents, unsigned int *data_sg_offset, 2120 struct scatterlist *meta_sg, int meta_sg_nents, 2121 unsigned int *meta_sg_offset) 2122 { 2123 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2124 struct mlx5_ib_mr *pi_mr = mr->mtt_mr; 2125 int n; 2126 2127 pi_mr->mmkey.ndescs = 0; 2128 pi_mr->meta_ndescs = 0; 2129 pi_mr->meta_length = 0; 2130 2131 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2132 pi_mr->desc_size * pi_mr->max_descs, 2133 DMA_TO_DEVICE); 2134 2135 pi_mr->ibmr.page_size = ibmr->page_size; 2136 n = ib_sg_to_pages(&pi_mr->ibmr, data_sg, data_sg_nents, data_sg_offset, 2137 mlx5_set_page); 2138 if (n != data_sg_nents) 2139 return n; 2140 2141 pi_mr->data_iova = pi_mr->ibmr.iova; 2142 pi_mr->data_length = pi_mr->ibmr.length; 2143 pi_mr->ibmr.length = pi_mr->data_length; 2144 ibmr->length = pi_mr->data_length; 2145 2146 if (meta_sg_nents) { 2147 u64 page_mask = ~((u64)ibmr->page_size - 1); 2148 u64 iova = pi_mr->data_iova; 2149 2150 n += ib_sg_to_pages(&pi_mr->ibmr, meta_sg, meta_sg_nents, 2151 meta_sg_offset, mlx5_set_page_pi); 2152 2153 pi_mr->meta_length = pi_mr->ibmr.length; 2154 /* 2155 * PI address for the HW is the offset of the metadata address 2156 * relative to the first data page address. 2157 * It equals to first data page address + size of data pages + 2158 * metadata offset at the first metadata page 2159 */ 2160 pi_mr->pi_iova = (iova & page_mask) + 2161 pi_mr->mmkey.ndescs * ibmr->page_size + 2162 (pi_mr->ibmr.iova & ~page_mask); 2163 /* 2164 * In order to use one MTT MR for data and metadata, we register 2165 * also the gaps between the end of the data and the start of 2166 * the metadata (the sig MR will verify that the HW will access 2167 * to right addresses). This mapping is safe because we use 2168 * internal mkey for the registration. 2169 */ 2170 pi_mr->ibmr.length = pi_mr->pi_iova + pi_mr->meta_length - iova; 2171 pi_mr->ibmr.iova = iova; 2172 ibmr->length += pi_mr->meta_length; 2173 } 2174 2175 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2176 pi_mr->desc_size * pi_mr->max_descs, 2177 DMA_TO_DEVICE); 2178 2179 return n; 2180 } 2181 2182 static int 2183 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2184 int data_sg_nents, unsigned int *data_sg_offset, 2185 struct scatterlist *meta_sg, int meta_sg_nents, 2186 unsigned int *meta_sg_offset) 2187 { 2188 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2189 struct mlx5_ib_mr *pi_mr = mr->klm_mr; 2190 int n; 2191 2192 pi_mr->mmkey.ndescs = 0; 2193 pi_mr->meta_ndescs = 0; 2194 pi_mr->meta_length = 0; 2195 2196 ib_dma_sync_single_for_cpu(ibmr->device, pi_mr->desc_map, 2197 pi_mr->desc_size * pi_mr->max_descs, 2198 DMA_TO_DEVICE); 2199 2200 n = mlx5_ib_sg_to_klms(pi_mr, data_sg, data_sg_nents, data_sg_offset, 2201 meta_sg, meta_sg_nents, meta_sg_offset); 2202 2203 ib_dma_sync_single_for_device(ibmr->device, pi_mr->desc_map, 2204 pi_mr->desc_size * pi_mr->max_descs, 2205 DMA_TO_DEVICE); 2206 2207 /* This is zero-based memory region */ 2208 pi_mr->data_iova = 0; 2209 pi_mr->ibmr.iova = 0; 2210 pi_mr->pi_iova = pi_mr->data_length; 2211 ibmr->length = pi_mr->ibmr.length; 2212 2213 return n; 2214 } 2215 2216 int mlx5_ib_map_mr_sg_pi(struct ib_mr *ibmr, struct scatterlist *data_sg, 2217 int data_sg_nents, unsigned int *data_sg_offset, 2218 struct scatterlist *meta_sg, int meta_sg_nents, 2219 unsigned int *meta_sg_offset) 2220 { 2221 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2222 struct mlx5_ib_mr *pi_mr = NULL; 2223 int n; 2224 2225 WARN_ON(ibmr->type != IB_MR_TYPE_INTEGRITY); 2226 2227 mr->mmkey.ndescs = 0; 2228 mr->data_length = 0; 2229 mr->data_iova = 0; 2230 mr->meta_ndescs = 0; 2231 mr->pi_iova = 0; 2232 /* 2233 * As a performance optimization, if possible, there is no need to 2234 * perform UMR operation to register the data/metadata buffers. 2235 * First try to map the sg lists to PA descriptors with local_dma_lkey. 2236 * Fallback to UMR only in case of a failure. 2237 */ 2238 n = mlx5_ib_map_pa_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2239 data_sg_offset, meta_sg, meta_sg_nents, 2240 meta_sg_offset); 2241 if (n == data_sg_nents + meta_sg_nents) 2242 goto out; 2243 /* 2244 * As a performance optimization, if possible, there is no need to map 2245 * the sg lists to KLM descriptors. First try to map the sg lists to MTT 2246 * descriptors and fallback to KLM only in case of a failure. 2247 * It's more efficient for the HW to work with MTT descriptors 2248 * (especially in high load). 2249 * Use KLM (indirect access) only if it's mandatory. 2250 */ 2251 pi_mr = mr->mtt_mr; 2252 n = mlx5_ib_map_mtt_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2253 data_sg_offset, meta_sg, meta_sg_nents, 2254 meta_sg_offset); 2255 if (n == data_sg_nents + meta_sg_nents) 2256 goto out; 2257 2258 pi_mr = mr->klm_mr; 2259 n = mlx5_ib_map_klm_mr_sg_pi(ibmr, data_sg, data_sg_nents, 2260 data_sg_offset, meta_sg, meta_sg_nents, 2261 meta_sg_offset); 2262 if (unlikely(n != data_sg_nents + meta_sg_nents)) 2263 return -ENOMEM; 2264 2265 out: 2266 /* This is zero-based memory region */ 2267 ibmr->iova = 0; 2268 mr->pi_mr = pi_mr; 2269 if (pi_mr) 2270 ibmr->sig_attrs->meta_length = pi_mr->meta_length; 2271 else 2272 ibmr->sig_attrs->meta_length = mr->meta_length; 2273 2274 return 0; 2275 } 2276 2277 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, 2278 unsigned int *sg_offset) 2279 { 2280 struct mlx5_ib_mr *mr = to_mmr(ibmr); 2281 int n; 2282 2283 mr->mmkey.ndescs = 0; 2284 2285 ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map, 2286 mr->desc_size * mr->max_descs, 2287 DMA_TO_DEVICE); 2288 2289 if (mr->access_mode == MLX5_MKC_ACCESS_MODE_KLMS) 2290 n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset, NULL, 0, 2291 NULL); 2292 else 2293 n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, 2294 mlx5_set_page); 2295 2296 ib_dma_sync_single_for_device(ibmr->device, mr->desc_map, 2297 mr->desc_size * mr->max_descs, 2298 DMA_TO_DEVICE); 2299 2300 return n; 2301 } 2302