1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/vfio_pci_core.h> 21 #include <linux/anon_inodes.h> 22 23 #include "cmd.h" 24 25 /* Arbitrary to prevent userspace from consuming endless memory */ 26 #define MAX_MIGRATION_SIZE (512*1024*1024) 27 28 struct mlx5vf_pci_core_device { 29 struct vfio_pci_core_device core_device; 30 u16 vhca_id; 31 u8 migrate_cap:1; 32 u8 deferred_reset:1; 33 /* protect migration state */ 34 struct mutex state_mutex; 35 enum vfio_device_mig_state mig_state; 36 /* protect the reset_done flow */ 37 spinlock_t reset_lock; 38 struct mlx5_vf_migration_file *resuming_migf; 39 struct mlx5_vf_migration_file *saving_migf; 40 }; 41 42 static struct page * 43 mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, 44 unsigned long offset) 45 { 46 unsigned long cur_offset = 0; 47 struct scatterlist *sg; 48 unsigned int i; 49 50 /* All accesses are sequential */ 51 if (offset < migf->last_offset || !migf->last_offset_sg) { 52 migf->last_offset = 0; 53 migf->last_offset_sg = migf->table.sgt.sgl; 54 migf->sg_last_entry = 0; 55 } 56 57 cur_offset = migf->last_offset; 58 59 for_each_sg(migf->last_offset_sg, sg, 60 migf->table.sgt.orig_nents - migf->sg_last_entry, i) { 61 if (offset < sg->length + cur_offset) { 62 migf->last_offset_sg = sg; 63 migf->sg_last_entry += i; 64 migf->last_offset = cur_offset; 65 return nth_page(sg_page(sg), 66 (offset - cur_offset) / PAGE_SIZE); 67 } 68 cur_offset += sg->length; 69 } 70 return NULL; 71 } 72 73 static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, 74 unsigned int npages) 75 { 76 unsigned int to_alloc = npages; 77 struct page **page_list; 78 unsigned long filled; 79 unsigned int to_fill; 80 int ret; 81 82 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 83 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 84 if (!page_list) 85 return -ENOMEM; 86 87 do { 88 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 89 if (!filled) { 90 ret = -ENOMEM; 91 goto err; 92 } 93 to_alloc -= filled; 94 ret = sg_alloc_append_table_from_pages( 95 &migf->table, page_list, filled, 0, 96 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 97 GFP_KERNEL); 98 99 if (ret) 100 goto err; 101 migf->allocated_length += filled * PAGE_SIZE; 102 /* clean input for another bulk allocation */ 103 memset(page_list, 0, filled * sizeof(*page_list)); 104 to_fill = min_t(unsigned int, to_alloc, 105 PAGE_SIZE / sizeof(*page_list)); 106 } while (to_alloc > 0); 107 108 kvfree(page_list); 109 return 0; 110 111 err: 112 kvfree(page_list); 113 return ret; 114 } 115 116 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 117 { 118 struct sg_page_iter sg_iter; 119 120 mutex_lock(&migf->lock); 121 /* Undo alloc_pages_bulk_array() */ 122 for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) 123 __free_page(sg_page_iter_page(&sg_iter)); 124 sg_free_append_table(&migf->table); 125 migf->disabled = true; 126 migf->total_length = 0; 127 migf->allocated_length = 0; 128 migf->filp->f_pos = 0; 129 mutex_unlock(&migf->lock); 130 } 131 132 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 133 { 134 struct mlx5_vf_migration_file *migf = filp->private_data; 135 136 mlx5vf_disable_fd(migf); 137 mutex_destroy(&migf->lock); 138 kfree(migf); 139 return 0; 140 } 141 142 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 143 loff_t *pos) 144 { 145 struct mlx5_vf_migration_file *migf = filp->private_data; 146 ssize_t done = 0; 147 148 if (pos) 149 return -ESPIPE; 150 pos = &filp->f_pos; 151 152 mutex_lock(&migf->lock); 153 if (*pos > migf->total_length) { 154 done = -EINVAL; 155 goto out_unlock; 156 } 157 if (migf->disabled) { 158 done = -ENODEV; 159 goto out_unlock; 160 } 161 162 len = min_t(size_t, migf->total_length - *pos, len); 163 while (len) { 164 size_t page_offset; 165 struct page *page; 166 size_t page_len; 167 u8 *from_buff; 168 int ret; 169 170 page_offset = (*pos) % PAGE_SIZE; 171 page = mlx5vf_get_migration_page(migf, *pos - page_offset); 172 if (!page) { 173 if (done == 0) 174 done = -EINVAL; 175 goto out_unlock; 176 } 177 178 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 179 from_buff = kmap_local_page(page); 180 ret = copy_to_user(buf, from_buff + page_offset, page_len); 181 kunmap_local(from_buff); 182 if (ret) { 183 done = -EFAULT; 184 goto out_unlock; 185 } 186 *pos += page_len; 187 len -= page_len; 188 done += page_len; 189 buf += page_len; 190 } 191 192 out_unlock: 193 mutex_unlock(&migf->lock); 194 return done; 195 } 196 197 static const struct file_operations mlx5vf_save_fops = { 198 .owner = THIS_MODULE, 199 .read = mlx5vf_save_read, 200 .release = mlx5vf_release_file, 201 .llseek = no_llseek, 202 }; 203 204 static struct mlx5_vf_migration_file * 205 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) 206 { 207 struct mlx5_vf_migration_file *migf; 208 int ret; 209 210 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 211 if (!migf) 212 return ERR_PTR(-ENOMEM); 213 214 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 215 O_RDONLY); 216 if (IS_ERR(migf->filp)) { 217 int err = PTR_ERR(migf->filp); 218 219 kfree(migf); 220 return ERR_PTR(err); 221 } 222 223 stream_open(migf->filp->f_inode, migf->filp); 224 mutex_init(&migf->lock); 225 226 ret = mlx5vf_cmd_query_vhca_migration_state( 227 mvdev->core_device.pdev, mvdev->vhca_id, &migf->total_length); 228 if (ret) 229 goto out_free; 230 231 ret = mlx5vf_add_migration_pages( 232 migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); 233 if (ret) 234 goto out_free; 235 236 ret = mlx5vf_cmd_save_vhca_state(mvdev->core_device.pdev, 237 mvdev->vhca_id, migf); 238 if (ret) 239 goto out_free; 240 return migf; 241 out_free: 242 fput(migf->filp); 243 return ERR_PTR(ret); 244 } 245 246 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 247 size_t len, loff_t *pos) 248 { 249 struct mlx5_vf_migration_file *migf = filp->private_data; 250 loff_t requested_length; 251 ssize_t done = 0; 252 253 if (pos) 254 return -ESPIPE; 255 pos = &filp->f_pos; 256 257 if (*pos < 0 || 258 check_add_overflow((loff_t)len, *pos, &requested_length)) 259 return -EINVAL; 260 261 if (requested_length > MAX_MIGRATION_SIZE) 262 return -ENOMEM; 263 264 mutex_lock(&migf->lock); 265 if (migf->disabled) { 266 done = -ENODEV; 267 goto out_unlock; 268 } 269 270 if (migf->allocated_length < requested_length) { 271 done = mlx5vf_add_migration_pages( 272 migf, 273 DIV_ROUND_UP(requested_length - migf->allocated_length, 274 PAGE_SIZE)); 275 if (done) 276 goto out_unlock; 277 } 278 279 while (len) { 280 size_t page_offset; 281 struct page *page; 282 size_t page_len; 283 u8 *to_buff; 284 int ret; 285 286 page_offset = (*pos) % PAGE_SIZE; 287 page = mlx5vf_get_migration_page(migf, *pos - page_offset); 288 if (!page) { 289 if (done == 0) 290 done = -EINVAL; 291 goto out_unlock; 292 } 293 294 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 295 to_buff = kmap_local_page(page); 296 ret = copy_from_user(to_buff + page_offset, buf, page_len); 297 kunmap_local(to_buff); 298 if (ret) { 299 done = -EFAULT; 300 goto out_unlock; 301 } 302 *pos += page_len; 303 len -= page_len; 304 done += page_len; 305 buf += page_len; 306 migf->total_length += page_len; 307 } 308 out_unlock: 309 mutex_unlock(&migf->lock); 310 return done; 311 } 312 313 static const struct file_operations mlx5vf_resume_fops = { 314 .owner = THIS_MODULE, 315 .write = mlx5vf_resume_write, 316 .release = mlx5vf_release_file, 317 .llseek = no_llseek, 318 }; 319 320 static struct mlx5_vf_migration_file * 321 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 322 { 323 struct mlx5_vf_migration_file *migf; 324 325 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 326 if (!migf) 327 return ERR_PTR(-ENOMEM); 328 329 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 330 O_WRONLY); 331 if (IS_ERR(migf->filp)) { 332 int err = PTR_ERR(migf->filp); 333 334 kfree(migf); 335 return ERR_PTR(err); 336 } 337 stream_open(migf->filp->f_inode, migf->filp); 338 mutex_init(&migf->lock); 339 return migf; 340 } 341 342 static void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 343 { 344 if (mvdev->resuming_migf) { 345 mlx5vf_disable_fd(mvdev->resuming_migf); 346 fput(mvdev->resuming_migf->filp); 347 mvdev->resuming_migf = NULL; 348 } 349 if (mvdev->saving_migf) { 350 mlx5vf_disable_fd(mvdev->saving_migf); 351 fput(mvdev->saving_migf->filp); 352 mvdev->saving_migf = NULL; 353 } 354 } 355 356 static struct file * 357 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 358 u32 new) 359 { 360 u32 cur = mvdev->mig_state; 361 int ret; 362 363 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 364 ret = mlx5vf_cmd_suspend_vhca( 365 mvdev->core_device.pdev, mvdev->vhca_id, 366 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 367 if (ret) 368 return ERR_PTR(ret); 369 return NULL; 370 } 371 372 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 373 ret = mlx5vf_cmd_resume_vhca( 374 mvdev->core_device.pdev, mvdev->vhca_id, 375 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 376 if (ret) 377 return ERR_PTR(ret); 378 return NULL; 379 } 380 381 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 382 ret = mlx5vf_cmd_suspend_vhca( 383 mvdev->core_device.pdev, mvdev->vhca_id, 384 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 385 if (ret) 386 return ERR_PTR(ret); 387 return NULL; 388 } 389 390 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 391 ret = mlx5vf_cmd_resume_vhca( 392 mvdev->core_device.pdev, mvdev->vhca_id, 393 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 394 if (ret) 395 return ERR_PTR(ret); 396 return NULL; 397 } 398 399 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 400 struct mlx5_vf_migration_file *migf; 401 402 migf = mlx5vf_pci_save_device_data(mvdev); 403 if (IS_ERR(migf)) 404 return ERR_CAST(migf); 405 get_file(migf->filp); 406 mvdev->saving_migf = migf; 407 return migf->filp; 408 } 409 410 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { 411 mlx5vf_disable_fds(mvdev); 412 return NULL; 413 } 414 415 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 416 struct mlx5_vf_migration_file *migf; 417 418 migf = mlx5vf_pci_resume_device_data(mvdev); 419 if (IS_ERR(migf)) 420 return ERR_CAST(migf); 421 get_file(migf->filp); 422 mvdev->resuming_migf = migf; 423 return migf->filp; 424 } 425 426 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 427 ret = mlx5vf_cmd_load_vhca_state(mvdev->core_device.pdev, 428 mvdev->vhca_id, 429 mvdev->resuming_migf); 430 if (ret) 431 return ERR_PTR(ret); 432 mlx5vf_disable_fds(mvdev); 433 return NULL; 434 } 435 436 /* 437 * vfio_mig_get_next_state() does not use arcs other than the above 438 */ 439 WARN_ON(true); 440 return ERR_PTR(-EINVAL); 441 } 442 443 /* 444 * This function is called in all state_mutex unlock cases to 445 * handle a 'deferred_reset' if exists. 446 */ 447 static void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 448 { 449 again: 450 spin_lock(&mvdev->reset_lock); 451 if (mvdev->deferred_reset) { 452 mvdev->deferred_reset = false; 453 spin_unlock(&mvdev->reset_lock); 454 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 455 mlx5vf_disable_fds(mvdev); 456 goto again; 457 } 458 mutex_unlock(&mvdev->state_mutex); 459 spin_unlock(&mvdev->reset_lock); 460 } 461 462 static struct file * 463 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 464 enum vfio_device_mig_state new_state) 465 { 466 struct mlx5vf_pci_core_device *mvdev = container_of( 467 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 468 enum vfio_device_mig_state next_state; 469 struct file *res = NULL; 470 int ret; 471 472 mutex_lock(&mvdev->state_mutex); 473 while (new_state != mvdev->mig_state) { 474 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 475 new_state, &next_state); 476 if (ret) { 477 res = ERR_PTR(ret); 478 break; 479 } 480 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 481 if (IS_ERR(res)) 482 break; 483 mvdev->mig_state = next_state; 484 if (WARN_ON(res && new_state != mvdev->mig_state)) { 485 fput(res); 486 res = ERR_PTR(-EINVAL); 487 break; 488 } 489 } 490 mlx5vf_state_mutex_unlock(mvdev); 491 return res; 492 } 493 494 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 495 enum vfio_device_mig_state *curr_state) 496 { 497 struct mlx5vf_pci_core_device *mvdev = container_of( 498 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 499 500 mutex_lock(&mvdev->state_mutex); 501 *curr_state = mvdev->mig_state; 502 mlx5vf_state_mutex_unlock(mvdev); 503 return 0; 504 } 505 506 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 507 { 508 struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); 509 510 if (!mvdev->migrate_cap) 511 return; 512 513 /* 514 * As the higher VFIO layers are holding locks across reset and using 515 * those same locks with the mm_lock we need to prevent ABBA deadlock 516 * with the state_mutex and mm_lock. 517 * In case the state_mutex was taken already we defer the cleanup work 518 * to the unlock flow of the other running context. 519 */ 520 spin_lock(&mvdev->reset_lock); 521 mvdev->deferred_reset = true; 522 if (!mutex_trylock(&mvdev->state_mutex)) { 523 spin_unlock(&mvdev->reset_lock); 524 return; 525 } 526 spin_unlock(&mvdev->reset_lock); 527 mlx5vf_state_mutex_unlock(mvdev); 528 } 529 530 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 531 { 532 struct mlx5vf_pci_core_device *mvdev = container_of( 533 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 534 struct vfio_pci_core_device *vdev = &mvdev->core_device; 535 int vf_id; 536 int ret; 537 538 ret = vfio_pci_core_enable(vdev); 539 if (ret) 540 return ret; 541 542 if (!mvdev->migrate_cap) { 543 vfio_pci_core_finish_enable(vdev); 544 return 0; 545 } 546 547 vf_id = pci_iov_vf_id(vdev->pdev); 548 if (vf_id < 0) { 549 ret = vf_id; 550 goto out_disable; 551 } 552 553 ret = mlx5vf_cmd_get_vhca_id(vdev->pdev, vf_id + 1, &mvdev->vhca_id); 554 if (ret) 555 goto out_disable; 556 557 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 558 vfio_pci_core_finish_enable(vdev); 559 return 0; 560 out_disable: 561 vfio_pci_core_disable(vdev); 562 return ret; 563 } 564 565 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 566 { 567 struct mlx5vf_pci_core_device *mvdev = container_of( 568 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 569 570 mlx5vf_disable_fds(mvdev); 571 vfio_pci_core_close_device(core_vdev); 572 } 573 574 static const struct vfio_device_ops mlx5vf_pci_ops = { 575 .name = "mlx5-vfio-pci", 576 .open_device = mlx5vf_pci_open_device, 577 .close_device = mlx5vf_pci_close_device, 578 .ioctl = vfio_pci_core_ioctl, 579 .device_feature = vfio_pci_core_ioctl_feature, 580 .read = vfio_pci_core_read, 581 .write = vfio_pci_core_write, 582 .mmap = vfio_pci_core_mmap, 583 .request = vfio_pci_core_request, 584 .match = vfio_pci_core_match, 585 .migration_set_state = mlx5vf_pci_set_device_state, 586 .migration_get_state = mlx5vf_pci_get_device_state, 587 }; 588 589 static int mlx5vf_pci_probe(struct pci_dev *pdev, 590 const struct pci_device_id *id) 591 { 592 struct mlx5vf_pci_core_device *mvdev; 593 int ret; 594 595 mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); 596 if (!mvdev) 597 return -ENOMEM; 598 vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); 599 600 if (pdev->is_virtfn) { 601 struct mlx5_core_dev *mdev = 602 mlx5_vf_get_core_dev(pdev); 603 604 if (mdev) { 605 if (MLX5_CAP_GEN(mdev, migration)) { 606 mvdev->migrate_cap = 1; 607 mvdev->core_device.vdev.migration_flags = 608 VFIO_MIGRATION_STOP_COPY | 609 VFIO_MIGRATION_P2P; 610 mutex_init(&mvdev->state_mutex); 611 spin_lock_init(&mvdev->reset_lock); 612 } 613 mlx5_vf_put_core_dev(mdev); 614 } 615 } 616 617 ret = vfio_pci_core_register_device(&mvdev->core_device); 618 if (ret) 619 goto out_free; 620 621 dev_set_drvdata(&pdev->dev, mvdev); 622 return 0; 623 624 out_free: 625 vfio_pci_core_uninit_device(&mvdev->core_device); 626 kfree(mvdev); 627 return ret; 628 } 629 630 static void mlx5vf_pci_remove(struct pci_dev *pdev) 631 { 632 struct mlx5vf_pci_core_device *mvdev = dev_get_drvdata(&pdev->dev); 633 634 vfio_pci_core_unregister_device(&mvdev->core_device); 635 vfio_pci_core_uninit_device(&mvdev->core_device); 636 kfree(mvdev); 637 } 638 639 static const struct pci_device_id mlx5vf_pci_table[] = { 640 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 641 {} 642 }; 643 644 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 645 646 static const struct pci_error_handlers mlx5vf_err_handlers = { 647 .reset_done = mlx5vf_pci_aer_reset_done, 648 .error_detected = vfio_pci_core_aer_err_detected, 649 }; 650 651 static struct pci_driver mlx5vf_pci_driver = { 652 .name = KBUILD_MODNAME, 653 .id_table = mlx5vf_pci_table, 654 .probe = mlx5vf_pci_probe, 655 .remove = mlx5vf_pci_remove, 656 .err_handler = &mlx5vf_err_handlers, 657 }; 658 659 static void __exit mlx5vf_pci_cleanup(void) 660 { 661 pci_unregister_driver(&mlx5vf_pci_driver); 662 } 663 664 static int __init mlx5vf_pci_init(void) 665 { 666 return pci_register_driver(&mlx5vf_pci_driver); 667 } 668 669 module_init(mlx5vf_pci_init); 670 module_exit(mlx5vf_pci_cleanup); 671 672 MODULE_LICENSE("GPL"); 673 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 674 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 675 MODULE_DESCRIPTION( 676 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 677