1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Arbitrary to prevent userspace from consuming endless memory */ 25 #define MAX_MIGRATION_SIZE (512*1024*1024) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 static struct page * 36 mlx5vf_get_migration_page(struct mlx5_vf_migration_file *migf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < migf->last_offset || !migf->last_offset_sg) { 45 migf->last_offset = 0; 46 migf->last_offset_sg = migf->table.sgt.sgl; 47 migf->sg_last_entry = 0; 48 } 49 50 cur_offset = migf->last_offset; 51 52 for_each_sg(migf->last_offset_sg, sg, 53 migf->table.sgt.orig_nents - migf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 migf->last_offset_sg = sg; 56 migf->sg_last_entry += i; 57 migf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 static int mlx5vf_add_migration_pages(struct mlx5_vf_migration_file *migf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 82 if (!filled) { 83 ret = -ENOMEM; 84 goto err; 85 } 86 to_alloc -= filled; 87 ret = sg_alloc_append_table_from_pages( 88 &migf->table, page_list, filled, 0, 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 GFP_KERNEL); 91 92 if (ret) 93 goto err; 94 migf->allocated_length += filled * PAGE_SIZE; 95 /* clean input for another bulk allocation */ 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 to_fill = min_t(unsigned int, to_alloc, 98 PAGE_SIZE / sizeof(*page_list)); 99 } while (to_alloc > 0); 100 101 kvfree(page_list); 102 return 0; 103 104 err: 105 kvfree(page_list); 106 return ret; 107 } 108 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 { 111 struct sg_page_iter sg_iter; 112 113 mutex_lock(&migf->lock); 114 /* Undo alloc_pages_bulk_array() */ 115 for_each_sgtable_page(&migf->table.sgt, &sg_iter, 0) 116 __free_page(sg_page_iter_page(&sg_iter)); 117 sg_free_append_table(&migf->table); 118 migf->disabled = true; 119 migf->total_length = 0; 120 migf->allocated_length = 0; 121 migf->filp->f_pos = 0; 122 mutex_unlock(&migf->lock); 123 } 124 125 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 126 { 127 struct mlx5_vf_migration_file *migf = filp->private_data; 128 129 mlx5vf_disable_fd(migf); 130 mutex_destroy(&migf->lock); 131 kfree(migf); 132 return 0; 133 } 134 135 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 136 loff_t *pos) 137 { 138 struct mlx5_vf_migration_file *migf = filp->private_data; 139 ssize_t done = 0; 140 141 if (pos) 142 return -ESPIPE; 143 pos = &filp->f_pos; 144 145 if (!(filp->f_flags & O_NONBLOCK)) { 146 if (wait_event_interruptible(migf->poll_wait, 147 READ_ONCE(migf->total_length) || migf->is_err)) 148 return -ERESTARTSYS; 149 } 150 151 mutex_lock(&migf->lock); 152 if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(migf->total_length)) { 153 done = -EAGAIN; 154 goto out_unlock; 155 } 156 if (*pos > migf->total_length) { 157 done = -EINVAL; 158 goto out_unlock; 159 } 160 if (migf->disabled || migf->is_err) { 161 done = -ENODEV; 162 goto out_unlock; 163 } 164 165 len = min_t(size_t, migf->total_length - *pos, len); 166 while (len) { 167 size_t page_offset; 168 struct page *page; 169 size_t page_len; 170 u8 *from_buff; 171 int ret; 172 173 page_offset = (*pos) % PAGE_SIZE; 174 page = mlx5vf_get_migration_page(migf, *pos - page_offset); 175 if (!page) { 176 if (done == 0) 177 done = -EINVAL; 178 goto out_unlock; 179 } 180 181 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 182 from_buff = kmap_local_page(page); 183 ret = copy_to_user(buf, from_buff + page_offset, page_len); 184 kunmap_local(from_buff); 185 if (ret) { 186 done = -EFAULT; 187 goto out_unlock; 188 } 189 *pos += page_len; 190 len -= page_len; 191 done += page_len; 192 buf += page_len; 193 } 194 195 out_unlock: 196 mutex_unlock(&migf->lock); 197 return done; 198 } 199 200 static __poll_t mlx5vf_save_poll(struct file *filp, 201 struct poll_table_struct *wait) 202 { 203 struct mlx5_vf_migration_file *migf = filp->private_data; 204 __poll_t pollflags = 0; 205 206 poll_wait(filp, &migf->poll_wait, wait); 207 208 mutex_lock(&migf->lock); 209 if (migf->disabled || migf->is_err) 210 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 211 else if (READ_ONCE(migf->total_length)) 212 pollflags = EPOLLIN | EPOLLRDNORM; 213 mutex_unlock(&migf->lock); 214 215 return pollflags; 216 } 217 218 static const struct file_operations mlx5vf_save_fops = { 219 .owner = THIS_MODULE, 220 .read = mlx5vf_save_read, 221 .poll = mlx5vf_save_poll, 222 .release = mlx5vf_release_file, 223 .llseek = no_llseek, 224 }; 225 226 static struct mlx5_vf_migration_file * 227 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev) 228 { 229 struct mlx5_vf_migration_file *migf; 230 int ret; 231 232 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 233 if (!migf) 234 return ERR_PTR(-ENOMEM); 235 236 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 237 O_RDONLY); 238 if (IS_ERR(migf->filp)) { 239 int err = PTR_ERR(migf->filp); 240 241 kfree(migf); 242 return ERR_PTR(err); 243 } 244 245 stream_open(migf->filp->f_inode, migf->filp); 246 mutex_init(&migf->lock); 247 init_waitqueue_head(&migf->poll_wait); 248 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 249 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 250 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 251 &migf->total_length); 252 if (ret) 253 goto out_free; 254 255 ret = mlx5vf_add_migration_pages( 256 migf, DIV_ROUND_UP_ULL(migf->total_length, PAGE_SIZE)); 257 if (ret) 258 goto out_free; 259 260 migf->mvdev = mvdev; 261 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf); 262 if (ret) 263 goto out_free; 264 return migf; 265 out_free: 266 fput(migf->filp); 267 return ERR_PTR(ret); 268 } 269 270 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 271 size_t len, loff_t *pos) 272 { 273 struct mlx5_vf_migration_file *migf = filp->private_data; 274 loff_t requested_length; 275 ssize_t done = 0; 276 277 if (pos) 278 return -ESPIPE; 279 pos = &filp->f_pos; 280 281 if (*pos < 0 || 282 check_add_overflow((loff_t)len, *pos, &requested_length)) 283 return -EINVAL; 284 285 if (requested_length > MAX_MIGRATION_SIZE) 286 return -ENOMEM; 287 288 mutex_lock(&migf->lock); 289 if (migf->disabled) { 290 done = -ENODEV; 291 goto out_unlock; 292 } 293 294 if (migf->allocated_length < requested_length) { 295 done = mlx5vf_add_migration_pages( 296 migf, 297 DIV_ROUND_UP(requested_length - migf->allocated_length, 298 PAGE_SIZE)); 299 if (done) 300 goto out_unlock; 301 } 302 303 while (len) { 304 size_t page_offset; 305 struct page *page; 306 size_t page_len; 307 u8 *to_buff; 308 int ret; 309 310 page_offset = (*pos) % PAGE_SIZE; 311 page = mlx5vf_get_migration_page(migf, *pos - page_offset); 312 if (!page) { 313 if (done == 0) 314 done = -EINVAL; 315 goto out_unlock; 316 } 317 318 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 319 to_buff = kmap_local_page(page); 320 ret = copy_from_user(to_buff + page_offset, buf, page_len); 321 kunmap_local(to_buff); 322 if (ret) { 323 done = -EFAULT; 324 goto out_unlock; 325 } 326 *pos += page_len; 327 len -= page_len; 328 done += page_len; 329 buf += page_len; 330 migf->total_length += page_len; 331 } 332 out_unlock: 333 mutex_unlock(&migf->lock); 334 return done; 335 } 336 337 static const struct file_operations mlx5vf_resume_fops = { 338 .owner = THIS_MODULE, 339 .write = mlx5vf_resume_write, 340 .release = mlx5vf_release_file, 341 .llseek = no_llseek, 342 }; 343 344 static struct mlx5_vf_migration_file * 345 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 346 { 347 struct mlx5_vf_migration_file *migf; 348 349 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 350 if (!migf) 351 return ERR_PTR(-ENOMEM); 352 353 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 354 O_WRONLY); 355 if (IS_ERR(migf->filp)) { 356 int err = PTR_ERR(migf->filp); 357 358 kfree(migf); 359 return ERR_PTR(err); 360 } 361 stream_open(migf->filp->f_inode, migf->filp); 362 mutex_init(&migf->lock); 363 return migf; 364 } 365 366 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 367 { 368 if (mvdev->resuming_migf) { 369 mlx5vf_disable_fd(mvdev->resuming_migf); 370 fput(mvdev->resuming_migf->filp); 371 mvdev->resuming_migf = NULL; 372 } 373 if (mvdev->saving_migf) { 374 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 375 cancel_work_sync(&mvdev->saving_migf->async_data.work); 376 mlx5vf_disable_fd(mvdev->saving_migf); 377 fput(mvdev->saving_migf->filp); 378 mvdev->saving_migf = NULL; 379 } 380 } 381 382 static struct file * 383 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 384 u32 new) 385 { 386 u32 cur = mvdev->mig_state; 387 int ret; 388 389 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 390 ret = mlx5vf_cmd_suspend_vhca(mvdev, 391 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 392 if (ret) 393 return ERR_PTR(ret); 394 return NULL; 395 } 396 397 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 398 ret = mlx5vf_cmd_resume_vhca(mvdev, 399 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 400 if (ret) 401 return ERR_PTR(ret); 402 return NULL; 403 } 404 405 if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 406 ret = mlx5vf_cmd_suspend_vhca(mvdev, 407 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 408 if (ret) 409 return ERR_PTR(ret); 410 return NULL; 411 } 412 413 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) { 414 ret = mlx5vf_cmd_resume_vhca(mvdev, 415 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 416 if (ret) 417 return ERR_PTR(ret); 418 return NULL; 419 } 420 421 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 422 struct mlx5_vf_migration_file *migf; 423 424 migf = mlx5vf_pci_save_device_data(mvdev); 425 if (IS_ERR(migf)) 426 return ERR_CAST(migf); 427 get_file(migf->filp); 428 mvdev->saving_migf = migf; 429 return migf->filp; 430 } 431 432 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) { 433 mlx5vf_disable_fds(mvdev); 434 return NULL; 435 } 436 437 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 438 struct mlx5_vf_migration_file *migf; 439 440 migf = mlx5vf_pci_resume_device_data(mvdev); 441 if (IS_ERR(migf)) 442 return ERR_CAST(migf); 443 get_file(migf->filp); 444 mvdev->resuming_migf = migf; 445 return migf->filp; 446 } 447 448 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 449 ret = mlx5vf_cmd_load_vhca_state(mvdev, 450 mvdev->resuming_migf); 451 if (ret) 452 return ERR_PTR(ret); 453 mlx5vf_disable_fds(mvdev); 454 return NULL; 455 } 456 457 /* 458 * vfio_mig_get_next_state() does not use arcs other than the above 459 */ 460 WARN_ON(true); 461 return ERR_PTR(-EINVAL); 462 } 463 464 /* 465 * This function is called in all state_mutex unlock cases to 466 * handle a 'deferred_reset' if exists. 467 */ 468 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 469 { 470 again: 471 spin_lock(&mvdev->reset_lock); 472 if (mvdev->deferred_reset) { 473 mvdev->deferred_reset = false; 474 spin_unlock(&mvdev->reset_lock); 475 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 476 mlx5vf_disable_fds(mvdev); 477 goto again; 478 } 479 mutex_unlock(&mvdev->state_mutex); 480 spin_unlock(&mvdev->reset_lock); 481 } 482 483 static struct file * 484 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 485 enum vfio_device_mig_state new_state) 486 { 487 struct mlx5vf_pci_core_device *mvdev = container_of( 488 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 489 enum vfio_device_mig_state next_state; 490 struct file *res = NULL; 491 int ret; 492 493 mutex_lock(&mvdev->state_mutex); 494 while (new_state != mvdev->mig_state) { 495 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 496 new_state, &next_state); 497 if (ret) { 498 res = ERR_PTR(ret); 499 break; 500 } 501 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 502 if (IS_ERR(res)) 503 break; 504 mvdev->mig_state = next_state; 505 if (WARN_ON(res && new_state != mvdev->mig_state)) { 506 fput(res); 507 res = ERR_PTR(-EINVAL); 508 break; 509 } 510 } 511 mlx5vf_state_mutex_unlock(mvdev); 512 return res; 513 } 514 515 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 516 enum vfio_device_mig_state *curr_state) 517 { 518 struct mlx5vf_pci_core_device *mvdev = container_of( 519 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 520 521 mutex_lock(&mvdev->state_mutex); 522 *curr_state = mvdev->mig_state; 523 mlx5vf_state_mutex_unlock(mvdev); 524 return 0; 525 } 526 527 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 528 { 529 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 530 531 if (!mvdev->migrate_cap) 532 return; 533 534 /* 535 * As the higher VFIO layers are holding locks across reset and using 536 * those same locks with the mm_lock we need to prevent ABBA deadlock 537 * with the state_mutex and mm_lock. 538 * In case the state_mutex was taken already we defer the cleanup work 539 * to the unlock flow of the other running context. 540 */ 541 spin_lock(&mvdev->reset_lock); 542 mvdev->deferred_reset = true; 543 if (!mutex_trylock(&mvdev->state_mutex)) { 544 spin_unlock(&mvdev->reset_lock); 545 return; 546 } 547 spin_unlock(&mvdev->reset_lock); 548 mlx5vf_state_mutex_unlock(mvdev); 549 } 550 551 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 552 { 553 struct mlx5vf_pci_core_device *mvdev = container_of( 554 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 555 struct vfio_pci_core_device *vdev = &mvdev->core_device; 556 int ret; 557 558 ret = vfio_pci_core_enable(vdev); 559 if (ret) 560 return ret; 561 562 if (mvdev->migrate_cap) 563 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 564 vfio_pci_core_finish_enable(vdev); 565 return 0; 566 } 567 568 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 569 { 570 struct mlx5vf_pci_core_device *mvdev = container_of( 571 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 572 573 mlx5vf_disable_fds(mvdev); 574 vfio_pci_core_close_device(core_vdev); 575 } 576 577 static const struct vfio_device_ops mlx5vf_pci_ops = { 578 .name = "mlx5-vfio-pci", 579 .open_device = mlx5vf_pci_open_device, 580 .close_device = mlx5vf_pci_close_device, 581 .ioctl = vfio_pci_core_ioctl, 582 .device_feature = vfio_pci_core_ioctl_feature, 583 .read = vfio_pci_core_read, 584 .write = vfio_pci_core_write, 585 .mmap = vfio_pci_core_mmap, 586 .request = vfio_pci_core_request, 587 .match = vfio_pci_core_match, 588 .migration_set_state = mlx5vf_pci_set_device_state, 589 .migration_get_state = mlx5vf_pci_get_device_state, 590 }; 591 592 static int mlx5vf_pci_probe(struct pci_dev *pdev, 593 const struct pci_device_id *id) 594 { 595 struct mlx5vf_pci_core_device *mvdev; 596 int ret; 597 598 mvdev = kzalloc(sizeof(*mvdev), GFP_KERNEL); 599 if (!mvdev) 600 return -ENOMEM; 601 vfio_pci_core_init_device(&mvdev->core_device, pdev, &mlx5vf_pci_ops); 602 mlx5vf_cmd_set_migratable(mvdev); 603 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 604 ret = vfio_pci_core_register_device(&mvdev->core_device); 605 if (ret) 606 goto out_free; 607 return 0; 608 609 out_free: 610 mlx5vf_cmd_remove_migratable(mvdev); 611 vfio_pci_core_uninit_device(&mvdev->core_device); 612 kfree(mvdev); 613 return ret; 614 } 615 616 static void mlx5vf_pci_remove(struct pci_dev *pdev) 617 { 618 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 619 620 vfio_pci_core_unregister_device(&mvdev->core_device); 621 mlx5vf_cmd_remove_migratable(mvdev); 622 vfio_pci_core_uninit_device(&mvdev->core_device); 623 kfree(mvdev); 624 } 625 626 static const struct pci_device_id mlx5vf_pci_table[] = { 627 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 628 {} 629 }; 630 631 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 632 633 static const struct pci_error_handlers mlx5vf_err_handlers = { 634 .reset_done = mlx5vf_pci_aer_reset_done, 635 .error_detected = vfio_pci_core_aer_err_detected, 636 }; 637 638 static struct pci_driver mlx5vf_pci_driver = { 639 .name = KBUILD_MODNAME, 640 .id_table = mlx5vf_pci_table, 641 .probe = mlx5vf_pci_probe, 642 .remove = mlx5vf_pci_remove, 643 .err_handler = &mlx5vf_err_handlers, 644 .driver_managed_dma = true, 645 }; 646 647 static void __exit mlx5vf_pci_cleanup(void) 648 { 649 pci_unregister_driver(&mlx5vf_pci_driver); 650 } 651 652 static int __init mlx5vf_pci_init(void) 653 { 654 return pci_register_driver(&mlx5vf_pci_driver); 655 } 656 657 module_init(mlx5vf_pci_init); 658 module_exit(mlx5vf_pci_cleanup); 659 660 MODULE_LICENSE("GPL"); 661 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 662 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 663 MODULE_DESCRIPTION( 664 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 665