1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Arbitrary to prevent userspace from consuming endless memory */ 25 #define MAX_MIGRATION_SIZE (512*1024*1024) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 82 if (!filled) { 83 ret = -ENOMEM; 84 goto err; 85 } 86 to_alloc -= filled; 87 ret = sg_alloc_append_table_from_pages( 88 &buf->table, page_list, filled, 0, 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 GFP_KERNEL); 91 92 if (ret) 93 goto err; 94 buf->allocated_length += filled * PAGE_SIZE; 95 /* clean input for another bulk allocation */ 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 to_fill = min_t(unsigned int, to_alloc, 98 PAGE_SIZE / sizeof(*page_list)); 99 } while (to_alloc > 0); 100 101 kvfree(page_list); 102 return 0; 103 104 err: 105 kvfree(page_list); 106 return ret; 107 } 108 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 { 111 mutex_lock(&migf->lock); 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 migf->filp->f_pos = 0; 114 mutex_unlock(&migf->lock); 115 } 116 117 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 118 { 119 struct mlx5_vf_migration_file *migf = filp->private_data; 120 121 mlx5vf_disable_fd(migf); 122 mutex_destroy(&migf->lock); 123 kfree(migf); 124 return 0; 125 } 126 127 static struct mlx5_vhca_data_buffer * 128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 129 bool *end_of_data) 130 { 131 struct mlx5_vhca_data_buffer *buf; 132 bool found = false; 133 134 *end_of_data = false; 135 spin_lock_irq(&migf->list_lock); 136 if (list_empty(&migf->buf_list)) { 137 *end_of_data = true; 138 goto end; 139 } 140 141 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 142 buf_elm); 143 if (pos >= buf->start_pos && 144 pos < buf->start_pos + buf->length) { 145 found = true; 146 goto end; 147 } 148 149 /* 150 * As we use a stream based FD we may expect having the data always 151 * on first chunk 152 */ 153 migf->state = MLX5_MIGF_STATE_ERROR; 154 155 end: 156 spin_unlock_irq(&migf->list_lock); 157 return found ? buf : NULL; 158 } 159 160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 161 char __user **buf, size_t *len, loff_t *pos) 162 { 163 unsigned long offset; 164 ssize_t done = 0; 165 size_t copy_len; 166 167 copy_len = min_t(size_t, 168 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 169 while (copy_len) { 170 size_t page_offset; 171 struct page *page; 172 size_t page_len; 173 u8 *from_buff; 174 int ret; 175 176 offset = *pos - vhca_buf->start_pos; 177 page_offset = offset % PAGE_SIZE; 178 offset -= page_offset; 179 page = mlx5vf_get_migration_page(vhca_buf, offset); 180 if (!page) 181 return -EINVAL; 182 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 183 from_buff = kmap_local_page(page); 184 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 185 kunmap_local(from_buff); 186 if (ret) 187 return -EFAULT; 188 *pos += page_len; 189 *len -= page_len; 190 *buf += page_len; 191 done += page_len; 192 copy_len -= page_len; 193 } 194 195 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 196 spin_lock_irq(&vhca_buf->migf->list_lock); 197 list_del_init(&vhca_buf->buf_elm); 198 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 199 spin_unlock_irq(&vhca_buf->migf->list_lock); 200 } 201 202 return done; 203 } 204 205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 206 loff_t *pos) 207 { 208 struct mlx5_vf_migration_file *migf = filp->private_data; 209 struct mlx5_vhca_data_buffer *vhca_buf; 210 bool first_loop_call = true; 211 bool end_of_data; 212 ssize_t done = 0; 213 214 if (pos) 215 return -ESPIPE; 216 pos = &filp->f_pos; 217 218 if (!(filp->f_flags & O_NONBLOCK)) { 219 if (wait_event_interruptible(migf->poll_wait, 220 !list_empty(&migf->buf_list) || 221 migf->state == MLX5_MIGF_STATE_ERROR || 222 migf->state == MLX5_MIGF_STATE_COMPLETE)) 223 return -ERESTARTSYS; 224 } 225 226 mutex_lock(&migf->lock); 227 if (migf->state == MLX5_MIGF_STATE_ERROR) { 228 done = -ENODEV; 229 goto out_unlock; 230 } 231 232 while (len) { 233 ssize_t count; 234 235 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 236 &end_of_data); 237 if (first_loop_call) { 238 first_loop_call = false; 239 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 240 if (filp->f_flags & O_NONBLOCK) { 241 done = -EAGAIN; 242 goto out_unlock; 243 } 244 } 245 } 246 247 if (end_of_data) 248 goto out_unlock; 249 250 if (!vhca_buf) { 251 done = -EINVAL; 252 goto out_unlock; 253 } 254 255 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 256 if (count < 0) { 257 done = count; 258 goto out_unlock; 259 } 260 done += count; 261 } 262 263 out_unlock: 264 mutex_unlock(&migf->lock); 265 return done; 266 } 267 268 static __poll_t mlx5vf_save_poll(struct file *filp, 269 struct poll_table_struct *wait) 270 { 271 struct mlx5_vf_migration_file *migf = filp->private_data; 272 __poll_t pollflags = 0; 273 274 poll_wait(filp, &migf->poll_wait, wait); 275 276 mutex_lock(&migf->lock); 277 if (migf->state == MLX5_MIGF_STATE_ERROR) 278 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 279 else if (!list_empty(&migf->buf_list) || 280 migf->state == MLX5_MIGF_STATE_COMPLETE) 281 pollflags = EPOLLIN | EPOLLRDNORM; 282 mutex_unlock(&migf->lock); 283 284 return pollflags; 285 } 286 287 /* 288 * FD is exposed and user can use it after receiving an error. 289 * Mark migf in error, and wake the user. 290 */ 291 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 292 { 293 migf->state = MLX5_MIGF_STATE_ERROR; 294 wake_up_interruptible(&migf->poll_wait); 295 } 296 297 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 298 unsigned long arg) 299 { 300 struct mlx5_vf_migration_file *migf = filp->private_data; 301 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 302 struct mlx5_vhca_data_buffer *buf; 303 struct vfio_precopy_info info = {}; 304 loff_t *pos = &filp->f_pos; 305 unsigned long minsz; 306 size_t inc_length = 0; 307 bool end_of_data; 308 int ret; 309 310 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 311 return -ENOTTY; 312 313 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 314 315 if (copy_from_user(&info, (void __user *)arg, minsz)) 316 return -EFAULT; 317 318 if (info.argsz < minsz) 319 return -EINVAL; 320 321 mutex_lock(&mvdev->state_mutex); 322 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 323 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 324 ret = -EINVAL; 325 goto err_state_unlock; 326 } 327 328 /* 329 * We can't issue a SAVE command when the device is suspended, so as 330 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 331 * bytes that can't be read. 332 */ 333 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 334 /* 335 * Once the query returns it's guaranteed that there is no 336 * active SAVE command. 337 * As so, the other code below is safe with the proper locks. 338 */ 339 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 340 MLX5VF_QUERY_INC); 341 if (ret) 342 goto err_state_unlock; 343 } 344 345 mutex_lock(&migf->lock); 346 if (migf->state == MLX5_MIGF_STATE_ERROR) { 347 ret = -ENODEV; 348 goto err_migf_unlock; 349 } 350 351 buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); 352 if (buf) { 353 if (buf->start_pos == 0) { 354 info.initial_bytes = buf->header_image_size - *pos; 355 } else if (buf->start_pos == 356 sizeof(struct mlx5_vf_migration_header)) { 357 /* First data buffer following the header */ 358 info.initial_bytes = buf->start_pos + 359 buf->length - *pos; 360 } else { 361 info.dirty_bytes = buf->start_pos + buf->length - *pos; 362 } 363 } else { 364 if (!end_of_data) { 365 ret = -EINVAL; 366 goto err_migf_unlock; 367 } 368 369 info.dirty_bytes = inc_length; 370 } 371 372 if (!end_of_data || !inc_length) { 373 mutex_unlock(&migf->lock); 374 goto done; 375 } 376 377 mutex_unlock(&migf->lock); 378 /* 379 * We finished transferring the current state and the device has a 380 * dirty state, save a new state to be ready for. 381 */ 382 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 383 if (IS_ERR(buf)) { 384 ret = PTR_ERR(buf); 385 mlx5vf_mark_err(migf); 386 goto err_state_unlock; 387 } 388 389 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 390 if (ret) { 391 mlx5vf_mark_err(migf); 392 mlx5vf_put_data_buffer(buf); 393 goto err_state_unlock; 394 } 395 396 done: 397 mlx5vf_state_mutex_unlock(mvdev); 398 return copy_to_user((void __user *)arg, &info, minsz); 399 err_migf_unlock: 400 mutex_unlock(&migf->lock); 401 err_state_unlock: 402 mlx5vf_state_mutex_unlock(mvdev); 403 return ret; 404 } 405 406 static const struct file_operations mlx5vf_save_fops = { 407 .owner = THIS_MODULE, 408 .read = mlx5vf_save_read, 409 .poll = mlx5vf_save_poll, 410 .unlocked_ioctl = mlx5vf_precopy_ioctl, 411 .compat_ioctl = compat_ptr_ioctl, 412 .release = mlx5vf_release_file, 413 .llseek = no_llseek, 414 }; 415 416 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 417 { 418 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 419 struct mlx5_vhca_data_buffer *buf; 420 size_t length; 421 int ret; 422 423 if (migf->state == MLX5_MIGF_STATE_ERROR) 424 return -ENODEV; 425 426 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 427 MLX5VF_QUERY_INC); 428 if (ret) 429 goto err; 430 431 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 432 if (IS_ERR(buf)) { 433 ret = PTR_ERR(buf); 434 goto err; 435 } 436 437 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 438 if (ret) 439 goto err_save; 440 441 return 0; 442 443 err_save: 444 mlx5vf_put_data_buffer(buf); 445 err: 446 mlx5vf_mark_err(migf); 447 return ret; 448 } 449 450 static struct mlx5_vf_migration_file * 451 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 452 { 453 struct mlx5_vf_migration_file *migf; 454 struct mlx5_vhca_data_buffer *buf; 455 size_t length; 456 int ret; 457 458 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 459 if (!migf) 460 return ERR_PTR(-ENOMEM); 461 462 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 463 O_RDONLY); 464 if (IS_ERR(migf->filp)) { 465 ret = PTR_ERR(migf->filp); 466 goto end; 467 } 468 469 migf->mvdev = mvdev; 470 ret = mlx5vf_cmd_alloc_pd(migf); 471 if (ret) 472 goto out_free; 473 474 stream_open(migf->filp->f_inode, migf->filp); 475 mutex_init(&migf->lock); 476 init_waitqueue_head(&migf->poll_wait); 477 init_completion(&migf->save_comp); 478 /* 479 * save_comp is being used as a binary semaphore built from 480 * a completion. A normal mutex cannot be used because the lock is 481 * passed between kernel threads and lockdep can't model this. 482 */ 483 complete(&migf->save_comp); 484 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 485 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 486 INIT_LIST_HEAD(&migf->buf_list); 487 INIT_LIST_HEAD(&migf->avail_list); 488 spin_lock_init(&migf->list_lock); 489 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 490 if (ret) 491 goto out_pd; 492 493 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 494 if (IS_ERR(buf)) { 495 ret = PTR_ERR(buf); 496 goto out_pd; 497 } 498 499 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 500 if (ret) 501 goto out_save; 502 return migf; 503 out_save: 504 mlx5vf_free_data_buffer(buf); 505 out_pd: 506 mlx5vf_cmd_dealloc_pd(migf); 507 out_free: 508 fput(migf->filp); 509 end: 510 kfree(migf); 511 return ERR_PTR(ret); 512 } 513 514 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 515 size_t len, loff_t *pos) 516 { 517 struct mlx5_vf_migration_file *migf = filp->private_data; 518 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 519 loff_t requested_length; 520 ssize_t done = 0; 521 522 if (pos) 523 return -ESPIPE; 524 pos = &filp->f_pos; 525 526 if (*pos < 0 || 527 check_add_overflow((loff_t)len, *pos, &requested_length)) 528 return -EINVAL; 529 530 if (requested_length > MAX_MIGRATION_SIZE) 531 return -ENOMEM; 532 533 mutex_lock(&migf->lock); 534 if (migf->state == MLX5_MIGF_STATE_ERROR) { 535 done = -ENODEV; 536 goto out_unlock; 537 } 538 539 if (vhca_buf->allocated_length < requested_length) { 540 done = mlx5vf_add_migration_pages( 541 vhca_buf, 542 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 543 PAGE_SIZE)); 544 if (done) 545 goto out_unlock; 546 } 547 548 while (len) { 549 size_t page_offset; 550 struct page *page; 551 size_t page_len; 552 u8 *to_buff; 553 int ret; 554 555 page_offset = (*pos) % PAGE_SIZE; 556 page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset); 557 if (!page) { 558 if (done == 0) 559 done = -EINVAL; 560 goto out_unlock; 561 } 562 563 page_len = min_t(size_t, len, PAGE_SIZE - page_offset); 564 to_buff = kmap_local_page(page); 565 ret = copy_from_user(to_buff + page_offset, buf, page_len); 566 kunmap_local(to_buff); 567 if (ret) { 568 done = -EFAULT; 569 goto out_unlock; 570 } 571 *pos += page_len; 572 len -= page_len; 573 done += page_len; 574 buf += page_len; 575 vhca_buf->length += page_len; 576 } 577 out_unlock: 578 mutex_unlock(&migf->lock); 579 return done; 580 } 581 582 static const struct file_operations mlx5vf_resume_fops = { 583 .owner = THIS_MODULE, 584 .write = mlx5vf_resume_write, 585 .release = mlx5vf_release_file, 586 .llseek = no_llseek, 587 }; 588 589 static struct mlx5_vf_migration_file * 590 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 591 { 592 struct mlx5_vf_migration_file *migf; 593 struct mlx5_vhca_data_buffer *buf; 594 int ret; 595 596 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 597 if (!migf) 598 return ERR_PTR(-ENOMEM); 599 600 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 601 O_WRONLY); 602 if (IS_ERR(migf->filp)) { 603 ret = PTR_ERR(migf->filp); 604 goto end; 605 } 606 607 migf->mvdev = mvdev; 608 ret = mlx5vf_cmd_alloc_pd(migf); 609 if (ret) 610 goto out_free; 611 612 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 613 if (IS_ERR(buf)) { 614 ret = PTR_ERR(buf); 615 goto out_pd; 616 } 617 618 migf->buf = buf; 619 stream_open(migf->filp->f_inode, migf->filp); 620 mutex_init(&migf->lock); 621 INIT_LIST_HEAD(&migf->buf_list); 622 INIT_LIST_HEAD(&migf->avail_list); 623 spin_lock_init(&migf->list_lock); 624 return migf; 625 out_pd: 626 mlx5vf_cmd_dealloc_pd(migf); 627 out_free: 628 fput(migf->filp); 629 end: 630 kfree(migf); 631 return ERR_PTR(ret); 632 } 633 634 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 635 { 636 if (mvdev->resuming_migf) { 637 mlx5vf_disable_fd(mvdev->resuming_migf); 638 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 639 fput(mvdev->resuming_migf->filp); 640 mvdev->resuming_migf = NULL; 641 } 642 if (mvdev->saving_migf) { 643 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 644 cancel_work_sync(&mvdev->saving_migf->async_data.work); 645 mlx5vf_disable_fd(mvdev->saving_migf); 646 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 647 fput(mvdev->saving_migf->filp); 648 mvdev->saving_migf = NULL; 649 } 650 } 651 652 static struct file * 653 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 654 u32 new) 655 { 656 u32 cur = mvdev->mig_state; 657 int ret; 658 659 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 660 ret = mlx5vf_cmd_suspend_vhca(mvdev, 661 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 662 if (ret) 663 return ERR_PTR(ret); 664 return NULL; 665 } 666 667 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 668 ret = mlx5vf_cmd_resume_vhca(mvdev, 669 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 670 if (ret) 671 return ERR_PTR(ret); 672 return NULL; 673 } 674 675 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 676 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 677 ret = mlx5vf_cmd_suspend_vhca(mvdev, 678 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 679 if (ret) 680 return ERR_PTR(ret); 681 return NULL; 682 } 683 684 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 685 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 686 ret = mlx5vf_cmd_resume_vhca(mvdev, 687 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 688 if (ret) 689 return ERR_PTR(ret); 690 return NULL; 691 } 692 693 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 694 struct mlx5_vf_migration_file *migf; 695 696 migf = mlx5vf_pci_save_device_data(mvdev, false); 697 if (IS_ERR(migf)) 698 return ERR_CAST(migf); 699 get_file(migf->filp); 700 mvdev->saving_migf = migf; 701 return migf->filp; 702 } 703 704 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 705 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 706 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 707 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 708 mlx5vf_disable_fds(mvdev); 709 return NULL; 710 } 711 712 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 713 struct mlx5_vf_migration_file *migf; 714 715 migf = mlx5vf_pci_resume_device_data(mvdev); 716 if (IS_ERR(migf)) 717 return ERR_CAST(migf); 718 get_file(migf->filp); 719 mvdev->resuming_migf = migf; 720 return migf->filp; 721 } 722 723 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 724 ret = mlx5vf_cmd_load_vhca_state(mvdev, 725 mvdev->resuming_migf, 726 mvdev->resuming_migf->buf); 727 if (ret) 728 return ERR_PTR(ret); 729 mlx5vf_disable_fds(mvdev); 730 return NULL; 731 } 732 733 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 734 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 735 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 736 struct mlx5_vf_migration_file *migf; 737 738 migf = mlx5vf_pci_save_device_data(mvdev, true); 739 if (IS_ERR(migf)) 740 return ERR_CAST(migf); 741 get_file(migf->filp); 742 mvdev->saving_migf = migf; 743 return migf->filp; 744 } 745 746 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 747 ret = mlx5vf_cmd_suspend_vhca(mvdev, 748 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 749 if (ret) 750 return ERR_PTR(ret); 751 ret = mlx5vf_pci_save_device_inc_data(mvdev); 752 return ret ? ERR_PTR(ret) : NULL; 753 } 754 755 /* 756 * vfio_mig_get_next_state() does not use arcs other than the above 757 */ 758 WARN_ON(true); 759 return ERR_PTR(-EINVAL); 760 } 761 762 /* 763 * This function is called in all state_mutex unlock cases to 764 * handle a 'deferred_reset' if exists. 765 */ 766 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 767 { 768 again: 769 spin_lock(&mvdev->reset_lock); 770 if (mvdev->deferred_reset) { 771 mvdev->deferred_reset = false; 772 spin_unlock(&mvdev->reset_lock); 773 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 774 mlx5vf_disable_fds(mvdev); 775 goto again; 776 } 777 mutex_unlock(&mvdev->state_mutex); 778 spin_unlock(&mvdev->reset_lock); 779 } 780 781 static struct file * 782 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 783 enum vfio_device_mig_state new_state) 784 { 785 struct mlx5vf_pci_core_device *mvdev = container_of( 786 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 787 enum vfio_device_mig_state next_state; 788 struct file *res = NULL; 789 int ret; 790 791 mutex_lock(&mvdev->state_mutex); 792 while (new_state != mvdev->mig_state) { 793 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 794 new_state, &next_state); 795 if (ret) { 796 res = ERR_PTR(ret); 797 break; 798 } 799 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 800 if (IS_ERR(res)) 801 break; 802 mvdev->mig_state = next_state; 803 if (WARN_ON(res && new_state != mvdev->mig_state)) { 804 fput(res); 805 res = ERR_PTR(-EINVAL); 806 break; 807 } 808 } 809 mlx5vf_state_mutex_unlock(mvdev); 810 return res; 811 } 812 813 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 814 unsigned long *stop_copy_length) 815 { 816 struct mlx5vf_pci_core_device *mvdev = container_of( 817 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 818 size_t state_size; 819 int ret; 820 821 mutex_lock(&mvdev->state_mutex); 822 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 823 &state_size, 0); 824 if (!ret) 825 *stop_copy_length = state_size; 826 mlx5vf_state_mutex_unlock(mvdev); 827 return ret; 828 } 829 830 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 831 enum vfio_device_mig_state *curr_state) 832 { 833 struct mlx5vf_pci_core_device *mvdev = container_of( 834 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 835 836 mutex_lock(&mvdev->state_mutex); 837 *curr_state = mvdev->mig_state; 838 mlx5vf_state_mutex_unlock(mvdev); 839 return 0; 840 } 841 842 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 843 { 844 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 845 846 if (!mvdev->migrate_cap) 847 return; 848 849 /* 850 * As the higher VFIO layers are holding locks across reset and using 851 * those same locks with the mm_lock we need to prevent ABBA deadlock 852 * with the state_mutex and mm_lock. 853 * In case the state_mutex was taken already we defer the cleanup work 854 * to the unlock flow of the other running context. 855 */ 856 spin_lock(&mvdev->reset_lock); 857 mvdev->deferred_reset = true; 858 if (!mutex_trylock(&mvdev->state_mutex)) { 859 spin_unlock(&mvdev->reset_lock); 860 return; 861 } 862 spin_unlock(&mvdev->reset_lock); 863 mlx5vf_state_mutex_unlock(mvdev); 864 } 865 866 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 867 { 868 struct mlx5vf_pci_core_device *mvdev = container_of( 869 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 870 struct vfio_pci_core_device *vdev = &mvdev->core_device; 871 int ret; 872 873 ret = vfio_pci_core_enable(vdev); 874 if (ret) 875 return ret; 876 877 if (mvdev->migrate_cap) 878 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 879 vfio_pci_core_finish_enable(vdev); 880 return 0; 881 } 882 883 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 884 { 885 struct mlx5vf_pci_core_device *mvdev = container_of( 886 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 887 888 mlx5vf_cmd_close_migratable(mvdev); 889 vfio_pci_core_close_device(core_vdev); 890 } 891 892 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 893 .migration_set_state = mlx5vf_pci_set_device_state, 894 .migration_get_state = mlx5vf_pci_get_device_state, 895 .migration_get_data_size = mlx5vf_pci_get_data_size, 896 }; 897 898 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 899 .log_start = mlx5vf_start_page_tracker, 900 .log_stop = mlx5vf_stop_page_tracker, 901 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 902 }; 903 904 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 905 { 906 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 907 struct mlx5vf_pci_core_device, core_device.vdev); 908 int ret; 909 910 ret = vfio_pci_core_init_dev(core_vdev); 911 if (ret) 912 return ret; 913 914 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 915 &mlx5vf_pci_log_ops); 916 917 return 0; 918 } 919 920 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 921 { 922 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 923 struct mlx5vf_pci_core_device, core_device.vdev); 924 925 mlx5vf_cmd_remove_migratable(mvdev); 926 vfio_pci_core_release_dev(core_vdev); 927 } 928 929 static const struct vfio_device_ops mlx5vf_pci_ops = { 930 .name = "mlx5-vfio-pci", 931 .init = mlx5vf_pci_init_dev, 932 .release = mlx5vf_pci_release_dev, 933 .open_device = mlx5vf_pci_open_device, 934 .close_device = mlx5vf_pci_close_device, 935 .ioctl = vfio_pci_core_ioctl, 936 .device_feature = vfio_pci_core_ioctl_feature, 937 .read = vfio_pci_core_read, 938 .write = vfio_pci_core_write, 939 .mmap = vfio_pci_core_mmap, 940 .request = vfio_pci_core_request, 941 .match = vfio_pci_core_match, 942 }; 943 944 static int mlx5vf_pci_probe(struct pci_dev *pdev, 945 const struct pci_device_id *id) 946 { 947 struct mlx5vf_pci_core_device *mvdev; 948 int ret; 949 950 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 951 &pdev->dev, &mlx5vf_pci_ops); 952 if (IS_ERR(mvdev)) 953 return PTR_ERR(mvdev); 954 955 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 956 ret = vfio_pci_core_register_device(&mvdev->core_device); 957 if (ret) 958 goto out_put_vdev; 959 return 0; 960 961 out_put_vdev: 962 vfio_put_device(&mvdev->core_device.vdev); 963 return ret; 964 } 965 966 static void mlx5vf_pci_remove(struct pci_dev *pdev) 967 { 968 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 969 970 vfio_pci_core_unregister_device(&mvdev->core_device); 971 vfio_put_device(&mvdev->core_device.vdev); 972 } 973 974 static const struct pci_device_id mlx5vf_pci_table[] = { 975 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 976 {} 977 }; 978 979 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 980 981 static const struct pci_error_handlers mlx5vf_err_handlers = { 982 .reset_done = mlx5vf_pci_aer_reset_done, 983 .error_detected = vfio_pci_core_aer_err_detected, 984 }; 985 986 static struct pci_driver mlx5vf_pci_driver = { 987 .name = KBUILD_MODNAME, 988 .id_table = mlx5vf_pci_table, 989 .probe = mlx5vf_pci_probe, 990 .remove = mlx5vf_pci_remove, 991 .err_handler = &mlx5vf_err_handlers, 992 .driver_managed_dma = true, 993 }; 994 995 module_pci_driver(mlx5vf_pci_driver); 996 997 MODULE_LICENSE("GPL"); 998 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 999 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1000 MODULE_DESCRIPTION( 1001 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1002