1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 */ 5 6 #include <linux/device.h> 7 #include <linux/eventfd.h> 8 #include <linux/file.h> 9 #include <linux/interrupt.h> 10 #include <linux/iommu.h> 11 #include <linux/module.h> 12 #include <linux/mutex.h> 13 #include <linux/notifier.h> 14 #include <linux/pci.h> 15 #include <linux/pm_runtime.h> 16 #include <linux/types.h> 17 #include <linux/uaccess.h> 18 #include <linux/vfio.h> 19 #include <linux/sched/mm.h> 20 #include <linux/anon_inodes.h> 21 22 #include "cmd.h" 23 24 /* Arbitrary to prevent userspace from consuming endless memory */ 25 #define MAX_MIGRATION_SIZE (512*1024*1024) 26 27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) 28 { 29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev); 30 31 return container_of(core_device, struct mlx5vf_pci_core_device, 32 core_device); 33 } 34 35 struct page * 36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, 37 unsigned long offset) 38 { 39 unsigned long cur_offset = 0; 40 struct scatterlist *sg; 41 unsigned int i; 42 43 /* All accesses are sequential */ 44 if (offset < buf->last_offset || !buf->last_offset_sg) { 45 buf->last_offset = 0; 46 buf->last_offset_sg = buf->table.sgt.sgl; 47 buf->sg_last_entry = 0; 48 } 49 50 cur_offset = buf->last_offset; 51 52 for_each_sg(buf->last_offset_sg, sg, 53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) { 54 if (offset < sg->length + cur_offset) { 55 buf->last_offset_sg = sg; 56 buf->sg_last_entry += i; 57 buf->last_offset = cur_offset; 58 return nth_page(sg_page(sg), 59 (offset - cur_offset) / PAGE_SIZE); 60 } 61 cur_offset += sg->length; 62 } 63 return NULL; 64 } 65 66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, 67 unsigned int npages) 68 { 69 unsigned int to_alloc = npages; 70 struct page **page_list; 71 unsigned long filled; 72 unsigned int to_fill; 73 int ret; 74 75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); 76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL); 77 if (!page_list) 78 return -ENOMEM; 79 80 do { 81 filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list); 82 if (!filled) { 83 ret = -ENOMEM; 84 goto err; 85 } 86 to_alloc -= filled; 87 ret = sg_alloc_append_table_from_pages( 88 &buf->table, page_list, filled, 0, 89 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, 90 GFP_KERNEL); 91 92 if (ret) 93 goto err; 94 buf->allocated_length += filled * PAGE_SIZE; 95 /* clean input for another bulk allocation */ 96 memset(page_list, 0, filled * sizeof(*page_list)); 97 to_fill = min_t(unsigned int, to_alloc, 98 PAGE_SIZE / sizeof(*page_list)); 99 } while (to_alloc > 0); 100 101 kvfree(page_list); 102 return 0; 103 104 err: 105 kvfree(page_list); 106 return ret; 107 } 108 109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) 110 { 111 mutex_lock(&migf->lock); 112 migf->state = MLX5_MIGF_STATE_ERROR; 113 migf->filp->f_pos = 0; 114 mutex_unlock(&migf->lock); 115 } 116 117 static int mlx5vf_release_file(struct inode *inode, struct file *filp) 118 { 119 struct mlx5_vf_migration_file *migf = filp->private_data; 120 121 mlx5vf_disable_fd(migf); 122 mutex_destroy(&migf->lock); 123 kfree(migf); 124 return 0; 125 } 126 127 static struct mlx5_vhca_data_buffer * 128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, 129 bool *end_of_data) 130 { 131 struct mlx5_vhca_data_buffer *buf; 132 bool found = false; 133 134 *end_of_data = false; 135 spin_lock_irq(&migf->list_lock); 136 if (list_empty(&migf->buf_list)) { 137 *end_of_data = true; 138 goto end; 139 } 140 141 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, 142 buf_elm); 143 if (pos >= buf->start_pos && 144 pos < buf->start_pos + buf->length) { 145 found = true; 146 goto end; 147 } 148 149 /* 150 * As we use a stream based FD we may expect having the data always 151 * on first chunk 152 */ 153 migf->state = MLX5_MIGF_STATE_ERROR; 154 155 end: 156 spin_unlock_irq(&migf->list_lock); 157 return found ? buf : NULL; 158 } 159 160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, 161 char __user **buf, size_t *len, loff_t *pos) 162 { 163 unsigned long offset; 164 ssize_t done = 0; 165 size_t copy_len; 166 167 copy_len = min_t(size_t, 168 vhca_buf->start_pos + vhca_buf->length - *pos, *len); 169 while (copy_len) { 170 size_t page_offset; 171 struct page *page; 172 size_t page_len; 173 u8 *from_buff; 174 int ret; 175 176 offset = *pos - vhca_buf->start_pos; 177 page_offset = offset % PAGE_SIZE; 178 offset -= page_offset; 179 page = mlx5vf_get_migration_page(vhca_buf, offset); 180 if (!page) 181 return -EINVAL; 182 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); 183 from_buff = kmap_local_page(page); 184 ret = copy_to_user(*buf, from_buff + page_offset, page_len); 185 kunmap_local(from_buff); 186 if (ret) 187 return -EFAULT; 188 *pos += page_len; 189 *len -= page_len; 190 *buf += page_len; 191 done += page_len; 192 copy_len -= page_len; 193 } 194 195 if (*pos >= vhca_buf->start_pos + vhca_buf->length) { 196 spin_lock_irq(&vhca_buf->migf->list_lock); 197 list_del_init(&vhca_buf->buf_elm); 198 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); 199 spin_unlock_irq(&vhca_buf->migf->list_lock); 200 } 201 202 return done; 203 } 204 205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, 206 loff_t *pos) 207 { 208 struct mlx5_vf_migration_file *migf = filp->private_data; 209 struct mlx5_vhca_data_buffer *vhca_buf; 210 bool first_loop_call = true; 211 bool end_of_data; 212 ssize_t done = 0; 213 214 if (pos) 215 return -ESPIPE; 216 pos = &filp->f_pos; 217 218 if (!(filp->f_flags & O_NONBLOCK)) { 219 if (wait_event_interruptible(migf->poll_wait, 220 !list_empty(&migf->buf_list) || 221 migf->state == MLX5_MIGF_STATE_ERROR || 222 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || 223 migf->state == MLX5_MIGF_STATE_PRE_COPY || 224 migf->state == MLX5_MIGF_STATE_COMPLETE)) 225 return -ERESTARTSYS; 226 } 227 228 mutex_lock(&migf->lock); 229 if (migf->state == MLX5_MIGF_STATE_ERROR) { 230 done = -ENODEV; 231 goto out_unlock; 232 } 233 234 while (len) { 235 ssize_t count; 236 237 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos, 238 &end_of_data); 239 if (first_loop_call) { 240 first_loop_call = false; 241 /* Temporary end of file as part of PRE_COPY */ 242 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || 243 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { 244 done = -ENOMSG; 245 goto out_unlock; 246 } 247 248 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { 249 if (filp->f_flags & O_NONBLOCK) { 250 done = -EAGAIN; 251 goto out_unlock; 252 } 253 } 254 } 255 256 if (end_of_data) 257 goto out_unlock; 258 259 if (!vhca_buf) { 260 done = -EINVAL; 261 goto out_unlock; 262 } 263 264 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos); 265 if (count < 0) { 266 done = count; 267 goto out_unlock; 268 } 269 done += count; 270 } 271 272 out_unlock: 273 mutex_unlock(&migf->lock); 274 return done; 275 } 276 277 static __poll_t mlx5vf_save_poll(struct file *filp, 278 struct poll_table_struct *wait) 279 { 280 struct mlx5_vf_migration_file *migf = filp->private_data; 281 __poll_t pollflags = 0; 282 283 poll_wait(filp, &migf->poll_wait, wait); 284 285 mutex_lock(&migf->lock); 286 if (migf->state == MLX5_MIGF_STATE_ERROR) 287 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; 288 else if (!list_empty(&migf->buf_list) || 289 migf->state == MLX5_MIGF_STATE_COMPLETE) 290 pollflags = EPOLLIN | EPOLLRDNORM; 291 mutex_unlock(&migf->lock); 292 293 return pollflags; 294 } 295 296 /* 297 * FD is exposed and user can use it after receiving an error. 298 * Mark migf in error, and wake the user. 299 */ 300 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) 301 { 302 migf->state = MLX5_MIGF_STATE_ERROR; 303 wake_up_interruptible(&migf->poll_wait); 304 } 305 306 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, 307 unsigned long arg) 308 { 309 struct mlx5_vf_migration_file *migf = filp->private_data; 310 struct mlx5vf_pci_core_device *mvdev = migf->mvdev; 311 struct mlx5_vhca_data_buffer *buf; 312 struct vfio_precopy_info info = {}; 313 loff_t *pos = &filp->f_pos; 314 unsigned long minsz; 315 size_t inc_length = 0; 316 bool end_of_data; 317 int ret; 318 319 if (cmd != VFIO_MIG_GET_PRECOPY_INFO) 320 return -ENOTTY; 321 322 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); 323 324 if (copy_from_user(&info, (void __user *)arg, minsz)) 325 return -EFAULT; 326 327 if (info.argsz < minsz) 328 return -EINVAL; 329 330 mutex_lock(&mvdev->state_mutex); 331 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && 332 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { 333 ret = -EINVAL; 334 goto err_state_unlock; 335 } 336 337 /* 338 * We can't issue a SAVE command when the device is suspended, so as 339 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra 340 * bytes that can't be read. 341 */ 342 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { 343 /* 344 * Once the query returns it's guaranteed that there is no 345 * active SAVE command. 346 * As so, the other code below is safe with the proper locks. 347 */ 348 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length, 349 MLX5VF_QUERY_INC); 350 if (ret) 351 goto err_state_unlock; 352 } 353 354 mutex_lock(&migf->lock); 355 if (migf->state == MLX5_MIGF_STATE_ERROR) { 356 ret = -ENODEV; 357 goto err_migf_unlock; 358 } 359 360 buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data); 361 if (buf) { 362 if (buf->start_pos == 0) { 363 info.initial_bytes = buf->header_image_size - *pos; 364 } else if (buf->start_pos == 365 sizeof(struct mlx5_vf_migration_header)) { 366 /* First data buffer following the header */ 367 info.initial_bytes = buf->start_pos + 368 buf->length - *pos; 369 } else { 370 info.dirty_bytes = buf->start_pos + buf->length - *pos; 371 } 372 } else { 373 if (!end_of_data) { 374 ret = -EINVAL; 375 goto err_migf_unlock; 376 } 377 378 info.dirty_bytes = inc_length; 379 } 380 381 if (!end_of_data || !inc_length) { 382 mutex_unlock(&migf->lock); 383 goto done; 384 } 385 386 mutex_unlock(&migf->lock); 387 /* 388 * We finished transferring the current state and the device has a 389 * dirty state, save a new state to be ready for. 390 */ 391 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); 392 if (IS_ERR(buf)) { 393 ret = PTR_ERR(buf); 394 mlx5vf_mark_err(migf); 395 goto err_state_unlock; 396 } 397 398 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true); 399 if (ret) { 400 mlx5vf_mark_err(migf); 401 mlx5vf_put_data_buffer(buf); 402 goto err_state_unlock; 403 } 404 405 done: 406 mlx5vf_state_mutex_unlock(mvdev); 407 if (copy_to_user((void __user *)arg, &info, minsz)) 408 return -EFAULT; 409 return 0; 410 411 err_migf_unlock: 412 mutex_unlock(&migf->lock); 413 err_state_unlock: 414 mlx5vf_state_mutex_unlock(mvdev); 415 return ret; 416 } 417 418 static const struct file_operations mlx5vf_save_fops = { 419 .owner = THIS_MODULE, 420 .read = mlx5vf_save_read, 421 .poll = mlx5vf_save_poll, 422 .unlocked_ioctl = mlx5vf_precopy_ioctl, 423 .compat_ioctl = compat_ptr_ioctl, 424 .release = mlx5vf_release_file, 425 .llseek = no_llseek, 426 }; 427 428 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) 429 { 430 struct mlx5_vf_migration_file *migf = mvdev->saving_migf; 431 struct mlx5_vhca_data_buffer *buf; 432 size_t length; 433 int ret; 434 435 if (migf->state == MLX5_MIGF_STATE_ERROR) 436 return -ENODEV; 437 438 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 439 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); 440 if (ret) 441 goto err; 442 443 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE); 444 if (IS_ERR(buf)) { 445 ret = PTR_ERR(buf); 446 goto err; 447 } 448 449 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false); 450 if (ret) 451 goto err_save; 452 453 return 0; 454 455 err_save: 456 mlx5vf_put_data_buffer(buf); 457 err: 458 mlx5vf_mark_err(migf); 459 return ret; 460 } 461 462 static struct mlx5_vf_migration_file * 463 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) 464 { 465 struct mlx5_vf_migration_file *migf; 466 struct mlx5_vhca_data_buffer *buf; 467 size_t length; 468 int ret; 469 470 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 471 if (!migf) 472 return ERR_PTR(-ENOMEM); 473 474 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf, 475 O_RDONLY); 476 if (IS_ERR(migf->filp)) { 477 ret = PTR_ERR(migf->filp); 478 goto end; 479 } 480 481 migf->mvdev = mvdev; 482 ret = mlx5vf_cmd_alloc_pd(migf); 483 if (ret) 484 goto out_free; 485 486 stream_open(migf->filp->f_inode, migf->filp); 487 mutex_init(&migf->lock); 488 init_waitqueue_head(&migf->poll_wait); 489 init_completion(&migf->save_comp); 490 /* 491 * save_comp is being used as a binary semaphore built from 492 * a completion. A normal mutex cannot be used because the lock is 493 * passed between kernel threads and lockdep can't model this. 494 */ 495 complete(&migf->save_comp); 496 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx); 497 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); 498 INIT_LIST_HEAD(&migf->buf_list); 499 INIT_LIST_HEAD(&migf->avail_list); 500 spin_lock_init(&migf->list_lock); 501 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0); 502 if (ret) 503 goto out_pd; 504 505 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE); 506 if (IS_ERR(buf)) { 507 ret = PTR_ERR(buf); 508 goto out_pd; 509 } 510 511 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track); 512 if (ret) 513 goto out_save; 514 return migf; 515 out_save: 516 mlx5vf_free_data_buffer(buf); 517 out_pd: 518 mlx5vf_cmd_dealloc_pd(migf); 519 out_free: 520 fput(migf->filp); 521 end: 522 kfree(migf); 523 return ERR_PTR(ret); 524 } 525 526 static int 527 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, 528 const char __user **buf, size_t *len, 529 loff_t *pos, ssize_t *done) 530 { 531 unsigned long offset; 532 size_t page_offset; 533 struct page *page; 534 size_t page_len; 535 u8 *to_buff; 536 int ret; 537 538 offset = *pos - vhca_buf->start_pos; 539 page_offset = offset % PAGE_SIZE; 540 541 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset); 542 if (!page) 543 return -EINVAL; 544 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); 545 to_buff = kmap_local_page(page); 546 ret = copy_from_user(to_buff + page_offset, *buf, page_len); 547 kunmap_local(to_buff); 548 if (ret) 549 return -EFAULT; 550 551 *pos += page_len; 552 *done += page_len; 553 *buf += page_len; 554 *len -= page_len; 555 vhca_buf->length += page_len; 556 return 0; 557 } 558 559 static int 560 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, 561 loff_t requested_length, 562 const char __user **buf, size_t *len, 563 loff_t *pos, ssize_t *done) 564 { 565 int ret; 566 567 if (requested_length > MAX_MIGRATION_SIZE) 568 return -ENOMEM; 569 570 if (vhca_buf->allocated_length < requested_length) { 571 ret = mlx5vf_add_migration_pages( 572 vhca_buf, 573 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, 574 PAGE_SIZE)); 575 if (ret) 576 return ret; 577 } 578 579 while (*len) { 580 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos, 581 done); 582 if (ret) 583 return ret; 584 } 585 586 return 0; 587 } 588 589 static ssize_t 590 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, 591 struct mlx5_vhca_data_buffer *vhca_buf, 592 size_t image_size, const char __user **buf, 593 size_t *len, loff_t *pos, ssize_t *done, 594 bool *has_work) 595 { 596 size_t copy_len, to_copy; 597 int ret; 598 599 to_copy = min_t(size_t, *len, image_size - vhca_buf->length); 600 copy_len = to_copy; 601 while (to_copy) { 602 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos, 603 done); 604 if (ret) 605 return ret; 606 } 607 608 *len -= copy_len; 609 if (vhca_buf->length == image_size) { 610 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; 611 migf->max_pos += image_size; 612 *has_work = true; 613 } 614 615 return 0; 616 } 617 618 static int 619 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf, 620 struct mlx5_vhca_data_buffer *vhca_buf, 621 const char __user **buf, 622 size_t *len, loff_t *pos, 623 ssize_t *done, bool *has_work) 624 { 625 struct page *page; 626 size_t copy_len; 627 u8 *to_buff; 628 int ret; 629 630 copy_len = min_t(size_t, *len, 631 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); 632 page = mlx5vf_get_migration_page(vhca_buf, 0); 633 if (!page) 634 return -EINVAL; 635 to_buff = kmap_local_page(page); 636 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); 637 if (ret) { 638 ret = -EFAULT; 639 goto end; 640 } 641 642 *buf += copy_len; 643 *pos += copy_len; 644 *done += copy_len; 645 *len -= copy_len; 646 vhca_buf->length += copy_len; 647 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { 648 u64 flags; 649 650 vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff); 651 if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) { 652 ret = -ENOMEM; 653 goto end; 654 } 655 656 flags = le64_to_cpup((__le64 *)(to_buff + 657 offsetof(struct mlx5_vf_migration_header, flags))); 658 if (flags) { 659 ret = -EOPNOTSUPP; 660 goto end; 661 } 662 663 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; 664 migf->max_pos += vhca_buf->length; 665 *has_work = true; 666 } 667 end: 668 kunmap_local(to_buff); 669 return ret; 670 } 671 672 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, 673 size_t len, loff_t *pos) 674 { 675 struct mlx5_vf_migration_file *migf = filp->private_data; 676 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf; 677 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header; 678 loff_t requested_length; 679 bool has_work = false; 680 ssize_t done = 0; 681 int ret = 0; 682 683 if (pos) 684 return -ESPIPE; 685 pos = &filp->f_pos; 686 687 if (*pos < 0 || 688 check_add_overflow((loff_t)len, *pos, &requested_length)) 689 return -EINVAL; 690 691 mutex_lock(&migf->mvdev->state_mutex); 692 mutex_lock(&migf->lock); 693 if (migf->state == MLX5_MIGF_STATE_ERROR) { 694 ret = -ENODEV; 695 goto out_unlock; 696 } 697 698 while (len || has_work) { 699 has_work = false; 700 switch (migf->load_state) { 701 case MLX5_VF_LOAD_STATE_READ_HEADER: 702 ret = mlx5vf_resume_read_header(migf, vhca_buf_header, 703 &buf, &len, pos, 704 &done, &has_work); 705 if (ret) 706 goto out_unlock; 707 break; 708 case MLX5_VF_LOAD_STATE_PREP_IMAGE: 709 { 710 u64 size = vhca_buf_header->header_image_size; 711 712 if (vhca_buf->allocated_length < size) { 713 mlx5vf_free_data_buffer(vhca_buf); 714 715 migf->buf = mlx5vf_alloc_data_buffer(migf, 716 size, DMA_TO_DEVICE); 717 if (IS_ERR(migf->buf)) { 718 ret = PTR_ERR(migf->buf); 719 migf->buf = NULL; 720 goto out_unlock; 721 } 722 723 vhca_buf = migf->buf; 724 } 725 726 vhca_buf->start_pos = migf->max_pos; 727 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; 728 break; 729 } 730 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER: 731 ret = mlx5vf_resume_read_image_no_header(vhca_buf, 732 requested_length, 733 &buf, &len, pos, &done); 734 if (ret) 735 goto out_unlock; 736 break; 737 case MLX5_VF_LOAD_STATE_READ_IMAGE: 738 ret = mlx5vf_resume_read_image(migf, vhca_buf, 739 vhca_buf_header->header_image_size, 740 &buf, &len, pos, &done, &has_work); 741 if (ret) 742 goto out_unlock; 743 break; 744 case MLX5_VF_LOAD_STATE_LOAD_IMAGE: 745 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf); 746 if (ret) 747 goto out_unlock; 748 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 749 750 /* prep header buf for next image */ 751 vhca_buf_header->length = 0; 752 vhca_buf_header->header_image_size = 0; 753 /* prep data buf for next image */ 754 vhca_buf->length = 0; 755 756 break; 757 default: 758 break; 759 } 760 } 761 762 out_unlock: 763 if (ret) 764 migf->state = MLX5_MIGF_STATE_ERROR; 765 mutex_unlock(&migf->lock); 766 mlx5vf_state_mutex_unlock(migf->mvdev); 767 return ret ? ret : done; 768 } 769 770 static const struct file_operations mlx5vf_resume_fops = { 771 .owner = THIS_MODULE, 772 .write = mlx5vf_resume_write, 773 .release = mlx5vf_release_file, 774 .llseek = no_llseek, 775 }; 776 777 static struct mlx5_vf_migration_file * 778 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) 779 { 780 struct mlx5_vf_migration_file *migf; 781 struct mlx5_vhca_data_buffer *buf; 782 int ret; 783 784 migf = kzalloc(sizeof(*migf), GFP_KERNEL); 785 if (!migf) 786 return ERR_PTR(-ENOMEM); 787 788 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf, 789 O_WRONLY); 790 if (IS_ERR(migf->filp)) { 791 ret = PTR_ERR(migf->filp); 792 goto end; 793 } 794 795 migf->mvdev = mvdev; 796 ret = mlx5vf_cmd_alloc_pd(migf); 797 if (ret) 798 goto out_free; 799 800 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); 801 if (IS_ERR(buf)) { 802 ret = PTR_ERR(buf); 803 goto out_pd; 804 } 805 806 migf->buf = buf; 807 if (MLX5VF_PRE_COPY_SUPP(mvdev)) { 808 buf = mlx5vf_alloc_data_buffer(migf, 809 sizeof(struct mlx5_vf_migration_header), DMA_NONE); 810 if (IS_ERR(buf)) { 811 ret = PTR_ERR(buf); 812 goto out_buf; 813 } 814 815 migf->buf_header = buf; 816 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; 817 } else { 818 /* Initial state will be to read the image */ 819 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER; 820 } 821 822 stream_open(migf->filp->f_inode, migf->filp); 823 mutex_init(&migf->lock); 824 INIT_LIST_HEAD(&migf->buf_list); 825 INIT_LIST_HEAD(&migf->avail_list); 826 spin_lock_init(&migf->list_lock); 827 return migf; 828 out_buf: 829 mlx5vf_free_data_buffer(migf->buf); 830 out_pd: 831 mlx5vf_cmd_dealloc_pd(migf); 832 out_free: 833 fput(migf->filp); 834 end: 835 kfree(migf); 836 return ERR_PTR(ret); 837 } 838 839 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev) 840 { 841 if (mvdev->resuming_migf) { 842 mlx5vf_disable_fd(mvdev->resuming_migf); 843 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf); 844 fput(mvdev->resuming_migf->filp); 845 mvdev->resuming_migf = NULL; 846 } 847 if (mvdev->saving_migf) { 848 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx); 849 cancel_work_sync(&mvdev->saving_migf->async_data.work); 850 mlx5vf_disable_fd(mvdev->saving_migf); 851 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf); 852 fput(mvdev->saving_migf->filp); 853 mvdev->saving_migf = NULL; 854 } 855 } 856 857 static struct file * 858 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, 859 u32 new) 860 { 861 u32 cur = mvdev->mig_state; 862 int ret; 863 864 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { 865 ret = mlx5vf_cmd_suspend_vhca(mvdev, 866 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 867 if (ret) 868 return ERR_PTR(ret); 869 return NULL; 870 } 871 872 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { 873 ret = mlx5vf_cmd_resume_vhca(mvdev, 874 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); 875 if (ret) 876 return ERR_PTR(ret); 877 return NULL; 878 } 879 880 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || 881 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 882 ret = mlx5vf_cmd_suspend_vhca(mvdev, 883 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); 884 if (ret) 885 return ERR_PTR(ret); 886 return NULL; 887 } 888 889 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || 890 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { 891 ret = mlx5vf_cmd_resume_vhca(mvdev, 892 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); 893 if (ret) 894 return ERR_PTR(ret); 895 return NULL; 896 } 897 898 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { 899 struct mlx5_vf_migration_file *migf; 900 901 migf = mlx5vf_pci_save_device_data(mvdev, false); 902 if (IS_ERR(migf)) 903 return ERR_CAST(migf); 904 get_file(migf->filp); 905 mvdev->saving_migf = migf; 906 return migf->filp; 907 } 908 909 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || 910 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || 911 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && 912 new == VFIO_DEVICE_STATE_RUNNING_P2P)) { 913 mlx5vf_disable_fds(mvdev); 914 return NULL; 915 } 916 917 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { 918 struct mlx5_vf_migration_file *migf; 919 920 migf = mlx5vf_pci_resume_device_data(mvdev); 921 if (IS_ERR(migf)) 922 return ERR_CAST(migf); 923 get_file(migf->filp); 924 mvdev->resuming_migf = migf; 925 return migf->filp; 926 } 927 928 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { 929 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) { 930 ret = mlx5vf_cmd_load_vhca_state(mvdev, 931 mvdev->resuming_migf, 932 mvdev->resuming_migf->buf); 933 if (ret) 934 return ERR_PTR(ret); 935 } 936 mlx5vf_disable_fds(mvdev); 937 return NULL; 938 } 939 940 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || 941 (cur == VFIO_DEVICE_STATE_RUNNING_P2P && 942 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { 943 struct mlx5_vf_migration_file *migf; 944 945 migf = mlx5vf_pci_save_device_data(mvdev, true); 946 if (IS_ERR(migf)) 947 return ERR_CAST(migf); 948 get_file(migf->filp); 949 mvdev->saving_migf = migf; 950 return migf->filp; 951 } 952 953 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { 954 ret = mlx5vf_cmd_suspend_vhca(mvdev, 955 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); 956 if (ret) 957 return ERR_PTR(ret); 958 ret = mlx5vf_pci_save_device_inc_data(mvdev); 959 return ret ? ERR_PTR(ret) : NULL; 960 } 961 962 /* 963 * vfio_mig_get_next_state() does not use arcs other than the above 964 */ 965 WARN_ON(true); 966 return ERR_PTR(-EINVAL); 967 } 968 969 /* 970 * This function is called in all state_mutex unlock cases to 971 * handle a 'deferred_reset' if exists. 972 */ 973 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) 974 { 975 again: 976 spin_lock(&mvdev->reset_lock); 977 if (mvdev->deferred_reset) { 978 mvdev->deferred_reset = false; 979 spin_unlock(&mvdev->reset_lock); 980 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 981 mlx5vf_disable_fds(mvdev); 982 goto again; 983 } 984 mutex_unlock(&mvdev->state_mutex); 985 spin_unlock(&mvdev->reset_lock); 986 } 987 988 static struct file * 989 mlx5vf_pci_set_device_state(struct vfio_device *vdev, 990 enum vfio_device_mig_state new_state) 991 { 992 struct mlx5vf_pci_core_device *mvdev = container_of( 993 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 994 enum vfio_device_mig_state next_state; 995 struct file *res = NULL; 996 int ret; 997 998 mutex_lock(&mvdev->state_mutex); 999 while (new_state != mvdev->mig_state) { 1000 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state, 1001 new_state, &next_state); 1002 if (ret) { 1003 res = ERR_PTR(ret); 1004 break; 1005 } 1006 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state); 1007 if (IS_ERR(res)) 1008 break; 1009 mvdev->mig_state = next_state; 1010 if (WARN_ON(res && new_state != mvdev->mig_state)) { 1011 fput(res); 1012 res = ERR_PTR(-EINVAL); 1013 break; 1014 } 1015 } 1016 mlx5vf_state_mutex_unlock(mvdev); 1017 return res; 1018 } 1019 1020 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, 1021 unsigned long *stop_copy_length) 1022 { 1023 struct mlx5vf_pci_core_device *mvdev = container_of( 1024 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1025 size_t state_size; 1026 int ret; 1027 1028 mutex_lock(&mvdev->state_mutex); 1029 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, 1030 &state_size, 0); 1031 if (!ret) 1032 *stop_copy_length = state_size; 1033 mlx5vf_state_mutex_unlock(mvdev); 1034 return ret; 1035 } 1036 1037 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, 1038 enum vfio_device_mig_state *curr_state) 1039 { 1040 struct mlx5vf_pci_core_device *mvdev = container_of( 1041 vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1042 1043 mutex_lock(&mvdev->state_mutex); 1044 *curr_state = mvdev->mig_state; 1045 mlx5vf_state_mutex_unlock(mvdev); 1046 return 0; 1047 } 1048 1049 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) 1050 { 1051 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1052 1053 if (!mvdev->migrate_cap) 1054 return; 1055 1056 /* 1057 * As the higher VFIO layers are holding locks across reset and using 1058 * those same locks with the mm_lock we need to prevent ABBA deadlock 1059 * with the state_mutex and mm_lock. 1060 * In case the state_mutex was taken already we defer the cleanup work 1061 * to the unlock flow of the other running context. 1062 */ 1063 spin_lock(&mvdev->reset_lock); 1064 mvdev->deferred_reset = true; 1065 if (!mutex_trylock(&mvdev->state_mutex)) { 1066 spin_unlock(&mvdev->reset_lock); 1067 return; 1068 } 1069 spin_unlock(&mvdev->reset_lock); 1070 mlx5vf_state_mutex_unlock(mvdev); 1071 } 1072 1073 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) 1074 { 1075 struct mlx5vf_pci_core_device *mvdev = container_of( 1076 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1077 struct vfio_pci_core_device *vdev = &mvdev->core_device; 1078 int ret; 1079 1080 ret = vfio_pci_core_enable(vdev); 1081 if (ret) 1082 return ret; 1083 1084 if (mvdev->migrate_cap) 1085 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; 1086 vfio_pci_core_finish_enable(vdev); 1087 return 0; 1088 } 1089 1090 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) 1091 { 1092 struct mlx5vf_pci_core_device *mvdev = container_of( 1093 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); 1094 1095 mlx5vf_cmd_close_migratable(mvdev); 1096 vfio_pci_core_close_device(core_vdev); 1097 } 1098 1099 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { 1100 .migration_set_state = mlx5vf_pci_set_device_state, 1101 .migration_get_state = mlx5vf_pci_get_device_state, 1102 .migration_get_data_size = mlx5vf_pci_get_data_size, 1103 }; 1104 1105 static const struct vfio_log_ops mlx5vf_pci_log_ops = { 1106 .log_start = mlx5vf_start_page_tracker, 1107 .log_stop = mlx5vf_stop_page_tracker, 1108 .log_read_and_clear = mlx5vf_tracker_read_and_clear, 1109 }; 1110 1111 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) 1112 { 1113 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1114 struct mlx5vf_pci_core_device, core_device.vdev); 1115 int ret; 1116 1117 ret = vfio_pci_core_init_dev(core_vdev); 1118 if (ret) 1119 return ret; 1120 1121 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops, 1122 &mlx5vf_pci_log_ops); 1123 1124 return 0; 1125 } 1126 1127 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) 1128 { 1129 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, 1130 struct mlx5vf_pci_core_device, core_device.vdev); 1131 1132 mlx5vf_cmd_remove_migratable(mvdev); 1133 vfio_pci_core_release_dev(core_vdev); 1134 } 1135 1136 static const struct vfio_device_ops mlx5vf_pci_ops = { 1137 .name = "mlx5-vfio-pci", 1138 .init = mlx5vf_pci_init_dev, 1139 .release = mlx5vf_pci_release_dev, 1140 .open_device = mlx5vf_pci_open_device, 1141 .close_device = mlx5vf_pci_close_device, 1142 .ioctl = vfio_pci_core_ioctl, 1143 .device_feature = vfio_pci_core_ioctl_feature, 1144 .read = vfio_pci_core_read, 1145 .write = vfio_pci_core_write, 1146 .mmap = vfio_pci_core_mmap, 1147 .request = vfio_pci_core_request, 1148 .match = vfio_pci_core_match, 1149 .bind_iommufd = vfio_iommufd_physical_bind, 1150 .unbind_iommufd = vfio_iommufd_physical_unbind, 1151 .attach_ioas = vfio_iommufd_physical_attach_ioas, 1152 }; 1153 1154 static int mlx5vf_pci_probe(struct pci_dev *pdev, 1155 const struct pci_device_id *id) 1156 { 1157 struct mlx5vf_pci_core_device *mvdev; 1158 int ret; 1159 1160 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, 1161 &pdev->dev, &mlx5vf_pci_ops); 1162 if (IS_ERR(mvdev)) 1163 return PTR_ERR(mvdev); 1164 1165 dev_set_drvdata(&pdev->dev, &mvdev->core_device); 1166 ret = vfio_pci_core_register_device(&mvdev->core_device); 1167 if (ret) 1168 goto out_put_vdev; 1169 return 0; 1170 1171 out_put_vdev: 1172 vfio_put_device(&mvdev->core_device.vdev); 1173 return ret; 1174 } 1175 1176 static void mlx5vf_pci_remove(struct pci_dev *pdev) 1177 { 1178 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); 1179 1180 vfio_pci_core_unregister_device(&mvdev->core_device); 1181 vfio_put_device(&mvdev->core_device.vdev); 1182 } 1183 1184 static const struct pci_device_id mlx5vf_pci_table[] = { 1185 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ 1186 {} 1187 }; 1188 1189 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); 1190 1191 static const struct pci_error_handlers mlx5vf_err_handlers = { 1192 .reset_done = mlx5vf_pci_aer_reset_done, 1193 .error_detected = vfio_pci_core_aer_err_detected, 1194 }; 1195 1196 static struct pci_driver mlx5vf_pci_driver = { 1197 .name = KBUILD_MODNAME, 1198 .id_table = mlx5vf_pci_table, 1199 .probe = mlx5vf_pci_probe, 1200 .remove = mlx5vf_pci_remove, 1201 .err_handler = &mlx5vf_err_handlers, 1202 .driver_managed_dma = true, 1203 }; 1204 1205 module_pci_driver(mlx5vf_pci_driver); 1206 1207 MODULE_LICENSE("GPL"); 1208 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>"); 1209 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>"); 1210 MODULE_DESCRIPTION( 1211 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family"); 1212