1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21
22 #include "cmd.h"
23
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
mlx5vf_drvdata(struct pci_dev * pdev)27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30
31 return container_of(core_device, struct mlx5vf_pci_core_device,
32 core_device);
33 }
34
35 struct page *
mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer * buf,unsigned long offset)36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 unsigned long offset)
38 {
39 unsigned long cur_offset = 0;
40 struct scatterlist *sg;
41 unsigned int i;
42
43 /* All accesses are sequential */
44 if (offset < buf->last_offset || !buf->last_offset_sg) {
45 buf->last_offset = 0;
46 buf->last_offset_sg = buf->table.sgt.sgl;
47 buf->sg_last_entry = 0;
48 }
49
50 cur_offset = buf->last_offset;
51
52 for_each_sg(buf->last_offset_sg, sg,
53 buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 if (offset < sg->length + cur_offset) {
55 buf->last_offset_sg = sg;
56 buf->sg_last_entry += i;
57 buf->last_offset = cur_offset;
58 return nth_page(sg_page(sg),
59 (offset - cur_offset) / PAGE_SIZE);
60 }
61 cur_offset += sg->length;
62 }
63 return NULL;
64 }
65
mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer * buf,unsigned int npages)66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 unsigned int npages)
68 {
69 unsigned int to_alloc = npages;
70 struct page **page_list;
71 unsigned long filled;
72 unsigned int to_fill;
73 int ret;
74
75 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
77 if (!page_list)
78 return -ENOMEM;
79
80 do {
81 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill,
82 page_list);
83 if (!filled) {
84 ret = -ENOMEM;
85 goto err;
86 }
87 to_alloc -= filled;
88 ret = sg_alloc_append_table_from_pages(
89 &buf->table, page_list, filled, 0,
90 filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
91 GFP_KERNEL_ACCOUNT);
92
93 if (ret)
94 goto err;
95 buf->allocated_length += filled * PAGE_SIZE;
96 /* clean input for another bulk allocation */
97 memset(page_list, 0, filled * sizeof(*page_list));
98 to_fill = min_t(unsigned int, to_alloc,
99 PAGE_SIZE / sizeof(*page_list));
100 } while (to_alloc > 0);
101
102 kvfree(page_list);
103 return 0;
104
105 err:
106 kvfree(page_list);
107 return ret;
108 }
109
mlx5vf_disable_fd(struct mlx5_vf_migration_file * migf)110 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
111 {
112 mutex_lock(&migf->lock);
113 migf->state = MLX5_MIGF_STATE_ERROR;
114 migf->filp->f_pos = 0;
115 mutex_unlock(&migf->lock);
116 }
117
mlx5vf_release_file(struct inode * inode,struct file * filp)118 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
119 {
120 struct mlx5_vf_migration_file *migf = filp->private_data;
121
122 mlx5vf_disable_fd(migf);
123 mutex_destroy(&migf->lock);
124 kfree(migf);
125 return 0;
126 }
127
128 static struct mlx5_vhca_data_buffer *
mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file * migf,loff_t pos,bool * end_of_data)129 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
130 bool *end_of_data)
131 {
132 struct mlx5_vhca_data_buffer *buf;
133 bool found = false;
134
135 *end_of_data = false;
136 spin_lock_irq(&migf->list_lock);
137 if (list_empty(&migf->buf_list)) {
138 *end_of_data = true;
139 goto end;
140 }
141
142 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
143 buf_elm);
144 if (pos >= buf->start_pos &&
145 pos < buf->start_pos + buf->length) {
146 found = true;
147 goto end;
148 }
149
150 /*
151 * As we use a stream based FD we may expect having the data always
152 * on first chunk
153 */
154 migf->state = MLX5_MIGF_STATE_ERROR;
155
156 end:
157 spin_unlock_irq(&migf->list_lock);
158 return found ? buf : NULL;
159 }
160
mlx5vf_buf_read(struct mlx5_vhca_data_buffer * vhca_buf,char __user ** buf,size_t * len,loff_t * pos)161 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
162 char __user **buf, size_t *len, loff_t *pos)
163 {
164 unsigned long offset;
165 ssize_t done = 0;
166 size_t copy_len;
167
168 copy_len = min_t(size_t,
169 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
170 while (copy_len) {
171 size_t page_offset;
172 struct page *page;
173 size_t page_len;
174 u8 *from_buff;
175 int ret;
176
177 offset = *pos - vhca_buf->start_pos;
178 page_offset = offset % PAGE_SIZE;
179 offset -= page_offset;
180 page = mlx5vf_get_migration_page(vhca_buf, offset);
181 if (!page)
182 return -EINVAL;
183 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
184 from_buff = kmap_local_page(page);
185 ret = copy_to_user(*buf, from_buff + page_offset, page_len);
186 kunmap_local(from_buff);
187 if (ret)
188 return -EFAULT;
189 *pos += page_len;
190 *len -= page_len;
191 *buf += page_len;
192 done += page_len;
193 copy_len -= page_len;
194 }
195
196 if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
197 spin_lock_irq(&vhca_buf->migf->list_lock);
198 list_del_init(&vhca_buf->buf_elm);
199 list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
200 spin_unlock_irq(&vhca_buf->migf->list_lock);
201 }
202
203 return done;
204 }
205
mlx5vf_save_read(struct file * filp,char __user * buf,size_t len,loff_t * pos)206 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
207 loff_t *pos)
208 {
209 struct mlx5_vf_migration_file *migf = filp->private_data;
210 struct mlx5_vhca_data_buffer *vhca_buf;
211 bool first_loop_call = true;
212 bool end_of_data;
213 ssize_t done = 0;
214
215 if (pos)
216 return -ESPIPE;
217 pos = &filp->f_pos;
218
219 if (!(filp->f_flags & O_NONBLOCK)) {
220 if (wait_event_interruptible(migf->poll_wait,
221 !list_empty(&migf->buf_list) ||
222 migf->state == MLX5_MIGF_STATE_ERROR ||
223 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
224 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
225 migf->state == MLX5_MIGF_STATE_COMPLETE))
226 return -ERESTARTSYS;
227 }
228
229 mutex_lock(&migf->lock);
230 if (migf->state == MLX5_MIGF_STATE_ERROR) {
231 done = -ENODEV;
232 goto out_unlock;
233 }
234
235 while (len) {
236 ssize_t count;
237
238 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
239 &end_of_data);
240 if (first_loop_call) {
241 first_loop_call = false;
242 /* Temporary end of file as part of PRE_COPY */
243 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
244 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
245 done = -ENOMSG;
246 goto out_unlock;
247 }
248
249 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
250 if (filp->f_flags & O_NONBLOCK) {
251 done = -EAGAIN;
252 goto out_unlock;
253 }
254 }
255 }
256
257 if (end_of_data)
258 goto out_unlock;
259
260 if (!vhca_buf) {
261 done = -EINVAL;
262 goto out_unlock;
263 }
264
265 count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
266 if (count < 0) {
267 done = count;
268 goto out_unlock;
269 }
270 done += count;
271 }
272
273 out_unlock:
274 mutex_unlock(&migf->lock);
275 return done;
276 }
277
mlx5vf_save_poll(struct file * filp,struct poll_table_struct * wait)278 static __poll_t mlx5vf_save_poll(struct file *filp,
279 struct poll_table_struct *wait)
280 {
281 struct mlx5_vf_migration_file *migf = filp->private_data;
282 __poll_t pollflags = 0;
283
284 poll_wait(filp, &migf->poll_wait, wait);
285
286 mutex_lock(&migf->lock);
287 if (migf->state == MLX5_MIGF_STATE_ERROR)
288 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
289 else if (!list_empty(&migf->buf_list) ||
290 migf->state == MLX5_MIGF_STATE_COMPLETE)
291 pollflags = EPOLLIN | EPOLLRDNORM;
292 mutex_unlock(&migf->lock);
293
294 return pollflags;
295 }
296
297 /*
298 * FD is exposed and user can use it after receiving an error.
299 * Mark migf in error, and wake the user.
300 */
mlx5vf_mark_err(struct mlx5_vf_migration_file * migf)301 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
302 {
303 migf->state = MLX5_MIGF_STATE_ERROR;
304 wake_up_interruptible(&migf->poll_wait);
305 }
306
mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file * migf)307 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf)
308 {
309 size_t size = sizeof(struct mlx5_vf_migration_header) +
310 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
311 struct mlx5_vf_migration_tag_stop_copy_data data = {};
312 struct mlx5_vhca_data_buffer *header_buf = NULL;
313 struct mlx5_vf_migration_header header = {};
314 unsigned long flags;
315 struct page *page;
316 u8 *to_buff;
317 int ret;
318
319 header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE);
320 if (IS_ERR(header_buf))
321 return PTR_ERR(header_buf);
322
323 header.record_size = cpu_to_le64(sizeof(data));
324 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
325 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
326 page = mlx5vf_get_migration_page(header_buf, 0);
327 if (!page) {
328 ret = -EINVAL;
329 goto err;
330 }
331 to_buff = kmap_local_page(page);
332 memcpy(to_buff, &header, sizeof(header));
333 header_buf->length = sizeof(header);
334 data.stop_copy_size = cpu_to_le64(migf->buf->allocated_length);
335 memcpy(to_buff + sizeof(header), &data, sizeof(data));
336 header_buf->length += sizeof(data);
337 kunmap_local(to_buff);
338 header_buf->start_pos = header_buf->migf->max_pos;
339 migf->max_pos += header_buf->length;
340 spin_lock_irqsave(&migf->list_lock, flags);
341 list_add_tail(&header_buf->buf_elm, &migf->buf_list);
342 spin_unlock_irqrestore(&migf->list_lock, flags);
343 migf->pre_copy_initial_bytes = size;
344 return 0;
345 err:
346 mlx5vf_put_data_buffer(header_buf);
347 return ret;
348 }
349
mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file * migf,size_t state_size)350 static int mlx5vf_prep_stop_copy(struct mlx5_vf_migration_file *migf,
351 size_t state_size)
352 {
353 struct mlx5_vhca_data_buffer *buf;
354 size_t inc_state_size;
355 int ret;
356
357 /* let's be ready for stop_copy size that might grow by 10 percents */
358 if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
359 inc_state_size = state_size;
360
361 buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE);
362 if (IS_ERR(buf))
363 return PTR_ERR(buf);
364
365 migf->buf = buf;
366 buf = mlx5vf_get_data_buffer(migf,
367 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
368 if (IS_ERR(buf)) {
369 ret = PTR_ERR(buf);
370 goto err;
371 }
372
373 migf->buf_header = buf;
374 ret = mlx5vf_add_stop_copy_header(migf);
375 if (ret)
376 goto err_header;
377 return 0;
378
379 err_header:
380 mlx5vf_put_data_buffer(migf->buf_header);
381 migf->buf_header = NULL;
382 err:
383 mlx5vf_put_data_buffer(migf->buf);
384 migf->buf = NULL;
385 return ret;
386 }
387
mlx5vf_precopy_ioctl(struct file * filp,unsigned int cmd,unsigned long arg)388 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
389 unsigned long arg)
390 {
391 struct mlx5_vf_migration_file *migf = filp->private_data;
392 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
393 struct mlx5_vhca_data_buffer *buf;
394 struct vfio_precopy_info info = {};
395 loff_t *pos = &filp->f_pos;
396 unsigned long minsz;
397 size_t inc_length = 0;
398 bool end_of_data = false;
399 int ret;
400
401 if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
402 return -ENOTTY;
403
404 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
405
406 if (copy_from_user(&info, (void __user *)arg, minsz))
407 return -EFAULT;
408
409 if (info.argsz < minsz)
410 return -EINVAL;
411
412 mutex_lock(&mvdev->state_mutex);
413 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
414 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
415 ret = -EINVAL;
416 goto err_state_unlock;
417 }
418
419 /*
420 * We can't issue a SAVE command when the device is suspended, so as
421 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
422 * bytes that can't be read.
423 */
424 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
425 /*
426 * Once the query returns it's guaranteed that there is no
427 * active SAVE command.
428 * As so, the other code below is safe with the proper locks.
429 */
430 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
431 MLX5VF_QUERY_INC);
432 if (ret)
433 goto err_state_unlock;
434 }
435
436 mutex_lock(&migf->lock);
437 if (migf->state == MLX5_MIGF_STATE_ERROR) {
438 ret = -ENODEV;
439 goto err_migf_unlock;
440 }
441
442 if (migf->pre_copy_initial_bytes > *pos) {
443 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
444 } else {
445 info.dirty_bytes = migf->max_pos - *pos;
446 if (!info.dirty_bytes)
447 end_of_data = true;
448 info.dirty_bytes += inc_length;
449 }
450
451 if (!end_of_data || !inc_length) {
452 mutex_unlock(&migf->lock);
453 goto done;
454 }
455
456 mutex_unlock(&migf->lock);
457 /*
458 * We finished transferring the current state and the device has a
459 * dirty state, save a new state to be ready for.
460 */
461 buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
462 if (IS_ERR(buf)) {
463 ret = PTR_ERR(buf);
464 mlx5vf_mark_err(migf);
465 goto err_state_unlock;
466 }
467
468 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
469 if (ret) {
470 mlx5vf_mark_err(migf);
471 mlx5vf_put_data_buffer(buf);
472 goto err_state_unlock;
473 }
474
475 done:
476 mlx5vf_state_mutex_unlock(mvdev);
477 if (copy_to_user((void __user *)arg, &info, minsz))
478 return -EFAULT;
479 return 0;
480
481 err_migf_unlock:
482 mutex_unlock(&migf->lock);
483 err_state_unlock:
484 mlx5vf_state_mutex_unlock(mvdev);
485 return ret;
486 }
487
488 static const struct file_operations mlx5vf_save_fops = {
489 .owner = THIS_MODULE,
490 .read = mlx5vf_save_read,
491 .poll = mlx5vf_save_poll,
492 .unlocked_ioctl = mlx5vf_precopy_ioctl,
493 .compat_ioctl = compat_ptr_ioctl,
494 .release = mlx5vf_release_file,
495 .llseek = no_llseek,
496 };
497
mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device * mvdev)498 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
499 {
500 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
501 struct mlx5_vhca_data_buffer *buf;
502 size_t length;
503 int ret;
504
505 if (migf->state == MLX5_MIGF_STATE_ERROR)
506 return -ENODEV;
507
508 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
509 MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
510 if (ret)
511 goto err;
512
513 /* Checking whether we have a matching pre-allocated buffer that can fit */
514 if (migf->buf && migf->buf->allocated_length >= length) {
515 buf = migf->buf;
516 migf->buf = NULL;
517 } else {
518 buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
519 if (IS_ERR(buf)) {
520 ret = PTR_ERR(buf);
521 goto err;
522 }
523 }
524
525 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
526 if (ret)
527 goto err_save;
528
529 return 0;
530
531 err_save:
532 mlx5vf_put_data_buffer(buf);
533 err:
534 mlx5vf_mark_err(migf);
535 return ret;
536 }
537
538 static struct mlx5_vf_migration_file *
mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device * mvdev,bool track)539 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
540 {
541 struct mlx5_vf_migration_file *migf;
542 struct mlx5_vhca_data_buffer *buf;
543 size_t length;
544 int ret;
545
546 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
547 if (!migf)
548 return ERR_PTR(-ENOMEM);
549
550 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
551 O_RDONLY);
552 if (IS_ERR(migf->filp)) {
553 ret = PTR_ERR(migf->filp);
554 goto end;
555 }
556
557 migf->mvdev = mvdev;
558 ret = mlx5vf_cmd_alloc_pd(migf);
559 if (ret)
560 goto out_free;
561
562 stream_open(migf->filp->f_inode, migf->filp);
563 mutex_init(&migf->lock);
564 init_waitqueue_head(&migf->poll_wait);
565 init_completion(&migf->save_comp);
566 /*
567 * save_comp is being used as a binary semaphore built from
568 * a completion. A normal mutex cannot be used because the lock is
569 * passed between kernel threads and lockdep can't model this.
570 */
571 complete(&migf->save_comp);
572 mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
573 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
574 INIT_LIST_HEAD(&migf->buf_list);
575 INIT_LIST_HEAD(&migf->avail_list);
576 spin_lock_init(&migf->list_lock);
577 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
578 if (ret)
579 goto out_pd;
580
581 if (track) {
582 ret = mlx5vf_prep_stop_copy(migf, length);
583 if (ret)
584 goto out_pd;
585 }
586
587 buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
588 if (IS_ERR(buf)) {
589 ret = PTR_ERR(buf);
590 goto out_pd;
591 }
592
593 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
594 if (ret)
595 goto out_save;
596 return migf;
597 out_save:
598 mlx5vf_free_data_buffer(buf);
599 out_pd:
600 mlx5fv_cmd_clean_migf_resources(migf);
601 out_free:
602 fput(migf->filp);
603 end:
604 kfree(migf);
605 return ERR_PTR(ret);
606 }
607
608 static int
mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)609 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
610 const char __user **buf, size_t *len,
611 loff_t *pos, ssize_t *done)
612 {
613 unsigned long offset;
614 size_t page_offset;
615 struct page *page;
616 size_t page_len;
617 u8 *to_buff;
618 int ret;
619
620 offset = *pos - vhca_buf->start_pos;
621 page_offset = offset % PAGE_SIZE;
622
623 page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
624 if (!page)
625 return -EINVAL;
626 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
627 to_buff = kmap_local_page(page);
628 ret = copy_from_user(to_buff + page_offset, *buf, page_len);
629 kunmap_local(to_buff);
630 if (ret)
631 return -EFAULT;
632
633 *pos += page_len;
634 *done += page_len;
635 *buf += page_len;
636 *len -= page_len;
637 vhca_buf->length += page_len;
638 return 0;
639 }
640
641 static int
mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer * vhca_buf,loff_t requested_length,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)642 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
643 loff_t requested_length,
644 const char __user **buf, size_t *len,
645 loff_t *pos, ssize_t *done)
646 {
647 int ret;
648
649 if (requested_length > MAX_LOAD_SIZE)
650 return -ENOMEM;
651
652 if (vhca_buf->allocated_length < requested_length) {
653 ret = mlx5vf_add_migration_pages(
654 vhca_buf,
655 DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
656 PAGE_SIZE));
657 if (ret)
658 return ret;
659 }
660
661 while (*len) {
662 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
663 done);
664 if (ret)
665 return ret;
666 }
667
668 return 0;
669 }
670
671 static ssize_t
mlx5vf_resume_read_image(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,size_t image_size,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)672 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
673 struct mlx5_vhca_data_buffer *vhca_buf,
674 size_t image_size, const char __user **buf,
675 size_t *len, loff_t *pos, ssize_t *done,
676 bool *has_work)
677 {
678 size_t copy_len, to_copy;
679 int ret;
680
681 to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
682 copy_len = to_copy;
683 while (to_copy) {
684 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
685 done);
686 if (ret)
687 return ret;
688 }
689
690 *len -= copy_len;
691 if (vhca_buf->length == image_size) {
692 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
693 migf->max_pos += image_size;
694 *has_work = true;
695 }
696
697 return 0;
698 }
699
700 static int
mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done)701 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
702 struct mlx5_vhca_data_buffer *vhca_buf,
703 const char __user **buf, size_t *len,
704 loff_t *pos, ssize_t *done)
705 {
706 size_t copy_len, to_copy;
707 size_t required_data;
708 u8 *to_buff;
709 int ret;
710
711 required_data = migf->record_size - vhca_buf->length;
712 to_copy = min_t(size_t, *len, required_data);
713 copy_len = to_copy;
714 while (to_copy) {
715 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
716 done);
717 if (ret)
718 return ret;
719 }
720
721 *len -= copy_len;
722 if (vhca_buf->length == migf->record_size) {
723 switch (migf->record_tag) {
724 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
725 {
726 struct page *page;
727
728 page = mlx5vf_get_migration_page(vhca_buf, 0);
729 if (!page)
730 return -EINVAL;
731 to_buff = kmap_local_page(page);
732 migf->stop_copy_prep_size = min_t(u64,
733 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
734 kunmap_local(to_buff);
735 break;
736 }
737 default:
738 /* Optional tag */
739 break;
740 }
741
742 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
743 migf->max_pos += migf->record_size;
744 vhca_buf->length = 0;
745 }
746
747 return 0;
748 }
749
750 static int
mlx5vf_resume_read_header(struct mlx5_vf_migration_file * migf,struct mlx5_vhca_data_buffer * vhca_buf,const char __user ** buf,size_t * len,loff_t * pos,ssize_t * done,bool * has_work)751 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
752 struct mlx5_vhca_data_buffer *vhca_buf,
753 const char __user **buf,
754 size_t *len, loff_t *pos,
755 ssize_t *done, bool *has_work)
756 {
757 struct page *page;
758 size_t copy_len;
759 u8 *to_buff;
760 int ret;
761
762 copy_len = min_t(size_t, *len,
763 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
764 page = mlx5vf_get_migration_page(vhca_buf, 0);
765 if (!page)
766 return -EINVAL;
767 to_buff = kmap_local_page(page);
768 ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
769 if (ret) {
770 ret = -EFAULT;
771 goto end;
772 }
773
774 *buf += copy_len;
775 *pos += copy_len;
776 *done += copy_len;
777 *len -= copy_len;
778 vhca_buf->length += copy_len;
779 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
780 u64 record_size;
781 u32 flags;
782
783 record_size = le64_to_cpup((__le64 *)to_buff);
784 if (record_size > MAX_LOAD_SIZE) {
785 ret = -ENOMEM;
786 goto end;
787 }
788
789 migf->record_size = record_size;
790 flags = le32_to_cpup((__le32 *)(to_buff +
791 offsetof(struct mlx5_vf_migration_header, flags)));
792 migf->record_tag = le32_to_cpup((__le32 *)(to_buff +
793 offsetof(struct mlx5_vf_migration_header, tag)));
794 switch (migf->record_tag) {
795 case MLX5_MIGF_HEADER_TAG_FW_DATA:
796 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
797 break;
798 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
799 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
800 break;
801 default:
802 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
803 ret = -EOPNOTSUPP;
804 goto end;
805 }
806 /* We may read and skip this optional record data */
807 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
808 }
809
810 migf->max_pos += vhca_buf->length;
811 vhca_buf->length = 0;
812 *has_work = true;
813 }
814 end:
815 kunmap_local(to_buff);
816 return ret;
817 }
818
mlx5vf_resume_write(struct file * filp,const char __user * buf,size_t len,loff_t * pos)819 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
820 size_t len, loff_t *pos)
821 {
822 struct mlx5_vf_migration_file *migf = filp->private_data;
823 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
824 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
825 loff_t requested_length;
826 bool has_work = false;
827 ssize_t done = 0;
828 int ret = 0;
829
830 if (pos)
831 return -ESPIPE;
832 pos = &filp->f_pos;
833
834 if (*pos < 0 ||
835 check_add_overflow((loff_t)len, *pos, &requested_length))
836 return -EINVAL;
837
838 mutex_lock(&migf->mvdev->state_mutex);
839 mutex_lock(&migf->lock);
840 if (migf->state == MLX5_MIGF_STATE_ERROR) {
841 ret = -ENODEV;
842 goto out_unlock;
843 }
844
845 while (len || has_work) {
846 has_work = false;
847 switch (migf->load_state) {
848 case MLX5_VF_LOAD_STATE_READ_HEADER:
849 ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
850 &buf, &len, pos,
851 &done, &has_work);
852 if (ret)
853 goto out_unlock;
854 break;
855 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
856 if (vhca_buf_header->allocated_length < migf->record_size) {
857 mlx5vf_free_data_buffer(vhca_buf_header);
858
859 migf->buf_header = mlx5vf_alloc_data_buffer(migf,
860 migf->record_size, DMA_NONE);
861 if (IS_ERR(migf->buf_header)) {
862 ret = PTR_ERR(migf->buf_header);
863 migf->buf_header = NULL;
864 goto out_unlock;
865 }
866
867 vhca_buf_header = migf->buf_header;
868 }
869
870 vhca_buf_header->start_pos = migf->max_pos;
871 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
872 break;
873 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
874 ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header,
875 &buf, &len, pos, &done);
876 if (ret)
877 goto out_unlock;
878 break;
879 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
880 {
881 u64 size = max(migf->record_size,
882 migf->stop_copy_prep_size);
883
884 if (vhca_buf->allocated_length < size) {
885 mlx5vf_free_data_buffer(vhca_buf);
886
887 migf->buf = mlx5vf_alloc_data_buffer(migf,
888 size, DMA_TO_DEVICE);
889 if (IS_ERR(migf->buf)) {
890 ret = PTR_ERR(migf->buf);
891 migf->buf = NULL;
892 goto out_unlock;
893 }
894
895 vhca_buf = migf->buf;
896 }
897
898 vhca_buf->start_pos = migf->max_pos;
899 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
900 break;
901 }
902 case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
903 ret = mlx5vf_resume_read_image_no_header(vhca_buf,
904 requested_length,
905 &buf, &len, pos, &done);
906 if (ret)
907 goto out_unlock;
908 break;
909 case MLX5_VF_LOAD_STATE_READ_IMAGE:
910 ret = mlx5vf_resume_read_image(migf, vhca_buf,
911 migf->record_size,
912 &buf, &len, pos, &done, &has_work);
913 if (ret)
914 goto out_unlock;
915 break;
916 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
917 ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
918 if (ret)
919 goto out_unlock;
920 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
921
922 /* prep header buf for next image */
923 vhca_buf_header->length = 0;
924 /* prep data buf for next image */
925 vhca_buf->length = 0;
926
927 break;
928 default:
929 break;
930 }
931 }
932
933 out_unlock:
934 if (ret)
935 migf->state = MLX5_MIGF_STATE_ERROR;
936 mutex_unlock(&migf->lock);
937 mlx5vf_state_mutex_unlock(migf->mvdev);
938 return ret ? ret : done;
939 }
940
941 static const struct file_operations mlx5vf_resume_fops = {
942 .owner = THIS_MODULE,
943 .write = mlx5vf_resume_write,
944 .release = mlx5vf_release_file,
945 .llseek = no_llseek,
946 };
947
948 static struct mlx5_vf_migration_file *
mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device * mvdev)949 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
950 {
951 struct mlx5_vf_migration_file *migf;
952 struct mlx5_vhca_data_buffer *buf;
953 int ret;
954
955 migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT);
956 if (!migf)
957 return ERR_PTR(-ENOMEM);
958
959 migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
960 O_WRONLY);
961 if (IS_ERR(migf->filp)) {
962 ret = PTR_ERR(migf->filp);
963 goto end;
964 }
965
966 migf->mvdev = mvdev;
967 ret = mlx5vf_cmd_alloc_pd(migf);
968 if (ret)
969 goto out_free;
970
971 buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
972 if (IS_ERR(buf)) {
973 ret = PTR_ERR(buf);
974 goto out_pd;
975 }
976
977 migf->buf = buf;
978 if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
979 buf = mlx5vf_alloc_data_buffer(migf,
980 sizeof(struct mlx5_vf_migration_header), DMA_NONE);
981 if (IS_ERR(buf)) {
982 ret = PTR_ERR(buf);
983 goto out_buf;
984 }
985
986 migf->buf_header = buf;
987 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
988 } else {
989 /* Initial state will be to read the image */
990 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
991 }
992
993 stream_open(migf->filp->f_inode, migf->filp);
994 mutex_init(&migf->lock);
995 INIT_LIST_HEAD(&migf->buf_list);
996 INIT_LIST_HEAD(&migf->avail_list);
997 spin_lock_init(&migf->list_lock);
998 return migf;
999 out_buf:
1000 mlx5vf_free_data_buffer(migf->buf);
1001 out_pd:
1002 mlx5vf_cmd_dealloc_pd(migf);
1003 out_free:
1004 fput(migf->filp);
1005 end:
1006 kfree(migf);
1007 return ERR_PTR(ret);
1008 }
1009
mlx5vf_disable_fds(struct mlx5vf_pci_core_device * mvdev)1010 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
1011 {
1012 if (mvdev->resuming_migf) {
1013 mlx5vf_disable_fd(mvdev->resuming_migf);
1014 mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
1015 fput(mvdev->resuming_migf->filp);
1016 mvdev->resuming_migf = NULL;
1017 }
1018 if (mvdev->saving_migf) {
1019 mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
1020 cancel_work_sync(&mvdev->saving_migf->async_data.work);
1021 mlx5vf_disable_fd(mvdev->saving_migf);
1022 mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
1023 fput(mvdev->saving_migf->filp);
1024 mvdev->saving_migf = NULL;
1025 }
1026 }
1027
1028 static struct file *
mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device * mvdev,u32 new)1029 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1030 u32 new)
1031 {
1032 u32 cur = mvdev->mig_state;
1033 int ret;
1034
1035 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1036 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1037 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1038 if (ret)
1039 return ERR_PTR(ret);
1040 return NULL;
1041 }
1042
1043 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1044 ret = mlx5vf_cmd_resume_vhca(mvdev,
1045 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1046 if (ret)
1047 return ERR_PTR(ret);
1048 return NULL;
1049 }
1050
1051 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1052 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1053 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1054 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1055 if (ret)
1056 return ERR_PTR(ret);
1057 return NULL;
1058 }
1059
1060 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1061 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1062 ret = mlx5vf_cmd_resume_vhca(mvdev,
1063 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1064 if (ret)
1065 return ERR_PTR(ret);
1066 return NULL;
1067 }
1068
1069 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1070 struct mlx5_vf_migration_file *migf;
1071
1072 migf = mlx5vf_pci_save_device_data(mvdev, false);
1073 if (IS_ERR(migf))
1074 return ERR_CAST(migf);
1075 get_file(migf->filp);
1076 mvdev->saving_migf = migf;
1077 return migf->filp;
1078 }
1079
1080 if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
1081 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1082 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1083 new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1084 mlx5vf_disable_fds(mvdev);
1085 return NULL;
1086 }
1087
1088 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1089 struct mlx5_vf_migration_file *migf;
1090
1091 migf = mlx5vf_pci_resume_device_data(mvdev);
1092 if (IS_ERR(migf))
1093 return ERR_CAST(migf);
1094 get_file(migf->filp);
1095 mvdev->resuming_migf = migf;
1096 return migf->filp;
1097 }
1098
1099 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1100 if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
1101 ret = mlx5vf_cmd_load_vhca_state(mvdev,
1102 mvdev->resuming_migf,
1103 mvdev->resuming_migf->buf);
1104 if (ret)
1105 return ERR_PTR(ret);
1106 }
1107 mlx5vf_disable_fds(mvdev);
1108 return NULL;
1109 }
1110
1111 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1112 (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1113 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1114 struct mlx5_vf_migration_file *migf;
1115
1116 migf = mlx5vf_pci_save_device_data(mvdev, true);
1117 if (IS_ERR(migf))
1118 return ERR_CAST(migf);
1119 get_file(migf->filp);
1120 mvdev->saving_migf = migf;
1121 return migf->filp;
1122 }
1123
1124 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1125 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1126 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1127 if (ret)
1128 return ERR_PTR(ret);
1129 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1130 return ret ? ERR_PTR(ret) : NULL;
1131 }
1132
1133 /*
1134 * vfio_mig_get_next_state() does not use arcs other than the above
1135 */
1136 WARN_ON(true);
1137 return ERR_PTR(-EINVAL);
1138 }
1139
1140 /*
1141 * This function is called in all state_mutex unlock cases to
1142 * handle a 'deferred_reset' if exists.
1143 */
mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device * mvdev)1144 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1145 {
1146 again:
1147 spin_lock(&mvdev->reset_lock);
1148 if (mvdev->deferred_reset) {
1149 mvdev->deferred_reset = false;
1150 spin_unlock(&mvdev->reset_lock);
1151 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1152 mlx5vf_disable_fds(mvdev);
1153 goto again;
1154 }
1155 mutex_unlock(&mvdev->state_mutex);
1156 spin_unlock(&mvdev->reset_lock);
1157 }
1158
1159 static struct file *
mlx5vf_pci_set_device_state(struct vfio_device * vdev,enum vfio_device_mig_state new_state)1160 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1161 enum vfio_device_mig_state new_state)
1162 {
1163 struct mlx5vf_pci_core_device *mvdev = container_of(
1164 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1165 enum vfio_device_mig_state next_state;
1166 struct file *res = NULL;
1167 int ret;
1168
1169 mutex_lock(&mvdev->state_mutex);
1170 while (new_state != mvdev->mig_state) {
1171 ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1172 new_state, &next_state);
1173 if (ret) {
1174 res = ERR_PTR(ret);
1175 break;
1176 }
1177 res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1178 if (IS_ERR(res))
1179 break;
1180 mvdev->mig_state = next_state;
1181 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1182 fput(res);
1183 res = ERR_PTR(-EINVAL);
1184 break;
1185 }
1186 }
1187 mlx5vf_state_mutex_unlock(mvdev);
1188 return res;
1189 }
1190
mlx5vf_pci_get_data_size(struct vfio_device * vdev,unsigned long * stop_copy_length)1191 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1192 unsigned long *stop_copy_length)
1193 {
1194 struct mlx5vf_pci_core_device *mvdev = container_of(
1195 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1196 size_t state_size;
1197 int ret;
1198
1199 mutex_lock(&mvdev->state_mutex);
1200 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
1201 &state_size, 0);
1202 if (!ret)
1203 *stop_copy_length = state_size;
1204 mlx5vf_state_mutex_unlock(mvdev);
1205 return ret;
1206 }
1207
mlx5vf_pci_get_device_state(struct vfio_device * vdev,enum vfio_device_mig_state * curr_state)1208 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1209 enum vfio_device_mig_state *curr_state)
1210 {
1211 struct mlx5vf_pci_core_device *mvdev = container_of(
1212 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1213
1214 mutex_lock(&mvdev->state_mutex);
1215 *curr_state = mvdev->mig_state;
1216 mlx5vf_state_mutex_unlock(mvdev);
1217 return 0;
1218 }
1219
mlx5vf_pci_aer_reset_done(struct pci_dev * pdev)1220 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1221 {
1222 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1223
1224 if (!mvdev->migrate_cap)
1225 return;
1226
1227 /*
1228 * As the higher VFIO layers are holding locks across reset and using
1229 * those same locks with the mm_lock we need to prevent ABBA deadlock
1230 * with the state_mutex and mm_lock.
1231 * In case the state_mutex was taken already we defer the cleanup work
1232 * to the unlock flow of the other running context.
1233 */
1234 spin_lock(&mvdev->reset_lock);
1235 mvdev->deferred_reset = true;
1236 if (!mutex_trylock(&mvdev->state_mutex)) {
1237 spin_unlock(&mvdev->reset_lock);
1238 return;
1239 }
1240 spin_unlock(&mvdev->reset_lock);
1241 mlx5vf_state_mutex_unlock(mvdev);
1242 }
1243
mlx5vf_pci_open_device(struct vfio_device * core_vdev)1244 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1245 {
1246 struct mlx5vf_pci_core_device *mvdev = container_of(
1247 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1248 struct vfio_pci_core_device *vdev = &mvdev->core_device;
1249 int ret;
1250
1251 ret = vfio_pci_core_enable(vdev);
1252 if (ret)
1253 return ret;
1254
1255 if (mvdev->migrate_cap)
1256 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1257 vfio_pci_core_finish_enable(vdev);
1258 return 0;
1259 }
1260
mlx5vf_pci_close_device(struct vfio_device * core_vdev)1261 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1262 {
1263 struct mlx5vf_pci_core_device *mvdev = container_of(
1264 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1265
1266 mlx5vf_cmd_close_migratable(mvdev);
1267 vfio_pci_core_close_device(core_vdev);
1268 }
1269
1270 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1271 .migration_set_state = mlx5vf_pci_set_device_state,
1272 .migration_get_state = mlx5vf_pci_get_device_state,
1273 .migration_get_data_size = mlx5vf_pci_get_data_size,
1274 };
1275
1276 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1277 .log_start = mlx5vf_start_page_tracker,
1278 .log_stop = mlx5vf_stop_page_tracker,
1279 .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1280 };
1281
mlx5vf_pci_init_dev(struct vfio_device * core_vdev)1282 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1283 {
1284 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1285 struct mlx5vf_pci_core_device, core_device.vdev);
1286 int ret;
1287
1288 ret = vfio_pci_core_init_dev(core_vdev);
1289 if (ret)
1290 return ret;
1291
1292 mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1293 &mlx5vf_pci_log_ops);
1294
1295 return 0;
1296 }
1297
mlx5vf_pci_release_dev(struct vfio_device * core_vdev)1298 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1299 {
1300 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1301 struct mlx5vf_pci_core_device, core_device.vdev);
1302
1303 mlx5vf_cmd_remove_migratable(mvdev);
1304 vfio_pci_core_release_dev(core_vdev);
1305 }
1306
1307 static const struct vfio_device_ops mlx5vf_pci_ops = {
1308 .name = "mlx5-vfio-pci",
1309 .init = mlx5vf_pci_init_dev,
1310 .release = mlx5vf_pci_release_dev,
1311 .open_device = mlx5vf_pci_open_device,
1312 .close_device = mlx5vf_pci_close_device,
1313 .ioctl = vfio_pci_core_ioctl,
1314 .device_feature = vfio_pci_core_ioctl_feature,
1315 .read = vfio_pci_core_read,
1316 .write = vfio_pci_core_write,
1317 .mmap = vfio_pci_core_mmap,
1318 .request = vfio_pci_core_request,
1319 .match = vfio_pci_core_match,
1320 .bind_iommufd = vfio_iommufd_physical_bind,
1321 .unbind_iommufd = vfio_iommufd_physical_unbind,
1322 .attach_ioas = vfio_iommufd_physical_attach_ioas,
1323 .detach_ioas = vfio_iommufd_physical_detach_ioas,
1324 };
1325
mlx5vf_pci_probe(struct pci_dev * pdev,const struct pci_device_id * id)1326 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1327 const struct pci_device_id *id)
1328 {
1329 struct mlx5vf_pci_core_device *mvdev;
1330 int ret;
1331
1332 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1333 &pdev->dev, &mlx5vf_pci_ops);
1334 if (IS_ERR(mvdev))
1335 return PTR_ERR(mvdev);
1336
1337 dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1338 ret = vfio_pci_core_register_device(&mvdev->core_device);
1339 if (ret)
1340 goto out_put_vdev;
1341 return 0;
1342
1343 out_put_vdev:
1344 vfio_put_device(&mvdev->core_device.vdev);
1345 return ret;
1346 }
1347
mlx5vf_pci_remove(struct pci_dev * pdev)1348 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1349 {
1350 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1351
1352 vfio_pci_core_unregister_device(&mvdev->core_device);
1353 vfio_put_device(&mvdev->core_device.vdev);
1354 }
1355
1356 static const struct pci_device_id mlx5vf_pci_table[] = {
1357 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1358 {}
1359 };
1360
1361 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1362
1363 static const struct pci_error_handlers mlx5vf_err_handlers = {
1364 .reset_done = mlx5vf_pci_aer_reset_done,
1365 .error_detected = vfio_pci_core_aer_err_detected,
1366 };
1367
1368 static struct pci_driver mlx5vf_pci_driver = {
1369 .name = KBUILD_MODNAME,
1370 .id_table = mlx5vf_pci_table,
1371 .probe = mlx5vf_pci_probe,
1372 .remove = mlx5vf_pci_remove,
1373 .err_handler = &mlx5vf_err_handlers,
1374 .driver_managed_dma = true,
1375 };
1376
1377 module_pci_driver(mlx5vf_pci_driver);
1378
1379 MODULE_LICENSE("GPL");
1380 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1381 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1382 MODULE_DESCRIPTION(
1383 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1384