xref: /openbmc/linux/drivers/vfio/pci/mlx5/main.c (revision d1f0f50f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Arbitrary to prevent userspace from consuming endless memory */
25 #define MAX_MIGRATION_SIZE (512*1024*1024)
26 
27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30 
31 	return container_of(core_device, struct mlx5vf_pci_core_device,
32 			    core_device);
33 }
34 
35 struct page *
36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 			  unsigned long offset)
38 {
39 	unsigned long cur_offset = 0;
40 	struct scatterlist *sg;
41 	unsigned int i;
42 
43 	/* All accesses are sequential */
44 	if (offset < buf->last_offset || !buf->last_offset_sg) {
45 		buf->last_offset = 0;
46 		buf->last_offset_sg = buf->table.sgt.sgl;
47 		buf->sg_last_entry = 0;
48 	}
49 
50 	cur_offset = buf->last_offset;
51 
52 	for_each_sg(buf->last_offset_sg, sg,
53 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 		if (offset < sg->length + cur_offset) {
55 			buf->last_offset_sg = sg;
56 			buf->sg_last_entry += i;
57 			buf->last_offset = cur_offset;
58 			return nth_page(sg_page(sg),
59 					(offset - cur_offset) / PAGE_SIZE);
60 		}
61 		cur_offset += sg->length;
62 	}
63 	return NULL;
64 }
65 
66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 			       unsigned int npages)
68 {
69 	unsigned int to_alloc = npages;
70 	struct page **page_list;
71 	unsigned long filled;
72 	unsigned int to_fill;
73 	int ret;
74 
75 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
77 	if (!page_list)
78 		return -ENOMEM;
79 
80 	do {
81 		filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
82 		if (!filled) {
83 			ret = -ENOMEM;
84 			goto err;
85 		}
86 		to_alloc -= filled;
87 		ret = sg_alloc_append_table_from_pages(
88 			&buf->table, page_list, filled, 0,
89 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
90 			GFP_KERNEL);
91 
92 		if (ret)
93 			goto err;
94 		buf->allocated_length += filled * PAGE_SIZE;
95 		/* clean input for another bulk allocation */
96 		memset(page_list, 0, filled * sizeof(*page_list));
97 		to_fill = min_t(unsigned int, to_alloc,
98 				PAGE_SIZE / sizeof(*page_list));
99 	} while (to_alloc > 0);
100 
101 	kvfree(page_list);
102 	return 0;
103 
104 err:
105 	kvfree(page_list);
106 	return ret;
107 }
108 
109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
110 {
111 	mutex_lock(&migf->lock);
112 	migf->state = MLX5_MIGF_STATE_ERROR;
113 	migf->filp->f_pos = 0;
114 	mutex_unlock(&migf->lock);
115 }
116 
117 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
118 {
119 	struct mlx5_vf_migration_file *migf = filp->private_data;
120 
121 	mlx5vf_disable_fd(migf);
122 	mutex_destroy(&migf->lock);
123 	kfree(migf);
124 	return 0;
125 }
126 
127 static struct mlx5_vhca_data_buffer *
128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
129 			      bool *end_of_data)
130 {
131 	struct mlx5_vhca_data_buffer *buf;
132 	bool found = false;
133 
134 	*end_of_data = false;
135 	spin_lock_irq(&migf->list_lock);
136 	if (list_empty(&migf->buf_list)) {
137 		*end_of_data = true;
138 		goto end;
139 	}
140 
141 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
142 			       buf_elm);
143 	if (pos >= buf->start_pos &&
144 	    pos < buf->start_pos + buf->length) {
145 		found = true;
146 		goto end;
147 	}
148 
149 	/*
150 	 * As we use a stream based FD we may expect having the data always
151 	 * on first chunk
152 	 */
153 	migf->state = MLX5_MIGF_STATE_ERROR;
154 
155 end:
156 	spin_unlock_irq(&migf->list_lock);
157 	return found ? buf : NULL;
158 }
159 
160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
161 			       char __user **buf, size_t *len, loff_t *pos)
162 {
163 	unsigned long offset;
164 	ssize_t done = 0;
165 	size_t copy_len;
166 
167 	copy_len = min_t(size_t,
168 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
169 	while (copy_len) {
170 		size_t page_offset;
171 		struct page *page;
172 		size_t page_len;
173 		u8 *from_buff;
174 		int ret;
175 
176 		offset = *pos - vhca_buf->start_pos;
177 		page_offset = offset % PAGE_SIZE;
178 		offset -= page_offset;
179 		page = mlx5vf_get_migration_page(vhca_buf, offset);
180 		if (!page)
181 			return -EINVAL;
182 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
183 		from_buff = kmap_local_page(page);
184 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
185 		kunmap_local(from_buff);
186 		if (ret)
187 			return -EFAULT;
188 		*pos += page_len;
189 		*len -= page_len;
190 		*buf += page_len;
191 		done += page_len;
192 		copy_len -= page_len;
193 	}
194 
195 	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
196 		spin_lock_irq(&vhca_buf->migf->list_lock);
197 		list_del_init(&vhca_buf->buf_elm);
198 		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
199 		spin_unlock_irq(&vhca_buf->migf->list_lock);
200 	}
201 
202 	return done;
203 }
204 
205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
206 			       loff_t *pos)
207 {
208 	struct mlx5_vf_migration_file *migf = filp->private_data;
209 	struct mlx5_vhca_data_buffer *vhca_buf;
210 	bool first_loop_call = true;
211 	bool end_of_data;
212 	ssize_t done = 0;
213 
214 	if (pos)
215 		return -ESPIPE;
216 	pos = &filp->f_pos;
217 
218 	if (!(filp->f_flags & O_NONBLOCK)) {
219 		if (wait_event_interruptible(migf->poll_wait,
220 				!list_empty(&migf->buf_list) ||
221 				migf->state == MLX5_MIGF_STATE_ERROR ||
222 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
223 				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
224 				migf->state == MLX5_MIGF_STATE_COMPLETE))
225 			return -ERESTARTSYS;
226 	}
227 
228 	mutex_lock(&migf->lock);
229 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
230 		done = -ENODEV;
231 		goto out_unlock;
232 	}
233 
234 	while (len) {
235 		ssize_t count;
236 
237 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
238 							 &end_of_data);
239 		if (first_loop_call) {
240 			first_loop_call = false;
241 			/* Temporary end of file as part of PRE_COPY */
242 			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
243 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
244 				done = -ENOMSG;
245 				goto out_unlock;
246 			}
247 
248 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
249 				if (filp->f_flags & O_NONBLOCK) {
250 					done = -EAGAIN;
251 					goto out_unlock;
252 				}
253 			}
254 		}
255 
256 		if (end_of_data)
257 			goto out_unlock;
258 
259 		if (!vhca_buf) {
260 			done = -EINVAL;
261 			goto out_unlock;
262 		}
263 
264 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
265 		if (count < 0) {
266 			done = count;
267 			goto out_unlock;
268 		}
269 		done += count;
270 	}
271 
272 out_unlock:
273 	mutex_unlock(&migf->lock);
274 	return done;
275 }
276 
277 static __poll_t mlx5vf_save_poll(struct file *filp,
278 				 struct poll_table_struct *wait)
279 {
280 	struct mlx5_vf_migration_file *migf = filp->private_data;
281 	__poll_t pollflags = 0;
282 
283 	poll_wait(filp, &migf->poll_wait, wait);
284 
285 	mutex_lock(&migf->lock);
286 	if (migf->state == MLX5_MIGF_STATE_ERROR)
287 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
288 	else if (!list_empty(&migf->buf_list) ||
289 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
290 		pollflags = EPOLLIN | EPOLLRDNORM;
291 	mutex_unlock(&migf->lock);
292 
293 	return pollflags;
294 }
295 
296 /*
297  * FD is exposed and user can use it after receiving an error.
298  * Mark migf in error, and wake the user.
299  */
300 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
301 {
302 	migf->state = MLX5_MIGF_STATE_ERROR;
303 	wake_up_interruptible(&migf->poll_wait);
304 }
305 
306 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
307 				 unsigned long arg)
308 {
309 	struct mlx5_vf_migration_file *migf = filp->private_data;
310 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
311 	struct mlx5_vhca_data_buffer *buf;
312 	struct vfio_precopy_info info = {};
313 	loff_t *pos = &filp->f_pos;
314 	unsigned long minsz;
315 	size_t inc_length = 0;
316 	bool end_of_data;
317 	int ret;
318 
319 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
320 		return -ENOTTY;
321 
322 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
323 
324 	if (copy_from_user(&info, (void __user *)arg, minsz))
325 		return -EFAULT;
326 
327 	if (info.argsz < minsz)
328 		return -EINVAL;
329 
330 	mutex_lock(&mvdev->state_mutex);
331 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
332 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
333 		ret = -EINVAL;
334 		goto err_state_unlock;
335 	}
336 
337 	/*
338 	 * We can't issue a SAVE command when the device is suspended, so as
339 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
340 	 * bytes that can't be read.
341 	 */
342 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
343 		/*
344 		 * Once the query returns it's guaranteed that there is no
345 		 * active SAVE command.
346 		 * As so, the other code below is safe with the proper locks.
347 		 */
348 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
349 							    MLX5VF_QUERY_INC);
350 		if (ret)
351 			goto err_state_unlock;
352 	}
353 
354 	mutex_lock(&migf->lock);
355 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
356 		ret = -ENODEV;
357 		goto err_migf_unlock;
358 	}
359 
360 	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
361 	if (buf) {
362 		if (buf->start_pos == 0) {
363 			info.initial_bytes = buf->header_image_size - *pos;
364 		} else if (buf->start_pos ==
365 				sizeof(struct mlx5_vf_migration_header)) {
366 			/* First data buffer following the header */
367 			info.initial_bytes = buf->start_pos +
368 						buf->length - *pos;
369 		} else {
370 			info.dirty_bytes = buf->start_pos + buf->length - *pos;
371 		}
372 	} else {
373 		if (!end_of_data) {
374 			ret = -EINVAL;
375 			goto err_migf_unlock;
376 		}
377 
378 		info.dirty_bytes = inc_length;
379 	}
380 
381 	if (!end_of_data || !inc_length) {
382 		mutex_unlock(&migf->lock);
383 		goto done;
384 	}
385 
386 	mutex_unlock(&migf->lock);
387 	/*
388 	 * We finished transferring the current state and the device has a
389 	 * dirty state, save a new state to be ready for.
390 	 */
391 	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
392 	if (IS_ERR(buf)) {
393 		ret = PTR_ERR(buf);
394 		mlx5vf_mark_err(migf);
395 		goto err_state_unlock;
396 	}
397 
398 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
399 	if (ret) {
400 		mlx5vf_mark_err(migf);
401 		mlx5vf_put_data_buffer(buf);
402 		goto err_state_unlock;
403 	}
404 
405 done:
406 	mlx5vf_state_mutex_unlock(mvdev);
407 	return copy_to_user((void __user *)arg, &info, minsz);
408 err_migf_unlock:
409 	mutex_unlock(&migf->lock);
410 err_state_unlock:
411 	mlx5vf_state_mutex_unlock(mvdev);
412 	return ret;
413 }
414 
415 static const struct file_operations mlx5vf_save_fops = {
416 	.owner = THIS_MODULE,
417 	.read = mlx5vf_save_read,
418 	.poll = mlx5vf_save_poll,
419 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
420 	.compat_ioctl = compat_ptr_ioctl,
421 	.release = mlx5vf_release_file,
422 	.llseek = no_llseek,
423 };
424 
425 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
426 {
427 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
428 	struct mlx5_vhca_data_buffer *buf;
429 	size_t length;
430 	int ret;
431 
432 	if (migf->state == MLX5_MIGF_STATE_ERROR)
433 		return -ENODEV;
434 
435 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
436 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
437 	if (ret)
438 		goto err;
439 
440 	buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
441 	if (IS_ERR(buf)) {
442 		ret = PTR_ERR(buf);
443 		goto err;
444 	}
445 
446 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
447 	if (ret)
448 		goto err_save;
449 
450 	return 0;
451 
452 err_save:
453 	mlx5vf_put_data_buffer(buf);
454 err:
455 	mlx5vf_mark_err(migf);
456 	return ret;
457 }
458 
459 static struct mlx5_vf_migration_file *
460 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
461 {
462 	struct mlx5_vf_migration_file *migf;
463 	struct mlx5_vhca_data_buffer *buf;
464 	size_t length;
465 	int ret;
466 
467 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
468 	if (!migf)
469 		return ERR_PTR(-ENOMEM);
470 
471 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
472 					O_RDONLY);
473 	if (IS_ERR(migf->filp)) {
474 		ret = PTR_ERR(migf->filp);
475 		goto end;
476 	}
477 
478 	migf->mvdev = mvdev;
479 	ret = mlx5vf_cmd_alloc_pd(migf);
480 	if (ret)
481 		goto out_free;
482 
483 	stream_open(migf->filp->f_inode, migf->filp);
484 	mutex_init(&migf->lock);
485 	init_waitqueue_head(&migf->poll_wait);
486 	init_completion(&migf->save_comp);
487 	/*
488 	 * save_comp is being used as a binary semaphore built from
489 	 * a completion. A normal mutex cannot be used because the lock is
490 	 * passed between kernel threads and lockdep can't model this.
491 	 */
492 	complete(&migf->save_comp);
493 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
494 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
495 	INIT_LIST_HEAD(&migf->buf_list);
496 	INIT_LIST_HEAD(&migf->avail_list);
497 	spin_lock_init(&migf->list_lock);
498 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
499 	if (ret)
500 		goto out_pd;
501 
502 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
503 	if (IS_ERR(buf)) {
504 		ret = PTR_ERR(buf);
505 		goto out_pd;
506 	}
507 
508 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
509 	if (ret)
510 		goto out_save;
511 	return migf;
512 out_save:
513 	mlx5vf_free_data_buffer(buf);
514 out_pd:
515 	mlx5vf_cmd_dealloc_pd(migf);
516 out_free:
517 	fput(migf->filp);
518 end:
519 	kfree(migf);
520 	return ERR_PTR(ret);
521 }
522 
523 static int
524 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
525 			      const char __user **buf, size_t *len,
526 			      loff_t *pos, ssize_t *done)
527 {
528 	unsigned long offset;
529 	size_t page_offset;
530 	struct page *page;
531 	size_t page_len;
532 	u8 *to_buff;
533 	int ret;
534 
535 	offset = *pos - vhca_buf->start_pos;
536 	page_offset = offset % PAGE_SIZE;
537 
538 	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
539 	if (!page)
540 		return -EINVAL;
541 	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
542 	to_buff = kmap_local_page(page);
543 	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
544 	kunmap_local(to_buff);
545 	if (ret)
546 		return -EFAULT;
547 
548 	*pos += page_len;
549 	*done += page_len;
550 	*buf += page_len;
551 	*len -= page_len;
552 	vhca_buf->length += page_len;
553 	return 0;
554 }
555 
556 static int
557 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
558 				   loff_t requested_length,
559 				   const char __user **buf, size_t *len,
560 				   loff_t *pos, ssize_t *done)
561 {
562 	int ret;
563 
564 	if (requested_length > MAX_MIGRATION_SIZE)
565 		return -ENOMEM;
566 
567 	if (vhca_buf->allocated_length < requested_length) {
568 		ret = mlx5vf_add_migration_pages(
569 			vhca_buf,
570 			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
571 				     PAGE_SIZE));
572 		if (ret)
573 			return ret;
574 	}
575 
576 	while (*len) {
577 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
578 						    done);
579 		if (ret)
580 			return ret;
581 	}
582 
583 	return 0;
584 }
585 
586 static ssize_t
587 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
588 			 struct mlx5_vhca_data_buffer *vhca_buf,
589 			 size_t image_size, const char __user **buf,
590 			 size_t *len, loff_t *pos, ssize_t *done,
591 			 bool *has_work)
592 {
593 	size_t copy_len, to_copy;
594 	int ret;
595 
596 	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
597 	copy_len = to_copy;
598 	while (to_copy) {
599 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
600 						    done);
601 		if (ret)
602 			return ret;
603 	}
604 
605 	*len -= copy_len;
606 	if (vhca_buf->length == image_size) {
607 		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
608 		migf->max_pos += image_size;
609 		*has_work = true;
610 	}
611 
612 	return 0;
613 }
614 
615 static int
616 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
617 			  struct mlx5_vhca_data_buffer *vhca_buf,
618 			  const char __user **buf,
619 			  size_t *len, loff_t *pos,
620 			  ssize_t *done, bool *has_work)
621 {
622 	struct page *page;
623 	size_t copy_len;
624 	u8 *to_buff;
625 	int ret;
626 
627 	copy_len = min_t(size_t, *len,
628 		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
629 	page = mlx5vf_get_migration_page(vhca_buf, 0);
630 	if (!page)
631 		return -EINVAL;
632 	to_buff = kmap_local_page(page);
633 	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
634 	if (ret) {
635 		ret = -EFAULT;
636 		goto end;
637 	}
638 
639 	*buf += copy_len;
640 	*pos += copy_len;
641 	*done += copy_len;
642 	*len -= copy_len;
643 	vhca_buf->length += copy_len;
644 	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
645 		u64 flags;
646 
647 		vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
648 		if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
649 			ret = -ENOMEM;
650 			goto end;
651 		}
652 
653 		flags = le64_to_cpup((__le64 *)(to_buff +
654 			    offsetof(struct mlx5_vf_migration_header, flags)));
655 		if (flags) {
656 			ret = -EOPNOTSUPP;
657 			goto end;
658 		}
659 
660 		migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
661 		migf->max_pos += vhca_buf->length;
662 		*has_work = true;
663 	}
664 end:
665 	kunmap_local(to_buff);
666 	return ret;
667 }
668 
669 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
670 				   size_t len, loff_t *pos)
671 {
672 	struct mlx5_vf_migration_file *migf = filp->private_data;
673 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
674 	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
675 	loff_t requested_length;
676 	bool has_work = false;
677 	ssize_t done = 0;
678 	int ret = 0;
679 
680 	if (pos)
681 		return -ESPIPE;
682 	pos = &filp->f_pos;
683 
684 	if (*pos < 0 ||
685 	    check_add_overflow((loff_t)len, *pos, &requested_length))
686 		return -EINVAL;
687 
688 	mutex_lock(&migf->mvdev->state_mutex);
689 	mutex_lock(&migf->lock);
690 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
691 		ret = -ENODEV;
692 		goto out_unlock;
693 	}
694 
695 	while (len || has_work) {
696 		has_work = false;
697 		switch (migf->load_state) {
698 		case MLX5_VF_LOAD_STATE_READ_HEADER:
699 			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
700 							&buf, &len, pos,
701 							&done, &has_work);
702 			if (ret)
703 				goto out_unlock;
704 			break;
705 		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
706 		{
707 			u64 size = vhca_buf_header->header_image_size;
708 
709 			if (vhca_buf->allocated_length < size) {
710 				mlx5vf_free_data_buffer(vhca_buf);
711 
712 				migf->buf = mlx5vf_alloc_data_buffer(migf,
713 							size, DMA_TO_DEVICE);
714 				if (IS_ERR(migf->buf)) {
715 					ret = PTR_ERR(migf->buf);
716 					migf->buf = NULL;
717 					goto out_unlock;
718 				}
719 
720 				vhca_buf = migf->buf;
721 			}
722 
723 			vhca_buf->start_pos = migf->max_pos;
724 			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
725 			break;
726 		}
727 		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
728 			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
729 						requested_length,
730 						&buf, &len, pos, &done);
731 			if (ret)
732 				goto out_unlock;
733 			break;
734 		case MLX5_VF_LOAD_STATE_READ_IMAGE:
735 			ret = mlx5vf_resume_read_image(migf, vhca_buf,
736 						vhca_buf_header->header_image_size,
737 						&buf, &len, pos, &done, &has_work);
738 			if (ret)
739 				goto out_unlock;
740 			break;
741 		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
742 			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
743 			if (ret)
744 				goto out_unlock;
745 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
746 
747 			/* prep header buf for next image */
748 			vhca_buf_header->length = 0;
749 			vhca_buf_header->header_image_size = 0;
750 			/* prep data buf for next image */
751 			vhca_buf->length = 0;
752 
753 			break;
754 		default:
755 			break;
756 		}
757 	}
758 
759 out_unlock:
760 	if (ret)
761 		migf->state = MLX5_MIGF_STATE_ERROR;
762 	mutex_unlock(&migf->lock);
763 	mlx5vf_state_mutex_unlock(migf->mvdev);
764 	return ret ? ret : done;
765 }
766 
767 static const struct file_operations mlx5vf_resume_fops = {
768 	.owner = THIS_MODULE,
769 	.write = mlx5vf_resume_write,
770 	.release = mlx5vf_release_file,
771 	.llseek = no_llseek,
772 };
773 
774 static struct mlx5_vf_migration_file *
775 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
776 {
777 	struct mlx5_vf_migration_file *migf;
778 	struct mlx5_vhca_data_buffer *buf;
779 	int ret;
780 
781 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
782 	if (!migf)
783 		return ERR_PTR(-ENOMEM);
784 
785 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
786 					O_WRONLY);
787 	if (IS_ERR(migf->filp)) {
788 		ret = PTR_ERR(migf->filp);
789 		goto end;
790 	}
791 
792 	migf->mvdev = mvdev;
793 	ret = mlx5vf_cmd_alloc_pd(migf);
794 	if (ret)
795 		goto out_free;
796 
797 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
798 	if (IS_ERR(buf)) {
799 		ret = PTR_ERR(buf);
800 		goto out_pd;
801 	}
802 
803 	migf->buf = buf;
804 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
805 		buf = mlx5vf_alloc_data_buffer(migf,
806 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
807 		if (IS_ERR(buf)) {
808 			ret = PTR_ERR(buf);
809 			goto out_buf;
810 		}
811 
812 		migf->buf_header = buf;
813 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
814 	} else {
815 		/* Initial state will be to read the image */
816 		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
817 	}
818 
819 	stream_open(migf->filp->f_inode, migf->filp);
820 	mutex_init(&migf->lock);
821 	INIT_LIST_HEAD(&migf->buf_list);
822 	INIT_LIST_HEAD(&migf->avail_list);
823 	spin_lock_init(&migf->list_lock);
824 	return migf;
825 out_buf:
826 	mlx5vf_free_data_buffer(buf);
827 out_pd:
828 	mlx5vf_cmd_dealloc_pd(migf);
829 out_free:
830 	fput(migf->filp);
831 end:
832 	kfree(migf);
833 	return ERR_PTR(ret);
834 }
835 
836 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
837 {
838 	if (mvdev->resuming_migf) {
839 		mlx5vf_disable_fd(mvdev->resuming_migf);
840 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
841 		fput(mvdev->resuming_migf->filp);
842 		mvdev->resuming_migf = NULL;
843 	}
844 	if (mvdev->saving_migf) {
845 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
846 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
847 		mlx5vf_disable_fd(mvdev->saving_migf);
848 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
849 		fput(mvdev->saving_migf->filp);
850 		mvdev->saving_migf = NULL;
851 	}
852 }
853 
854 static struct file *
855 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
856 				    u32 new)
857 {
858 	u32 cur = mvdev->mig_state;
859 	int ret;
860 
861 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
862 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
863 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
864 		if (ret)
865 			return ERR_PTR(ret);
866 		return NULL;
867 	}
868 
869 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
870 		ret = mlx5vf_cmd_resume_vhca(mvdev,
871 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
872 		if (ret)
873 			return ERR_PTR(ret);
874 		return NULL;
875 	}
876 
877 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
878 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
879 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
880 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
881 		if (ret)
882 			return ERR_PTR(ret);
883 		return NULL;
884 	}
885 
886 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
887 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
888 		ret = mlx5vf_cmd_resume_vhca(mvdev,
889 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
890 		if (ret)
891 			return ERR_PTR(ret);
892 		return NULL;
893 	}
894 
895 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
896 		struct mlx5_vf_migration_file *migf;
897 
898 		migf = mlx5vf_pci_save_device_data(mvdev, false);
899 		if (IS_ERR(migf))
900 			return ERR_CAST(migf);
901 		get_file(migf->filp);
902 		mvdev->saving_migf = migf;
903 		return migf->filp;
904 	}
905 
906 	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
907 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
908 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
909 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
910 		mlx5vf_disable_fds(mvdev);
911 		return NULL;
912 	}
913 
914 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
915 		struct mlx5_vf_migration_file *migf;
916 
917 		migf = mlx5vf_pci_resume_device_data(mvdev);
918 		if (IS_ERR(migf))
919 			return ERR_CAST(migf);
920 		get_file(migf->filp);
921 		mvdev->resuming_migf = migf;
922 		return migf->filp;
923 	}
924 
925 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
926 		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
927 			ret = mlx5vf_cmd_load_vhca_state(mvdev,
928 							 mvdev->resuming_migf,
929 							 mvdev->resuming_migf->buf);
930 			if (ret)
931 				return ERR_PTR(ret);
932 		}
933 		mlx5vf_disable_fds(mvdev);
934 		return NULL;
935 	}
936 
937 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
938 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
939 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
940 		struct mlx5_vf_migration_file *migf;
941 
942 		migf = mlx5vf_pci_save_device_data(mvdev, true);
943 		if (IS_ERR(migf))
944 			return ERR_CAST(migf);
945 		get_file(migf->filp);
946 		mvdev->saving_migf = migf;
947 		return migf->filp;
948 	}
949 
950 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
951 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
952 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
953 		if (ret)
954 			return ERR_PTR(ret);
955 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
956 		return ret ? ERR_PTR(ret) : NULL;
957 	}
958 
959 	/*
960 	 * vfio_mig_get_next_state() does not use arcs other than the above
961 	 */
962 	WARN_ON(true);
963 	return ERR_PTR(-EINVAL);
964 }
965 
966 /*
967  * This function is called in all state_mutex unlock cases to
968  * handle a 'deferred_reset' if exists.
969  */
970 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
971 {
972 again:
973 	spin_lock(&mvdev->reset_lock);
974 	if (mvdev->deferred_reset) {
975 		mvdev->deferred_reset = false;
976 		spin_unlock(&mvdev->reset_lock);
977 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
978 		mlx5vf_disable_fds(mvdev);
979 		goto again;
980 	}
981 	mutex_unlock(&mvdev->state_mutex);
982 	spin_unlock(&mvdev->reset_lock);
983 }
984 
985 static struct file *
986 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
987 			    enum vfio_device_mig_state new_state)
988 {
989 	struct mlx5vf_pci_core_device *mvdev = container_of(
990 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
991 	enum vfio_device_mig_state next_state;
992 	struct file *res = NULL;
993 	int ret;
994 
995 	mutex_lock(&mvdev->state_mutex);
996 	while (new_state != mvdev->mig_state) {
997 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
998 					      new_state, &next_state);
999 		if (ret) {
1000 			res = ERR_PTR(ret);
1001 			break;
1002 		}
1003 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1004 		if (IS_ERR(res))
1005 			break;
1006 		mvdev->mig_state = next_state;
1007 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1008 			fput(res);
1009 			res = ERR_PTR(-EINVAL);
1010 			break;
1011 		}
1012 	}
1013 	mlx5vf_state_mutex_unlock(mvdev);
1014 	return res;
1015 }
1016 
1017 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1018 				    unsigned long *stop_copy_length)
1019 {
1020 	struct mlx5vf_pci_core_device *mvdev = container_of(
1021 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1022 	size_t state_size;
1023 	int ret;
1024 
1025 	mutex_lock(&mvdev->state_mutex);
1026 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
1027 						    &state_size, 0);
1028 	if (!ret)
1029 		*stop_copy_length = state_size;
1030 	mlx5vf_state_mutex_unlock(mvdev);
1031 	return ret;
1032 }
1033 
1034 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1035 				       enum vfio_device_mig_state *curr_state)
1036 {
1037 	struct mlx5vf_pci_core_device *mvdev = container_of(
1038 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1039 
1040 	mutex_lock(&mvdev->state_mutex);
1041 	*curr_state = mvdev->mig_state;
1042 	mlx5vf_state_mutex_unlock(mvdev);
1043 	return 0;
1044 }
1045 
1046 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1047 {
1048 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1049 
1050 	if (!mvdev->migrate_cap)
1051 		return;
1052 
1053 	/*
1054 	 * As the higher VFIO layers are holding locks across reset and using
1055 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1056 	 * with the state_mutex and mm_lock.
1057 	 * In case the state_mutex was taken already we defer the cleanup work
1058 	 * to the unlock flow of the other running context.
1059 	 */
1060 	spin_lock(&mvdev->reset_lock);
1061 	mvdev->deferred_reset = true;
1062 	if (!mutex_trylock(&mvdev->state_mutex)) {
1063 		spin_unlock(&mvdev->reset_lock);
1064 		return;
1065 	}
1066 	spin_unlock(&mvdev->reset_lock);
1067 	mlx5vf_state_mutex_unlock(mvdev);
1068 }
1069 
1070 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1071 {
1072 	struct mlx5vf_pci_core_device *mvdev = container_of(
1073 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1074 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1075 	int ret;
1076 
1077 	ret = vfio_pci_core_enable(vdev);
1078 	if (ret)
1079 		return ret;
1080 
1081 	if (mvdev->migrate_cap)
1082 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1083 	vfio_pci_core_finish_enable(vdev);
1084 	return 0;
1085 }
1086 
1087 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1088 {
1089 	struct mlx5vf_pci_core_device *mvdev = container_of(
1090 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1091 
1092 	mlx5vf_cmd_close_migratable(mvdev);
1093 	vfio_pci_core_close_device(core_vdev);
1094 }
1095 
1096 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1097 	.migration_set_state = mlx5vf_pci_set_device_state,
1098 	.migration_get_state = mlx5vf_pci_get_device_state,
1099 	.migration_get_data_size = mlx5vf_pci_get_data_size,
1100 };
1101 
1102 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1103 	.log_start = mlx5vf_start_page_tracker,
1104 	.log_stop = mlx5vf_stop_page_tracker,
1105 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1106 };
1107 
1108 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1109 {
1110 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1111 			struct mlx5vf_pci_core_device, core_device.vdev);
1112 	int ret;
1113 
1114 	ret = vfio_pci_core_init_dev(core_vdev);
1115 	if (ret)
1116 		return ret;
1117 
1118 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1119 				  &mlx5vf_pci_log_ops);
1120 
1121 	return 0;
1122 }
1123 
1124 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1125 {
1126 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1127 			struct mlx5vf_pci_core_device, core_device.vdev);
1128 
1129 	mlx5vf_cmd_remove_migratable(mvdev);
1130 	vfio_pci_core_release_dev(core_vdev);
1131 }
1132 
1133 static const struct vfio_device_ops mlx5vf_pci_ops = {
1134 	.name = "mlx5-vfio-pci",
1135 	.init = mlx5vf_pci_init_dev,
1136 	.release = mlx5vf_pci_release_dev,
1137 	.open_device = mlx5vf_pci_open_device,
1138 	.close_device = mlx5vf_pci_close_device,
1139 	.ioctl = vfio_pci_core_ioctl,
1140 	.device_feature = vfio_pci_core_ioctl_feature,
1141 	.read = vfio_pci_core_read,
1142 	.write = vfio_pci_core_write,
1143 	.mmap = vfio_pci_core_mmap,
1144 	.request = vfio_pci_core_request,
1145 	.match = vfio_pci_core_match,
1146 };
1147 
1148 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1149 			    const struct pci_device_id *id)
1150 {
1151 	struct mlx5vf_pci_core_device *mvdev;
1152 	int ret;
1153 
1154 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1155 				  &pdev->dev, &mlx5vf_pci_ops);
1156 	if (IS_ERR(mvdev))
1157 		return PTR_ERR(mvdev);
1158 
1159 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1160 	ret = vfio_pci_core_register_device(&mvdev->core_device);
1161 	if (ret)
1162 		goto out_put_vdev;
1163 	return 0;
1164 
1165 out_put_vdev:
1166 	vfio_put_device(&mvdev->core_device.vdev);
1167 	return ret;
1168 }
1169 
1170 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1171 {
1172 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1173 
1174 	vfio_pci_core_unregister_device(&mvdev->core_device);
1175 	vfio_put_device(&mvdev->core_device.vdev);
1176 }
1177 
1178 static const struct pci_device_id mlx5vf_pci_table[] = {
1179 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1180 	{}
1181 };
1182 
1183 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1184 
1185 static const struct pci_error_handlers mlx5vf_err_handlers = {
1186 	.reset_done = mlx5vf_pci_aer_reset_done,
1187 	.error_detected = vfio_pci_core_aer_err_detected,
1188 };
1189 
1190 static struct pci_driver mlx5vf_pci_driver = {
1191 	.name = KBUILD_MODNAME,
1192 	.id_table = mlx5vf_pci_table,
1193 	.probe = mlx5vf_pci_probe,
1194 	.remove = mlx5vf_pci_remove,
1195 	.err_handler = &mlx5vf_err_handlers,
1196 	.driver_managed_dma = true,
1197 };
1198 
1199 module_pci_driver(mlx5vf_pci_driver);
1200 
1201 MODULE_LICENSE("GPL");
1202 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1203 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1204 MODULE_DESCRIPTION(
1205 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1206