xref: /openbmc/linux/drivers/vfio/pci/mlx5/main.c (revision c699ce1a)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Arbitrary to prevent userspace from consuming endless memory */
25 #define MAX_MIGRATION_SIZE (512*1024*1024)
26 
27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30 
31 	return container_of(core_device, struct mlx5vf_pci_core_device,
32 			    core_device);
33 }
34 
35 struct page *
36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 			  unsigned long offset)
38 {
39 	unsigned long cur_offset = 0;
40 	struct scatterlist *sg;
41 	unsigned int i;
42 
43 	/* All accesses are sequential */
44 	if (offset < buf->last_offset || !buf->last_offset_sg) {
45 		buf->last_offset = 0;
46 		buf->last_offset_sg = buf->table.sgt.sgl;
47 		buf->sg_last_entry = 0;
48 	}
49 
50 	cur_offset = buf->last_offset;
51 
52 	for_each_sg(buf->last_offset_sg, sg,
53 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 		if (offset < sg->length + cur_offset) {
55 			buf->last_offset_sg = sg;
56 			buf->sg_last_entry += i;
57 			buf->last_offset = cur_offset;
58 			return nth_page(sg_page(sg),
59 					(offset - cur_offset) / PAGE_SIZE);
60 		}
61 		cur_offset += sg->length;
62 	}
63 	return NULL;
64 }
65 
66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 			       unsigned int npages)
68 {
69 	unsigned int to_alloc = npages;
70 	struct page **page_list;
71 	unsigned long filled;
72 	unsigned int to_fill;
73 	int ret;
74 
75 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
77 	if (!page_list)
78 		return -ENOMEM;
79 
80 	do {
81 		filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
82 		if (!filled) {
83 			ret = -ENOMEM;
84 			goto err;
85 		}
86 		to_alloc -= filled;
87 		ret = sg_alloc_append_table_from_pages(
88 			&buf->table, page_list, filled, 0,
89 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
90 			GFP_KERNEL);
91 
92 		if (ret)
93 			goto err;
94 		buf->allocated_length += filled * PAGE_SIZE;
95 		/* clean input for another bulk allocation */
96 		memset(page_list, 0, filled * sizeof(*page_list));
97 		to_fill = min_t(unsigned int, to_alloc,
98 				PAGE_SIZE / sizeof(*page_list));
99 	} while (to_alloc > 0);
100 
101 	kvfree(page_list);
102 	return 0;
103 
104 err:
105 	kvfree(page_list);
106 	return ret;
107 }
108 
109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
110 {
111 	mutex_lock(&migf->lock);
112 	migf->state = MLX5_MIGF_STATE_ERROR;
113 	migf->filp->f_pos = 0;
114 	mutex_unlock(&migf->lock);
115 }
116 
117 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
118 {
119 	struct mlx5_vf_migration_file *migf = filp->private_data;
120 
121 	mlx5vf_disable_fd(migf);
122 	mutex_destroy(&migf->lock);
123 	kfree(migf);
124 	return 0;
125 }
126 
127 static struct mlx5_vhca_data_buffer *
128 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
129 			      bool *end_of_data)
130 {
131 	struct mlx5_vhca_data_buffer *buf;
132 	bool found = false;
133 
134 	*end_of_data = false;
135 	spin_lock_irq(&migf->list_lock);
136 	if (list_empty(&migf->buf_list)) {
137 		*end_of_data = true;
138 		goto end;
139 	}
140 
141 	buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
142 			       buf_elm);
143 	if (pos >= buf->start_pos &&
144 	    pos < buf->start_pos + buf->length) {
145 		found = true;
146 		goto end;
147 	}
148 
149 	/*
150 	 * As we use a stream based FD we may expect having the data always
151 	 * on first chunk
152 	 */
153 	migf->state = MLX5_MIGF_STATE_ERROR;
154 
155 end:
156 	spin_unlock_irq(&migf->list_lock);
157 	return found ? buf : NULL;
158 }
159 
160 static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
161 			       char __user **buf, size_t *len, loff_t *pos)
162 {
163 	unsigned long offset;
164 	ssize_t done = 0;
165 	size_t copy_len;
166 
167 	copy_len = min_t(size_t,
168 			 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
169 	while (copy_len) {
170 		size_t page_offset;
171 		struct page *page;
172 		size_t page_len;
173 		u8 *from_buff;
174 		int ret;
175 
176 		offset = *pos - vhca_buf->start_pos;
177 		page_offset = offset % PAGE_SIZE;
178 		offset -= page_offset;
179 		page = mlx5vf_get_migration_page(vhca_buf, offset);
180 		if (!page)
181 			return -EINVAL;
182 		page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
183 		from_buff = kmap_local_page(page);
184 		ret = copy_to_user(*buf, from_buff + page_offset, page_len);
185 		kunmap_local(from_buff);
186 		if (ret)
187 			return -EFAULT;
188 		*pos += page_len;
189 		*len -= page_len;
190 		*buf += page_len;
191 		done += page_len;
192 		copy_len -= page_len;
193 	}
194 
195 	if (*pos >= vhca_buf->start_pos + vhca_buf->length) {
196 		spin_lock_irq(&vhca_buf->migf->list_lock);
197 		list_del_init(&vhca_buf->buf_elm);
198 		list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list);
199 		spin_unlock_irq(&vhca_buf->migf->list_lock);
200 	}
201 
202 	return done;
203 }
204 
205 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
206 			       loff_t *pos)
207 {
208 	struct mlx5_vf_migration_file *migf = filp->private_data;
209 	struct mlx5_vhca_data_buffer *vhca_buf;
210 	bool first_loop_call = true;
211 	bool end_of_data;
212 	ssize_t done = 0;
213 
214 	if (pos)
215 		return -ESPIPE;
216 	pos = &filp->f_pos;
217 
218 	if (!(filp->f_flags & O_NONBLOCK)) {
219 		if (wait_event_interruptible(migf->poll_wait,
220 				!list_empty(&migf->buf_list) ||
221 				migf->state == MLX5_MIGF_STATE_ERROR ||
222 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
223 				migf->state == MLX5_MIGF_STATE_PRE_COPY ||
224 				migf->state == MLX5_MIGF_STATE_COMPLETE))
225 			return -ERESTARTSYS;
226 	}
227 
228 	mutex_lock(&migf->lock);
229 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
230 		done = -ENODEV;
231 		goto out_unlock;
232 	}
233 
234 	while (len) {
235 		ssize_t count;
236 
237 		vhca_buf = mlx5vf_get_data_buff_from_pos(migf, *pos,
238 							 &end_of_data);
239 		if (first_loop_call) {
240 			first_loop_call = false;
241 			/* Temporary end of file as part of PRE_COPY */
242 			if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
243 				migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
244 				done = -ENOMSG;
245 				goto out_unlock;
246 			}
247 
248 			if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
249 				if (filp->f_flags & O_NONBLOCK) {
250 					done = -EAGAIN;
251 					goto out_unlock;
252 				}
253 			}
254 		}
255 
256 		if (end_of_data)
257 			goto out_unlock;
258 
259 		if (!vhca_buf) {
260 			done = -EINVAL;
261 			goto out_unlock;
262 		}
263 
264 		count = mlx5vf_buf_read(vhca_buf, &buf, &len, pos);
265 		if (count < 0) {
266 			done = count;
267 			goto out_unlock;
268 		}
269 		done += count;
270 	}
271 
272 out_unlock:
273 	mutex_unlock(&migf->lock);
274 	return done;
275 }
276 
277 static __poll_t mlx5vf_save_poll(struct file *filp,
278 				 struct poll_table_struct *wait)
279 {
280 	struct mlx5_vf_migration_file *migf = filp->private_data;
281 	__poll_t pollflags = 0;
282 
283 	poll_wait(filp, &migf->poll_wait, wait);
284 
285 	mutex_lock(&migf->lock);
286 	if (migf->state == MLX5_MIGF_STATE_ERROR)
287 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
288 	else if (!list_empty(&migf->buf_list) ||
289 		 migf->state == MLX5_MIGF_STATE_COMPLETE)
290 		pollflags = EPOLLIN | EPOLLRDNORM;
291 	mutex_unlock(&migf->lock);
292 
293 	return pollflags;
294 }
295 
296 /*
297  * FD is exposed and user can use it after receiving an error.
298  * Mark migf in error, and wake the user.
299  */
300 static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
301 {
302 	migf->state = MLX5_MIGF_STATE_ERROR;
303 	wake_up_interruptible(&migf->poll_wait);
304 }
305 
306 static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
307 				 unsigned long arg)
308 {
309 	struct mlx5_vf_migration_file *migf = filp->private_data;
310 	struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
311 	struct mlx5_vhca_data_buffer *buf;
312 	struct vfio_precopy_info info = {};
313 	loff_t *pos = &filp->f_pos;
314 	unsigned long minsz;
315 	size_t inc_length = 0;
316 	bool end_of_data;
317 	int ret;
318 
319 	if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
320 		return -ENOTTY;
321 
322 	minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
323 
324 	if (copy_from_user(&info, (void __user *)arg, minsz))
325 		return -EFAULT;
326 
327 	if (info.argsz < minsz)
328 		return -EINVAL;
329 
330 	mutex_lock(&mvdev->state_mutex);
331 	if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
332 	    mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
333 		ret = -EINVAL;
334 		goto err_state_unlock;
335 	}
336 
337 	/*
338 	 * We can't issue a SAVE command when the device is suspended, so as
339 	 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
340 	 * bytes that can't be read.
341 	 */
342 	if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
343 		/*
344 		 * Once the query returns it's guaranteed that there is no
345 		 * active SAVE command.
346 		 * As so, the other code below is safe with the proper locks.
347 		 */
348 		ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &inc_length,
349 							    MLX5VF_QUERY_INC);
350 		if (ret)
351 			goto err_state_unlock;
352 	}
353 
354 	mutex_lock(&migf->lock);
355 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
356 		ret = -ENODEV;
357 		goto err_migf_unlock;
358 	}
359 
360 	buf = mlx5vf_get_data_buff_from_pos(migf, *pos, &end_of_data);
361 	if (buf) {
362 		if (buf->start_pos == 0) {
363 			info.initial_bytes = buf->header_image_size - *pos;
364 		} else if (buf->start_pos ==
365 				sizeof(struct mlx5_vf_migration_header)) {
366 			/* First data buffer following the header */
367 			info.initial_bytes = buf->start_pos +
368 						buf->length - *pos;
369 		} else {
370 			info.dirty_bytes = buf->start_pos + buf->length - *pos;
371 		}
372 	} else {
373 		if (!end_of_data) {
374 			ret = -EINVAL;
375 			goto err_migf_unlock;
376 		}
377 
378 		info.dirty_bytes = inc_length;
379 	}
380 
381 	if (!end_of_data || !inc_length) {
382 		mutex_unlock(&migf->lock);
383 		goto done;
384 	}
385 
386 	mutex_unlock(&migf->lock);
387 	/*
388 	 * We finished transferring the current state and the device has a
389 	 * dirty state, save a new state to be ready for.
390 	 */
391 	buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE);
392 	if (IS_ERR(buf)) {
393 		ret = PTR_ERR(buf);
394 		mlx5vf_mark_err(migf);
395 		goto err_state_unlock;
396 	}
397 
398 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, true);
399 	if (ret) {
400 		mlx5vf_mark_err(migf);
401 		mlx5vf_put_data_buffer(buf);
402 		goto err_state_unlock;
403 	}
404 
405 done:
406 	mlx5vf_state_mutex_unlock(mvdev);
407 	if (copy_to_user((void __user *)arg, &info, minsz))
408 		return -EFAULT;
409 	return 0;
410 
411 err_migf_unlock:
412 	mutex_unlock(&migf->lock);
413 err_state_unlock:
414 	mlx5vf_state_mutex_unlock(mvdev);
415 	return ret;
416 }
417 
418 static const struct file_operations mlx5vf_save_fops = {
419 	.owner = THIS_MODULE,
420 	.read = mlx5vf_save_read,
421 	.poll = mlx5vf_save_poll,
422 	.unlocked_ioctl = mlx5vf_precopy_ioctl,
423 	.compat_ioctl = compat_ptr_ioctl,
424 	.release = mlx5vf_release_file,
425 	.llseek = no_llseek,
426 };
427 
428 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
429 {
430 	struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
431 	struct mlx5_vhca_data_buffer *buf;
432 	size_t length;
433 	int ret;
434 
435 	if (migf->state == MLX5_MIGF_STATE_ERROR)
436 		return -ENODEV;
437 
438 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length,
439 				MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
440 	if (ret)
441 		goto err;
442 
443 	buf = mlx5vf_get_data_buffer(migf, length, DMA_FROM_DEVICE);
444 	if (IS_ERR(buf)) {
445 		ret = PTR_ERR(buf);
446 		goto err;
447 	}
448 
449 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, true, false);
450 	if (ret)
451 		goto err_save;
452 
453 	return 0;
454 
455 err_save:
456 	mlx5vf_put_data_buffer(buf);
457 err:
458 	mlx5vf_mark_err(migf);
459 	return ret;
460 }
461 
462 static struct mlx5_vf_migration_file *
463 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
464 {
465 	struct mlx5_vf_migration_file *migf;
466 	struct mlx5_vhca_data_buffer *buf;
467 	size_t length;
468 	int ret;
469 
470 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
471 	if (!migf)
472 		return ERR_PTR(-ENOMEM);
473 
474 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
475 					O_RDONLY);
476 	if (IS_ERR(migf->filp)) {
477 		ret = PTR_ERR(migf->filp);
478 		goto end;
479 	}
480 
481 	migf->mvdev = mvdev;
482 	ret = mlx5vf_cmd_alloc_pd(migf);
483 	if (ret)
484 		goto out_free;
485 
486 	stream_open(migf->filp->f_inode, migf->filp);
487 	mutex_init(&migf->lock);
488 	init_waitqueue_head(&migf->poll_wait);
489 	init_completion(&migf->save_comp);
490 	/*
491 	 * save_comp is being used as a binary semaphore built from
492 	 * a completion. A normal mutex cannot be used because the lock is
493 	 * passed between kernel threads and lockdep can't model this.
494 	 */
495 	complete(&migf->save_comp);
496 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
497 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
498 	INIT_LIST_HEAD(&migf->buf_list);
499 	INIT_LIST_HEAD(&migf->avail_list);
500 	spin_lock_init(&migf->list_lock);
501 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, 0);
502 	if (ret)
503 		goto out_pd;
504 
505 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
506 	if (IS_ERR(buf)) {
507 		ret = PTR_ERR(buf);
508 		goto out_pd;
509 	}
510 
511 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, false, track);
512 	if (ret)
513 		goto out_save;
514 	return migf;
515 out_save:
516 	mlx5vf_free_data_buffer(buf);
517 out_pd:
518 	mlx5vf_cmd_dealloc_pd(migf);
519 out_free:
520 	fput(migf->filp);
521 end:
522 	kfree(migf);
523 	return ERR_PTR(ret);
524 }
525 
526 static int
527 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
528 			      const char __user **buf, size_t *len,
529 			      loff_t *pos, ssize_t *done)
530 {
531 	unsigned long offset;
532 	size_t page_offset;
533 	struct page *page;
534 	size_t page_len;
535 	u8 *to_buff;
536 	int ret;
537 
538 	offset = *pos - vhca_buf->start_pos;
539 	page_offset = offset % PAGE_SIZE;
540 
541 	page = mlx5vf_get_migration_page(vhca_buf, offset - page_offset);
542 	if (!page)
543 		return -EINVAL;
544 	page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
545 	to_buff = kmap_local_page(page);
546 	ret = copy_from_user(to_buff + page_offset, *buf, page_len);
547 	kunmap_local(to_buff);
548 	if (ret)
549 		return -EFAULT;
550 
551 	*pos += page_len;
552 	*done += page_len;
553 	*buf += page_len;
554 	*len -= page_len;
555 	vhca_buf->length += page_len;
556 	return 0;
557 }
558 
559 static int
560 mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf,
561 				   loff_t requested_length,
562 				   const char __user **buf, size_t *len,
563 				   loff_t *pos, ssize_t *done)
564 {
565 	int ret;
566 
567 	if (requested_length > MAX_MIGRATION_SIZE)
568 		return -ENOMEM;
569 
570 	if (vhca_buf->allocated_length < requested_length) {
571 		ret = mlx5vf_add_migration_pages(
572 			vhca_buf,
573 			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
574 				     PAGE_SIZE));
575 		if (ret)
576 			return ret;
577 	}
578 
579 	while (*len) {
580 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len, pos,
581 						    done);
582 		if (ret)
583 			return ret;
584 	}
585 
586 	return 0;
587 }
588 
589 static ssize_t
590 mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
591 			 struct mlx5_vhca_data_buffer *vhca_buf,
592 			 size_t image_size, const char __user **buf,
593 			 size_t *len, loff_t *pos, ssize_t *done,
594 			 bool *has_work)
595 {
596 	size_t copy_len, to_copy;
597 	int ret;
598 
599 	to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
600 	copy_len = to_copy;
601 	while (to_copy) {
602 		ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, pos,
603 						    done);
604 		if (ret)
605 			return ret;
606 	}
607 
608 	*len -= copy_len;
609 	if (vhca_buf->length == image_size) {
610 		migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
611 		migf->max_pos += image_size;
612 		*has_work = true;
613 	}
614 
615 	return 0;
616 }
617 
618 static int
619 mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
620 			  struct mlx5_vhca_data_buffer *vhca_buf,
621 			  const char __user **buf,
622 			  size_t *len, loff_t *pos,
623 			  ssize_t *done, bool *has_work)
624 {
625 	struct page *page;
626 	size_t copy_len;
627 	u8 *to_buff;
628 	int ret;
629 
630 	copy_len = min_t(size_t, *len,
631 		sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
632 	page = mlx5vf_get_migration_page(vhca_buf, 0);
633 	if (!page)
634 		return -EINVAL;
635 	to_buff = kmap_local_page(page);
636 	ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len);
637 	if (ret) {
638 		ret = -EFAULT;
639 		goto end;
640 	}
641 
642 	*buf += copy_len;
643 	*pos += copy_len;
644 	*done += copy_len;
645 	*len -= copy_len;
646 	vhca_buf->length += copy_len;
647 	if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
648 		u64 flags;
649 
650 		vhca_buf->header_image_size = le64_to_cpup((__le64 *)to_buff);
651 		if (vhca_buf->header_image_size > MAX_MIGRATION_SIZE) {
652 			ret = -ENOMEM;
653 			goto end;
654 		}
655 
656 		flags = le64_to_cpup((__le64 *)(to_buff +
657 			    offsetof(struct mlx5_vf_migration_header, flags)));
658 		if (flags) {
659 			ret = -EOPNOTSUPP;
660 			goto end;
661 		}
662 
663 		migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
664 		migf->max_pos += vhca_buf->length;
665 		*has_work = true;
666 	}
667 end:
668 	kunmap_local(to_buff);
669 	return ret;
670 }
671 
672 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
673 				   size_t len, loff_t *pos)
674 {
675 	struct mlx5_vf_migration_file *migf = filp->private_data;
676 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
677 	struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header;
678 	loff_t requested_length;
679 	bool has_work = false;
680 	ssize_t done = 0;
681 	int ret = 0;
682 
683 	if (pos)
684 		return -ESPIPE;
685 	pos = &filp->f_pos;
686 
687 	if (*pos < 0 ||
688 	    check_add_overflow((loff_t)len, *pos, &requested_length))
689 		return -EINVAL;
690 
691 	mutex_lock(&migf->mvdev->state_mutex);
692 	mutex_lock(&migf->lock);
693 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
694 		ret = -ENODEV;
695 		goto out_unlock;
696 	}
697 
698 	while (len || has_work) {
699 		has_work = false;
700 		switch (migf->load_state) {
701 		case MLX5_VF_LOAD_STATE_READ_HEADER:
702 			ret = mlx5vf_resume_read_header(migf, vhca_buf_header,
703 							&buf, &len, pos,
704 							&done, &has_work);
705 			if (ret)
706 				goto out_unlock;
707 			break;
708 		case MLX5_VF_LOAD_STATE_PREP_IMAGE:
709 		{
710 			u64 size = vhca_buf_header->header_image_size;
711 
712 			if (vhca_buf->allocated_length < size) {
713 				mlx5vf_free_data_buffer(vhca_buf);
714 
715 				migf->buf = mlx5vf_alloc_data_buffer(migf,
716 							size, DMA_TO_DEVICE);
717 				if (IS_ERR(migf->buf)) {
718 					ret = PTR_ERR(migf->buf);
719 					migf->buf = NULL;
720 					goto out_unlock;
721 				}
722 
723 				vhca_buf = migf->buf;
724 			}
725 
726 			vhca_buf->start_pos = migf->max_pos;
727 			migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
728 			break;
729 		}
730 		case MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER:
731 			ret = mlx5vf_resume_read_image_no_header(vhca_buf,
732 						requested_length,
733 						&buf, &len, pos, &done);
734 			if (ret)
735 				goto out_unlock;
736 			break;
737 		case MLX5_VF_LOAD_STATE_READ_IMAGE:
738 			ret = mlx5vf_resume_read_image(migf, vhca_buf,
739 						vhca_buf_header->header_image_size,
740 						&buf, &len, pos, &done, &has_work);
741 			if (ret)
742 				goto out_unlock;
743 			break;
744 		case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
745 			ret = mlx5vf_cmd_load_vhca_state(migf->mvdev, migf, vhca_buf);
746 			if (ret)
747 				goto out_unlock;
748 			migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
749 
750 			/* prep header buf for next image */
751 			vhca_buf_header->length = 0;
752 			vhca_buf_header->header_image_size = 0;
753 			/* prep data buf for next image */
754 			vhca_buf->length = 0;
755 
756 			break;
757 		default:
758 			break;
759 		}
760 	}
761 
762 out_unlock:
763 	if (ret)
764 		migf->state = MLX5_MIGF_STATE_ERROR;
765 	mutex_unlock(&migf->lock);
766 	mlx5vf_state_mutex_unlock(migf->mvdev);
767 	return ret ? ret : done;
768 }
769 
770 static const struct file_operations mlx5vf_resume_fops = {
771 	.owner = THIS_MODULE,
772 	.write = mlx5vf_resume_write,
773 	.release = mlx5vf_release_file,
774 	.llseek = no_llseek,
775 };
776 
777 static struct mlx5_vf_migration_file *
778 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
779 {
780 	struct mlx5_vf_migration_file *migf;
781 	struct mlx5_vhca_data_buffer *buf;
782 	int ret;
783 
784 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
785 	if (!migf)
786 		return ERR_PTR(-ENOMEM);
787 
788 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
789 					O_WRONLY);
790 	if (IS_ERR(migf->filp)) {
791 		ret = PTR_ERR(migf->filp);
792 		goto end;
793 	}
794 
795 	migf->mvdev = mvdev;
796 	ret = mlx5vf_cmd_alloc_pd(migf);
797 	if (ret)
798 		goto out_free;
799 
800 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
801 	if (IS_ERR(buf)) {
802 		ret = PTR_ERR(buf);
803 		goto out_pd;
804 	}
805 
806 	migf->buf = buf;
807 	if (MLX5VF_PRE_COPY_SUPP(mvdev)) {
808 		buf = mlx5vf_alloc_data_buffer(migf,
809 			sizeof(struct mlx5_vf_migration_header), DMA_NONE);
810 		if (IS_ERR(buf)) {
811 			ret = PTR_ERR(buf);
812 			goto out_buf;
813 		}
814 
815 		migf->buf_header = buf;
816 		migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
817 	} else {
818 		/* Initial state will be to read the image */
819 		migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE_NO_HEADER;
820 	}
821 
822 	stream_open(migf->filp->f_inode, migf->filp);
823 	mutex_init(&migf->lock);
824 	INIT_LIST_HEAD(&migf->buf_list);
825 	INIT_LIST_HEAD(&migf->avail_list);
826 	spin_lock_init(&migf->list_lock);
827 	return migf;
828 out_buf:
829 	mlx5vf_free_data_buffer(migf->buf);
830 out_pd:
831 	mlx5vf_cmd_dealloc_pd(migf);
832 out_free:
833 	fput(migf->filp);
834 end:
835 	kfree(migf);
836 	return ERR_PTR(ret);
837 }
838 
839 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
840 {
841 	if (mvdev->resuming_migf) {
842 		mlx5vf_disable_fd(mvdev->resuming_migf);
843 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
844 		fput(mvdev->resuming_migf->filp);
845 		mvdev->resuming_migf = NULL;
846 	}
847 	if (mvdev->saving_migf) {
848 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
849 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
850 		mlx5vf_disable_fd(mvdev->saving_migf);
851 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
852 		fput(mvdev->saving_migf->filp);
853 		mvdev->saving_migf = NULL;
854 	}
855 }
856 
857 static struct file *
858 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
859 				    u32 new)
860 {
861 	u32 cur = mvdev->mig_state;
862 	int ret;
863 
864 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
865 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
866 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
867 		if (ret)
868 			return ERR_PTR(ret);
869 		return NULL;
870 	}
871 
872 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
873 		ret = mlx5vf_cmd_resume_vhca(mvdev,
874 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
875 		if (ret)
876 			return ERR_PTR(ret);
877 		return NULL;
878 	}
879 
880 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
881 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
882 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
883 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
884 		if (ret)
885 			return ERR_PTR(ret);
886 		return NULL;
887 	}
888 
889 	if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
890 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
891 		ret = mlx5vf_cmd_resume_vhca(mvdev,
892 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
893 		if (ret)
894 			return ERR_PTR(ret);
895 		return NULL;
896 	}
897 
898 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
899 		struct mlx5_vf_migration_file *migf;
900 
901 		migf = mlx5vf_pci_save_device_data(mvdev, false);
902 		if (IS_ERR(migf))
903 			return ERR_CAST(migf);
904 		get_file(migf->filp);
905 		mvdev->saving_migf = migf;
906 		return migf->filp;
907 	}
908 
909 	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) ||
910 	    (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
911 	    (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
912 	     new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
913 		mlx5vf_disable_fds(mvdev);
914 		return NULL;
915 	}
916 
917 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
918 		struct mlx5_vf_migration_file *migf;
919 
920 		migf = mlx5vf_pci_resume_device_data(mvdev);
921 		if (IS_ERR(migf))
922 			return ERR_CAST(migf);
923 		get_file(migf->filp);
924 		mvdev->resuming_migf = migf;
925 		return migf->filp;
926 	}
927 
928 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
929 		if (!MLX5VF_PRE_COPY_SUPP(mvdev)) {
930 			ret = mlx5vf_cmd_load_vhca_state(mvdev,
931 							 mvdev->resuming_migf,
932 							 mvdev->resuming_migf->buf);
933 			if (ret)
934 				return ERR_PTR(ret);
935 		}
936 		mlx5vf_disable_fds(mvdev);
937 		return NULL;
938 	}
939 
940 	if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
941 	    (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
942 	     new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
943 		struct mlx5_vf_migration_file *migf;
944 
945 		migf = mlx5vf_pci_save_device_data(mvdev, true);
946 		if (IS_ERR(migf))
947 			return ERR_CAST(migf);
948 		get_file(migf->filp);
949 		mvdev->saving_migf = migf;
950 		return migf->filp;
951 	}
952 
953 	if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
954 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
955 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
956 		if (ret)
957 			return ERR_PTR(ret);
958 		ret = mlx5vf_pci_save_device_inc_data(mvdev);
959 		return ret ? ERR_PTR(ret) : NULL;
960 	}
961 
962 	/*
963 	 * vfio_mig_get_next_state() does not use arcs other than the above
964 	 */
965 	WARN_ON(true);
966 	return ERR_PTR(-EINVAL);
967 }
968 
969 /*
970  * This function is called in all state_mutex unlock cases to
971  * handle a 'deferred_reset' if exists.
972  */
973 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
974 {
975 again:
976 	spin_lock(&mvdev->reset_lock);
977 	if (mvdev->deferred_reset) {
978 		mvdev->deferred_reset = false;
979 		spin_unlock(&mvdev->reset_lock);
980 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
981 		mlx5vf_disable_fds(mvdev);
982 		goto again;
983 	}
984 	mutex_unlock(&mvdev->state_mutex);
985 	spin_unlock(&mvdev->reset_lock);
986 }
987 
988 static struct file *
989 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
990 			    enum vfio_device_mig_state new_state)
991 {
992 	struct mlx5vf_pci_core_device *mvdev = container_of(
993 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
994 	enum vfio_device_mig_state next_state;
995 	struct file *res = NULL;
996 	int ret;
997 
998 	mutex_lock(&mvdev->state_mutex);
999 	while (new_state != mvdev->mig_state) {
1000 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
1001 					      new_state, &next_state);
1002 		if (ret) {
1003 			res = ERR_PTR(ret);
1004 			break;
1005 		}
1006 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
1007 		if (IS_ERR(res))
1008 			break;
1009 		mvdev->mig_state = next_state;
1010 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
1011 			fput(res);
1012 			res = ERR_PTR(-EINVAL);
1013 			break;
1014 		}
1015 	}
1016 	mlx5vf_state_mutex_unlock(mvdev);
1017 	return res;
1018 }
1019 
1020 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1021 				    unsigned long *stop_copy_length)
1022 {
1023 	struct mlx5vf_pci_core_device *mvdev = container_of(
1024 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1025 	size_t state_size;
1026 	int ret;
1027 
1028 	mutex_lock(&mvdev->state_mutex);
1029 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
1030 						    &state_size, 0);
1031 	if (!ret)
1032 		*stop_copy_length = state_size;
1033 	mlx5vf_state_mutex_unlock(mvdev);
1034 	return ret;
1035 }
1036 
1037 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1038 				       enum vfio_device_mig_state *curr_state)
1039 {
1040 	struct mlx5vf_pci_core_device *mvdev = container_of(
1041 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1042 
1043 	mutex_lock(&mvdev->state_mutex);
1044 	*curr_state = mvdev->mig_state;
1045 	mlx5vf_state_mutex_unlock(mvdev);
1046 	return 0;
1047 }
1048 
1049 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1050 {
1051 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1052 
1053 	if (!mvdev->migrate_cap)
1054 		return;
1055 
1056 	/*
1057 	 * As the higher VFIO layers are holding locks across reset and using
1058 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
1059 	 * with the state_mutex and mm_lock.
1060 	 * In case the state_mutex was taken already we defer the cleanup work
1061 	 * to the unlock flow of the other running context.
1062 	 */
1063 	spin_lock(&mvdev->reset_lock);
1064 	mvdev->deferred_reset = true;
1065 	if (!mutex_trylock(&mvdev->state_mutex)) {
1066 		spin_unlock(&mvdev->reset_lock);
1067 		return;
1068 	}
1069 	spin_unlock(&mvdev->reset_lock);
1070 	mlx5vf_state_mutex_unlock(mvdev);
1071 }
1072 
1073 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1074 {
1075 	struct mlx5vf_pci_core_device *mvdev = container_of(
1076 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1077 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
1078 	int ret;
1079 
1080 	ret = vfio_pci_core_enable(vdev);
1081 	if (ret)
1082 		return ret;
1083 
1084 	if (mvdev->migrate_cap)
1085 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1086 	vfio_pci_core_finish_enable(vdev);
1087 	return 0;
1088 }
1089 
1090 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1091 {
1092 	struct mlx5vf_pci_core_device *mvdev = container_of(
1093 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1094 
1095 	mlx5vf_cmd_close_migratable(mvdev);
1096 	vfio_pci_core_close_device(core_vdev);
1097 }
1098 
1099 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1100 	.migration_set_state = mlx5vf_pci_set_device_state,
1101 	.migration_get_state = mlx5vf_pci_get_device_state,
1102 	.migration_get_data_size = mlx5vf_pci_get_data_size,
1103 };
1104 
1105 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1106 	.log_start = mlx5vf_start_page_tracker,
1107 	.log_stop = mlx5vf_stop_page_tracker,
1108 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
1109 };
1110 
1111 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1112 {
1113 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1114 			struct mlx5vf_pci_core_device, core_device.vdev);
1115 	int ret;
1116 
1117 	ret = vfio_pci_core_init_dev(core_vdev);
1118 	if (ret)
1119 		return ret;
1120 
1121 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
1122 				  &mlx5vf_pci_log_ops);
1123 
1124 	return 0;
1125 }
1126 
1127 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1128 {
1129 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1130 			struct mlx5vf_pci_core_device, core_device.vdev);
1131 
1132 	mlx5vf_cmd_remove_migratable(mvdev);
1133 	vfio_pci_core_release_dev(core_vdev);
1134 }
1135 
1136 static const struct vfio_device_ops mlx5vf_pci_ops = {
1137 	.name = "mlx5-vfio-pci",
1138 	.init = mlx5vf_pci_init_dev,
1139 	.release = mlx5vf_pci_release_dev,
1140 	.open_device = mlx5vf_pci_open_device,
1141 	.close_device = mlx5vf_pci_close_device,
1142 	.ioctl = vfio_pci_core_ioctl,
1143 	.device_feature = vfio_pci_core_ioctl_feature,
1144 	.read = vfio_pci_core_read,
1145 	.write = vfio_pci_core_write,
1146 	.mmap = vfio_pci_core_mmap,
1147 	.request = vfio_pci_core_request,
1148 	.match = vfio_pci_core_match,
1149 	.bind_iommufd = vfio_iommufd_physical_bind,
1150 	.unbind_iommufd = vfio_iommufd_physical_unbind,
1151 	.attach_ioas = vfio_iommufd_physical_attach_ioas,
1152 };
1153 
1154 static int mlx5vf_pci_probe(struct pci_dev *pdev,
1155 			    const struct pci_device_id *id)
1156 {
1157 	struct mlx5vf_pci_core_device *mvdev;
1158 	int ret;
1159 
1160 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1161 				  &pdev->dev, &mlx5vf_pci_ops);
1162 	if (IS_ERR(mvdev))
1163 		return PTR_ERR(mvdev);
1164 
1165 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
1166 	ret = vfio_pci_core_register_device(&mvdev->core_device);
1167 	if (ret)
1168 		goto out_put_vdev;
1169 	return 0;
1170 
1171 out_put_vdev:
1172 	vfio_put_device(&mvdev->core_device.vdev);
1173 	return ret;
1174 }
1175 
1176 static void mlx5vf_pci_remove(struct pci_dev *pdev)
1177 {
1178 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1179 
1180 	vfio_pci_core_unregister_device(&mvdev->core_device);
1181 	vfio_put_device(&mvdev->core_device.vdev);
1182 }
1183 
1184 static const struct pci_device_id mlx5vf_pci_table[] = {
1185 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1186 	{}
1187 };
1188 
1189 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1190 
1191 static const struct pci_error_handlers mlx5vf_err_handlers = {
1192 	.reset_done = mlx5vf_pci_aer_reset_done,
1193 	.error_detected = vfio_pci_core_aer_err_detected,
1194 };
1195 
1196 static struct pci_driver mlx5vf_pci_driver = {
1197 	.name = KBUILD_MODNAME,
1198 	.id_table = mlx5vf_pci_table,
1199 	.probe = mlx5vf_pci_probe,
1200 	.remove = mlx5vf_pci_remove,
1201 	.err_handler = &mlx5vf_err_handlers,
1202 	.driver_managed_dma = true,
1203 };
1204 
1205 module_pci_driver(mlx5vf_pci_driver);
1206 
1207 MODULE_LICENSE("GPL");
1208 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1209 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1210 MODULE_DESCRIPTION(
1211 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1212