xref: /openbmc/linux/drivers/vfio/pci/mlx5/main.c (revision 8b599d14)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4  */
5 
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
21 
22 #include "cmd.h"
23 
24 /* Arbitrary to prevent userspace from consuming endless memory */
25 #define MAX_MIGRATION_SIZE (512*1024*1024)
26 
27 static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
28 {
29 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
30 
31 	return container_of(core_device, struct mlx5vf_pci_core_device,
32 			    core_device);
33 }
34 
35 static struct page *
36 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
37 			  unsigned long offset)
38 {
39 	unsigned long cur_offset = 0;
40 	struct scatterlist *sg;
41 	unsigned int i;
42 
43 	/* All accesses are sequential */
44 	if (offset < buf->last_offset || !buf->last_offset_sg) {
45 		buf->last_offset = 0;
46 		buf->last_offset_sg = buf->table.sgt.sgl;
47 		buf->sg_last_entry = 0;
48 	}
49 
50 	cur_offset = buf->last_offset;
51 
52 	for_each_sg(buf->last_offset_sg, sg,
53 			buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
54 		if (offset < sg->length + cur_offset) {
55 			buf->last_offset_sg = sg;
56 			buf->sg_last_entry += i;
57 			buf->last_offset = cur_offset;
58 			return nth_page(sg_page(sg),
59 					(offset - cur_offset) / PAGE_SIZE);
60 		}
61 		cur_offset += sg->length;
62 	}
63 	return NULL;
64 }
65 
66 int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
67 			       unsigned int npages)
68 {
69 	unsigned int to_alloc = npages;
70 	struct page **page_list;
71 	unsigned long filled;
72 	unsigned int to_fill;
73 	int ret;
74 
75 	to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
76 	page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL);
77 	if (!page_list)
78 		return -ENOMEM;
79 
80 	do {
81 		filled = alloc_pages_bulk_array(GFP_KERNEL, to_fill, page_list);
82 		if (!filled) {
83 			ret = -ENOMEM;
84 			goto err;
85 		}
86 		to_alloc -= filled;
87 		ret = sg_alloc_append_table_from_pages(
88 			&buf->table, page_list, filled, 0,
89 			filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
90 			GFP_KERNEL);
91 
92 		if (ret)
93 			goto err;
94 		buf->allocated_length += filled * PAGE_SIZE;
95 		/* clean input for another bulk allocation */
96 		memset(page_list, 0, filled * sizeof(*page_list));
97 		to_fill = min_t(unsigned int, to_alloc,
98 				PAGE_SIZE / sizeof(*page_list));
99 	} while (to_alloc > 0);
100 
101 	kvfree(page_list);
102 	return 0;
103 
104 err:
105 	kvfree(page_list);
106 	return ret;
107 }
108 
109 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
110 {
111 	mutex_lock(&migf->lock);
112 	migf->state = MLX5_MIGF_STATE_ERROR;
113 	migf->filp->f_pos = 0;
114 	mutex_unlock(&migf->lock);
115 }
116 
117 static int mlx5vf_release_file(struct inode *inode, struct file *filp)
118 {
119 	struct mlx5_vf_migration_file *migf = filp->private_data;
120 
121 	mlx5vf_disable_fd(migf);
122 	mutex_destroy(&migf->lock);
123 	kfree(migf);
124 	return 0;
125 }
126 
127 static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
128 			       loff_t *pos)
129 {
130 	struct mlx5_vf_migration_file *migf = filp->private_data;
131 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
132 	ssize_t done = 0;
133 
134 	if (pos)
135 		return -ESPIPE;
136 	pos = &filp->f_pos;
137 
138 	if (!(filp->f_flags & O_NONBLOCK)) {
139 		if (wait_event_interruptible(migf->poll_wait,
140 			     READ_ONCE(vhca_buf->length) ||
141 			     migf->state == MLX5_MIGF_STATE_ERROR))
142 			return -ERESTARTSYS;
143 	}
144 
145 	mutex_lock(&migf->lock);
146 	if ((filp->f_flags & O_NONBLOCK) && !READ_ONCE(vhca_buf->length)) {
147 		done = -EAGAIN;
148 		goto out_unlock;
149 	}
150 	if (*pos > vhca_buf->length) {
151 		done = -EINVAL;
152 		goto out_unlock;
153 	}
154 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
155 		done = -ENODEV;
156 		goto out_unlock;
157 	}
158 
159 	len = min_t(size_t, vhca_buf->length - *pos, len);
160 	while (len) {
161 		size_t page_offset;
162 		struct page *page;
163 		size_t page_len;
164 		u8 *from_buff;
165 		int ret;
166 
167 		page_offset = (*pos) % PAGE_SIZE;
168 		page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset);
169 		if (!page) {
170 			if (done == 0)
171 				done = -EINVAL;
172 			goto out_unlock;
173 		}
174 
175 		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
176 		from_buff = kmap_local_page(page);
177 		ret = copy_to_user(buf, from_buff + page_offset, page_len);
178 		kunmap_local(from_buff);
179 		if (ret) {
180 			done = -EFAULT;
181 			goto out_unlock;
182 		}
183 		*pos += page_len;
184 		len -= page_len;
185 		done += page_len;
186 		buf += page_len;
187 	}
188 
189 out_unlock:
190 	mutex_unlock(&migf->lock);
191 	return done;
192 }
193 
194 static __poll_t mlx5vf_save_poll(struct file *filp,
195 				 struct poll_table_struct *wait)
196 {
197 	struct mlx5_vf_migration_file *migf = filp->private_data;
198 	__poll_t pollflags = 0;
199 
200 	poll_wait(filp, &migf->poll_wait, wait);
201 
202 	mutex_lock(&migf->lock);
203 	if (migf->state == MLX5_MIGF_STATE_ERROR)
204 		pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
205 	else if (READ_ONCE(migf->buf->length))
206 		pollflags = EPOLLIN | EPOLLRDNORM;
207 	mutex_unlock(&migf->lock);
208 
209 	return pollflags;
210 }
211 
212 static const struct file_operations mlx5vf_save_fops = {
213 	.owner = THIS_MODULE,
214 	.read = mlx5vf_save_read,
215 	.poll = mlx5vf_save_poll,
216 	.release = mlx5vf_release_file,
217 	.llseek = no_llseek,
218 };
219 
220 static struct mlx5_vf_migration_file *
221 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev)
222 {
223 	struct mlx5_vf_migration_file *migf;
224 	struct mlx5_vhca_data_buffer *buf;
225 	size_t length;
226 	int ret;
227 
228 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
229 	if (!migf)
230 		return ERR_PTR(-ENOMEM);
231 
232 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops, migf,
233 					O_RDONLY);
234 	if (IS_ERR(migf->filp)) {
235 		ret = PTR_ERR(migf->filp);
236 		goto end;
237 	}
238 
239 	migf->mvdev = mvdev;
240 	ret = mlx5vf_cmd_alloc_pd(migf);
241 	if (ret)
242 		goto out_free;
243 
244 	stream_open(migf->filp->f_inode, migf->filp);
245 	mutex_init(&migf->lock);
246 	init_waitqueue_head(&migf->poll_wait);
247 	init_completion(&migf->save_comp);
248 	/*
249 	 * save_comp is being used as a binary semaphore built from
250 	 * a completion. A normal mutex cannot be used because the lock is
251 	 * passed between kernel threads and lockdep can't model this.
252 	 */
253 	complete(&migf->save_comp);
254 	mlx5_cmd_init_async_ctx(mvdev->mdev, &migf->async_ctx);
255 	INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
256 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length);
257 	if (ret)
258 		goto out_pd;
259 
260 	buf = mlx5vf_alloc_data_buffer(migf, length, DMA_FROM_DEVICE);
261 	if (IS_ERR(buf)) {
262 		ret = PTR_ERR(buf);
263 		goto out_pd;
264 	}
265 
266 	ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf);
267 	if (ret)
268 		goto out_save;
269 	migf->buf = buf;
270 	return migf;
271 out_save:
272 	mlx5vf_free_data_buffer(buf);
273 out_pd:
274 	mlx5vf_cmd_dealloc_pd(migf);
275 out_free:
276 	fput(migf->filp);
277 end:
278 	kfree(migf);
279 	return ERR_PTR(ret);
280 }
281 
282 static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
283 				   size_t len, loff_t *pos)
284 {
285 	struct mlx5_vf_migration_file *migf = filp->private_data;
286 	struct mlx5_vhca_data_buffer *vhca_buf = migf->buf;
287 	loff_t requested_length;
288 	ssize_t done = 0;
289 
290 	if (pos)
291 		return -ESPIPE;
292 	pos = &filp->f_pos;
293 
294 	if (*pos < 0 ||
295 	    check_add_overflow((loff_t)len, *pos, &requested_length))
296 		return -EINVAL;
297 
298 	if (requested_length > MAX_MIGRATION_SIZE)
299 		return -ENOMEM;
300 
301 	mutex_lock(&migf->lock);
302 	if (migf->state == MLX5_MIGF_STATE_ERROR) {
303 		done = -ENODEV;
304 		goto out_unlock;
305 	}
306 
307 	if (vhca_buf->allocated_length < requested_length) {
308 		done = mlx5vf_add_migration_pages(
309 			vhca_buf,
310 			DIV_ROUND_UP(requested_length - vhca_buf->allocated_length,
311 				     PAGE_SIZE));
312 		if (done)
313 			goto out_unlock;
314 	}
315 
316 	while (len) {
317 		size_t page_offset;
318 		struct page *page;
319 		size_t page_len;
320 		u8 *to_buff;
321 		int ret;
322 
323 		page_offset = (*pos) % PAGE_SIZE;
324 		page = mlx5vf_get_migration_page(vhca_buf, *pos - page_offset);
325 		if (!page) {
326 			if (done == 0)
327 				done = -EINVAL;
328 			goto out_unlock;
329 		}
330 
331 		page_len = min_t(size_t, len, PAGE_SIZE - page_offset);
332 		to_buff = kmap_local_page(page);
333 		ret = copy_from_user(to_buff + page_offset, buf, page_len);
334 		kunmap_local(to_buff);
335 		if (ret) {
336 			done = -EFAULT;
337 			goto out_unlock;
338 		}
339 		*pos += page_len;
340 		len -= page_len;
341 		done += page_len;
342 		buf += page_len;
343 		vhca_buf->length += page_len;
344 	}
345 out_unlock:
346 	mutex_unlock(&migf->lock);
347 	return done;
348 }
349 
350 static const struct file_operations mlx5vf_resume_fops = {
351 	.owner = THIS_MODULE,
352 	.write = mlx5vf_resume_write,
353 	.release = mlx5vf_release_file,
354 	.llseek = no_llseek,
355 };
356 
357 static struct mlx5_vf_migration_file *
358 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
359 {
360 	struct mlx5_vf_migration_file *migf;
361 	struct mlx5_vhca_data_buffer *buf;
362 	int ret;
363 
364 	migf = kzalloc(sizeof(*migf), GFP_KERNEL);
365 	if (!migf)
366 		return ERR_PTR(-ENOMEM);
367 
368 	migf->filp = anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops, migf,
369 					O_WRONLY);
370 	if (IS_ERR(migf->filp)) {
371 		ret = PTR_ERR(migf->filp);
372 		goto end;
373 	}
374 
375 	migf->mvdev = mvdev;
376 	ret = mlx5vf_cmd_alloc_pd(migf);
377 	if (ret)
378 		goto out_free;
379 
380 	buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE);
381 	if (IS_ERR(buf)) {
382 		ret = PTR_ERR(buf);
383 		goto out_pd;
384 	}
385 
386 	migf->buf = buf;
387 	stream_open(migf->filp->f_inode, migf->filp);
388 	mutex_init(&migf->lock);
389 	return migf;
390 out_pd:
391 	mlx5vf_cmd_dealloc_pd(migf);
392 out_free:
393 	fput(migf->filp);
394 end:
395 	kfree(migf);
396 	return ERR_PTR(ret);
397 }
398 
399 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev)
400 {
401 	if (mvdev->resuming_migf) {
402 		mlx5vf_disable_fd(mvdev->resuming_migf);
403 		mlx5fv_cmd_clean_migf_resources(mvdev->resuming_migf);
404 		fput(mvdev->resuming_migf->filp);
405 		mvdev->resuming_migf = NULL;
406 	}
407 	if (mvdev->saving_migf) {
408 		mlx5_cmd_cleanup_async_ctx(&mvdev->saving_migf->async_ctx);
409 		cancel_work_sync(&mvdev->saving_migf->async_data.work);
410 		mlx5vf_disable_fd(mvdev->saving_migf);
411 		mlx5fv_cmd_clean_migf_resources(mvdev->saving_migf);
412 		fput(mvdev->saving_migf->filp);
413 		mvdev->saving_migf = NULL;
414 	}
415 }
416 
417 static struct file *
418 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
419 				    u32 new)
420 {
421 	u32 cur = mvdev->mig_state;
422 	int ret;
423 
424 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
425 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
426 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
427 		if (ret)
428 			return ERR_PTR(ret);
429 		return NULL;
430 	}
431 
432 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
433 		ret = mlx5vf_cmd_resume_vhca(mvdev,
434 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
435 		if (ret)
436 			return ERR_PTR(ret);
437 		return NULL;
438 	}
439 
440 	if (cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
441 		ret = mlx5vf_cmd_suspend_vhca(mvdev,
442 			MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
443 		if (ret)
444 			return ERR_PTR(ret);
445 		return NULL;
446 	}
447 
448 	if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) {
449 		ret = mlx5vf_cmd_resume_vhca(mvdev,
450 			MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
451 		if (ret)
452 			return ERR_PTR(ret);
453 		return NULL;
454 	}
455 
456 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
457 		struct mlx5_vf_migration_file *migf;
458 
459 		migf = mlx5vf_pci_save_device_data(mvdev);
460 		if (IS_ERR(migf))
461 			return ERR_CAST(migf);
462 		get_file(migf->filp);
463 		mvdev->saving_migf = migf;
464 		return migf->filp;
465 	}
466 
467 	if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP)) {
468 		mlx5vf_disable_fds(mvdev);
469 		return NULL;
470 	}
471 
472 	if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
473 		struct mlx5_vf_migration_file *migf;
474 
475 		migf = mlx5vf_pci_resume_device_data(mvdev);
476 		if (IS_ERR(migf))
477 			return ERR_CAST(migf);
478 		get_file(migf->filp);
479 		mvdev->resuming_migf = migf;
480 		return migf->filp;
481 	}
482 
483 	if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
484 		ret = mlx5vf_cmd_load_vhca_state(mvdev,
485 						 mvdev->resuming_migf,
486 						 mvdev->resuming_migf->buf);
487 		if (ret)
488 			return ERR_PTR(ret);
489 		mlx5vf_disable_fds(mvdev);
490 		return NULL;
491 	}
492 
493 	/*
494 	 * vfio_mig_get_next_state() does not use arcs other than the above
495 	 */
496 	WARN_ON(true);
497 	return ERR_PTR(-EINVAL);
498 }
499 
500 /*
501  * This function is called in all state_mutex unlock cases to
502  * handle a 'deferred_reset' if exists.
503  */
504 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
505 {
506 again:
507 	spin_lock(&mvdev->reset_lock);
508 	if (mvdev->deferred_reset) {
509 		mvdev->deferred_reset = false;
510 		spin_unlock(&mvdev->reset_lock);
511 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
512 		mlx5vf_disable_fds(mvdev);
513 		goto again;
514 	}
515 	mutex_unlock(&mvdev->state_mutex);
516 	spin_unlock(&mvdev->reset_lock);
517 }
518 
519 static struct file *
520 mlx5vf_pci_set_device_state(struct vfio_device *vdev,
521 			    enum vfio_device_mig_state new_state)
522 {
523 	struct mlx5vf_pci_core_device *mvdev = container_of(
524 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
525 	enum vfio_device_mig_state next_state;
526 	struct file *res = NULL;
527 	int ret;
528 
529 	mutex_lock(&mvdev->state_mutex);
530 	while (new_state != mvdev->mig_state) {
531 		ret = vfio_mig_get_next_state(vdev, mvdev->mig_state,
532 					      new_state, &next_state);
533 		if (ret) {
534 			res = ERR_PTR(ret);
535 			break;
536 		}
537 		res = mlx5vf_pci_step_device_state_locked(mvdev, next_state);
538 		if (IS_ERR(res))
539 			break;
540 		mvdev->mig_state = next_state;
541 		if (WARN_ON(res && new_state != mvdev->mig_state)) {
542 			fput(res);
543 			res = ERR_PTR(-EINVAL);
544 			break;
545 		}
546 	}
547 	mlx5vf_state_mutex_unlock(mvdev);
548 	return res;
549 }
550 
551 static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
552 				    unsigned long *stop_copy_length)
553 {
554 	struct mlx5vf_pci_core_device *mvdev = container_of(
555 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
556 	size_t state_size;
557 	int ret;
558 
559 	mutex_lock(&mvdev->state_mutex);
560 	ret = mlx5vf_cmd_query_vhca_migration_state(mvdev,
561 						    &state_size);
562 	if (!ret)
563 		*stop_copy_length = state_size;
564 	mlx5vf_state_mutex_unlock(mvdev);
565 	return ret;
566 }
567 
568 static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
569 				       enum vfio_device_mig_state *curr_state)
570 {
571 	struct mlx5vf_pci_core_device *mvdev = container_of(
572 		vdev, struct mlx5vf_pci_core_device, core_device.vdev);
573 
574 	mutex_lock(&mvdev->state_mutex);
575 	*curr_state = mvdev->mig_state;
576 	mlx5vf_state_mutex_unlock(mvdev);
577 	return 0;
578 }
579 
580 static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
581 {
582 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
583 
584 	if (!mvdev->migrate_cap)
585 		return;
586 
587 	/*
588 	 * As the higher VFIO layers are holding locks across reset and using
589 	 * those same locks with the mm_lock we need to prevent ABBA deadlock
590 	 * with the state_mutex and mm_lock.
591 	 * In case the state_mutex was taken already we defer the cleanup work
592 	 * to the unlock flow of the other running context.
593 	 */
594 	spin_lock(&mvdev->reset_lock);
595 	mvdev->deferred_reset = true;
596 	if (!mutex_trylock(&mvdev->state_mutex)) {
597 		spin_unlock(&mvdev->reset_lock);
598 		return;
599 	}
600 	spin_unlock(&mvdev->reset_lock);
601 	mlx5vf_state_mutex_unlock(mvdev);
602 }
603 
604 static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
605 {
606 	struct mlx5vf_pci_core_device *mvdev = container_of(
607 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
608 	struct vfio_pci_core_device *vdev = &mvdev->core_device;
609 	int ret;
610 
611 	ret = vfio_pci_core_enable(vdev);
612 	if (ret)
613 		return ret;
614 
615 	if (mvdev->migrate_cap)
616 		mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
617 	vfio_pci_core_finish_enable(vdev);
618 	return 0;
619 }
620 
621 static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
622 {
623 	struct mlx5vf_pci_core_device *mvdev = container_of(
624 		core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
625 
626 	mlx5vf_cmd_close_migratable(mvdev);
627 	vfio_pci_core_close_device(core_vdev);
628 }
629 
630 static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
631 	.migration_set_state = mlx5vf_pci_set_device_state,
632 	.migration_get_state = mlx5vf_pci_get_device_state,
633 	.migration_get_data_size = mlx5vf_pci_get_data_size,
634 };
635 
636 static const struct vfio_log_ops mlx5vf_pci_log_ops = {
637 	.log_start = mlx5vf_start_page_tracker,
638 	.log_stop = mlx5vf_stop_page_tracker,
639 	.log_read_and_clear = mlx5vf_tracker_read_and_clear,
640 };
641 
642 static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
643 {
644 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
645 			struct mlx5vf_pci_core_device, core_device.vdev);
646 	int ret;
647 
648 	ret = vfio_pci_core_init_dev(core_vdev);
649 	if (ret)
650 		return ret;
651 
652 	mlx5vf_cmd_set_migratable(mvdev, &mlx5vf_pci_mig_ops,
653 				  &mlx5vf_pci_log_ops);
654 
655 	return 0;
656 }
657 
658 static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
659 {
660 	struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
661 			struct mlx5vf_pci_core_device, core_device.vdev);
662 
663 	mlx5vf_cmd_remove_migratable(mvdev);
664 	vfio_pci_core_release_dev(core_vdev);
665 }
666 
667 static const struct vfio_device_ops mlx5vf_pci_ops = {
668 	.name = "mlx5-vfio-pci",
669 	.init = mlx5vf_pci_init_dev,
670 	.release = mlx5vf_pci_release_dev,
671 	.open_device = mlx5vf_pci_open_device,
672 	.close_device = mlx5vf_pci_close_device,
673 	.ioctl = vfio_pci_core_ioctl,
674 	.device_feature = vfio_pci_core_ioctl_feature,
675 	.read = vfio_pci_core_read,
676 	.write = vfio_pci_core_write,
677 	.mmap = vfio_pci_core_mmap,
678 	.request = vfio_pci_core_request,
679 	.match = vfio_pci_core_match,
680 };
681 
682 static int mlx5vf_pci_probe(struct pci_dev *pdev,
683 			    const struct pci_device_id *id)
684 {
685 	struct mlx5vf_pci_core_device *mvdev;
686 	int ret;
687 
688 	mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
689 				  &pdev->dev, &mlx5vf_pci_ops);
690 	if (IS_ERR(mvdev))
691 		return PTR_ERR(mvdev);
692 
693 	dev_set_drvdata(&pdev->dev, &mvdev->core_device);
694 	ret = vfio_pci_core_register_device(&mvdev->core_device);
695 	if (ret)
696 		goto out_put_vdev;
697 	return 0;
698 
699 out_put_vdev:
700 	vfio_put_device(&mvdev->core_device.vdev);
701 	return ret;
702 }
703 
704 static void mlx5vf_pci_remove(struct pci_dev *pdev)
705 {
706 	struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
707 
708 	vfio_pci_core_unregister_device(&mvdev->core_device);
709 	vfio_put_device(&mvdev->core_device.vdev);
710 }
711 
712 static const struct pci_device_id mlx5vf_pci_table[] = {
713 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
714 	{}
715 };
716 
717 MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
718 
719 static const struct pci_error_handlers mlx5vf_err_handlers = {
720 	.reset_done = mlx5vf_pci_aer_reset_done,
721 	.error_detected = vfio_pci_core_aer_err_detected,
722 };
723 
724 static struct pci_driver mlx5vf_pci_driver = {
725 	.name = KBUILD_MODNAME,
726 	.id_table = mlx5vf_pci_table,
727 	.probe = mlx5vf_pci_probe,
728 	.remove = mlx5vf_pci_remove,
729 	.err_handler = &mlx5vf_err_handlers,
730 	.driver_managed_dma = true,
731 };
732 
733 module_pci_driver(mlx5vf_pci_driver);
734 
735 MODULE_LICENSE("GPL");
736 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
737 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
738 MODULE_DESCRIPTION(
739 	"MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
740