xref: /openbmc/linux/drivers/vfio/pci/vfio_pci.c (revision 97da55fc)
1 /*
2  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
3  *     Author: Alex Williamson <alex.williamson@redhat.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License version 2 as
7  * published by the Free Software Foundation.
8  *
9  * Derived from original vfio:
10  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
11  * Author: Tom Lyon, pugs@cisco.com
12  */
13 
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 
28 #include "vfio_pci_private.h"
29 
30 #define DRIVER_VERSION  "0.2"
31 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
32 #define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
33 
34 static bool nointxmask;
35 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36 MODULE_PARM_DESC(nointxmask,
37 		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38 
39 static int vfio_pci_enable(struct vfio_pci_device *vdev)
40 {
41 	struct pci_dev *pdev = vdev->pdev;
42 	int ret;
43 	u16 cmd;
44 	u8 msix_pos;
45 
46 	ret = pci_enable_device(pdev);
47 	if (ret)
48 		return ret;
49 
50 	vdev->reset_works = (pci_reset_function(pdev) == 0);
51 	pci_save_state(pdev);
52 	vdev->pci_saved_state = pci_store_saved_state(pdev);
53 	if (!vdev->pci_saved_state)
54 		pr_debug("%s: Couldn't store %s saved state\n",
55 			 __func__, dev_name(&pdev->dev));
56 
57 	ret = vfio_config_init(vdev);
58 	if (ret) {
59 		pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state);
60 		pci_disable_device(pdev);
61 		return ret;
62 	}
63 
64 	if (likely(!nointxmask))
65 		vdev->pci_2_3 = pci_intx_mask_supported(pdev);
66 
67 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
68 	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
69 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
70 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
71 	}
72 
73 	msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
74 	if (msix_pos) {
75 		u16 flags;
76 		u32 table;
77 
78 		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
79 		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
80 
81 		vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
82 		vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
83 		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
84 	} else
85 		vdev->msix_bar = 0xFF;
86 
87 #ifdef CONFIG_VFIO_PCI_VGA
88 	if ((pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
89 		vdev->has_vga = true;
90 #endif
91 
92 	return 0;
93 }
94 
95 static void vfio_pci_disable(struct vfio_pci_device *vdev)
96 {
97 	struct pci_dev *pdev = vdev->pdev;
98 	int bar;
99 
100 	pci_disable_device(pdev);
101 
102 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
103 				VFIO_IRQ_SET_ACTION_TRIGGER,
104 				vdev->irq_type, 0, 0, NULL);
105 
106 	vdev->virq_disabled = false;
107 
108 	vfio_config_free(vdev);
109 
110 	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
111 		if (!vdev->barmap[bar])
112 			continue;
113 		pci_iounmap(pdev, vdev->barmap[bar]);
114 		pci_release_selected_regions(pdev, 1 << bar);
115 		vdev->barmap[bar] = NULL;
116 	}
117 
118 	/*
119 	 * If we have saved state, restore it.  If we can reset the device,
120 	 * even better.  Resetting with current state seems better than
121 	 * nothing, but saving and restoring current state without reset
122 	 * is just busy work.
123 	 */
124 	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
125 		pr_info("%s: Couldn't reload %s saved state\n",
126 			__func__, dev_name(&pdev->dev));
127 
128 		if (!vdev->reset_works)
129 			return;
130 
131 		pci_save_state(pdev);
132 	}
133 
134 	/*
135 	 * Disable INTx and MSI, presumably to avoid spurious interrupts
136 	 * during reset.  Stolen from pci_reset_function()
137 	 */
138 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
139 
140 	if (vdev->reset_works)
141 		__pci_reset_function(pdev);
142 
143 	pci_restore_state(pdev);
144 }
145 
146 static void vfio_pci_release(void *device_data)
147 {
148 	struct vfio_pci_device *vdev = device_data;
149 
150 	if (atomic_dec_and_test(&vdev->refcnt))
151 		vfio_pci_disable(vdev);
152 
153 	module_put(THIS_MODULE);
154 }
155 
156 static int vfio_pci_open(void *device_data)
157 {
158 	struct vfio_pci_device *vdev = device_data;
159 
160 	if (!try_module_get(THIS_MODULE))
161 		return -ENODEV;
162 
163 	if (atomic_inc_return(&vdev->refcnt) == 1) {
164 		int ret = vfio_pci_enable(vdev);
165 		if (ret) {
166 			module_put(THIS_MODULE);
167 			return ret;
168 		}
169 	}
170 
171 	return 0;
172 }
173 
174 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
175 {
176 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
177 		u8 pin;
178 		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
179 		if (pin)
180 			return 1;
181 
182 	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
183 		u8 pos;
184 		u16 flags;
185 
186 		pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
187 		if (pos) {
188 			pci_read_config_word(vdev->pdev,
189 					     pos + PCI_MSI_FLAGS, &flags);
190 
191 			return 1 << (flags & PCI_MSI_FLAGS_QMASK);
192 		}
193 	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
194 		u8 pos;
195 		u16 flags;
196 
197 		pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
198 		if (pos) {
199 			pci_read_config_word(vdev->pdev,
200 					     pos + PCI_MSIX_FLAGS, &flags);
201 
202 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
203 		}
204 	}
205 
206 	return 0;
207 }
208 
209 static long vfio_pci_ioctl(void *device_data,
210 			   unsigned int cmd, unsigned long arg)
211 {
212 	struct vfio_pci_device *vdev = device_data;
213 	unsigned long minsz;
214 
215 	if (cmd == VFIO_DEVICE_GET_INFO) {
216 		struct vfio_device_info info;
217 
218 		minsz = offsetofend(struct vfio_device_info, num_irqs);
219 
220 		if (copy_from_user(&info, (void __user *)arg, minsz))
221 			return -EFAULT;
222 
223 		if (info.argsz < minsz)
224 			return -EINVAL;
225 
226 		info.flags = VFIO_DEVICE_FLAGS_PCI;
227 
228 		if (vdev->reset_works)
229 			info.flags |= VFIO_DEVICE_FLAGS_RESET;
230 
231 		info.num_regions = VFIO_PCI_NUM_REGIONS;
232 		info.num_irqs = VFIO_PCI_NUM_IRQS;
233 
234 		return copy_to_user((void __user *)arg, &info, minsz);
235 
236 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
237 		struct pci_dev *pdev = vdev->pdev;
238 		struct vfio_region_info info;
239 
240 		minsz = offsetofend(struct vfio_region_info, offset);
241 
242 		if (copy_from_user(&info, (void __user *)arg, minsz))
243 			return -EFAULT;
244 
245 		if (info.argsz < minsz)
246 			return -EINVAL;
247 
248 		switch (info.index) {
249 		case VFIO_PCI_CONFIG_REGION_INDEX:
250 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
251 			info.size = pdev->cfg_size;
252 			info.flags = VFIO_REGION_INFO_FLAG_READ |
253 				     VFIO_REGION_INFO_FLAG_WRITE;
254 			break;
255 		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
256 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
257 			info.size = pci_resource_len(pdev, info.index);
258 			if (!info.size) {
259 				info.flags = 0;
260 				break;
261 			}
262 
263 			info.flags = VFIO_REGION_INFO_FLAG_READ |
264 				     VFIO_REGION_INFO_FLAG_WRITE;
265 			if (pci_resource_flags(pdev, info.index) &
266 			    IORESOURCE_MEM && info.size >= PAGE_SIZE)
267 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
268 			break;
269 		case VFIO_PCI_ROM_REGION_INDEX:
270 		{
271 			void __iomem *io;
272 			size_t size;
273 
274 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
275 			info.flags = 0;
276 
277 			/* Report the BAR size, not the ROM size */
278 			info.size = pci_resource_len(pdev, info.index);
279 			if (!info.size)
280 				break;
281 
282 			/* Is it really there? */
283 			io = pci_map_rom(pdev, &size);
284 			if (!io || !size) {
285 				info.size = 0;
286 				break;
287 			}
288 			pci_unmap_rom(pdev, io);
289 
290 			info.flags = VFIO_REGION_INFO_FLAG_READ;
291 			break;
292 		}
293 		case VFIO_PCI_VGA_REGION_INDEX:
294 			if (!vdev->has_vga)
295 				return -EINVAL;
296 
297 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
298 			info.size = 0xc0000;
299 			info.flags = VFIO_REGION_INFO_FLAG_READ |
300 				     VFIO_REGION_INFO_FLAG_WRITE;
301 
302 			break;
303 		default:
304 			return -EINVAL;
305 		}
306 
307 		return copy_to_user((void __user *)arg, &info, minsz);
308 
309 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
310 		struct vfio_irq_info info;
311 
312 		minsz = offsetofend(struct vfio_irq_info, count);
313 
314 		if (copy_from_user(&info, (void __user *)arg, minsz))
315 			return -EFAULT;
316 
317 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
318 			return -EINVAL;
319 
320 		info.flags = VFIO_IRQ_INFO_EVENTFD;
321 
322 		info.count = vfio_pci_get_irq_count(vdev, info.index);
323 
324 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
325 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
326 				       VFIO_IRQ_INFO_AUTOMASKED);
327 		else
328 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
329 
330 		return copy_to_user((void __user *)arg, &info, minsz);
331 
332 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
333 		struct vfio_irq_set hdr;
334 		u8 *data = NULL;
335 		int ret = 0;
336 
337 		minsz = offsetofend(struct vfio_irq_set, count);
338 
339 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
340 			return -EFAULT;
341 
342 		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
343 		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
344 				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
345 			return -EINVAL;
346 
347 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
348 			size_t size;
349 
350 			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
351 				size = sizeof(uint8_t);
352 			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
353 				size = sizeof(int32_t);
354 			else
355 				return -EINVAL;
356 
357 			if (hdr.argsz - minsz < hdr.count * size ||
358 			    hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
359 				return -EINVAL;
360 
361 			data = memdup_user((void __user *)(arg + minsz),
362 					   hdr.count * size);
363 			if (IS_ERR(data))
364 				return PTR_ERR(data);
365 		}
366 
367 		mutex_lock(&vdev->igate);
368 
369 		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
370 					      hdr.start, hdr.count, data);
371 
372 		mutex_unlock(&vdev->igate);
373 		kfree(data);
374 
375 		return ret;
376 
377 	} else if (cmd == VFIO_DEVICE_RESET)
378 		return vdev->reset_works ?
379 			pci_reset_function(vdev->pdev) : -EINVAL;
380 
381 	return -ENOTTY;
382 }
383 
384 static ssize_t vfio_pci_rw(void *device_data, char __user *buf,
385 			   size_t count, loff_t *ppos, bool iswrite)
386 {
387 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
388 	struct vfio_pci_device *vdev = device_data;
389 
390 	if (index >= VFIO_PCI_NUM_REGIONS)
391 		return -EINVAL;
392 
393 	switch (index) {
394 	case VFIO_PCI_CONFIG_REGION_INDEX:
395 		return vfio_pci_config_rw(vdev, buf, count, ppos, iswrite);
396 
397 	case VFIO_PCI_ROM_REGION_INDEX:
398 		if (iswrite)
399 			return -EINVAL;
400 		return vfio_pci_bar_rw(vdev, buf, count, ppos, false);
401 
402 	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
403 		return vfio_pci_bar_rw(vdev, buf, count, ppos, iswrite);
404 
405 	case VFIO_PCI_VGA_REGION_INDEX:
406 		return vfio_pci_vga_rw(vdev, buf, count, ppos, iswrite);
407 	}
408 
409 	return -EINVAL;
410 }
411 
412 static ssize_t vfio_pci_read(void *device_data, char __user *buf,
413 			     size_t count, loff_t *ppos)
414 {
415 	if (!count)
416 		return 0;
417 
418 	return vfio_pci_rw(device_data, buf, count, ppos, false);
419 }
420 
421 static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
422 			      size_t count, loff_t *ppos)
423 {
424 	if (!count)
425 		return 0;
426 
427 	return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true);
428 }
429 
430 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
431 {
432 	struct vfio_pci_device *vdev = device_data;
433 	struct pci_dev *pdev = vdev->pdev;
434 	unsigned int index;
435 	u64 phys_len, req_len, pgoff, req_start;
436 	int ret;
437 
438 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
439 
440 	if (vma->vm_end < vma->vm_start)
441 		return -EINVAL;
442 	if ((vma->vm_flags & VM_SHARED) == 0)
443 		return -EINVAL;
444 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
445 		return -EINVAL;
446 	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
447 		return -EINVAL;
448 
449 	phys_len = pci_resource_len(pdev, index);
450 	req_len = vma->vm_end - vma->vm_start;
451 	pgoff = vma->vm_pgoff &
452 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
453 	req_start = pgoff << PAGE_SHIFT;
454 
455 	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
456 		return -EINVAL;
457 
458 	if (index == vdev->msix_bar) {
459 		/*
460 		 * Disallow mmaps overlapping the MSI-X table; users don't
461 		 * get to touch this directly.  We could find somewhere
462 		 * else to map the overlap, but page granularity is only
463 		 * a recommendation, not a requirement, so the user needs
464 		 * to know which bits are real.  Requiring them to mmap
465 		 * around the table makes that clear.
466 		 */
467 
468 		/* If neither entirely above nor below, then it overlaps */
469 		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
470 		      req_start + req_len <= vdev->msix_offset))
471 			return -EINVAL;
472 	}
473 
474 	/*
475 	 * Even though we don't make use of the barmap for the mmap,
476 	 * we need to request the region and the barmap tracks that.
477 	 */
478 	if (!vdev->barmap[index]) {
479 		ret = pci_request_selected_regions(pdev,
480 						   1 << index, "vfio-pci");
481 		if (ret)
482 			return ret;
483 
484 		vdev->barmap[index] = pci_iomap(pdev, index, 0);
485 	}
486 
487 	vma->vm_private_data = vdev;
488 	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
489 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
490 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
491 
492 	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
493 			       req_len, vma->vm_page_prot);
494 }
495 
496 static const struct vfio_device_ops vfio_pci_ops = {
497 	.name		= "vfio-pci",
498 	.open		= vfio_pci_open,
499 	.release	= vfio_pci_release,
500 	.ioctl		= vfio_pci_ioctl,
501 	.read		= vfio_pci_read,
502 	.write		= vfio_pci_write,
503 	.mmap		= vfio_pci_mmap,
504 };
505 
506 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
507 {
508 	u8 type;
509 	struct vfio_pci_device *vdev;
510 	struct iommu_group *group;
511 	int ret;
512 
513 	pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
514 	if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
515 		return -EINVAL;
516 
517 	group = iommu_group_get(&pdev->dev);
518 	if (!group)
519 		return -EINVAL;
520 
521 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
522 	if (!vdev) {
523 		iommu_group_put(group);
524 		return -ENOMEM;
525 	}
526 
527 	vdev->pdev = pdev;
528 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
529 	mutex_init(&vdev->igate);
530 	spin_lock_init(&vdev->irqlock);
531 	atomic_set(&vdev->refcnt, 0);
532 
533 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
534 	if (ret) {
535 		iommu_group_put(group);
536 		kfree(vdev);
537 	}
538 
539 	return ret;
540 }
541 
542 static void vfio_pci_remove(struct pci_dev *pdev)
543 {
544 	struct vfio_pci_device *vdev;
545 
546 	vdev = vfio_del_group_dev(&pdev->dev);
547 	if (!vdev)
548 		return;
549 
550 	iommu_group_put(pdev->dev.iommu_group);
551 	kfree(vdev);
552 }
553 
554 static struct pci_driver vfio_pci_driver = {
555 	.name		= "vfio-pci",
556 	.id_table	= NULL, /* only dynamic ids */
557 	.probe		= vfio_pci_probe,
558 	.remove		= vfio_pci_remove,
559 };
560 
561 static void __exit vfio_pci_cleanup(void)
562 {
563 	pci_unregister_driver(&vfio_pci_driver);
564 	vfio_pci_virqfd_exit();
565 	vfio_pci_uninit_perm_bits();
566 }
567 
568 static int __init vfio_pci_init(void)
569 {
570 	int ret;
571 
572 	/* Allocate shared config space permision data used by all devices */
573 	ret = vfio_pci_init_perm_bits();
574 	if (ret)
575 		return ret;
576 
577 	/* Start the virqfd cleanup handler */
578 	ret = vfio_pci_virqfd_init();
579 	if (ret)
580 		goto out_virqfd;
581 
582 	/* Register and scan for devices */
583 	ret = pci_register_driver(&vfio_pci_driver);
584 	if (ret)
585 		goto out_driver;
586 
587 	return 0;
588 
589 out_driver:
590 	vfio_pci_virqfd_exit();
591 out_virqfd:
592 	vfio_pci_uninit_perm_bits();
593 	return ret;
594 }
595 
596 module_init(vfio_pci_init);
597 module_exit(vfio_pci_cleanup);
598 
599 MODULE_VERSION(DRIVER_VERSION);
600 MODULE_LICENSE("GPL v2");
601 MODULE_AUTHOR(DRIVER_AUTHOR);
602 MODULE_DESCRIPTION(DRIVER_DESC);
603