xref: /openbmc/linux/drivers/vfio/pci/vfio_pci.c (revision 9d749629)
1 /*
2  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
3  *     Author: Alex Williamson <alex.williamson@redhat.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License version 2 as
7  * published by the Free Software Foundation.
8  *
9  * Derived from original vfio:
10  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
11  * Author: Tom Lyon, pugs@cisco.com
12  */
13 
14 #include <linux/device.h>
15 #include <linux/eventfd.h>
16 #include <linux/interrupt.h>
17 #include <linux/iommu.h>
18 #include <linux/module.h>
19 #include <linux/mutex.h>
20 #include <linux/notifier.h>
21 #include <linux/pci.h>
22 #include <linux/pm_runtime.h>
23 #include <linux/slab.h>
24 #include <linux/types.h>
25 #include <linux/uaccess.h>
26 #include <linux/vfio.h>
27 
28 #include "vfio_pci_private.h"
29 
30 #define DRIVER_VERSION  "0.2"
31 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
32 #define DRIVER_DESC     "VFIO PCI - User Level meta-driver"
33 
34 static bool nointxmask;
35 module_param_named(nointxmask, nointxmask, bool, S_IRUGO | S_IWUSR);
36 MODULE_PARM_DESC(nointxmask,
37 		  "Disable support for PCI 2.3 style INTx masking.  If this resolves problems for specific devices, report lspci -vvvxxx to linux-pci@vger.kernel.org so the device can be fixed automatically via the broken_intx_masking flag.");
38 
39 static int vfio_pci_enable(struct vfio_pci_device *vdev)
40 {
41 	struct pci_dev *pdev = vdev->pdev;
42 	int ret;
43 	u16 cmd;
44 	u8 msix_pos;
45 
46 	ret = pci_enable_device(pdev);
47 	if (ret)
48 		return ret;
49 
50 	vdev->reset_works = (pci_reset_function(pdev) == 0);
51 	pci_save_state(pdev);
52 	vdev->pci_saved_state = pci_store_saved_state(pdev);
53 	if (!vdev->pci_saved_state)
54 		pr_debug("%s: Couldn't store %s saved state\n",
55 			 __func__, dev_name(&pdev->dev));
56 
57 	ret = vfio_config_init(vdev);
58 	if (ret) {
59 		pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state);
60 		pci_disable_device(pdev);
61 		return ret;
62 	}
63 
64 	if (likely(!nointxmask))
65 		vdev->pci_2_3 = pci_intx_mask_supported(pdev);
66 
67 	pci_read_config_word(pdev, PCI_COMMAND, &cmd);
68 	if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) {
69 		cmd &= ~PCI_COMMAND_INTX_DISABLE;
70 		pci_write_config_word(pdev, PCI_COMMAND, cmd);
71 	}
72 
73 	msix_pos = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
74 	if (msix_pos) {
75 		u16 flags;
76 		u32 table;
77 
78 		pci_read_config_word(pdev, msix_pos + PCI_MSIX_FLAGS, &flags);
79 		pci_read_config_dword(pdev, msix_pos + PCI_MSIX_TABLE, &table);
80 
81 		vdev->msix_bar = table & PCI_MSIX_FLAGS_BIRMASK;
82 		vdev->msix_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
83 		vdev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 16;
84 	} else
85 		vdev->msix_bar = 0xFF;
86 
87 	return 0;
88 }
89 
90 static void vfio_pci_disable(struct vfio_pci_device *vdev)
91 {
92 	struct pci_dev *pdev = vdev->pdev;
93 	int bar;
94 
95 	pci_disable_device(pdev);
96 
97 	vfio_pci_set_irqs_ioctl(vdev, VFIO_IRQ_SET_DATA_NONE |
98 				VFIO_IRQ_SET_ACTION_TRIGGER,
99 				vdev->irq_type, 0, 0, NULL);
100 
101 	vdev->virq_disabled = false;
102 
103 	vfio_config_free(vdev);
104 
105 	for (bar = PCI_STD_RESOURCES; bar <= PCI_STD_RESOURCE_END; bar++) {
106 		if (!vdev->barmap[bar])
107 			continue;
108 		pci_iounmap(pdev, vdev->barmap[bar]);
109 		pci_release_selected_regions(pdev, 1 << bar);
110 		vdev->barmap[bar] = NULL;
111 	}
112 
113 	/*
114 	 * If we have saved state, restore it.  If we can reset the device,
115 	 * even better.  Resetting with current state seems better than
116 	 * nothing, but saving and restoring current state without reset
117 	 * is just busy work.
118 	 */
119 	if (pci_load_and_free_saved_state(pdev, &vdev->pci_saved_state)) {
120 		pr_info("%s: Couldn't reload %s saved state\n",
121 			__func__, dev_name(&pdev->dev));
122 
123 		if (!vdev->reset_works)
124 			return;
125 
126 		pci_save_state(pdev);
127 	}
128 
129 	/*
130 	 * Disable INTx and MSI, presumably to avoid spurious interrupts
131 	 * during reset.  Stolen from pci_reset_function()
132 	 */
133 	pci_write_config_word(pdev, PCI_COMMAND, PCI_COMMAND_INTX_DISABLE);
134 
135 	if (vdev->reset_works)
136 		__pci_reset_function(pdev);
137 
138 	pci_restore_state(pdev);
139 }
140 
141 static void vfio_pci_release(void *device_data)
142 {
143 	struct vfio_pci_device *vdev = device_data;
144 
145 	if (atomic_dec_and_test(&vdev->refcnt))
146 		vfio_pci_disable(vdev);
147 
148 	module_put(THIS_MODULE);
149 }
150 
151 static int vfio_pci_open(void *device_data)
152 {
153 	struct vfio_pci_device *vdev = device_data;
154 
155 	if (!try_module_get(THIS_MODULE))
156 		return -ENODEV;
157 
158 	if (atomic_inc_return(&vdev->refcnt) == 1) {
159 		int ret = vfio_pci_enable(vdev);
160 		if (ret) {
161 			module_put(THIS_MODULE);
162 			return ret;
163 		}
164 	}
165 
166 	return 0;
167 }
168 
169 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
170 {
171 	if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
172 		u8 pin;
173 		pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin);
174 		if (pin)
175 			return 1;
176 
177 	} else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
178 		u8 pos;
179 		u16 flags;
180 
181 		pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSI);
182 		if (pos) {
183 			pci_read_config_word(vdev->pdev,
184 					     pos + PCI_MSI_FLAGS, &flags);
185 
186 			return 1 << (flags & PCI_MSI_FLAGS_QMASK);
187 		}
188 	} else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
189 		u8 pos;
190 		u16 flags;
191 
192 		pos = pci_find_capability(vdev->pdev, PCI_CAP_ID_MSIX);
193 		if (pos) {
194 			pci_read_config_word(vdev->pdev,
195 					     pos + PCI_MSIX_FLAGS, &flags);
196 
197 			return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
198 		}
199 	}
200 
201 	return 0;
202 }
203 
204 static long vfio_pci_ioctl(void *device_data,
205 			   unsigned int cmd, unsigned long arg)
206 {
207 	struct vfio_pci_device *vdev = device_data;
208 	unsigned long minsz;
209 
210 	if (cmd == VFIO_DEVICE_GET_INFO) {
211 		struct vfio_device_info info;
212 
213 		minsz = offsetofend(struct vfio_device_info, num_irqs);
214 
215 		if (copy_from_user(&info, (void __user *)arg, minsz))
216 			return -EFAULT;
217 
218 		if (info.argsz < minsz)
219 			return -EINVAL;
220 
221 		info.flags = VFIO_DEVICE_FLAGS_PCI;
222 
223 		if (vdev->reset_works)
224 			info.flags |= VFIO_DEVICE_FLAGS_RESET;
225 
226 		info.num_regions = VFIO_PCI_NUM_REGIONS;
227 		info.num_irqs = VFIO_PCI_NUM_IRQS;
228 
229 		return copy_to_user((void __user *)arg, &info, minsz);
230 
231 	} else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
232 		struct pci_dev *pdev = vdev->pdev;
233 		struct vfio_region_info info;
234 
235 		minsz = offsetofend(struct vfio_region_info, offset);
236 
237 		if (copy_from_user(&info, (void __user *)arg, minsz))
238 			return -EFAULT;
239 
240 		if (info.argsz < minsz)
241 			return -EINVAL;
242 
243 		switch (info.index) {
244 		case VFIO_PCI_CONFIG_REGION_INDEX:
245 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
246 			info.size = pdev->cfg_size;
247 			info.flags = VFIO_REGION_INFO_FLAG_READ |
248 				     VFIO_REGION_INFO_FLAG_WRITE;
249 			break;
250 		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
251 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
252 			info.size = pci_resource_len(pdev, info.index);
253 			if (!info.size) {
254 				info.flags = 0;
255 				break;
256 			}
257 
258 			info.flags = VFIO_REGION_INFO_FLAG_READ |
259 				     VFIO_REGION_INFO_FLAG_WRITE;
260 			if (pci_resource_flags(pdev, info.index) &
261 			    IORESOURCE_MEM && info.size >= PAGE_SIZE)
262 				info.flags |= VFIO_REGION_INFO_FLAG_MMAP;
263 			break;
264 		case VFIO_PCI_ROM_REGION_INDEX:
265 		{
266 			void __iomem *io;
267 			size_t size;
268 
269 			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
270 			info.flags = 0;
271 
272 			/* Report the BAR size, not the ROM size */
273 			info.size = pci_resource_len(pdev, info.index);
274 			if (!info.size)
275 				break;
276 
277 			/* Is it really there? */
278 			io = pci_map_rom(pdev, &size);
279 			if (!io || !size) {
280 				info.size = 0;
281 				break;
282 			}
283 			pci_unmap_rom(pdev, io);
284 
285 			info.flags = VFIO_REGION_INFO_FLAG_READ;
286 			break;
287 		}
288 		default:
289 			return -EINVAL;
290 		}
291 
292 		return copy_to_user((void __user *)arg, &info, minsz);
293 
294 	} else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
295 		struct vfio_irq_info info;
296 
297 		minsz = offsetofend(struct vfio_irq_info, count);
298 
299 		if (copy_from_user(&info, (void __user *)arg, minsz))
300 			return -EFAULT;
301 
302 		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
303 			return -EINVAL;
304 
305 		info.flags = VFIO_IRQ_INFO_EVENTFD;
306 
307 		info.count = vfio_pci_get_irq_count(vdev, info.index);
308 
309 		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
310 			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
311 				       VFIO_IRQ_INFO_AUTOMASKED);
312 		else
313 			info.flags |= VFIO_IRQ_INFO_NORESIZE;
314 
315 		return copy_to_user((void __user *)arg, &info, minsz);
316 
317 	} else if (cmd == VFIO_DEVICE_SET_IRQS) {
318 		struct vfio_irq_set hdr;
319 		u8 *data = NULL;
320 		int ret = 0;
321 
322 		minsz = offsetofend(struct vfio_irq_set, count);
323 
324 		if (copy_from_user(&hdr, (void __user *)arg, minsz))
325 			return -EFAULT;
326 
327 		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
328 		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
329 				  VFIO_IRQ_SET_ACTION_TYPE_MASK))
330 			return -EINVAL;
331 
332 		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
333 			size_t size;
334 
335 			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
336 				size = sizeof(uint8_t);
337 			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
338 				size = sizeof(int32_t);
339 			else
340 				return -EINVAL;
341 
342 			if (hdr.argsz - minsz < hdr.count * size ||
343 			    hdr.count > vfio_pci_get_irq_count(vdev, hdr.index))
344 				return -EINVAL;
345 
346 			data = memdup_user((void __user *)(arg + minsz),
347 					   hdr.count * size);
348 			if (IS_ERR(data))
349 				return PTR_ERR(data);
350 		}
351 
352 		mutex_lock(&vdev->igate);
353 
354 		ret = vfio_pci_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
355 					      hdr.start, hdr.count, data);
356 
357 		mutex_unlock(&vdev->igate);
358 		kfree(data);
359 
360 		return ret;
361 
362 	} else if (cmd == VFIO_DEVICE_RESET)
363 		return vdev->reset_works ?
364 			pci_reset_function(vdev->pdev) : -EINVAL;
365 
366 	return -ENOTTY;
367 }
368 
369 static ssize_t vfio_pci_read(void *device_data, char __user *buf,
370 			     size_t count, loff_t *ppos)
371 {
372 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
373 	struct vfio_pci_device *vdev = device_data;
374 	struct pci_dev *pdev = vdev->pdev;
375 
376 	if (index >= VFIO_PCI_NUM_REGIONS)
377 		return -EINVAL;
378 
379 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
380 		return vfio_pci_config_readwrite(vdev, buf, count, ppos, false);
381 	else if (index == VFIO_PCI_ROM_REGION_INDEX)
382 		return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
383 	else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
384 		return vfio_pci_io_readwrite(vdev, buf, count, ppos, false);
385 	else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM)
386 		return vfio_pci_mem_readwrite(vdev, buf, count, ppos, false);
387 
388 	return -EINVAL;
389 }
390 
391 static ssize_t vfio_pci_write(void *device_data, const char __user *buf,
392 			      size_t count, loff_t *ppos)
393 {
394 	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
395 	struct vfio_pci_device *vdev = device_data;
396 	struct pci_dev *pdev = vdev->pdev;
397 
398 	if (index >= VFIO_PCI_NUM_REGIONS)
399 		return -EINVAL;
400 
401 	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
402 		return vfio_pci_config_readwrite(vdev, (char __user *)buf,
403 						 count, ppos, true);
404 	else if (index == VFIO_PCI_ROM_REGION_INDEX)
405 		return -EINVAL;
406 	else if (pci_resource_flags(pdev, index) & IORESOURCE_IO)
407 		return vfio_pci_io_readwrite(vdev, (char __user *)buf,
408 					     count, ppos, true);
409 	else if (pci_resource_flags(pdev, index) & IORESOURCE_MEM) {
410 		return vfio_pci_mem_readwrite(vdev, (char __user *)buf,
411 					      count, ppos, true);
412 	}
413 
414 	return -EINVAL;
415 }
416 
417 static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma)
418 {
419 	struct vfio_pci_device *vdev = device_data;
420 	struct pci_dev *pdev = vdev->pdev;
421 	unsigned int index;
422 	u64 phys_len, req_len, pgoff, req_start;
423 	int ret;
424 
425 	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
426 
427 	if (vma->vm_end < vma->vm_start)
428 		return -EINVAL;
429 	if ((vma->vm_flags & VM_SHARED) == 0)
430 		return -EINVAL;
431 	if (index >= VFIO_PCI_ROM_REGION_INDEX)
432 		return -EINVAL;
433 	if (!(pci_resource_flags(pdev, index) & IORESOURCE_MEM))
434 		return -EINVAL;
435 
436 	phys_len = pci_resource_len(pdev, index);
437 	req_len = vma->vm_end - vma->vm_start;
438 	pgoff = vma->vm_pgoff &
439 		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
440 	req_start = pgoff << PAGE_SHIFT;
441 
442 	if (phys_len < PAGE_SIZE || req_start + req_len > phys_len)
443 		return -EINVAL;
444 
445 	if (index == vdev->msix_bar) {
446 		/*
447 		 * Disallow mmaps overlapping the MSI-X table; users don't
448 		 * get to touch this directly.  We could find somewhere
449 		 * else to map the overlap, but page granularity is only
450 		 * a recommendation, not a requirement, so the user needs
451 		 * to know which bits are real.  Requiring them to mmap
452 		 * around the table makes that clear.
453 		 */
454 
455 		/* If neither entirely above nor below, then it overlaps */
456 		if (!(req_start >= vdev->msix_offset + vdev->msix_size ||
457 		      req_start + req_len <= vdev->msix_offset))
458 			return -EINVAL;
459 	}
460 
461 	/*
462 	 * Even though we don't make use of the barmap for the mmap,
463 	 * we need to request the region and the barmap tracks that.
464 	 */
465 	if (!vdev->barmap[index]) {
466 		ret = pci_request_selected_regions(pdev,
467 						   1 << index, "vfio-pci");
468 		if (ret)
469 			return ret;
470 
471 		vdev->barmap[index] = pci_iomap(pdev, index, 0);
472 	}
473 
474 	vma->vm_private_data = vdev;
475 	vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP;
476 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
477 	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
478 
479 	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
480 			       req_len, vma->vm_page_prot);
481 }
482 
483 static const struct vfio_device_ops vfio_pci_ops = {
484 	.name		= "vfio-pci",
485 	.open		= vfio_pci_open,
486 	.release	= vfio_pci_release,
487 	.ioctl		= vfio_pci_ioctl,
488 	.read		= vfio_pci_read,
489 	.write		= vfio_pci_write,
490 	.mmap		= vfio_pci_mmap,
491 };
492 
493 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
494 {
495 	u8 type;
496 	struct vfio_pci_device *vdev;
497 	struct iommu_group *group;
498 	int ret;
499 
500 	pci_read_config_byte(pdev, PCI_HEADER_TYPE, &type);
501 	if ((type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL)
502 		return -EINVAL;
503 
504 	group = iommu_group_get(&pdev->dev);
505 	if (!group)
506 		return -EINVAL;
507 
508 	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
509 	if (!vdev) {
510 		iommu_group_put(group);
511 		return -ENOMEM;
512 	}
513 
514 	vdev->pdev = pdev;
515 	vdev->irq_type = VFIO_PCI_NUM_IRQS;
516 	mutex_init(&vdev->igate);
517 	spin_lock_init(&vdev->irqlock);
518 	atomic_set(&vdev->refcnt, 0);
519 
520 	ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev);
521 	if (ret) {
522 		iommu_group_put(group);
523 		kfree(vdev);
524 	}
525 
526 	return ret;
527 }
528 
529 static void vfio_pci_remove(struct pci_dev *pdev)
530 {
531 	struct vfio_pci_device *vdev;
532 
533 	vdev = vfio_del_group_dev(&pdev->dev);
534 	if (!vdev)
535 		return;
536 
537 	iommu_group_put(pdev->dev.iommu_group);
538 	kfree(vdev);
539 }
540 
541 static struct pci_driver vfio_pci_driver = {
542 	.name		= "vfio-pci",
543 	.id_table	= NULL, /* only dynamic ids */
544 	.probe		= vfio_pci_probe,
545 	.remove		= vfio_pci_remove,
546 };
547 
548 static void __exit vfio_pci_cleanup(void)
549 {
550 	pci_unregister_driver(&vfio_pci_driver);
551 	vfio_pci_virqfd_exit();
552 	vfio_pci_uninit_perm_bits();
553 }
554 
555 static int __init vfio_pci_init(void)
556 {
557 	int ret;
558 
559 	/* Allocate shared config space permision data used by all devices */
560 	ret = vfio_pci_init_perm_bits();
561 	if (ret)
562 		return ret;
563 
564 	/* Start the virqfd cleanup handler */
565 	ret = vfio_pci_virqfd_init();
566 	if (ret)
567 		goto out_virqfd;
568 
569 	/* Register and scan for devices */
570 	ret = pci_register_driver(&vfio_pci_driver);
571 	if (ret)
572 		goto out_driver;
573 
574 	return 0;
575 
576 out_driver:
577 	vfio_pci_virqfd_exit();
578 out_virqfd:
579 	vfio_pci_uninit_perm_bits();
580 	return ret;
581 }
582 
583 module_init(vfio_pci_init);
584 module_exit(vfio_pci_cleanup);
585 
586 MODULE_VERSION(DRIVER_VERSION);
587 MODULE_LICENSE("GPL v2");
588 MODULE_AUTHOR(DRIVER_AUTHOR);
589 MODULE_DESCRIPTION(DRIVER_DESC);
590