1 /* 2 * VFIO regions 3 * 4 * Copyright Red Hat, Inc. 2012 5 * 6 * Authors: 7 * Alex Williamson <alex.williamson@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2. See 10 * the COPYING file in the top-level directory. 11 * 12 * Based on qemu-kvm device-assignment: 13 * Adapted for KVM by Qumranet. 14 * Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com) 15 * Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com) 16 * Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com) 17 * Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com) 18 * Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com) 19 */ 20 21 #include "qemu/osdep.h" 22 #include <sys/ioctl.h> 23 24 #include "hw/vfio/vfio-common.h" 25 #include "hw/vfio/pci.h" 26 #include "hw/hw.h" 27 #include "trace.h" 28 #include "qapi/error.h" 29 #include "qemu/error-report.h" 30 #include "qemu/units.h" 31 #include "monitor/monitor.h" 32 33 /* 34 * IO Port/MMIO - Beware of the endians, VFIO is always little endian 35 */ 36 void vfio_region_write(void *opaque, hwaddr addr, 37 uint64_t data, unsigned size) 38 { 39 VFIORegion *region = opaque; 40 VFIODevice *vbasedev = region->vbasedev; 41 union { 42 uint8_t byte; 43 uint16_t word; 44 uint32_t dword; 45 uint64_t qword; 46 } buf; 47 48 switch (size) { 49 case 1: 50 buf.byte = data; 51 break; 52 case 2: 53 buf.word = cpu_to_le16(data); 54 break; 55 case 4: 56 buf.dword = cpu_to_le32(data); 57 break; 58 case 8: 59 buf.qword = cpu_to_le64(data); 60 break; 61 default: 62 hw_error("vfio: unsupported write size, %u bytes", size); 63 break; 64 } 65 66 if (pwrite(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 67 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", 0x%"PRIx64 68 ",%d) failed: %m", 69 __func__, vbasedev->name, region->nr, 70 addr, data, size); 71 } 72 73 trace_vfio_region_write(vbasedev->name, region->nr, addr, data, size); 74 75 /* 76 * A read or write to a BAR always signals an INTx EOI. This will 77 * do nothing if not pending (including not in INTx mode). We assume 78 * that a BAR access is in response to an interrupt and that BAR 79 * accesses will service the interrupt. Unfortunately, we don't know 80 * which access will service the interrupt, so we're potentially 81 * getting quite a few host interrupts per guest interrupt. 82 */ 83 vbasedev->ops->vfio_eoi(vbasedev); 84 } 85 86 uint64_t vfio_region_read(void *opaque, 87 hwaddr addr, unsigned size) 88 { 89 VFIORegion *region = opaque; 90 VFIODevice *vbasedev = region->vbasedev; 91 union { 92 uint8_t byte; 93 uint16_t word; 94 uint32_t dword; 95 uint64_t qword; 96 } buf; 97 uint64_t data = 0; 98 99 if (pread(vbasedev->fd, &buf, size, region->fd_offset + addr) != size) { 100 error_report("%s(%s:region%d+0x%"HWADDR_PRIx", %d) failed: %m", 101 __func__, vbasedev->name, region->nr, 102 addr, size); 103 return (uint64_t)-1; 104 } 105 switch (size) { 106 case 1: 107 data = buf.byte; 108 break; 109 case 2: 110 data = le16_to_cpu(buf.word); 111 break; 112 case 4: 113 data = le32_to_cpu(buf.dword); 114 break; 115 case 8: 116 data = le64_to_cpu(buf.qword); 117 break; 118 default: 119 hw_error("vfio: unsupported read size, %u bytes", size); 120 break; 121 } 122 123 trace_vfio_region_read(vbasedev->name, region->nr, addr, size, data); 124 125 /* Same as write above */ 126 vbasedev->ops->vfio_eoi(vbasedev); 127 128 return data; 129 } 130 131 static const MemoryRegionOps vfio_region_ops = { 132 .read = vfio_region_read, 133 .write = vfio_region_write, 134 .endianness = DEVICE_LITTLE_ENDIAN, 135 .valid = { 136 .min_access_size = 1, 137 .max_access_size = 8, 138 }, 139 .impl = { 140 .min_access_size = 1, 141 .max_access_size = 8, 142 }, 143 }; 144 145 static int vfio_setup_region_sparse_mmaps(VFIORegion *region, 146 struct vfio_region_info *info) 147 { 148 struct vfio_info_cap_header *hdr; 149 struct vfio_region_info_cap_sparse_mmap *sparse; 150 int i, j; 151 152 hdr = vfio_get_region_info_cap(info, VFIO_REGION_INFO_CAP_SPARSE_MMAP); 153 if (!hdr) { 154 return -ENODEV; 155 } 156 157 sparse = container_of(hdr, struct vfio_region_info_cap_sparse_mmap, header); 158 159 trace_vfio_region_sparse_mmap_header(region->vbasedev->name, 160 region->nr, sparse->nr_areas); 161 162 region->mmaps = g_new0(VFIOMmap, sparse->nr_areas); 163 164 for (i = 0, j = 0; i < sparse->nr_areas; i++) { 165 if (sparse->areas[i].size) { 166 trace_vfio_region_sparse_mmap_entry(i, sparse->areas[i].offset, 167 sparse->areas[i].offset + 168 sparse->areas[i].size - 1); 169 region->mmaps[j].offset = sparse->areas[i].offset; 170 region->mmaps[j].size = sparse->areas[i].size; 171 j++; 172 } 173 } 174 175 region->nr_mmaps = j; 176 region->mmaps = g_realloc(region->mmaps, j * sizeof(VFIOMmap)); 177 178 return 0; 179 } 180 181 int vfio_region_setup(Object *obj, VFIODevice *vbasedev, VFIORegion *region, 182 int index, const char *name) 183 { 184 g_autofree struct vfio_region_info *info = NULL; 185 int ret; 186 187 ret = vfio_get_region_info(vbasedev, index, &info); 188 if (ret) { 189 return ret; 190 } 191 192 region->vbasedev = vbasedev; 193 region->flags = info->flags; 194 region->size = info->size; 195 region->fd_offset = info->offset; 196 region->nr = index; 197 198 if (region->size) { 199 region->mem = g_new0(MemoryRegion, 1); 200 memory_region_init_io(region->mem, obj, &vfio_region_ops, 201 region, name, region->size); 202 203 if (!vbasedev->no_mmap && 204 region->flags & VFIO_REGION_INFO_FLAG_MMAP) { 205 206 ret = vfio_setup_region_sparse_mmaps(region, info); 207 208 if (ret) { 209 region->nr_mmaps = 1; 210 region->mmaps = g_new0(VFIOMmap, region->nr_mmaps); 211 region->mmaps[0].offset = 0; 212 region->mmaps[0].size = region->size; 213 } 214 } 215 } 216 217 trace_vfio_region_setup(vbasedev->name, index, name, 218 region->flags, region->fd_offset, region->size); 219 return 0; 220 } 221 222 static void vfio_subregion_unmap(VFIORegion *region, int index) 223 { 224 trace_vfio_region_unmap(memory_region_name(®ion->mmaps[index].mem), 225 region->mmaps[index].offset, 226 region->mmaps[index].offset + 227 region->mmaps[index].size - 1); 228 memory_region_del_subregion(region->mem, ®ion->mmaps[index].mem); 229 munmap(region->mmaps[index].mmap, region->mmaps[index].size); 230 object_unparent(OBJECT(®ion->mmaps[index].mem)); 231 region->mmaps[index].mmap = NULL; 232 } 233 234 int vfio_region_mmap(VFIORegion *region) 235 { 236 int i, ret, prot = 0; 237 char *name; 238 239 if (!region->mem) { 240 return 0; 241 } 242 243 prot |= region->flags & VFIO_REGION_INFO_FLAG_READ ? PROT_READ : 0; 244 prot |= region->flags & VFIO_REGION_INFO_FLAG_WRITE ? PROT_WRITE : 0; 245 246 for (i = 0; i < region->nr_mmaps; i++) { 247 size_t align = MIN(1ULL << ctz64(region->mmaps[i].size), 1 * GiB); 248 void *map_base, *map_align; 249 250 /* 251 * Align the mmap for more efficient mapping in the kernel. Ideally 252 * we'd know the PMD and PUD mapping sizes to use as discrete alignment 253 * intervals, but we don't. As of Linux v6.12, the largest PUD size 254 * supporting huge pfnmap is 1GiB (ARCH_SUPPORTS_PUD_PFNMAP is only set 255 * on x86_64). Align by power-of-two size, capped at 1GiB. 256 * 257 * NB. qemu_memalign() and friends actually allocate memory, whereas 258 * the region size here can exceed host memory, therefore we manually 259 * create an oversized anonymous mapping and clean it up for alignment. 260 */ 261 map_base = mmap(0, region->mmaps[i].size + align, PROT_NONE, 262 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 263 if (map_base == MAP_FAILED) { 264 ret = -errno; 265 goto no_mmap; 266 } 267 268 map_align = (void *)ROUND_UP((uintptr_t)map_base, (uintptr_t)align); 269 munmap(map_base, map_align - map_base); 270 munmap(map_align + region->mmaps[i].size, 271 align - (map_align - map_base)); 272 273 region->mmaps[i].mmap = mmap(map_align, region->mmaps[i].size, prot, 274 MAP_SHARED | MAP_FIXED, 275 region->vbasedev->fd, 276 region->fd_offset + 277 region->mmaps[i].offset); 278 if (region->mmaps[i].mmap == MAP_FAILED) { 279 ret = -errno; 280 goto no_mmap; 281 } 282 283 name = g_strdup_printf("%s mmaps[%d]", 284 memory_region_name(region->mem), i); 285 memory_region_init_ram_device_ptr(®ion->mmaps[i].mem, 286 memory_region_owner(region->mem), 287 name, region->mmaps[i].size, 288 region->mmaps[i].mmap); 289 g_free(name); 290 memory_region_add_subregion(region->mem, region->mmaps[i].offset, 291 ®ion->mmaps[i].mem); 292 293 trace_vfio_region_mmap(memory_region_name(®ion->mmaps[i].mem), 294 region->mmaps[i].offset, 295 region->mmaps[i].offset + 296 region->mmaps[i].size - 1); 297 } 298 299 return 0; 300 301 no_mmap: 302 trace_vfio_region_mmap_fault(memory_region_name(region->mem), i, 303 region->fd_offset + region->mmaps[i].offset, 304 region->fd_offset + region->mmaps[i].offset + 305 region->mmaps[i].size - 1, ret); 306 307 region->mmaps[i].mmap = NULL; 308 309 for (i--; i >= 0; i--) { 310 vfio_subregion_unmap(region, i); 311 } 312 313 return ret; 314 } 315 316 void vfio_region_unmap(VFIORegion *region) 317 { 318 int i; 319 320 if (!region->mem) { 321 return; 322 } 323 324 for (i = 0; i < region->nr_mmaps; i++) { 325 if (region->mmaps[i].mmap) { 326 vfio_subregion_unmap(region, i); 327 } 328 } 329 } 330 331 void vfio_region_exit(VFIORegion *region) 332 { 333 int i; 334 335 if (!region->mem) { 336 return; 337 } 338 339 for (i = 0; i < region->nr_mmaps; i++) { 340 if (region->mmaps[i].mmap) { 341 memory_region_del_subregion(region->mem, ®ion->mmaps[i].mem); 342 } 343 } 344 345 trace_vfio_region_exit(region->vbasedev->name, region->nr); 346 } 347 348 void vfio_region_finalize(VFIORegion *region) 349 { 350 int i; 351 352 if (!region->mem) { 353 return; 354 } 355 356 for (i = 0; i < region->nr_mmaps; i++) { 357 if (region->mmaps[i].mmap) { 358 munmap(region->mmaps[i].mmap, region->mmaps[i].size); 359 object_unparent(OBJECT(®ion->mmaps[i].mem)); 360 } 361 } 362 363 object_unparent(OBJECT(region->mem)); 364 365 g_free(region->mem); 366 g_free(region->mmaps); 367 368 trace_vfio_region_finalize(region->vbasedev->name, region->nr); 369 370 region->mem = NULL; 371 region->mmaps = NULL; 372 region->nr_mmaps = 0; 373 region->size = 0; 374 region->flags = 0; 375 region->nr = 0; 376 } 377 378 void vfio_region_mmaps_set_enabled(VFIORegion *region, bool enabled) 379 { 380 int i; 381 382 if (!region->mem) { 383 return; 384 } 385 386 for (i = 0; i < region->nr_mmaps; i++) { 387 if (region->mmaps[i].mmap) { 388 memory_region_set_enabled(®ion->mmaps[i].mem, enabled); 389 } 390 } 391 392 trace_vfio_region_mmaps_set_enabled(memory_region_name(region->mem), 393 enabled); 394 } 395