1 /* 2 * QEMU Xen emulation: Grant table support 3 * 4 * Copyright © 2022 Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 * 6 * Authors: David Woodhouse <dwmw2@infradead.org> 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/host-utils.h" 14 #include "qemu/module.h" 15 #include "qemu/lockable.h" 16 #include "qemu/main-loop.h" 17 #include "qapi/error.h" 18 #include "qom/object.h" 19 #include "exec/target_page.h" 20 #include "exec/address-spaces.h" 21 #include "migration/vmstate.h" 22 23 #include "hw/sysbus.h" 24 #include "hw/xen/xen.h" 25 #include "hw/xen/xen_backend_ops.h" 26 #include "xen_overlay.h" 27 #include "xen_gnttab.h" 28 #include "xen_primary_console.h" 29 30 #include "sysemu/kvm.h" 31 #include "sysemu/kvm_xen.h" 32 33 #include "hw/xen/interface/memory.h" 34 #include "hw/xen/interface/grant_table.h" 35 36 #define TYPE_XEN_GNTTAB "xen-gnttab" 37 OBJECT_DECLARE_SIMPLE_TYPE(XenGnttabState, XEN_GNTTAB) 38 39 #define ENTRIES_PER_FRAME_V1 (XEN_PAGE_SIZE / sizeof(grant_entry_v1_t)) 40 41 static struct gnttab_backend_ops emu_gnttab_backend_ops; 42 43 struct XenGnttabState { 44 /*< private >*/ 45 SysBusDevice busdev; 46 /*< public >*/ 47 48 QemuMutex gnt_lock; 49 50 uint32_t nr_frames; 51 uint32_t max_frames; 52 53 union { 54 grant_entry_v1_t *v1; 55 /* Theoretically, v2 support could be added here. */ 56 } entries; 57 58 MemoryRegion gnt_frames; 59 MemoryRegion *gnt_aliases; 60 uint64_t *gnt_frame_gpas; 61 62 uint8_t *map_track; 63 }; 64 65 struct XenGnttabState *xen_gnttab_singleton; 66 67 static void xen_gnttab_realize(DeviceState *dev, Error **errp) 68 { 69 XenGnttabState *s = XEN_GNTTAB(dev); 70 int i; 71 72 if (xen_mode != XEN_EMULATE) { 73 error_setg(errp, "Xen grant table support is for Xen emulation"); 74 return; 75 } 76 s->max_frames = kvm_xen_get_gnttab_max_frames(); 77 memory_region_init_ram(&s->gnt_frames, OBJECT(dev), "xen:grant_table", 78 XEN_PAGE_SIZE * s->max_frames, &error_abort); 79 memory_region_set_enabled(&s->gnt_frames, true); 80 s->entries.v1 = memory_region_get_ram_ptr(&s->gnt_frames); 81 82 /* Create individual page-sizes aliases for overlays */ 83 s->gnt_aliases = (void *)g_new0(MemoryRegion, s->max_frames); 84 s->gnt_frame_gpas = (void *)g_new(uint64_t, s->max_frames); 85 for (i = 0; i < s->max_frames; i++) { 86 memory_region_init_alias(&s->gnt_aliases[i], OBJECT(dev), 87 NULL, &s->gnt_frames, 88 i * XEN_PAGE_SIZE, XEN_PAGE_SIZE); 89 s->gnt_frame_gpas[i] = INVALID_GPA; 90 } 91 92 s->nr_frames = 0; 93 memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); 94 s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; 95 s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); 96 97 qemu_mutex_init(&s->gnt_lock); 98 99 xen_gnttab_singleton = s; 100 101 s->map_track = g_new0(uint8_t, s->max_frames * ENTRIES_PER_FRAME_V1); 102 103 xen_gnttab_ops = &emu_gnttab_backend_ops; 104 } 105 106 static int xen_gnttab_post_load(void *opaque, int version_id) 107 { 108 XenGnttabState *s = XEN_GNTTAB(opaque); 109 uint32_t i; 110 111 for (i = 0; i < s->nr_frames; i++) { 112 if (s->gnt_frame_gpas[i] != INVALID_GPA) { 113 xen_overlay_do_map_page(&s->gnt_aliases[i], s->gnt_frame_gpas[i]); 114 } 115 } 116 return 0; 117 } 118 119 static bool xen_gnttab_is_needed(void *opaque) 120 { 121 return xen_mode == XEN_EMULATE; 122 } 123 124 static const VMStateDescription xen_gnttab_vmstate = { 125 .name = "xen_gnttab", 126 .version_id = 1, 127 .minimum_version_id = 1, 128 .needed = xen_gnttab_is_needed, 129 .post_load = xen_gnttab_post_load, 130 .fields = (const VMStateField[]) { 131 VMSTATE_UINT32(nr_frames, XenGnttabState), 132 VMSTATE_VARRAY_UINT32(gnt_frame_gpas, XenGnttabState, nr_frames, 0, 133 vmstate_info_uint64, uint64_t), 134 VMSTATE_END_OF_LIST() 135 } 136 }; 137 138 static void xen_gnttab_class_init(ObjectClass *klass, void *data) 139 { 140 DeviceClass *dc = DEVICE_CLASS(klass); 141 142 dc->realize = xen_gnttab_realize; 143 dc->vmsd = &xen_gnttab_vmstate; 144 } 145 146 static const TypeInfo xen_gnttab_info = { 147 .name = TYPE_XEN_GNTTAB, 148 .parent = TYPE_SYS_BUS_DEVICE, 149 .instance_size = sizeof(XenGnttabState), 150 .class_init = xen_gnttab_class_init, 151 }; 152 153 void xen_gnttab_create(void) 154 { 155 xen_gnttab_singleton = XEN_GNTTAB(sysbus_create_simple(TYPE_XEN_GNTTAB, 156 -1, NULL)); 157 } 158 159 static void xen_gnttab_register_types(void) 160 { 161 type_register_static(&xen_gnttab_info); 162 } 163 164 type_init(xen_gnttab_register_types) 165 166 int xen_gnttab_map_page(uint64_t idx, uint64_t gfn) 167 { 168 XenGnttabState *s = xen_gnttab_singleton; 169 uint64_t gpa = gfn << XEN_PAGE_SHIFT; 170 171 if (!s) { 172 return -ENOTSUP; 173 } 174 175 if (idx >= s->max_frames) { 176 return -EINVAL; 177 } 178 179 QEMU_IOTHREAD_LOCK_GUARD(); 180 QEMU_LOCK_GUARD(&s->gnt_lock); 181 182 xen_overlay_do_map_page(&s->gnt_aliases[idx], gpa); 183 184 s->gnt_frame_gpas[idx] = gpa; 185 186 if (s->nr_frames <= idx) { 187 s->nr_frames = idx + 1; 188 } 189 190 return 0; 191 } 192 193 int xen_gnttab_set_version_op(struct gnttab_set_version *set) 194 { 195 int ret; 196 197 switch (set->version) { 198 case 1: 199 ret = 0; 200 break; 201 202 case 2: 203 /* Behave as before set_version was introduced. */ 204 ret = -ENOSYS; 205 break; 206 207 default: 208 ret = -EINVAL; 209 } 210 211 set->version = 1; 212 return ret; 213 } 214 215 int xen_gnttab_get_version_op(struct gnttab_get_version *get) 216 { 217 if (get->dom != DOMID_SELF && get->dom != xen_domid) { 218 return -ESRCH; 219 } 220 221 get->version = 1; 222 return 0; 223 } 224 225 int xen_gnttab_query_size_op(struct gnttab_query_size *size) 226 { 227 XenGnttabState *s = xen_gnttab_singleton; 228 229 if (!s) { 230 return -ENOTSUP; 231 } 232 233 if (size->dom != DOMID_SELF && size->dom != xen_domid) { 234 size->status = GNTST_bad_domain; 235 return 0; 236 } 237 238 size->status = GNTST_okay; 239 size->nr_frames = s->nr_frames; 240 size->max_nr_frames = s->max_frames; 241 return 0; 242 } 243 244 /* Track per-open refs, to allow close() to clean up. */ 245 struct active_ref { 246 MemoryRegionSection mrs; 247 void *virtaddr; 248 uint32_t refcnt; 249 int prot; 250 }; 251 252 static void gnt_unref(XenGnttabState *s, grant_ref_t ref, 253 MemoryRegionSection *mrs, int prot) 254 { 255 if (mrs && mrs->mr) { 256 if (prot & PROT_WRITE) { 257 memory_region_set_dirty(mrs->mr, mrs->offset_within_region, 258 XEN_PAGE_SIZE); 259 } 260 memory_region_unref(mrs->mr); 261 mrs->mr = NULL; 262 } 263 assert(s->map_track[ref] != 0); 264 265 if (--s->map_track[ref] == 0) { 266 grant_entry_v1_t *gnt_p = &s->entries.v1[ref]; 267 qatomic_and(&gnt_p->flags, (uint16_t)~(GTF_reading | GTF_writing)); 268 } 269 } 270 271 static uint64_t gnt_ref(XenGnttabState *s, grant_ref_t ref, int prot) 272 { 273 uint16_t mask = GTF_type_mask | GTF_sub_page; 274 grant_entry_v1_t gnt, *gnt_p; 275 int retries = 0; 276 277 if (ref >= s->max_frames * ENTRIES_PER_FRAME_V1 || 278 s->map_track[ref] == UINT8_MAX) { 279 return INVALID_GPA; 280 } 281 282 if (prot & PROT_WRITE) { 283 mask |= GTF_readonly; 284 } 285 286 gnt_p = &s->entries.v1[ref]; 287 288 /* 289 * The guest can legitimately be changing the GTF_readonly flag. Allow 290 * that, but don't let a malicious guest cause a livelock. 291 */ 292 for (retries = 0; retries < 5; retries++) { 293 uint16_t new_flags; 294 295 /* Read the entry before an atomic operation on its flags */ 296 gnt = *(volatile grant_entry_v1_t *)gnt_p; 297 298 if ((gnt.flags & mask) != GTF_permit_access || 299 gnt.domid != DOMID_QEMU) { 300 return INVALID_GPA; 301 } 302 303 new_flags = gnt.flags | GTF_reading; 304 if (prot & PROT_WRITE) { 305 new_flags |= GTF_writing; 306 } 307 308 if (qatomic_cmpxchg(&gnt_p->flags, gnt.flags, new_flags) == gnt.flags) { 309 return (uint64_t)gnt.frame << XEN_PAGE_SHIFT; 310 } 311 } 312 313 return INVALID_GPA; 314 } 315 316 struct xengntdev_handle { 317 GHashTable *active_maps; 318 }; 319 320 static int xen_be_gnttab_set_max_grants(struct xengntdev_handle *xgt, 321 uint32_t nr_grants) 322 { 323 return 0; 324 } 325 326 static void *xen_be_gnttab_map_refs(struct xengntdev_handle *xgt, 327 uint32_t count, uint32_t domid, 328 uint32_t *refs, int prot) 329 { 330 XenGnttabState *s = xen_gnttab_singleton; 331 struct active_ref *act; 332 333 if (!s) { 334 errno = ENOTSUP; 335 return NULL; 336 } 337 338 if (domid != xen_domid) { 339 errno = EINVAL; 340 return NULL; 341 } 342 343 if (!count || count > 4096) { 344 errno = EINVAL; 345 return NULL; 346 } 347 348 /* 349 * Making a contiguous mapping from potentially discontiguous grant 350 * references would be... distinctly non-trivial. We don't support it. 351 * Even changing the API to return an array of pointers, one per page, 352 * wouldn't be simple to use in PV backends because some structures 353 * actually cross page boundaries (e.g. 32-bit blkif_response ring 354 * entries are 12 bytes). 355 */ 356 if (count != 1) { 357 errno = EINVAL; 358 return NULL; 359 } 360 361 QEMU_LOCK_GUARD(&s->gnt_lock); 362 363 act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); 364 if (act) { 365 if ((prot & PROT_WRITE) && !(act->prot & PROT_WRITE)) { 366 if (gnt_ref(s, refs[0], prot) == INVALID_GPA) { 367 return NULL; 368 } 369 act->prot |= PROT_WRITE; 370 } 371 act->refcnt++; 372 } else { 373 uint64_t gpa = gnt_ref(s, refs[0], prot); 374 if (gpa == INVALID_GPA) { 375 errno = EINVAL; 376 return NULL; 377 } 378 379 act = g_new0(struct active_ref, 1); 380 act->prot = prot; 381 act->refcnt = 1; 382 act->mrs = memory_region_find(get_system_memory(), gpa, XEN_PAGE_SIZE); 383 384 if (act->mrs.mr && 385 !int128_lt(act->mrs.size, int128_make64(XEN_PAGE_SIZE)) && 386 memory_region_get_ram_addr(act->mrs.mr) != RAM_ADDR_INVALID) { 387 act->virtaddr = qemu_map_ram_ptr(act->mrs.mr->ram_block, 388 act->mrs.offset_within_region); 389 } 390 if (!act->virtaddr) { 391 gnt_unref(s, refs[0], &act->mrs, 0); 392 g_free(act); 393 errno = EINVAL; 394 return NULL; 395 } 396 397 s->map_track[refs[0]]++; 398 g_hash_table_insert(xgt->active_maps, GINT_TO_POINTER(refs[0]), act); 399 } 400 401 return act->virtaddr; 402 } 403 404 static gboolean do_unmap(gpointer key, gpointer value, gpointer user_data) 405 { 406 XenGnttabState *s = user_data; 407 grant_ref_t gref = GPOINTER_TO_INT(key); 408 struct active_ref *act = value; 409 410 gnt_unref(s, gref, &act->mrs, act->prot); 411 g_free(act); 412 return true; 413 } 414 415 static int xen_be_gnttab_unmap(struct xengntdev_handle *xgt, 416 void *start_address, uint32_t *refs, 417 uint32_t count) 418 { 419 XenGnttabState *s = xen_gnttab_singleton; 420 struct active_ref *act; 421 422 if (!s) { 423 return -ENOTSUP; 424 } 425 426 if (count != 1) { 427 return -EINVAL; 428 } 429 430 QEMU_LOCK_GUARD(&s->gnt_lock); 431 432 act = g_hash_table_lookup(xgt->active_maps, GINT_TO_POINTER(refs[0])); 433 if (!act) { 434 return -ENOENT; 435 } 436 437 if (act->virtaddr != start_address) { 438 return -EINVAL; 439 } 440 441 if (!--act->refcnt) { 442 do_unmap(GINT_TO_POINTER(refs[0]), act, s); 443 g_hash_table_remove(xgt->active_maps, GINT_TO_POINTER(refs[0])); 444 } 445 446 return 0; 447 } 448 449 /* 450 * This looks a bit like the one for true Xen in xen-operations.c but 451 * in emulation we don't support multi-page mappings. And under Xen we 452 * *want* the multi-page mappings so we have fewer bounces through the 453 * kernel and the hypervisor. So the code paths end up being similar, 454 * but different. 455 */ 456 static int xen_be_gnttab_copy(struct xengntdev_handle *xgt, bool to_domain, 457 uint32_t domid, XenGrantCopySegment *segs, 458 uint32_t nr_segs, Error **errp) 459 { 460 int prot = to_domain ? PROT_WRITE : PROT_READ; 461 unsigned int i; 462 463 for (i = 0; i < nr_segs; i++) { 464 XenGrantCopySegment *seg = &segs[i]; 465 void *page; 466 uint32_t ref = to_domain ? seg->dest.foreign.ref : 467 seg->source.foreign.ref; 468 469 page = xen_be_gnttab_map_refs(xgt, 1, domid, &ref, prot); 470 if (!page) { 471 if (errp) { 472 error_setg_errno(errp, errno, 473 "xen_be_gnttab_map_refs failed"); 474 } 475 return -errno; 476 } 477 478 if (to_domain) { 479 memcpy(page + seg->dest.foreign.offset, seg->source.virt, 480 seg->len); 481 } else { 482 memcpy(seg->dest.virt, page + seg->source.foreign.offset, 483 seg->len); 484 } 485 486 if (xen_be_gnttab_unmap(xgt, page, &ref, 1)) { 487 if (errp) { 488 error_setg_errno(errp, errno, "xen_be_gnttab_unmap failed"); 489 } 490 return -errno; 491 } 492 } 493 494 return 0; 495 } 496 497 static struct xengntdev_handle *xen_be_gnttab_open(void) 498 { 499 struct xengntdev_handle *xgt = g_new0(struct xengntdev_handle, 1); 500 501 xgt->active_maps = g_hash_table_new(g_direct_hash, g_direct_equal); 502 return xgt; 503 } 504 505 static int xen_be_gnttab_close(struct xengntdev_handle *xgt) 506 { 507 XenGnttabState *s = xen_gnttab_singleton; 508 509 if (!s) { 510 return -ENOTSUP; 511 } 512 513 g_hash_table_foreach_remove(xgt->active_maps, do_unmap, s); 514 g_hash_table_destroy(xgt->active_maps); 515 g_free(xgt); 516 return 0; 517 } 518 519 static struct gnttab_backend_ops emu_gnttab_backend_ops = { 520 .open = xen_be_gnttab_open, 521 .close = xen_be_gnttab_close, 522 .grant_copy = xen_be_gnttab_copy, 523 .set_max_grants = xen_be_gnttab_set_max_grants, 524 .map_refs = xen_be_gnttab_map_refs, 525 .unmap = xen_be_gnttab_unmap, 526 }; 527 528 int xen_gnttab_reset(void) 529 { 530 XenGnttabState *s = xen_gnttab_singleton; 531 532 if (!s) { 533 return -ENOTSUP; 534 } 535 536 QEMU_LOCK_GUARD(&s->gnt_lock); 537 538 s->nr_frames = 0; 539 540 memset(s->entries.v1, 0, XEN_PAGE_SIZE * s->max_frames); 541 s->entries.v1[GNTTAB_RESERVED_XENSTORE].flags = GTF_permit_access; 542 s->entries.v1[GNTTAB_RESERVED_XENSTORE].frame = XEN_SPECIAL_PFN(XENSTORE); 543 544 if (xen_primary_console_get_pfn()) { 545 s->entries.v1[GNTTAB_RESERVED_CONSOLE].flags = GTF_permit_access; 546 s->entries.v1[GNTTAB_RESERVED_CONSOLE].frame = XEN_SPECIAL_PFN(CONSOLE); 547 } 548 549 return 0; 550 } 551