1 /* 2 * Copyright (C) 2011 Citrix Ltd. 3 * 4 * This work is licensed under the terms of the GNU GPL, version 2. See 5 * the COPYING file in the top-level directory. 6 * 7 * Contributions after 2012-01-13 are licensed under the terms of the 8 * GNU GPL, version 2 or (at your option) any later version. 9 */ 10 11 #include "qemu/osdep.h" 12 #include "qemu/units.h" 13 #include "qemu/error-report.h" 14 15 #include <sys/resource.h> 16 17 #include "hw/xen/xen_native.h" 18 #include "qemu/bitmap.h" 19 20 #include "sysemu/runstate.h" 21 #include "sysemu/xen-mapcache.h" 22 #include "trace.h" 23 24 25 #if HOST_LONG_BITS == 32 26 # define MCACHE_BUCKET_SHIFT 16 27 # define MCACHE_MAX_SIZE (1UL<<31) /* 2GB Cap */ 28 #else 29 # define MCACHE_BUCKET_SHIFT 20 30 # define MCACHE_MAX_SIZE (1UL<<35) /* 32GB Cap */ 31 #endif 32 #define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT) 33 34 /* This is the size of the virtual address space reserve to QEMU that will not 35 * be use by MapCache. 36 * From empirical tests I observed that qemu use 75MB more than the 37 * max_mcache_size. 38 */ 39 #define NON_MCACHE_MEMORY_SIZE (80 * MiB) 40 41 typedef struct MapCacheEntry { 42 hwaddr paddr_index; 43 uint8_t *vaddr_base; 44 unsigned long *valid_mapping; 45 uint32_t lock; 46 #define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0) 47 uint8_t flags; 48 hwaddr size; 49 struct MapCacheEntry *next; 50 } MapCacheEntry; 51 52 typedef struct MapCacheRev { 53 uint8_t *vaddr_req; 54 hwaddr paddr_index; 55 hwaddr size; 56 QTAILQ_ENTRY(MapCacheRev) next; 57 bool dma; 58 } MapCacheRev; 59 60 typedef struct MapCache { 61 MapCacheEntry *entry; 62 unsigned long nr_buckets; 63 QTAILQ_HEAD(, MapCacheRev) locked_entries; 64 65 /* For most cases (>99.9%), the page address is the same. */ 66 MapCacheEntry *last_entry; 67 unsigned long max_mcache_size; 68 unsigned int mcache_bucket_shift; 69 70 phys_offset_to_gaddr_t phys_offset_to_gaddr; 71 QemuMutex lock; 72 void *opaque; 73 } MapCache; 74 75 static MapCache *mapcache; 76 77 static inline void mapcache_lock(MapCache *mc) 78 { 79 qemu_mutex_lock(&mc->lock); 80 } 81 82 static inline void mapcache_unlock(MapCache *mc) 83 { 84 qemu_mutex_unlock(&mc->lock); 85 } 86 87 static inline int test_bits(int nr, int size, const unsigned long *addr) 88 { 89 unsigned long res = find_next_zero_bit(addr, size + nr, nr); 90 if (res >= nr + size) 91 return 1; 92 else 93 return 0; 94 } 95 96 static MapCache *xen_map_cache_init_single(phys_offset_to_gaddr_t f, 97 void *opaque, 98 unsigned long max_size) 99 { 100 unsigned long size; 101 MapCache *mc; 102 103 mc = g_new0(MapCache, 1); 104 105 mc->phys_offset_to_gaddr = f; 106 mc->opaque = opaque; 107 qemu_mutex_init(&mc->lock); 108 109 QTAILQ_INIT(&mc->locked_entries); 110 111 mc->max_mcache_size = max_size; 112 113 mc->nr_buckets = 114 (((mc->max_mcache_size >> XC_PAGE_SHIFT) + 115 (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >> 116 (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)); 117 118 size = mc->nr_buckets * sizeof(MapCacheEntry); 119 size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1); 120 trace_xen_map_cache_init(mc->nr_buckets, size); 121 mc->entry = g_malloc0(size); 122 return mc; 123 } 124 125 void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque) 126 { 127 struct rlimit rlimit_as; 128 unsigned long max_mcache_size; 129 130 if (geteuid() == 0) { 131 rlimit_as.rlim_cur = RLIM_INFINITY; 132 rlimit_as.rlim_max = RLIM_INFINITY; 133 max_mcache_size = MCACHE_MAX_SIZE; 134 } else { 135 getrlimit(RLIMIT_AS, &rlimit_as); 136 rlimit_as.rlim_cur = rlimit_as.rlim_max; 137 138 if (rlimit_as.rlim_max != RLIM_INFINITY) { 139 warn_report("QEMU's maximum size of virtual" 140 " memory is not infinity"); 141 } 142 if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) { 143 max_mcache_size = rlimit_as.rlim_max - NON_MCACHE_MEMORY_SIZE; 144 } else { 145 max_mcache_size = MCACHE_MAX_SIZE; 146 } 147 } 148 149 mapcache = xen_map_cache_init_single(f, opaque, max_mcache_size); 150 setrlimit(RLIMIT_AS, &rlimit_as); 151 } 152 153 static void xen_remap_bucket(MapCache *mc, 154 MapCacheEntry *entry, 155 void *vaddr, 156 hwaddr size, 157 hwaddr address_index, 158 bool dummy) 159 { 160 uint8_t *vaddr_base; 161 xen_pfn_t *pfns; 162 int *err; 163 unsigned int i; 164 hwaddr nb_pfn = size >> XC_PAGE_SHIFT; 165 166 trace_xen_remap_bucket(address_index); 167 168 pfns = g_new0(xen_pfn_t, nb_pfn); 169 err = g_new0(int, nb_pfn); 170 171 if (entry->vaddr_base != NULL) { 172 if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) { 173 ram_block_notify_remove(entry->vaddr_base, entry->size, 174 entry->size); 175 } 176 177 /* 178 * If an entry is being replaced by another mapping and we're using 179 * MAP_FIXED flag for it - there is possibility of a race for vaddr 180 * address with another thread doing an mmap call itself 181 * (see man 2 mmap). To avoid that we skip explicit unmapping here 182 * and allow the kernel to destroy the previous mappings by replacing 183 * them in mmap call later. 184 * 185 * Non-identical replacements are not allowed therefore. 186 */ 187 assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size)); 188 189 if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) { 190 perror("unmap fails"); 191 exit(-1); 192 } 193 } 194 g_free(entry->valid_mapping); 195 entry->valid_mapping = NULL; 196 197 for (i = 0; i < nb_pfn; i++) { 198 pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i; 199 } 200 201 /* 202 * If the caller has requested the mapping at a specific address use 203 * MAP_FIXED to make sure it's honored. 204 */ 205 if (!dummy) { 206 vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr, 207 PROT_READ | PROT_WRITE, 208 vaddr ? MAP_FIXED : 0, 209 nb_pfn, pfns, err); 210 if (vaddr_base == NULL) { 211 perror("xenforeignmemory_map2"); 212 exit(-1); 213 } 214 } else { 215 /* 216 * We create dummy mappings where we are unable to create a foreign 217 * mapping immediately due to certain circumstances (i.e. on resume now) 218 */ 219 vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE, 220 MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0), 221 -1, 0); 222 if (vaddr_base == MAP_FAILED) { 223 perror("mmap"); 224 exit(-1); 225 } 226 } 227 228 if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) { 229 ram_block_notify_add(vaddr_base, size, size); 230 } 231 232 entry->vaddr_base = vaddr_base; 233 entry->paddr_index = address_index; 234 entry->size = size; 235 entry->valid_mapping = g_new0(unsigned long, 236 BITS_TO_LONGS(size >> XC_PAGE_SHIFT)); 237 238 if (dummy) { 239 entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY; 240 } else { 241 entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY); 242 } 243 244 bitmap_zero(entry->valid_mapping, nb_pfn); 245 for (i = 0; i < nb_pfn; i++) { 246 if (!err[i]) { 247 bitmap_set(entry->valid_mapping, i, 1); 248 } 249 } 250 251 g_free(pfns); 252 g_free(err); 253 } 254 255 static uint8_t *xen_map_cache_unlocked(MapCache *mc, 256 hwaddr phys_addr, hwaddr size, 257 uint8_t lock, bool dma, bool is_write) 258 { 259 MapCacheEntry *entry, *pentry = NULL, 260 *free_entry = NULL, *free_pentry = NULL; 261 hwaddr address_index; 262 hwaddr address_offset; 263 hwaddr cache_size = size; 264 hwaddr test_bit_size; 265 bool translated G_GNUC_UNUSED = false; 266 bool dummy = false; 267 268 tryagain: 269 address_index = phys_addr >> MCACHE_BUCKET_SHIFT; 270 address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1); 271 272 trace_xen_map_cache(phys_addr); 273 274 /* test_bit_size is always a multiple of XC_PAGE_SIZE */ 275 if (size) { 276 test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1)); 277 278 if (test_bit_size % XC_PAGE_SIZE) { 279 test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE); 280 } 281 } else { 282 test_bit_size = XC_PAGE_SIZE; 283 } 284 285 if (mc->last_entry != NULL && 286 mc->last_entry->paddr_index == address_index && 287 !lock && !size && 288 test_bits(address_offset >> XC_PAGE_SHIFT, 289 test_bit_size >> XC_PAGE_SHIFT, 290 mc->last_entry->valid_mapping)) { 291 trace_xen_map_cache_return( 292 mc->last_entry->vaddr_base + address_offset 293 ); 294 return mc->last_entry->vaddr_base + address_offset; 295 } 296 297 /* size is always a multiple of MCACHE_BUCKET_SIZE */ 298 if (size) { 299 cache_size = size + address_offset; 300 if (cache_size % MCACHE_BUCKET_SIZE) { 301 cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE); 302 } 303 } else { 304 cache_size = MCACHE_BUCKET_SIZE; 305 } 306 307 entry = &mc->entry[address_index % mc->nr_buckets]; 308 309 while (entry && (lock || entry->lock) && entry->vaddr_base && 310 (entry->paddr_index != address_index || entry->size != cache_size || 311 !test_bits(address_offset >> XC_PAGE_SHIFT, 312 test_bit_size >> XC_PAGE_SHIFT, 313 entry->valid_mapping))) { 314 if (!free_entry && !entry->lock) { 315 free_entry = entry; 316 free_pentry = pentry; 317 } 318 pentry = entry; 319 entry = entry->next; 320 } 321 if (!entry && free_entry) { 322 entry = free_entry; 323 pentry = free_pentry; 324 } 325 if (!entry) { 326 entry = g_new0(MapCacheEntry, 1); 327 pentry->next = entry; 328 xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy); 329 } else if (!entry->lock) { 330 if (!entry->vaddr_base || entry->paddr_index != address_index || 331 entry->size != cache_size || 332 !test_bits(address_offset >> XC_PAGE_SHIFT, 333 test_bit_size >> XC_PAGE_SHIFT, 334 entry->valid_mapping)) { 335 xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy); 336 } 337 } 338 339 if(!test_bits(address_offset >> XC_PAGE_SHIFT, 340 test_bit_size >> XC_PAGE_SHIFT, 341 entry->valid_mapping)) { 342 mc->last_entry = NULL; 343 #ifdef XEN_COMPAT_PHYSMAP 344 if (!translated && mc->phys_offset_to_gaddr) { 345 phys_addr = mc->phys_offset_to_gaddr(phys_addr, size); 346 translated = true; 347 goto tryagain; 348 } 349 #endif 350 if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) { 351 dummy = true; 352 goto tryagain; 353 } 354 trace_xen_map_cache_return(NULL); 355 return NULL; 356 } 357 358 mc->last_entry = entry; 359 if (lock) { 360 MapCacheRev *reventry = g_new0(MapCacheRev, 1); 361 entry->lock++; 362 if (entry->lock == 0) { 363 error_report("mapcache entry lock overflow: "HWADDR_FMT_plx" -> %p", 364 entry->paddr_index, entry->vaddr_base); 365 abort(); 366 } 367 reventry->dma = dma; 368 reventry->vaddr_req = mc->last_entry->vaddr_base + address_offset; 369 reventry->paddr_index = mc->last_entry->paddr_index; 370 reventry->size = entry->size; 371 QTAILQ_INSERT_HEAD(&mc->locked_entries, reventry, next); 372 } 373 374 trace_xen_map_cache_return( 375 mc->last_entry->vaddr_base + address_offset 376 ); 377 return mc->last_entry->vaddr_base + address_offset; 378 } 379 380 uint8_t *xen_map_cache(MemoryRegion *mr, 381 hwaddr phys_addr, hwaddr size, 382 uint8_t lock, bool dma, 383 bool is_write) 384 { 385 uint8_t *p; 386 387 mapcache_lock(mapcache); 388 p = xen_map_cache_unlocked(mapcache, phys_addr, size, lock, dma, is_write); 389 mapcache_unlock(mapcache); 390 return p; 391 } 392 393 static ram_addr_t xen_ram_addr_from_mapcache_single(MapCache *mc, void *ptr) 394 { 395 MapCacheEntry *entry = NULL; 396 MapCacheRev *reventry; 397 hwaddr paddr_index; 398 hwaddr size; 399 ram_addr_t raddr; 400 int found = 0; 401 402 mapcache_lock(mc); 403 QTAILQ_FOREACH(reventry, &mc->locked_entries, next) { 404 if (reventry->vaddr_req == ptr) { 405 paddr_index = reventry->paddr_index; 406 size = reventry->size; 407 found = 1; 408 break; 409 } 410 } 411 if (!found) { 412 trace_xen_ram_addr_from_mapcache_not_found(ptr); 413 mapcache_unlock(mc); 414 return RAM_ADDR_INVALID; 415 } 416 417 entry = &mc->entry[paddr_index % mc->nr_buckets]; 418 while (entry && (entry->paddr_index != paddr_index || entry->size != size)) { 419 entry = entry->next; 420 } 421 if (!entry) { 422 trace_xen_ram_addr_from_mapcache_not_in_cache(ptr); 423 raddr = RAM_ADDR_INVALID; 424 } else { 425 raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) + 426 ((unsigned long) ptr - (unsigned long) entry->vaddr_base); 427 } 428 mapcache_unlock(mc); 429 return raddr; 430 } 431 432 ram_addr_t xen_ram_addr_from_mapcache(void *ptr) 433 { 434 return xen_ram_addr_from_mapcache_single(mapcache, ptr); 435 } 436 437 static void xen_invalidate_map_cache_entry_unlocked(MapCache *mc, 438 uint8_t *buffer) 439 { 440 MapCacheEntry *entry = NULL, *pentry = NULL; 441 MapCacheRev *reventry; 442 hwaddr paddr_index; 443 hwaddr size; 444 int found = 0; 445 446 QTAILQ_FOREACH(reventry, &mc->locked_entries, next) { 447 if (reventry->vaddr_req == buffer) { 448 paddr_index = reventry->paddr_index; 449 size = reventry->size; 450 found = 1; 451 break; 452 } 453 } 454 if (!found) { 455 trace_xen_invalidate_map_cache_entry_unlocked_not_found(buffer); 456 QTAILQ_FOREACH(reventry, &mc->locked_entries, next) { 457 trace_xen_invalidate_map_cache_entry_unlocked_found( 458 reventry->paddr_index, 459 reventry->vaddr_req 460 ); 461 } 462 return; 463 } 464 QTAILQ_REMOVE(&mc->locked_entries, reventry, next); 465 g_free(reventry); 466 467 if (mc->last_entry != NULL && 468 mc->last_entry->paddr_index == paddr_index) { 469 mc->last_entry = NULL; 470 } 471 472 entry = &mc->entry[paddr_index % mc->nr_buckets]; 473 while (entry && (entry->paddr_index != paddr_index || entry->size != size)) { 474 pentry = entry; 475 entry = entry->next; 476 } 477 if (!entry) { 478 trace_xen_invalidate_map_cache_entry_unlocked_miss(buffer); 479 return; 480 } 481 entry->lock--; 482 if (entry->lock > 0 || pentry == NULL) { 483 return; 484 } 485 486 pentry->next = entry->next; 487 ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size); 488 if (munmap(entry->vaddr_base, entry->size) != 0) { 489 perror("unmap fails"); 490 exit(-1); 491 } 492 g_free(entry->valid_mapping); 493 g_free(entry); 494 } 495 496 typedef struct XenMapCacheData { 497 Coroutine *co; 498 uint8_t *buffer; 499 } XenMapCacheData; 500 501 static void xen_invalidate_map_cache_entry_bh(void *opaque) 502 { 503 XenMapCacheData *data = opaque; 504 505 mapcache_lock(mapcache); 506 xen_invalidate_map_cache_entry_unlocked(mapcache, data->buffer); 507 mapcache_unlock(mapcache); 508 509 aio_co_wake(data->co); 510 } 511 512 void coroutine_mixed_fn xen_invalidate_map_cache_entry(uint8_t *buffer) 513 { 514 if (qemu_in_coroutine()) { 515 XenMapCacheData data = { 516 .co = qemu_coroutine_self(), 517 .buffer = buffer, 518 }; 519 aio_bh_schedule_oneshot(qemu_get_current_aio_context(), 520 xen_invalidate_map_cache_entry_bh, &data); 521 qemu_coroutine_yield(); 522 } else { 523 mapcache_lock(mapcache); 524 xen_invalidate_map_cache_entry_unlocked(mapcache, buffer); 525 mapcache_unlock(mapcache); 526 } 527 } 528 529 static void xen_invalidate_map_cache_single(MapCache *mc) 530 { 531 unsigned long i; 532 MapCacheRev *reventry; 533 534 mapcache_lock(mc); 535 536 QTAILQ_FOREACH(reventry, &mc->locked_entries, next) { 537 if (!reventry->dma) { 538 continue; 539 } 540 trace_xen_invalidate_map_cache(reventry->paddr_index, 541 reventry->vaddr_req); 542 } 543 544 for (i = 0; i < mc->nr_buckets; i++) { 545 MapCacheEntry *entry = &mc->entry[i]; 546 547 if (entry->vaddr_base == NULL) { 548 continue; 549 } 550 if (entry->lock > 0) { 551 continue; 552 } 553 554 if (munmap(entry->vaddr_base, entry->size) != 0) { 555 perror("unmap fails"); 556 exit(-1); 557 } 558 559 entry->paddr_index = 0; 560 entry->vaddr_base = NULL; 561 entry->size = 0; 562 g_free(entry->valid_mapping); 563 entry->valid_mapping = NULL; 564 } 565 566 mc->last_entry = NULL; 567 568 mapcache_unlock(mc); 569 } 570 571 void xen_invalidate_map_cache(void) 572 { 573 /* Flush pending AIO before destroying the mapcache */ 574 bdrv_drain_all(); 575 576 xen_invalidate_map_cache_single(mapcache); 577 } 578 579 static uint8_t *xen_replace_cache_entry_unlocked(MapCache *mc, 580 hwaddr old_phys_addr, 581 hwaddr new_phys_addr, 582 hwaddr size) 583 { 584 MapCacheEntry *entry; 585 hwaddr address_index, address_offset; 586 hwaddr test_bit_size, cache_size = size; 587 588 address_index = old_phys_addr >> MCACHE_BUCKET_SHIFT; 589 address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1); 590 591 assert(size); 592 /* test_bit_size is always a multiple of XC_PAGE_SIZE */ 593 test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1)); 594 if (test_bit_size % XC_PAGE_SIZE) { 595 test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE); 596 } 597 cache_size = size + address_offset; 598 if (cache_size % MCACHE_BUCKET_SIZE) { 599 cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE); 600 } 601 602 entry = &mc->entry[address_index % mc->nr_buckets]; 603 while (entry && !(entry->paddr_index == address_index && 604 entry->size == cache_size)) { 605 entry = entry->next; 606 } 607 if (!entry) { 608 trace_xen_replace_cache_entry_unlocked(old_phys_addr); 609 return NULL; 610 } 611 612 address_index = new_phys_addr >> MCACHE_BUCKET_SHIFT; 613 address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1); 614 615 trace_xen_replace_cache_entry_dummy(old_phys_addr, new_phys_addr); 616 617 xen_remap_bucket(mc, entry, entry->vaddr_base, 618 cache_size, address_index, false); 619 if (!test_bits(address_offset >> XC_PAGE_SHIFT, 620 test_bit_size >> XC_PAGE_SHIFT, 621 entry->valid_mapping)) { 622 trace_xen_replace_cache_entry_unlocked_could_not_update_entry( 623 old_phys_addr 624 ); 625 return NULL; 626 } 627 628 return entry->vaddr_base + address_offset; 629 } 630 631 uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr, 632 hwaddr new_phys_addr, 633 hwaddr size) 634 { 635 uint8_t *p; 636 637 mapcache_lock(mapcache); 638 p = xen_replace_cache_entry_unlocked(mapcache, old_phys_addr, 639 new_phys_addr, size); 640 mapcache_unlock(mapcache); 641 return p; 642 } 643