1 /* 2 * Memory Device Interface 3 * 4 * Copyright ProfitBricks GmbH 2012 5 * Copyright (C) 2014 Red Hat Inc 6 * Copyright (c) 2018 Red Hat Inc 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "hw/mem/memory-device.h" 15 #include "qapi/error.h" 16 #include "hw/boards.h" 17 #include "qemu/range.h" 18 #include "hw/virtio/vhost.h" 19 #include "sysemu/kvm.h" 20 #include "exec/address-spaces.h" 21 #include "trace.h" 22 23 static bool memory_device_is_empty(const MemoryDeviceState *md) 24 { 25 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 26 Error *local_err = NULL; 27 MemoryRegion *mr; 28 29 /* dropping const here is fine as we don't touch the memory region */ 30 mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err); 31 if (local_err) { 32 /* Not empty, we'll report errors later when containing the MR again. */ 33 error_free(local_err); 34 return false; 35 } 36 return !mr; 37 } 38 39 static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) 40 { 41 const MemoryDeviceState *md_a = MEMORY_DEVICE(a); 42 const MemoryDeviceState *md_b = MEMORY_DEVICE(b); 43 const MemoryDeviceClass *mdc_a = MEMORY_DEVICE_GET_CLASS(a); 44 const MemoryDeviceClass *mdc_b = MEMORY_DEVICE_GET_CLASS(b); 45 const uint64_t addr_a = mdc_a->get_addr(md_a); 46 const uint64_t addr_b = mdc_b->get_addr(md_b); 47 48 if (addr_a > addr_b) { 49 return 1; 50 } else if (addr_a < addr_b) { 51 return -1; 52 } 53 return 0; 54 } 55 56 static int memory_device_build_list(Object *obj, void *opaque) 57 { 58 GSList **list = opaque; 59 60 if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) { 61 DeviceState *dev = DEVICE(obj); 62 if (dev->realized) { /* only realized memory devices matter */ 63 *list = g_slist_insert_sorted(*list, dev, memory_device_addr_sort); 64 } 65 } 66 67 object_child_foreach(obj, memory_device_build_list, opaque); 68 return 0; 69 } 70 71 static unsigned int memory_device_get_memslots(MemoryDeviceState *md) 72 { 73 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 74 75 if (mdc->get_memslots) { 76 return mdc->get_memslots(md); 77 } 78 return 1; 79 } 80 81 /* 82 * Memslots that are reserved by memory devices (required but still reported 83 * as free from KVM / vhost). 84 */ 85 static unsigned int get_reserved_memslots(MachineState *ms) 86 { 87 if (ms->device_memory->used_memslots > 88 ms->device_memory->required_memslots) { 89 /* This is unexpected, and we warned already in the memory notifier. */ 90 return 0; 91 } 92 return ms->device_memory->required_memslots - 93 ms->device_memory->used_memslots; 94 } 95 96 unsigned int memory_devices_get_reserved_memslots(void) 97 { 98 if (!current_machine->device_memory) { 99 return 0; 100 } 101 return get_reserved_memslots(current_machine); 102 } 103 104 bool memory_devices_memslot_auto_decision_active(void) 105 { 106 if (!current_machine->device_memory) { 107 return false; 108 } 109 110 return current_machine->device_memory->memslot_auto_decision_active; 111 } 112 113 static unsigned int memory_device_memslot_decision_limit(MachineState *ms, 114 MemoryRegion *mr) 115 { 116 const unsigned int reserved = get_reserved_memslots(ms); 117 const uint64_t size = memory_region_size(mr); 118 unsigned int max = vhost_get_max_memslots(); 119 unsigned int free = vhost_get_free_memslots(); 120 uint64_t available_space; 121 unsigned int memslots; 122 123 if (kvm_enabled()) { 124 max = MIN(max, kvm_get_max_memslots()); 125 free = MIN(free, kvm_get_free_memslots()); 126 } 127 128 /* 129 * If we only have less overall memslots than what we consider reasonable, 130 * just keep it to a minimum. 131 */ 132 if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) { 133 return 1; 134 } 135 136 /* 137 * Consider our soft-limit across all memory devices. We don't really 138 * expect to exceed this limit in reasonable configurations. 139 */ 140 if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <= 141 ms->device_memory->required_memslots) { 142 return 1; 143 } 144 memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT - 145 ms->device_memory->required_memslots; 146 147 /* 148 * Consider the actually still free memslots. This is only relevant if 149 * other memslot consumers would consume *significantly* more memslots than 150 * what we prepared for (> 253). Unlikely, but let's just handle it 151 * cleanly. 152 */ 153 memslots = MIN(memslots, free - reserved); 154 if (memslots < 1 || unlikely(free < reserved)) { 155 return 1; 156 } 157 158 /* We cannot have any other memory devices? So give all to this device. */ 159 if (size == ms->maxram_size - ms->ram_size) { 160 return memslots; 161 } 162 163 /* 164 * Simple heuristic: equally distribute the memslots over the space 165 * still available for memory devices. 166 */ 167 available_space = ms->maxram_size - ms->ram_size - 168 ms->device_memory->used_region_size; 169 memslots = (double)memslots * size / available_space; 170 return memslots < 1 ? 1 : memslots; 171 } 172 173 static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md, 174 MemoryRegion *mr, Error **errp) 175 { 176 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 177 const uint64_t used_region_size = ms->device_memory->used_region_size; 178 const uint64_t size = memory_region_size(mr); 179 const unsigned int reserved_memslots = get_reserved_memslots(ms); 180 unsigned int required_memslots, memslot_limit; 181 182 /* 183 * Instruct the device to decide how many memslots to use, if applicable, 184 * before we query the number of required memslots the first time. 185 */ 186 if (mdc->decide_memslots) { 187 memslot_limit = memory_device_memslot_decision_limit(ms, mr); 188 mdc->decide_memslots(md, memslot_limit); 189 } 190 required_memslots = memory_device_get_memslots(md); 191 192 /* we will need memory slots for kvm and vhost */ 193 if (kvm_enabled() && 194 kvm_get_free_memslots() < required_memslots + reserved_memslots) { 195 error_setg(errp, "hypervisor has not enough free memory slots left"); 196 return; 197 } 198 if (vhost_get_free_memslots() < required_memslots + reserved_memslots) { 199 error_setg(errp, "a used vhost backend has not enough free memory slots left"); 200 return; 201 } 202 203 /* will we exceed the total amount of memory specified */ 204 if (used_region_size + size < used_region_size || 205 used_region_size + size > ms->maxram_size - ms->ram_size) { 206 error_setg(errp, "not enough space, currently 0x%" PRIx64 207 " in use of total space for memory devices 0x" RAM_ADDR_FMT, 208 used_region_size, ms->maxram_size - ms->ram_size); 209 return; 210 } 211 212 } 213 214 static uint64_t memory_device_get_free_addr(MachineState *ms, 215 const uint64_t *hint, 216 uint64_t align, uint64_t size, 217 Error **errp) 218 { 219 GSList *list = NULL, *item; 220 Range as, new = range_empty; 221 222 range_init_nofail(&as, ms->device_memory->base, 223 memory_region_size(&ms->device_memory->mr)); 224 225 /* start of address space indicates the maximum alignment we expect */ 226 if (!QEMU_IS_ALIGNED(range_lob(&as), align)) { 227 warn_report("the alignment (0x%" PRIx64 ") exceeds the expected" 228 " maximum alignment, memory will get fragmented and not" 229 " all 'maxmem' might be usable for memory devices.", 230 align); 231 } 232 233 if (hint && !QEMU_IS_ALIGNED(*hint, align)) { 234 error_setg(errp, "address must be aligned to 0x%" PRIx64 " bytes", 235 align); 236 return 0; 237 } 238 239 if (hint) { 240 if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) { 241 error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64 242 "], usable range for memory devices [0x%" PRIx64 ":0x%" 243 PRIx64 "]", *hint, size, range_lob(&as), 244 range_size(&as)); 245 return 0; 246 } 247 } else { 248 if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) { 249 error_setg(errp, "can't add memory device, device too big"); 250 return 0; 251 } 252 } 253 254 /* find address range that will fit new memory device */ 255 object_child_foreach(OBJECT(ms), memory_device_build_list, &list); 256 for (item = list; item; item = g_slist_next(item)) { 257 const MemoryDeviceState *md = item->data; 258 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(OBJECT(md)); 259 uint64_t next_addr; 260 Range tmp; 261 262 if (memory_device_is_empty(md)) { 263 continue; 264 } 265 266 range_init_nofail(&tmp, mdc->get_addr(md), 267 memory_device_get_region_size(md, &error_abort)); 268 269 if (range_overlaps_range(&tmp, &new)) { 270 if (hint) { 271 const DeviceState *d = DEVICE(md); 272 error_setg(errp, "address range conflicts with memory device" 273 " id='%s'", d->id ? d->id : "(unnamed)"); 274 goto out; 275 } 276 277 next_addr = QEMU_ALIGN_UP(range_upb(&tmp) + 1, align); 278 if (!next_addr || range_init(&new, next_addr, range_size(&new))) { 279 range_make_empty(&new); 280 break; 281 } 282 } else if (range_lob(&tmp) > range_upb(&new)) { 283 break; 284 } 285 } 286 287 if (!range_contains_range(&as, &new)) { 288 error_setg(errp, "could not find position in guest address space for " 289 "memory device - memory fragmented due to alignments"); 290 } 291 out: 292 g_slist_free(list); 293 return range_lob(&new); 294 } 295 296 MemoryDeviceInfoList *qmp_memory_device_list(void) 297 { 298 GSList *devices = NULL, *item; 299 MemoryDeviceInfoList *list = NULL, **tail = &list; 300 301 object_child_foreach(qdev_get_machine(), memory_device_build_list, 302 &devices); 303 304 for (item = devices; item; item = g_slist_next(item)) { 305 const MemoryDeviceState *md = MEMORY_DEVICE(item->data); 306 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data); 307 MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); 308 309 /* Let's query infotmation even for empty memory devices. */ 310 mdc->fill_device_info(md, info); 311 312 QAPI_LIST_APPEND(tail, info); 313 } 314 315 g_slist_free(devices); 316 317 return list; 318 } 319 320 static int memory_device_plugged_size(Object *obj, void *opaque) 321 { 322 uint64_t *size = opaque; 323 324 if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) { 325 const DeviceState *dev = DEVICE(obj); 326 const MemoryDeviceState *md = MEMORY_DEVICE(obj); 327 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj); 328 329 if (dev->realized && !memory_device_is_empty(md)) { 330 *size += mdc->get_plugged_size(md, &error_abort); 331 } 332 } 333 334 object_child_foreach(obj, memory_device_plugged_size, opaque); 335 return 0; 336 } 337 338 uint64_t get_plugged_memory_size(void) 339 { 340 uint64_t size = 0; 341 342 memory_device_plugged_size(qdev_get_machine(), &size); 343 344 return size; 345 } 346 347 void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, 348 const uint64_t *legacy_align, Error **errp) 349 { 350 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 351 Error *local_err = NULL; 352 uint64_t addr, align = 0; 353 MemoryRegion *mr; 354 355 /* We support empty memory devices even without device memory. */ 356 if (memory_device_is_empty(md)) { 357 return; 358 } 359 360 if (!ms->device_memory) { 361 error_setg(errp, "the configuration is not prepared for memory devices" 362 " (e.g., for memory hotplug), consider specifying the" 363 " maxmem option"); 364 return; 365 } 366 367 mr = mdc->get_memory_region(md, &local_err); 368 if (local_err) { 369 goto out; 370 } 371 372 memory_device_check_addable(ms, md, mr, &local_err); 373 if (local_err) { 374 goto out; 375 } 376 377 /* 378 * We always want the memory region size to be multiples of the memory 379 * region alignment: for example, DIMMs with 1G+1byte size don't make 380 * any sense. Note that we don't check that the size is multiples 381 * of any additional alignment requirements the memory device might 382 * have when it comes to the address in physical address space. 383 */ 384 if (!QEMU_IS_ALIGNED(memory_region_size(mr), 385 memory_region_get_alignment(mr))) { 386 error_setg(errp, "backend memory size must be multiple of 0x%" 387 PRIx64, memory_region_get_alignment(mr)); 388 return; 389 } 390 391 if (legacy_align) { 392 align = *legacy_align; 393 } else { 394 if (mdc->get_min_alignment) { 395 align = mdc->get_min_alignment(md); 396 } 397 align = MAX(align, memory_region_get_alignment(mr)); 398 } 399 addr = mdc->get_addr(md); 400 addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align, 401 memory_region_size(mr), &local_err); 402 if (local_err) { 403 goto out; 404 } 405 mdc->set_addr(md, addr, &local_err); 406 if (!local_err) { 407 trace_memory_device_pre_plug(DEVICE(md)->id ? DEVICE(md)->id : "", 408 addr); 409 } 410 out: 411 error_propagate(errp, local_err); 412 } 413 414 void memory_device_plug(MemoryDeviceState *md, MachineState *ms) 415 { 416 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 417 unsigned int memslots; 418 uint64_t addr; 419 MemoryRegion *mr; 420 421 if (memory_device_is_empty(md)) { 422 return; 423 } 424 425 memslots = memory_device_get_memslots(md); 426 addr = mdc->get_addr(md); 427 428 /* 429 * We expect that a previous call to memory_device_pre_plug() succeeded, so 430 * it can't fail at this point. 431 */ 432 mr = mdc->get_memory_region(md, &error_abort); 433 g_assert(ms->device_memory); 434 435 ms->device_memory->used_region_size += memory_region_size(mr); 436 ms->device_memory->required_memslots += memslots; 437 if (mdc->decide_memslots && memslots > 1) { 438 ms->device_memory->memslot_auto_decision_active++; 439 } 440 441 memory_region_add_subregion(&ms->device_memory->mr, 442 addr - ms->device_memory->base, mr); 443 trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr); 444 } 445 446 void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) 447 { 448 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 449 const unsigned int memslots = memory_device_get_memslots(md); 450 MemoryRegion *mr; 451 452 if (memory_device_is_empty(md)) { 453 return; 454 } 455 456 /* 457 * We expect that a previous call to memory_device_pre_plug() succeeded, so 458 * it can't fail at this point. 459 */ 460 mr = mdc->get_memory_region(md, &error_abort); 461 g_assert(ms->device_memory); 462 463 memory_region_del_subregion(&ms->device_memory->mr, mr); 464 465 if (mdc->decide_memslots && memslots > 1) { 466 ms->device_memory->memslot_auto_decision_active--; 467 } 468 ms->device_memory->used_region_size -= memory_region_size(mr); 469 ms->device_memory->required_memslots -= memslots; 470 trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "", 471 mdc->get_addr(md)); 472 } 473 474 uint64_t memory_device_get_region_size(const MemoryDeviceState *md, 475 Error **errp) 476 { 477 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 478 MemoryRegion *mr; 479 480 /* dropping const here is fine as we don't touch the memory region */ 481 mr = mdc->get_memory_region((MemoryDeviceState *)md, errp); 482 if (!mr) { 483 return 0; 484 } 485 486 return memory_region_size(mr); 487 } 488 489 static void memory_devices_region_mod(MemoryListener *listener, 490 MemoryRegionSection *mrs, bool add) 491 { 492 DeviceMemoryState *dms = container_of(listener, DeviceMemoryState, 493 listener); 494 495 if (!memory_region_is_ram(mrs->mr)) { 496 warn_report("Unexpected memory region mapped into device memory region."); 497 return; 498 } 499 500 /* 501 * The expectation is that each distinct RAM memory region section in 502 * our region for memory devices consumes exactly one memslot in KVM 503 * and in vhost. For vhost, this is true, except: 504 * * ROM memory regions don't consume a memslot. These get used very 505 * rarely for memory devices (R/O NVDIMMs). 506 * * Memslots without a fd (memory-backend-ram) don't necessarily 507 * consume a memslot. Such setups are quite rare and possibly bogus: 508 * the memory would be inaccessible by such vhost devices. 509 * 510 * So for vhost, in corner cases we might over-estimate the number of 511 * memslots that are currently used or that might still be reserved 512 * (required - used). 513 */ 514 dms->used_memslots += add ? 1 : -1; 515 516 if (dms->used_memslots > dms->required_memslots) { 517 warn_report("Memory devices use more memory slots than indicated as required."); 518 } 519 } 520 521 static void memory_devices_region_add(MemoryListener *listener, 522 MemoryRegionSection *mrs) 523 { 524 return memory_devices_region_mod(listener, mrs, true); 525 } 526 527 static void memory_devices_region_del(MemoryListener *listener, 528 MemoryRegionSection *mrs) 529 { 530 return memory_devices_region_mod(listener, mrs, false); 531 } 532 533 void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size) 534 { 535 g_assert(size); 536 g_assert(!ms->device_memory); 537 ms->device_memory = g_new0(DeviceMemoryState, 1); 538 ms->device_memory->base = base; 539 540 memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory", 541 size); 542 address_space_init(&ms->device_memory->as, &ms->device_memory->mr, 543 "device-memory"); 544 memory_region_add_subregion(get_system_memory(), ms->device_memory->base, 545 &ms->device_memory->mr); 546 547 /* Track the number of memslots used by memory devices. */ 548 ms->device_memory->listener.region_add = memory_devices_region_add; 549 ms->device_memory->listener.region_del = memory_devices_region_del; 550 memory_listener_register(&ms->device_memory->listener, 551 &ms->device_memory->as); 552 } 553 554 static const TypeInfo memory_device_info = { 555 .name = TYPE_MEMORY_DEVICE, 556 .parent = TYPE_INTERFACE, 557 .class_size = sizeof(MemoryDeviceClass), 558 }; 559 560 static void memory_device_register_types(void) 561 { 562 type_register_static(&memory_device_info); 563 } 564 565 type_init(memory_device_register_types) 566