1 /* 2 * Memory Device Interface 3 * 4 * Copyright ProfitBricks GmbH 2012 5 * Copyright (C) 2014 Red Hat Inc 6 * Copyright (c) 2018 Red Hat Inc 7 * 8 * This work is licensed under the terms of the GNU GPL, version 2 or later. 9 * See the COPYING file in the top-level directory. 10 */ 11 12 #include "qemu/osdep.h" 13 #include "qemu/error-report.h" 14 #include "hw/mem/memory-device.h" 15 #include "qapi/error.h" 16 #include "hw/boards.h" 17 #include "qemu/range.h" 18 #include "hw/virtio/vhost.h" 19 #include "sysemu/kvm.h" 20 #include "exec/address-spaces.h" 21 #include "trace.h" 22 23 static gint memory_device_addr_sort(gconstpointer a, gconstpointer b) 24 { 25 const MemoryDeviceState *md_a = MEMORY_DEVICE(a); 26 const MemoryDeviceState *md_b = MEMORY_DEVICE(b); 27 const MemoryDeviceClass *mdc_a = MEMORY_DEVICE_GET_CLASS(a); 28 const MemoryDeviceClass *mdc_b = MEMORY_DEVICE_GET_CLASS(b); 29 const uint64_t addr_a = mdc_a->get_addr(md_a); 30 const uint64_t addr_b = mdc_b->get_addr(md_b); 31 32 if (addr_a > addr_b) { 33 return 1; 34 } else if (addr_a < addr_b) { 35 return -1; 36 } 37 return 0; 38 } 39 40 static int memory_device_build_list(Object *obj, void *opaque) 41 { 42 GSList **list = opaque; 43 44 if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) { 45 DeviceState *dev = DEVICE(obj); 46 if (dev->realized) { /* only realized memory devices matter */ 47 *list = g_slist_insert_sorted(*list, dev, memory_device_addr_sort); 48 } 49 } 50 51 object_child_foreach(obj, memory_device_build_list, opaque); 52 return 0; 53 } 54 55 static unsigned int memory_device_get_memslots(MemoryDeviceState *md) 56 { 57 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 58 59 if (mdc->get_memslots) { 60 return mdc->get_memslots(md); 61 } 62 return 1; 63 } 64 65 /* 66 * Memslots that are reserved by memory devices (required but still reported 67 * as free from KVM / vhost). 68 */ 69 static unsigned int get_reserved_memslots(MachineState *ms) 70 { 71 if (ms->device_memory->used_memslots > 72 ms->device_memory->required_memslots) { 73 /* This is unexpected, and we warned already in the memory notifier. */ 74 return 0; 75 } 76 return ms->device_memory->required_memslots - 77 ms->device_memory->used_memslots; 78 } 79 80 unsigned int memory_devices_get_reserved_memslots(void) 81 { 82 if (!current_machine->device_memory) { 83 return 0; 84 } 85 return get_reserved_memslots(current_machine); 86 } 87 88 bool memory_devices_memslot_auto_decision_active(void) 89 { 90 if (!current_machine->device_memory) { 91 return false; 92 } 93 94 return current_machine->device_memory->memslot_auto_decision_active; 95 } 96 97 static unsigned int memory_device_memslot_decision_limit(MachineState *ms, 98 MemoryRegion *mr) 99 { 100 const unsigned int reserved = get_reserved_memslots(ms); 101 const uint64_t size = memory_region_size(mr); 102 unsigned int max = vhost_get_max_memslots(); 103 unsigned int free = vhost_get_free_memslots(); 104 uint64_t available_space; 105 unsigned int memslots; 106 107 if (kvm_enabled()) { 108 max = MIN(max, kvm_get_max_memslots()); 109 free = MIN(free, kvm_get_free_memslots()); 110 } 111 112 /* 113 * If we only have less overall memslots than what we consider reasonable, 114 * just keep it to a minimum. 115 */ 116 if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) { 117 return 1; 118 } 119 120 /* 121 * Consider our soft-limit across all memory devices. We don't really 122 * expect to exceed this limit in reasonable configurations. 123 */ 124 if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <= 125 ms->device_memory->required_memslots) { 126 return 1; 127 } 128 memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT - 129 ms->device_memory->required_memslots; 130 131 /* 132 * Consider the actually still free memslots. This is only relevant if 133 * other memslot consumers would consume *significantly* more memslots than 134 * what we prepared for (> 253). Unlikely, but let's just handle it 135 * cleanly. 136 */ 137 memslots = MIN(memslots, free - reserved); 138 if (memslots < 1 || unlikely(free < reserved)) { 139 return 1; 140 } 141 142 /* We cannot have any other memory devices? So give all to this device. */ 143 if (size == ms->maxram_size - ms->ram_size) { 144 return memslots; 145 } 146 147 /* 148 * Simple heuristic: equally distribute the memslots over the space 149 * still available for memory devices. 150 */ 151 available_space = ms->maxram_size - ms->ram_size - 152 ms->device_memory->used_region_size; 153 memslots = (double)memslots * size / available_space; 154 return memslots < 1 ? 1 : memslots; 155 } 156 157 static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md, 158 MemoryRegion *mr, Error **errp) 159 { 160 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 161 const uint64_t used_region_size = ms->device_memory->used_region_size; 162 const uint64_t size = memory_region_size(mr); 163 const unsigned int reserved_memslots = get_reserved_memslots(ms); 164 unsigned int required_memslots, memslot_limit; 165 166 /* 167 * Instruct the device to decide how many memslots to use, if applicable, 168 * before we query the number of required memslots the first time. 169 */ 170 if (mdc->decide_memslots) { 171 memslot_limit = memory_device_memslot_decision_limit(ms, mr); 172 mdc->decide_memslots(md, memslot_limit); 173 } 174 required_memslots = memory_device_get_memslots(md); 175 176 /* we will need memory slots for kvm and vhost */ 177 if (kvm_enabled() && 178 kvm_get_free_memslots() < required_memslots + reserved_memslots) { 179 error_setg(errp, "hypervisor has not enough free memory slots left"); 180 return; 181 } 182 if (vhost_get_free_memslots() < required_memslots + reserved_memslots) { 183 error_setg(errp, "a used vhost backend has not enough free memory slots left"); 184 return; 185 } 186 187 /* will we exceed the total amount of memory specified */ 188 if (used_region_size + size < used_region_size || 189 used_region_size + size > ms->maxram_size - ms->ram_size) { 190 error_setg(errp, "not enough space, currently 0x%" PRIx64 191 " in use of total space for memory devices 0x" RAM_ADDR_FMT, 192 used_region_size, ms->maxram_size - ms->ram_size); 193 return; 194 } 195 196 } 197 198 static uint64_t memory_device_get_free_addr(MachineState *ms, 199 const uint64_t *hint, 200 uint64_t align, uint64_t size, 201 Error **errp) 202 { 203 GSList *list = NULL, *item; 204 Range as, new = range_empty; 205 206 range_init_nofail(&as, ms->device_memory->base, 207 memory_region_size(&ms->device_memory->mr)); 208 209 /* start of address space indicates the maximum alignment we expect */ 210 if (!QEMU_IS_ALIGNED(range_lob(&as), align)) { 211 warn_report("the alignment (0x%" PRIx64 ") exceeds the expected" 212 " maximum alignment, memory will get fragmented and not" 213 " all 'maxmem' might be usable for memory devices.", 214 align); 215 } 216 217 if (hint && !QEMU_IS_ALIGNED(*hint, align)) { 218 error_setg(errp, "address must be aligned to 0x%" PRIx64 " bytes", 219 align); 220 return 0; 221 } 222 223 if (!QEMU_IS_ALIGNED(size, align)) { 224 error_setg(errp, "backend memory size must be multiple of 0x%" 225 PRIx64, align); 226 return 0; 227 } 228 229 if (hint) { 230 if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) { 231 error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64 232 "], usable range for memory devices [0x%" PRIx64 ":0x%" 233 PRIx64 "]", *hint, size, range_lob(&as), 234 range_size(&as)); 235 return 0; 236 } 237 } else { 238 if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) { 239 error_setg(errp, "can't add memory device, device too big"); 240 return 0; 241 } 242 } 243 244 /* find address range that will fit new memory device */ 245 object_child_foreach(OBJECT(ms), memory_device_build_list, &list); 246 for (item = list; item; item = g_slist_next(item)) { 247 const MemoryDeviceState *md = item->data; 248 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(OBJECT(md)); 249 uint64_t next_addr; 250 Range tmp; 251 252 range_init_nofail(&tmp, mdc->get_addr(md), 253 memory_device_get_region_size(md, &error_abort)); 254 255 if (range_overlaps_range(&tmp, &new)) { 256 if (hint) { 257 const DeviceState *d = DEVICE(md); 258 error_setg(errp, "address range conflicts with memory device" 259 " id='%s'", d->id ? d->id : "(unnamed)"); 260 goto out; 261 } 262 263 next_addr = QEMU_ALIGN_UP(range_upb(&tmp) + 1, align); 264 if (!next_addr || range_init(&new, next_addr, range_size(&new))) { 265 range_make_empty(&new); 266 break; 267 } 268 } else if (range_lob(&tmp) > range_upb(&new)) { 269 break; 270 } 271 } 272 273 if (!range_contains_range(&as, &new)) { 274 error_setg(errp, "could not find position in guest address space for " 275 "memory device - memory fragmented due to alignments"); 276 } 277 out: 278 g_slist_free(list); 279 return range_lob(&new); 280 } 281 282 MemoryDeviceInfoList *qmp_memory_device_list(void) 283 { 284 GSList *devices = NULL, *item; 285 MemoryDeviceInfoList *list = NULL, **tail = &list; 286 287 object_child_foreach(qdev_get_machine(), memory_device_build_list, 288 &devices); 289 290 for (item = devices; item; item = g_slist_next(item)) { 291 const MemoryDeviceState *md = MEMORY_DEVICE(item->data); 292 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data); 293 MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1); 294 295 mdc->fill_device_info(md, info); 296 297 QAPI_LIST_APPEND(tail, info); 298 } 299 300 g_slist_free(devices); 301 302 return list; 303 } 304 305 static int memory_device_plugged_size(Object *obj, void *opaque) 306 { 307 uint64_t *size = opaque; 308 309 if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) { 310 const DeviceState *dev = DEVICE(obj); 311 const MemoryDeviceState *md = MEMORY_DEVICE(obj); 312 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj); 313 314 if (dev->realized) { 315 *size += mdc->get_plugged_size(md, &error_abort); 316 } 317 } 318 319 object_child_foreach(obj, memory_device_plugged_size, opaque); 320 return 0; 321 } 322 323 uint64_t get_plugged_memory_size(void) 324 { 325 uint64_t size = 0; 326 327 memory_device_plugged_size(qdev_get_machine(), &size); 328 329 return size; 330 } 331 332 void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms, 333 const uint64_t *legacy_align, Error **errp) 334 { 335 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 336 Error *local_err = NULL; 337 uint64_t addr, align = 0; 338 MemoryRegion *mr; 339 340 if (!ms->device_memory) { 341 error_setg(errp, "the configuration is not prepared for memory devices" 342 " (e.g., for memory hotplug), consider specifying the" 343 " maxmem option"); 344 return; 345 } 346 347 mr = mdc->get_memory_region(md, &local_err); 348 if (local_err) { 349 goto out; 350 } 351 352 memory_device_check_addable(ms, md, mr, &local_err); 353 if (local_err) { 354 goto out; 355 } 356 357 if (legacy_align) { 358 align = *legacy_align; 359 } else { 360 if (mdc->get_min_alignment) { 361 align = mdc->get_min_alignment(md); 362 } 363 align = MAX(align, memory_region_get_alignment(mr)); 364 } 365 addr = mdc->get_addr(md); 366 addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align, 367 memory_region_size(mr), &local_err); 368 if (local_err) { 369 goto out; 370 } 371 mdc->set_addr(md, addr, &local_err); 372 if (!local_err) { 373 trace_memory_device_pre_plug(DEVICE(md)->id ? DEVICE(md)->id : "", 374 addr); 375 } 376 out: 377 error_propagate(errp, local_err); 378 } 379 380 void memory_device_plug(MemoryDeviceState *md, MachineState *ms) 381 { 382 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 383 const unsigned int memslots = memory_device_get_memslots(md); 384 const uint64_t addr = mdc->get_addr(md); 385 MemoryRegion *mr; 386 387 /* 388 * We expect that a previous call to memory_device_pre_plug() succeeded, so 389 * it can't fail at this point. 390 */ 391 mr = mdc->get_memory_region(md, &error_abort); 392 g_assert(ms->device_memory); 393 394 ms->device_memory->used_region_size += memory_region_size(mr); 395 ms->device_memory->required_memslots += memslots; 396 if (mdc->decide_memslots && memslots > 1) { 397 ms->device_memory->memslot_auto_decision_active++; 398 } 399 400 memory_region_add_subregion(&ms->device_memory->mr, 401 addr - ms->device_memory->base, mr); 402 trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr); 403 } 404 405 void memory_device_unplug(MemoryDeviceState *md, MachineState *ms) 406 { 407 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 408 const unsigned int memslots = memory_device_get_memslots(md); 409 MemoryRegion *mr; 410 411 /* 412 * We expect that a previous call to memory_device_pre_plug() succeeded, so 413 * it can't fail at this point. 414 */ 415 mr = mdc->get_memory_region(md, &error_abort); 416 g_assert(ms->device_memory); 417 418 memory_region_del_subregion(&ms->device_memory->mr, mr); 419 420 if (mdc->decide_memslots && memslots > 1) { 421 ms->device_memory->memslot_auto_decision_active--; 422 } 423 ms->device_memory->used_region_size -= memory_region_size(mr); 424 ms->device_memory->required_memslots -= memslots; 425 trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "", 426 mdc->get_addr(md)); 427 } 428 429 uint64_t memory_device_get_region_size(const MemoryDeviceState *md, 430 Error **errp) 431 { 432 const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md); 433 MemoryRegion *mr; 434 435 /* dropping const here is fine as we don't touch the memory region */ 436 mr = mdc->get_memory_region((MemoryDeviceState *)md, errp); 437 if (!mr) { 438 return 0; 439 } 440 441 return memory_region_size(mr); 442 } 443 444 static void memory_devices_region_mod(MemoryListener *listener, 445 MemoryRegionSection *mrs, bool add) 446 { 447 DeviceMemoryState *dms = container_of(listener, DeviceMemoryState, 448 listener); 449 450 if (!memory_region_is_ram(mrs->mr)) { 451 warn_report("Unexpected memory region mapped into device memory region."); 452 return; 453 } 454 455 /* 456 * The expectation is that each distinct RAM memory region section in 457 * our region for memory devices consumes exactly one memslot in KVM 458 * and in vhost. For vhost, this is true, except: 459 * * ROM memory regions don't consume a memslot. These get used very 460 * rarely for memory devices (R/O NVDIMMs). 461 * * Memslots without a fd (memory-backend-ram) don't necessarily 462 * consume a memslot. Such setups are quite rare and possibly bogus: 463 * the memory would be inaccessible by such vhost devices. 464 * 465 * So for vhost, in corner cases we might over-estimate the number of 466 * memslots that are currently used or that might still be reserved 467 * (required - used). 468 */ 469 dms->used_memslots += add ? 1 : -1; 470 471 if (dms->used_memslots > dms->required_memslots) { 472 warn_report("Memory devices use more memory slots than indicated as required."); 473 } 474 } 475 476 static void memory_devices_region_add(MemoryListener *listener, 477 MemoryRegionSection *mrs) 478 { 479 return memory_devices_region_mod(listener, mrs, true); 480 } 481 482 static void memory_devices_region_del(MemoryListener *listener, 483 MemoryRegionSection *mrs) 484 { 485 return memory_devices_region_mod(listener, mrs, false); 486 } 487 488 void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size) 489 { 490 g_assert(size); 491 g_assert(!ms->device_memory); 492 ms->device_memory = g_new0(DeviceMemoryState, 1); 493 ms->device_memory->base = base; 494 495 memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory", 496 size); 497 address_space_init(&ms->device_memory->as, &ms->device_memory->mr, 498 "device-memory"); 499 memory_region_add_subregion(get_system_memory(), ms->device_memory->base, 500 &ms->device_memory->mr); 501 502 /* Track the number of memslots used by memory devices. */ 503 ms->device_memory->listener.region_add = memory_devices_region_add; 504 ms->device_memory->listener.region_del = memory_devices_region_del; 505 memory_listener_register(&ms->device_memory->listener, 506 &ms->device_memory->as); 507 } 508 509 static const TypeInfo memory_device_info = { 510 .name = TYPE_MEMORY_DEVICE, 511 .parent = TYPE_INTERFACE, 512 .class_size = sizeof(MemoryDeviceClass), 513 }; 514 515 static void memory_device_register_types(void) 516 { 517 type_register_static(&memory_device_info); 518 } 519 520 type_init(memory_device_register_types) 521