1 /* 2 * QEMU Host Memory Backend 3 * 4 * Copyright (C) 2013-2014 Red Hat Inc 5 * 6 * Authors: 7 * Igor Mammedov <imammedo@redhat.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or later. 10 * See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "sysemu/hostmem.h" 15 #include "hw/boards.h" 16 #include "qapi/error.h" 17 #include "qapi/qapi-builtin-visit.h" 18 #include "qapi/visitor.h" 19 #include "qemu/config-file.h" 20 #include "qom/object_interfaces.h" 21 #include "qemu/mmap-alloc.h" 22 #include "qemu/madvise.h" 23 #include "qemu/cutils.h" 24 #include "hw/qdev-core.h" 25 26 #ifdef CONFIG_NUMA 27 #include <numaif.h> 28 #include <numa.h> 29 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT); 30 /* 31 * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or 32 * MPOL_PREFERRED_MANY, see comments further below. 33 */ 34 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED); 35 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND); 36 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE); 37 #endif 38 39 char * 40 host_memory_backend_get_name(HostMemoryBackend *backend) 41 { 42 if (!backend->use_canonical_path) { 43 return g_strdup(object_get_canonical_path_component(OBJECT(backend))); 44 } 45 46 return object_get_canonical_path(OBJECT(backend)); 47 } 48 49 static void 50 host_memory_backend_get_size(Object *obj, Visitor *v, const char *name, 51 void *opaque, Error **errp) 52 { 53 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 54 uint64_t value = backend->size; 55 56 visit_type_size(v, name, &value, errp); 57 } 58 59 static void 60 host_memory_backend_set_size(Object *obj, Visitor *v, const char *name, 61 void *opaque, Error **errp) 62 { 63 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 64 uint64_t value; 65 66 if (host_memory_backend_mr_inited(backend)) { 67 error_setg(errp, "cannot change property %s of %s ", name, 68 object_get_typename(obj)); 69 return; 70 } 71 72 if (!visit_type_size(v, name, &value, errp)) { 73 return; 74 } 75 if (!value) { 76 error_setg(errp, 77 "property '%s' of %s doesn't take value '%" PRIu64 "'", 78 name, object_get_typename(obj), value); 79 return; 80 } 81 backend->size = value; 82 } 83 84 static void 85 host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name, 86 void *opaque, Error **errp) 87 { 88 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 89 uint16List *host_nodes = NULL; 90 uint16List **tail = &host_nodes; 91 unsigned long value; 92 93 value = find_first_bit(backend->host_nodes, MAX_NODES); 94 if (value == MAX_NODES) { 95 goto ret; 96 } 97 98 QAPI_LIST_APPEND(tail, value); 99 100 do { 101 value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1); 102 if (value == MAX_NODES) { 103 break; 104 } 105 106 QAPI_LIST_APPEND(tail, value); 107 } while (true); 108 109 ret: 110 visit_type_uint16List(v, name, &host_nodes, errp); 111 qapi_free_uint16List(host_nodes); 112 } 113 114 static void 115 host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name, 116 void *opaque, Error **errp) 117 { 118 #ifdef CONFIG_NUMA 119 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 120 uint16List *l, *host_nodes = NULL; 121 122 visit_type_uint16List(v, name, &host_nodes, errp); 123 124 for (l = host_nodes; l; l = l->next) { 125 if (l->value >= MAX_NODES) { 126 error_setg(errp, "Invalid host-nodes value: %d", l->value); 127 goto out; 128 } 129 } 130 131 for (l = host_nodes; l; l = l->next) { 132 bitmap_set(backend->host_nodes, l->value, 1); 133 } 134 135 out: 136 qapi_free_uint16List(host_nodes); 137 #else 138 error_setg(errp, "NUMA node binding are not supported by this QEMU"); 139 #endif 140 } 141 142 static int 143 host_memory_backend_get_policy(Object *obj, Error **errp G_GNUC_UNUSED) 144 { 145 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 146 return backend->policy; 147 } 148 149 static void 150 host_memory_backend_set_policy(Object *obj, int policy, Error **errp) 151 { 152 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 153 backend->policy = policy; 154 155 #ifndef CONFIG_NUMA 156 if (policy != HOST_MEM_POLICY_DEFAULT) { 157 error_setg(errp, "NUMA policies are not supported by this QEMU"); 158 } 159 #endif 160 } 161 162 static bool host_memory_backend_get_merge(Object *obj, Error **errp) 163 { 164 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 165 166 return backend->merge; 167 } 168 169 static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp) 170 { 171 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 172 173 if (QEMU_MADV_MERGEABLE == QEMU_MADV_INVALID) { 174 if (value) { 175 error_setg(errp, "Memory merging is not supported on this host"); 176 } 177 assert(!backend->merge); 178 return; 179 } 180 181 if (host_memory_backend_mr_inited(backend) && 182 value != backend->merge) { 183 void *ptr = memory_region_get_ram_ptr(&backend->mr); 184 uint64_t sz = memory_region_size(&backend->mr); 185 186 qemu_madvise(ptr, sz, 187 value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE); 188 } 189 190 backend->merge = value; 191 } 192 193 static bool host_memory_backend_get_dump(Object *obj, Error **errp) 194 { 195 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 196 197 return backend->dump; 198 } 199 200 static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp) 201 { 202 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 203 204 if (QEMU_MADV_DONTDUMP == QEMU_MADV_INVALID) { 205 if (!value) { 206 error_setg(errp, "Dumping guest memory cannot be disabled on this host"); 207 } 208 assert(backend->dump); 209 return; 210 } 211 212 if (host_memory_backend_mr_inited(backend) && 213 value != backend->dump) { 214 void *ptr = memory_region_get_ram_ptr(&backend->mr); 215 uint64_t sz = memory_region_size(&backend->mr); 216 217 qemu_madvise(ptr, sz, 218 value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP); 219 } 220 221 backend->dump = value; 222 } 223 224 static bool host_memory_backend_get_prealloc(Object *obj, Error **errp) 225 { 226 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 227 228 return backend->prealloc; 229 } 230 231 static void host_memory_backend_set_prealloc(Object *obj, bool value, 232 Error **errp) 233 { 234 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 235 236 if (!backend->reserve && value) { 237 error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible"); 238 return; 239 } 240 241 if (!host_memory_backend_mr_inited(backend)) { 242 backend->prealloc = value; 243 return; 244 } 245 246 if (value && !backend->prealloc) { 247 int fd = memory_region_get_fd(&backend->mr); 248 void *ptr = memory_region_get_ram_ptr(&backend->mr); 249 uint64_t sz = memory_region_size(&backend->mr); 250 251 if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads, 252 backend->prealloc_context, false, errp)) { 253 return; 254 } 255 backend->prealloc = true; 256 } 257 } 258 259 static void host_memory_backend_get_prealloc_threads(Object *obj, Visitor *v, 260 const char *name, void *opaque, Error **errp) 261 { 262 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 263 visit_type_uint32(v, name, &backend->prealloc_threads, errp); 264 } 265 266 static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v, 267 const char *name, void *opaque, Error **errp) 268 { 269 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 270 uint32_t value; 271 272 if (!visit_type_uint32(v, name, &value, errp)) { 273 return; 274 } 275 if (value <= 0) { 276 error_setg(errp, "property '%s' of %s doesn't take value '%d'", name, 277 object_get_typename(obj), value); 278 return; 279 } 280 backend->prealloc_threads = value; 281 } 282 283 static void host_memory_backend_init(Object *obj) 284 { 285 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 286 MachineState *machine = MACHINE(qdev_get_machine()); 287 288 /* TODO: convert access to globals to compat properties */ 289 backend->merge = machine_mem_merge(machine); 290 backend->dump = machine_dump_guest_core(machine); 291 backend->guest_memfd = machine_require_guest_memfd(machine); 292 backend->reserve = true; 293 backend->prealloc_threads = machine->smp.cpus; 294 } 295 296 static void host_memory_backend_post_init(Object *obj) 297 { 298 object_apply_compat_props(obj); 299 } 300 301 bool host_memory_backend_mr_inited(HostMemoryBackend *backend) 302 { 303 /* 304 * NOTE: We forbid zero-length memory backend, so here zero means 305 * "we haven't inited the backend memory region yet". 306 */ 307 return memory_region_size(&backend->mr) != 0; 308 } 309 310 MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend) 311 { 312 return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL; 313 } 314 315 void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped) 316 { 317 backend->is_mapped = mapped; 318 } 319 320 bool host_memory_backend_is_mapped(HostMemoryBackend *backend) 321 { 322 return backend->is_mapped; 323 } 324 325 size_t host_memory_backend_pagesize(HostMemoryBackend *memdev) 326 { 327 size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block); 328 g_assert(pagesize >= qemu_real_host_page_size()); 329 return pagesize; 330 } 331 332 static void 333 host_memory_backend_memory_complete(UserCreatable *uc, Error **errp) 334 { 335 HostMemoryBackend *backend = MEMORY_BACKEND(uc); 336 HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc); 337 void *ptr; 338 uint64_t sz; 339 size_t pagesize; 340 bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED); 341 342 if (!bc->alloc) { 343 return; 344 } 345 if (!bc->alloc(backend, errp)) { 346 return; 347 } 348 349 ptr = memory_region_get_ram_ptr(&backend->mr); 350 sz = memory_region_size(&backend->mr); 351 pagesize = qemu_ram_pagesize(backend->mr.ram_block); 352 353 if (backend->aligned && !QEMU_IS_ALIGNED(sz, pagesize)) { 354 g_autofree char *pagesize_str = size_to_str(pagesize); 355 error_setg(errp, "backend '%s' memory size must be multiple of %s", 356 object_get_typename(OBJECT(uc)), pagesize_str); 357 return; 358 } 359 360 if (backend->merge) { 361 qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE); 362 } 363 if (!backend->dump) { 364 qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP); 365 } 366 #ifdef CONFIG_NUMA 367 unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES); 368 /* lastbit == MAX_NODES means maxnode = 0 */ 369 unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1); 370 /* 371 * Ensure policy won't be ignored in case memory is preallocated 372 * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so 373 * this doesn't catch hugepage case. 374 */ 375 unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE; 376 int mode = backend->policy; 377 378 /* check for invalid host-nodes and policies and give more verbose 379 * error messages than mbind(). */ 380 if (maxnode && backend->policy == MPOL_DEFAULT) { 381 error_setg(errp, "host-nodes must be empty for policy default," 382 " or you should explicitly specify a policy other" 383 " than default"); 384 return; 385 } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) { 386 error_setg(errp, "host-nodes must be set for policy %s", 387 HostMemPolicy_str(backend->policy)); 388 return; 389 } 390 391 /* 392 * We can have up to MAX_NODES nodes, but we need to pass maxnode+1 393 * as argument to mbind() due to an old Linux bug (feature?) which 394 * cuts off the last specified node. This means backend->host_nodes 395 * must have MAX_NODES+1 bits available. 396 */ 397 assert(sizeof(backend->host_nodes) >= 398 BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long)); 399 assert(maxnode <= MAX_NODES); 400 401 #ifdef HAVE_NUMA_HAS_PREFERRED_MANY 402 if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) { 403 /* 404 * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below 405 * silently picks the first node. 406 */ 407 mode = MPOL_PREFERRED_MANY; 408 } 409 #endif 410 411 if (maxnode && 412 mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) { 413 if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) { 414 error_setg_errno(errp, errno, 415 "cannot bind memory to host NUMA nodes"); 416 return; 417 } 418 } 419 #endif 420 /* 421 * Preallocate memory after the NUMA policy has been instantiated. 422 * This is necessary to guarantee memory is allocated with 423 * specified NUMA policy in place. 424 */ 425 if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr), 426 ptr, sz, 427 backend->prealloc_threads, 428 backend->prealloc_context, 429 async, errp)) { 430 return; 431 } 432 } 433 434 static bool 435 host_memory_backend_can_be_deleted(UserCreatable *uc) 436 { 437 if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) { 438 return false; 439 } else { 440 return true; 441 } 442 } 443 444 static bool host_memory_backend_get_share(Object *o, Error **errp) 445 { 446 HostMemoryBackend *backend = MEMORY_BACKEND(o); 447 448 return backend->share; 449 } 450 451 static void host_memory_backend_set_share(Object *o, bool value, Error **errp) 452 { 453 HostMemoryBackend *backend = MEMORY_BACKEND(o); 454 455 if (host_memory_backend_mr_inited(backend)) { 456 error_setg(errp, "cannot change property value"); 457 return; 458 } 459 backend->share = value; 460 } 461 462 #ifdef CONFIG_LINUX 463 static bool host_memory_backend_get_reserve(Object *o, Error **errp) 464 { 465 HostMemoryBackend *backend = MEMORY_BACKEND(o); 466 467 return backend->reserve; 468 } 469 470 static void host_memory_backend_set_reserve(Object *o, bool value, Error **errp) 471 { 472 HostMemoryBackend *backend = MEMORY_BACKEND(o); 473 474 if (host_memory_backend_mr_inited(backend)) { 475 error_setg(errp, "cannot change property value"); 476 return; 477 } 478 if (backend->prealloc && !value) { 479 error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible"); 480 return; 481 } 482 backend->reserve = value; 483 } 484 #endif /* CONFIG_LINUX */ 485 486 static bool 487 host_memory_backend_get_use_canonical_path(Object *obj, Error **errp) 488 { 489 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 490 491 return backend->use_canonical_path; 492 } 493 494 static void 495 host_memory_backend_set_use_canonical_path(Object *obj, bool value, 496 Error **errp) 497 { 498 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 499 500 backend->use_canonical_path = value; 501 } 502 503 static void 504 host_memory_backend_class_init(ObjectClass *oc, void *data) 505 { 506 UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc); 507 508 ucc->complete = host_memory_backend_memory_complete; 509 ucc->can_be_deleted = host_memory_backend_can_be_deleted; 510 511 object_class_property_add_bool(oc, "merge", 512 host_memory_backend_get_merge, 513 host_memory_backend_set_merge); 514 object_class_property_set_description(oc, "merge", 515 "Mark memory as mergeable"); 516 object_class_property_add_bool(oc, "dump", 517 host_memory_backend_get_dump, 518 host_memory_backend_set_dump); 519 object_class_property_set_description(oc, "dump", 520 "Set to 'off' to exclude from core dump"); 521 object_class_property_add_bool(oc, "prealloc", 522 host_memory_backend_get_prealloc, 523 host_memory_backend_set_prealloc); 524 object_class_property_set_description(oc, "prealloc", 525 "Preallocate memory"); 526 object_class_property_add(oc, "prealloc-threads", "int", 527 host_memory_backend_get_prealloc_threads, 528 host_memory_backend_set_prealloc_threads, 529 NULL, NULL); 530 object_class_property_set_description(oc, "prealloc-threads", 531 "Number of CPU threads to use for prealloc"); 532 object_class_property_add_link(oc, "prealloc-context", 533 TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context), 534 object_property_allow_set_link, OBJ_PROP_LINK_STRONG); 535 object_class_property_set_description(oc, "prealloc-context", 536 "Context to use for creating CPU threads for preallocation"); 537 object_class_property_add(oc, "size", "int", 538 host_memory_backend_get_size, 539 host_memory_backend_set_size, 540 NULL, NULL); 541 object_class_property_set_description(oc, "size", 542 "Size of the memory region (ex: 500M)"); 543 object_class_property_add(oc, "host-nodes", "int", 544 host_memory_backend_get_host_nodes, 545 host_memory_backend_set_host_nodes, 546 NULL, NULL); 547 object_class_property_set_description(oc, "host-nodes", 548 "Binds memory to the list of NUMA host nodes"); 549 object_class_property_add_enum(oc, "policy", "HostMemPolicy", 550 &HostMemPolicy_lookup, 551 host_memory_backend_get_policy, 552 host_memory_backend_set_policy); 553 object_class_property_set_description(oc, "policy", 554 "Set the NUMA policy"); 555 object_class_property_add_bool(oc, "share", 556 host_memory_backend_get_share, host_memory_backend_set_share); 557 object_class_property_set_description(oc, "share", 558 "Mark the memory as private to QEMU or shared"); 559 #ifdef CONFIG_LINUX 560 object_class_property_add_bool(oc, "reserve", 561 host_memory_backend_get_reserve, host_memory_backend_set_reserve); 562 object_class_property_set_description(oc, "reserve", 563 "Reserve swap space (or huge pages) if applicable"); 564 #endif /* CONFIG_LINUX */ 565 /* 566 * Do not delete/rename option. This option must be considered stable 567 * (as if it didn't have the 'x-' prefix including deprecation period) as 568 * long as 4.0 and older machine types exists. 569 * Option will be used by upper layers to override (disable) canonical path 570 * for ramblock-id set by compat properties on old machine types ( <= 4.0), 571 * to keep migration working when backend is used for main RAM with 572 * -machine memory-backend= option (main RAM historically used prefix-less 573 * ramblock-id). 574 */ 575 object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id", 576 host_memory_backend_get_use_canonical_path, 577 host_memory_backend_set_use_canonical_path); 578 } 579 580 static const TypeInfo host_memory_backend_info = { 581 .name = TYPE_MEMORY_BACKEND, 582 .parent = TYPE_OBJECT, 583 .abstract = true, 584 .class_size = sizeof(HostMemoryBackendClass), 585 .class_init = host_memory_backend_class_init, 586 .instance_size = sizeof(HostMemoryBackend), 587 .instance_init = host_memory_backend_init, 588 .instance_post_init = host_memory_backend_post_init, 589 .interfaces = (InterfaceInfo[]) { 590 { TYPE_USER_CREATABLE }, 591 { } 592 } 593 }; 594 595 static void register_types(void) 596 { 597 type_register_static(&host_memory_backend_info); 598 } 599 600 type_init(register_types); 601