xref: /openbmc/qemu/backends/hostmem.c (revision ba379542)
1 /*
2  * QEMU Host Memory Backend
3  *
4  * Copyright (C) 2013-2014 Red Hat Inc
5  *
6  * Authors:
7  *   Igor Mammedov <imammedo@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/hostmem.h"
15 #include "hw/boards.h"
16 #include "qapi/error.h"
17 #include "qapi/qapi-builtin-visit.h"
18 #include "qapi/visitor.h"
19 #include "qemu/config-file.h"
20 #include "qom/object_interfaces.h"
21 #include "qemu/mmap-alloc.h"
22 #include "qemu/madvise.h"
23 #include "qemu/cutils.h"
24 #include "hw/qdev-core.h"
25 
26 #ifdef CONFIG_NUMA
27 #include <numaif.h>
28 #include <numa.h>
29 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
30 /*
31  * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or
32  * MPOL_PREFERRED_MANY, see comments further below.
33  */
34 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
35 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
36 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
37 #endif
38 
39 char *
40 host_memory_backend_get_name(HostMemoryBackend *backend)
41 {
42     if (!backend->use_canonical_path) {
43         return g_strdup(object_get_canonical_path_component(OBJECT(backend)));
44     }
45 
46     return object_get_canonical_path(OBJECT(backend));
47 }
48 
49 static void
50 host_memory_backend_get_size(Object *obj, Visitor *v, const char *name,
51                              void *opaque, Error **errp)
52 {
53     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
54     uint64_t value = backend->size;
55 
56     visit_type_size(v, name, &value, errp);
57 }
58 
59 static void
60 host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
61                              void *opaque, Error **errp)
62 {
63     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
64     uint64_t value;
65 
66     if (host_memory_backend_mr_inited(backend)) {
67         error_setg(errp, "cannot change property %s of %s ", name,
68                    object_get_typename(obj));
69         return;
70     }
71 
72     if (!visit_type_size(v, name, &value, errp)) {
73         return;
74     }
75     if (!value) {
76         error_setg(errp,
77                    "property '%s' of %s doesn't take value '%" PRIu64 "'",
78                    name, object_get_typename(obj), value);
79         return;
80     }
81     backend->size = value;
82 }
83 
84 static void
85 host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
86                                    void *opaque, Error **errp)
87 {
88     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
89     uint16List *host_nodes = NULL;
90     uint16List **tail = &host_nodes;
91     unsigned long value;
92 
93     value = find_first_bit(backend->host_nodes, MAX_NODES);
94     if (value == MAX_NODES) {
95         goto ret;
96     }
97 
98     QAPI_LIST_APPEND(tail, value);
99 
100     do {
101         value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
102         if (value == MAX_NODES) {
103             break;
104         }
105 
106         QAPI_LIST_APPEND(tail, value);
107     } while (true);
108 
109 ret:
110     visit_type_uint16List(v, name, &host_nodes, errp);
111     qapi_free_uint16List(host_nodes);
112 }
113 
114 static void
115 host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
116                                    void *opaque, Error **errp)
117 {
118 #ifdef CONFIG_NUMA
119     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
120     uint16List *l, *host_nodes = NULL;
121 
122     visit_type_uint16List(v, name, &host_nodes, errp);
123 
124     for (l = host_nodes; l; l = l->next) {
125         if (l->value >= MAX_NODES) {
126             error_setg(errp, "Invalid host-nodes value: %d", l->value);
127             goto out;
128         }
129     }
130 
131     for (l = host_nodes; l; l = l->next) {
132         bitmap_set(backend->host_nodes, l->value, 1);
133     }
134 
135 out:
136     qapi_free_uint16List(host_nodes);
137 #else
138     error_setg(errp, "NUMA node binding are not supported by this QEMU");
139 #endif
140 }
141 
142 static int
143 host_memory_backend_get_policy(Object *obj, Error **errp G_GNUC_UNUSED)
144 {
145     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
146     return backend->policy;
147 }
148 
149 static void
150 host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
151 {
152     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
153     backend->policy = policy;
154 
155 #ifndef CONFIG_NUMA
156     if (policy != HOST_MEM_POLICY_DEFAULT) {
157         error_setg(errp, "NUMA policies are not supported by this QEMU");
158     }
159 #endif
160 }
161 
162 static bool host_memory_backend_get_merge(Object *obj, Error **errp)
163 {
164     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
165 
166     return backend->merge;
167 }
168 
169 static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp)
170 {
171     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
172 
173     if (QEMU_MADV_MERGEABLE == QEMU_MADV_INVALID) {
174         if (value) {
175             error_setg(errp, "Memory merging is not supported on this host");
176         }
177         assert(!backend->merge);
178         return;
179     }
180 
181     if (!host_memory_backend_mr_inited(backend) &&
182         value != backend->merge) {
183         void *ptr = memory_region_get_ram_ptr(&backend->mr);
184         uint64_t sz = memory_region_size(&backend->mr);
185 
186         qemu_madvise(ptr, sz,
187                      value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE);
188     }
189 
190     backend->merge = value;
191 }
192 
193 static bool host_memory_backend_get_dump(Object *obj, Error **errp)
194 {
195     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
196 
197     return backend->dump;
198 }
199 
200 static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp)
201 {
202     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
203 
204     if (QEMU_MADV_DONTDUMP == QEMU_MADV_INVALID) {
205         if (!value) {
206             error_setg(errp, "Dumping guest memory cannot be disabled on this host");
207         }
208         assert(backend->dump);
209         return;
210     }
211 
212     if (host_memory_backend_mr_inited(backend) &&
213         value != backend->dump) {
214         void *ptr = memory_region_get_ram_ptr(&backend->mr);
215         uint64_t sz = memory_region_size(&backend->mr);
216 
217         qemu_madvise(ptr, sz,
218                      value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP);
219     }
220 
221     backend->dump = value;
222 }
223 
224 static bool host_memory_backend_get_prealloc(Object *obj, Error **errp)
225 {
226     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
227 
228     return backend->prealloc;
229 }
230 
231 static void host_memory_backend_set_prealloc(Object *obj, bool value,
232                                              Error **errp)
233 {
234     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
235 
236     if (!backend->reserve && value) {
237         error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
238         return;
239     }
240 
241     if (!host_memory_backend_mr_inited(backend)) {
242         backend->prealloc = value;
243         return;
244     }
245 
246     if (value && !backend->prealloc) {
247         int fd = memory_region_get_fd(&backend->mr);
248         void *ptr = memory_region_get_ram_ptr(&backend->mr);
249         uint64_t sz = memory_region_size(&backend->mr);
250 
251         if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
252                                backend->prealloc_context, false, errp)) {
253             return;
254         }
255         backend->prealloc = true;
256     }
257 }
258 
259 static void host_memory_backend_get_prealloc_threads(Object *obj, Visitor *v,
260     const char *name, void *opaque, Error **errp)
261 {
262     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
263     visit_type_uint32(v, name, &backend->prealloc_threads, errp);
264 }
265 
266 static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v,
267     const char *name, void *opaque, Error **errp)
268 {
269     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
270     uint32_t value;
271 
272     if (!visit_type_uint32(v, name, &value, errp)) {
273         return;
274     }
275     if (value <= 0) {
276         error_setg(errp, "property '%s' of %s doesn't take value '%d'", name,
277                    object_get_typename(obj), value);
278         return;
279     }
280     backend->prealloc_threads = value;
281 }
282 
283 static void host_memory_backend_init(Object *obj)
284 {
285     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
286     MachineState *machine = MACHINE(qdev_get_machine());
287 
288     /* TODO: convert access to globals to compat properties */
289     backend->merge = machine_mem_merge(machine);
290     backend->dump = machine_dump_guest_core(machine);
291     backend->guest_memfd = machine_require_guest_memfd(machine);
292     backend->reserve = true;
293     backend->prealloc_threads = machine->smp.cpus;
294 }
295 
296 static void host_memory_backend_post_init(Object *obj)
297 {
298     object_apply_compat_props(obj);
299 }
300 
301 bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
302 {
303     /*
304      * NOTE: We forbid zero-length memory backend, so here zero means
305      * "we haven't inited the backend memory region yet".
306      */
307     return memory_region_size(&backend->mr) != 0;
308 }
309 
310 MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
311 {
312     return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
313 }
314 
315 void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
316 {
317     backend->is_mapped = mapped;
318 }
319 
320 bool host_memory_backend_is_mapped(HostMemoryBackend *backend)
321 {
322     return backend->is_mapped;
323 }
324 
325 size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
326 {
327     size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block);
328     g_assert(pagesize >= qemu_real_host_page_size());
329     return pagesize;
330 }
331 
332 static void
333 host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
334 {
335     HostMemoryBackend *backend = MEMORY_BACKEND(uc);
336     HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
337     void *ptr;
338     uint64_t sz;
339     size_t pagesize;
340     bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
341 
342     if (!bc->alloc) {
343         return;
344     }
345     if (!bc->alloc(backend, errp)) {
346         return;
347     }
348 
349     ptr = memory_region_get_ram_ptr(&backend->mr);
350     sz = memory_region_size(&backend->mr);
351     pagesize = qemu_ram_pagesize(backend->mr.ram_block);
352 
353     if (backend->aligned && !QEMU_IS_ALIGNED(sz, pagesize)) {
354         g_autofree char *pagesize_str = size_to_str(pagesize);
355         error_setg(errp, "backend '%s' memory size must be multiple of %s",
356                    object_get_typename(OBJECT(uc)), pagesize_str);
357         return;
358     }
359 
360     if (backend->merge) {
361         qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
362     }
363     if (!backend->dump) {
364         qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
365     }
366 #ifdef CONFIG_NUMA
367     unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
368     /* lastbit == MAX_NODES means maxnode = 0 */
369     unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
370     /*
371      * Ensure policy won't be ignored in case memory is preallocated
372      * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
373      * this doesn't catch hugepage case.
374      */
375     unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
376     int mode = backend->policy;
377 
378     /* check for invalid host-nodes and policies and give more verbose
379      * error messages than mbind(). */
380     if (maxnode && backend->policy == MPOL_DEFAULT) {
381         error_setg(errp, "host-nodes must be empty for policy default,"
382                    " or you should explicitly specify a policy other"
383                    " than default");
384         return;
385     } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
386         error_setg(errp, "host-nodes must be set for policy %s",
387                    HostMemPolicy_str(backend->policy));
388         return;
389     }
390 
391     /*
392      * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
393      * as argument to mbind() due to an old Linux bug (feature?) which
394      * cuts off the last specified node. This means backend->host_nodes
395      * must have MAX_NODES+1 bits available.
396      */
397     assert(sizeof(backend->host_nodes) >=
398            BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
399     assert(maxnode <= MAX_NODES);
400 
401 #ifdef HAVE_NUMA_HAS_PREFERRED_MANY
402     if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
403         /*
404          * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
405          * silently picks the first node.
406          */
407         mode = MPOL_PREFERRED_MANY;
408     }
409 #endif
410 
411     if (maxnode &&
412         mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
413         if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
414             error_setg_errno(errp, errno,
415                              "cannot bind memory to host NUMA nodes");
416             return;
417         }
418     }
419 #endif
420     /*
421      * Preallocate memory after the NUMA policy has been instantiated.
422      * This is necessary to guarantee memory is allocated with
423      * specified NUMA policy in place.
424      */
425     if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
426                                                 ptr, sz,
427                                                 backend->prealloc_threads,
428                                                 backend->prealloc_context,
429                                                 async, errp)) {
430         return;
431     }
432 }
433 
434 static bool
435 host_memory_backend_can_be_deleted(UserCreatable *uc)
436 {
437     if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
438         return false;
439     } else {
440         return true;
441     }
442 }
443 
444 static bool host_memory_backend_get_share(Object *o, Error **errp)
445 {
446     HostMemoryBackend *backend = MEMORY_BACKEND(o);
447 
448     return backend->share;
449 }
450 
451 static void host_memory_backend_set_share(Object *o, bool value, Error **errp)
452 {
453     HostMemoryBackend *backend = MEMORY_BACKEND(o);
454 
455     if (host_memory_backend_mr_inited(backend)) {
456         error_setg(errp, "cannot change property value");
457         return;
458     }
459     backend->share = value;
460 }
461 
462 #ifdef CONFIG_LINUX
463 static bool host_memory_backend_get_reserve(Object *o, Error **errp)
464 {
465     HostMemoryBackend *backend = MEMORY_BACKEND(o);
466 
467     return backend->reserve;
468 }
469 
470 static void host_memory_backend_set_reserve(Object *o, bool value, Error **errp)
471 {
472     HostMemoryBackend *backend = MEMORY_BACKEND(o);
473 
474     if (host_memory_backend_mr_inited(backend)) {
475         error_setg(errp, "cannot change property value");
476         return;
477     }
478     if (backend->prealloc && !value) {
479         error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
480         return;
481     }
482     backend->reserve = value;
483 }
484 #endif /* CONFIG_LINUX */
485 
486 static bool
487 host_memory_backend_get_use_canonical_path(Object *obj, Error **errp)
488 {
489     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
490 
491     return backend->use_canonical_path;
492 }
493 
494 static void
495 host_memory_backend_set_use_canonical_path(Object *obj, bool value,
496                                            Error **errp)
497 {
498     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
499 
500     backend->use_canonical_path = value;
501 }
502 
503 static void
504 host_memory_backend_class_init(ObjectClass *oc, void *data)
505 {
506     UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
507 
508     ucc->complete = host_memory_backend_memory_complete;
509     ucc->can_be_deleted = host_memory_backend_can_be_deleted;
510 
511     object_class_property_add_bool(oc, "merge",
512         host_memory_backend_get_merge,
513         host_memory_backend_set_merge);
514     object_class_property_set_description(oc, "merge",
515         "Mark memory as mergeable");
516     object_class_property_add_bool(oc, "dump",
517         host_memory_backend_get_dump,
518         host_memory_backend_set_dump);
519     object_class_property_set_description(oc, "dump",
520         "Set to 'off' to exclude from core dump");
521     object_class_property_add_bool(oc, "prealloc",
522         host_memory_backend_get_prealloc,
523         host_memory_backend_set_prealloc);
524     object_class_property_set_description(oc, "prealloc",
525         "Preallocate memory");
526     object_class_property_add(oc, "prealloc-threads", "int",
527         host_memory_backend_get_prealloc_threads,
528         host_memory_backend_set_prealloc_threads,
529         NULL, NULL);
530     object_class_property_set_description(oc, "prealloc-threads",
531         "Number of CPU threads to use for prealloc");
532     object_class_property_add_link(oc, "prealloc-context",
533         TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
534         object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
535     object_class_property_set_description(oc, "prealloc-context",
536         "Context to use for creating CPU threads for preallocation");
537     object_class_property_add(oc, "size", "int",
538         host_memory_backend_get_size,
539         host_memory_backend_set_size,
540         NULL, NULL);
541     object_class_property_set_description(oc, "size",
542         "Size of the memory region (ex: 500M)");
543     object_class_property_add(oc, "host-nodes", "int",
544         host_memory_backend_get_host_nodes,
545         host_memory_backend_set_host_nodes,
546         NULL, NULL);
547     object_class_property_set_description(oc, "host-nodes",
548         "Binds memory to the list of NUMA host nodes");
549     object_class_property_add_enum(oc, "policy", "HostMemPolicy",
550         &HostMemPolicy_lookup,
551         host_memory_backend_get_policy,
552         host_memory_backend_set_policy);
553     object_class_property_set_description(oc, "policy",
554         "Set the NUMA policy");
555     object_class_property_add_bool(oc, "share",
556         host_memory_backend_get_share, host_memory_backend_set_share);
557     object_class_property_set_description(oc, "share",
558         "Mark the memory as private to QEMU or shared");
559 #ifdef CONFIG_LINUX
560     object_class_property_add_bool(oc, "reserve",
561         host_memory_backend_get_reserve, host_memory_backend_set_reserve);
562     object_class_property_set_description(oc, "reserve",
563         "Reserve swap space (or huge pages) if applicable");
564 #endif /* CONFIG_LINUX */
565     /*
566      * Do not delete/rename option. This option must be considered stable
567      * (as if it didn't have the 'x-' prefix including deprecation period) as
568      * long as 4.0 and older machine types exists.
569      * Option will be used by upper layers to override (disable) canonical path
570      * for ramblock-id set by compat properties on old machine types ( <= 4.0),
571      * to keep migration working when backend is used for main RAM with
572      * -machine memory-backend= option (main RAM historically used prefix-less
573      * ramblock-id).
574      */
575     object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id",
576         host_memory_backend_get_use_canonical_path,
577         host_memory_backend_set_use_canonical_path);
578 }
579 
580 static const TypeInfo host_memory_backend_info = {
581     .name = TYPE_MEMORY_BACKEND,
582     .parent = TYPE_OBJECT,
583     .abstract = true,
584     .class_size = sizeof(HostMemoryBackendClass),
585     .class_init = host_memory_backend_class_init,
586     .instance_size = sizeof(HostMemoryBackend),
587     .instance_init = host_memory_backend_init,
588     .instance_post_init = host_memory_backend_post_init,
589     .interfaces = (InterfaceInfo[]) {
590         { TYPE_USER_CREATABLE },
591         { }
592     }
593 };
594 
595 static void register_types(void)
596 {
597     type_register_static(&host_memory_backend_info);
598 }
599 
600 type_init(register_types);
601