xref: /openbmc/qemu/backends/hostmem.c (revision 2df1eb27)
1 /*
2  * QEMU Host Memory Backend
3  *
4  * Copyright (C) 2013-2014 Red Hat Inc
5  *
6  * Authors:
7  *   Igor Mammedov <imammedo@redhat.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "sysemu/hostmem.h"
15 #include "hw/boards.h"
16 #include "qapi/error.h"
17 #include "qapi/qapi-builtin-visit.h"
18 #include "qapi/visitor.h"
19 #include "qemu/config-file.h"
20 #include "qom/object_interfaces.h"
21 #include "qemu/mmap-alloc.h"
22 #include "qemu/madvise.h"
23 #include "hw/qdev-core.h"
24 
25 #ifdef CONFIG_NUMA
26 #include <numaif.h>
27 #include <numa.h>
28 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
29 /*
30  * HOST_MEM_POLICY_PREFERRED may either translate to MPOL_PREFERRED or
31  * MPOL_PREFERRED_MANY, see comments further below.
32  */
33 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
34 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
35 QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
36 #endif
37 
38 char *
39 host_memory_backend_get_name(HostMemoryBackend *backend)
40 {
41     if (!backend->use_canonical_path) {
42         return g_strdup(object_get_canonical_path_component(OBJECT(backend)));
43     }
44 
45     return object_get_canonical_path(OBJECT(backend));
46 }
47 
48 static void
49 host_memory_backend_get_size(Object *obj, Visitor *v, const char *name,
50                              void *opaque, Error **errp)
51 {
52     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
53     uint64_t value = backend->size;
54 
55     visit_type_size(v, name, &value, errp);
56 }
57 
58 static void
59 host_memory_backend_set_size(Object *obj, Visitor *v, const char *name,
60                              void *opaque, Error **errp)
61 {
62     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
63     uint64_t value;
64 
65     if (host_memory_backend_mr_inited(backend)) {
66         error_setg(errp, "cannot change property %s of %s ", name,
67                    object_get_typename(obj));
68         return;
69     }
70 
71     if (!visit_type_size(v, name, &value, errp)) {
72         return;
73     }
74     if (!value) {
75         error_setg(errp,
76                    "property '%s' of %s doesn't take value '%" PRIu64 "'",
77                    name, object_get_typename(obj), value);
78         return;
79     }
80     backend->size = value;
81 }
82 
83 static void
84 host_memory_backend_get_host_nodes(Object *obj, Visitor *v, const char *name,
85                                    void *opaque, Error **errp)
86 {
87     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
88     uint16List *host_nodes = NULL;
89     uint16List **tail = &host_nodes;
90     unsigned long value;
91 
92     value = find_first_bit(backend->host_nodes, MAX_NODES);
93     if (value == MAX_NODES) {
94         goto ret;
95     }
96 
97     QAPI_LIST_APPEND(tail, value);
98 
99     do {
100         value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
101         if (value == MAX_NODES) {
102             break;
103         }
104 
105         QAPI_LIST_APPEND(tail, value);
106     } while (true);
107 
108 ret:
109     visit_type_uint16List(v, name, &host_nodes, errp);
110     qapi_free_uint16List(host_nodes);
111 }
112 
113 static void
114 host_memory_backend_set_host_nodes(Object *obj, Visitor *v, const char *name,
115                                    void *opaque, Error **errp)
116 {
117 #ifdef CONFIG_NUMA
118     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
119     uint16List *l, *host_nodes = NULL;
120 
121     visit_type_uint16List(v, name, &host_nodes, errp);
122 
123     for (l = host_nodes; l; l = l->next) {
124         if (l->value >= MAX_NODES) {
125             error_setg(errp, "Invalid host-nodes value: %d", l->value);
126             goto out;
127         }
128     }
129 
130     for (l = host_nodes; l; l = l->next) {
131         bitmap_set(backend->host_nodes, l->value, 1);
132     }
133 
134 out:
135     qapi_free_uint16List(host_nodes);
136 #else
137     error_setg(errp, "NUMA node binding are not supported by this QEMU");
138 #endif
139 }
140 
141 static int
142 host_memory_backend_get_policy(Object *obj, Error **errp G_GNUC_UNUSED)
143 {
144     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
145     return backend->policy;
146 }
147 
148 static void
149 host_memory_backend_set_policy(Object *obj, int policy, Error **errp)
150 {
151     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
152     backend->policy = policy;
153 
154 #ifndef CONFIG_NUMA
155     if (policy != HOST_MEM_POLICY_DEFAULT) {
156         error_setg(errp, "NUMA policies are not supported by this QEMU");
157     }
158 #endif
159 }
160 
161 static bool host_memory_backend_get_merge(Object *obj, Error **errp)
162 {
163     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
164 
165     return backend->merge;
166 }
167 
168 static void host_memory_backend_set_merge(Object *obj, bool value, Error **errp)
169 {
170     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
171 
172     if (!host_memory_backend_mr_inited(backend)) {
173         backend->merge = value;
174         return;
175     }
176 
177     if (value != backend->merge) {
178         void *ptr = memory_region_get_ram_ptr(&backend->mr);
179         uint64_t sz = memory_region_size(&backend->mr);
180 
181         qemu_madvise(ptr, sz,
182                      value ? QEMU_MADV_MERGEABLE : QEMU_MADV_UNMERGEABLE);
183         backend->merge = value;
184     }
185 }
186 
187 static bool host_memory_backend_get_dump(Object *obj, Error **errp)
188 {
189     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
190 
191     return backend->dump;
192 }
193 
194 static void host_memory_backend_set_dump(Object *obj, bool value, Error **errp)
195 {
196     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
197 
198     if (!host_memory_backend_mr_inited(backend)) {
199         backend->dump = value;
200         return;
201     }
202 
203     if (value != backend->dump) {
204         void *ptr = memory_region_get_ram_ptr(&backend->mr);
205         uint64_t sz = memory_region_size(&backend->mr);
206 
207         qemu_madvise(ptr, sz,
208                      value ? QEMU_MADV_DODUMP : QEMU_MADV_DONTDUMP);
209         backend->dump = value;
210     }
211 }
212 
213 static bool host_memory_backend_get_prealloc(Object *obj, Error **errp)
214 {
215     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
216 
217     return backend->prealloc;
218 }
219 
220 static void host_memory_backend_set_prealloc(Object *obj, bool value,
221                                              Error **errp)
222 {
223     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
224 
225     if (!backend->reserve && value) {
226         error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
227         return;
228     }
229 
230     if (!host_memory_backend_mr_inited(backend)) {
231         backend->prealloc = value;
232         return;
233     }
234 
235     if (value && !backend->prealloc) {
236         int fd = memory_region_get_fd(&backend->mr);
237         void *ptr = memory_region_get_ram_ptr(&backend->mr);
238         uint64_t sz = memory_region_size(&backend->mr);
239 
240         if (!qemu_prealloc_mem(fd, ptr, sz, backend->prealloc_threads,
241                                backend->prealloc_context, false, errp)) {
242             return;
243         }
244         backend->prealloc = true;
245     }
246 }
247 
248 static void host_memory_backend_get_prealloc_threads(Object *obj, Visitor *v,
249     const char *name, void *opaque, Error **errp)
250 {
251     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
252     visit_type_uint32(v, name, &backend->prealloc_threads, errp);
253 }
254 
255 static void host_memory_backend_set_prealloc_threads(Object *obj, Visitor *v,
256     const char *name, void *opaque, Error **errp)
257 {
258     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
259     uint32_t value;
260 
261     if (!visit_type_uint32(v, name, &value, errp)) {
262         return;
263     }
264     if (value <= 0) {
265         error_setg(errp, "property '%s' of %s doesn't take value '%d'", name,
266                    object_get_typename(obj), value);
267         return;
268     }
269     backend->prealloc_threads = value;
270 }
271 
272 static void host_memory_backend_init(Object *obj)
273 {
274     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
275     MachineState *machine = MACHINE(qdev_get_machine());
276 
277     /* TODO: convert access to globals to compat properties */
278     backend->merge = machine_mem_merge(machine);
279     backend->dump = machine_dump_guest_core(machine);
280     backend->reserve = true;
281     backend->prealloc_threads = machine->smp.cpus;
282 }
283 
284 static void host_memory_backend_post_init(Object *obj)
285 {
286     object_apply_compat_props(obj);
287 }
288 
289 bool host_memory_backend_mr_inited(HostMemoryBackend *backend)
290 {
291     /*
292      * NOTE: We forbid zero-length memory backend, so here zero means
293      * "we haven't inited the backend memory region yet".
294      */
295     return memory_region_size(&backend->mr) != 0;
296 }
297 
298 MemoryRegion *host_memory_backend_get_memory(HostMemoryBackend *backend)
299 {
300     return host_memory_backend_mr_inited(backend) ? &backend->mr : NULL;
301 }
302 
303 void host_memory_backend_set_mapped(HostMemoryBackend *backend, bool mapped)
304 {
305     backend->is_mapped = mapped;
306 }
307 
308 bool host_memory_backend_is_mapped(HostMemoryBackend *backend)
309 {
310     return backend->is_mapped;
311 }
312 
313 size_t host_memory_backend_pagesize(HostMemoryBackend *memdev)
314 {
315     size_t pagesize = qemu_ram_pagesize(memdev->mr.ram_block);
316     g_assert(pagesize >= qemu_real_host_page_size());
317     return pagesize;
318 }
319 
320 static void
321 host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
322 {
323     HostMemoryBackend *backend = MEMORY_BACKEND(uc);
324     HostMemoryBackendClass *bc = MEMORY_BACKEND_GET_CLASS(uc);
325     void *ptr;
326     uint64_t sz;
327     bool async = !phase_check(PHASE_LATE_BACKENDS_CREATED);
328 
329     if (!bc->alloc) {
330         return;
331     }
332     if (!bc->alloc(backend, errp)) {
333         return;
334     }
335 
336     ptr = memory_region_get_ram_ptr(&backend->mr);
337     sz = memory_region_size(&backend->mr);
338 
339     if (backend->merge) {
340         qemu_madvise(ptr, sz, QEMU_MADV_MERGEABLE);
341     }
342     if (!backend->dump) {
343         qemu_madvise(ptr, sz, QEMU_MADV_DONTDUMP);
344     }
345 #ifdef CONFIG_NUMA
346     unsigned long lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
347     /* lastbit == MAX_NODES means maxnode = 0 */
348     unsigned long maxnode = (lastbit + 1) % (MAX_NODES + 1);
349     /*
350      * Ensure policy won't be ignored in case memory is preallocated
351      * before mbind(). note: MPOL_MF_STRICT is ignored on hugepages so
352      * this doesn't catch hugepage case.
353      */
354     unsigned flags = MPOL_MF_STRICT | MPOL_MF_MOVE;
355     int mode = backend->policy;
356 
357     /* check for invalid host-nodes and policies and give more verbose
358      * error messages than mbind(). */
359     if (maxnode && backend->policy == MPOL_DEFAULT) {
360         error_setg(errp, "host-nodes must be empty for policy default,"
361                    " or you should explicitly specify a policy other"
362                    " than default");
363         return;
364     } else if (maxnode == 0 && backend->policy != MPOL_DEFAULT) {
365         error_setg(errp, "host-nodes must be set for policy %s",
366                    HostMemPolicy_str(backend->policy));
367         return;
368     }
369 
370     /*
371      * We can have up to MAX_NODES nodes, but we need to pass maxnode+1
372      * as argument to mbind() due to an old Linux bug (feature?) which
373      * cuts off the last specified node. This means backend->host_nodes
374      * must have MAX_NODES+1 bits available.
375      */
376     assert(sizeof(backend->host_nodes) >=
377            BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
378     assert(maxnode <= MAX_NODES);
379 
380 #ifdef HAVE_NUMA_HAS_PREFERRED_MANY
381     if (mode == MPOL_PREFERRED && numa_has_preferred_many() > 0) {
382         /*
383          * Replace with MPOL_PREFERRED_MANY otherwise the mbind() below
384          * silently picks the first node.
385          */
386         mode = MPOL_PREFERRED_MANY;
387     }
388 #endif
389 
390     if (maxnode &&
391         mbind(ptr, sz, mode, backend->host_nodes, maxnode + 1, flags)) {
392         if (backend->policy != MPOL_DEFAULT || errno != ENOSYS) {
393             error_setg_errno(errp, errno,
394                              "cannot bind memory to host NUMA nodes");
395             return;
396         }
397     }
398 #endif
399     /*
400      * Preallocate memory after the NUMA policy has been instantiated.
401      * This is necessary to guarantee memory is allocated with
402      * specified NUMA policy in place.
403      */
404     if (backend->prealloc && !qemu_prealloc_mem(memory_region_get_fd(&backend->mr),
405                                                 ptr, sz,
406                                                 backend->prealloc_threads,
407                                                 backend->prealloc_context,
408                                                 async, errp)) {
409         return;
410     }
411 }
412 
413 static bool
414 host_memory_backend_can_be_deleted(UserCreatable *uc)
415 {
416     if (host_memory_backend_is_mapped(MEMORY_BACKEND(uc))) {
417         return false;
418     } else {
419         return true;
420     }
421 }
422 
423 static bool host_memory_backend_get_share(Object *o, Error **errp)
424 {
425     HostMemoryBackend *backend = MEMORY_BACKEND(o);
426 
427     return backend->share;
428 }
429 
430 static void host_memory_backend_set_share(Object *o, bool value, Error **errp)
431 {
432     HostMemoryBackend *backend = MEMORY_BACKEND(o);
433 
434     if (host_memory_backend_mr_inited(backend)) {
435         error_setg(errp, "cannot change property value");
436         return;
437     }
438     backend->share = value;
439 }
440 
441 #ifdef CONFIG_LINUX
442 static bool host_memory_backend_get_reserve(Object *o, Error **errp)
443 {
444     HostMemoryBackend *backend = MEMORY_BACKEND(o);
445 
446     return backend->reserve;
447 }
448 
449 static void host_memory_backend_set_reserve(Object *o, bool value, Error **errp)
450 {
451     HostMemoryBackend *backend = MEMORY_BACKEND(o);
452 
453     if (host_memory_backend_mr_inited(backend)) {
454         error_setg(errp, "cannot change property value");
455         return;
456     }
457     if (backend->prealloc && !value) {
458         error_setg(errp, "'prealloc=on' and 'reserve=off' are incompatible");
459         return;
460     }
461     backend->reserve = value;
462 }
463 #endif /* CONFIG_LINUX */
464 
465 static bool
466 host_memory_backend_get_use_canonical_path(Object *obj, Error **errp)
467 {
468     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
469 
470     return backend->use_canonical_path;
471 }
472 
473 static void
474 host_memory_backend_set_use_canonical_path(Object *obj, bool value,
475                                            Error **errp)
476 {
477     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
478 
479     backend->use_canonical_path = value;
480 }
481 
482 static void
483 host_memory_backend_class_init(ObjectClass *oc, void *data)
484 {
485     UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
486 
487     ucc->complete = host_memory_backend_memory_complete;
488     ucc->can_be_deleted = host_memory_backend_can_be_deleted;
489 
490     object_class_property_add_bool(oc, "merge",
491         host_memory_backend_get_merge,
492         host_memory_backend_set_merge);
493     object_class_property_set_description(oc, "merge",
494         "Mark memory as mergeable");
495     object_class_property_add_bool(oc, "dump",
496         host_memory_backend_get_dump,
497         host_memory_backend_set_dump);
498     object_class_property_set_description(oc, "dump",
499         "Set to 'off' to exclude from core dump");
500     object_class_property_add_bool(oc, "prealloc",
501         host_memory_backend_get_prealloc,
502         host_memory_backend_set_prealloc);
503     object_class_property_set_description(oc, "prealloc",
504         "Preallocate memory");
505     object_class_property_add(oc, "prealloc-threads", "int",
506         host_memory_backend_get_prealloc_threads,
507         host_memory_backend_set_prealloc_threads,
508         NULL, NULL);
509     object_class_property_set_description(oc, "prealloc-threads",
510         "Number of CPU threads to use for prealloc");
511     object_class_property_add_link(oc, "prealloc-context",
512         TYPE_THREAD_CONTEXT, offsetof(HostMemoryBackend, prealloc_context),
513         object_property_allow_set_link, OBJ_PROP_LINK_STRONG);
514     object_class_property_set_description(oc, "prealloc-context",
515         "Context to use for creating CPU threads for preallocation");
516     object_class_property_add(oc, "size", "int",
517         host_memory_backend_get_size,
518         host_memory_backend_set_size,
519         NULL, NULL);
520     object_class_property_set_description(oc, "size",
521         "Size of the memory region (ex: 500M)");
522     object_class_property_add(oc, "host-nodes", "int",
523         host_memory_backend_get_host_nodes,
524         host_memory_backend_set_host_nodes,
525         NULL, NULL);
526     object_class_property_set_description(oc, "host-nodes",
527         "Binds memory to the list of NUMA host nodes");
528     object_class_property_add_enum(oc, "policy", "HostMemPolicy",
529         &HostMemPolicy_lookup,
530         host_memory_backend_get_policy,
531         host_memory_backend_set_policy);
532     object_class_property_set_description(oc, "policy",
533         "Set the NUMA policy");
534     object_class_property_add_bool(oc, "share",
535         host_memory_backend_get_share, host_memory_backend_set_share);
536     object_class_property_set_description(oc, "share",
537         "Mark the memory as private to QEMU or shared");
538 #ifdef CONFIG_LINUX
539     object_class_property_add_bool(oc, "reserve",
540         host_memory_backend_get_reserve, host_memory_backend_set_reserve);
541     object_class_property_set_description(oc, "reserve",
542         "Reserve swap space (or huge pages) if applicable");
543 #endif /* CONFIG_LINUX */
544     /*
545      * Do not delete/rename option. This option must be considered stable
546      * (as if it didn't have the 'x-' prefix including deprecation period) as
547      * long as 4.0 and older machine types exists.
548      * Option will be used by upper layers to override (disable) canonical path
549      * for ramblock-id set by compat properties on old machine types ( <= 4.0),
550      * to keep migration working when backend is used for main RAM with
551      * -machine memory-backend= option (main RAM historically used prefix-less
552      * ramblock-id).
553      */
554     object_class_property_add_bool(oc, "x-use-canonical-path-for-ramblock-id",
555         host_memory_backend_get_use_canonical_path,
556         host_memory_backend_set_use_canonical_path);
557 }
558 
559 static const TypeInfo host_memory_backend_info = {
560     .name = TYPE_MEMORY_BACKEND,
561     .parent = TYPE_OBJECT,
562     .abstract = true,
563     .class_size = sizeof(HostMemoryBackendClass),
564     .class_init = host_memory_backend_class_init,
565     .instance_size = sizeof(HostMemoryBackend),
566     .instance_init = host_memory_backend_init,
567     .instance_post_init = host_memory_backend_post_init,
568     .interfaces = (InterfaceInfo[]) {
569         { TYPE_USER_CREATABLE },
570         { }
571     }
572 };
573 
574 static void register_types(void)
575 {
576     type_register_static(&host_memory_backend_info);
577 }
578 
579 type_init(register_types);
580