xref: /openbmc/qemu/hw/mem/memory-device.c (revision 0d67249c6d30a626434815c4fc39ab6bc60708f6)
1 /*
2  * Memory Device Interface
3  *
4  * Copyright ProfitBricks GmbH 2012
5  * Copyright (C) 2014 Red Hat Inc
6  * Copyright (c) 2018 Red Hat Inc
7  *
8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
9  * See the COPYING file in the top-level directory.
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "hw/mem/memory-device.h"
15 #include "qapi/error.h"
16 #include "hw/boards.h"
17 #include "qemu/range.h"
18 #include "hw/virtio/vhost.h"
19 #include "sysemu/kvm.h"
20 #include "exec/address-spaces.h"
21 #include "trace.h"
22 
23 static gint memory_device_addr_sort(gconstpointer a, gconstpointer b)
24 {
25     const MemoryDeviceState *md_a = MEMORY_DEVICE(a);
26     const MemoryDeviceState *md_b = MEMORY_DEVICE(b);
27     const MemoryDeviceClass *mdc_a = MEMORY_DEVICE_GET_CLASS(a);
28     const MemoryDeviceClass *mdc_b = MEMORY_DEVICE_GET_CLASS(b);
29     const uint64_t addr_a = mdc_a->get_addr(md_a);
30     const uint64_t addr_b = mdc_b->get_addr(md_b);
31 
32     if (addr_a > addr_b) {
33         return 1;
34     } else if (addr_a < addr_b) {
35         return -1;
36     }
37     return 0;
38 }
39 
40 static int memory_device_build_list(Object *obj, void *opaque)
41 {
42     GSList **list = opaque;
43 
44     if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
45         DeviceState *dev = DEVICE(obj);
46         if (dev->realized) { /* only realized memory devices matter */
47             *list = g_slist_insert_sorted(*list, dev, memory_device_addr_sort);
48         }
49     }
50 
51     object_child_foreach(obj, memory_device_build_list, opaque);
52     return 0;
53 }
54 
55 static unsigned int memory_device_get_memslots(MemoryDeviceState *md)
56 {
57     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
58 
59     if (mdc->get_memslots) {
60         return mdc->get_memslots(md);
61     }
62     return 1;
63 }
64 
65 /*
66  * Memslots that are reserved by memory devices (required but still reported
67  * as free from KVM / vhost).
68  */
69 static unsigned int get_reserved_memslots(MachineState *ms)
70 {
71     if (ms->device_memory->used_memslots >
72         ms->device_memory->required_memslots) {
73         /* This is unexpected, and we warned already in the memory notifier. */
74         return 0;
75     }
76     return ms->device_memory->required_memslots -
77            ms->device_memory->used_memslots;
78 }
79 
80 unsigned int memory_devices_get_reserved_memslots(void)
81 {
82     if (!current_machine->device_memory) {
83         return 0;
84     }
85     return get_reserved_memslots(current_machine);
86 }
87 
88 bool memory_devices_memslot_auto_decision_active(void)
89 {
90     if (!current_machine->device_memory) {
91         return false;
92     }
93 
94     return current_machine->device_memory->memslot_auto_decision_active;
95 }
96 
97 static unsigned int memory_device_memslot_decision_limit(MachineState *ms,
98                                                          MemoryRegion *mr)
99 {
100     const unsigned int reserved = get_reserved_memslots(ms);
101     const uint64_t size = memory_region_size(mr);
102     unsigned int max = vhost_get_max_memslots();
103     unsigned int free = vhost_get_free_memslots();
104     uint64_t available_space;
105     unsigned int memslots;
106 
107     if (kvm_enabled()) {
108         max = MIN(max, kvm_get_max_memslots());
109         free = MIN(free, kvm_get_free_memslots());
110     }
111 
112     /*
113      * If we only have less overall memslots than what we consider reasonable,
114      * just keep it to a minimum.
115      */
116     if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) {
117         return 1;
118     }
119 
120     /*
121      * Consider our soft-limit across all memory devices. We don't really
122      * expect to exceed this limit in reasonable configurations.
123      */
124     if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <=
125         ms->device_memory->required_memslots) {
126         return 1;
127     }
128     memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT -
129                ms->device_memory->required_memslots;
130 
131     /*
132      * Consider the actually still free memslots. This is only relevant if
133      * other memslot consumers would consume *significantly* more memslots than
134      * what we prepared for (> 253). Unlikely, but let's just handle it
135      * cleanly.
136      */
137     memslots = MIN(memslots, free - reserved);
138     if (memslots < 1 || unlikely(free < reserved)) {
139         return 1;
140     }
141 
142     /* We cannot have any other memory devices? So give all to this device. */
143     if (size == ms->maxram_size - ms->ram_size) {
144         return memslots;
145     }
146 
147     /*
148      * Simple heuristic: equally distribute the memslots over the space
149      * still available for memory devices.
150      */
151     available_space = ms->maxram_size - ms->ram_size -
152                       ms->device_memory->used_region_size;
153     memslots = (double)memslots * size / available_space;
154     return memslots < 1 ? 1 : memslots;
155 }
156 
157 static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md,
158                                         MemoryRegion *mr, Error **errp)
159 {
160     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
161     const uint64_t used_region_size = ms->device_memory->used_region_size;
162     const uint64_t size = memory_region_size(mr);
163     const unsigned int reserved_memslots = get_reserved_memslots(ms);
164     unsigned int required_memslots, memslot_limit;
165 
166     /*
167      * Instruct the device to decide how many memslots to use, if applicable,
168      * before we query the number of required memslots the first time.
169      */
170     if (mdc->decide_memslots) {
171         memslot_limit = memory_device_memslot_decision_limit(ms, mr);
172         mdc->decide_memslots(md, memslot_limit);
173     }
174     required_memslots = memory_device_get_memslots(md);
175 
176     /* we will need memory slots for kvm and vhost */
177     if (kvm_enabled() &&
178         kvm_get_free_memslots() < required_memslots + reserved_memslots) {
179         error_setg(errp, "hypervisor has not enough free memory slots left");
180         return;
181     }
182     if (vhost_get_free_memslots() < required_memslots + reserved_memslots) {
183         error_setg(errp, "a used vhost backend has not enough free memory slots left");
184         return;
185     }
186 
187     /* will we exceed the total amount of memory specified */
188     if (used_region_size + size < used_region_size ||
189         used_region_size + size > ms->maxram_size - ms->ram_size) {
190         error_setg(errp, "not enough space, currently 0x%" PRIx64
191                    " in use of total space for memory devices 0x" RAM_ADDR_FMT,
192                    used_region_size, ms->maxram_size - ms->ram_size);
193         return;
194     }
195 
196 }
197 
198 static uint64_t memory_device_get_free_addr(MachineState *ms,
199                                             const uint64_t *hint,
200                                             uint64_t align, uint64_t size,
201                                             Error **errp)
202 {
203     GSList *list = NULL, *item;
204     Range as, new = range_empty;
205 
206     range_init_nofail(&as, ms->device_memory->base,
207                       memory_region_size(&ms->device_memory->mr));
208 
209     /* start of address space indicates the maximum alignment we expect */
210     if (!QEMU_IS_ALIGNED(range_lob(&as), align)) {
211         warn_report("the alignment (0x%" PRIx64 ") exceeds the expected"
212                     " maximum alignment, memory will get fragmented and not"
213                     " all 'maxmem' might be usable for memory devices.",
214                     align);
215     }
216 
217     if (hint && !QEMU_IS_ALIGNED(*hint, align)) {
218         error_setg(errp, "address must be aligned to 0x%" PRIx64 " bytes",
219                    align);
220         return 0;
221     }
222 
223     if (!QEMU_IS_ALIGNED(size, align)) {
224         error_setg(errp, "backend memory size must be multiple of 0x%"
225                    PRIx64, align);
226         return 0;
227     }
228 
229     if (hint) {
230         if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) {
231             error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64
232                        "], usable range for memory devices [0x%" PRIx64 ":0x%"
233                        PRIx64 "]", *hint, size, range_lob(&as),
234                        range_size(&as));
235             return 0;
236         }
237     } else {
238         if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) {
239             error_setg(errp, "can't add memory device, device too big");
240             return 0;
241         }
242     }
243 
244     /* find address range that will fit new memory device */
245     object_child_foreach(OBJECT(ms), memory_device_build_list, &list);
246     for (item = list; item; item = g_slist_next(item)) {
247         const MemoryDeviceState *md = item->data;
248         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(OBJECT(md));
249         uint64_t next_addr;
250         Range tmp;
251 
252         range_init_nofail(&tmp, mdc->get_addr(md),
253                           memory_device_get_region_size(md, &error_abort));
254 
255         if (range_overlaps_range(&tmp, &new)) {
256             if (hint) {
257                 const DeviceState *d = DEVICE(md);
258                 error_setg(errp, "address range conflicts with memory device"
259                            " id='%s'", d->id ? d->id : "(unnamed)");
260                 goto out;
261             }
262 
263             next_addr = QEMU_ALIGN_UP(range_upb(&tmp) + 1, align);
264             if (!next_addr || range_init(&new, next_addr, range_size(&new))) {
265                 range_make_empty(&new);
266                 break;
267             }
268         } else if (range_lob(&tmp) > range_upb(&new)) {
269             break;
270         }
271     }
272 
273     if (!range_contains_range(&as, &new)) {
274         error_setg(errp, "could not find position in guest address space for "
275                    "memory device - memory fragmented due to alignments");
276     }
277 out:
278     g_slist_free(list);
279     return range_lob(&new);
280 }
281 
282 MemoryDeviceInfoList *qmp_memory_device_list(void)
283 {
284     GSList *devices = NULL, *item;
285     MemoryDeviceInfoList *list = NULL, **tail = &list;
286 
287     object_child_foreach(qdev_get_machine(), memory_device_build_list,
288                          &devices);
289 
290     for (item = devices; item; item = g_slist_next(item)) {
291         const MemoryDeviceState *md = MEMORY_DEVICE(item->data);
292         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data);
293         MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
294 
295         mdc->fill_device_info(md, info);
296 
297         QAPI_LIST_APPEND(tail, info);
298     }
299 
300     g_slist_free(devices);
301 
302     return list;
303 }
304 
305 static int memory_device_plugged_size(Object *obj, void *opaque)
306 {
307     uint64_t *size = opaque;
308 
309     if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
310         const DeviceState *dev = DEVICE(obj);
311         const MemoryDeviceState *md = MEMORY_DEVICE(obj);
312         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj);
313 
314         if (dev->realized) {
315             *size += mdc->get_plugged_size(md, &error_abort);
316         }
317     }
318 
319     object_child_foreach(obj, memory_device_plugged_size, opaque);
320     return 0;
321 }
322 
323 uint64_t get_plugged_memory_size(void)
324 {
325     uint64_t size = 0;
326 
327     memory_device_plugged_size(qdev_get_machine(), &size);
328 
329     return size;
330 }
331 
332 void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
333                             const uint64_t *legacy_align, Error **errp)
334 {
335     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
336     Error *local_err = NULL;
337     uint64_t addr, align = 0;
338     MemoryRegion *mr;
339 
340     if (!ms->device_memory) {
341         error_setg(errp, "the configuration is not prepared for memory devices"
342                          " (e.g., for memory hotplug), consider specifying the"
343                          " maxmem option");
344         return;
345     }
346 
347     mr = mdc->get_memory_region(md, &local_err);
348     if (local_err) {
349         goto out;
350     }
351 
352     memory_device_check_addable(ms, md, mr, &local_err);
353     if (local_err) {
354         goto out;
355     }
356 
357     if (legacy_align) {
358         align = *legacy_align;
359     } else {
360         if (mdc->get_min_alignment) {
361             align = mdc->get_min_alignment(md);
362         }
363         align = MAX(align, memory_region_get_alignment(mr));
364     }
365     addr = mdc->get_addr(md);
366     addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align,
367                                        memory_region_size(mr), &local_err);
368     if (local_err) {
369         goto out;
370     }
371     mdc->set_addr(md, addr, &local_err);
372     if (!local_err) {
373         trace_memory_device_pre_plug(DEVICE(md)->id ? DEVICE(md)->id : "",
374                                      addr);
375     }
376 out:
377     error_propagate(errp, local_err);
378 }
379 
380 void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
381 {
382     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
383     const unsigned int memslots = memory_device_get_memslots(md);
384     const uint64_t addr = mdc->get_addr(md);
385     MemoryRegion *mr;
386 
387     /*
388      * We expect that a previous call to memory_device_pre_plug() succeeded, so
389      * it can't fail at this point.
390      */
391     mr = mdc->get_memory_region(md, &error_abort);
392     g_assert(ms->device_memory);
393 
394     ms->device_memory->used_region_size += memory_region_size(mr);
395     ms->device_memory->required_memslots += memslots;
396     if (mdc->decide_memslots && memslots > 1) {
397         ms->device_memory->memslot_auto_decision_active++;
398     }
399 
400     memory_region_add_subregion(&ms->device_memory->mr,
401                                 addr - ms->device_memory->base, mr);
402     trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr);
403 }
404 
405 void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
406 {
407     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
408     const unsigned int memslots = memory_device_get_memslots(md);
409     MemoryRegion *mr;
410 
411     /*
412      * We expect that a previous call to memory_device_pre_plug() succeeded, so
413      * it can't fail at this point.
414      */
415     mr = mdc->get_memory_region(md, &error_abort);
416     g_assert(ms->device_memory);
417 
418     memory_region_del_subregion(&ms->device_memory->mr, mr);
419 
420     if (mdc->decide_memslots && memslots > 1) {
421         ms->device_memory->memslot_auto_decision_active--;
422     }
423     ms->device_memory->used_region_size -= memory_region_size(mr);
424     ms->device_memory->required_memslots -= memslots;
425     trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "",
426                                mdc->get_addr(md));
427 }
428 
429 uint64_t memory_device_get_region_size(const MemoryDeviceState *md,
430                                        Error **errp)
431 {
432     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
433     MemoryRegion *mr;
434 
435     /* dropping const here is fine as we don't touch the memory region */
436     mr = mdc->get_memory_region((MemoryDeviceState *)md, errp);
437     if (!mr) {
438         return 0;
439     }
440 
441     return memory_region_size(mr);
442 }
443 
444 static void memory_devices_region_mod(MemoryListener *listener,
445                                       MemoryRegionSection *mrs, bool add)
446 {
447     DeviceMemoryState *dms = container_of(listener, DeviceMemoryState,
448                                           listener);
449 
450     if (!memory_region_is_ram(mrs->mr)) {
451         warn_report("Unexpected memory region mapped into device memory region.");
452         return;
453     }
454 
455     /*
456      * The expectation is that each distinct RAM memory region section in
457      * our region for memory devices consumes exactly one memslot in KVM
458      * and in vhost. For vhost, this is true, except:
459      * * ROM memory regions don't consume a memslot. These get used very
460      *   rarely for memory devices (R/O NVDIMMs).
461      * * Memslots without a fd (memory-backend-ram) don't necessarily
462      *   consume a memslot. Such setups are quite rare and possibly bogus:
463      *   the memory would be inaccessible by such vhost devices.
464      *
465      * So for vhost, in corner cases we might over-estimate the number of
466      * memslots that are currently used or that might still be reserved
467      * (required - used).
468      */
469     dms->used_memslots += add ? 1 : -1;
470 
471     if (dms->used_memslots > dms->required_memslots) {
472         warn_report("Memory devices use more memory slots than indicated as required.");
473     }
474 }
475 
476 static void memory_devices_region_add(MemoryListener *listener,
477                                       MemoryRegionSection *mrs)
478 {
479     return memory_devices_region_mod(listener, mrs, true);
480 }
481 
482 static void memory_devices_region_del(MemoryListener *listener,
483                                       MemoryRegionSection *mrs)
484 {
485     return memory_devices_region_mod(listener, mrs, false);
486 }
487 
488 void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
489 {
490     g_assert(size);
491     g_assert(!ms->device_memory);
492     ms->device_memory = g_new0(DeviceMemoryState, 1);
493     ms->device_memory->base = base;
494 
495     memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory",
496                        size);
497     address_space_init(&ms->device_memory->as, &ms->device_memory->mr,
498                        "device-memory");
499     memory_region_add_subregion(get_system_memory(), ms->device_memory->base,
500                                 &ms->device_memory->mr);
501 
502     /* Track the number of memslots used by memory devices. */
503     ms->device_memory->listener.region_add = memory_devices_region_add;
504     ms->device_memory->listener.region_del = memory_devices_region_del;
505     memory_listener_register(&ms->device_memory->listener,
506                              &ms->device_memory->as);
507 }
508 
509 static const TypeInfo memory_device_info = {
510     .name          = TYPE_MEMORY_DEVICE,
511     .parent        = TYPE_INTERFACE,
512     .class_size = sizeof(MemoryDeviceClass),
513 };
514 
515 static void memory_device_register_types(void)
516 {
517     type_register_static(&memory_device_info);
518 }
519 
520 type_init(memory_device_register_types)
521