xref: /openbmc/qemu/hw/mem/memory-device.c (revision 2df1eb27)
1 /*
2  * Memory Device Interface
3  *
4  * Copyright ProfitBricks GmbH 2012
5  * Copyright (C) 2014 Red Hat Inc
6  * Copyright (c) 2018 Red Hat Inc
7  *
8  * This work is licensed under the terms of the GNU GPL, version 2 or later.
9  * See the COPYING file in the top-level directory.
10  */
11 
12 #include "qemu/osdep.h"
13 #include "qemu/error-report.h"
14 #include "hw/mem/memory-device.h"
15 #include "qapi/error.h"
16 #include "hw/boards.h"
17 #include "qemu/range.h"
18 #include "hw/virtio/vhost.h"
19 #include "sysemu/kvm.h"
20 #include "exec/address-spaces.h"
21 #include "trace.h"
22 
23 static bool memory_device_is_empty(const MemoryDeviceState *md)
24 {
25     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
26     Error *local_err = NULL;
27     MemoryRegion *mr;
28 
29     /* dropping const here is fine as we don't touch the memory region */
30     mr = mdc->get_memory_region((MemoryDeviceState *)md, &local_err);
31     if (local_err) {
32         /* Not empty, we'll report errors later when containing the MR again. */
33         error_free(local_err);
34         return false;
35     }
36     return !mr;
37 }
38 
39 static gint memory_device_addr_sort(gconstpointer a, gconstpointer b)
40 {
41     const MemoryDeviceState *md_a = MEMORY_DEVICE(a);
42     const MemoryDeviceState *md_b = MEMORY_DEVICE(b);
43     const MemoryDeviceClass *mdc_a = MEMORY_DEVICE_GET_CLASS(a);
44     const MemoryDeviceClass *mdc_b = MEMORY_DEVICE_GET_CLASS(b);
45     const uint64_t addr_a = mdc_a->get_addr(md_a);
46     const uint64_t addr_b = mdc_b->get_addr(md_b);
47 
48     if (addr_a > addr_b) {
49         return 1;
50     } else if (addr_a < addr_b) {
51         return -1;
52     }
53     return 0;
54 }
55 
56 static int memory_device_build_list(Object *obj, void *opaque)
57 {
58     GSList **list = opaque;
59 
60     if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
61         DeviceState *dev = DEVICE(obj);
62         if (dev->realized) { /* only realized memory devices matter */
63             *list = g_slist_insert_sorted(*list, dev, memory_device_addr_sort);
64         }
65     }
66 
67     object_child_foreach(obj, memory_device_build_list, opaque);
68     return 0;
69 }
70 
71 static unsigned int memory_device_get_memslots(MemoryDeviceState *md)
72 {
73     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
74 
75     if (mdc->get_memslots) {
76         return mdc->get_memslots(md);
77     }
78     return 1;
79 }
80 
81 /*
82  * Memslots that are reserved by memory devices (required but still reported
83  * as free from KVM / vhost).
84  */
85 static unsigned int get_reserved_memslots(MachineState *ms)
86 {
87     if (ms->device_memory->used_memslots >
88         ms->device_memory->required_memslots) {
89         /* This is unexpected, and we warned already in the memory notifier. */
90         return 0;
91     }
92     return ms->device_memory->required_memslots -
93            ms->device_memory->used_memslots;
94 }
95 
96 unsigned int memory_devices_get_reserved_memslots(void)
97 {
98     if (!current_machine->device_memory) {
99         return 0;
100     }
101     return get_reserved_memslots(current_machine);
102 }
103 
104 bool memory_devices_memslot_auto_decision_active(void)
105 {
106     if (!current_machine->device_memory) {
107         return false;
108     }
109 
110     return current_machine->device_memory->memslot_auto_decision_active;
111 }
112 
113 static unsigned int memory_device_memslot_decision_limit(MachineState *ms,
114                                                          MemoryRegion *mr)
115 {
116     const unsigned int reserved = get_reserved_memslots(ms);
117     const uint64_t size = memory_region_size(mr);
118     unsigned int max = vhost_get_max_memslots();
119     unsigned int free = vhost_get_free_memslots();
120     uint64_t available_space;
121     unsigned int memslots;
122 
123     if (kvm_enabled()) {
124         max = MIN(max, kvm_get_max_memslots());
125         free = MIN(free, kvm_get_free_memslots());
126     }
127 
128     /*
129      * If we only have less overall memslots than what we consider reasonable,
130      * just keep it to a minimum.
131      */
132     if (max < MEMORY_DEVICES_SAFE_MAX_MEMSLOTS) {
133         return 1;
134     }
135 
136     /*
137      * Consider our soft-limit across all memory devices. We don't really
138      * expect to exceed this limit in reasonable configurations.
139      */
140     if (MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT <=
141         ms->device_memory->required_memslots) {
142         return 1;
143     }
144     memslots = MEMORY_DEVICES_SOFT_MEMSLOT_LIMIT -
145                ms->device_memory->required_memslots;
146 
147     /*
148      * Consider the actually still free memslots. This is only relevant if
149      * other memslot consumers would consume *significantly* more memslots than
150      * what we prepared for (> 253). Unlikely, but let's just handle it
151      * cleanly.
152      */
153     memslots = MIN(memslots, free - reserved);
154     if (memslots < 1 || unlikely(free < reserved)) {
155         return 1;
156     }
157 
158     /* We cannot have any other memory devices? So give all to this device. */
159     if (size == ms->maxram_size - ms->ram_size) {
160         return memslots;
161     }
162 
163     /*
164      * Simple heuristic: equally distribute the memslots over the space
165      * still available for memory devices.
166      */
167     available_space = ms->maxram_size - ms->ram_size -
168                       ms->device_memory->used_region_size;
169     memslots = (double)memslots * size / available_space;
170     return memslots < 1 ? 1 : memslots;
171 }
172 
173 static void memory_device_check_addable(MachineState *ms, MemoryDeviceState *md,
174                                         MemoryRegion *mr, Error **errp)
175 {
176     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
177     const uint64_t used_region_size = ms->device_memory->used_region_size;
178     const uint64_t size = memory_region_size(mr);
179     const unsigned int reserved_memslots = get_reserved_memslots(ms);
180     unsigned int required_memslots, memslot_limit;
181 
182     /*
183      * Instruct the device to decide how many memslots to use, if applicable,
184      * before we query the number of required memslots the first time.
185      */
186     if (mdc->decide_memslots) {
187         memslot_limit = memory_device_memslot_decision_limit(ms, mr);
188         mdc->decide_memslots(md, memslot_limit);
189     }
190     required_memslots = memory_device_get_memslots(md);
191 
192     /* we will need memory slots for kvm and vhost */
193     if (kvm_enabled() &&
194         kvm_get_free_memslots() < required_memslots + reserved_memslots) {
195         error_setg(errp, "hypervisor has not enough free memory slots left");
196         return;
197     }
198     if (vhost_get_free_memslots() < required_memslots + reserved_memslots) {
199         error_setg(errp, "a used vhost backend has not enough free memory slots left");
200         return;
201     }
202 
203     /* will we exceed the total amount of memory specified */
204     if (used_region_size + size < used_region_size ||
205         used_region_size + size > ms->maxram_size - ms->ram_size) {
206         error_setg(errp, "not enough space, currently 0x%" PRIx64
207                    " in use of total space for memory devices 0x" RAM_ADDR_FMT,
208                    used_region_size, ms->maxram_size - ms->ram_size);
209         return;
210     }
211 
212 }
213 
214 static uint64_t memory_device_get_free_addr(MachineState *ms,
215                                             const uint64_t *hint,
216                                             uint64_t align, uint64_t size,
217                                             Error **errp)
218 {
219     GSList *list = NULL, *item;
220     Range as, new = range_empty;
221 
222     range_init_nofail(&as, ms->device_memory->base,
223                       memory_region_size(&ms->device_memory->mr));
224 
225     /* start of address space indicates the maximum alignment we expect */
226     if (!QEMU_IS_ALIGNED(range_lob(&as), align)) {
227         warn_report("the alignment (0x%" PRIx64 ") exceeds the expected"
228                     " maximum alignment, memory will get fragmented and not"
229                     " all 'maxmem' might be usable for memory devices.",
230                     align);
231     }
232 
233     if (hint && !QEMU_IS_ALIGNED(*hint, align)) {
234         error_setg(errp, "address must be aligned to 0x%" PRIx64 " bytes",
235                    align);
236         return 0;
237     }
238 
239     if (hint) {
240         if (range_init(&new, *hint, size) || !range_contains_range(&as, &new)) {
241             error_setg(errp, "can't add memory device [0x%" PRIx64 ":0x%" PRIx64
242                        "], usable range for memory devices [0x%" PRIx64 ":0x%"
243                        PRIx64 "]", *hint, size, range_lob(&as),
244                        range_size(&as));
245             return 0;
246         }
247     } else {
248         if (range_init(&new, QEMU_ALIGN_UP(range_lob(&as), align), size)) {
249             error_setg(errp, "can't add memory device, device too big");
250             return 0;
251         }
252     }
253 
254     /* find address range that will fit new memory device */
255     object_child_foreach(OBJECT(ms), memory_device_build_list, &list);
256     for (item = list; item; item = g_slist_next(item)) {
257         const MemoryDeviceState *md = item->data;
258         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(OBJECT(md));
259         uint64_t next_addr;
260         Range tmp;
261 
262         if (memory_device_is_empty(md)) {
263             continue;
264         }
265 
266         range_init_nofail(&tmp, mdc->get_addr(md),
267                           memory_device_get_region_size(md, &error_abort));
268 
269         if (range_overlaps_range(&tmp, &new)) {
270             if (hint) {
271                 const DeviceState *d = DEVICE(md);
272                 error_setg(errp, "address range conflicts with memory device"
273                            " id='%s'", d->id ? d->id : "(unnamed)");
274                 goto out;
275             }
276 
277             next_addr = QEMU_ALIGN_UP(range_upb(&tmp) + 1, align);
278             if (!next_addr || range_init(&new, next_addr, range_size(&new))) {
279                 range_make_empty(&new);
280                 break;
281             }
282         } else if (range_lob(&tmp) > range_upb(&new)) {
283             break;
284         }
285     }
286 
287     if (!range_contains_range(&as, &new)) {
288         error_setg(errp, "could not find position in guest address space for "
289                    "memory device - memory fragmented due to alignments");
290     }
291 out:
292     g_slist_free(list);
293     return range_lob(&new);
294 }
295 
296 MemoryDeviceInfoList *qmp_memory_device_list(void)
297 {
298     GSList *devices = NULL, *item;
299     MemoryDeviceInfoList *list = NULL, **tail = &list;
300 
301     object_child_foreach(qdev_get_machine(), memory_device_build_list,
302                          &devices);
303 
304     for (item = devices; item; item = g_slist_next(item)) {
305         const MemoryDeviceState *md = MEMORY_DEVICE(item->data);
306         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(item->data);
307         MemoryDeviceInfo *info = g_new0(MemoryDeviceInfo, 1);
308 
309         /* Let's query infotmation even for empty memory devices. */
310         mdc->fill_device_info(md, info);
311 
312         QAPI_LIST_APPEND(tail, info);
313     }
314 
315     g_slist_free(devices);
316 
317     return list;
318 }
319 
320 static int memory_device_plugged_size(Object *obj, void *opaque)
321 {
322     uint64_t *size = opaque;
323 
324     if (object_dynamic_cast(obj, TYPE_MEMORY_DEVICE)) {
325         const DeviceState *dev = DEVICE(obj);
326         const MemoryDeviceState *md = MEMORY_DEVICE(obj);
327         const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(obj);
328 
329         if (dev->realized && !memory_device_is_empty(md)) {
330             *size += mdc->get_plugged_size(md, &error_abort);
331         }
332     }
333 
334     object_child_foreach(obj, memory_device_plugged_size, opaque);
335     return 0;
336 }
337 
338 uint64_t get_plugged_memory_size(void)
339 {
340     uint64_t size = 0;
341 
342     memory_device_plugged_size(qdev_get_machine(), &size);
343 
344     return size;
345 }
346 
347 void memory_device_pre_plug(MemoryDeviceState *md, MachineState *ms,
348                             const uint64_t *legacy_align, Error **errp)
349 {
350     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
351     Error *local_err = NULL;
352     uint64_t addr, align = 0;
353     MemoryRegion *mr;
354 
355     /* We support empty memory devices even without device memory. */
356     if (memory_device_is_empty(md)) {
357         return;
358     }
359 
360     if (!ms->device_memory) {
361         error_setg(errp, "the configuration is not prepared for memory devices"
362                          " (e.g., for memory hotplug), consider specifying the"
363                          " maxmem option");
364         return;
365     }
366 
367     mr = mdc->get_memory_region(md, &local_err);
368     if (local_err) {
369         goto out;
370     }
371 
372     memory_device_check_addable(ms, md, mr, &local_err);
373     if (local_err) {
374         goto out;
375     }
376 
377     /*
378      * We always want the memory region size to be multiples of the memory
379      * region alignment: for example, DIMMs with 1G+1byte size don't make
380      * any sense. Note that we don't check that the size is multiples
381      * of any additional alignment requirements the memory device might
382      * have when it comes to the address in physical address space.
383      */
384     if (!QEMU_IS_ALIGNED(memory_region_size(mr),
385                          memory_region_get_alignment(mr))) {
386         error_setg(errp, "backend memory size must be multiple of 0x%"
387                    PRIx64, memory_region_get_alignment(mr));
388         return;
389     }
390 
391     if (legacy_align) {
392         align = *legacy_align;
393     } else {
394         if (mdc->get_min_alignment) {
395             align = mdc->get_min_alignment(md);
396         }
397         align = MAX(align, memory_region_get_alignment(mr));
398     }
399     addr = mdc->get_addr(md);
400     addr = memory_device_get_free_addr(ms, !addr ? NULL : &addr, align,
401                                        memory_region_size(mr), &local_err);
402     if (local_err) {
403         goto out;
404     }
405     mdc->set_addr(md, addr, &local_err);
406     if (!local_err) {
407         trace_memory_device_pre_plug(DEVICE(md)->id ? DEVICE(md)->id : "",
408                                      addr);
409     }
410 out:
411     error_propagate(errp, local_err);
412 }
413 
414 void memory_device_plug(MemoryDeviceState *md, MachineState *ms)
415 {
416     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
417     unsigned int memslots;
418     uint64_t addr;
419     MemoryRegion *mr;
420 
421     if (memory_device_is_empty(md)) {
422         return;
423     }
424 
425     memslots = memory_device_get_memslots(md);
426     addr = mdc->get_addr(md);
427 
428     /*
429      * We expect that a previous call to memory_device_pre_plug() succeeded, so
430      * it can't fail at this point.
431      */
432     mr = mdc->get_memory_region(md, &error_abort);
433     g_assert(ms->device_memory);
434 
435     ms->device_memory->used_region_size += memory_region_size(mr);
436     ms->device_memory->required_memslots += memslots;
437     if (mdc->decide_memslots && memslots > 1) {
438         ms->device_memory->memslot_auto_decision_active++;
439     }
440 
441     memory_region_add_subregion(&ms->device_memory->mr,
442                                 addr - ms->device_memory->base, mr);
443     trace_memory_device_plug(DEVICE(md)->id ? DEVICE(md)->id : "", addr);
444 }
445 
446 void memory_device_unplug(MemoryDeviceState *md, MachineState *ms)
447 {
448     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
449     const unsigned int memslots = memory_device_get_memslots(md);
450     MemoryRegion *mr;
451 
452     if (memory_device_is_empty(md)) {
453         return;
454     }
455 
456     /*
457      * We expect that a previous call to memory_device_pre_plug() succeeded, so
458      * it can't fail at this point.
459      */
460     mr = mdc->get_memory_region(md, &error_abort);
461     g_assert(ms->device_memory);
462 
463     memory_region_del_subregion(&ms->device_memory->mr, mr);
464 
465     if (mdc->decide_memslots && memslots > 1) {
466         ms->device_memory->memslot_auto_decision_active--;
467     }
468     ms->device_memory->used_region_size -= memory_region_size(mr);
469     ms->device_memory->required_memslots -= memslots;
470     trace_memory_device_unplug(DEVICE(md)->id ? DEVICE(md)->id : "",
471                                mdc->get_addr(md));
472 }
473 
474 uint64_t memory_device_get_region_size(const MemoryDeviceState *md,
475                                        Error **errp)
476 {
477     const MemoryDeviceClass *mdc = MEMORY_DEVICE_GET_CLASS(md);
478     MemoryRegion *mr;
479 
480     /* dropping const here is fine as we don't touch the memory region */
481     mr = mdc->get_memory_region((MemoryDeviceState *)md, errp);
482     if (!mr) {
483         return 0;
484     }
485 
486     return memory_region_size(mr);
487 }
488 
489 static void memory_devices_region_mod(MemoryListener *listener,
490                                       MemoryRegionSection *mrs, bool add)
491 {
492     DeviceMemoryState *dms = container_of(listener, DeviceMemoryState,
493                                           listener);
494 
495     if (!memory_region_is_ram(mrs->mr)) {
496         warn_report("Unexpected memory region mapped into device memory region.");
497         return;
498     }
499 
500     /*
501      * The expectation is that each distinct RAM memory region section in
502      * our region for memory devices consumes exactly one memslot in KVM
503      * and in vhost. For vhost, this is true, except:
504      * * ROM memory regions don't consume a memslot. These get used very
505      *   rarely for memory devices (R/O NVDIMMs).
506      * * Memslots without a fd (memory-backend-ram) don't necessarily
507      *   consume a memslot. Such setups are quite rare and possibly bogus:
508      *   the memory would be inaccessible by such vhost devices.
509      *
510      * So for vhost, in corner cases we might over-estimate the number of
511      * memslots that are currently used or that might still be reserved
512      * (required - used).
513      */
514     dms->used_memslots += add ? 1 : -1;
515 
516     if (dms->used_memslots > dms->required_memslots) {
517         warn_report("Memory devices use more memory slots than indicated as required.");
518     }
519 }
520 
521 static void memory_devices_region_add(MemoryListener *listener,
522                                       MemoryRegionSection *mrs)
523 {
524     return memory_devices_region_mod(listener, mrs, true);
525 }
526 
527 static void memory_devices_region_del(MemoryListener *listener,
528                                       MemoryRegionSection *mrs)
529 {
530     return memory_devices_region_mod(listener, mrs, false);
531 }
532 
533 void machine_memory_devices_init(MachineState *ms, hwaddr base, uint64_t size)
534 {
535     g_assert(size);
536     g_assert(!ms->device_memory);
537     ms->device_memory = g_new0(DeviceMemoryState, 1);
538     ms->device_memory->base = base;
539 
540     memory_region_init(&ms->device_memory->mr, OBJECT(ms), "device-memory",
541                        size);
542     address_space_init(&ms->device_memory->as, &ms->device_memory->mr,
543                        "device-memory");
544     memory_region_add_subregion(get_system_memory(), ms->device_memory->base,
545                                 &ms->device_memory->mr);
546 
547     /* Track the number of memslots used by memory devices. */
548     ms->device_memory->listener.region_add = memory_devices_region_add;
549     ms->device_memory->listener.region_del = memory_devices_region_del;
550     memory_listener_register(&ms->device_memory->listener,
551                              &ms->device_memory->as);
552 }
553 
554 static const TypeInfo memory_device_info = {
555     .name          = TYPE_MEMORY_DEVICE,
556     .parent        = TYPE_INTERFACE,
557     .class_size = sizeof(MemoryDeviceClass),
558 };
559 
560 static void memory_device_register_types(void)
561 {
562     type_register_static(&memory_device_info);
563 }
564 
565 type_init(memory_device_register_types)
566