xref: /openbmc/qemu/hw/xen/xen-mapcache.c (revision 4fd71d19)
1 /*
2  * Copyright (C) 2011       Citrix Ltd.
3  *
4  * This work is licensed under the terms of the GNU GPL, version 2.  See
5  * the COPYING file in the top-level directory.
6  *
7  * Contributions after 2012-01-13 are licensed under the terms of the
8  * GNU GPL, version 2 or (at your option) any later version.
9  */
10 
11 #include "qemu/osdep.h"
12 #include "qemu/units.h"
13 #include "qemu/error-report.h"
14 
15 #include <sys/resource.h>
16 
17 #include "hw/xen/xen_native.h"
18 #include "qemu/bitmap.h"
19 
20 #include "sysemu/runstate.h"
21 #include "sysemu/xen-mapcache.h"
22 #include "trace.h"
23 
24 
25 #if HOST_LONG_BITS == 32
26 #  define MCACHE_BUCKET_SHIFT 16
27 #  define MCACHE_MAX_SIZE     (1UL<<31) /* 2GB Cap */
28 #else
29 #  define MCACHE_BUCKET_SHIFT 20
30 #  define MCACHE_MAX_SIZE     (1UL<<35) /* 32GB Cap */
31 #endif
32 #define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
33 
34 /* This is the size of the virtual address space reserve to QEMU that will not
35  * be use by MapCache.
36  * From empirical tests I observed that qemu use 75MB more than the
37  * max_mcache_size.
38  */
39 #define NON_MCACHE_MEMORY_SIZE (80 * MiB)
40 
41 typedef struct MapCacheEntry {
42     hwaddr paddr_index;
43     uint8_t *vaddr_base;
44     unsigned long *valid_mapping;
45     uint32_t lock;
46 #define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
47     uint8_t flags;
48     hwaddr size;
49     struct MapCacheEntry *next;
50 } MapCacheEntry;
51 
52 typedef struct MapCacheRev {
53     uint8_t *vaddr_req;
54     hwaddr paddr_index;
55     hwaddr size;
56     QTAILQ_ENTRY(MapCacheRev) next;
57     bool dma;
58 } MapCacheRev;
59 
60 typedef struct MapCache {
61     MapCacheEntry *entry;
62     unsigned long nr_buckets;
63     QTAILQ_HEAD(, MapCacheRev) locked_entries;
64 
65     /* For most cases (>99.9%), the page address is the same. */
66     MapCacheEntry *last_entry;
67     unsigned long max_mcache_size;
68     unsigned int mcache_bucket_shift;
69 
70     phys_offset_to_gaddr_t phys_offset_to_gaddr;
71     QemuMutex lock;
72     void *opaque;
73 } MapCache;
74 
75 static MapCache *mapcache;
76 
77 static inline void mapcache_lock(MapCache *mc)
78 {
79     qemu_mutex_lock(&mc->lock);
80 }
81 
82 static inline void mapcache_unlock(MapCache *mc)
83 {
84     qemu_mutex_unlock(&mc->lock);
85 }
86 
87 static inline int test_bits(int nr, int size, const unsigned long *addr)
88 {
89     unsigned long res = find_next_zero_bit(addr, size + nr, nr);
90     if (res >= nr + size)
91         return 1;
92     else
93         return 0;
94 }
95 
96 static MapCache *xen_map_cache_init_single(phys_offset_to_gaddr_t f,
97                                            void *opaque,
98                                            unsigned long max_size)
99 {
100     unsigned long size;
101     MapCache *mc;
102 
103     mc = g_new0(MapCache, 1);
104 
105     mc->phys_offset_to_gaddr = f;
106     mc->opaque = opaque;
107     qemu_mutex_init(&mc->lock);
108 
109     QTAILQ_INIT(&mc->locked_entries);
110 
111     mc->max_mcache_size = max_size;
112 
113     mc->nr_buckets =
114         (((mc->max_mcache_size >> XC_PAGE_SHIFT) +
115           (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
116          (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
117 
118     size = mc->nr_buckets * sizeof(MapCacheEntry);
119     size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
120     trace_xen_map_cache_init(mc->nr_buckets, size);
121     mc->entry = g_malloc0(size);
122     return mc;
123 }
124 
125 void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque)
126 {
127     struct rlimit rlimit_as;
128     unsigned long max_mcache_size;
129 
130     if (geteuid() == 0) {
131         rlimit_as.rlim_cur = RLIM_INFINITY;
132         rlimit_as.rlim_max = RLIM_INFINITY;
133         max_mcache_size = MCACHE_MAX_SIZE;
134     } else {
135         getrlimit(RLIMIT_AS, &rlimit_as);
136         rlimit_as.rlim_cur = rlimit_as.rlim_max;
137 
138         if (rlimit_as.rlim_max != RLIM_INFINITY) {
139             warn_report("QEMU's maximum size of virtual"
140                         " memory is not infinity");
141         }
142         if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) {
143             max_mcache_size = rlimit_as.rlim_max - NON_MCACHE_MEMORY_SIZE;
144         } else {
145             max_mcache_size = MCACHE_MAX_SIZE;
146         }
147     }
148 
149     mapcache = xen_map_cache_init_single(f, opaque, max_mcache_size);
150     setrlimit(RLIMIT_AS, &rlimit_as);
151 }
152 
153 static void xen_remap_bucket(MapCache *mc,
154                              MapCacheEntry *entry,
155                              void *vaddr,
156                              hwaddr size,
157                              hwaddr address_index,
158                              bool dummy)
159 {
160     uint8_t *vaddr_base;
161     xen_pfn_t *pfns;
162     int *err;
163     unsigned int i;
164     hwaddr nb_pfn = size >> XC_PAGE_SHIFT;
165 
166     trace_xen_remap_bucket(address_index);
167 
168     pfns = g_new0(xen_pfn_t, nb_pfn);
169     err = g_new0(int, nb_pfn);
170 
171     if (entry->vaddr_base != NULL) {
172         if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
173             ram_block_notify_remove(entry->vaddr_base, entry->size,
174                                     entry->size);
175         }
176 
177         /*
178          * If an entry is being replaced by another mapping and we're using
179          * MAP_FIXED flag for it - there is possibility of a race for vaddr
180          * address with another thread doing an mmap call itself
181          * (see man 2 mmap). To avoid that we skip explicit unmapping here
182          * and allow the kernel to destroy the previous mappings by replacing
183          * them in mmap call later.
184          *
185          * Non-identical replacements are not allowed therefore.
186          */
187         assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size));
188 
189         if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) {
190             perror("unmap fails");
191             exit(-1);
192         }
193     }
194     g_free(entry->valid_mapping);
195     entry->valid_mapping = NULL;
196 
197     for (i = 0; i < nb_pfn; i++) {
198         pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
199     }
200 
201     /*
202      * If the caller has requested the mapping at a specific address use
203      * MAP_FIXED to make sure it's honored.
204      */
205     if (!dummy) {
206         vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr,
207                                            PROT_READ | PROT_WRITE,
208                                            vaddr ? MAP_FIXED : 0,
209                                            nb_pfn, pfns, err);
210         if (vaddr_base == NULL) {
211             perror("xenforeignmemory_map2");
212             exit(-1);
213         }
214     } else {
215         /*
216          * We create dummy mappings where we are unable to create a foreign
217          * mapping immediately due to certain circumstances (i.e. on resume now)
218          */
219         vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE,
220                           MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0),
221                           -1, 0);
222         if (vaddr_base == MAP_FAILED) {
223             perror("mmap");
224             exit(-1);
225         }
226     }
227 
228     if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
229         ram_block_notify_add(vaddr_base, size, size);
230     }
231 
232     entry->vaddr_base = vaddr_base;
233     entry->paddr_index = address_index;
234     entry->size = size;
235     entry->valid_mapping = g_new0(unsigned long,
236                                   BITS_TO_LONGS(size >> XC_PAGE_SHIFT));
237 
238     if (dummy) {
239         entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY;
240     } else {
241         entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY);
242     }
243 
244     bitmap_zero(entry->valid_mapping, nb_pfn);
245     for (i = 0; i < nb_pfn; i++) {
246         if (!err[i]) {
247             bitmap_set(entry->valid_mapping, i, 1);
248         }
249     }
250 
251     g_free(pfns);
252     g_free(err);
253 }
254 
255 static uint8_t *xen_map_cache_unlocked(MapCache *mc,
256                                        hwaddr phys_addr, hwaddr size,
257                                        uint8_t lock, bool dma, bool is_write)
258 {
259     MapCacheEntry *entry, *pentry = NULL,
260                   *free_entry = NULL, *free_pentry = NULL;
261     hwaddr address_index;
262     hwaddr address_offset;
263     hwaddr cache_size = size;
264     hwaddr test_bit_size;
265     bool translated G_GNUC_UNUSED = false;
266     bool dummy = false;
267 
268 tryagain:
269     address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
270     address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
271 
272     trace_xen_map_cache(phys_addr);
273 
274     /* test_bit_size is always a multiple of XC_PAGE_SIZE */
275     if (size) {
276         test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1));
277 
278         if (test_bit_size % XC_PAGE_SIZE) {
279             test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
280         }
281     } else {
282         test_bit_size = XC_PAGE_SIZE;
283     }
284 
285     if (mc->last_entry != NULL &&
286         mc->last_entry->paddr_index == address_index &&
287         !lock && !size &&
288         test_bits(address_offset >> XC_PAGE_SHIFT,
289                   test_bit_size >> XC_PAGE_SHIFT,
290                   mc->last_entry->valid_mapping)) {
291         trace_xen_map_cache_return(
292             mc->last_entry->vaddr_base + address_offset
293         );
294         return mc->last_entry->vaddr_base + address_offset;
295     }
296 
297     /* size is always a multiple of MCACHE_BUCKET_SIZE */
298     if (size) {
299         cache_size = size + address_offset;
300         if (cache_size % MCACHE_BUCKET_SIZE) {
301             cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
302         }
303     } else {
304         cache_size = MCACHE_BUCKET_SIZE;
305     }
306 
307     entry = &mc->entry[address_index % mc->nr_buckets];
308 
309     while (entry && (lock || entry->lock) && entry->vaddr_base &&
310             (entry->paddr_index != address_index || entry->size != cache_size ||
311              !test_bits(address_offset >> XC_PAGE_SHIFT,
312                  test_bit_size >> XC_PAGE_SHIFT,
313                  entry->valid_mapping))) {
314         if (!free_entry && !entry->lock) {
315             free_entry = entry;
316             free_pentry = pentry;
317         }
318         pentry = entry;
319         entry = entry->next;
320     }
321     if (!entry && free_entry) {
322         entry = free_entry;
323         pentry = free_pentry;
324     }
325     if (!entry) {
326         entry = g_new0(MapCacheEntry, 1);
327         pentry->next = entry;
328         xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy);
329     } else if (!entry->lock) {
330         if (!entry->vaddr_base || entry->paddr_index != address_index ||
331                 entry->size != cache_size ||
332                 !test_bits(address_offset >> XC_PAGE_SHIFT,
333                     test_bit_size >> XC_PAGE_SHIFT,
334                     entry->valid_mapping)) {
335             xen_remap_bucket(mc, entry, NULL, cache_size, address_index, dummy);
336         }
337     }
338 
339     if(!test_bits(address_offset >> XC_PAGE_SHIFT,
340                 test_bit_size >> XC_PAGE_SHIFT,
341                 entry->valid_mapping)) {
342         mc->last_entry = NULL;
343 #ifdef XEN_COMPAT_PHYSMAP
344         if (!translated && mc->phys_offset_to_gaddr) {
345             phys_addr = mc->phys_offset_to_gaddr(phys_addr, size);
346             translated = true;
347             goto tryagain;
348         }
349 #endif
350         if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) {
351             dummy = true;
352             goto tryagain;
353         }
354         trace_xen_map_cache_return(NULL);
355         return NULL;
356     }
357 
358     mc->last_entry = entry;
359     if (lock) {
360         MapCacheRev *reventry = g_new0(MapCacheRev, 1);
361         entry->lock++;
362         if (entry->lock == 0) {
363             error_report("mapcache entry lock overflow: "HWADDR_FMT_plx" -> %p",
364                          entry->paddr_index, entry->vaddr_base);
365             abort();
366         }
367         reventry->dma = dma;
368         reventry->vaddr_req = mc->last_entry->vaddr_base + address_offset;
369         reventry->paddr_index = mc->last_entry->paddr_index;
370         reventry->size = entry->size;
371         QTAILQ_INSERT_HEAD(&mc->locked_entries, reventry, next);
372     }
373 
374     trace_xen_map_cache_return(
375         mc->last_entry->vaddr_base + address_offset
376     );
377     return mc->last_entry->vaddr_base + address_offset;
378 }
379 
380 uint8_t *xen_map_cache(MemoryRegion *mr,
381                        hwaddr phys_addr, hwaddr size,
382                        uint8_t lock, bool dma,
383                        bool is_write)
384 {
385     uint8_t *p;
386 
387     mapcache_lock(mapcache);
388     p = xen_map_cache_unlocked(mapcache, phys_addr, size, lock, dma, is_write);
389     mapcache_unlock(mapcache);
390     return p;
391 }
392 
393 static ram_addr_t xen_ram_addr_from_mapcache_single(MapCache *mc, void *ptr)
394 {
395     MapCacheEntry *entry = NULL;
396     MapCacheRev *reventry;
397     hwaddr paddr_index;
398     hwaddr size;
399     ram_addr_t raddr;
400     int found = 0;
401 
402     mapcache_lock(mc);
403     QTAILQ_FOREACH(reventry, &mc->locked_entries, next) {
404         if (reventry->vaddr_req == ptr) {
405             paddr_index = reventry->paddr_index;
406             size = reventry->size;
407             found = 1;
408             break;
409         }
410     }
411     if (!found) {
412         trace_xen_ram_addr_from_mapcache_not_found(ptr);
413         mapcache_unlock(mc);
414         return RAM_ADDR_INVALID;
415     }
416 
417     entry = &mc->entry[paddr_index % mc->nr_buckets];
418     while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
419         entry = entry->next;
420     }
421     if (!entry) {
422         trace_xen_ram_addr_from_mapcache_not_in_cache(ptr);
423         raddr = RAM_ADDR_INVALID;
424     } else {
425         raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
426              ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
427     }
428     mapcache_unlock(mc);
429     return raddr;
430 }
431 
432 ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
433 {
434     return xen_ram_addr_from_mapcache_single(mapcache, ptr);
435 }
436 
437 static void xen_invalidate_map_cache_entry_unlocked(MapCache *mc,
438                                                     uint8_t *buffer)
439 {
440     MapCacheEntry *entry = NULL, *pentry = NULL;
441     MapCacheRev *reventry;
442     hwaddr paddr_index;
443     hwaddr size;
444     int found = 0;
445 
446     QTAILQ_FOREACH(reventry, &mc->locked_entries, next) {
447         if (reventry->vaddr_req == buffer) {
448             paddr_index = reventry->paddr_index;
449             size = reventry->size;
450             found = 1;
451             break;
452         }
453     }
454     if (!found) {
455         trace_xen_invalidate_map_cache_entry_unlocked_not_found(buffer);
456         QTAILQ_FOREACH(reventry, &mc->locked_entries, next) {
457             trace_xen_invalidate_map_cache_entry_unlocked_found(
458                 reventry->paddr_index,
459                 reventry->vaddr_req
460             );
461         }
462         return;
463     }
464     QTAILQ_REMOVE(&mc->locked_entries, reventry, next);
465     g_free(reventry);
466 
467     if (mc->last_entry != NULL &&
468         mc->last_entry->paddr_index == paddr_index) {
469         mc->last_entry = NULL;
470     }
471 
472     entry = &mc->entry[paddr_index % mc->nr_buckets];
473     while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
474         pentry = entry;
475         entry = entry->next;
476     }
477     if (!entry) {
478         trace_xen_invalidate_map_cache_entry_unlocked_miss(buffer);
479         return;
480     }
481     entry->lock--;
482     if (entry->lock > 0 || pentry == NULL) {
483         return;
484     }
485 
486     pentry->next = entry->next;
487     ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
488     if (munmap(entry->vaddr_base, entry->size) != 0) {
489         perror("unmap fails");
490         exit(-1);
491     }
492     g_free(entry->valid_mapping);
493     g_free(entry);
494 }
495 
496 typedef struct XenMapCacheData {
497     Coroutine *co;
498     uint8_t *buffer;
499 } XenMapCacheData;
500 
501 static void xen_invalidate_map_cache_entry_bh(void *opaque)
502 {
503     XenMapCacheData *data = opaque;
504 
505     mapcache_lock(mapcache);
506     xen_invalidate_map_cache_entry_unlocked(mapcache, data->buffer);
507     mapcache_unlock(mapcache);
508 
509     aio_co_wake(data->co);
510 }
511 
512 void coroutine_mixed_fn xen_invalidate_map_cache_entry(uint8_t *buffer)
513 {
514     if (qemu_in_coroutine()) {
515         XenMapCacheData data = {
516             .co = qemu_coroutine_self(),
517             .buffer = buffer,
518         };
519         aio_bh_schedule_oneshot(qemu_get_current_aio_context(),
520                                 xen_invalidate_map_cache_entry_bh, &data);
521         qemu_coroutine_yield();
522     } else {
523         mapcache_lock(mapcache);
524         xen_invalidate_map_cache_entry_unlocked(mapcache, buffer);
525         mapcache_unlock(mapcache);
526     }
527 }
528 
529 static void xen_invalidate_map_cache_single(MapCache *mc)
530 {
531     unsigned long i;
532     MapCacheRev *reventry;
533 
534     mapcache_lock(mc);
535 
536     QTAILQ_FOREACH(reventry, &mc->locked_entries, next) {
537         if (!reventry->dma) {
538             continue;
539         }
540         trace_xen_invalidate_map_cache(reventry->paddr_index,
541                                        reventry->vaddr_req);
542     }
543 
544     for (i = 0; i < mc->nr_buckets; i++) {
545         MapCacheEntry *entry = &mc->entry[i];
546 
547         if (entry->vaddr_base == NULL) {
548             continue;
549         }
550         if (entry->lock > 0) {
551             continue;
552         }
553 
554         if (munmap(entry->vaddr_base, entry->size) != 0) {
555             perror("unmap fails");
556             exit(-1);
557         }
558 
559         entry->paddr_index = 0;
560         entry->vaddr_base = NULL;
561         entry->size = 0;
562         g_free(entry->valid_mapping);
563         entry->valid_mapping = NULL;
564     }
565 
566     mc->last_entry = NULL;
567 
568     mapcache_unlock(mc);
569 }
570 
571 void xen_invalidate_map_cache(void)
572 {
573     /* Flush pending AIO before destroying the mapcache */
574     bdrv_drain_all();
575 
576     xen_invalidate_map_cache_single(mapcache);
577 }
578 
579 static uint8_t *xen_replace_cache_entry_unlocked(MapCache *mc,
580                                                  hwaddr old_phys_addr,
581                                                  hwaddr new_phys_addr,
582                                                  hwaddr size)
583 {
584     MapCacheEntry *entry;
585     hwaddr address_index, address_offset;
586     hwaddr test_bit_size, cache_size = size;
587 
588     address_index  = old_phys_addr >> MCACHE_BUCKET_SHIFT;
589     address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1);
590 
591     assert(size);
592     /* test_bit_size is always a multiple of XC_PAGE_SIZE */
593     test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1));
594     if (test_bit_size % XC_PAGE_SIZE) {
595         test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
596     }
597     cache_size = size + address_offset;
598     if (cache_size % MCACHE_BUCKET_SIZE) {
599         cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
600     }
601 
602     entry = &mc->entry[address_index % mc->nr_buckets];
603     while (entry && !(entry->paddr_index == address_index &&
604                       entry->size == cache_size)) {
605         entry = entry->next;
606     }
607     if (!entry) {
608         trace_xen_replace_cache_entry_unlocked(old_phys_addr);
609         return NULL;
610     }
611 
612     address_index  = new_phys_addr >> MCACHE_BUCKET_SHIFT;
613     address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1);
614 
615     trace_xen_replace_cache_entry_dummy(old_phys_addr, new_phys_addr);
616 
617     xen_remap_bucket(mc, entry, entry->vaddr_base,
618                      cache_size, address_index, false);
619     if (!test_bits(address_offset >> XC_PAGE_SHIFT,
620                 test_bit_size >> XC_PAGE_SHIFT,
621                 entry->valid_mapping)) {
622         trace_xen_replace_cache_entry_unlocked_could_not_update_entry(
623             old_phys_addr
624         );
625         return NULL;
626     }
627 
628     return entry->vaddr_base + address_offset;
629 }
630 
631 uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
632                                  hwaddr new_phys_addr,
633                                  hwaddr size)
634 {
635     uint8_t *p;
636 
637     mapcache_lock(mapcache);
638     p = xen_replace_cache_entry_unlocked(mapcache, old_phys_addr,
639                                          new_phys_addr, size);
640     mapcache_unlock(mapcache);
641     return p;
642 }
643