xref: /openbmc/qemu/hw/xen/xen-mapcache.c (revision ecd6f6a8)
1 /*
2  * Copyright (C) 2011       Citrix Ltd.
3  *
4  * This work is licensed under the terms of the GNU GPL, version 2.  See
5  * the COPYING file in the top-level directory.
6  *
7  * Contributions after 2012-01-13 are licensed under the terms of the
8  * GNU GPL, version 2 or (at your option) any later version.
9  */
10 
11 #include "qemu/osdep.h"
12 #include "qemu/units.h"
13 #include "qemu/error-report.h"
14 
15 #include <sys/resource.h>
16 
17 #include "hw/xen/xen_native.h"
18 #include "qemu/bitmap.h"
19 
20 #include "sysemu/runstate.h"
21 #include "sysemu/xen-mapcache.h"
22 #include "trace.h"
23 
24 
25 #if HOST_LONG_BITS == 32
26 #  define MCACHE_BUCKET_SHIFT 16
27 #  define MCACHE_MAX_SIZE     (1UL<<31) /* 2GB Cap */
28 #else
29 #  define MCACHE_BUCKET_SHIFT 20
30 #  define MCACHE_MAX_SIZE     (1UL<<35) /* 32GB Cap */
31 #endif
32 #define MCACHE_BUCKET_SIZE (1UL << MCACHE_BUCKET_SHIFT)
33 
34 /* This is the size of the virtual address space reserve to QEMU that will not
35  * be use by MapCache.
36  * From empirical tests I observed that qemu use 75MB more than the
37  * max_mcache_size.
38  */
39 #define NON_MCACHE_MEMORY_SIZE (80 * MiB)
40 
41 typedef struct MapCacheEntry {
42     hwaddr paddr_index;
43     uint8_t *vaddr_base;
44     unsigned long *valid_mapping;
45     uint32_t lock;
46 #define XEN_MAPCACHE_ENTRY_DUMMY (1 << 0)
47     uint8_t flags;
48     hwaddr size;
49     struct MapCacheEntry *next;
50 } MapCacheEntry;
51 
52 typedef struct MapCacheRev {
53     uint8_t *vaddr_req;
54     hwaddr paddr_index;
55     hwaddr size;
56     QTAILQ_ENTRY(MapCacheRev) next;
57     bool dma;
58 } MapCacheRev;
59 
60 typedef struct MapCache {
61     MapCacheEntry *entry;
62     unsigned long nr_buckets;
63     QTAILQ_HEAD(, MapCacheRev) locked_entries;
64 
65     /* For most cases (>99.9%), the page address is the same. */
66     MapCacheEntry *last_entry;
67     unsigned long max_mcache_size;
68     unsigned int mcache_bucket_shift;
69 
70     phys_offset_to_gaddr_t phys_offset_to_gaddr;
71     QemuMutex lock;
72     void *opaque;
73 } MapCache;
74 
75 static MapCache *mapcache;
76 
77 static inline void mapcache_lock(void)
78 {
79     qemu_mutex_lock(&mapcache->lock);
80 }
81 
82 static inline void mapcache_unlock(void)
83 {
84     qemu_mutex_unlock(&mapcache->lock);
85 }
86 
87 static inline int test_bits(int nr, int size, const unsigned long *addr)
88 {
89     unsigned long res = find_next_zero_bit(addr, size + nr, nr);
90     if (res >= nr + size)
91         return 1;
92     else
93         return 0;
94 }
95 
96 void xen_map_cache_init(phys_offset_to_gaddr_t f, void *opaque)
97 {
98     unsigned long size;
99     struct rlimit rlimit_as;
100 
101     mapcache = g_new0(MapCache, 1);
102 
103     mapcache->phys_offset_to_gaddr = f;
104     mapcache->opaque = opaque;
105     qemu_mutex_init(&mapcache->lock);
106 
107     QTAILQ_INIT(&mapcache->locked_entries);
108 
109     if (geteuid() == 0) {
110         rlimit_as.rlim_cur = RLIM_INFINITY;
111         rlimit_as.rlim_max = RLIM_INFINITY;
112         mapcache->max_mcache_size = MCACHE_MAX_SIZE;
113     } else {
114         getrlimit(RLIMIT_AS, &rlimit_as);
115         rlimit_as.rlim_cur = rlimit_as.rlim_max;
116 
117         if (rlimit_as.rlim_max != RLIM_INFINITY) {
118             warn_report("QEMU's maximum size of virtual"
119                         " memory is not infinity");
120         }
121         if (rlimit_as.rlim_max < MCACHE_MAX_SIZE + NON_MCACHE_MEMORY_SIZE) {
122             mapcache->max_mcache_size = rlimit_as.rlim_max -
123                 NON_MCACHE_MEMORY_SIZE;
124         } else {
125             mapcache->max_mcache_size = MCACHE_MAX_SIZE;
126         }
127     }
128 
129     setrlimit(RLIMIT_AS, &rlimit_as);
130 
131     mapcache->nr_buckets =
132         (((mapcache->max_mcache_size >> XC_PAGE_SHIFT) +
133           (1UL << (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT)) - 1) >>
134          (MCACHE_BUCKET_SHIFT - XC_PAGE_SHIFT));
135 
136     size = mapcache->nr_buckets * sizeof (MapCacheEntry);
137     size = (size + XC_PAGE_SIZE - 1) & ~(XC_PAGE_SIZE - 1);
138     trace_xen_map_cache_init(mapcache->nr_buckets, size);
139     mapcache->entry = g_malloc0(size);
140 }
141 
142 static void xen_remap_bucket(MapCacheEntry *entry,
143                              void *vaddr,
144                              hwaddr size,
145                              hwaddr address_index,
146                              bool dummy)
147 {
148     uint8_t *vaddr_base;
149     xen_pfn_t *pfns;
150     int *err;
151     unsigned int i;
152     hwaddr nb_pfn = size >> XC_PAGE_SHIFT;
153 
154     trace_xen_remap_bucket(address_index);
155 
156     pfns = g_new0(xen_pfn_t, nb_pfn);
157     err = g_new0(int, nb_pfn);
158 
159     if (entry->vaddr_base != NULL) {
160         if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
161             ram_block_notify_remove(entry->vaddr_base, entry->size,
162                                     entry->size);
163         }
164 
165         /*
166          * If an entry is being replaced by another mapping and we're using
167          * MAP_FIXED flag for it - there is possibility of a race for vaddr
168          * address with another thread doing an mmap call itself
169          * (see man 2 mmap). To avoid that we skip explicit unmapping here
170          * and allow the kernel to destroy the previous mappings by replacing
171          * them in mmap call later.
172          *
173          * Non-identical replacements are not allowed therefore.
174          */
175         assert(!vaddr || (entry->vaddr_base == vaddr && entry->size == size));
176 
177         if (!vaddr && munmap(entry->vaddr_base, entry->size) != 0) {
178             perror("unmap fails");
179             exit(-1);
180         }
181     }
182     g_free(entry->valid_mapping);
183     entry->valid_mapping = NULL;
184 
185     for (i = 0; i < nb_pfn; i++) {
186         pfns[i] = (address_index << (MCACHE_BUCKET_SHIFT-XC_PAGE_SHIFT)) + i;
187     }
188 
189     /*
190      * If the caller has requested the mapping at a specific address use
191      * MAP_FIXED to make sure it's honored.
192      */
193     if (!dummy) {
194         vaddr_base = xenforeignmemory_map2(xen_fmem, xen_domid, vaddr,
195                                            PROT_READ | PROT_WRITE,
196                                            vaddr ? MAP_FIXED : 0,
197                                            nb_pfn, pfns, err);
198         if (vaddr_base == NULL) {
199             perror("xenforeignmemory_map2");
200             exit(-1);
201         }
202     } else {
203         /*
204          * We create dummy mappings where we are unable to create a foreign
205          * mapping immediately due to certain circumstances (i.e. on resume now)
206          */
207         vaddr_base = mmap(vaddr, size, PROT_READ | PROT_WRITE,
208                           MAP_ANON | MAP_SHARED | (vaddr ? MAP_FIXED : 0),
209                           -1, 0);
210         if (vaddr_base == MAP_FAILED) {
211             perror("mmap");
212             exit(-1);
213         }
214     }
215 
216     if (!(entry->flags & XEN_MAPCACHE_ENTRY_DUMMY)) {
217         ram_block_notify_add(vaddr_base, size, size);
218     }
219 
220     entry->vaddr_base = vaddr_base;
221     entry->paddr_index = address_index;
222     entry->size = size;
223     entry->valid_mapping = g_new0(unsigned long,
224                                   BITS_TO_LONGS(size >> XC_PAGE_SHIFT));
225 
226     if (dummy) {
227         entry->flags |= XEN_MAPCACHE_ENTRY_DUMMY;
228     } else {
229         entry->flags &= ~(XEN_MAPCACHE_ENTRY_DUMMY);
230     }
231 
232     bitmap_zero(entry->valid_mapping, nb_pfn);
233     for (i = 0; i < nb_pfn; i++) {
234         if (!err[i]) {
235             bitmap_set(entry->valid_mapping, i, 1);
236         }
237     }
238 
239     g_free(pfns);
240     g_free(err);
241 }
242 
243 static uint8_t *xen_map_cache_unlocked(hwaddr phys_addr, hwaddr size,
244                                        uint8_t lock, bool dma)
245 {
246     MapCacheEntry *entry, *pentry = NULL,
247                   *free_entry = NULL, *free_pentry = NULL;
248     hwaddr address_index;
249     hwaddr address_offset;
250     hwaddr cache_size = size;
251     hwaddr test_bit_size;
252     bool translated G_GNUC_UNUSED = false;
253     bool dummy = false;
254 
255 tryagain:
256     address_index  = phys_addr >> MCACHE_BUCKET_SHIFT;
257     address_offset = phys_addr & (MCACHE_BUCKET_SIZE - 1);
258 
259     trace_xen_map_cache(phys_addr);
260 
261     /* test_bit_size is always a multiple of XC_PAGE_SIZE */
262     if (size) {
263         test_bit_size = size + (phys_addr & (XC_PAGE_SIZE - 1));
264 
265         if (test_bit_size % XC_PAGE_SIZE) {
266             test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
267         }
268     } else {
269         test_bit_size = XC_PAGE_SIZE;
270     }
271 
272     if (mapcache->last_entry != NULL &&
273         mapcache->last_entry->paddr_index == address_index &&
274         !lock && !size &&
275         test_bits(address_offset >> XC_PAGE_SHIFT,
276                   test_bit_size >> XC_PAGE_SHIFT,
277                   mapcache->last_entry->valid_mapping)) {
278         trace_xen_map_cache_return(
279             mapcache->last_entry->vaddr_base + address_offset
280         );
281         return mapcache->last_entry->vaddr_base + address_offset;
282     }
283 
284     /* size is always a multiple of MCACHE_BUCKET_SIZE */
285     if (size) {
286         cache_size = size + address_offset;
287         if (cache_size % MCACHE_BUCKET_SIZE) {
288             cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
289         }
290     } else {
291         cache_size = MCACHE_BUCKET_SIZE;
292     }
293 
294     entry = &mapcache->entry[address_index % mapcache->nr_buckets];
295 
296     while (entry && (lock || entry->lock) && entry->vaddr_base &&
297             (entry->paddr_index != address_index || entry->size != cache_size ||
298              !test_bits(address_offset >> XC_PAGE_SHIFT,
299                  test_bit_size >> XC_PAGE_SHIFT,
300                  entry->valid_mapping))) {
301         if (!free_entry && !entry->lock) {
302             free_entry = entry;
303             free_pentry = pentry;
304         }
305         pentry = entry;
306         entry = entry->next;
307     }
308     if (!entry && free_entry) {
309         entry = free_entry;
310         pentry = free_pentry;
311     }
312     if (!entry) {
313         entry = g_new0(MapCacheEntry, 1);
314         pentry->next = entry;
315         xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
316     } else if (!entry->lock) {
317         if (!entry->vaddr_base || entry->paddr_index != address_index ||
318                 entry->size != cache_size ||
319                 !test_bits(address_offset >> XC_PAGE_SHIFT,
320                     test_bit_size >> XC_PAGE_SHIFT,
321                     entry->valid_mapping)) {
322             xen_remap_bucket(entry, NULL, cache_size, address_index, dummy);
323         }
324     }
325 
326     if(!test_bits(address_offset >> XC_PAGE_SHIFT,
327                 test_bit_size >> XC_PAGE_SHIFT,
328                 entry->valid_mapping)) {
329         mapcache->last_entry = NULL;
330 #ifdef XEN_COMPAT_PHYSMAP
331         if (!translated && mapcache->phys_offset_to_gaddr) {
332             phys_addr = mapcache->phys_offset_to_gaddr(phys_addr, size);
333             translated = true;
334             goto tryagain;
335         }
336 #endif
337         if (!dummy && runstate_check(RUN_STATE_INMIGRATE)) {
338             dummy = true;
339             goto tryagain;
340         }
341         trace_xen_map_cache_return(NULL);
342         return NULL;
343     }
344 
345     mapcache->last_entry = entry;
346     if (lock) {
347         MapCacheRev *reventry = g_new0(MapCacheRev, 1);
348         entry->lock++;
349         if (entry->lock == 0) {
350             error_report("mapcache entry lock overflow: "HWADDR_FMT_plx" -> %p",
351                          entry->paddr_index, entry->vaddr_base);
352             abort();
353         }
354         reventry->dma = dma;
355         reventry->vaddr_req = mapcache->last_entry->vaddr_base + address_offset;
356         reventry->paddr_index = mapcache->last_entry->paddr_index;
357         reventry->size = entry->size;
358         QTAILQ_INSERT_HEAD(&mapcache->locked_entries, reventry, next);
359     }
360 
361     trace_xen_map_cache_return(
362         mapcache->last_entry->vaddr_base + address_offset
363     );
364     return mapcache->last_entry->vaddr_base + address_offset;
365 }
366 
367 uint8_t *xen_map_cache(hwaddr phys_addr, hwaddr size,
368                        uint8_t lock, bool dma)
369 {
370     uint8_t *p;
371 
372     mapcache_lock();
373     p = xen_map_cache_unlocked(phys_addr, size, lock, dma);
374     mapcache_unlock();
375     return p;
376 }
377 
378 ram_addr_t xen_ram_addr_from_mapcache(void *ptr)
379 {
380     MapCacheEntry *entry = NULL;
381     MapCacheRev *reventry;
382     hwaddr paddr_index;
383     hwaddr size;
384     ram_addr_t raddr;
385     int found = 0;
386 
387     mapcache_lock();
388     QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
389         if (reventry->vaddr_req == ptr) {
390             paddr_index = reventry->paddr_index;
391             size = reventry->size;
392             found = 1;
393             break;
394         }
395     }
396     if (!found) {
397         trace_xen_ram_addr_from_mapcache_not_found(ptr);
398         QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
399             trace_xen_ram_addr_from_mapcache_found(reventry->paddr_index,
400                                                    reventry->vaddr_req);
401         }
402         abort();
403         return 0;
404     }
405 
406     entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
407     while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
408         entry = entry->next;
409     }
410     if (!entry) {
411         trace_xen_ram_addr_from_mapcache_not_in_cache(ptr);
412         raddr = 0;
413     } else {
414         raddr = (reventry->paddr_index << MCACHE_BUCKET_SHIFT) +
415              ((unsigned long) ptr - (unsigned long) entry->vaddr_base);
416     }
417     mapcache_unlock();
418     return raddr;
419 }
420 
421 static void xen_invalidate_map_cache_entry_unlocked(uint8_t *buffer)
422 {
423     MapCacheEntry *entry = NULL, *pentry = NULL;
424     MapCacheRev *reventry;
425     hwaddr paddr_index;
426     hwaddr size;
427     int found = 0;
428 
429     QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
430         if (reventry->vaddr_req == buffer) {
431             paddr_index = reventry->paddr_index;
432             size = reventry->size;
433             found = 1;
434             break;
435         }
436     }
437     if (!found) {
438         trace_xen_invalidate_map_cache_entry_unlocked_not_found(buffer);
439         QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
440             trace_xen_invalidate_map_cache_entry_unlocked_found(
441                 reventry->paddr_index,
442                 reventry->vaddr_req
443             );
444         }
445         return;
446     }
447     QTAILQ_REMOVE(&mapcache->locked_entries, reventry, next);
448     g_free(reventry);
449 
450     if (mapcache->last_entry != NULL &&
451         mapcache->last_entry->paddr_index == paddr_index) {
452         mapcache->last_entry = NULL;
453     }
454 
455     entry = &mapcache->entry[paddr_index % mapcache->nr_buckets];
456     while (entry && (entry->paddr_index != paddr_index || entry->size != size)) {
457         pentry = entry;
458         entry = entry->next;
459     }
460     if (!entry) {
461         trace_xen_invalidate_map_cache_entry_unlocked_miss(buffer);
462         return;
463     }
464     entry->lock--;
465     if (entry->lock > 0 || pentry == NULL) {
466         return;
467     }
468 
469     pentry->next = entry->next;
470     ram_block_notify_remove(entry->vaddr_base, entry->size, entry->size);
471     if (munmap(entry->vaddr_base, entry->size) != 0) {
472         perror("unmap fails");
473         exit(-1);
474     }
475     g_free(entry->valid_mapping);
476     g_free(entry);
477 }
478 
479 void xen_invalidate_map_cache_entry(uint8_t *buffer)
480 {
481     mapcache_lock();
482     xen_invalidate_map_cache_entry_unlocked(buffer);
483     mapcache_unlock();
484 }
485 
486 void xen_invalidate_map_cache(void)
487 {
488     unsigned long i;
489     MapCacheRev *reventry;
490 
491     /* Flush pending AIO before destroying the mapcache */
492     bdrv_drain_all();
493 
494     mapcache_lock();
495 
496     QTAILQ_FOREACH(reventry, &mapcache->locked_entries, next) {
497         if (!reventry->dma) {
498             continue;
499         }
500         trace_xen_invalidate_map_cache(reventry->paddr_index,
501                                        reventry->vaddr_req);
502     }
503 
504     for (i = 0; i < mapcache->nr_buckets; i++) {
505         MapCacheEntry *entry = &mapcache->entry[i];
506 
507         if (entry->vaddr_base == NULL) {
508             continue;
509         }
510         if (entry->lock > 0) {
511             continue;
512         }
513 
514         if (munmap(entry->vaddr_base, entry->size) != 0) {
515             perror("unmap fails");
516             exit(-1);
517         }
518 
519         entry->paddr_index = 0;
520         entry->vaddr_base = NULL;
521         entry->size = 0;
522         g_free(entry->valid_mapping);
523         entry->valid_mapping = NULL;
524     }
525 
526     mapcache->last_entry = NULL;
527 
528     mapcache_unlock();
529 }
530 
531 static uint8_t *xen_replace_cache_entry_unlocked(hwaddr old_phys_addr,
532                                                  hwaddr new_phys_addr,
533                                                  hwaddr size)
534 {
535     MapCacheEntry *entry;
536     hwaddr address_index, address_offset;
537     hwaddr test_bit_size, cache_size = size;
538 
539     address_index  = old_phys_addr >> MCACHE_BUCKET_SHIFT;
540     address_offset = old_phys_addr & (MCACHE_BUCKET_SIZE - 1);
541 
542     assert(size);
543     /* test_bit_size is always a multiple of XC_PAGE_SIZE */
544     test_bit_size = size + (old_phys_addr & (XC_PAGE_SIZE - 1));
545     if (test_bit_size % XC_PAGE_SIZE) {
546         test_bit_size += XC_PAGE_SIZE - (test_bit_size % XC_PAGE_SIZE);
547     }
548     cache_size = size + address_offset;
549     if (cache_size % MCACHE_BUCKET_SIZE) {
550         cache_size += MCACHE_BUCKET_SIZE - (cache_size % MCACHE_BUCKET_SIZE);
551     }
552 
553     entry = &mapcache->entry[address_index % mapcache->nr_buckets];
554     while (entry && !(entry->paddr_index == address_index &&
555                       entry->size == cache_size)) {
556         entry = entry->next;
557     }
558     if (!entry) {
559         trace_xen_replace_cache_entry_unlocked(old_phys_addr);
560         return NULL;
561     }
562 
563     address_index  = new_phys_addr >> MCACHE_BUCKET_SHIFT;
564     address_offset = new_phys_addr & (MCACHE_BUCKET_SIZE - 1);
565 
566     trace_xen_replace_cache_entry_dummy(old_phys_addr, new_phys_addr);
567 
568     xen_remap_bucket(entry, entry->vaddr_base,
569                      cache_size, address_index, false);
570     if (!test_bits(address_offset >> XC_PAGE_SHIFT,
571                 test_bit_size >> XC_PAGE_SHIFT,
572                 entry->valid_mapping)) {
573         trace_xen_replace_cache_entry_unlocked_could_not_update_entry(
574             old_phys_addr
575         );
576         return NULL;
577     }
578 
579     return entry->vaddr_base + address_offset;
580 }
581 
582 uint8_t *xen_replace_cache_entry(hwaddr old_phys_addr,
583                                  hwaddr new_phys_addr,
584                                  hwaddr size)
585 {
586     uint8_t *p;
587 
588     mapcache_lock();
589     p = xen_replace_cache_entry_unlocked(old_phys_addr, new_phys_addr, size);
590     mapcache_unlock();
591     return p;
592 }
593