xref: /openbmc/qemu/util/mmap-alloc.c (revision fa45f8da)
1794e8f30SMichael S. Tsirkin /*
2794e8f30SMichael S. Tsirkin  * Support for RAM backed by mmaped host memory.
3794e8f30SMichael S. Tsirkin  *
4794e8f30SMichael S. Tsirkin  * Copyright (c) 2015 Red Hat, Inc.
5794e8f30SMichael S. Tsirkin  *
6794e8f30SMichael S. Tsirkin  * Authors:
7794e8f30SMichael S. Tsirkin  *  Michael S. Tsirkin <mst@redhat.com>
8794e8f30SMichael S. Tsirkin  *
9794e8f30SMichael S. Tsirkin  * This work is licensed under the terms of the GNU GPL, version 2 or
10794e8f30SMichael S. Tsirkin  * later.  See the COPYING file in the top-level directory.
11794e8f30SMichael S. Tsirkin  */
12a9c94277SMarkus Armbruster 
13119906afSZhang Yi #ifdef CONFIG_LINUX
14119906afSZhang Yi #include <linux/mman.h>
15119906afSZhang Yi #else  /* !CONFIG_LINUX */
16119906afSZhang Yi #define MAP_SYNC              0x0
17119906afSZhang Yi #define MAP_SHARED_VALIDATE   0x0
18119906afSZhang Yi #endif /* CONFIG_LINUX */
19119906afSZhang Yi 
20aafd7584SPeter Maydell #include "qemu/osdep.h"
21a9c94277SMarkus Armbruster #include "qemu/mmap-alloc.h"
224a3ecf20SCao jin #include "qemu/host-utils.h"
23d94e0bc9SDavid Hildenbrand #include "qemu/cutils.h"
248dbe22c6SDavid Hildenbrand #include "qemu/error-report.h"
25794e8f30SMichael S. Tsirkin 
267197fb40SMichael S. Tsirkin #define HUGETLBFS_MAGIC       0x958458f6
277197fb40SMichael S. Tsirkin 
287197fb40SMichael S. Tsirkin #ifdef CONFIG_LINUX
297197fb40SMichael S. Tsirkin #include <sys/vfs.h>
30*fa45f8daSPeter Xu #include <linux/magic.h>
317197fb40SMichael S. Tsirkin #endif
327197fb40SMichael S. Tsirkin 
qemu_fd_getfs(int fd)33*fa45f8daSPeter Xu QemuFsType qemu_fd_getfs(int fd)
34*fa45f8daSPeter Xu {
35*fa45f8daSPeter Xu #ifdef CONFIG_LINUX
36*fa45f8daSPeter Xu     struct statfs fs;
37*fa45f8daSPeter Xu     int ret;
38*fa45f8daSPeter Xu 
39*fa45f8daSPeter Xu     if (fd < 0) {
40*fa45f8daSPeter Xu         return QEMU_FS_TYPE_UNKNOWN;
41*fa45f8daSPeter Xu     }
42*fa45f8daSPeter Xu 
43*fa45f8daSPeter Xu     do {
44*fa45f8daSPeter Xu         ret = fstatfs(fd, &fs);
45*fa45f8daSPeter Xu     } while (ret != 0 && errno == EINTR);
46*fa45f8daSPeter Xu 
47*fa45f8daSPeter Xu     switch (fs.f_type) {
48*fa45f8daSPeter Xu     case TMPFS_MAGIC:
49*fa45f8daSPeter Xu         return QEMU_FS_TYPE_TMPFS;
50*fa45f8daSPeter Xu     case HUGETLBFS_MAGIC:
51*fa45f8daSPeter Xu         return QEMU_FS_TYPE_HUGETLBFS;
52*fa45f8daSPeter Xu     default:
53*fa45f8daSPeter Xu         return QEMU_FS_TYPE_UNKNOWN;
54*fa45f8daSPeter Xu     }
55*fa45f8daSPeter Xu #else
56*fa45f8daSPeter Xu     return QEMU_FS_TYPE_UNKNOWN;
57*fa45f8daSPeter Xu #endif
58*fa45f8daSPeter Xu }
59*fa45f8daSPeter Xu 
qemu_fd_getpagesize(int fd)607197fb40SMichael S. Tsirkin size_t qemu_fd_getpagesize(int fd)
617197fb40SMichael S. Tsirkin {
627197fb40SMichael S. Tsirkin #ifdef CONFIG_LINUX
637197fb40SMichael S. Tsirkin     struct statfs fs;
647197fb40SMichael S. Tsirkin     int ret;
657197fb40SMichael S. Tsirkin 
667197fb40SMichael S. Tsirkin     if (fd != -1) {
677197fb40SMichael S. Tsirkin         do {
687197fb40SMichael S. Tsirkin             ret = fstatfs(fd, &fs);
697197fb40SMichael S. Tsirkin         } while (ret != 0 && errno == EINTR);
707197fb40SMichael S. Tsirkin 
717197fb40SMichael S. Tsirkin         if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
727197fb40SMichael S. Tsirkin             return fs.f_bsize;
737197fb40SMichael S. Tsirkin         }
747197fb40SMichael S. Tsirkin     }
7557d1f6d7SPeter Maydell #ifdef __sparc__
7657d1f6d7SPeter Maydell     /* SPARC Linux needs greater alignment than the pagesize */
7757d1f6d7SPeter Maydell     return QEMU_VMALLOC_ALIGN;
7857d1f6d7SPeter Maydell #endif
797197fb40SMichael S. Tsirkin #endif
807197fb40SMichael S. Tsirkin 
818e3b0cbbSMarc-André Lureau     return qemu_real_host_page_size();
827197fb40SMichael S. Tsirkin }
837197fb40SMichael S. Tsirkin 
84d94e0bc9SDavid Hildenbrand #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
map_noreserve_effective(int fd,uint32_t qemu_map_flags)85d94e0bc9SDavid Hildenbrand static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
86d94e0bc9SDavid Hildenbrand {
87d94e0bc9SDavid Hildenbrand #if defined(__linux__)
88d94e0bc9SDavid Hildenbrand     const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
89d94e0bc9SDavid Hildenbrand     const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
90d94e0bc9SDavid Hildenbrand     gchar *content = NULL;
91d94e0bc9SDavid Hildenbrand     const char *endptr;
92d94e0bc9SDavid Hildenbrand     unsigned int tmp;
93d94e0bc9SDavid Hildenbrand 
94d94e0bc9SDavid Hildenbrand     /*
95d94e0bc9SDavid Hildenbrand      * hugeltb accounting is different than ordinary swap reservation:
96d94e0bc9SDavid Hildenbrand      * a) Hugetlb pages from the pool are reserved for both private and
97d94e0bc9SDavid Hildenbrand      *    shared mappings. For shared mappings, all mappers have to specify
98d94e0bc9SDavid Hildenbrand      *    MAP_NORESERVE.
99d94e0bc9SDavid Hildenbrand      * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
100d94e0bc9SDavid Hildenbrand      */
1018e3b0cbbSMarc-André Lureau     if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size()) {
102d94e0bc9SDavid Hildenbrand         return true;
103d94e0bc9SDavid Hildenbrand     }
104d94e0bc9SDavid Hildenbrand 
105d94e0bc9SDavid Hildenbrand     /*
106d94e0bc9SDavid Hildenbrand      * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
107d94e0bc9SDavid Hildenbrand      * are private writable mappings (see mm/mmap.c:accountable_mapping() in
108d94e0bc9SDavid Hildenbrand      * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
109d94e0bc9SDavid Hildenbrand      * implicitly active -- no reservation; this includes shmem. The only
110d94e0bc9SDavid Hildenbrand      * exception is shared anonymous memory, it is accounted like private
111d94e0bc9SDavid Hildenbrand      * anonymous memory.
112d94e0bc9SDavid Hildenbrand      */
113d94e0bc9SDavid Hildenbrand     if (readonly || (shared && fd >= 0)) {
114d94e0bc9SDavid Hildenbrand         return true;
115d94e0bc9SDavid Hildenbrand     }
116d94e0bc9SDavid Hildenbrand 
117d94e0bc9SDavid Hildenbrand     /*
118d94e0bc9SDavid Hildenbrand      * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
119d94e0bc9SDavid Hildenbrand      * memory overcommit is set to "never". Sparse memory regions aren't really
120d94e0bc9SDavid Hildenbrand      * possible in this system configuration.
121d94e0bc9SDavid Hildenbrand      *
122d94e0bc9SDavid Hildenbrand      * Bail out now instead of silently committing way more memory than
123d94e0bc9SDavid Hildenbrand      * currently desired by the user.
124d94e0bc9SDavid Hildenbrand      */
125d94e0bc9SDavid Hildenbrand     if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
126d94e0bc9SDavid Hildenbrand         !qemu_strtoui(content, &endptr, 0, &tmp) &&
127d94e0bc9SDavid Hildenbrand         (!endptr || *endptr == '\n')) {
128d94e0bc9SDavid Hildenbrand         if (tmp == 2) {
129d94e0bc9SDavid Hildenbrand             error_report("Skipping reservation of swap space is not supported:"
130d94e0bc9SDavid Hildenbrand                          " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
131d94e0bc9SDavid Hildenbrand             return false;
132d94e0bc9SDavid Hildenbrand         }
133d94e0bc9SDavid Hildenbrand         return true;
134d94e0bc9SDavid Hildenbrand     }
135d94e0bc9SDavid Hildenbrand     /* this interface has been around since Linux 2.6 */
136d94e0bc9SDavid Hildenbrand     error_report("Skipping reservation of swap space is not supported:"
137d94e0bc9SDavid Hildenbrand                  " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
138d94e0bc9SDavid Hildenbrand     return false;
139d94e0bc9SDavid Hildenbrand #endif
140d94e0bc9SDavid Hildenbrand     /*
141d94e0bc9SDavid Hildenbrand      * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
142d94e0bc9SDavid Hildenbrand      * and removed it a while ago.
143d94e0bc9SDavid Hildenbrand      */
144d94e0bc9SDavid Hildenbrand     error_report("Skipping reservation of swap space is not supported");
145d94e0bc9SDavid Hildenbrand     return false;
146d94e0bc9SDavid Hildenbrand }
147d94e0bc9SDavid Hildenbrand 
14801c26ad6SDavid Hildenbrand /*
14901c26ad6SDavid Hildenbrand  * Reserve a new memory region of the requested size to be used for mapping
15001c26ad6SDavid Hildenbrand  * from the given fd (if any).
15101c26ad6SDavid Hildenbrand  */
mmap_reserve(size_t size,int fd)15201c26ad6SDavid Hildenbrand static void *mmap_reserve(size_t size, int fd)
15301c26ad6SDavid Hildenbrand {
15401c26ad6SDavid Hildenbrand     int flags = MAP_PRIVATE;
15501c26ad6SDavid Hildenbrand 
15601c26ad6SDavid Hildenbrand #if defined(__powerpc64__) && defined(__linux__)
15701c26ad6SDavid Hildenbrand     /*
15801c26ad6SDavid Hildenbrand      * On ppc64 mappings in the same segment (aka slice) must share the same
15901c26ad6SDavid Hildenbrand      * page size. Since we will be re-allocating part of this segment
16001c26ad6SDavid Hildenbrand      * from the supplied fd, we should make sure to use the same page size, to
16101c26ad6SDavid Hildenbrand      * this end we mmap the supplied fd.  In this case, set MAP_NORESERVE to
16201c26ad6SDavid Hildenbrand      * avoid allocating backing store memory.
16301c26ad6SDavid Hildenbrand      * We do this unless we are using the system page size, in which case
16401c26ad6SDavid Hildenbrand      * anonymous memory is OK.
16501c26ad6SDavid Hildenbrand      */
1668e3b0cbbSMarc-André Lureau     if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size()) {
16701c26ad6SDavid Hildenbrand         fd = -1;
16801c26ad6SDavid Hildenbrand         flags |= MAP_ANONYMOUS;
16901c26ad6SDavid Hildenbrand     } else {
17001c26ad6SDavid Hildenbrand         flags |= MAP_NORESERVE;
17101c26ad6SDavid Hildenbrand     }
17201c26ad6SDavid Hildenbrand #else
17301c26ad6SDavid Hildenbrand     fd = -1;
17401c26ad6SDavid Hildenbrand     flags |= MAP_ANONYMOUS;
17501c26ad6SDavid Hildenbrand #endif
17601c26ad6SDavid Hildenbrand 
17701c26ad6SDavid Hildenbrand     return mmap(0, size, PROT_NONE, flags, fd, 0);
17801c26ad6SDavid Hildenbrand }
17901c26ad6SDavid Hildenbrand 
180d01cbf82SDavid Hildenbrand /*
181d01cbf82SDavid Hildenbrand  * Activate memory in a reserved region from the given fd (if any), to make
182d01cbf82SDavid Hildenbrand  * it accessible.
183d01cbf82SDavid Hildenbrand  */
mmap_activate(void * ptr,size_t size,int fd,uint32_t qemu_map_flags,off_t map_offset)184b444f5c0SDavid Hildenbrand static void *mmap_activate(void *ptr, size_t size, int fd,
185b444f5c0SDavid Hildenbrand                            uint32_t qemu_map_flags, off_t map_offset)
186d01cbf82SDavid Hildenbrand {
1878dbe22c6SDavid Hildenbrand     const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;
188b444f5c0SDavid Hildenbrand     const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
189b444f5c0SDavid Hildenbrand     const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
190b444f5c0SDavid Hildenbrand     const bool sync = qemu_map_flags & QEMU_MAP_SYNC;
191d01cbf82SDavid Hildenbrand     const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
192d01cbf82SDavid Hildenbrand     int map_sync_flags = 0;
193d01cbf82SDavid Hildenbrand     int flags = MAP_FIXED;
194d01cbf82SDavid Hildenbrand     void *activated_ptr;
195d01cbf82SDavid Hildenbrand 
196d94e0bc9SDavid Hildenbrand     if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {
1978dbe22c6SDavid Hildenbrand         return MAP_FAILED;
1988dbe22c6SDavid Hildenbrand     }
1998dbe22c6SDavid Hildenbrand 
200d01cbf82SDavid Hildenbrand     flags |= fd == -1 ? MAP_ANONYMOUS : 0;
201d01cbf82SDavid Hildenbrand     flags |= shared ? MAP_SHARED : MAP_PRIVATE;
202d94e0bc9SDavid Hildenbrand     flags |= noreserve ? MAP_NORESERVE : 0;
203b444f5c0SDavid Hildenbrand     if (shared && sync) {
204d01cbf82SDavid Hildenbrand         map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
205d01cbf82SDavid Hildenbrand     }
206d01cbf82SDavid Hildenbrand 
207d01cbf82SDavid Hildenbrand     activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,
208d01cbf82SDavid Hildenbrand                          map_offset);
209d01cbf82SDavid Hildenbrand     if (activated_ptr == MAP_FAILED && map_sync_flags) {
210d01cbf82SDavid Hildenbrand         if (errno == ENOTSUP) {
211d01cbf82SDavid Hildenbrand             char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
212d01cbf82SDavid Hildenbrand             char *file_name = g_malloc0(PATH_MAX);
213d01cbf82SDavid Hildenbrand             int len = readlink(proc_link, file_name, PATH_MAX - 1);
214d01cbf82SDavid Hildenbrand 
215d01cbf82SDavid Hildenbrand             if (len < 0) {
216d01cbf82SDavid Hildenbrand                 len = 0;
217d01cbf82SDavid Hildenbrand             }
218d01cbf82SDavid Hildenbrand             file_name[len] = '\0';
219d01cbf82SDavid Hildenbrand             fprintf(stderr, "Warning: requesting persistence across crashes "
220d01cbf82SDavid Hildenbrand                     "for backend file %s failed. Proceeding without "
221d01cbf82SDavid Hildenbrand                     "persistence, data might become corrupted in case of host "
222d01cbf82SDavid Hildenbrand                     "crash.\n", file_name);
223d01cbf82SDavid Hildenbrand             g_free(proc_link);
224d01cbf82SDavid Hildenbrand             g_free(file_name);
225cdcf766dSIgor Mammedov             warn_report("Using non DAX backing file with 'pmem=on' option"
226cdcf766dSIgor Mammedov                         " is deprecated");
227d01cbf82SDavid Hildenbrand         }
228d01cbf82SDavid Hildenbrand         /*
229d01cbf82SDavid Hildenbrand          * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
230d01cbf82SDavid Hildenbrand          * again without these flags to handle backwards compatibility.
231d01cbf82SDavid Hildenbrand          */
232d01cbf82SDavid Hildenbrand         activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);
233d01cbf82SDavid Hildenbrand     }
234d01cbf82SDavid Hildenbrand     return activated_ptr;
235d01cbf82SDavid Hildenbrand }
236d01cbf82SDavid Hildenbrand 
mmap_guard_pagesize(int fd)237adad0b3aSDavid Hildenbrand static inline size_t mmap_guard_pagesize(int fd)
238adad0b3aSDavid Hildenbrand {
239adad0b3aSDavid Hildenbrand #if defined(__powerpc64__) && defined(__linux__)
240adad0b3aSDavid Hildenbrand     /* Mappings in the same segment must share the same page size */
241adad0b3aSDavid Hildenbrand     return qemu_fd_getpagesize(fd);
242adad0b3aSDavid Hildenbrand #else
2438e3b0cbbSMarc-André Lureau     return qemu_real_host_page_size();
244adad0b3aSDavid Hildenbrand #endif
245adad0b3aSDavid Hildenbrand }
246adad0b3aSDavid Hildenbrand 
qemu_ram_mmap(int fd,size_t size,size_t align,uint32_t qemu_map_flags,off_t map_offset)2472ac0f162SZhang Yi void *qemu_ram_mmap(int fd,
2482ac0f162SZhang Yi                     size_t size,
2492ac0f162SZhang Yi                     size_t align,
250b444f5c0SDavid Hildenbrand                     uint32_t qemu_map_flags,
25144a4ff31SJagannathan Raman                     off_t map_offset)
252794e8f30SMichael S. Tsirkin {
253adad0b3aSDavid Hildenbrand     const size_t guard_pagesize = mmap_guard_pagesize(fd);
254d01cbf82SDavid Hildenbrand     size_t offset, total;
255d01cbf82SDavid Hildenbrand     void *ptr, *guardptr;
2562044c3e7SMurilo Opsfelder Araujo 
257794e8f30SMichael S. Tsirkin     /*
258794e8f30SMichael S. Tsirkin      * Note: this always allocates at least one extra page of virtual address
259794e8f30SMichael S. Tsirkin      * space, even if size is already aligned.
260794e8f30SMichael S. Tsirkin      */
2612044c3e7SMurilo Opsfelder Araujo     total = size + align;
2622044c3e7SMurilo Opsfelder Araujo 
26301c26ad6SDavid Hildenbrand     guardptr = mmap_reserve(total, fd);
2642044c3e7SMurilo Opsfelder Araujo     if (guardptr == MAP_FAILED) {
2659d4ec937SMichael S. Tsirkin         return MAP_FAILED;
266794e8f30SMichael S. Tsirkin     }
267794e8f30SMichael S. Tsirkin 
2684a3ecf20SCao jin     assert(is_power_of_2(align));
269794e8f30SMichael S. Tsirkin     /* Always align to host page size */
270adad0b3aSDavid Hildenbrand     assert(align >= guard_pagesize);
271794e8f30SMichael S. Tsirkin 
2722044c3e7SMurilo Opsfelder Araujo     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
2732044c3e7SMurilo Opsfelder Araujo 
274b444f5c0SDavid Hildenbrand     ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,
275d01cbf82SDavid Hildenbrand                         map_offset);
2762044c3e7SMurilo Opsfelder Araujo     if (ptr == MAP_FAILED) {
2772044c3e7SMurilo Opsfelder Araujo         munmap(guardptr, total);
2789d4ec937SMichael S. Tsirkin         return MAP_FAILED;
279794e8f30SMichael S. Tsirkin     }
280794e8f30SMichael S. Tsirkin 
281794e8f30SMichael S. Tsirkin     if (offset > 0) {
2822044c3e7SMurilo Opsfelder Araujo         munmap(guardptr, offset);
283794e8f30SMichael S. Tsirkin     }
284794e8f30SMichael S. Tsirkin 
285794e8f30SMichael S. Tsirkin     /*
286794e8f30SMichael S. Tsirkin      * Leave a single PROT_NONE page allocated after the RAM block, to serve as
287794e8f30SMichael S. Tsirkin      * a guard page guarding against potential buffer overflows.
288794e8f30SMichael S. Tsirkin      */
2896e4c890eSCao jin     total -= offset;
290adad0b3aSDavid Hildenbrand     if (total > size + guard_pagesize) {
291adad0b3aSDavid Hildenbrand         munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);
292794e8f30SMichael S. Tsirkin     }
293794e8f30SMichael S. Tsirkin 
2942044c3e7SMurilo Opsfelder Araujo     return ptr;
295794e8f30SMichael S. Tsirkin }
296794e8f30SMichael S. Tsirkin 
qemu_ram_munmap(int fd,void * ptr,size_t size)29753adb9d4SMurilo Opsfelder Araujo void qemu_ram_munmap(int fd, void *ptr, size_t size)
298794e8f30SMichael S. Tsirkin {
299794e8f30SMichael S. Tsirkin     if (ptr) {
300794e8f30SMichael S. Tsirkin         /* Unmap both the RAM block and the guard page */
301adad0b3aSDavid Hildenbrand         munmap(ptr, size + mmap_guard_pagesize(fd));
302794e8f30SMichael S. Tsirkin     }
303794e8f30SMichael S. Tsirkin }
304