1794e8f30SMichael S. Tsirkin /*
2794e8f30SMichael S. Tsirkin * Support for RAM backed by mmaped host memory.
3794e8f30SMichael S. Tsirkin *
4794e8f30SMichael S. Tsirkin * Copyright (c) 2015 Red Hat, Inc.
5794e8f30SMichael S. Tsirkin *
6794e8f30SMichael S. Tsirkin * Authors:
7794e8f30SMichael S. Tsirkin * Michael S. Tsirkin <mst@redhat.com>
8794e8f30SMichael S. Tsirkin *
9794e8f30SMichael S. Tsirkin * This work is licensed under the terms of the GNU GPL, version 2 or
10794e8f30SMichael S. Tsirkin * later. See the COPYING file in the top-level directory.
11794e8f30SMichael S. Tsirkin */
12a9c94277SMarkus Armbruster
13119906afSZhang Yi #ifdef CONFIG_LINUX
14119906afSZhang Yi #include <linux/mman.h>
15119906afSZhang Yi #else /* !CONFIG_LINUX */
16119906afSZhang Yi #define MAP_SYNC 0x0
17119906afSZhang Yi #define MAP_SHARED_VALIDATE 0x0
18119906afSZhang Yi #endif /* CONFIG_LINUX */
19119906afSZhang Yi
20aafd7584SPeter Maydell #include "qemu/osdep.h"
21a9c94277SMarkus Armbruster #include "qemu/mmap-alloc.h"
224a3ecf20SCao jin #include "qemu/host-utils.h"
23d94e0bc9SDavid Hildenbrand #include "qemu/cutils.h"
248dbe22c6SDavid Hildenbrand #include "qemu/error-report.h"
25794e8f30SMichael S. Tsirkin
267197fb40SMichael S. Tsirkin #define HUGETLBFS_MAGIC 0x958458f6
277197fb40SMichael S. Tsirkin
287197fb40SMichael S. Tsirkin #ifdef CONFIG_LINUX
297197fb40SMichael S. Tsirkin #include <sys/vfs.h>
30*fa45f8daSPeter Xu #include <linux/magic.h>
317197fb40SMichael S. Tsirkin #endif
327197fb40SMichael S. Tsirkin
qemu_fd_getfs(int fd)33*fa45f8daSPeter Xu QemuFsType qemu_fd_getfs(int fd)
34*fa45f8daSPeter Xu {
35*fa45f8daSPeter Xu #ifdef CONFIG_LINUX
36*fa45f8daSPeter Xu struct statfs fs;
37*fa45f8daSPeter Xu int ret;
38*fa45f8daSPeter Xu
39*fa45f8daSPeter Xu if (fd < 0) {
40*fa45f8daSPeter Xu return QEMU_FS_TYPE_UNKNOWN;
41*fa45f8daSPeter Xu }
42*fa45f8daSPeter Xu
43*fa45f8daSPeter Xu do {
44*fa45f8daSPeter Xu ret = fstatfs(fd, &fs);
45*fa45f8daSPeter Xu } while (ret != 0 && errno == EINTR);
46*fa45f8daSPeter Xu
47*fa45f8daSPeter Xu switch (fs.f_type) {
48*fa45f8daSPeter Xu case TMPFS_MAGIC:
49*fa45f8daSPeter Xu return QEMU_FS_TYPE_TMPFS;
50*fa45f8daSPeter Xu case HUGETLBFS_MAGIC:
51*fa45f8daSPeter Xu return QEMU_FS_TYPE_HUGETLBFS;
52*fa45f8daSPeter Xu default:
53*fa45f8daSPeter Xu return QEMU_FS_TYPE_UNKNOWN;
54*fa45f8daSPeter Xu }
55*fa45f8daSPeter Xu #else
56*fa45f8daSPeter Xu return QEMU_FS_TYPE_UNKNOWN;
57*fa45f8daSPeter Xu #endif
58*fa45f8daSPeter Xu }
59*fa45f8daSPeter Xu
qemu_fd_getpagesize(int fd)607197fb40SMichael S. Tsirkin size_t qemu_fd_getpagesize(int fd)
617197fb40SMichael S. Tsirkin {
627197fb40SMichael S. Tsirkin #ifdef CONFIG_LINUX
637197fb40SMichael S. Tsirkin struct statfs fs;
647197fb40SMichael S. Tsirkin int ret;
657197fb40SMichael S. Tsirkin
667197fb40SMichael S. Tsirkin if (fd != -1) {
677197fb40SMichael S. Tsirkin do {
687197fb40SMichael S. Tsirkin ret = fstatfs(fd, &fs);
697197fb40SMichael S. Tsirkin } while (ret != 0 && errno == EINTR);
707197fb40SMichael S. Tsirkin
717197fb40SMichael S. Tsirkin if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
727197fb40SMichael S. Tsirkin return fs.f_bsize;
737197fb40SMichael S. Tsirkin }
747197fb40SMichael S. Tsirkin }
7557d1f6d7SPeter Maydell #ifdef __sparc__
7657d1f6d7SPeter Maydell /* SPARC Linux needs greater alignment than the pagesize */
7757d1f6d7SPeter Maydell return QEMU_VMALLOC_ALIGN;
7857d1f6d7SPeter Maydell #endif
797197fb40SMichael S. Tsirkin #endif
807197fb40SMichael S. Tsirkin
818e3b0cbbSMarc-André Lureau return qemu_real_host_page_size();
827197fb40SMichael S. Tsirkin }
837197fb40SMichael S. Tsirkin
84d94e0bc9SDavid Hildenbrand #define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
map_noreserve_effective(int fd,uint32_t qemu_map_flags)85d94e0bc9SDavid Hildenbrand static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
86d94e0bc9SDavid Hildenbrand {
87d94e0bc9SDavid Hildenbrand #if defined(__linux__)
88d94e0bc9SDavid Hildenbrand const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
89d94e0bc9SDavid Hildenbrand const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
90d94e0bc9SDavid Hildenbrand gchar *content = NULL;
91d94e0bc9SDavid Hildenbrand const char *endptr;
92d94e0bc9SDavid Hildenbrand unsigned int tmp;
93d94e0bc9SDavid Hildenbrand
94d94e0bc9SDavid Hildenbrand /*
95d94e0bc9SDavid Hildenbrand * hugeltb accounting is different than ordinary swap reservation:
96d94e0bc9SDavid Hildenbrand * a) Hugetlb pages from the pool are reserved for both private and
97d94e0bc9SDavid Hildenbrand * shared mappings. For shared mappings, all mappers have to specify
98d94e0bc9SDavid Hildenbrand * MAP_NORESERVE.
99d94e0bc9SDavid Hildenbrand * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
100d94e0bc9SDavid Hildenbrand */
1018e3b0cbbSMarc-André Lureau if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size()) {
102d94e0bc9SDavid Hildenbrand return true;
103d94e0bc9SDavid Hildenbrand }
104d94e0bc9SDavid Hildenbrand
105d94e0bc9SDavid Hildenbrand /*
106d94e0bc9SDavid Hildenbrand * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
107d94e0bc9SDavid Hildenbrand * are private writable mappings (see mm/mmap.c:accountable_mapping() in
108d94e0bc9SDavid Hildenbrand * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
109d94e0bc9SDavid Hildenbrand * implicitly active -- no reservation; this includes shmem. The only
110d94e0bc9SDavid Hildenbrand * exception is shared anonymous memory, it is accounted like private
111d94e0bc9SDavid Hildenbrand * anonymous memory.
112d94e0bc9SDavid Hildenbrand */
113d94e0bc9SDavid Hildenbrand if (readonly || (shared && fd >= 0)) {
114d94e0bc9SDavid Hildenbrand return true;
115d94e0bc9SDavid Hildenbrand }
116d94e0bc9SDavid Hildenbrand
117d94e0bc9SDavid Hildenbrand /*
118d94e0bc9SDavid Hildenbrand * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
119d94e0bc9SDavid Hildenbrand * memory overcommit is set to "never". Sparse memory regions aren't really
120d94e0bc9SDavid Hildenbrand * possible in this system configuration.
121d94e0bc9SDavid Hildenbrand *
122d94e0bc9SDavid Hildenbrand * Bail out now instead of silently committing way more memory than
123d94e0bc9SDavid Hildenbrand * currently desired by the user.
124d94e0bc9SDavid Hildenbrand */
125d94e0bc9SDavid Hildenbrand if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
126d94e0bc9SDavid Hildenbrand !qemu_strtoui(content, &endptr, 0, &tmp) &&
127d94e0bc9SDavid Hildenbrand (!endptr || *endptr == '\n')) {
128d94e0bc9SDavid Hildenbrand if (tmp == 2) {
129d94e0bc9SDavid Hildenbrand error_report("Skipping reservation of swap space is not supported:"
130d94e0bc9SDavid Hildenbrand " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
131d94e0bc9SDavid Hildenbrand return false;
132d94e0bc9SDavid Hildenbrand }
133d94e0bc9SDavid Hildenbrand return true;
134d94e0bc9SDavid Hildenbrand }
135d94e0bc9SDavid Hildenbrand /* this interface has been around since Linux 2.6 */
136d94e0bc9SDavid Hildenbrand error_report("Skipping reservation of swap space is not supported:"
137d94e0bc9SDavid Hildenbrand " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
138d94e0bc9SDavid Hildenbrand return false;
139d94e0bc9SDavid Hildenbrand #endif
140d94e0bc9SDavid Hildenbrand /*
141d94e0bc9SDavid Hildenbrand * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
142d94e0bc9SDavid Hildenbrand * and removed it a while ago.
143d94e0bc9SDavid Hildenbrand */
144d94e0bc9SDavid Hildenbrand error_report("Skipping reservation of swap space is not supported");
145d94e0bc9SDavid Hildenbrand return false;
146d94e0bc9SDavid Hildenbrand }
147d94e0bc9SDavid Hildenbrand
14801c26ad6SDavid Hildenbrand /*
14901c26ad6SDavid Hildenbrand * Reserve a new memory region of the requested size to be used for mapping
15001c26ad6SDavid Hildenbrand * from the given fd (if any).
15101c26ad6SDavid Hildenbrand */
mmap_reserve(size_t size,int fd)15201c26ad6SDavid Hildenbrand static void *mmap_reserve(size_t size, int fd)
15301c26ad6SDavid Hildenbrand {
15401c26ad6SDavid Hildenbrand int flags = MAP_PRIVATE;
15501c26ad6SDavid Hildenbrand
15601c26ad6SDavid Hildenbrand #if defined(__powerpc64__) && defined(__linux__)
15701c26ad6SDavid Hildenbrand /*
15801c26ad6SDavid Hildenbrand * On ppc64 mappings in the same segment (aka slice) must share the same
15901c26ad6SDavid Hildenbrand * page size. Since we will be re-allocating part of this segment
16001c26ad6SDavid Hildenbrand * from the supplied fd, we should make sure to use the same page size, to
16101c26ad6SDavid Hildenbrand * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
16201c26ad6SDavid Hildenbrand * avoid allocating backing store memory.
16301c26ad6SDavid Hildenbrand * We do this unless we are using the system page size, in which case
16401c26ad6SDavid Hildenbrand * anonymous memory is OK.
16501c26ad6SDavid Hildenbrand */
1668e3b0cbbSMarc-André Lureau if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size()) {
16701c26ad6SDavid Hildenbrand fd = -1;
16801c26ad6SDavid Hildenbrand flags |= MAP_ANONYMOUS;
16901c26ad6SDavid Hildenbrand } else {
17001c26ad6SDavid Hildenbrand flags |= MAP_NORESERVE;
17101c26ad6SDavid Hildenbrand }
17201c26ad6SDavid Hildenbrand #else
17301c26ad6SDavid Hildenbrand fd = -1;
17401c26ad6SDavid Hildenbrand flags |= MAP_ANONYMOUS;
17501c26ad6SDavid Hildenbrand #endif
17601c26ad6SDavid Hildenbrand
17701c26ad6SDavid Hildenbrand return mmap(0, size, PROT_NONE, flags, fd, 0);
17801c26ad6SDavid Hildenbrand }
17901c26ad6SDavid Hildenbrand
180d01cbf82SDavid Hildenbrand /*
181d01cbf82SDavid Hildenbrand * Activate memory in a reserved region from the given fd (if any), to make
182d01cbf82SDavid Hildenbrand * it accessible.
183d01cbf82SDavid Hildenbrand */
mmap_activate(void * ptr,size_t size,int fd,uint32_t qemu_map_flags,off_t map_offset)184b444f5c0SDavid Hildenbrand static void *mmap_activate(void *ptr, size_t size, int fd,
185b444f5c0SDavid Hildenbrand uint32_t qemu_map_flags, off_t map_offset)
186d01cbf82SDavid Hildenbrand {
1878dbe22c6SDavid Hildenbrand const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;
188b444f5c0SDavid Hildenbrand const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
189b444f5c0SDavid Hildenbrand const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
190b444f5c0SDavid Hildenbrand const bool sync = qemu_map_flags & QEMU_MAP_SYNC;
191d01cbf82SDavid Hildenbrand const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
192d01cbf82SDavid Hildenbrand int map_sync_flags = 0;
193d01cbf82SDavid Hildenbrand int flags = MAP_FIXED;
194d01cbf82SDavid Hildenbrand void *activated_ptr;
195d01cbf82SDavid Hildenbrand
196d94e0bc9SDavid Hildenbrand if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {
1978dbe22c6SDavid Hildenbrand return MAP_FAILED;
1988dbe22c6SDavid Hildenbrand }
1998dbe22c6SDavid Hildenbrand
200d01cbf82SDavid Hildenbrand flags |= fd == -1 ? MAP_ANONYMOUS : 0;
201d01cbf82SDavid Hildenbrand flags |= shared ? MAP_SHARED : MAP_PRIVATE;
202d94e0bc9SDavid Hildenbrand flags |= noreserve ? MAP_NORESERVE : 0;
203b444f5c0SDavid Hildenbrand if (shared && sync) {
204d01cbf82SDavid Hildenbrand map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
205d01cbf82SDavid Hildenbrand }
206d01cbf82SDavid Hildenbrand
207d01cbf82SDavid Hildenbrand activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,
208d01cbf82SDavid Hildenbrand map_offset);
209d01cbf82SDavid Hildenbrand if (activated_ptr == MAP_FAILED && map_sync_flags) {
210d01cbf82SDavid Hildenbrand if (errno == ENOTSUP) {
211d01cbf82SDavid Hildenbrand char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
212d01cbf82SDavid Hildenbrand char *file_name = g_malloc0(PATH_MAX);
213d01cbf82SDavid Hildenbrand int len = readlink(proc_link, file_name, PATH_MAX - 1);
214d01cbf82SDavid Hildenbrand
215d01cbf82SDavid Hildenbrand if (len < 0) {
216d01cbf82SDavid Hildenbrand len = 0;
217d01cbf82SDavid Hildenbrand }
218d01cbf82SDavid Hildenbrand file_name[len] = '\0';
219d01cbf82SDavid Hildenbrand fprintf(stderr, "Warning: requesting persistence across crashes "
220d01cbf82SDavid Hildenbrand "for backend file %s failed. Proceeding without "
221d01cbf82SDavid Hildenbrand "persistence, data might become corrupted in case of host "
222d01cbf82SDavid Hildenbrand "crash.\n", file_name);
223d01cbf82SDavid Hildenbrand g_free(proc_link);
224d01cbf82SDavid Hildenbrand g_free(file_name);
225cdcf766dSIgor Mammedov warn_report("Using non DAX backing file with 'pmem=on' option"
226cdcf766dSIgor Mammedov " is deprecated");
227d01cbf82SDavid Hildenbrand }
228d01cbf82SDavid Hildenbrand /*
229d01cbf82SDavid Hildenbrand * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
230d01cbf82SDavid Hildenbrand * again without these flags to handle backwards compatibility.
231d01cbf82SDavid Hildenbrand */
232d01cbf82SDavid Hildenbrand activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);
233d01cbf82SDavid Hildenbrand }
234d01cbf82SDavid Hildenbrand return activated_ptr;
235d01cbf82SDavid Hildenbrand }
236d01cbf82SDavid Hildenbrand
mmap_guard_pagesize(int fd)237adad0b3aSDavid Hildenbrand static inline size_t mmap_guard_pagesize(int fd)
238adad0b3aSDavid Hildenbrand {
239adad0b3aSDavid Hildenbrand #if defined(__powerpc64__) && defined(__linux__)
240adad0b3aSDavid Hildenbrand /* Mappings in the same segment must share the same page size */
241adad0b3aSDavid Hildenbrand return qemu_fd_getpagesize(fd);
242adad0b3aSDavid Hildenbrand #else
2438e3b0cbbSMarc-André Lureau return qemu_real_host_page_size();
244adad0b3aSDavid Hildenbrand #endif
245adad0b3aSDavid Hildenbrand }
246adad0b3aSDavid Hildenbrand
qemu_ram_mmap(int fd,size_t size,size_t align,uint32_t qemu_map_flags,off_t map_offset)2472ac0f162SZhang Yi void *qemu_ram_mmap(int fd,
2482ac0f162SZhang Yi size_t size,
2492ac0f162SZhang Yi size_t align,
250b444f5c0SDavid Hildenbrand uint32_t qemu_map_flags,
25144a4ff31SJagannathan Raman off_t map_offset)
252794e8f30SMichael S. Tsirkin {
253adad0b3aSDavid Hildenbrand const size_t guard_pagesize = mmap_guard_pagesize(fd);
254d01cbf82SDavid Hildenbrand size_t offset, total;
255d01cbf82SDavid Hildenbrand void *ptr, *guardptr;
2562044c3e7SMurilo Opsfelder Araujo
257794e8f30SMichael S. Tsirkin /*
258794e8f30SMichael S. Tsirkin * Note: this always allocates at least one extra page of virtual address
259794e8f30SMichael S. Tsirkin * space, even if size is already aligned.
260794e8f30SMichael S. Tsirkin */
2612044c3e7SMurilo Opsfelder Araujo total = size + align;
2622044c3e7SMurilo Opsfelder Araujo
26301c26ad6SDavid Hildenbrand guardptr = mmap_reserve(total, fd);
2642044c3e7SMurilo Opsfelder Araujo if (guardptr == MAP_FAILED) {
2659d4ec937SMichael S. Tsirkin return MAP_FAILED;
266794e8f30SMichael S. Tsirkin }
267794e8f30SMichael S. Tsirkin
2684a3ecf20SCao jin assert(is_power_of_2(align));
269794e8f30SMichael S. Tsirkin /* Always align to host page size */
270adad0b3aSDavid Hildenbrand assert(align >= guard_pagesize);
271794e8f30SMichael S. Tsirkin
2722044c3e7SMurilo Opsfelder Araujo offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
2732044c3e7SMurilo Opsfelder Araujo
274b444f5c0SDavid Hildenbrand ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,
275d01cbf82SDavid Hildenbrand map_offset);
2762044c3e7SMurilo Opsfelder Araujo if (ptr == MAP_FAILED) {
2772044c3e7SMurilo Opsfelder Araujo munmap(guardptr, total);
2789d4ec937SMichael S. Tsirkin return MAP_FAILED;
279794e8f30SMichael S. Tsirkin }
280794e8f30SMichael S. Tsirkin
281794e8f30SMichael S. Tsirkin if (offset > 0) {
2822044c3e7SMurilo Opsfelder Araujo munmap(guardptr, offset);
283794e8f30SMichael S. Tsirkin }
284794e8f30SMichael S. Tsirkin
285794e8f30SMichael S. Tsirkin /*
286794e8f30SMichael S. Tsirkin * Leave a single PROT_NONE page allocated after the RAM block, to serve as
287794e8f30SMichael S. Tsirkin * a guard page guarding against potential buffer overflows.
288794e8f30SMichael S. Tsirkin */
2896e4c890eSCao jin total -= offset;
290adad0b3aSDavid Hildenbrand if (total > size + guard_pagesize) {
291adad0b3aSDavid Hildenbrand munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);
292794e8f30SMichael S. Tsirkin }
293794e8f30SMichael S. Tsirkin
2942044c3e7SMurilo Opsfelder Araujo return ptr;
295794e8f30SMichael S. Tsirkin }
296794e8f30SMichael S. Tsirkin
qemu_ram_munmap(int fd,void * ptr,size_t size)29753adb9d4SMurilo Opsfelder Araujo void qemu_ram_munmap(int fd, void *ptr, size_t size)
298794e8f30SMichael S. Tsirkin {
299794e8f30SMichael S. Tsirkin if (ptr) {
300794e8f30SMichael S. Tsirkin /* Unmap both the RAM block and the guard page */
301adad0b3aSDavid Hildenbrand munmap(ptr, size + mmap_guard_pagesize(fd));
302794e8f30SMichael S. Tsirkin }
303794e8f30SMichael S. Tsirkin }
304