xref: /openbmc/qemu/linux-user/mmap.c (revision 3871be753f3351c21c8e384432f7798c3eed9de9)
1 /*
2  *  mmap support for qemu
3  *
4  *  Copyright (c) 2003 Fabrice Bellard
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, see <http://www.gnu.org/licenses/>.
18  */
19 #include "qemu/osdep.h"
20 #include <sys/shm.h>
21 #include "trace.h"
22 #include "exec/log.h"
23 #include "qemu.h"
24 #include "user-internals.h"
25 #include "user-mmap.h"
26 #include "target_mman.h"
27 #include "qemu/interval-tree.h"
28 
29 #ifdef TARGET_ARM
30 #include "target/arm/cpu-features.h"
31 #endif
32 
33 static pthread_mutex_t mmap_mutex = PTHREAD_MUTEX_INITIALIZER;
34 static __thread int mmap_lock_count;
35 
36 void mmap_lock(void)
37 {
38     if (mmap_lock_count++ == 0) {
39         pthread_mutex_lock(&mmap_mutex);
40     }
41 }
42 
43 void mmap_unlock(void)
44 {
45     assert(mmap_lock_count > 0);
46     if (--mmap_lock_count == 0) {
47         pthread_mutex_unlock(&mmap_mutex);
48     }
49 }
50 
51 bool have_mmap_lock(void)
52 {
53     return mmap_lock_count > 0 ? true : false;
54 }
55 
56 /* Grab lock to make sure things are in a consistent state after fork().  */
57 void mmap_fork_start(void)
58 {
59     if (mmap_lock_count)
60         abort();
61     pthread_mutex_lock(&mmap_mutex);
62 }
63 
64 void mmap_fork_end(int child)
65 {
66     if (child) {
67         pthread_mutex_init(&mmap_mutex, NULL);
68     } else {
69         pthread_mutex_unlock(&mmap_mutex);
70     }
71 }
72 
73 /* Protected by mmap_lock. */
74 static IntervalTreeRoot shm_regions;
75 
76 static void shm_region_add(abi_ptr start, abi_ptr last)
77 {
78     IntervalTreeNode *i = g_new0(IntervalTreeNode, 1);
79 
80     i->start = start;
81     i->last = last;
82     interval_tree_insert(i, &shm_regions);
83 }
84 
85 static abi_ptr shm_region_find(abi_ptr start)
86 {
87     IntervalTreeNode *i;
88 
89     for (i = interval_tree_iter_first(&shm_regions, start, start); i;
90          i = interval_tree_iter_next(i, start, start)) {
91         if (i->start == start) {
92             return i->last;
93         }
94     }
95     return 0;
96 }
97 
98 static void shm_region_rm_complete(abi_ptr start, abi_ptr last)
99 {
100     IntervalTreeNode *i, *n;
101 
102     for (i = interval_tree_iter_first(&shm_regions, start, last); i; i = n) {
103         n = interval_tree_iter_next(i, start, last);
104         if (i->start >= start && i->last <= last) {
105             interval_tree_remove(i, &shm_regions);
106             g_free(i);
107         }
108     }
109 }
110 
111 /*
112  * Validate target prot bitmask.
113  * Return the prot bitmask for the host in *HOST_PROT.
114  * Return 0 if the target prot bitmask is invalid, otherwise
115  * the internal qemu page_flags (which will include PAGE_VALID).
116  */
117 static int validate_prot_to_pageflags(int prot)
118 {
119     int valid = PROT_READ | PROT_WRITE | PROT_EXEC | TARGET_PROT_SEM;
120     int page_flags = (prot & PAGE_BITS) | PAGE_VALID;
121 
122 #ifdef TARGET_AARCH64
123     {
124         ARMCPU *cpu = ARM_CPU(thread_cpu);
125 
126         /*
127          * The PROT_BTI bit is only accepted if the cpu supports the feature.
128          * Since this is the unusual case, don't bother checking unless
129          * the bit has been requested.  If set and valid, record the bit
130          * within QEMU's page_flags.
131          */
132         if ((prot & TARGET_PROT_BTI) && cpu_isar_feature(aa64_bti, cpu)) {
133             valid |= TARGET_PROT_BTI;
134             page_flags |= PAGE_BTI;
135         }
136         /* Similarly for the PROT_MTE bit. */
137         if ((prot & TARGET_PROT_MTE) && cpu_isar_feature(aa64_mte, cpu)) {
138             valid |= TARGET_PROT_MTE;
139             page_flags |= PAGE_MTE;
140         }
141     }
142 #elif defined(TARGET_HPPA)
143     valid |= PROT_GROWSDOWN | PROT_GROWSUP;
144 #endif
145 
146     return prot & ~valid ? 0 : page_flags;
147 }
148 
149 /*
150  * For the host, we need not pass anything except read/write/exec.
151  * While PROT_SEM is allowed by all hosts, it is also ignored, so
152  * don't bother transforming guest bit to host bit.  Any other
153  * target-specific prot bits will not be understood by the host
154  * and will need to be encoded into page_flags for qemu emulation.
155  *
156  * Pages that are executable by the guest will never be executed
157  * by the host, but the host will need to be able to read them.
158  */
159 static int target_to_host_prot(int prot)
160 {
161     return (prot & (PROT_READ | PROT_WRITE)) |
162            (prot & PROT_EXEC ? PROT_READ : 0);
163 }
164 
165 /* NOTE: all the constants are the HOST ones, but addresses are target. */
166 int target_mprotect(abi_ulong start, abi_ulong len, int target_prot)
167 {
168     abi_ulong starts[3];
169     abi_ulong lens[3];
170     int prots[3];
171     abi_ulong host_start, host_last, last;
172     int prot1, ret, page_flags, nranges;
173 
174     trace_target_mprotect(start, len, target_prot);
175 
176     if ((start & ~TARGET_PAGE_MASK) != 0) {
177         return -TARGET_EINVAL;
178     }
179     page_flags = validate_prot_to_pageflags(target_prot);
180     if (!page_flags) {
181         return -TARGET_EINVAL;
182     }
183     if (len == 0) {
184         return 0;
185     }
186     len = TARGET_PAGE_ALIGN(len);
187     if (!guest_range_valid_untagged(start, len)) {
188         return -TARGET_ENOMEM;
189     }
190 
191     last = start + len - 1;
192     host_start = start & qemu_host_page_mask;
193     host_last = HOST_PAGE_ALIGN(last) - 1;
194     nranges = 0;
195 
196     mmap_lock();
197 
198     if (host_last - host_start < qemu_host_page_size) {
199         /* Single host page contains all guest pages: sum the prot. */
200         prot1 = target_prot;
201         for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {
202             prot1 |= page_get_flags(a);
203         }
204         for (abi_ulong a = last; a < host_last; a += TARGET_PAGE_SIZE) {
205             prot1 |= page_get_flags(a + 1);
206         }
207         starts[nranges] = host_start;
208         lens[nranges] = qemu_host_page_size;
209         prots[nranges] = prot1;
210         nranges++;
211     } else {
212         if (host_start < start) {
213             /* Host page contains more than one guest page: sum the prot. */
214             prot1 = target_prot;
215             for (abi_ulong a = host_start; a < start; a += TARGET_PAGE_SIZE) {
216                 prot1 |= page_get_flags(a);
217             }
218             /* If the resulting sum differs, create a new range. */
219             if (prot1 != target_prot) {
220                 starts[nranges] = host_start;
221                 lens[nranges] = qemu_host_page_size;
222                 prots[nranges] = prot1;
223                 nranges++;
224                 host_start += qemu_host_page_size;
225             }
226         }
227 
228         if (last < host_last) {
229             /* Host page contains more than one guest page: sum the prot. */
230             prot1 = target_prot;
231             for (abi_ulong a = last; a < host_last; a += TARGET_PAGE_SIZE) {
232                 prot1 |= page_get_flags(a + 1);
233             }
234             /* If the resulting sum differs, create a new range. */
235             if (prot1 != target_prot) {
236                 host_last -= qemu_host_page_size;
237                 starts[nranges] = host_last + 1;
238                 lens[nranges] = qemu_host_page_size;
239                 prots[nranges] = prot1;
240                 nranges++;
241             }
242         }
243 
244         /* Create a range for the middle, if any remains. */
245         if (host_start < host_last) {
246             starts[nranges] = host_start;
247             lens[nranges] = host_last - host_start + 1;
248             prots[nranges] = target_prot;
249             nranges++;
250         }
251     }
252 
253     for (int i = 0; i < nranges; ++i) {
254         ret = mprotect(g2h_untagged(starts[i]), lens[i],
255                        target_to_host_prot(prots[i]));
256         if (ret != 0) {
257             goto error;
258         }
259     }
260 
261     page_set_flags(start, last, page_flags);
262     ret = 0;
263 
264  error:
265     mmap_unlock();
266     return ret;
267 }
268 
269 /* map an incomplete host page */
270 static bool mmap_frag(abi_ulong real_start, abi_ulong start, abi_ulong last,
271                       int prot, int flags, int fd, off_t offset)
272 {
273     abi_ulong real_last;
274     void *host_start;
275     int prot_old, prot_new;
276     int host_prot_old, host_prot_new;
277 
278     if (!(flags & MAP_ANONYMOUS)
279         && (flags & MAP_TYPE) == MAP_SHARED
280         && (prot & PROT_WRITE)) {
281         /*
282          * msync() won't work with the partial page, so we return an
283          * error if write is possible while it is a shared mapping.
284          */
285         errno = EINVAL;
286         return false;
287     }
288 
289     real_last = real_start + qemu_host_page_size - 1;
290     host_start = g2h_untagged(real_start);
291 
292     /* Get the protection of the target pages outside the mapping. */
293     prot_old = 0;
294     for (abi_ulong a = real_start; a < start; a += TARGET_PAGE_SIZE) {
295         prot_old |= page_get_flags(a);
296     }
297     for (abi_ulong a = real_last; a > last; a -= TARGET_PAGE_SIZE) {
298         prot_old |= page_get_flags(a);
299     }
300 
301     if (prot_old == 0) {
302         /*
303          * Since !(prot_old & PAGE_VALID), there were no guest pages
304          * outside of the fragment we need to map.  Allocate a new host
305          * page to cover, discarding whatever else may have been present.
306          */
307         void *p = mmap(host_start, qemu_host_page_size,
308                        target_to_host_prot(prot),
309                        flags | MAP_ANONYMOUS, -1, 0);
310         if (p != host_start) {
311             if (p != MAP_FAILED) {
312                 munmap(p, qemu_host_page_size);
313                 errno = EEXIST;
314             }
315             return false;
316         }
317         prot_old = prot;
318     }
319     prot_new = prot | prot_old;
320 
321     host_prot_old = target_to_host_prot(prot_old);
322     host_prot_new = target_to_host_prot(prot_new);
323 
324     /* Adjust protection to be able to write. */
325     if (!(host_prot_old & PROT_WRITE)) {
326         host_prot_old |= PROT_WRITE;
327         mprotect(host_start, qemu_host_page_size, host_prot_old);
328     }
329 
330     /* Read or zero the new guest pages. */
331     if (flags & MAP_ANONYMOUS) {
332         memset(g2h_untagged(start), 0, last - start + 1);
333     } else {
334         if (pread(fd, g2h_untagged(start), last - start + 1, offset) == -1) {
335             return false;
336         }
337     }
338 
339     /* Put final protection */
340     if (host_prot_new != host_prot_old) {
341         mprotect(host_start, qemu_host_page_size, host_prot_new);
342     }
343     return true;
344 }
345 
346 abi_ulong task_unmapped_base;
347 abi_ulong elf_et_dyn_base;
348 abi_ulong mmap_next_start;
349 
350 /*
351  * Subroutine of mmap_find_vma, used when we have pre-allocated
352  * a chunk of guest address space.
353  */
354 static abi_ulong mmap_find_vma_reserved(abi_ulong start, abi_ulong size,
355                                         abi_ulong align)
356 {
357     target_ulong ret;
358 
359     ret = page_find_range_empty(start, reserved_va, size, align);
360     if (ret == -1 && start > mmap_min_addr) {
361         /* Restart at the beginning of the address space. */
362         ret = page_find_range_empty(mmap_min_addr, start - 1, size, align);
363     }
364 
365     return ret;
366 }
367 
368 /*
369  * Find and reserve a free memory area of size 'size'. The search
370  * starts at 'start'.
371  * It must be called with mmap_lock() held.
372  * Return -1 if error.
373  */
374 abi_ulong mmap_find_vma(abi_ulong start, abi_ulong size, abi_ulong align)
375 {
376     void *ptr, *prev;
377     abi_ulong addr;
378     int wrapped, repeat;
379 
380     align = MAX(align, qemu_host_page_size);
381 
382     /* If 'start' == 0, then a default start address is used. */
383     if (start == 0) {
384         start = mmap_next_start;
385     } else {
386         start &= qemu_host_page_mask;
387     }
388     start = ROUND_UP(start, align);
389 
390     size = HOST_PAGE_ALIGN(size);
391 
392     if (reserved_va) {
393         return mmap_find_vma_reserved(start, size, align);
394     }
395 
396     addr = start;
397     wrapped = repeat = 0;
398     prev = 0;
399 
400     for (;; prev = ptr) {
401         /*
402          * Reserve needed memory area to avoid a race.
403          * It should be discarded using:
404          *  - mmap() with MAP_FIXED flag
405          *  - mremap() with MREMAP_FIXED flag
406          *  - shmat() with SHM_REMAP flag
407          */
408         ptr = mmap(g2h_untagged(addr), size, PROT_NONE,
409                    MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
410 
411         /* ENOMEM, if host address space has no memory */
412         if (ptr == MAP_FAILED) {
413             return (abi_ulong)-1;
414         }
415 
416         /*
417          * Count the number of sequential returns of the same address.
418          * This is used to modify the search algorithm below.
419          */
420         repeat = (ptr == prev ? repeat + 1 : 0);
421 
422         if (h2g_valid(ptr + size - 1)) {
423             addr = h2g(ptr);
424 
425             if ((addr & (align - 1)) == 0) {
426                 /* Success.  */
427                 if (start == mmap_next_start && addr >= task_unmapped_base) {
428                     mmap_next_start = addr + size;
429                 }
430                 return addr;
431             }
432 
433             /* The address is not properly aligned for the target.  */
434             switch (repeat) {
435             case 0:
436                 /*
437                  * Assume the result that the kernel gave us is the
438                  * first with enough free space, so start again at the
439                  * next higher target page.
440                  */
441                 addr = ROUND_UP(addr, align);
442                 break;
443             case 1:
444                 /*
445                  * Sometimes the kernel decides to perform the allocation
446                  * at the top end of memory instead.
447                  */
448                 addr &= -align;
449                 break;
450             case 2:
451                 /* Start over at low memory.  */
452                 addr = 0;
453                 break;
454             default:
455                 /* Fail.  This unaligned block must the last.  */
456                 addr = -1;
457                 break;
458             }
459         } else {
460             /*
461              * Since the result the kernel gave didn't fit, start
462              * again at low memory.  If any repetition, fail.
463              */
464             addr = (repeat ? -1 : 0);
465         }
466 
467         /* Unmap and try again.  */
468         munmap(ptr, size);
469 
470         /* ENOMEM if we checked the whole of the target address space.  */
471         if (addr == (abi_ulong)-1) {
472             return (abi_ulong)-1;
473         } else if (addr == 0) {
474             if (wrapped) {
475                 return (abi_ulong)-1;
476             }
477             wrapped = 1;
478             /*
479              * Don't actually use 0 when wrapping, instead indicate
480              * that we'd truly like an allocation in low memory.
481              */
482             addr = (mmap_min_addr > TARGET_PAGE_SIZE
483                      ? TARGET_PAGE_ALIGN(mmap_min_addr)
484                      : TARGET_PAGE_SIZE);
485         } else if (wrapped && addr >= start) {
486             return (abi_ulong)-1;
487         }
488     }
489 }
490 
491 /* NOTE: all the constants are the HOST ones */
492 abi_long target_mmap(abi_ulong start, abi_ulong len, int target_prot,
493                      int flags, int fd, off_t offset)
494 {
495     abi_ulong ret, last, real_start, real_last, retaddr, host_len;
496     abi_ulong passthrough_start = -1, passthrough_last = 0;
497     int page_flags;
498     off_t host_offset;
499 
500     mmap_lock();
501     trace_target_mmap(start, len, target_prot, flags, fd, offset);
502 
503     if (!len) {
504         errno = EINVAL;
505         goto fail;
506     }
507 
508     page_flags = validate_prot_to_pageflags(target_prot);
509     if (!page_flags) {
510         errno = EINVAL;
511         goto fail;
512     }
513 
514     /* Also check for overflows... */
515     len = TARGET_PAGE_ALIGN(len);
516     if (!len) {
517         errno = ENOMEM;
518         goto fail;
519     }
520 
521     if (offset & ~TARGET_PAGE_MASK) {
522         errno = EINVAL;
523         goto fail;
524     }
525 
526     /*
527      * If we're mapping shared memory, ensure we generate code for parallel
528      * execution and flush old translations.  This will work up to the level
529      * supported by the host -- anything that requires EXCP_ATOMIC will not
530      * be atomic with respect to an external process.
531      */
532     if (flags & MAP_SHARED) {
533         CPUState *cpu = thread_cpu;
534         if (!(cpu->tcg_cflags & CF_PARALLEL)) {
535             cpu->tcg_cflags |= CF_PARALLEL;
536             tb_flush(cpu);
537         }
538     }
539 
540     real_start = start & qemu_host_page_mask;
541     host_offset = offset & qemu_host_page_mask;
542 
543     /*
544      * If the user is asking for the kernel to find a location, do that
545      * before we truncate the length for mapping files below.
546      */
547     if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
548         host_len = len + offset - host_offset;
549         host_len = HOST_PAGE_ALIGN(host_len);
550         start = mmap_find_vma(real_start, host_len, TARGET_PAGE_SIZE);
551         if (start == (abi_ulong)-1) {
552             errno = ENOMEM;
553             goto fail;
554         }
555     }
556 
557     /*
558      * When mapping files into a memory area larger than the file, accesses
559      * to pages beyond the file size will cause a SIGBUS.
560      *
561      * For example, if mmaping a file of 100 bytes on a host with 4K pages
562      * emulating a target with 8K pages, the target expects to be able to
563      * access the first 8K. But the host will trap us on any access beyond
564      * 4K.
565      *
566      * When emulating a target with a larger page-size than the hosts, we
567      * may need to truncate file maps at EOF and add extra anonymous pages
568      * up to the targets page boundary.
569      */
570     if ((qemu_real_host_page_size() < qemu_host_page_size) &&
571         !(flags & MAP_ANONYMOUS)) {
572         struct stat sb;
573 
574         if (fstat(fd, &sb) == -1) {
575             goto fail;
576         }
577 
578         /* Are we trying to create a map beyond EOF?.  */
579         if (offset + len > sb.st_size) {
580             /*
581              * If so, truncate the file map at eof aligned with
582              * the hosts real pagesize. Additional anonymous maps
583              * will be created beyond EOF.
584              */
585             len = REAL_HOST_PAGE_ALIGN(sb.st_size - offset);
586         }
587     }
588 
589     if (!(flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))) {
590         uintptr_t host_start;
591         int host_prot;
592         void *p;
593 
594         host_len = len + offset - host_offset;
595         host_len = HOST_PAGE_ALIGN(host_len);
596         host_prot = target_to_host_prot(target_prot);
597 
598         /*
599          * Note: we prefer to control the mapping address. It is
600          * especially important if qemu_host_page_size >
601          * qemu_real_host_page_size.
602          */
603         p = mmap(g2h_untagged(start), host_len, host_prot,
604                  flags | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
605         if (p == MAP_FAILED) {
606             goto fail;
607         }
608         /* update start so that it points to the file position at 'offset' */
609         host_start = (uintptr_t)p;
610         if (!(flags & MAP_ANONYMOUS)) {
611             p = mmap(g2h_untagged(start), len, host_prot,
612                      flags | MAP_FIXED, fd, host_offset);
613             if (p == MAP_FAILED) {
614                 munmap(g2h_untagged(start), host_len);
615                 goto fail;
616             }
617             host_start += offset - host_offset;
618         }
619         start = h2g(host_start);
620         last = start + len - 1;
621         passthrough_start = start;
622         passthrough_last = last;
623     } else {
624         if (start & ~TARGET_PAGE_MASK) {
625             errno = EINVAL;
626             goto fail;
627         }
628         last = start + len - 1;
629         real_last = HOST_PAGE_ALIGN(last) - 1;
630 
631         /*
632          * Test if requested memory area fits target address space
633          * It can fail only on 64-bit host with 32-bit target.
634          * On any other target/host host mmap() handles this error correctly.
635          */
636         if (last < start || !guest_range_valid_untagged(start, len)) {
637             errno = ENOMEM;
638             goto fail;
639         }
640 
641         if (flags & MAP_FIXED_NOREPLACE) {
642             /* Validate that the chosen range is empty. */
643             if (!page_check_range_empty(start, last)) {
644                 errno = EEXIST;
645                 goto fail;
646             }
647 
648             /*
649              * With reserved_va, the entire address space is mmaped in the
650              * host to ensure it isn't accidentally used for something else.
651              * We have just checked that the guest address is not mapped
652              * within the guest, but need to replace the host reservation.
653              *
654              * Without reserved_va, despite the guest address check above,
655              * keep MAP_FIXED_NOREPLACE so that the guest does not overwrite
656              * any host address mappings.
657              */
658             if (reserved_va) {
659                 flags = (flags & ~MAP_FIXED_NOREPLACE) | MAP_FIXED;
660             }
661         }
662 
663         /*
664          * worst case: we cannot map the file because the offset is not
665          * aligned, so we read it
666          */
667         if (!(flags & MAP_ANONYMOUS) &&
668             (offset & ~qemu_host_page_mask) != (start & ~qemu_host_page_mask)) {
669             /*
670              * msync() won't work here, so we return an error if write is
671              * possible while it is a shared mapping
672              */
673             if ((flags & MAP_TYPE) == MAP_SHARED
674                 && (target_prot & PROT_WRITE)) {
675                 errno = EINVAL;
676                 goto fail;
677             }
678             retaddr = target_mmap(start, len, target_prot | PROT_WRITE,
679                                   (flags & (MAP_FIXED | MAP_FIXED_NOREPLACE))
680                                   | MAP_PRIVATE | MAP_ANONYMOUS,
681                                   -1, 0);
682             if (retaddr == -1) {
683                 goto fail;
684             }
685             if (pread(fd, g2h_untagged(start), len, offset) == -1) {
686                 goto fail;
687             }
688             if (!(target_prot & PROT_WRITE)) {
689                 ret = target_mprotect(start, len, target_prot);
690                 assert(ret == 0);
691             }
692             goto the_end;
693         }
694 
695         /* handle the start of the mapping */
696         if (start > real_start) {
697             if (real_last == real_start + qemu_host_page_size - 1) {
698                 /* one single host page */
699                 if (!mmap_frag(real_start, start, last,
700                                target_prot, flags, fd, offset)) {
701                     goto fail;
702                 }
703                 goto the_end1;
704             }
705             if (!mmap_frag(real_start, start,
706                            real_start + qemu_host_page_size - 1,
707                            target_prot, flags, fd, offset)) {
708                 goto fail;
709             }
710             real_start += qemu_host_page_size;
711         }
712         /* handle the end of the mapping */
713         if (last < real_last) {
714             abi_ulong real_page = real_last - qemu_host_page_size + 1;
715             if (!mmap_frag(real_page, real_page, last,
716                            target_prot, flags, fd,
717                            offset + real_page - start)) {
718                 goto fail;
719             }
720             real_last -= qemu_host_page_size;
721         }
722 
723         /* map the middle (easier) */
724         if (real_start < real_last) {
725             void *p, *want_p;
726             off_t offset1;
727             size_t len1;
728 
729             if (flags & MAP_ANONYMOUS) {
730                 offset1 = 0;
731             } else {
732                 offset1 = offset + real_start - start;
733             }
734             len1 = real_last - real_start + 1;
735             want_p = g2h_untagged(real_start);
736 
737             p = mmap(want_p, len1, target_to_host_prot(target_prot),
738                      flags, fd, offset1);
739             if (p != want_p) {
740                 if (p != MAP_FAILED) {
741                     munmap(p, len1);
742                     errno = EEXIST;
743                 }
744                 goto fail;
745             }
746             passthrough_start = real_start;
747             passthrough_last = real_last;
748         }
749     }
750  the_end1:
751     if (flags & MAP_ANONYMOUS) {
752         page_flags |= PAGE_ANON;
753     }
754     page_flags |= PAGE_RESET;
755     if (passthrough_start > passthrough_last) {
756         page_set_flags(start, last, page_flags);
757     } else {
758         if (start < passthrough_start) {
759             page_set_flags(start, passthrough_start - 1, page_flags);
760         }
761         page_set_flags(passthrough_start, passthrough_last,
762                        page_flags | PAGE_PASSTHROUGH);
763         if (passthrough_last < last) {
764             page_set_flags(passthrough_last + 1, last, page_flags);
765         }
766     }
767     shm_region_rm_complete(start, last);
768  the_end:
769     trace_target_mmap_complete(start);
770     if (qemu_loglevel_mask(CPU_LOG_PAGE)) {
771         FILE *f = qemu_log_trylock();
772         if (f) {
773             fprintf(f, "page layout changed following mmap\n");
774             page_dump(f);
775             qemu_log_unlock(f);
776         }
777     }
778     mmap_unlock();
779     return start;
780 fail:
781     mmap_unlock();
782     return -1;
783 }
784 
785 static int mmap_reserve_or_unmap(abi_ulong start, abi_ulong len)
786 {
787     abi_ulong real_start;
788     abi_ulong real_last;
789     abi_ulong real_len;
790     abi_ulong last;
791     abi_ulong a;
792     void *host_start;
793     int prot;
794 
795     last = start + len - 1;
796     real_start = start & qemu_host_page_mask;
797     real_last = HOST_PAGE_ALIGN(last) - 1;
798 
799     /*
800      * If guest pages remain on the first or last host pages,
801      * adjust the deallocation to retain those guest pages.
802      * The single page special case is required for the last page,
803      * lest real_start overflow to zero.
804      */
805     if (real_last - real_start < qemu_host_page_size) {
806         prot = 0;
807         for (a = real_start; a < start; a += TARGET_PAGE_SIZE) {
808             prot |= page_get_flags(a);
809         }
810         for (a = last; a < real_last; a += TARGET_PAGE_SIZE) {
811             prot |= page_get_flags(a + 1);
812         }
813         if (prot != 0) {
814             return 0;
815         }
816     } else {
817         for (prot = 0, a = real_start; a < start; a += TARGET_PAGE_SIZE) {
818             prot |= page_get_flags(a);
819         }
820         if (prot != 0) {
821             real_start += qemu_host_page_size;
822         }
823 
824         for (prot = 0, a = last; a < real_last; a += TARGET_PAGE_SIZE) {
825             prot |= page_get_flags(a + 1);
826         }
827         if (prot != 0) {
828             real_last -= qemu_host_page_size;
829         }
830 
831         if (real_last < real_start) {
832             return 0;
833         }
834     }
835 
836     real_len = real_last - real_start + 1;
837     host_start = g2h_untagged(real_start);
838 
839     if (reserved_va) {
840         void *ptr = mmap(host_start, real_len, PROT_NONE,
841                          MAP_FIXED | MAP_ANONYMOUS
842                          | MAP_PRIVATE | MAP_NORESERVE, -1, 0);
843         return ptr == host_start ? 0 : -1;
844     }
845     return munmap(host_start, real_len);
846 }
847 
848 int target_munmap(abi_ulong start, abi_ulong len)
849 {
850     int ret;
851 
852     trace_target_munmap(start, len);
853 
854     if (start & ~TARGET_PAGE_MASK) {
855         errno = EINVAL;
856         return -1;
857     }
858     len = TARGET_PAGE_ALIGN(len);
859     if (len == 0 || !guest_range_valid_untagged(start, len)) {
860         errno = EINVAL;
861         return -1;
862     }
863 
864     mmap_lock();
865     ret = mmap_reserve_or_unmap(start, len);
866     if (likely(ret == 0)) {
867         page_set_flags(start, start + len - 1, 0);
868         shm_region_rm_complete(start, start + len - 1);
869     }
870     mmap_unlock();
871 
872     return ret;
873 }
874 
875 abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size,
876                        abi_ulong new_size, unsigned long flags,
877                        abi_ulong new_addr)
878 {
879     int prot;
880     void *host_addr;
881 
882     if (!guest_range_valid_untagged(old_addr, old_size) ||
883         ((flags & MREMAP_FIXED) &&
884          !guest_range_valid_untagged(new_addr, new_size)) ||
885         ((flags & MREMAP_MAYMOVE) == 0 &&
886          !guest_range_valid_untagged(old_addr, new_size))) {
887         errno = ENOMEM;
888         return -1;
889     }
890 
891     mmap_lock();
892 
893     if (flags & MREMAP_FIXED) {
894         host_addr = mremap(g2h_untagged(old_addr), old_size, new_size,
895                            flags, g2h_untagged(new_addr));
896 
897         if (reserved_va && host_addr != MAP_FAILED) {
898             /*
899              * If new and old addresses overlap then the above mremap will
900              * already have failed with EINVAL.
901              */
902             mmap_reserve_or_unmap(old_addr, old_size);
903         }
904     } else if (flags & MREMAP_MAYMOVE) {
905         abi_ulong mmap_start;
906 
907         mmap_start = mmap_find_vma(0, new_size, TARGET_PAGE_SIZE);
908 
909         if (mmap_start == -1) {
910             errno = ENOMEM;
911             host_addr = MAP_FAILED;
912         } else {
913             host_addr = mremap(g2h_untagged(old_addr), old_size, new_size,
914                                flags | MREMAP_FIXED,
915                                g2h_untagged(mmap_start));
916             if (reserved_va) {
917                 mmap_reserve_or_unmap(old_addr, old_size);
918             }
919         }
920     } else {
921         int page_flags = 0;
922         if (reserved_va && old_size < new_size) {
923             abi_ulong addr;
924             for (addr = old_addr + old_size;
925                  addr < old_addr + new_size;
926                  addr++) {
927                 page_flags |= page_get_flags(addr);
928             }
929         }
930         if (page_flags == 0) {
931             host_addr = mremap(g2h_untagged(old_addr),
932                                old_size, new_size, flags);
933 
934             if (host_addr != MAP_FAILED) {
935                 /* Check if address fits target address space */
936                 if (!guest_range_valid_untagged(h2g(host_addr), new_size)) {
937                     /* Revert mremap() changes */
938                     host_addr = mremap(g2h_untagged(old_addr),
939                                        new_size, old_size, flags);
940                     errno = ENOMEM;
941                     host_addr = MAP_FAILED;
942                 } else if (reserved_va && old_size > new_size) {
943                     mmap_reserve_or_unmap(old_addr + old_size,
944                                           old_size - new_size);
945                 }
946             }
947         } else {
948             errno = ENOMEM;
949             host_addr = MAP_FAILED;
950         }
951     }
952 
953     if (host_addr == MAP_FAILED) {
954         new_addr = -1;
955     } else {
956         new_addr = h2g(host_addr);
957         prot = page_get_flags(old_addr);
958         page_set_flags(old_addr, old_addr + old_size - 1, 0);
959         shm_region_rm_complete(old_addr, old_addr + old_size - 1);
960         page_set_flags(new_addr, new_addr + new_size - 1,
961                        prot | PAGE_VALID | PAGE_RESET);
962         shm_region_rm_complete(new_addr, new_addr + new_size - 1);
963     }
964     mmap_unlock();
965     return new_addr;
966 }
967 
968 abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice)
969 {
970     abi_ulong len;
971     int ret = 0;
972 
973     if (start & ~TARGET_PAGE_MASK) {
974         return -TARGET_EINVAL;
975     }
976     if (len_in == 0) {
977         return 0;
978     }
979     len = TARGET_PAGE_ALIGN(len_in);
980     if (len == 0 || !guest_range_valid_untagged(start, len)) {
981         return -TARGET_EINVAL;
982     }
983 
984     /* Translate for some architectures which have different MADV_xxx values */
985     switch (advice) {
986     case TARGET_MADV_DONTNEED:      /* alpha */
987         advice = MADV_DONTNEED;
988         break;
989     case TARGET_MADV_WIPEONFORK:    /* parisc */
990         advice = MADV_WIPEONFORK;
991         break;
992     case TARGET_MADV_KEEPONFORK:    /* parisc */
993         advice = MADV_KEEPONFORK;
994         break;
995     /* we do not care about the other MADV_xxx values yet */
996     }
997 
998     /*
999      * Most advice values are hints, so ignoring and returning success is ok.
1000      *
1001      * However, some advice values such as MADV_DONTNEED, MADV_WIPEONFORK and
1002      * MADV_KEEPONFORK are not hints and need to be emulated.
1003      *
1004      * A straight passthrough for those may not be safe because qemu sometimes
1005      * turns private file-backed mappings into anonymous mappings.
1006      * If all guest pages have PAGE_PASSTHROUGH set, mappings have the
1007      * same semantics for the host as for the guest.
1008      *
1009      * We pass through MADV_WIPEONFORK and MADV_KEEPONFORK if possible and
1010      * return failure if not.
1011      *
1012      * MADV_DONTNEED is passed through as well, if possible.
1013      * If passthrough isn't possible, we nevertheless (wrongly!) return
1014      * success, which is broken but some userspace programs fail to work
1015      * otherwise. Completely implementing such emulation is quite complicated
1016      * though.
1017      */
1018     mmap_lock();
1019     switch (advice) {
1020     case MADV_WIPEONFORK:
1021     case MADV_KEEPONFORK:
1022         ret = -EINVAL;
1023         /* fall through */
1024     case MADV_DONTNEED:
1025         if (page_check_range(start, len, PAGE_PASSTHROUGH)) {
1026             ret = get_errno(madvise(g2h_untagged(start), len, advice));
1027             if ((advice == MADV_DONTNEED) && (ret == 0)) {
1028                 page_reset_target_data(start, start + len - 1);
1029             }
1030         }
1031     }
1032     mmap_unlock();
1033 
1034     return ret;
1035 }
1036 
1037 #ifndef TARGET_FORCE_SHMLBA
1038 /*
1039  * For most architectures, SHMLBA is the same as the page size;
1040  * some architectures have larger values, in which case they should
1041  * define TARGET_FORCE_SHMLBA and provide a target_shmlba() function.
1042  * This corresponds to the kernel arch code defining __ARCH_FORCE_SHMLBA
1043  * and defining its own value for SHMLBA.
1044  *
1045  * The kernel also permits SHMLBA to be set by the architecture to a
1046  * value larger than the page size without setting __ARCH_FORCE_SHMLBA;
1047  * this means that addresses are rounded to the large size if
1048  * SHM_RND is set but addresses not aligned to that size are not rejected
1049  * as long as they are at least page-aligned. Since the only architecture
1050  * which uses this is ia64 this code doesn't provide for that oddity.
1051  */
1052 static inline abi_ulong target_shmlba(CPUArchState *cpu_env)
1053 {
1054     return TARGET_PAGE_SIZE;
1055 }
1056 #endif
1057 
1058 abi_ulong target_shmat(CPUArchState *cpu_env, int shmid,
1059                        abi_ulong shmaddr, int shmflg)
1060 {
1061     CPUState *cpu = env_cpu(cpu_env);
1062     abi_ulong raddr;
1063     struct shmid_ds shm_info;
1064     int ret;
1065     abi_ulong shmlba;
1066 
1067     /* shmat pointers are always untagged */
1068 
1069     /* find out the length of the shared memory segment */
1070     ret = get_errno(shmctl(shmid, IPC_STAT, &shm_info));
1071     if (is_error(ret)) {
1072         /* can't get length, bail out */
1073         return ret;
1074     }
1075 
1076     shmlba = target_shmlba(cpu_env);
1077 
1078     if (shmaddr & (shmlba - 1)) {
1079         if (shmflg & SHM_RND) {
1080             shmaddr &= ~(shmlba - 1);
1081         } else {
1082             return -TARGET_EINVAL;
1083         }
1084     }
1085     if (!guest_range_valid_untagged(shmaddr, shm_info.shm_segsz)) {
1086         return -TARGET_EINVAL;
1087     }
1088 
1089     WITH_MMAP_LOCK_GUARD() {
1090         void *host_raddr;
1091         abi_ulong last;
1092 
1093         if (shmaddr) {
1094             host_raddr = shmat(shmid, (void *)g2h_untagged(shmaddr), shmflg);
1095         } else {
1096             abi_ulong mmap_start;
1097 
1098             /* In order to use the host shmat, we need to honor host SHMLBA.  */
1099             mmap_start = mmap_find_vma(0, shm_info.shm_segsz,
1100                                        MAX(SHMLBA, shmlba));
1101 
1102             if (mmap_start == -1) {
1103                 return -TARGET_ENOMEM;
1104             }
1105             host_raddr = shmat(shmid, g2h_untagged(mmap_start),
1106                                shmflg | SHM_REMAP);
1107         }
1108 
1109         if (host_raddr == (void *)-1) {
1110             return get_errno(-1);
1111         }
1112         raddr = h2g(host_raddr);
1113         last = raddr + shm_info.shm_segsz - 1;
1114 
1115         page_set_flags(raddr, last,
1116                        PAGE_VALID | PAGE_RESET | PAGE_READ |
1117                        (shmflg & SHM_RDONLY ? 0 : PAGE_WRITE));
1118 
1119         shm_region_rm_complete(raddr, last);
1120         shm_region_add(raddr, last);
1121     }
1122 
1123     /*
1124      * We're mapping shared memory, so ensure we generate code for parallel
1125      * execution and flush old translations.  This will work up to the level
1126      * supported by the host -- anything that requires EXCP_ATOMIC will not
1127      * be atomic with respect to an external process.
1128      */
1129     if (!(cpu->tcg_cflags & CF_PARALLEL)) {
1130         cpu->tcg_cflags |= CF_PARALLEL;
1131         tb_flush(cpu);
1132     }
1133 
1134     return raddr;
1135 }
1136 
1137 abi_long target_shmdt(abi_ulong shmaddr)
1138 {
1139     abi_long rv;
1140 
1141     /* shmdt pointers are always untagged */
1142 
1143     WITH_MMAP_LOCK_GUARD() {
1144         abi_ulong last = shm_region_find(shmaddr);
1145         if (last == 0) {
1146             return -TARGET_EINVAL;
1147         }
1148 
1149         rv = get_errno(shmdt(g2h_untagged(shmaddr)));
1150         if (rv == 0) {
1151             abi_ulong size = last - shmaddr + 1;
1152 
1153             page_set_flags(shmaddr, last, 0);
1154             shm_region_rm_complete(shmaddr, last);
1155             mmap_reserve_or_unmap(shmaddr, size);
1156         }
1157     }
1158     return rv;
1159 }
1160