xref: /openbmc/qemu/util/oslib-posix.c (revision bd6aa0d1e59d71218c3eee055bc8d222c6e1a628)
1 /*
2  * os-posix-lib.c
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2010 Red Hat, Inc.
6  *
7  * QEMU library functions on POSIX which are shared between QEMU and
8  * the QEMU tools.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include <termios.h>
31 
32 #include <glib/gprintf.h>
33 
34 #include "system/system.h"
35 #include "trace.h"
36 #include "qapi/error.h"
37 #include "qemu/error-report.h"
38 #include "qemu/madvise.h"
39 #include "qemu/sockets.h"
40 #include "qemu/thread.h"
41 #include <libgen.h>
42 #include "qemu/cutils.h"
43 #include "qemu/units.h"
44 #include "qemu/thread-context.h"
45 #include "qemu/main-loop.h"
46 
47 #ifdef CONFIG_LINUX
48 #include <sys/syscall.h>
49 #endif
50 
51 #ifdef __FreeBSD__
52 #include <sys/thr.h>
53 #include <sys/user.h>
54 #include <libutil.h>
55 #endif
56 
57 #ifdef __NetBSD__
58 #include <lwp.h>
59 #endif
60 
61 #include "qemu/memalign.h"
62 #include "qemu/mmap-alloc.h"
63 
64 #define MAX_MEM_PREALLOC_THREAD_COUNT 16
65 
66 struct MemsetThread;
67 
68 static QLIST_HEAD(, MemsetContext) memset_contexts =
69     QLIST_HEAD_INITIALIZER(memset_contexts);
70 
71 typedef struct MemsetContext {
72     bool all_threads_created;
73     bool any_thread_failed;
74     struct MemsetThread *threads;
75     int num_threads;
76     QLIST_ENTRY(MemsetContext) next;
77 } MemsetContext;
78 
79 struct MemsetThread {
80     char *addr;
81     size_t numpages;
82     size_t hpagesize;
83     QemuThread pgthread;
84     sigjmp_buf env;
85     MemsetContext *context;
86 };
87 typedef struct MemsetThread MemsetThread;
88 
89 /* used by sigbus_handler() */
90 static MemsetContext *sigbus_memset_context;
91 struct sigaction sigbus_oldact;
92 static QemuMutex sigbus_mutex;
93 
94 static QemuMutex page_mutex;
95 static QemuCond page_cond;
96 
97 int qemu_get_thread_id(void)
98 {
99 #if defined(__linux__)
100     return syscall(SYS_gettid);
101 #elif defined(__FreeBSD__)
102     /* thread id is up to INT_MAX */
103     long tid;
104     thr_self(&tid);
105     return (int)tid;
106 #elif defined(__NetBSD__)
107     return _lwp_self();
108 #elif defined(__OpenBSD__)
109     return getthrid();
110 #else
111     return getpid();
112 #endif
113 }
114 
115 int qemu_kill_thread(int tid, int sig)
116 {
117 #if defined(__linux__)
118     return syscall(__NR_tgkill, getpid(), tid, sig);
119 #elif defined(__FreeBSD__)
120     return thr_kill2(getpid(), tid, sig);
121 #elif defined(__NetBSD__)
122     return _lwp_kill(tid, sig);
123 #elif defined(__OpenBSD__)
124     return thrkill(tid, sig, NULL);
125 #else
126     return kill(tid, sig);
127 #endif
128 }
129 
130 int qemu_daemon(int nochdir, int noclose)
131 {
132     return daemon(nochdir, noclose);
133 }
134 
135 bool qemu_write_pidfile(const char *path, Error **errp)
136 {
137     int fd;
138     char pidstr[32];
139 
140     while (1) {
141         struct stat a, b;
142         struct flock lock = {
143             .l_type = F_WRLCK,
144             .l_whence = SEEK_SET,
145             .l_len = 0,
146         };
147 
148         fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
149         if (fd == -1) {
150             return false;
151         }
152 
153         if (fstat(fd, &b) < 0) {
154             error_setg_errno(errp, errno, "Cannot stat file");
155             goto fail_close;
156         }
157 
158         if (fcntl(fd, F_SETLK, &lock)) {
159             error_setg_errno(errp, errno, "Cannot lock pid file");
160             goto fail_close;
161         }
162 
163         /*
164          * Now make sure the path we locked is the same one that now
165          * exists on the filesystem.
166          */
167         if (stat(path, &a) < 0) {
168             /*
169              * PID file disappeared, someone else must be racing with
170              * us, so try again.
171              */
172             close(fd);
173             continue;
174         }
175 
176         if (a.st_ino == b.st_ino) {
177             break;
178         }
179 
180         /*
181          * PID file was recreated, someone else must be racing with
182          * us, so try again.
183          */
184         close(fd);
185     }
186 
187     if (ftruncate(fd, 0) < 0) {
188         error_setg_errno(errp, errno, "Failed to truncate pid file");
189         goto fail_unlink;
190     }
191 
192     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
193     if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
194         error_setg(errp, "Failed to write pid file");
195         goto fail_unlink;
196     }
197 
198     return true;
199 
200 fail_unlink:
201     unlink(path);
202 fail_close:
203     close(fd);
204     return false;
205 }
206 
207 /* alloc shared memory pages */
208 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
209                           bool noreserve)
210 {
211     const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
212                                     (noreserve ? QEMU_MAP_NORESERVE : 0);
213     size_t align = QEMU_VMALLOC_ALIGN;
214 #ifndef EMSCRIPTEN
215     void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
216 
217     if (ptr == MAP_FAILED) {
218         return NULL;
219     }
220 #else
221     /*
222      * qemu_ram_mmap is not implemented for Emscripten. Use qemu_memalign
223      * for the anonymous allocation. noreserve is ignored as there is no swap
224      * space on Emscripten, and shared is ignored as there is no other
225      * processes on Emscripten.
226      */
227     void *ptr = qemu_memalign(align, size);
228 #endif
229 
230     if (alignment) {
231         *alignment = align;
232     }
233 
234     trace_qemu_anon_ram_alloc(size, ptr);
235     return ptr;
236 }
237 
238 void qemu_anon_ram_free(void *ptr, size_t size)
239 {
240     trace_qemu_anon_ram_free(ptr, size);
241 #ifndef EMSCRIPTEN
242     qemu_ram_munmap(-1, ptr, size);
243 #else
244     /*
245      * qemu_ram_munmap is not implemented for Emscripten and qemu_memalign
246      * was used for the allocation. Use the corresponding freeing function
247      * here.
248      */
249     qemu_vfree(ptr);
250 #endif
251 }
252 
253 bool qemu_set_blocking(int fd, bool block, Error **errp)
254 {
255     g_autoptr(GError) err = NULL;
256 
257     if (!g_unix_set_fd_nonblocking(fd, !block, &err)) {
258         error_setg_errno(errp, errno,
259                          "Can't set file descriptor %d %s: %s", fd,
260                          block ? "blocking" : "non-blocking",
261                          err->message);
262         return false;
263     }
264 
265     return true;
266 }
267 
268 int socket_set_fast_reuse(int fd)
269 {
270     int val = 1, ret;
271 
272     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
273                      (const char *)&val, sizeof(val));
274 
275     assert(ret == 0);
276 
277     return ret;
278 }
279 
280 void qemu_set_cloexec(int fd)
281 {
282     int f;
283     f = fcntl(fd, F_GETFD);
284     assert(f != -1);
285     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
286     assert(f != -1);
287 }
288 
289 int qemu_socketpair(int domain, int type, int protocol, int sv[2])
290 {
291     int ret;
292 
293 #ifdef SOCK_CLOEXEC
294     ret = socketpair(domain, type | SOCK_CLOEXEC, protocol, sv);
295     if (ret != -1 || errno != EINVAL) {
296         return ret;
297     }
298 #endif
299     ret = socketpair(domain, type, protocol, sv);
300     if (ret == 0) {
301         qemu_set_cloexec(sv[0]);
302         qemu_set_cloexec(sv[1]);
303     }
304 
305     return ret;
306 }
307 
308 void qemu_clear_cloexec(int fd)
309 {
310     int f;
311     f = fcntl(fd, F_GETFD);
312     assert(f != -1);
313     f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC);
314     assert(f != -1);
315 }
316 
317 char *
318 qemu_get_local_state_dir(void)
319 {
320     return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR);
321 }
322 
323 void qemu_set_tty_echo(int fd, bool echo)
324 {
325     struct termios tty;
326 
327     tcgetattr(fd, &tty);
328 
329     if (echo) {
330         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
331     } else {
332         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
333     }
334 
335     tcsetattr(fd, TCSANOW, &tty);
336 }
337 
338 #ifdef CONFIG_LINUX
339 static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
340 #else /* CONFIG_LINUX */
341 static void sigbus_handler(int signal)
342 #endif /* CONFIG_LINUX */
343 {
344     int i;
345 
346     if (sigbus_memset_context) {
347         for (i = 0; i < sigbus_memset_context->num_threads; i++) {
348             MemsetThread *thread = &sigbus_memset_context->threads[i];
349 
350             if (qemu_thread_is_self(&thread->pgthread)) {
351                 siglongjmp(thread->env, 1);
352             }
353         }
354     }
355 
356 #ifdef CONFIG_LINUX
357     /*
358      * We assume that the MCE SIGBUS handler could have been registered. We
359      * should never receive BUS_MCEERR_AO on any of our threads, but only on
360      * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
361      * receive BUS_MCEERR_AR triggered by action of other threads on one of
362      * our threads. So, no need to check for unrelated SIGBUS when seeing one
363      * for our threads.
364      *
365      * We will forward to the MCE handler, which will either handle the SIGBUS
366      * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
367      * default SIGBUS handler will crash the process, so we don't care.
368      */
369     if (sigbus_oldact.sa_flags & SA_SIGINFO) {
370         sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
371         return;
372     }
373 #endif /* CONFIG_LINUX */
374     warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
375 }
376 
377 static void *do_touch_pages(void *arg)
378 {
379     MemsetThread *memset_args = (MemsetThread *)arg;
380     sigset_t set, oldset;
381     int ret = 0;
382 
383     /*
384      * On Linux, the page faults from the loop below can cause mmap_sem
385      * contention with allocation of the thread stacks.  Do not start
386      * clearing until all threads have been created.
387      */
388     qemu_mutex_lock(&page_mutex);
389     while (!memset_args->context->all_threads_created) {
390         qemu_cond_wait(&page_cond, &page_mutex);
391     }
392     qemu_mutex_unlock(&page_mutex);
393 
394     /* unblock SIGBUS */
395     sigemptyset(&set);
396     sigaddset(&set, SIGBUS);
397     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
398 
399     if (sigsetjmp(memset_args->env, 1)) {
400         ret = -EFAULT;
401     } else {
402         char *addr = memset_args->addr;
403         size_t numpages = memset_args->numpages;
404         size_t hpagesize = memset_args->hpagesize;
405         size_t i;
406         for (i = 0; i < numpages; i++) {
407             /*
408              * Read & write back the same value, so we don't
409              * corrupt existing user/app data that might be
410              * stored.
411              *
412              * 'volatile' to stop compiler optimizing this away
413              * to a no-op
414              */
415             *(volatile char *)addr = *addr;
416             addr += hpagesize;
417         }
418     }
419     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
420     return (void *)(uintptr_t)ret;
421 }
422 
423 static void *do_madv_populate_write_pages(void *arg)
424 {
425     MemsetThread *memset_args = (MemsetThread *)arg;
426     const size_t size = memset_args->numpages * memset_args->hpagesize;
427     char * const addr = memset_args->addr;
428     int ret = 0;
429 
430     /* See do_touch_pages(). */
431     qemu_mutex_lock(&page_mutex);
432     while (!memset_args->context->all_threads_created) {
433         qemu_cond_wait(&page_cond, &page_mutex);
434     }
435     qemu_mutex_unlock(&page_mutex);
436 
437     if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
438         ret = -errno;
439     }
440     return (void *)(uintptr_t)ret;
441 }
442 
443 static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
444                                          int max_threads)
445 {
446     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
447     int ret = 1;
448 
449     if (host_procs > 0) {
450         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
451     }
452 
453     /* Especially with gigantic pages, don't create more threads than pages. */
454     ret = MIN(ret, numpages);
455     /* Don't start threads to prealloc comparatively little memory. */
456     ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
457 
458     /* In case sysconf() fails, we fall back to single threaded */
459     return ret;
460 }
461 
462 static int wait_and_free_mem_prealloc_context(MemsetContext *context)
463 {
464     int i, ret = 0, tmp;
465 
466     for (i = 0; i < context->num_threads; i++) {
467         tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
468 
469         if (tmp) {
470             ret = tmp;
471         }
472     }
473     g_free(context->threads);
474     g_free(context);
475     return ret;
476 }
477 
478 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
479                            int max_threads, ThreadContext *tc, bool async,
480                            bool use_madv_populate_write)
481 {
482     static gsize initialized = 0;
483     MemsetContext *context = g_malloc0(sizeof(MemsetContext));
484     size_t numpages_per_thread, leftover;
485     void *(*touch_fn)(void *);
486     int ret, i = 0;
487     char *addr = area;
488 
489     /*
490      * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
491      * and prealloc context for thread placement.
492      */
493     if (!use_madv_populate_write || !tc) {
494         async = false;
495     }
496 
497     context->num_threads =
498         get_memset_num_threads(hpagesize, numpages, max_threads);
499 
500     if (g_once_init_enter(&initialized)) {
501         qemu_mutex_init(&page_mutex);
502         qemu_cond_init(&page_cond);
503         g_once_init_leave(&initialized, 1);
504     }
505 
506     if (use_madv_populate_write) {
507         /*
508          * Avoid creating a single thread for MADV_POPULATE_WRITE when
509          * preallocating synchronously.
510          */
511         if (context->num_threads == 1 && !async) {
512             ret = 0;
513             if (qemu_madvise(area, hpagesize * numpages,
514                              QEMU_MADV_POPULATE_WRITE)) {
515                 ret = -errno;
516             }
517             g_free(context);
518             return ret;
519         }
520         touch_fn = do_madv_populate_write_pages;
521     } else {
522         touch_fn = do_touch_pages;
523     }
524 
525     context->threads = g_new0(MemsetThread, context->num_threads);
526     numpages_per_thread = numpages / context->num_threads;
527     leftover = numpages % context->num_threads;
528     for (i = 0; i < context->num_threads; i++) {
529         context->threads[i].addr = addr;
530         context->threads[i].numpages = numpages_per_thread + (i < leftover);
531         context->threads[i].hpagesize = hpagesize;
532         context->threads[i].context = context;
533         if (tc) {
534             thread_context_create_thread(tc, &context->threads[i].pgthread,
535                                          "touch_pages",
536                                          touch_fn, &context->threads[i],
537                                          QEMU_THREAD_JOINABLE);
538         } else {
539             qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
540                                touch_fn, &context->threads[i],
541                                QEMU_THREAD_JOINABLE);
542         }
543         addr += context->threads[i].numpages * hpagesize;
544     }
545 
546     if (async) {
547         /*
548          * async requests currently require the BQL. Add it to the list and kick
549          * preallocation off during qemu_finish_async_prealloc_mem().
550          */
551         assert(bql_locked());
552         QLIST_INSERT_HEAD(&memset_contexts, context, next);
553         return 0;
554     }
555 
556     if (!use_madv_populate_write) {
557         sigbus_memset_context = context;
558     }
559 
560     qemu_mutex_lock(&page_mutex);
561     context->all_threads_created = true;
562     qemu_cond_broadcast(&page_cond);
563     qemu_mutex_unlock(&page_mutex);
564 
565     ret = wait_and_free_mem_prealloc_context(context);
566 
567     if (!use_madv_populate_write) {
568         sigbus_memset_context = NULL;
569     }
570     return ret;
571 }
572 
573 bool qemu_finish_async_prealloc_mem(Error **errp)
574 {
575     int ret = 0, tmp;
576     MemsetContext *context, *next_context;
577 
578     /* Waiting for preallocation requires the BQL. */
579     assert(bql_locked());
580     if (QLIST_EMPTY(&memset_contexts)) {
581         return true;
582     }
583 
584     qemu_mutex_lock(&page_mutex);
585     QLIST_FOREACH(context, &memset_contexts, next) {
586         context->all_threads_created = true;
587     }
588     qemu_cond_broadcast(&page_cond);
589     qemu_mutex_unlock(&page_mutex);
590 
591     QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
592         QLIST_REMOVE(context, next);
593         tmp = wait_and_free_mem_prealloc_context(context);
594         if (tmp) {
595             ret = tmp;
596         }
597     }
598 
599     if (ret) {
600         error_setg_errno(errp, -ret,
601                          "qemu_prealloc_mem: preallocating memory failed");
602         return false;
603     }
604     return true;
605 }
606 
607 static bool madv_populate_write_possible(char *area, size_t pagesize)
608 {
609     return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
610            errno != EINVAL;
611 }
612 
613 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
614                        ThreadContext *tc, bool async, Error **errp)
615 {
616     static gsize initialized;
617     int ret;
618 #ifndef EMSCRIPTEN
619     size_t hpagesize = qemu_fd_getpagesize(fd);
620 #else
621     /*
622      * mmap-alloc.c is excluded from Emscripten build, so qemu_fd_getpagesize
623      * is unavailable. Fallback to the lower level implementation.
624      */
625     size_t hpagesize = qemu_real_host_page_size();
626 #endif
627     size_t numpages = DIV_ROUND_UP(sz, hpagesize);
628     bool use_madv_populate_write;
629     struct sigaction act;
630     bool rv = true;
631 
632     /*
633      * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
634      * some special mappings, such as mapping /dev/mem.
635      */
636     use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
637 
638     if (!use_madv_populate_write) {
639         if (g_once_init_enter(&initialized)) {
640             qemu_mutex_init(&sigbus_mutex);
641             g_once_init_leave(&initialized, 1);
642         }
643 
644         qemu_mutex_lock(&sigbus_mutex);
645         memset(&act, 0, sizeof(act));
646 #ifdef CONFIG_LINUX
647         act.sa_sigaction = &sigbus_handler;
648         act.sa_flags = SA_SIGINFO;
649 #else /* CONFIG_LINUX */
650         act.sa_handler = &sigbus_handler;
651         act.sa_flags = 0;
652 #endif /* CONFIG_LINUX */
653 
654         ret = sigaction(SIGBUS, &act, &sigbus_oldact);
655         if (ret) {
656             qemu_mutex_unlock(&sigbus_mutex);
657             error_setg_errno(errp, errno,
658                 "qemu_prealloc_mem: failed to install signal handler");
659             return false;
660         }
661     }
662 
663     /* touch pages simultaneously */
664     ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
665                           use_madv_populate_write);
666     if (ret) {
667         error_setg_errno(errp, -ret,
668                          "qemu_prealloc_mem: preallocating memory failed");
669         rv = false;
670     }
671 
672     if (!use_madv_populate_write) {
673         ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
674         if (ret) {
675             /* Terminate QEMU since it can't recover from error */
676             perror("qemu_prealloc_mem: failed to reinstall signal handler");
677             exit(1);
678         }
679         qemu_mutex_unlock(&sigbus_mutex);
680     }
681     return rv;
682 }
683 
684 char *qemu_get_pid_name(pid_t pid)
685 {
686     char *name = NULL;
687 
688 #if defined(__FreeBSD__)
689     /* BSDs don't have /proc, but they provide a nice substitute */
690     struct kinfo_proc *proc = kinfo_getproc(pid);
691 
692     if (proc) {
693         name = g_strdup(proc->ki_comm);
694         free(proc);
695     }
696 #else
697     /* Assume a system with reasonable procfs */
698     char *pid_path;
699     size_t len;
700 
701     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
702     g_file_get_contents(pid_path, &name, &len, NULL);
703     g_free(pid_path);
704 #endif
705 
706     return name;
707 }
708 
709 
710 void *qemu_alloc_stack(size_t *sz)
711 {
712     void *ptr;
713     int flags;
714 #ifdef CONFIG_DEBUG_STACK_USAGE
715     void *ptr2;
716 #endif
717     size_t pagesz = qemu_real_host_page_size();
718 #ifdef _SC_THREAD_STACK_MIN
719     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
720     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
721     *sz = MAX(MAX(min_stack_sz, 0), *sz);
722 #endif
723     /* adjust stack size to a multiple of the page size */
724     *sz = ROUND_UP(*sz, pagesz);
725     /* allocate one extra page for the guard page */
726     *sz += pagesz;
727 
728     flags = MAP_PRIVATE | MAP_ANONYMOUS;
729 #if defined(MAP_STACK) && defined(__OpenBSD__)
730     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
731      * Linux/FreeBSD/NetBSD have a flag with the same name
732      * but have differing functionality. OpenBSD will SEGV
733      * if it spots execution with a stack pointer pointing
734      * at memory that was not allocated with MAP_STACK.
735      */
736     flags |= MAP_STACK;
737 #endif
738 
739     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
740     if (ptr == MAP_FAILED) {
741         perror("failed to allocate memory for stack");
742         abort();
743     }
744 
745     /* Stack grows down -- guard page at the bottom. */
746     if (mprotect(ptr, pagesz, PROT_NONE) != 0) {
747         perror("failed to set up stack guard page");
748         abort();
749     }
750 
751 #ifdef CONFIG_DEBUG_STACK_USAGE
752     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
753         *(uint32_t *)ptr2 = 0xdeadbeaf;
754     }
755 #endif
756 
757     return ptr;
758 }
759 
760 #ifdef CONFIG_DEBUG_STACK_USAGE
761 static __thread unsigned int max_stack_usage;
762 #endif
763 
764 void qemu_free_stack(void *stack, size_t sz)
765 {
766 #ifdef CONFIG_DEBUG_STACK_USAGE
767     unsigned int usage;
768     void *ptr;
769 
770     for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
771          ptr += sizeof(uint32_t)) {
772         if (*(uint32_t *)ptr != 0xdeadbeaf) {
773             break;
774         }
775     }
776     usage = sz - (uintptr_t) (ptr - stack);
777     if (usage > max_stack_usage) {
778         error_report("thread %d max stack usage increased from %u to %u",
779                      qemu_get_thread_id(), max_stack_usage, usage);
780         max_stack_usage = usage;
781     }
782 #endif
783 
784     munmap(stack, sz);
785 }
786 
787 /*
788  * Disable CFI checks.
789  * We are going to call a signal handler directly. Such handler may or may not
790  * have been defined in our binary, so there's no guarantee that the pointer
791  * used to set the handler is a cfi-valid pointer. Since the handlers are
792  * stored in kernel memory, changing the handler to an attacker-defined
793  * function requires being able to call a sigaction() syscall,
794  * which is not as easy as overwriting a pointer in memory.
795  */
796 QEMU_DISABLE_CFI
797 void sigaction_invoke(struct sigaction *action,
798                       struct qemu_signalfd_siginfo *info)
799 {
800     siginfo_t si = {};
801     si.si_signo = info->ssi_signo;
802     si.si_errno = info->ssi_errno;
803     si.si_code = info->ssi_code;
804 
805     /* Convert the minimal set of fields defined by POSIX.
806      * Positive si_code values are reserved for kernel-generated
807      * signals, where the valid siginfo fields are determined by
808      * the signal number.  But according to POSIX, it is unspecified
809      * whether SI_USER and SI_QUEUE have values less than or equal to
810      * zero.
811      */
812     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
813         info->ssi_code <= 0) {
814         /* SIGTERM, etc.  */
815         si.si_pid = info->ssi_pid;
816         si.si_uid = info->ssi_uid;
817     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
818                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
819         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
820     } else if (info->ssi_signo == SIGCHLD) {
821         si.si_pid = info->ssi_pid;
822         si.si_status = info->ssi_status;
823         si.si_uid = info->ssi_uid;
824     }
825     action->sa_sigaction(info->ssi_signo, &si, NULL);
826 }
827 
828 size_t qemu_get_host_physmem(void)
829 {
830 #ifdef _SC_PHYS_PAGES
831     long pages = sysconf(_SC_PHYS_PAGES);
832     if (pages > 0) {
833         if (pages > SIZE_MAX / qemu_real_host_page_size()) {
834             return SIZE_MAX;
835         } else {
836             return pages * qemu_real_host_page_size();
837         }
838     }
839 #endif
840     return 0;
841 }
842 
843 int qemu_msync(void *addr, size_t length, int fd)
844 {
845     size_t align_mask = ~(qemu_real_host_page_size() - 1);
846 
847     /**
848      * There are no strict reqs as per the length of mapping
849      * to be synced. Still the length needs to follow the address
850      * alignment changes. Additionally - round the size to the multiple
851      * of PAGE_SIZE
852      */
853     length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
854     length = (length + ~align_mask) & align_mask;
855 
856     addr = (void *)((uintptr_t)addr & align_mask);
857 
858     return msync(addr, length, MS_SYNC);
859 }
860 
861 static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
862 {
863     struct dirent *de;
864     int fd, dfd;
865     DIR *dir;
866     unsigned int skip_start = 0, skip_end = nskip;
867 
868     dir = opendir("/proc/self/fd");
869     if (!dir) {
870         /* If /proc is not mounted, there is nothing that can be done. */
871         return false;
872     }
873     /* Avoid closing the directory. */
874     dfd = dirfd(dir);
875 
876     for (de = readdir(dir); de; de = readdir(dir)) {
877         bool close_fd = true;
878 
879         if (de->d_name[0] == '.') {
880             continue;
881         }
882         fd = atoi(de->d_name);
883         if (fd == dfd) {
884             continue;
885         }
886 
887         for (unsigned int i = skip_start; i < skip_end; i++) {
888             if (fd < skip[i]) {
889                 /* We are below the next skipped fd, break */
890                 break;
891             } else if (fd == skip[i]) {
892                 close_fd = false;
893                 /* Restrict the range as we found fds matching start/end */
894                 if (i == skip_start) {
895                     skip_start++;
896                 } else if (i == skip_end) {
897                     skip_end--;
898                 }
899                 break;
900             }
901         }
902 
903         if (close_fd) {
904             close(fd);
905         }
906     }
907     closedir(dir);
908 
909     return true;
910 }
911 
912 static bool qemu_close_all_open_fd_close_range(const int *skip,
913                                                unsigned int nskip,
914                                                int open_max)
915 {
916 #ifdef CONFIG_CLOSE_RANGE
917     int max_fd = open_max - 1;
918     int first = 0, last;
919     unsigned int cur_skip = 0;
920     int ret;
921 
922     do {
923         /* Find the start boundary of the range to close */
924         while (cur_skip < nskip && first == skip[cur_skip]) {
925             cur_skip++;
926             first++;
927         }
928 
929         /* Find the upper boundary of the range to close */
930         last = max_fd;
931         if (cur_skip < nskip) {
932             last = skip[cur_skip] - 1;
933             last = MIN(last, max_fd);
934         }
935 
936         /* With the adjustments to the range, we might be done. */
937         if (first > last) {
938             break;
939         }
940 
941         ret = close_range(first, last, 0);
942         if (ret < 0) {
943             return false;
944         }
945 
946         first = last + 1;
947     } while (last < max_fd);
948 
949     return true;
950 #else
951     return false;
952 #endif
953 }
954 
955 static void qemu_close_all_open_fd_fallback(const int *skip, unsigned int nskip,
956                                             int open_max)
957 {
958     unsigned int cur_skip = 0;
959 
960     /* Fallback */
961     for (int i = 0; i < open_max; i++) {
962         if (cur_skip < nskip && i == skip[cur_skip]) {
963             cur_skip++;
964             continue;
965         }
966         close(i);
967     }
968 }
969 
970 /*
971  * Close all open file descriptors.
972  */
973 void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
974 {
975     int open_max = sysconf(_SC_OPEN_MAX);
976 
977     assert(skip != NULL || nskip == 0);
978 
979     if (!qemu_close_all_open_fd_close_range(skip, nskip, open_max) &&
980         !qemu_close_all_open_fd_proc(skip, nskip)) {
981         qemu_close_all_open_fd_fallback(skip, nskip, open_max);
982     }
983 }
984 
985 int qemu_shm_alloc(size_t size, Error **errp)
986 {
987     g_autoptr(GString) shm_name = g_string_new(NULL);
988     int fd, oflag, cur_sequence;
989     static int sequence;
990     mode_t mode;
991 
992     cur_sequence = qatomic_fetch_inc(&sequence);
993 
994     /*
995      * Let's use `mode = 0` because we don't want other processes to open our
996      * memory unless we share the file descriptor with them.
997      */
998     mode = 0;
999     oflag = O_RDWR | O_CREAT | O_EXCL;
1000 
1001     /*
1002      * Some operating systems allow creating anonymous POSIX shared memory
1003      * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not
1004      * defined by POSIX, so let's create a unique name.
1005      *
1006      * From Linux's shm_open(3) man-page:
1007      *   For  portable  use,  a shared  memory  object should be identified
1008      *   by a name of the form /somename;"
1009      */
1010     g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%d", getpid(),
1011                     cur_sequence);
1012 
1013     fd = shm_open(shm_name->str, oflag, mode);
1014     if (fd < 0) {
1015         error_setg_errno(errp, errno,
1016                          "failed to create POSIX shared memory");
1017         return -1;
1018     }
1019 
1020     /*
1021      * We have the file descriptor, so we no longer need to expose the
1022      * POSIX shared memory object. However it will remain allocated as long as
1023      * there are file descriptors pointing to it.
1024      */
1025     shm_unlink(shm_name->str);
1026 
1027     if (ftruncate(fd, size) == -1) {
1028         error_setg_errno(errp, errno,
1029                          "failed to resize POSIX shared memory to %zu", size);
1030         close(fd);
1031         return -1;
1032     }
1033 
1034     return fd;
1035 }
1036