xref: /openbmc/qemu/util/oslib-posix.c (revision 0f64fb674360393ae09605d8d53bf81c02c78a3e)
1 /*
2  * os-posix-lib.c
3  *
4  * Copyright (c) 2003-2008 Fabrice Bellard
5  * Copyright (c) 2010 Red Hat, Inc.
6  *
7  * QEMU library functions on POSIX which are shared between QEMU and
8  * the QEMU tools.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining a copy
11  * of this software and associated documentation files (the "Software"), to deal
12  * in the Software without restriction, including without limitation the rights
13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  * copies of the Software, and to permit persons to whom the Software is
15  * furnished to do so, subject to the following conditions:
16  *
17  * The above copyright notice and this permission notice shall be included in
18  * all copies or substantial portions of the Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  * THE SOFTWARE.
27  */
28 
29 #include "qemu/osdep.h"
30 #include <termios.h>
31 
32 #include <glib/gprintf.h>
33 
34 #include "system/system.h"
35 #include "trace.h"
36 #include "qapi/error.h"
37 #include "qemu/error-report.h"
38 #include "qemu/madvise.h"
39 #include "qemu/sockets.h"
40 #include "qemu/thread.h"
41 #include <libgen.h>
42 #include "qemu/cutils.h"
43 #include "qemu/units.h"
44 #include "qemu/thread-context.h"
45 #include "qemu/main-loop.h"
46 
47 #ifdef CONFIG_LINUX
48 #include <sys/syscall.h>
49 #endif
50 
51 #ifdef __FreeBSD__
52 #include <sys/thr.h>
53 #include <sys/user.h>
54 #include <libutil.h>
55 #endif
56 
57 #ifdef __NetBSD__
58 #include <lwp.h>
59 #endif
60 
61 #include "qemu/memalign.h"
62 #include "qemu/mmap-alloc.h"
63 
64 #define MAX_MEM_PREALLOC_THREAD_COUNT 16
65 
66 struct MemsetThread;
67 
68 static QLIST_HEAD(, MemsetContext) memset_contexts =
69     QLIST_HEAD_INITIALIZER(memset_contexts);
70 
71 typedef struct MemsetContext {
72     bool all_threads_created;
73     bool any_thread_failed;
74     struct MemsetThread *threads;
75     int num_threads;
76     QLIST_ENTRY(MemsetContext) next;
77 } MemsetContext;
78 
79 struct MemsetThread {
80     char *addr;
81     size_t numpages;
82     size_t hpagesize;
83     QemuThread pgthread;
84     sigjmp_buf env;
85     MemsetContext *context;
86 };
87 typedef struct MemsetThread MemsetThread;
88 
89 /* used by sigbus_handler() */
90 static MemsetContext *sigbus_memset_context;
91 struct sigaction sigbus_oldact;
92 static QemuMutex sigbus_mutex;
93 
94 static QemuMutex page_mutex;
95 static QemuCond page_cond;
96 
97 int qemu_get_thread_id(void)
98 {
99 #if defined(__linux__)
100     return syscall(SYS_gettid);
101 #elif defined(__FreeBSD__)
102     /* thread id is up to INT_MAX */
103     long tid;
104     thr_self(&tid);
105     return (int)tid;
106 #elif defined(__NetBSD__)
107     return _lwp_self();
108 #elif defined(__OpenBSD__)
109     return getthrid();
110 #else
111     return getpid();
112 #endif
113 }
114 
115 int qemu_kill_thread(int tid, int sig)
116 {
117 #if defined(__linux__)
118     return syscall(__NR_tgkill, getpid(), tid, sig);
119 #elif defined(__FreeBSD__)
120     return thr_kill2(getpid(), tid, sig);
121 #elif defined(__NetBSD__)
122     return _lwp_kill(tid, sig);
123 #elif defined(__OpenBSD__)
124     return thrkill(tid, sig, NULL);
125 #else
126     return kill(tid, sig);
127 #endif
128 }
129 
130 int qemu_daemon(int nochdir, int noclose)
131 {
132     return daemon(nochdir, noclose);
133 }
134 
135 bool qemu_write_pidfile(const char *path, Error **errp)
136 {
137     int fd;
138     char pidstr[32];
139 
140     while (1) {
141         struct stat a, b;
142         struct flock lock = {
143             .l_type = F_WRLCK,
144             .l_whence = SEEK_SET,
145             .l_len = 0,
146         };
147 
148         fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
149         if (fd == -1) {
150             return false;
151         }
152 
153         if (fstat(fd, &b) < 0) {
154             error_setg_errno(errp, errno, "Cannot stat file");
155             goto fail_close;
156         }
157 
158         if (fcntl(fd, F_SETLK, &lock)) {
159             error_setg_errno(errp, errno, "Cannot lock pid file");
160             goto fail_close;
161         }
162 
163         /*
164          * Now make sure the path we locked is the same one that now
165          * exists on the filesystem.
166          */
167         if (stat(path, &a) < 0) {
168             /*
169              * PID file disappeared, someone else must be racing with
170              * us, so try again.
171              */
172             close(fd);
173             continue;
174         }
175 
176         if (a.st_ino == b.st_ino) {
177             break;
178         }
179 
180         /*
181          * PID file was recreated, someone else must be racing with
182          * us, so try again.
183          */
184         close(fd);
185     }
186 
187     if (ftruncate(fd, 0) < 0) {
188         error_setg_errno(errp, errno, "Failed to truncate pid file");
189         goto fail_unlink;
190     }
191 
192     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
193     if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
194         error_setg(errp, "Failed to write pid file");
195         goto fail_unlink;
196     }
197 
198     return true;
199 
200 fail_unlink:
201     unlink(path);
202 fail_close:
203     close(fd);
204     return false;
205 }
206 
207 /* alloc shared memory pages */
208 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
209                           bool noreserve)
210 {
211     const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
212                                     (noreserve ? QEMU_MAP_NORESERVE : 0);
213     size_t align = QEMU_VMALLOC_ALIGN;
214 #ifndef EMSCRIPTEN
215     void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
216 
217     if (ptr == MAP_FAILED) {
218         return NULL;
219     }
220 #else
221     /*
222      * qemu_ram_mmap is not implemented for Emscripten. Use qemu_memalign
223      * for the anonymous allocation. noreserve is ignored as there is no swap
224      * space on Emscripten, and shared is ignored as there is no other
225      * processes on Emscripten.
226      */
227     void *ptr = qemu_memalign(align, size);
228 #endif
229 
230     if (alignment) {
231         *alignment = align;
232     }
233 
234     trace_qemu_anon_ram_alloc(size, ptr);
235     return ptr;
236 }
237 
238 void qemu_anon_ram_free(void *ptr, size_t size)
239 {
240     trace_qemu_anon_ram_free(ptr, size);
241 #ifndef EMSCRIPTEN
242     qemu_ram_munmap(-1, ptr, size);
243 #else
244     /*
245      * qemu_ram_munmap is not implemented for Emscripten and qemu_memalign
246      * was used for the allocation. Use the corresponding freeing function
247      * here.
248      */
249     qemu_vfree(ptr);
250 #endif
251 }
252 
253 void qemu_socket_set_block(int fd)
254 {
255     g_unix_set_fd_nonblocking(fd, false, NULL);
256 }
257 
258 int qemu_socket_try_set_nonblock(int fd)
259 {
260     return g_unix_set_fd_nonblocking(fd, true, NULL) ? 0 : -errno;
261 }
262 
263 void qemu_socket_set_nonblock(int fd)
264 {
265     int f;
266     f = qemu_socket_try_set_nonblock(fd);
267     assert(f == 0);
268 }
269 
270 int socket_set_fast_reuse(int fd)
271 {
272     int val = 1, ret;
273 
274     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
275                      (const char *)&val, sizeof(val));
276 
277     assert(ret == 0);
278 
279     return ret;
280 }
281 
282 void qemu_set_cloexec(int fd)
283 {
284     int f;
285     f = fcntl(fd, F_GETFD);
286     assert(f != -1);
287     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
288     assert(f != -1);
289 }
290 
291 int qemu_socketpair(int domain, int type, int protocol, int sv[2])
292 {
293     int ret;
294 
295 #ifdef SOCK_CLOEXEC
296     ret = socketpair(domain, type | SOCK_CLOEXEC, protocol, sv);
297     if (ret != -1 || errno != EINVAL) {
298         return ret;
299     }
300 #endif
301     ret = socketpair(domain, type, protocol, sv);
302     if (ret == 0) {
303         qemu_set_cloexec(sv[0]);
304         qemu_set_cloexec(sv[1]);
305     }
306 
307     return ret;
308 }
309 
310 char *
311 qemu_get_local_state_dir(void)
312 {
313     return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR);
314 }
315 
316 void qemu_set_tty_echo(int fd, bool echo)
317 {
318     struct termios tty;
319 
320     tcgetattr(fd, &tty);
321 
322     if (echo) {
323         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
324     } else {
325         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
326     }
327 
328     tcsetattr(fd, TCSANOW, &tty);
329 }
330 
331 #ifdef CONFIG_LINUX
332 static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
333 #else /* CONFIG_LINUX */
334 static void sigbus_handler(int signal)
335 #endif /* CONFIG_LINUX */
336 {
337     int i;
338 
339     if (sigbus_memset_context) {
340         for (i = 0; i < sigbus_memset_context->num_threads; i++) {
341             MemsetThread *thread = &sigbus_memset_context->threads[i];
342 
343             if (qemu_thread_is_self(&thread->pgthread)) {
344                 siglongjmp(thread->env, 1);
345             }
346         }
347     }
348 
349 #ifdef CONFIG_LINUX
350     /*
351      * We assume that the MCE SIGBUS handler could have been registered. We
352      * should never receive BUS_MCEERR_AO on any of our threads, but only on
353      * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
354      * receive BUS_MCEERR_AR triggered by action of other threads on one of
355      * our threads. So, no need to check for unrelated SIGBUS when seeing one
356      * for our threads.
357      *
358      * We will forward to the MCE handler, which will either handle the SIGBUS
359      * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
360      * default SIGBUS handler will crash the process, so we don't care.
361      */
362     if (sigbus_oldact.sa_flags & SA_SIGINFO) {
363         sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
364         return;
365     }
366 #endif /* CONFIG_LINUX */
367     warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
368 }
369 
370 static void *do_touch_pages(void *arg)
371 {
372     MemsetThread *memset_args = (MemsetThread *)arg;
373     sigset_t set, oldset;
374     int ret = 0;
375 
376     /*
377      * On Linux, the page faults from the loop below can cause mmap_sem
378      * contention with allocation of the thread stacks.  Do not start
379      * clearing until all threads have been created.
380      */
381     qemu_mutex_lock(&page_mutex);
382     while (!memset_args->context->all_threads_created) {
383         qemu_cond_wait(&page_cond, &page_mutex);
384     }
385     qemu_mutex_unlock(&page_mutex);
386 
387     /* unblock SIGBUS */
388     sigemptyset(&set);
389     sigaddset(&set, SIGBUS);
390     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
391 
392     if (sigsetjmp(memset_args->env, 1)) {
393         ret = -EFAULT;
394     } else {
395         char *addr = memset_args->addr;
396         size_t numpages = memset_args->numpages;
397         size_t hpagesize = memset_args->hpagesize;
398         size_t i;
399         for (i = 0; i < numpages; i++) {
400             /*
401              * Read & write back the same value, so we don't
402              * corrupt existing user/app data that might be
403              * stored.
404              *
405              * 'volatile' to stop compiler optimizing this away
406              * to a no-op
407              */
408             *(volatile char *)addr = *addr;
409             addr += hpagesize;
410         }
411     }
412     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
413     return (void *)(uintptr_t)ret;
414 }
415 
416 static void *do_madv_populate_write_pages(void *arg)
417 {
418     MemsetThread *memset_args = (MemsetThread *)arg;
419     const size_t size = memset_args->numpages * memset_args->hpagesize;
420     char * const addr = memset_args->addr;
421     int ret = 0;
422 
423     /* See do_touch_pages(). */
424     qemu_mutex_lock(&page_mutex);
425     while (!memset_args->context->all_threads_created) {
426         qemu_cond_wait(&page_cond, &page_mutex);
427     }
428     qemu_mutex_unlock(&page_mutex);
429 
430     if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
431         ret = -errno;
432     }
433     return (void *)(uintptr_t)ret;
434 }
435 
436 static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
437                                          int max_threads)
438 {
439     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
440     int ret = 1;
441 
442     if (host_procs > 0) {
443         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
444     }
445 
446     /* Especially with gigantic pages, don't create more threads than pages. */
447     ret = MIN(ret, numpages);
448     /* Don't start threads to prealloc comparatively little memory. */
449     ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
450 
451     /* In case sysconf() fails, we fall back to single threaded */
452     return ret;
453 }
454 
455 static int wait_and_free_mem_prealloc_context(MemsetContext *context)
456 {
457     int i, ret = 0, tmp;
458 
459     for (i = 0; i < context->num_threads; i++) {
460         tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
461 
462         if (tmp) {
463             ret = tmp;
464         }
465     }
466     g_free(context->threads);
467     g_free(context);
468     return ret;
469 }
470 
471 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
472                            int max_threads, ThreadContext *tc, bool async,
473                            bool use_madv_populate_write)
474 {
475     static gsize initialized = 0;
476     MemsetContext *context = g_malloc0(sizeof(MemsetContext));
477     size_t numpages_per_thread, leftover;
478     void *(*touch_fn)(void *);
479     int ret, i = 0;
480     char *addr = area;
481 
482     /*
483      * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
484      * and prealloc context for thread placement.
485      */
486     if (!use_madv_populate_write || !tc) {
487         async = false;
488     }
489 
490     context->num_threads =
491         get_memset_num_threads(hpagesize, numpages, max_threads);
492 
493     if (g_once_init_enter(&initialized)) {
494         qemu_mutex_init(&page_mutex);
495         qemu_cond_init(&page_cond);
496         g_once_init_leave(&initialized, 1);
497     }
498 
499     if (use_madv_populate_write) {
500         /*
501          * Avoid creating a single thread for MADV_POPULATE_WRITE when
502          * preallocating synchronously.
503          */
504         if (context->num_threads == 1 && !async) {
505             ret = 0;
506             if (qemu_madvise(area, hpagesize * numpages,
507                              QEMU_MADV_POPULATE_WRITE)) {
508                 ret = -errno;
509             }
510             g_free(context);
511             return ret;
512         }
513         touch_fn = do_madv_populate_write_pages;
514     } else {
515         touch_fn = do_touch_pages;
516     }
517 
518     context->threads = g_new0(MemsetThread, context->num_threads);
519     numpages_per_thread = numpages / context->num_threads;
520     leftover = numpages % context->num_threads;
521     for (i = 0; i < context->num_threads; i++) {
522         context->threads[i].addr = addr;
523         context->threads[i].numpages = numpages_per_thread + (i < leftover);
524         context->threads[i].hpagesize = hpagesize;
525         context->threads[i].context = context;
526         if (tc) {
527             thread_context_create_thread(tc, &context->threads[i].pgthread,
528                                          "touch_pages",
529                                          touch_fn, &context->threads[i],
530                                          QEMU_THREAD_JOINABLE);
531         } else {
532             qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
533                                touch_fn, &context->threads[i],
534                                QEMU_THREAD_JOINABLE);
535         }
536         addr += context->threads[i].numpages * hpagesize;
537     }
538 
539     if (async) {
540         /*
541          * async requests currently require the BQL. Add it to the list and kick
542          * preallocation off during qemu_finish_async_prealloc_mem().
543          */
544         assert(bql_locked());
545         QLIST_INSERT_HEAD(&memset_contexts, context, next);
546         return 0;
547     }
548 
549     if (!use_madv_populate_write) {
550         sigbus_memset_context = context;
551     }
552 
553     qemu_mutex_lock(&page_mutex);
554     context->all_threads_created = true;
555     qemu_cond_broadcast(&page_cond);
556     qemu_mutex_unlock(&page_mutex);
557 
558     ret = wait_and_free_mem_prealloc_context(context);
559 
560     if (!use_madv_populate_write) {
561         sigbus_memset_context = NULL;
562     }
563     return ret;
564 }
565 
566 bool qemu_finish_async_prealloc_mem(Error **errp)
567 {
568     int ret = 0, tmp;
569     MemsetContext *context, *next_context;
570 
571     /* Waiting for preallocation requires the BQL. */
572     assert(bql_locked());
573     if (QLIST_EMPTY(&memset_contexts)) {
574         return true;
575     }
576 
577     qemu_mutex_lock(&page_mutex);
578     QLIST_FOREACH(context, &memset_contexts, next) {
579         context->all_threads_created = true;
580     }
581     qemu_cond_broadcast(&page_cond);
582     qemu_mutex_unlock(&page_mutex);
583 
584     QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
585         QLIST_REMOVE(context, next);
586         tmp = wait_and_free_mem_prealloc_context(context);
587         if (tmp) {
588             ret = tmp;
589         }
590     }
591 
592     if (ret) {
593         error_setg_errno(errp, -ret,
594                          "qemu_prealloc_mem: preallocating memory failed");
595         return false;
596     }
597     return true;
598 }
599 
600 static bool madv_populate_write_possible(char *area, size_t pagesize)
601 {
602     return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
603            errno != EINVAL;
604 }
605 
606 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
607                        ThreadContext *tc, bool async, Error **errp)
608 {
609     static gsize initialized;
610     int ret;
611 #ifndef EMSCRIPTEN
612     size_t hpagesize = qemu_fd_getpagesize(fd);
613 #else
614     /*
615      * mmap-alloc.c is excluded from Emscripten build, so qemu_fd_getpagesize
616      * is unavailable. Fallback to the lower level implementation.
617      */
618     size_t hpagesize = qemu_real_host_page_size();
619 #endif
620     size_t numpages = DIV_ROUND_UP(sz, hpagesize);
621     bool use_madv_populate_write;
622     struct sigaction act;
623     bool rv = true;
624 
625     /*
626      * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
627      * some special mappings, such as mapping /dev/mem.
628      */
629     use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
630 
631     if (!use_madv_populate_write) {
632         if (g_once_init_enter(&initialized)) {
633             qemu_mutex_init(&sigbus_mutex);
634             g_once_init_leave(&initialized, 1);
635         }
636 
637         qemu_mutex_lock(&sigbus_mutex);
638         memset(&act, 0, sizeof(act));
639 #ifdef CONFIG_LINUX
640         act.sa_sigaction = &sigbus_handler;
641         act.sa_flags = SA_SIGINFO;
642 #else /* CONFIG_LINUX */
643         act.sa_handler = &sigbus_handler;
644         act.sa_flags = 0;
645 #endif /* CONFIG_LINUX */
646 
647         ret = sigaction(SIGBUS, &act, &sigbus_oldact);
648         if (ret) {
649             qemu_mutex_unlock(&sigbus_mutex);
650             error_setg_errno(errp, errno,
651                 "qemu_prealloc_mem: failed to install signal handler");
652             return false;
653         }
654     }
655 
656     /* touch pages simultaneously */
657     ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
658                           use_madv_populate_write);
659     if (ret) {
660         error_setg_errno(errp, -ret,
661                          "qemu_prealloc_mem: preallocating memory failed");
662         rv = false;
663     }
664 
665     if (!use_madv_populate_write) {
666         ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
667         if (ret) {
668             /* Terminate QEMU since it can't recover from error */
669             perror("qemu_prealloc_mem: failed to reinstall signal handler");
670             exit(1);
671         }
672         qemu_mutex_unlock(&sigbus_mutex);
673     }
674     return rv;
675 }
676 
677 char *qemu_get_pid_name(pid_t pid)
678 {
679     char *name = NULL;
680 
681 #if defined(__FreeBSD__)
682     /* BSDs don't have /proc, but they provide a nice substitute */
683     struct kinfo_proc *proc = kinfo_getproc(pid);
684 
685     if (proc) {
686         name = g_strdup(proc->ki_comm);
687         free(proc);
688     }
689 #else
690     /* Assume a system with reasonable procfs */
691     char *pid_path;
692     size_t len;
693 
694     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
695     g_file_get_contents(pid_path, &name, &len, NULL);
696     g_free(pid_path);
697 #endif
698 
699     return name;
700 }
701 
702 
703 void *qemu_alloc_stack(size_t *sz)
704 {
705     void *ptr;
706     int flags;
707 #ifdef CONFIG_DEBUG_STACK_USAGE
708     void *ptr2;
709 #endif
710     size_t pagesz = qemu_real_host_page_size();
711 #ifdef _SC_THREAD_STACK_MIN
712     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
713     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
714     *sz = MAX(MAX(min_stack_sz, 0), *sz);
715 #endif
716     /* adjust stack size to a multiple of the page size */
717     *sz = ROUND_UP(*sz, pagesz);
718     /* allocate one extra page for the guard page */
719     *sz += pagesz;
720 
721     flags = MAP_PRIVATE | MAP_ANONYMOUS;
722 #if defined(MAP_STACK) && defined(__OpenBSD__)
723     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
724      * Linux/FreeBSD/NetBSD have a flag with the same name
725      * but have differing functionality. OpenBSD will SEGV
726      * if it spots execution with a stack pointer pointing
727      * at memory that was not allocated with MAP_STACK.
728      */
729     flags |= MAP_STACK;
730 #endif
731 
732     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
733     if (ptr == MAP_FAILED) {
734         perror("failed to allocate memory for stack");
735         abort();
736     }
737 
738     /* Stack grows down -- guard page at the bottom. */
739     if (mprotect(ptr, pagesz, PROT_NONE) != 0) {
740         perror("failed to set up stack guard page");
741         abort();
742     }
743 
744 #ifdef CONFIG_DEBUG_STACK_USAGE
745     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
746         *(uint32_t *)ptr2 = 0xdeadbeaf;
747     }
748 #endif
749 
750     return ptr;
751 }
752 
753 #ifdef CONFIG_DEBUG_STACK_USAGE
754 static __thread unsigned int max_stack_usage;
755 #endif
756 
757 void qemu_free_stack(void *stack, size_t sz)
758 {
759 #ifdef CONFIG_DEBUG_STACK_USAGE
760     unsigned int usage;
761     void *ptr;
762 
763     for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
764          ptr += sizeof(uint32_t)) {
765         if (*(uint32_t *)ptr != 0xdeadbeaf) {
766             break;
767         }
768     }
769     usage = sz - (uintptr_t) (ptr - stack);
770     if (usage > max_stack_usage) {
771         error_report("thread %d max stack usage increased from %u to %u",
772                      qemu_get_thread_id(), max_stack_usage, usage);
773         max_stack_usage = usage;
774     }
775 #endif
776 
777     munmap(stack, sz);
778 }
779 
780 /*
781  * Disable CFI checks.
782  * We are going to call a signal handler directly. Such handler may or may not
783  * have been defined in our binary, so there's no guarantee that the pointer
784  * used to set the handler is a cfi-valid pointer. Since the handlers are
785  * stored in kernel memory, changing the handler to an attacker-defined
786  * function requires being able to call a sigaction() syscall,
787  * which is not as easy as overwriting a pointer in memory.
788  */
789 QEMU_DISABLE_CFI
790 void sigaction_invoke(struct sigaction *action,
791                       struct qemu_signalfd_siginfo *info)
792 {
793     siginfo_t si = {};
794     si.si_signo = info->ssi_signo;
795     si.si_errno = info->ssi_errno;
796     si.si_code = info->ssi_code;
797 
798     /* Convert the minimal set of fields defined by POSIX.
799      * Positive si_code values are reserved for kernel-generated
800      * signals, where the valid siginfo fields are determined by
801      * the signal number.  But according to POSIX, it is unspecified
802      * whether SI_USER and SI_QUEUE have values less than or equal to
803      * zero.
804      */
805     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
806         info->ssi_code <= 0) {
807         /* SIGTERM, etc.  */
808         si.si_pid = info->ssi_pid;
809         si.si_uid = info->ssi_uid;
810     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
811                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
812         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
813     } else if (info->ssi_signo == SIGCHLD) {
814         si.si_pid = info->ssi_pid;
815         si.si_status = info->ssi_status;
816         si.si_uid = info->ssi_uid;
817     }
818     action->sa_sigaction(info->ssi_signo, &si, NULL);
819 }
820 
821 size_t qemu_get_host_physmem(void)
822 {
823 #ifdef _SC_PHYS_PAGES
824     long pages = sysconf(_SC_PHYS_PAGES);
825     if (pages > 0) {
826         if (pages > SIZE_MAX / qemu_real_host_page_size()) {
827             return SIZE_MAX;
828         } else {
829             return pages * qemu_real_host_page_size();
830         }
831     }
832 #endif
833     return 0;
834 }
835 
836 int qemu_msync(void *addr, size_t length, int fd)
837 {
838     size_t align_mask = ~(qemu_real_host_page_size() - 1);
839 
840     /**
841      * There are no strict reqs as per the length of mapping
842      * to be synced. Still the length needs to follow the address
843      * alignment changes. Additionally - round the size to the multiple
844      * of PAGE_SIZE
845      */
846     length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
847     length = (length + ~align_mask) & align_mask;
848 
849     addr = (void *)((uintptr_t)addr & align_mask);
850 
851     return msync(addr, length, MS_SYNC);
852 }
853 
854 static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
855 {
856     struct dirent *de;
857     int fd, dfd;
858     DIR *dir;
859     unsigned int skip_start = 0, skip_end = nskip;
860 
861     dir = opendir("/proc/self/fd");
862     if (!dir) {
863         /* If /proc is not mounted, there is nothing that can be done. */
864         return false;
865     }
866     /* Avoid closing the directory. */
867     dfd = dirfd(dir);
868 
869     for (de = readdir(dir); de; de = readdir(dir)) {
870         bool close_fd = true;
871 
872         if (de->d_name[0] == '.') {
873             continue;
874         }
875         fd = atoi(de->d_name);
876         if (fd == dfd) {
877             continue;
878         }
879 
880         for (unsigned int i = skip_start; i < skip_end; i++) {
881             if (fd < skip[i]) {
882                 /* We are below the next skipped fd, break */
883                 break;
884             } else if (fd == skip[i]) {
885                 close_fd = false;
886                 /* Restrict the range as we found fds matching start/end */
887                 if (i == skip_start) {
888                     skip_start++;
889                 } else if (i == skip_end) {
890                     skip_end--;
891                 }
892                 break;
893             }
894         }
895 
896         if (close_fd) {
897             close(fd);
898         }
899     }
900     closedir(dir);
901 
902     return true;
903 }
904 
905 static bool qemu_close_all_open_fd_close_range(const int *skip,
906                                                unsigned int nskip,
907                                                int open_max)
908 {
909 #ifdef CONFIG_CLOSE_RANGE
910     int max_fd = open_max - 1;
911     int first = 0, last;
912     unsigned int cur_skip = 0;
913     int ret;
914 
915     do {
916         /* Find the start boundary of the range to close */
917         while (cur_skip < nskip && first == skip[cur_skip]) {
918             cur_skip++;
919             first++;
920         }
921 
922         /* Find the upper boundary of the range to close */
923         last = max_fd;
924         if (cur_skip < nskip) {
925             last = skip[cur_skip] - 1;
926             last = MIN(last, max_fd);
927         }
928 
929         /* With the adjustments to the range, we might be done. */
930         if (first > last) {
931             break;
932         }
933 
934         ret = close_range(first, last, 0);
935         if (ret < 0) {
936             return false;
937         }
938 
939         first = last + 1;
940     } while (last < max_fd);
941 
942     return true;
943 #else
944     return false;
945 #endif
946 }
947 
948 static void qemu_close_all_open_fd_fallback(const int *skip, unsigned int nskip,
949                                             int open_max)
950 {
951     unsigned int cur_skip = 0;
952 
953     /* Fallback */
954     for (int i = 0; i < open_max; i++) {
955         if (cur_skip < nskip && i == skip[cur_skip]) {
956             cur_skip++;
957             continue;
958         }
959         close(i);
960     }
961 }
962 
963 /*
964  * Close all open file descriptors.
965  */
966 void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
967 {
968     int open_max = sysconf(_SC_OPEN_MAX);
969 
970     assert(skip != NULL || nskip == 0);
971 
972     if (!qemu_close_all_open_fd_close_range(skip, nskip, open_max) &&
973         !qemu_close_all_open_fd_proc(skip, nskip)) {
974         qemu_close_all_open_fd_fallback(skip, nskip, open_max);
975     }
976 }
977 
978 int qemu_shm_alloc(size_t size, Error **errp)
979 {
980     g_autoptr(GString) shm_name = g_string_new(NULL);
981     int fd, oflag, cur_sequence;
982     static int sequence;
983     mode_t mode;
984 
985     cur_sequence = qatomic_fetch_inc(&sequence);
986 
987     /*
988      * Let's use `mode = 0` because we don't want other processes to open our
989      * memory unless we share the file descriptor with them.
990      */
991     mode = 0;
992     oflag = O_RDWR | O_CREAT | O_EXCL;
993 
994     /*
995      * Some operating systems allow creating anonymous POSIX shared memory
996      * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not
997      * defined by POSIX, so let's create a unique name.
998      *
999      * From Linux's shm_open(3) man-page:
1000      *   For  portable  use,  a shared  memory  object should be identified
1001      *   by a name of the form /somename;"
1002      */
1003     g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%d", getpid(),
1004                     cur_sequence);
1005 
1006     fd = shm_open(shm_name->str, oflag, mode);
1007     if (fd < 0) {
1008         error_setg_errno(errp, errno,
1009                          "failed to create POSIX shared memory");
1010         return -1;
1011     }
1012 
1013     /*
1014      * We have the file descriptor, so we no longer need to expose the
1015      * POSIX shared memory object. However it will remain allocated as long as
1016      * there are file descriptors pointing to it.
1017      */
1018     shm_unlink(shm_name->str);
1019 
1020     if (ftruncate(fd, size) == -1) {
1021         error_setg_errno(errp, errno,
1022                          "failed to resize POSIX shared memory to %zu", size);
1023         close(fd);
1024         return -1;
1025     }
1026 
1027     return fd;
1028 }
1029