xref: /openbmc/qemu/util/oslib-posix.c (revision 1b34d08f)
1baacf047SPaolo Bonzini /*
2baacf047SPaolo Bonzini  * os-posix-lib.c
3baacf047SPaolo Bonzini  *
4baacf047SPaolo Bonzini  * Copyright (c) 2003-2008 Fabrice Bellard
5baacf047SPaolo Bonzini  * Copyright (c) 2010 Red Hat, Inc.
6baacf047SPaolo Bonzini  *
7baacf047SPaolo Bonzini  * QEMU library functions on POSIX which are shared between QEMU and
8baacf047SPaolo Bonzini  * the QEMU tools.
9baacf047SPaolo Bonzini  *
10baacf047SPaolo Bonzini  * Permission is hereby granted, free of charge, to any person obtaining a copy
11baacf047SPaolo Bonzini  * of this software and associated documentation files (the "Software"), to deal
12baacf047SPaolo Bonzini  * in the Software without restriction, including without limitation the rights
13baacf047SPaolo Bonzini  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14baacf047SPaolo Bonzini  * copies of the Software, and to permit persons to whom the Software is
15baacf047SPaolo Bonzini  * furnished to do so, subject to the following conditions:
16baacf047SPaolo Bonzini  *
17baacf047SPaolo Bonzini  * The above copyright notice and this permission notice shall be included in
18baacf047SPaolo Bonzini  * all copies or substantial portions of the Software.
19baacf047SPaolo Bonzini  *
20baacf047SPaolo Bonzini  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21baacf047SPaolo Bonzini  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22baacf047SPaolo Bonzini  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23baacf047SPaolo Bonzini  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24baacf047SPaolo Bonzini  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25baacf047SPaolo Bonzini  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26baacf047SPaolo Bonzini  * THE SOFTWARE.
27baacf047SPaolo Bonzini  */
28baacf047SPaolo Bonzini 
29aafd7584SPeter Maydell #include "qemu/osdep.h"
3013401ba0SStefan Hajnoczi #include <termios.h>
3113401ba0SStefan Hajnoczi 
32e2ea3515SLaszlo Ersek #include <glib/gprintf.h>
33e2ea3515SLaszlo Ersek 
34baacf047SPaolo Bonzini #include "sysemu/sysemu.h"
35baacf047SPaolo Bonzini #include "trace.h"
36da34e65cSMarkus Armbruster #include "qapi/error.h"
3729b838c0SDavid Hildenbrand #include "qemu/error-report.h"
38b85ea5faSPeter Maydell #include "qemu/madvise.h"
39baacf047SPaolo Bonzini #include "qemu/sockets.h"
40db725815SMarkus Armbruster #include "qemu/thread.h"
4110f5bff6SFam Zheng #include <libgen.h>
42f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
43c905a368SDaniele Buono #include "qemu/compiler.h"
4489aec641SDavid Hildenbrand #include "qemu/units.h"
45baacf047SPaolo Bonzini 
46baacf047SPaolo Bonzini #ifdef CONFIG_LINUX
47baacf047SPaolo Bonzini #include <sys/syscall.h>
48baacf047SPaolo Bonzini #endif
49baacf047SPaolo Bonzini 
5041975b26SAndreas Färber #ifdef __FreeBSD__
5141975b26SAndreas Färber #include <sys/sysctl.h>
52a7764f15SEd Maste #include <sys/user.h>
539548a891SDavid Carlier #include <sys/thr.h>
547dc9ae43SMichal Privoznik #include <libutil.h>
5541975b26SAndreas Färber #endif
5641975b26SAndreas Färber 
57094611b4SKamil Rytarowski #ifdef __NetBSD__
58094611b4SKamil Rytarowski #include <sys/sysctl.h>
599548a891SDavid Carlier #include <lwp.h>
60094611b4SKamil Rytarowski #endif
61094611b4SKamil Rytarowski 
622032e243SDavid CARLIER #ifdef __APPLE__
632032e243SDavid CARLIER #include <mach-o/dyld.h>
642032e243SDavid CARLIER #endif
652032e243SDavid CARLIER 
662b9b9e70SDavid CARLIER #ifdef __HAIKU__
672b9b9e70SDavid CARLIER #include <kernel/image.h>
682b9b9e70SDavid CARLIER #endif
692b9b9e70SDavid CARLIER 
70a9c94277SMarkus Armbruster #include "qemu/mmap-alloc.h"
71794e8f30SMichael S. Tsirkin 
727d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
737d992e4dSPeter Lieven #include "qemu/error-report.h"
747d992e4dSPeter Lieven #endif
757d992e4dSPeter Lieven 
76dfd0dcc7SJitendra Kolhe #define MAX_MEM_PREALLOC_THREAD_COUNT 16
771e356fc1SJitendra Kolhe 
78dba50678SDavid Hildenbrand struct MemsetThread;
79dba50678SDavid Hildenbrand 
80dba50678SDavid Hildenbrand typedef struct MemsetContext {
81dba50678SDavid Hildenbrand     bool all_threads_created;
82dba50678SDavid Hildenbrand     bool any_thread_failed;
83dba50678SDavid Hildenbrand     struct MemsetThread *threads;
84dba50678SDavid Hildenbrand     int num_threads;
85dba50678SDavid Hildenbrand } MemsetContext;
86dba50678SDavid Hildenbrand 
871e356fc1SJitendra Kolhe struct MemsetThread {
881e356fc1SJitendra Kolhe     char *addr;
89e947d47dSStefan Weil     size_t numpages;
90e947d47dSStefan Weil     size_t hpagesize;
911e356fc1SJitendra Kolhe     QemuThread pgthread;
921e356fc1SJitendra Kolhe     sigjmp_buf env;
93dba50678SDavid Hildenbrand     MemsetContext *context;
941e356fc1SJitendra Kolhe };
951e356fc1SJitendra Kolhe typedef struct MemsetThread MemsetThread;
961e356fc1SJitendra Kolhe 
97dba50678SDavid Hildenbrand /* used by sigbus_handler() */
98dba50678SDavid Hildenbrand static MemsetContext *sigbus_memset_context;
9929b838c0SDavid Hildenbrand struct sigaction sigbus_oldact;
100a960d664SDavid Hildenbrand static QemuMutex sigbus_mutex;
1011e356fc1SJitendra Kolhe 
102037fb5ebSbauerchen static QemuMutex page_mutex;
103037fb5ebSbauerchen static QemuCond page_cond;
104037fb5ebSbauerchen 
105baacf047SPaolo Bonzini int qemu_get_thread_id(void)
106baacf047SPaolo Bonzini {
107baacf047SPaolo Bonzini #if defined(__linux__)
108baacf047SPaolo Bonzini     return syscall(SYS_gettid);
1099548a891SDavid Carlier #elif defined(__FreeBSD__)
1109548a891SDavid Carlier     /* thread id is up to INT_MAX */
1119548a891SDavid Carlier     long tid;
1129548a891SDavid Carlier     thr_self(&tid);
1139548a891SDavid Carlier     return (int)tid;
1149548a891SDavid Carlier #elif defined(__NetBSD__)
1159548a891SDavid Carlier     return _lwp_self();
1168edbca51SDavid CARLIER #elif defined(__OpenBSD__)
1178edbca51SDavid CARLIER     return getthrid();
118baacf047SPaolo Bonzini #else
119baacf047SPaolo Bonzini     return getpid();
120baacf047SPaolo Bonzini #endif
121baacf047SPaolo Bonzini }
122baacf047SPaolo Bonzini 
123baacf047SPaolo Bonzini int qemu_daemon(int nochdir, int noclose)
124baacf047SPaolo Bonzini {
125baacf047SPaolo Bonzini     return daemon(nochdir, noclose);
126baacf047SPaolo Bonzini }
127baacf047SPaolo Bonzini 
1289e6bdef2SMarc-André Lureau bool qemu_write_pidfile(const char *path, Error **errp)
1299e6bdef2SMarc-André Lureau {
1309e6bdef2SMarc-André Lureau     int fd;
1319e6bdef2SMarc-André Lureau     char pidstr[32];
1329e6bdef2SMarc-André Lureau 
1339e6bdef2SMarc-André Lureau     while (1) {
1349e6bdef2SMarc-André Lureau         struct stat a, b;
13535f7f3fbSMarc-André Lureau         struct flock lock = {
13635f7f3fbSMarc-André Lureau             .l_type = F_WRLCK,
13735f7f3fbSMarc-André Lureau             .l_whence = SEEK_SET,
13835f7f3fbSMarc-André Lureau             .l_len = 0,
13935f7f3fbSMarc-André Lureau         };
1409e6bdef2SMarc-André Lureau 
141*1b34d08fSMarc-André Lureau         fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
1429e6bdef2SMarc-André Lureau         if (fd == -1) {
1439e6bdef2SMarc-André Lureau             return false;
1449e6bdef2SMarc-André Lureau         }
1459e6bdef2SMarc-André Lureau 
1469e6bdef2SMarc-André Lureau         if (fstat(fd, &b) < 0) {
1479e6bdef2SMarc-André Lureau             error_setg_errno(errp, errno, "Cannot stat file");
1489e6bdef2SMarc-André Lureau             goto fail_close;
1499e6bdef2SMarc-André Lureau         }
1509e6bdef2SMarc-André Lureau 
15135f7f3fbSMarc-André Lureau         if (fcntl(fd, F_SETLK, &lock)) {
1529e6bdef2SMarc-André Lureau             error_setg_errno(errp, errno, "Cannot lock pid file");
1539e6bdef2SMarc-André Lureau             goto fail_close;
1549e6bdef2SMarc-André Lureau         }
1559e6bdef2SMarc-André Lureau 
1569e6bdef2SMarc-André Lureau         /*
1579e6bdef2SMarc-André Lureau          * Now make sure the path we locked is the same one that now
1589e6bdef2SMarc-André Lureau          * exists on the filesystem.
1599e6bdef2SMarc-André Lureau          */
1609e6bdef2SMarc-André Lureau         if (stat(path, &a) < 0) {
1619e6bdef2SMarc-André Lureau             /*
1629e6bdef2SMarc-André Lureau              * PID file disappeared, someone else must be racing with
1639e6bdef2SMarc-André Lureau              * us, so try again.
1649e6bdef2SMarc-André Lureau              */
1659e6bdef2SMarc-André Lureau             close(fd);
1669e6bdef2SMarc-André Lureau             continue;
1679e6bdef2SMarc-André Lureau         }
1689e6bdef2SMarc-André Lureau 
1699e6bdef2SMarc-André Lureau         if (a.st_ino == b.st_ino) {
1709e6bdef2SMarc-André Lureau             break;
1719e6bdef2SMarc-André Lureau         }
1729e6bdef2SMarc-André Lureau 
1739e6bdef2SMarc-André Lureau         /*
1749e6bdef2SMarc-André Lureau          * PID file was recreated, someone else must be racing with
1759e6bdef2SMarc-André Lureau          * us, so try again.
1769e6bdef2SMarc-André Lureau          */
1779e6bdef2SMarc-André Lureau         close(fd);
1789e6bdef2SMarc-André Lureau     }
1799e6bdef2SMarc-André Lureau 
1809e6bdef2SMarc-André Lureau     if (ftruncate(fd, 0) < 0) {
1819e6bdef2SMarc-André Lureau         error_setg_errno(errp, errno, "Failed to truncate pid file");
1829e6bdef2SMarc-André Lureau         goto fail_unlink;
1839e6bdef2SMarc-André Lureau     }
1849e6bdef2SMarc-André Lureau 
1859e6bdef2SMarc-André Lureau     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
18696eb9b2bSMarc-André Lureau     if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
1879e6bdef2SMarc-André Lureau         error_setg(errp, "Failed to write pid file");
1889e6bdef2SMarc-André Lureau         goto fail_unlink;
1899e6bdef2SMarc-André Lureau     }
1909e6bdef2SMarc-André Lureau 
1919e6bdef2SMarc-André Lureau     return true;
1929e6bdef2SMarc-André Lureau 
1939e6bdef2SMarc-André Lureau fail_unlink:
1949e6bdef2SMarc-André Lureau     unlink(path);
1959e6bdef2SMarc-André Lureau fail_close:
1969e6bdef2SMarc-André Lureau     close(fd);
1979e6bdef2SMarc-André Lureau     return false;
1989e6bdef2SMarc-André Lureau }
1999e6bdef2SMarc-André Lureau 
200baacf047SPaolo Bonzini /* alloc shared memory pages */
2018dbe22c6SDavid Hildenbrand void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
2028dbe22c6SDavid Hildenbrand                           bool noreserve)
203baacf047SPaolo Bonzini {
2048dbe22c6SDavid Hildenbrand     const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
2058dbe22c6SDavid Hildenbrand                                     (noreserve ? QEMU_MAP_NORESERVE : 0);
206baacf047SPaolo Bonzini     size_t align = QEMU_VMALLOC_ALIGN;
207b444f5c0SDavid Hildenbrand     void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
208baacf047SPaolo Bonzini 
2097dda5dc8SPaolo Bonzini     if (ptr == MAP_FAILED) {
21039228250SMarkus Armbruster         return NULL;
211baacf047SPaolo Bonzini     }
212baacf047SPaolo Bonzini 
213a2b257d6SIgor Mammedov     if (alignment) {
214a2b257d6SIgor Mammedov         *alignment = align;
215a2b257d6SIgor Mammedov     }
216c2dfc5baSMichael S. Tsirkin 
2176eebf958SPaolo Bonzini     trace_qemu_anon_ram_alloc(size, ptr);
218baacf047SPaolo Bonzini     return ptr;
219baacf047SPaolo Bonzini }
220baacf047SPaolo Bonzini 
221e7a09b92SPaolo Bonzini void qemu_anon_ram_free(void *ptr, size_t size)
222e7a09b92SPaolo Bonzini {
223e7a09b92SPaolo Bonzini     trace_qemu_anon_ram_free(ptr, size);
22453adb9d4SMurilo Opsfelder Araujo     qemu_ram_munmap(-1, ptr, size);
225e7a09b92SPaolo Bonzini }
226e7a09b92SPaolo Bonzini 
227f9e8caccSStefan Hajnoczi void qemu_set_block(int fd)
228baacf047SPaolo Bonzini {
229baacf047SPaolo Bonzini     int f;
230baacf047SPaolo Bonzini     f = fcntl(fd, F_GETFL);
231da93b820SLi Qiang     assert(f != -1);
232da93b820SLi Qiang     f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
233da93b820SLi Qiang     assert(f != -1);
234baacf047SPaolo Bonzini }
235baacf047SPaolo Bonzini 
236894022e6SLaurent Vivier int qemu_try_set_nonblock(int fd)
237baacf047SPaolo Bonzini {
238baacf047SPaolo Bonzini     int f;
239baacf047SPaolo Bonzini     f = fcntl(fd, F_GETFL);
24002cdcc96SPhilippe Mathieu-Daudé     if (f == -1) {
241894022e6SLaurent Vivier         return -errno;
242894022e6SLaurent Vivier     }
243894022e6SLaurent Vivier     if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
244894022e6SLaurent Vivier         return -errno;
245894022e6SLaurent Vivier     }
246894022e6SLaurent Vivier     return 0;
247894022e6SLaurent Vivier }
248894022e6SLaurent Vivier 
249894022e6SLaurent Vivier void qemu_set_nonblock(int fd)
250894022e6SLaurent Vivier {
251894022e6SLaurent Vivier     int f;
252894022e6SLaurent Vivier     f = qemu_try_set_nonblock(fd);
253894022e6SLaurent Vivier     assert(f == 0);
254baacf047SPaolo Bonzini }
255baacf047SPaolo Bonzini 
256606600a1SSebastian Ottlik int socket_set_fast_reuse(int fd)
257606600a1SSebastian Ottlik {
258606600a1SSebastian Ottlik     int val = 1, ret;
259606600a1SSebastian Ottlik 
260606600a1SSebastian Ottlik     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
261606600a1SSebastian Ottlik                      (const char *)&val, sizeof(val));
262606600a1SSebastian Ottlik 
263606600a1SSebastian Ottlik     assert(ret == 0);
264606600a1SSebastian Ottlik 
265606600a1SSebastian Ottlik     return ret;
266606600a1SSebastian Ottlik }
267606600a1SSebastian Ottlik 
268baacf047SPaolo Bonzini void qemu_set_cloexec(int fd)
269baacf047SPaolo Bonzini {
270baacf047SPaolo Bonzini     int f;
271baacf047SPaolo Bonzini     f = fcntl(fd, F_GETFD);
2727e6478e7SStefano Stabellini     assert(f != -1);
2737e6478e7SStefano Stabellini     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
2747e6478e7SStefano Stabellini     assert(f != -1);
275baacf047SPaolo Bonzini }
276baacf047SPaolo Bonzini 
277baacf047SPaolo Bonzini /*
278baacf047SPaolo Bonzini  * Creates a pipe with FD_CLOEXEC set on both file descriptors
279baacf047SPaolo Bonzini  */
280baacf047SPaolo Bonzini int qemu_pipe(int pipefd[2])
281baacf047SPaolo Bonzini {
282baacf047SPaolo Bonzini     int ret;
283baacf047SPaolo Bonzini 
284baacf047SPaolo Bonzini #ifdef CONFIG_PIPE2
285baacf047SPaolo Bonzini     ret = pipe2(pipefd, O_CLOEXEC);
286baacf047SPaolo Bonzini     if (ret != -1 || errno != ENOSYS) {
287baacf047SPaolo Bonzini         return ret;
288baacf047SPaolo Bonzini     }
289baacf047SPaolo Bonzini #endif
290baacf047SPaolo Bonzini     ret = pipe(pipefd);
291baacf047SPaolo Bonzini     if (ret == 0) {
292baacf047SPaolo Bonzini         qemu_set_cloexec(pipefd[0]);
293baacf047SPaolo Bonzini         qemu_set_cloexec(pipefd[1]);
294baacf047SPaolo Bonzini     }
295baacf047SPaolo Bonzini 
296baacf047SPaolo Bonzini     return ret;
297baacf047SPaolo Bonzini }
298baacf047SPaolo Bonzini 
299e2ea3515SLaszlo Ersek char *
300e2ea3515SLaszlo Ersek qemu_get_local_state_pathname(const char *relative_pathname)
301e2ea3515SLaszlo Ersek {
302fcb4f59cSPaolo Bonzini     g_autofree char *dir = g_strdup_printf("%s/%s",
303fcb4f59cSPaolo Bonzini                                            CONFIG_QEMU_LOCALSTATEDIR,
304e2ea3515SLaszlo Ersek                                            relative_pathname);
305fcb4f59cSPaolo Bonzini     return get_relocated_path(dir);
306e2ea3515SLaszlo Ersek }
30713401ba0SStefan Hajnoczi 
30813401ba0SStefan Hajnoczi void qemu_set_tty_echo(int fd, bool echo)
30913401ba0SStefan Hajnoczi {
31013401ba0SStefan Hajnoczi     struct termios tty;
31113401ba0SStefan Hajnoczi 
31213401ba0SStefan Hajnoczi     tcgetattr(fd, &tty);
31313401ba0SStefan Hajnoczi 
31413401ba0SStefan Hajnoczi     if (echo) {
31513401ba0SStefan Hajnoczi         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
31613401ba0SStefan Hajnoczi     } else {
31713401ba0SStefan Hajnoczi         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
31813401ba0SStefan Hajnoczi     }
31913401ba0SStefan Hajnoczi 
32013401ba0SStefan Hajnoczi     tcsetattr(fd, TCSANOW, &tty);
32113401ba0SStefan Hajnoczi }
32210f5bff6SFam Zheng 
3239386a4a7SPaolo Bonzini static const char *exec_dir;
32410f5bff6SFam Zheng 
32510f5bff6SFam Zheng void qemu_init_exec_dir(const char *argv0)
32610f5bff6SFam Zheng {
32710f5bff6SFam Zheng     char *p = NULL;
32810f5bff6SFam Zheng     char buf[PATH_MAX];
32910f5bff6SFam Zheng 
3309386a4a7SPaolo Bonzini     if (exec_dir) {
331a4c13869SPaolo Bonzini         return;
332a4c13869SPaolo Bonzini     }
33310f5bff6SFam Zheng 
33410f5bff6SFam Zheng #if defined(__linux__)
33510f5bff6SFam Zheng     {
33610f5bff6SFam Zheng         int len;
33710f5bff6SFam Zheng         len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
33810f5bff6SFam Zheng         if (len > 0) {
33910f5bff6SFam Zheng             buf[len] = 0;
34010f5bff6SFam Zheng             p = buf;
34110f5bff6SFam Zheng         }
34210f5bff6SFam Zheng     }
343094611b4SKamil Rytarowski #elif defined(__FreeBSD__) \
344094611b4SKamil Rytarowski       || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
34510f5bff6SFam Zheng     {
346094611b4SKamil Rytarowski #if defined(__FreeBSD__)
34710f5bff6SFam Zheng         static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
348094611b4SKamil Rytarowski #else
349094611b4SKamil Rytarowski         static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
350094611b4SKamil Rytarowski #endif
35110f5bff6SFam Zheng         size_t len = sizeof(buf) - 1;
35210f5bff6SFam Zheng 
35310f5bff6SFam Zheng         *buf = '\0';
35410f5bff6SFam Zheng         if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
35510f5bff6SFam Zheng             *buf) {
35610f5bff6SFam Zheng             buf[sizeof(buf) - 1] = '\0';
35710f5bff6SFam Zheng             p = buf;
35810f5bff6SFam Zheng         }
35910f5bff6SFam Zheng     }
3602032e243SDavid CARLIER #elif defined(__APPLE__)
3612032e243SDavid CARLIER     {
3622032e243SDavid CARLIER         char fpath[PATH_MAX];
3632032e243SDavid CARLIER         uint32_t len = sizeof(fpath);
3642032e243SDavid CARLIER         if (_NSGetExecutablePath(fpath, &len) == 0) {
3652032e243SDavid CARLIER             p = realpath(fpath, buf);
3662032e243SDavid CARLIER             if (!p) {
3672032e243SDavid CARLIER                 return;
3682032e243SDavid CARLIER             }
3692032e243SDavid CARLIER         }
3702032e243SDavid CARLIER     }
3712b9b9e70SDavid CARLIER #elif defined(__HAIKU__)
3722b9b9e70SDavid CARLIER     {
3732b9b9e70SDavid CARLIER         image_info ii;
3742b9b9e70SDavid CARLIER         int32_t c = 0;
3752b9b9e70SDavid CARLIER 
3762b9b9e70SDavid CARLIER         *buf = '\0';
3772b9b9e70SDavid CARLIER         while (get_next_image_info(0, &c, &ii) == B_OK) {
3782b9b9e70SDavid CARLIER             if (ii.type == B_APP_IMAGE) {
3792b9b9e70SDavid CARLIER                 strncpy(buf, ii.name, sizeof(buf));
3802b9b9e70SDavid CARLIER                 buf[sizeof(buf) - 1] = 0;
3812b9b9e70SDavid CARLIER                 p = buf;
3822b9b9e70SDavid CARLIER                 break;
3832b9b9e70SDavid CARLIER             }
3842b9b9e70SDavid CARLIER         }
3852b9b9e70SDavid CARLIER     }
38610f5bff6SFam Zheng #endif
38710f5bff6SFam Zheng     /* If we don't have any way of figuring out the actual executable
38810f5bff6SFam Zheng        location then try argv[0].  */
3899386a4a7SPaolo Bonzini     if (!p && argv0) {
39010f5bff6SFam Zheng         p = realpath(argv0, buf);
39110f5bff6SFam Zheng     }
3929386a4a7SPaolo Bonzini     if (p) {
3939386a4a7SPaolo Bonzini         exec_dir = g_path_get_dirname(p);
3949386a4a7SPaolo Bonzini     } else {
3959386a4a7SPaolo Bonzini         exec_dir = CONFIG_BINDIR;
39610f5bff6SFam Zheng     }
39710f5bff6SFam Zheng }
39810f5bff6SFam Zheng 
399a4c13869SPaolo Bonzini const char *qemu_get_exec_dir(void)
40010f5bff6SFam Zheng {
401a4c13869SPaolo Bonzini     return exec_dir;
40210f5bff6SFam Zheng }
40338183310SPaolo Bonzini 
40429b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
40529b838c0SDavid Hildenbrand static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
40629b838c0SDavid Hildenbrand #else /* CONFIG_LINUX */
40738183310SPaolo Bonzini static void sigbus_handler(int signal)
40829b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
40938183310SPaolo Bonzini {
4101e356fc1SJitendra Kolhe     int i;
411dba50678SDavid Hildenbrand 
412dba50678SDavid Hildenbrand     if (sigbus_memset_context) {
413dba50678SDavid Hildenbrand         for (i = 0; i < sigbus_memset_context->num_threads; i++) {
414dba50678SDavid Hildenbrand             MemsetThread *thread = &sigbus_memset_context->threads[i];
415dba50678SDavid Hildenbrand 
416dba50678SDavid Hildenbrand             if (qemu_thread_is_self(&thread->pgthread)) {
417dba50678SDavid Hildenbrand                 siglongjmp(thread->env, 1);
4181e356fc1SJitendra Kolhe             }
4191e356fc1SJitendra Kolhe         }
4201e356fc1SJitendra Kolhe     }
42129b838c0SDavid Hildenbrand 
42229b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
42329b838c0SDavid Hildenbrand     /*
42429b838c0SDavid Hildenbrand      * We assume that the MCE SIGBUS handler could have been registered. We
42529b838c0SDavid Hildenbrand      * should never receive BUS_MCEERR_AO on any of our threads, but only on
42629b838c0SDavid Hildenbrand      * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
42729b838c0SDavid Hildenbrand      * receive BUS_MCEERR_AR triggered by action of other threads on one of
42829b838c0SDavid Hildenbrand      * our threads. So, no need to check for unrelated SIGBUS when seeing one
42929b838c0SDavid Hildenbrand      * for our threads.
43029b838c0SDavid Hildenbrand      *
43129b838c0SDavid Hildenbrand      * We will forward to the MCE handler, which will either handle the SIGBUS
43229b838c0SDavid Hildenbrand      * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
43329b838c0SDavid Hildenbrand      * default SIGBUS handler will crash the process, so we don't care.
43429b838c0SDavid Hildenbrand      */
43529b838c0SDavid Hildenbrand     if (sigbus_oldact.sa_flags & SA_SIGINFO) {
43629b838c0SDavid Hildenbrand         sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
43729b838c0SDavid Hildenbrand         return;
43829b838c0SDavid Hildenbrand     }
43929b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
44029b838c0SDavid Hildenbrand     warn_report("os_mem_prealloc: unrelated SIGBUS detected and ignored");
44138183310SPaolo Bonzini }
44238183310SPaolo Bonzini 
4431e356fc1SJitendra Kolhe static void *do_touch_pages(void *arg)
4441e356fc1SJitendra Kolhe {
4451e356fc1SJitendra Kolhe     MemsetThread *memset_args = (MemsetThread *)arg;
4461e356fc1SJitendra Kolhe     sigset_t set, oldset;
4476c427ab9SDavid Hildenbrand     int ret = 0;
4481e356fc1SJitendra Kolhe 
449037fb5ebSbauerchen     /*
450037fb5ebSbauerchen      * On Linux, the page faults from the loop below can cause mmap_sem
451037fb5ebSbauerchen      * contention with allocation of the thread stacks.  Do not start
452037fb5ebSbauerchen      * clearing until all threads have been created.
453037fb5ebSbauerchen      */
454037fb5ebSbauerchen     qemu_mutex_lock(&page_mutex);
455dba50678SDavid Hildenbrand     while (!memset_args->context->all_threads_created) {
456037fb5ebSbauerchen         qemu_cond_wait(&page_cond, &page_mutex);
457037fb5ebSbauerchen     }
458037fb5ebSbauerchen     qemu_mutex_unlock(&page_mutex);
459037fb5ebSbauerchen 
4601e356fc1SJitendra Kolhe     /* unblock SIGBUS */
4611e356fc1SJitendra Kolhe     sigemptyset(&set);
4621e356fc1SJitendra Kolhe     sigaddset(&set, SIGBUS);
4631e356fc1SJitendra Kolhe     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
4641e356fc1SJitendra Kolhe 
4651e356fc1SJitendra Kolhe     if (sigsetjmp(memset_args->env, 1)) {
4666c427ab9SDavid Hildenbrand         ret = -EFAULT;
4671e356fc1SJitendra Kolhe     } else {
468e947d47dSStefan Weil         char *addr = memset_args->addr;
469e947d47dSStefan Weil         size_t numpages = memset_args->numpages;
470e947d47dSStefan Weil         size_t hpagesize = memset_args->hpagesize;
471e947d47dSStefan Weil         size_t i;
4721e356fc1SJitendra Kolhe         for (i = 0; i < numpages; i++) {
4739dc44aa5SDaniel P. Berrange             /*
4749dc44aa5SDaniel P. Berrange              * Read & write back the same value, so we don't
4759dc44aa5SDaniel P. Berrange              * corrupt existing user/app data that might be
4769dc44aa5SDaniel P. Berrange              * stored.
4779dc44aa5SDaniel P. Berrange              *
4789dc44aa5SDaniel P. Berrange              * 'volatile' to stop compiler optimizing this away
4799dc44aa5SDaniel P. Berrange              * to a no-op
4809dc44aa5SDaniel P. Berrange              */
4819dc44aa5SDaniel P. Berrange             *(volatile char *)addr = *addr;
4821e356fc1SJitendra Kolhe             addr += hpagesize;
4831e356fc1SJitendra Kolhe         }
4841e356fc1SJitendra Kolhe     }
4851e356fc1SJitendra Kolhe     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
4866c427ab9SDavid Hildenbrand     return (void *)(uintptr_t)ret;
4871e356fc1SJitendra Kolhe }
4881e356fc1SJitendra Kolhe 
489a384bfa3SDavid Hildenbrand static void *do_madv_populate_write_pages(void *arg)
490a384bfa3SDavid Hildenbrand {
491a384bfa3SDavid Hildenbrand     MemsetThread *memset_args = (MemsetThread *)arg;
492a384bfa3SDavid Hildenbrand     const size_t size = memset_args->numpages * memset_args->hpagesize;
493a384bfa3SDavid Hildenbrand     char * const addr = memset_args->addr;
494a384bfa3SDavid Hildenbrand     int ret = 0;
495a384bfa3SDavid Hildenbrand 
496a384bfa3SDavid Hildenbrand     /* See do_touch_pages(). */
497a384bfa3SDavid Hildenbrand     qemu_mutex_lock(&page_mutex);
498dba50678SDavid Hildenbrand     while (!memset_args->context->all_threads_created) {
499a384bfa3SDavid Hildenbrand         qemu_cond_wait(&page_cond, &page_mutex);
500a384bfa3SDavid Hildenbrand     }
501a384bfa3SDavid Hildenbrand     qemu_mutex_unlock(&page_mutex);
502a384bfa3SDavid Hildenbrand 
503a384bfa3SDavid Hildenbrand     if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
504a384bfa3SDavid Hildenbrand         ret = -errno;
505a384bfa3SDavid Hildenbrand     }
506a384bfa3SDavid Hildenbrand     return (void *)(uintptr_t)ret;
507a384bfa3SDavid Hildenbrand }
508a384bfa3SDavid Hildenbrand 
50989aec641SDavid Hildenbrand static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
51089aec641SDavid Hildenbrand                                          int smp_cpus)
511dfd0dcc7SJitendra Kolhe {
512dfd0dcc7SJitendra Kolhe     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
513dfd0dcc7SJitendra Kolhe     int ret = 1;
514dfd0dcc7SJitendra Kolhe 
515dfd0dcc7SJitendra Kolhe     if (host_procs > 0) {
516dfd0dcc7SJitendra Kolhe         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
517dfd0dcc7SJitendra Kolhe     }
51889aec641SDavid Hildenbrand 
51989aec641SDavid Hildenbrand     /* Especially with gigantic pages, don't create more threads than pages. */
52089aec641SDavid Hildenbrand     ret = MIN(ret, numpages);
52189aec641SDavid Hildenbrand     /* Don't start threads to prealloc comparatively little memory. */
52289aec641SDavid Hildenbrand     ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
52389aec641SDavid Hildenbrand 
524dfd0dcc7SJitendra Kolhe     /* In case sysconf() fails, we fall back to single threaded */
525dfd0dcc7SJitendra Kolhe     return ret;
526dfd0dcc7SJitendra Kolhe }
527dfd0dcc7SJitendra Kolhe 
5286c427ab9SDavid Hildenbrand static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
529a384bfa3SDavid Hildenbrand                            int smp_cpus, bool use_madv_populate_write)
5301e356fc1SJitendra Kolhe {
53178b3f67aSPaolo Bonzini     static gsize initialized = 0;
532dba50678SDavid Hildenbrand     MemsetContext context = {
53389aec641SDavid Hildenbrand         .num_threads = get_memset_num_threads(hpagesize, numpages, smp_cpus),
534dba50678SDavid Hildenbrand     };
535037fb5ebSbauerchen     size_t numpages_per_thread, leftover;
536a384bfa3SDavid Hildenbrand     void *(*touch_fn)(void *);
5376c427ab9SDavid Hildenbrand     int ret = 0, i = 0;
5381e356fc1SJitendra Kolhe     char *addr = area;
5391e356fc1SJitendra Kolhe 
54078b3f67aSPaolo Bonzini     if (g_once_init_enter(&initialized)) {
54178b3f67aSPaolo Bonzini         qemu_mutex_init(&page_mutex);
54278b3f67aSPaolo Bonzini         qemu_cond_init(&page_cond);
54378b3f67aSPaolo Bonzini         g_once_init_leave(&initialized, 1);
54478b3f67aSPaolo Bonzini     }
54578b3f67aSPaolo Bonzini 
546a384bfa3SDavid Hildenbrand     if (use_madv_populate_write) {
547ac86e5c3SDavid Hildenbrand         /* Avoid creating a single thread for MADV_POPULATE_WRITE */
548ac86e5c3SDavid Hildenbrand         if (context.num_threads == 1) {
549ac86e5c3SDavid Hildenbrand             if (qemu_madvise(area, hpagesize * numpages,
550ac86e5c3SDavid Hildenbrand                              QEMU_MADV_POPULATE_WRITE)) {
551ac86e5c3SDavid Hildenbrand                 return -errno;
552ac86e5c3SDavid Hildenbrand             }
553ac86e5c3SDavid Hildenbrand             return 0;
554ac86e5c3SDavid Hildenbrand         }
555a384bfa3SDavid Hildenbrand         touch_fn = do_madv_populate_write_pages;
556a384bfa3SDavid Hildenbrand     } else {
557a384bfa3SDavid Hildenbrand         touch_fn = do_touch_pages;
558a384bfa3SDavid Hildenbrand     }
559a384bfa3SDavid Hildenbrand 
560dba50678SDavid Hildenbrand     context.threads = g_new0(MemsetThread, context.num_threads);
561dba50678SDavid Hildenbrand     numpages_per_thread = numpages / context.num_threads;
562dba50678SDavid Hildenbrand     leftover = numpages % context.num_threads;
563dba50678SDavid Hildenbrand     for (i = 0; i < context.num_threads; i++) {
564dba50678SDavid Hildenbrand         context.threads[i].addr = addr;
565dba50678SDavid Hildenbrand         context.threads[i].numpages = numpages_per_thread + (i < leftover);
566dba50678SDavid Hildenbrand         context.threads[i].hpagesize = hpagesize;
567dba50678SDavid Hildenbrand         context.threads[i].context = &context;
568dba50678SDavid Hildenbrand         qemu_thread_create(&context.threads[i].pgthread, "touch_pages",
569dba50678SDavid Hildenbrand                            touch_fn, &context.threads[i],
5701e356fc1SJitendra Kolhe                            QEMU_THREAD_JOINABLE);
571dba50678SDavid Hildenbrand         addr += context.threads[i].numpages * hpagesize;
572dba50678SDavid Hildenbrand     }
573dba50678SDavid Hildenbrand 
574dba50678SDavid Hildenbrand     if (!use_madv_populate_write) {
575dba50678SDavid Hildenbrand         sigbus_memset_context = &context;
5761e356fc1SJitendra Kolhe     }
577278fb162SBauerchen 
578278fb162SBauerchen     qemu_mutex_lock(&page_mutex);
579dba50678SDavid Hildenbrand     context.all_threads_created = true;
580037fb5ebSbauerchen     qemu_cond_broadcast(&page_cond);
581278fb162SBauerchen     qemu_mutex_unlock(&page_mutex);
582037fb5ebSbauerchen 
583dba50678SDavid Hildenbrand     for (i = 0; i < context.num_threads; i++) {
584dba50678SDavid Hildenbrand         int tmp = (uintptr_t)qemu_thread_join(&context.threads[i].pgthread);
5856c427ab9SDavid Hildenbrand 
5866c427ab9SDavid Hildenbrand         if (tmp) {
5876c427ab9SDavid Hildenbrand             ret = tmp;
5886c427ab9SDavid Hildenbrand         }
5891e356fc1SJitendra Kolhe     }
590dba50678SDavid Hildenbrand 
591dba50678SDavid Hildenbrand     if (!use_madv_populate_write) {
592dba50678SDavid Hildenbrand         sigbus_memset_context = NULL;
593dba50678SDavid Hildenbrand     }
594dba50678SDavid Hildenbrand     g_free(context.threads);
5951e356fc1SJitendra Kolhe 
5966c427ab9SDavid Hildenbrand     return ret;
5971e356fc1SJitendra Kolhe }
5981e356fc1SJitendra Kolhe 
599a384bfa3SDavid Hildenbrand static bool madv_populate_write_possible(char *area, size_t pagesize)
600a384bfa3SDavid Hildenbrand {
601a384bfa3SDavid Hildenbrand     return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
602a384bfa3SDavid Hildenbrand            errno != EINVAL;
603a384bfa3SDavid Hildenbrand }
604a384bfa3SDavid Hildenbrand 
6051e356fc1SJitendra Kolhe void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
6061e356fc1SJitendra Kolhe                      Error **errp)
60738183310SPaolo Bonzini {
608a960d664SDavid Hildenbrand     static gsize initialized;
609b7bf8f56SStefan Weil     int ret;
6101e356fc1SJitendra Kolhe     size_t hpagesize = qemu_fd_getpagesize(fd);
6111e356fc1SJitendra Kolhe     size_t numpages = DIV_ROUND_UP(memory, hpagesize);
612a384bfa3SDavid Hildenbrand     bool use_madv_populate_write;
61329b838c0SDavid Hildenbrand     struct sigaction act;
61438183310SPaolo Bonzini 
615a384bfa3SDavid Hildenbrand     /*
616a384bfa3SDavid Hildenbrand      * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
617a384bfa3SDavid Hildenbrand      * some special mappings, such as mapping /dev/mem.
618a384bfa3SDavid Hildenbrand      */
619a384bfa3SDavid Hildenbrand     use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
620a384bfa3SDavid Hildenbrand 
621a384bfa3SDavid Hildenbrand     if (!use_madv_populate_write) {
622a960d664SDavid Hildenbrand         if (g_once_init_enter(&initialized)) {
623a960d664SDavid Hildenbrand             qemu_mutex_init(&sigbus_mutex);
624a960d664SDavid Hildenbrand             g_once_init_leave(&initialized, 1);
625a960d664SDavid Hildenbrand         }
626a960d664SDavid Hildenbrand 
627a960d664SDavid Hildenbrand         qemu_mutex_lock(&sigbus_mutex);
62838183310SPaolo Bonzini         memset(&act, 0, sizeof(act));
62929b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
63029b838c0SDavid Hildenbrand         act.sa_sigaction = &sigbus_handler;
63129b838c0SDavid Hildenbrand         act.sa_flags = SA_SIGINFO;
63229b838c0SDavid Hildenbrand #else /* CONFIG_LINUX */
63338183310SPaolo Bonzini         act.sa_handler = &sigbus_handler;
63438183310SPaolo Bonzini         act.sa_flags = 0;
63529b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
63638183310SPaolo Bonzini 
63729b838c0SDavid Hildenbrand         ret = sigaction(SIGBUS, &act, &sigbus_oldact);
63838183310SPaolo Bonzini         if (ret) {
639dd4fc605SDavid Hildenbrand             qemu_mutex_unlock(&sigbus_mutex);
640056b68afSIgor Mammedov             error_setg_errno(errp, errno,
641056b68afSIgor Mammedov                 "os_mem_prealloc: failed to install signal handler");
642056b68afSIgor Mammedov             return;
64338183310SPaolo Bonzini         }
644a384bfa3SDavid Hildenbrand     }
64538183310SPaolo Bonzini 
6461e356fc1SJitendra Kolhe     /* touch pages simultaneously */
647a384bfa3SDavid Hildenbrand     ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
648a384bfa3SDavid Hildenbrand                           use_madv_populate_write);
6496c427ab9SDavid Hildenbrand     if (ret) {
6506c427ab9SDavid Hildenbrand         error_setg_errno(errp, -ret,
6516c427ab9SDavid Hildenbrand                          "os_mem_prealloc: preallocating memory failed");
652056b68afSIgor Mammedov     }
65338183310SPaolo Bonzini 
654a384bfa3SDavid Hildenbrand     if (!use_madv_populate_write) {
65529b838c0SDavid Hildenbrand         ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
65638183310SPaolo Bonzini         if (ret) {
657056b68afSIgor Mammedov             /* Terminate QEMU since it can't recover from error */
65838183310SPaolo Bonzini             perror("os_mem_prealloc: failed to reinstall signal handler");
65938183310SPaolo Bonzini             exit(1);
66038183310SPaolo Bonzini         }
661a960d664SDavid Hildenbrand         qemu_mutex_unlock(&sigbus_mutex);
66238183310SPaolo Bonzini     }
663a384bfa3SDavid Hildenbrand }
664d57e4e48SDaniel P. Berrange 
6657dc9ae43SMichal Privoznik char *qemu_get_pid_name(pid_t pid)
6667dc9ae43SMichal Privoznik {
6677dc9ae43SMichal Privoznik     char *name = NULL;
6687dc9ae43SMichal Privoznik 
6697dc9ae43SMichal Privoznik #if defined(__FreeBSD__)
6707dc9ae43SMichal Privoznik     /* BSDs don't have /proc, but they provide a nice substitute */
6717dc9ae43SMichal Privoznik     struct kinfo_proc *proc = kinfo_getproc(pid);
6727dc9ae43SMichal Privoznik 
6737dc9ae43SMichal Privoznik     if (proc) {
6747dc9ae43SMichal Privoznik         name = g_strdup(proc->ki_comm);
6757dc9ae43SMichal Privoznik         free(proc);
6767dc9ae43SMichal Privoznik     }
6777dc9ae43SMichal Privoznik #else
6787dc9ae43SMichal Privoznik     /* Assume a system with reasonable procfs */
6797dc9ae43SMichal Privoznik     char *pid_path;
6807dc9ae43SMichal Privoznik     size_t len;
6817dc9ae43SMichal Privoznik 
6827dc9ae43SMichal Privoznik     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
6837dc9ae43SMichal Privoznik     g_file_get_contents(pid_path, &name, &len, NULL);
6847dc9ae43SMichal Privoznik     g_free(pid_path);
6857dc9ae43SMichal Privoznik #endif
6867dc9ae43SMichal Privoznik 
6877dc9ae43SMichal Privoznik     return name;
6887dc9ae43SMichal Privoznik }
6897dc9ae43SMichal Privoznik 
6907dc9ae43SMichal Privoznik 
69157cb38b3SDaniel P. Berrange pid_t qemu_fork(Error **errp)
69257cb38b3SDaniel P. Berrange {
69357cb38b3SDaniel P. Berrange     sigset_t oldmask, newmask;
69457cb38b3SDaniel P. Berrange     struct sigaction sig_action;
69557cb38b3SDaniel P. Berrange     int saved_errno;
69657cb38b3SDaniel P. Berrange     pid_t pid;
69757cb38b3SDaniel P. Berrange 
69857cb38b3SDaniel P. Berrange     /*
69957cb38b3SDaniel P. Berrange      * Need to block signals now, so that child process can safely
70057cb38b3SDaniel P. Berrange      * kill off caller's signal handlers without a race.
70157cb38b3SDaniel P. Berrange      */
70257cb38b3SDaniel P. Berrange     sigfillset(&newmask);
70357cb38b3SDaniel P. Berrange     if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
70457cb38b3SDaniel P. Berrange         error_setg_errno(errp, errno,
70557cb38b3SDaniel P. Berrange                          "cannot block signals");
70657cb38b3SDaniel P. Berrange         return -1;
70757cb38b3SDaniel P. Berrange     }
70857cb38b3SDaniel P. Berrange 
70957cb38b3SDaniel P. Berrange     pid = fork();
71057cb38b3SDaniel P. Berrange     saved_errno = errno;
71157cb38b3SDaniel P. Berrange 
71257cb38b3SDaniel P. Berrange     if (pid < 0) {
71357cb38b3SDaniel P. Berrange         /* attempt to restore signal mask, but ignore failure, to
71457cb38b3SDaniel P. Berrange          * avoid obscuring the fork failure */
71557cb38b3SDaniel P. Berrange         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
71657cb38b3SDaniel P. Berrange         error_setg_errno(errp, saved_errno,
71757cb38b3SDaniel P. Berrange                          "cannot fork child process");
71857cb38b3SDaniel P. Berrange         errno = saved_errno;
71957cb38b3SDaniel P. Berrange         return -1;
72057cb38b3SDaniel P. Berrange     } else if (pid) {
72157cb38b3SDaniel P. Berrange         /* parent process */
72257cb38b3SDaniel P. Berrange 
72357cb38b3SDaniel P. Berrange         /* Restore our original signal mask now that the child is
72457cb38b3SDaniel P. Berrange          * safely running. Only documented failures are EFAULT (not
72557cb38b3SDaniel P. Berrange          * possible, since we are using just-grabbed mask) or EINVAL
72657cb38b3SDaniel P. Berrange          * (not possible, since we are using correct arguments).  */
72757cb38b3SDaniel P. Berrange         (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
72857cb38b3SDaniel P. Berrange     } else {
72957cb38b3SDaniel P. Berrange         /* child process */
73057cb38b3SDaniel P. Berrange         size_t i;
73157cb38b3SDaniel P. Berrange 
73257cb38b3SDaniel P. Berrange         /* Clear out all signal handlers from parent so nothing
73357cb38b3SDaniel P. Berrange          * unexpected can happen in our child once we unblock
73457cb38b3SDaniel P. Berrange          * signals */
73557cb38b3SDaniel P. Berrange         sig_action.sa_handler = SIG_DFL;
73657cb38b3SDaniel P. Berrange         sig_action.sa_flags = 0;
73757cb38b3SDaniel P. Berrange         sigemptyset(&sig_action.sa_mask);
73857cb38b3SDaniel P. Berrange 
73957cb38b3SDaniel P. Berrange         for (i = 1; i < NSIG; i++) {
74057cb38b3SDaniel P. Berrange             /* Only possible errors are EFAULT or EINVAL The former
74157cb38b3SDaniel P. Berrange              * won't happen, the latter we expect, so no need to check
74257cb38b3SDaniel P. Berrange              * return value */
74357cb38b3SDaniel P. Berrange             (void)sigaction(i, &sig_action, NULL);
74457cb38b3SDaniel P. Berrange         }
74557cb38b3SDaniel P. Berrange 
74657cb38b3SDaniel P. Berrange         /* Unmask all signals in child, since we've no idea what the
74757cb38b3SDaniel P. Berrange          * caller's done with their signal mask and don't want to
74857cb38b3SDaniel P. Berrange          * propagate that to children */
74957cb38b3SDaniel P. Berrange         sigemptyset(&newmask);
75057cb38b3SDaniel P. Berrange         if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
75157cb38b3SDaniel P. Berrange             Error *local_err = NULL;
75257cb38b3SDaniel P. Berrange             error_setg_errno(&local_err, errno,
75357cb38b3SDaniel P. Berrange                              "cannot unblock signals");
75457cb38b3SDaniel P. Berrange             error_report_err(local_err);
75557cb38b3SDaniel P. Berrange             _exit(1);
75657cb38b3SDaniel P. Berrange         }
75757cb38b3SDaniel P. Berrange     }
75857cb38b3SDaniel P. Berrange     return pid;
75957cb38b3SDaniel P. Berrange }
7608737d9e0SPeter Lieven 
7618737d9e0SPeter Lieven void *qemu_alloc_stack(size_t *sz)
7628737d9e0SPeter Lieven {
7638737d9e0SPeter Lieven     void *ptr, *guardpage;
764fc3d1badSBrad Smith     int flags;
7657d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
7667d992e4dSPeter Lieven     void *ptr2;
7677d992e4dSPeter Lieven #endif
7688e3b0cbbSMarc-André Lureau     size_t pagesz = qemu_real_host_page_size();
7698737d9e0SPeter Lieven #ifdef _SC_THREAD_STACK_MIN
7708737d9e0SPeter Lieven     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
7718737d9e0SPeter Lieven     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
7728737d9e0SPeter Lieven     *sz = MAX(MAX(min_stack_sz, 0), *sz);
7738737d9e0SPeter Lieven #endif
7748737d9e0SPeter Lieven     /* adjust stack size to a multiple of the page size */
7758737d9e0SPeter Lieven     *sz = ROUND_UP(*sz, pagesz);
7768737d9e0SPeter Lieven     /* allocate one extra page for the guard page */
7778737d9e0SPeter Lieven     *sz += pagesz;
7788737d9e0SPeter Lieven 
779fc3d1badSBrad Smith     flags = MAP_PRIVATE | MAP_ANONYMOUS;
780fc3d1badSBrad Smith #if defined(MAP_STACK) && defined(__OpenBSD__)
781fc3d1badSBrad Smith     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
782fc3d1badSBrad Smith      * Linux/FreeBSD/NetBSD have a flag with the same name
783fc3d1badSBrad Smith      * but have differing functionality. OpenBSD will SEGV
784fc3d1badSBrad Smith      * if it spots execution with a stack pointer pointing
785fc3d1badSBrad Smith      * at memory that was not allocated with MAP_STACK.
786fc3d1badSBrad Smith      */
787fc3d1badSBrad Smith     flags |= MAP_STACK;
788fc3d1badSBrad Smith #endif
789fc3d1badSBrad Smith 
790fc3d1badSBrad Smith     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
7918737d9e0SPeter Lieven     if (ptr == MAP_FAILED) {
792e916a6e8SEduardo Habkost         perror("failed to allocate memory for stack");
7938737d9e0SPeter Lieven         abort();
7948737d9e0SPeter Lieven     }
7958737d9e0SPeter Lieven 
7968737d9e0SPeter Lieven #if defined(HOST_IA64)
7978737d9e0SPeter Lieven     /* separate register stack */
7988737d9e0SPeter Lieven     guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
7998737d9e0SPeter Lieven #elif defined(HOST_HPPA)
8008737d9e0SPeter Lieven     /* stack grows up */
8018737d9e0SPeter Lieven     guardpage = ptr + *sz - pagesz;
8028737d9e0SPeter Lieven #else
8038737d9e0SPeter Lieven     /* stack grows down */
8048737d9e0SPeter Lieven     guardpage = ptr;
8058737d9e0SPeter Lieven #endif
8068737d9e0SPeter Lieven     if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
807e916a6e8SEduardo Habkost         perror("failed to set up stack guard page");
8088737d9e0SPeter Lieven         abort();
8098737d9e0SPeter Lieven     }
8108737d9e0SPeter Lieven 
8117d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
8127d992e4dSPeter Lieven     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
8137d992e4dSPeter Lieven         *(uint32_t *)ptr2 = 0xdeadbeaf;
8147d992e4dSPeter Lieven     }
8157d992e4dSPeter Lieven #endif
8167d992e4dSPeter Lieven 
8178737d9e0SPeter Lieven     return ptr;
8188737d9e0SPeter Lieven }
8198737d9e0SPeter Lieven 
8207d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
8217d992e4dSPeter Lieven static __thread unsigned int max_stack_usage;
8227d992e4dSPeter Lieven #endif
8237d992e4dSPeter Lieven 
8248737d9e0SPeter Lieven void qemu_free_stack(void *stack, size_t sz)
8258737d9e0SPeter Lieven {
8267d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
8277d992e4dSPeter Lieven     unsigned int usage;
8287d992e4dSPeter Lieven     void *ptr;
8297d992e4dSPeter Lieven 
8308e3b0cbbSMarc-André Lureau     for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
8317d992e4dSPeter Lieven          ptr += sizeof(uint32_t)) {
8327d992e4dSPeter Lieven         if (*(uint32_t *)ptr != 0xdeadbeaf) {
8337d992e4dSPeter Lieven             break;
8347d992e4dSPeter Lieven         }
8357d992e4dSPeter Lieven     }
8367d992e4dSPeter Lieven     usage = sz - (uintptr_t) (ptr - stack);
8377d992e4dSPeter Lieven     if (usage > max_stack_usage) {
8387d992e4dSPeter Lieven         error_report("thread %d max stack usage increased from %u to %u",
8397d992e4dSPeter Lieven                      qemu_get_thread_id(), max_stack_usage, usage);
8407d992e4dSPeter Lieven         max_stack_usage = usage;
8417d992e4dSPeter Lieven     }
8427d992e4dSPeter Lieven #endif
8437d992e4dSPeter Lieven 
8448737d9e0SPeter Lieven     munmap(stack, sz);
8458737d9e0SPeter Lieven }
846d98d4072SPaolo Bonzini 
847c905a368SDaniele Buono /*
848c905a368SDaniele Buono  * Disable CFI checks.
849c905a368SDaniele Buono  * We are going to call a signal hander directly. Such handler may or may not
850c905a368SDaniele Buono  * have been defined in our binary, so there's no guarantee that the pointer
851c905a368SDaniele Buono  * used to set the handler is a cfi-valid pointer. Since the handlers are
852c905a368SDaniele Buono  * stored in kernel memory, changing the handler to an attacker-defined
853c905a368SDaniele Buono  * function requires being able to call a sigaction() syscall,
854c905a368SDaniele Buono  * which is not as easy as overwriting a pointer in memory.
855c905a368SDaniele Buono  */
856c905a368SDaniele Buono QEMU_DISABLE_CFI
857d98d4072SPaolo Bonzini void sigaction_invoke(struct sigaction *action,
858d98d4072SPaolo Bonzini                       struct qemu_signalfd_siginfo *info)
859d98d4072SPaolo Bonzini {
86002ffa034SPeter Maydell     siginfo_t si = {};
861d98d4072SPaolo Bonzini     si.si_signo = info->ssi_signo;
862d98d4072SPaolo Bonzini     si.si_errno = info->ssi_errno;
863d98d4072SPaolo Bonzini     si.si_code = info->ssi_code;
864d98d4072SPaolo Bonzini 
865d98d4072SPaolo Bonzini     /* Convert the minimal set of fields defined by POSIX.
866d98d4072SPaolo Bonzini      * Positive si_code values are reserved for kernel-generated
867d98d4072SPaolo Bonzini      * signals, where the valid siginfo fields are determined by
868d98d4072SPaolo Bonzini      * the signal number.  But according to POSIX, it is unspecified
869d98d4072SPaolo Bonzini      * whether SI_USER and SI_QUEUE have values less than or equal to
870d98d4072SPaolo Bonzini      * zero.
871d98d4072SPaolo Bonzini      */
872d98d4072SPaolo Bonzini     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
873d98d4072SPaolo Bonzini         info->ssi_code <= 0) {
874d98d4072SPaolo Bonzini         /* SIGTERM, etc.  */
875d98d4072SPaolo Bonzini         si.si_pid = info->ssi_pid;
876d98d4072SPaolo Bonzini         si.si_uid = info->ssi_uid;
877d98d4072SPaolo Bonzini     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
878d98d4072SPaolo Bonzini                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
879d98d4072SPaolo Bonzini         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
880d98d4072SPaolo Bonzini     } else if (info->ssi_signo == SIGCHLD) {
881d98d4072SPaolo Bonzini         si.si_pid = info->ssi_pid;
882d98d4072SPaolo Bonzini         si.si_status = info->ssi_status;
883d98d4072SPaolo Bonzini         si.si_uid = info->ssi_uid;
884d98d4072SPaolo Bonzini     }
885d98d4072SPaolo Bonzini     action->sa_sigaction(info->ssi_signo, &si, NULL);
886d98d4072SPaolo Bonzini }
887e47f4765SMichal Privoznik 
888ad06ef0eSAlex Bennée size_t qemu_get_host_physmem(void)
889ad06ef0eSAlex Bennée {
890ad06ef0eSAlex Bennée #ifdef _SC_PHYS_PAGES
891ad06ef0eSAlex Bennée     long pages = sysconf(_SC_PHYS_PAGES);
892ad06ef0eSAlex Bennée     if (pages > 0) {
8938e3b0cbbSMarc-André Lureau         if (pages > SIZE_MAX / qemu_real_host_page_size()) {
894ad06ef0eSAlex Bennée             return SIZE_MAX;
895ad06ef0eSAlex Bennée         } else {
8968e3b0cbbSMarc-André Lureau             return pages * qemu_real_host_page_size();
897ad06ef0eSAlex Bennée         }
898ad06ef0eSAlex Bennée     }
899ad06ef0eSAlex Bennée #endif
900ad06ef0eSAlex Bennée     return 0;
901ad06ef0eSAlex Bennée }
902e9c4e0a8SMarc-André Lureau 
903e9c4e0a8SMarc-André Lureau /* Sets a specific flag */
904e9c4e0a8SMarc-André Lureau int fcntl_setfl(int fd, int flag)
905e9c4e0a8SMarc-André Lureau {
906e9c4e0a8SMarc-André Lureau     int flags;
907e9c4e0a8SMarc-André Lureau 
908e9c4e0a8SMarc-André Lureau     flags = fcntl(fd, F_GETFL);
909e9c4e0a8SMarc-André Lureau     if (flags == -1) {
910e9c4e0a8SMarc-André Lureau         return -errno;
911e9c4e0a8SMarc-André Lureau     }
912e9c4e0a8SMarc-André Lureau     if (fcntl(fd, F_SETFL, flags | flag) == -1) {
913e9c4e0a8SMarc-André Lureau         return -errno;
914e9c4e0a8SMarc-André Lureau     }
915e9c4e0a8SMarc-André Lureau     return 0;
916e9c4e0a8SMarc-André Lureau }
91773991a92SMarc-André Lureau 
91873991a92SMarc-André Lureau int qemu_msync(void *addr, size_t length, int fd)
91973991a92SMarc-André Lureau {
92073991a92SMarc-André Lureau     size_t align_mask = ~(qemu_real_host_page_size() - 1);
92173991a92SMarc-André Lureau 
92273991a92SMarc-André Lureau     /**
92373991a92SMarc-André Lureau      * There are no strict reqs as per the length of mapping
92473991a92SMarc-André Lureau      * to be synced. Still the length needs to follow the address
92573991a92SMarc-André Lureau      * alignment changes. Additionally - round the size to the multiple
92673991a92SMarc-André Lureau      * of PAGE_SIZE
92773991a92SMarc-André Lureau      */
92873991a92SMarc-André Lureau     length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
92973991a92SMarc-André Lureau     length = (length + ~align_mask) & align_mask;
93073991a92SMarc-André Lureau 
93173991a92SMarc-André Lureau     addr = (void *)((uintptr_t)addr & align_mask);
93273991a92SMarc-André Lureau 
93373991a92SMarc-André Lureau     return msync(addr, length, MS_SYNC);
93473991a92SMarc-André Lureau }
935