xref: /openbmc/qemu/util/oslib-posix.c (revision 8db6e33d9b6b24b39281dc9e5a2a668f09f4ad7d)
1baacf047SPaolo Bonzini /*
2baacf047SPaolo Bonzini  * os-posix-lib.c
3baacf047SPaolo Bonzini  *
4baacf047SPaolo Bonzini  * Copyright (c) 2003-2008 Fabrice Bellard
5baacf047SPaolo Bonzini  * Copyright (c) 2010 Red Hat, Inc.
6baacf047SPaolo Bonzini  *
7baacf047SPaolo Bonzini  * QEMU library functions on POSIX which are shared between QEMU and
8baacf047SPaolo Bonzini  * the QEMU tools.
9baacf047SPaolo Bonzini  *
10baacf047SPaolo Bonzini  * Permission is hereby granted, free of charge, to any person obtaining a copy
11baacf047SPaolo Bonzini  * of this software and associated documentation files (the "Software"), to deal
12baacf047SPaolo Bonzini  * in the Software without restriction, including without limitation the rights
13baacf047SPaolo Bonzini  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14baacf047SPaolo Bonzini  * copies of the Software, and to permit persons to whom the Software is
15baacf047SPaolo Bonzini  * furnished to do so, subject to the following conditions:
16baacf047SPaolo Bonzini  *
17baacf047SPaolo Bonzini  * The above copyright notice and this permission notice shall be included in
18baacf047SPaolo Bonzini  * all copies or substantial portions of the Software.
19baacf047SPaolo Bonzini  *
20baacf047SPaolo Bonzini  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21baacf047SPaolo Bonzini  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22baacf047SPaolo Bonzini  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23baacf047SPaolo Bonzini  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24baacf047SPaolo Bonzini  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25baacf047SPaolo Bonzini  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26baacf047SPaolo Bonzini  * THE SOFTWARE.
27baacf047SPaolo Bonzini  */
28baacf047SPaolo Bonzini 
29aafd7584SPeter Maydell #include "qemu/osdep.h"
3013401ba0SStefan Hajnoczi #include <termios.h>
3113401ba0SStefan Hajnoczi 
32e2ea3515SLaszlo Ersek #include <glib/gprintf.h>
33e2ea3515SLaszlo Ersek 
34baacf047SPaolo Bonzini #include "sysemu/sysemu.h"
35baacf047SPaolo Bonzini #include "trace.h"
36da34e65cSMarkus Armbruster #include "qapi/error.h"
3729b838c0SDavid Hildenbrand #include "qemu/error-report.h"
38b85ea5faSPeter Maydell #include "qemu/madvise.h"
39baacf047SPaolo Bonzini #include "qemu/sockets.h"
40db725815SMarkus Armbruster #include "qemu/thread.h"
4110f5bff6SFam Zheng #include <libgen.h>
42f348b6d1SVeronia Bahaa #include "qemu/cutils.h"
4389aec641SDavid Hildenbrand #include "qemu/units.h"
44e2de2c49SDavid Hildenbrand #include "qemu/thread-context.h"
4504accf43SMark Kanda #include "qemu/main-loop.h"
46baacf047SPaolo Bonzini 
47baacf047SPaolo Bonzini #ifdef CONFIG_LINUX
48baacf047SPaolo Bonzini #include <sys/syscall.h>
49baacf047SPaolo Bonzini #endif
50baacf047SPaolo Bonzini 
5141975b26SAndreas Färber #ifdef __FreeBSD__
529548a891SDavid Carlier #include <sys/thr.h>
5306680b15SMarc-André Lureau #include <sys/user.h>
547dc9ae43SMichal Privoznik #include <libutil.h>
5541975b26SAndreas Färber #endif
5641975b26SAndreas Färber 
57094611b4SKamil Rytarowski #ifdef __NetBSD__
589548a891SDavid Carlier #include <lwp.h>
59094611b4SKamil Rytarowski #endif
60094611b4SKamil Rytarowski 
61a9c94277SMarkus Armbruster #include "qemu/mmap-alloc.h"
62794e8f30SMichael S. Tsirkin 
63dfd0dcc7SJitendra Kolhe #define MAX_MEM_PREALLOC_THREAD_COUNT 16
641e356fc1SJitendra Kolhe 
65dba50678SDavid Hildenbrand struct MemsetThread;
66dba50678SDavid Hildenbrand 
6704accf43SMark Kanda static QLIST_HEAD(, MemsetContext) memset_contexts =
6804accf43SMark Kanda     QLIST_HEAD_INITIALIZER(memset_contexts);
6904accf43SMark Kanda 
70dba50678SDavid Hildenbrand typedef struct MemsetContext {
71dba50678SDavid Hildenbrand     bool all_threads_created;
72dba50678SDavid Hildenbrand     bool any_thread_failed;
73dba50678SDavid Hildenbrand     struct MemsetThread *threads;
74dba50678SDavid Hildenbrand     int num_threads;
7504accf43SMark Kanda     QLIST_ENTRY(MemsetContext) next;
76dba50678SDavid Hildenbrand } MemsetContext;
77dba50678SDavid Hildenbrand 
781e356fc1SJitendra Kolhe struct MemsetThread {
791e356fc1SJitendra Kolhe     char *addr;
80e947d47dSStefan Weil     size_t numpages;
81e947d47dSStefan Weil     size_t hpagesize;
821e356fc1SJitendra Kolhe     QemuThread pgthread;
831e356fc1SJitendra Kolhe     sigjmp_buf env;
84dba50678SDavid Hildenbrand     MemsetContext *context;
851e356fc1SJitendra Kolhe };
861e356fc1SJitendra Kolhe typedef struct MemsetThread MemsetThread;
871e356fc1SJitendra Kolhe 
88dba50678SDavid Hildenbrand /* used by sigbus_handler() */
89dba50678SDavid Hildenbrand static MemsetContext *sigbus_memset_context;
9029b838c0SDavid Hildenbrand struct sigaction sigbus_oldact;
91a960d664SDavid Hildenbrand static QemuMutex sigbus_mutex;
921e356fc1SJitendra Kolhe 
93037fb5ebSbauerchen static QemuMutex page_mutex;
94037fb5ebSbauerchen static QemuCond page_cond;
95037fb5ebSbauerchen 
qemu_get_thread_id(void)96baacf047SPaolo Bonzini int qemu_get_thread_id(void)
97baacf047SPaolo Bonzini {
98baacf047SPaolo Bonzini #if defined(__linux__)
99baacf047SPaolo Bonzini     return syscall(SYS_gettid);
1009548a891SDavid Carlier #elif defined(__FreeBSD__)
1019548a891SDavid Carlier     /* thread id is up to INT_MAX */
1029548a891SDavid Carlier     long tid;
1039548a891SDavid Carlier     thr_self(&tid);
1049548a891SDavid Carlier     return (int)tid;
1059548a891SDavid Carlier #elif defined(__NetBSD__)
1069548a891SDavid Carlier     return _lwp_self();
1078edbca51SDavid CARLIER #elif defined(__OpenBSD__)
1088edbca51SDavid CARLIER     return getthrid();
109baacf047SPaolo Bonzini #else
110baacf047SPaolo Bonzini     return getpid();
111baacf047SPaolo Bonzini #endif
112baacf047SPaolo Bonzini }
113baacf047SPaolo Bonzini 
qemu_daemon(int nochdir,int noclose)114baacf047SPaolo Bonzini int qemu_daemon(int nochdir, int noclose)
115baacf047SPaolo Bonzini {
116baacf047SPaolo Bonzini     return daemon(nochdir, noclose);
117baacf047SPaolo Bonzini }
118baacf047SPaolo Bonzini 
qemu_write_pidfile(const char * path,Error ** errp)1199e6bdef2SMarc-André Lureau bool qemu_write_pidfile(const char *path, Error **errp)
1209e6bdef2SMarc-André Lureau {
1219e6bdef2SMarc-André Lureau     int fd;
1229e6bdef2SMarc-André Lureau     char pidstr[32];
1239e6bdef2SMarc-André Lureau 
1249e6bdef2SMarc-André Lureau     while (1) {
1259e6bdef2SMarc-André Lureau         struct stat a, b;
12635f7f3fbSMarc-André Lureau         struct flock lock = {
12735f7f3fbSMarc-André Lureau             .l_type = F_WRLCK,
12835f7f3fbSMarc-André Lureau             .l_whence = SEEK_SET,
12935f7f3fbSMarc-André Lureau             .l_len = 0,
13035f7f3fbSMarc-André Lureau         };
1319e6bdef2SMarc-André Lureau 
1321b34d08fSMarc-André Lureau         fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp);
1339e6bdef2SMarc-André Lureau         if (fd == -1) {
1349e6bdef2SMarc-André Lureau             return false;
1359e6bdef2SMarc-André Lureau         }
1369e6bdef2SMarc-André Lureau 
1379e6bdef2SMarc-André Lureau         if (fstat(fd, &b) < 0) {
1389e6bdef2SMarc-André Lureau             error_setg_errno(errp, errno, "Cannot stat file");
1399e6bdef2SMarc-André Lureau             goto fail_close;
1409e6bdef2SMarc-André Lureau         }
1419e6bdef2SMarc-André Lureau 
14235f7f3fbSMarc-André Lureau         if (fcntl(fd, F_SETLK, &lock)) {
1439e6bdef2SMarc-André Lureau             error_setg_errno(errp, errno, "Cannot lock pid file");
1449e6bdef2SMarc-André Lureau             goto fail_close;
1459e6bdef2SMarc-André Lureau         }
1469e6bdef2SMarc-André Lureau 
1479e6bdef2SMarc-André Lureau         /*
1489e6bdef2SMarc-André Lureau          * Now make sure the path we locked is the same one that now
1499e6bdef2SMarc-André Lureau          * exists on the filesystem.
1509e6bdef2SMarc-André Lureau          */
1519e6bdef2SMarc-André Lureau         if (stat(path, &a) < 0) {
1529e6bdef2SMarc-André Lureau             /*
1539e6bdef2SMarc-André Lureau              * PID file disappeared, someone else must be racing with
1549e6bdef2SMarc-André Lureau              * us, so try again.
1559e6bdef2SMarc-André Lureau              */
1569e6bdef2SMarc-André Lureau             close(fd);
1579e6bdef2SMarc-André Lureau             continue;
1589e6bdef2SMarc-André Lureau         }
1599e6bdef2SMarc-André Lureau 
1609e6bdef2SMarc-André Lureau         if (a.st_ino == b.st_ino) {
1619e6bdef2SMarc-André Lureau             break;
1629e6bdef2SMarc-André Lureau         }
1639e6bdef2SMarc-André Lureau 
1649e6bdef2SMarc-André Lureau         /*
1659e6bdef2SMarc-André Lureau          * PID file was recreated, someone else must be racing with
1669e6bdef2SMarc-André Lureau          * us, so try again.
1679e6bdef2SMarc-André Lureau          */
1689e6bdef2SMarc-André Lureau         close(fd);
1699e6bdef2SMarc-André Lureau     }
1709e6bdef2SMarc-André Lureau 
1719e6bdef2SMarc-André Lureau     if (ftruncate(fd, 0) < 0) {
1729e6bdef2SMarc-André Lureau         error_setg_errno(errp, errno, "Failed to truncate pid file");
1739e6bdef2SMarc-André Lureau         goto fail_unlink;
1749e6bdef2SMarc-André Lureau     }
1759e6bdef2SMarc-André Lureau 
1769e6bdef2SMarc-André Lureau     snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
17796eb9b2bSMarc-André Lureau     if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
1789e6bdef2SMarc-André Lureau         error_setg(errp, "Failed to write pid file");
1799e6bdef2SMarc-André Lureau         goto fail_unlink;
1809e6bdef2SMarc-André Lureau     }
1819e6bdef2SMarc-André Lureau 
1829e6bdef2SMarc-André Lureau     return true;
1839e6bdef2SMarc-André Lureau 
1849e6bdef2SMarc-André Lureau fail_unlink:
1859e6bdef2SMarc-André Lureau     unlink(path);
1869e6bdef2SMarc-André Lureau fail_close:
1879e6bdef2SMarc-André Lureau     close(fd);
1889e6bdef2SMarc-André Lureau     return false;
1899e6bdef2SMarc-André Lureau }
1909e6bdef2SMarc-André Lureau 
191baacf047SPaolo Bonzini /* alloc shared memory pages */
qemu_anon_ram_alloc(size_t size,uint64_t * alignment,bool shared,bool noreserve)1928dbe22c6SDavid Hildenbrand void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
1938dbe22c6SDavid Hildenbrand                           bool noreserve)
194baacf047SPaolo Bonzini {
1958dbe22c6SDavid Hildenbrand     const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
1968dbe22c6SDavid Hildenbrand                                     (noreserve ? QEMU_MAP_NORESERVE : 0);
197baacf047SPaolo Bonzini     size_t align = QEMU_VMALLOC_ALIGN;
198b444f5c0SDavid Hildenbrand     void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
199baacf047SPaolo Bonzini 
2007dda5dc8SPaolo Bonzini     if (ptr == MAP_FAILED) {
20139228250SMarkus Armbruster         return NULL;
202baacf047SPaolo Bonzini     }
203baacf047SPaolo Bonzini 
204a2b257d6SIgor Mammedov     if (alignment) {
205a2b257d6SIgor Mammedov         *alignment = align;
206a2b257d6SIgor Mammedov     }
207c2dfc5baSMichael S. Tsirkin 
2086eebf958SPaolo Bonzini     trace_qemu_anon_ram_alloc(size, ptr);
209baacf047SPaolo Bonzini     return ptr;
210baacf047SPaolo Bonzini }
211baacf047SPaolo Bonzini 
qemu_anon_ram_free(void * ptr,size_t size)212e7a09b92SPaolo Bonzini void qemu_anon_ram_free(void *ptr, size_t size)
213e7a09b92SPaolo Bonzini {
214e7a09b92SPaolo Bonzini     trace_qemu_anon_ram_free(ptr, size);
21553adb9d4SMurilo Opsfelder Araujo     qemu_ram_munmap(-1, ptr, size);
216e7a09b92SPaolo Bonzini }
217e7a09b92SPaolo Bonzini 
qemu_socket_set_block(int fd)218ff5927baSMarc-André Lureau void qemu_socket_set_block(int fd)
219baacf047SPaolo Bonzini {
22022e135fcSMarc-André Lureau     g_unix_set_fd_nonblocking(fd, false, NULL);
221baacf047SPaolo Bonzini }
222baacf047SPaolo Bonzini 
qemu_socket_try_set_nonblock(int fd)223ff5927baSMarc-André Lureau int qemu_socket_try_set_nonblock(int fd)
224baacf047SPaolo Bonzini {
22522e135fcSMarc-André Lureau     return g_unix_set_fd_nonblocking(fd, true, NULL) ? 0 : -errno;
226894022e6SLaurent Vivier }
227894022e6SLaurent Vivier 
qemu_socket_set_nonblock(int fd)228ff5927baSMarc-André Lureau void qemu_socket_set_nonblock(int fd)
229894022e6SLaurent Vivier {
230894022e6SLaurent Vivier     int f;
231ff5927baSMarc-André Lureau     f = qemu_socket_try_set_nonblock(fd);
232894022e6SLaurent Vivier     assert(f == 0);
233baacf047SPaolo Bonzini }
234baacf047SPaolo Bonzini 
socket_set_fast_reuse(int fd)235606600a1SSebastian Ottlik int socket_set_fast_reuse(int fd)
236606600a1SSebastian Ottlik {
237606600a1SSebastian Ottlik     int val = 1, ret;
238606600a1SSebastian Ottlik 
239606600a1SSebastian Ottlik     ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
240606600a1SSebastian Ottlik                      (const char *)&val, sizeof(val));
241606600a1SSebastian Ottlik 
242606600a1SSebastian Ottlik     assert(ret == 0);
243606600a1SSebastian Ottlik 
244606600a1SSebastian Ottlik     return ret;
245606600a1SSebastian Ottlik }
246606600a1SSebastian Ottlik 
qemu_set_cloexec(int fd)247baacf047SPaolo Bonzini void qemu_set_cloexec(int fd)
248baacf047SPaolo Bonzini {
249baacf047SPaolo Bonzini     int f;
250baacf047SPaolo Bonzini     f = fcntl(fd, F_GETFD);
2517e6478e7SStefano Stabellini     assert(f != -1);
2527e6478e7SStefano Stabellini     f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
2537e6478e7SStefano Stabellini     assert(f != -1);
254baacf047SPaolo Bonzini }
255baacf047SPaolo Bonzini 
qemu_socketpair(int domain,int type,int protocol,int sv[2])2563c63b4e9SGuoyi Tu int qemu_socketpair(int domain, int type, int protocol, int sv[2])
2573c63b4e9SGuoyi Tu {
2583c63b4e9SGuoyi Tu     int ret;
2593c63b4e9SGuoyi Tu 
2603c63b4e9SGuoyi Tu #ifdef SOCK_CLOEXEC
2613c63b4e9SGuoyi Tu     ret = socketpair(domain, type | SOCK_CLOEXEC, protocol, sv);
2623c63b4e9SGuoyi Tu     if (ret != -1 || errno != EINVAL) {
2633c63b4e9SGuoyi Tu         return ret;
2643c63b4e9SGuoyi Tu     }
2653c63b4e9SGuoyi Tu #endif
266083c4e71SZhao Liu     ret = socketpair(domain, type, protocol, sv);
2673c63b4e9SGuoyi Tu     if (ret == 0) {
2683c63b4e9SGuoyi Tu         qemu_set_cloexec(sv[0]);
2693c63b4e9SGuoyi Tu         qemu_set_cloexec(sv[1]);
2703c63b4e9SGuoyi Tu     }
2713c63b4e9SGuoyi Tu 
2723c63b4e9SGuoyi Tu     return ret;
2733c63b4e9SGuoyi Tu }
2743c63b4e9SGuoyi Tu 
275e2ea3515SLaszlo Ersek char *
qemu_get_local_state_dir(void)2761fbf2665SMarc-André Lureau qemu_get_local_state_dir(void)
277e2ea3515SLaszlo Ersek {
2781fbf2665SMarc-André Lureau     return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR);
279e2ea3515SLaszlo Ersek }
28013401ba0SStefan Hajnoczi 
qemu_set_tty_echo(int fd,bool echo)28113401ba0SStefan Hajnoczi void qemu_set_tty_echo(int fd, bool echo)
28213401ba0SStefan Hajnoczi {
28313401ba0SStefan Hajnoczi     struct termios tty;
28413401ba0SStefan Hajnoczi 
28513401ba0SStefan Hajnoczi     tcgetattr(fd, &tty);
28613401ba0SStefan Hajnoczi 
28713401ba0SStefan Hajnoczi     if (echo) {
28813401ba0SStefan Hajnoczi         tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
28913401ba0SStefan Hajnoczi     } else {
29013401ba0SStefan Hajnoczi         tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
29113401ba0SStefan Hajnoczi     }
29213401ba0SStefan Hajnoczi 
29313401ba0SStefan Hajnoczi     tcsetattr(fd, TCSANOW, &tty);
29413401ba0SStefan Hajnoczi }
29510f5bff6SFam Zheng 
29629b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
sigbus_handler(int signal,siginfo_t * siginfo,void * ctx)29729b838c0SDavid Hildenbrand static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx)
29829b838c0SDavid Hildenbrand #else /* CONFIG_LINUX */
29938183310SPaolo Bonzini static void sigbus_handler(int signal)
30029b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
30138183310SPaolo Bonzini {
3021e356fc1SJitendra Kolhe     int i;
303dba50678SDavid Hildenbrand 
304dba50678SDavid Hildenbrand     if (sigbus_memset_context) {
305dba50678SDavid Hildenbrand         for (i = 0; i < sigbus_memset_context->num_threads; i++) {
306dba50678SDavid Hildenbrand             MemsetThread *thread = &sigbus_memset_context->threads[i];
307dba50678SDavid Hildenbrand 
308dba50678SDavid Hildenbrand             if (qemu_thread_is_self(&thread->pgthread)) {
309dba50678SDavid Hildenbrand                 siglongjmp(thread->env, 1);
3101e356fc1SJitendra Kolhe             }
3111e356fc1SJitendra Kolhe         }
3121e356fc1SJitendra Kolhe     }
31329b838c0SDavid Hildenbrand 
31429b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
31529b838c0SDavid Hildenbrand     /*
31629b838c0SDavid Hildenbrand      * We assume that the MCE SIGBUS handler could have been registered. We
31729b838c0SDavid Hildenbrand      * should never receive BUS_MCEERR_AO on any of our threads, but only on
31829b838c0SDavid Hildenbrand      * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not
31929b838c0SDavid Hildenbrand      * receive BUS_MCEERR_AR triggered by action of other threads on one of
32029b838c0SDavid Hildenbrand      * our threads. So, no need to check for unrelated SIGBUS when seeing one
32129b838c0SDavid Hildenbrand      * for our threads.
32229b838c0SDavid Hildenbrand      *
32329b838c0SDavid Hildenbrand      * We will forward to the MCE handler, which will either handle the SIGBUS
32429b838c0SDavid Hildenbrand      * or reinstall the default SIGBUS handler and reraise the SIGBUS. The
32529b838c0SDavid Hildenbrand      * default SIGBUS handler will crash the process, so we don't care.
32629b838c0SDavid Hildenbrand      */
32729b838c0SDavid Hildenbrand     if (sigbus_oldact.sa_flags & SA_SIGINFO) {
32829b838c0SDavid Hildenbrand         sigbus_oldact.sa_sigaction(signal, siginfo, ctx);
32929b838c0SDavid Hildenbrand         return;
33029b838c0SDavid Hildenbrand     }
33129b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
3326556aadcSDavid Hildenbrand     warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored");
33338183310SPaolo Bonzini }
33438183310SPaolo Bonzini 
do_touch_pages(void * arg)3351e356fc1SJitendra Kolhe static void *do_touch_pages(void *arg)
3361e356fc1SJitendra Kolhe {
3371e356fc1SJitendra Kolhe     MemsetThread *memset_args = (MemsetThread *)arg;
3381e356fc1SJitendra Kolhe     sigset_t set, oldset;
3396c427ab9SDavid Hildenbrand     int ret = 0;
3401e356fc1SJitendra Kolhe 
341037fb5ebSbauerchen     /*
342037fb5ebSbauerchen      * On Linux, the page faults from the loop below can cause mmap_sem
343037fb5ebSbauerchen      * contention with allocation of the thread stacks.  Do not start
344037fb5ebSbauerchen      * clearing until all threads have been created.
345037fb5ebSbauerchen      */
346037fb5ebSbauerchen     qemu_mutex_lock(&page_mutex);
347dba50678SDavid Hildenbrand     while (!memset_args->context->all_threads_created) {
348037fb5ebSbauerchen         qemu_cond_wait(&page_cond, &page_mutex);
349037fb5ebSbauerchen     }
350037fb5ebSbauerchen     qemu_mutex_unlock(&page_mutex);
351037fb5ebSbauerchen 
3521e356fc1SJitendra Kolhe     /* unblock SIGBUS */
3531e356fc1SJitendra Kolhe     sigemptyset(&set);
3541e356fc1SJitendra Kolhe     sigaddset(&set, SIGBUS);
3551e356fc1SJitendra Kolhe     pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
3561e356fc1SJitendra Kolhe 
3571e356fc1SJitendra Kolhe     if (sigsetjmp(memset_args->env, 1)) {
3586c427ab9SDavid Hildenbrand         ret = -EFAULT;
3591e356fc1SJitendra Kolhe     } else {
360e947d47dSStefan Weil         char *addr = memset_args->addr;
361e947d47dSStefan Weil         size_t numpages = memset_args->numpages;
362e947d47dSStefan Weil         size_t hpagesize = memset_args->hpagesize;
363e947d47dSStefan Weil         size_t i;
3641e356fc1SJitendra Kolhe         for (i = 0; i < numpages; i++) {
3659dc44aa5SDaniel P. Berrange             /*
3669dc44aa5SDaniel P. Berrange              * Read & write back the same value, so we don't
3679dc44aa5SDaniel P. Berrange              * corrupt existing user/app data that might be
3689dc44aa5SDaniel P. Berrange              * stored.
3699dc44aa5SDaniel P. Berrange              *
3709dc44aa5SDaniel P. Berrange              * 'volatile' to stop compiler optimizing this away
3719dc44aa5SDaniel P. Berrange              * to a no-op
3729dc44aa5SDaniel P. Berrange              */
3739dc44aa5SDaniel P. Berrange             *(volatile char *)addr = *addr;
3741e356fc1SJitendra Kolhe             addr += hpagesize;
3751e356fc1SJitendra Kolhe         }
3761e356fc1SJitendra Kolhe     }
3771e356fc1SJitendra Kolhe     pthread_sigmask(SIG_SETMASK, &oldset, NULL);
3786c427ab9SDavid Hildenbrand     return (void *)(uintptr_t)ret;
3791e356fc1SJitendra Kolhe }
3801e356fc1SJitendra Kolhe 
do_madv_populate_write_pages(void * arg)381a384bfa3SDavid Hildenbrand static void *do_madv_populate_write_pages(void *arg)
382a384bfa3SDavid Hildenbrand {
383a384bfa3SDavid Hildenbrand     MemsetThread *memset_args = (MemsetThread *)arg;
384a384bfa3SDavid Hildenbrand     const size_t size = memset_args->numpages * memset_args->hpagesize;
385a384bfa3SDavid Hildenbrand     char * const addr = memset_args->addr;
386a384bfa3SDavid Hildenbrand     int ret = 0;
387a384bfa3SDavid Hildenbrand 
388a384bfa3SDavid Hildenbrand     /* See do_touch_pages(). */
389a384bfa3SDavid Hildenbrand     qemu_mutex_lock(&page_mutex);
390dba50678SDavid Hildenbrand     while (!memset_args->context->all_threads_created) {
391a384bfa3SDavid Hildenbrand         qemu_cond_wait(&page_cond, &page_mutex);
392a384bfa3SDavid Hildenbrand     }
393a384bfa3SDavid Hildenbrand     qemu_mutex_unlock(&page_mutex);
394a384bfa3SDavid Hildenbrand 
395a384bfa3SDavid Hildenbrand     if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
396a384bfa3SDavid Hildenbrand         ret = -errno;
397a384bfa3SDavid Hildenbrand     }
398a384bfa3SDavid Hildenbrand     return (void *)(uintptr_t)ret;
399a384bfa3SDavid Hildenbrand }
400a384bfa3SDavid Hildenbrand 
get_memset_num_threads(size_t hpagesize,size_t numpages,int max_threads)40189aec641SDavid Hildenbrand static inline int get_memset_num_threads(size_t hpagesize, size_t numpages,
4026556aadcSDavid Hildenbrand                                          int max_threads)
403dfd0dcc7SJitendra Kolhe {
404dfd0dcc7SJitendra Kolhe     long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
405dfd0dcc7SJitendra Kolhe     int ret = 1;
406dfd0dcc7SJitendra Kolhe 
407dfd0dcc7SJitendra Kolhe     if (host_procs > 0) {
4086556aadcSDavid Hildenbrand         ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads);
409dfd0dcc7SJitendra Kolhe     }
41089aec641SDavid Hildenbrand 
41189aec641SDavid Hildenbrand     /* Especially with gigantic pages, don't create more threads than pages. */
41289aec641SDavid Hildenbrand     ret = MIN(ret, numpages);
41389aec641SDavid Hildenbrand     /* Don't start threads to prealloc comparatively little memory. */
41489aec641SDavid Hildenbrand     ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB)));
41589aec641SDavid Hildenbrand 
416dfd0dcc7SJitendra Kolhe     /* In case sysconf() fails, we fall back to single threaded */
417dfd0dcc7SJitendra Kolhe     return ret;
418dfd0dcc7SJitendra Kolhe }
419dfd0dcc7SJitendra Kolhe 
wait_and_free_mem_prealloc_context(MemsetContext * context)42004accf43SMark Kanda static int wait_and_free_mem_prealloc_context(MemsetContext *context)
42104accf43SMark Kanda {
42204accf43SMark Kanda     int i, ret = 0, tmp;
42304accf43SMark Kanda 
42404accf43SMark Kanda     for (i = 0; i < context->num_threads; i++) {
42504accf43SMark Kanda         tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread);
42604accf43SMark Kanda 
42704accf43SMark Kanda         if (tmp) {
42804accf43SMark Kanda             ret = tmp;
42904accf43SMark Kanda         }
43004accf43SMark Kanda     }
43104accf43SMark Kanda     g_free(context->threads);
43204accf43SMark Kanda     g_free(context);
43304accf43SMark Kanda     return ret;
43404accf43SMark Kanda }
43504accf43SMark Kanda 
touch_all_pages(char * area,size_t hpagesize,size_t numpages,int max_threads,ThreadContext * tc,bool async,bool use_madv_populate_write)4366c427ab9SDavid Hildenbrand static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
43704accf43SMark Kanda                            int max_threads, ThreadContext *tc, bool async,
438e04a34e5SDavid Hildenbrand                            bool use_madv_populate_write)
4391e356fc1SJitendra Kolhe {
44078b3f67aSPaolo Bonzini     static gsize initialized = 0;
44104accf43SMark Kanda     MemsetContext *context = g_malloc0(sizeof(MemsetContext));
442037fb5ebSbauerchen     size_t numpages_per_thread, leftover;
443a384bfa3SDavid Hildenbrand     void *(*touch_fn)(void *);
44404accf43SMark Kanda     int ret, i = 0;
4451e356fc1SJitendra Kolhe     char *addr = area;
4461e356fc1SJitendra Kolhe 
44704accf43SMark Kanda     /*
44804accf43SMark Kanda      * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE
44904accf43SMark Kanda      * and prealloc context for thread placement.
45004accf43SMark Kanda      */
45104accf43SMark Kanda     if (!use_madv_populate_write || !tc) {
45204accf43SMark Kanda         async = false;
45304accf43SMark Kanda     }
45404accf43SMark Kanda 
45504accf43SMark Kanda     context->num_threads =
45604accf43SMark Kanda         get_memset_num_threads(hpagesize, numpages, max_threads);
45704accf43SMark Kanda 
45878b3f67aSPaolo Bonzini     if (g_once_init_enter(&initialized)) {
45978b3f67aSPaolo Bonzini         qemu_mutex_init(&page_mutex);
46078b3f67aSPaolo Bonzini         qemu_cond_init(&page_cond);
46178b3f67aSPaolo Bonzini         g_once_init_leave(&initialized, 1);
46278b3f67aSPaolo Bonzini     }
46378b3f67aSPaolo Bonzini 
464a384bfa3SDavid Hildenbrand     if (use_madv_populate_write) {
46504accf43SMark Kanda         /*
46604accf43SMark Kanda          * Avoid creating a single thread for MADV_POPULATE_WRITE when
46704accf43SMark Kanda          * preallocating synchronously.
46804accf43SMark Kanda          */
46904accf43SMark Kanda         if (context->num_threads == 1 && !async) {
47044a90c08SPaolo Bonzini             ret = 0;
471ac86e5c3SDavid Hildenbrand             if (qemu_madvise(area, hpagesize * numpages,
472ac86e5c3SDavid Hildenbrand                              QEMU_MADV_POPULATE_WRITE)) {
47344a90c08SPaolo Bonzini                 ret = -errno;
474ac86e5c3SDavid Hildenbrand             }
47544a90c08SPaolo Bonzini             g_free(context);
47644a90c08SPaolo Bonzini             return ret;
477ac86e5c3SDavid Hildenbrand         }
478a384bfa3SDavid Hildenbrand         touch_fn = do_madv_populate_write_pages;
479a384bfa3SDavid Hildenbrand     } else {
480a384bfa3SDavid Hildenbrand         touch_fn = do_touch_pages;
481a384bfa3SDavid Hildenbrand     }
482a384bfa3SDavid Hildenbrand 
48304accf43SMark Kanda     context->threads = g_new0(MemsetThread, context->num_threads);
48404accf43SMark Kanda     numpages_per_thread = numpages / context->num_threads;
48504accf43SMark Kanda     leftover = numpages % context->num_threads;
48604accf43SMark Kanda     for (i = 0; i < context->num_threads; i++) {
48704accf43SMark Kanda         context->threads[i].addr = addr;
48804accf43SMark Kanda         context->threads[i].numpages = numpages_per_thread + (i < leftover);
48904accf43SMark Kanda         context->threads[i].hpagesize = hpagesize;
49004accf43SMark Kanda         context->threads[i].context = context;
491e04a34e5SDavid Hildenbrand         if (tc) {
49204accf43SMark Kanda             thread_context_create_thread(tc, &context->threads[i].pgthread,
493e04a34e5SDavid Hildenbrand                                          "touch_pages",
49404accf43SMark Kanda                                          touch_fn, &context->threads[i],
495e04a34e5SDavid Hildenbrand                                          QEMU_THREAD_JOINABLE);
496e04a34e5SDavid Hildenbrand         } else {
49704accf43SMark Kanda             qemu_thread_create(&context->threads[i].pgthread, "touch_pages",
49804accf43SMark Kanda                                touch_fn, &context->threads[i],
4991e356fc1SJitendra Kolhe                                QEMU_THREAD_JOINABLE);
500e04a34e5SDavid Hildenbrand         }
50104accf43SMark Kanda         addr += context->threads[i].numpages * hpagesize;
50204accf43SMark Kanda     }
50304accf43SMark Kanda 
50404accf43SMark Kanda     if (async) {
50504accf43SMark Kanda         /*
50604accf43SMark Kanda          * async requests currently require the BQL. Add it to the list and kick
50704accf43SMark Kanda          * preallocation off during qemu_finish_async_prealloc_mem().
50804accf43SMark Kanda          */
50904accf43SMark Kanda         assert(bql_locked());
51004accf43SMark Kanda         QLIST_INSERT_HEAD(&memset_contexts, context, next);
51104accf43SMark Kanda         return 0;
512dba50678SDavid Hildenbrand     }
513dba50678SDavid Hildenbrand 
514dba50678SDavid Hildenbrand     if (!use_madv_populate_write) {
51504accf43SMark Kanda         sigbus_memset_context = context;
5161e356fc1SJitendra Kolhe     }
517278fb162SBauerchen 
518278fb162SBauerchen     qemu_mutex_lock(&page_mutex);
51904accf43SMark Kanda     context->all_threads_created = true;
520037fb5ebSbauerchen     qemu_cond_broadcast(&page_cond);
521278fb162SBauerchen     qemu_mutex_unlock(&page_mutex);
522037fb5ebSbauerchen 
52304accf43SMark Kanda     ret = wait_and_free_mem_prealloc_context(context);
5246c427ab9SDavid Hildenbrand 
52504accf43SMark Kanda     if (!use_madv_populate_write) {
52604accf43SMark Kanda         sigbus_memset_context = NULL;
52704accf43SMark Kanda     }
52804accf43SMark Kanda     return ret;
52904accf43SMark Kanda }
53004accf43SMark Kanda 
qemu_finish_async_prealloc_mem(Error ** errp)53104accf43SMark Kanda bool qemu_finish_async_prealloc_mem(Error **errp)
53204accf43SMark Kanda {
53304accf43SMark Kanda     int ret = 0, tmp;
53404accf43SMark Kanda     MemsetContext *context, *next_context;
53504accf43SMark Kanda 
53604accf43SMark Kanda     /* Waiting for preallocation requires the BQL. */
53704accf43SMark Kanda     assert(bql_locked());
53804accf43SMark Kanda     if (QLIST_EMPTY(&memset_contexts)) {
53904accf43SMark Kanda         return true;
54004accf43SMark Kanda     }
54104accf43SMark Kanda 
54204accf43SMark Kanda     qemu_mutex_lock(&page_mutex);
54304accf43SMark Kanda     QLIST_FOREACH(context, &memset_contexts, next) {
54404accf43SMark Kanda         context->all_threads_created = true;
54504accf43SMark Kanda     }
54604accf43SMark Kanda     qemu_cond_broadcast(&page_cond);
54704accf43SMark Kanda     qemu_mutex_unlock(&page_mutex);
54804accf43SMark Kanda 
54904accf43SMark Kanda     QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) {
55004accf43SMark Kanda         QLIST_REMOVE(context, next);
55104accf43SMark Kanda         tmp = wait_and_free_mem_prealloc_context(context);
5526c427ab9SDavid Hildenbrand         if (tmp) {
5536c427ab9SDavid Hildenbrand             ret = tmp;
5546c427ab9SDavid Hildenbrand         }
5551e356fc1SJitendra Kolhe     }
556dba50678SDavid Hildenbrand 
55704accf43SMark Kanda     if (ret) {
55804accf43SMark Kanda         error_setg_errno(errp, -ret,
55904accf43SMark Kanda                          "qemu_prealloc_mem: preallocating memory failed");
56004accf43SMark Kanda         return false;
561dba50678SDavid Hildenbrand     }
56204accf43SMark Kanda     return true;
5631e356fc1SJitendra Kolhe }
5641e356fc1SJitendra Kolhe 
madv_populate_write_possible(char * area,size_t pagesize)565a384bfa3SDavid Hildenbrand static bool madv_populate_write_possible(char *area, size_t pagesize)
566a384bfa3SDavid Hildenbrand {
567a384bfa3SDavid Hildenbrand     return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
568a384bfa3SDavid Hildenbrand            errno != EINVAL;
569a384bfa3SDavid Hildenbrand }
570a384bfa3SDavid Hildenbrand 
qemu_prealloc_mem(int fd,char * area,size_t sz,int max_threads,ThreadContext * tc,bool async,Error ** errp)571b622ee98SPhilippe Mathieu-Daudé bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads,
57204accf43SMark Kanda                        ThreadContext *tc, bool async, Error **errp)
57338183310SPaolo Bonzini {
574a960d664SDavid Hildenbrand     static gsize initialized;
575b7bf8f56SStefan Weil     int ret;
5761e356fc1SJitendra Kolhe     size_t hpagesize = qemu_fd_getpagesize(fd);
5776556aadcSDavid Hildenbrand     size_t numpages = DIV_ROUND_UP(sz, hpagesize);
578a384bfa3SDavid Hildenbrand     bool use_madv_populate_write;
57929b838c0SDavid Hildenbrand     struct sigaction act;
580b622ee98SPhilippe Mathieu-Daudé     bool rv = true;
58138183310SPaolo Bonzini 
582a384bfa3SDavid Hildenbrand     /*
583a384bfa3SDavid Hildenbrand      * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
584a384bfa3SDavid Hildenbrand      * some special mappings, such as mapping /dev/mem.
585a384bfa3SDavid Hildenbrand      */
586a384bfa3SDavid Hildenbrand     use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
587a384bfa3SDavid Hildenbrand 
588a384bfa3SDavid Hildenbrand     if (!use_madv_populate_write) {
589a960d664SDavid Hildenbrand         if (g_once_init_enter(&initialized)) {
590a960d664SDavid Hildenbrand             qemu_mutex_init(&sigbus_mutex);
591a960d664SDavid Hildenbrand             g_once_init_leave(&initialized, 1);
592a960d664SDavid Hildenbrand         }
593a960d664SDavid Hildenbrand 
594a960d664SDavid Hildenbrand         qemu_mutex_lock(&sigbus_mutex);
59538183310SPaolo Bonzini         memset(&act, 0, sizeof(act));
59629b838c0SDavid Hildenbrand #ifdef CONFIG_LINUX
59729b838c0SDavid Hildenbrand         act.sa_sigaction = &sigbus_handler;
59829b838c0SDavid Hildenbrand         act.sa_flags = SA_SIGINFO;
59929b838c0SDavid Hildenbrand #else /* CONFIG_LINUX */
60038183310SPaolo Bonzini         act.sa_handler = &sigbus_handler;
60138183310SPaolo Bonzini         act.sa_flags = 0;
60229b838c0SDavid Hildenbrand #endif /* CONFIG_LINUX */
60338183310SPaolo Bonzini 
60429b838c0SDavid Hildenbrand         ret = sigaction(SIGBUS, &act, &sigbus_oldact);
60538183310SPaolo Bonzini         if (ret) {
606dd4fc605SDavid Hildenbrand             qemu_mutex_unlock(&sigbus_mutex);
607056b68afSIgor Mammedov             error_setg_errno(errp, errno,
6086556aadcSDavid Hildenbrand                 "qemu_prealloc_mem: failed to install signal handler");
609b622ee98SPhilippe Mathieu-Daudé             return false;
61038183310SPaolo Bonzini         }
611a384bfa3SDavid Hildenbrand     }
61238183310SPaolo Bonzini 
6131e356fc1SJitendra Kolhe     /* touch pages simultaneously */
61404accf43SMark Kanda     ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async,
615a384bfa3SDavid Hildenbrand                           use_madv_populate_write);
6166c427ab9SDavid Hildenbrand     if (ret) {
6176c427ab9SDavid Hildenbrand         error_setg_errno(errp, -ret,
6186556aadcSDavid Hildenbrand                          "qemu_prealloc_mem: preallocating memory failed");
619b622ee98SPhilippe Mathieu-Daudé         rv = false;
620056b68afSIgor Mammedov     }
62138183310SPaolo Bonzini 
622a384bfa3SDavid Hildenbrand     if (!use_madv_populate_write) {
62329b838c0SDavid Hildenbrand         ret = sigaction(SIGBUS, &sigbus_oldact, NULL);
62438183310SPaolo Bonzini         if (ret) {
625056b68afSIgor Mammedov             /* Terminate QEMU since it can't recover from error */
6266556aadcSDavid Hildenbrand             perror("qemu_prealloc_mem: failed to reinstall signal handler");
62738183310SPaolo Bonzini             exit(1);
62838183310SPaolo Bonzini         }
629a960d664SDavid Hildenbrand         qemu_mutex_unlock(&sigbus_mutex);
63038183310SPaolo Bonzini     }
631b622ee98SPhilippe Mathieu-Daudé     return rv;
632a384bfa3SDavid Hildenbrand }
633d57e4e48SDaniel P. Berrange 
qemu_get_pid_name(pid_t pid)6347dc9ae43SMichal Privoznik char *qemu_get_pid_name(pid_t pid)
6357dc9ae43SMichal Privoznik {
6367dc9ae43SMichal Privoznik     char *name = NULL;
6377dc9ae43SMichal Privoznik 
6387dc9ae43SMichal Privoznik #if defined(__FreeBSD__)
6397dc9ae43SMichal Privoznik     /* BSDs don't have /proc, but they provide a nice substitute */
6407dc9ae43SMichal Privoznik     struct kinfo_proc *proc = kinfo_getproc(pid);
6417dc9ae43SMichal Privoznik 
6427dc9ae43SMichal Privoznik     if (proc) {
6437dc9ae43SMichal Privoznik         name = g_strdup(proc->ki_comm);
6447dc9ae43SMichal Privoznik         free(proc);
6457dc9ae43SMichal Privoznik     }
6467dc9ae43SMichal Privoznik #else
6477dc9ae43SMichal Privoznik     /* Assume a system with reasonable procfs */
6487dc9ae43SMichal Privoznik     char *pid_path;
6497dc9ae43SMichal Privoznik     size_t len;
6507dc9ae43SMichal Privoznik 
6517dc9ae43SMichal Privoznik     pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
6527dc9ae43SMichal Privoznik     g_file_get_contents(pid_path, &name, &len, NULL);
6537dc9ae43SMichal Privoznik     g_free(pid_path);
6547dc9ae43SMichal Privoznik #endif
6557dc9ae43SMichal Privoznik 
6567dc9ae43SMichal Privoznik     return name;
6577dc9ae43SMichal Privoznik }
6587dc9ae43SMichal Privoznik 
6597dc9ae43SMichal Privoznik 
qemu_alloc_stack(size_t * sz)6608737d9e0SPeter Lieven void *qemu_alloc_stack(size_t *sz)
6618737d9e0SPeter Lieven {
662a1eaa628SAkihiko Odaki     void *ptr;
663fc3d1badSBrad Smith     int flags;
6647d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
6657d992e4dSPeter Lieven     void *ptr2;
6667d992e4dSPeter Lieven #endif
6678e3b0cbbSMarc-André Lureau     size_t pagesz = qemu_real_host_page_size();
6688737d9e0SPeter Lieven #ifdef _SC_THREAD_STACK_MIN
6698737d9e0SPeter Lieven     /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
6708737d9e0SPeter Lieven     long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
6718737d9e0SPeter Lieven     *sz = MAX(MAX(min_stack_sz, 0), *sz);
6728737d9e0SPeter Lieven #endif
6738737d9e0SPeter Lieven     /* adjust stack size to a multiple of the page size */
6748737d9e0SPeter Lieven     *sz = ROUND_UP(*sz, pagesz);
6758737d9e0SPeter Lieven     /* allocate one extra page for the guard page */
6768737d9e0SPeter Lieven     *sz += pagesz;
6778737d9e0SPeter Lieven 
678fc3d1badSBrad Smith     flags = MAP_PRIVATE | MAP_ANONYMOUS;
679fc3d1badSBrad Smith #if defined(MAP_STACK) && defined(__OpenBSD__)
680fc3d1badSBrad Smith     /* Only enable MAP_STACK on OpenBSD. Other OS's such as
681fc3d1badSBrad Smith      * Linux/FreeBSD/NetBSD have a flag with the same name
682fc3d1badSBrad Smith      * but have differing functionality. OpenBSD will SEGV
683fc3d1badSBrad Smith      * if it spots execution with a stack pointer pointing
684fc3d1badSBrad Smith      * at memory that was not allocated with MAP_STACK.
685fc3d1badSBrad Smith      */
686fc3d1badSBrad Smith     flags |= MAP_STACK;
687fc3d1badSBrad Smith #endif
688fc3d1badSBrad Smith 
689fc3d1badSBrad Smith     ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
6908737d9e0SPeter Lieven     if (ptr == MAP_FAILED) {
691e916a6e8SEduardo Habkost         perror("failed to allocate memory for stack");
6928737d9e0SPeter Lieven         abort();
6938737d9e0SPeter Lieven     }
6948737d9e0SPeter Lieven 
695a1eaa628SAkihiko Odaki     /* Stack grows down -- guard page at the bottom. */
696a1eaa628SAkihiko Odaki     if (mprotect(ptr, pagesz, PROT_NONE) != 0) {
697e916a6e8SEduardo Habkost         perror("failed to set up stack guard page");
6988737d9e0SPeter Lieven         abort();
6998737d9e0SPeter Lieven     }
7008737d9e0SPeter Lieven 
7017d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
7027d992e4dSPeter Lieven     for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
7037d992e4dSPeter Lieven         *(uint32_t *)ptr2 = 0xdeadbeaf;
7047d992e4dSPeter Lieven     }
7057d992e4dSPeter Lieven #endif
7067d992e4dSPeter Lieven 
7078737d9e0SPeter Lieven     return ptr;
7088737d9e0SPeter Lieven }
7098737d9e0SPeter Lieven 
7107d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
7117d992e4dSPeter Lieven static __thread unsigned int max_stack_usage;
7127d992e4dSPeter Lieven #endif
7137d992e4dSPeter Lieven 
qemu_free_stack(void * stack,size_t sz)7148737d9e0SPeter Lieven void qemu_free_stack(void *stack, size_t sz)
7158737d9e0SPeter Lieven {
7167d992e4dSPeter Lieven #ifdef CONFIG_DEBUG_STACK_USAGE
7177d992e4dSPeter Lieven     unsigned int usage;
7187d992e4dSPeter Lieven     void *ptr;
7197d992e4dSPeter Lieven 
7208e3b0cbbSMarc-André Lureau     for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz;
7217d992e4dSPeter Lieven          ptr += sizeof(uint32_t)) {
7227d992e4dSPeter Lieven         if (*(uint32_t *)ptr != 0xdeadbeaf) {
7237d992e4dSPeter Lieven             break;
7247d992e4dSPeter Lieven         }
7257d992e4dSPeter Lieven     }
7267d992e4dSPeter Lieven     usage = sz - (uintptr_t) (ptr - stack);
7277d992e4dSPeter Lieven     if (usage > max_stack_usage) {
7287d992e4dSPeter Lieven         error_report("thread %d max stack usage increased from %u to %u",
7297d992e4dSPeter Lieven                      qemu_get_thread_id(), max_stack_usage, usage);
7307d992e4dSPeter Lieven         max_stack_usage = usage;
7317d992e4dSPeter Lieven     }
7327d992e4dSPeter Lieven #endif
7337d992e4dSPeter Lieven 
7348737d9e0SPeter Lieven     munmap(stack, sz);
7358737d9e0SPeter Lieven }
736d98d4072SPaolo Bonzini 
737c905a368SDaniele Buono /*
738c905a368SDaniele Buono  * Disable CFI checks.
739d02d06f8SMichael Tokarev  * We are going to call a signal handler directly. Such handler may or may not
740c905a368SDaniele Buono  * have been defined in our binary, so there's no guarantee that the pointer
741c905a368SDaniele Buono  * used to set the handler is a cfi-valid pointer. Since the handlers are
742c905a368SDaniele Buono  * stored in kernel memory, changing the handler to an attacker-defined
743c905a368SDaniele Buono  * function requires being able to call a sigaction() syscall,
744c905a368SDaniele Buono  * which is not as easy as overwriting a pointer in memory.
745c905a368SDaniele Buono  */
746c905a368SDaniele Buono QEMU_DISABLE_CFI
sigaction_invoke(struct sigaction * action,struct qemu_signalfd_siginfo * info)747d98d4072SPaolo Bonzini void sigaction_invoke(struct sigaction *action,
748d98d4072SPaolo Bonzini                       struct qemu_signalfd_siginfo *info)
749d98d4072SPaolo Bonzini {
75002ffa034SPeter Maydell     siginfo_t si = {};
751d98d4072SPaolo Bonzini     si.si_signo = info->ssi_signo;
752d98d4072SPaolo Bonzini     si.si_errno = info->ssi_errno;
753d98d4072SPaolo Bonzini     si.si_code = info->ssi_code;
754d98d4072SPaolo Bonzini 
755d98d4072SPaolo Bonzini     /* Convert the minimal set of fields defined by POSIX.
756d98d4072SPaolo Bonzini      * Positive si_code values are reserved for kernel-generated
757d98d4072SPaolo Bonzini      * signals, where the valid siginfo fields are determined by
758d98d4072SPaolo Bonzini      * the signal number.  But according to POSIX, it is unspecified
759d98d4072SPaolo Bonzini      * whether SI_USER and SI_QUEUE have values less than or equal to
760d98d4072SPaolo Bonzini      * zero.
761d98d4072SPaolo Bonzini      */
762d98d4072SPaolo Bonzini     if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
763d98d4072SPaolo Bonzini         info->ssi_code <= 0) {
764d98d4072SPaolo Bonzini         /* SIGTERM, etc.  */
765d98d4072SPaolo Bonzini         si.si_pid = info->ssi_pid;
766d98d4072SPaolo Bonzini         si.si_uid = info->ssi_uid;
767d98d4072SPaolo Bonzini     } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
768d98d4072SPaolo Bonzini                info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
769d98d4072SPaolo Bonzini         si.si_addr = (void *)(uintptr_t)info->ssi_addr;
770d98d4072SPaolo Bonzini     } else if (info->ssi_signo == SIGCHLD) {
771d98d4072SPaolo Bonzini         si.si_pid = info->ssi_pid;
772d98d4072SPaolo Bonzini         si.si_status = info->ssi_status;
773d98d4072SPaolo Bonzini         si.si_uid = info->ssi_uid;
774d98d4072SPaolo Bonzini     }
775d98d4072SPaolo Bonzini     action->sa_sigaction(info->ssi_signo, &si, NULL);
776d98d4072SPaolo Bonzini }
777e47f4765SMichal Privoznik 
qemu_get_host_physmem(void)778ad06ef0eSAlex Bennée size_t qemu_get_host_physmem(void)
779ad06ef0eSAlex Bennée {
780ad06ef0eSAlex Bennée #ifdef _SC_PHYS_PAGES
781ad06ef0eSAlex Bennée     long pages = sysconf(_SC_PHYS_PAGES);
782ad06ef0eSAlex Bennée     if (pages > 0) {
7838e3b0cbbSMarc-André Lureau         if (pages > SIZE_MAX / qemu_real_host_page_size()) {
784ad06ef0eSAlex Bennée             return SIZE_MAX;
785ad06ef0eSAlex Bennée         } else {
7868e3b0cbbSMarc-André Lureau             return pages * qemu_real_host_page_size();
787ad06ef0eSAlex Bennée         }
788ad06ef0eSAlex Bennée     }
789ad06ef0eSAlex Bennée #endif
790ad06ef0eSAlex Bennée     return 0;
791ad06ef0eSAlex Bennée }
792e9c4e0a8SMarc-André Lureau 
qemu_msync(void * addr,size_t length,int fd)79373991a92SMarc-André Lureau int qemu_msync(void *addr, size_t length, int fd)
79473991a92SMarc-André Lureau {
79573991a92SMarc-André Lureau     size_t align_mask = ~(qemu_real_host_page_size() - 1);
79673991a92SMarc-André Lureau 
79773991a92SMarc-André Lureau     /**
79873991a92SMarc-André Lureau      * There are no strict reqs as per the length of mapping
79973991a92SMarc-André Lureau      * to be synced. Still the length needs to follow the address
80073991a92SMarc-André Lureau      * alignment changes. Additionally - round the size to the multiple
80173991a92SMarc-André Lureau      * of PAGE_SIZE
80273991a92SMarc-André Lureau      */
80373991a92SMarc-André Lureau     length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1));
80473991a92SMarc-André Lureau     length = (length + ~align_mask) & align_mask;
80573991a92SMarc-André Lureau 
80673991a92SMarc-André Lureau     addr = (void *)((uintptr_t)addr & align_mask);
80773991a92SMarc-André Lureau 
80873991a92SMarc-André Lureau     return msync(addr, length, MS_SYNC);
80973991a92SMarc-André Lureau }
8104ec5ebeaSClément Léger 
qemu_close_all_open_fd_proc(const int * skip,unsigned int nskip)811*7532ca57SClément Léger static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip)
8124ec5ebeaSClément Léger {
8134ec5ebeaSClément Léger     struct dirent *de;
8144ec5ebeaSClément Léger     int fd, dfd;
8154ec5ebeaSClément Léger     DIR *dir;
816*7532ca57SClément Léger     unsigned int skip_start = 0, skip_end = nskip;
8174ec5ebeaSClément Léger 
8184ec5ebeaSClément Léger     dir = opendir("/proc/self/fd");
8194ec5ebeaSClément Léger     if (!dir) {
8204ec5ebeaSClément Léger         /* If /proc is not mounted, there is nothing that can be done. */
821ffa28f9cSClément Léger         return false;
8224ec5ebeaSClément Léger     }
8234ec5ebeaSClément Léger     /* Avoid closing the directory. */
8244ec5ebeaSClément Léger     dfd = dirfd(dir);
8254ec5ebeaSClément Léger 
8264ec5ebeaSClément Léger     for (de = readdir(dir); de; de = readdir(dir)) {
827*7532ca57SClément Léger         bool close_fd = true;
828*7532ca57SClément Léger 
829*7532ca57SClément Léger         if (de->d_name[0] == '.') {
830*7532ca57SClément Léger             continue;
831*7532ca57SClément Léger         }
8324ec5ebeaSClément Léger         fd = atoi(de->d_name);
833*7532ca57SClément Léger         if (fd == dfd) {
834*7532ca57SClément Léger             continue;
835*7532ca57SClément Léger         }
836*7532ca57SClément Léger 
837*7532ca57SClément Léger         for (unsigned int i = skip_start; i < skip_end; i++) {
838*7532ca57SClément Léger             if (fd < skip[i]) {
839*7532ca57SClément Léger                 /* We are below the next skipped fd, break */
840*7532ca57SClément Léger                 break;
841*7532ca57SClément Léger             } else if (fd == skip[i]) {
842*7532ca57SClément Léger                 close_fd = false;
843*7532ca57SClément Léger                 /* Restrict the range as we found fds matching start/end */
844*7532ca57SClément Léger                 if (i == skip_start) {
845*7532ca57SClément Léger                     skip_start++;
846*7532ca57SClément Léger                 } else if (i == skip_end) {
847*7532ca57SClément Léger                     skip_end--;
848*7532ca57SClément Léger                 }
849*7532ca57SClément Léger                 break;
850*7532ca57SClément Léger             }
851*7532ca57SClément Léger         }
852*7532ca57SClément Léger 
853*7532ca57SClément Léger         if (close_fd) {
8544ec5ebeaSClément Léger             close(fd);
8554ec5ebeaSClément Léger         }
8564ec5ebeaSClément Léger     }
8574ec5ebeaSClément Léger     closedir(dir);
858ffa28f9cSClément Léger 
859ffa28f9cSClément Léger     return true;
860ffa28f9cSClément Léger }
861ffa28f9cSClément Léger 
qemu_close_all_open_fd_close_range(const int * skip,unsigned int nskip,int open_max)862*7532ca57SClément Léger static bool qemu_close_all_open_fd_close_range(const int *skip,
863*7532ca57SClément Léger                                                unsigned int nskip,
864*7532ca57SClément Léger                                                int open_max)
865ffa28f9cSClément Léger {
866ffa28f9cSClément Léger #ifdef CONFIG_CLOSE_RANGE
867*7532ca57SClément Léger     int max_fd = open_max - 1;
868*7532ca57SClément Léger     int first = 0, last;
869*7532ca57SClément Léger     unsigned int cur_skip = 0;
870*7532ca57SClément Léger     int ret;
871*7532ca57SClément Léger 
872*7532ca57SClément Léger     do {
873*7532ca57SClément Léger         /* Find the start boundary of the range to close */
874*7532ca57SClément Léger         while (cur_skip < nskip && first == skip[cur_skip]) {
875*7532ca57SClément Léger             cur_skip++;
876*7532ca57SClément Léger             first++;
877ffa28f9cSClément Léger         }
878*7532ca57SClément Léger 
879*7532ca57SClément Léger         /* Find the upper boundary of the range to close */
880*7532ca57SClément Léger         last = max_fd;
881*7532ca57SClément Léger         if (cur_skip < nskip) {
882*7532ca57SClément Léger             last = skip[cur_skip] - 1;
883*7532ca57SClément Léger             last = MIN(last, max_fd);
884*7532ca57SClément Léger         }
885*7532ca57SClément Léger 
886*7532ca57SClément Léger         /* With the adjustments to the range, we might be done. */
887*7532ca57SClément Léger         if (first > last) {
888*7532ca57SClément Léger             break;
889*7532ca57SClément Léger         }
890*7532ca57SClément Léger 
891*7532ca57SClément Léger         ret = close_range(first, last, 0);
892*7532ca57SClément Léger         if (ret < 0) {
893ffa28f9cSClément Léger             return false;
894ffa28f9cSClément Léger         }
895ffa28f9cSClément Léger 
896*7532ca57SClément Léger         first = last + 1;
897*7532ca57SClément Léger     } while (last < max_fd);
898*7532ca57SClément Léger 
899*7532ca57SClément Léger     return true;
900*7532ca57SClément Léger #else
901*7532ca57SClément Léger     return false;
902*7532ca57SClément Léger #endif
903*7532ca57SClément Léger }
904*7532ca57SClément Léger 
qemu_close_all_open_fd_fallback(const int * skip,unsigned int nskip,int open_max)905*7532ca57SClément Léger static void qemu_close_all_open_fd_fallback(const int *skip, unsigned int nskip,
906*7532ca57SClément Léger                                             int open_max)
907ffa28f9cSClément Léger {
908*7532ca57SClément Léger     unsigned int cur_skip = 0;
909ffa28f9cSClément Léger 
910ffa28f9cSClément Léger     /* Fallback */
911*7532ca57SClément Léger     for (int i = 0; i < open_max; i++) {
912*7532ca57SClément Léger         if (cur_skip < nskip && i == skip[cur_skip]) {
913*7532ca57SClément Léger             cur_skip++;
914*7532ca57SClément Léger             continue;
915*7532ca57SClément Léger         }
916ffa28f9cSClément Léger         close(i);
917ffa28f9cSClément Léger     }
918ffa28f9cSClément Léger }
919ffa28f9cSClément Léger 
920ffa28f9cSClément Léger /*
921ffa28f9cSClément Léger  * Close all open file descriptors.
922ffa28f9cSClément Léger  */
qemu_close_all_open_fd(const int * skip,unsigned int nskip)923*7532ca57SClément Léger void qemu_close_all_open_fd(const int *skip, unsigned int nskip)
924ffa28f9cSClément Léger {
925*7532ca57SClément Léger     int open_max = sysconf(_SC_OPEN_MAX);
926*7532ca57SClément Léger 
927*7532ca57SClément Léger     assert(skip != NULL || nskip == 0);
928*7532ca57SClément Léger 
929*7532ca57SClément Léger     if (!qemu_close_all_open_fd_close_range(skip, nskip, open_max) &&
930*7532ca57SClément Léger         !qemu_close_all_open_fd_proc(skip, nskip)) {
931*7532ca57SClément Léger         qemu_close_all_open_fd_fallback(skip, nskip, open_max);
932ffa28f9cSClément Léger     }
9334ec5ebeaSClément Léger }
934