1 /* 2 * os-posix-lib.c 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2010 Red Hat, Inc. 6 * 7 * QEMU library functions on POSIX which are shared between QEMU and 8 * the QEMU tools. 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include <termios.h> 31 32 #include <glib/gprintf.h> 33 34 #include "system/system.h" 35 #include "trace.h" 36 #include "qapi/error.h" 37 #include "qemu/error-report.h" 38 #include "qemu/madvise.h" 39 #include "qemu/sockets.h" 40 #include "qemu/thread.h" 41 #include <libgen.h> 42 #include "qemu/cutils.h" 43 #include "qemu/units.h" 44 #include "qemu/thread-context.h" 45 #include "qemu/main-loop.h" 46 47 #ifdef CONFIG_LINUX 48 #include <sys/syscall.h> 49 #endif 50 51 #ifdef __FreeBSD__ 52 #include <sys/thr.h> 53 #include <sys/user.h> 54 #include <libutil.h> 55 #endif 56 57 #ifdef __NetBSD__ 58 #include <lwp.h> 59 #endif 60 61 #include "qemu/memalign.h" 62 #include "qemu/mmap-alloc.h" 63 64 #define MAX_MEM_PREALLOC_THREAD_COUNT 16 65 66 struct MemsetThread; 67 68 static QLIST_HEAD(, MemsetContext) memset_contexts = 69 QLIST_HEAD_INITIALIZER(memset_contexts); 70 71 typedef struct MemsetContext { 72 bool all_threads_created; 73 bool any_thread_failed; 74 struct MemsetThread *threads; 75 int num_threads; 76 QLIST_ENTRY(MemsetContext) next; 77 } MemsetContext; 78 79 struct MemsetThread { 80 char *addr; 81 size_t numpages; 82 size_t hpagesize; 83 QemuThread pgthread; 84 sigjmp_buf env; 85 MemsetContext *context; 86 }; 87 typedef struct MemsetThread MemsetThread; 88 89 /* used by sigbus_handler() */ 90 static MemsetContext *sigbus_memset_context; 91 struct sigaction sigbus_oldact; 92 static QemuMutex sigbus_mutex; 93 94 static QemuMutex page_mutex; 95 static QemuCond page_cond; 96 97 int qemu_get_thread_id(void) 98 { 99 #if defined(__linux__) 100 return syscall(SYS_gettid); 101 #elif defined(__FreeBSD__) 102 /* thread id is up to INT_MAX */ 103 long tid; 104 thr_self(&tid); 105 return (int)tid; 106 #elif defined(__NetBSD__) 107 return _lwp_self(); 108 #elif defined(__OpenBSD__) 109 return getthrid(); 110 #else 111 return getpid(); 112 #endif 113 } 114 115 int qemu_kill_thread(int tid, int sig) 116 { 117 #if defined(__linux__) 118 return syscall(__NR_tgkill, getpid(), tid, sig); 119 #elif defined(__FreeBSD__) 120 return thr_kill2(getpid(), tid, sig); 121 #elif defined(__NetBSD__) 122 return _lwp_kill(tid, sig); 123 #elif defined(__OpenBSD__) 124 return thrkill(tid, sig, NULL); 125 #else 126 return kill(tid, sig); 127 #endif 128 } 129 130 int qemu_daemon(int nochdir, int noclose) 131 { 132 return daemon(nochdir, noclose); 133 } 134 135 bool qemu_write_pidfile(const char *path, Error **errp) 136 { 137 int fd; 138 char pidstr[32]; 139 140 while (1) { 141 struct stat a, b; 142 struct flock lock = { 143 .l_type = F_WRLCK, 144 .l_whence = SEEK_SET, 145 .l_len = 0, 146 }; 147 148 fd = qemu_create(path, O_WRONLY, S_IRUSR | S_IWUSR, errp); 149 if (fd == -1) { 150 return false; 151 } 152 153 if (fstat(fd, &b) < 0) { 154 error_setg_errno(errp, errno, "Cannot stat file"); 155 goto fail_close; 156 } 157 158 if (fcntl(fd, F_SETLK, &lock)) { 159 error_setg_errno(errp, errno, "Cannot lock pid file"); 160 goto fail_close; 161 } 162 163 /* 164 * Now make sure the path we locked is the same one that now 165 * exists on the filesystem. 166 */ 167 if (stat(path, &a) < 0) { 168 /* 169 * PID file disappeared, someone else must be racing with 170 * us, so try again. 171 */ 172 close(fd); 173 continue; 174 } 175 176 if (a.st_ino == b.st_ino) { 177 break; 178 } 179 180 /* 181 * PID file was recreated, someone else must be racing with 182 * us, so try again. 183 */ 184 close(fd); 185 } 186 187 if (ftruncate(fd, 0) < 0) { 188 error_setg_errno(errp, errno, "Failed to truncate pid file"); 189 goto fail_unlink; 190 } 191 192 snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid()); 193 if (qemu_write_full(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) { 194 error_setg(errp, "Failed to write pid file"); 195 goto fail_unlink; 196 } 197 198 return true; 199 200 fail_unlink: 201 unlink(path); 202 fail_close: 203 close(fd); 204 return false; 205 } 206 207 /* alloc shared memory pages */ 208 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared, 209 bool noreserve) 210 { 211 const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) | 212 (noreserve ? QEMU_MAP_NORESERVE : 0); 213 size_t align = QEMU_VMALLOC_ALIGN; 214 #ifndef EMSCRIPTEN 215 void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0); 216 217 if (ptr == MAP_FAILED) { 218 return NULL; 219 } 220 #else 221 /* 222 * qemu_ram_mmap is not implemented for Emscripten. Use qemu_memalign 223 * for the anonymous allocation. noreserve is ignored as there is no swap 224 * space on Emscripten, and shared is ignored as there is no other 225 * processes on Emscripten. 226 */ 227 void *ptr = qemu_memalign(align, size); 228 #endif 229 230 if (alignment) { 231 *alignment = align; 232 } 233 234 trace_qemu_anon_ram_alloc(size, ptr); 235 return ptr; 236 } 237 238 void qemu_anon_ram_free(void *ptr, size_t size) 239 { 240 trace_qemu_anon_ram_free(ptr, size); 241 #ifndef EMSCRIPTEN 242 qemu_ram_munmap(-1, ptr, size); 243 #else 244 /* 245 * qemu_ram_munmap is not implemented for Emscripten and qemu_memalign 246 * was used for the allocation. Use the corresponding freeing function 247 * here. 248 */ 249 qemu_vfree(ptr); 250 #endif 251 } 252 253 bool qemu_set_blocking(int fd, bool block, Error **errp) 254 { 255 g_autoptr(GError) err = NULL; 256 257 if (!g_unix_set_fd_nonblocking(fd, !block, &err)) { 258 error_setg_errno(errp, errno, 259 "Can't set file descriptor %d %s: %s", fd, 260 block ? "blocking" : "non-blocking", 261 err->message); 262 return false; 263 } 264 265 return true; 266 } 267 268 int socket_set_fast_reuse(int fd) 269 { 270 int val = 1, ret; 271 272 ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 273 (const char *)&val, sizeof(val)); 274 275 assert(ret == 0); 276 277 return ret; 278 } 279 280 void qemu_set_cloexec(int fd) 281 { 282 int f; 283 f = fcntl(fd, F_GETFD); 284 assert(f != -1); 285 f = fcntl(fd, F_SETFD, f | FD_CLOEXEC); 286 assert(f != -1); 287 } 288 289 int qemu_socketpair(int domain, int type, int protocol, int sv[2]) 290 { 291 int ret; 292 293 #ifdef SOCK_CLOEXEC 294 ret = socketpair(domain, type | SOCK_CLOEXEC, protocol, sv); 295 if (ret != -1 || errno != EINVAL) { 296 return ret; 297 } 298 #endif 299 ret = socketpair(domain, type, protocol, sv); 300 if (ret == 0) { 301 qemu_set_cloexec(sv[0]); 302 qemu_set_cloexec(sv[1]); 303 } 304 305 return ret; 306 } 307 308 void qemu_clear_cloexec(int fd) 309 { 310 int f; 311 f = fcntl(fd, F_GETFD); 312 assert(f != -1); 313 f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC); 314 assert(f != -1); 315 } 316 317 char * 318 qemu_get_local_state_dir(void) 319 { 320 return get_relocated_path(CONFIG_QEMU_LOCALSTATEDIR); 321 } 322 323 void qemu_set_tty_echo(int fd, bool echo) 324 { 325 struct termios tty; 326 327 tcgetattr(fd, &tty); 328 329 if (echo) { 330 tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN; 331 } else { 332 tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN); 333 } 334 335 tcsetattr(fd, TCSANOW, &tty); 336 } 337 338 #ifdef CONFIG_LINUX 339 static void sigbus_handler(int signal, siginfo_t *siginfo, void *ctx) 340 #else /* CONFIG_LINUX */ 341 static void sigbus_handler(int signal) 342 #endif /* CONFIG_LINUX */ 343 { 344 int i; 345 346 if (sigbus_memset_context) { 347 for (i = 0; i < sigbus_memset_context->num_threads; i++) { 348 MemsetThread *thread = &sigbus_memset_context->threads[i]; 349 350 if (qemu_thread_is_self(&thread->pgthread)) { 351 siglongjmp(thread->env, 1); 352 } 353 } 354 } 355 356 #ifdef CONFIG_LINUX 357 /* 358 * We assume that the MCE SIGBUS handler could have been registered. We 359 * should never receive BUS_MCEERR_AO on any of our threads, but only on 360 * the main thread registered for PR_MCE_KILL_EARLY. Further, we should not 361 * receive BUS_MCEERR_AR triggered by action of other threads on one of 362 * our threads. So, no need to check for unrelated SIGBUS when seeing one 363 * for our threads. 364 * 365 * We will forward to the MCE handler, which will either handle the SIGBUS 366 * or reinstall the default SIGBUS handler and reraise the SIGBUS. The 367 * default SIGBUS handler will crash the process, so we don't care. 368 */ 369 if (sigbus_oldact.sa_flags & SA_SIGINFO) { 370 sigbus_oldact.sa_sigaction(signal, siginfo, ctx); 371 return; 372 } 373 #endif /* CONFIG_LINUX */ 374 warn_report("qemu_prealloc_mem: unrelated SIGBUS detected and ignored"); 375 } 376 377 static void *do_touch_pages(void *arg) 378 { 379 MemsetThread *memset_args = (MemsetThread *)arg; 380 sigset_t set, oldset; 381 int ret = 0; 382 383 /* 384 * On Linux, the page faults from the loop below can cause mmap_sem 385 * contention with allocation of the thread stacks. Do not start 386 * clearing until all threads have been created. 387 */ 388 qemu_mutex_lock(&page_mutex); 389 while (!memset_args->context->all_threads_created) { 390 qemu_cond_wait(&page_cond, &page_mutex); 391 } 392 qemu_mutex_unlock(&page_mutex); 393 394 /* unblock SIGBUS */ 395 sigemptyset(&set); 396 sigaddset(&set, SIGBUS); 397 pthread_sigmask(SIG_UNBLOCK, &set, &oldset); 398 399 if (sigsetjmp(memset_args->env, 1)) { 400 ret = -EFAULT; 401 } else { 402 char *addr = memset_args->addr; 403 size_t numpages = memset_args->numpages; 404 size_t hpagesize = memset_args->hpagesize; 405 size_t i; 406 for (i = 0; i < numpages; i++) { 407 /* 408 * Read & write back the same value, so we don't 409 * corrupt existing user/app data that might be 410 * stored. 411 * 412 * 'volatile' to stop compiler optimizing this away 413 * to a no-op 414 */ 415 *(volatile char *)addr = *addr; 416 addr += hpagesize; 417 } 418 } 419 pthread_sigmask(SIG_SETMASK, &oldset, NULL); 420 return (void *)(uintptr_t)ret; 421 } 422 423 static void *do_madv_populate_write_pages(void *arg) 424 { 425 MemsetThread *memset_args = (MemsetThread *)arg; 426 const size_t size = memset_args->numpages * memset_args->hpagesize; 427 char * const addr = memset_args->addr; 428 int ret = 0; 429 430 /* See do_touch_pages(). */ 431 qemu_mutex_lock(&page_mutex); 432 while (!memset_args->context->all_threads_created) { 433 qemu_cond_wait(&page_cond, &page_mutex); 434 } 435 qemu_mutex_unlock(&page_mutex); 436 437 if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) { 438 ret = -errno; 439 } 440 return (void *)(uintptr_t)ret; 441 } 442 443 static inline int get_memset_num_threads(size_t hpagesize, size_t numpages, 444 int max_threads) 445 { 446 long host_procs = sysconf(_SC_NPROCESSORS_ONLN); 447 int ret = 1; 448 449 if (host_procs > 0) { 450 ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), max_threads); 451 } 452 453 /* Especially with gigantic pages, don't create more threads than pages. */ 454 ret = MIN(ret, numpages); 455 /* Don't start threads to prealloc comparatively little memory. */ 456 ret = MIN(ret, MAX(1, hpagesize * numpages / (64 * MiB))); 457 458 /* In case sysconf() fails, we fall back to single threaded */ 459 return ret; 460 } 461 462 static int wait_and_free_mem_prealloc_context(MemsetContext *context) 463 { 464 int i, ret = 0, tmp; 465 466 for (i = 0; i < context->num_threads; i++) { 467 tmp = (uintptr_t)qemu_thread_join(&context->threads[i].pgthread); 468 469 if (tmp) { 470 ret = tmp; 471 } 472 } 473 g_free(context->threads); 474 g_free(context); 475 return ret; 476 } 477 478 static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, 479 int max_threads, ThreadContext *tc, bool async, 480 bool use_madv_populate_write) 481 { 482 static gsize initialized = 0; 483 MemsetContext *context = g_malloc0(sizeof(MemsetContext)); 484 size_t numpages_per_thread, leftover; 485 void *(*touch_fn)(void *); 486 int ret, i = 0; 487 char *addr = area; 488 489 /* 490 * Asynchronous preallocation is only allowed when using MADV_POPULATE_WRITE 491 * and prealloc context for thread placement. 492 */ 493 if (!use_madv_populate_write || !tc) { 494 async = false; 495 } 496 497 context->num_threads = 498 get_memset_num_threads(hpagesize, numpages, max_threads); 499 500 if (g_once_init_enter(&initialized)) { 501 qemu_mutex_init(&page_mutex); 502 qemu_cond_init(&page_cond); 503 g_once_init_leave(&initialized, 1); 504 } 505 506 if (use_madv_populate_write) { 507 /* 508 * Avoid creating a single thread for MADV_POPULATE_WRITE when 509 * preallocating synchronously. 510 */ 511 if (context->num_threads == 1 && !async) { 512 ret = 0; 513 if (qemu_madvise(area, hpagesize * numpages, 514 QEMU_MADV_POPULATE_WRITE)) { 515 ret = -errno; 516 } 517 g_free(context); 518 return ret; 519 } 520 touch_fn = do_madv_populate_write_pages; 521 } else { 522 touch_fn = do_touch_pages; 523 } 524 525 context->threads = g_new0(MemsetThread, context->num_threads); 526 numpages_per_thread = numpages / context->num_threads; 527 leftover = numpages % context->num_threads; 528 for (i = 0; i < context->num_threads; i++) { 529 context->threads[i].addr = addr; 530 context->threads[i].numpages = numpages_per_thread + (i < leftover); 531 context->threads[i].hpagesize = hpagesize; 532 context->threads[i].context = context; 533 if (tc) { 534 thread_context_create_thread(tc, &context->threads[i].pgthread, 535 "touch_pages", 536 touch_fn, &context->threads[i], 537 QEMU_THREAD_JOINABLE); 538 } else { 539 qemu_thread_create(&context->threads[i].pgthread, "touch_pages", 540 touch_fn, &context->threads[i], 541 QEMU_THREAD_JOINABLE); 542 } 543 addr += context->threads[i].numpages * hpagesize; 544 } 545 546 if (async) { 547 /* 548 * async requests currently require the BQL. Add it to the list and kick 549 * preallocation off during qemu_finish_async_prealloc_mem(). 550 */ 551 assert(bql_locked()); 552 QLIST_INSERT_HEAD(&memset_contexts, context, next); 553 return 0; 554 } 555 556 if (!use_madv_populate_write) { 557 sigbus_memset_context = context; 558 } 559 560 qemu_mutex_lock(&page_mutex); 561 context->all_threads_created = true; 562 qemu_cond_broadcast(&page_cond); 563 qemu_mutex_unlock(&page_mutex); 564 565 ret = wait_and_free_mem_prealloc_context(context); 566 567 if (!use_madv_populate_write) { 568 sigbus_memset_context = NULL; 569 } 570 return ret; 571 } 572 573 bool qemu_finish_async_prealloc_mem(Error **errp) 574 { 575 int ret = 0, tmp; 576 MemsetContext *context, *next_context; 577 578 /* Waiting for preallocation requires the BQL. */ 579 assert(bql_locked()); 580 if (QLIST_EMPTY(&memset_contexts)) { 581 return true; 582 } 583 584 qemu_mutex_lock(&page_mutex); 585 QLIST_FOREACH(context, &memset_contexts, next) { 586 context->all_threads_created = true; 587 } 588 qemu_cond_broadcast(&page_cond); 589 qemu_mutex_unlock(&page_mutex); 590 591 QLIST_FOREACH_SAFE(context, &memset_contexts, next, next_context) { 592 QLIST_REMOVE(context, next); 593 tmp = wait_and_free_mem_prealloc_context(context); 594 if (tmp) { 595 ret = tmp; 596 } 597 } 598 599 if (ret) { 600 error_setg_errno(errp, -ret, 601 "qemu_prealloc_mem: preallocating memory failed"); 602 return false; 603 } 604 return true; 605 } 606 607 static bool madv_populate_write_possible(char *area, size_t pagesize) 608 { 609 return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) || 610 errno != EINVAL; 611 } 612 613 bool qemu_prealloc_mem(int fd, char *area, size_t sz, int max_threads, 614 ThreadContext *tc, bool async, Error **errp) 615 { 616 static gsize initialized; 617 int ret; 618 #ifndef EMSCRIPTEN 619 size_t hpagesize = qemu_fd_getpagesize(fd); 620 #else 621 /* 622 * mmap-alloc.c is excluded from Emscripten build, so qemu_fd_getpagesize 623 * is unavailable. Fallback to the lower level implementation. 624 */ 625 size_t hpagesize = qemu_real_host_page_size(); 626 #endif 627 size_t numpages = DIV_ROUND_UP(sz, hpagesize); 628 bool use_madv_populate_write; 629 struct sigaction act; 630 bool rv = true; 631 632 /* 633 * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for 634 * some special mappings, such as mapping /dev/mem. 635 */ 636 use_madv_populate_write = madv_populate_write_possible(area, hpagesize); 637 638 if (!use_madv_populate_write) { 639 if (g_once_init_enter(&initialized)) { 640 qemu_mutex_init(&sigbus_mutex); 641 g_once_init_leave(&initialized, 1); 642 } 643 644 qemu_mutex_lock(&sigbus_mutex); 645 memset(&act, 0, sizeof(act)); 646 #ifdef CONFIG_LINUX 647 act.sa_sigaction = &sigbus_handler; 648 act.sa_flags = SA_SIGINFO; 649 #else /* CONFIG_LINUX */ 650 act.sa_handler = &sigbus_handler; 651 act.sa_flags = 0; 652 #endif /* CONFIG_LINUX */ 653 654 ret = sigaction(SIGBUS, &act, &sigbus_oldact); 655 if (ret) { 656 qemu_mutex_unlock(&sigbus_mutex); 657 error_setg_errno(errp, errno, 658 "qemu_prealloc_mem: failed to install signal handler"); 659 return false; 660 } 661 } 662 663 /* touch pages simultaneously */ 664 ret = touch_all_pages(area, hpagesize, numpages, max_threads, tc, async, 665 use_madv_populate_write); 666 if (ret) { 667 error_setg_errno(errp, -ret, 668 "qemu_prealloc_mem: preallocating memory failed"); 669 rv = false; 670 } 671 672 if (!use_madv_populate_write) { 673 ret = sigaction(SIGBUS, &sigbus_oldact, NULL); 674 if (ret) { 675 /* Terminate QEMU since it can't recover from error */ 676 perror("qemu_prealloc_mem: failed to reinstall signal handler"); 677 exit(1); 678 } 679 qemu_mutex_unlock(&sigbus_mutex); 680 } 681 return rv; 682 } 683 684 char *qemu_get_pid_name(pid_t pid) 685 { 686 char *name = NULL; 687 688 #if defined(__FreeBSD__) 689 /* BSDs don't have /proc, but they provide a nice substitute */ 690 struct kinfo_proc *proc = kinfo_getproc(pid); 691 692 if (proc) { 693 name = g_strdup(proc->ki_comm); 694 free(proc); 695 } 696 #else 697 /* Assume a system with reasonable procfs */ 698 char *pid_path; 699 size_t len; 700 701 pid_path = g_strdup_printf("/proc/%d/cmdline", pid); 702 g_file_get_contents(pid_path, &name, &len, NULL); 703 g_free(pid_path); 704 #endif 705 706 return name; 707 } 708 709 710 void *qemu_alloc_stack(size_t *sz) 711 { 712 void *ptr; 713 int flags; 714 #ifdef CONFIG_DEBUG_STACK_USAGE 715 void *ptr2; 716 #endif 717 size_t pagesz = qemu_real_host_page_size(); 718 #ifdef _SC_THREAD_STACK_MIN 719 /* avoid stacks smaller than _SC_THREAD_STACK_MIN */ 720 long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN); 721 *sz = MAX(MAX(min_stack_sz, 0), *sz); 722 #endif 723 /* adjust stack size to a multiple of the page size */ 724 *sz = ROUND_UP(*sz, pagesz); 725 /* allocate one extra page for the guard page */ 726 *sz += pagesz; 727 728 flags = MAP_PRIVATE | MAP_ANONYMOUS; 729 #if defined(MAP_STACK) && defined(__OpenBSD__) 730 /* Only enable MAP_STACK on OpenBSD. Other OS's such as 731 * Linux/FreeBSD/NetBSD have a flag with the same name 732 * but have differing functionality. OpenBSD will SEGV 733 * if it spots execution with a stack pointer pointing 734 * at memory that was not allocated with MAP_STACK. 735 */ 736 flags |= MAP_STACK; 737 #endif 738 739 ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0); 740 if (ptr == MAP_FAILED) { 741 perror("failed to allocate memory for stack"); 742 abort(); 743 } 744 745 /* Stack grows down -- guard page at the bottom. */ 746 if (mprotect(ptr, pagesz, PROT_NONE) != 0) { 747 perror("failed to set up stack guard page"); 748 abort(); 749 } 750 751 #ifdef CONFIG_DEBUG_STACK_USAGE 752 for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) { 753 *(uint32_t *)ptr2 = 0xdeadbeaf; 754 } 755 #endif 756 757 return ptr; 758 } 759 760 #ifdef CONFIG_DEBUG_STACK_USAGE 761 static __thread unsigned int max_stack_usage; 762 #endif 763 764 void qemu_free_stack(void *stack, size_t sz) 765 { 766 #ifdef CONFIG_DEBUG_STACK_USAGE 767 unsigned int usage; 768 void *ptr; 769 770 for (ptr = stack + qemu_real_host_page_size(); ptr < stack + sz; 771 ptr += sizeof(uint32_t)) { 772 if (*(uint32_t *)ptr != 0xdeadbeaf) { 773 break; 774 } 775 } 776 usage = sz - (uintptr_t) (ptr - stack); 777 if (usage > max_stack_usage) { 778 error_report("thread %d max stack usage increased from %u to %u", 779 qemu_get_thread_id(), max_stack_usage, usage); 780 max_stack_usage = usage; 781 } 782 #endif 783 784 munmap(stack, sz); 785 } 786 787 /* 788 * Disable CFI checks. 789 * We are going to call a signal handler directly. Such handler may or may not 790 * have been defined in our binary, so there's no guarantee that the pointer 791 * used to set the handler is a cfi-valid pointer. Since the handlers are 792 * stored in kernel memory, changing the handler to an attacker-defined 793 * function requires being able to call a sigaction() syscall, 794 * which is not as easy as overwriting a pointer in memory. 795 */ 796 QEMU_DISABLE_CFI 797 void sigaction_invoke(struct sigaction *action, 798 struct qemu_signalfd_siginfo *info) 799 { 800 siginfo_t si = {}; 801 si.si_signo = info->ssi_signo; 802 si.si_errno = info->ssi_errno; 803 si.si_code = info->ssi_code; 804 805 /* Convert the minimal set of fields defined by POSIX. 806 * Positive si_code values are reserved for kernel-generated 807 * signals, where the valid siginfo fields are determined by 808 * the signal number. But according to POSIX, it is unspecified 809 * whether SI_USER and SI_QUEUE have values less than or equal to 810 * zero. 811 */ 812 if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE || 813 info->ssi_code <= 0) { 814 /* SIGTERM, etc. */ 815 si.si_pid = info->ssi_pid; 816 si.si_uid = info->ssi_uid; 817 } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE || 818 info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) { 819 si.si_addr = (void *)(uintptr_t)info->ssi_addr; 820 } else if (info->ssi_signo == SIGCHLD) { 821 si.si_pid = info->ssi_pid; 822 si.si_status = info->ssi_status; 823 si.si_uid = info->ssi_uid; 824 } 825 action->sa_sigaction(info->ssi_signo, &si, NULL); 826 } 827 828 size_t qemu_get_host_physmem(void) 829 { 830 #ifdef _SC_PHYS_PAGES 831 long pages = sysconf(_SC_PHYS_PAGES); 832 if (pages > 0) { 833 if (pages > SIZE_MAX / qemu_real_host_page_size()) { 834 return SIZE_MAX; 835 } else { 836 return pages * qemu_real_host_page_size(); 837 } 838 } 839 #endif 840 return 0; 841 } 842 843 int qemu_msync(void *addr, size_t length, int fd) 844 { 845 size_t align_mask = ~(qemu_real_host_page_size() - 1); 846 847 /** 848 * There are no strict reqs as per the length of mapping 849 * to be synced. Still the length needs to follow the address 850 * alignment changes. Additionally - round the size to the multiple 851 * of PAGE_SIZE 852 */ 853 length += ((uintptr_t)addr & (qemu_real_host_page_size() - 1)); 854 length = (length + ~align_mask) & align_mask; 855 856 addr = (void *)((uintptr_t)addr & align_mask); 857 858 return msync(addr, length, MS_SYNC); 859 } 860 861 static bool qemu_close_all_open_fd_proc(const int *skip, unsigned int nskip) 862 { 863 struct dirent *de; 864 int fd, dfd; 865 DIR *dir; 866 unsigned int skip_start = 0, skip_end = nskip; 867 868 dir = opendir("/proc/self/fd"); 869 if (!dir) { 870 /* If /proc is not mounted, there is nothing that can be done. */ 871 return false; 872 } 873 /* Avoid closing the directory. */ 874 dfd = dirfd(dir); 875 876 for (de = readdir(dir); de; de = readdir(dir)) { 877 bool close_fd = true; 878 879 if (de->d_name[0] == '.') { 880 continue; 881 } 882 fd = atoi(de->d_name); 883 if (fd == dfd) { 884 continue; 885 } 886 887 for (unsigned int i = skip_start; i < skip_end; i++) { 888 if (fd < skip[i]) { 889 /* We are below the next skipped fd, break */ 890 break; 891 } else if (fd == skip[i]) { 892 close_fd = false; 893 /* Restrict the range as we found fds matching start/end */ 894 if (i == skip_start) { 895 skip_start++; 896 } else if (i == skip_end) { 897 skip_end--; 898 } 899 break; 900 } 901 } 902 903 if (close_fd) { 904 close(fd); 905 } 906 } 907 closedir(dir); 908 909 return true; 910 } 911 912 static bool qemu_close_all_open_fd_close_range(const int *skip, 913 unsigned int nskip, 914 int open_max) 915 { 916 #ifdef CONFIG_CLOSE_RANGE 917 int max_fd = open_max - 1; 918 int first = 0, last; 919 unsigned int cur_skip = 0; 920 int ret; 921 922 do { 923 /* Find the start boundary of the range to close */ 924 while (cur_skip < nskip && first == skip[cur_skip]) { 925 cur_skip++; 926 first++; 927 } 928 929 /* Find the upper boundary of the range to close */ 930 last = max_fd; 931 if (cur_skip < nskip) { 932 last = skip[cur_skip] - 1; 933 last = MIN(last, max_fd); 934 } 935 936 /* With the adjustments to the range, we might be done. */ 937 if (first > last) { 938 break; 939 } 940 941 ret = close_range(first, last, 0); 942 if (ret < 0) { 943 return false; 944 } 945 946 first = last + 1; 947 } while (last < max_fd); 948 949 return true; 950 #else 951 return false; 952 #endif 953 } 954 955 static void qemu_close_all_open_fd_fallback(const int *skip, unsigned int nskip, 956 int open_max) 957 { 958 unsigned int cur_skip = 0; 959 960 /* Fallback */ 961 for (int i = 0; i < open_max; i++) { 962 if (cur_skip < nskip && i == skip[cur_skip]) { 963 cur_skip++; 964 continue; 965 } 966 close(i); 967 } 968 } 969 970 /* 971 * Close all open file descriptors. 972 */ 973 void qemu_close_all_open_fd(const int *skip, unsigned int nskip) 974 { 975 int open_max = sysconf(_SC_OPEN_MAX); 976 977 assert(skip != NULL || nskip == 0); 978 979 if (!qemu_close_all_open_fd_close_range(skip, nskip, open_max) && 980 !qemu_close_all_open_fd_proc(skip, nskip)) { 981 qemu_close_all_open_fd_fallback(skip, nskip, open_max); 982 } 983 } 984 985 int qemu_shm_alloc(size_t size, Error **errp) 986 { 987 g_autoptr(GString) shm_name = g_string_new(NULL); 988 int fd, oflag, cur_sequence; 989 static int sequence; 990 mode_t mode; 991 992 cur_sequence = qatomic_fetch_inc(&sequence); 993 994 /* 995 * Let's use `mode = 0` because we don't want other processes to open our 996 * memory unless we share the file descriptor with them. 997 */ 998 mode = 0; 999 oflag = O_RDWR | O_CREAT | O_EXCL; 1000 1001 /* 1002 * Some operating systems allow creating anonymous POSIX shared memory 1003 * objects (e.g. FreeBSD provides the SHM_ANON constant), but this is not 1004 * defined by POSIX, so let's create a unique name. 1005 * 1006 * From Linux's shm_open(3) man-page: 1007 * For portable use, a shared memory object should be identified 1008 * by a name of the form /somename;" 1009 */ 1010 g_string_printf(shm_name, "/qemu-" FMT_pid "-shm-%d", getpid(), 1011 cur_sequence); 1012 1013 fd = shm_open(shm_name->str, oflag, mode); 1014 if (fd < 0) { 1015 error_setg_errno(errp, errno, 1016 "failed to create POSIX shared memory"); 1017 return -1; 1018 } 1019 1020 /* 1021 * We have the file descriptor, so we no longer need to expose the 1022 * POSIX shared memory object. However it will remain allocated as long as 1023 * there are file descriptors pointing to it. 1024 */ 1025 shm_unlink(shm_name->str); 1026 1027 if (ftruncate(fd, size) == -1) { 1028 error_setg_errno(errp, errno, 1029 "failed to resize POSIX shared memory to %zu", size); 1030 close(fd); 1031 return -1; 1032 } 1033 1034 return fd; 1035 } 1036