1 /* 2 * os-posix-lib.c 3 * 4 * Copyright (c) 2003-2008 Fabrice Bellard 5 * Copyright (c) 2010 Red Hat, Inc. 6 * 7 * QEMU library functions on POSIX which are shared between QEMU and 8 * the QEMU tools. 9 * 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 11 * of this software and associated documentation files (the "Software"), to deal 12 * in the Software without restriction, including without limitation the rights 13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 * copies of the Software, and to permit persons to whom the Software is 15 * furnished to do so, subject to the following conditions: 16 * 17 * The above copyright notice and this permission notice shall be included in 18 * all copies or substantial portions of the Software. 19 * 20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 26 * THE SOFTWARE. 27 */ 28 29 #include "qemu/osdep.h" 30 #include <termios.h> 31 32 #include <glib/gprintf.h> 33 34 #include "qemu-common.h" 35 #include "sysemu/sysemu.h" 36 #include "trace.h" 37 #include "qapi/error.h" 38 #include "qemu/sockets.h" 39 #include "qemu/thread.h" 40 #include <libgen.h> 41 #include <sys/signal.h> 42 #include "qemu/cutils.h" 43 44 #ifdef CONFIG_LINUX 45 #include <sys/syscall.h> 46 #endif 47 48 #ifdef __FreeBSD__ 49 #include <sys/sysctl.h> 50 #include <sys/user.h> 51 #include <libutil.h> 52 #endif 53 54 #ifdef __NetBSD__ 55 #include <sys/sysctl.h> 56 #endif 57 58 #include "qemu/mmap-alloc.h" 59 60 #ifdef CONFIG_DEBUG_STACK_USAGE 61 #include "qemu/error-report.h" 62 #endif 63 64 #define MAX_MEM_PREALLOC_THREAD_COUNT 16 65 66 struct MemsetThread { 67 char *addr; 68 size_t numpages; 69 size_t hpagesize; 70 QemuThread pgthread; 71 sigjmp_buf env; 72 }; 73 typedef struct MemsetThread MemsetThread; 74 75 static MemsetThread *memset_thread; 76 static int memset_num_threads; 77 static bool memset_thread_failed; 78 79 int qemu_get_thread_id(void) 80 { 81 #if defined(__linux__) 82 return syscall(SYS_gettid); 83 #else 84 return getpid(); 85 #endif 86 } 87 88 int qemu_daemon(int nochdir, int noclose) 89 { 90 return daemon(nochdir, noclose); 91 } 92 93 bool qemu_write_pidfile(const char *path, Error **errp) 94 { 95 int fd; 96 char pidstr[32]; 97 98 while (1) { 99 struct stat a, b; 100 struct flock lock = { 101 .l_type = F_WRLCK, 102 .l_whence = SEEK_SET, 103 .l_len = 0, 104 }; 105 106 fd = qemu_open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR); 107 if (fd == -1) { 108 error_setg_errno(errp, errno, "Cannot open pid file"); 109 return false; 110 } 111 112 if (fstat(fd, &b) < 0) { 113 error_setg_errno(errp, errno, "Cannot stat file"); 114 goto fail_close; 115 } 116 117 if (fcntl(fd, F_SETLK, &lock)) { 118 error_setg_errno(errp, errno, "Cannot lock pid file"); 119 goto fail_close; 120 } 121 122 /* 123 * Now make sure the path we locked is the same one that now 124 * exists on the filesystem. 125 */ 126 if (stat(path, &a) < 0) { 127 /* 128 * PID file disappeared, someone else must be racing with 129 * us, so try again. 130 */ 131 close(fd); 132 continue; 133 } 134 135 if (a.st_ino == b.st_ino) { 136 break; 137 } 138 139 /* 140 * PID file was recreated, someone else must be racing with 141 * us, so try again. 142 */ 143 close(fd); 144 } 145 146 if (ftruncate(fd, 0) < 0) { 147 error_setg_errno(errp, errno, "Failed to truncate pid file"); 148 goto fail_unlink; 149 } 150 151 snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid()); 152 if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) { 153 error_setg(errp, "Failed to write pid file"); 154 goto fail_unlink; 155 } 156 157 return true; 158 159 fail_unlink: 160 unlink(path); 161 fail_close: 162 close(fd); 163 return false; 164 } 165 166 void *qemu_oom_check(void *ptr) 167 { 168 if (ptr == NULL) { 169 fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno)); 170 abort(); 171 } 172 return ptr; 173 } 174 175 void *qemu_try_memalign(size_t alignment, size_t size) 176 { 177 void *ptr; 178 179 if (alignment < sizeof(void*)) { 180 alignment = sizeof(void*); 181 } 182 183 #if defined(CONFIG_POSIX_MEMALIGN) 184 int ret; 185 ret = posix_memalign(&ptr, alignment, size); 186 if (ret != 0) { 187 errno = ret; 188 ptr = NULL; 189 } 190 #elif defined(CONFIG_BSD) 191 ptr = valloc(size); 192 #else 193 ptr = memalign(alignment, size); 194 #endif 195 trace_qemu_memalign(alignment, size, ptr); 196 return ptr; 197 } 198 199 void *qemu_memalign(size_t alignment, size_t size) 200 { 201 return qemu_oom_check(qemu_try_memalign(alignment, size)); 202 } 203 204 /* alloc shared memory pages */ 205 void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared) 206 { 207 size_t align = QEMU_VMALLOC_ALIGN; 208 void *ptr = qemu_ram_mmap(-1, size, align, shared, false); 209 210 if (ptr == MAP_FAILED) { 211 return NULL; 212 } 213 214 if (alignment) { 215 *alignment = align; 216 } 217 218 trace_qemu_anon_ram_alloc(size, ptr); 219 return ptr; 220 } 221 222 void qemu_vfree(void *ptr) 223 { 224 trace_qemu_vfree(ptr); 225 free(ptr); 226 } 227 228 void qemu_anon_ram_free(void *ptr, size_t size) 229 { 230 trace_qemu_anon_ram_free(ptr, size); 231 qemu_ram_munmap(-1, ptr, size); 232 } 233 234 void qemu_set_block(int fd) 235 { 236 int f; 237 f = fcntl(fd, F_GETFL); 238 assert(f != -1); 239 f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK); 240 assert(f != -1); 241 } 242 243 void qemu_set_nonblock(int fd) 244 { 245 int f; 246 f = fcntl(fd, F_GETFL); 247 assert(f != -1); 248 f = fcntl(fd, F_SETFL, f | O_NONBLOCK); 249 #ifdef __OpenBSD__ 250 if (f == -1) { 251 /* 252 * Previous to OpenBSD 6.3, fcntl(F_SETFL) is not permitted on 253 * memory devices and sets errno to ENODEV. 254 * It's OK if we fail to set O_NONBLOCK on devices like /dev/null, 255 * because they will never block anyway. 256 */ 257 assert(errno == ENODEV); 258 } 259 #else 260 assert(f != -1); 261 #endif 262 } 263 264 int socket_set_fast_reuse(int fd) 265 { 266 int val = 1, ret; 267 268 ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, 269 (const char *)&val, sizeof(val)); 270 271 assert(ret == 0); 272 273 return ret; 274 } 275 276 void qemu_set_cloexec(int fd) 277 { 278 int f; 279 f = fcntl(fd, F_GETFD); 280 assert(f != -1); 281 f = fcntl(fd, F_SETFD, f | FD_CLOEXEC); 282 assert(f != -1); 283 } 284 285 /* 286 * Creates a pipe with FD_CLOEXEC set on both file descriptors 287 */ 288 int qemu_pipe(int pipefd[2]) 289 { 290 int ret; 291 292 #ifdef CONFIG_PIPE2 293 ret = pipe2(pipefd, O_CLOEXEC); 294 if (ret != -1 || errno != ENOSYS) { 295 return ret; 296 } 297 #endif 298 ret = pipe(pipefd); 299 if (ret == 0) { 300 qemu_set_cloexec(pipefd[0]); 301 qemu_set_cloexec(pipefd[1]); 302 } 303 304 return ret; 305 } 306 307 char * 308 qemu_get_local_state_pathname(const char *relative_pathname) 309 { 310 return g_strdup_printf("%s/%s", CONFIG_QEMU_LOCALSTATEDIR, 311 relative_pathname); 312 } 313 314 void qemu_set_tty_echo(int fd, bool echo) 315 { 316 struct termios tty; 317 318 tcgetattr(fd, &tty); 319 320 if (echo) { 321 tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN; 322 } else { 323 tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN); 324 } 325 326 tcsetattr(fd, TCSANOW, &tty); 327 } 328 329 static char exec_dir[PATH_MAX]; 330 331 void qemu_init_exec_dir(const char *argv0) 332 { 333 char *dir; 334 char *p = NULL; 335 char buf[PATH_MAX]; 336 337 assert(!exec_dir[0]); 338 339 #if defined(__linux__) 340 { 341 int len; 342 len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); 343 if (len > 0) { 344 buf[len] = 0; 345 p = buf; 346 } 347 } 348 #elif defined(__FreeBSD__) \ 349 || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME)) 350 { 351 #if defined(__FreeBSD__) 352 static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; 353 #else 354 static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME}; 355 #endif 356 size_t len = sizeof(buf) - 1; 357 358 *buf = '\0'; 359 if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) && 360 *buf) { 361 buf[sizeof(buf) - 1] = '\0'; 362 p = buf; 363 } 364 } 365 #endif 366 /* If we don't have any way of figuring out the actual executable 367 location then try argv[0]. */ 368 if (!p) { 369 if (!argv0) { 370 return; 371 } 372 p = realpath(argv0, buf); 373 if (!p) { 374 return; 375 } 376 } 377 dir = g_path_get_dirname(p); 378 379 pstrcpy(exec_dir, sizeof(exec_dir), dir); 380 381 g_free(dir); 382 } 383 384 char *qemu_get_exec_dir(void) 385 { 386 return g_strdup(exec_dir); 387 } 388 389 static void sigbus_handler(int signal) 390 { 391 int i; 392 if (memset_thread) { 393 for (i = 0; i < memset_num_threads; i++) { 394 if (qemu_thread_is_self(&memset_thread[i].pgthread)) { 395 siglongjmp(memset_thread[i].env, 1); 396 } 397 } 398 } 399 } 400 401 static void *do_touch_pages(void *arg) 402 { 403 MemsetThread *memset_args = (MemsetThread *)arg; 404 sigset_t set, oldset; 405 406 /* unblock SIGBUS */ 407 sigemptyset(&set); 408 sigaddset(&set, SIGBUS); 409 pthread_sigmask(SIG_UNBLOCK, &set, &oldset); 410 411 if (sigsetjmp(memset_args->env, 1)) { 412 memset_thread_failed = true; 413 } else { 414 char *addr = memset_args->addr; 415 size_t numpages = memset_args->numpages; 416 size_t hpagesize = memset_args->hpagesize; 417 size_t i; 418 for (i = 0; i < numpages; i++) { 419 /* 420 * Read & write back the same value, so we don't 421 * corrupt existing user/app data that might be 422 * stored. 423 * 424 * 'volatile' to stop compiler optimizing this away 425 * to a no-op 426 * 427 * TODO: get a better solution from kernel so we 428 * don't need to write at all so we don't cause 429 * wear on the storage backing the region... 430 */ 431 *(volatile char *)addr = *addr; 432 addr += hpagesize; 433 } 434 } 435 pthread_sigmask(SIG_SETMASK, &oldset, NULL); 436 return NULL; 437 } 438 439 static inline int get_memset_num_threads(int smp_cpus) 440 { 441 long host_procs = sysconf(_SC_NPROCESSORS_ONLN); 442 int ret = 1; 443 444 if (host_procs > 0) { 445 ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus); 446 } 447 /* In case sysconf() fails, we fall back to single threaded */ 448 return ret; 449 } 450 451 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, 452 int smp_cpus) 453 { 454 size_t numpages_per_thread; 455 size_t size_per_thread; 456 char *addr = area; 457 int i = 0; 458 459 memset_thread_failed = false; 460 memset_num_threads = get_memset_num_threads(smp_cpus); 461 memset_thread = g_new0(MemsetThread, memset_num_threads); 462 numpages_per_thread = (numpages / memset_num_threads); 463 size_per_thread = (hpagesize * numpages_per_thread); 464 for (i = 0; i < memset_num_threads; i++) { 465 memset_thread[i].addr = addr; 466 memset_thread[i].numpages = (i == (memset_num_threads - 1)) ? 467 numpages : numpages_per_thread; 468 memset_thread[i].hpagesize = hpagesize; 469 qemu_thread_create(&memset_thread[i].pgthread, "touch_pages", 470 do_touch_pages, &memset_thread[i], 471 QEMU_THREAD_JOINABLE); 472 addr += size_per_thread; 473 numpages -= numpages_per_thread; 474 } 475 for (i = 0; i < memset_num_threads; i++) { 476 qemu_thread_join(&memset_thread[i].pgthread); 477 } 478 g_free(memset_thread); 479 memset_thread = NULL; 480 481 return memset_thread_failed; 482 } 483 484 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, 485 Error **errp) 486 { 487 int ret; 488 struct sigaction act, oldact; 489 size_t hpagesize = qemu_fd_getpagesize(fd); 490 size_t numpages = DIV_ROUND_UP(memory, hpagesize); 491 492 memset(&act, 0, sizeof(act)); 493 act.sa_handler = &sigbus_handler; 494 act.sa_flags = 0; 495 496 ret = sigaction(SIGBUS, &act, &oldact); 497 if (ret) { 498 error_setg_errno(errp, errno, 499 "os_mem_prealloc: failed to install signal handler"); 500 return; 501 } 502 503 /* touch pages simultaneously */ 504 if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) { 505 error_setg(errp, "os_mem_prealloc: Insufficient free host memory " 506 "pages available to allocate guest RAM"); 507 } 508 509 ret = sigaction(SIGBUS, &oldact, NULL); 510 if (ret) { 511 /* Terminate QEMU since it can't recover from error */ 512 perror("os_mem_prealloc: failed to reinstall signal handler"); 513 exit(1); 514 } 515 } 516 517 char *qemu_get_pid_name(pid_t pid) 518 { 519 char *name = NULL; 520 521 #if defined(__FreeBSD__) 522 /* BSDs don't have /proc, but they provide a nice substitute */ 523 struct kinfo_proc *proc = kinfo_getproc(pid); 524 525 if (proc) { 526 name = g_strdup(proc->ki_comm); 527 free(proc); 528 } 529 #else 530 /* Assume a system with reasonable procfs */ 531 char *pid_path; 532 size_t len; 533 534 pid_path = g_strdup_printf("/proc/%d/cmdline", pid); 535 g_file_get_contents(pid_path, &name, &len, NULL); 536 g_free(pid_path); 537 #endif 538 539 return name; 540 } 541 542 543 pid_t qemu_fork(Error **errp) 544 { 545 sigset_t oldmask, newmask; 546 struct sigaction sig_action; 547 int saved_errno; 548 pid_t pid; 549 550 /* 551 * Need to block signals now, so that child process can safely 552 * kill off caller's signal handlers without a race. 553 */ 554 sigfillset(&newmask); 555 if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) { 556 error_setg_errno(errp, errno, 557 "cannot block signals"); 558 return -1; 559 } 560 561 pid = fork(); 562 saved_errno = errno; 563 564 if (pid < 0) { 565 /* attempt to restore signal mask, but ignore failure, to 566 * avoid obscuring the fork failure */ 567 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); 568 error_setg_errno(errp, saved_errno, 569 "cannot fork child process"); 570 errno = saved_errno; 571 return -1; 572 } else if (pid) { 573 /* parent process */ 574 575 /* Restore our original signal mask now that the child is 576 * safely running. Only documented failures are EFAULT (not 577 * possible, since we are using just-grabbed mask) or EINVAL 578 * (not possible, since we are using correct arguments). */ 579 (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); 580 } else { 581 /* child process */ 582 size_t i; 583 584 /* Clear out all signal handlers from parent so nothing 585 * unexpected can happen in our child once we unblock 586 * signals */ 587 sig_action.sa_handler = SIG_DFL; 588 sig_action.sa_flags = 0; 589 sigemptyset(&sig_action.sa_mask); 590 591 for (i = 1; i < NSIG; i++) { 592 /* Only possible errors are EFAULT or EINVAL The former 593 * won't happen, the latter we expect, so no need to check 594 * return value */ 595 (void)sigaction(i, &sig_action, NULL); 596 } 597 598 /* Unmask all signals in child, since we've no idea what the 599 * caller's done with their signal mask and don't want to 600 * propagate that to children */ 601 sigemptyset(&newmask); 602 if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) { 603 Error *local_err = NULL; 604 error_setg_errno(&local_err, errno, 605 "cannot unblock signals"); 606 error_report_err(local_err); 607 _exit(1); 608 } 609 } 610 return pid; 611 } 612 613 void *qemu_alloc_stack(size_t *sz) 614 { 615 void *ptr, *guardpage; 616 int flags; 617 #ifdef CONFIG_DEBUG_STACK_USAGE 618 void *ptr2; 619 #endif 620 size_t pagesz = qemu_real_host_page_size; 621 #ifdef _SC_THREAD_STACK_MIN 622 /* avoid stacks smaller than _SC_THREAD_STACK_MIN */ 623 long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN); 624 *sz = MAX(MAX(min_stack_sz, 0), *sz); 625 #endif 626 /* adjust stack size to a multiple of the page size */ 627 *sz = ROUND_UP(*sz, pagesz); 628 /* allocate one extra page for the guard page */ 629 *sz += pagesz; 630 631 flags = MAP_PRIVATE | MAP_ANONYMOUS; 632 #if defined(MAP_STACK) && defined(__OpenBSD__) 633 /* Only enable MAP_STACK on OpenBSD. Other OS's such as 634 * Linux/FreeBSD/NetBSD have a flag with the same name 635 * but have differing functionality. OpenBSD will SEGV 636 * if it spots execution with a stack pointer pointing 637 * at memory that was not allocated with MAP_STACK. 638 */ 639 flags |= MAP_STACK; 640 #endif 641 642 ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0); 643 if (ptr == MAP_FAILED) { 644 perror("failed to allocate memory for stack"); 645 abort(); 646 } 647 648 #if defined(HOST_IA64) 649 /* separate register stack */ 650 guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz); 651 #elif defined(HOST_HPPA) 652 /* stack grows up */ 653 guardpage = ptr + *sz - pagesz; 654 #else 655 /* stack grows down */ 656 guardpage = ptr; 657 #endif 658 if (mprotect(guardpage, pagesz, PROT_NONE) != 0) { 659 perror("failed to set up stack guard page"); 660 abort(); 661 } 662 663 #ifdef CONFIG_DEBUG_STACK_USAGE 664 for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) { 665 *(uint32_t *)ptr2 = 0xdeadbeaf; 666 } 667 #endif 668 669 return ptr; 670 } 671 672 #ifdef CONFIG_DEBUG_STACK_USAGE 673 static __thread unsigned int max_stack_usage; 674 #endif 675 676 void qemu_free_stack(void *stack, size_t sz) 677 { 678 #ifdef CONFIG_DEBUG_STACK_USAGE 679 unsigned int usage; 680 void *ptr; 681 682 for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz; 683 ptr += sizeof(uint32_t)) { 684 if (*(uint32_t *)ptr != 0xdeadbeaf) { 685 break; 686 } 687 } 688 usage = sz - (uintptr_t) (ptr - stack); 689 if (usage > max_stack_usage) { 690 error_report("thread %d max stack usage increased from %u to %u", 691 qemu_get_thread_id(), max_stack_usage, usage); 692 max_stack_usage = usage; 693 } 694 #endif 695 696 munmap(stack, sz); 697 } 698 699 void sigaction_invoke(struct sigaction *action, 700 struct qemu_signalfd_siginfo *info) 701 { 702 siginfo_t si = {}; 703 si.si_signo = info->ssi_signo; 704 si.si_errno = info->ssi_errno; 705 si.si_code = info->ssi_code; 706 707 /* Convert the minimal set of fields defined by POSIX. 708 * Positive si_code values are reserved for kernel-generated 709 * signals, where the valid siginfo fields are determined by 710 * the signal number. But according to POSIX, it is unspecified 711 * whether SI_USER and SI_QUEUE have values less than or equal to 712 * zero. 713 */ 714 if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE || 715 info->ssi_code <= 0) { 716 /* SIGTERM, etc. */ 717 si.si_pid = info->ssi_pid; 718 si.si_uid = info->ssi_uid; 719 } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE || 720 info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) { 721 si.si_addr = (void *)(uintptr_t)info->ssi_addr; 722 } else if (info->ssi_signo == SIGCHLD) { 723 si.si_pid = info->ssi_pid; 724 si.si_status = info->ssi_status; 725 si.si_uid = info->ssi_uid; 726 } 727 action->sa_sigaction(info->ssi_signo, &si, NULL); 728 } 729