xref: /openbmc/qemu/linux-user/elfload.c (revision 86ce6e07739b98137e9401076f54a3e89f6a5fbd)
1 /* This is the Linux kernel elf-loading code, ported into user space */
2 #include "qemu/osdep.h"
3 #include <sys/param.h>
4 
5 #include <sys/prctl.h>
6 #include <sys/resource.h>
7 #include <sys/shm.h>
8 
9 #include "qemu.h"
10 #include "user/tswap-target.h"
11 #include "user/page-protection.h"
12 #include "exec/page-protection.h"
13 #include "exec/mmap-lock.h"
14 #include "exec/translation-block.h"
15 #include "exec/tswap.h"
16 #include "user/guest-base.h"
17 #include "user-internals.h"
18 #include "signal-common.h"
19 #include "loader.h"
20 #include "user-mmap.h"
21 #include "disas/disas.h"
22 #include "qemu/bitops.h"
23 #include "qemu/path.h"
24 #include "qemu/queue.h"
25 #include "qemu/guest-random.h"
26 #include "qemu/units.h"
27 #include "qemu/selfmap.h"
28 #include "qemu/lockable.h"
29 #include "qapi/error.h"
30 #include "qemu/error-report.h"
31 #include "target_elf.h"
32 #include "target_signal.h"
33 #include "tcg/debuginfo.h"
34 
35 #ifdef TARGET_ARM
36 #include "target/arm/cpu-features.h"
37 #endif
38 
39 #ifndef TARGET_ARCH_HAS_SIGTRAMP_PAGE
40 #define TARGET_ARCH_HAS_SIGTRAMP_PAGE 0
41 #endif
42 
43 #define ELF_OSABI   ELFOSABI_SYSV
44 
45 /* from personality.h */
46 
47 /*
48  * Flags for bug emulation.
49  *
50  * These occupy the top three bytes.
51  */
52 enum {
53     ADDR_NO_RANDOMIZE = 0x0040000,      /* disable randomization of VA space */
54     FDPIC_FUNCPTRS =    0x0080000,      /* userspace function ptrs point to
55                                            descriptors (signal handling) */
56     MMAP_PAGE_ZERO =    0x0100000,
57     ADDR_COMPAT_LAYOUT = 0x0200000,
58     READ_IMPLIES_EXEC = 0x0400000,
59     ADDR_LIMIT_32BIT =  0x0800000,
60     SHORT_INODE =       0x1000000,
61     WHOLE_SECONDS =     0x2000000,
62     STICKY_TIMEOUTS =   0x4000000,
63     ADDR_LIMIT_3GB =    0x8000000,
64 };
65 
66 /*
67  * Personality types.
68  *
69  * These go in the low byte.  Avoid using the top bit, it will
70  * conflict with error returns.
71  */
72 enum {
73     PER_LINUX =         0x0000,
74     PER_LINUX_32BIT =   0x0000 | ADDR_LIMIT_32BIT,
75     PER_LINUX_FDPIC =   0x0000 | FDPIC_FUNCPTRS,
76     PER_SVR4 =          0x0001 | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
77     PER_SVR3 =          0x0002 | STICKY_TIMEOUTS | SHORT_INODE,
78     PER_SCOSVR3 =       0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS | SHORT_INODE,
79     PER_OSR5 =          0x0003 | STICKY_TIMEOUTS | WHOLE_SECONDS,
80     PER_WYSEV386 =      0x0004 | STICKY_TIMEOUTS | SHORT_INODE,
81     PER_ISCR4 =         0x0005 | STICKY_TIMEOUTS,
82     PER_BSD =           0x0006,
83     PER_SUNOS =         0x0006 | STICKY_TIMEOUTS,
84     PER_XENIX =         0x0007 | STICKY_TIMEOUTS | SHORT_INODE,
85     PER_LINUX32 =       0x0008,
86     PER_LINUX32_3GB =   0x0008 | ADDR_LIMIT_3GB,
87     PER_IRIX32 =        0x0009 | STICKY_TIMEOUTS,/* IRIX5 32-bit */
88     PER_IRIXN32 =       0x000a | STICKY_TIMEOUTS,/* IRIX6 new 32-bit */
89     PER_IRIX64 =        0x000b | STICKY_TIMEOUTS,/* IRIX6 64-bit */
90     PER_RISCOS =        0x000c,
91     PER_SOLARIS =       0x000d | STICKY_TIMEOUTS,
92     PER_UW7 =           0x000e | STICKY_TIMEOUTS | MMAP_PAGE_ZERO,
93     PER_OSF4 =          0x000f,                  /* OSF/1 v4 */
94     PER_HPUX =          0x0010,
95     PER_MASK =          0x00ff,
96 };
97 
98 /*
99  * Return the base personality without flags.
100  */
101 #define personality(pers)       (pers & PER_MASK)
102 
info_is_fdpic(struct image_info * info)103 int info_is_fdpic(struct image_info *info)
104 {
105     return info->personality == PER_LINUX_FDPIC;
106 }
107 
108 #if TARGET_BIG_ENDIAN
109 #define ELF_DATA        ELFDATA2MSB
110 #else
111 #define ELF_DATA        ELFDATA2LSB
112 #endif
113 
114 #ifdef USE_UID16
115 typedef abi_ushort      target_uid_t;
116 typedef abi_ushort      target_gid_t;
117 #else
118 typedef abi_uint        target_uid_t;
119 typedef abi_uint        target_gid_t;
120 #endif
121 typedef abi_int         target_pid_t;
122 
123 #ifndef elf_check_machine
124 #define elf_check_machine(x) ((x) == ELF_MACHINE)
125 #endif
126 
127 #ifndef elf_check_abi
128 #define elf_check_abi(x) (1)
129 #endif
130 
131 #ifndef STACK_GROWS_DOWN
132 #define STACK_GROWS_DOWN 1
133 #endif
134 
135 #ifndef STACK_ALIGNMENT
136 #define STACK_ALIGNMENT 16
137 #endif
138 
139 #ifdef TARGET_ABI32
140 #undef ELF_CLASS
141 #define ELF_CLASS ELFCLASS32
142 #undef bswaptls
143 #define bswaptls(ptr) bswap32s(ptr)
144 #endif
145 
146 #ifndef EXSTACK_DEFAULT
147 #define EXSTACK_DEFAULT false
148 #endif
149 
150 /*
151  * Provide fallback definitions that the target may omit.
152  * One way or another, we'll get a link error if the setting of
153  * HAVE_* doesn't match the implementation.
154  */
155 #ifndef HAVE_ELF_HWCAP
get_elf_hwcap(CPUState * cs)156 abi_ulong get_elf_hwcap(CPUState *cs) { return 0; }
157 #endif
158 #ifndef HAVE_ELF_HWCAP2
get_elf_hwcap2(CPUState * cs)159 abi_ulong get_elf_hwcap2(CPUState *cs) { g_assert_not_reached(); }
160 #define HAVE_ELF_HWCAP2 0
161 #endif
162 #ifndef HAVE_ELF_PLATFORM
get_elf_platform(CPUState * cs)163 const char *get_elf_platform(CPUState *cs) { return NULL; }
164 #endif
165 #ifndef HAVE_ELF_BASE_PLATFORM
get_elf_base_platform(CPUState * cs)166 const char *get_elf_base_platform(CPUState *cs) { return NULL; }
167 #endif
168 
169 #ifndef HAVE_ELF_GNU_PROPERTY
arch_parse_elf_property(uint32_t pr_type,uint32_t pr_datasz,const uint32_t * data,struct image_info * info,Error ** errp)170 bool arch_parse_elf_property(uint32_t pr_type, uint32_t pr_datasz,
171                              const uint32_t *data, struct image_info *info,
172                              Error **errp)
173 {
174     g_assert_not_reached();
175 }
176 #define HAVE_ELF_GNU_PROPERTY 0
177 #endif
178 
179 #include "elf.h"
180 
181 #define DLINFO_ITEMS 16
182 
memcpy_fromfs(void * to,const void * from,unsigned long n)183 static inline void memcpy_fromfs(void * to, const void * from, unsigned long n)
184 {
185     memcpy(to, from, n);
186 }
187 
bswap_ehdr(struct elfhdr * ehdr)188 static void bswap_ehdr(struct elfhdr *ehdr)
189 {
190     if (!target_needs_bswap()) {
191         return;
192     }
193 
194     bswap16s(&ehdr->e_type);            /* Object file type */
195     bswap16s(&ehdr->e_machine);         /* Architecture */
196     bswap32s(&ehdr->e_version);         /* Object file version */
197     bswaptls(&ehdr->e_entry);           /* Entry point virtual address */
198     bswaptls(&ehdr->e_phoff);           /* Program header table file offset */
199     bswaptls(&ehdr->e_shoff);           /* Section header table file offset */
200     bswap32s(&ehdr->e_flags);           /* Processor-specific flags */
201     bswap16s(&ehdr->e_ehsize);          /* ELF header size in bytes */
202     bswap16s(&ehdr->e_phentsize);       /* Program header table entry size */
203     bswap16s(&ehdr->e_phnum);           /* Program header table entry count */
204     bswap16s(&ehdr->e_shentsize);       /* Section header table entry size */
205     bswap16s(&ehdr->e_shnum);           /* Section header table entry count */
206     bswap16s(&ehdr->e_shstrndx);        /* Section header string table index */
207 }
208 
bswap_phdr(struct elf_phdr * phdr,int phnum)209 static void bswap_phdr(struct elf_phdr *phdr, int phnum)
210 {
211     if (!target_needs_bswap()) {
212         return;
213     }
214 
215     for (int i = 0; i < phnum; ++i, ++phdr) {
216         bswap32s(&phdr->p_type);        /* Segment type */
217         bswap32s(&phdr->p_flags);       /* Segment flags */
218         bswaptls(&phdr->p_offset);      /* Segment file offset */
219         bswaptls(&phdr->p_vaddr);       /* Segment virtual address */
220         bswaptls(&phdr->p_paddr);       /* Segment physical address */
221         bswaptls(&phdr->p_filesz);      /* Segment size in file */
222         bswaptls(&phdr->p_memsz);       /* Segment size in memory */
223         bswaptls(&phdr->p_align);       /* Segment alignment */
224     }
225 }
226 
bswap_shdr(struct elf_shdr * shdr,int shnum)227 static void bswap_shdr(struct elf_shdr *shdr, int shnum)
228 {
229     if (!target_needs_bswap()) {
230         return;
231     }
232 
233     for (int i = 0; i < shnum; ++i, ++shdr) {
234         bswap32s(&shdr->sh_name);
235         bswap32s(&shdr->sh_type);
236         bswaptls(&shdr->sh_flags);
237         bswaptls(&shdr->sh_addr);
238         bswaptls(&shdr->sh_offset);
239         bswaptls(&shdr->sh_size);
240         bswap32s(&shdr->sh_link);
241         bswap32s(&shdr->sh_info);
242         bswaptls(&shdr->sh_addralign);
243         bswaptls(&shdr->sh_entsize);
244     }
245 }
246 
bswap_sym(struct elf_sym * sym)247 static void bswap_sym(struct elf_sym *sym)
248 {
249     if (!target_needs_bswap()) {
250         return;
251     }
252 
253     bswap32s(&sym->st_name);
254     bswaptls(&sym->st_value);
255     bswaptls(&sym->st_size);
256     bswap16s(&sym->st_shndx);
257 }
258 
259 #ifdef TARGET_MIPS
bswap_mips_abiflags(Mips_elf_abiflags_v0 * abiflags)260 static void bswap_mips_abiflags(Mips_elf_abiflags_v0 *abiflags)
261 {
262     if (!target_needs_bswap()) {
263         return;
264     }
265 
266     bswap16s(&abiflags->version);
267     bswap32s(&abiflags->ases);
268     bswap32s(&abiflags->isa_ext);
269     bswap32s(&abiflags->flags1);
270     bswap32s(&abiflags->flags2);
271 }
272 #endif
273 
274 #ifdef HAVE_ELF_CORE_DUMP
275 static int elf_core_dump(int, const CPUArchState *);
276 #endif /* HAVE_ELF_CORE_DUMP */
277 static void load_symbols(struct elfhdr *hdr, const ImageSource *src,
278                          abi_ulong load_bias);
279 
280 /* Verify the portions of EHDR within E_IDENT for the target.
281    This can be performed before bswapping the entire header.  */
elf_check_ident(struct elfhdr * ehdr)282 static bool elf_check_ident(struct elfhdr *ehdr)
283 {
284     return (ehdr->e_ident[EI_MAG0] == ELFMAG0
285             && ehdr->e_ident[EI_MAG1] == ELFMAG1
286             && ehdr->e_ident[EI_MAG2] == ELFMAG2
287             && ehdr->e_ident[EI_MAG3] == ELFMAG3
288             && ehdr->e_ident[EI_CLASS] == ELF_CLASS
289             && ehdr->e_ident[EI_DATA] == ELF_DATA
290             && ehdr->e_ident[EI_VERSION] == EV_CURRENT);
291 }
292 
293 /* Verify the portions of EHDR outside of E_IDENT for the target.
294    This has to wait until after bswapping the header.  */
elf_check_ehdr(struct elfhdr * ehdr)295 static bool elf_check_ehdr(struct elfhdr *ehdr)
296 {
297     return (elf_check_machine(ehdr->e_machine)
298             && elf_check_abi(ehdr->e_flags)
299             && ehdr->e_ehsize == sizeof(struct elfhdr)
300             && ehdr->e_phentsize == sizeof(struct elf_phdr)
301             && (ehdr->e_type == ET_EXEC || ehdr->e_type == ET_DYN));
302 }
303 
304 /*
305  * 'copy_elf_strings()' copies argument/envelope strings from user
306  * memory to free pages in kernel mem. These are in a format ready
307  * to be put directly into the top of new user memory.
308  *
309  */
copy_elf_strings(int argc,char ** argv,char * scratch,abi_ulong p,abi_ulong stack_limit)310 static abi_ulong copy_elf_strings(int argc, char **argv, char *scratch,
311                                   abi_ulong p, abi_ulong stack_limit)
312 {
313     char *tmp;
314     int len, i;
315     abi_ulong top = p;
316 
317     if (!p) {
318         return 0;       /* bullet-proofing */
319     }
320 
321     if (STACK_GROWS_DOWN) {
322         int offset = ((p - 1) % TARGET_PAGE_SIZE) + 1;
323         for (i = argc - 1; i >= 0; --i) {
324             tmp = argv[i];
325             if (!tmp) {
326                 fprintf(stderr, "VFS: argc is wrong");
327                 exit(-1);
328             }
329             len = strlen(tmp) + 1;
330             tmp += len;
331 
332             if (len > (p - stack_limit)) {
333                 return 0;
334             }
335             while (len) {
336                 int bytes_to_copy = (len > offset) ? offset : len;
337                 tmp -= bytes_to_copy;
338                 p -= bytes_to_copy;
339                 offset -= bytes_to_copy;
340                 len -= bytes_to_copy;
341 
342                 memcpy_fromfs(scratch + offset, tmp, bytes_to_copy);
343 
344                 if (offset == 0) {
345                     memcpy_to_target(p, scratch, top - p);
346                     top = p;
347                     offset = TARGET_PAGE_SIZE;
348                 }
349             }
350         }
351         if (p != top) {
352             memcpy_to_target(p, scratch + offset, top - p);
353         }
354     } else {
355         int remaining = TARGET_PAGE_SIZE - (p % TARGET_PAGE_SIZE);
356         for (i = 0; i < argc; ++i) {
357             tmp = argv[i];
358             if (!tmp) {
359                 fprintf(stderr, "VFS: argc is wrong");
360                 exit(-1);
361             }
362             len = strlen(tmp) + 1;
363             if (len > (stack_limit - p)) {
364                 return 0;
365             }
366             while (len) {
367                 int bytes_to_copy = (len > remaining) ? remaining : len;
368 
369                 memcpy_fromfs(scratch + (p - top), tmp, bytes_to_copy);
370 
371                 tmp += bytes_to_copy;
372                 remaining -= bytes_to_copy;
373                 p += bytes_to_copy;
374                 len -= bytes_to_copy;
375 
376                 if (remaining == 0) {
377                     memcpy_to_target(top, scratch, p - top);
378                     top = p;
379                     remaining = TARGET_PAGE_SIZE;
380                 }
381             }
382         }
383         if (p != top) {
384             memcpy_to_target(top, scratch, p - top);
385         }
386     }
387 
388     return p;
389 }
390 
391 /* Older linux kernels provide up to MAX_ARG_PAGES (default: 32) of
392  * argument/environment space. Newer kernels (>2.6.33) allow more,
393  * dependent on stack size, but guarantee at least 32 pages for
394  * backwards compatibility.
395  */
396 #define STACK_LOWER_LIMIT (32 * TARGET_PAGE_SIZE)
397 
setup_arg_pages(struct linux_binprm * bprm,struct image_info * info)398 static abi_ulong setup_arg_pages(struct linux_binprm *bprm,
399                                  struct image_info *info)
400 {
401     abi_ulong size, error, guard;
402     int prot;
403 
404     size = guest_stack_size;
405     if (size < STACK_LOWER_LIMIT) {
406         size = STACK_LOWER_LIMIT;
407     }
408 
409     if (STACK_GROWS_DOWN) {
410         guard = TARGET_PAGE_SIZE;
411         if (guard < qemu_real_host_page_size()) {
412             guard = qemu_real_host_page_size();
413         }
414     } else {
415         /* no guard page for hppa target where stack grows upwards. */
416         guard = 0;
417     }
418 
419     prot = PROT_READ | PROT_WRITE;
420     if (info->exec_stack) {
421         prot |= PROT_EXEC;
422     }
423     error = target_mmap(0, size + guard, prot,
424                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
425     if (error == -1) {
426         perror("mmap stack");
427         exit(-1);
428     }
429 
430     /* We reserve one extra page at the top of the stack as guard.  */
431     if (STACK_GROWS_DOWN) {
432         target_mprotect(error, guard, PROT_NONE);
433         info->stack_limit = error + guard;
434         return info->stack_limit + size - sizeof(void *);
435     } else {
436         info->stack_limit = error + size;
437         return error;
438     }
439 }
440 
441 /**
442  * zero_bss:
443  *
444  * Map and zero the bss.  We need to explicitly zero any fractional pages
445  * after the data section (i.e. bss).  Return false on mapping failure.
446  */
zero_bss(abi_ulong start_bss,abi_ulong end_bss,int prot,Error ** errp)447 static bool zero_bss(abi_ulong start_bss, abi_ulong end_bss,
448                      int prot, Error **errp)
449 {
450     abi_ulong align_bss;
451 
452     /* We only expect writable bss; the code segment shouldn't need this. */
453     if (!(prot & PROT_WRITE)) {
454         error_setg(errp, "PT_LOAD with non-writable bss");
455         return false;
456     }
457 
458     align_bss = TARGET_PAGE_ALIGN(start_bss);
459     end_bss = TARGET_PAGE_ALIGN(end_bss);
460 
461     if (start_bss < align_bss) {
462         int flags = page_get_flags(start_bss);
463 
464         if (!(flags & PAGE_RWX)) {
465             /*
466              * The whole address space of the executable was reserved
467              * at the start, therefore all pages will be VALID.
468              * But assuming there are no PROT_NONE PT_LOAD segments,
469              * a PROT_NONE page means no data all bss, and we can
470              * simply extend the new anon mapping back to the start
471              * of the page of bss.
472              */
473             align_bss -= TARGET_PAGE_SIZE;
474         } else {
475             /*
476              * The start of the bss shares a page with something.
477              * The only thing that we expect is the data section,
478              * which would already be marked writable.
479              * Overlapping the RX code segment seems malformed.
480              */
481             if (!(flags & PAGE_WRITE)) {
482                 error_setg(errp, "PT_LOAD with bss overlapping "
483                            "non-writable page");
484                 return false;
485             }
486 
487             /* The page is already mapped and writable. */
488             memset(g2h_untagged(start_bss), 0, align_bss - start_bss);
489         }
490     }
491 
492     if (align_bss < end_bss &&
493         target_mmap(align_bss, end_bss - align_bss, prot,
494                     MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0) == -1) {
495         error_setg_errno(errp, errno, "Error mapping bss");
496         return false;
497     }
498     return true;
499 }
500 
501 #if defined(TARGET_ARM)
elf_is_fdpic(struct elfhdr * exec)502 static int elf_is_fdpic(struct elfhdr *exec)
503 {
504     return exec->e_ident[EI_OSABI] == ELFOSABI_ARM_FDPIC;
505 }
506 #elif defined(TARGET_XTENSA)
elf_is_fdpic(struct elfhdr * exec)507 static int elf_is_fdpic(struct elfhdr *exec)
508 {
509     return exec->e_ident[EI_OSABI] == ELFOSABI_XTENSA_FDPIC;
510 }
511 #else
512 /* Default implementation, always false.  */
elf_is_fdpic(struct elfhdr * exec)513 static int elf_is_fdpic(struct elfhdr *exec)
514 {
515     return 0;
516 }
517 #endif
518 
loader_build_fdpic_loadmap(struct image_info * info,abi_ulong sp)519 static abi_ulong loader_build_fdpic_loadmap(struct image_info *info, abi_ulong sp)
520 {
521     uint16_t n;
522     struct elf32_fdpic_loadseg *loadsegs = info->loadsegs;
523 
524     /* elf32_fdpic_loadseg */
525     n = info->nsegs;
526     while (n--) {
527         sp -= 12;
528         put_user_u32(loadsegs[n].addr, sp+0);
529         put_user_u32(loadsegs[n].p_vaddr, sp+4);
530         put_user_u32(loadsegs[n].p_memsz, sp+8);
531     }
532 
533     /* elf32_fdpic_loadmap */
534     sp -= 4;
535     put_user_u16(0, sp+0); /* version */
536     put_user_u16(info->nsegs, sp+2); /* nsegs */
537 
538     info->personality = PER_LINUX_FDPIC;
539     info->loadmap_addr = sp;
540 
541     return sp;
542 }
543 
create_elf_tables(abi_ulong p,int argc,int envc,struct elfhdr * exec,struct image_info * info,struct image_info * interp_info,struct image_info * vdso_info)544 static abi_ulong create_elf_tables(abi_ulong p, int argc, int envc,
545                                    struct elfhdr *exec,
546                                    struct image_info *info,
547                                    struct image_info *interp_info,
548                                    struct image_info *vdso_info)
549 {
550     abi_ulong sp;
551     abi_ulong u_argc, u_argv, u_envp, u_auxv;
552     int size;
553     int i;
554     abi_ulong u_rand_bytes;
555     uint8_t k_rand_bytes[16];
556     abi_ulong u_platform, u_base_platform;
557     const char *k_platform, *k_base_platform;
558     const int n = sizeof(elf_addr_t);
559 
560     sp = p;
561 
562     /* Needs to be before we load the env/argc/... */
563     if (elf_is_fdpic(exec)) {
564         /* Need 4 byte alignment for these structs */
565         sp &= ~3;
566         sp = loader_build_fdpic_loadmap(info, sp);
567         info->other_info = interp_info;
568         if (interp_info) {
569             interp_info->other_info = info;
570             sp = loader_build_fdpic_loadmap(interp_info, sp);
571             info->interpreter_loadmap_addr = interp_info->loadmap_addr;
572             info->interpreter_pt_dynamic_addr = interp_info->pt_dynamic_addr;
573         } else {
574             info->interpreter_loadmap_addr = 0;
575             info->interpreter_pt_dynamic_addr = 0;
576         }
577     }
578 
579     u_base_platform = 0;
580     k_base_platform = get_elf_base_platform(thread_cpu);
581     if (k_base_platform) {
582         size_t len = strlen(k_base_platform) + 1;
583         if (STACK_GROWS_DOWN) {
584             sp -= (len + n - 1) & ~(n - 1);
585             u_base_platform = sp;
586             /* FIXME - check return value of memcpy_to_target() for failure */
587             memcpy_to_target(sp, k_base_platform, len);
588         } else {
589             memcpy_to_target(sp, k_base_platform, len);
590             u_base_platform = sp;
591             sp += len + 1;
592         }
593     }
594 
595     u_platform = 0;
596     k_platform = get_elf_platform(thread_cpu);
597     if (k_platform) {
598         size_t len = strlen(k_platform) + 1;
599         if (STACK_GROWS_DOWN) {
600             sp -= (len + n - 1) & ~(n - 1);
601             u_platform = sp;
602             /* FIXME - check return value of memcpy_to_target() for failure */
603             memcpy_to_target(sp, k_platform, len);
604         } else {
605             memcpy_to_target(sp, k_platform, len);
606             u_platform = sp;
607             sp += len + 1;
608         }
609     }
610 
611     /* Provide 16 byte alignment for the PRNG, and basic alignment for
612      * the argv and envp pointers.
613      */
614     if (STACK_GROWS_DOWN) {
615         sp = QEMU_ALIGN_DOWN(sp, 16);
616     } else {
617         sp = QEMU_ALIGN_UP(sp, 16);
618     }
619 
620     /*
621      * Generate 16 random bytes for userspace PRNG seeding.
622      */
623     qemu_guest_getrandom_nofail(k_rand_bytes, sizeof(k_rand_bytes));
624     if (STACK_GROWS_DOWN) {
625         sp -= 16;
626         u_rand_bytes = sp;
627         /* FIXME - check return value of memcpy_to_target() for failure */
628         memcpy_to_target(sp, k_rand_bytes, 16);
629     } else {
630         memcpy_to_target(sp, k_rand_bytes, 16);
631         u_rand_bytes = sp;
632         sp += 16;
633     }
634 
635     size = (DLINFO_ITEMS + 1) * 2;
636     if (k_base_platform) {
637         size += 2;
638     }
639     if (k_platform) {
640         size += 2;
641     }
642     if (vdso_info) {
643         size += 2;
644     }
645 #ifdef DLINFO_ARCH_ITEMS
646     size += DLINFO_ARCH_ITEMS * 2;
647 #endif
648     if (HAVE_ELF_HWCAP2) {
649         size += 2;
650     }
651     info->auxv_len = size * n;
652 
653     size += envc + argc + 2;
654     size += 1;  /* argc itself */
655     size *= n;
656 
657     /* Allocate space and finalize stack alignment for entry now.  */
658     if (STACK_GROWS_DOWN) {
659         u_argc = QEMU_ALIGN_DOWN(sp - size, STACK_ALIGNMENT);
660         sp = u_argc;
661     } else {
662         u_argc = sp;
663         sp = QEMU_ALIGN_UP(sp + size, STACK_ALIGNMENT);
664     }
665 
666     u_argv = u_argc + n;
667     u_envp = u_argv + (argc + 1) * n;
668     u_auxv = u_envp + (envc + 1) * n;
669     info->saved_auxv = u_auxv;
670     info->argc = argc;
671     info->envc = envc;
672     info->argv = u_argv;
673     info->envp = u_envp;
674 
675     /* This is correct because Linux defines
676      * elf_addr_t as Elf32_Off / Elf64_Off
677      */
678 #define NEW_AUX_ENT(id, val) do {               \
679         put_user_ual(id, u_auxv);  u_auxv += n; \
680         put_user_ual(val, u_auxv); u_auxv += n; \
681     } while(0)
682 
683 #ifdef ARCH_DLINFO
684     /*
685      * ARCH_DLINFO must come first so platform specific code can enforce
686      * special alignment requirements on the AUXV if necessary (eg. PPC).
687      */
688     ARCH_DLINFO;
689 #endif
690     /* There must be exactly DLINFO_ITEMS entries here, or the assert
691      * on info->auxv_len will trigger.
692      */
693     NEW_AUX_ENT(AT_PHDR, (abi_ulong)(info->load_addr + exec->e_phoff));
694     NEW_AUX_ENT(AT_PHENT, (abi_ulong)(sizeof (struct elf_phdr)));
695     NEW_AUX_ENT(AT_PHNUM, (abi_ulong)(exec->e_phnum));
696     NEW_AUX_ENT(AT_PAGESZ, (abi_ulong)(TARGET_PAGE_SIZE));
697     NEW_AUX_ENT(AT_BASE, (abi_ulong)(interp_info ? interp_info->load_addr : 0));
698     NEW_AUX_ENT(AT_FLAGS, (abi_ulong)0);
699     NEW_AUX_ENT(AT_ENTRY, info->entry);
700     NEW_AUX_ENT(AT_UID, (abi_ulong) getuid());
701     NEW_AUX_ENT(AT_EUID, (abi_ulong) geteuid());
702     NEW_AUX_ENT(AT_GID, (abi_ulong) getgid());
703     NEW_AUX_ENT(AT_EGID, (abi_ulong) getegid());
704     NEW_AUX_ENT(AT_HWCAP, get_elf_hwcap(thread_cpu));
705     NEW_AUX_ENT(AT_CLKTCK, (abi_ulong) sysconf(_SC_CLK_TCK));
706     NEW_AUX_ENT(AT_RANDOM, (abi_ulong) u_rand_bytes);
707     NEW_AUX_ENT(AT_SECURE, (abi_ulong) qemu_getauxval(AT_SECURE));
708     NEW_AUX_ENT(AT_EXECFN, info->file_string);
709 
710     if (HAVE_ELF_HWCAP2) {
711         NEW_AUX_ENT(AT_HWCAP2, get_elf_hwcap2(thread_cpu));
712     }
713     if (u_base_platform) {
714         NEW_AUX_ENT(AT_BASE_PLATFORM, u_base_platform);
715     }
716     if (u_platform) {
717         NEW_AUX_ENT(AT_PLATFORM, u_platform);
718     }
719     if (vdso_info) {
720         NEW_AUX_ENT(AT_SYSINFO_EHDR, vdso_info->load_addr);
721     }
722     NEW_AUX_ENT (AT_NULL, 0);
723 #undef NEW_AUX_ENT
724 
725     /* Check that our initial calculation of the auxv length matches how much
726      * we actually put into it.
727      */
728     assert(info->auxv_len == u_auxv - info->saved_auxv);
729 
730     put_user_ual(argc, u_argc);
731 
732     p = info->arg_strings;
733     for (i = 0; i < argc; ++i) {
734         put_user_ual(p, u_argv);
735         u_argv += n;
736         p += target_strlen(p) + 1;
737     }
738     put_user_ual(0, u_argv);
739 
740     p = info->env_strings;
741     for (i = 0; i < envc; ++i) {
742         put_user_ual(p, u_envp);
743         u_envp += n;
744         p += target_strlen(p) + 1;
745     }
746     put_user_ual(0, u_envp);
747 
748     return sp;
749 }
750 
751 #if defined(HI_COMMPAGE)
752 #define LO_COMMPAGE -1
753 #elif defined(LO_COMMPAGE)
754 #define HI_COMMPAGE 0
755 #else
756 #define HI_COMMPAGE 0
757 #define LO_COMMPAGE -1
758 #ifndef HAVE_GUEST_COMMPAGE
init_guest_commpage(void)759 bool init_guest_commpage(void) { return true; }
760 #endif
761 #endif
762 
763 /**
764  * pgb_try_mmap:
765  * @addr: host start address
766  * @addr_last: host last address
767  * @keep: do not unmap the probe region
768  *
769  * Return 1 if [@addr, @addr_last] is not mapped in the host,
770  * return 0 if it is not available to map, and -1 on mmap error.
771  * If @keep, the region is left mapped on success, otherwise unmapped.
772  */
pgb_try_mmap(uintptr_t addr,uintptr_t addr_last,bool keep)773 static int pgb_try_mmap(uintptr_t addr, uintptr_t addr_last, bool keep)
774 {
775     size_t size = addr_last - addr + 1;
776     void *p = mmap((void *)addr, size, PROT_NONE,
777                    MAP_ANONYMOUS | MAP_PRIVATE |
778                    MAP_NORESERVE | MAP_FIXED_NOREPLACE, -1, 0);
779     int ret;
780 
781     if (p == MAP_FAILED) {
782         return errno == EEXIST ? 0 : -1;
783     }
784     ret = p == (void *)addr;
785     if (!keep || !ret) {
786         munmap(p, size);
787     }
788     return ret;
789 }
790 
791 /**
792  * pgb_try_mmap_skip_brk(uintptr_t addr, uintptr_t size, uintptr_t brk)
793  * @addr: host address
794  * @addr_last: host last address
795  * @brk: host brk
796  *
797  * Like pgb_try_mmap, but additionally reserve some memory following brk.
798  */
pgb_try_mmap_skip_brk(uintptr_t addr,uintptr_t addr_last,uintptr_t brk,bool keep)799 static int pgb_try_mmap_skip_brk(uintptr_t addr, uintptr_t addr_last,
800                                  uintptr_t brk, bool keep)
801 {
802     uintptr_t brk_last = brk + 16 * MiB - 1;
803 
804     /* Do not map anything close to the host brk. */
805     if (addr <= brk_last && brk <= addr_last) {
806         return 0;
807     }
808     return pgb_try_mmap(addr, addr_last, keep);
809 }
810 
811 /**
812  * pgb_try_mmap_set:
813  * @ga: set of guest addrs
814  * @base: guest_base
815  * @brk: host brk
816  *
817  * Return true if all @ga can be mapped by the host at @base.
818  * On success, retain the mapping at index 0 for reserved_va.
819  */
820 
821 typedef struct PGBAddrs {
822     uintptr_t bounds[3][2]; /* start/last pairs */
823     int nbounds;
824 } PGBAddrs;
825 
pgb_try_mmap_set(const PGBAddrs * ga,uintptr_t base,uintptr_t brk)826 static bool pgb_try_mmap_set(const PGBAddrs *ga, uintptr_t base, uintptr_t brk)
827 {
828     for (int i = ga->nbounds - 1; i >= 0; --i) {
829         if (pgb_try_mmap_skip_brk(ga->bounds[i][0] + base,
830                                   ga->bounds[i][1] + base,
831                                   brk, i == 0 && reserved_va) <= 0) {
832             return false;
833         }
834     }
835     return true;
836 }
837 
838 /**
839  * pgb_addr_set:
840  * @ga: output set of guest addrs
841  * @guest_loaddr: guest image low address
842  * @guest_loaddr: guest image high address
843  * @identity: create for identity mapping
844  *
845  * Fill in @ga with the image, COMMPAGE and NULL page.
846  */
pgb_addr_set(PGBAddrs * ga,abi_ulong guest_loaddr,abi_ulong guest_hiaddr,bool try_identity)847 static bool pgb_addr_set(PGBAddrs *ga, abi_ulong guest_loaddr,
848                          abi_ulong guest_hiaddr, bool try_identity)
849 {
850     int n;
851 
852     /*
853      * With a low commpage, or a guest mapped very low,
854      * we may not be able to use the identity map.
855      */
856     if (try_identity) {
857         if (LO_COMMPAGE != -1 && LO_COMMPAGE < mmap_min_addr) {
858             return false;
859         }
860         if (guest_loaddr != 0 && guest_loaddr < mmap_min_addr) {
861             return false;
862         }
863     }
864 
865     memset(ga, 0, sizeof(*ga));
866     n = 0;
867 
868     if (reserved_va) {
869         ga->bounds[n][0] = try_identity ? mmap_min_addr : 0;
870         ga->bounds[n][1] = reserved_va;
871         n++;
872         /* LO_COMMPAGE and NULL handled by reserving from 0. */
873     } else {
874         /* Add any LO_COMMPAGE or NULL page. */
875         if (LO_COMMPAGE != -1) {
876             ga->bounds[n][0] = 0;
877             ga->bounds[n][1] = LO_COMMPAGE + TARGET_PAGE_SIZE - 1;
878             n++;
879         } else if (!try_identity) {
880             ga->bounds[n][0] = 0;
881             ga->bounds[n][1] = TARGET_PAGE_SIZE - 1;
882             n++;
883         }
884 
885         /* Add the guest image for ET_EXEC. */
886         if (guest_loaddr) {
887             ga->bounds[n][0] = guest_loaddr;
888             ga->bounds[n][1] = guest_hiaddr;
889             n++;
890         }
891     }
892 
893     /*
894      * Temporarily disable
895      *   "comparison is always false due to limited range of data type"
896      * due to comparison between unsigned and (possible) 0.
897      */
898 #pragma GCC diagnostic push
899 #pragma GCC diagnostic ignored "-Wtype-limits"
900 
901     /* Add any HI_COMMPAGE not covered by reserved_va. */
902     if (reserved_va < HI_COMMPAGE) {
903         ga->bounds[n][0] = HI_COMMPAGE & qemu_real_host_page_mask();
904         ga->bounds[n][1] = HI_COMMPAGE + TARGET_PAGE_SIZE - 1;
905         n++;
906     }
907 
908 #pragma GCC diagnostic pop
909 
910     ga->nbounds = n;
911     return true;
912 }
913 
pgb_fail_in_use(const char * image_name)914 static void pgb_fail_in_use(const char *image_name)
915 {
916     error_report("%s: requires virtual address space that is in use "
917                  "(omit the -B option or choose a different value)",
918                  image_name);
919     exit(EXIT_FAILURE);
920 }
921 
pgb_fixed(const char * image_name,uintptr_t guest_loaddr,uintptr_t guest_hiaddr,uintptr_t align)922 static void pgb_fixed(const char *image_name, uintptr_t guest_loaddr,
923                       uintptr_t guest_hiaddr, uintptr_t align)
924 {
925     PGBAddrs ga;
926     uintptr_t brk = (uintptr_t)sbrk(0);
927 
928     if (!QEMU_IS_ALIGNED(guest_base, align)) {
929         fprintf(stderr, "Requested guest base %p does not satisfy "
930                 "host minimum alignment (0x%" PRIxPTR ")\n",
931                 (void *)guest_base, align);
932         exit(EXIT_FAILURE);
933     }
934 
935     if (!pgb_addr_set(&ga, guest_loaddr, guest_hiaddr, !guest_base)
936         || !pgb_try_mmap_set(&ga, guest_base, brk)) {
937         pgb_fail_in_use(image_name);
938     }
939 }
940 
941 /**
942  * pgb_find_fallback:
943  *
944  * This is a fallback method for finding holes in the host address space
945  * if we don't have the benefit of being able to access /proc/self/map.
946  * It can potentially take a very long time as we can only dumbly iterate
947  * up the host address space seeing if the allocation would work.
948  */
pgb_find_fallback(const PGBAddrs * ga,uintptr_t align,uintptr_t brk)949 static uintptr_t pgb_find_fallback(const PGBAddrs *ga, uintptr_t align,
950                                    uintptr_t brk)
951 {
952     /* TODO: come up with a better estimate of how much to skip. */
953     uintptr_t skip = sizeof(uintptr_t) == 4 ? MiB : GiB;
954 
955     for (uintptr_t base = skip; ; base += skip) {
956         base = ROUND_UP(base, align);
957         if (pgb_try_mmap_set(ga, base, brk)) {
958             return base;
959         }
960         if (base >= -skip) {
961             return -1;
962         }
963     }
964 }
965 
pgb_try_itree(const PGBAddrs * ga,uintptr_t base,IntervalTreeRoot * root)966 static uintptr_t pgb_try_itree(const PGBAddrs *ga, uintptr_t base,
967                                IntervalTreeRoot *root)
968 {
969     for (int i = ga->nbounds - 1; i >= 0; --i) {
970         uintptr_t s = base + ga->bounds[i][0];
971         uintptr_t l = base + ga->bounds[i][1];
972         IntervalTreeNode *n;
973 
974         if (l < s) {
975             /* Wraparound. Skip to advance S to mmap_min_addr. */
976             return mmap_min_addr - s;
977         }
978 
979         n = interval_tree_iter_first(root, s, l);
980         if (n != NULL) {
981             /* Conflict.  Skip to advance S to LAST + 1. */
982             return n->last - s + 1;
983         }
984     }
985     return 0;  /* success */
986 }
987 
pgb_find_itree(const PGBAddrs * ga,IntervalTreeRoot * root,uintptr_t align,uintptr_t brk)988 static uintptr_t pgb_find_itree(const PGBAddrs *ga, IntervalTreeRoot *root,
989                                 uintptr_t align, uintptr_t brk)
990 {
991     uintptr_t last = sizeof(uintptr_t) == 4 ? MiB : GiB;
992     uintptr_t base, skip;
993 
994     while (true) {
995         base = ROUND_UP(last, align);
996         if (base < last) {
997             return -1;
998         }
999 
1000         skip = pgb_try_itree(ga, base, root);
1001         if (skip == 0) {
1002             break;
1003         }
1004 
1005         last = base + skip;
1006         if (last < base) {
1007             return -1;
1008         }
1009     }
1010 
1011     /*
1012      * We've chosen 'base' based on holes in the interval tree,
1013      * but we don't yet know if it is a valid host address.
1014      * Because it is the first matching hole, if the host addresses
1015      * are invalid we know there are no further matches.
1016      */
1017     return pgb_try_mmap_set(ga, base, brk) ? base : -1;
1018 }
1019 
pgb_dynamic(const char * image_name,uintptr_t guest_loaddr,uintptr_t guest_hiaddr,uintptr_t align)1020 static void pgb_dynamic(const char *image_name, uintptr_t guest_loaddr,
1021                         uintptr_t guest_hiaddr, uintptr_t align)
1022 {
1023     IntervalTreeRoot *root;
1024     uintptr_t brk, ret;
1025     PGBAddrs ga;
1026 
1027     /* Try the identity map first. */
1028     if (pgb_addr_set(&ga, guest_loaddr, guest_hiaddr, true)) {
1029         brk = (uintptr_t)sbrk(0);
1030         if (pgb_try_mmap_set(&ga, 0, brk)) {
1031             guest_base = 0;
1032             return;
1033         }
1034     }
1035 
1036     /*
1037      * Rebuild the address set for non-identity map.
1038      * This differs in the mapping of the guest NULL page.
1039      */
1040     pgb_addr_set(&ga, guest_loaddr, guest_hiaddr, false);
1041 
1042     root = read_self_maps();
1043 
1044     /* Read brk after we've read the maps, which will malloc. */
1045     brk = (uintptr_t)sbrk(0);
1046 
1047     if (!root) {
1048         ret = pgb_find_fallback(&ga, align, brk);
1049     } else {
1050         /*
1051          * Reserve the area close to the host brk.
1052          * This will be freed with the rest of the tree.
1053          */
1054         IntervalTreeNode *b = g_new0(IntervalTreeNode, 1);
1055         b->start = brk;
1056         b->last = brk + 16 * MiB - 1;
1057         interval_tree_insert(b, root);
1058 
1059         ret = pgb_find_itree(&ga, root, align, brk);
1060         free_self_maps(root);
1061     }
1062 
1063     if (ret == -1) {
1064         int w = TARGET_LONG_BITS / 4;
1065 
1066         error_report("%s: Unable to find a guest_base to satisfy all "
1067                      "guest address mapping requirements", image_name);
1068 
1069         for (int i = 0; i < ga.nbounds; ++i) {
1070             error_printf("  %0*" PRIx64 "-%0*" PRIx64 "\n",
1071                          w, (uint64_t)ga.bounds[i][0],
1072                          w, (uint64_t)ga.bounds[i][1]);
1073         }
1074         exit(EXIT_FAILURE);
1075     }
1076     guest_base = ret;
1077 }
1078 
probe_guest_base(const char * image_name,abi_ulong guest_loaddr,abi_ulong guest_hiaddr)1079 void probe_guest_base(const char *image_name, abi_ulong guest_loaddr,
1080                       abi_ulong guest_hiaddr)
1081 {
1082     /* In order to use host shmat, we must be able to honor SHMLBA.  */
1083     uintptr_t align = MAX(SHMLBA, TARGET_PAGE_SIZE);
1084 
1085     /* Sanity check the guest binary. */
1086     if (reserved_va) {
1087         if (guest_hiaddr > reserved_va) {
1088             error_report("%s: requires more than reserved virtual "
1089                          "address space (0x%" PRIx64 " > 0x%lx)",
1090                          image_name, (uint64_t)guest_hiaddr, reserved_va);
1091             exit(EXIT_FAILURE);
1092         }
1093     } else {
1094         if (guest_hiaddr != (uintptr_t)guest_hiaddr) {
1095             error_report("%s: requires more virtual address space "
1096                          "than the host can provide (0x%" PRIx64 ")",
1097                          image_name, (uint64_t)guest_hiaddr + 1);
1098             exit(EXIT_FAILURE);
1099         }
1100     }
1101 
1102     if (have_guest_base) {
1103         pgb_fixed(image_name, guest_loaddr, guest_hiaddr, align);
1104     } else {
1105         pgb_dynamic(image_name, guest_loaddr, guest_hiaddr, align);
1106     }
1107 
1108     /* Reserve and initialize the commpage. */
1109     if (!init_guest_commpage()) {
1110         /* We have already probed for the commpage being free. */
1111         g_assert_not_reached();
1112     }
1113 
1114     assert(QEMU_IS_ALIGNED(guest_base, align));
1115     qemu_log_mask(CPU_LOG_PAGE, "Locating guest address space "
1116                   "@ 0x%" PRIx64 "\n", (uint64_t)guest_base);
1117 }
1118 
1119 enum {
1120     /* The string "GNU\0" as a magic number. */
1121     GNU0_MAGIC = const_le32('G' | 'N' << 8 | 'U' << 16),
1122     NOTE_DATA_SZ = 1 * KiB,
1123     NOTE_NAME_SZ = 4,
1124     ELF_GNU_PROPERTY_ALIGN = ELF_CLASS == ELFCLASS32 ? 4 : 8,
1125 };
1126 
1127 /*
1128  * Process a single gnu_property entry.
1129  * Return false for error.
1130  */
parse_elf_property(const uint32_t * data,int * off,int datasz,struct image_info * info,bool have_prev_type,uint32_t * prev_type,Error ** errp)1131 static bool parse_elf_property(const uint32_t *data, int *off, int datasz,
1132                                struct image_info *info, bool have_prev_type,
1133                                uint32_t *prev_type, Error **errp)
1134 {
1135     uint32_t pr_type, pr_datasz, step;
1136 
1137     if (*off > datasz || !QEMU_IS_ALIGNED(*off, ELF_GNU_PROPERTY_ALIGN)) {
1138         goto error_data;
1139     }
1140     datasz -= *off;
1141     data += *off / sizeof(uint32_t);
1142 
1143     if (datasz < 2 * sizeof(uint32_t)) {
1144         goto error_data;
1145     }
1146     pr_type = data[0];
1147     pr_datasz = data[1];
1148     data += 2;
1149     datasz -= 2 * sizeof(uint32_t);
1150     step = ROUND_UP(pr_datasz, ELF_GNU_PROPERTY_ALIGN);
1151     if (step > datasz) {
1152         goto error_data;
1153     }
1154 
1155     /* Properties are supposed to be unique and sorted on pr_type. */
1156     if (have_prev_type && pr_type <= *prev_type) {
1157         if (pr_type == *prev_type) {
1158             error_setg(errp, "Duplicate property in PT_GNU_PROPERTY");
1159         } else {
1160             error_setg(errp, "Unsorted property in PT_GNU_PROPERTY");
1161         }
1162         return false;
1163     }
1164     *prev_type = pr_type;
1165 
1166     if (!arch_parse_elf_property(pr_type, pr_datasz, data, info, errp)) {
1167         return false;
1168     }
1169 
1170     *off += 2 * sizeof(uint32_t) + step;
1171     return true;
1172 
1173  error_data:
1174     error_setg(errp, "Ill-formed property in PT_GNU_PROPERTY");
1175     return false;
1176 }
1177 
1178 /* Process NT_GNU_PROPERTY_TYPE_0. */
parse_elf_properties(const ImageSource * src,struct image_info * info,const struct elf_phdr * phdr,Error ** errp)1179 static bool parse_elf_properties(const ImageSource *src,
1180                                  struct image_info *info,
1181                                  const struct elf_phdr *phdr,
1182                                  Error **errp)
1183 {
1184     union {
1185         struct elf_note nhdr;
1186         uint32_t data[NOTE_DATA_SZ / sizeof(uint32_t)];
1187     } note;
1188 
1189     int n, off, datasz;
1190     bool have_prev_type;
1191     uint32_t prev_type;
1192 
1193     /* Unless the arch requires properties, ignore them. */
1194     if (!HAVE_ELF_GNU_PROPERTY) {
1195         return true;
1196     }
1197 
1198     /* If the properties are crazy large, that's too bad. */
1199     n = phdr->p_filesz;
1200     if (n > sizeof(note)) {
1201         error_setg(errp, "PT_GNU_PROPERTY too large");
1202         return false;
1203     }
1204     if (n < sizeof(note.nhdr)) {
1205         error_setg(errp, "PT_GNU_PROPERTY too small");
1206         return false;
1207     }
1208 
1209     if (!imgsrc_read(&note, phdr->p_offset, n, src, errp)) {
1210         return false;
1211     }
1212 
1213     /*
1214      * The contents of a valid PT_GNU_PROPERTY is a sequence of uint32_t.
1215      * Swap most of them now, beyond the header and namesz.
1216      */
1217     if (target_needs_bswap()) {
1218         for (int i = 4; i < n / 4; i++) {
1219             bswap32s(note.data + i);
1220         }
1221     }
1222 
1223     /*
1224      * Note that nhdr is 3 words, and that the "name" described by namesz
1225      * immediately follows nhdr and is thus at the 4th word.  Further, all
1226      * of the inputs to the kernel's round_up are multiples of 4.
1227      */
1228     if (tswap32(note.nhdr.n_type) != NT_GNU_PROPERTY_TYPE_0 ||
1229         tswap32(note.nhdr.n_namesz) != NOTE_NAME_SZ ||
1230         note.data[3] != GNU0_MAGIC) {
1231         error_setg(errp, "Invalid note in PT_GNU_PROPERTY");
1232         return false;
1233     }
1234     off = sizeof(note.nhdr) + NOTE_NAME_SZ;
1235 
1236     datasz = tswap32(note.nhdr.n_descsz) + off;
1237     if (datasz > n) {
1238         error_setg(errp, "Invalid note size in PT_GNU_PROPERTY");
1239         return false;
1240     }
1241 
1242     have_prev_type = false;
1243     prev_type = 0;
1244     while (1) {
1245         if (off == datasz) {
1246             return true;  /* end, exit ok */
1247         }
1248         if (!parse_elf_property(note.data, &off, datasz, info,
1249                                 have_prev_type, &prev_type, errp)) {
1250             return false;
1251         }
1252         have_prev_type = true;
1253     }
1254 }
1255 
1256 /**
1257  * load_elf_image: Load an ELF image into the address space.
1258  * @image_name: the filename of the image, to use in error messages.
1259  * @src: the ImageSource from which to read.
1260  * @info: info collected from the loaded image.
1261  * @ehdr: the ELF header, not yet bswapped.
1262  * @pinterp_name: record any PT_INTERP string found.
1263  *
1264  * On return: @info values will be filled in, as necessary or available.
1265  */
1266 
load_elf_image(const char * image_name,const ImageSource * src,struct image_info * info,struct elfhdr * ehdr,char ** pinterp_name)1267 static void load_elf_image(const char *image_name, const ImageSource *src,
1268                            struct image_info *info, struct elfhdr *ehdr,
1269                            char **pinterp_name)
1270 {
1271     g_autofree struct elf_phdr *phdr = NULL;
1272     abi_ulong load_addr, load_bias, loaddr, hiaddr, error, align;
1273     size_t reserve_size, align_size;
1274     int i, prot_exec;
1275     Error *err = NULL;
1276 
1277     /*
1278      * First of all, some simple consistency checks.
1279      * Note that we rely on the bswapped ehdr staying in bprm_buf,
1280      * for later use by load_elf_binary and create_elf_tables.
1281      */
1282     if (!imgsrc_read(ehdr, 0, sizeof(*ehdr), src, &err)) {
1283         goto exit_errmsg;
1284     }
1285     if (!elf_check_ident(ehdr)) {
1286         error_setg(&err, "Invalid ELF image for this architecture");
1287         goto exit_errmsg;
1288     }
1289     bswap_ehdr(ehdr);
1290     if (!elf_check_ehdr(ehdr)) {
1291         error_setg(&err, "Invalid ELF image for this architecture");
1292         goto exit_errmsg;
1293     }
1294 
1295     phdr = imgsrc_read_alloc(ehdr->e_phoff,
1296                              ehdr->e_phnum * sizeof(struct elf_phdr),
1297                              src, &err);
1298     if (phdr == NULL) {
1299         goto exit_errmsg;
1300     }
1301     bswap_phdr(phdr, ehdr->e_phnum);
1302 
1303     info->nsegs = 0;
1304     info->pt_dynamic_addr = 0;
1305 
1306     mmap_lock();
1307 
1308     /*
1309      * Find the maximum size of the image and allocate an appropriate
1310      * amount of memory to handle that.  Locate the interpreter, if any.
1311      */
1312     loaddr = -1, hiaddr = 0;
1313     align = 0;
1314     info->exec_stack = EXSTACK_DEFAULT;
1315     for (i = 0; i < ehdr->e_phnum; ++i) {
1316         struct elf_phdr *eppnt = phdr + i;
1317         if (eppnt->p_type == PT_LOAD) {
1318             abi_ulong a = eppnt->p_vaddr & TARGET_PAGE_MASK;
1319             if (a < loaddr) {
1320                 loaddr = a;
1321             }
1322             a = eppnt->p_vaddr + eppnt->p_memsz - 1;
1323             if (a > hiaddr) {
1324                 hiaddr = a;
1325             }
1326             ++info->nsegs;
1327             align |= eppnt->p_align;
1328         } else if (eppnt->p_type == PT_INTERP && pinterp_name) {
1329             g_autofree char *interp_name = NULL;
1330 
1331             if (*pinterp_name) {
1332                 error_setg(&err, "Multiple PT_INTERP entries");
1333                 goto exit_errmsg;
1334             }
1335 
1336             interp_name = imgsrc_read_alloc(eppnt->p_offset, eppnt->p_filesz,
1337                                             src, &err);
1338             if (interp_name == NULL) {
1339                 goto exit_errmsg;
1340             }
1341             if (interp_name[eppnt->p_filesz - 1] != 0) {
1342                 error_setg(&err, "Invalid PT_INTERP entry");
1343                 goto exit_errmsg;
1344             }
1345             *pinterp_name = g_steal_pointer(&interp_name);
1346         } else if (eppnt->p_type == PT_GNU_PROPERTY) {
1347             if (!parse_elf_properties(src, info, eppnt, &err)) {
1348                 goto exit_errmsg;
1349             }
1350         } else if (eppnt->p_type == PT_GNU_STACK) {
1351             info->exec_stack = eppnt->p_flags & PF_X;
1352         }
1353     }
1354 
1355     load_addr = loaddr;
1356 
1357     align = pow2ceil(align);
1358 
1359     if (pinterp_name != NULL) {
1360         if (ehdr->e_type == ET_EXEC) {
1361             /*
1362              * Make sure that the low address does not conflict with
1363              * MMAP_MIN_ADDR or the QEMU application itself.
1364              */
1365             probe_guest_base(image_name, loaddr, hiaddr);
1366         } else {
1367             /*
1368              * The binary is dynamic, but we still need to
1369              * select guest_base.  In this case we pass a size.
1370              */
1371             probe_guest_base(image_name, 0, hiaddr - loaddr);
1372 
1373             /*
1374              * Avoid collision with the loader by providing a different
1375              * default load address.
1376              */
1377             load_addr += elf_et_dyn_base;
1378 
1379             /*
1380              * TODO: Better support for mmap alignment is desirable.
1381              * Since we do not have complete control over the guest
1382              * address space, we prefer the kernel to choose some address
1383              * rather than force the use of LOAD_ADDR via MAP_FIXED.
1384              */
1385             if (align) {
1386                 load_addr &= -align;
1387             }
1388         }
1389     }
1390 
1391     /*
1392      * Reserve address space for all of this.
1393      *
1394      * In the case of ET_EXEC, we supply MAP_FIXED_NOREPLACE so that we get
1395      * exactly the address range that is required.  Without reserved_va,
1396      * the guest address space is not isolated.  We have attempted to avoid
1397      * conflict with the host program itself via probe_guest_base, but using
1398      * MAP_FIXED_NOREPLACE instead of MAP_FIXED provides an extra check.
1399      *
1400      * Otherwise this is ET_DYN, and we are searching for a location
1401      * that can hold the memory space required.  If the image is
1402      * pre-linked, LOAD_ADDR will be non-zero, and the kernel should
1403      * honor that address if it happens to be free.
1404      *
1405      * In both cases, we will overwrite pages in this range with mappings
1406      * from the executable.
1407      */
1408     reserve_size = (size_t)hiaddr - loaddr + 1;
1409     align_size = reserve_size;
1410 
1411     if (ehdr->e_type != ET_EXEC && align > qemu_real_host_page_size()) {
1412         align_size += align - 1;
1413     }
1414 
1415     load_addr = target_mmap(load_addr, align_size, PROT_NONE,
1416                             MAP_PRIVATE | MAP_ANON | MAP_NORESERVE |
1417                             (ehdr->e_type == ET_EXEC ? MAP_FIXED_NOREPLACE : 0),
1418                             -1, 0);
1419     if (load_addr == -1) {
1420         goto exit_mmap;
1421     }
1422 
1423     if (align_size != reserve_size) {
1424         abi_ulong align_addr = ROUND_UP(load_addr, align);
1425         abi_ulong align_end = TARGET_PAGE_ALIGN(align_addr + reserve_size);
1426         abi_ulong load_end = TARGET_PAGE_ALIGN(load_addr + align_size);
1427 
1428         if (align_addr != load_addr) {
1429             target_munmap(load_addr, align_addr - load_addr);
1430         }
1431         if (align_end != load_end) {
1432             target_munmap(align_end, load_end - align_end);
1433         }
1434         load_addr = align_addr;
1435     }
1436 
1437     load_bias = load_addr - loaddr;
1438 
1439     if (elf_is_fdpic(ehdr)) {
1440         struct elf32_fdpic_loadseg *loadsegs = info->loadsegs =
1441             g_malloc(sizeof(*loadsegs) * info->nsegs);
1442 
1443         for (i = 0; i < ehdr->e_phnum; ++i) {
1444             switch (phdr[i].p_type) {
1445             case PT_DYNAMIC:
1446                 info->pt_dynamic_addr = phdr[i].p_vaddr + load_bias;
1447                 break;
1448             case PT_LOAD:
1449                 loadsegs->addr = phdr[i].p_vaddr + load_bias;
1450                 loadsegs->p_vaddr = phdr[i].p_vaddr;
1451                 loadsegs->p_memsz = phdr[i].p_memsz;
1452                 ++loadsegs;
1453                 break;
1454             }
1455         }
1456     }
1457 
1458     info->load_bias = load_bias;
1459     info->code_offset = load_bias;
1460     info->data_offset = load_bias;
1461     info->load_addr = load_addr;
1462     info->entry = ehdr->e_entry + load_bias;
1463     info->start_code = -1;
1464     info->end_code = 0;
1465     info->start_data = -1;
1466     info->end_data = 0;
1467     /* Usual start for brk is after all sections of the main executable. */
1468     info->brk = TARGET_PAGE_ALIGN(hiaddr + load_bias);
1469     info->elf_flags = ehdr->e_flags;
1470 
1471     prot_exec = PROT_EXEC;
1472 #ifdef TARGET_AARCH64
1473     /*
1474      * If the BTI feature is present, this indicates that the executable
1475      * pages of the startup binary should be mapped with PROT_BTI, so that
1476      * branch targets are enforced.
1477      *
1478      * The startup binary is either the interpreter or the static executable.
1479      * The interpreter is responsible for all pages of a dynamic executable.
1480      *
1481      * Elf notes are backward compatible to older cpus.
1482      * Do not enable BTI unless it is supported.
1483      */
1484     if ((info->note_flags & GNU_PROPERTY_AARCH64_FEATURE_1_BTI)
1485         && (pinterp_name == NULL || *pinterp_name == 0)
1486         && cpu_isar_feature(aa64_bti, ARM_CPU(thread_cpu))) {
1487         prot_exec |= TARGET_PROT_BTI;
1488     }
1489 #endif
1490 
1491     for (i = 0; i < ehdr->e_phnum; i++) {
1492         struct elf_phdr *eppnt = phdr + i;
1493         if (eppnt->p_type == PT_LOAD) {
1494             abi_ulong vaddr, vaddr_po, vaddr_ps, vaddr_ef, vaddr_em;
1495             int elf_prot = 0;
1496 
1497             if (eppnt->p_flags & PF_R) {
1498                 elf_prot |= PROT_READ;
1499             }
1500             if (eppnt->p_flags & PF_W) {
1501                 elf_prot |= PROT_WRITE;
1502             }
1503             if (eppnt->p_flags & PF_X) {
1504                 elf_prot |= prot_exec;
1505             }
1506 
1507             vaddr = load_bias + eppnt->p_vaddr;
1508             vaddr_po = vaddr & ~TARGET_PAGE_MASK;
1509             vaddr_ps = vaddr & TARGET_PAGE_MASK;
1510 
1511             vaddr_ef = vaddr + eppnt->p_filesz;
1512             vaddr_em = vaddr + eppnt->p_memsz;
1513 
1514             /*
1515              * Some segments may be completely empty, with a non-zero p_memsz
1516              * but no backing file segment.
1517              */
1518             if (eppnt->p_filesz != 0) {
1519                 error = imgsrc_mmap(vaddr_ps, eppnt->p_filesz + vaddr_po,
1520                                     elf_prot, MAP_PRIVATE | MAP_FIXED,
1521                                     src, eppnt->p_offset - vaddr_po);
1522                 if (error == -1) {
1523                     goto exit_mmap;
1524                 }
1525             }
1526 
1527             /* If the load segment requests extra zeros (e.g. bss), map it. */
1528             if (vaddr_ef < vaddr_em &&
1529                 !zero_bss(vaddr_ef, vaddr_em, elf_prot, &err)) {
1530                 goto exit_errmsg;
1531             }
1532 
1533             /* Find the full program boundaries.  */
1534             if (elf_prot & PROT_EXEC) {
1535                 if (vaddr < info->start_code) {
1536                     info->start_code = vaddr;
1537                 }
1538                 if (vaddr_ef > info->end_code) {
1539                     info->end_code = vaddr_ef;
1540                 }
1541             }
1542             if (elf_prot & PROT_WRITE) {
1543                 if (vaddr < info->start_data) {
1544                     info->start_data = vaddr;
1545                 }
1546                 if (vaddr_ef > info->end_data) {
1547                     info->end_data = vaddr_ef;
1548                 }
1549             }
1550 #ifdef TARGET_MIPS
1551         } else if (eppnt->p_type == PT_MIPS_ABIFLAGS) {
1552             Mips_elf_abiflags_v0 abiflags;
1553 
1554             if (!imgsrc_read(&abiflags, eppnt->p_offset, sizeof(abiflags),
1555                              src, &err)) {
1556                 goto exit_errmsg;
1557             }
1558             bswap_mips_abiflags(&abiflags);
1559             info->fp_abi = abiflags.fp_abi;
1560 #endif
1561         }
1562     }
1563 
1564     if (info->end_data == 0) {
1565         info->start_data = info->end_code;
1566         info->end_data = info->end_code;
1567     }
1568 
1569     if (qemu_log_enabled()) {
1570         load_symbols(ehdr, src, load_bias);
1571     }
1572 
1573     debuginfo_report_elf(image_name, src->fd, load_bias);
1574 
1575     mmap_unlock();
1576 
1577     close(src->fd);
1578     return;
1579 
1580  exit_mmap:
1581     error_setg_errno(&err, errno, "Error mapping file");
1582     goto exit_errmsg;
1583  exit_errmsg:
1584     error_reportf_err(err, "%s: ", image_name);
1585     exit(-1);
1586 }
1587 
load_elf_interp(const char * filename,struct image_info * info,char bprm_buf[BPRM_BUF_SIZE])1588 static void load_elf_interp(const char *filename, struct image_info *info,
1589                             char bprm_buf[BPRM_BUF_SIZE])
1590 {
1591     struct elfhdr ehdr;
1592     ImageSource src;
1593     int fd, retval;
1594     Error *err = NULL;
1595 
1596     fd = open(path(filename), O_RDONLY);
1597     if (fd < 0) {
1598         error_setg_file_open(&err, errno, filename);
1599         error_report_err(err);
1600         exit(-1);
1601     }
1602 
1603     retval = read(fd, bprm_buf, BPRM_BUF_SIZE);
1604     if (retval < 0) {
1605         error_setg_errno(&err, errno, "Error reading file header");
1606         error_reportf_err(err, "%s: ", filename);
1607         exit(-1);
1608     }
1609 
1610     src.fd = fd;
1611     src.cache = bprm_buf;
1612     src.cache_size = retval;
1613 
1614     load_elf_image(filename, &src, info, &ehdr, NULL);
1615 }
1616 
1617 #ifndef HAVE_VDSO_IMAGE_INFO
get_vdso_image_info(uint32_t elf_flags)1618 const VdsoImageInfo *get_vdso_image_info(uint32_t elf_flags)
1619 {
1620 #ifdef VDSO_HEADER
1621 #include VDSO_HEADER
1622     return &vdso_image_info;
1623 #else
1624     return NULL;
1625 #endif
1626 }
1627 #endif /* HAVE_VDSO_IMAGE_INFO */
1628 
load_elf_vdso(struct image_info * info,const VdsoImageInfo * vdso)1629 static void load_elf_vdso(struct image_info *info, const VdsoImageInfo *vdso)
1630 {
1631     ImageSource src;
1632     struct elfhdr ehdr;
1633     abi_ulong load_bias, load_addr;
1634 
1635     src.fd = -1;
1636     src.cache = vdso->image;
1637     src.cache_size = vdso->image_size;
1638 
1639     load_elf_image("<internal-vdso>", &src, info, &ehdr, NULL);
1640     load_addr = info->load_addr;
1641     load_bias = info->load_bias;
1642 
1643     /*
1644      * We need to relocate the VDSO image.  The one built into the kernel
1645      * is built for a fixed address.  The one built for QEMU is not, since
1646      * that requires close control of the guest address space.
1647      * We pre-processed the image to locate all of the addresses that need
1648      * to be updated.
1649      */
1650     for (unsigned i = 0, n = vdso->reloc_count; i < n; i++) {
1651         abi_ulong *addr = g2h_untagged(load_addr + vdso->relocs[i]);
1652         *addr = tswapal(tswapal(*addr) + load_bias);
1653     }
1654 
1655     /* Install signal trampolines, if present. */
1656     if (vdso->sigreturn_ofs) {
1657         default_sigreturn = load_addr + vdso->sigreturn_ofs;
1658     }
1659     if (vdso->rt_sigreturn_ofs) {
1660         default_rt_sigreturn = load_addr + vdso->rt_sigreturn_ofs;
1661     }
1662     if (vdso->sigreturn_region_start_ofs) {
1663         vdso_sigreturn_region_start =
1664             load_addr + vdso->sigreturn_region_start_ofs;
1665         vdso_sigreturn_region_end = load_addr + vdso->sigreturn_region_end_ofs;
1666     }
1667 
1668     /* Remove write from VDSO segment. */
1669     target_mprotect(info->start_data, info->end_data - info->start_data,
1670                     PROT_READ | PROT_EXEC);
1671 }
1672 
symfind(const void * s0,const void * s1)1673 static int symfind(const void *s0, const void *s1)
1674 {
1675     struct elf_sym *sym = (struct elf_sym *)s1;
1676     __typeof(sym->st_value) addr = *(uint64_t *)s0;
1677     int result = 0;
1678 
1679     if (addr < sym->st_value) {
1680         result = -1;
1681     } else if (addr >= sym->st_value + sym->st_size) {
1682         result = 1;
1683     }
1684     return result;
1685 }
1686 
lookup_symbolxx(struct syminfo * s,uint64_t orig_addr)1687 static const char *lookup_symbolxx(struct syminfo *s, uint64_t orig_addr)
1688 {
1689 #if ELF_CLASS == ELFCLASS32
1690     struct elf_sym *syms = s->disas_symtab.elf32;
1691 #else
1692     struct elf_sym *syms = s->disas_symtab.elf64;
1693 #endif
1694 
1695     // binary search
1696     struct elf_sym *sym;
1697 
1698     sym = bsearch(&orig_addr, syms, s->disas_num_syms, sizeof(*syms), symfind);
1699     if (sym != NULL) {
1700         return s->disas_strtab + sym->st_name;
1701     }
1702 
1703     return "";
1704 }
1705 
1706 /* FIXME: This should use elf_ops.h.inc  */
symcmp(const void * s0,const void * s1)1707 static int symcmp(const void *s0, const void *s1)
1708 {
1709     struct elf_sym *sym0 = (struct elf_sym *)s0;
1710     struct elf_sym *sym1 = (struct elf_sym *)s1;
1711     return (sym0->st_value < sym1->st_value)
1712         ? -1
1713         : ((sym0->st_value > sym1->st_value) ? 1 : 0);
1714 }
1715 
1716 /* Best attempt to load symbols from this ELF object. */
load_symbols(struct elfhdr * hdr,const ImageSource * src,abi_ulong load_bias)1717 static void load_symbols(struct elfhdr *hdr, const ImageSource *src,
1718                          abi_ulong load_bias)
1719 {
1720     int i, shnum, nsyms, sym_idx = 0, str_idx = 0;
1721     g_autofree struct elf_shdr *shdr = NULL;
1722     char *strings = NULL;
1723     struct elf_sym *syms = NULL;
1724     struct elf_sym *new_syms;
1725     uint64_t segsz;
1726 
1727     shnum = hdr->e_shnum;
1728     shdr = imgsrc_read_alloc(hdr->e_shoff, shnum * sizeof(struct elf_shdr),
1729                              src, NULL);
1730     if (shdr == NULL) {
1731         return;
1732     }
1733 
1734     bswap_shdr(shdr, shnum);
1735     for (i = 0; i < shnum; ++i) {
1736         if (shdr[i].sh_type == SHT_SYMTAB) {
1737             sym_idx = i;
1738             str_idx = shdr[i].sh_link;
1739             goto found;
1740         }
1741     }
1742 
1743     /* There will be no symbol table if the file was stripped.  */
1744     return;
1745 
1746  found:
1747     /* Now know where the strtab and symtab are.  Snarf them.  */
1748 
1749     segsz = shdr[str_idx].sh_size;
1750     strings = g_try_malloc(segsz);
1751     if (!strings) {
1752         goto give_up;
1753     }
1754     if (!imgsrc_read(strings, shdr[str_idx].sh_offset, segsz, src, NULL)) {
1755         goto give_up;
1756     }
1757 
1758     segsz = shdr[sym_idx].sh_size;
1759     if (segsz / sizeof(struct elf_sym) > INT_MAX) {
1760         /*
1761          * Implausibly large symbol table: give up rather than ploughing
1762          * on with the number of symbols calculation overflowing.
1763          */
1764         goto give_up;
1765     }
1766     nsyms = segsz / sizeof(struct elf_sym);
1767     syms = g_try_malloc(segsz);
1768     if (!syms) {
1769         goto give_up;
1770     }
1771     if (!imgsrc_read(syms, shdr[sym_idx].sh_offset, segsz, src, NULL)) {
1772         goto give_up;
1773     }
1774 
1775     for (i = 0; i < nsyms; ) {
1776         bswap_sym(syms + i);
1777         /* Throw away entries which we do not need.  */
1778         if (syms[i].st_shndx == SHN_UNDEF
1779             || syms[i].st_shndx >= SHN_LORESERVE
1780             || ELF_ST_TYPE(syms[i].st_info) != STT_FUNC) {
1781             if (i < --nsyms) {
1782                 syms[i] = syms[nsyms];
1783             }
1784         } else {
1785 #if defined(TARGET_ARM) || defined (TARGET_MIPS)
1786             /* The bottom address bit marks a Thumb or MIPS16 symbol.  */
1787             syms[i].st_value &= ~(target_ulong)1;
1788 #endif
1789             syms[i].st_value += load_bias;
1790             i++;
1791         }
1792     }
1793 
1794     /* No "useful" symbol.  */
1795     if (nsyms == 0) {
1796         goto give_up;
1797     }
1798 
1799     /*
1800      * Attempt to free the storage associated with the local symbols
1801      * that we threw away.  Whether or not this has any effect on the
1802      * memory allocation depends on the malloc implementation and how
1803      * many symbols we managed to discard.
1804      */
1805     new_syms = g_try_renew(struct elf_sym, syms, nsyms);
1806     if (new_syms == NULL) {
1807         goto give_up;
1808     }
1809     syms = new_syms;
1810 
1811     qsort(syms, nsyms, sizeof(*syms), symcmp);
1812 
1813     {
1814         struct syminfo *s = g_new(struct syminfo, 1);
1815 
1816         s->disas_strtab = strings;
1817         s->disas_num_syms = nsyms;
1818 #if ELF_CLASS == ELFCLASS32
1819         s->disas_symtab.elf32 = syms;
1820 #else
1821         s->disas_symtab.elf64 = syms;
1822 #endif
1823         s->lookup_symbol = lookup_symbolxx;
1824         s->next = syminfos;
1825         syminfos = s;
1826     }
1827     return;
1828 
1829  give_up:
1830     g_free(strings);
1831     g_free(syms);
1832 }
1833 
get_elf_eflags(int fd)1834 uint32_t get_elf_eflags(int fd)
1835 {
1836     struct elfhdr ehdr;
1837     off_t offset;
1838     int ret;
1839 
1840     /* Read ELF header */
1841     offset = lseek(fd, 0, SEEK_SET);
1842     if (offset == (off_t) -1) {
1843         return 0;
1844     }
1845     ret = read(fd, &ehdr, sizeof(ehdr));
1846     if (ret < sizeof(ehdr)) {
1847         return 0;
1848     }
1849     offset = lseek(fd, offset, SEEK_SET);
1850     if (offset == (off_t) -1) {
1851         return 0;
1852     }
1853 
1854     /* Check ELF signature */
1855     if (!elf_check_ident(&ehdr)) {
1856         return 0;
1857     }
1858 
1859     /* check header */
1860     bswap_ehdr(&ehdr);
1861     if (!elf_check_ehdr(&ehdr)) {
1862         return 0;
1863     }
1864 
1865     /* return architecture id */
1866     return ehdr.e_flags;
1867 }
1868 
load_elf_binary(struct linux_binprm * bprm,struct image_info * info)1869 int load_elf_binary(struct linux_binprm *bprm, struct image_info *info)
1870 {
1871     /*
1872      * We need a copy of the elf header for passing to create_elf_tables.
1873      * We will have overwritten the original when we re-use bprm->buf
1874      * while loading the interpreter.  Allocate the storage for this now
1875      * and let elf_load_image do any swapping that may be required.
1876      */
1877     struct elfhdr ehdr;
1878     struct image_info interp_info, vdso_info;
1879     char *elf_interpreter = NULL;
1880     char *scratch;
1881 
1882     memset(&interp_info, 0, sizeof(interp_info));
1883 #ifdef TARGET_MIPS
1884     interp_info.fp_abi = MIPS_ABI_FP_UNKNOWN;
1885 #endif
1886 
1887     load_elf_image(bprm->filename, &bprm->src, info, &ehdr, &elf_interpreter);
1888 
1889     /* Do this so that we can load the interpreter, if need be.  We will
1890        change some of these later */
1891     bprm->p = setup_arg_pages(bprm, info);
1892 
1893     scratch = g_new0(char, TARGET_PAGE_SIZE);
1894     if (STACK_GROWS_DOWN) {
1895         bprm->p = copy_elf_strings(1, &bprm->filename, scratch,
1896                                    bprm->p, info->stack_limit);
1897         info->file_string = bprm->p;
1898         bprm->p = copy_elf_strings(bprm->envc, bprm->envp, scratch,
1899                                    bprm->p, info->stack_limit);
1900         info->env_strings = bprm->p;
1901         bprm->p = copy_elf_strings(bprm->argc, bprm->argv, scratch,
1902                                    bprm->p, info->stack_limit);
1903         info->arg_strings = bprm->p;
1904     } else {
1905         info->arg_strings = bprm->p;
1906         bprm->p = copy_elf_strings(bprm->argc, bprm->argv, scratch,
1907                                    bprm->p, info->stack_limit);
1908         info->env_strings = bprm->p;
1909         bprm->p = copy_elf_strings(bprm->envc, bprm->envp, scratch,
1910                                    bprm->p, info->stack_limit);
1911         info->file_string = bprm->p;
1912         bprm->p = copy_elf_strings(1, &bprm->filename, scratch,
1913                                    bprm->p, info->stack_limit);
1914     }
1915 
1916     g_free(scratch);
1917 
1918     if (!bprm->p) {
1919         fprintf(stderr, "%s: %s\n", bprm->filename, strerror(E2BIG));
1920         exit(-1);
1921     }
1922 
1923     if (elf_interpreter) {
1924         load_elf_interp(elf_interpreter, &interp_info, bprm->buf);
1925 
1926         /*
1927          * While unusual because of ELF_ET_DYN_BASE, if we are unlucky
1928          * with the mappings the interpreter can be loaded above but
1929          * near the main executable, which can leave very little room
1930          * for the heap.
1931          * If the current brk has less than 16MB, use the end of the
1932          * interpreter.
1933          */
1934         if (interp_info.brk > info->brk &&
1935             interp_info.load_bias - info->brk < 16 * MiB)  {
1936             info->brk = interp_info.brk;
1937         }
1938 
1939         /* If the program interpreter is one of these two, then assume
1940            an iBCS2 image.  Otherwise assume a native linux image.  */
1941 
1942         if (strcmp(elf_interpreter, "/usr/lib/libc.so.1") == 0
1943             || strcmp(elf_interpreter, "/usr/lib/ld.so.1") == 0) {
1944             info->personality = PER_SVR4;
1945 
1946             /* Why this, you ask???  Well SVr4 maps page 0 as read-only,
1947                and some applications "depend" upon this behavior.  Since
1948                we do not have the power to recompile these, we emulate
1949                the SVr4 behavior.  Sigh.  */
1950             target_mmap(0, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC,
1951                         MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_ANONYMOUS,
1952                         -1, 0);
1953         }
1954 #ifdef TARGET_MIPS
1955         info->interp_fp_abi = interp_info.fp_abi;
1956 #endif
1957     }
1958 
1959     /*
1960      * Load a vdso if available, which will amongst other things contain the
1961      * signal trampolines.  Otherwise, allocate a separate page for them.
1962      */
1963     const VdsoImageInfo *vdso = get_vdso_image_info(info->elf_flags);
1964     if (vdso) {
1965         load_elf_vdso(&vdso_info, vdso);
1966         info->vdso = vdso_info.load_bias;
1967     } else if (TARGET_ARCH_HAS_SIGTRAMP_PAGE) {
1968         abi_long tramp_page = target_mmap(0, TARGET_PAGE_SIZE,
1969                                           PROT_READ | PROT_WRITE,
1970                                           MAP_PRIVATE | MAP_ANON, -1, 0);
1971         if (tramp_page == -1) {
1972             return -errno;
1973         }
1974 
1975         setup_sigtramp(tramp_page);
1976         target_mprotect(tramp_page, TARGET_PAGE_SIZE, PROT_READ | PROT_EXEC);
1977         vdso_sigreturn_region_start = tramp_page;
1978         vdso_sigreturn_region_end = tramp_page + TARGET_PAGE_SIZE;
1979     }
1980 
1981     bprm->p = create_elf_tables(bprm->p, bprm->argc, bprm->envc, &ehdr, info,
1982                                 elf_interpreter ? &interp_info : NULL,
1983                                 vdso ? &vdso_info : NULL);
1984     info->start_stack = bprm->p;
1985 
1986     /* If we have an interpreter, set that as the program's entry point.
1987        Copy the load_bias as well, to help PPC64 interpret the entry
1988        point as a function descriptor.  Do this after creating elf tables
1989        so that we copy the original program entry point into the AUXV.  */
1990     if (elf_interpreter) {
1991         info->load_bias = interp_info.load_bias;
1992         info->entry = interp_info.entry;
1993         g_free(elf_interpreter);
1994     }
1995 
1996 #ifdef HAVE_ELF_CORE_DUMP
1997     bprm->core_dump = &elf_core_dump;
1998 #endif
1999 
2000     return 0;
2001 }
2002 
2003 #ifdef HAVE_ELF_CORE_DUMP
2004 
2005 /*
2006  * Definitions to generate Intel SVR4-like core files.
2007  * These mostly have the same names as the SVR4 types with "target_elf_"
2008  * tacked on the front to prevent clashes with linux definitions,
2009  * and the typedef forms have been avoided.  This is mostly like
2010  * the SVR4 structure, but more Linuxy, with things that Linux does
2011  * not support and which gdb doesn't really use excluded.
2012  *
2013  * Fields we don't dump (their contents is zero) in linux-user qemu
2014  * are marked with XXX.
2015  *
2016  * Core dump code is copied from linux kernel (fs/binfmt_elf.c).
2017  *
2018  * Porting ELF coredump for target is (quite) simple process.  First you
2019  * define HAVE_ELF_CORE_DUMP in target ELF code (where init_thread() for
2020  * the target resides):
2021  *
2022  * #define HAVE_ELF_CORE_DUMP
2023  *
2024  * Next you define type of register set used for dumping:
2025  * typedef struct target_elf_gregset_t { ... } target_elf_gregset_t;
2026  *
2027  * Last step is to implement target specific function that copies registers
2028  * from given cpu into just specified register set.  Prototype is:
2029  *
2030  * void elf_core_copy_regs(target_elf_gregset_t *regs, const CPUArchState *env);
2031  *
2032  * Parameters:
2033  *     regs - copy register values into here (allocated and zeroed by caller)
2034  *     env - copy registers from here
2035  *
2036  * Example for ARM target is provided in this file.
2037  */
2038 
2039 struct target_elf_siginfo {
2040     abi_int    si_signo; /* signal number */
2041     abi_int    si_code;  /* extra code */
2042     abi_int    si_errno; /* errno */
2043 };
2044 
2045 struct target_elf_prstatus {
2046     struct target_elf_siginfo pr_info;      /* Info associated with signal */
2047     abi_short          pr_cursig;    /* Current signal */
2048     abi_ulong          pr_sigpend;   /* XXX */
2049     abi_ulong          pr_sighold;   /* XXX */
2050     target_pid_t       pr_pid;
2051     target_pid_t       pr_ppid;
2052     target_pid_t       pr_pgrp;
2053     target_pid_t       pr_sid;
2054     struct target_timeval pr_utime;  /* XXX User time */
2055     struct target_timeval pr_stime;  /* XXX System time */
2056     struct target_timeval pr_cutime; /* XXX Cumulative user time */
2057     struct target_timeval pr_cstime; /* XXX Cumulative system time */
2058     target_elf_gregset_t      pr_reg;       /* GP registers */
2059     abi_int            pr_fpvalid;   /* XXX */
2060 };
2061 
2062 #define ELF_PRARGSZ     (80) /* Number of chars for args */
2063 
2064 struct target_elf_prpsinfo {
2065     char         pr_state;       /* numeric process state */
2066     char         pr_sname;       /* char for pr_state */
2067     char         pr_zomb;        /* zombie */
2068     char         pr_nice;        /* nice val */
2069     abi_ulong    pr_flag;        /* flags */
2070     target_uid_t pr_uid;
2071     target_gid_t pr_gid;
2072     target_pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid;
2073     /* Lots missing */
2074     char    pr_fname[16] QEMU_NONSTRING; /* filename of executable */
2075     char    pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */
2076 };
2077 
bswap_prstatus(struct target_elf_prstatus * prstatus)2078 static void bswap_prstatus(struct target_elf_prstatus *prstatus)
2079 {
2080     if (!target_needs_bswap()) {
2081         return;
2082     }
2083 
2084     prstatus->pr_info.si_signo = tswap32(prstatus->pr_info.si_signo);
2085     prstatus->pr_info.si_code = tswap32(prstatus->pr_info.si_code);
2086     prstatus->pr_info.si_errno = tswap32(prstatus->pr_info.si_errno);
2087     prstatus->pr_cursig = tswap16(prstatus->pr_cursig);
2088     prstatus->pr_sigpend = tswapal(prstatus->pr_sigpend);
2089     prstatus->pr_sighold = tswapal(prstatus->pr_sighold);
2090     prstatus->pr_pid = tswap32(prstatus->pr_pid);
2091     prstatus->pr_ppid = tswap32(prstatus->pr_ppid);
2092     prstatus->pr_pgrp = tswap32(prstatus->pr_pgrp);
2093     prstatus->pr_sid = tswap32(prstatus->pr_sid);
2094     /* cpu times are not filled, so we skip them */
2095     /* regs should be in correct format already */
2096     prstatus->pr_fpvalid = tswap32(prstatus->pr_fpvalid);
2097 }
2098 
bswap_psinfo(struct target_elf_prpsinfo * psinfo)2099 static void bswap_psinfo(struct target_elf_prpsinfo *psinfo)
2100 {
2101     if (!target_needs_bswap()) {
2102         return;
2103     }
2104 
2105     psinfo->pr_flag = tswapal(psinfo->pr_flag);
2106     psinfo->pr_uid = tswap16(psinfo->pr_uid);
2107     psinfo->pr_gid = tswap16(psinfo->pr_gid);
2108     psinfo->pr_pid = tswap32(psinfo->pr_pid);
2109     psinfo->pr_ppid = tswap32(psinfo->pr_ppid);
2110     psinfo->pr_pgrp = tswap32(psinfo->pr_pgrp);
2111     psinfo->pr_sid = tswap32(psinfo->pr_sid);
2112 }
2113 
bswap_note(struct elf_note * en)2114 static void bswap_note(struct elf_note *en)
2115 {
2116     if (!target_needs_bswap()) {
2117         return;
2118     }
2119 
2120     bswap32s(&en->n_namesz);
2121     bswap32s(&en->n_descsz);
2122     bswap32s(&en->n_type);
2123 }
2124 
2125 /*
2126  * Calculate file (dump) size of given memory region.
2127  */
vma_dump_size(vaddr start,vaddr end,int flags)2128 static size_t vma_dump_size(vaddr start, vaddr end, int flags)
2129 {
2130     /* The area must be readable and dumpable. */
2131     if (!(flags & PAGE_READ) || (flags & PAGE_DONTDUMP)) {
2132         return 0;
2133     }
2134 
2135     /*
2136      * Usually we don't dump executable pages as they contain
2137      * non-writable code that debugger can read directly from
2138      * target library etc. If there is no elf header, we dump it.
2139      */
2140     if (!(flags & PAGE_WRITE_ORG) &&
2141         (flags & PAGE_EXEC) &&
2142         memcmp(g2h_untagged(start), ELFMAG, SELFMAG) == 0) {
2143         return 0;
2144     }
2145 
2146     return end - start;
2147 }
2148 
size_note(const char * name,size_t datasz)2149 static size_t size_note(const char *name, size_t datasz)
2150 {
2151     size_t namesz = strlen(name) + 1;
2152 
2153     namesz = ROUND_UP(namesz, 4);
2154     datasz = ROUND_UP(datasz, 4);
2155 
2156     return sizeof(struct elf_note) + namesz + datasz;
2157 }
2158 
fill_note(void ** pptr,int type,const char * name,size_t datasz)2159 static void *fill_note(void **pptr, int type, const char *name, size_t datasz)
2160 {
2161     void *ptr = *pptr;
2162     struct elf_note *n = ptr;
2163     size_t namesz = strlen(name) + 1;
2164 
2165     n->n_namesz = namesz;
2166     n->n_descsz = datasz;
2167     n->n_type = type;
2168     bswap_note(n);
2169 
2170     ptr += sizeof(*n);
2171     memcpy(ptr, name, namesz);
2172 
2173     namesz = ROUND_UP(namesz, 4);
2174     datasz = ROUND_UP(datasz, 4);
2175 
2176     *pptr = ptr + namesz + datasz;
2177     return ptr + namesz;
2178 }
2179 
fill_elf_header(struct elfhdr * elf,int segs,uint16_t machine,uint32_t flags)2180 static void fill_elf_header(struct elfhdr *elf, int segs, uint16_t machine,
2181                             uint32_t flags)
2182 {
2183     memcpy(elf->e_ident, ELFMAG, SELFMAG);
2184 
2185     elf->e_ident[EI_CLASS] = ELF_CLASS;
2186     elf->e_ident[EI_DATA] = ELF_DATA;
2187     elf->e_ident[EI_VERSION] = EV_CURRENT;
2188     elf->e_ident[EI_OSABI] = ELF_OSABI;
2189 
2190     elf->e_type = ET_CORE;
2191     elf->e_machine = machine;
2192     elf->e_version = EV_CURRENT;
2193     elf->e_phoff = sizeof(struct elfhdr);
2194     elf->e_flags = flags;
2195     elf->e_ehsize = sizeof(struct elfhdr);
2196     elf->e_phentsize = sizeof(struct elf_phdr);
2197     elf->e_phnum = segs;
2198 
2199     bswap_ehdr(elf);
2200 }
2201 
fill_elf_note_phdr(struct elf_phdr * phdr,size_t sz,off_t offset)2202 static void fill_elf_note_phdr(struct elf_phdr *phdr, size_t sz, off_t offset)
2203 {
2204     phdr->p_type = PT_NOTE;
2205     phdr->p_offset = offset;
2206     phdr->p_filesz = sz;
2207 
2208     bswap_phdr(phdr, 1);
2209 }
2210 
fill_prstatus_note(void * data,CPUState * cpu,int signr)2211 static void fill_prstatus_note(void *data, CPUState *cpu, int signr)
2212 {
2213     /*
2214      * Because note memory is only aligned to 4, and target_elf_prstatus
2215      * may well have higher alignment requirements, fill locally and
2216      * memcpy to the destination afterward.
2217      */
2218     struct target_elf_prstatus prstatus = {
2219         .pr_info.si_signo = signr,
2220         .pr_cursig = signr,
2221         .pr_pid = get_task_state(cpu)->ts_tid,
2222         .pr_ppid = getppid(),
2223         .pr_pgrp = getpgrp(),
2224         .pr_sid = getsid(0),
2225     };
2226 
2227     elf_core_copy_regs(&prstatus.pr_reg, cpu_env(cpu));
2228     bswap_prstatus(&prstatus);
2229     memcpy(data, &prstatus, sizeof(prstatus));
2230 }
2231 
fill_prpsinfo_note(void * data,const TaskState * ts)2232 static void fill_prpsinfo_note(void *data, const TaskState *ts)
2233 {
2234     /*
2235      * Because note memory is only aligned to 4, and target_elf_prpsinfo
2236      * may well have higher alignment requirements, fill locally and
2237      * memcpy to the destination afterward.
2238      */
2239     struct target_elf_prpsinfo psinfo = {
2240         .pr_pid = getpid(),
2241         .pr_ppid = getppid(),
2242         .pr_pgrp = getpgrp(),
2243         .pr_sid = getsid(0),
2244         .pr_uid = getuid(),
2245         .pr_gid = getgid(),
2246     };
2247     char *base_filename;
2248     size_t len;
2249 
2250     len = ts->info->env_strings - ts->info->arg_strings;
2251     len = MIN(len, ELF_PRARGSZ);
2252     memcpy(&psinfo.pr_psargs, g2h_untagged(ts->info->arg_strings), len);
2253     for (size_t i = 0; i < len; i++) {
2254         if (psinfo.pr_psargs[i] == 0) {
2255             psinfo.pr_psargs[i] = ' ';
2256         }
2257     }
2258 
2259     base_filename = g_path_get_basename(ts->bprm->filename);
2260     /*
2261      * Using strncpy here is fine: at max-length,
2262      * this field is not NUL-terminated.
2263      */
2264     strncpy(psinfo.pr_fname, base_filename, sizeof(psinfo.pr_fname));
2265     g_free(base_filename);
2266 
2267     bswap_psinfo(&psinfo);
2268     memcpy(data, &psinfo, sizeof(psinfo));
2269 }
2270 
fill_auxv_note(void * data,const TaskState * ts)2271 static void fill_auxv_note(void *data, const TaskState *ts)
2272 {
2273     memcpy(data, g2h_untagged(ts->info->saved_auxv), ts->info->auxv_len);
2274 }
2275 
2276 /*
2277  * Constructs name of coredump file.  We have following convention
2278  * for the name:
2279  *     qemu_<basename-of-target-binary>_<date>-<time>_<pid>.core
2280  *
2281  * Returns the filename
2282  */
core_dump_filename(const TaskState * ts)2283 static char *core_dump_filename(const TaskState *ts)
2284 {
2285     g_autoptr(GDateTime) now = g_date_time_new_now_local();
2286     g_autofree char *nowstr = g_date_time_format(now, "%Y%m%d-%H%M%S");
2287     g_autofree char *base_filename = g_path_get_basename(ts->bprm->filename);
2288 
2289     return g_strdup_printf("qemu_%s_%s_%d.core",
2290                            base_filename, nowstr, (int)getpid());
2291 }
2292 
dump_write(int fd,const void * ptr,size_t size)2293 static int dump_write(int fd, const void *ptr, size_t size)
2294 {
2295     const char *bufp = (const char *)ptr;
2296     ssize_t bytes_written, bytes_left;
2297 
2298     bytes_written = 0;
2299     bytes_left = size;
2300 
2301     /*
2302      * In normal conditions, single write(2) should do but
2303      * in case of socket etc. this mechanism is more portable.
2304      */
2305     do {
2306         bytes_written = write(fd, bufp, bytes_left);
2307         if (bytes_written < 0) {
2308             if (errno == EINTR)
2309                 continue;
2310             return (-1);
2311         } else if (bytes_written == 0) { /* eof */
2312             return (-1);
2313         }
2314         bufp += bytes_written;
2315         bytes_left -= bytes_written;
2316     } while (bytes_left > 0);
2317 
2318     return (0);
2319 }
2320 
wmr_page_unprotect_regions(void * opaque,vaddr start,vaddr end,int flags)2321 static int wmr_page_unprotect_regions(void *opaque, vaddr start,
2322                                       vaddr end, int flags)
2323 {
2324     if ((flags & (PAGE_WRITE | PAGE_WRITE_ORG)) == PAGE_WRITE_ORG) {
2325         size_t step = MAX(TARGET_PAGE_SIZE, qemu_real_host_page_size());
2326 
2327         while (1) {
2328             page_unprotect(NULL, start, 0);
2329             if (end - start <= step) {
2330                 break;
2331             }
2332             start += step;
2333         }
2334     }
2335     return 0;
2336 }
2337 
2338 typedef struct {
2339     unsigned count;
2340     size_t size;
2341 } CountAndSizeRegions;
2342 
wmr_count_and_size_regions(void * opaque,vaddr start,vaddr end,int flags)2343 static int wmr_count_and_size_regions(void *opaque, vaddr start,
2344                                       vaddr end, int flags)
2345 {
2346     CountAndSizeRegions *css = opaque;
2347 
2348     css->count++;
2349     css->size += vma_dump_size(start, end, flags);
2350     return 0;
2351 }
2352 
2353 typedef struct {
2354     struct elf_phdr *phdr;
2355     off_t offset;
2356 } FillRegionPhdr;
2357 
wmr_fill_region_phdr(void * opaque,vaddr start,vaddr end,int flags)2358 static int wmr_fill_region_phdr(void *opaque, vaddr start,
2359                                 vaddr end, int flags)
2360 {
2361     FillRegionPhdr *d = opaque;
2362     struct elf_phdr *phdr = d->phdr;
2363 
2364     phdr->p_type = PT_LOAD;
2365     phdr->p_vaddr = start;
2366     phdr->p_paddr = 0;
2367     phdr->p_filesz = vma_dump_size(start, end, flags);
2368     phdr->p_offset = d->offset;
2369     d->offset += phdr->p_filesz;
2370     phdr->p_memsz = end - start;
2371     phdr->p_flags = (flags & PAGE_READ ? PF_R : 0)
2372                   | (flags & PAGE_WRITE_ORG ? PF_W : 0)
2373                   | (flags & PAGE_EXEC ? PF_X : 0);
2374     phdr->p_align = TARGET_PAGE_SIZE;
2375 
2376     bswap_phdr(phdr, 1);
2377     d->phdr = phdr + 1;
2378     return 0;
2379 }
2380 
wmr_write_region(void * opaque,vaddr start,vaddr end,int flags)2381 static int wmr_write_region(void *opaque, vaddr start,
2382                             vaddr end, int flags)
2383 {
2384     int fd = *(int *)opaque;
2385     size_t size = vma_dump_size(start, end, flags);
2386 
2387     if (!size) {
2388         return 0;
2389     }
2390     return dump_write(fd, g2h_untagged(start), size);
2391 }
2392 
2393 /*
2394  * Write out ELF coredump.
2395  *
2396  * See documentation of ELF object file format in:
2397  * http://www.caldera.com/developers/devspecs/gabi41.pdf
2398  *
2399  * Coredump format in linux is following:
2400  *
2401  * 0   +----------------------+         \
2402  *     | ELF header           | ET_CORE  |
2403  *     +----------------------+          |
2404  *     | ELF program headers  |          |--- headers
2405  *     | - NOTE section       |          |
2406  *     | - PT_LOAD sections   |          |
2407  *     +----------------------+         /
2408  *     | NOTEs:               |
2409  *     | - NT_PRSTATUS        |
2410  *     | - NT_PRSINFO         |
2411  *     | - NT_AUXV            |
2412  *     +----------------------+ <-- aligned to target page
2413  *     | Process memory dump  |
2414  *     :                      :
2415  *     .                      .
2416  *     :                      :
2417  *     |                      |
2418  *     +----------------------+
2419  *
2420  * NT_PRSTATUS -> struct elf_prstatus (per thread)
2421  * NT_PRSINFO  -> struct elf_prpsinfo
2422  * NT_AUXV is array of { type, value } pairs (see fill_auxv_note()).
2423  *
2424  * Format follows System V format as close as possible.  Current
2425  * version limitations are as follows:
2426  *     - no floating point registers are dumped
2427  *
2428  * Function returns 0 in case of success, negative errno otherwise.
2429  *
2430  * TODO: make this work also during runtime: it should be
2431  * possible to force coredump from running process and then
2432  * continue processing.  For example qemu could set up SIGUSR2
2433  * handler (provided that target process haven't registered
2434  * handler for that) that does the dump when signal is received.
2435  */
elf_core_dump(int signr,const CPUArchState * env)2436 static int elf_core_dump(int signr, const CPUArchState *env)
2437 {
2438     const CPUState *cpu = env_cpu_const(env);
2439     const TaskState *ts = (const TaskState *)get_task_state((CPUState *)cpu);
2440     struct rlimit dumpsize;
2441     CountAndSizeRegions css;
2442     off_t offset, note_offset, data_offset;
2443     size_t note_size;
2444     int cpus, ret;
2445     int fd = -1;
2446     CPUState *cpu_iter;
2447 
2448     if (prctl(PR_GET_DUMPABLE) == 0) {
2449         return 0;
2450     }
2451 
2452     if (getrlimit(RLIMIT_CORE, &dumpsize) < 0 || dumpsize.rlim_cur == 0) {
2453         return 0;
2454     }
2455 
2456     cpu_list_lock();
2457     mmap_lock();
2458 
2459     /* By unprotecting, we merge vmas that might be split. */
2460     walk_memory_regions(NULL, wmr_page_unprotect_regions);
2461 
2462     /*
2463      * Walk through target process memory mappings and
2464      * set up structure containing this information.
2465      */
2466     memset(&css, 0, sizeof(css));
2467     walk_memory_regions(&css, wmr_count_and_size_regions);
2468 
2469     cpus = 0;
2470     CPU_FOREACH(cpu_iter) {
2471         cpus++;
2472     }
2473 
2474     offset = sizeof(struct elfhdr);
2475     offset += (css.count + 1) * sizeof(struct elf_phdr);
2476     note_offset = offset;
2477 
2478     offset += size_note("CORE", ts->info->auxv_len);
2479     offset += size_note("CORE", sizeof(struct target_elf_prpsinfo));
2480     offset += size_note("CORE", sizeof(struct target_elf_prstatus)) * cpus;
2481     note_size = offset - note_offset;
2482     data_offset = TARGET_PAGE_ALIGN(offset);
2483 
2484     /* Do not dump if the corefile size exceeds the limit. */
2485     if (dumpsize.rlim_cur != RLIM_INFINITY
2486         && dumpsize.rlim_cur < data_offset + css.size) {
2487         errno = 0;
2488         goto out;
2489     }
2490 
2491     {
2492         g_autofree char *corefile = core_dump_filename(ts);
2493         fd = open(corefile, O_WRONLY | O_CREAT | O_TRUNC,
2494                   S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
2495     }
2496     if (fd < 0) {
2497         goto out;
2498     }
2499 
2500     /*
2501      * There is a fair amount of alignment padding within the notes
2502      * as well as preceeding the process memory.  Allocate a zeroed
2503      * block to hold it all.  Write all of the headers directly into
2504      * this buffer and then write it out as a block.
2505      */
2506     {
2507         g_autofree void *header = g_malloc0(data_offset);
2508         FillRegionPhdr frp;
2509         void *hptr, *dptr;
2510 
2511         /* Create elf file header. */
2512         hptr = header;
2513         fill_elf_header(hptr, css.count + 1, ELF_MACHINE, 0);
2514         hptr += sizeof(struct elfhdr);
2515 
2516         /* Create elf program headers. */
2517         fill_elf_note_phdr(hptr, note_size, note_offset);
2518         hptr += sizeof(struct elf_phdr);
2519 
2520         frp.phdr = hptr;
2521         frp.offset = data_offset;
2522         walk_memory_regions(&frp, wmr_fill_region_phdr);
2523         hptr = frp.phdr;
2524 
2525         /* Create the notes. */
2526         dptr = fill_note(&hptr, NT_AUXV, "CORE", ts->info->auxv_len);
2527         fill_auxv_note(dptr, ts);
2528 
2529         dptr = fill_note(&hptr, NT_PRPSINFO, "CORE",
2530                          sizeof(struct target_elf_prpsinfo));
2531         fill_prpsinfo_note(dptr, ts);
2532 
2533         CPU_FOREACH(cpu_iter) {
2534             dptr = fill_note(&hptr, NT_PRSTATUS, "CORE",
2535                              sizeof(struct target_elf_prstatus));
2536             fill_prstatus_note(dptr, cpu_iter, cpu_iter == cpu ? signr : 0);
2537         }
2538 
2539         if (dump_write(fd, header, data_offset) < 0) {
2540             goto out;
2541         }
2542     }
2543 
2544     /*
2545      * Finally write process memory into the corefile as well.
2546      */
2547     if (walk_memory_regions(&fd, wmr_write_region) < 0) {
2548         goto out;
2549     }
2550     errno = 0;
2551 
2552  out:
2553     ret = -errno;
2554     mmap_unlock();
2555     cpu_list_unlock();
2556     if (fd >= 0) {
2557         close(fd);
2558     }
2559     return ret;
2560 }
2561 #endif /* HAVE_ELF_CORE_DUMP */
2562