1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3 * rseq.c
4 *
5 * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; only
10 * version 2.1 of the License.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 */
17
18 #define _GNU_SOURCE
19 #include <errno.h>
20 #include <sched.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syscall.h>
26 #include <assert.h>
27 #include <signal.h>
28 #include <limits.h>
29 #include <dlfcn.h>
30 #include <stddef.h>
31 #include <sys/auxv.h>
32 #include <linux/auxvec.h>
33
34 #include <linux/compiler.h>
35
36 #include "../kselftest.h"
37 #include "rseq.h"
38
39 /*
40 * Define weak versions to play nice with binaries that are statically linked
41 * against a libc that doesn't support registering its own rseq.
42 */
43 __weak ptrdiff_t __rseq_offset;
44 __weak unsigned int __rseq_size;
45 __weak unsigned int __rseq_flags;
46
47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48 static const unsigned int *libc_rseq_size_p = &__rseq_size;
49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50
51 /* Offset from the thread pointer to the rseq area. */
52 ptrdiff_t rseq_offset;
53
54 /*
55 * Size of the registered rseq area. 0 if the registration was
56 * unsuccessful.
57 */
58 unsigned int rseq_size = -1U;
59
60 /* Flags used during rseq registration. */
61 unsigned int rseq_flags;
62
63 static int rseq_ownership;
64
65 /* Allocate a large area for the TLS. */
66 #define RSEQ_THREAD_AREA_ALLOC_SIZE 1024
67
68 /* Original struct rseq feature size is 20 bytes. */
69 #define ORIG_RSEQ_FEATURE_SIZE 20
70
71 /* Original struct rseq allocation size is 32 bytes. */
72 #define ORIG_RSEQ_ALLOC_SIZE 32
73
74 static
75 __thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = {
76 .cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
77 };
78
sys_rseq(struct rseq_abi * rseq_abi,uint32_t rseq_len,int flags,uint32_t sig)79 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
80 int flags, uint32_t sig)
81 {
82 return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
83 }
84
sys_getcpu(unsigned * cpu,unsigned * node)85 static int sys_getcpu(unsigned *cpu, unsigned *node)
86 {
87 return syscall(__NR_getcpu, cpu, node, NULL);
88 }
89
rseq_available(void)90 int rseq_available(void)
91 {
92 int rc;
93
94 rc = sys_rseq(NULL, 0, 0, 0);
95 if (rc != -1)
96 abort();
97 switch (errno) {
98 case ENOSYS:
99 return 0;
100 case EINVAL:
101 return 1;
102 default:
103 abort();
104 }
105 }
106
107 /* The rseq areas need to be at least 32 bytes. */
108 static
get_rseq_min_alloc_size(void)109 unsigned int get_rseq_min_alloc_size(void)
110 {
111 unsigned int alloc_size = rseq_size;
112
113 if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
114 alloc_size = ORIG_RSEQ_ALLOC_SIZE;
115 return alloc_size;
116 }
117
118 /*
119 * Return the feature size supported by the kernel.
120 *
121 * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
122 *
123 * 0: Return ORIG_RSEQ_FEATURE_SIZE (20)
124 * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
125 *
126 * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
127 */
128 static
get_rseq_kernel_feature_size(void)129 unsigned int get_rseq_kernel_feature_size(void)
130 {
131 unsigned long auxv_rseq_feature_size, auxv_rseq_align;
132
133 auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
134 assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
135
136 auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
137 assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
138 if (auxv_rseq_feature_size)
139 return auxv_rseq_feature_size;
140 else
141 return ORIG_RSEQ_FEATURE_SIZE;
142 }
143
rseq_register_current_thread(void)144 int rseq_register_current_thread(void)
145 {
146 int rc;
147
148 if (!rseq_ownership) {
149 /* Treat libc's ownership as a successful registration. */
150 return 0;
151 }
152 rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
153 if (rc) {
154 /*
155 * After at least one thread has registered successfully
156 * (rseq_size > 0), the registration of other threads should
157 * never fail.
158 */
159 if (RSEQ_READ_ONCE(rseq_size) > 0) {
160 /* Incoherent success/failure within process. */
161 abort();
162 }
163 return -1;
164 }
165 assert(rseq_current_cpu_raw() >= 0);
166
167 /*
168 * The first thread to register sets the rseq_size to mimic the libc
169 * behavior.
170 */
171 if (RSEQ_READ_ONCE(rseq_size) == 0) {
172 RSEQ_WRITE_ONCE(rseq_size, get_rseq_kernel_feature_size());
173 }
174
175 return 0;
176 }
177
rseq_unregister_current_thread(void)178 int rseq_unregister_current_thread(void)
179 {
180 int rc;
181
182 if (!rseq_ownership) {
183 /* Treat libc's ownership as a successful unregistration. */
184 return 0;
185 }
186 rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
187 if (rc)
188 return -1;
189 return 0;
190 }
191
192 static __attribute__((constructor))
rseq_init(void)193 void rseq_init(void)
194 {
195 /*
196 * If the libc's registered rseq size isn't already valid, it may be
197 * because the binary is dynamically linked and not necessarily due to
198 * libc not having registered a restartable sequence. Try to find the
199 * symbols if that's the case.
200 */
201 if (!*libc_rseq_size_p) {
202 libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
203 libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
204 libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
205 }
206 if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
207 *libc_rseq_size_p != 0) {
208 unsigned int libc_rseq_size;
209
210 /* rseq registration owned by glibc */
211 rseq_offset = *libc_rseq_offset_p;
212 libc_rseq_size = *libc_rseq_size_p;
213 rseq_flags = *libc_rseq_flags_p;
214
215 /*
216 * Previous versions of glibc expose the value
217 * 32 even though the kernel only supported 20
218 * bytes initially. Therefore treat 32 as a
219 * special-case. glibc 2.40 exposes a 20 bytes
220 * __rseq_size without using getauxval(3) to
221 * query the supported size, while still allocating a 32
222 * bytes area. Also treat 20 as a special-case.
223 *
224 * Special-cases are handled by using the following
225 * value as active feature set size:
226 *
227 * rseq_size = min(32, get_rseq_kernel_feature_size())
228 */
229 switch (libc_rseq_size) {
230 case ORIG_RSEQ_FEATURE_SIZE:
231 fallthrough;
232 case ORIG_RSEQ_ALLOC_SIZE:
233 {
234 unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
235
236 if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
237 rseq_size = rseq_kernel_feature_size;
238 else
239 rseq_size = ORIG_RSEQ_ALLOC_SIZE;
240 break;
241 }
242 default:
243 /* Otherwise just use the __rseq_size from libc as rseq_size. */
244 rseq_size = libc_rseq_size;
245 break;
246 }
247 return;
248 }
249 rseq_ownership = 1;
250
251 /* Calculate the offset of the rseq area from the thread pointer. */
252 rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
253
254 /* rseq flags are deprecated, always set to 0. */
255 rseq_flags = 0;
256
257 /*
258 * Set the size to 0 until at least one thread registers to mimic the
259 * libc behavior.
260 */
261 rseq_size = 0;
262 }
263
264 static __attribute__((destructor))
rseq_exit(void)265 void rseq_exit(void)
266 {
267 if (!rseq_ownership)
268 return;
269 rseq_offset = 0;
270 rseq_size = -1U;
271 rseq_ownership = 0;
272 }
273
rseq_fallback_current_cpu(void)274 int32_t rseq_fallback_current_cpu(void)
275 {
276 int32_t cpu;
277
278 cpu = sched_getcpu();
279 if (cpu < 0) {
280 perror("sched_getcpu()");
281 abort();
282 }
283 return cpu;
284 }
285
rseq_fallback_current_node(void)286 int32_t rseq_fallback_current_node(void)
287 {
288 uint32_t cpu_id, node_id;
289 int ret;
290
291 ret = sys_getcpu(&cpu_id, &node_id);
292 if (ret) {
293 perror("sys_getcpu()");
294 return ret;
295 }
296 return (int32_t) node_id;
297 }
298