1 // SPDX-License-Identifier: LGPL-2.1
2 /*
3  * rseq.c
4  *
5  * Copyright (C) 2016 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
6  *
7  * This library is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; only
10  * version 2.1 of the License.
11  *
12  * This library is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  */
17 
18 #define _GNU_SOURCE
19 #include <errno.h>
20 #include <sched.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <unistd.h>
25 #include <syscall.h>
26 #include <assert.h>
27 #include <signal.h>
28 #include <limits.h>
29 #include <dlfcn.h>
30 #include <stddef.h>
31 #include <sys/auxv.h>
32 #include <linux/auxvec.h>
33 
34 #include <linux/compiler.h>
35 
36 #include "../kselftest.h"
37 #include "rseq.h"
38 
39 /*
40  * Define weak versions to play nice with binaries that are statically linked
41  * against a libc that doesn't support registering its own rseq.
42  */
43 __weak ptrdiff_t __rseq_offset;
44 __weak unsigned int __rseq_size;
45 __weak unsigned int __rseq_flags;
46 
47 static const ptrdiff_t *libc_rseq_offset_p = &__rseq_offset;
48 static const unsigned int *libc_rseq_size_p = &__rseq_size;
49 static const unsigned int *libc_rseq_flags_p = &__rseq_flags;
50 
51 /* Offset from the thread pointer to the rseq area. */
52 ptrdiff_t rseq_offset;
53 
54 /*
55  * Size of the registered rseq area. 0 if the registration was
56  * unsuccessful.
57  */
58 unsigned int rseq_size = -1U;
59 
60 /* Flags used during rseq registration.  */
61 unsigned int rseq_flags;
62 
63 static int rseq_ownership;
64 static int rseq_reg_success;	/* At least one rseq registration has succeded. */
65 
66 /* Allocate a large area for the TLS. */
67 #define RSEQ_THREAD_AREA_ALLOC_SIZE	1024
68 
69 /* Original struct rseq feature size is 20 bytes. */
70 #define ORIG_RSEQ_FEATURE_SIZE		20
71 
72 /* Original struct rseq allocation size is 32 bytes. */
73 #define ORIG_RSEQ_ALLOC_SIZE		32
74 
75 static
76 __thread struct rseq_abi __rseq_abi __attribute__((tls_model("initial-exec"), aligned(RSEQ_THREAD_AREA_ALLOC_SIZE))) = {
77 	.cpu_id = RSEQ_ABI_CPU_ID_UNINITIALIZED,
78 };
79 
80 static int sys_rseq(struct rseq_abi *rseq_abi, uint32_t rseq_len,
81 		    int flags, uint32_t sig)
82 {
83 	return syscall(__NR_rseq, rseq_abi, rseq_len, flags, sig);
84 }
85 
86 static int sys_getcpu(unsigned *cpu, unsigned *node)
87 {
88 	return syscall(__NR_getcpu, cpu, node, NULL);
89 }
90 
91 int rseq_available(void)
92 {
93 	int rc;
94 
95 	rc = sys_rseq(NULL, 0, 0, 0);
96 	if (rc != -1)
97 		abort();
98 	switch (errno) {
99 	case ENOSYS:
100 		return 0;
101 	case EINVAL:
102 		return 1;
103 	default:
104 		abort();
105 	}
106 }
107 
108 /* The rseq areas need to be at least 32 bytes. */
109 static
110 unsigned int get_rseq_min_alloc_size(void)
111 {
112 	unsigned int alloc_size = rseq_size;
113 
114 	if (alloc_size < ORIG_RSEQ_ALLOC_SIZE)
115 		alloc_size = ORIG_RSEQ_ALLOC_SIZE;
116 	return alloc_size;
117 }
118 
119 /*
120  * Return the feature size supported by the kernel.
121  *
122  * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE):
123  *
124  * 0:   Return ORIG_RSEQ_FEATURE_SIZE (20)
125  * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE).
126  *
127  * It should never return a value below ORIG_RSEQ_FEATURE_SIZE.
128  */
129 static
130 unsigned int get_rseq_kernel_feature_size(void)
131 {
132 	unsigned long auxv_rseq_feature_size, auxv_rseq_align;
133 
134 	auxv_rseq_align = getauxval(AT_RSEQ_ALIGN);
135 	assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE);
136 
137 	auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE);
138 	assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE);
139 	if (auxv_rseq_feature_size)
140 		return auxv_rseq_feature_size;
141 	else
142 		return ORIG_RSEQ_FEATURE_SIZE;
143 }
144 
145 int rseq_register_current_thread(void)
146 {
147 	int rc;
148 
149 	if (!rseq_ownership) {
150 		/* Treat libc's ownership as a successful registration. */
151 		return 0;
152 	}
153 	rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG);
154 	if (rc) {
155 		if (RSEQ_READ_ONCE(rseq_reg_success)) {
156 			/* Incoherent success/failure within process. */
157 			abort();
158 		}
159 		return -1;
160 	}
161 	assert(rseq_current_cpu_raw() >= 0);
162 	RSEQ_WRITE_ONCE(rseq_reg_success, 1);
163 	return 0;
164 }
165 
166 int rseq_unregister_current_thread(void)
167 {
168 	int rc;
169 
170 	if (!rseq_ownership) {
171 		/* Treat libc's ownership as a successful unregistration. */
172 		return 0;
173 	}
174 	rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG);
175 	if (rc)
176 		return -1;
177 	return 0;
178 }
179 
180 static __attribute__((constructor))
181 void rseq_init(void)
182 {
183 	/*
184 	 * If the libc's registered rseq size isn't already valid, it may be
185 	 * because the binary is dynamically linked and not necessarily due to
186 	 * libc not having registered a restartable sequence.  Try to find the
187 	 * symbols if that's the case.
188 	 */
189 	if (!*libc_rseq_size_p) {
190 		libc_rseq_offset_p = dlsym(RTLD_NEXT, "__rseq_offset");
191 		libc_rseq_size_p = dlsym(RTLD_NEXT, "__rseq_size");
192 		libc_rseq_flags_p = dlsym(RTLD_NEXT, "__rseq_flags");
193 	}
194 	if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p &&
195 			*libc_rseq_size_p != 0) {
196 		unsigned int libc_rseq_size;
197 
198 		/* rseq registration owned by glibc */
199 		rseq_offset = *libc_rseq_offset_p;
200 		libc_rseq_size = *libc_rseq_size_p;
201 		rseq_flags = *libc_rseq_flags_p;
202 
203 		/*
204 		 * Previous versions of glibc expose the value
205 		 * 32 even though the kernel only supported 20
206 		 * bytes initially. Therefore treat 32 as a
207 		 * special-case. glibc 2.40 exposes a 20 bytes
208 		 * __rseq_size without using getauxval(3) to
209 		 * query the supported size, while still allocating a 32
210 		 * bytes area. Also treat 20 as a special-case.
211 		 *
212 		 * Special-cases are handled by using the following
213 		 * value as active feature set size:
214 		 *
215 		 *   rseq_size = min(32, get_rseq_kernel_feature_size())
216 		 */
217 		switch (libc_rseq_size) {
218 		case ORIG_RSEQ_FEATURE_SIZE:
219 			fallthrough;
220 		case ORIG_RSEQ_ALLOC_SIZE:
221 		{
222 			unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size();
223 
224 			if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE)
225 				rseq_size = rseq_kernel_feature_size;
226 			else
227 				rseq_size = ORIG_RSEQ_ALLOC_SIZE;
228 			break;
229 		}
230 		default:
231 			/* Otherwise just use the __rseq_size from libc as rseq_size. */
232 			rseq_size = libc_rseq_size;
233 			break;
234 		}
235 		return;
236 	}
237 	rseq_ownership = 1;
238 	if (!rseq_available()) {
239 		rseq_size = 0;
240 		return;
241 	}
242 	rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer();
243 	rseq_flags = 0;
244 }
245 
246 static __attribute__((destructor))
247 void rseq_exit(void)
248 {
249 	if (!rseq_ownership)
250 		return;
251 	rseq_offset = 0;
252 	rseq_size = -1U;
253 	rseq_ownership = 0;
254 }
255 
256 int32_t rseq_fallback_current_cpu(void)
257 {
258 	int32_t cpu;
259 
260 	cpu = sched_getcpu();
261 	if (cpu < 0) {
262 		perror("sched_getcpu()");
263 		abort();
264 	}
265 	return cpu;
266 }
267 
268 int32_t rseq_fallback_current_node(void)
269 {
270 	uint32_t cpu_id, node_id;
271 	int ret;
272 
273 	ret = sys_getcpu(&cpu_id, &node_id);
274 	if (ret) {
275 		perror("sys_getcpu()");
276 		return ret;
277 	}
278 	return (int32_t) node_id;
279 }
280