1 /*
2 * Linux UFFD-WP support
3 *
4 * Copyright Virtuozzo GmbH, 2020
5 *
6 * Authors:
7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
8 *
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
11 */
12
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
17 #include "trace.h"
18 #include <poll.h>
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
21
22 typedef enum {
23 UFFD_UNINITIALIZED = 0,
24 UFFD_USE_DEV_PATH,
25 UFFD_USE_SYSCALL,
26 } uffd_open_mode;
27
uffd_open(int flags)28 int uffd_open(int flags)
29 {
30 #if defined(__NR_userfaultfd)
31 static uffd_open_mode open_mode;
32 static int uffd_dev;
33
34 /* Detect how to generate uffd desc when run the 1st time */
35 if (open_mode == UFFD_UNINITIALIZED) {
36 /*
37 * Make /dev/userfaultfd the default approach because it has better
38 * permission controls, meanwhile allows kernel faults without any
39 * privilege requirement (e.g. SYS_CAP_PTRACE).
40 */
41 uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
42 if (uffd_dev >= 0) {
43 open_mode = UFFD_USE_DEV_PATH;
44 } else {
45 /* Fallback to the system call */
46 open_mode = UFFD_USE_SYSCALL;
47 }
48 trace_uffd_detect_open_mode(open_mode);
49 }
50
51 if (open_mode == UFFD_USE_DEV_PATH) {
52 assert(uffd_dev >= 0);
53 return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
54 }
55
56 return syscall(__NR_userfaultfd, flags);
57 #else
58 return -EINVAL;
59 #endif
60 }
61
62 /**
63 * uffd_query_features: query UFFD features
64 *
65 * Returns: 0 on success, negative value in case of an error
66 *
67 * @features: parameter to receive 'uffdio_api.features'
68 */
uffd_query_features(uint64_t * features)69 int uffd_query_features(uint64_t *features)
70 {
71 int uffd_fd;
72 struct uffdio_api api_struct = { 0 };
73 int ret = -1;
74
75 uffd_fd = uffd_open(O_CLOEXEC);
76 if (uffd_fd < 0) {
77 trace_uffd_query_features_nosys(errno);
78 return -1;
79 }
80
81 api_struct.api = UFFD_API;
82 api_struct.features = 0;
83
84 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
85 trace_uffd_query_features_api_failed(errno);
86 goto out;
87 }
88 *features = api_struct.features;
89 ret = 0;
90
91 out:
92 close(uffd_fd);
93 return ret;
94 }
95
96 /**
97 * uffd_create_fd: create UFFD file descriptor
98 *
99 * Returns non-negative file descriptor or negative value in case of an error
100 *
101 * @features: UFFD features to request
102 * @non_blocking: create UFFD file descriptor for non-blocking operation
103 */
uffd_create_fd(uint64_t features,bool non_blocking)104 int uffd_create_fd(uint64_t features, bool non_blocking)
105 {
106 int uffd_fd;
107 int flags;
108 struct uffdio_api api_struct = { 0 };
109 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
110
111 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
112 uffd_fd = uffd_open(flags);
113 if (uffd_fd < 0) {
114 trace_uffd_create_fd_nosys(errno);
115 return -1;
116 }
117
118 api_struct.api = UFFD_API;
119 api_struct.features = features;
120 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
121 trace_uffd_create_fd_api_failed(errno);
122 goto fail;
123 }
124 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
125 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
126 goto fail;
127 }
128
129 return uffd_fd;
130
131 fail:
132 close(uffd_fd);
133 return -1;
134 }
135
136 /**
137 * uffd_close_fd: close UFFD file descriptor
138 *
139 * @uffd_fd: UFFD file descriptor
140 */
uffd_close_fd(int uffd_fd)141 void uffd_close_fd(int uffd_fd)
142 {
143 assert(uffd_fd >= 0);
144 close(uffd_fd);
145 }
146
147 /**
148 * uffd_register_memory: register memory range via UFFD-IO
149 *
150 * Returns 0 in case of success, negative value in case of an error
151 *
152 * @uffd_fd: UFFD file descriptor
153 * @addr: base address of memory range
154 * @length: length of memory range
155 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
156 * @ioctls: optional pointer to receive supported IOCTL mask
157 */
uffd_register_memory(int uffd_fd,void * addr,uint64_t length,uint64_t mode,uint64_t * ioctls)158 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
159 uint64_t mode, uint64_t *ioctls)
160 {
161 struct uffdio_register uffd_register;
162
163 uffd_register.range.start = (uintptr_t) addr;
164 uffd_register.range.len = length;
165 uffd_register.mode = mode;
166
167 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
168 trace_uffd_register_memory_failed(addr, length, mode, errno);
169 return -1;
170 }
171 if (ioctls) {
172 *ioctls = uffd_register.ioctls;
173 }
174
175 return 0;
176 }
177
178 /**
179 * uffd_unregister_memory: un-register memory range with UFFD-IO
180 *
181 * Returns 0 in case of success, negative value in case of an error
182 *
183 * @uffd_fd: UFFD file descriptor
184 * @addr: base address of memory range
185 * @length: length of memory range
186 */
uffd_unregister_memory(int uffd_fd,void * addr,uint64_t length)187 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
188 {
189 struct uffdio_range uffd_range;
190
191 uffd_range.start = (uintptr_t) addr;
192 uffd_range.len = length;
193
194 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
195 trace_uffd_unregister_memory_failed(addr, length, errno);
196 return -1;
197 }
198
199 return 0;
200 }
201
202 /**
203 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
204 *
205 * Returns 0 on success, negative value in case of error
206 *
207 * @uffd_fd: UFFD file descriptor
208 * @addr: base address of memory range
209 * @length: length of memory range
210 * @wp: write-protect/unprotect
211 * @dont_wake: do not wake threads waiting on wr-protected page
212 */
uffd_change_protection(int uffd_fd,void * addr,uint64_t length,bool wp,bool dont_wake)213 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
214 bool wp, bool dont_wake)
215 {
216 struct uffdio_writeprotect uffd_writeprotect;
217
218 uffd_writeprotect.range.start = (uintptr_t) addr;
219 uffd_writeprotect.range.len = length;
220 if (!wp && dont_wake) {
221 /* DONTWAKE is meaningful only on protection release */
222 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
223 } else {
224 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
225 }
226
227 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
228 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
229 " mode=%" PRIx64 " errno=%i", addr, length,
230 (uint64_t) uffd_writeprotect.mode, errno);
231 return -1;
232 }
233
234 return 0;
235 }
236
237 /**
238 * uffd_copy_page: copy range of pages to destination via UFFD-IO
239 *
240 * Copy range of source pages to the destination to resolve
241 * missing page fault somewhere in the destination range.
242 *
243 * Returns 0 on success, negative value in case of an error
244 *
245 * @uffd_fd: UFFD file descriptor
246 * @dst_addr: destination base address
247 * @src_addr: source base address
248 * @length: length of the range to copy
249 * @dont_wake: do not wake threads waiting on missing page
250 */
uffd_copy_page(int uffd_fd,void * dst_addr,void * src_addr,uint64_t length,bool dont_wake)251 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
252 uint64_t length, bool dont_wake)
253 {
254 struct uffdio_copy uffd_copy;
255
256 uffd_copy.dst = (uintptr_t) dst_addr;
257 uffd_copy.src = (uintptr_t) src_addr;
258 uffd_copy.len = length;
259 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
260
261 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
262 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
263 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
264 length, (uint64_t) uffd_copy.mode, errno);
265 return -1;
266 }
267
268 return 0;
269 }
270
271 /**
272 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
273 *
274 * Fill range pages with zeroes to resolve missing page fault within the range.
275 *
276 * Returns 0 on success, negative value in case of an error
277 *
278 * @uffd_fd: UFFD file descriptor
279 * @addr: base address
280 * @length: length of the range to fill with zeroes
281 * @dont_wake: do not wake threads waiting on missing page
282 */
uffd_zero_page(int uffd_fd,void * addr,uint64_t length,bool dont_wake)283 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
284 {
285 struct uffdio_zeropage uffd_zeropage;
286
287 uffd_zeropage.range.start = (uintptr_t) addr;
288 uffd_zeropage.range.len = length;
289 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
290
291 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
292 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
293 " mode=%" PRIx64 " errno=%i", addr, length,
294 (uint64_t) uffd_zeropage.mode, errno);
295 return -1;
296 }
297
298 return 0;
299 }
300
301 /**
302 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
303 *
304 * Wake up threads waiting on any page/pages from the designated range.
305 * The main use case is when during some period, page faults are resolved
306 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
307 * for the whole memory range are satisfied in a single call to uffd_wakeup().
308 *
309 * Returns 0 on success, negative value in case of an error
310 *
311 * @uffd_fd: UFFD file descriptor
312 * @addr: base address
313 * @length: length of the range
314 */
uffd_wakeup(int uffd_fd,void * addr,uint64_t length)315 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
316 {
317 struct uffdio_range uffd_range;
318
319 uffd_range.start = (uintptr_t) addr;
320 uffd_range.len = length;
321
322 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
323 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
324 addr, length, errno);
325 return -1;
326 }
327
328 return 0;
329 }
330
331 /**
332 * uffd_read_events: read pending UFFD events
333 *
334 * Returns number of fetched messages, 0 if non is available or
335 * negative value in case of an error
336 *
337 * @uffd_fd: UFFD file descriptor
338 * @msgs: pointer to message buffer
339 * @count: number of messages that can fit in the buffer
340 */
uffd_read_events(int uffd_fd,struct uffd_msg * msgs,int count)341 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
342 {
343 ssize_t res;
344 do {
345 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
346 } while (res < 0 && errno == EINTR);
347
348 if ((res < 0 && errno == EAGAIN)) {
349 return 0;
350 }
351 if (res < 0) {
352 error_report("uffd_read_events() failed: errno=%i", errno);
353 return -1;
354 }
355
356 return (int) (res / sizeof(struct uffd_msg));
357 }
358
359 /**
360 * uffd_poll_events: poll UFFD file descriptor for read
361 *
362 * Returns true if events are available for read, false otherwise
363 *
364 * @uffd_fd: UFFD file descriptor
365 * @tmo: timeout value
366 */
uffd_poll_events(int uffd_fd,int tmo)367 bool uffd_poll_events(int uffd_fd, int tmo)
368 {
369 int res;
370 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
371
372 do {
373 res = poll(&poll_fd, 1, tmo);
374 } while (res < 0 && errno == EINTR);
375
376 if (res == 0) {
377 return false;
378 }
379 if (res < 0) {
380 error_report("uffd_poll_events() failed: errno=%i", errno);
381 return false;
382 }
383
384 return (poll_fd.revents & POLLIN) != 0;
385 }
386