xref: /openbmc/qemu/util/userfaultfd.c (revision b4b9a0e3)
1 /*
2  * Linux UFFD-WP support
3  *
4  * Copyright Virtuozzo GmbH, 2020
5  *
6  * Authors:
7  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or
10  * later.  See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
17 #include "trace.h"
18 #include <poll.h>
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
21 
22 /**
23  * uffd_query_features: query UFFD features
24  *
25  * Returns: 0 on success, negative value in case of an error
26  *
27  * @features: parameter to receive 'uffdio_api.features'
28  */
29 int uffd_query_features(uint64_t *features)
30 {
31     int uffd_fd;
32     struct uffdio_api api_struct = { 0 };
33     int ret = -1;
34 
35     uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
36     if (uffd_fd < 0) {
37         trace_uffd_query_features_nosys(errno);
38         return -1;
39     }
40 
41     api_struct.api = UFFD_API;
42     api_struct.features = 0;
43 
44     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
45         trace_uffd_query_features_api_failed(errno);
46         goto out;
47     }
48     *features = api_struct.features;
49     ret = 0;
50 
51 out:
52     close(uffd_fd);
53     return ret;
54 }
55 
56 /**
57  * uffd_create_fd: create UFFD file descriptor
58  *
59  * Returns non-negative file descriptor or negative value in case of an error
60  *
61  * @features: UFFD features to request
62  * @non_blocking: create UFFD file descriptor for non-blocking operation
63  */
64 int uffd_create_fd(uint64_t features, bool non_blocking)
65 {
66     int uffd_fd;
67     int flags;
68     struct uffdio_api api_struct = { 0 };
69     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
70 
71     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
72     uffd_fd = syscall(__NR_userfaultfd, flags);
73     if (uffd_fd < 0) {
74         trace_uffd_create_fd_nosys(errno);
75         return -1;
76     }
77 
78     api_struct.api = UFFD_API;
79     api_struct.features = features;
80     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
81         trace_uffd_create_fd_api_failed(errno);
82         goto fail;
83     }
84     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
85         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
86         goto fail;
87     }
88 
89     return uffd_fd;
90 
91 fail:
92     close(uffd_fd);
93     return -1;
94 }
95 
96 /**
97  * uffd_close_fd: close UFFD file descriptor
98  *
99  * @uffd_fd: UFFD file descriptor
100  */
101 void uffd_close_fd(int uffd_fd)
102 {
103     assert(uffd_fd >= 0);
104     close(uffd_fd);
105 }
106 
107 /**
108  * uffd_register_memory: register memory range via UFFD-IO
109  *
110  * Returns 0 in case of success, negative value in case of an error
111  *
112  * @uffd_fd: UFFD file descriptor
113  * @addr: base address of memory range
114  * @length: length of memory range
115  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
116  * @ioctls: optional pointer to receive supported IOCTL mask
117  */
118 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
119         uint64_t mode, uint64_t *ioctls)
120 {
121     struct uffdio_register uffd_register;
122 
123     uffd_register.range.start = (uintptr_t) addr;
124     uffd_register.range.len = length;
125     uffd_register.mode = mode;
126 
127     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
128         trace_uffd_register_memory_failed(addr, length, mode, errno);
129         return -1;
130     }
131     if (ioctls) {
132         *ioctls = uffd_register.ioctls;
133     }
134 
135     return 0;
136 }
137 
138 /**
139  * uffd_unregister_memory: un-register memory range with UFFD-IO
140  *
141  * Returns 0 in case of success, negative value in case of an error
142  *
143  * @uffd_fd: UFFD file descriptor
144  * @addr: base address of memory range
145  * @length: length of memory range
146  */
147 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
148 {
149     struct uffdio_range uffd_range;
150 
151     uffd_range.start = (uintptr_t) addr;
152     uffd_range.len = length;
153 
154     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
155         trace_uffd_unregister_memory_failed(addr, length, errno);
156         return -1;
157     }
158 
159     return 0;
160 }
161 
162 /**
163  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
164  *
165  * Returns 0 on success, negative value in case of error
166  *
167  * @uffd_fd: UFFD file descriptor
168  * @addr: base address of memory range
169  * @length: length of memory range
170  * @wp: write-protect/unprotect
171  * @dont_wake: do not wake threads waiting on wr-protected page
172  */
173 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
174         bool wp, bool dont_wake)
175 {
176     struct uffdio_writeprotect uffd_writeprotect;
177 
178     uffd_writeprotect.range.start = (uintptr_t) addr;
179     uffd_writeprotect.range.len = length;
180     if (!wp && dont_wake) {
181         /* DONTWAKE is meaningful only on protection release */
182         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
183     } else {
184         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
185     }
186 
187     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
188         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
189                 " mode=%" PRIx64 " errno=%i", addr, length,
190                 (uint64_t) uffd_writeprotect.mode, errno);
191         return -1;
192     }
193 
194     return 0;
195 }
196 
197 /**
198  * uffd_copy_page: copy range of pages to destination via UFFD-IO
199  *
200  * Copy range of source pages to the destination to resolve
201  * missing page fault somewhere in the destination range.
202  *
203  * Returns 0 on success, negative value in case of an error
204  *
205  * @uffd_fd: UFFD file descriptor
206  * @dst_addr: destination base address
207  * @src_addr: source base address
208  * @length: length of the range to copy
209  * @dont_wake: do not wake threads waiting on missing page
210  */
211 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
212         uint64_t length, bool dont_wake)
213 {
214     struct uffdio_copy uffd_copy;
215 
216     uffd_copy.dst = (uintptr_t) dst_addr;
217     uffd_copy.src = (uintptr_t) src_addr;
218     uffd_copy.len = length;
219     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
220 
221     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
222         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
223                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
224                 length, (uint64_t) uffd_copy.mode, errno);
225         return -1;
226     }
227 
228     return 0;
229 }
230 
231 /**
232  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
233  *
234  * Fill range pages with zeroes to resolve missing page fault within the range.
235  *
236  * Returns 0 on success, negative value in case of an error
237  *
238  * @uffd_fd: UFFD file descriptor
239  * @addr: base address
240  * @length: length of the range to fill with zeroes
241  * @dont_wake: do not wake threads waiting on missing page
242  */
243 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
244 {
245     struct uffdio_zeropage uffd_zeropage;
246 
247     uffd_zeropage.range.start = (uintptr_t) addr;
248     uffd_zeropage.range.len = length;
249     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
250 
251     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
252         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
253                 " mode=%" PRIx64 " errno=%i", addr, length,
254                 (uint64_t) uffd_zeropage.mode, errno);
255         return -1;
256     }
257 
258     return 0;
259 }
260 
261 /**
262  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
263  *
264  * Wake up threads waiting on any page/pages from the designated range.
265  * The main use case is when during some period, page faults are resolved
266  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
267  * for the whole memory range are satisfied in a single call to uffd_wakeup().
268  *
269  * Returns 0 on success, negative value in case of an error
270  *
271  * @uffd_fd: UFFD file descriptor
272  * @addr: base address
273  * @length: length of the range
274  */
275 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
276 {
277     struct uffdio_range uffd_range;
278 
279     uffd_range.start = (uintptr_t) addr;
280     uffd_range.len = length;
281 
282     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
283         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
284                 addr, length, errno);
285         return -1;
286     }
287 
288     return 0;
289 }
290 
291 /**
292  * uffd_read_events: read pending UFFD events
293  *
294  * Returns number of fetched messages, 0 if non is available or
295  * negative value in case of an error
296  *
297  * @uffd_fd: UFFD file descriptor
298  * @msgs: pointer to message buffer
299  * @count: number of messages that can fit in the buffer
300  */
301 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
302 {
303     ssize_t res;
304     do {
305         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
306     } while (res < 0 && errno == EINTR);
307 
308     if ((res < 0 && errno == EAGAIN)) {
309         return 0;
310     }
311     if (res < 0) {
312         error_report("uffd_read_events() failed: errno=%i", errno);
313         return -1;
314     }
315 
316     return (int) (res / sizeof(struct uffd_msg));
317 }
318 
319 /**
320  * uffd_poll_events: poll UFFD file descriptor for read
321  *
322  * Returns true if events are available for read, false otherwise
323  *
324  * @uffd_fd: UFFD file descriptor
325  * @tmo: timeout value
326  */
327 bool uffd_poll_events(int uffd_fd, int tmo)
328 {
329     int res;
330     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
331 
332     do {
333         res = poll(&poll_fd, 1, tmo);
334     } while (res < 0 && errno == EINTR);
335 
336     if (res == 0) {
337         return false;
338     }
339     if (res < 0) {
340         error_report("uffd_poll_events() failed: errno=%i", errno);
341         return false;
342     }
343 
344     return (poll_fd.revents & POLLIN) != 0;
345 }
346