1 /* 2 * Linux UFFD-WP support 3 * 4 * Copyright Virtuozzo GmbH, 2020 5 * 6 * Authors: 7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or 10 * later. See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/bitops.h" 15 #include "qemu/error-report.h" 16 #include "qemu/userfaultfd.h" 17 #include "trace.h" 18 #include <poll.h> 19 #include <sys/syscall.h> 20 #include <sys/ioctl.h> 21 #include <fcntl.h> 22 23 typedef enum { 24 UFFD_UNINITIALIZED = 0, 25 UFFD_USE_DEV_PATH, 26 UFFD_USE_SYSCALL, 27 } uffd_open_mode; 28 29 int uffd_open(int flags) 30 { 31 #if defined(__NR_userfaultfd) 32 static uffd_open_mode open_mode; 33 static int uffd_dev; 34 35 /* Detect how to generate uffd desc when run the 1st time */ 36 if (open_mode == UFFD_UNINITIALIZED) { 37 /* 38 * Make /dev/userfaultfd the default approach because it has better 39 * permission controls, meanwhile allows kernel faults without any 40 * privilege requirement (e.g. SYS_CAP_PTRACE). 41 */ 42 uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 43 if (uffd_dev >= 0) { 44 open_mode = UFFD_USE_DEV_PATH; 45 } else { 46 /* Fallback to the system call */ 47 open_mode = UFFD_USE_SYSCALL; 48 } 49 trace_uffd_detect_open_mode(open_mode); 50 } 51 52 if (open_mode == UFFD_USE_DEV_PATH) { 53 assert(uffd_dev >= 0); 54 return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); 55 } 56 57 return syscall(__NR_userfaultfd, flags); 58 #else 59 return -EINVAL; 60 #endif 61 } 62 63 /** 64 * uffd_query_features: query UFFD features 65 * 66 * Returns: 0 on success, negative value in case of an error 67 * 68 * @features: parameter to receive 'uffdio_api.features' 69 */ 70 int uffd_query_features(uint64_t *features) 71 { 72 int uffd_fd; 73 struct uffdio_api api_struct = { 0 }; 74 int ret = -1; 75 76 uffd_fd = uffd_open(O_CLOEXEC); 77 if (uffd_fd < 0) { 78 trace_uffd_query_features_nosys(errno); 79 return -1; 80 } 81 82 api_struct.api = UFFD_API; 83 api_struct.features = 0; 84 85 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 86 trace_uffd_query_features_api_failed(errno); 87 goto out; 88 } 89 *features = api_struct.features; 90 ret = 0; 91 92 out: 93 close(uffd_fd); 94 return ret; 95 } 96 97 /** 98 * uffd_create_fd: create UFFD file descriptor 99 * 100 * Returns non-negative file descriptor or negative value in case of an error 101 * 102 * @features: UFFD features to request 103 * @non_blocking: create UFFD file descriptor for non-blocking operation 104 */ 105 int uffd_create_fd(uint64_t features, bool non_blocking) 106 { 107 int uffd_fd; 108 int flags; 109 struct uffdio_api api_struct = { 0 }; 110 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); 111 112 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); 113 uffd_fd = uffd_open(flags); 114 if (uffd_fd < 0) { 115 trace_uffd_create_fd_nosys(errno); 116 return -1; 117 } 118 119 api_struct.api = UFFD_API; 120 api_struct.features = features; 121 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 122 trace_uffd_create_fd_api_failed(errno); 123 goto fail; 124 } 125 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { 126 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); 127 goto fail; 128 } 129 130 return uffd_fd; 131 132 fail: 133 close(uffd_fd); 134 return -1; 135 } 136 137 /** 138 * uffd_close_fd: close UFFD file descriptor 139 * 140 * @uffd_fd: UFFD file descriptor 141 */ 142 void uffd_close_fd(int uffd_fd) 143 { 144 assert(uffd_fd >= 0); 145 close(uffd_fd); 146 } 147 148 /** 149 * uffd_register_memory: register memory range via UFFD-IO 150 * 151 * Returns 0 in case of success, negative value in case of an error 152 * 153 * @uffd_fd: UFFD file descriptor 154 * @addr: base address of memory range 155 * @length: length of memory range 156 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) 157 * @ioctls: optional pointer to receive supported IOCTL mask 158 */ 159 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, 160 uint64_t mode, uint64_t *ioctls) 161 { 162 struct uffdio_register uffd_register; 163 164 uffd_register.range.start = (uintptr_t) addr; 165 uffd_register.range.len = length; 166 uffd_register.mode = mode; 167 168 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { 169 trace_uffd_register_memory_failed(addr, length, mode, errno); 170 return -1; 171 } 172 if (ioctls) { 173 *ioctls = uffd_register.ioctls; 174 } 175 176 return 0; 177 } 178 179 /** 180 * uffd_unregister_memory: un-register memory range with UFFD-IO 181 * 182 * Returns 0 in case of success, negative value in case of an error 183 * 184 * @uffd_fd: UFFD file descriptor 185 * @addr: base address of memory range 186 * @length: length of memory range 187 */ 188 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) 189 { 190 struct uffdio_range uffd_range; 191 192 uffd_range.start = (uintptr_t) addr; 193 uffd_range.len = length; 194 195 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { 196 trace_uffd_unregister_memory_failed(addr, length, errno); 197 return -1; 198 } 199 200 return 0; 201 } 202 203 /** 204 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO 205 * 206 * Returns 0 on success, negative value in case of error 207 * 208 * @uffd_fd: UFFD file descriptor 209 * @addr: base address of memory range 210 * @length: length of memory range 211 * @wp: write-protect/unprotect 212 * @dont_wake: do not wake threads waiting on wr-protected page 213 */ 214 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, 215 bool wp, bool dont_wake) 216 { 217 struct uffdio_writeprotect uffd_writeprotect; 218 219 uffd_writeprotect.range.start = (uintptr_t) addr; 220 uffd_writeprotect.range.len = length; 221 if (!wp && dont_wake) { 222 /* DONTWAKE is meaningful only on protection release */ 223 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 224 } else { 225 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); 226 } 227 228 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { 229 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 230 " mode=%" PRIx64 " errno=%i", addr, length, 231 (uint64_t) uffd_writeprotect.mode, errno); 232 return -1; 233 } 234 235 return 0; 236 } 237 238 /** 239 * uffd_copy_page: copy range of pages to destination via UFFD-IO 240 * 241 * Copy range of source pages to the destination to resolve 242 * missing page fault somewhere in the destination range. 243 * 244 * Returns 0 on success, negative value in case of an error 245 * 246 * @uffd_fd: UFFD file descriptor 247 * @dst_addr: destination base address 248 * @src_addr: source base address 249 * @length: length of the range to copy 250 * @dont_wake: do not wake threads waiting on missing page 251 */ 252 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, 253 uint64_t length, bool dont_wake) 254 { 255 struct uffdio_copy uffd_copy; 256 257 uffd_copy.dst = (uintptr_t) dst_addr; 258 uffd_copy.src = (uintptr_t) src_addr; 259 uffd_copy.len = length; 260 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; 261 262 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { 263 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 264 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, 265 length, (uint64_t) uffd_copy.mode, errno); 266 return -1; 267 } 268 269 return 0; 270 } 271 272 /** 273 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO 274 * 275 * Fill range pages with zeroes to resolve missing page fault within the range. 276 * 277 * Returns 0 on success, negative value in case of an error 278 * 279 * @uffd_fd: UFFD file descriptor 280 * @addr: base address 281 * @length: length of the range to fill with zeroes 282 * @dont_wake: do not wake threads waiting on missing page 283 */ 284 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) 285 { 286 struct uffdio_zeropage uffd_zeropage; 287 288 uffd_zeropage.range.start = (uintptr_t) addr; 289 uffd_zeropage.range.len = length; 290 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; 291 292 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { 293 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 294 " mode=%" PRIx64 " errno=%i", addr, length, 295 (uint64_t) uffd_zeropage.mode, errno); 296 return -1; 297 } 298 299 return 0; 300 } 301 302 /** 303 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution 304 * 305 * Wake up threads waiting on any page/pages from the designated range. 306 * The main use case is when during some period, page faults are resolved 307 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits 308 * for the whole memory range are satisfied in a single call to uffd_wakeup(). 309 * 310 * Returns 0 on success, negative value in case of an error 311 * 312 * @uffd_fd: UFFD file descriptor 313 * @addr: base address 314 * @length: length of the range 315 */ 316 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) 317 { 318 struct uffdio_range uffd_range; 319 320 uffd_range.start = (uintptr_t) addr; 321 uffd_range.len = length; 322 323 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { 324 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", 325 addr, length, errno); 326 return -1; 327 } 328 329 return 0; 330 } 331 332 /** 333 * uffd_read_events: read pending UFFD events 334 * 335 * Returns number of fetched messages, 0 if non is available or 336 * negative value in case of an error 337 * 338 * @uffd_fd: UFFD file descriptor 339 * @msgs: pointer to message buffer 340 * @count: number of messages that can fit in the buffer 341 */ 342 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) 343 { 344 ssize_t res; 345 do { 346 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); 347 } while (res < 0 && errno == EINTR); 348 349 if ((res < 0 && errno == EAGAIN)) { 350 return 0; 351 } 352 if (res < 0) { 353 error_report("uffd_read_events() failed: errno=%i", errno); 354 return -1; 355 } 356 357 return (int) (res / sizeof(struct uffd_msg)); 358 } 359 360 /** 361 * uffd_poll_events: poll UFFD file descriptor for read 362 * 363 * Returns true if events are available for read, false otherwise 364 * 365 * @uffd_fd: UFFD file descriptor 366 * @tmo: timeout value 367 */ 368 bool uffd_poll_events(int uffd_fd, int tmo) 369 { 370 int res; 371 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; 372 373 do { 374 res = poll(&poll_fd, 1, tmo); 375 } while (res < 0 && errno == EINTR); 376 377 if (res == 0) { 378 return false; 379 } 380 if (res < 0) { 381 error_report("uffd_poll_events() failed: errno=%i", errno); 382 return false; 383 } 384 385 return (poll_fd.revents & POLLIN) != 0; 386 } 387