1 /* 2 * Linux UFFD-WP support 3 * 4 * Copyright Virtuozzo GmbH, 2020 5 * 6 * Authors: 7 * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> 8 * 9 * This work is licensed under the terms of the GNU GPL, version 2 or 10 * later. See the COPYING file in the top-level directory. 11 */ 12 13 #include "qemu/osdep.h" 14 #include "qemu/bitops.h" 15 #include "qemu/error-report.h" 16 #include "qemu/userfaultfd.h" 17 #include "trace.h" 18 #include <poll.h> 19 #include <sys/syscall.h> 20 #include <sys/ioctl.h> 21 22 typedef enum { 23 UFFD_UNINITIALIZED = 0, 24 UFFD_USE_DEV_PATH, 25 UFFD_USE_SYSCALL, 26 } uffd_open_mode; 27 28 int uffd_open(int flags) 29 { 30 #if defined(__NR_userfaultfd) 31 static uffd_open_mode open_mode; 32 static int uffd_dev; 33 34 /* Detect how to generate uffd desc when run the 1st time */ 35 if (open_mode == UFFD_UNINITIALIZED) { 36 /* 37 * Make /dev/userfaultfd the default approach because it has better 38 * permission controls, meanwhile allows kernel faults without any 39 * privilege requirement (e.g. SYS_CAP_PTRACE). 40 */ 41 uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); 42 if (uffd_dev >= 0) { 43 open_mode = UFFD_USE_DEV_PATH; 44 } else { 45 /* Fallback to the system call */ 46 open_mode = UFFD_USE_SYSCALL; 47 } 48 trace_uffd_detect_open_mode(open_mode); 49 } 50 51 if (open_mode == UFFD_USE_DEV_PATH) { 52 assert(uffd_dev >= 0); 53 return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags); 54 } 55 56 return syscall(__NR_userfaultfd, flags); 57 #else 58 return -EINVAL; 59 #endif 60 } 61 62 /** 63 * uffd_query_features: query UFFD features 64 * 65 * Returns: 0 on success, negative value in case of an error 66 * 67 * @features: parameter to receive 'uffdio_api.features' 68 */ 69 int uffd_query_features(uint64_t *features) 70 { 71 int uffd_fd; 72 struct uffdio_api api_struct = { 0 }; 73 int ret = -1; 74 75 uffd_fd = uffd_open(O_CLOEXEC); 76 if (uffd_fd < 0) { 77 trace_uffd_query_features_nosys(errno); 78 return -1; 79 } 80 81 api_struct.api = UFFD_API; 82 api_struct.features = 0; 83 84 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 85 trace_uffd_query_features_api_failed(errno); 86 goto out; 87 } 88 *features = api_struct.features; 89 ret = 0; 90 91 out: 92 close(uffd_fd); 93 return ret; 94 } 95 96 /** 97 * uffd_create_fd: create UFFD file descriptor 98 * 99 * Returns non-negative file descriptor or negative value in case of an error 100 * 101 * @features: UFFD features to request 102 * @non_blocking: create UFFD file descriptor for non-blocking operation 103 */ 104 int uffd_create_fd(uint64_t features, bool non_blocking) 105 { 106 int uffd_fd; 107 int flags; 108 struct uffdio_api api_struct = { 0 }; 109 uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); 110 111 flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); 112 uffd_fd = uffd_open(flags); 113 if (uffd_fd < 0) { 114 trace_uffd_create_fd_nosys(errno); 115 return -1; 116 } 117 118 api_struct.api = UFFD_API; 119 api_struct.features = features; 120 if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { 121 trace_uffd_create_fd_api_failed(errno); 122 goto fail; 123 } 124 if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { 125 trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); 126 goto fail; 127 } 128 129 return uffd_fd; 130 131 fail: 132 close(uffd_fd); 133 return -1; 134 } 135 136 /** 137 * uffd_close_fd: close UFFD file descriptor 138 * 139 * @uffd_fd: UFFD file descriptor 140 */ 141 void uffd_close_fd(int uffd_fd) 142 { 143 assert(uffd_fd >= 0); 144 close(uffd_fd); 145 } 146 147 /** 148 * uffd_register_memory: register memory range via UFFD-IO 149 * 150 * Returns 0 in case of success, negative value in case of an error 151 * 152 * @uffd_fd: UFFD file descriptor 153 * @addr: base address of memory range 154 * @length: length of memory range 155 * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) 156 * @ioctls: optional pointer to receive supported IOCTL mask 157 */ 158 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, 159 uint64_t mode, uint64_t *ioctls) 160 { 161 struct uffdio_register uffd_register; 162 163 uffd_register.range.start = (uintptr_t) addr; 164 uffd_register.range.len = length; 165 uffd_register.mode = mode; 166 167 if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { 168 trace_uffd_register_memory_failed(addr, length, mode, errno); 169 return -1; 170 } 171 if (ioctls) { 172 *ioctls = uffd_register.ioctls; 173 } 174 175 return 0; 176 } 177 178 /** 179 * uffd_unregister_memory: un-register memory range with UFFD-IO 180 * 181 * Returns 0 in case of success, negative value in case of an error 182 * 183 * @uffd_fd: UFFD file descriptor 184 * @addr: base address of memory range 185 * @length: length of memory range 186 */ 187 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) 188 { 189 struct uffdio_range uffd_range; 190 191 uffd_range.start = (uintptr_t) addr; 192 uffd_range.len = length; 193 194 if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { 195 trace_uffd_unregister_memory_failed(addr, length, errno); 196 return -1; 197 } 198 199 return 0; 200 } 201 202 /** 203 * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO 204 * 205 * Returns 0 on success, negative value in case of error 206 * 207 * @uffd_fd: UFFD file descriptor 208 * @addr: base address of memory range 209 * @length: length of memory range 210 * @wp: write-protect/unprotect 211 * @dont_wake: do not wake threads waiting on wr-protected page 212 */ 213 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, 214 bool wp, bool dont_wake) 215 { 216 struct uffdio_writeprotect uffd_writeprotect; 217 218 uffd_writeprotect.range.start = (uintptr_t) addr; 219 uffd_writeprotect.range.len = length; 220 if (!wp && dont_wake) { 221 /* DONTWAKE is meaningful only on protection release */ 222 uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; 223 } else { 224 uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); 225 } 226 227 if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { 228 error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 229 " mode=%" PRIx64 " errno=%i", addr, length, 230 (uint64_t) uffd_writeprotect.mode, errno); 231 return -1; 232 } 233 234 return 0; 235 } 236 237 /** 238 * uffd_copy_page: copy range of pages to destination via UFFD-IO 239 * 240 * Copy range of source pages to the destination to resolve 241 * missing page fault somewhere in the destination range. 242 * 243 * Returns 0 on success, negative value in case of an error 244 * 245 * @uffd_fd: UFFD file descriptor 246 * @dst_addr: destination base address 247 * @src_addr: source base address 248 * @length: length of the range to copy 249 * @dont_wake: do not wake threads waiting on missing page 250 */ 251 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, 252 uint64_t length, bool dont_wake) 253 { 254 struct uffdio_copy uffd_copy; 255 256 uffd_copy.dst = (uintptr_t) dst_addr; 257 uffd_copy.src = (uintptr_t) src_addr; 258 uffd_copy.len = length; 259 uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; 260 261 if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { 262 error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 263 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, 264 length, (uint64_t) uffd_copy.mode, errno); 265 return -1; 266 } 267 268 return 0; 269 } 270 271 /** 272 * uffd_zero_page: fill range of pages with zeroes via UFFD-IO 273 * 274 * Fill range pages with zeroes to resolve missing page fault within the range. 275 * 276 * Returns 0 on success, negative value in case of an error 277 * 278 * @uffd_fd: UFFD file descriptor 279 * @addr: base address 280 * @length: length of the range to fill with zeroes 281 * @dont_wake: do not wake threads waiting on missing page 282 */ 283 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) 284 { 285 struct uffdio_zeropage uffd_zeropage; 286 287 uffd_zeropage.range.start = (uintptr_t) addr; 288 uffd_zeropage.range.len = length; 289 uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; 290 291 if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { 292 error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 293 " mode=%" PRIx64 " errno=%i", addr, length, 294 (uint64_t) uffd_zeropage.mode, errno); 295 return -1; 296 } 297 298 return 0; 299 } 300 301 /** 302 * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution 303 * 304 * Wake up threads waiting on any page/pages from the designated range. 305 * The main use case is when during some period, page faults are resolved 306 * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits 307 * for the whole memory range are satisfied in a single call to uffd_wakeup(). 308 * 309 * Returns 0 on success, negative value in case of an error 310 * 311 * @uffd_fd: UFFD file descriptor 312 * @addr: base address 313 * @length: length of the range 314 */ 315 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) 316 { 317 struct uffdio_range uffd_range; 318 319 uffd_range.start = (uintptr_t) addr; 320 uffd_range.len = length; 321 322 if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { 323 error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", 324 addr, length, errno); 325 return -1; 326 } 327 328 return 0; 329 } 330 331 /** 332 * uffd_read_events: read pending UFFD events 333 * 334 * Returns number of fetched messages, 0 if non is available or 335 * negative value in case of an error 336 * 337 * @uffd_fd: UFFD file descriptor 338 * @msgs: pointer to message buffer 339 * @count: number of messages that can fit in the buffer 340 */ 341 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) 342 { 343 ssize_t res; 344 do { 345 res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); 346 } while (res < 0 && errno == EINTR); 347 348 if ((res < 0 && errno == EAGAIN)) { 349 return 0; 350 } 351 if (res < 0) { 352 error_report("uffd_read_events() failed: errno=%i", errno); 353 return -1; 354 } 355 356 return (int) (res / sizeof(struct uffd_msg)); 357 } 358 359 /** 360 * uffd_poll_events: poll UFFD file descriptor for read 361 * 362 * Returns true if events are available for read, false otherwise 363 * 364 * @uffd_fd: UFFD file descriptor 365 * @tmo: timeout value 366 */ 367 bool uffd_poll_events(int uffd_fd, int tmo) 368 { 369 int res; 370 struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; 371 372 do { 373 res = poll(&poll_fd, 1, tmo); 374 } while (res < 0 && errno == EINTR); 375 376 if (res == 0) { 377 return false; 378 } 379 if (res < 0) { 380 error_report("uffd_poll_events() failed: errno=%i", errno); 381 return false; 382 } 383 384 return (poll_fd.revents & POLLIN) != 0; 385 } 386