1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* kiocb-using read/write 3 * 4 * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. 5 * Written by David Howells (dhowells@redhat.com) 6 */ 7 8 #include <linux/mount.h> 9 #include <linux/slab.h> 10 #include <linux/file.h> 11 #include <linux/uio.h> 12 #include <linux/falloc.h> 13 #include <linux/sched/mm.h> 14 #include <trace/events/fscache.h> 15 #include "internal.h" 16 17 struct cachefiles_kiocb { 18 struct kiocb iocb; 19 refcount_t ki_refcnt; 20 loff_t start; 21 union { 22 size_t skipped; 23 size_t len; 24 }; 25 struct cachefiles_object *object; 26 netfs_io_terminated_t term_func; 27 void *term_func_priv; 28 bool was_async; 29 unsigned int inval_counter; /* Copy of cookie->inval_counter */ 30 u64 b_writing; 31 }; 32 33 static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) 34 { 35 if (refcount_dec_and_test(&ki->ki_refcnt)) { 36 cachefiles_put_object(ki->object, cachefiles_obj_put_ioreq); 37 fput(ki->iocb.ki_filp); 38 kfree(ki); 39 } 40 } 41 42 /* 43 * Handle completion of a read from the cache. 44 */ 45 static void cachefiles_read_complete(struct kiocb *iocb, long ret) 46 { 47 struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); 48 struct inode *inode = file_inode(ki->iocb.ki_filp); 49 50 _enter("%ld", ret); 51 52 if (ret < 0) 53 trace_cachefiles_io_error(ki->object, inode, ret, 54 cachefiles_trace_read_error); 55 56 if (ki->term_func) { 57 if (ret >= 0) { 58 if (ki->object->cookie->inval_counter == ki->inval_counter) 59 ki->skipped += ret; 60 else 61 ret = -ESTALE; 62 } 63 64 ki->term_func(ki->term_func_priv, ret, ki->was_async); 65 } 66 67 cachefiles_put_kiocb(ki); 68 } 69 70 /* 71 * Initiate a read from the cache. 72 */ 73 static int cachefiles_read(struct netfs_cache_resources *cres, 74 loff_t start_pos, 75 struct iov_iter *iter, 76 enum netfs_read_from_hole read_hole, 77 netfs_io_terminated_t term_func, 78 void *term_func_priv) 79 { 80 struct cachefiles_object *object; 81 struct cachefiles_kiocb *ki; 82 struct file *file; 83 unsigned int old_nofs; 84 ssize_t ret = -ENOBUFS; 85 size_t len = iov_iter_count(iter), skipped = 0; 86 87 if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) 88 goto presubmission_error; 89 90 fscache_count_read(); 91 object = cachefiles_cres_object(cres); 92 file = cachefiles_cres_file(cres); 93 94 _enter("%pD,%li,%llx,%zx/%llx", 95 file, file_inode(file)->i_ino, start_pos, len, 96 i_size_read(file_inode(file))); 97 98 /* If the caller asked us to seek for data before doing the read, then 99 * we should do that now. If we find a gap, we fill it with zeros. 100 */ 101 if (read_hole != NETFS_READ_HOLE_IGNORE) { 102 loff_t off = start_pos, off2; 103 104 off2 = cachefiles_inject_read_error(); 105 if (off2 == 0) 106 off2 = vfs_llseek(file, off, SEEK_DATA); 107 if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { 108 skipped = 0; 109 ret = off2; 110 goto presubmission_error; 111 } 112 113 if (off2 == -ENXIO || off2 >= start_pos + len) { 114 /* The region is beyond the EOF or there's no more data 115 * in the region, so clear the rest of the buffer and 116 * return success. 117 */ 118 ret = -ENODATA; 119 if (read_hole == NETFS_READ_HOLE_FAIL) 120 goto presubmission_error; 121 122 iov_iter_zero(len, iter); 123 skipped = len; 124 ret = 0; 125 goto presubmission_error; 126 } 127 128 skipped = off2 - off; 129 iov_iter_zero(skipped, iter); 130 } 131 132 ret = -ENOMEM; 133 ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); 134 if (!ki) 135 goto presubmission_error; 136 137 refcount_set(&ki->ki_refcnt, 2); 138 ki->iocb.ki_filp = file; 139 ki->iocb.ki_pos = start_pos + skipped; 140 ki->iocb.ki_flags = IOCB_DIRECT; 141 ki->iocb.ki_ioprio = get_current_ioprio(); 142 ki->skipped = skipped; 143 ki->object = object; 144 ki->inval_counter = cres->inval_counter; 145 ki->term_func = term_func; 146 ki->term_func_priv = term_func_priv; 147 ki->was_async = true; 148 149 if (ki->term_func) 150 ki->iocb.ki_complete = cachefiles_read_complete; 151 152 get_file(ki->iocb.ki_filp); 153 cachefiles_grab_object(object, cachefiles_obj_get_ioreq); 154 155 trace_cachefiles_read(object, file_inode(file), ki->iocb.ki_pos, len - skipped); 156 old_nofs = memalloc_nofs_save(); 157 ret = cachefiles_inject_read_error(); 158 if (ret == 0) 159 ret = vfs_iocb_iter_read(file, &ki->iocb, iter); 160 memalloc_nofs_restore(old_nofs); 161 switch (ret) { 162 case -EIOCBQUEUED: 163 goto in_progress; 164 165 case -ERESTARTSYS: 166 case -ERESTARTNOINTR: 167 case -ERESTARTNOHAND: 168 case -ERESTART_RESTARTBLOCK: 169 /* There's no easy way to restart the syscall since other AIO's 170 * may be already running. Just fail this IO with EINTR. 171 */ 172 ret = -EINTR; 173 fallthrough; 174 default: 175 ki->was_async = false; 176 cachefiles_read_complete(&ki->iocb, ret); 177 if (ret > 0) 178 ret = 0; 179 break; 180 } 181 182 in_progress: 183 cachefiles_put_kiocb(ki); 184 _leave(" = %zd", ret); 185 return ret; 186 187 presubmission_error: 188 if (term_func) 189 term_func(term_func_priv, ret < 0 ? ret : skipped, false); 190 return ret; 191 } 192 193 /* 194 * Query the occupancy of the cache in a region, returning where the next chunk 195 * of data starts and how long it is. 196 */ 197 static int cachefiles_query_occupancy(struct netfs_cache_resources *cres, 198 loff_t start, size_t len, size_t granularity, 199 loff_t *_data_start, size_t *_data_len) 200 { 201 struct cachefiles_object *object; 202 struct file *file; 203 loff_t off, off2; 204 205 *_data_start = -1; 206 *_data_len = 0; 207 208 if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) 209 return -ENOBUFS; 210 211 object = cachefiles_cres_object(cres); 212 file = cachefiles_cres_file(cres); 213 granularity = max_t(size_t, object->volume->cache->bsize, granularity); 214 215 _enter("%pD,%li,%llx,%zx/%llx", 216 file, file_inode(file)->i_ino, start, len, 217 i_size_read(file_inode(file))); 218 219 off = cachefiles_inject_read_error(); 220 if (off == 0) 221 off = vfs_llseek(file, start, SEEK_DATA); 222 if (off == -ENXIO) 223 return -ENODATA; /* Beyond EOF */ 224 if (off < 0 && off >= (loff_t)-MAX_ERRNO) 225 return -ENOBUFS; /* Error. */ 226 if (round_up(off, granularity) >= start + len) 227 return -ENODATA; /* No data in range */ 228 229 off2 = cachefiles_inject_read_error(); 230 if (off2 == 0) 231 off2 = vfs_llseek(file, off, SEEK_HOLE); 232 if (off2 == -ENXIO) 233 return -ENODATA; /* Beyond EOF */ 234 if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO) 235 return -ENOBUFS; /* Error. */ 236 237 /* Round away partial blocks */ 238 off = round_up(off, granularity); 239 off2 = round_down(off2, granularity); 240 if (off2 <= off) 241 return -ENODATA; 242 243 *_data_start = off; 244 if (off2 > start + len) 245 *_data_len = len; 246 else 247 *_data_len = off2 - off; 248 return 0; 249 } 250 251 /* 252 * Handle completion of a write to the cache. 253 */ 254 static void cachefiles_write_complete(struct kiocb *iocb, long ret) 255 { 256 struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); 257 struct cachefiles_object *object = ki->object; 258 struct inode *inode = file_inode(ki->iocb.ki_filp); 259 260 _enter("%ld", ret); 261 262 /* Tell lockdep we inherited freeze protection from submission thread */ 263 __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); 264 __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); 265 266 if (ret < 0) 267 trace_cachefiles_io_error(object, inode, ret, 268 cachefiles_trace_write_error); 269 270 atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); 271 set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); 272 if (ki->term_func) 273 ki->term_func(ki->term_func_priv, ret, ki->was_async); 274 cachefiles_put_kiocb(ki); 275 } 276 277 /* 278 * Initiate a write to the cache. 279 */ 280 int __cachefiles_write(struct cachefiles_object *object, 281 struct file *file, 282 loff_t start_pos, 283 struct iov_iter *iter, 284 netfs_io_terminated_t term_func, 285 void *term_func_priv) 286 { 287 struct cachefiles_cache *cache; 288 struct cachefiles_kiocb *ki; 289 struct inode *inode; 290 unsigned int old_nofs; 291 ssize_t ret; 292 size_t len = iov_iter_count(iter); 293 294 fscache_count_write(); 295 cache = object->volume->cache; 296 297 _enter("%pD,%li,%llx,%zx/%llx", 298 file, file_inode(file)->i_ino, start_pos, len, 299 i_size_read(file_inode(file))); 300 301 ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); 302 if (!ki) { 303 if (term_func) 304 term_func(term_func_priv, -ENOMEM, false); 305 return -ENOMEM; 306 } 307 308 refcount_set(&ki->ki_refcnt, 2); 309 ki->iocb.ki_filp = file; 310 ki->iocb.ki_pos = start_pos; 311 ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; 312 ki->iocb.ki_ioprio = get_current_ioprio(); 313 ki->object = object; 314 ki->start = start_pos; 315 ki->len = len; 316 ki->term_func = term_func; 317 ki->term_func_priv = term_func_priv; 318 ki->was_async = true; 319 ki->b_writing = (len + (1 << cache->bshift) - 1) >> cache->bshift; 320 321 if (ki->term_func) 322 ki->iocb.ki_complete = cachefiles_write_complete; 323 atomic_long_add(ki->b_writing, &cache->b_writing); 324 325 /* Open-code file_start_write here to grab freeze protection, which 326 * will be released by another thread in aio_complete_rw(). Fool 327 * lockdep by telling it the lock got released so that it doesn't 328 * complain about the held lock when we return to userspace. 329 */ 330 inode = file_inode(file); 331 __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); 332 __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); 333 334 get_file(ki->iocb.ki_filp); 335 cachefiles_grab_object(object, cachefiles_obj_get_ioreq); 336 337 trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); 338 old_nofs = memalloc_nofs_save(); 339 ret = cachefiles_inject_write_error(); 340 if (ret == 0) 341 ret = vfs_iocb_iter_write(file, &ki->iocb, iter); 342 memalloc_nofs_restore(old_nofs); 343 switch (ret) { 344 case -EIOCBQUEUED: 345 goto in_progress; 346 347 case -ERESTARTSYS: 348 case -ERESTARTNOINTR: 349 case -ERESTARTNOHAND: 350 case -ERESTART_RESTARTBLOCK: 351 /* There's no easy way to restart the syscall since other AIO's 352 * may be already running. Just fail this IO with EINTR. 353 */ 354 ret = -EINTR; 355 fallthrough; 356 default: 357 ki->was_async = false; 358 cachefiles_write_complete(&ki->iocb, ret); 359 if (ret > 0) 360 ret = 0; 361 break; 362 } 363 364 in_progress: 365 cachefiles_put_kiocb(ki); 366 _leave(" = %zd", ret); 367 return ret; 368 } 369 370 static int cachefiles_write(struct netfs_cache_resources *cres, 371 loff_t start_pos, 372 struct iov_iter *iter, 373 netfs_io_terminated_t term_func, 374 void *term_func_priv) 375 { 376 if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { 377 if (term_func) 378 term_func(term_func_priv, -ENOBUFS, false); 379 return -ENOBUFS; 380 } 381 382 return __cachefiles_write(cachefiles_cres_object(cres), 383 cachefiles_cres_file(cres), 384 start_pos, iter, 385 term_func, term_func_priv); 386 } 387 388 static inline enum netfs_io_source 389 cachefiles_do_prepare_read(struct netfs_cache_resources *cres, 390 loff_t start, size_t *_len, loff_t i_size, 391 unsigned long *_flags, ino_t netfs_ino) 392 { 393 enum cachefiles_prepare_read_trace why; 394 struct cachefiles_object *object = NULL; 395 struct cachefiles_cache *cache; 396 struct fscache_cookie *cookie = fscache_cres_cookie(cres); 397 const struct cred *saved_cred; 398 struct file *file = cachefiles_cres_file(cres); 399 enum netfs_io_source ret = NETFS_DOWNLOAD_FROM_SERVER; 400 size_t len = *_len; 401 loff_t off, to; 402 ino_t ino = file ? file_inode(file)->i_ino : 0; 403 int rc; 404 405 _enter("%zx @%llx/%llx", len, start, i_size); 406 407 if (start >= i_size) { 408 ret = NETFS_FILL_WITH_ZEROES; 409 why = cachefiles_trace_read_after_eof; 410 goto out_no_object; 411 } 412 413 if (test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) { 414 __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); 415 why = cachefiles_trace_read_no_data; 416 if (!test_bit(NETFS_SREQ_ONDEMAND, _flags)) 417 goto out_no_object; 418 } 419 420 /* The object and the file may be being created in the background. */ 421 if (!file) { 422 why = cachefiles_trace_read_no_file; 423 if (!fscache_wait_for_operation(cres, FSCACHE_WANT_READ)) 424 goto out_no_object; 425 file = cachefiles_cres_file(cres); 426 if (!file) 427 goto out_no_object; 428 ino = file_inode(file)->i_ino; 429 } 430 431 object = cachefiles_cres_object(cres); 432 cache = object->volume->cache; 433 cachefiles_begin_secure(cache, &saved_cred); 434 retry: 435 off = cachefiles_inject_read_error(); 436 if (off == 0) 437 off = vfs_llseek(file, start, SEEK_DATA); 438 if (off < 0 && off >= (loff_t)-MAX_ERRNO) { 439 if (off == (loff_t)-ENXIO) { 440 why = cachefiles_trace_read_seek_nxio; 441 goto download_and_store; 442 } 443 trace_cachefiles_io_error(object, file_inode(file), off, 444 cachefiles_trace_seek_error); 445 why = cachefiles_trace_read_seek_error; 446 goto out; 447 } 448 449 if (off >= start + len) { 450 why = cachefiles_trace_read_found_hole; 451 goto download_and_store; 452 } 453 454 if (off > start) { 455 off = round_up(off, cache->bsize); 456 len = off - start; 457 *_len = len; 458 why = cachefiles_trace_read_found_part; 459 goto download_and_store; 460 } 461 462 to = cachefiles_inject_read_error(); 463 if (to == 0) 464 to = vfs_llseek(file, start, SEEK_HOLE); 465 if (to < 0 && to >= (loff_t)-MAX_ERRNO) { 466 trace_cachefiles_io_error(object, file_inode(file), to, 467 cachefiles_trace_seek_error); 468 why = cachefiles_trace_read_seek_error; 469 goto out; 470 } 471 472 if (to < start + len) { 473 if (start + len >= i_size) 474 to = round_up(to, cache->bsize); 475 else 476 to = round_down(to, cache->bsize); 477 len = to - start; 478 *_len = len; 479 } 480 481 why = cachefiles_trace_read_have_data; 482 ret = NETFS_READ_FROM_CACHE; 483 goto out; 484 485 download_and_store: 486 __set_bit(NETFS_SREQ_COPY_TO_CACHE, _flags); 487 if (test_bit(NETFS_SREQ_ONDEMAND, _flags)) { 488 rc = cachefiles_ondemand_read(object, start, len); 489 if (!rc) { 490 __clear_bit(NETFS_SREQ_ONDEMAND, _flags); 491 goto retry; 492 } 493 ret = NETFS_INVALID_READ; 494 } 495 out: 496 cachefiles_end_secure(cache, saved_cred); 497 out_no_object: 498 trace_cachefiles_prep_read(object, start, len, *_flags, ret, why, ino, netfs_ino); 499 return ret; 500 } 501 502 /* 503 * Prepare a read operation, shortening it to a cached/uncached 504 * boundary as appropriate. 505 */ 506 static enum netfs_io_source cachefiles_prepare_read(struct netfs_io_subrequest *subreq, 507 loff_t i_size) 508 { 509 return cachefiles_do_prepare_read(&subreq->rreq->cache_resources, 510 subreq->start, &subreq->len, i_size, 511 &subreq->flags, subreq->rreq->inode->i_ino); 512 } 513 514 /* 515 * Prepare an on-demand read operation, shortening it to a cached/uncached 516 * boundary as appropriate. 517 */ 518 static enum netfs_io_source 519 cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres, 520 loff_t start, size_t *_len, loff_t i_size, 521 unsigned long *_flags, ino_t ino) 522 { 523 return cachefiles_do_prepare_read(cres, start, _len, i_size, _flags, ino); 524 } 525 526 /* 527 * Prepare for a write to occur. 528 */ 529 int __cachefiles_prepare_write(struct cachefiles_object *object, 530 struct file *file, 531 loff_t *_start, size_t *_len, 532 bool no_space_allocated_yet) 533 { 534 struct cachefiles_cache *cache = object->volume->cache; 535 loff_t start = *_start, pos; 536 size_t len = *_len, down; 537 int ret; 538 539 /* Round to DIO size */ 540 down = start - round_down(start, PAGE_SIZE); 541 *_start = start - down; 542 *_len = round_up(down + len, PAGE_SIZE); 543 544 /* We need to work out whether there's sufficient disk space to perform 545 * the write - but we can skip that check if we have space already 546 * allocated. 547 */ 548 if (no_space_allocated_yet) 549 goto check_space; 550 551 pos = cachefiles_inject_read_error(); 552 if (pos == 0) 553 pos = vfs_llseek(file, *_start, SEEK_DATA); 554 if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { 555 if (pos == -ENXIO) 556 goto check_space; /* Unallocated tail */ 557 trace_cachefiles_io_error(object, file_inode(file), pos, 558 cachefiles_trace_seek_error); 559 return pos; 560 } 561 if ((u64)pos >= (u64)*_start + *_len) 562 goto check_space; /* Unallocated region */ 563 564 /* We have a block that's at least partially filled - if we're low on 565 * space, we need to see if it's fully allocated. If it's not, we may 566 * want to cull it. 567 */ 568 if (cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, 569 cachefiles_has_space_check) == 0) 570 return 0; /* Enough space to simply overwrite the whole block */ 571 572 pos = cachefiles_inject_read_error(); 573 if (pos == 0) 574 pos = vfs_llseek(file, *_start, SEEK_HOLE); 575 if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) { 576 trace_cachefiles_io_error(object, file_inode(file), pos, 577 cachefiles_trace_seek_error); 578 return pos; 579 } 580 if ((u64)pos >= (u64)*_start + *_len) 581 return 0; /* Fully allocated */ 582 583 /* Partially allocated, but insufficient space: cull. */ 584 fscache_count_no_write_space(); 585 ret = cachefiles_inject_remove_error(); 586 if (ret == 0) 587 ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 588 *_start, *_len); 589 if (ret < 0) { 590 trace_cachefiles_io_error(object, file_inode(file), ret, 591 cachefiles_trace_fallocate_error); 592 cachefiles_io_error_obj(object, 593 "CacheFiles: fallocate failed (%d)\n", ret); 594 ret = -EIO; 595 } 596 597 return ret; 598 599 check_space: 600 return cachefiles_has_space(cache, 0, *_len / PAGE_SIZE, 601 cachefiles_has_space_for_write); 602 } 603 604 static int cachefiles_prepare_write(struct netfs_cache_resources *cres, 605 loff_t *_start, size_t *_len, loff_t i_size, 606 bool no_space_allocated_yet) 607 { 608 struct cachefiles_object *object = cachefiles_cres_object(cres); 609 struct cachefiles_cache *cache = object->volume->cache; 610 const struct cred *saved_cred; 611 int ret; 612 613 if (!cachefiles_cres_file(cres)) { 614 if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) 615 return -ENOBUFS; 616 if (!cachefiles_cres_file(cres)) 617 return -ENOBUFS; 618 } 619 620 cachefiles_begin_secure(cache, &saved_cred); 621 ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres), 622 _start, _len, 623 no_space_allocated_yet); 624 cachefiles_end_secure(cache, saved_cred); 625 return ret; 626 } 627 628 /* 629 * Clean up an operation. 630 */ 631 static void cachefiles_end_operation(struct netfs_cache_resources *cres) 632 { 633 struct file *file = cachefiles_cres_file(cres); 634 635 if (file) 636 fput(file); 637 fscache_end_cookie_access(fscache_cres_cookie(cres), fscache_access_io_end); 638 } 639 640 static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { 641 .end_operation = cachefiles_end_operation, 642 .read = cachefiles_read, 643 .write = cachefiles_write, 644 .prepare_read = cachefiles_prepare_read, 645 .prepare_write = cachefiles_prepare_write, 646 .prepare_ondemand_read = cachefiles_prepare_ondemand_read, 647 .query_occupancy = cachefiles_query_occupancy, 648 }; 649 650 /* 651 * Open the cache file when beginning a cache operation. 652 */ 653 bool cachefiles_begin_operation(struct netfs_cache_resources *cres, 654 enum fscache_want_state want_state) 655 { 656 struct cachefiles_object *object = cachefiles_cres_object(cres); 657 658 if (!cachefiles_cres_file(cres)) { 659 cres->ops = &cachefiles_netfs_cache_ops; 660 if (object->file) { 661 spin_lock(&object->lock); 662 if (!cres->cache_priv2 && object->file) 663 cres->cache_priv2 = get_file(object->file); 664 spin_unlock(&object->lock); 665 } 666 } 667 668 if (!cachefiles_cres_file(cres) && want_state != FSCACHE_WANT_PARAMS) { 669 pr_err("failed to get cres->file\n"); 670 return false; 671 } 672 673 return true; 674 } 675