1 /* 2 FUSE: Filesystem in Userspace 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> 4 5 This program can be distributed under the terms of the GNU GPL. 6 See the file COPYING. 7 */ 8 9 #include "fuse_i.h" 10 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/kernel.h> 14 #include <linux/sched.h> 15 #include <linux/sched/signal.h> 16 #include <linux/module.h> 17 #include <linux/swap.h> 18 #include <linux/falloc.h> 19 #include <linux/uio.h> 20 #include <linux/fs.h> 21 22 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, 23 unsigned int open_flags, int opcode, 24 struct fuse_open_out *outargp) 25 { 26 struct fuse_open_in inarg; 27 FUSE_ARGS(args); 28 29 memset(&inarg, 0, sizeof(inarg)); 30 inarg.flags = open_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 31 if (!fm->fc->atomic_o_trunc) 32 inarg.flags &= ~O_TRUNC; 33 34 if (fm->fc->handle_killpriv_v2 && 35 (inarg.flags & O_TRUNC) && !capable(CAP_FSETID)) { 36 inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; 37 } 38 39 args.opcode = opcode; 40 args.nodeid = nodeid; 41 args.in_numargs = 1; 42 args.in_args[0].size = sizeof(inarg); 43 args.in_args[0].value = &inarg; 44 args.out_numargs = 1; 45 args.out_args[0].size = sizeof(*outargp); 46 args.out_args[0].value = outargp; 47 48 return fuse_simple_request(fm, &args); 49 } 50 51 struct fuse_release_args { 52 struct fuse_args args; 53 struct fuse_release_in inarg; 54 struct inode *inode; 55 }; 56 57 struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) 58 { 59 struct fuse_file *ff; 60 61 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); 62 if (unlikely(!ff)) 63 return NULL; 64 65 ff->fm = fm; 66 ff->release_args = kzalloc(sizeof(*ff->release_args), 67 GFP_KERNEL_ACCOUNT); 68 if (!ff->release_args) { 69 kfree(ff); 70 return NULL; 71 } 72 73 INIT_LIST_HEAD(&ff->write_entry); 74 mutex_init(&ff->readdir.lock); 75 refcount_set(&ff->count, 1); 76 RB_CLEAR_NODE(&ff->polled_node); 77 init_waitqueue_head(&ff->poll_wait); 78 79 ff->kh = atomic64_inc_return(&fm->fc->khctr); 80 81 return ff; 82 } 83 84 void fuse_file_free(struct fuse_file *ff) 85 { 86 kfree(ff->release_args); 87 mutex_destroy(&ff->readdir.lock); 88 kfree(ff); 89 } 90 91 static struct fuse_file *fuse_file_get(struct fuse_file *ff) 92 { 93 refcount_inc(&ff->count); 94 return ff; 95 } 96 97 static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, 98 int error) 99 { 100 struct fuse_release_args *ra = container_of(args, typeof(*ra), args); 101 102 iput(ra->inode); 103 kfree(ra); 104 } 105 106 static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) 107 { 108 if (refcount_dec_and_test(&ff->count)) { 109 struct fuse_args *args = &ff->release_args->args; 110 111 if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { 112 /* Do nothing when client does not implement 'open' */ 113 fuse_release_end(ff->fm, args, 0); 114 } else if (sync) { 115 fuse_simple_request(ff->fm, args); 116 fuse_release_end(ff->fm, args, 0); 117 } else { 118 args->end = fuse_release_end; 119 if (fuse_simple_background(ff->fm, args, 120 GFP_KERNEL | __GFP_NOFAIL)) 121 fuse_release_end(ff->fm, args, -ENOTCONN); 122 } 123 kfree(ff); 124 } 125 } 126 127 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, 128 unsigned int open_flags, bool isdir) 129 { 130 struct fuse_conn *fc = fm->fc; 131 struct fuse_file *ff; 132 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 133 134 ff = fuse_file_alloc(fm); 135 if (!ff) 136 return ERR_PTR(-ENOMEM); 137 138 ff->fh = 0; 139 /* Default for no-open */ 140 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); 141 if (isdir ? !fc->no_opendir : !fc->no_open) { 142 struct fuse_open_out outarg; 143 int err; 144 145 err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg); 146 if (!err) { 147 ff->fh = outarg.fh; 148 ff->open_flags = outarg.open_flags; 149 150 } else if (err != -ENOSYS) { 151 fuse_file_free(ff); 152 return ERR_PTR(err); 153 } else { 154 if (isdir) 155 fc->no_opendir = 1; 156 else 157 fc->no_open = 1; 158 } 159 } 160 161 if (isdir) 162 ff->open_flags &= ~FOPEN_DIRECT_IO; 163 164 ff->nodeid = nodeid; 165 166 return ff; 167 } 168 169 int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, 170 bool isdir) 171 { 172 struct fuse_file *ff = fuse_file_open(fm, nodeid, file->f_flags, isdir); 173 174 if (!IS_ERR(ff)) 175 file->private_data = ff; 176 177 return PTR_ERR_OR_ZERO(ff); 178 } 179 EXPORT_SYMBOL_GPL(fuse_do_open); 180 181 static void fuse_link_write_file(struct file *file) 182 { 183 struct inode *inode = file_inode(file); 184 struct fuse_inode *fi = get_fuse_inode(inode); 185 struct fuse_file *ff = file->private_data; 186 /* 187 * file may be written through mmap, so chain it onto the 188 * inodes's write_file list 189 */ 190 spin_lock(&fi->lock); 191 if (list_empty(&ff->write_entry)) 192 list_add(&ff->write_entry, &fi->write_files); 193 spin_unlock(&fi->lock); 194 } 195 196 void fuse_finish_open(struct inode *inode, struct file *file) 197 { 198 struct fuse_file *ff = file->private_data; 199 struct fuse_conn *fc = get_fuse_conn(inode); 200 201 if (ff->open_flags & FOPEN_STREAM) 202 stream_open(inode, file); 203 else if (ff->open_flags & FOPEN_NONSEEKABLE) 204 nonseekable_open(inode, file); 205 206 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { 207 struct fuse_inode *fi = get_fuse_inode(inode); 208 209 spin_lock(&fi->lock); 210 fi->attr_version = atomic64_inc_return(&fc->attr_version); 211 i_size_write(inode, 0); 212 spin_unlock(&fi->lock); 213 truncate_pagecache(inode, 0); 214 fuse_invalidate_attr(inode); 215 if (fc->writeback_cache) 216 file_update_time(file); 217 } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { 218 invalidate_inode_pages2(inode->i_mapping); 219 } 220 221 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) 222 fuse_link_write_file(file); 223 } 224 225 int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 226 { 227 struct fuse_mount *fm = get_fuse_mount(inode); 228 struct fuse_conn *fc = fm->fc; 229 int err; 230 bool is_wb_truncate = (file->f_flags & O_TRUNC) && 231 fc->atomic_o_trunc && 232 fc->writeback_cache; 233 bool dax_truncate = (file->f_flags & O_TRUNC) && 234 fc->atomic_o_trunc && FUSE_IS_DAX(inode); 235 236 if (fuse_is_bad(inode)) 237 return -EIO; 238 239 err = generic_file_open(inode, file); 240 if (err) 241 return err; 242 243 if (is_wb_truncate || dax_truncate) { 244 inode_lock(inode); 245 fuse_set_nowrite(inode); 246 } 247 248 if (dax_truncate) { 249 filemap_invalidate_lock(inode->i_mapping); 250 err = fuse_dax_break_layouts(inode, 0, 0); 251 if (err) 252 goto out; 253 } 254 255 err = fuse_do_open(fm, get_node_id(inode), file, isdir); 256 if (!err) 257 fuse_finish_open(inode, file); 258 259 out: 260 if (dax_truncate) 261 filemap_invalidate_unlock(inode->i_mapping); 262 263 if (is_wb_truncate | dax_truncate) { 264 fuse_release_nowrite(inode); 265 inode_unlock(inode); 266 } 267 268 return err; 269 } 270 271 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, 272 unsigned int flags, int opcode) 273 { 274 struct fuse_conn *fc = ff->fm->fc; 275 struct fuse_release_args *ra = ff->release_args; 276 277 /* Inode is NULL on error path of fuse_create_open() */ 278 if (likely(fi)) { 279 spin_lock(&fi->lock); 280 list_del(&ff->write_entry); 281 spin_unlock(&fi->lock); 282 } 283 spin_lock(&fc->lock); 284 if (!RB_EMPTY_NODE(&ff->polled_node)) 285 rb_erase(&ff->polled_node, &fc->polled_files); 286 spin_unlock(&fc->lock); 287 288 wake_up_interruptible_all(&ff->poll_wait); 289 290 ra->inarg.fh = ff->fh; 291 ra->inarg.flags = flags; 292 ra->args.in_numargs = 1; 293 ra->args.in_args[0].size = sizeof(struct fuse_release_in); 294 ra->args.in_args[0].value = &ra->inarg; 295 ra->args.opcode = opcode; 296 ra->args.nodeid = ff->nodeid; 297 ra->args.force = true; 298 ra->args.nocreds = true; 299 } 300 301 void fuse_file_release(struct inode *inode, struct fuse_file *ff, 302 unsigned int open_flags, fl_owner_t id, bool isdir) 303 { 304 struct fuse_inode *fi = get_fuse_inode(inode); 305 struct fuse_release_args *ra = ff->release_args; 306 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 307 308 fuse_prepare_release(fi, ff, open_flags, opcode); 309 310 if (ff->flock) { 311 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 312 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id); 313 } 314 /* Hold inode until release is finished */ 315 ra->inode = igrab(inode); 316 317 /* 318 * Normally this will send the RELEASE request, however if 319 * some asynchronous READ or WRITE requests are outstanding, 320 * the sending will be delayed. 321 * 322 * Make the release synchronous if this is a fuseblk mount, 323 * synchronous RELEASE is allowed (and desirable) in this case 324 * because the server can be trusted not to screw up. 325 */ 326 fuse_file_put(ff, ff->fm->fc->destroy, isdir); 327 } 328 329 void fuse_release_common(struct file *file, bool isdir) 330 { 331 fuse_file_release(file_inode(file), file->private_data, file->f_flags, 332 (fl_owner_t) file, isdir); 333 } 334 335 static int fuse_open(struct inode *inode, struct file *file) 336 { 337 return fuse_open_common(inode, file, false); 338 } 339 340 static int fuse_release(struct inode *inode, struct file *file) 341 { 342 struct fuse_conn *fc = get_fuse_conn(inode); 343 344 /* see fuse_vma_close() for !writeback_cache case */ 345 if (fc->writeback_cache) 346 write_inode_now(inode, 1); 347 348 fuse_release_common(file, false); 349 350 /* return value is ignored by VFS */ 351 return 0; 352 } 353 354 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, 355 unsigned int flags) 356 { 357 WARN_ON(refcount_read(&ff->count) > 1); 358 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); 359 /* 360 * iput(NULL) is a no-op and since the refcount is 1 and everything's 361 * synchronous, we are fine with not doing igrab() here" 362 */ 363 fuse_file_put(ff, true, false); 364 } 365 EXPORT_SYMBOL_GPL(fuse_sync_release); 366 367 /* 368 * Scramble the ID space with XTEA, so that the value of the files_struct 369 * pointer is not exposed to userspace. 370 */ 371 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) 372 { 373 u32 *k = fc->scramble_key; 374 u64 v = (unsigned long) id; 375 u32 v0 = v; 376 u32 v1 = v >> 32; 377 u32 sum = 0; 378 int i; 379 380 for (i = 0; i < 32; i++) { 381 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); 382 sum += 0x9E3779B9; 383 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); 384 } 385 386 return (u64) v0 + ((u64) v1 << 32); 387 } 388 389 struct fuse_writepage_args { 390 struct fuse_io_args ia; 391 struct rb_node writepages_entry; 392 struct list_head queue_entry; 393 struct fuse_writepage_args *next; 394 struct inode *inode; 395 struct fuse_sync_bucket *bucket; 396 }; 397 398 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, 399 pgoff_t idx_from, pgoff_t idx_to) 400 { 401 struct rb_node *n; 402 403 n = fi->writepages.rb_node; 404 405 while (n) { 406 struct fuse_writepage_args *wpa; 407 pgoff_t curr_index; 408 409 wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); 410 WARN_ON(get_fuse_inode(wpa->inode) != fi); 411 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; 412 if (idx_from >= curr_index + wpa->ia.ap.num_pages) 413 n = n->rb_right; 414 else if (idx_to < curr_index) 415 n = n->rb_left; 416 else 417 return wpa; 418 } 419 return NULL; 420 } 421 422 /* 423 * Check if any page in a range is under writeback 424 * 425 * This is currently done by walking the list of writepage requests 426 * for the inode, which can be pretty inefficient. 427 */ 428 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, 429 pgoff_t idx_to) 430 { 431 struct fuse_inode *fi = get_fuse_inode(inode); 432 bool found; 433 434 spin_lock(&fi->lock); 435 found = fuse_find_writeback(fi, idx_from, idx_to); 436 spin_unlock(&fi->lock); 437 438 return found; 439 } 440 441 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 442 { 443 return fuse_range_is_writeback(inode, index, index); 444 } 445 446 /* 447 * Wait for page writeback to be completed. 448 * 449 * Since fuse doesn't rely on the VM writeback tracking, this has to 450 * use some other means. 451 */ 452 static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) 453 { 454 struct fuse_inode *fi = get_fuse_inode(inode); 455 456 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); 457 } 458 459 /* 460 * Wait for all pending writepages on the inode to finish. 461 * 462 * This is currently done by blocking further writes with FUSE_NOWRITE 463 * and waiting for all sent writes to complete. 464 * 465 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage 466 * could conflict with truncation. 467 */ 468 static void fuse_sync_writes(struct inode *inode) 469 { 470 fuse_set_nowrite(inode); 471 fuse_release_nowrite(inode); 472 } 473 474 static int fuse_flush(struct file *file, fl_owner_t id) 475 { 476 struct inode *inode = file_inode(file); 477 struct fuse_mount *fm = get_fuse_mount(inode); 478 struct fuse_file *ff = file->private_data; 479 struct fuse_flush_in inarg; 480 FUSE_ARGS(args); 481 int err; 482 483 if (fuse_is_bad(inode)) 484 return -EIO; 485 486 err = write_inode_now(inode, 1); 487 if (err) 488 return err; 489 490 inode_lock(inode); 491 fuse_sync_writes(inode); 492 inode_unlock(inode); 493 494 err = filemap_check_errors(file->f_mapping); 495 if (err) 496 return err; 497 498 err = 0; 499 if (fm->fc->no_flush) 500 goto inval_attr_out; 501 502 memset(&inarg, 0, sizeof(inarg)); 503 inarg.fh = ff->fh; 504 inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); 505 args.opcode = FUSE_FLUSH; 506 args.nodeid = get_node_id(inode); 507 args.in_numargs = 1; 508 args.in_args[0].size = sizeof(inarg); 509 args.in_args[0].value = &inarg; 510 args.force = true; 511 512 err = fuse_simple_request(fm, &args); 513 if (err == -ENOSYS) { 514 fm->fc->no_flush = 1; 515 err = 0; 516 } 517 518 inval_attr_out: 519 /* 520 * In memory i_blocks is not maintained by fuse, if writeback cache is 521 * enabled, i_blocks from cached attr may not be accurate. 522 */ 523 if (!err && fm->fc->writeback_cache) 524 fuse_invalidate_attr(inode); 525 return err; 526 } 527 528 int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 529 int datasync, int opcode) 530 { 531 struct inode *inode = file->f_mapping->host; 532 struct fuse_mount *fm = get_fuse_mount(inode); 533 struct fuse_file *ff = file->private_data; 534 FUSE_ARGS(args); 535 struct fuse_fsync_in inarg; 536 537 memset(&inarg, 0, sizeof(inarg)); 538 inarg.fh = ff->fh; 539 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; 540 args.opcode = opcode; 541 args.nodeid = get_node_id(inode); 542 args.in_numargs = 1; 543 args.in_args[0].size = sizeof(inarg); 544 args.in_args[0].value = &inarg; 545 return fuse_simple_request(fm, &args); 546 } 547 548 static int fuse_fsync(struct file *file, loff_t start, loff_t end, 549 int datasync) 550 { 551 struct inode *inode = file->f_mapping->host; 552 struct fuse_conn *fc = get_fuse_conn(inode); 553 int err; 554 555 if (fuse_is_bad(inode)) 556 return -EIO; 557 558 inode_lock(inode); 559 560 /* 561 * Start writeback against all dirty pages of the inode, then 562 * wait for all outstanding writes, before sending the FSYNC 563 * request. 564 */ 565 err = file_write_and_wait_range(file, start, end); 566 if (err) 567 goto out; 568 569 fuse_sync_writes(inode); 570 571 /* 572 * Due to implementation of fuse writeback 573 * file_write_and_wait_range() does not catch errors. 574 * We have to do this directly after fuse_sync_writes() 575 */ 576 err = file_check_and_advance_wb_err(file); 577 if (err) 578 goto out; 579 580 err = sync_inode_metadata(inode, 1); 581 if (err) 582 goto out; 583 584 if (fc->no_fsync) 585 goto out; 586 587 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); 588 if (err == -ENOSYS) { 589 fc->no_fsync = 1; 590 err = 0; 591 } 592 out: 593 inode_unlock(inode); 594 595 return err; 596 } 597 598 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, 599 size_t count, int opcode) 600 { 601 struct fuse_file *ff = file->private_data; 602 struct fuse_args *args = &ia->ap.args; 603 604 ia->read.in.fh = ff->fh; 605 ia->read.in.offset = pos; 606 ia->read.in.size = count; 607 ia->read.in.flags = file->f_flags; 608 args->opcode = opcode; 609 args->nodeid = ff->nodeid; 610 args->in_numargs = 1; 611 args->in_args[0].size = sizeof(ia->read.in); 612 args->in_args[0].value = &ia->read.in; 613 args->out_argvar = true; 614 args->out_numargs = 1; 615 args->out_args[0].size = count; 616 } 617 618 static void fuse_release_user_pages(struct fuse_args_pages *ap, 619 bool should_dirty) 620 { 621 unsigned int i; 622 623 for (i = 0; i < ap->num_pages; i++) { 624 if (should_dirty) 625 set_page_dirty_lock(ap->pages[i]); 626 put_page(ap->pages[i]); 627 } 628 } 629 630 static void fuse_io_release(struct kref *kref) 631 { 632 kfree(container_of(kref, struct fuse_io_priv, refcnt)); 633 } 634 635 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) 636 { 637 if (io->err) 638 return io->err; 639 640 if (io->bytes >= 0 && io->write) 641 return -EIO; 642 643 return io->bytes < 0 ? io->size : io->bytes; 644 } 645 646 /** 647 * In case of short read, the caller sets 'pos' to the position of 648 * actual end of fuse request in IO request. Otherwise, if bytes_requested 649 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. 650 * 651 * An example: 652 * User requested DIO read of 64K. It was split into two 32K fuse requests, 653 * both submitted asynchronously. The first of them was ACKed by userspace as 654 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The 655 * second request was ACKed as short, e.g. only 1K was read, resulting in 656 * pos == 33K. 657 * 658 * Thus, when all fuse requests are completed, the minimal non-negative 'pos' 659 * will be equal to the length of the longest contiguous fragment of 660 * transferred data starting from the beginning of IO request. 661 */ 662 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) 663 { 664 int left; 665 666 spin_lock(&io->lock); 667 if (err) 668 io->err = io->err ? : err; 669 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) 670 io->bytes = pos; 671 672 left = --io->reqs; 673 if (!left && io->blocking) 674 complete(io->done); 675 spin_unlock(&io->lock); 676 677 if (!left && !io->blocking) { 678 ssize_t res = fuse_get_res_by_io(io); 679 680 if (res >= 0) { 681 struct inode *inode = file_inode(io->iocb->ki_filp); 682 struct fuse_conn *fc = get_fuse_conn(inode); 683 struct fuse_inode *fi = get_fuse_inode(inode); 684 685 spin_lock(&fi->lock); 686 fi->attr_version = atomic64_inc_return(&fc->attr_version); 687 spin_unlock(&fi->lock); 688 } 689 690 io->iocb->ki_complete(io->iocb, res, 0); 691 } 692 693 kref_put(&io->refcnt, fuse_io_release); 694 } 695 696 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, 697 unsigned int npages) 698 { 699 struct fuse_io_args *ia; 700 701 ia = kzalloc(sizeof(*ia), GFP_KERNEL); 702 if (ia) { 703 ia->io = io; 704 ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, 705 &ia->ap.descs); 706 if (!ia->ap.pages) { 707 kfree(ia); 708 ia = NULL; 709 } 710 } 711 return ia; 712 } 713 714 static void fuse_io_free(struct fuse_io_args *ia) 715 { 716 kfree(ia->ap.pages); 717 kfree(ia); 718 } 719 720 static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, 721 int err) 722 { 723 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 724 struct fuse_io_priv *io = ia->io; 725 ssize_t pos = -1; 726 727 fuse_release_user_pages(&ia->ap, io->should_dirty); 728 729 if (err) { 730 /* Nothing */ 731 } else if (io->write) { 732 if (ia->write.out.size > ia->write.in.size) { 733 err = -EIO; 734 } else if (ia->write.in.size != ia->write.out.size) { 735 pos = ia->write.in.offset - io->offset + 736 ia->write.out.size; 737 } 738 } else { 739 u32 outsize = args->out_args[0].size; 740 741 if (ia->read.in.size != outsize) 742 pos = ia->read.in.offset - io->offset + outsize; 743 } 744 745 fuse_aio_complete(io, err, pos); 746 fuse_io_free(ia); 747 } 748 749 static ssize_t fuse_async_req_send(struct fuse_mount *fm, 750 struct fuse_io_args *ia, size_t num_bytes) 751 { 752 ssize_t err; 753 struct fuse_io_priv *io = ia->io; 754 755 spin_lock(&io->lock); 756 kref_get(&io->refcnt); 757 io->size += num_bytes; 758 io->reqs++; 759 spin_unlock(&io->lock); 760 761 ia->ap.args.end = fuse_aio_complete_req; 762 ia->ap.args.may_block = io->should_dirty; 763 err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); 764 if (err) 765 fuse_aio_complete_req(fm, &ia->ap.args, err); 766 767 return num_bytes; 768 } 769 770 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, 771 fl_owner_t owner) 772 { 773 struct file *file = ia->io->iocb->ki_filp; 774 struct fuse_file *ff = file->private_data; 775 struct fuse_mount *fm = ff->fm; 776 777 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 778 if (owner != NULL) { 779 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; 780 ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); 781 } 782 783 if (ia->io->async) 784 return fuse_async_req_send(fm, ia, count); 785 786 return fuse_simple_request(fm, &ia->ap.args); 787 } 788 789 static void fuse_read_update_size(struct inode *inode, loff_t size, 790 u64 attr_ver) 791 { 792 struct fuse_conn *fc = get_fuse_conn(inode); 793 struct fuse_inode *fi = get_fuse_inode(inode); 794 795 spin_lock(&fi->lock); 796 if (attr_ver == fi->attr_version && size < inode->i_size && 797 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { 798 fi->attr_version = atomic64_inc_return(&fc->attr_version); 799 i_size_write(inode, size); 800 } 801 spin_unlock(&fi->lock); 802 } 803 804 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, 805 struct fuse_args_pages *ap) 806 { 807 struct fuse_conn *fc = get_fuse_conn(inode); 808 809 /* 810 * If writeback_cache is enabled, a short read means there's a hole in 811 * the file. Some data after the hole is in page cache, but has not 812 * reached the client fs yet. So the hole is not present there. 813 */ 814 if (!fc->writeback_cache) { 815 loff_t pos = page_offset(ap->pages[0]) + num_read; 816 fuse_read_update_size(inode, pos, attr_ver); 817 } 818 } 819 820 static int fuse_do_readpage(struct file *file, struct page *page) 821 { 822 struct inode *inode = page->mapping->host; 823 struct fuse_mount *fm = get_fuse_mount(inode); 824 loff_t pos = page_offset(page); 825 struct fuse_page_desc desc = { .length = PAGE_SIZE }; 826 struct fuse_io_args ia = { 827 .ap.args.page_zeroing = true, 828 .ap.args.out_pages = true, 829 .ap.num_pages = 1, 830 .ap.pages = &page, 831 .ap.descs = &desc, 832 }; 833 ssize_t res; 834 u64 attr_ver; 835 836 /* 837 * Page writeback can extend beyond the lifetime of the 838 * page-cache page, so make sure we read a properly synced 839 * page. 840 */ 841 fuse_wait_on_page_writeback(inode, page->index); 842 843 attr_ver = fuse_get_attr_version(fm->fc); 844 845 /* Don't overflow end offset */ 846 if (pos + (desc.length - 1) == LLONG_MAX) 847 desc.length--; 848 849 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); 850 res = fuse_simple_request(fm, &ia.ap.args); 851 if (res < 0) 852 return res; 853 /* 854 * Short read means EOF. If file size is larger, truncate it 855 */ 856 if (res < desc.length) 857 fuse_short_read(inode, attr_ver, res, &ia.ap); 858 859 SetPageUptodate(page); 860 861 return 0; 862 } 863 864 static int fuse_readpage(struct file *file, struct page *page) 865 { 866 struct inode *inode = page->mapping->host; 867 int err; 868 869 err = -EIO; 870 if (fuse_is_bad(inode)) 871 goto out; 872 873 err = fuse_do_readpage(file, page); 874 fuse_invalidate_atime(inode); 875 out: 876 unlock_page(page); 877 return err; 878 } 879 880 static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, 881 int err) 882 { 883 int i; 884 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 885 struct fuse_args_pages *ap = &ia->ap; 886 size_t count = ia->read.in.size; 887 size_t num_read = args->out_args[0].size; 888 struct address_space *mapping = NULL; 889 890 for (i = 0; mapping == NULL && i < ap->num_pages; i++) 891 mapping = ap->pages[i]->mapping; 892 893 if (mapping) { 894 struct inode *inode = mapping->host; 895 896 /* 897 * Short read means EOF. If file size is larger, truncate it 898 */ 899 if (!err && num_read < count) 900 fuse_short_read(inode, ia->read.attr_ver, num_read, ap); 901 902 fuse_invalidate_atime(inode); 903 } 904 905 for (i = 0; i < ap->num_pages; i++) { 906 struct page *page = ap->pages[i]; 907 908 if (!err) 909 SetPageUptodate(page); 910 else 911 SetPageError(page); 912 unlock_page(page); 913 put_page(page); 914 } 915 if (ia->ff) 916 fuse_file_put(ia->ff, false, false); 917 918 fuse_io_free(ia); 919 } 920 921 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) 922 { 923 struct fuse_file *ff = file->private_data; 924 struct fuse_mount *fm = ff->fm; 925 struct fuse_args_pages *ap = &ia->ap; 926 loff_t pos = page_offset(ap->pages[0]); 927 size_t count = ap->num_pages << PAGE_SHIFT; 928 ssize_t res; 929 int err; 930 931 ap->args.out_pages = true; 932 ap->args.page_zeroing = true; 933 ap->args.page_replace = true; 934 935 /* Don't overflow end offset */ 936 if (pos + (count - 1) == LLONG_MAX) { 937 count--; 938 ap->descs[ap->num_pages - 1].length--; 939 } 940 WARN_ON((loff_t) (pos + count) < 0); 941 942 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 943 ia->read.attr_ver = fuse_get_attr_version(fm->fc); 944 if (fm->fc->async_read) { 945 ia->ff = fuse_file_get(ff); 946 ap->args.end = fuse_readpages_end; 947 err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); 948 if (!err) 949 return; 950 } else { 951 res = fuse_simple_request(fm, &ap->args); 952 err = res < 0 ? res : 0; 953 } 954 fuse_readpages_end(fm, &ap->args, err); 955 } 956 957 static void fuse_readahead(struct readahead_control *rac) 958 { 959 struct inode *inode = rac->mapping->host; 960 struct fuse_conn *fc = get_fuse_conn(inode); 961 unsigned int i, max_pages, nr_pages = 0; 962 963 if (fuse_is_bad(inode)) 964 return; 965 966 max_pages = min_t(unsigned int, fc->max_pages, 967 fc->max_read / PAGE_SIZE); 968 969 for (;;) { 970 struct fuse_io_args *ia; 971 struct fuse_args_pages *ap; 972 973 nr_pages = readahead_count(rac) - nr_pages; 974 if (nr_pages > max_pages) 975 nr_pages = max_pages; 976 if (nr_pages == 0) 977 break; 978 ia = fuse_io_alloc(NULL, nr_pages); 979 if (!ia) 980 return; 981 ap = &ia->ap; 982 nr_pages = __readahead_batch(rac, ap->pages, nr_pages); 983 for (i = 0; i < nr_pages; i++) { 984 fuse_wait_on_page_writeback(inode, 985 readahead_index(rac) + i); 986 ap->descs[i].length = PAGE_SIZE; 987 } 988 ap->num_pages = nr_pages; 989 fuse_send_readpages(ia, rac->file); 990 } 991 } 992 993 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) 994 { 995 struct inode *inode = iocb->ki_filp->f_mapping->host; 996 struct fuse_conn *fc = get_fuse_conn(inode); 997 998 /* 999 * In auto invalidate mode, always update attributes on read. 1000 * Otherwise, only update if we attempt to read past EOF (to ensure 1001 * i_size is up to date). 1002 */ 1003 if (fc->auto_inval_data || 1004 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { 1005 int err; 1006 err = fuse_update_attributes(inode, iocb->ki_filp); 1007 if (err) 1008 return err; 1009 } 1010 1011 return generic_file_read_iter(iocb, to); 1012 } 1013 1014 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, 1015 loff_t pos, size_t count) 1016 { 1017 struct fuse_args *args = &ia->ap.args; 1018 1019 ia->write.in.fh = ff->fh; 1020 ia->write.in.offset = pos; 1021 ia->write.in.size = count; 1022 args->opcode = FUSE_WRITE; 1023 args->nodeid = ff->nodeid; 1024 args->in_numargs = 2; 1025 if (ff->fm->fc->minor < 9) 1026 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 1027 else 1028 args->in_args[0].size = sizeof(ia->write.in); 1029 args->in_args[0].value = &ia->write.in; 1030 args->in_args[1].size = count; 1031 args->out_numargs = 1; 1032 args->out_args[0].size = sizeof(ia->write.out); 1033 args->out_args[0].value = &ia->write.out; 1034 } 1035 1036 static unsigned int fuse_write_flags(struct kiocb *iocb) 1037 { 1038 unsigned int flags = iocb->ki_filp->f_flags; 1039 1040 if (iocb->ki_flags & IOCB_DSYNC) 1041 flags |= O_DSYNC; 1042 if (iocb->ki_flags & IOCB_SYNC) 1043 flags |= O_SYNC; 1044 1045 return flags; 1046 } 1047 1048 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, 1049 size_t count, fl_owner_t owner) 1050 { 1051 struct kiocb *iocb = ia->io->iocb; 1052 struct file *file = iocb->ki_filp; 1053 struct fuse_file *ff = file->private_data; 1054 struct fuse_mount *fm = ff->fm; 1055 struct fuse_write_in *inarg = &ia->write.in; 1056 ssize_t err; 1057 1058 fuse_write_args_fill(ia, ff, pos, count); 1059 inarg->flags = fuse_write_flags(iocb); 1060 if (owner != NULL) { 1061 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 1062 inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); 1063 } 1064 1065 if (ia->io->async) 1066 return fuse_async_req_send(fm, ia, count); 1067 1068 err = fuse_simple_request(fm, &ia->ap.args); 1069 if (!err && ia->write.out.size > count) 1070 err = -EIO; 1071 1072 return err ?: ia->write.out.size; 1073 } 1074 1075 bool fuse_write_update_size(struct inode *inode, loff_t pos) 1076 { 1077 struct fuse_conn *fc = get_fuse_conn(inode); 1078 struct fuse_inode *fi = get_fuse_inode(inode); 1079 bool ret = false; 1080 1081 spin_lock(&fi->lock); 1082 fi->attr_version = atomic64_inc_return(&fc->attr_version); 1083 if (pos > inode->i_size) { 1084 i_size_write(inode, pos); 1085 ret = true; 1086 } 1087 spin_unlock(&fi->lock); 1088 1089 return ret; 1090 } 1091 1092 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, 1093 struct kiocb *iocb, struct inode *inode, 1094 loff_t pos, size_t count) 1095 { 1096 struct fuse_args_pages *ap = &ia->ap; 1097 struct file *file = iocb->ki_filp; 1098 struct fuse_file *ff = file->private_data; 1099 struct fuse_mount *fm = ff->fm; 1100 unsigned int offset, i; 1101 bool short_write; 1102 int err; 1103 1104 for (i = 0; i < ap->num_pages; i++) 1105 fuse_wait_on_page_writeback(inode, ap->pages[i]->index); 1106 1107 fuse_write_args_fill(ia, ff, pos, count); 1108 ia->write.in.flags = fuse_write_flags(iocb); 1109 if (fm->fc->handle_killpriv_v2 && !capable(CAP_FSETID)) 1110 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1111 1112 err = fuse_simple_request(fm, &ap->args); 1113 if (!err && ia->write.out.size > count) 1114 err = -EIO; 1115 1116 short_write = ia->write.out.size < count; 1117 offset = ap->descs[0].offset; 1118 count = ia->write.out.size; 1119 for (i = 0; i < ap->num_pages; i++) { 1120 struct page *page = ap->pages[i]; 1121 1122 if (err) { 1123 ClearPageUptodate(page); 1124 } else { 1125 if (count >= PAGE_SIZE - offset) 1126 count -= PAGE_SIZE - offset; 1127 else { 1128 if (short_write) 1129 ClearPageUptodate(page); 1130 count = 0; 1131 } 1132 offset = 0; 1133 } 1134 if (ia->write.page_locked && (i == ap->num_pages - 1)) 1135 unlock_page(page); 1136 put_page(page); 1137 } 1138 1139 return err; 1140 } 1141 1142 static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, 1143 struct address_space *mapping, 1144 struct iov_iter *ii, loff_t pos, 1145 unsigned int max_pages) 1146 { 1147 struct fuse_args_pages *ap = &ia->ap; 1148 struct fuse_conn *fc = get_fuse_conn(mapping->host); 1149 unsigned offset = pos & (PAGE_SIZE - 1); 1150 size_t count = 0; 1151 int err; 1152 1153 ap->args.in_pages = true; 1154 ap->descs[0].offset = offset; 1155 1156 do { 1157 size_t tmp; 1158 struct page *page; 1159 pgoff_t index = pos >> PAGE_SHIFT; 1160 size_t bytes = min_t(size_t, PAGE_SIZE - offset, 1161 iov_iter_count(ii)); 1162 1163 bytes = min_t(size_t, bytes, fc->max_write - count); 1164 1165 again: 1166 err = -EFAULT; 1167 if (iov_iter_fault_in_readable(ii, bytes)) 1168 break; 1169 1170 err = -ENOMEM; 1171 page = grab_cache_page_write_begin(mapping, index, 0); 1172 if (!page) 1173 break; 1174 1175 if (mapping_writably_mapped(mapping)) 1176 flush_dcache_page(page); 1177 1178 tmp = copy_page_from_iter_atomic(page, offset, bytes, ii); 1179 flush_dcache_page(page); 1180 1181 if (!tmp) { 1182 unlock_page(page); 1183 put_page(page); 1184 goto again; 1185 } 1186 1187 err = 0; 1188 ap->pages[ap->num_pages] = page; 1189 ap->descs[ap->num_pages].length = tmp; 1190 ap->num_pages++; 1191 1192 count += tmp; 1193 pos += tmp; 1194 offset += tmp; 1195 if (offset == PAGE_SIZE) 1196 offset = 0; 1197 1198 /* If we copied full page, mark it uptodate */ 1199 if (tmp == PAGE_SIZE) 1200 SetPageUptodate(page); 1201 1202 if (PageUptodate(page)) { 1203 unlock_page(page); 1204 } else { 1205 ia->write.page_locked = true; 1206 break; 1207 } 1208 if (!fc->big_writes) 1209 break; 1210 } while (iov_iter_count(ii) && count < fc->max_write && 1211 ap->num_pages < max_pages && offset == 0); 1212 1213 return count > 0 ? count : err; 1214 } 1215 1216 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, 1217 unsigned int max_pages) 1218 { 1219 return min_t(unsigned int, 1220 ((pos + len - 1) >> PAGE_SHIFT) - 1221 (pos >> PAGE_SHIFT) + 1, 1222 max_pages); 1223 } 1224 1225 static ssize_t fuse_perform_write(struct kiocb *iocb, 1226 struct address_space *mapping, 1227 struct iov_iter *ii, loff_t pos) 1228 { 1229 struct inode *inode = mapping->host; 1230 struct fuse_conn *fc = get_fuse_conn(inode); 1231 struct fuse_inode *fi = get_fuse_inode(inode); 1232 int err = 0; 1233 ssize_t res = 0; 1234 1235 if (inode->i_size < pos + iov_iter_count(ii)) 1236 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1237 1238 do { 1239 ssize_t count; 1240 struct fuse_io_args ia = {}; 1241 struct fuse_args_pages *ap = &ia.ap; 1242 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), 1243 fc->max_pages); 1244 1245 ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); 1246 if (!ap->pages) { 1247 err = -ENOMEM; 1248 break; 1249 } 1250 1251 count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); 1252 if (count <= 0) { 1253 err = count; 1254 } else { 1255 err = fuse_send_write_pages(&ia, iocb, inode, 1256 pos, count); 1257 if (!err) { 1258 size_t num_written = ia.write.out.size; 1259 1260 res += num_written; 1261 pos += num_written; 1262 1263 /* break out of the loop on short write */ 1264 if (num_written != count) 1265 err = -EIO; 1266 } 1267 } 1268 kfree(ap->pages); 1269 } while (!err && iov_iter_count(ii)); 1270 1271 if (res > 0) 1272 fuse_write_update_size(inode, pos); 1273 1274 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1275 fuse_invalidate_attr(inode); 1276 1277 return res > 0 ? res : err; 1278 } 1279 1280 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) 1281 { 1282 struct file *file = iocb->ki_filp; 1283 struct address_space *mapping = file->f_mapping; 1284 ssize_t written = 0; 1285 ssize_t written_buffered = 0; 1286 struct inode *inode = mapping->host; 1287 ssize_t err; 1288 struct fuse_conn *fc = get_fuse_conn(inode); 1289 loff_t endbyte = 0; 1290 1291 if (fc->writeback_cache) { 1292 /* Update size (EOF optimization) and mode (SUID clearing) */ 1293 err = fuse_update_attributes(mapping->host, file); 1294 if (err) 1295 return err; 1296 1297 if (fc->handle_killpriv_v2 && 1298 should_remove_suid(file_dentry(file))) { 1299 goto writethrough; 1300 } 1301 1302 return generic_file_write_iter(iocb, from); 1303 } 1304 1305 writethrough: 1306 inode_lock(inode); 1307 1308 /* We can write back this queue in page reclaim */ 1309 current->backing_dev_info = inode_to_bdi(inode); 1310 1311 err = generic_write_checks(iocb, from); 1312 if (err <= 0) 1313 goto out; 1314 1315 err = file_remove_privs(file); 1316 if (err) 1317 goto out; 1318 1319 err = file_update_time(file); 1320 if (err) 1321 goto out; 1322 1323 if (iocb->ki_flags & IOCB_DIRECT) { 1324 loff_t pos = iocb->ki_pos; 1325 written = generic_file_direct_write(iocb, from); 1326 if (written < 0 || !iov_iter_count(from)) 1327 goto out; 1328 1329 pos += written; 1330 1331 written_buffered = fuse_perform_write(iocb, mapping, from, pos); 1332 if (written_buffered < 0) { 1333 err = written_buffered; 1334 goto out; 1335 } 1336 endbyte = pos + written_buffered - 1; 1337 1338 err = filemap_write_and_wait_range(file->f_mapping, pos, 1339 endbyte); 1340 if (err) 1341 goto out; 1342 1343 invalidate_mapping_pages(file->f_mapping, 1344 pos >> PAGE_SHIFT, 1345 endbyte >> PAGE_SHIFT); 1346 1347 written += written_buffered; 1348 iocb->ki_pos = pos + written_buffered; 1349 } else { 1350 written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); 1351 if (written >= 0) 1352 iocb->ki_pos += written; 1353 } 1354 out: 1355 current->backing_dev_info = NULL; 1356 inode_unlock(inode); 1357 if (written > 0) 1358 written = generic_write_sync(iocb, written); 1359 1360 return written ? written : err; 1361 } 1362 1363 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) 1364 { 1365 return (unsigned long)ii->iov->iov_base + ii->iov_offset; 1366 } 1367 1368 static inline size_t fuse_get_frag_size(const struct iov_iter *ii, 1369 size_t max_size) 1370 { 1371 return min(iov_iter_single_seg_count(ii), max_size); 1372 } 1373 1374 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, 1375 size_t *nbytesp, int write, 1376 unsigned int max_pages) 1377 { 1378 size_t nbytes = 0; /* # bytes already packed in req */ 1379 ssize_t ret = 0; 1380 1381 /* Special case for kernel I/O: can copy directly into the buffer */ 1382 if (iov_iter_is_kvec(ii)) { 1383 unsigned long user_addr = fuse_get_user_addr(ii); 1384 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1385 1386 if (write) 1387 ap->args.in_args[1].value = (void *) user_addr; 1388 else 1389 ap->args.out_args[0].value = (void *) user_addr; 1390 1391 iov_iter_advance(ii, frag_size); 1392 *nbytesp = frag_size; 1393 return 0; 1394 } 1395 1396 while (nbytes < *nbytesp && ap->num_pages < max_pages) { 1397 unsigned npages; 1398 size_t start; 1399 ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], 1400 *nbytesp - nbytes, 1401 max_pages - ap->num_pages, 1402 &start); 1403 if (ret < 0) 1404 break; 1405 1406 iov_iter_advance(ii, ret); 1407 nbytes += ret; 1408 1409 ret += start; 1410 npages = DIV_ROUND_UP(ret, PAGE_SIZE); 1411 1412 ap->descs[ap->num_pages].offset = start; 1413 fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); 1414 1415 ap->num_pages += npages; 1416 ap->descs[ap->num_pages - 1].length -= 1417 (PAGE_SIZE - ret) & (PAGE_SIZE - 1); 1418 } 1419 1420 if (write) 1421 ap->args.in_pages = true; 1422 else 1423 ap->args.out_pages = true; 1424 1425 *nbytesp = nbytes; 1426 1427 return ret < 0 ? ret : 0; 1428 } 1429 1430 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, 1431 loff_t *ppos, int flags) 1432 { 1433 int write = flags & FUSE_DIO_WRITE; 1434 int cuse = flags & FUSE_DIO_CUSE; 1435 struct file *file = io->iocb->ki_filp; 1436 struct inode *inode = file->f_mapping->host; 1437 struct fuse_file *ff = file->private_data; 1438 struct fuse_conn *fc = ff->fm->fc; 1439 size_t nmax = write ? fc->max_write : fc->max_read; 1440 loff_t pos = *ppos; 1441 size_t count = iov_iter_count(iter); 1442 pgoff_t idx_from = pos >> PAGE_SHIFT; 1443 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; 1444 ssize_t res = 0; 1445 int err = 0; 1446 struct fuse_io_args *ia; 1447 unsigned int max_pages; 1448 1449 max_pages = iov_iter_npages(iter, fc->max_pages); 1450 ia = fuse_io_alloc(io, max_pages); 1451 if (!ia) 1452 return -ENOMEM; 1453 1454 ia->io = io; 1455 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { 1456 if (!write) 1457 inode_lock(inode); 1458 fuse_sync_writes(inode); 1459 if (!write) 1460 inode_unlock(inode); 1461 } 1462 1463 io->should_dirty = !write && iter_is_iovec(iter); 1464 while (count) { 1465 ssize_t nres; 1466 fl_owner_t owner = current->files; 1467 size_t nbytes = min(count, nmax); 1468 1469 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, 1470 max_pages); 1471 if (err && !nbytes) 1472 break; 1473 1474 if (write) { 1475 if (!capable(CAP_FSETID)) 1476 ia->write.in.write_flags |= FUSE_WRITE_KILL_SUIDGID; 1477 1478 nres = fuse_send_write(ia, pos, nbytes, owner); 1479 } else { 1480 nres = fuse_send_read(ia, pos, nbytes, owner); 1481 } 1482 1483 if (!io->async || nres < 0) { 1484 fuse_release_user_pages(&ia->ap, io->should_dirty); 1485 fuse_io_free(ia); 1486 } 1487 ia = NULL; 1488 if (nres < 0) { 1489 iov_iter_revert(iter, nbytes); 1490 err = nres; 1491 break; 1492 } 1493 WARN_ON(nres > nbytes); 1494 1495 count -= nres; 1496 res += nres; 1497 pos += nres; 1498 if (nres != nbytes) { 1499 iov_iter_revert(iter, nbytes - nres); 1500 break; 1501 } 1502 if (count) { 1503 max_pages = iov_iter_npages(iter, fc->max_pages); 1504 ia = fuse_io_alloc(io, max_pages); 1505 if (!ia) 1506 break; 1507 } 1508 } 1509 if (ia) 1510 fuse_io_free(ia); 1511 if (res > 0) 1512 *ppos = pos; 1513 1514 return res > 0 ? res : err; 1515 } 1516 EXPORT_SYMBOL_GPL(fuse_direct_io); 1517 1518 static ssize_t __fuse_direct_read(struct fuse_io_priv *io, 1519 struct iov_iter *iter, 1520 loff_t *ppos) 1521 { 1522 ssize_t res; 1523 struct inode *inode = file_inode(io->iocb->ki_filp); 1524 1525 res = fuse_direct_io(io, iter, ppos, 0); 1526 1527 fuse_invalidate_atime(inode); 1528 1529 return res; 1530 } 1531 1532 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); 1533 1534 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) 1535 { 1536 ssize_t res; 1537 1538 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { 1539 res = fuse_direct_IO(iocb, to); 1540 } else { 1541 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1542 1543 res = __fuse_direct_read(&io, to, &iocb->ki_pos); 1544 } 1545 1546 return res; 1547 } 1548 1549 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) 1550 { 1551 struct inode *inode = file_inode(iocb->ki_filp); 1552 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1553 ssize_t res; 1554 1555 /* Don't allow parallel writes to the same file */ 1556 inode_lock(inode); 1557 res = generic_write_checks(iocb, from); 1558 if (res > 0) { 1559 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { 1560 res = fuse_direct_IO(iocb, from); 1561 } else { 1562 res = fuse_direct_io(&io, from, &iocb->ki_pos, 1563 FUSE_DIO_WRITE); 1564 } 1565 } 1566 fuse_invalidate_attr(inode); 1567 if (res > 0) 1568 fuse_write_update_size(inode, iocb->ki_pos); 1569 inode_unlock(inode); 1570 1571 return res; 1572 } 1573 1574 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1575 { 1576 struct file *file = iocb->ki_filp; 1577 struct fuse_file *ff = file->private_data; 1578 struct inode *inode = file_inode(file); 1579 1580 if (fuse_is_bad(inode)) 1581 return -EIO; 1582 1583 if (FUSE_IS_DAX(inode)) 1584 return fuse_dax_read_iter(iocb, to); 1585 1586 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1587 return fuse_cache_read_iter(iocb, to); 1588 else 1589 return fuse_direct_read_iter(iocb, to); 1590 } 1591 1592 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1593 { 1594 struct file *file = iocb->ki_filp; 1595 struct fuse_file *ff = file->private_data; 1596 struct inode *inode = file_inode(file); 1597 1598 if (fuse_is_bad(inode)) 1599 return -EIO; 1600 1601 if (FUSE_IS_DAX(inode)) 1602 return fuse_dax_write_iter(iocb, from); 1603 1604 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1605 return fuse_cache_write_iter(iocb, from); 1606 else 1607 return fuse_direct_write_iter(iocb, from); 1608 } 1609 1610 static void fuse_writepage_free(struct fuse_writepage_args *wpa) 1611 { 1612 struct fuse_args_pages *ap = &wpa->ia.ap; 1613 int i; 1614 1615 if (wpa->bucket) 1616 fuse_sync_bucket_dec(wpa->bucket); 1617 1618 for (i = 0; i < ap->num_pages; i++) 1619 __free_page(ap->pages[i]); 1620 1621 if (wpa->ia.ff) 1622 fuse_file_put(wpa->ia.ff, false, false); 1623 1624 kfree(ap->pages); 1625 kfree(wpa); 1626 } 1627 1628 static void fuse_writepage_finish(struct fuse_mount *fm, 1629 struct fuse_writepage_args *wpa) 1630 { 1631 struct fuse_args_pages *ap = &wpa->ia.ap; 1632 struct inode *inode = wpa->inode; 1633 struct fuse_inode *fi = get_fuse_inode(inode); 1634 struct backing_dev_info *bdi = inode_to_bdi(inode); 1635 int i; 1636 1637 for (i = 0; i < ap->num_pages; i++) { 1638 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1639 dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); 1640 wb_writeout_inc(&bdi->wb); 1641 } 1642 wake_up(&fi->page_waitq); 1643 } 1644 1645 /* Called under fi->lock, may release and reacquire it */ 1646 static void fuse_send_writepage(struct fuse_mount *fm, 1647 struct fuse_writepage_args *wpa, loff_t size) 1648 __releases(fi->lock) 1649 __acquires(fi->lock) 1650 { 1651 struct fuse_writepage_args *aux, *next; 1652 struct fuse_inode *fi = get_fuse_inode(wpa->inode); 1653 struct fuse_write_in *inarg = &wpa->ia.write.in; 1654 struct fuse_args *args = &wpa->ia.ap.args; 1655 __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; 1656 int err; 1657 1658 fi->writectr++; 1659 if (inarg->offset + data_size <= size) { 1660 inarg->size = data_size; 1661 } else if (inarg->offset < size) { 1662 inarg->size = size - inarg->offset; 1663 } else { 1664 /* Got truncated off completely */ 1665 goto out_free; 1666 } 1667 1668 args->in_args[1].size = inarg->size; 1669 args->force = true; 1670 args->nocreds = true; 1671 1672 err = fuse_simple_background(fm, args, GFP_ATOMIC); 1673 if (err == -ENOMEM) { 1674 spin_unlock(&fi->lock); 1675 err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); 1676 spin_lock(&fi->lock); 1677 } 1678 1679 /* Fails on broken connection only */ 1680 if (unlikely(err)) 1681 goto out_free; 1682 1683 return; 1684 1685 out_free: 1686 fi->writectr--; 1687 rb_erase(&wpa->writepages_entry, &fi->writepages); 1688 fuse_writepage_finish(fm, wpa); 1689 spin_unlock(&fi->lock); 1690 1691 /* After fuse_writepage_finish() aux request list is private */ 1692 for (aux = wpa->next; aux; aux = next) { 1693 next = aux->next; 1694 aux->next = NULL; 1695 fuse_writepage_free(aux); 1696 } 1697 1698 fuse_writepage_free(wpa); 1699 spin_lock(&fi->lock); 1700 } 1701 1702 /* 1703 * If fi->writectr is positive (no truncate or fsync going on) send 1704 * all queued writepage requests. 1705 * 1706 * Called with fi->lock 1707 */ 1708 void fuse_flush_writepages(struct inode *inode) 1709 __releases(fi->lock) 1710 __acquires(fi->lock) 1711 { 1712 struct fuse_mount *fm = get_fuse_mount(inode); 1713 struct fuse_inode *fi = get_fuse_inode(inode); 1714 loff_t crop = i_size_read(inode); 1715 struct fuse_writepage_args *wpa; 1716 1717 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 1718 wpa = list_entry(fi->queued_writes.next, 1719 struct fuse_writepage_args, queue_entry); 1720 list_del_init(&wpa->queue_entry); 1721 fuse_send_writepage(fm, wpa, crop); 1722 } 1723 } 1724 1725 static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, 1726 struct fuse_writepage_args *wpa) 1727 { 1728 pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; 1729 pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; 1730 struct rb_node **p = &root->rb_node; 1731 struct rb_node *parent = NULL; 1732 1733 WARN_ON(!wpa->ia.ap.num_pages); 1734 while (*p) { 1735 struct fuse_writepage_args *curr; 1736 pgoff_t curr_index; 1737 1738 parent = *p; 1739 curr = rb_entry(parent, struct fuse_writepage_args, 1740 writepages_entry); 1741 WARN_ON(curr->inode != wpa->inode); 1742 curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; 1743 1744 if (idx_from >= curr_index + curr->ia.ap.num_pages) 1745 p = &(*p)->rb_right; 1746 else if (idx_to < curr_index) 1747 p = &(*p)->rb_left; 1748 else 1749 return curr; 1750 } 1751 1752 rb_link_node(&wpa->writepages_entry, parent, p); 1753 rb_insert_color(&wpa->writepages_entry, root); 1754 return NULL; 1755 } 1756 1757 static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) 1758 { 1759 WARN_ON(fuse_insert_writeback(root, wpa)); 1760 } 1761 1762 static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, 1763 int error) 1764 { 1765 struct fuse_writepage_args *wpa = 1766 container_of(args, typeof(*wpa), ia.ap.args); 1767 struct inode *inode = wpa->inode; 1768 struct fuse_inode *fi = get_fuse_inode(inode); 1769 struct fuse_conn *fc = get_fuse_conn(inode); 1770 1771 mapping_set_error(inode->i_mapping, error); 1772 /* 1773 * A writeback finished and this might have updated mtime/ctime on 1774 * server making local mtime/ctime stale. Hence invalidate attrs. 1775 * Do this only if writeback_cache is not enabled. If writeback_cache 1776 * is enabled, we trust local ctime/mtime. 1777 */ 1778 if (!fc->writeback_cache) 1779 fuse_invalidate_attr(inode); 1780 spin_lock(&fi->lock); 1781 rb_erase(&wpa->writepages_entry, &fi->writepages); 1782 while (wpa->next) { 1783 struct fuse_mount *fm = get_fuse_mount(inode); 1784 struct fuse_write_in *inarg = &wpa->ia.write.in; 1785 struct fuse_writepage_args *next = wpa->next; 1786 1787 wpa->next = next->next; 1788 next->next = NULL; 1789 next->ia.ff = fuse_file_get(wpa->ia.ff); 1790 tree_insert(&fi->writepages, next); 1791 1792 /* 1793 * Skip fuse_flush_writepages() to make it easy to crop requests 1794 * based on primary request size. 1795 * 1796 * 1st case (trivial): there are no concurrent activities using 1797 * fuse_set/release_nowrite. Then we're on safe side because 1798 * fuse_flush_writepages() would call fuse_send_writepage() 1799 * anyway. 1800 * 1801 * 2nd case: someone called fuse_set_nowrite and it is waiting 1802 * now for completion of all in-flight requests. This happens 1803 * rarely and no more than once per page, so this should be 1804 * okay. 1805 * 1806 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle 1807 * of fuse_set_nowrite..fuse_release_nowrite section. The fact 1808 * that fuse_set_nowrite returned implies that all in-flight 1809 * requests were completed along with all of their secondary 1810 * requests. Further primary requests are blocked by negative 1811 * writectr. Hence there cannot be any in-flight requests and 1812 * no invocations of fuse_writepage_end() while we're in 1813 * fuse_set_nowrite..fuse_release_nowrite section. 1814 */ 1815 fuse_send_writepage(fm, next, inarg->offset + inarg->size); 1816 } 1817 fi->writectr--; 1818 fuse_writepage_finish(fm, wpa); 1819 spin_unlock(&fi->lock); 1820 fuse_writepage_free(wpa); 1821 } 1822 1823 static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) 1824 { 1825 struct fuse_file *ff = NULL; 1826 1827 spin_lock(&fi->lock); 1828 if (!list_empty(&fi->write_files)) { 1829 ff = list_entry(fi->write_files.next, struct fuse_file, 1830 write_entry); 1831 fuse_file_get(ff); 1832 } 1833 spin_unlock(&fi->lock); 1834 1835 return ff; 1836 } 1837 1838 static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) 1839 { 1840 struct fuse_file *ff = __fuse_write_file_get(fi); 1841 WARN_ON(!ff); 1842 return ff; 1843 } 1844 1845 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) 1846 { 1847 struct fuse_inode *fi = get_fuse_inode(inode); 1848 struct fuse_file *ff; 1849 int err; 1850 1851 ff = __fuse_write_file_get(fi); 1852 err = fuse_flush_times(inode, ff); 1853 if (ff) 1854 fuse_file_put(ff, false, false); 1855 1856 return err; 1857 } 1858 1859 static struct fuse_writepage_args *fuse_writepage_args_alloc(void) 1860 { 1861 struct fuse_writepage_args *wpa; 1862 struct fuse_args_pages *ap; 1863 1864 wpa = kzalloc(sizeof(*wpa), GFP_NOFS); 1865 if (wpa) { 1866 ap = &wpa->ia.ap; 1867 ap->num_pages = 0; 1868 ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); 1869 if (!ap->pages) { 1870 kfree(wpa); 1871 wpa = NULL; 1872 } 1873 } 1874 return wpa; 1875 1876 } 1877 1878 static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, 1879 struct fuse_writepage_args *wpa) 1880 { 1881 if (!fc->sync_fs) 1882 return; 1883 1884 rcu_read_lock(); 1885 /* Prevent resurrection of dead bucket in unlikely race with syncfs */ 1886 do { 1887 wpa->bucket = rcu_dereference(fc->curr_bucket); 1888 } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); 1889 rcu_read_unlock(); 1890 } 1891 1892 static int fuse_writepage_locked(struct page *page) 1893 { 1894 struct address_space *mapping = page->mapping; 1895 struct inode *inode = mapping->host; 1896 struct fuse_conn *fc = get_fuse_conn(inode); 1897 struct fuse_inode *fi = get_fuse_inode(inode); 1898 struct fuse_writepage_args *wpa; 1899 struct fuse_args_pages *ap; 1900 struct page *tmp_page; 1901 int error = -ENOMEM; 1902 1903 set_page_writeback(page); 1904 1905 wpa = fuse_writepage_args_alloc(); 1906 if (!wpa) 1907 goto err; 1908 ap = &wpa->ia.ap; 1909 1910 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1911 if (!tmp_page) 1912 goto err_free; 1913 1914 error = -EIO; 1915 wpa->ia.ff = fuse_write_file_get(fi); 1916 if (!wpa->ia.ff) 1917 goto err_nofile; 1918 1919 fuse_writepage_add_to_bucket(fc, wpa); 1920 fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); 1921 1922 copy_highpage(tmp_page, page); 1923 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 1924 wpa->next = NULL; 1925 ap->args.in_pages = true; 1926 ap->num_pages = 1; 1927 ap->pages[0] = tmp_page; 1928 ap->descs[0].offset = 0; 1929 ap->descs[0].length = PAGE_SIZE; 1930 ap->args.end = fuse_writepage_end; 1931 wpa->inode = inode; 1932 1933 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 1934 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 1935 1936 spin_lock(&fi->lock); 1937 tree_insert(&fi->writepages, wpa); 1938 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 1939 fuse_flush_writepages(inode); 1940 spin_unlock(&fi->lock); 1941 1942 end_page_writeback(page); 1943 1944 return 0; 1945 1946 err_nofile: 1947 __free_page(tmp_page); 1948 err_free: 1949 kfree(wpa); 1950 err: 1951 mapping_set_error(page->mapping, error); 1952 end_page_writeback(page); 1953 return error; 1954 } 1955 1956 static int fuse_writepage(struct page *page, struct writeback_control *wbc) 1957 { 1958 int err; 1959 1960 if (fuse_page_is_writeback(page->mapping->host, page->index)) { 1961 /* 1962 * ->writepages() should be called for sync() and friends. We 1963 * should only get here on direct reclaim and then we are 1964 * allowed to skip a page which is already in flight 1965 */ 1966 WARN_ON(wbc->sync_mode == WB_SYNC_ALL); 1967 1968 redirty_page_for_writepage(wbc, page); 1969 unlock_page(page); 1970 return 0; 1971 } 1972 1973 err = fuse_writepage_locked(page); 1974 unlock_page(page); 1975 1976 return err; 1977 } 1978 1979 struct fuse_fill_wb_data { 1980 struct fuse_writepage_args *wpa; 1981 struct fuse_file *ff; 1982 struct inode *inode; 1983 struct page **orig_pages; 1984 unsigned int max_pages; 1985 }; 1986 1987 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) 1988 { 1989 struct fuse_args_pages *ap = &data->wpa->ia.ap; 1990 struct fuse_conn *fc = get_fuse_conn(data->inode); 1991 struct page **pages; 1992 struct fuse_page_desc *descs; 1993 unsigned int npages = min_t(unsigned int, 1994 max_t(unsigned int, data->max_pages * 2, 1995 FUSE_DEFAULT_MAX_PAGES_PER_REQ), 1996 fc->max_pages); 1997 WARN_ON(npages <= data->max_pages); 1998 1999 pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); 2000 if (!pages) 2001 return false; 2002 2003 memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); 2004 memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); 2005 kfree(ap->pages); 2006 ap->pages = pages; 2007 ap->descs = descs; 2008 data->max_pages = npages; 2009 2010 return true; 2011 } 2012 2013 static void fuse_writepages_send(struct fuse_fill_wb_data *data) 2014 { 2015 struct fuse_writepage_args *wpa = data->wpa; 2016 struct inode *inode = data->inode; 2017 struct fuse_inode *fi = get_fuse_inode(inode); 2018 int num_pages = wpa->ia.ap.num_pages; 2019 int i; 2020 2021 wpa->ia.ff = fuse_file_get(data->ff); 2022 spin_lock(&fi->lock); 2023 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 2024 fuse_flush_writepages(inode); 2025 spin_unlock(&fi->lock); 2026 2027 for (i = 0; i < num_pages; i++) 2028 end_page_writeback(data->orig_pages[i]); 2029 } 2030 2031 /* 2032 * Check under fi->lock if the page is under writeback, and insert it onto the 2033 * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's 2034 * one already added for a page at this offset. If there's none, then insert 2035 * this new request onto the auxiliary list, otherwise reuse the existing one by 2036 * swapping the new temp page with the old one. 2037 */ 2038 static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, 2039 struct page *page) 2040 { 2041 struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); 2042 struct fuse_writepage_args *tmp; 2043 struct fuse_writepage_args *old_wpa; 2044 struct fuse_args_pages *new_ap = &new_wpa->ia.ap; 2045 2046 WARN_ON(new_ap->num_pages != 0); 2047 new_ap->num_pages = 1; 2048 2049 spin_lock(&fi->lock); 2050 old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); 2051 if (!old_wpa) { 2052 spin_unlock(&fi->lock); 2053 return true; 2054 } 2055 2056 for (tmp = old_wpa->next; tmp; tmp = tmp->next) { 2057 pgoff_t curr_index; 2058 2059 WARN_ON(tmp->inode != new_wpa->inode); 2060 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; 2061 if (curr_index == page->index) { 2062 WARN_ON(tmp->ia.ap.num_pages != 1); 2063 swap(tmp->ia.ap.pages[0], new_ap->pages[0]); 2064 break; 2065 } 2066 } 2067 2068 if (!tmp) { 2069 new_wpa->next = old_wpa->next; 2070 old_wpa->next = new_wpa; 2071 } 2072 2073 spin_unlock(&fi->lock); 2074 2075 if (tmp) { 2076 struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); 2077 2078 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 2079 dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); 2080 wb_writeout_inc(&bdi->wb); 2081 fuse_writepage_free(new_wpa); 2082 } 2083 2084 return false; 2085 } 2086 2087 static bool fuse_writepage_need_send(struct fuse_conn *fc, struct page *page, 2088 struct fuse_args_pages *ap, 2089 struct fuse_fill_wb_data *data) 2090 { 2091 WARN_ON(!ap->num_pages); 2092 2093 /* 2094 * Being under writeback is unlikely but possible. For example direct 2095 * read to an mmaped fuse file will set the page dirty twice; once when 2096 * the pages are faulted with get_user_pages(), and then after the read 2097 * completed. 2098 */ 2099 if (fuse_page_is_writeback(data->inode, page->index)) 2100 return true; 2101 2102 /* Reached max pages */ 2103 if (ap->num_pages == fc->max_pages) 2104 return true; 2105 2106 /* Reached max write bytes */ 2107 if ((ap->num_pages + 1) * PAGE_SIZE > fc->max_write) 2108 return true; 2109 2110 /* Discontinuity */ 2111 if (data->orig_pages[ap->num_pages - 1]->index + 1 != page->index) 2112 return true; 2113 2114 /* Need to grow the pages array? If so, did the expansion fail? */ 2115 if (ap->num_pages == data->max_pages && !fuse_pages_realloc(data)) 2116 return true; 2117 2118 return false; 2119 } 2120 2121 static int fuse_writepages_fill(struct page *page, 2122 struct writeback_control *wbc, void *_data) 2123 { 2124 struct fuse_fill_wb_data *data = _data; 2125 struct fuse_writepage_args *wpa = data->wpa; 2126 struct fuse_args_pages *ap = &wpa->ia.ap; 2127 struct inode *inode = data->inode; 2128 struct fuse_inode *fi = get_fuse_inode(inode); 2129 struct fuse_conn *fc = get_fuse_conn(inode); 2130 struct page *tmp_page; 2131 int err; 2132 2133 if (!data->ff) { 2134 err = -EIO; 2135 data->ff = fuse_write_file_get(fi); 2136 if (!data->ff) 2137 goto out_unlock; 2138 } 2139 2140 if (wpa && fuse_writepage_need_send(fc, page, ap, data)) { 2141 fuse_writepages_send(data); 2142 data->wpa = NULL; 2143 } 2144 2145 err = -ENOMEM; 2146 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2147 if (!tmp_page) 2148 goto out_unlock; 2149 2150 /* 2151 * The page must not be redirtied until the writeout is completed 2152 * (i.e. userspace has sent a reply to the write request). Otherwise 2153 * there could be more than one temporary page instance for each real 2154 * page. 2155 * 2156 * This is ensured by holding the page lock in page_mkwrite() while 2157 * checking fuse_page_is_writeback(). We already hold the page lock 2158 * since clear_page_dirty_for_io() and keep it held until we add the 2159 * request to the fi->writepages list and increment ap->num_pages. 2160 * After this fuse_page_is_writeback() will indicate that the page is 2161 * under writeback, so we can release the page lock. 2162 */ 2163 if (data->wpa == NULL) { 2164 err = -ENOMEM; 2165 wpa = fuse_writepage_args_alloc(); 2166 if (!wpa) { 2167 __free_page(tmp_page); 2168 goto out_unlock; 2169 } 2170 fuse_writepage_add_to_bucket(fc, wpa); 2171 2172 data->max_pages = 1; 2173 2174 ap = &wpa->ia.ap; 2175 fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); 2176 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 2177 wpa->next = NULL; 2178 ap->args.in_pages = true; 2179 ap->args.end = fuse_writepage_end; 2180 ap->num_pages = 0; 2181 wpa->inode = inode; 2182 } 2183 set_page_writeback(page); 2184 2185 copy_highpage(tmp_page, page); 2186 ap->pages[ap->num_pages] = tmp_page; 2187 ap->descs[ap->num_pages].offset = 0; 2188 ap->descs[ap->num_pages].length = PAGE_SIZE; 2189 data->orig_pages[ap->num_pages] = page; 2190 2191 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 2192 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 2193 2194 err = 0; 2195 if (data->wpa) { 2196 /* 2197 * Protected by fi->lock against concurrent access by 2198 * fuse_page_is_writeback(). 2199 */ 2200 spin_lock(&fi->lock); 2201 ap->num_pages++; 2202 spin_unlock(&fi->lock); 2203 } else if (fuse_writepage_add(wpa, page)) { 2204 data->wpa = wpa; 2205 } else { 2206 end_page_writeback(page); 2207 } 2208 out_unlock: 2209 unlock_page(page); 2210 2211 return err; 2212 } 2213 2214 static int fuse_writepages(struct address_space *mapping, 2215 struct writeback_control *wbc) 2216 { 2217 struct inode *inode = mapping->host; 2218 struct fuse_conn *fc = get_fuse_conn(inode); 2219 struct fuse_fill_wb_data data; 2220 int err; 2221 2222 err = -EIO; 2223 if (fuse_is_bad(inode)) 2224 goto out; 2225 2226 data.inode = inode; 2227 data.wpa = NULL; 2228 data.ff = NULL; 2229 2230 err = -ENOMEM; 2231 data.orig_pages = kcalloc(fc->max_pages, 2232 sizeof(struct page *), 2233 GFP_NOFS); 2234 if (!data.orig_pages) 2235 goto out; 2236 2237 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); 2238 if (data.wpa) { 2239 WARN_ON(!data.wpa->ia.ap.num_pages); 2240 fuse_writepages_send(&data); 2241 } 2242 if (data.ff) 2243 fuse_file_put(data.ff, false, false); 2244 2245 kfree(data.orig_pages); 2246 out: 2247 return err; 2248 } 2249 2250 /* 2251 * It's worthy to make sure that space is reserved on disk for the write, 2252 * but how to implement it without killing performance need more thinking. 2253 */ 2254 static int fuse_write_begin(struct file *file, struct address_space *mapping, 2255 loff_t pos, unsigned len, unsigned flags, 2256 struct page **pagep, void **fsdata) 2257 { 2258 pgoff_t index = pos >> PAGE_SHIFT; 2259 struct fuse_conn *fc = get_fuse_conn(file_inode(file)); 2260 struct page *page; 2261 loff_t fsize; 2262 int err = -ENOMEM; 2263 2264 WARN_ON(!fc->writeback_cache); 2265 2266 page = grab_cache_page_write_begin(mapping, index, flags); 2267 if (!page) 2268 goto error; 2269 2270 fuse_wait_on_page_writeback(mapping->host, page->index); 2271 2272 if (PageUptodate(page) || len == PAGE_SIZE) 2273 goto success; 2274 /* 2275 * Check if the start this page comes after the end of file, in which 2276 * case the readpage can be optimized away. 2277 */ 2278 fsize = i_size_read(mapping->host); 2279 if (fsize <= (pos & PAGE_MASK)) { 2280 size_t off = pos & ~PAGE_MASK; 2281 if (off) 2282 zero_user_segment(page, 0, off); 2283 goto success; 2284 } 2285 err = fuse_do_readpage(file, page); 2286 if (err) 2287 goto cleanup; 2288 success: 2289 *pagep = page; 2290 return 0; 2291 2292 cleanup: 2293 unlock_page(page); 2294 put_page(page); 2295 error: 2296 return err; 2297 } 2298 2299 static int fuse_write_end(struct file *file, struct address_space *mapping, 2300 loff_t pos, unsigned len, unsigned copied, 2301 struct page *page, void *fsdata) 2302 { 2303 struct inode *inode = page->mapping->host; 2304 2305 /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ 2306 if (!copied) 2307 goto unlock; 2308 2309 if (!PageUptodate(page)) { 2310 /* Zero any unwritten bytes at the end of the page */ 2311 size_t endoff = (pos + copied) & ~PAGE_MASK; 2312 if (endoff) 2313 zero_user_segment(page, endoff, PAGE_SIZE); 2314 SetPageUptodate(page); 2315 } 2316 2317 fuse_write_update_size(inode, pos + copied); 2318 set_page_dirty(page); 2319 2320 unlock: 2321 unlock_page(page); 2322 put_page(page); 2323 2324 return copied; 2325 } 2326 2327 static int fuse_launder_page(struct page *page) 2328 { 2329 int err = 0; 2330 if (clear_page_dirty_for_io(page)) { 2331 struct inode *inode = page->mapping->host; 2332 2333 /* Serialize with pending writeback for the same page */ 2334 fuse_wait_on_page_writeback(inode, page->index); 2335 err = fuse_writepage_locked(page); 2336 if (!err) 2337 fuse_wait_on_page_writeback(inode, page->index); 2338 } 2339 return err; 2340 } 2341 2342 /* 2343 * Write back dirty pages now, because there may not be any suitable 2344 * open files later 2345 */ 2346 static void fuse_vma_close(struct vm_area_struct *vma) 2347 { 2348 filemap_write_and_wait(vma->vm_file->f_mapping); 2349 } 2350 2351 /* 2352 * Wait for writeback against this page to complete before allowing it 2353 * to be marked dirty again, and hence written back again, possibly 2354 * before the previous writepage completed. 2355 * 2356 * Block here, instead of in ->writepage(), so that the userspace fs 2357 * can only block processes actually operating on the filesystem. 2358 * 2359 * Otherwise unprivileged userspace fs would be able to block 2360 * unrelated: 2361 * 2362 * - page migration 2363 * - sync(2) 2364 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 2365 */ 2366 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) 2367 { 2368 struct page *page = vmf->page; 2369 struct inode *inode = file_inode(vmf->vma->vm_file); 2370 2371 file_update_time(vmf->vma->vm_file); 2372 lock_page(page); 2373 if (page->mapping != inode->i_mapping) { 2374 unlock_page(page); 2375 return VM_FAULT_NOPAGE; 2376 } 2377 2378 fuse_wait_on_page_writeback(inode, page->index); 2379 return VM_FAULT_LOCKED; 2380 } 2381 2382 static const struct vm_operations_struct fuse_file_vm_ops = { 2383 .close = fuse_vma_close, 2384 .fault = filemap_fault, 2385 .map_pages = filemap_map_pages, 2386 .page_mkwrite = fuse_page_mkwrite, 2387 }; 2388 2389 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2390 { 2391 struct fuse_file *ff = file->private_data; 2392 2393 /* DAX mmap is superior to direct_io mmap */ 2394 if (FUSE_IS_DAX(file_inode(file))) 2395 return fuse_dax_mmap(file, vma); 2396 2397 if (ff->open_flags & FOPEN_DIRECT_IO) { 2398 /* Can't provide the coherency needed for MAP_SHARED */ 2399 if (vma->vm_flags & VM_MAYSHARE) 2400 return -ENODEV; 2401 2402 invalidate_inode_pages2(file->f_mapping); 2403 2404 return generic_file_mmap(file, vma); 2405 } 2406 2407 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2408 fuse_link_write_file(file); 2409 2410 file_accessed(file); 2411 vma->vm_ops = &fuse_file_vm_ops; 2412 return 0; 2413 } 2414 2415 static int convert_fuse_file_lock(struct fuse_conn *fc, 2416 const struct fuse_file_lock *ffl, 2417 struct file_lock *fl) 2418 { 2419 switch (ffl->type) { 2420 case F_UNLCK: 2421 break; 2422 2423 case F_RDLCK: 2424 case F_WRLCK: 2425 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || 2426 ffl->end < ffl->start) 2427 return -EIO; 2428 2429 fl->fl_start = ffl->start; 2430 fl->fl_end = ffl->end; 2431 2432 /* 2433 * Convert pid into init's pid namespace. The locks API will 2434 * translate it into the caller's pid namespace. 2435 */ 2436 rcu_read_lock(); 2437 fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); 2438 rcu_read_unlock(); 2439 break; 2440 2441 default: 2442 return -EIO; 2443 } 2444 fl->fl_type = ffl->type; 2445 return 0; 2446 } 2447 2448 static void fuse_lk_fill(struct fuse_args *args, struct file *file, 2449 const struct file_lock *fl, int opcode, pid_t pid, 2450 int flock, struct fuse_lk_in *inarg) 2451 { 2452 struct inode *inode = file_inode(file); 2453 struct fuse_conn *fc = get_fuse_conn(inode); 2454 struct fuse_file *ff = file->private_data; 2455 2456 memset(inarg, 0, sizeof(*inarg)); 2457 inarg->fh = ff->fh; 2458 inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); 2459 inarg->lk.start = fl->fl_start; 2460 inarg->lk.end = fl->fl_end; 2461 inarg->lk.type = fl->fl_type; 2462 inarg->lk.pid = pid; 2463 if (flock) 2464 inarg->lk_flags |= FUSE_LK_FLOCK; 2465 args->opcode = opcode; 2466 args->nodeid = get_node_id(inode); 2467 args->in_numargs = 1; 2468 args->in_args[0].size = sizeof(*inarg); 2469 args->in_args[0].value = inarg; 2470 } 2471 2472 static int fuse_getlk(struct file *file, struct file_lock *fl) 2473 { 2474 struct inode *inode = file_inode(file); 2475 struct fuse_mount *fm = get_fuse_mount(inode); 2476 FUSE_ARGS(args); 2477 struct fuse_lk_in inarg; 2478 struct fuse_lk_out outarg; 2479 int err; 2480 2481 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); 2482 args.out_numargs = 1; 2483 args.out_args[0].size = sizeof(outarg); 2484 args.out_args[0].value = &outarg; 2485 err = fuse_simple_request(fm, &args); 2486 if (!err) 2487 err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); 2488 2489 return err; 2490 } 2491 2492 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) 2493 { 2494 struct inode *inode = file_inode(file); 2495 struct fuse_mount *fm = get_fuse_mount(inode); 2496 FUSE_ARGS(args); 2497 struct fuse_lk_in inarg; 2498 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; 2499 struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; 2500 pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); 2501 int err; 2502 2503 if (fl->fl_lmops && fl->fl_lmops->lm_grant) { 2504 /* NLM needs asynchronous locks, which we don't support yet */ 2505 return -ENOLCK; 2506 } 2507 2508 /* Unlock on close is handled by the flush method */ 2509 if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) 2510 return 0; 2511 2512 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); 2513 err = fuse_simple_request(fm, &args); 2514 2515 /* locking is restartable */ 2516 if (err == -EINTR) 2517 err = -ERESTARTSYS; 2518 2519 return err; 2520 } 2521 2522 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) 2523 { 2524 struct inode *inode = file_inode(file); 2525 struct fuse_conn *fc = get_fuse_conn(inode); 2526 int err; 2527 2528 if (cmd == F_CANCELLK) { 2529 err = 0; 2530 } else if (cmd == F_GETLK) { 2531 if (fc->no_lock) { 2532 posix_test_lock(file, fl); 2533 err = 0; 2534 } else 2535 err = fuse_getlk(file, fl); 2536 } else { 2537 if (fc->no_lock) 2538 err = posix_lock_file(file, fl, NULL); 2539 else 2540 err = fuse_setlk(file, fl, 0); 2541 } 2542 return err; 2543 } 2544 2545 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) 2546 { 2547 struct inode *inode = file_inode(file); 2548 struct fuse_conn *fc = get_fuse_conn(inode); 2549 int err; 2550 2551 if (fc->no_flock) { 2552 err = locks_lock_file_wait(file, fl); 2553 } else { 2554 struct fuse_file *ff = file->private_data; 2555 2556 /* emulate flock with POSIX locks */ 2557 ff->flock = true; 2558 err = fuse_setlk(file, fl, 1); 2559 } 2560 2561 return err; 2562 } 2563 2564 static sector_t fuse_bmap(struct address_space *mapping, sector_t block) 2565 { 2566 struct inode *inode = mapping->host; 2567 struct fuse_mount *fm = get_fuse_mount(inode); 2568 FUSE_ARGS(args); 2569 struct fuse_bmap_in inarg; 2570 struct fuse_bmap_out outarg; 2571 int err; 2572 2573 if (!inode->i_sb->s_bdev || fm->fc->no_bmap) 2574 return 0; 2575 2576 memset(&inarg, 0, sizeof(inarg)); 2577 inarg.block = block; 2578 inarg.blocksize = inode->i_sb->s_blocksize; 2579 args.opcode = FUSE_BMAP; 2580 args.nodeid = get_node_id(inode); 2581 args.in_numargs = 1; 2582 args.in_args[0].size = sizeof(inarg); 2583 args.in_args[0].value = &inarg; 2584 args.out_numargs = 1; 2585 args.out_args[0].size = sizeof(outarg); 2586 args.out_args[0].value = &outarg; 2587 err = fuse_simple_request(fm, &args); 2588 if (err == -ENOSYS) 2589 fm->fc->no_bmap = 1; 2590 2591 return err ? 0 : outarg.block; 2592 } 2593 2594 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) 2595 { 2596 struct inode *inode = file->f_mapping->host; 2597 struct fuse_mount *fm = get_fuse_mount(inode); 2598 struct fuse_file *ff = file->private_data; 2599 FUSE_ARGS(args); 2600 struct fuse_lseek_in inarg = { 2601 .fh = ff->fh, 2602 .offset = offset, 2603 .whence = whence 2604 }; 2605 struct fuse_lseek_out outarg; 2606 int err; 2607 2608 if (fm->fc->no_lseek) 2609 goto fallback; 2610 2611 args.opcode = FUSE_LSEEK; 2612 args.nodeid = ff->nodeid; 2613 args.in_numargs = 1; 2614 args.in_args[0].size = sizeof(inarg); 2615 args.in_args[0].value = &inarg; 2616 args.out_numargs = 1; 2617 args.out_args[0].size = sizeof(outarg); 2618 args.out_args[0].value = &outarg; 2619 err = fuse_simple_request(fm, &args); 2620 if (err) { 2621 if (err == -ENOSYS) { 2622 fm->fc->no_lseek = 1; 2623 goto fallback; 2624 } 2625 return err; 2626 } 2627 2628 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); 2629 2630 fallback: 2631 err = fuse_update_attributes(inode, file); 2632 if (!err) 2633 return generic_file_llseek(file, offset, whence); 2634 else 2635 return err; 2636 } 2637 2638 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) 2639 { 2640 loff_t retval; 2641 struct inode *inode = file_inode(file); 2642 2643 switch (whence) { 2644 case SEEK_SET: 2645 case SEEK_CUR: 2646 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 2647 retval = generic_file_llseek(file, offset, whence); 2648 break; 2649 case SEEK_END: 2650 inode_lock(inode); 2651 retval = fuse_update_attributes(inode, file); 2652 if (!retval) 2653 retval = generic_file_llseek(file, offset, whence); 2654 inode_unlock(inode); 2655 break; 2656 case SEEK_HOLE: 2657 case SEEK_DATA: 2658 inode_lock(inode); 2659 retval = fuse_lseek(file, offset, whence); 2660 inode_unlock(inode); 2661 break; 2662 default: 2663 retval = -EINVAL; 2664 } 2665 2666 return retval; 2667 } 2668 2669 /* 2670 * All files which have been polled are linked to RB tree 2671 * fuse_conn->polled_files which is indexed by kh. Walk the tree and 2672 * find the matching one. 2673 */ 2674 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, 2675 struct rb_node **parent_out) 2676 { 2677 struct rb_node **link = &fc->polled_files.rb_node; 2678 struct rb_node *last = NULL; 2679 2680 while (*link) { 2681 struct fuse_file *ff; 2682 2683 last = *link; 2684 ff = rb_entry(last, struct fuse_file, polled_node); 2685 2686 if (kh < ff->kh) 2687 link = &last->rb_left; 2688 else if (kh > ff->kh) 2689 link = &last->rb_right; 2690 else 2691 return link; 2692 } 2693 2694 if (parent_out) 2695 *parent_out = last; 2696 return link; 2697 } 2698 2699 /* 2700 * The file is about to be polled. Make sure it's on the polled_files 2701 * RB tree. Note that files once added to the polled_files tree are 2702 * not removed before the file is released. This is because a file 2703 * polled once is likely to be polled again. 2704 */ 2705 static void fuse_register_polled_file(struct fuse_conn *fc, 2706 struct fuse_file *ff) 2707 { 2708 spin_lock(&fc->lock); 2709 if (RB_EMPTY_NODE(&ff->polled_node)) { 2710 struct rb_node **link, *parent; 2711 2712 link = fuse_find_polled_node(fc, ff->kh, &parent); 2713 BUG_ON(*link); 2714 rb_link_node(&ff->polled_node, parent, link); 2715 rb_insert_color(&ff->polled_node, &fc->polled_files); 2716 } 2717 spin_unlock(&fc->lock); 2718 } 2719 2720 __poll_t fuse_file_poll(struct file *file, poll_table *wait) 2721 { 2722 struct fuse_file *ff = file->private_data; 2723 struct fuse_mount *fm = ff->fm; 2724 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 2725 struct fuse_poll_out outarg; 2726 FUSE_ARGS(args); 2727 int err; 2728 2729 if (fm->fc->no_poll) 2730 return DEFAULT_POLLMASK; 2731 2732 poll_wait(file, &ff->poll_wait, wait); 2733 inarg.events = mangle_poll(poll_requested_events(wait)); 2734 2735 /* 2736 * Ask for notification iff there's someone waiting for it. 2737 * The client may ignore the flag and always notify. 2738 */ 2739 if (waitqueue_active(&ff->poll_wait)) { 2740 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; 2741 fuse_register_polled_file(fm->fc, ff); 2742 } 2743 2744 args.opcode = FUSE_POLL; 2745 args.nodeid = ff->nodeid; 2746 args.in_numargs = 1; 2747 args.in_args[0].size = sizeof(inarg); 2748 args.in_args[0].value = &inarg; 2749 args.out_numargs = 1; 2750 args.out_args[0].size = sizeof(outarg); 2751 args.out_args[0].value = &outarg; 2752 err = fuse_simple_request(fm, &args); 2753 2754 if (!err) 2755 return demangle_poll(outarg.revents); 2756 if (err == -ENOSYS) { 2757 fm->fc->no_poll = 1; 2758 return DEFAULT_POLLMASK; 2759 } 2760 return EPOLLERR; 2761 } 2762 EXPORT_SYMBOL_GPL(fuse_file_poll); 2763 2764 /* 2765 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 2766 * wakes up the poll waiters. 2767 */ 2768 int fuse_notify_poll_wakeup(struct fuse_conn *fc, 2769 struct fuse_notify_poll_wakeup_out *outarg) 2770 { 2771 u64 kh = outarg->kh; 2772 struct rb_node **link; 2773 2774 spin_lock(&fc->lock); 2775 2776 link = fuse_find_polled_node(fc, kh, NULL); 2777 if (*link) { 2778 struct fuse_file *ff; 2779 2780 ff = rb_entry(*link, struct fuse_file, polled_node); 2781 wake_up_interruptible_sync(&ff->poll_wait); 2782 } 2783 2784 spin_unlock(&fc->lock); 2785 return 0; 2786 } 2787 2788 static void fuse_do_truncate(struct file *file) 2789 { 2790 struct inode *inode = file->f_mapping->host; 2791 struct iattr attr; 2792 2793 attr.ia_valid = ATTR_SIZE; 2794 attr.ia_size = i_size_read(inode); 2795 2796 attr.ia_file = file; 2797 attr.ia_valid |= ATTR_FILE; 2798 2799 fuse_do_setattr(file_dentry(file), &attr, file); 2800 } 2801 2802 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) 2803 { 2804 return round_up(off, fc->max_pages << PAGE_SHIFT); 2805 } 2806 2807 static ssize_t 2808 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 2809 { 2810 DECLARE_COMPLETION_ONSTACK(wait); 2811 ssize_t ret = 0; 2812 struct file *file = iocb->ki_filp; 2813 struct fuse_file *ff = file->private_data; 2814 loff_t pos = 0; 2815 struct inode *inode; 2816 loff_t i_size; 2817 size_t count = iov_iter_count(iter), shortened = 0; 2818 loff_t offset = iocb->ki_pos; 2819 struct fuse_io_priv *io; 2820 2821 pos = offset; 2822 inode = file->f_mapping->host; 2823 i_size = i_size_read(inode); 2824 2825 if ((iov_iter_rw(iter) == READ) && (offset >= i_size)) 2826 return 0; 2827 2828 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); 2829 if (!io) 2830 return -ENOMEM; 2831 spin_lock_init(&io->lock); 2832 kref_init(&io->refcnt); 2833 io->reqs = 1; 2834 io->bytes = -1; 2835 io->size = 0; 2836 io->offset = offset; 2837 io->write = (iov_iter_rw(iter) == WRITE); 2838 io->err = 0; 2839 /* 2840 * By default, we want to optimize all I/Os with async request 2841 * submission to the client filesystem if supported. 2842 */ 2843 io->async = ff->fm->fc->async_dio; 2844 io->iocb = iocb; 2845 io->blocking = is_sync_kiocb(iocb); 2846 2847 /* optimization for short read */ 2848 if (io->async && !io->write && offset + count > i_size) { 2849 iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); 2850 shortened = count - iov_iter_count(iter); 2851 count -= shortened; 2852 } 2853 2854 /* 2855 * We cannot asynchronously extend the size of a file. 2856 * In such case the aio will behave exactly like sync io. 2857 */ 2858 if ((offset + count > i_size) && io->write) 2859 io->blocking = true; 2860 2861 if (io->async && io->blocking) { 2862 /* 2863 * Additional reference to keep io around after 2864 * calling fuse_aio_complete() 2865 */ 2866 kref_get(&io->refcnt); 2867 io->done = &wait; 2868 } 2869 2870 if (iov_iter_rw(iter) == WRITE) { 2871 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); 2872 fuse_invalidate_attr(inode); 2873 } else { 2874 ret = __fuse_direct_read(io, iter, &pos); 2875 } 2876 iov_iter_reexpand(iter, iov_iter_count(iter) + shortened); 2877 2878 if (io->async) { 2879 bool blocking = io->blocking; 2880 2881 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 2882 2883 /* we have a non-extending, async request, so return */ 2884 if (!blocking) 2885 return -EIOCBQUEUED; 2886 2887 wait_for_completion(&wait); 2888 ret = fuse_get_res_by_io(io); 2889 } 2890 2891 kref_put(&io->refcnt, fuse_io_release); 2892 2893 if (iov_iter_rw(iter) == WRITE) { 2894 if (ret > 0) 2895 fuse_write_update_size(inode, pos); 2896 else if (ret < 0 && offset + count > i_size) 2897 fuse_do_truncate(file); 2898 } 2899 2900 return ret; 2901 } 2902 2903 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) 2904 { 2905 int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); 2906 2907 if (!err) 2908 fuse_sync_writes(inode); 2909 2910 return err; 2911 } 2912 2913 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 2914 loff_t length) 2915 { 2916 struct fuse_file *ff = file->private_data; 2917 struct inode *inode = file_inode(file); 2918 struct fuse_inode *fi = get_fuse_inode(inode); 2919 struct fuse_mount *fm = ff->fm; 2920 FUSE_ARGS(args); 2921 struct fuse_fallocate_in inarg = { 2922 .fh = ff->fh, 2923 .offset = offset, 2924 .length = length, 2925 .mode = mode 2926 }; 2927 int err; 2928 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || 2929 (mode & (FALLOC_FL_PUNCH_HOLE | 2930 FALLOC_FL_ZERO_RANGE)); 2931 2932 bool block_faults = FUSE_IS_DAX(inode) && lock_inode; 2933 2934 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | 2935 FALLOC_FL_ZERO_RANGE)) 2936 return -EOPNOTSUPP; 2937 2938 if (fm->fc->no_fallocate) 2939 return -EOPNOTSUPP; 2940 2941 if (lock_inode) { 2942 inode_lock(inode); 2943 if (block_faults) { 2944 filemap_invalidate_lock(inode->i_mapping); 2945 err = fuse_dax_break_layouts(inode, 0, 0); 2946 if (err) 2947 goto out; 2948 } 2949 2950 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) { 2951 loff_t endbyte = offset + length - 1; 2952 2953 err = fuse_writeback_range(inode, offset, endbyte); 2954 if (err) 2955 goto out; 2956 } 2957 } 2958 2959 if (!(mode & FALLOC_FL_KEEP_SIZE) && 2960 offset + length > i_size_read(inode)) { 2961 err = inode_newsize_ok(inode, offset + length); 2962 if (err) 2963 goto out; 2964 } 2965 2966 if (!(mode & FALLOC_FL_KEEP_SIZE)) 2967 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 2968 2969 args.opcode = FUSE_FALLOCATE; 2970 args.nodeid = ff->nodeid; 2971 args.in_numargs = 1; 2972 args.in_args[0].size = sizeof(inarg); 2973 args.in_args[0].value = &inarg; 2974 err = fuse_simple_request(fm, &args); 2975 if (err == -ENOSYS) { 2976 fm->fc->no_fallocate = 1; 2977 err = -EOPNOTSUPP; 2978 } 2979 if (err) 2980 goto out; 2981 2982 /* we could have extended the file */ 2983 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 2984 bool changed = fuse_write_update_size(inode, offset + length); 2985 2986 if (changed && fm->fc->writeback_cache) 2987 file_update_time(file); 2988 } 2989 2990 if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) 2991 truncate_pagecache_range(inode, offset, offset + length - 1); 2992 2993 fuse_invalidate_attr(inode); 2994 2995 out: 2996 if (!(mode & FALLOC_FL_KEEP_SIZE)) 2997 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 2998 2999 if (block_faults) 3000 filemap_invalidate_unlock(inode->i_mapping); 3001 3002 if (lock_inode) 3003 inode_unlock(inode); 3004 3005 return err; 3006 } 3007 3008 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, 3009 struct file *file_out, loff_t pos_out, 3010 size_t len, unsigned int flags) 3011 { 3012 struct fuse_file *ff_in = file_in->private_data; 3013 struct fuse_file *ff_out = file_out->private_data; 3014 struct inode *inode_in = file_inode(file_in); 3015 struct inode *inode_out = file_inode(file_out); 3016 struct fuse_inode *fi_out = get_fuse_inode(inode_out); 3017 struct fuse_mount *fm = ff_in->fm; 3018 struct fuse_conn *fc = fm->fc; 3019 FUSE_ARGS(args); 3020 struct fuse_copy_file_range_in inarg = { 3021 .fh_in = ff_in->fh, 3022 .off_in = pos_in, 3023 .nodeid_out = ff_out->nodeid, 3024 .fh_out = ff_out->fh, 3025 .off_out = pos_out, 3026 .len = len, 3027 .flags = flags 3028 }; 3029 struct fuse_write_out outarg; 3030 ssize_t err; 3031 /* mark unstable when write-back is not used, and file_out gets 3032 * extended */ 3033 bool is_unstable = (!fc->writeback_cache) && 3034 ((pos_out + len) > inode_out->i_size); 3035 3036 if (fc->no_copy_file_range) 3037 return -EOPNOTSUPP; 3038 3039 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) 3040 return -EXDEV; 3041 3042 inode_lock(inode_in); 3043 err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); 3044 inode_unlock(inode_in); 3045 if (err) 3046 return err; 3047 3048 inode_lock(inode_out); 3049 3050 err = file_modified(file_out); 3051 if (err) 3052 goto out; 3053 3054 /* 3055 * Write out dirty pages in the destination file before sending the COPY 3056 * request to userspace. After the request is completed, truncate off 3057 * pages (including partial ones) from the cache that have been copied, 3058 * since these contain stale data at that point. 3059 * 3060 * This should be mostly correct, but if the COPY writes to partial 3061 * pages (at the start or end) and the parts not covered by the COPY are 3062 * written through a memory map after calling fuse_writeback_range(), 3063 * then these partial page modifications will be lost on truncation. 3064 * 3065 * It is unlikely that someone would rely on such mixed style 3066 * modifications. Yet this does give less guarantees than if the 3067 * copying was performed with write(2). 3068 * 3069 * To fix this a mapping->invalidate_lock could be used to prevent new 3070 * faults while the copy is ongoing. 3071 */ 3072 err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); 3073 if (err) 3074 goto out; 3075 3076 if (is_unstable) 3077 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3078 3079 args.opcode = FUSE_COPY_FILE_RANGE; 3080 args.nodeid = ff_in->nodeid; 3081 args.in_numargs = 1; 3082 args.in_args[0].size = sizeof(inarg); 3083 args.in_args[0].value = &inarg; 3084 args.out_numargs = 1; 3085 args.out_args[0].size = sizeof(outarg); 3086 args.out_args[0].value = &outarg; 3087 err = fuse_simple_request(fm, &args); 3088 if (err == -ENOSYS) { 3089 fc->no_copy_file_range = 1; 3090 err = -EOPNOTSUPP; 3091 } 3092 if (err) 3093 goto out; 3094 3095 truncate_inode_pages_range(inode_out->i_mapping, 3096 ALIGN_DOWN(pos_out, PAGE_SIZE), 3097 ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); 3098 3099 if (fc->writeback_cache) { 3100 fuse_write_update_size(inode_out, pos_out + outarg.size); 3101 file_update_time(file_out); 3102 } 3103 3104 fuse_invalidate_attr(inode_out); 3105 3106 err = outarg.size; 3107 out: 3108 if (is_unstable) 3109 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3110 3111 inode_unlock(inode_out); 3112 file_accessed(file_in); 3113 3114 return err; 3115 } 3116 3117 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, 3118 struct file *dst_file, loff_t dst_off, 3119 size_t len, unsigned int flags) 3120 { 3121 ssize_t ret; 3122 3123 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, 3124 len, flags); 3125 3126 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3127 ret = generic_copy_file_range(src_file, src_off, dst_file, 3128 dst_off, len, flags); 3129 return ret; 3130 } 3131 3132 static const struct file_operations fuse_file_operations = { 3133 .llseek = fuse_file_llseek, 3134 .read_iter = fuse_file_read_iter, 3135 .write_iter = fuse_file_write_iter, 3136 .mmap = fuse_file_mmap, 3137 .open = fuse_open, 3138 .flush = fuse_flush, 3139 .release = fuse_release, 3140 .fsync = fuse_fsync, 3141 .lock = fuse_file_lock, 3142 .get_unmapped_area = thp_get_unmapped_area, 3143 .flock = fuse_file_flock, 3144 .splice_read = generic_file_splice_read, 3145 .splice_write = iter_file_splice_write, 3146 .unlocked_ioctl = fuse_file_ioctl, 3147 .compat_ioctl = fuse_file_compat_ioctl, 3148 .poll = fuse_file_poll, 3149 .fallocate = fuse_file_fallocate, 3150 .copy_file_range = fuse_copy_file_range, 3151 }; 3152 3153 static const struct address_space_operations fuse_file_aops = { 3154 .readpage = fuse_readpage, 3155 .readahead = fuse_readahead, 3156 .writepage = fuse_writepage, 3157 .writepages = fuse_writepages, 3158 .launder_page = fuse_launder_page, 3159 .set_page_dirty = __set_page_dirty_nobuffers, 3160 .bmap = fuse_bmap, 3161 .direct_IO = fuse_direct_IO, 3162 .write_begin = fuse_write_begin, 3163 .write_end = fuse_write_end, 3164 }; 3165 3166 void fuse_init_file_inode(struct inode *inode) 3167 { 3168 struct fuse_inode *fi = get_fuse_inode(inode); 3169 3170 inode->i_fop = &fuse_file_operations; 3171 inode->i_data.a_ops = &fuse_file_aops; 3172 3173 INIT_LIST_HEAD(&fi->write_files); 3174 INIT_LIST_HEAD(&fi->queued_writes); 3175 fi->writectr = 0; 3176 init_waitqueue_head(&fi->page_waitq); 3177 fi->writepages = RB_ROOT; 3178 3179 if (IS_ENABLED(CONFIG_FUSE_DAX)) 3180 fuse_dax_inode_init(inode); 3181 } 3182