1 /* 2 FUSE: Filesystem in Userspace 3 Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> 4 5 This program can be distributed under the terms of the GNU GPL. 6 See the file COPYING. 7 */ 8 9 #include "fuse_i.h" 10 11 #include <linux/pagemap.h> 12 #include <linux/slab.h> 13 #include <linux/kernel.h> 14 #include <linux/sched.h> 15 #include <linux/sched/signal.h> 16 #include <linux/module.h> 17 #include <linux/compat.h> 18 #include <linux/swap.h> 19 #include <linux/falloc.h> 20 #include <linux/uio.h> 21 22 static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, 23 struct fuse_page_desc **desc) 24 { 25 struct page **pages; 26 27 pages = kzalloc(npages * (sizeof(struct page *) + 28 sizeof(struct fuse_page_desc)), flags); 29 *desc = (void *) (pages + npages); 30 31 return pages; 32 } 33 34 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 35 int opcode, struct fuse_open_out *outargp) 36 { 37 struct fuse_open_in inarg; 38 FUSE_ARGS(args); 39 40 memset(&inarg, 0, sizeof(inarg)); 41 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 42 if (!fc->atomic_o_trunc) 43 inarg.flags &= ~O_TRUNC; 44 args.opcode = opcode; 45 args.nodeid = nodeid; 46 args.in_numargs = 1; 47 args.in_args[0].size = sizeof(inarg); 48 args.in_args[0].value = &inarg; 49 args.out_numargs = 1; 50 args.out_args[0].size = sizeof(*outargp); 51 args.out_args[0].value = outargp; 52 53 return fuse_simple_request(fc, &args); 54 } 55 56 struct fuse_release_args { 57 struct fuse_args args; 58 struct fuse_release_in inarg; 59 struct inode *inode; 60 }; 61 62 struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) 63 { 64 struct fuse_file *ff; 65 66 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); 67 if (unlikely(!ff)) 68 return NULL; 69 70 ff->fc = fc; 71 ff->release_args = kzalloc(sizeof(*ff->release_args), 72 GFP_KERNEL_ACCOUNT); 73 if (!ff->release_args) { 74 kfree(ff); 75 return NULL; 76 } 77 78 INIT_LIST_HEAD(&ff->write_entry); 79 mutex_init(&ff->readdir.lock); 80 refcount_set(&ff->count, 1); 81 RB_CLEAR_NODE(&ff->polled_node); 82 init_waitqueue_head(&ff->poll_wait); 83 84 ff->kh = atomic64_inc_return(&fc->khctr); 85 86 return ff; 87 } 88 89 void fuse_file_free(struct fuse_file *ff) 90 { 91 kfree(ff->release_args); 92 mutex_destroy(&ff->readdir.lock); 93 kfree(ff); 94 } 95 96 static struct fuse_file *fuse_file_get(struct fuse_file *ff) 97 { 98 refcount_inc(&ff->count); 99 return ff; 100 } 101 102 static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, 103 int error) 104 { 105 struct fuse_release_args *ra = container_of(args, typeof(*ra), args); 106 107 iput(ra->inode); 108 kfree(ra); 109 } 110 111 static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) 112 { 113 if (refcount_dec_and_test(&ff->count)) { 114 struct fuse_args *args = &ff->release_args->args; 115 116 if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { 117 /* Do nothing when client does not implement 'open' */ 118 fuse_release_end(ff->fc, args, 0); 119 } else if (sync) { 120 fuse_simple_request(ff->fc, args); 121 fuse_release_end(ff->fc, args, 0); 122 } else { 123 args->end = fuse_release_end; 124 if (fuse_simple_background(ff->fc, args, 125 GFP_KERNEL | __GFP_NOFAIL)) 126 fuse_release_end(ff->fc, args, -ENOTCONN); 127 } 128 kfree(ff); 129 } 130 } 131 132 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 133 bool isdir) 134 { 135 struct fuse_file *ff; 136 int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; 137 138 ff = fuse_file_alloc(fc); 139 if (!ff) 140 return -ENOMEM; 141 142 ff->fh = 0; 143 /* Default for no-open */ 144 ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0); 145 if (isdir ? !fc->no_opendir : !fc->no_open) { 146 struct fuse_open_out outarg; 147 int err; 148 149 err = fuse_send_open(fc, nodeid, file, opcode, &outarg); 150 if (!err) { 151 ff->fh = outarg.fh; 152 ff->open_flags = outarg.open_flags; 153 154 } else if (err != -ENOSYS) { 155 fuse_file_free(ff); 156 return err; 157 } else { 158 if (isdir) 159 fc->no_opendir = 1; 160 else 161 fc->no_open = 1; 162 } 163 } 164 165 if (isdir) 166 ff->open_flags &= ~FOPEN_DIRECT_IO; 167 168 ff->nodeid = nodeid; 169 file->private_data = ff; 170 171 return 0; 172 } 173 EXPORT_SYMBOL_GPL(fuse_do_open); 174 175 static void fuse_link_write_file(struct file *file) 176 { 177 struct inode *inode = file_inode(file); 178 struct fuse_inode *fi = get_fuse_inode(inode); 179 struct fuse_file *ff = file->private_data; 180 /* 181 * file may be written through mmap, so chain it onto the 182 * inodes's write_file list 183 */ 184 spin_lock(&fi->lock); 185 if (list_empty(&ff->write_entry)) 186 list_add(&ff->write_entry, &fi->write_files); 187 spin_unlock(&fi->lock); 188 } 189 190 void fuse_finish_open(struct inode *inode, struct file *file) 191 { 192 struct fuse_file *ff = file->private_data; 193 struct fuse_conn *fc = get_fuse_conn(inode); 194 195 if (!(ff->open_flags & FOPEN_KEEP_CACHE)) 196 invalidate_inode_pages2(inode->i_mapping); 197 if (ff->open_flags & FOPEN_STREAM) 198 stream_open(inode, file); 199 else if (ff->open_flags & FOPEN_NONSEEKABLE) 200 nonseekable_open(inode, file); 201 if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { 202 struct fuse_inode *fi = get_fuse_inode(inode); 203 204 spin_lock(&fi->lock); 205 fi->attr_version = atomic64_inc_return(&fc->attr_version); 206 i_size_write(inode, 0); 207 spin_unlock(&fi->lock); 208 fuse_invalidate_attr(inode); 209 if (fc->writeback_cache) 210 file_update_time(file); 211 } 212 if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) 213 fuse_link_write_file(file); 214 } 215 216 int fuse_open_common(struct inode *inode, struct file *file, bool isdir) 217 { 218 struct fuse_conn *fc = get_fuse_conn(inode); 219 int err; 220 bool is_wb_truncate = (file->f_flags & O_TRUNC) && 221 fc->atomic_o_trunc && 222 fc->writeback_cache; 223 224 err = generic_file_open(inode, file); 225 if (err) 226 return err; 227 228 if (is_wb_truncate) { 229 inode_lock(inode); 230 fuse_set_nowrite(inode); 231 } 232 233 err = fuse_do_open(fc, get_node_id(inode), file, isdir); 234 235 if (!err) 236 fuse_finish_open(inode, file); 237 238 if (is_wb_truncate) { 239 fuse_release_nowrite(inode); 240 inode_unlock(inode); 241 } 242 243 return err; 244 } 245 246 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, 247 int flags, int opcode) 248 { 249 struct fuse_conn *fc = ff->fc; 250 struct fuse_release_args *ra = ff->release_args; 251 252 /* Inode is NULL on error path of fuse_create_open() */ 253 if (likely(fi)) { 254 spin_lock(&fi->lock); 255 list_del(&ff->write_entry); 256 spin_unlock(&fi->lock); 257 } 258 spin_lock(&fc->lock); 259 if (!RB_EMPTY_NODE(&ff->polled_node)) 260 rb_erase(&ff->polled_node, &fc->polled_files); 261 spin_unlock(&fc->lock); 262 263 wake_up_interruptible_all(&ff->poll_wait); 264 265 ra->inarg.fh = ff->fh; 266 ra->inarg.flags = flags; 267 ra->args.in_numargs = 1; 268 ra->args.in_args[0].size = sizeof(struct fuse_release_in); 269 ra->args.in_args[0].value = &ra->inarg; 270 ra->args.opcode = opcode; 271 ra->args.nodeid = ff->nodeid; 272 ra->args.force = true; 273 ra->args.nocreds = true; 274 } 275 276 void fuse_release_common(struct file *file, bool isdir) 277 { 278 struct fuse_inode *fi = get_fuse_inode(file_inode(file)); 279 struct fuse_file *ff = file->private_data; 280 struct fuse_release_args *ra = ff->release_args; 281 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 282 283 fuse_prepare_release(fi, ff, file->f_flags, opcode); 284 285 if (ff->flock) { 286 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 287 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, 288 (fl_owner_t) file); 289 } 290 /* Hold inode until release is finished */ 291 ra->inode = igrab(file_inode(file)); 292 293 /* 294 * Normally this will send the RELEASE request, however if 295 * some asynchronous READ or WRITE requests are outstanding, 296 * the sending will be delayed. 297 * 298 * Make the release synchronous if this is a fuseblk mount, 299 * synchronous RELEASE is allowed (and desirable) in this case 300 * because the server can be trusted not to screw up. 301 */ 302 fuse_file_put(ff, ff->fc->destroy, isdir); 303 } 304 305 static int fuse_open(struct inode *inode, struct file *file) 306 { 307 return fuse_open_common(inode, file, false); 308 } 309 310 static int fuse_release(struct inode *inode, struct file *file) 311 { 312 struct fuse_conn *fc = get_fuse_conn(inode); 313 314 /* see fuse_vma_close() for !writeback_cache case */ 315 if (fc->writeback_cache) 316 write_inode_now(inode, 1); 317 318 fuse_release_common(file, false); 319 320 /* return value is ignored by VFS */ 321 return 0; 322 } 323 324 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags) 325 { 326 WARN_ON(refcount_read(&ff->count) > 1); 327 fuse_prepare_release(fi, ff, flags, FUSE_RELEASE); 328 /* 329 * iput(NULL) is a no-op and since the refcount is 1 and everything's 330 * synchronous, we are fine with not doing igrab() here" 331 */ 332 fuse_file_put(ff, true, false); 333 } 334 EXPORT_SYMBOL_GPL(fuse_sync_release); 335 336 /* 337 * Scramble the ID space with XTEA, so that the value of the files_struct 338 * pointer is not exposed to userspace. 339 */ 340 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) 341 { 342 u32 *k = fc->scramble_key; 343 u64 v = (unsigned long) id; 344 u32 v0 = v; 345 u32 v1 = v >> 32; 346 u32 sum = 0; 347 int i; 348 349 for (i = 0; i < 32; i++) { 350 v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]); 351 sum += 0x9E3779B9; 352 v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]); 353 } 354 355 return (u64) v0 + ((u64) v1 << 32); 356 } 357 358 struct fuse_writepage_args { 359 struct fuse_io_args ia; 360 struct rb_node writepages_entry; 361 struct list_head queue_entry; 362 struct fuse_writepage_args *next; 363 struct inode *inode; 364 }; 365 366 static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, 367 pgoff_t idx_from, pgoff_t idx_to) 368 { 369 struct rb_node *n; 370 371 n = fi->writepages.rb_node; 372 373 while (n) { 374 struct fuse_writepage_args *wpa; 375 pgoff_t curr_index; 376 377 wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); 378 WARN_ON(get_fuse_inode(wpa->inode) != fi); 379 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; 380 if (idx_from >= curr_index + wpa->ia.ap.num_pages) 381 n = n->rb_right; 382 else if (idx_to < curr_index) 383 n = n->rb_left; 384 else 385 return wpa; 386 } 387 return NULL; 388 } 389 390 /* 391 * Check if any page in a range is under writeback 392 * 393 * This is currently done by walking the list of writepage requests 394 * for the inode, which can be pretty inefficient. 395 */ 396 static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, 397 pgoff_t idx_to) 398 { 399 struct fuse_inode *fi = get_fuse_inode(inode); 400 bool found; 401 402 spin_lock(&fi->lock); 403 found = fuse_find_writeback(fi, idx_from, idx_to); 404 spin_unlock(&fi->lock); 405 406 return found; 407 } 408 409 static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) 410 { 411 return fuse_range_is_writeback(inode, index, index); 412 } 413 414 /* 415 * Wait for page writeback to be completed. 416 * 417 * Since fuse doesn't rely on the VM writeback tracking, this has to 418 * use some other means. 419 */ 420 static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) 421 { 422 struct fuse_inode *fi = get_fuse_inode(inode); 423 424 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); 425 } 426 427 /* 428 * Wait for all pending writepages on the inode to finish. 429 * 430 * This is currently done by blocking further writes with FUSE_NOWRITE 431 * and waiting for all sent writes to complete. 432 * 433 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage 434 * could conflict with truncation. 435 */ 436 static void fuse_sync_writes(struct inode *inode) 437 { 438 fuse_set_nowrite(inode); 439 fuse_release_nowrite(inode); 440 } 441 442 static int fuse_flush(struct file *file, fl_owner_t id) 443 { 444 struct inode *inode = file_inode(file); 445 struct fuse_conn *fc = get_fuse_conn(inode); 446 struct fuse_file *ff = file->private_data; 447 struct fuse_flush_in inarg; 448 FUSE_ARGS(args); 449 int err; 450 451 if (is_bad_inode(inode)) 452 return -EIO; 453 454 err = write_inode_now(inode, 1); 455 if (err) 456 return err; 457 458 inode_lock(inode); 459 fuse_sync_writes(inode); 460 inode_unlock(inode); 461 462 err = filemap_check_errors(file->f_mapping); 463 if (err) 464 return err; 465 466 err = 0; 467 if (fc->no_flush) 468 goto inval_attr_out; 469 470 memset(&inarg, 0, sizeof(inarg)); 471 inarg.fh = ff->fh; 472 inarg.lock_owner = fuse_lock_owner_id(fc, id); 473 args.opcode = FUSE_FLUSH; 474 args.nodeid = get_node_id(inode); 475 args.in_numargs = 1; 476 args.in_args[0].size = sizeof(inarg); 477 args.in_args[0].value = &inarg; 478 args.force = true; 479 480 err = fuse_simple_request(fc, &args); 481 if (err == -ENOSYS) { 482 fc->no_flush = 1; 483 err = 0; 484 } 485 486 inval_attr_out: 487 /* 488 * In memory i_blocks is not maintained by fuse, if writeback cache is 489 * enabled, i_blocks from cached attr may not be accurate. 490 */ 491 if (!err && fc->writeback_cache) 492 fuse_invalidate_attr(inode); 493 return err; 494 } 495 496 int fuse_fsync_common(struct file *file, loff_t start, loff_t end, 497 int datasync, int opcode) 498 { 499 struct inode *inode = file->f_mapping->host; 500 struct fuse_conn *fc = get_fuse_conn(inode); 501 struct fuse_file *ff = file->private_data; 502 FUSE_ARGS(args); 503 struct fuse_fsync_in inarg; 504 505 memset(&inarg, 0, sizeof(inarg)); 506 inarg.fh = ff->fh; 507 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; 508 args.opcode = opcode; 509 args.nodeid = get_node_id(inode); 510 args.in_numargs = 1; 511 args.in_args[0].size = sizeof(inarg); 512 args.in_args[0].value = &inarg; 513 return fuse_simple_request(fc, &args); 514 } 515 516 static int fuse_fsync(struct file *file, loff_t start, loff_t end, 517 int datasync) 518 { 519 struct inode *inode = file->f_mapping->host; 520 struct fuse_conn *fc = get_fuse_conn(inode); 521 int err; 522 523 if (is_bad_inode(inode)) 524 return -EIO; 525 526 inode_lock(inode); 527 528 /* 529 * Start writeback against all dirty pages of the inode, then 530 * wait for all outstanding writes, before sending the FSYNC 531 * request. 532 */ 533 err = file_write_and_wait_range(file, start, end); 534 if (err) 535 goto out; 536 537 fuse_sync_writes(inode); 538 539 /* 540 * Due to implementation of fuse writeback 541 * file_write_and_wait_range() does not catch errors. 542 * We have to do this directly after fuse_sync_writes() 543 */ 544 err = file_check_and_advance_wb_err(file); 545 if (err) 546 goto out; 547 548 err = sync_inode_metadata(inode, 1); 549 if (err) 550 goto out; 551 552 if (fc->no_fsync) 553 goto out; 554 555 err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC); 556 if (err == -ENOSYS) { 557 fc->no_fsync = 1; 558 err = 0; 559 } 560 out: 561 inode_unlock(inode); 562 563 return err; 564 } 565 566 void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, 567 size_t count, int opcode) 568 { 569 struct fuse_file *ff = file->private_data; 570 struct fuse_args *args = &ia->ap.args; 571 572 ia->read.in.fh = ff->fh; 573 ia->read.in.offset = pos; 574 ia->read.in.size = count; 575 ia->read.in.flags = file->f_flags; 576 args->opcode = opcode; 577 args->nodeid = ff->nodeid; 578 args->in_numargs = 1; 579 args->in_args[0].size = sizeof(ia->read.in); 580 args->in_args[0].value = &ia->read.in; 581 args->out_argvar = true; 582 args->out_numargs = 1; 583 args->out_args[0].size = count; 584 } 585 586 static void fuse_release_user_pages(struct fuse_args_pages *ap, 587 bool should_dirty) 588 { 589 unsigned int i; 590 591 for (i = 0; i < ap->num_pages; i++) { 592 if (should_dirty) 593 set_page_dirty_lock(ap->pages[i]); 594 put_page(ap->pages[i]); 595 } 596 } 597 598 static void fuse_io_release(struct kref *kref) 599 { 600 kfree(container_of(kref, struct fuse_io_priv, refcnt)); 601 } 602 603 static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) 604 { 605 if (io->err) 606 return io->err; 607 608 if (io->bytes >= 0 && io->write) 609 return -EIO; 610 611 return io->bytes < 0 ? io->size : io->bytes; 612 } 613 614 /** 615 * In case of short read, the caller sets 'pos' to the position of 616 * actual end of fuse request in IO request. Otherwise, if bytes_requested 617 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1. 618 * 619 * An example: 620 * User requested DIO read of 64K. It was splitted into two 32K fuse requests, 621 * both submitted asynchronously. The first of them was ACKed by userspace as 622 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The 623 * second request was ACKed as short, e.g. only 1K was read, resulting in 624 * pos == 33K. 625 * 626 * Thus, when all fuse requests are completed, the minimal non-negative 'pos' 627 * will be equal to the length of the longest contiguous fragment of 628 * transferred data starting from the beginning of IO request. 629 */ 630 static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) 631 { 632 int left; 633 634 spin_lock(&io->lock); 635 if (err) 636 io->err = io->err ? : err; 637 else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes)) 638 io->bytes = pos; 639 640 left = --io->reqs; 641 if (!left && io->blocking) 642 complete(io->done); 643 spin_unlock(&io->lock); 644 645 if (!left && !io->blocking) { 646 ssize_t res = fuse_get_res_by_io(io); 647 648 if (res >= 0) { 649 struct inode *inode = file_inode(io->iocb->ki_filp); 650 struct fuse_conn *fc = get_fuse_conn(inode); 651 struct fuse_inode *fi = get_fuse_inode(inode); 652 653 spin_lock(&fi->lock); 654 fi->attr_version = atomic64_inc_return(&fc->attr_version); 655 spin_unlock(&fi->lock); 656 } 657 658 io->iocb->ki_complete(io->iocb, res, 0); 659 } 660 661 kref_put(&io->refcnt, fuse_io_release); 662 } 663 664 static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, 665 unsigned int npages) 666 { 667 struct fuse_io_args *ia; 668 669 ia = kzalloc(sizeof(*ia), GFP_KERNEL); 670 if (ia) { 671 ia->io = io; 672 ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, 673 &ia->ap.descs); 674 if (!ia->ap.pages) { 675 kfree(ia); 676 ia = NULL; 677 } 678 } 679 return ia; 680 } 681 682 static void fuse_io_free(struct fuse_io_args *ia) 683 { 684 kfree(ia->ap.pages); 685 kfree(ia); 686 } 687 688 static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, 689 int err) 690 { 691 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 692 struct fuse_io_priv *io = ia->io; 693 ssize_t pos = -1; 694 695 fuse_release_user_pages(&ia->ap, io->should_dirty); 696 697 if (err) { 698 /* Nothing */ 699 } else if (io->write) { 700 if (ia->write.out.size > ia->write.in.size) { 701 err = -EIO; 702 } else if (ia->write.in.size != ia->write.out.size) { 703 pos = ia->write.in.offset - io->offset + 704 ia->write.out.size; 705 } 706 } else { 707 u32 outsize = args->out_args[0].size; 708 709 if (ia->read.in.size != outsize) 710 pos = ia->read.in.offset - io->offset + outsize; 711 } 712 713 fuse_aio_complete(io, err, pos); 714 fuse_io_free(ia); 715 } 716 717 static ssize_t fuse_async_req_send(struct fuse_conn *fc, 718 struct fuse_io_args *ia, size_t num_bytes) 719 { 720 ssize_t err; 721 struct fuse_io_priv *io = ia->io; 722 723 spin_lock(&io->lock); 724 kref_get(&io->refcnt); 725 io->size += num_bytes; 726 io->reqs++; 727 spin_unlock(&io->lock); 728 729 ia->ap.args.end = fuse_aio_complete_req; 730 ia->ap.args.may_block = io->should_dirty; 731 err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); 732 if (err) 733 fuse_aio_complete_req(fc, &ia->ap.args, err); 734 735 return num_bytes; 736 } 737 738 static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, 739 fl_owner_t owner) 740 { 741 struct file *file = ia->io->iocb->ki_filp; 742 struct fuse_file *ff = file->private_data; 743 struct fuse_conn *fc = ff->fc; 744 745 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 746 if (owner != NULL) { 747 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; 748 ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); 749 } 750 751 if (ia->io->async) 752 return fuse_async_req_send(fc, ia, count); 753 754 return fuse_simple_request(fc, &ia->ap.args); 755 } 756 757 static void fuse_read_update_size(struct inode *inode, loff_t size, 758 u64 attr_ver) 759 { 760 struct fuse_conn *fc = get_fuse_conn(inode); 761 struct fuse_inode *fi = get_fuse_inode(inode); 762 763 spin_lock(&fi->lock); 764 if (attr_ver == fi->attr_version && size < inode->i_size && 765 !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { 766 fi->attr_version = atomic64_inc_return(&fc->attr_version); 767 i_size_write(inode, size); 768 } 769 spin_unlock(&fi->lock); 770 } 771 772 static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, 773 struct fuse_args_pages *ap) 774 { 775 struct fuse_conn *fc = get_fuse_conn(inode); 776 777 if (fc->writeback_cache) { 778 /* 779 * A hole in a file. Some data after the hole are in page cache, 780 * but have not reached the client fs yet. So, the hole is not 781 * present there. 782 */ 783 int i; 784 int start_idx = num_read >> PAGE_SHIFT; 785 size_t off = num_read & (PAGE_SIZE - 1); 786 787 for (i = start_idx; i < ap->num_pages; i++) { 788 zero_user_segment(ap->pages[i], off, PAGE_SIZE); 789 off = 0; 790 } 791 } else { 792 loff_t pos = page_offset(ap->pages[0]) + num_read; 793 fuse_read_update_size(inode, pos, attr_ver); 794 } 795 } 796 797 static int fuse_do_readpage(struct file *file, struct page *page) 798 { 799 struct inode *inode = page->mapping->host; 800 struct fuse_conn *fc = get_fuse_conn(inode); 801 loff_t pos = page_offset(page); 802 struct fuse_page_desc desc = { .length = PAGE_SIZE }; 803 struct fuse_io_args ia = { 804 .ap.args.page_zeroing = true, 805 .ap.args.out_pages = true, 806 .ap.num_pages = 1, 807 .ap.pages = &page, 808 .ap.descs = &desc, 809 }; 810 ssize_t res; 811 u64 attr_ver; 812 813 /* 814 * Page writeback can extend beyond the lifetime of the 815 * page-cache page, so make sure we read a properly synced 816 * page. 817 */ 818 fuse_wait_on_page_writeback(inode, page->index); 819 820 attr_ver = fuse_get_attr_version(fc); 821 822 /* Don't overflow end offset */ 823 if (pos + (desc.length - 1) == LLONG_MAX) 824 desc.length--; 825 826 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); 827 res = fuse_simple_request(fc, &ia.ap.args); 828 if (res < 0) 829 return res; 830 /* 831 * Short read means EOF. If file size is larger, truncate it 832 */ 833 if (res < desc.length) 834 fuse_short_read(inode, attr_ver, res, &ia.ap); 835 836 SetPageUptodate(page); 837 838 return 0; 839 } 840 841 static int fuse_readpage(struct file *file, struct page *page) 842 { 843 struct inode *inode = page->mapping->host; 844 int err; 845 846 err = -EIO; 847 if (is_bad_inode(inode)) 848 goto out; 849 850 err = fuse_do_readpage(file, page); 851 fuse_invalidate_atime(inode); 852 out: 853 unlock_page(page); 854 return err; 855 } 856 857 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, 858 int err) 859 { 860 int i; 861 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); 862 struct fuse_args_pages *ap = &ia->ap; 863 size_t count = ia->read.in.size; 864 size_t num_read = args->out_args[0].size; 865 struct address_space *mapping = NULL; 866 867 for (i = 0; mapping == NULL && i < ap->num_pages; i++) 868 mapping = ap->pages[i]->mapping; 869 870 if (mapping) { 871 struct inode *inode = mapping->host; 872 873 /* 874 * Short read means EOF. If file size is larger, truncate it 875 */ 876 if (!err && num_read < count) 877 fuse_short_read(inode, ia->read.attr_ver, num_read, ap); 878 879 fuse_invalidate_atime(inode); 880 } 881 882 for (i = 0; i < ap->num_pages; i++) { 883 struct page *page = ap->pages[i]; 884 885 if (!err) 886 SetPageUptodate(page); 887 else 888 SetPageError(page); 889 unlock_page(page); 890 put_page(page); 891 } 892 if (ia->ff) 893 fuse_file_put(ia->ff, false, false); 894 895 fuse_io_free(ia); 896 } 897 898 static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) 899 { 900 struct fuse_file *ff = file->private_data; 901 struct fuse_conn *fc = ff->fc; 902 struct fuse_args_pages *ap = &ia->ap; 903 loff_t pos = page_offset(ap->pages[0]); 904 size_t count = ap->num_pages << PAGE_SHIFT; 905 ssize_t res; 906 int err; 907 908 ap->args.out_pages = true; 909 ap->args.page_zeroing = true; 910 ap->args.page_replace = true; 911 912 /* Don't overflow end offset */ 913 if (pos + (count - 1) == LLONG_MAX) { 914 count--; 915 ap->descs[ap->num_pages - 1].length--; 916 } 917 WARN_ON((loff_t) (pos + count) < 0); 918 919 fuse_read_args_fill(ia, file, pos, count, FUSE_READ); 920 ia->read.attr_ver = fuse_get_attr_version(fc); 921 if (fc->async_read) { 922 ia->ff = fuse_file_get(ff); 923 ap->args.end = fuse_readpages_end; 924 err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); 925 if (!err) 926 return; 927 } else { 928 res = fuse_simple_request(fc, &ap->args); 929 err = res < 0 ? res : 0; 930 } 931 fuse_readpages_end(fc, &ap->args, err); 932 } 933 934 static void fuse_readahead(struct readahead_control *rac) 935 { 936 struct inode *inode = rac->mapping->host; 937 struct fuse_conn *fc = get_fuse_conn(inode); 938 unsigned int i, max_pages, nr_pages = 0; 939 940 if (is_bad_inode(inode)) 941 return; 942 943 max_pages = min_t(unsigned int, fc->max_pages, 944 fc->max_read / PAGE_SIZE); 945 946 for (;;) { 947 struct fuse_io_args *ia; 948 struct fuse_args_pages *ap; 949 950 nr_pages = readahead_count(rac) - nr_pages; 951 if (nr_pages > max_pages) 952 nr_pages = max_pages; 953 if (nr_pages == 0) 954 break; 955 ia = fuse_io_alloc(NULL, nr_pages); 956 if (!ia) 957 return; 958 ap = &ia->ap; 959 nr_pages = __readahead_batch(rac, ap->pages, nr_pages); 960 for (i = 0; i < nr_pages; i++) { 961 fuse_wait_on_page_writeback(inode, 962 readahead_index(rac) + i); 963 ap->descs[i].length = PAGE_SIZE; 964 } 965 ap->num_pages = nr_pages; 966 fuse_send_readpages(ia, rac->file); 967 } 968 } 969 970 static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) 971 { 972 struct inode *inode = iocb->ki_filp->f_mapping->host; 973 struct fuse_conn *fc = get_fuse_conn(inode); 974 975 /* 976 * In auto invalidate mode, always update attributes on read. 977 * Otherwise, only update if we attempt to read past EOF (to ensure 978 * i_size is up to date). 979 */ 980 if (fc->auto_inval_data || 981 (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { 982 int err; 983 err = fuse_update_attributes(inode, iocb->ki_filp); 984 if (err) 985 return err; 986 } 987 988 return generic_file_read_iter(iocb, to); 989 } 990 991 static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, 992 loff_t pos, size_t count) 993 { 994 struct fuse_args *args = &ia->ap.args; 995 996 ia->write.in.fh = ff->fh; 997 ia->write.in.offset = pos; 998 ia->write.in.size = count; 999 args->opcode = FUSE_WRITE; 1000 args->nodeid = ff->nodeid; 1001 args->in_numargs = 2; 1002 if (ff->fc->minor < 9) 1003 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 1004 else 1005 args->in_args[0].size = sizeof(ia->write.in); 1006 args->in_args[0].value = &ia->write.in; 1007 args->in_args[1].size = count; 1008 args->out_numargs = 1; 1009 args->out_args[0].size = sizeof(ia->write.out); 1010 args->out_args[0].value = &ia->write.out; 1011 } 1012 1013 static unsigned int fuse_write_flags(struct kiocb *iocb) 1014 { 1015 unsigned int flags = iocb->ki_filp->f_flags; 1016 1017 if (iocb->ki_flags & IOCB_DSYNC) 1018 flags |= O_DSYNC; 1019 if (iocb->ki_flags & IOCB_SYNC) 1020 flags |= O_SYNC; 1021 1022 return flags; 1023 } 1024 1025 static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, 1026 size_t count, fl_owner_t owner) 1027 { 1028 struct kiocb *iocb = ia->io->iocb; 1029 struct file *file = iocb->ki_filp; 1030 struct fuse_file *ff = file->private_data; 1031 struct fuse_conn *fc = ff->fc; 1032 struct fuse_write_in *inarg = &ia->write.in; 1033 ssize_t err; 1034 1035 fuse_write_args_fill(ia, ff, pos, count); 1036 inarg->flags = fuse_write_flags(iocb); 1037 if (owner != NULL) { 1038 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 1039 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 1040 } 1041 1042 if (ia->io->async) 1043 return fuse_async_req_send(fc, ia, count); 1044 1045 err = fuse_simple_request(fc, &ia->ap.args); 1046 if (!err && ia->write.out.size > count) 1047 err = -EIO; 1048 1049 return err ?: ia->write.out.size; 1050 } 1051 1052 bool fuse_write_update_size(struct inode *inode, loff_t pos) 1053 { 1054 struct fuse_conn *fc = get_fuse_conn(inode); 1055 struct fuse_inode *fi = get_fuse_inode(inode); 1056 bool ret = false; 1057 1058 spin_lock(&fi->lock); 1059 fi->attr_version = atomic64_inc_return(&fc->attr_version); 1060 if (pos > inode->i_size) { 1061 i_size_write(inode, pos); 1062 ret = true; 1063 } 1064 spin_unlock(&fi->lock); 1065 1066 return ret; 1067 } 1068 1069 static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, 1070 struct kiocb *iocb, struct inode *inode, 1071 loff_t pos, size_t count) 1072 { 1073 struct fuse_args_pages *ap = &ia->ap; 1074 struct file *file = iocb->ki_filp; 1075 struct fuse_file *ff = file->private_data; 1076 struct fuse_conn *fc = ff->fc; 1077 unsigned int offset, i; 1078 int err; 1079 1080 for (i = 0; i < ap->num_pages; i++) 1081 fuse_wait_on_page_writeback(inode, ap->pages[i]->index); 1082 1083 fuse_write_args_fill(ia, ff, pos, count); 1084 ia->write.in.flags = fuse_write_flags(iocb); 1085 1086 err = fuse_simple_request(fc, &ap->args); 1087 if (!err && ia->write.out.size > count) 1088 err = -EIO; 1089 1090 offset = ap->descs[0].offset; 1091 count = ia->write.out.size; 1092 for (i = 0; i < ap->num_pages; i++) { 1093 struct page *page = ap->pages[i]; 1094 1095 if (!err && !offset && count >= PAGE_SIZE) 1096 SetPageUptodate(page); 1097 1098 if (count > PAGE_SIZE - offset) 1099 count -= PAGE_SIZE - offset; 1100 else 1101 count = 0; 1102 offset = 0; 1103 1104 unlock_page(page); 1105 put_page(page); 1106 } 1107 1108 return err; 1109 } 1110 1111 static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, 1112 struct address_space *mapping, 1113 struct iov_iter *ii, loff_t pos, 1114 unsigned int max_pages) 1115 { 1116 struct fuse_conn *fc = get_fuse_conn(mapping->host); 1117 unsigned offset = pos & (PAGE_SIZE - 1); 1118 size_t count = 0; 1119 int err; 1120 1121 ap->args.in_pages = true; 1122 ap->descs[0].offset = offset; 1123 1124 do { 1125 size_t tmp; 1126 struct page *page; 1127 pgoff_t index = pos >> PAGE_SHIFT; 1128 size_t bytes = min_t(size_t, PAGE_SIZE - offset, 1129 iov_iter_count(ii)); 1130 1131 bytes = min_t(size_t, bytes, fc->max_write - count); 1132 1133 again: 1134 err = -EFAULT; 1135 if (iov_iter_fault_in_readable(ii, bytes)) 1136 break; 1137 1138 err = -ENOMEM; 1139 page = grab_cache_page_write_begin(mapping, index, 0); 1140 if (!page) 1141 break; 1142 1143 if (mapping_writably_mapped(mapping)) 1144 flush_dcache_page(page); 1145 1146 tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); 1147 flush_dcache_page(page); 1148 1149 iov_iter_advance(ii, tmp); 1150 if (!tmp) { 1151 unlock_page(page); 1152 put_page(page); 1153 bytes = min(bytes, iov_iter_single_seg_count(ii)); 1154 goto again; 1155 } 1156 1157 err = 0; 1158 ap->pages[ap->num_pages] = page; 1159 ap->descs[ap->num_pages].length = tmp; 1160 ap->num_pages++; 1161 1162 count += tmp; 1163 pos += tmp; 1164 offset += tmp; 1165 if (offset == PAGE_SIZE) 1166 offset = 0; 1167 1168 if (!fc->big_writes) 1169 break; 1170 } while (iov_iter_count(ii) && count < fc->max_write && 1171 ap->num_pages < max_pages && offset == 0); 1172 1173 return count > 0 ? count : err; 1174 } 1175 1176 static inline unsigned int fuse_wr_pages(loff_t pos, size_t len, 1177 unsigned int max_pages) 1178 { 1179 return min_t(unsigned int, 1180 ((pos + len - 1) >> PAGE_SHIFT) - 1181 (pos >> PAGE_SHIFT) + 1, 1182 max_pages); 1183 } 1184 1185 static ssize_t fuse_perform_write(struct kiocb *iocb, 1186 struct address_space *mapping, 1187 struct iov_iter *ii, loff_t pos) 1188 { 1189 struct inode *inode = mapping->host; 1190 struct fuse_conn *fc = get_fuse_conn(inode); 1191 struct fuse_inode *fi = get_fuse_inode(inode); 1192 int err = 0; 1193 ssize_t res = 0; 1194 1195 if (inode->i_size < pos + iov_iter_count(ii)) 1196 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1197 1198 do { 1199 ssize_t count; 1200 struct fuse_io_args ia = {}; 1201 struct fuse_args_pages *ap = &ia.ap; 1202 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), 1203 fc->max_pages); 1204 1205 ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); 1206 if (!ap->pages) { 1207 err = -ENOMEM; 1208 break; 1209 } 1210 1211 count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); 1212 if (count <= 0) { 1213 err = count; 1214 } else { 1215 err = fuse_send_write_pages(&ia, iocb, inode, 1216 pos, count); 1217 if (!err) { 1218 size_t num_written = ia.write.out.size; 1219 1220 res += num_written; 1221 pos += num_written; 1222 1223 /* break out of the loop on short write */ 1224 if (num_written != count) 1225 err = -EIO; 1226 } 1227 } 1228 kfree(ap->pages); 1229 } while (!err && iov_iter_count(ii)); 1230 1231 if (res > 0) 1232 fuse_write_update_size(inode, pos); 1233 1234 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1235 fuse_invalidate_attr(inode); 1236 1237 return res > 0 ? res : err; 1238 } 1239 1240 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) 1241 { 1242 struct file *file = iocb->ki_filp; 1243 struct address_space *mapping = file->f_mapping; 1244 ssize_t written = 0; 1245 ssize_t written_buffered = 0; 1246 struct inode *inode = mapping->host; 1247 ssize_t err; 1248 loff_t endbyte = 0; 1249 1250 if (get_fuse_conn(inode)->writeback_cache) { 1251 /* Update size (EOF optimization) and mode (SUID clearing) */ 1252 err = fuse_update_attributes(mapping->host, file); 1253 if (err) 1254 return err; 1255 1256 return generic_file_write_iter(iocb, from); 1257 } 1258 1259 inode_lock(inode); 1260 1261 /* We can write back this queue in page reclaim */ 1262 current->backing_dev_info = inode_to_bdi(inode); 1263 1264 err = generic_write_checks(iocb, from); 1265 if (err <= 0) 1266 goto out; 1267 1268 err = file_remove_privs(file); 1269 if (err) 1270 goto out; 1271 1272 err = file_update_time(file); 1273 if (err) 1274 goto out; 1275 1276 if (iocb->ki_flags & IOCB_DIRECT) { 1277 loff_t pos = iocb->ki_pos; 1278 written = generic_file_direct_write(iocb, from); 1279 if (written < 0 || !iov_iter_count(from)) 1280 goto out; 1281 1282 pos += written; 1283 1284 written_buffered = fuse_perform_write(iocb, mapping, from, pos); 1285 if (written_buffered < 0) { 1286 err = written_buffered; 1287 goto out; 1288 } 1289 endbyte = pos + written_buffered - 1; 1290 1291 err = filemap_write_and_wait_range(file->f_mapping, pos, 1292 endbyte); 1293 if (err) 1294 goto out; 1295 1296 invalidate_mapping_pages(file->f_mapping, 1297 pos >> PAGE_SHIFT, 1298 endbyte >> PAGE_SHIFT); 1299 1300 written += written_buffered; 1301 iocb->ki_pos = pos + written_buffered; 1302 } else { 1303 written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); 1304 if (written >= 0) 1305 iocb->ki_pos += written; 1306 } 1307 out: 1308 current->backing_dev_info = NULL; 1309 inode_unlock(inode); 1310 if (written > 0) 1311 written = generic_write_sync(iocb, written); 1312 1313 return written ? written : err; 1314 } 1315 1316 static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, 1317 unsigned int index, 1318 unsigned int nr_pages) 1319 { 1320 int i; 1321 1322 for (i = index; i < index + nr_pages; i++) 1323 descs[i].length = PAGE_SIZE - descs[i].offset; 1324 } 1325 1326 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) 1327 { 1328 return (unsigned long)ii->iov->iov_base + ii->iov_offset; 1329 } 1330 1331 static inline size_t fuse_get_frag_size(const struct iov_iter *ii, 1332 size_t max_size) 1333 { 1334 return min(iov_iter_single_seg_count(ii), max_size); 1335 } 1336 1337 static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, 1338 size_t *nbytesp, int write, 1339 unsigned int max_pages) 1340 { 1341 size_t nbytes = 0; /* # bytes already packed in req */ 1342 ssize_t ret = 0; 1343 1344 /* Special case for kernel I/O: can copy directly into the buffer */ 1345 if (iov_iter_is_kvec(ii)) { 1346 unsigned long user_addr = fuse_get_user_addr(ii); 1347 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1348 1349 if (write) 1350 ap->args.in_args[1].value = (void *) user_addr; 1351 else 1352 ap->args.out_args[0].value = (void *) user_addr; 1353 1354 iov_iter_advance(ii, frag_size); 1355 *nbytesp = frag_size; 1356 return 0; 1357 } 1358 1359 while (nbytes < *nbytesp && ap->num_pages < max_pages) { 1360 unsigned npages; 1361 size_t start; 1362 ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], 1363 *nbytesp - nbytes, 1364 max_pages - ap->num_pages, 1365 &start); 1366 if (ret < 0) 1367 break; 1368 1369 iov_iter_advance(ii, ret); 1370 nbytes += ret; 1371 1372 ret += start; 1373 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; 1374 1375 ap->descs[ap->num_pages].offset = start; 1376 fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); 1377 1378 ap->num_pages += npages; 1379 ap->descs[ap->num_pages - 1].length -= 1380 (PAGE_SIZE - ret) & (PAGE_SIZE - 1); 1381 } 1382 1383 if (write) 1384 ap->args.in_pages = true; 1385 else 1386 ap->args.out_pages = true; 1387 1388 *nbytesp = nbytes; 1389 1390 return ret < 0 ? ret : 0; 1391 } 1392 1393 ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, 1394 loff_t *ppos, int flags) 1395 { 1396 int write = flags & FUSE_DIO_WRITE; 1397 int cuse = flags & FUSE_DIO_CUSE; 1398 struct file *file = io->iocb->ki_filp; 1399 struct inode *inode = file->f_mapping->host; 1400 struct fuse_file *ff = file->private_data; 1401 struct fuse_conn *fc = ff->fc; 1402 size_t nmax = write ? fc->max_write : fc->max_read; 1403 loff_t pos = *ppos; 1404 size_t count = iov_iter_count(iter); 1405 pgoff_t idx_from = pos >> PAGE_SHIFT; 1406 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; 1407 ssize_t res = 0; 1408 int err = 0; 1409 struct fuse_io_args *ia; 1410 unsigned int max_pages; 1411 1412 max_pages = iov_iter_npages(iter, fc->max_pages); 1413 ia = fuse_io_alloc(io, max_pages); 1414 if (!ia) 1415 return -ENOMEM; 1416 1417 ia->io = io; 1418 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { 1419 if (!write) 1420 inode_lock(inode); 1421 fuse_sync_writes(inode); 1422 if (!write) 1423 inode_unlock(inode); 1424 } 1425 1426 io->should_dirty = !write && iter_is_iovec(iter); 1427 while (count) { 1428 ssize_t nres; 1429 fl_owner_t owner = current->files; 1430 size_t nbytes = min(count, nmax); 1431 1432 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, 1433 max_pages); 1434 if (err && !nbytes) 1435 break; 1436 1437 if (write) { 1438 if (!capable(CAP_FSETID)) 1439 ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; 1440 1441 nres = fuse_send_write(ia, pos, nbytes, owner); 1442 } else { 1443 nres = fuse_send_read(ia, pos, nbytes, owner); 1444 } 1445 1446 if (!io->async || nres < 0) { 1447 fuse_release_user_pages(&ia->ap, io->should_dirty); 1448 fuse_io_free(ia); 1449 } 1450 ia = NULL; 1451 if (nres < 0) { 1452 iov_iter_revert(iter, nbytes); 1453 err = nres; 1454 break; 1455 } 1456 WARN_ON(nres > nbytes); 1457 1458 count -= nres; 1459 res += nres; 1460 pos += nres; 1461 if (nres != nbytes) { 1462 iov_iter_revert(iter, nbytes - nres); 1463 break; 1464 } 1465 if (count) { 1466 max_pages = iov_iter_npages(iter, fc->max_pages); 1467 ia = fuse_io_alloc(io, max_pages); 1468 if (!ia) 1469 break; 1470 } 1471 } 1472 if (ia) 1473 fuse_io_free(ia); 1474 if (res > 0) 1475 *ppos = pos; 1476 1477 return res > 0 ? res : err; 1478 } 1479 EXPORT_SYMBOL_GPL(fuse_direct_io); 1480 1481 static ssize_t __fuse_direct_read(struct fuse_io_priv *io, 1482 struct iov_iter *iter, 1483 loff_t *ppos) 1484 { 1485 ssize_t res; 1486 struct inode *inode = file_inode(io->iocb->ki_filp); 1487 1488 res = fuse_direct_io(io, iter, ppos, 0); 1489 1490 fuse_invalidate_atime(inode); 1491 1492 return res; 1493 } 1494 1495 static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter); 1496 1497 static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) 1498 { 1499 ssize_t res; 1500 1501 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { 1502 res = fuse_direct_IO(iocb, to); 1503 } else { 1504 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1505 1506 res = __fuse_direct_read(&io, to, &iocb->ki_pos); 1507 } 1508 1509 return res; 1510 } 1511 1512 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) 1513 { 1514 struct inode *inode = file_inode(iocb->ki_filp); 1515 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1516 ssize_t res; 1517 1518 /* Don't allow parallel writes to the same file */ 1519 inode_lock(inode); 1520 res = generic_write_checks(iocb, from); 1521 if (res > 0) { 1522 if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { 1523 res = fuse_direct_IO(iocb, from); 1524 } else { 1525 res = fuse_direct_io(&io, from, &iocb->ki_pos, 1526 FUSE_DIO_WRITE); 1527 } 1528 } 1529 fuse_invalidate_attr(inode); 1530 if (res > 0) 1531 fuse_write_update_size(inode, iocb->ki_pos); 1532 inode_unlock(inode); 1533 1534 return res; 1535 } 1536 1537 static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1538 { 1539 struct file *file = iocb->ki_filp; 1540 struct fuse_file *ff = file->private_data; 1541 1542 if (is_bad_inode(file_inode(file))) 1543 return -EIO; 1544 1545 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1546 return fuse_cache_read_iter(iocb, to); 1547 else 1548 return fuse_direct_read_iter(iocb, to); 1549 } 1550 1551 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 1552 { 1553 struct file *file = iocb->ki_filp; 1554 struct fuse_file *ff = file->private_data; 1555 1556 if (is_bad_inode(file_inode(file))) 1557 return -EIO; 1558 1559 if (!(ff->open_flags & FOPEN_DIRECT_IO)) 1560 return fuse_cache_write_iter(iocb, from); 1561 else 1562 return fuse_direct_write_iter(iocb, from); 1563 } 1564 1565 static void fuse_writepage_free(struct fuse_writepage_args *wpa) 1566 { 1567 struct fuse_args_pages *ap = &wpa->ia.ap; 1568 int i; 1569 1570 for (i = 0; i < ap->num_pages; i++) 1571 __free_page(ap->pages[i]); 1572 1573 if (wpa->ia.ff) 1574 fuse_file_put(wpa->ia.ff, false, false); 1575 1576 kfree(ap->pages); 1577 kfree(wpa); 1578 } 1579 1580 static void fuse_writepage_finish(struct fuse_conn *fc, 1581 struct fuse_writepage_args *wpa) 1582 { 1583 struct fuse_args_pages *ap = &wpa->ia.ap; 1584 struct inode *inode = wpa->inode; 1585 struct fuse_inode *fi = get_fuse_inode(inode); 1586 struct backing_dev_info *bdi = inode_to_bdi(inode); 1587 int i; 1588 1589 rb_erase(&wpa->writepages_entry, &fi->writepages); 1590 for (i = 0; i < ap->num_pages; i++) { 1591 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1592 dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); 1593 wb_writeout_inc(&bdi->wb); 1594 } 1595 wake_up(&fi->page_waitq); 1596 } 1597 1598 /* Called under fi->lock, may release and reacquire it */ 1599 static void fuse_send_writepage(struct fuse_conn *fc, 1600 struct fuse_writepage_args *wpa, loff_t size) 1601 __releases(fi->lock) 1602 __acquires(fi->lock) 1603 { 1604 struct fuse_writepage_args *aux, *next; 1605 struct fuse_inode *fi = get_fuse_inode(wpa->inode); 1606 struct fuse_write_in *inarg = &wpa->ia.write.in; 1607 struct fuse_args *args = &wpa->ia.ap.args; 1608 __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; 1609 int err; 1610 1611 fi->writectr++; 1612 if (inarg->offset + data_size <= size) { 1613 inarg->size = data_size; 1614 } else if (inarg->offset < size) { 1615 inarg->size = size - inarg->offset; 1616 } else { 1617 /* Got truncated off completely */ 1618 goto out_free; 1619 } 1620 1621 args->in_args[1].size = inarg->size; 1622 args->force = true; 1623 args->nocreds = true; 1624 1625 err = fuse_simple_background(fc, args, GFP_ATOMIC); 1626 if (err == -ENOMEM) { 1627 spin_unlock(&fi->lock); 1628 err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); 1629 spin_lock(&fi->lock); 1630 } 1631 1632 /* Fails on broken connection only */ 1633 if (unlikely(err)) 1634 goto out_free; 1635 1636 return; 1637 1638 out_free: 1639 fi->writectr--; 1640 fuse_writepage_finish(fc, wpa); 1641 spin_unlock(&fi->lock); 1642 1643 /* After fuse_writepage_finish() aux request list is private */ 1644 for (aux = wpa->next; aux; aux = next) { 1645 next = aux->next; 1646 aux->next = NULL; 1647 fuse_writepage_free(aux); 1648 } 1649 1650 fuse_writepage_free(wpa); 1651 spin_lock(&fi->lock); 1652 } 1653 1654 /* 1655 * If fi->writectr is positive (no truncate or fsync going on) send 1656 * all queued writepage requests. 1657 * 1658 * Called with fi->lock 1659 */ 1660 void fuse_flush_writepages(struct inode *inode) 1661 __releases(fi->lock) 1662 __acquires(fi->lock) 1663 { 1664 struct fuse_conn *fc = get_fuse_conn(inode); 1665 struct fuse_inode *fi = get_fuse_inode(inode); 1666 loff_t crop = i_size_read(inode); 1667 struct fuse_writepage_args *wpa; 1668 1669 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 1670 wpa = list_entry(fi->queued_writes.next, 1671 struct fuse_writepage_args, queue_entry); 1672 list_del_init(&wpa->queue_entry); 1673 fuse_send_writepage(fc, wpa, crop); 1674 } 1675 } 1676 1677 static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) 1678 { 1679 pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; 1680 pgoff_t idx_to = idx_from + wpa->ia.ap.num_pages - 1; 1681 struct rb_node **p = &root->rb_node; 1682 struct rb_node *parent = NULL; 1683 1684 WARN_ON(!wpa->ia.ap.num_pages); 1685 while (*p) { 1686 struct fuse_writepage_args *curr; 1687 pgoff_t curr_index; 1688 1689 parent = *p; 1690 curr = rb_entry(parent, struct fuse_writepage_args, 1691 writepages_entry); 1692 WARN_ON(curr->inode != wpa->inode); 1693 curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; 1694 1695 if (idx_from >= curr_index + curr->ia.ap.num_pages) 1696 p = &(*p)->rb_right; 1697 else if (idx_to < curr_index) 1698 p = &(*p)->rb_left; 1699 else 1700 return (void) WARN_ON(true); 1701 } 1702 1703 rb_link_node(&wpa->writepages_entry, parent, p); 1704 rb_insert_color(&wpa->writepages_entry, root); 1705 } 1706 1707 static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, 1708 int error) 1709 { 1710 struct fuse_writepage_args *wpa = 1711 container_of(args, typeof(*wpa), ia.ap.args); 1712 struct inode *inode = wpa->inode; 1713 struct fuse_inode *fi = get_fuse_inode(inode); 1714 1715 mapping_set_error(inode->i_mapping, error); 1716 spin_lock(&fi->lock); 1717 while (wpa->next) { 1718 struct fuse_conn *fc = get_fuse_conn(inode); 1719 struct fuse_write_in *inarg = &wpa->ia.write.in; 1720 struct fuse_writepage_args *next = wpa->next; 1721 1722 wpa->next = next->next; 1723 next->next = NULL; 1724 next->ia.ff = fuse_file_get(wpa->ia.ff); 1725 tree_insert(&fi->writepages, next); 1726 1727 /* 1728 * Skip fuse_flush_writepages() to make it easy to crop requests 1729 * based on primary request size. 1730 * 1731 * 1st case (trivial): there are no concurrent activities using 1732 * fuse_set/release_nowrite. Then we're on safe side because 1733 * fuse_flush_writepages() would call fuse_send_writepage() 1734 * anyway. 1735 * 1736 * 2nd case: someone called fuse_set_nowrite and it is waiting 1737 * now for completion of all in-flight requests. This happens 1738 * rarely and no more than once per page, so this should be 1739 * okay. 1740 * 1741 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle 1742 * of fuse_set_nowrite..fuse_release_nowrite section. The fact 1743 * that fuse_set_nowrite returned implies that all in-flight 1744 * requests were completed along with all of their secondary 1745 * requests. Further primary requests are blocked by negative 1746 * writectr. Hence there cannot be any in-flight requests and 1747 * no invocations of fuse_writepage_end() while we're in 1748 * fuse_set_nowrite..fuse_release_nowrite section. 1749 */ 1750 fuse_send_writepage(fc, next, inarg->offset + inarg->size); 1751 } 1752 fi->writectr--; 1753 fuse_writepage_finish(fc, wpa); 1754 spin_unlock(&fi->lock); 1755 fuse_writepage_free(wpa); 1756 } 1757 1758 static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, 1759 struct fuse_inode *fi) 1760 { 1761 struct fuse_file *ff = NULL; 1762 1763 spin_lock(&fi->lock); 1764 if (!list_empty(&fi->write_files)) { 1765 ff = list_entry(fi->write_files.next, struct fuse_file, 1766 write_entry); 1767 fuse_file_get(ff); 1768 } 1769 spin_unlock(&fi->lock); 1770 1771 return ff; 1772 } 1773 1774 static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, 1775 struct fuse_inode *fi) 1776 { 1777 struct fuse_file *ff = __fuse_write_file_get(fc, fi); 1778 WARN_ON(!ff); 1779 return ff; 1780 } 1781 1782 int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) 1783 { 1784 struct fuse_conn *fc = get_fuse_conn(inode); 1785 struct fuse_inode *fi = get_fuse_inode(inode); 1786 struct fuse_file *ff; 1787 int err; 1788 1789 ff = __fuse_write_file_get(fc, fi); 1790 err = fuse_flush_times(inode, ff); 1791 if (ff) 1792 fuse_file_put(ff, false, false); 1793 1794 return err; 1795 } 1796 1797 static struct fuse_writepage_args *fuse_writepage_args_alloc(void) 1798 { 1799 struct fuse_writepage_args *wpa; 1800 struct fuse_args_pages *ap; 1801 1802 wpa = kzalloc(sizeof(*wpa), GFP_NOFS); 1803 if (wpa) { 1804 ap = &wpa->ia.ap; 1805 ap->num_pages = 0; 1806 ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); 1807 if (!ap->pages) { 1808 kfree(wpa); 1809 wpa = NULL; 1810 } 1811 } 1812 return wpa; 1813 1814 } 1815 1816 static int fuse_writepage_locked(struct page *page) 1817 { 1818 struct address_space *mapping = page->mapping; 1819 struct inode *inode = mapping->host; 1820 struct fuse_conn *fc = get_fuse_conn(inode); 1821 struct fuse_inode *fi = get_fuse_inode(inode); 1822 struct fuse_writepage_args *wpa; 1823 struct fuse_args_pages *ap; 1824 struct page *tmp_page; 1825 int error = -ENOMEM; 1826 1827 set_page_writeback(page); 1828 1829 wpa = fuse_writepage_args_alloc(); 1830 if (!wpa) 1831 goto err; 1832 ap = &wpa->ia.ap; 1833 1834 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1835 if (!tmp_page) 1836 goto err_free; 1837 1838 error = -EIO; 1839 wpa->ia.ff = fuse_write_file_get(fc, fi); 1840 if (!wpa->ia.ff) 1841 goto err_nofile; 1842 1843 fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); 1844 1845 copy_highpage(tmp_page, page); 1846 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 1847 wpa->next = NULL; 1848 ap->args.in_pages = true; 1849 ap->num_pages = 1; 1850 ap->pages[0] = tmp_page; 1851 ap->descs[0].offset = 0; 1852 ap->descs[0].length = PAGE_SIZE; 1853 ap->args.end = fuse_writepage_end; 1854 wpa->inode = inode; 1855 1856 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 1857 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 1858 1859 spin_lock(&fi->lock); 1860 tree_insert(&fi->writepages, wpa); 1861 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 1862 fuse_flush_writepages(inode); 1863 spin_unlock(&fi->lock); 1864 1865 end_page_writeback(page); 1866 1867 return 0; 1868 1869 err_nofile: 1870 __free_page(tmp_page); 1871 err_free: 1872 kfree(wpa); 1873 err: 1874 mapping_set_error(page->mapping, error); 1875 end_page_writeback(page); 1876 return error; 1877 } 1878 1879 static int fuse_writepage(struct page *page, struct writeback_control *wbc) 1880 { 1881 int err; 1882 1883 if (fuse_page_is_writeback(page->mapping->host, page->index)) { 1884 /* 1885 * ->writepages() should be called for sync() and friends. We 1886 * should only get here on direct reclaim and then we are 1887 * allowed to skip a page which is already in flight 1888 */ 1889 WARN_ON(wbc->sync_mode == WB_SYNC_ALL); 1890 1891 redirty_page_for_writepage(wbc, page); 1892 unlock_page(page); 1893 return 0; 1894 } 1895 1896 err = fuse_writepage_locked(page); 1897 unlock_page(page); 1898 1899 return err; 1900 } 1901 1902 struct fuse_fill_wb_data { 1903 struct fuse_writepage_args *wpa; 1904 struct fuse_file *ff; 1905 struct inode *inode; 1906 struct page **orig_pages; 1907 unsigned int max_pages; 1908 }; 1909 1910 static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) 1911 { 1912 struct fuse_args_pages *ap = &data->wpa->ia.ap; 1913 struct fuse_conn *fc = get_fuse_conn(data->inode); 1914 struct page **pages; 1915 struct fuse_page_desc *descs; 1916 unsigned int npages = min_t(unsigned int, 1917 max_t(unsigned int, data->max_pages * 2, 1918 FUSE_DEFAULT_MAX_PAGES_PER_REQ), 1919 fc->max_pages); 1920 WARN_ON(npages <= data->max_pages); 1921 1922 pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); 1923 if (!pages) 1924 return false; 1925 1926 memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); 1927 memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); 1928 kfree(ap->pages); 1929 ap->pages = pages; 1930 ap->descs = descs; 1931 data->max_pages = npages; 1932 1933 return true; 1934 } 1935 1936 static void fuse_writepages_send(struct fuse_fill_wb_data *data) 1937 { 1938 struct fuse_writepage_args *wpa = data->wpa; 1939 struct inode *inode = data->inode; 1940 struct fuse_inode *fi = get_fuse_inode(inode); 1941 int num_pages = wpa->ia.ap.num_pages; 1942 int i; 1943 1944 wpa->ia.ff = fuse_file_get(data->ff); 1945 spin_lock(&fi->lock); 1946 list_add_tail(&wpa->queue_entry, &fi->queued_writes); 1947 fuse_flush_writepages(inode); 1948 spin_unlock(&fi->lock); 1949 1950 for (i = 0; i < num_pages; i++) 1951 end_page_writeback(data->orig_pages[i]); 1952 } 1953 1954 /* 1955 * First recheck under fi->lock if the offending offset is still under 1956 * writeback. If yes, then iterate auxiliary write requests, to see if there's 1957 * one already added for a page at this offset. If there's none, then insert 1958 * this new request onto the auxiliary list, otherwise reuse the existing one by 1959 * copying the new page contents over to the old temporary page. 1960 */ 1961 static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, 1962 struct page *page) 1963 { 1964 struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); 1965 struct fuse_writepage_args *tmp; 1966 struct fuse_writepage_args *old_wpa; 1967 struct fuse_args_pages *new_ap = &new_wpa->ia.ap; 1968 1969 WARN_ON(new_ap->num_pages != 0); 1970 1971 spin_lock(&fi->lock); 1972 rb_erase(&new_wpa->writepages_entry, &fi->writepages); 1973 old_wpa = fuse_find_writeback(fi, page->index, page->index); 1974 if (!old_wpa) { 1975 tree_insert(&fi->writepages, new_wpa); 1976 spin_unlock(&fi->lock); 1977 return false; 1978 } 1979 1980 new_ap->num_pages = 1; 1981 for (tmp = old_wpa->next; tmp; tmp = tmp->next) { 1982 pgoff_t curr_index; 1983 1984 WARN_ON(tmp->inode != new_wpa->inode); 1985 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; 1986 if (curr_index == page->index) { 1987 WARN_ON(tmp->ia.ap.num_pages != 1); 1988 swap(tmp->ia.ap.pages[0], new_ap->pages[0]); 1989 break; 1990 } 1991 } 1992 1993 if (!tmp) { 1994 new_wpa->next = old_wpa->next; 1995 old_wpa->next = new_wpa; 1996 } 1997 1998 spin_unlock(&fi->lock); 1999 2000 if (tmp) { 2001 struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); 2002 2003 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 2004 dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); 2005 wb_writeout_inc(&bdi->wb); 2006 fuse_writepage_free(new_wpa); 2007 } 2008 2009 return true; 2010 } 2011 2012 static int fuse_writepages_fill(struct page *page, 2013 struct writeback_control *wbc, void *_data) 2014 { 2015 struct fuse_fill_wb_data *data = _data; 2016 struct fuse_writepage_args *wpa = data->wpa; 2017 struct fuse_args_pages *ap = &wpa->ia.ap; 2018 struct inode *inode = data->inode; 2019 struct fuse_inode *fi = get_fuse_inode(inode); 2020 struct fuse_conn *fc = get_fuse_conn(inode); 2021 struct page *tmp_page; 2022 bool is_writeback; 2023 int err; 2024 2025 if (!data->ff) { 2026 err = -EIO; 2027 data->ff = fuse_write_file_get(fc, fi); 2028 if (!data->ff) 2029 goto out_unlock; 2030 } 2031 2032 /* 2033 * Being under writeback is unlikely but possible. For example direct 2034 * read to an mmaped fuse file will set the page dirty twice; once when 2035 * the pages are faulted with get_user_pages(), and then after the read 2036 * completed. 2037 */ 2038 is_writeback = fuse_page_is_writeback(inode, page->index); 2039 2040 if (wpa && ap->num_pages && 2041 (is_writeback || ap->num_pages == fc->max_pages || 2042 (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || 2043 data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { 2044 fuse_writepages_send(data); 2045 data->wpa = NULL; 2046 } else if (wpa && ap->num_pages == data->max_pages) { 2047 if (!fuse_pages_realloc(data)) { 2048 fuse_writepages_send(data); 2049 data->wpa = NULL; 2050 } 2051 } 2052 2053 err = -ENOMEM; 2054 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 2055 if (!tmp_page) 2056 goto out_unlock; 2057 2058 /* 2059 * The page must not be redirtied until the writeout is completed 2060 * (i.e. userspace has sent a reply to the write request). Otherwise 2061 * there could be more than one temporary page instance for each real 2062 * page. 2063 * 2064 * This is ensured by holding the page lock in page_mkwrite() while 2065 * checking fuse_page_is_writeback(). We already hold the page lock 2066 * since clear_page_dirty_for_io() and keep it held until we add the 2067 * request to the fi->writepages list and increment ap->num_pages. 2068 * After this fuse_page_is_writeback() will indicate that the page is 2069 * under writeback, so we can release the page lock. 2070 */ 2071 if (data->wpa == NULL) { 2072 err = -ENOMEM; 2073 wpa = fuse_writepage_args_alloc(); 2074 if (!wpa) { 2075 __free_page(tmp_page); 2076 goto out_unlock; 2077 } 2078 data->max_pages = 1; 2079 2080 ap = &wpa->ia.ap; 2081 fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); 2082 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; 2083 wpa->next = NULL; 2084 ap->args.in_pages = true; 2085 ap->args.end = fuse_writepage_end; 2086 ap->num_pages = 0; 2087 wpa->inode = inode; 2088 2089 spin_lock(&fi->lock); 2090 tree_insert(&fi->writepages, wpa); 2091 spin_unlock(&fi->lock); 2092 2093 data->wpa = wpa; 2094 } 2095 set_page_writeback(page); 2096 2097 copy_highpage(tmp_page, page); 2098 ap->pages[ap->num_pages] = tmp_page; 2099 ap->descs[ap->num_pages].offset = 0; 2100 ap->descs[ap->num_pages].length = PAGE_SIZE; 2101 2102 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 2103 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 2104 2105 err = 0; 2106 if (is_writeback && fuse_writepage_in_flight(wpa, page)) { 2107 end_page_writeback(page); 2108 data->wpa = NULL; 2109 goto out_unlock; 2110 } 2111 data->orig_pages[ap->num_pages] = page; 2112 2113 /* 2114 * Protected by fi->lock against concurrent access by 2115 * fuse_page_is_writeback(). 2116 */ 2117 spin_lock(&fi->lock); 2118 ap->num_pages++; 2119 spin_unlock(&fi->lock); 2120 2121 out_unlock: 2122 unlock_page(page); 2123 2124 return err; 2125 } 2126 2127 static int fuse_writepages(struct address_space *mapping, 2128 struct writeback_control *wbc) 2129 { 2130 struct inode *inode = mapping->host; 2131 struct fuse_conn *fc = get_fuse_conn(inode); 2132 struct fuse_fill_wb_data data; 2133 int err; 2134 2135 err = -EIO; 2136 if (is_bad_inode(inode)) 2137 goto out; 2138 2139 data.inode = inode; 2140 data.wpa = NULL; 2141 data.ff = NULL; 2142 2143 err = -ENOMEM; 2144 data.orig_pages = kcalloc(fc->max_pages, 2145 sizeof(struct page *), 2146 GFP_NOFS); 2147 if (!data.orig_pages) 2148 goto out; 2149 2150 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); 2151 if (data.wpa) { 2152 /* Ignore errors if we can write at least one page */ 2153 WARN_ON(!data.wpa->ia.ap.num_pages); 2154 fuse_writepages_send(&data); 2155 err = 0; 2156 } 2157 if (data.ff) 2158 fuse_file_put(data.ff, false, false); 2159 2160 kfree(data.orig_pages); 2161 out: 2162 return err; 2163 } 2164 2165 /* 2166 * It's worthy to make sure that space is reserved on disk for the write, 2167 * but how to implement it without killing performance need more thinking. 2168 */ 2169 static int fuse_write_begin(struct file *file, struct address_space *mapping, 2170 loff_t pos, unsigned len, unsigned flags, 2171 struct page **pagep, void **fsdata) 2172 { 2173 pgoff_t index = pos >> PAGE_SHIFT; 2174 struct fuse_conn *fc = get_fuse_conn(file_inode(file)); 2175 struct page *page; 2176 loff_t fsize; 2177 int err = -ENOMEM; 2178 2179 WARN_ON(!fc->writeback_cache); 2180 2181 page = grab_cache_page_write_begin(mapping, index, flags); 2182 if (!page) 2183 goto error; 2184 2185 fuse_wait_on_page_writeback(mapping->host, page->index); 2186 2187 if (PageUptodate(page) || len == PAGE_SIZE) 2188 goto success; 2189 /* 2190 * Check if the start this page comes after the end of file, in which 2191 * case the readpage can be optimized away. 2192 */ 2193 fsize = i_size_read(mapping->host); 2194 if (fsize <= (pos & PAGE_MASK)) { 2195 size_t off = pos & ~PAGE_MASK; 2196 if (off) 2197 zero_user_segment(page, 0, off); 2198 goto success; 2199 } 2200 err = fuse_do_readpage(file, page); 2201 if (err) 2202 goto cleanup; 2203 success: 2204 *pagep = page; 2205 return 0; 2206 2207 cleanup: 2208 unlock_page(page); 2209 put_page(page); 2210 error: 2211 return err; 2212 } 2213 2214 static int fuse_write_end(struct file *file, struct address_space *mapping, 2215 loff_t pos, unsigned len, unsigned copied, 2216 struct page *page, void *fsdata) 2217 { 2218 struct inode *inode = page->mapping->host; 2219 2220 /* Haven't copied anything? Skip zeroing, size extending, dirtying. */ 2221 if (!copied) 2222 goto unlock; 2223 2224 if (!PageUptodate(page)) { 2225 /* Zero any unwritten bytes at the end of the page */ 2226 size_t endoff = (pos + copied) & ~PAGE_MASK; 2227 if (endoff) 2228 zero_user_segment(page, endoff, PAGE_SIZE); 2229 SetPageUptodate(page); 2230 } 2231 2232 fuse_write_update_size(inode, pos + copied); 2233 set_page_dirty(page); 2234 2235 unlock: 2236 unlock_page(page); 2237 put_page(page); 2238 2239 return copied; 2240 } 2241 2242 static int fuse_launder_page(struct page *page) 2243 { 2244 int err = 0; 2245 if (clear_page_dirty_for_io(page)) { 2246 struct inode *inode = page->mapping->host; 2247 err = fuse_writepage_locked(page); 2248 if (!err) 2249 fuse_wait_on_page_writeback(inode, page->index); 2250 } 2251 return err; 2252 } 2253 2254 /* 2255 * Write back dirty pages now, because there may not be any suitable 2256 * open files later 2257 */ 2258 static void fuse_vma_close(struct vm_area_struct *vma) 2259 { 2260 filemap_write_and_wait(vma->vm_file->f_mapping); 2261 } 2262 2263 /* 2264 * Wait for writeback against this page to complete before allowing it 2265 * to be marked dirty again, and hence written back again, possibly 2266 * before the previous writepage completed. 2267 * 2268 * Block here, instead of in ->writepage(), so that the userspace fs 2269 * can only block processes actually operating on the filesystem. 2270 * 2271 * Otherwise unprivileged userspace fs would be able to block 2272 * unrelated: 2273 * 2274 * - page migration 2275 * - sync(2) 2276 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER 2277 */ 2278 static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) 2279 { 2280 struct page *page = vmf->page; 2281 struct inode *inode = file_inode(vmf->vma->vm_file); 2282 2283 file_update_time(vmf->vma->vm_file); 2284 lock_page(page); 2285 if (page->mapping != inode->i_mapping) { 2286 unlock_page(page); 2287 return VM_FAULT_NOPAGE; 2288 } 2289 2290 fuse_wait_on_page_writeback(inode, page->index); 2291 return VM_FAULT_LOCKED; 2292 } 2293 2294 static const struct vm_operations_struct fuse_file_vm_ops = { 2295 .close = fuse_vma_close, 2296 .fault = filemap_fault, 2297 .map_pages = filemap_map_pages, 2298 .page_mkwrite = fuse_page_mkwrite, 2299 }; 2300 2301 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) 2302 { 2303 struct fuse_file *ff = file->private_data; 2304 2305 if (ff->open_flags & FOPEN_DIRECT_IO) { 2306 /* Can't provide the coherency needed for MAP_SHARED */ 2307 if (vma->vm_flags & VM_MAYSHARE) 2308 return -ENODEV; 2309 2310 invalidate_inode_pages2(file->f_mapping); 2311 2312 return generic_file_mmap(file, vma); 2313 } 2314 2315 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 2316 fuse_link_write_file(file); 2317 2318 file_accessed(file); 2319 vma->vm_ops = &fuse_file_vm_ops; 2320 return 0; 2321 } 2322 2323 static int convert_fuse_file_lock(struct fuse_conn *fc, 2324 const struct fuse_file_lock *ffl, 2325 struct file_lock *fl) 2326 { 2327 switch (ffl->type) { 2328 case F_UNLCK: 2329 break; 2330 2331 case F_RDLCK: 2332 case F_WRLCK: 2333 if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX || 2334 ffl->end < ffl->start) 2335 return -EIO; 2336 2337 fl->fl_start = ffl->start; 2338 fl->fl_end = ffl->end; 2339 2340 /* 2341 * Convert pid into init's pid namespace. The locks API will 2342 * translate it into the caller's pid namespace. 2343 */ 2344 rcu_read_lock(); 2345 fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); 2346 rcu_read_unlock(); 2347 break; 2348 2349 default: 2350 return -EIO; 2351 } 2352 fl->fl_type = ffl->type; 2353 return 0; 2354 } 2355 2356 static void fuse_lk_fill(struct fuse_args *args, struct file *file, 2357 const struct file_lock *fl, int opcode, pid_t pid, 2358 int flock, struct fuse_lk_in *inarg) 2359 { 2360 struct inode *inode = file_inode(file); 2361 struct fuse_conn *fc = get_fuse_conn(inode); 2362 struct fuse_file *ff = file->private_data; 2363 2364 memset(inarg, 0, sizeof(*inarg)); 2365 inarg->fh = ff->fh; 2366 inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner); 2367 inarg->lk.start = fl->fl_start; 2368 inarg->lk.end = fl->fl_end; 2369 inarg->lk.type = fl->fl_type; 2370 inarg->lk.pid = pid; 2371 if (flock) 2372 inarg->lk_flags |= FUSE_LK_FLOCK; 2373 args->opcode = opcode; 2374 args->nodeid = get_node_id(inode); 2375 args->in_numargs = 1; 2376 args->in_args[0].size = sizeof(*inarg); 2377 args->in_args[0].value = inarg; 2378 } 2379 2380 static int fuse_getlk(struct file *file, struct file_lock *fl) 2381 { 2382 struct inode *inode = file_inode(file); 2383 struct fuse_conn *fc = get_fuse_conn(inode); 2384 FUSE_ARGS(args); 2385 struct fuse_lk_in inarg; 2386 struct fuse_lk_out outarg; 2387 int err; 2388 2389 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); 2390 args.out_numargs = 1; 2391 args.out_args[0].size = sizeof(outarg); 2392 args.out_args[0].value = &outarg; 2393 err = fuse_simple_request(fc, &args); 2394 if (!err) 2395 err = convert_fuse_file_lock(fc, &outarg.lk, fl); 2396 2397 return err; 2398 } 2399 2400 static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) 2401 { 2402 struct inode *inode = file_inode(file); 2403 struct fuse_conn *fc = get_fuse_conn(inode); 2404 FUSE_ARGS(args); 2405 struct fuse_lk_in inarg; 2406 int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; 2407 struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; 2408 pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); 2409 int err; 2410 2411 if (fl->fl_lmops && fl->fl_lmops->lm_grant) { 2412 /* NLM needs asynchronous locks, which we don't support yet */ 2413 return -ENOLCK; 2414 } 2415 2416 /* Unlock on close is handled by the flush method */ 2417 if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) 2418 return 0; 2419 2420 fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); 2421 err = fuse_simple_request(fc, &args); 2422 2423 /* locking is restartable */ 2424 if (err == -EINTR) 2425 err = -ERESTARTSYS; 2426 2427 return err; 2428 } 2429 2430 static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl) 2431 { 2432 struct inode *inode = file_inode(file); 2433 struct fuse_conn *fc = get_fuse_conn(inode); 2434 int err; 2435 2436 if (cmd == F_CANCELLK) { 2437 err = 0; 2438 } else if (cmd == F_GETLK) { 2439 if (fc->no_lock) { 2440 posix_test_lock(file, fl); 2441 err = 0; 2442 } else 2443 err = fuse_getlk(file, fl); 2444 } else { 2445 if (fc->no_lock) 2446 err = posix_lock_file(file, fl, NULL); 2447 else 2448 err = fuse_setlk(file, fl, 0); 2449 } 2450 return err; 2451 } 2452 2453 static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) 2454 { 2455 struct inode *inode = file_inode(file); 2456 struct fuse_conn *fc = get_fuse_conn(inode); 2457 int err; 2458 2459 if (fc->no_flock) { 2460 err = locks_lock_file_wait(file, fl); 2461 } else { 2462 struct fuse_file *ff = file->private_data; 2463 2464 /* emulate flock with POSIX locks */ 2465 ff->flock = true; 2466 err = fuse_setlk(file, fl, 1); 2467 } 2468 2469 return err; 2470 } 2471 2472 static sector_t fuse_bmap(struct address_space *mapping, sector_t block) 2473 { 2474 struct inode *inode = mapping->host; 2475 struct fuse_conn *fc = get_fuse_conn(inode); 2476 FUSE_ARGS(args); 2477 struct fuse_bmap_in inarg; 2478 struct fuse_bmap_out outarg; 2479 int err; 2480 2481 if (!inode->i_sb->s_bdev || fc->no_bmap) 2482 return 0; 2483 2484 memset(&inarg, 0, sizeof(inarg)); 2485 inarg.block = block; 2486 inarg.blocksize = inode->i_sb->s_blocksize; 2487 args.opcode = FUSE_BMAP; 2488 args.nodeid = get_node_id(inode); 2489 args.in_numargs = 1; 2490 args.in_args[0].size = sizeof(inarg); 2491 args.in_args[0].value = &inarg; 2492 args.out_numargs = 1; 2493 args.out_args[0].size = sizeof(outarg); 2494 args.out_args[0].value = &outarg; 2495 err = fuse_simple_request(fc, &args); 2496 if (err == -ENOSYS) 2497 fc->no_bmap = 1; 2498 2499 return err ? 0 : outarg.block; 2500 } 2501 2502 static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) 2503 { 2504 struct inode *inode = file->f_mapping->host; 2505 struct fuse_conn *fc = get_fuse_conn(inode); 2506 struct fuse_file *ff = file->private_data; 2507 FUSE_ARGS(args); 2508 struct fuse_lseek_in inarg = { 2509 .fh = ff->fh, 2510 .offset = offset, 2511 .whence = whence 2512 }; 2513 struct fuse_lseek_out outarg; 2514 int err; 2515 2516 if (fc->no_lseek) 2517 goto fallback; 2518 2519 args.opcode = FUSE_LSEEK; 2520 args.nodeid = ff->nodeid; 2521 args.in_numargs = 1; 2522 args.in_args[0].size = sizeof(inarg); 2523 args.in_args[0].value = &inarg; 2524 args.out_numargs = 1; 2525 args.out_args[0].size = sizeof(outarg); 2526 args.out_args[0].value = &outarg; 2527 err = fuse_simple_request(fc, &args); 2528 if (err) { 2529 if (err == -ENOSYS) { 2530 fc->no_lseek = 1; 2531 goto fallback; 2532 } 2533 return err; 2534 } 2535 2536 return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); 2537 2538 fallback: 2539 err = fuse_update_attributes(inode, file); 2540 if (!err) 2541 return generic_file_llseek(file, offset, whence); 2542 else 2543 return err; 2544 } 2545 2546 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) 2547 { 2548 loff_t retval; 2549 struct inode *inode = file_inode(file); 2550 2551 switch (whence) { 2552 case SEEK_SET: 2553 case SEEK_CUR: 2554 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ 2555 retval = generic_file_llseek(file, offset, whence); 2556 break; 2557 case SEEK_END: 2558 inode_lock(inode); 2559 retval = fuse_update_attributes(inode, file); 2560 if (!retval) 2561 retval = generic_file_llseek(file, offset, whence); 2562 inode_unlock(inode); 2563 break; 2564 case SEEK_HOLE: 2565 case SEEK_DATA: 2566 inode_lock(inode); 2567 retval = fuse_lseek(file, offset, whence); 2568 inode_unlock(inode); 2569 break; 2570 default: 2571 retval = -EINVAL; 2572 } 2573 2574 return retval; 2575 } 2576 2577 /* 2578 * CUSE servers compiled on 32bit broke on 64bit kernels because the 2579 * ABI was defined to be 'struct iovec' which is different on 32bit 2580 * and 64bit. Fortunately we can determine which structure the server 2581 * used from the size of the reply. 2582 */ 2583 static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src, 2584 size_t transferred, unsigned count, 2585 bool is_compat) 2586 { 2587 #ifdef CONFIG_COMPAT 2588 if (count * sizeof(struct compat_iovec) == transferred) { 2589 struct compat_iovec *ciov = src; 2590 unsigned i; 2591 2592 /* 2593 * With this interface a 32bit server cannot support 2594 * non-compat (i.e. ones coming from 64bit apps) ioctl 2595 * requests 2596 */ 2597 if (!is_compat) 2598 return -EINVAL; 2599 2600 for (i = 0; i < count; i++) { 2601 dst[i].iov_base = compat_ptr(ciov[i].iov_base); 2602 dst[i].iov_len = ciov[i].iov_len; 2603 } 2604 return 0; 2605 } 2606 #endif 2607 2608 if (count * sizeof(struct iovec) != transferred) 2609 return -EIO; 2610 2611 memcpy(dst, src, transferred); 2612 return 0; 2613 } 2614 2615 /* Make sure iov_length() won't overflow */ 2616 static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov, 2617 size_t count) 2618 { 2619 size_t n; 2620 u32 max = fc->max_pages << PAGE_SHIFT; 2621 2622 for (n = 0; n < count; n++, iov++) { 2623 if (iov->iov_len > (size_t) max) 2624 return -ENOMEM; 2625 max -= iov->iov_len; 2626 } 2627 return 0; 2628 } 2629 2630 static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst, 2631 void *src, size_t transferred, unsigned count, 2632 bool is_compat) 2633 { 2634 unsigned i; 2635 struct fuse_ioctl_iovec *fiov = src; 2636 2637 if (fc->minor < 16) { 2638 return fuse_copy_ioctl_iovec_old(dst, src, transferred, 2639 count, is_compat); 2640 } 2641 2642 if (count * sizeof(struct fuse_ioctl_iovec) != transferred) 2643 return -EIO; 2644 2645 for (i = 0; i < count; i++) { 2646 /* Did the server supply an inappropriate value? */ 2647 if (fiov[i].base != (unsigned long) fiov[i].base || 2648 fiov[i].len != (unsigned long) fiov[i].len) 2649 return -EIO; 2650 2651 dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base; 2652 dst[i].iov_len = (size_t) fiov[i].len; 2653 2654 #ifdef CONFIG_COMPAT 2655 if (is_compat && 2656 (ptr_to_compat(dst[i].iov_base) != fiov[i].base || 2657 (compat_size_t) dst[i].iov_len != fiov[i].len)) 2658 return -EIO; 2659 #endif 2660 } 2661 2662 return 0; 2663 } 2664 2665 2666 /* 2667 * For ioctls, there is no generic way to determine how much memory 2668 * needs to be read and/or written. Furthermore, ioctls are allowed 2669 * to dereference the passed pointer, so the parameter requires deep 2670 * copying but FUSE has no idea whatsoever about what to copy in or 2671 * out. 2672 * 2673 * This is solved by allowing FUSE server to retry ioctl with 2674 * necessary in/out iovecs. Let's assume the ioctl implementation 2675 * needs to read in the following structure. 2676 * 2677 * struct a { 2678 * char *buf; 2679 * size_t buflen; 2680 * } 2681 * 2682 * On the first callout to FUSE server, inarg->in_size and 2683 * inarg->out_size will be NULL; then, the server completes the ioctl 2684 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and 2685 * the actual iov array to 2686 * 2687 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } 2688 * 2689 * which tells FUSE to copy in the requested area and retry the ioctl. 2690 * On the second round, the server has access to the structure and 2691 * from that it can tell what to look for next, so on the invocation, 2692 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to 2693 * 2694 * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, 2695 * { .iov_base = a.buf, .iov_len = a.buflen } } 2696 * 2697 * FUSE will copy both struct a and the pointed buffer from the 2698 * process doing the ioctl and retry ioctl with both struct a and the 2699 * buffer. 2700 * 2701 * This time, FUSE server has everything it needs and completes ioctl 2702 * without FUSE_IOCTL_RETRY which finishes the ioctl call. 2703 * 2704 * Copying data out works the same way. 2705 * 2706 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel 2707 * automatically initializes in and out iovs by decoding @cmd with 2708 * _IOC_* macros and the server is not allowed to request RETRY. This 2709 * limits ioctl data transfers to well-formed ioctls and is the forced 2710 * behavior for all FUSE servers. 2711 */ 2712 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, 2713 unsigned int flags) 2714 { 2715 struct fuse_file *ff = file->private_data; 2716 struct fuse_conn *fc = ff->fc; 2717 struct fuse_ioctl_in inarg = { 2718 .fh = ff->fh, 2719 .cmd = cmd, 2720 .arg = arg, 2721 .flags = flags 2722 }; 2723 struct fuse_ioctl_out outarg; 2724 struct iovec *iov_page = NULL; 2725 struct iovec *in_iov = NULL, *out_iov = NULL; 2726 unsigned int in_iovs = 0, out_iovs = 0, max_pages; 2727 size_t in_size, out_size, c; 2728 ssize_t transferred; 2729 int err, i; 2730 struct iov_iter ii; 2731 struct fuse_args_pages ap = {}; 2732 2733 #if BITS_PER_LONG == 32 2734 inarg.flags |= FUSE_IOCTL_32BIT; 2735 #else 2736 if (flags & FUSE_IOCTL_COMPAT) { 2737 inarg.flags |= FUSE_IOCTL_32BIT; 2738 #ifdef CONFIG_X86_X32 2739 if (in_x32_syscall()) 2740 inarg.flags |= FUSE_IOCTL_COMPAT_X32; 2741 #endif 2742 } 2743 #endif 2744 2745 /* assume all the iovs returned by client always fits in a page */ 2746 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 2747 2748 err = -ENOMEM; 2749 ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); 2750 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); 2751 if (!ap.pages || !iov_page) 2752 goto out; 2753 2754 fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); 2755 2756 /* 2757 * If restricted, initialize IO parameters as encoded in @cmd. 2758 * RETRY from server is not allowed. 2759 */ 2760 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { 2761 struct iovec *iov = iov_page; 2762 2763 iov->iov_base = (void __user *)arg; 2764 iov->iov_len = _IOC_SIZE(cmd); 2765 2766 if (_IOC_DIR(cmd) & _IOC_WRITE) { 2767 in_iov = iov; 2768 in_iovs = 1; 2769 } 2770 2771 if (_IOC_DIR(cmd) & _IOC_READ) { 2772 out_iov = iov; 2773 out_iovs = 1; 2774 } 2775 } 2776 2777 retry: 2778 inarg.in_size = in_size = iov_length(in_iov, in_iovs); 2779 inarg.out_size = out_size = iov_length(out_iov, out_iovs); 2780 2781 /* 2782 * Out data can be used either for actual out data or iovs, 2783 * make sure there always is at least one page. 2784 */ 2785 out_size = max_t(size_t, out_size, PAGE_SIZE); 2786 max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); 2787 2788 /* make sure there are enough buffer pages and init request with them */ 2789 err = -ENOMEM; 2790 if (max_pages > fc->max_pages) 2791 goto out; 2792 while (ap.num_pages < max_pages) { 2793 ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 2794 if (!ap.pages[ap.num_pages]) 2795 goto out; 2796 ap.num_pages++; 2797 } 2798 2799 2800 /* okay, let's send it to the client */ 2801 ap.args.opcode = FUSE_IOCTL; 2802 ap.args.nodeid = ff->nodeid; 2803 ap.args.in_numargs = 1; 2804 ap.args.in_args[0].size = sizeof(inarg); 2805 ap.args.in_args[0].value = &inarg; 2806 if (in_size) { 2807 ap.args.in_numargs++; 2808 ap.args.in_args[1].size = in_size; 2809 ap.args.in_pages = true; 2810 2811 err = -EFAULT; 2812 iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); 2813 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { 2814 c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); 2815 if (c != PAGE_SIZE && iov_iter_count(&ii)) 2816 goto out; 2817 } 2818 } 2819 2820 ap.args.out_numargs = 2; 2821 ap.args.out_args[0].size = sizeof(outarg); 2822 ap.args.out_args[0].value = &outarg; 2823 ap.args.out_args[1].size = out_size; 2824 ap.args.out_pages = true; 2825 ap.args.out_argvar = true; 2826 2827 transferred = fuse_simple_request(fc, &ap.args); 2828 err = transferred; 2829 if (transferred < 0) 2830 goto out; 2831 2832 /* did it ask for retry? */ 2833 if (outarg.flags & FUSE_IOCTL_RETRY) { 2834 void *vaddr; 2835 2836 /* no retry if in restricted mode */ 2837 err = -EIO; 2838 if (!(flags & FUSE_IOCTL_UNRESTRICTED)) 2839 goto out; 2840 2841 in_iovs = outarg.in_iovs; 2842 out_iovs = outarg.out_iovs; 2843 2844 /* 2845 * Make sure things are in boundary, separate checks 2846 * are to protect against overflow. 2847 */ 2848 err = -ENOMEM; 2849 if (in_iovs > FUSE_IOCTL_MAX_IOV || 2850 out_iovs > FUSE_IOCTL_MAX_IOV || 2851 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 2852 goto out; 2853 2854 vaddr = kmap_atomic(ap.pages[0]); 2855 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, 2856 transferred, in_iovs + out_iovs, 2857 (flags & FUSE_IOCTL_COMPAT) != 0); 2858 kunmap_atomic(vaddr); 2859 if (err) 2860 goto out; 2861 2862 in_iov = iov_page; 2863 out_iov = in_iov + in_iovs; 2864 2865 err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); 2866 if (err) 2867 goto out; 2868 2869 err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); 2870 if (err) 2871 goto out; 2872 2873 goto retry; 2874 } 2875 2876 err = -EIO; 2877 if (transferred > inarg.out_size) 2878 goto out; 2879 2880 err = -EFAULT; 2881 iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); 2882 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { 2883 c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); 2884 if (c != PAGE_SIZE && iov_iter_count(&ii)) 2885 goto out; 2886 } 2887 err = 0; 2888 out: 2889 free_page((unsigned long) iov_page); 2890 while (ap.num_pages) 2891 __free_page(ap.pages[--ap.num_pages]); 2892 kfree(ap.pages); 2893 2894 return err ? err : outarg.result; 2895 } 2896 EXPORT_SYMBOL_GPL(fuse_do_ioctl); 2897 2898 long fuse_ioctl_common(struct file *file, unsigned int cmd, 2899 unsigned long arg, unsigned int flags) 2900 { 2901 struct inode *inode = file_inode(file); 2902 struct fuse_conn *fc = get_fuse_conn(inode); 2903 2904 if (!fuse_allow_current_process(fc)) 2905 return -EACCES; 2906 2907 if (is_bad_inode(inode)) 2908 return -EIO; 2909 2910 return fuse_do_ioctl(file, cmd, arg, flags); 2911 } 2912 2913 static long fuse_file_ioctl(struct file *file, unsigned int cmd, 2914 unsigned long arg) 2915 { 2916 return fuse_ioctl_common(file, cmd, arg, 0); 2917 } 2918 2919 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, 2920 unsigned long arg) 2921 { 2922 return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT); 2923 } 2924 2925 /* 2926 * All files which have been polled are linked to RB tree 2927 * fuse_conn->polled_files which is indexed by kh. Walk the tree and 2928 * find the matching one. 2929 */ 2930 static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, 2931 struct rb_node **parent_out) 2932 { 2933 struct rb_node **link = &fc->polled_files.rb_node; 2934 struct rb_node *last = NULL; 2935 2936 while (*link) { 2937 struct fuse_file *ff; 2938 2939 last = *link; 2940 ff = rb_entry(last, struct fuse_file, polled_node); 2941 2942 if (kh < ff->kh) 2943 link = &last->rb_left; 2944 else if (kh > ff->kh) 2945 link = &last->rb_right; 2946 else 2947 return link; 2948 } 2949 2950 if (parent_out) 2951 *parent_out = last; 2952 return link; 2953 } 2954 2955 /* 2956 * The file is about to be polled. Make sure it's on the polled_files 2957 * RB tree. Note that files once added to the polled_files tree are 2958 * not removed before the file is released. This is because a file 2959 * polled once is likely to be polled again. 2960 */ 2961 static void fuse_register_polled_file(struct fuse_conn *fc, 2962 struct fuse_file *ff) 2963 { 2964 spin_lock(&fc->lock); 2965 if (RB_EMPTY_NODE(&ff->polled_node)) { 2966 struct rb_node **link, *uninitialized_var(parent); 2967 2968 link = fuse_find_polled_node(fc, ff->kh, &parent); 2969 BUG_ON(*link); 2970 rb_link_node(&ff->polled_node, parent, link); 2971 rb_insert_color(&ff->polled_node, &fc->polled_files); 2972 } 2973 spin_unlock(&fc->lock); 2974 } 2975 2976 __poll_t fuse_file_poll(struct file *file, poll_table *wait) 2977 { 2978 struct fuse_file *ff = file->private_data; 2979 struct fuse_conn *fc = ff->fc; 2980 struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; 2981 struct fuse_poll_out outarg; 2982 FUSE_ARGS(args); 2983 int err; 2984 2985 if (fc->no_poll) 2986 return DEFAULT_POLLMASK; 2987 2988 poll_wait(file, &ff->poll_wait, wait); 2989 inarg.events = mangle_poll(poll_requested_events(wait)); 2990 2991 /* 2992 * Ask for notification iff there's someone waiting for it. 2993 * The client may ignore the flag and always notify. 2994 */ 2995 if (waitqueue_active(&ff->poll_wait)) { 2996 inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; 2997 fuse_register_polled_file(fc, ff); 2998 } 2999 3000 args.opcode = FUSE_POLL; 3001 args.nodeid = ff->nodeid; 3002 args.in_numargs = 1; 3003 args.in_args[0].size = sizeof(inarg); 3004 args.in_args[0].value = &inarg; 3005 args.out_numargs = 1; 3006 args.out_args[0].size = sizeof(outarg); 3007 args.out_args[0].value = &outarg; 3008 err = fuse_simple_request(fc, &args); 3009 3010 if (!err) 3011 return demangle_poll(outarg.revents); 3012 if (err == -ENOSYS) { 3013 fc->no_poll = 1; 3014 return DEFAULT_POLLMASK; 3015 } 3016 return EPOLLERR; 3017 } 3018 EXPORT_SYMBOL_GPL(fuse_file_poll); 3019 3020 /* 3021 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and 3022 * wakes up the poll waiters. 3023 */ 3024 int fuse_notify_poll_wakeup(struct fuse_conn *fc, 3025 struct fuse_notify_poll_wakeup_out *outarg) 3026 { 3027 u64 kh = outarg->kh; 3028 struct rb_node **link; 3029 3030 spin_lock(&fc->lock); 3031 3032 link = fuse_find_polled_node(fc, kh, NULL); 3033 if (*link) { 3034 struct fuse_file *ff; 3035 3036 ff = rb_entry(*link, struct fuse_file, polled_node); 3037 wake_up_interruptible_sync(&ff->poll_wait); 3038 } 3039 3040 spin_unlock(&fc->lock); 3041 return 0; 3042 } 3043 3044 static void fuse_do_truncate(struct file *file) 3045 { 3046 struct inode *inode = file->f_mapping->host; 3047 struct iattr attr; 3048 3049 attr.ia_valid = ATTR_SIZE; 3050 attr.ia_size = i_size_read(inode); 3051 3052 attr.ia_file = file; 3053 attr.ia_valid |= ATTR_FILE; 3054 3055 fuse_do_setattr(file_dentry(file), &attr, file); 3056 } 3057 3058 static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off) 3059 { 3060 return round_up(off, fc->max_pages << PAGE_SHIFT); 3061 } 3062 3063 static ssize_t 3064 fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 3065 { 3066 DECLARE_COMPLETION_ONSTACK(wait); 3067 ssize_t ret = 0; 3068 struct file *file = iocb->ki_filp; 3069 struct fuse_file *ff = file->private_data; 3070 bool async_dio = ff->fc->async_dio; 3071 loff_t pos = 0; 3072 struct inode *inode; 3073 loff_t i_size; 3074 size_t count = iov_iter_count(iter); 3075 loff_t offset = iocb->ki_pos; 3076 struct fuse_io_priv *io; 3077 3078 pos = offset; 3079 inode = file->f_mapping->host; 3080 i_size = i_size_read(inode); 3081 3082 if ((iov_iter_rw(iter) == READ) && (offset > i_size)) 3083 return 0; 3084 3085 /* optimization for short read */ 3086 if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { 3087 if (offset >= i_size) 3088 return 0; 3089 iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); 3090 count = iov_iter_count(iter); 3091 } 3092 3093 io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); 3094 if (!io) 3095 return -ENOMEM; 3096 spin_lock_init(&io->lock); 3097 kref_init(&io->refcnt); 3098 io->reqs = 1; 3099 io->bytes = -1; 3100 io->size = 0; 3101 io->offset = offset; 3102 io->write = (iov_iter_rw(iter) == WRITE); 3103 io->err = 0; 3104 /* 3105 * By default, we want to optimize all I/Os with async request 3106 * submission to the client filesystem if supported. 3107 */ 3108 io->async = async_dio; 3109 io->iocb = iocb; 3110 io->blocking = is_sync_kiocb(iocb); 3111 3112 /* 3113 * We cannot asynchronously extend the size of a file. 3114 * In such case the aio will behave exactly like sync io. 3115 */ 3116 if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE) 3117 io->blocking = true; 3118 3119 if (io->async && io->blocking) { 3120 /* 3121 * Additional reference to keep io around after 3122 * calling fuse_aio_complete() 3123 */ 3124 kref_get(&io->refcnt); 3125 io->done = &wait; 3126 } 3127 3128 if (iov_iter_rw(iter) == WRITE) { 3129 ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); 3130 fuse_invalidate_attr(inode); 3131 } else { 3132 ret = __fuse_direct_read(io, iter, &pos); 3133 } 3134 3135 if (io->async) { 3136 bool blocking = io->blocking; 3137 3138 fuse_aio_complete(io, ret < 0 ? ret : 0, -1); 3139 3140 /* we have a non-extending, async request, so return */ 3141 if (!blocking) 3142 return -EIOCBQUEUED; 3143 3144 wait_for_completion(&wait); 3145 ret = fuse_get_res_by_io(io); 3146 } 3147 3148 kref_put(&io->refcnt, fuse_io_release); 3149 3150 if (iov_iter_rw(iter) == WRITE) { 3151 if (ret > 0) 3152 fuse_write_update_size(inode, pos); 3153 else if (ret < 0 && offset + count > i_size) 3154 fuse_do_truncate(file); 3155 } 3156 3157 return ret; 3158 } 3159 3160 static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) 3161 { 3162 int err = filemap_write_and_wait_range(inode->i_mapping, start, end); 3163 3164 if (!err) 3165 fuse_sync_writes(inode); 3166 3167 return err; 3168 } 3169 3170 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, 3171 loff_t length) 3172 { 3173 struct fuse_file *ff = file->private_data; 3174 struct inode *inode = file_inode(file); 3175 struct fuse_inode *fi = get_fuse_inode(inode); 3176 struct fuse_conn *fc = ff->fc; 3177 FUSE_ARGS(args); 3178 struct fuse_fallocate_in inarg = { 3179 .fh = ff->fh, 3180 .offset = offset, 3181 .length = length, 3182 .mode = mode 3183 }; 3184 int err; 3185 bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || 3186 (mode & FALLOC_FL_PUNCH_HOLE); 3187 3188 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) 3189 return -EOPNOTSUPP; 3190 3191 if (fc->no_fallocate) 3192 return -EOPNOTSUPP; 3193 3194 if (lock_inode) { 3195 inode_lock(inode); 3196 if (mode & FALLOC_FL_PUNCH_HOLE) { 3197 loff_t endbyte = offset + length - 1; 3198 3199 err = fuse_writeback_range(inode, offset, endbyte); 3200 if (err) 3201 goto out; 3202 } 3203 } 3204 3205 if (!(mode & FALLOC_FL_KEEP_SIZE) && 3206 offset + length > i_size_read(inode)) { 3207 err = inode_newsize_ok(inode, offset + length); 3208 if (err) 3209 goto out; 3210 } 3211 3212 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3213 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 3214 3215 args.opcode = FUSE_FALLOCATE; 3216 args.nodeid = ff->nodeid; 3217 args.in_numargs = 1; 3218 args.in_args[0].size = sizeof(inarg); 3219 args.in_args[0].value = &inarg; 3220 err = fuse_simple_request(fc, &args); 3221 if (err == -ENOSYS) { 3222 fc->no_fallocate = 1; 3223 err = -EOPNOTSUPP; 3224 } 3225 if (err) 3226 goto out; 3227 3228 /* we could have extended the file */ 3229 if (!(mode & FALLOC_FL_KEEP_SIZE)) { 3230 bool changed = fuse_write_update_size(inode, offset + length); 3231 3232 if (changed && fc->writeback_cache) 3233 file_update_time(file); 3234 } 3235 3236 if (mode & FALLOC_FL_PUNCH_HOLE) 3237 truncate_pagecache_range(inode, offset, offset + length - 1); 3238 3239 fuse_invalidate_attr(inode); 3240 3241 out: 3242 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3243 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 3244 3245 if (lock_inode) 3246 inode_unlock(inode); 3247 3248 return err; 3249 } 3250 3251 static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, 3252 struct file *file_out, loff_t pos_out, 3253 size_t len, unsigned int flags) 3254 { 3255 struct fuse_file *ff_in = file_in->private_data; 3256 struct fuse_file *ff_out = file_out->private_data; 3257 struct inode *inode_in = file_inode(file_in); 3258 struct inode *inode_out = file_inode(file_out); 3259 struct fuse_inode *fi_out = get_fuse_inode(inode_out); 3260 struct fuse_conn *fc = ff_in->fc; 3261 FUSE_ARGS(args); 3262 struct fuse_copy_file_range_in inarg = { 3263 .fh_in = ff_in->fh, 3264 .off_in = pos_in, 3265 .nodeid_out = ff_out->nodeid, 3266 .fh_out = ff_out->fh, 3267 .off_out = pos_out, 3268 .len = len, 3269 .flags = flags 3270 }; 3271 struct fuse_write_out outarg; 3272 ssize_t err; 3273 /* mark unstable when write-back is not used, and file_out gets 3274 * extended */ 3275 bool is_unstable = (!fc->writeback_cache) && 3276 ((pos_out + len) > inode_out->i_size); 3277 3278 if (fc->no_copy_file_range) 3279 return -EOPNOTSUPP; 3280 3281 if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) 3282 return -EXDEV; 3283 3284 inode_lock(inode_in); 3285 err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1); 3286 inode_unlock(inode_in); 3287 if (err) 3288 return err; 3289 3290 inode_lock(inode_out); 3291 3292 err = file_modified(file_out); 3293 if (err) 3294 goto out; 3295 3296 /* 3297 * Write out dirty pages in the destination file before sending the COPY 3298 * request to userspace. After the request is completed, truncate off 3299 * pages (including partial ones) from the cache that have been copied, 3300 * since these contain stale data at that point. 3301 * 3302 * This should be mostly correct, but if the COPY writes to partial 3303 * pages (at the start or end) and the parts not covered by the COPY are 3304 * written through a memory map after calling fuse_writeback_range(), 3305 * then these partial page modifications will be lost on truncation. 3306 * 3307 * It is unlikely that someone would rely on such mixed style 3308 * modifications. Yet this does give less guarantees than if the 3309 * copying was performed with write(2). 3310 * 3311 * To fix this a i_mmap_sem style lock could be used to prevent new 3312 * faults while the copy is ongoing. 3313 */ 3314 err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); 3315 if (err) 3316 goto out; 3317 3318 if (is_unstable) 3319 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3320 3321 args.opcode = FUSE_COPY_FILE_RANGE; 3322 args.nodeid = ff_in->nodeid; 3323 args.in_numargs = 1; 3324 args.in_args[0].size = sizeof(inarg); 3325 args.in_args[0].value = &inarg; 3326 args.out_numargs = 1; 3327 args.out_args[0].size = sizeof(outarg); 3328 args.out_args[0].value = &outarg; 3329 err = fuse_simple_request(fc, &args); 3330 if (err == -ENOSYS) { 3331 fc->no_copy_file_range = 1; 3332 err = -EOPNOTSUPP; 3333 } 3334 if (err) 3335 goto out; 3336 3337 truncate_inode_pages_range(inode_out->i_mapping, 3338 ALIGN_DOWN(pos_out, PAGE_SIZE), 3339 ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); 3340 3341 if (fc->writeback_cache) { 3342 fuse_write_update_size(inode_out, pos_out + outarg.size); 3343 file_update_time(file_out); 3344 } 3345 3346 fuse_invalidate_attr(inode_out); 3347 3348 err = outarg.size; 3349 out: 3350 if (is_unstable) 3351 clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3352 3353 inode_unlock(inode_out); 3354 file_accessed(file_in); 3355 3356 return err; 3357 } 3358 3359 static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off, 3360 struct file *dst_file, loff_t dst_off, 3361 size_t len, unsigned int flags) 3362 { 3363 ssize_t ret; 3364 3365 ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off, 3366 len, flags); 3367 3368 if (ret == -EOPNOTSUPP || ret == -EXDEV) 3369 ret = generic_copy_file_range(src_file, src_off, dst_file, 3370 dst_off, len, flags); 3371 return ret; 3372 } 3373 3374 static const struct file_operations fuse_file_operations = { 3375 .llseek = fuse_file_llseek, 3376 .read_iter = fuse_file_read_iter, 3377 .write_iter = fuse_file_write_iter, 3378 .mmap = fuse_file_mmap, 3379 .open = fuse_open, 3380 .flush = fuse_flush, 3381 .release = fuse_release, 3382 .fsync = fuse_fsync, 3383 .lock = fuse_file_lock, 3384 .flock = fuse_file_flock, 3385 .splice_read = generic_file_splice_read, 3386 .splice_write = iter_file_splice_write, 3387 .unlocked_ioctl = fuse_file_ioctl, 3388 .compat_ioctl = fuse_file_compat_ioctl, 3389 .poll = fuse_file_poll, 3390 .fallocate = fuse_file_fallocate, 3391 .copy_file_range = fuse_copy_file_range, 3392 }; 3393 3394 static const struct address_space_operations fuse_file_aops = { 3395 .readpage = fuse_readpage, 3396 .readahead = fuse_readahead, 3397 .writepage = fuse_writepage, 3398 .writepages = fuse_writepages, 3399 .launder_page = fuse_launder_page, 3400 .set_page_dirty = __set_page_dirty_nobuffers, 3401 .bmap = fuse_bmap, 3402 .direct_IO = fuse_direct_IO, 3403 .write_begin = fuse_write_begin, 3404 .write_end = fuse_write_end, 3405 }; 3406 3407 void fuse_init_file_inode(struct inode *inode) 3408 { 3409 struct fuse_inode *fi = get_fuse_inode(inode); 3410 3411 inode->i_fop = &fuse_file_operations; 3412 inode->i_data.a_ops = &fuse_file_aops; 3413 3414 INIT_LIST_HEAD(&fi->write_files); 3415 INIT_LIST_HEAD(&fi->queued_writes); 3416 fi->writectr = 0; 3417 init_waitqueue_head(&fi->page_waitq); 3418 fi->writepages = RB_ROOT; 3419 } 3420