1 /* handling of writes to regular files and writing back to the server 2 * 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/backing-dev.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/pagemap.h> 15 #include <linux/writeback.h> 16 #include <linux/pagevec.h> 17 #include "internal.h" 18 19 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 20 struct page *page); 21 22 /* 23 * mark a page as having been made dirty and thus needing writeback 24 */ 25 int afs_set_page_dirty(struct page *page) 26 { 27 _enter(""); 28 return __set_page_dirty_nobuffers(page); 29 } 30 31 /* 32 * unlink a writeback record because its usage has reached zero 33 * - must be called with the wb->vnode->writeback_lock held 34 */ 35 static void afs_unlink_writeback(struct afs_writeback *wb) 36 { 37 struct afs_writeback *front; 38 struct afs_vnode *vnode = wb->vnode; 39 40 list_del_init(&wb->link); 41 if (!list_empty(&vnode->writebacks)) { 42 /* if an fsync rises to the front of the queue then wake it 43 * up */ 44 front = list_entry(vnode->writebacks.next, 45 struct afs_writeback, link); 46 if (front->state == AFS_WBACK_SYNCING) { 47 _debug("wake up sync"); 48 front->state = AFS_WBACK_COMPLETE; 49 wake_up(&front->waitq); 50 } 51 } 52 } 53 54 /* 55 * free a writeback record 56 */ 57 static void afs_free_writeback(struct afs_writeback *wb) 58 { 59 _enter(""); 60 key_put(wb->key); 61 kfree(wb); 62 } 63 64 /* 65 * dispose of a reference to a writeback record 66 */ 67 void afs_put_writeback(struct afs_writeback *wb) 68 { 69 struct afs_vnode *vnode = wb->vnode; 70 71 _enter("{%d}", wb->usage); 72 73 spin_lock(&vnode->writeback_lock); 74 if (--wb->usage == 0) 75 afs_unlink_writeback(wb); 76 else 77 wb = NULL; 78 spin_unlock(&vnode->writeback_lock); 79 if (wb) 80 afs_free_writeback(wb); 81 } 82 83 /* 84 * partly or wholly fill a page that's under preparation for writing 85 */ 86 static int afs_fill_page(struct afs_vnode *vnode, struct key *key, 87 loff_t pos, unsigned len, struct page *page) 88 { 89 loff_t i_size; 90 unsigned eof; 91 int ret; 92 93 _enter(",,%llu,%u", (unsigned long long)pos, len); 94 95 ASSERTCMP(len, <=, PAGE_CACHE_SIZE); 96 97 i_size = i_size_read(&vnode->vfs_inode); 98 if (pos + len > i_size) 99 eof = i_size; 100 else 101 eof = PAGE_CACHE_SIZE; 102 103 ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); 104 if (ret < 0) { 105 if (ret == -ENOENT) { 106 _debug("got NOENT from server" 107 " - marking file deleted and stale"); 108 set_bit(AFS_VNODE_DELETED, &vnode->flags); 109 ret = -ESTALE; 110 } 111 } 112 113 _leave(" = %d", ret); 114 return ret; 115 } 116 117 /* 118 * prepare to perform part of a write to a page 119 */ 120 int afs_write_begin(struct file *file, struct address_space *mapping, 121 loff_t pos, unsigned len, unsigned flags, 122 struct page **pagep, void **fsdata) 123 { 124 struct afs_writeback *candidate, *wb; 125 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 126 struct page *page; 127 struct key *key = file->private_data; 128 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 129 unsigned to = from + len; 130 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 131 int ret; 132 133 _enter("{%x:%u},{%lx},%u,%u", 134 vnode->fid.vid, vnode->fid.vnode, index, from, to); 135 136 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); 137 if (!candidate) 138 return -ENOMEM; 139 candidate->vnode = vnode; 140 candidate->first = candidate->last = index; 141 candidate->offset_first = from; 142 candidate->to_last = to; 143 candidate->usage = 1; 144 candidate->state = AFS_WBACK_PENDING; 145 init_waitqueue_head(&candidate->waitq); 146 147 page = grab_cache_page_write_begin(mapping, index, flags); 148 if (!page) { 149 kfree(candidate); 150 return -ENOMEM; 151 } 152 *pagep = page; 153 /* page won't leak in error case: it eventually gets cleaned off LRU */ 154 155 if (!PageUptodate(page)) { 156 _debug("not up to date"); 157 ret = afs_fill_page(vnode, key, pos, len, page); 158 if (ret < 0) { 159 kfree(candidate); 160 _leave(" = %d [prep]", ret); 161 return ret; 162 } 163 SetPageUptodate(page); 164 } 165 166 try_again: 167 spin_lock(&vnode->writeback_lock); 168 169 /* see if this page is already pending a writeback under a suitable key 170 * - if so we can just join onto that one */ 171 wb = (struct afs_writeback *) page_private(page); 172 if (wb) { 173 if (wb->key == key && wb->state == AFS_WBACK_PENDING) 174 goto subsume_in_current_wb; 175 goto flush_conflicting_wb; 176 } 177 178 if (index > 0) { 179 /* see if we can find an already pending writeback that we can 180 * append this page to */ 181 list_for_each_entry(wb, &vnode->writebacks, link) { 182 if (wb->last == index - 1 && wb->key == key && 183 wb->state == AFS_WBACK_PENDING) 184 goto append_to_previous_wb; 185 } 186 } 187 188 list_add_tail(&candidate->link, &vnode->writebacks); 189 candidate->key = key_get(key); 190 spin_unlock(&vnode->writeback_lock); 191 SetPagePrivate(page); 192 set_page_private(page, (unsigned long) candidate); 193 _leave(" = 0 [new]"); 194 return 0; 195 196 subsume_in_current_wb: 197 _debug("subsume"); 198 ASSERTRANGE(wb->first, <=, index, <=, wb->last); 199 if (index == wb->first && from < wb->offset_first) 200 wb->offset_first = from; 201 if (index == wb->last && to > wb->to_last) 202 wb->to_last = to; 203 spin_unlock(&vnode->writeback_lock); 204 kfree(candidate); 205 _leave(" = 0 [sub]"); 206 return 0; 207 208 append_to_previous_wb: 209 _debug("append into %lx-%lx", wb->first, wb->last); 210 wb->usage++; 211 wb->last++; 212 wb->to_last = to; 213 spin_unlock(&vnode->writeback_lock); 214 SetPagePrivate(page); 215 set_page_private(page, (unsigned long) wb); 216 kfree(candidate); 217 _leave(" = 0 [app]"); 218 return 0; 219 220 /* the page is currently bound to another context, so if it's dirty we 221 * need to flush it before we can use the new context */ 222 flush_conflicting_wb: 223 _debug("flush conflict"); 224 if (wb->state == AFS_WBACK_PENDING) 225 wb->state = AFS_WBACK_CONFLICTING; 226 spin_unlock(&vnode->writeback_lock); 227 if (PageDirty(page)) { 228 ret = afs_write_back_from_locked_page(wb, page); 229 if (ret < 0) { 230 afs_put_writeback(candidate); 231 _leave(" = %d", ret); 232 return ret; 233 } 234 } 235 236 /* the page holds a ref on the writeback record */ 237 afs_put_writeback(wb); 238 set_page_private(page, 0); 239 ClearPagePrivate(page); 240 goto try_again; 241 } 242 243 /* 244 * finalise part of a write to a page 245 */ 246 int afs_write_end(struct file *file, struct address_space *mapping, 247 loff_t pos, unsigned len, unsigned copied, 248 struct page *page, void *fsdata) 249 { 250 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 251 loff_t i_size, maybe_i_size; 252 253 _enter("{%x:%u},{%lx}", 254 vnode->fid.vid, vnode->fid.vnode, page->index); 255 256 maybe_i_size = pos + copied; 257 258 i_size = i_size_read(&vnode->vfs_inode); 259 if (maybe_i_size > i_size) { 260 spin_lock(&vnode->writeback_lock); 261 i_size = i_size_read(&vnode->vfs_inode); 262 if (maybe_i_size > i_size) 263 i_size_write(&vnode->vfs_inode, maybe_i_size); 264 spin_unlock(&vnode->writeback_lock); 265 } 266 267 set_page_dirty(page); 268 if (PageDirty(page)) 269 _debug("dirtied"); 270 unlock_page(page); 271 page_cache_release(page); 272 273 return copied; 274 } 275 276 /* 277 * kill all the pages in the given range 278 */ 279 static void afs_kill_pages(struct afs_vnode *vnode, bool error, 280 pgoff_t first, pgoff_t last) 281 { 282 struct pagevec pv; 283 unsigned count, loop; 284 285 _enter("{%x:%u},%lx-%lx", 286 vnode->fid.vid, vnode->fid.vnode, first, last); 287 288 pagevec_init(&pv, 0); 289 290 do { 291 _debug("kill %lx-%lx", first, last); 292 293 count = last - first + 1; 294 if (count > PAGEVEC_SIZE) 295 count = PAGEVEC_SIZE; 296 pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, 297 first, count, pv.pages); 298 ASSERTCMP(pv.nr, ==, count); 299 300 for (loop = 0; loop < count; loop++) { 301 ClearPageUptodate(pv.pages[loop]); 302 if (error) 303 SetPageError(pv.pages[loop]); 304 end_page_writeback(pv.pages[loop]); 305 } 306 307 __pagevec_release(&pv); 308 } while (first < last); 309 310 _leave(""); 311 } 312 313 /* 314 * synchronously write back the locked page and any subsequent non-locked dirty 315 * pages also covered by the same writeback record 316 */ 317 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 318 struct page *primary_page) 319 { 320 struct page *pages[8], *page; 321 unsigned long count; 322 unsigned n, offset, to; 323 pgoff_t start, first, last; 324 int loop, ret; 325 326 _enter(",%lx", primary_page->index); 327 328 count = 1; 329 if (!clear_page_dirty_for_io(primary_page)) 330 BUG(); 331 if (test_set_page_writeback(primary_page)) 332 BUG(); 333 334 /* find all consecutive lockable dirty pages, stopping when we find a 335 * page that is not immediately lockable, is not dirty or is missing, 336 * or we reach the end of the range */ 337 start = primary_page->index; 338 if (start >= wb->last) 339 goto no_more; 340 start++; 341 do { 342 _debug("more %lx [%lx]", start, count); 343 n = wb->last - start + 1; 344 if (n > ARRAY_SIZE(pages)) 345 n = ARRAY_SIZE(pages); 346 n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, 347 start, n, pages); 348 _debug("fgpc %u", n); 349 if (n == 0) 350 goto no_more; 351 if (pages[0]->index != start) { 352 do { 353 put_page(pages[--n]); 354 } while (n > 0); 355 goto no_more; 356 } 357 358 for (loop = 0; loop < n; loop++) { 359 page = pages[loop]; 360 if (page->index > wb->last) 361 break; 362 if (!trylock_page(page)) 363 break; 364 if (!PageDirty(page) || 365 page_private(page) != (unsigned long) wb) { 366 unlock_page(page); 367 break; 368 } 369 if (!clear_page_dirty_for_io(page)) 370 BUG(); 371 if (test_set_page_writeback(page)) 372 BUG(); 373 unlock_page(page); 374 put_page(page); 375 } 376 count += loop; 377 if (loop < n) { 378 for (; loop < n; loop++) 379 put_page(pages[loop]); 380 goto no_more; 381 } 382 383 start += loop; 384 } while (start <= wb->last && count < 65536); 385 386 no_more: 387 /* we now have a contiguous set of dirty pages, each with writeback set 388 * and the dirty mark cleared; the first page is locked and must remain 389 * so, all the rest are unlocked */ 390 first = primary_page->index; 391 last = first + count - 1; 392 393 offset = (first == wb->first) ? wb->offset_first : 0; 394 to = (last == wb->last) ? wb->to_last : PAGE_SIZE; 395 396 _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); 397 398 ret = afs_vnode_store_data(wb, first, last, offset, to); 399 if (ret < 0) { 400 switch (ret) { 401 case -EDQUOT: 402 case -ENOSPC: 403 set_bit(AS_ENOSPC, 404 &wb->vnode->vfs_inode.i_mapping->flags); 405 break; 406 case -EROFS: 407 case -EIO: 408 case -EREMOTEIO: 409 case -EFBIG: 410 case -ENOENT: 411 case -ENOMEDIUM: 412 case -ENXIO: 413 afs_kill_pages(wb->vnode, true, first, last); 414 set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); 415 break; 416 case -EACCES: 417 case -EPERM: 418 case -ENOKEY: 419 case -EKEYEXPIRED: 420 case -EKEYREJECTED: 421 case -EKEYREVOKED: 422 afs_kill_pages(wb->vnode, false, first, last); 423 break; 424 default: 425 break; 426 } 427 } else { 428 ret = count; 429 } 430 431 _leave(" = %d", ret); 432 return ret; 433 } 434 435 /* 436 * write a page back to the server 437 * - the caller locked the page for us 438 */ 439 int afs_writepage(struct page *page, struct writeback_control *wbc) 440 { 441 struct afs_writeback *wb; 442 int ret; 443 444 _enter("{%lx},", page->index); 445 446 wb = (struct afs_writeback *) page_private(page); 447 ASSERT(wb != NULL); 448 449 ret = afs_write_back_from_locked_page(wb, page); 450 unlock_page(page); 451 if (ret < 0) { 452 _leave(" = %d", ret); 453 return 0; 454 } 455 456 wbc->nr_to_write -= ret; 457 458 _leave(" = 0"); 459 return 0; 460 } 461 462 /* 463 * write a region of pages back to the server 464 */ 465 static int afs_writepages_region(struct address_space *mapping, 466 struct writeback_control *wbc, 467 pgoff_t index, pgoff_t end, pgoff_t *_next) 468 { 469 struct afs_writeback *wb; 470 struct page *page; 471 int ret, n; 472 473 _enter(",,%lx,%lx,", index, end); 474 475 do { 476 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 477 1, &page); 478 if (!n) 479 break; 480 481 _debug("wback %lx", page->index); 482 483 if (page->index > end) { 484 *_next = index; 485 page_cache_release(page); 486 _leave(" = 0 [%lx]", *_next); 487 return 0; 488 } 489 490 /* at this point we hold neither mapping->tree_lock nor lock on 491 * the page itself: the page may be truncated or invalidated 492 * (changing page->mapping to NULL), or even swizzled back from 493 * swapper_space to tmpfs file mapping 494 */ 495 lock_page(page); 496 497 if (page->mapping != mapping) { 498 unlock_page(page); 499 page_cache_release(page); 500 continue; 501 } 502 503 if (wbc->sync_mode != WB_SYNC_NONE) 504 wait_on_page_writeback(page); 505 506 if (PageWriteback(page) || !PageDirty(page)) { 507 unlock_page(page); 508 continue; 509 } 510 511 wb = (struct afs_writeback *) page_private(page); 512 ASSERT(wb != NULL); 513 514 spin_lock(&wb->vnode->writeback_lock); 515 wb->state = AFS_WBACK_WRITING; 516 spin_unlock(&wb->vnode->writeback_lock); 517 518 ret = afs_write_back_from_locked_page(wb, page); 519 unlock_page(page); 520 page_cache_release(page); 521 if (ret < 0) { 522 _leave(" = %d", ret); 523 return ret; 524 } 525 526 wbc->nr_to_write -= ret; 527 528 cond_resched(); 529 } while (index < end && wbc->nr_to_write > 0); 530 531 *_next = index; 532 _leave(" = 0 [%lx]", *_next); 533 return 0; 534 } 535 536 /* 537 * write some of the pending data back to the server 538 */ 539 int afs_writepages(struct address_space *mapping, 540 struct writeback_control *wbc) 541 { 542 pgoff_t start, end, next; 543 int ret; 544 545 _enter(""); 546 547 if (wbc->range_cyclic) { 548 start = mapping->writeback_index; 549 end = -1; 550 ret = afs_writepages_region(mapping, wbc, start, end, &next); 551 if (start > 0 && wbc->nr_to_write > 0 && ret == 0) 552 ret = afs_writepages_region(mapping, wbc, 0, start, 553 &next); 554 mapping->writeback_index = next; 555 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 556 end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); 557 ret = afs_writepages_region(mapping, wbc, 0, end, &next); 558 if (wbc->nr_to_write > 0) 559 mapping->writeback_index = next; 560 } else { 561 start = wbc->range_start >> PAGE_CACHE_SHIFT; 562 end = wbc->range_end >> PAGE_CACHE_SHIFT; 563 ret = afs_writepages_region(mapping, wbc, start, end, &next); 564 } 565 566 _leave(" = %d", ret); 567 return ret; 568 } 569 570 /* 571 * completion of write to server 572 */ 573 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 574 { 575 struct afs_writeback *wb = call->wb; 576 struct pagevec pv; 577 unsigned count, loop; 578 pgoff_t first = call->first, last = call->last; 579 bool free_wb; 580 581 _enter("{%x:%u},{%lx-%lx}", 582 vnode->fid.vid, vnode->fid.vnode, first, last); 583 584 ASSERT(wb != NULL); 585 586 pagevec_init(&pv, 0); 587 588 do { 589 _debug("done %lx-%lx", first, last); 590 591 count = last - first + 1; 592 if (count > PAGEVEC_SIZE) 593 count = PAGEVEC_SIZE; 594 pv.nr = find_get_pages_contig(call->mapping, first, count, 595 pv.pages); 596 ASSERTCMP(pv.nr, ==, count); 597 598 spin_lock(&vnode->writeback_lock); 599 for (loop = 0; loop < count; loop++) { 600 struct page *page = pv.pages[loop]; 601 end_page_writeback(page); 602 if (page_private(page) == (unsigned long) wb) { 603 set_page_private(page, 0); 604 ClearPagePrivate(page); 605 wb->usage--; 606 } 607 } 608 free_wb = false; 609 if (wb->usage == 0) { 610 afs_unlink_writeback(wb); 611 free_wb = true; 612 } 613 spin_unlock(&vnode->writeback_lock); 614 first += count; 615 if (free_wb) { 616 afs_free_writeback(wb); 617 wb = NULL; 618 } 619 620 __pagevec_release(&pv); 621 } while (first <= last); 622 623 _leave(""); 624 } 625 626 /* 627 * write to an AFS file 628 */ 629 ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, 630 unsigned long nr_segs, loff_t pos) 631 { 632 struct dentry *dentry = iocb->ki_filp->f_path.dentry; 633 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 634 ssize_t result; 635 size_t count = iov_length(iov, nr_segs); 636 637 _enter("{%x.%u},{%zu},%lu,", 638 vnode->fid.vid, vnode->fid.vnode, count, nr_segs); 639 640 if (IS_SWAPFILE(&vnode->vfs_inode)) { 641 printk(KERN_INFO 642 "AFS: Attempt to write to active swap file!\n"); 643 return -EBUSY; 644 } 645 646 if (!count) 647 return 0; 648 649 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 650 if (IS_ERR_VALUE(result)) { 651 _leave(" = %zd", result); 652 return result; 653 } 654 655 _leave(" = %zd", result); 656 return result; 657 } 658 659 /* 660 * flush the vnode to the fileserver 661 */ 662 int afs_writeback_all(struct afs_vnode *vnode) 663 { 664 struct address_space *mapping = vnode->vfs_inode.i_mapping; 665 struct writeback_control wbc = { 666 .sync_mode = WB_SYNC_ALL, 667 .nr_to_write = LONG_MAX, 668 .range_cyclic = 1, 669 }; 670 int ret; 671 672 _enter(""); 673 674 ret = mapping->a_ops->writepages(mapping, &wbc); 675 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 676 677 _leave(" = %d", ret); 678 return ret; 679 } 680 681 /* 682 * flush any dirty pages for this process, and check for write errors. 683 * - the return status from this call provides a reliable indication of 684 * whether any write errors occurred for this process. 685 */ 686 int afs_fsync(struct file *file, int datasync) 687 { 688 struct dentry *dentry = file->f_path.dentry; 689 struct afs_writeback *wb, *xwb; 690 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 691 int ret; 692 693 _enter("{%x:%u},{n=%s},%d", 694 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, 695 datasync); 696 697 /* use a writeback record as a marker in the queue - when this reaches 698 * the front of the queue, all the outstanding writes are either 699 * completed or rejected */ 700 wb = kzalloc(sizeof(*wb), GFP_KERNEL); 701 if (!wb) 702 return -ENOMEM; 703 wb->vnode = vnode; 704 wb->first = 0; 705 wb->last = -1; 706 wb->offset_first = 0; 707 wb->to_last = PAGE_SIZE; 708 wb->usage = 1; 709 wb->state = AFS_WBACK_SYNCING; 710 init_waitqueue_head(&wb->waitq); 711 712 spin_lock(&vnode->writeback_lock); 713 list_for_each_entry(xwb, &vnode->writebacks, link) { 714 if (xwb->state == AFS_WBACK_PENDING) 715 xwb->state = AFS_WBACK_CONFLICTING; 716 } 717 list_add_tail(&wb->link, &vnode->writebacks); 718 spin_unlock(&vnode->writeback_lock); 719 720 /* push all the outstanding writebacks to the server */ 721 ret = afs_writeback_all(vnode); 722 if (ret < 0) { 723 afs_put_writeback(wb); 724 _leave(" = %d [wb]", ret); 725 return ret; 726 } 727 728 /* wait for the preceding writes to actually complete */ 729 ret = wait_event_interruptible(wb->waitq, 730 wb->state == AFS_WBACK_COMPLETE || 731 vnode->writebacks.next == &wb->link); 732 afs_put_writeback(wb); 733 _leave(" = %d", ret); 734 return ret; 735 } 736 737 /* 738 * notification that a previously read-only page is about to become writable 739 * - if it returns an error, the caller will deliver a bus error signal 740 */ 741 int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 742 { 743 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); 744 745 _enter("{{%x:%u}},{%lx}", 746 vnode->fid.vid, vnode->fid.vnode, page->index); 747 748 /* wait for the page to be written to the cache before we allow it to 749 * be modified */ 750 #ifdef CONFIG_AFS_FSCACHE 751 fscache_wait_on_page_write(vnode->cache, page); 752 #endif 753 754 _leave(" = 0"); 755 return 0; 756 } 757