1 /* handling of writes to regular files and writing back to the server 2 * 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/backing-dev.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/pagemap.h> 15 #include <linux/writeback.h> 16 #include <linux/pagevec.h> 17 #include "internal.h" 18 19 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 20 struct page *page); 21 22 /* 23 * mark a page as having been made dirty and thus needing writeback 24 */ 25 int afs_set_page_dirty(struct page *page) 26 { 27 _enter(""); 28 return __set_page_dirty_nobuffers(page); 29 } 30 31 /* 32 * unlink a writeback record because its usage has reached zero 33 * - must be called with the wb->vnode->writeback_lock held 34 */ 35 static void afs_unlink_writeback(struct afs_writeback *wb) 36 { 37 struct afs_writeback *front; 38 struct afs_vnode *vnode = wb->vnode; 39 40 list_del_init(&wb->link); 41 if (!list_empty(&vnode->writebacks)) { 42 /* if an fsync rises to the front of the queue then wake it 43 * up */ 44 front = list_entry(vnode->writebacks.next, 45 struct afs_writeback, link); 46 if (front->state == AFS_WBACK_SYNCING) { 47 _debug("wake up sync"); 48 front->state = AFS_WBACK_COMPLETE; 49 wake_up(&front->waitq); 50 } 51 } 52 } 53 54 /* 55 * free a writeback record 56 */ 57 static void afs_free_writeback(struct afs_writeback *wb) 58 { 59 _enter(""); 60 key_put(wb->key); 61 kfree(wb); 62 } 63 64 /* 65 * dispose of a reference to a writeback record 66 */ 67 void afs_put_writeback(struct afs_writeback *wb) 68 { 69 struct afs_vnode *vnode = wb->vnode; 70 71 _enter("{%d}", wb->usage); 72 73 spin_lock(&vnode->writeback_lock); 74 if (--wb->usage == 0) 75 afs_unlink_writeback(wb); 76 else 77 wb = NULL; 78 spin_unlock(&vnode->writeback_lock); 79 if (wb) 80 afs_free_writeback(wb); 81 } 82 83 /* 84 * partly or wholly fill a page that's under preparation for writing 85 */ 86 static int afs_fill_page(struct afs_vnode *vnode, struct key *key, 87 loff_t pos, unsigned len, struct page *page) 88 { 89 loff_t i_size; 90 unsigned eof; 91 int ret; 92 93 _enter(",,%llu,%u", (unsigned long long)pos, len); 94 95 ASSERTCMP(len, <=, PAGE_CACHE_SIZE); 96 97 i_size = i_size_read(&vnode->vfs_inode); 98 if (pos + len > i_size) 99 eof = i_size; 100 else 101 eof = PAGE_CACHE_SIZE; 102 103 ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); 104 if (ret < 0) { 105 if (ret == -ENOENT) { 106 _debug("got NOENT from server" 107 " - marking file deleted and stale"); 108 set_bit(AFS_VNODE_DELETED, &vnode->flags); 109 ret = -ESTALE; 110 } 111 } 112 113 _leave(" = %d", ret); 114 return ret; 115 } 116 117 /* 118 * prepare to perform part of a write to a page 119 */ 120 int afs_write_begin(struct file *file, struct address_space *mapping, 121 loff_t pos, unsigned len, unsigned flags, 122 struct page **pagep, void **fsdata) 123 { 124 struct afs_writeback *candidate, *wb; 125 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 126 struct page *page; 127 struct key *key = file->private_data; 128 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 129 unsigned to = from + len; 130 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 131 int ret; 132 133 _enter("{%x:%u},{%lx},%u,%u", 134 vnode->fid.vid, vnode->fid.vnode, index, from, to); 135 136 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); 137 if (!candidate) 138 return -ENOMEM; 139 candidate->vnode = vnode; 140 candidate->first = candidate->last = index; 141 candidate->offset_first = from; 142 candidate->to_last = to; 143 INIT_LIST_HEAD(&candidate->link); 144 candidate->usage = 1; 145 candidate->state = AFS_WBACK_PENDING; 146 init_waitqueue_head(&candidate->waitq); 147 148 page = grab_cache_page_write_begin(mapping, index, flags); 149 if (!page) { 150 kfree(candidate); 151 return -ENOMEM; 152 } 153 *pagep = page; 154 /* page won't leak in error case: it eventually gets cleaned off LRU */ 155 156 if (!PageUptodate(page)) { 157 _debug("not up to date"); 158 ret = afs_fill_page(vnode, key, pos, len, page); 159 if (ret < 0) { 160 kfree(candidate); 161 _leave(" = %d [prep]", ret); 162 return ret; 163 } 164 SetPageUptodate(page); 165 } 166 167 try_again: 168 spin_lock(&vnode->writeback_lock); 169 170 /* see if this page is already pending a writeback under a suitable key 171 * - if so we can just join onto that one */ 172 wb = (struct afs_writeback *) page_private(page); 173 if (wb) { 174 if (wb->key == key && wb->state == AFS_WBACK_PENDING) 175 goto subsume_in_current_wb; 176 goto flush_conflicting_wb; 177 } 178 179 if (index > 0) { 180 /* see if we can find an already pending writeback that we can 181 * append this page to */ 182 list_for_each_entry(wb, &vnode->writebacks, link) { 183 if (wb->last == index - 1 && wb->key == key && 184 wb->state == AFS_WBACK_PENDING) 185 goto append_to_previous_wb; 186 } 187 } 188 189 list_add_tail(&candidate->link, &vnode->writebacks); 190 candidate->key = key_get(key); 191 spin_unlock(&vnode->writeback_lock); 192 SetPagePrivate(page); 193 set_page_private(page, (unsigned long) candidate); 194 _leave(" = 0 [new]"); 195 return 0; 196 197 subsume_in_current_wb: 198 _debug("subsume"); 199 ASSERTRANGE(wb->first, <=, index, <=, wb->last); 200 if (index == wb->first && from < wb->offset_first) 201 wb->offset_first = from; 202 if (index == wb->last && to > wb->to_last) 203 wb->to_last = to; 204 spin_unlock(&vnode->writeback_lock); 205 kfree(candidate); 206 _leave(" = 0 [sub]"); 207 return 0; 208 209 append_to_previous_wb: 210 _debug("append into %lx-%lx", wb->first, wb->last); 211 wb->usage++; 212 wb->last++; 213 wb->to_last = to; 214 spin_unlock(&vnode->writeback_lock); 215 SetPagePrivate(page); 216 set_page_private(page, (unsigned long) wb); 217 kfree(candidate); 218 _leave(" = 0 [app]"); 219 return 0; 220 221 /* the page is currently bound to another context, so if it's dirty we 222 * need to flush it before we can use the new context */ 223 flush_conflicting_wb: 224 _debug("flush conflict"); 225 if (wb->state == AFS_WBACK_PENDING) 226 wb->state = AFS_WBACK_CONFLICTING; 227 spin_unlock(&vnode->writeback_lock); 228 if (PageDirty(page)) { 229 ret = afs_write_back_from_locked_page(wb, page); 230 if (ret < 0) { 231 afs_put_writeback(candidate); 232 _leave(" = %d", ret); 233 return ret; 234 } 235 } 236 237 /* the page holds a ref on the writeback record */ 238 afs_put_writeback(wb); 239 set_page_private(page, 0); 240 ClearPagePrivate(page); 241 goto try_again; 242 } 243 244 /* 245 * finalise part of a write to a page 246 */ 247 int afs_write_end(struct file *file, struct address_space *mapping, 248 loff_t pos, unsigned len, unsigned copied, 249 struct page *page, void *fsdata) 250 { 251 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 252 loff_t i_size, maybe_i_size; 253 254 _enter("{%x:%u},{%lx}", 255 vnode->fid.vid, vnode->fid.vnode, page->index); 256 257 maybe_i_size = pos + copied; 258 259 i_size = i_size_read(&vnode->vfs_inode); 260 if (maybe_i_size > i_size) { 261 spin_lock(&vnode->writeback_lock); 262 i_size = i_size_read(&vnode->vfs_inode); 263 if (maybe_i_size > i_size) 264 i_size_write(&vnode->vfs_inode, maybe_i_size); 265 spin_unlock(&vnode->writeback_lock); 266 } 267 268 set_page_dirty(page); 269 if (PageDirty(page)) 270 _debug("dirtied"); 271 unlock_page(page); 272 page_cache_release(page); 273 274 return copied; 275 } 276 277 /* 278 * kill all the pages in the given range 279 */ 280 static void afs_kill_pages(struct afs_vnode *vnode, bool error, 281 pgoff_t first, pgoff_t last) 282 { 283 struct pagevec pv; 284 unsigned count, loop; 285 286 _enter("{%x:%u},%lx-%lx", 287 vnode->fid.vid, vnode->fid.vnode, first, last); 288 289 pagevec_init(&pv, 0); 290 291 do { 292 _debug("kill %lx-%lx", first, last); 293 294 count = last - first + 1; 295 if (count > PAGEVEC_SIZE) 296 count = PAGEVEC_SIZE; 297 pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, 298 first, count, pv.pages); 299 ASSERTCMP(pv.nr, ==, count); 300 301 for (loop = 0; loop < count; loop++) { 302 ClearPageUptodate(pv.pages[loop]); 303 if (error) 304 SetPageError(pv.pages[loop]); 305 end_page_writeback(pv.pages[loop]); 306 } 307 308 __pagevec_release(&pv); 309 } while (first < last); 310 311 _leave(""); 312 } 313 314 /* 315 * synchronously write back the locked page and any subsequent non-locked dirty 316 * pages also covered by the same writeback record 317 */ 318 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 319 struct page *primary_page) 320 { 321 struct page *pages[8], *page; 322 unsigned long count; 323 unsigned n, offset, to; 324 pgoff_t start, first, last; 325 int loop, ret; 326 327 _enter(",%lx", primary_page->index); 328 329 count = 1; 330 if (!clear_page_dirty_for_io(primary_page)) 331 BUG(); 332 if (test_set_page_writeback(primary_page)) 333 BUG(); 334 335 /* find all consecutive lockable dirty pages, stopping when we find a 336 * page that is not immediately lockable, is not dirty or is missing, 337 * or we reach the end of the range */ 338 start = primary_page->index; 339 if (start >= wb->last) 340 goto no_more; 341 start++; 342 do { 343 _debug("more %lx [%lx]", start, count); 344 n = wb->last - start + 1; 345 if (n > ARRAY_SIZE(pages)) 346 n = ARRAY_SIZE(pages); 347 n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, 348 start, n, pages); 349 _debug("fgpc %u", n); 350 if (n == 0) 351 goto no_more; 352 if (pages[0]->index != start) { 353 do { 354 put_page(pages[--n]); 355 } while (n > 0); 356 goto no_more; 357 } 358 359 for (loop = 0; loop < n; loop++) { 360 page = pages[loop]; 361 if (page->index > wb->last) 362 break; 363 if (!trylock_page(page)) 364 break; 365 if (!PageDirty(page) || 366 page_private(page) != (unsigned long) wb) { 367 unlock_page(page); 368 break; 369 } 370 if (!clear_page_dirty_for_io(page)) 371 BUG(); 372 if (test_set_page_writeback(page)) 373 BUG(); 374 unlock_page(page); 375 put_page(page); 376 } 377 count += loop; 378 if (loop < n) { 379 for (; loop < n; loop++) 380 put_page(pages[loop]); 381 goto no_more; 382 } 383 384 start += loop; 385 } while (start <= wb->last && count < 65536); 386 387 no_more: 388 /* we now have a contiguous set of dirty pages, each with writeback set 389 * and the dirty mark cleared; the first page is locked and must remain 390 * so, all the rest are unlocked */ 391 first = primary_page->index; 392 last = first + count - 1; 393 394 offset = (first == wb->first) ? wb->offset_first : 0; 395 to = (last == wb->last) ? wb->to_last : PAGE_SIZE; 396 397 _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); 398 399 ret = afs_vnode_store_data(wb, first, last, offset, to); 400 if (ret < 0) { 401 switch (ret) { 402 case -EDQUOT: 403 case -ENOSPC: 404 set_bit(AS_ENOSPC, 405 &wb->vnode->vfs_inode.i_mapping->flags); 406 break; 407 case -EROFS: 408 case -EIO: 409 case -EREMOTEIO: 410 case -EFBIG: 411 case -ENOENT: 412 case -ENOMEDIUM: 413 case -ENXIO: 414 afs_kill_pages(wb->vnode, true, first, last); 415 set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); 416 break; 417 case -EACCES: 418 case -EPERM: 419 case -ENOKEY: 420 case -EKEYEXPIRED: 421 case -EKEYREJECTED: 422 case -EKEYREVOKED: 423 afs_kill_pages(wb->vnode, false, first, last); 424 break; 425 default: 426 break; 427 } 428 } else { 429 ret = count; 430 } 431 432 _leave(" = %d", ret); 433 return ret; 434 } 435 436 /* 437 * write a page back to the server 438 * - the caller locked the page for us 439 */ 440 int afs_writepage(struct page *page, struct writeback_control *wbc) 441 { 442 struct afs_writeback *wb; 443 int ret; 444 445 _enter("{%lx},", page->index); 446 447 wb = (struct afs_writeback *) page_private(page); 448 ASSERT(wb != NULL); 449 450 ret = afs_write_back_from_locked_page(wb, page); 451 unlock_page(page); 452 if (ret < 0) { 453 _leave(" = %d", ret); 454 return 0; 455 } 456 457 wbc->nr_to_write -= ret; 458 459 _leave(" = 0"); 460 return 0; 461 } 462 463 /* 464 * write a region of pages back to the server 465 */ 466 static int afs_writepages_region(struct address_space *mapping, 467 struct writeback_control *wbc, 468 pgoff_t index, pgoff_t end, pgoff_t *_next) 469 { 470 struct afs_writeback *wb; 471 struct page *page; 472 int ret, n; 473 474 _enter(",,%lx,%lx,", index, end); 475 476 do { 477 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 478 1, &page); 479 if (!n) 480 break; 481 482 _debug("wback %lx", page->index); 483 484 if (page->index > end) { 485 *_next = index; 486 page_cache_release(page); 487 _leave(" = 0 [%lx]", *_next); 488 return 0; 489 } 490 491 /* at this point we hold neither mapping->tree_lock nor lock on 492 * the page itself: the page may be truncated or invalidated 493 * (changing page->mapping to NULL), or even swizzled back from 494 * swapper_space to tmpfs file mapping 495 */ 496 lock_page(page); 497 498 if (page->mapping != mapping) { 499 unlock_page(page); 500 page_cache_release(page); 501 continue; 502 } 503 504 if (wbc->sync_mode != WB_SYNC_NONE) 505 wait_on_page_writeback(page); 506 507 if (PageWriteback(page) || !PageDirty(page)) { 508 unlock_page(page); 509 continue; 510 } 511 512 wb = (struct afs_writeback *) page_private(page); 513 ASSERT(wb != NULL); 514 515 spin_lock(&wb->vnode->writeback_lock); 516 wb->state = AFS_WBACK_WRITING; 517 spin_unlock(&wb->vnode->writeback_lock); 518 519 ret = afs_write_back_from_locked_page(wb, page); 520 unlock_page(page); 521 page_cache_release(page); 522 if (ret < 0) { 523 _leave(" = %d", ret); 524 return ret; 525 } 526 527 wbc->nr_to_write -= ret; 528 529 cond_resched(); 530 } while (index < end && wbc->nr_to_write > 0); 531 532 *_next = index; 533 _leave(" = 0 [%lx]", *_next); 534 return 0; 535 } 536 537 /* 538 * write some of the pending data back to the server 539 */ 540 int afs_writepages(struct address_space *mapping, 541 struct writeback_control *wbc) 542 { 543 pgoff_t start, end, next; 544 int ret; 545 546 _enter(""); 547 548 if (wbc->range_cyclic) { 549 start = mapping->writeback_index; 550 end = -1; 551 ret = afs_writepages_region(mapping, wbc, start, end, &next); 552 if (start > 0 && wbc->nr_to_write > 0 && ret == 0) 553 ret = afs_writepages_region(mapping, wbc, 0, start, 554 &next); 555 mapping->writeback_index = next; 556 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 557 end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); 558 ret = afs_writepages_region(mapping, wbc, 0, end, &next); 559 if (wbc->nr_to_write > 0) 560 mapping->writeback_index = next; 561 } else { 562 start = wbc->range_start >> PAGE_CACHE_SHIFT; 563 end = wbc->range_end >> PAGE_CACHE_SHIFT; 564 ret = afs_writepages_region(mapping, wbc, start, end, &next); 565 } 566 567 _leave(" = %d", ret); 568 return ret; 569 } 570 571 /* 572 * completion of write to server 573 */ 574 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 575 { 576 struct afs_writeback *wb = call->wb; 577 struct pagevec pv; 578 unsigned count, loop; 579 pgoff_t first = call->first, last = call->last; 580 bool free_wb; 581 582 _enter("{%x:%u},{%lx-%lx}", 583 vnode->fid.vid, vnode->fid.vnode, first, last); 584 585 ASSERT(wb != NULL); 586 587 pagevec_init(&pv, 0); 588 589 do { 590 _debug("done %lx-%lx", first, last); 591 592 count = last - first + 1; 593 if (count > PAGEVEC_SIZE) 594 count = PAGEVEC_SIZE; 595 pv.nr = find_get_pages_contig(call->mapping, first, count, 596 pv.pages); 597 ASSERTCMP(pv.nr, ==, count); 598 599 spin_lock(&vnode->writeback_lock); 600 for (loop = 0; loop < count; loop++) { 601 struct page *page = pv.pages[loop]; 602 end_page_writeback(page); 603 if (page_private(page) == (unsigned long) wb) { 604 set_page_private(page, 0); 605 ClearPagePrivate(page); 606 wb->usage--; 607 } 608 } 609 free_wb = false; 610 if (wb->usage == 0) { 611 afs_unlink_writeback(wb); 612 free_wb = true; 613 } 614 spin_unlock(&vnode->writeback_lock); 615 first += count; 616 if (free_wb) { 617 afs_free_writeback(wb); 618 wb = NULL; 619 } 620 621 __pagevec_release(&pv); 622 } while (first <= last); 623 624 _leave(""); 625 } 626 627 /* 628 * write to an AFS file 629 */ 630 ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, 631 unsigned long nr_segs, loff_t pos) 632 { 633 struct dentry *dentry = iocb->ki_filp->f_path.dentry; 634 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 635 ssize_t result; 636 size_t count = iov_length(iov, nr_segs); 637 638 _enter("{%x.%u},{%zu},%lu,", 639 vnode->fid.vid, vnode->fid.vnode, count, nr_segs); 640 641 if (IS_SWAPFILE(&vnode->vfs_inode)) { 642 printk(KERN_INFO 643 "AFS: Attempt to write to active swap file!\n"); 644 return -EBUSY; 645 } 646 647 if (!count) 648 return 0; 649 650 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 651 if (IS_ERR_VALUE(result)) { 652 _leave(" = %zd", result); 653 return result; 654 } 655 656 _leave(" = %zd", result); 657 return result; 658 } 659 660 /* 661 * flush the vnode to the fileserver 662 */ 663 int afs_writeback_all(struct afs_vnode *vnode) 664 { 665 struct address_space *mapping = vnode->vfs_inode.i_mapping; 666 struct writeback_control wbc = { 667 .sync_mode = WB_SYNC_ALL, 668 .nr_to_write = LONG_MAX, 669 .range_cyclic = 1, 670 }; 671 int ret; 672 673 _enter(""); 674 675 ret = mapping->a_ops->writepages(mapping, &wbc); 676 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 677 678 _leave(" = %d", ret); 679 return ret; 680 } 681 682 /* 683 * flush any dirty pages for this process, and check for write errors. 684 * - the return status from this call provides a reliable indication of 685 * whether any write errors occurred for this process. 686 */ 687 int afs_fsync(struct file *file, int datasync) 688 { 689 struct dentry *dentry = file->f_path.dentry; 690 struct afs_writeback *wb, *xwb; 691 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 692 int ret; 693 694 _enter("{%x:%u},{n=%s},%d", 695 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, 696 datasync); 697 698 /* use a writeback record as a marker in the queue - when this reaches 699 * the front of the queue, all the outstanding writes are either 700 * completed or rejected */ 701 wb = kzalloc(sizeof(*wb), GFP_KERNEL); 702 if (!wb) 703 return -ENOMEM; 704 wb->vnode = vnode; 705 wb->first = 0; 706 wb->last = -1; 707 wb->offset_first = 0; 708 wb->to_last = PAGE_SIZE; 709 wb->usage = 1; 710 wb->state = AFS_WBACK_SYNCING; 711 init_waitqueue_head(&wb->waitq); 712 713 spin_lock(&vnode->writeback_lock); 714 list_for_each_entry(xwb, &vnode->writebacks, link) { 715 if (xwb->state == AFS_WBACK_PENDING) 716 xwb->state = AFS_WBACK_CONFLICTING; 717 } 718 list_add_tail(&wb->link, &vnode->writebacks); 719 spin_unlock(&vnode->writeback_lock); 720 721 /* push all the outstanding writebacks to the server */ 722 ret = afs_writeback_all(vnode); 723 if (ret < 0) { 724 afs_put_writeback(wb); 725 _leave(" = %d [wb]", ret); 726 return ret; 727 } 728 729 /* wait for the preceding writes to actually complete */ 730 ret = wait_event_interruptible(wb->waitq, 731 wb->state == AFS_WBACK_COMPLETE || 732 vnode->writebacks.next == &wb->link); 733 afs_put_writeback(wb); 734 _leave(" = %d", ret); 735 return ret; 736 } 737 738 /* 739 * notification that a previously read-only page is about to become writable 740 * - if it returns an error, the caller will deliver a bus error signal 741 */ 742 int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 743 { 744 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); 745 746 _enter("{{%x:%u}},{%lx}", 747 vnode->fid.vid, vnode->fid.vnode, page->index); 748 749 /* wait for the page to be written to the cache before we allow it to 750 * be modified */ 751 #ifdef CONFIG_AFS_FSCACHE 752 fscache_wait_on_page_write(vnode->cache, page); 753 #endif 754 755 _leave(" = 0"); 756 return 0; 757 } 758