1 /* handling of writes to regular files and writing back to the server 2 * 3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. 4 * Written by David Howells (dhowells@redhat.com) 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 9 * 2 of the License, or (at your option) any later version. 10 */ 11 #include <linux/backing-dev.h> 12 #include <linux/slab.h> 13 #include <linux/fs.h> 14 #include <linux/pagemap.h> 15 #include <linux/writeback.h> 16 #include <linux/pagevec.h> 17 #include "internal.h" 18 19 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 20 struct page *page); 21 22 /* 23 * mark a page as having been made dirty and thus needing writeback 24 */ 25 int afs_set_page_dirty(struct page *page) 26 { 27 _enter(""); 28 return __set_page_dirty_nobuffers(page); 29 } 30 31 /* 32 * unlink a writeback record because its usage has reached zero 33 * - must be called with the wb->vnode->writeback_lock held 34 */ 35 static void afs_unlink_writeback(struct afs_writeback *wb) 36 { 37 struct afs_writeback *front; 38 struct afs_vnode *vnode = wb->vnode; 39 40 list_del_init(&wb->link); 41 if (!list_empty(&vnode->writebacks)) { 42 /* if an fsync rises to the front of the queue then wake it 43 * up */ 44 front = list_entry(vnode->writebacks.next, 45 struct afs_writeback, link); 46 if (front->state == AFS_WBACK_SYNCING) { 47 _debug("wake up sync"); 48 front->state = AFS_WBACK_COMPLETE; 49 wake_up(&front->waitq); 50 } 51 } 52 } 53 54 /* 55 * free a writeback record 56 */ 57 static void afs_free_writeback(struct afs_writeback *wb) 58 { 59 _enter(""); 60 key_put(wb->key); 61 kfree(wb); 62 } 63 64 /* 65 * dispose of a reference to a writeback record 66 */ 67 void afs_put_writeback(struct afs_writeback *wb) 68 { 69 struct afs_vnode *vnode = wb->vnode; 70 71 _enter("{%d}", wb->usage); 72 73 spin_lock(&vnode->writeback_lock); 74 if (--wb->usage == 0) 75 afs_unlink_writeback(wb); 76 else 77 wb = NULL; 78 spin_unlock(&vnode->writeback_lock); 79 if (wb) 80 afs_free_writeback(wb); 81 } 82 83 /* 84 * partly or wholly fill a page that's under preparation for writing 85 */ 86 static int afs_fill_page(struct afs_vnode *vnode, struct key *key, 87 loff_t pos, struct page *page) 88 { 89 loff_t i_size; 90 int ret; 91 int len; 92 93 _enter(",,%llu", (unsigned long long)pos); 94 95 i_size = i_size_read(&vnode->vfs_inode); 96 if (pos + PAGE_CACHE_SIZE > i_size) 97 len = i_size - pos; 98 else 99 len = PAGE_CACHE_SIZE; 100 101 ret = afs_vnode_fetch_data(vnode, key, pos, len, page); 102 if (ret < 0) { 103 if (ret == -ENOENT) { 104 _debug("got NOENT from server" 105 " - marking file deleted and stale"); 106 set_bit(AFS_VNODE_DELETED, &vnode->flags); 107 ret = -ESTALE; 108 } 109 } 110 111 _leave(" = %d", ret); 112 return ret; 113 } 114 115 /* 116 * prepare to perform part of a write to a page 117 */ 118 int afs_write_begin(struct file *file, struct address_space *mapping, 119 loff_t pos, unsigned len, unsigned flags, 120 struct page **pagep, void **fsdata) 121 { 122 struct afs_writeback *candidate, *wb; 123 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 124 struct page *page; 125 struct key *key = file->private_data; 126 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 127 unsigned to = from + len; 128 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 129 int ret; 130 131 _enter("{%x:%u},{%lx},%u,%u", 132 vnode->fid.vid, vnode->fid.vnode, index, from, to); 133 134 candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); 135 if (!candidate) 136 return -ENOMEM; 137 candidate->vnode = vnode; 138 candidate->first = candidate->last = index; 139 candidate->offset_first = from; 140 candidate->to_last = to; 141 INIT_LIST_HEAD(&candidate->link); 142 candidate->usage = 1; 143 candidate->state = AFS_WBACK_PENDING; 144 init_waitqueue_head(&candidate->waitq); 145 146 page = grab_cache_page_write_begin(mapping, index, flags); 147 if (!page) { 148 kfree(candidate); 149 return -ENOMEM; 150 } 151 *pagep = page; 152 /* page won't leak in error case: it eventually gets cleaned off LRU */ 153 154 if (!PageUptodate(page) && len != PAGE_CACHE_SIZE) { 155 ret = afs_fill_page(vnode, key, index << PAGE_CACHE_SHIFT, page); 156 if (ret < 0) { 157 kfree(candidate); 158 _leave(" = %d [prep]", ret); 159 return ret; 160 } 161 SetPageUptodate(page); 162 } 163 164 try_again: 165 spin_lock(&vnode->writeback_lock); 166 167 /* see if this page is already pending a writeback under a suitable key 168 * - if so we can just join onto that one */ 169 wb = (struct afs_writeback *) page_private(page); 170 if (wb) { 171 if (wb->key == key && wb->state == AFS_WBACK_PENDING) 172 goto subsume_in_current_wb; 173 goto flush_conflicting_wb; 174 } 175 176 if (index > 0) { 177 /* see if we can find an already pending writeback that we can 178 * append this page to */ 179 list_for_each_entry(wb, &vnode->writebacks, link) { 180 if (wb->last == index - 1 && wb->key == key && 181 wb->state == AFS_WBACK_PENDING) 182 goto append_to_previous_wb; 183 } 184 } 185 186 list_add_tail(&candidate->link, &vnode->writebacks); 187 candidate->key = key_get(key); 188 spin_unlock(&vnode->writeback_lock); 189 SetPagePrivate(page); 190 set_page_private(page, (unsigned long) candidate); 191 _leave(" = 0 [new]"); 192 return 0; 193 194 subsume_in_current_wb: 195 _debug("subsume"); 196 ASSERTRANGE(wb->first, <=, index, <=, wb->last); 197 if (index == wb->first && from < wb->offset_first) 198 wb->offset_first = from; 199 if (index == wb->last && to > wb->to_last) 200 wb->to_last = to; 201 spin_unlock(&vnode->writeback_lock); 202 kfree(candidate); 203 _leave(" = 0 [sub]"); 204 return 0; 205 206 append_to_previous_wb: 207 _debug("append into %lx-%lx", wb->first, wb->last); 208 wb->usage++; 209 wb->last++; 210 wb->to_last = to; 211 spin_unlock(&vnode->writeback_lock); 212 SetPagePrivate(page); 213 set_page_private(page, (unsigned long) wb); 214 kfree(candidate); 215 _leave(" = 0 [app]"); 216 return 0; 217 218 /* the page is currently bound to another context, so if it's dirty we 219 * need to flush it before we can use the new context */ 220 flush_conflicting_wb: 221 _debug("flush conflict"); 222 if (wb->state == AFS_WBACK_PENDING) 223 wb->state = AFS_WBACK_CONFLICTING; 224 spin_unlock(&vnode->writeback_lock); 225 if (PageDirty(page)) { 226 ret = afs_write_back_from_locked_page(wb, page); 227 if (ret < 0) { 228 afs_put_writeback(candidate); 229 _leave(" = %d", ret); 230 return ret; 231 } 232 } 233 234 /* the page holds a ref on the writeback record */ 235 afs_put_writeback(wb); 236 set_page_private(page, 0); 237 ClearPagePrivate(page); 238 goto try_again; 239 } 240 241 /* 242 * finalise part of a write to a page 243 */ 244 int afs_write_end(struct file *file, struct address_space *mapping, 245 loff_t pos, unsigned len, unsigned copied, 246 struct page *page, void *fsdata) 247 { 248 struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); 249 loff_t i_size, maybe_i_size; 250 251 _enter("{%x:%u},{%lx}", 252 vnode->fid.vid, vnode->fid.vnode, page->index); 253 254 maybe_i_size = pos + copied; 255 256 i_size = i_size_read(&vnode->vfs_inode); 257 if (maybe_i_size > i_size) { 258 spin_lock(&vnode->writeback_lock); 259 i_size = i_size_read(&vnode->vfs_inode); 260 if (maybe_i_size > i_size) 261 i_size_write(&vnode->vfs_inode, maybe_i_size); 262 spin_unlock(&vnode->writeback_lock); 263 } 264 265 set_page_dirty(page); 266 if (PageDirty(page)) 267 _debug("dirtied"); 268 unlock_page(page); 269 page_cache_release(page); 270 271 return copied; 272 } 273 274 /* 275 * kill all the pages in the given range 276 */ 277 static void afs_kill_pages(struct afs_vnode *vnode, bool error, 278 pgoff_t first, pgoff_t last) 279 { 280 struct pagevec pv; 281 unsigned count, loop; 282 283 _enter("{%x:%u},%lx-%lx", 284 vnode->fid.vid, vnode->fid.vnode, first, last); 285 286 pagevec_init(&pv, 0); 287 288 do { 289 _debug("kill %lx-%lx", first, last); 290 291 count = last - first + 1; 292 if (count > PAGEVEC_SIZE) 293 count = PAGEVEC_SIZE; 294 pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, 295 first, count, pv.pages); 296 ASSERTCMP(pv.nr, ==, count); 297 298 for (loop = 0; loop < count; loop++) { 299 ClearPageUptodate(pv.pages[loop]); 300 if (error) 301 SetPageError(pv.pages[loop]); 302 end_page_writeback(pv.pages[loop]); 303 } 304 305 __pagevec_release(&pv); 306 } while (first < last); 307 308 _leave(""); 309 } 310 311 /* 312 * synchronously write back the locked page and any subsequent non-locked dirty 313 * pages also covered by the same writeback record 314 */ 315 static int afs_write_back_from_locked_page(struct afs_writeback *wb, 316 struct page *primary_page) 317 { 318 struct page *pages[8], *page; 319 unsigned long count; 320 unsigned n, offset, to; 321 pgoff_t start, first, last; 322 int loop, ret; 323 324 _enter(",%lx", primary_page->index); 325 326 count = 1; 327 if (!clear_page_dirty_for_io(primary_page)) 328 BUG(); 329 if (test_set_page_writeback(primary_page)) 330 BUG(); 331 332 /* find all consecutive lockable dirty pages, stopping when we find a 333 * page that is not immediately lockable, is not dirty or is missing, 334 * or we reach the end of the range */ 335 start = primary_page->index; 336 if (start >= wb->last) 337 goto no_more; 338 start++; 339 do { 340 _debug("more %lx [%lx]", start, count); 341 n = wb->last - start + 1; 342 if (n > ARRAY_SIZE(pages)) 343 n = ARRAY_SIZE(pages); 344 n = find_get_pages_contig(wb->vnode->vfs_inode.i_mapping, 345 start, n, pages); 346 _debug("fgpc %u", n); 347 if (n == 0) 348 goto no_more; 349 if (pages[0]->index != start) { 350 do { 351 put_page(pages[--n]); 352 } while (n > 0); 353 goto no_more; 354 } 355 356 for (loop = 0; loop < n; loop++) { 357 page = pages[loop]; 358 if (page->index > wb->last) 359 break; 360 if (!trylock_page(page)) 361 break; 362 if (!PageDirty(page) || 363 page_private(page) != (unsigned long) wb) { 364 unlock_page(page); 365 break; 366 } 367 if (!clear_page_dirty_for_io(page)) 368 BUG(); 369 if (test_set_page_writeback(page)) 370 BUG(); 371 unlock_page(page); 372 put_page(page); 373 } 374 count += loop; 375 if (loop < n) { 376 for (; loop < n; loop++) 377 put_page(pages[loop]); 378 goto no_more; 379 } 380 381 start += loop; 382 } while (start <= wb->last && count < 65536); 383 384 no_more: 385 /* we now have a contiguous set of dirty pages, each with writeback set 386 * and the dirty mark cleared; the first page is locked and must remain 387 * so, all the rest are unlocked */ 388 first = primary_page->index; 389 last = first + count - 1; 390 391 offset = (first == wb->first) ? wb->offset_first : 0; 392 to = (last == wb->last) ? wb->to_last : PAGE_SIZE; 393 394 _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); 395 396 ret = afs_vnode_store_data(wb, first, last, offset, to); 397 if (ret < 0) { 398 switch (ret) { 399 case -EDQUOT: 400 case -ENOSPC: 401 set_bit(AS_ENOSPC, 402 &wb->vnode->vfs_inode.i_mapping->flags); 403 break; 404 case -EROFS: 405 case -EIO: 406 case -EREMOTEIO: 407 case -EFBIG: 408 case -ENOENT: 409 case -ENOMEDIUM: 410 case -ENXIO: 411 afs_kill_pages(wb->vnode, true, first, last); 412 set_bit(AS_EIO, &wb->vnode->vfs_inode.i_mapping->flags); 413 break; 414 case -EACCES: 415 case -EPERM: 416 case -ENOKEY: 417 case -EKEYEXPIRED: 418 case -EKEYREJECTED: 419 case -EKEYREVOKED: 420 afs_kill_pages(wb->vnode, false, first, last); 421 break; 422 default: 423 break; 424 } 425 } else { 426 ret = count; 427 } 428 429 _leave(" = %d", ret); 430 return ret; 431 } 432 433 /* 434 * write a page back to the server 435 * - the caller locked the page for us 436 */ 437 int afs_writepage(struct page *page, struct writeback_control *wbc) 438 { 439 struct afs_writeback *wb; 440 int ret; 441 442 _enter("{%lx},", page->index); 443 444 wb = (struct afs_writeback *) page_private(page); 445 ASSERT(wb != NULL); 446 447 ret = afs_write_back_from_locked_page(wb, page); 448 unlock_page(page); 449 if (ret < 0) { 450 _leave(" = %d", ret); 451 return 0; 452 } 453 454 wbc->nr_to_write -= ret; 455 456 _leave(" = 0"); 457 return 0; 458 } 459 460 /* 461 * write a region of pages back to the server 462 */ 463 static int afs_writepages_region(struct address_space *mapping, 464 struct writeback_control *wbc, 465 pgoff_t index, pgoff_t end, pgoff_t *_next) 466 { 467 struct afs_writeback *wb; 468 struct page *page; 469 int ret, n; 470 471 _enter(",,%lx,%lx,", index, end); 472 473 do { 474 n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, 475 1, &page); 476 if (!n) 477 break; 478 479 _debug("wback %lx", page->index); 480 481 if (page->index > end) { 482 *_next = index; 483 page_cache_release(page); 484 _leave(" = 0 [%lx]", *_next); 485 return 0; 486 } 487 488 /* at this point we hold neither mapping->tree_lock nor lock on 489 * the page itself: the page may be truncated or invalidated 490 * (changing page->mapping to NULL), or even swizzled back from 491 * swapper_space to tmpfs file mapping 492 */ 493 lock_page(page); 494 495 if (page->mapping != mapping) { 496 unlock_page(page); 497 page_cache_release(page); 498 continue; 499 } 500 501 if (wbc->sync_mode != WB_SYNC_NONE) 502 wait_on_page_writeback(page); 503 504 if (PageWriteback(page) || !PageDirty(page)) { 505 unlock_page(page); 506 continue; 507 } 508 509 wb = (struct afs_writeback *) page_private(page); 510 ASSERT(wb != NULL); 511 512 spin_lock(&wb->vnode->writeback_lock); 513 wb->state = AFS_WBACK_WRITING; 514 spin_unlock(&wb->vnode->writeback_lock); 515 516 ret = afs_write_back_from_locked_page(wb, page); 517 unlock_page(page); 518 page_cache_release(page); 519 if (ret < 0) { 520 _leave(" = %d", ret); 521 return ret; 522 } 523 524 wbc->nr_to_write -= ret; 525 526 cond_resched(); 527 } while (index < end && wbc->nr_to_write > 0); 528 529 *_next = index; 530 _leave(" = 0 [%lx]", *_next); 531 return 0; 532 } 533 534 /* 535 * write some of the pending data back to the server 536 */ 537 int afs_writepages(struct address_space *mapping, 538 struct writeback_control *wbc) 539 { 540 pgoff_t start, end, next; 541 int ret; 542 543 _enter(""); 544 545 if (wbc->range_cyclic) { 546 start = mapping->writeback_index; 547 end = -1; 548 ret = afs_writepages_region(mapping, wbc, start, end, &next); 549 if (start > 0 && wbc->nr_to_write > 0 && ret == 0) 550 ret = afs_writepages_region(mapping, wbc, 0, start, 551 &next); 552 mapping->writeback_index = next; 553 } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { 554 end = (pgoff_t)(LLONG_MAX >> PAGE_CACHE_SHIFT); 555 ret = afs_writepages_region(mapping, wbc, 0, end, &next); 556 if (wbc->nr_to_write > 0) 557 mapping->writeback_index = next; 558 } else { 559 start = wbc->range_start >> PAGE_CACHE_SHIFT; 560 end = wbc->range_end >> PAGE_CACHE_SHIFT; 561 ret = afs_writepages_region(mapping, wbc, start, end, &next); 562 } 563 564 _leave(" = %d", ret); 565 return ret; 566 } 567 568 /* 569 * completion of write to server 570 */ 571 void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) 572 { 573 struct afs_writeback *wb = call->wb; 574 struct pagevec pv; 575 unsigned count, loop; 576 pgoff_t first = call->first, last = call->last; 577 bool free_wb; 578 579 _enter("{%x:%u},{%lx-%lx}", 580 vnode->fid.vid, vnode->fid.vnode, first, last); 581 582 ASSERT(wb != NULL); 583 584 pagevec_init(&pv, 0); 585 586 do { 587 _debug("done %lx-%lx", first, last); 588 589 count = last - first + 1; 590 if (count > PAGEVEC_SIZE) 591 count = PAGEVEC_SIZE; 592 pv.nr = find_get_pages_contig(call->mapping, first, count, 593 pv.pages); 594 ASSERTCMP(pv.nr, ==, count); 595 596 spin_lock(&vnode->writeback_lock); 597 for (loop = 0; loop < count; loop++) { 598 struct page *page = pv.pages[loop]; 599 end_page_writeback(page); 600 if (page_private(page) == (unsigned long) wb) { 601 set_page_private(page, 0); 602 ClearPagePrivate(page); 603 wb->usage--; 604 } 605 } 606 free_wb = false; 607 if (wb->usage == 0) { 608 afs_unlink_writeback(wb); 609 free_wb = true; 610 } 611 spin_unlock(&vnode->writeback_lock); 612 first += count; 613 if (free_wb) { 614 afs_free_writeback(wb); 615 wb = NULL; 616 } 617 618 __pagevec_release(&pv); 619 } while (first <= last); 620 621 _leave(""); 622 } 623 624 /* 625 * write to an AFS file 626 */ 627 ssize_t afs_file_write(struct kiocb *iocb, const struct iovec *iov, 628 unsigned long nr_segs, loff_t pos) 629 { 630 struct dentry *dentry = iocb->ki_filp->f_path.dentry; 631 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 632 ssize_t result; 633 size_t count = iov_length(iov, nr_segs); 634 635 _enter("{%x.%u},{%zu},%lu,", 636 vnode->fid.vid, vnode->fid.vnode, count, nr_segs); 637 638 if (IS_SWAPFILE(&vnode->vfs_inode)) { 639 printk(KERN_INFO 640 "AFS: Attempt to write to active swap file!\n"); 641 return -EBUSY; 642 } 643 644 if (!count) 645 return 0; 646 647 result = generic_file_aio_write(iocb, iov, nr_segs, pos); 648 if (IS_ERR_VALUE(result)) { 649 _leave(" = %zd", result); 650 return result; 651 } 652 653 _leave(" = %zd", result); 654 return result; 655 } 656 657 /* 658 * flush the vnode to the fileserver 659 */ 660 int afs_writeback_all(struct afs_vnode *vnode) 661 { 662 struct address_space *mapping = vnode->vfs_inode.i_mapping; 663 struct writeback_control wbc = { 664 .sync_mode = WB_SYNC_ALL, 665 .nr_to_write = LONG_MAX, 666 .range_cyclic = 1, 667 }; 668 int ret; 669 670 _enter(""); 671 672 ret = mapping->a_ops->writepages(mapping, &wbc); 673 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 674 675 _leave(" = %d", ret); 676 return ret; 677 } 678 679 /* 680 * flush any dirty pages for this process, and check for write errors. 681 * - the return status from this call provides a reliable indication of 682 * whether any write errors occurred for this process. 683 */ 684 int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) 685 { 686 struct dentry *dentry = file->f_path.dentry; 687 struct inode *inode = file->f_mapping->host; 688 struct afs_writeback *wb, *xwb; 689 struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); 690 int ret; 691 692 _enter("{%x:%u},{n=%s},%d", 693 vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name, 694 datasync); 695 696 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 697 if (ret) 698 return ret; 699 mutex_lock(&inode->i_mutex); 700 701 /* use a writeback record as a marker in the queue - when this reaches 702 * the front of the queue, all the outstanding writes are either 703 * completed or rejected */ 704 wb = kzalloc(sizeof(*wb), GFP_KERNEL); 705 if (!wb) { 706 ret = -ENOMEM; 707 goto out; 708 } 709 wb->vnode = vnode; 710 wb->first = 0; 711 wb->last = -1; 712 wb->offset_first = 0; 713 wb->to_last = PAGE_SIZE; 714 wb->usage = 1; 715 wb->state = AFS_WBACK_SYNCING; 716 init_waitqueue_head(&wb->waitq); 717 718 spin_lock(&vnode->writeback_lock); 719 list_for_each_entry(xwb, &vnode->writebacks, link) { 720 if (xwb->state == AFS_WBACK_PENDING) 721 xwb->state = AFS_WBACK_CONFLICTING; 722 } 723 list_add_tail(&wb->link, &vnode->writebacks); 724 spin_unlock(&vnode->writeback_lock); 725 726 /* push all the outstanding writebacks to the server */ 727 ret = afs_writeback_all(vnode); 728 if (ret < 0) { 729 afs_put_writeback(wb); 730 _leave(" = %d [wb]", ret); 731 goto out; 732 } 733 734 /* wait for the preceding writes to actually complete */ 735 ret = wait_event_interruptible(wb->waitq, 736 wb->state == AFS_WBACK_COMPLETE || 737 vnode->writebacks.next == &wb->link); 738 afs_put_writeback(wb); 739 _leave(" = %d", ret); 740 out: 741 mutex_unlock(&inode->i_mutex); 742 return ret; 743 } 744 745 /* 746 * notification that a previously read-only page is about to become writable 747 * - if it returns an error, the caller will deliver a bus error signal 748 */ 749 int afs_page_mkwrite(struct vm_area_struct *vma, struct page *page) 750 { 751 struct afs_vnode *vnode = AFS_FS_I(vma->vm_file->f_mapping->host); 752 753 _enter("{{%x:%u}},{%lx}", 754 vnode->fid.vid, vnode->fid.vnode, page->index); 755 756 /* wait for the page to be written to the cache before we allow it to 757 * be modified */ 758 #ifdef CONFIG_AFS_FSCACHE 759 fscache_wait_on_page_write(vnode->cache, page); 760 #endif 761 762 _leave(" = 0"); 763 return 0; 764 } 765