1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public 17 * License along with this program; if not, write to the 18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 19 * Boston, MA 021110-1307, USA. 20 */ 21 22 #include <linux/fs.h> 23 #include <linux/slab.h> 24 #include <linux/highmem.h> 25 #include <linux/pagemap.h> 26 #include <asm/byteorder.h> 27 #include <linux/swap.h> 28 #include <linux/pipe_fs_i.h> 29 30 #define MLOG_MASK_PREFIX ML_FILE_IO 31 #include <cluster/masklog.h> 32 33 #include "ocfs2.h" 34 35 #include "alloc.h" 36 #include "aops.h" 37 #include "dlmglue.h" 38 #include "extent_map.h" 39 #include "file.h" 40 #include "inode.h" 41 #include "journal.h" 42 #include "suballoc.h" 43 #include "super.h" 44 #include "symlink.h" 45 46 #include "buffer_head_io.h" 47 48 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, 49 struct buffer_head *bh_result, int create) 50 { 51 int err = -EIO; 52 int status; 53 struct ocfs2_dinode *fe = NULL; 54 struct buffer_head *bh = NULL; 55 struct buffer_head *buffer_cache_bh = NULL; 56 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 57 void *kaddr; 58 59 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 60 (unsigned long long)iblock, bh_result, create); 61 62 BUG_ON(ocfs2_inode_is_fast_symlink(inode)); 63 64 if ((iblock << inode->i_sb->s_blocksize_bits) > PATH_MAX + 1) { 65 mlog(ML_ERROR, "block offset > PATH_MAX: %llu", 66 (unsigned long long)iblock); 67 goto bail; 68 } 69 70 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 71 OCFS2_I(inode)->ip_blkno, 72 &bh, OCFS2_BH_CACHED, inode); 73 if (status < 0) { 74 mlog_errno(status); 75 goto bail; 76 } 77 fe = (struct ocfs2_dinode *) bh->b_data; 78 79 if (!OCFS2_IS_VALID_DINODE(fe)) { 80 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 81 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 82 fe->i_signature); 83 goto bail; 84 } 85 86 if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, 87 le32_to_cpu(fe->i_clusters))) { 88 mlog(ML_ERROR, "block offset is outside the allocated size: " 89 "%llu\n", (unsigned long long)iblock); 90 goto bail; 91 } 92 93 /* We don't use the page cache to create symlink data, so if 94 * need be, copy it over from the buffer cache. */ 95 if (!buffer_uptodate(bh_result) && ocfs2_inode_is_new(inode)) { 96 u64 blkno = le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + 97 iblock; 98 buffer_cache_bh = sb_getblk(osb->sb, blkno); 99 if (!buffer_cache_bh) { 100 mlog(ML_ERROR, "couldn't getblock for symlink!\n"); 101 goto bail; 102 } 103 104 /* we haven't locked out transactions, so a commit 105 * could've happened. Since we've got a reference on 106 * the bh, even if it commits while we're doing the 107 * copy, the data is still good. */ 108 if (buffer_jbd(buffer_cache_bh) 109 && ocfs2_inode_is_new(inode)) { 110 kaddr = kmap_atomic(bh_result->b_page, KM_USER0); 111 if (!kaddr) { 112 mlog(ML_ERROR, "couldn't kmap!\n"); 113 goto bail; 114 } 115 memcpy(kaddr + (bh_result->b_size * iblock), 116 buffer_cache_bh->b_data, 117 bh_result->b_size); 118 kunmap_atomic(kaddr, KM_USER0); 119 set_buffer_uptodate(bh_result); 120 } 121 brelse(buffer_cache_bh); 122 } 123 124 map_bh(bh_result, inode->i_sb, 125 le64_to_cpu(fe->id2.i_list.l_recs[0].e_blkno) + iblock); 126 127 err = 0; 128 129 bail: 130 if (bh) 131 brelse(bh); 132 133 mlog_exit(err); 134 return err; 135 } 136 137 static int ocfs2_get_block(struct inode *inode, sector_t iblock, 138 struct buffer_head *bh_result, int create) 139 { 140 int err = 0; 141 unsigned int ext_flags; 142 u64 p_blkno, past_eof; 143 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 144 145 mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, 146 (unsigned long long)iblock, bh_result, create); 147 148 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) 149 mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n", 150 inode, inode->i_ino); 151 152 if (S_ISLNK(inode->i_mode)) { 153 /* this always does I/O for some reason. */ 154 err = ocfs2_symlink_get_block(inode, iblock, bh_result, create); 155 goto bail; 156 } 157 158 err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, 159 &ext_flags); 160 if (err) { 161 mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " 162 "%llu, NULL)\n", err, inode, (unsigned long long)iblock, 163 (unsigned long long)p_blkno); 164 goto bail; 165 } 166 167 /* 168 * ocfs2 never allocates in this function - the only time we 169 * need to use BH_New is when we're extending i_size on a file 170 * system which doesn't support holes, in which case BH_New 171 * allows block_prepare_write() to zero. 172 */ 173 mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), 174 "ino %lu, iblock %llu\n", inode->i_ino, 175 (unsigned long long)iblock); 176 177 /* Treat the unwritten extent as a hole for zeroing purposes. */ 178 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 179 map_bh(bh_result, inode->i_sb, p_blkno); 180 181 if (!ocfs2_sparse_alloc(osb)) { 182 if (p_blkno == 0) { 183 err = -EIO; 184 mlog(ML_ERROR, 185 "iblock = %llu p_blkno = %llu blkno=(%llu)\n", 186 (unsigned long long)iblock, 187 (unsigned long long)p_blkno, 188 (unsigned long long)OCFS2_I(inode)->ip_blkno); 189 mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); 190 dump_stack(); 191 } 192 193 past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 194 mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, 195 (unsigned long long)past_eof); 196 197 if (create && (iblock >= past_eof)) 198 set_buffer_new(bh_result); 199 } 200 201 bail: 202 if (err < 0) 203 err = -EIO; 204 205 mlog_exit(err); 206 return err; 207 } 208 209 static int ocfs2_readpage(struct file *file, struct page *page) 210 { 211 struct inode *inode = page->mapping->host; 212 loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT; 213 int ret, unlock = 1; 214 215 mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0)); 216 217 ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); 218 if (ret != 0) { 219 if (ret == AOP_TRUNCATED_PAGE) 220 unlock = 0; 221 mlog_errno(ret); 222 goto out; 223 } 224 225 if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) { 226 ret = AOP_TRUNCATED_PAGE; 227 goto out_meta_unlock; 228 } 229 230 /* 231 * i_size might have just been updated as we grabed the meta lock. We 232 * might now be discovering a truncate that hit on another node. 233 * block_read_full_page->get_block freaks out if it is asked to read 234 * beyond the end of a file, so we check here. Callers 235 * (generic_file_read, fault->nopage) are clever enough to check i_size 236 * and notice that the page they just read isn't needed. 237 * 238 * XXX sys_readahead() seems to get that wrong? 239 */ 240 if (start >= i_size_read(inode)) { 241 zero_user_page(page, 0, PAGE_SIZE, KM_USER0); 242 SetPageUptodate(page); 243 ret = 0; 244 goto out_alloc; 245 } 246 247 ret = ocfs2_data_lock_with_page(inode, 0, page); 248 if (ret != 0) { 249 if (ret == AOP_TRUNCATED_PAGE) 250 unlock = 0; 251 mlog_errno(ret); 252 goto out_alloc; 253 } 254 255 ret = block_read_full_page(page, ocfs2_get_block); 256 unlock = 0; 257 258 ocfs2_data_unlock(inode, 0); 259 out_alloc: 260 up_read(&OCFS2_I(inode)->ip_alloc_sem); 261 out_meta_unlock: 262 ocfs2_meta_unlock(inode, 0); 263 out: 264 if (unlock) 265 unlock_page(page); 266 mlog_exit(ret); 267 return ret; 268 } 269 270 /* Note: Because we don't support holes, our allocation has 271 * already happened (allocation writes zeros to the file data) 272 * so we don't have to worry about ordered writes in 273 * ocfs2_writepage. 274 * 275 * ->writepage is called during the process of invalidating the page cache 276 * during blocked lock processing. It can't block on any cluster locks 277 * to during block mapping. It's relying on the fact that the block 278 * mapping can't have disappeared under the dirty pages that it is 279 * being asked to write back. 280 */ 281 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) 282 { 283 int ret; 284 285 mlog_entry("(0x%p)\n", page); 286 287 ret = block_write_full_page(page, ocfs2_get_block, wbc); 288 289 mlog_exit(ret); 290 291 return ret; 292 } 293 294 /* 295 * This is called from ocfs2_write_zero_page() which has handled it's 296 * own cluster locking and has ensured allocation exists for those 297 * blocks to be written. 298 */ 299 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, 300 unsigned from, unsigned to) 301 { 302 int ret; 303 304 down_read(&OCFS2_I(inode)->ip_alloc_sem); 305 306 ret = block_prepare_write(page, from, to, ocfs2_get_block); 307 308 up_read(&OCFS2_I(inode)->ip_alloc_sem); 309 310 return ret; 311 } 312 313 /* Taken from ext3. We don't necessarily need the full blown 314 * functionality yet, but IMHO it's better to cut and paste the whole 315 * thing so we can avoid introducing our own bugs (and easily pick up 316 * their fixes when they happen) --Mark */ 317 int walk_page_buffers( handle_t *handle, 318 struct buffer_head *head, 319 unsigned from, 320 unsigned to, 321 int *partial, 322 int (*fn)( handle_t *handle, 323 struct buffer_head *bh)) 324 { 325 struct buffer_head *bh; 326 unsigned block_start, block_end; 327 unsigned blocksize = head->b_size; 328 int err, ret = 0; 329 struct buffer_head *next; 330 331 for ( bh = head, block_start = 0; 332 ret == 0 && (bh != head || !block_start); 333 block_start = block_end, bh = next) 334 { 335 next = bh->b_this_page; 336 block_end = block_start + blocksize; 337 if (block_end <= from || block_start >= to) { 338 if (partial && !buffer_uptodate(bh)) 339 *partial = 1; 340 continue; 341 } 342 err = (*fn)(handle, bh); 343 if (!ret) 344 ret = err; 345 } 346 return ret; 347 } 348 349 handle_t *ocfs2_start_walk_page_trans(struct inode *inode, 350 struct page *page, 351 unsigned from, 352 unsigned to) 353 { 354 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 355 handle_t *handle = NULL; 356 int ret = 0; 357 358 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 359 if (!handle) { 360 ret = -ENOMEM; 361 mlog_errno(ret); 362 goto out; 363 } 364 365 if (ocfs2_should_order_data(inode)) { 366 ret = walk_page_buffers(handle, 367 page_buffers(page), 368 from, to, NULL, 369 ocfs2_journal_dirty_data); 370 if (ret < 0) 371 mlog_errno(ret); 372 } 373 out: 374 if (ret) { 375 if (handle) 376 ocfs2_commit_trans(osb, handle); 377 handle = ERR_PTR(ret); 378 } 379 return handle; 380 } 381 382 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) 383 { 384 sector_t status; 385 u64 p_blkno = 0; 386 int err = 0; 387 struct inode *inode = mapping->host; 388 389 mlog_entry("(block = %llu)\n", (unsigned long long)block); 390 391 /* We don't need to lock journal system files, since they aren't 392 * accessed concurrently from multiple nodes. 393 */ 394 if (!INODE_JOURNAL(inode)) { 395 err = ocfs2_meta_lock(inode, NULL, 0); 396 if (err) { 397 if (err != -ENOENT) 398 mlog_errno(err); 399 goto bail; 400 } 401 down_read(&OCFS2_I(inode)->ip_alloc_sem); 402 } 403 404 err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); 405 406 if (!INODE_JOURNAL(inode)) { 407 up_read(&OCFS2_I(inode)->ip_alloc_sem); 408 ocfs2_meta_unlock(inode, 0); 409 } 410 411 if (err) { 412 mlog(ML_ERROR, "get_blocks() failed, block = %llu\n", 413 (unsigned long long)block); 414 mlog_errno(err); 415 goto bail; 416 } 417 418 419 bail: 420 status = err ? 0 : p_blkno; 421 422 mlog_exit((int)status); 423 424 return status; 425 } 426 427 /* 428 * TODO: Make this into a generic get_blocks function. 429 * 430 * From do_direct_io in direct-io.c: 431 * "So what we do is to permit the ->get_blocks function to populate 432 * bh.b_size with the size of IO which is permitted at this offset and 433 * this i_blkbits." 434 * 435 * This function is called directly from get_more_blocks in direct-io.c. 436 * 437 * called like this: dio->get_blocks(dio->inode, fs_startblk, 438 * fs_count, map_bh, dio->rw == WRITE); 439 */ 440 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, 441 struct buffer_head *bh_result, int create) 442 { 443 int ret; 444 u64 p_blkno, inode_blocks, contig_blocks; 445 unsigned int ext_flags; 446 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 447 unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; 448 449 /* This function won't even be called if the request isn't all 450 * nicely aligned and of the right size, so there's no need 451 * for us to check any of that. */ 452 453 inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); 454 455 /* 456 * Any write past EOF is not allowed because we'd be extending. 457 */ 458 if (create && (iblock + max_blocks) > inode_blocks) { 459 ret = -EIO; 460 goto bail; 461 } 462 463 /* This figures out the size of the next contiguous block, and 464 * our logical offset */ 465 ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, 466 &contig_blocks, &ext_flags); 467 if (ret) { 468 mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", 469 (unsigned long long)iblock); 470 ret = -EIO; 471 goto bail; 472 } 473 474 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { 475 ocfs2_error(inode->i_sb, 476 "Inode %llu has a hole at block %llu\n", 477 (unsigned long long)OCFS2_I(inode)->ip_blkno, 478 (unsigned long long)iblock); 479 ret = -EROFS; 480 goto bail; 481 } 482 483 /* 484 * get_more_blocks() expects us to describe a hole by clearing 485 * the mapped bit on bh_result(). 486 * 487 * Consider an unwritten extent as a hole. 488 */ 489 if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) 490 map_bh(bh_result, inode->i_sb, p_blkno); 491 else { 492 /* 493 * ocfs2_prepare_inode_for_write() should have caught 494 * the case where we'd be filling a hole and triggered 495 * a buffered write instead. 496 */ 497 if (create) { 498 ret = -EIO; 499 mlog_errno(ret); 500 goto bail; 501 } 502 503 clear_buffer_mapped(bh_result); 504 } 505 506 /* make sure we don't map more than max_blocks blocks here as 507 that's all the kernel will handle at this point. */ 508 if (max_blocks < contig_blocks) 509 contig_blocks = max_blocks; 510 bh_result->b_size = contig_blocks << blocksize_bits; 511 bail: 512 return ret; 513 } 514 515 /* 516 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 517 * particularly interested in the aio/dio case. Like the core uses 518 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 519 * truncation on another. 520 */ 521 static void ocfs2_dio_end_io(struct kiocb *iocb, 522 loff_t offset, 523 ssize_t bytes, 524 void *private) 525 { 526 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 527 int level; 528 529 /* this io's submitter should not have unlocked this before we could */ 530 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 531 532 ocfs2_iocb_clear_rw_locked(iocb); 533 534 level = ocfs2_iocb_rw_locked_level(iocb); 535 if (!level) 536 up_read(&inode->i_alloc_sem); 537 ocfs2_rw_unlock(inode, level); 538 } 539 540 /* 541 * ocfs2_invalidatepage() and ocfs2_releasepage() are shamelessly stolen 542 * from ext3. PageChecked() bits have been removed as OCFS2 does not 543 * do journalled data. 544 */ 545 static void ocfs2_invalidatepage(struct page *page, unsigned long offset) 546 { 547 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 548 549 journal_invalidatepage(journal, page, offset); 550 } 551 552 static int ocfs2_releasepage(struct page *page, gfp_t wait) 553 { 554 journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; 555 556 if (!page_has_buffers(page)) 557 return 0; 558 return journal_try_to_free_buffers(journal, page, wait); 559 } 560 561 static ssize_t ocfs2_direct_IO(int rw, 562 struct kiocb *iocb, 563 const struct iovec *iov, 564 loff_t offset, 565 unsigned long nr_segs) 566 { 567 struct file *file = iocb->ki_filp; 568 struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host; 569 int ret; 570 571 mlog_entry_void(); 572 573 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { 574 /* 575 * We get PR data locks even for O_DIRECT. This 576 * allows concurrent O_DIRECT I/O but doesn't let 577 * O_DIRECT with extending and buffered zeroing writes 578 * race. If they did race then the buffered zeroing 579 * could be written back after the O_DIRECT I/O. It's 580 * one thing to tell people not to mix buffered and 581 * O_DIRECT writes, but expecting them to understand 582 * that file extension is also an implicit buffered 583 * write is too much. By getting the PR we force 584 * writeback of the buffered zeroing before 585 * proceeding. 586 */ 587 ret = ocfs2_data_lock(inode, 0); 588 if (ret < 0) { 589 mlog_errno(ret); 590 goto out; 591 } 592 ocfs2_data_unlock(inode, 0); 593 } 594 595 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, 596 inode->i_sb->s_bdev, iov, offset, 597 nr_segs, 598 ocfs2_direct_IO_get_blocks, 599 ocfs2_dio_end_io); 600 out: 601 mlog_exit(ret); 602 return ret; 603 } 604 605 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, 606 u32 cpos, 607 unsigned int *start, 608 unsigned int *end) 609 { 610 unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; 611 612 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { 613 unsigned int cpp; 614 615 cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); 616 617 cluster_start = cpos % cpp; 618 cluster_start = cluster_start << osb->s_clustersize_bits; 619 620 cluster_end = cluster_start + osb->s_clustersize; 621 } 622 623 BUG_ON(cluster_start > PAGE_SIZE); 624 BUG_ON(cluster_end > PAGE_SIZE); 625 626 if (start) 627 *start = cluster_start; 628 if (end) 629 *end = cluster_end; 630 } 631 632 /* 633 * 'from' and 'to' are the region in the page to avoid zeroing. 634 * 635 * If pagesize > clustersize, this function will avoid zeroing outside 636 * of the cluster boundary. 637 * 638 * from == to == 0 is code for "zero the entire cluster region" 639 */ 640 static void ocfs2_clear_page_regions(struct page *page, 641 struct ocfs2_super *osb, u32 cpos, 642 unsigned from, unsigned to) 643 { 644 void *kaddr; 645 unsigned int cluster_start, cluster_end; 646 647 ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); 648 649 kaddr = kmap_atomic(page, KM_USER0); 650 651 if (from || to) { 652 if (from > cluster_start) 653 memset(kaddr + cluster_start, 0, from - cluster_start); 654 if (to < cluster_end) 655 memset(kaddr + to, 0, cluster_end - to); 656 } else { 657 memset(kaddr + cluster_start, 0, cluster_end - cluster_start); 658 } 659 660 kunmap_atomic(kaddr, KM_USER0); 661 } 662 663 /* 664 * Some of this taken from block_prepare_write(). We already have our 665 * mapping by now though, and the entire write will be allocating or 666 * it won't, so not much need to use BH_New. 667 * 668 * This will also skip zeroing, which is handled externally. 669 */ 670 int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, 671 struct inode *inode, unsigned int from, 672 unsigned int to, int new) 673 { 674 int ret = 0; 675 struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; 676 unsigned int block_end, block_start; 677 unsigned int bsize = 1 << inode->i_blkbits; 678 679 if (!page_has_buffers(page)) 680 create_empty_buffers(page, bsize, 0); 681 682 head = page_buffers(page); 683 for (bh = head, block_start = 0; bh != head || !block_start; 684 bh = bh->b_this_page, block_start += bsize) { 685 block_end = block_start + bsize; 686 687 /* 688 * Ignore blocks outside of our i/o range - 689 * they may belong to unallocated clusters. 690 */ 691 if (block_start >= to || block_end <= from) { 692 if (PageUptodate(page)) 693 set_buffer_uptodate(bh); 694 continue; 695 } 696 697 /* 698 * For an allocating write with cluster size >= page 699 * size, we always write the entire page. 700 */ 701 702 if (buffer_new(bh)) 703 clear_buffer_new(bh); 704 705 if (!buffer_mapped(bh)) { 706 map_bh(bh, inode->i_sb, *p_blkno); 707 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 708 } 709 710 if (PageUptodate(page)) { 711 if (!buffer_uptodate(bh)) 712 set_buffer_uptodate(bh); 713 } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && 714 (block_start < from || block_end > to)) { 715 ll_rw_block(READ, 1, &bh); 716 *wait_bh++=bh; 717 } 718 719 *p_blkno = *p_blkno + 1; 720 } 721 722 /* 723 * If we issued read requests - let them complete. 724 */ 725 while(wait_bh > wait) { 726 wait_on_buffer(*--wait_bh); 727 if (!buffer_uptodate(*wait_bh)) 728 ret = -EIO; 729 } 730 731 if (ret == 0 || !new) 732 return ret; 733 734 /* 735 * If we get -EIO above, zero out any newly allocated blocks 736 * to avoid exposing stale data. 737 */ 738 bh = head; 739 block_start = 0; 740 do { 741 void *kaddr; 742 743 block_end = block_start + bsize; 744 if (block_end <= from) 745 goto next_bh; 746 if (block_start >= to) 747 break; 748 749 kaddr = kmap_atomic(page, KM_USER0); 750 memset(kaddr+block_start, 0, bh->b_size); 751 flush_dcache_page(page); 752 kunmap_atomic(kaddr, KM_USER0); 753 set_buffer_uptodate(bh); 754 mark_buffer_dirty(bh); 755 756 next_bh: 757 block_start = block_end; 758 bh = bh->b_this_page; 759 } while (bh != head); 760 761 return ret; 762 } 763 764 /* 765 * This will copy user data from the buffer page in the splice 766 * context. 767 * 768 * For now, we ignore SPLICE_F_MOVE as that would require some extra 769 * communication out all the way to ocfs2_write(). 770 */ 771 int ocfs2_map_and_write_splice_data(struct inode *inode, 772 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 773 unsigned int *ret_from, unsigned int *ret_to) 774 { 775 int ret; 776 unsigned int to, from, cluster_start, cluster_end; 777 char *src, *dst; 778 struct ocfs2_splice_write_priv *sp = wc->w_private; 779 struct pipe_buffer *buf = sp->s_buf; 780 unsigned long bytes, src_from; 781 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 782 783 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 784 &cluster_end); 785 786 from = sp->s_offset; 787 src_from = sp->s_buf_offset; 788 bytes = wc->w_count; 789 790 if (wc->w_large_pages) { 791 /* 792 * For cluster size < page size, we have to 793 * calculate pos within the cluster and obey 794 * the rightmost boundary. 795 */ 796 bytes = min(bytes, (unsigned long)(osb->s_clustersize 797 - (wc->w_pos & (osb->s_clustersize - 1)))); 798 } 799 to = from + bytes; 800 801 BUG_ON(from > PAGE_CACHE_SIZE); 802 BUG_ON(to > PAGE_CACHE_SIZE); 803 BUG_ON(from < cluster_start); 804 BUG_ON(to > cluster_end); 805 806 if (wc->w_this_page_new) 807 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 808 cluster_start, cluster_end, 1); 809 else 810 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 811 from, to, 0); 812 if (ret) { 813 mlog_errno(ret); 814 goto out; 815 } 816 817 src = buf->ops->map(sp->s_pipe, buf, 1); 818 dst = kmap_atomic(wc->w_this_page, KM_USER1); 819 memcpy(dst + from, src + src_from, bytes); 820 kunmap_atomic(wc->w_this_page, KM_USER1); 821 buf->ops->unmap(sp->s_pipe, buf, src); 822 823 wc->w_finished_copy = 1; 824 825 *ret_from = from; 826 *ret_to = to; 827 out: 828 829 return bytes ? (unsigned int)bytes : ret; 830 } 831 832 /* 833 * This will copy user data from the iovec in the buffered write 834 * context. 835 */ 836 int ocfs2_map_and_write_user_data(struct inode *inode, 837 struct ocfs2_write_ctxt *wc, u64 *p_blkno, 838 unsigned int *ret_from, unsigned int *ret_to) 839 { 840 int ret; 841 unsigned int to, from, cluster_start, cluster_end; 842 unsigned long bytes, src_from; 843 char *dst; 844 struct ocfs2_buffered_write_priv *bp = wc->w_private; 845 const struct iovec *cur_iov = bp->b_cur_iov; 846 char __user *buf; 847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 848 849 ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, 850 &cluster_end); 851 852 buf = cur_iov->iov_base + bp->b_cur_off; 853 src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; 854 855 from = wc->w_pos & (PAGE_CACHE_SIZE - 1); 856 857 /* 858 * This is a lot of comparisons, but it reads quite 859 * easily, which is important here. 860 */ 861 /* Stay within the src page */ 862 bytes = PAGE_SIZE - src_from; 863 /* Stay within the vector */ 864 bytes = min(bytes, 865 (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); 866 /* Stay within count */ 867 bytes = min(bytes, (unsigned long)wc->w_count); 868 /* 869 * For clustersize > page size, just stay within 870 * target page, otherwise we have to calculate pos 871 * within the cluster and obey the rightmost 872 * boundary. 873 */ 874 if (wc->w_large_pages) { 875 /* 876 * For cluster size < page size, we have to 877 * calculate pos within the cluster and obey 878 * the rightmost boundary. 879 */ 880 bytes = min(bytes, (unsigned long)(osb->s_clustersize 881 - (wc->w_pos & (osb->s_clustersize - 1)))); 882 } else { 883 /* 884 * cluster size > page size is the most common 885 * case - we just stay within the target page 886 * boundary. 887 */ 888 bytes = min(bytes, PAGE_CACHE_SIZE - from); 889 } 890 891 to = from + bytes; 892 893 BUG_ON(from > PAGE_CACHE_SIZE); 894 BUG_ON(to > PAGE_CACHE_SIZE); 895 BUG_ON(from < cluster_start); 896 BUG_ON(to > cluster_end); 897 898 if (wc->w_this_page_new) 899 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 900 cluster_start, cluster_end, 1); 901 else 902 ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, 903 from, to, 0); 904 if (ret) { 905 mlog_errno(ret); 906 goto out; 907 } 908 909 dst = kmap(wc->w_this_page); 910 memcpy(dst + from, bp->b_src_buf + src_from, bytes); 911 kunmap(wc->w_this_page); 912 913 /* 914 * XXX: This is slow, but simple. The caller of 915 * ocfs2_buffered_write_cluster() is responsible for 916 * passing through the iovecs, so it's difficult to 917 * predict what our next step is in here after our 918 * initial write. A future version should be pushing 919 * that iovec manipulation further down. 920 * 921 * By setting this, we indicate that a copy from user 922 * data was done, and subsequent calls for this 923 * cluster will skip copying more data. 924 */ 925 wc->w_finished_copy = 1; 926 927 *ret_from = from; 928 *ret_to = to; 929 out: 930 931 return bytes ? (unsigned int)bytes : ret; 932 } 933 934 /* 935 * Map, fill and write a page to disk. 936 * 937 * The work of copying data is done via callback. Newly allocated 938 * pages which don't take user data will be zero'd (set 'new' to 939 * indicate an allocating write) 940 * 941 * Returns a negative error code or the number of bytes copied into 942 * the page. 943 */ 944 static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, 945 u64 *p_blkno, struct page *page, 946 struct ocfs2_write_ctxt *wc, int new) 947 { 948 int ret, copied = 0; 949 unsigned int from = 0, to = 0; 950 unsigned int cluster_start, cluster_end; 951 unsigned int zero_from = 0, zero_to = 0; 952 953 ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, 954 &cluster_start, &cluster_end); 955 956 if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index 957 && !wc->w_finished_copy) { 958 959 wc->w_this_page = page; 960 wc->w_this_page_new = new; 961 ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); 962 if (ret < 0) { 963 mlog_errno(ret); 964 goto out; 965 } 966 967 copied = ret; 968 969 zero_from = from; 970 zero_to = to; 971 if (new) { 972 from = cluster_start; 973 to = cluster_end; 974 } 975 } else { 976 /* 977 * If we haven't allocated the new page yet, we 978 * shouldn't be writing it out without copying user 979 * data. This is likely a math error from the caller. 980 */ 981 BUG_ON(!new); 982 983 from = cluster_start; 984 to = cluster_end; 985 986 ret = ocfs2_map_page_blocks(page, p_blkno, inode, 987 cluster_start, cluster_end, 1); 988 if (ret) { 989 mlog_errno(ret); 990 goto out; 991 } 992 } 993 994 /* 995 * Parts of newly allocated pages need to be zero'd. 996 * 997 * Above, we have also rewritten 'to' and 'from' - as far as 998 * the rest of the function is concerned, the entire cluster 999 * range inside of a page needs to be written. 1000 * 1001 * We can skip this if the page is up to date - it's already 1002 * been zero'd from being read in as a hole. 1003 */ 1004 if (new && !PageUptodate(page)) 1005 ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), 1006 wc->w_cpos, zero_from, zero_to); 1007 1008 flush_dcache_page(page); 1009 1010 if (ocfs2_should_order_data(inode)) { 1011 ret = walk_page_buffers(handle, 1012 page_buffers(page), 1013 from, to, NULL, 1014 ocfs2_journal_dirty_data); 1015 if (ret < 0) 1016 mlog_errno(ret); 1017 } 1018 1019 /* 1020 * We don't use generic_commit_write() because we need to 1021 * handle our own i_size update. 1022 */ 1023 ret = block_commit_write(page, from, to); 1024 if (ret) 1025 mlog_errno(ret); 1026 out: 1027 1028 return copied ? copied : ret; 1029 } 1030 1031 /* 1032 * Do the actual write of some data into an inode. Optionally allocate 1033 * in order to fulfill the write. 1034 * 1035 * cpos is the logical cluster offset within the file to write at 1036 * 1037 * 'phys' is the physical mapping of that offset. a 'phys' value of 1038 * zero indicates that allocation is required. In this case, data_ac 1039 * and meta_ac should be valid (meta_ac can be null if metadata 1040 * allocation isn't required). 1041 */ 1042 static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, 1043 struct buffer_head *di_bh, 1044 struct ocfs2_alloc_context *data_ac, 1045 struct ocfs2_alloc_context *meta_ac, 1046 struct ocfs2_write_ctxt *wc) 1047 { 1048 int ret, i, numpages = 1, new; 1049 unsigned int copied = 0; 1050 u32 tmp_pos; 1051 u64 v_blkno, p_blkno; 1052 struct address_space *mapping = file->f_mapping; 1053 struct inode *inode = mapping->host; 1054 unsigned long index, start; 1055 struct page **cpages; 1056 1057 new = phys == 0 ? 1 : 0; 1058 1059 /* 1060 * Figure out how many pages we'll be manipulating here. For 1061 * non allocating write, we just change the one 1062 * page. Otherwise, we'll need a whole clusters worth. 1063 */ 1064 if (new) 1065 numpages = ocfs2_pages_per_cluster(inode->i_sb); 1066 1067 cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); 1068 if (!cpages) { 1069 ret = -ENOMEM; 1070 mlog_errno(ret); 1071 return ret; 1072 } 1073 1074 /* 1075 * Fill our page array first. That way we've grabbed enough so 1076 * that we can zero and flush if we error after adding the 1077 * extent. 1078 */ 1079 if (new) { 1080 start = ocfs2_align_clusters_to_page_index(inode->i_sb, 1081 wc->w_cpos); 1082 v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); 1083 } else { 1084 start = wc->w_pos >> PAGE_CACHE_SHIFT; 1085 v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; 1086 } 1087 1088 for(i = 0; i < numpages; i++) { 1089 index = start + i; 1090 1091 cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); 1092 if (!cpages[i]) { 1093 ret = -ENOMEM; 1094 mlog_errno(ret); 1095 goto out; 1096 } 1097 } 1098 1099 if (new) { 1100 /* 1101 * This is safe to call with the page locks - it won't take 1102 * any additional semaphores or cluster locks. 1103 */ 1104 tmp_pos = wc->w_cpos; 1105 ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, 1106 &tmp_pos, 1, di_bh, handle, 1107 data_ac, meta_ac, NULL); 1108 /* 1109 * This shouldn't happen because we must have already 1110 * calculated the correct meta data allocation required. The 1111 * internal tree allocation code should know how to increase 1112 * transaction credits itself. 1113 * 1114 * If need be, we could handle -EAGAIN for a 1115 * RESTART_TRANS here. 1116 */ 1117 mlog_bug_on_msg(ret == -EAGAIN, 1118 "Inode %llu: EAGAIN return during allocation.\n", 1119 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1120 if (ret < 0) { 1121 mlog_errno(ret); 1122 goto out; 1123 } 1124 } 1125 1126 ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, 1127 NULL); 1128 if (ret < 0) { 1129 1130 /* 1131 * XXX: Should we go readonly here? 1132 */ 1133 1134 mlog_errno(ret); 1135 goto out; 1136 } 1137 1138 BUG_ON(p_blkno == 0); 1139 1140 for(i = 0; i < numpages; i++) { 1141 ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], 1142 wc, new); 1143 if (ret < 0) { 1144 mlog_errno(ret); 1145 goto out; 1146 } 1147 1148 copied += ret; 1149 } 1150 1151 out: 1152 for(i = 0; i < numpages; i++) { 1153 unlock_page(cpages[i]); 1154 mark_page_accessed(cpages[i]); 1155 page_cache_release(cpages[i]); 1156 } 1157 kfree(cpages); 1158 1159 return copied ? copied : ret; 1160 } 1161 1162 static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, 1163 struct ocfs2_super *osb, loff_t pos, 1164 size_t count, ocfs2_page_writer *cb, 1165 void *cb_priv) 1166 { 1167 wc->w_count = count; 1168 wc->w_pos = pos; 1169 wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; 1170 wc->w_finished_copy = 0; 1171 1172 if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) 1173 wc->w_large_pages = 1; 1174 else 1175 wc->w_large_pages = 0; 1176 1177 wc->w_write_data_page = cb; 1178 wc->w_private = cb_priv; 1179 } 1180 1181 /* 1182 * Write a cluster to an inode. The cluster may not be allocated yet, 1183 * in which case it will be. This only exists for buffered writes - 1184 * O_DIRECT takes a more "traditional" path through the kernel. 1185 * 1186 * The caller is responsible for incrementing pos, written counts, etc 1187 * 1188 * For file systems that don't support sparse files, pre-allocation 1189 * and page zeroing up until cpos should be done prior to this 1190 * function call. 1191 * 1192 * Callers should be holding i_sem, and the rw cluster lock. 1193 * 1194 * Returns the number of user bytes written, or less than zero for 1195 * error. 1196 */ 1197 ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, 1198 size_t count, ocfs2_page_writer *actor, 1199 void *priv) 1200 { 1201 int ret, credits = OCFS2_INODE_UPDATE_CREDITS; 1202 ssize_t written = 0; 1203 u32 phys; 1204 struct inode *inode = file->f_mapping->host; 1205 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1206 struct buffer_head *di_bh = NULL; 1207 struct ocfs2_dinode *di; 1208 struct ocfs2_alloc_context *data_ac = NULL; 1209 struct ocfs2_alloc_context *meta_ac = NULL; 1210 handle_t *handle; 1211 struct ocfs2_write_ctxt wc; 1212 1213 ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); 1214 1215 ret = ocfs2_meta_lock(inode, &di_bh, 1); 1216 if (ret) { 1217 mlog_errno(ret); 1218 goto out; 1219 } 1220 di = (struct ocfs2_dinode *)di_bh->b_data; 1221 1222 /* 1223 * Take alloc sem here to prevent concurrent lookups. That way 1224 * the mapping, zeroing and tree manipulation within 1225 * ocfs2_write() will be safe against ->readpage(). This 1226 * should also serve to lock out allocation from a shared 1227 * writeable region. 1228 */ 1229 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1230 1231 ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); 1232 if (ret) { 1233 mlog_errno(ret); 1234 goto out_meta; 1235 } 1236 1237 /* phys == 0 means that allocation is required. */ 1238 if (phys == 0) { 1239 ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); 1240 if (ret) { 1241 mlog_errno(ret); 1242 goto out_meta; 1243 } 1244 1245 credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); 1246 } 1247 1248 ret = ocfs2_data_lock(inode, 1); 1249 if (ret) { 1250 mlog_errno(ret); 1251 goto out_meta; 1252 } 1253 1254 handle = ocfs2_start_trans(osb, credits); 1255 if (IS_ERR(handle)) { 1256 ret = PTR_ERR(handle); 1257 mlog_errno(ret); 1258 goto out_data; 1259 } 1260 1261 written = ocfs2_write(file, phys, handle, di_bh, data_ac, 1262 meta_ac, &wc); 1263 if (written < 0) { 1264 ret = written; 1265 mlog_errno(ret); 1266 goto out_commit; 1267 } 1268 1269 ret = ocfs2_journal_access(handle, inode, di_bh, 1270 OCFS2_JOURNAL_ACCESS_WRITE); 1271 if (ret) { 1272 mlog_errno(ret); 1273 goto out_commit; 1274 } 1275 1276 pos += written; 1277 if (pos > inode->i_size) { 1278 i_size_write(inode, pos); 1279 mark_inode_dirty(inode); 1280 } 1281 inode->i_blocks = ocfs2_inode_sector_count(inode); 1282 di->i_size = cpu_to_le64((u64)i_size_read(inode)); 1283 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1284 di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); 1285 di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1286 1287 ret = ocfs2_journal_dirty(handle, di_bh); 1288 if (ret) 1289 mlog_errno(ret); 1290 1291 out_commit: 1292 ocfs2_commit_trans(osb, handle); 1293 1294 out_data: 1295 ocfs2_data_unlock(inode, 1); 1296 1297 out_meta: 1298 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1299 ocfs2_meta_unlock(inode, 1); 1300 1301 out: 1302 brelse(di_bh); 1303 if (data_ac) 1304 ocfs2_free_alloc_context(data_ac); 1305 if (meta_ac) 1306 ocfs2_free_alloc_context(meta_ac); 1307 1308 return written ? written : ret; 1309 } 1310 1311 const struct address_space_operations ocfs2_aops = { 1312 .readpage = ocfs2_readpage, 1313 .writepage = ocfs2_writepage, 1314 .bmap = ocfs2_bmap, 1315 .sync_page = block_sync_page, 1316 .direct_IO = ocfs2_direct_IO, 1317 .invalidatepage = ocfs2_invalidatepage, 1318 .releasepage = ocfs2_releasepage, 1319 .migratepage = buffer_migrate_page, 1320 }; 1321