1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * refcounttree.c 5 * 6 * Copyright (C) 2009 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 18 #include <linux/sort.h> 19 #define MLOG_MASK_PREFIX ML_REFCOUNT 20 #include <cluster/masklog.h> 21 #include "ocfs2.h" 22 #include "inode.h" 23 #include "alloc.h" 24 #include "suballoc.h" 25 #include "journal.h" 26 #include "uptodate.h" 27 #include "super.h" 28 #include "buffer_head_io.h" 29 #include "blockcheck.h" 30 #include "refcounttree.h" 31 #include "sysfile.h" 32 #include "dlmglue.h" 33 #include "extent_map.h" 34 #include "aops.h" 35 #include "xattr.h" 36 #include "namei.h" 37 38 #include <linux/bio.h> 39 #include <linux/blkdev.h> 40 #include <linux/gfp.h> 41 #include <linux/slab.h> 42 #include <linux/writeback.h> 43 #include <linux/pagevec.h> 44 #include <linux/swap.h> 45 #include <linux/security.h> 46 #include <linux/fsnotify.h> 47 #include <linux/quotaops.h> 48 #include <linux/namei.h> 49 #include <linux/mount.h> 50 51 struct ocfs2_cow_context { 52 struct inode *inode; 53 u32 cow_start; 54 u32 cow_len; 55 struct ocfs2_extent_tree data_et; 56 struct ocfs2_refcount_tree *ref_tree; 57 struct buffer_head *ref_root_bh; 58 struct ocfs2_alloc_context *meta_ac; 59 struct ocfs2_alloc_context *data_ac; 60 struct ocfs2_cached_dealloc_ctxt dealloc; 61 void *cow_object; 62 struct ocfs2_post_refcount *post_refcount; 63 int extra_credits; 64 int (*get_clusters)(struct ocfs2_cow_context *context, 65 u32 v_cluster, u32 *p_cluster, 66 u32 *num_clusters, 67 unsigned int *extent_flags); 68 int (*cow_duplicate_clusters)(handle_t *handle, 69 struct ocfs2_cow_context *context, 70 u32 cpos, u32 old_cluster, 71 u32 new_cluster, u32 new_len); 72 }; 73 74 static inline struct ocfs2_refcount_tree * 75 cache_info_to_refcount(struct ocfs2_caching_info *ci) 76 { 77 return container_of(ci, struct ocfs2_refcount_tree, rf_ci); 78 } 79 80 static int ocfs2_validate_refcount_block(struct super_block *sb, 81 struct buffer_head *bh) 82 { 83 int rc; 84 struct ocfs2_refcount_block *rb = 85 (struct ocfs2_refcount_block *)bh->b_data; 86 87 mlog(0, "Validating refcount block %llu\n", 88 (unsigned long long)bh->b_blocknr); 89 90 BUG_ON(!buffer_uptodate(bh)); 91 92 /* 93 * If the ecc fails, we return the error but otherwise 94 * leave the filesystem running. We know any error is 95 * local to this block. 96 */ 97 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); 98 if (rc) { 99 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", 100 (unsigned long long)bh->b_blocknr); 101 return rc; 102 } 103 104 105 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { 106 ocfs2_error(sb, 107 "Refcount block #%llu has bad signature %.*s", 108 (unsigned long long)bh->b_blocknr, 7, 109 rb->rf_signature); 110 return -EINVAL; 111 } 112 113 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { 114 ocfs2_error(sb, 115 "Refcount block #%llu has an invalid rf_blkno " 116 "of %llu", 117 (unsigned long long)bh->b_blocknr, 118 (unsigned long long)le64_to_cpu(rb->rf_blkno)); 119 return -EINVAL; 120 } 121 122 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { 123 ocfs2_error(sb, 124 "Refcount block #%llu has an invalid " 125 "rf_fs_generation of #%u", 126 (unsigned long long)bh->b_blocknr, 127 le32_to_cpu(rb->rf_fs_generation)); 128 return -EINVAL; 129 } 130 131 return 0; 132 } 133 134 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, 135 u64 rb_blkno, 136 struct buffer_head **bh) 137 { 138 int rc; 139 struct buffer_head *tmp = *bh; 140 141 rc = ocfs2_read_block(ci, rb_blkno, &tmp, 142 ocfs2_validate_refcount_block); 143 144 /* If ocfs2_read_block() got us a new bh, pass it up. */ 145 if (!rc && !*bh) 146 *bh = tmp; 147 148 return rc; 149 } 150 151 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) 152 { 153 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 154 155 return rf->rf_blkno; 156 } 157 158 static struct super_block * 159 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) 160 { 161 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 162 163 return rf->rf_sb; 164 } 165 166 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) 167 { 168 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 169 170 spin_lock(&rf->rf_lock); 171 } 172 173 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) 174 { 175 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 176 177 spin_unlock(&rf->rf_lock); 178 } 179 180 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) 181 { 182 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 183 184 mutex_lock(&rf->rf_io_mutex); 185 } 186 187 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) 188 { 189 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 190 191 mutex_unlock(&rf->rf_io_mutex); 192 } 193 194 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { 195 .co_owner = ocfs2_refcount_cache_owner, 196 .co_get_super = ocfs2_refcount_cache_get_super, 197 .co_cache_lock = ocfs2_refcount_cache_lock, 198 .co_cache_unlock = ocfs2_refcount_cache_unlock, 199 .co_io_lock = ocfs2_refcount_cache_io_lock, 200 .co_io_unlock = ocfs2_refcount_cache_io_unlock, 201 }; 202 203 static struct ocfs2_refcount_tree * 204 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) 205 { 206 struct rb_node *n = osb->osb_rf_lock_tree.rb_node; 207 struct ocfs2_refcount_tree *tree = NULL; 208 209 while (n) { 210 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); 211 212 if (blkno < tree->rf_blkno) 213 n = n->rb_left; 214 else if (blkno > tree->rf_blkno) 215 n = n->rb_right; 216 else 217 return tree; 218 } 219 220 return NULL; 221 } 222 223 /* osb_lock is already locked. */ 224 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, 225 struct ocfs2_refcount_tree *new) 226 { 227 u64 rf_blkno = new->rf_blkno; 228 struct rb_node *parent = NULL; 229 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; 230 struct ocfs2_refcount_tree *tmp; 231 232 while (*p) { 233 parent = *p; 234 235 tmp = rb_entry(parent, struct ocfs2_refcount_tree, 236 rf_node); 237 238 if (rf_blkno < tmp->rf_blkno) 239 p = &(*p)->rb_left; 240 else if (rf_blkno > tmp->rf_blkno) 241 p = &(*p)->rb_right; 242 else { 243 /* This should never happen! */ 244 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", 245 (unsigned long long)rf_blkno); 246 BUG(); 247 } 248 } 249 250 rb_link_node(&new->rf_node, parent, p); 251 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); 252 } 253 254 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) 255 { 256 ocfs2_metadata_cache_exit(&tree->rf_ci); 257 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); 258 ocfs2_lock_res_free(&tree->rf_lockres); 259 kfree(tree); 260 } 261 262 static inline void 263 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, 264 struct ocfs2_refcount_tree *tree) 265 { 266 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); 267 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) 268 osb->osb_ref_tree_lru = NULL; 269 } 270 271 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, 272 struct ocfs2_refcount_tree *tree) 273 { 274 spin_lock(&osb->osb_lock); 275 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 276 spin_unlock(&osb->osb_lock); 277 } 278 279 static void ocfs2_kref_remove_refcount_tree(struct kref *kref) 280 { 281 struct ocfs2_refcount_tree *tree = 282 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); 283 284 ocfs2_free_refcount_tree(tree); 285 } 286 287 static inline void 288 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) 289 { 290 kref_get(&tree->rf_getcnt); 291 } 292 293 static inline void 294 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) 295 { 296 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); 297 } 298 299 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, 300 struct super_block *sb) 301 { 302 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); 303 mutex_init(&new->rf_io_mutex); 304 new->rf_sb = sb; 305 spin_lock_init(&new->rf_lock); 306 } 307 308 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, 309 struct ocfs2_refcount_tree *new, 310 u64 rf_blkno, u32 generation) 311 { 312 init_rwsem(&new->rf_sem); 313 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, 314 rf_blkno, generation); 315 } 316 317 static struct ocfs2_refcount_tree* 318 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) 319 { 320 struct ocfs2_refcount_tree *new; 321 322 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); 323 if (!new) 324 return NULL; 325 326 new->rf_blkno = rf_blkno; 327 kref_init(&new->rf_getcnt); 328 ocfs2_init_refcount_tree_ci(new, osb->sb); 329 330 return new; 331 } 332 333 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, 334 struct ocfs2_refcount_tree **ret_tree) 335 { 336 int ret = 0; 337 struct ocfs2_refcount_tree *tree, *new = NULL; 338 struct buffer_head *ref_root_bh = NULL; 339 struct ocfs2_refcount_block *ref_rb; 340 341 spin_lock(&osb->osb_lock); 342 if (osb->osb_ref_tree_lru && 343 osb->osb_ref_tree_lru->rf_blkno == rf_blkno) 344 tree = osb->osb_ref_tree_lru; 345 else 346 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 347 if (tree) 348 goto out; 349 350 spin_unlock(&osb->osb_lock); 351 352 new = ocfs2_allocate_refcount_tree(osb, rf_blkno); 353 if (!new) { 354 ret = -ENOMEM; 355 mlog_errno(ret); 356 return ret; 357 } 358 /* 359 * We need the generation to create the refcount tree lock and since 360 * it isn't changed during the tree modification, we are safe here to 361 * read without protection. 362 * We also have to purge the cache after we create the lock since the 363 * refcount block may have the stale data. It can only be trusted when 364 * we hold the refcount lock. 365 */ 366 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); 367 if (ret) { 368 mlog_errno(ret); 369 ocfs2_metadata_cache_exit(&new->rf_ci); 370 kfree(new); 371 return ret; 372 } 373 374 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 375 new->rf_generation = le32_to_cpu(ref_rb->rf_generation); 376 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, 377 new->rf_generation); 378 ocfs2_metadata_cache_purge(&new->rf_ci); 379 380 spin_lock(&osb->osb_lock); 381 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 382 if (tree) 383 goto out; 384 385 ocfs2_insert_refcount_tree(osb, new); 386 387 tree = new; 388 new = NULL; 389 390 out: 391 *ret_tree = tree; 392 393 osb->osb_ref_tree_lru = tree; 394 395 spin_unlock(&osb->osb_lock); 396 397 if (new) 398 ocfs2_free_refcount_tree(new); 399 400 brelse(ref_root_bh); 401 return ret; 402 } 403 404 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) 405 { 406 int ret; 407 struct buffer_head *di_bh = NULL; 408 struct ocfs2_dinode *di; 409 410 ret = ocfs2_read_inode_block(inode, &di_bh); 411 if (ret) { 412 mlog_errno(ret); 413 goto out; 414 } 415 416 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 417 418 di = (struct ocfs2_dinode *)di_bh->b_data; 419 *ref_blkno = le64_to_cpu(di->i_refcount_loc); 420 brelse(di_bh); 421 out: 422 return ret; 423 } 424 425 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 426 struct ocfs2_refcount_tree *tree, int rw) 427 { 428 int ret; 429 430 ret = ocfs2_refcount_lock(tree, rw); 431 if (ret) { 432 mlog_errno(ret); 433 goto out; 434 } 435 436 if (rw) 437 down_write(&tree->rf_sem); 438 else 439 down_read(&tree->rf_sem); 440 441 out: 442 return ret; 443 } 444 445 /* 446 * Lock the refcount tree pointed by ref_blkno and return the tree. 447 * In most case, we lock the tree and read the refcount block. 448 * So read it here if the caller really needs it. 449 * 450 * If the tree has been re-created by other node, it will free the 451 * old one and re-create it. 452 */ 453 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 454 u64 ref_blkno, int rw, 455 struct ocfs2_refcount_tree **ret_tree, 456 struct buffer_head **ref_bh) 457 { 458 int ret, delete_tree = 0; 459 struct ocfs2_refcount_tree *tree = NULL; 460 struct buffer_head *ref_root_bh = NULL; 461 struct ocfs2_refcount_block *rb; 462 463 again: 464 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); 465 if (ret) { 466 mlog_errno(ret); 467 return ret; 468 } 469 470 ocfs2_refcount_tree_get(tree); 471 472 ret = __ocfs2_lock_refcount_tree(osb, tree, rw); 473 if (ret) { 474 mlog_errno(ret); 475 ocfs2_refcount_tree_put(tree); 476 goto out; 477 } 478 479 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 480 &ref_root_bh); 481 if (ret) { 482 mlog_errno(ret); 483 ocfs2_unlock_refcount_tree(osb, tree, rw); 484 ocfs2_refcount_tree_put(tree); 485 goto out; 486 } 487 488 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 489 /* 490 * If the refcount block has been freed and re-created, we may need 491 * to recreate the refcount tree also. 492 * 493 * Here we just remove the tree from the rb-tree, and the last 494 * kref holder will unlock and delete this refcount_tree. 495 * Then we goto "again" and ocfs2_get_refcount_tree will create 496 * the new refcount tree for us. 497 */ 498 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { 499 if (!tree->rf_removed) { 500 ocfs2_erase_refcount_tree_from_list(osb, tree); 501 tree->rf_removed = 1; 502 delete_tree = 1; 503 } 504 505 ocfs2_unlock_refcount_tree(osb, tree, rw); 506 /* 507 * We get an extra reference when we create the refcount 508 * tree, so another put will destroy it. 509 */ 510 if (delete_tree) 511 ocfs2_refcount_tree_put(tree); 512 brelse(ref_root_bh); 513 ref_root_bh = NULL; 514 goto again; 515 } 516 517 *ret_tree = tree; 518 if (ref_bh) { 519 *ref_bh = ref_root_bh; 520 ref_root_bh = NULL; 521 } 522 out: 523 brelse(ref_root_bh); 524 return ret; 525 } 526 527 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, 528 struct ocfs2_refcount_tree *tree, int rw) 529 { 530 if (rw) 531 up_write(&tree->rf_sem); 532 else 533 up_read(&tree->rf_sem); 534 535 ocfs2_refcount_unlock(tree, rw); 536 ocfs2_refcount_tree_put(tree); 537 } 538 539 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) 540 { 541 struct rb_node *node; 542 struct ocfs2_refcount_tree *tree; 543 struct rb_root *root = &osb->osb_rf_lock_tree; 544 545 while ((node = rb_last(root)) != NULL) { 546 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); 547 548 mlog(0, "Purge tree %llu\n", 549 (unsigned long long) tree->rf_blkno); 550 551 rb_erase(&tree->rf_node, root); 552 ocfs2_free_refcount_tree(tree); 553 } 554 } 555 556 /* 557 * Create a refcount tree for an inode. 558 * We take for granted that the inode is already locked. 559 */ 560 static int ocfs2_create_refcount_tree(struct inode *inode, 561 struct buffer_head *di_bh) 562 { 563 int ret; 564 handle_t *handle = NULL; 565 struct ocfs2_alloc_context *meta_ac = NULL; 566 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 567 struct ocfs2_inode_info *oi = OCFS2_I(inode); 568 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 569 struct buffer_head *new_bh = NULL; 570 struct ocfs2_refcount_block *rb; 571 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 572 u16 suballoc_bit_start; 573 u32 num_got; 574 u64 first_blkno; 575 576 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 577 578 mlog(0, "create tree for inode %lu\n", inode->i_ino); 579 580 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 581 if (ret) { 582 mlog_errno(ret); 583 goto out; 584 } 585 586 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); 587 if (IS_ERR(handle)) { 588 ret = PTR_ERR(handle); 589 mlog_errno(ret); 590 goto out; 591 } 592 593 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 594 OCFS2_JOURNAL_ACCESS_WRITE); 595 if (ret) { 596 mlog_errno(ret); 597 goto out_commit; 598 } 599 600 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 601 &suballoc_bit_start, &num_got, 602 &first_blkno); 603 if (ret) { 604 mlog_errno(ret); 605 goto out_commit; 606 } 607 608 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); 609 if (!new_tree) { 610 ret = -ENOMEM; 611 mlog_errno(ret); 612 goto out_commit; 613 } 614 615 new_bh = sb_getblk(inode->i_sb, first_blkno); 616 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); 617 618 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, 619 OCFS2_JOURNAL_ACCESS_CREATE); 620 if (ret) { 621 mlog_errno(ret); 622 goto out_commit; 623 } 624 625 /* Initialize ocfs2_refcount_block. */ 626 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 627 memset(rb, 0, inode->i_sb->s_blocksize); 628 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 629 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 630 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 631 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 632 rb->rf_blkno = cpu_to_le64(first_blkno); 633 rb->rf_count = cpu_to_le32(1); 634 rb->rf_records.rl_count = 635 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); 636 spin_lock(&osb->osb_lock); 637 rb->rf_generation = osb->s_next_generation++; 638 spin_unlock(&osb->osb_lock); 639 640 ocfs2_journal_dirty(handle, new_bh); 641 642 spin_lock(&oi->ip_lock); 643 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 644 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 645 di->i_refcount_loc = cpu_to_le64(first_blkno); 646 spin_unlock(&oi->ip_lock); 647 648 mlog(0, "created tree for inode %lu, refblock %llu\n", 649 inode->i_ino, (unsigned long long)first_blkno); 650 651 ocfs2_journal_dirty(handle, di_bh); 652 653 /* 654 * We have to init the tree lock here since it will use 655 * the generation number to create it. 656 */ 657 new_tree->rf_generation = le32_to_cpu(rb->rf_generation); 658 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, 659 new_tree->rf_generation); 660 661 spin_lock(&osb->osb_lock); 662 tree = ocfs2_find_refcount_tree(osb, first_blkno); 663 664 /* 665 * We've just created a new refcount tree in this block. If 666 * we found a refcount tree on the ocfs2_super, it must be 667 * one we just deleted. We free the old tree before 668 * inserting the new tree. 669 */ 670 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); 671 if (tree) 672 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 673 ocfs2_insert_refcount_tree(osb, new_tree); 674 spin_unlock(&osb->osb_lock); 675 new_tree = NULL; 676 if (tree) 677 ocfs2_refcount_tree_put(tree); 678 679 out_commit: 680 ocfs2_commit_trans(osb, handle); 681 682 out: 683 if (new_tree) { 684 ocfs2_metadata_cache_exit(&new_tree->rf_ci); 685 kfree(new_tree); 686 } 687 688 brelse(new_bh); 689 if (meta_ac) 690 ocfs2_free_alloc_context(meta_ac); 691 692 return ret; 693 } 694 695 static int ocfs2_set_refcount_tree(struct inode *inode, 696 struct buffer_head *di_bh, 697 u64 refcount_loc) 698 { 699 int ret; 700 handle_t *handle = NULL; 701 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 702 struct ocfs2_inode_info *oi = OCFS2_I(inode); 703 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 704 struct buffer_head *ref_root_bh = NULL; 705 struct ocfs2_refcount_block *rb; 706 struct ocfs2_refcount_tree *ref_tree; 707 708 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 709 710 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, 711 &ref_tree, &ref_root_bh); 712 if (ret) { 713 mlog_errno(ret); 714 return ret; 715 } 716 717 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); 718 if (IS_ERR(handle)) { 719 ret = PTR_ERR(handle); 720 mlog_errno(ret); 721 goto out; 722 } 723 724 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 725 OCFS2_JOURNAL_ACCESS_WRITE); 726 if (ret) { 727 mlog_errno(ret); 728 goto out_commit; 729 } 730 731 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, 732 OCFS2_JOURNAL_ACCESS_WRITE); 733 if (ret) { 734 mlog_errno(ret); 735 goto out_commit; 736 } 737 738 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 739 le32_add_cpu(&rb->rf_count, 1); 740 741 ocfs2_journal_dirty(handle, ref_root_bh); 742 743 spin_lock(&oi->ip_lock); 744 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 745 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 746 di->i_refcount_loc = cpu_to_le64(refcount_loc); 747 spin_unlock(&oi->ip_lock); 748 ocfs2_journal_dirty(handle, di_bh); 749 750 out_commit: 751 ocfs2_commit_trans(osb, handle); 752 out: 753 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 754 brelse(ref_root_bh); 755 756 return ret; 757 } 758 759 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) 760 { 761 int ret, delete_tree = 0; 762 handle_t *handle = NULL; 763 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 764 struct ocfs2_inode_info *oi = OCFS2_I(inode); 765 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 766 struct ocfs2_refcount_block *rb; 767 struct inode *alloc_inode = NULL; 768 struct buffer_head *alloc_bh = NULL; 769 struct buffer_head *blk_bh = NULL; 770 struct ocfs2_refcount_tree *ref_tree; 771 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; 772 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); 773 u16 bit = 0; 774 775 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) 776 return 0; 777 778 BUG_ON(!ref_blkno); 779 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); 780 if (ret) { 781 mlog_errno(ret); 782 return ret; 783 } 784 785 rb = (struct ocfs2_refcount_block *)blk_bh->b_data; 786 787 /* 788 * If we are the last user, we need to free the block. 789 * So lock the allocator ahead. 790 */ 791 if (le32_to_cpu(rb->rf_count) == 1) { 792 blk = le64_to_cpu(rb->rf_blkno); 793 bit = le16_to_cpu(rb->rf_suballoc_bit); 794 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 795 796 alloc_inode = ocfs2_get_system_file_inode(osb, 797 EXTENT_ALLOC_SYSTEM_INODE, 798 le16_to_cpu(rb->rf_suballoc_slot)); 799 if (!alloc_inode) { 800 ret = -ENOMEM; 801 mlog_errno(ret); 802 goto out; 803 } 804 mutex_lock(&alloc_inode->i_mutex); 805 806 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); 807 if (ret) { 808 mlog_errno(ret); 809 goto out_mutex; 810 } 811 812 credits += OCFS2_SUBALLOC_FREE; 813 } 814 815 handle = ocfs2_start_trans(osb, credits); 816 if (IS_ERR(handle)) { 817 ret = PTR_ERR(handle); 818 mlog_errno(ret); 819 goto out_unlock; 820 } 821 822 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 823 OCFS2_JOURNAL_ACCESS_WRITE); 824 if (ret) { 825 mlog_errno(ret); 826 goto out_commit; 827 } 828 829 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, 830 OCFS2_JOURNAL_ACCESS_WRITE); 831 if (ret) { 832 mlog_errno(ret); 833 goto out_commit; 834 } 835 836 spin_lock(&oi->ip_lock); 837 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; 838 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 839 di->i_refcount_loc = 0; 840 spin_unlock(&oi->ip_lock); 841 ocfs2_journal_dirty(handle, di_bh); 842 843 le32_add_cpu(&rb->rf_count , -1); 844 ocfs2_journal_dirty(handle, blk_bh); 845 846 if (!rb->rf_count) { 847 delete_tree = 1; 848 ocfs2_erase_refcount_tree_from_list(osb, ref_tree); 849 ret = ocfs2_free_suballoc_bits(handle, alloc_inode, 850 alloc_bh, bit, bg_blkno, 1); 851 if (ret) 852 mlog_errno(ret); 853 } 854 855 out_commit: 856 ocfs2_commit_trans(osb, handle); 857 out_unlock: 858 if (alloc_inode) { 859 ocfs2_inode_unlock(alloc_inode, 1); 860 brelse(alloc_bh); 861 } 862 out_mutex: 863 if (alloc_inode) { 864 mutex_unlock(&alloc_inode->i_mutex); 865 iput(alloc_inode); 866 } 867 out: 868 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 869 if (delete_tree) 870 ocfs2_refcount_tree_put(ref_tree); 871 brelse(blk_bh); 872 873 return ret; 874 } 875 876 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, 877 struct buffer_head *ref_leaf_bh, 878 u64 cpos, unsigned int len, 879 struct ocfs2_refcount_rec *ret_rec, 880 int *index) 881 { 882 int i = 0; 883 struct ocfs2_refcount_block *rb = 884 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 885 struct ocfs2_refcount_rec *rec = NULL; 886 887 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { 888 rec = &rb->rf_records.rl_recs[i]; 889 890 if (le64_to_cpu(rec->r_cpos) + 891 le32_to_cpu(rec->r_clusters) <= cpos) 892 continue; 893 else if (le64_to_cpu(rec->r_cpos) > cpos) 894 break; 895 896 /* ok, cpos fail in this rec. Just return. */ 897 if (ret_rec) 898 *ret_rec = *rec; 899 goto out; 900 } 901 902 if (ret_rec) { 903 /* We meet with a hole here, so fake the rec. */ 904 ret_rec->r_cpos = cpu_to_le64(cpos); 905 ret_rec->r_refcount = 0; 906 if (i < le16_to_cpu(rb->rf_records.rl_used) && 907 le64_to_cpu(rec->r_cpos) < cpos + len) 908 ret_rec->r_clusters = 909 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); 910 else 911 ret_rec->r_clusters = cpu_to_le32(len); 912 } 913 914 out: 915 *index = i; 916 } 917 918 /* 919 * Try to remove refcount tree. The mechanism is: 920 * 1) Check whether i_clusters == 0, if no, exit. 921 * 2) check whether we have i_xattr_loc in dinode. if yes, exit. 922 * 3) Check whether we have inline xattr stored outside, if yes, exit. 923 * 4) Remove the tree. 924 */ 925 int ocfs2_try_remove_refcount_tree(struct inode *inode, 926 struct buffer_head *di_bh) 927 { 928 int ret; 929 struct ocfs2_inode_info *oi = OCFS2_I(inode); 930 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 931 932 down_write(&oi->ip_xattr_sem); 933 down_write(&oi->ip_alloc_sem); 934 935 if (oi->ip_clusters) 936 goto out; 937 938 if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc) 939 goto out; 940 941 if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL && 942 ocfs2_has_inline_xattr_value_outside(inode, di)) 943 goto out; 944 945 ret = ocfs2_remove_refcount_tree(inode, di_bh); 946 if (ret) 947 mlog_errno(ret); 948 out: 949 up_write(&oi->ip_alloc_sem); 950 up_write(&oi->ip_xattr_sem); 951 return 0; 952 } 953 954 /* 955 * Find the end range for a leaf refcount block indicated by 956 * el->l_recs[index].e_blkno. 957 */ 958 static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, 959 struct buffer_head *ref_root_bh, 960 struct ocfs2_extent_block *eb, 961 struct ocfs2_extent_list *el, 962 int index, u32 *cpos_end) 963 { 964 int ret, i, subtree_root; 965 u32 cpos; 966 u64 blkno; 967 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 968 struct ocfs2_path *left_path = NULL, *right_path = NULL; 969 struct ocfs2_extent_tree et; 970 struct ocfs2_extent_list *tmp_el; 971 972 if (index < le16_to_cpu(el->l_next_free_rec) - 1) { 973 /* 974 * We have a extent rec after index, so just use the e_cpos 975 * of the next extent rec. 976 */ 977 *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); 978 return 0; 979 } 980 981 if (!eb || (eb && !eb->h_next_leaf_blk)) { 982 /* 983 * We are the last extent rec, so any high cpos should 984 * be stored in this leaf refcount block. 985 */ 986 *cpos_end = UINT_MAX; 987 return 0; 988 } 989 990 /* 991 * If the extent block isn't the last one, we have to find 992 * the subtree root between this extent block and the next 993 * leaf extent block and get the corresponding e_cpos from 994 * the subroot. Otherwise we may corrupt the b-tree. 995 */ 996 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 997 998 left_path = ocfs2_new_path_from_et(&et); 999 if (!left_path) { 1000 ret = -ENOMEM; 1001 mlog_errno(ret); 1002 goto out; 1003 } 1004 1005 cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); 1006 ret = ocfs2_find_path(ci, left_path, cpos); 1007 if (ret) { 1008 mlog_errno(ret); 1009 goto out; 1010 } 1011 1012 right_path = ocfs2_new_path_from_path(left_path); 1013 if (!right_path) { 1014 ret = -ENOMEM; 1015 mlog_errno(ret); 1016 goto out; 1017 } 1018 1019 ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); 1020 if (ret) { 1021 mlog_errno(ret); 1022 goto out; 1023 } 1024 1025 ret = ocfs2_find_path(ci, right_path, cpos); 1026 if (ret) { 1027 mlog_errno(ret); 1028 goto out; 1029 } 1030 1031 subtree_root = ocfs2_find_subtree_root(&et, left_path, 1032 right_path); 1033 1034 tmp_el = left_path->p_node[subtree_root].el; 1035 blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; 1036 for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { 1037 if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { 1038 *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); 1039 break; 1040 } 1041 } 1042 1043 BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); 1044 1045 out: 1046 ocfs2_free_path(left_path); 1047 ocfs2_free_path(right_path); 1048 return ret; 1049 } 1050 1051 /* 1052 * Given a cpos and len, try to find the refcount record which contains cpos. 1053 * 1. If cpos can be found in one refcount record, return the record. 1054 * 2. If cpos can't be found, return a fake record which start from cpos 1055 * and end at a small value between cpos+len and start of the next record. 1056 * This fake record has r_refcount = 0. 1057 */ 1058 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, 1059 struct buffer_head *ref_root_bh, 1060 u64 cpos, unsigned int len, 1061 struct ocfs2_refcount_rec *ret_rec, 1062 int *index, 1063 struct buffer_head **ret_bh) 1064 { 1065 int ret = 0, i, found; 1066 u32 low_cpos, uninitialized_var(cpos_end); 1067 struct ocfs2_extent_list *el; 1068 struct ocfs2_extent_rec *rec = NULL; 1069 struct ocfs2_extent_block *eb = NULL; 1070 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; 1071 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1072 struct ocfs2_refcount_block *rb = 1073 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1074 1075 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { 1076 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, 1077 ret_rec, index); 1078 *ret_bh = ref_root_bh; 1079 get_bh(ref_root_bh); 1080 return 0; 1081 } 1082 1083 el = &rb->rf_list; 1084 low_cpos = cpos & OCFS2_32BIT_POS_MASK; 1085 1086 if (el->l_tree_depth) { 1087 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); 1088 if (ret) { 1089 mlog_errno(ret); 1090 goto out; 1091 } 1092 1093 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 1094 el = &eb->h_list; 1095 1096 if (el->l_tree_depth) { 1097 ocfs2_error(sb, 1098 "refcount tree %llu has non zero tree " 1099 "depth in leaf btree tree block %llu\n", 1100 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1101 (unsigned long long)eb_bh->b_blocknr); 1102 ret = -EROFS; 1103 goto out; 1104 } 1105 } 1106 1107 found = 0; 1108 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 1109 rec = &el->l_recs[i]; 1110 1111 if (le32_to_cpu(rec->e_cpos) <= low_cpos) { 1112 found = 1; 1113 break; 1114 } 1115 } 1116 1117 if (found) { 1118 ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, 1119 eb, el, i, &cpos_end); 1120 if (ret) { 1121 mlog_errno(ret); 1122 goto out; 1123 } 1124 1125 if (cpos_end < low_cpos + len) 1126 len = cpos_end - low_cpos; 1127 } 1128 1129 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), 1130 &ref_leaf_bh); 1131 if (ret) { 1132 mlog_errno(ret); 1133 goto out; 1134 } 1135 1136 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, 1137 ret_rec, index); 1138 *ret_bh = ref_leaf_bh; 1139 out: 1140 brelse(eb_bh); 1141 return ret; 1142 } 1143 1144 enum ocfs2_ref_rec_contig { 1145 REF_CONTIG_NONE = 0, 1146 REF_CONTIG_LEFT, 1147 REF_CONTIG_RIGHT, 1148 REF_CONTIG_LEFTRIGHT, 1149 }; 1150 1151 static enum ocfs2_ref_rec_contig 1152 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, 1153 int index) 1154 { 1155 if ((rb->rf_records.rl_recs[index].r_refcount == 1156 rb->rf_records.rl_recs[index + 1].r_refcount) && 1157 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + 1158 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == 1159 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) 1160 return REF_CONTIG_RIGHT; 1161 1162 return REF_CONTIG_NONE; 1163 } 1164 1165 static enum ocfs2_ref_rec_contig 1166 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, 1167 int index) 1168 { 1169 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; 1170 1171 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) 1172 ret = ocfs2_refcount_rec_adjacent(rb, index); 1173 1174 if (index > 0) { 1175 enum ocfs2_ref_rec_contig tmp; 1176 1177 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); 1178 1179 if (tmp == REF_CONTIG_RIGHT) { 1180 if (ret == REF_CONTIG_RIGHT) 1181 ret = REF_CONTIG_LEFTRIGHT; 1182 else 1183 ret = REF_CONTIG_LEFT; 1184 } 1185 } 1186 1187 return ret; 1188 } 1189 1190 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, 1191 int index) 1192 { 1193 BUG_ON(rb->rf_records.rl_recs[index].r_refcount != 1194 rb->rf_records.rl_recs[index+1].r_refcount); 1195 1196 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, 1197 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); 1198 1199 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) 1200 memmove(&rb->rf_records.rl_recs[index + 1], 1201 &rb->rf_records.rl_recs[index + 2], 1202 sizeof(struct ocfs2_refcount_rec) * 1203 (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); 1204 1205 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], 1206 0, sizeof(struct ocfs2_refcount_rec)); 1207 le16_add_cpu(&rb->rf_records.rl_used, -1); 1208 } 1209 1210 /* 1211 * Merge the refcount rec if we are contiguous with the adjacent recs. 1212 */ 1213 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, 1214 int index) 1215 { 1216 enum ocfs2_ref_rec_contig contig = 1217 ocfs2_refcount_rec_contig(rb, index); 1218 1219 if (contig == REF_CONTIG_NONE) 1220 return; 1221 1222 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { 1223 BUG_ON(index == 0); 1224 index--; 1225 } 1226 1227 ocfs2_rotate_refcount_rec_left(rb, index); 1228 1229 if (contig == REF_CONTIG_LEFTRIGHT) 1230 ocfs2_rotate_refcount_rec_left(rb, index); 1231 } 1232 1233 /* 1234 * Change the refcount indexed by "index" in ref_bh. 1235 * If refcount reaches 0, remove it. 1236 */ 1237 static int ocfs2_change_refcount_rec(handle_t *handle, 1238 struct ocfs2_caching_info *ci, 1239 struct buffer_head *ref_leaf_bh, 1240 int index, int merge, int change) 1241 { 1242 int ret; 1243 struct ocfs2_refcount_block *rb = 1244 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1245 struct ocfs2_refcount_list *rl = &rb->rf_records; 1246 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; 1247 1248 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1249 OCFS2_JOURNAL_ACCESS_WRITE); 1250 if (ret) { 1251 mlog_errno(ret); 1252 goto out; 1253 } 1254 1255 mlog(0, "change index %d, old count %u, change %d\n", index, 1256 le32_to_cpu(rec->r_refcount), change); 1257 le32_add_cpu(&rec->r_refcount, change); 1258 1259 if (!rec->r_refcount) { 1260 if (index != le16_to_cpu(rl->rl_used) - 1) { 1261 memmove(rec, rec + 1, 1262 (le16_to_cpu(rl->rl_used) - index - 1) * 1263 sizeof(struct ocfs2_refcount_rec)); 1264 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], 1265 0, sizeof(struct ocfs2_refcount_rec)); 1266 } 1267 1268 le16_add_cpu(&rl->rl_used, -1); 1269 } else if (merge) 1270 ocfs2_refcount_rec_merge(rb, index); 1271 1272 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1273 if (ret) 1274 mlog_errno(ret); 1275 out: 1276 return ret; 1277 } 1278 1279 static int ocfs2_expand_inline_ref_root(handle_t *handle, 1280 struct ocfs2_caching_info *ci, 1281 struct buffer_head *ref_root_bh, 1282 struct buffer_head **ref_leaf_bh, 1283 struct ocfs2_alloc_context *meta_ac) 1284 { 1285 int ret; 1286 u16 suballoc_bit_start; 1287 u32 num_got; 1288 u64 blkno; 1289 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1290 struct buffer_head *new_bh = NULL; 1291 struct ocfs2_refcount_block *new_rb; 1292 struct ocfs2_refcount_block *root_rb = 1293 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1294 1295 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1296 OCFS2_JOURNAL_ACCESS_WRITE); 1297 if (ret) { 1298 mlog_errno(ret); 1299 goto out; 1300 } 1301 1302 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1303 &suballoc_bit_start, &num_got, 1304 &blkno); 1305 if (ret) { 1306 mlog_errno(ret); 1307 goto out; 1308 } 1309 1310 new_bh = sb_getblk(sb, blkno); 1311 if (new_bh == NULL) { 1312 ret = -EIO; 1313 mlog_errno(ret); 1314 goto out; 1315 } 1316 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1317 1318 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1319 OCFS2_JOURNAL_ACCESS_CREATE); 1320 if (ret) { 1321 mlog_errno(ret); 1322 goto out; 1323 } 1324 1325 /* 1326 * Initialize ocfs2_refcount_block. 1327 * It should contain the same information as the old root. 1328 * so just memcpy it and change the corresponding field. 1329 */ 1330 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1331 1332 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1333 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1334 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1335 new_rb->rf_blkno = cpu_to_le64(blkno); 1336 new_rb->rf_cpos = cpu_to_le32(0); 1337 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1338 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1339 ocfs2_journal_dirty(handle, new_bh); 1340 1341 /* Now change the root. */ 1342 memset(&root_rb->rf_list, 0, sb->s_blocksize - 1343 offsetof(struct ocfs2_refcount_block, rf_list)); 1344 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); 1345 root_rb->rf_clusters = cpu_to_le32(1); 1346 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); 1347 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); 1348 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); 1349 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); 1350 1351 ocfs2_journal_dirty(handle, ref_root_bh); 1352 1353 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, 1354 le16_to_cpu(new_rb->rf_records.rl_used)); 1355 1356 *ref_leaf_bh = new_bh; 1357 new_bh = NULL; 1358 out: 1359 brelse(new_bh); 1360 return ret; 1361 } 1362 1363 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, 1364 struct ocfs2_refcount_rec *next) 1365 { 1366 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= 1367 ocfs2_get_ref_rec_low_cpos(next)) 1368 return 1; 1369 1370 return 0; 1371 } 1372 1373 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) 1374 { 1375 const struct ocfs2_refcount_rec *l = a, *r = b; 1376 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); 1377 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); 1378 1379 if (l_cpos > r_cpos) 1380 return 1; 1381 if (l_cpos < r_cpos) 1382 return -1; 1383 return 0; 1384 } 1385 1386 static int cmp_refcount_rec_by_cpos(const void *a, const void *b) 1387 { 1388 const struct ocfs2_refcount_rec *l = a, *r = b; 1389 u64 l_cpos = le64_to_cpu(l->r_cpos); 1390 u64 r_cpos = le64_to_cpu(r->r_cpos); 1391 1392 if (l_cpos > r_cpos) 1393 return 1; 1394 if (l_cpos < r_cpos) 1395 return -1; 1396 return 0; 1397 } 1398 1399 static void swap_refcount_rec(void *a, void *b, int size) 1400 { 1401 struct ocfs2_refcount_rec *l = a, *r = b, tmp; 1402 1403 tmp = *(struct ocfs2_refcount_rec *)l; 1404 *(struct ocfs2_refcount_rec *)l = 1405 *(struct ocfs2_refcount_rec *)r; 1406 *(struct ocfs2_refcount_rec *)r = tmp; 1407 } 1408 1409 /* 1410 * The refcount cpos are ordered by their 64bit cpos, 1411 * But we will use the low 32 bit to be the e_cpos in the b-tree. 1412 * So we need to make sure that this pos isn't intersected with others. 1413 * 1414 * Note: The refcount block is already sorted by their low 32 bit cpos, 1415 * So just try the middle pos first, and we will exit when we find 1416 * the good position. 1417 */ 1418 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, 1419 u32 *split_pos, int *split_index) 1420 { 1421 int num_used = le16_to_cpu(rl->rl_used); 1422 int delta, middle = num_used / 2; 1423 1424 for (delta = 0; delta < middle; delta++) { 1425 /* Let's check delta earlier than middle */ 1426 if (ocfs2_refcount_rec_no_intersect( 1427 &rl->rl_recs[middle - delta - 1], 1428 &rl->rl_recs[middle - delta])) { 1429 *split_index = middle - delta; 1430 break; 1431 } 1432 1433 /* For even counts, don't walk off the end */ 1434 if ((middle + delta + 1) == num_used) 1435 continue; 1436 1437 /* Now try delta past middle */ 1438 if (ocfs2_refcount_rec_no_intersect( 1439 &rl->rl_recs[middle + delta], 1440 &rl->rl_recs[middle + delta + 1])) { 1441 *split_index = middle + delta + 1; 1442 break; 1443 } 1444 } 1445 1446 if (delta >= middle) 1447 return -ENOSPC; 1448 1449 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); 1450 return 0; 1451 } 1452 1453 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, 1454 struct buffer_head *new_bh, 1455 u32 *split_cpos) 1456 { 1457 int split_index = 0, num_moved, ret; 1458 u32 cpos = 0; 1459 struct ocfs2_refcount_block *rb = 1460 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1461 struct ocfs2_refcount_list *rl = &rb->rf_records; 1462 struct ocfs2_refcount_block *new_rb = 1463 (struct ocfs2_refcount_block *)new_bh->b_data; 1464 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; 1465 1466 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", 1467 (unsigned long long)ref_leaf_bh->b_blocknr, 1468 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); 1469 1470 /* 1471 * XXX: Improvement later. 1472 * If we know all the high 32 bit cpos is the same, no need to sort. 1473 * 1474 * In order to make the whole process safe, we do: 1475 * 1. sort the entries by their low 32 bit cpos first so that we can 1476 * find the split cpos easily. 1477 * 2. call ocfs2_insert_extent to insert the new refcount block. 1478 * 3. move the refcount rec to the new block. 1479 * 4. sort the entries by their 64 bit cpos. 1480 * 5. dirty the new_rb and rb. 1481 */ 1482 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1483 sizeof(struct ocfs2_refcount_rec), 1484 cmp_refcount_rec_by_low_cpos, swap_refcount_rec); 1485 1486 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); 1487 if (ret) { 1488 mlog_errno(ret); 1489 return ret; 1490 } 1491 1492 new_rb->rf_cpos = cpu_to_le32(cpos); 1493 1494 /* move refcount records starting from split_index to the new block. */ 1495 num_moved = le16_to_cpu(rl->rl_used) - split_index; 1496 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], 1497 num_moved * sizeof(struct ocfs2_refcount_rec)); 1498 1499 /*ok, remove the entries we just moved over to the other block. */ 1500 memset(&rl->rl_recs[split_index], 0, 1501 num_moved * sizeof(struct ocfs2_refcount_rec)); 1502 1503 /* change old and new rl_used accordingly. */ 1504 le16_add_cpu(&rl->rl_used, -num_moved); 1505 new_rl->rl_used = cpu_to_le16(num_moved); 1506 1507 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1508 sizeof(struct ocfs2_refcount_rec), 1509 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1510 1511 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), 1512 sizeof(struct ocfs2_refcount_rec), 1513 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1514 1515 *split_cpos = cpos; 1516 return 0; 1517 } 1518 1519 static int ocfs2_new_leaf_refcount_block(handle_t *handle, 1520 struct ocfs2_caching_info *ci, 1521 struct buffer_head *ref_root_bh, 1522 struct buffer_head *ref_leaf_bh, 1523 struct ocfs2_alloc_context *meta_ac) 1524 { 1525 int ret; 1526 u16 suballoc_bit_start; 1527 u32 num_got, new_cpos; 1528 u64 blkno; 1529 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1530 struct ocfs2_refcount_block *root_rb = 1531 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1532 struct buffer_head *new_bh = NULL; 1533 struct ocfs2_refcount_block *new_rb; 1534 struct ocfs2_extent_tree ref_et; 1535 1536 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); 1537 1538 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1539 OCFS2_JOURNAL_ACCESS_WRITE); 1540 if (ret) { 1541 mlog_errno(ret); 1542 goto out; 1543 } 1544 1545 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1546 OCFS2_JOURNAL_ACCESS_WRITE); 1547 if (ret) { 1548 mlog_errno(ret); 1549 goto out; 1550 } 1551 1552 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1553 &suballoc_bit_start, &num_got, 1554 &blkno); 1555 if (ret) { 1556 mlog_errno(ret); 1557 goto out; 1558 } 1559 1560 new_bh = sb_getblk(sb, blkno); 1561 if (new_bh == NULL) { 1562 ret = -EIO; 1563 mlog_errno(ret); 1564 goto out; 1565 } 1566 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1567 1568 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1569 OCFS2_JOURNAL_ACCESS_CREATE); 1570 if (ret) { 1571 mlog_errno(ret); 1572 goto out; 1573 } 1574 1575 /* Initialize ocfs2_refcount_block. */ 1576 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1577 memset(new_rb, 0, sb->s_blocksize); 1578 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1579 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1580 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1581 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1582 new_rb->rf_blkno = cpu_to_le64(blkno); 1583 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1584 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1585 new_rb->rf_records.rl_count = 1586 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 1587 new_rb->rf_generation = root_rb->rf_generation; 1588 1589 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); 1590 if (ret) { 1591 mlog_errno(ret); 1592 goto out; 1593 } 1594 1595 ocfs2_journal_dirty(handle, ref_leaf_bh); 1596 ocfs2_journal_dirty(handle, new_bh); 1597 1598 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); 1599 1600 mlog(0, "insert new leaf block %llu at %u\n", 1601 (unsigned long long)new_bh->b_blocknr, new_cpos); 1602 1603 /* Insert the new leaf block with the specific offset cpos. */ 1604 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1605 1, 0, meta_ac); 1606 if (ret) 1607 mlog_errno(ret); 1608 1609 out: 1610 brelse(new_bh); 1611 return ret; 1612 } 1613 1614 static int ocfs2_expand_refcount_tree(handle_t *handle, 1615 struct ocfs2_caching_info *ci, 1616 struct buffer_head *ref_root_bh, 1617 struct buffer_head *ref_leaf_bh, 1618 struct ocfs2_alloc_context *meta_ac) 1619 { 1620 int ret; 1621 struct buffer_head *expand_bh = NULL; 1622 1623 if (ref_root_bh == ref_leaf_bh) { 1624 /* 1625 * the old root bh hasn't been expanded to a b-tree, 1626 * so expand it first. 1627 */ 1628 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, 1629 &expand_bh, meta_ac); 1630 if (ret) { 1631 mlog_errno(ret); 1632 goto out; 1633 } 1634 } else { 1635 expand_bh = ref_leaf_bh; 1636 get_bh(expand_bh); 1637 } 1638 1639 1640 /* Now add a new refcount block into the tree.*/ 1641 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, 1642 expand_bh, meta_ac); 1643 if (ret) 1644 mlog_errno(ret); 1645 out: 1646 brelse(expand_bh); 1647 return ret; 1648 } 1649 1650 /* 1651 * Adjust the extent rec in b-tree representing ref_leaf_bh. 1652 * 1653 * Only called when we have inserted a new refcount rec at index 0 1654 * which means ocfs2_extent_rec.e_cpos may need some change. 1655 */ 1656 static int ocfs2_adjust_refcount_rec(handle_t *handle, 1657 struct ocfs2_caching_info *ci, 1658 struct buffer_head *ref_root_bh, 1659 struct buffer_head *ref_leaf_bh, 1660 struct ocfs2_refcount_rec *rec) 1661 { 1662 int ret = 0, i; 1663 u32 new_cpos, old_cpos; 1664 struct ocfs2_path *path = NULL; 1665 struct ocfs2_extent_tree et; 1666 struct ocfs2_refcount_block *rb = 1667 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1668 struct ocfs2_extent_list *el; 1669 1670 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) 1671 goto out; 1672 1673 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1674 old_cpos = le32_to_cpu(rb->rf_cpos); 1675 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; 1676 if (old_cpos <= new_cpos) 1677 goto out; 1678 1679 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 1680 1681 path = ocfs2_new_path_from_et(&et); 1682 if (!path) { 1683 ret = -ENOMEM; 1684 mlog_errno(ret); 1685 goto out; 1686 } 1687 1688 ret = ocfs2_find_path(ci, path, old_cpos); 1689 if (ret) { 1690 mlog_errno(ret); 1691 goto out; 1692 } 1693 1694 /* 1695 * 2 more credits, one for the leaf refcount block, one for 1696 * the extent block contains the extent rec. 1697 */ 1698 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1699 if (ret < 0) { 1700 mlog_errno(ret); 1701 goto out; 1702 } 1703 1704 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1705 OCFS2_JOURNAL_ACCESS_WRITE); 1706 if (ret < 0) { 1707 mlog_errno(ret); 1708 goto out; 1709 } 1710 1711 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), 1712 OCFS2_JOURNAL_ACCESS_WRITE); 1713 if (ret < 0) { 1714 mlog_errno(ret); 1715 goto out; 1716 } 1717 1718 /* change the leaf extent block first. */ 1719 el = path_leaf_el(path); 1720 1721 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) 1722 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) 1723 break; 1724 1725 BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); 1726 1727 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1728 1729 /* change the r_cpos in the leaf block. */ 1730 rb->rf_cpos = cpu_to_le32(new_cpos); 1731 1732 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 1733 ocfs2_journal_dirty(handle, ref_leaf_bh); 1734 1735 out: 1736 ocfs2_free_path(path); 1737 return ret; 1738 } 1739 1740 static int ocfs2_insert_refcount_rec(handle_t *handle, 1741 struct ocfs2_caching_info *ci, 1742 struct buffer_head *ref_root_bh, 1743 struct buffer_head *ref_leaf_bh, 1744 struct ocfs2_refcount_rec *rec, 1745 int index, int merge, 1746 struct ocfs2_alloc_context *meta_ac) 1747 { 1748 int ret; 1749 struct ocfs2_refcount_block *rb = 1750 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1751 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1752 struct buffer_head *new_bh = NULL; 1753 1754 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1755 1756 if (rf_list->rl_used == rf_list->rl_count) { 1757 u64 cpos = le64_to_cpu(rec->r_cpos); 1758 u32 len = le32_to_cpu(rec->r_clusters); 1759 1760 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1761 ref_leaf_bh, meta_ac); 1762 if (ret) { 1763 mlog_errno(ret); 1764 goto out; 1765 } 1766 1767 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1768 cpos, len, NULL, &index, 1769 &new_bh); 1770 if (ret) { 1771 mlog_errno(ret); 1772 goto out; 1773 } 1774 1775 ref_leaf_bh = new_bh; 1776 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1777 rf_list = &rb->rf_records; 1778 } 1779 1780 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1781 OCFS2_JOURNAL_ACCESS_WRITE); 1782 if (ret) { 1783 mlog_errno(ret); 1784 goto out; 1785 } 1786 1787 if (index < le16_to_cpu(rf_list->rl_used)) 1788 memmove(&rf_list->rl_recs[index + 1], 1789 &rf_list->rl_recs[index], 1790 (le16_to_cpu(rf_list->rl_used) - index) * 1791 sizeof(struct ocfs2_refcount_rec)); 1792 1793 mlog(0, "insert refcount record start %llu, len %u, count %u " 1794 "to leaf block %llu at index %d\n", 1795 (unsigned long long)le64_to_cpu(rec->r_cpos), 1796 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), 1797 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1798 1799 rf_list->rl_recs[index] = *rec; 1800 1801 le16_add_cpu(&rf_list->rl_used, 1); 1802 1803 if (merge) 1804 ocfs2_refcount_rec_merge(rb, index); 1805 1806 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1807 if (ret) { 1808 mlog_errno(ret); 1809 goto out; 1810 } 1811 1812 if (index == 0) { 1813 ret = ocfs2_adjust_refcount_rec(handle, ci, 1814 ref_root_bh, 1815 ref_leaf_bh, rec); 1816 if (ret) 1817 mlog_errno(ret); 1818 } 1819 out: 1820 brelse(new_bh); 1821 return ret; 1822 } 1823 1824 /* 1825 * Split the refcount_rec indexed by "index" in ref_leaf_bh. 1826 * This is much simple than our b-tree code. 1827 * split_rec is the new refcount rec we want to insert. 1828 * If split_rec->r_refcount > 0, we are changing the refcount(in case we 1829 * increase refcount or decrease a refcount to non-zero). 1830 * If split_rec->r_refcount == 0, we are punching a hole in current refcount 1831 * rec( in case we decrease a refcount to zero). 1832 */ 1833 static int ocfs2_split_refcount_rec(handle_t *handle, 1834 struct ocfs2_caching_info *ci, 1835 struct buffer_head *ref_root_bh, 1836 struct buffer_head *ref_leaf_bh, 1837 struct ocfs2_refcount_rec *split_rec, 1838 int index, int merge, 1839 struct ocfs2_alloc_context *meta_ac, 1840 struct ocfs2_cached_dealloc_ctxt *dealloc) 1841 { 1842 int ret, recs_need; 1843 u32 len; 1844 struct ocfs2_refcount_block *rb = 1845 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1846 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1847 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; 1848 struct ocfs2_refcount_rec *tail_rec = NULL; 1849 struct buffer_head *new_bh = NULL; 1850 1851 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1852 1853 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", 1854 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), 1855 le64_to_cpu(split_rec->r_cpos), 1856 le32_to_cpu(split_rec->r_clusters)); 1857 1858 /* 1859 * If we just need to split the header or tail clusters, 1860 * no more recs are needed, just split is OK. 1861 * Otherwise we at least need one new recs. 1862 */ 1863 if (!split_rec->r_refcount && 1864 (split_rec->r_cpos == orig_rec->r_cpos || 1865 le64_to_cpu(split_rec->r_cpos) + 1866 le32_to_cpu(split_rec->r_clusters) == 1867 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1868 recs_need = 0; 1869 else 1870 recs_need = 1; 1871 1872 /* 1873 * We need one more rec if we split in the middle and the new rec have 1874 * some refcount in it. 1875 */ 1876 if (split_rec->r_refcount && 1877 (split_rec->r_cpos != orig_rec->r_cpos && 1878 le64_to_cpu(split_rec->r_cpos) + 1879 le32_to_cpu(split_rec->r_clusters) != 1880 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1881 recs_need++; 1882 1883 /* If the leaf block don't have enough record, expand it. */ 1884 if (le16_to_cpu(rf_list->rl_used) + recs_need > 1885 le16_to_cpu(rf_list->rl_count)) { 1886 struct ocfs2_refcount_rec tmp_rec; 1887 u64 cpos = le64_to_cpu(orig_rec->r_cpos); 1888 len = le32_to_cpu(orig_rec->r_clusters); 1889 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1890 ref_leaf_bh, meta_ac); 1891 if (ret) { 1892 mlog_errno(ret); 1893 goto out; 1894 } 1895 1896 /* 1897 * We have to re-get it since now cpos may be moved to 1898 * another leaf block. 1899 */ 1900 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1901 cpos, len, &tmp_rec, &index, 1902 &new_bh); 1903 if (ret) { 1904 mlog_errno(ret); 1905 goto out; 1906 } 1907 1908 ref_leaf_bh = new_bh; 1909 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1910 rf_list = &rb->rf_records; 1911 orig_rec = &rf_list->rl_recs[index]; 1912 } 1913 1914 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1915 OCFS2_JOURNAL_ACCESS_WRITE); 1916 if (ret) { 1917 mlog_errno(ret); 1918 goto out; 1919 } 1920 1921 /* 1922 * We have calculated out how many new records we need and store 1923 * in recs_need, so spare enough space first by moving the records 1924 * after "index" to the end. 1925 */ 1926 if (index != le16_to_cpu(rf_list->rl_used) - 1) 1927 memmove(&rf_list->rl_recs[index + 1 + recs_need], 1928 &rf_list->rl_recs[index + 1], 1929 (le16_to_cpu(rf_list->rl_used) - index - 1) * 1930 sizeof(struct ocfs2_refcount_rec)); 1931 1932 len = (le64_to_cpu(orig_rec->r_cpos) + 1933 le32_to_cpu(orig_rec->r_clusters)) - 1934 (le64_to_cpu(split_rec->r_cpos) + 1935 le32_to_cpu(split_rec->r_clusters)); 1936 1937 /* 1938 * If we have "len", the we will split in the tail and move it 1939 * to the end of the space we have just spared. 1940 */ 1941 if (len) { 1942 tail_rec = &rf_list->rl_recs[index + recs_need]; 1943 1944 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); 1945 le64_add_cpu(&tail_rec->r_cpos, 1946 le32_to_cpu(tail_rec->r_clusters) - len); 1947 tail_rec->r_clusters = cpu_to_le32(len); 1948 } 1949 1950 /* 1951 * If the split pos isn't the same as the original one, we need to 1952 * split in the head. 1953 * 1954 * Note: We have the chance that split_rec.r_refcount = 0, 1955 * recs_need = 0 and len > 0, which means we just cut the head from 1956 * the orig_rec and in that case we have done some modification in 1957 * orig_rec above, so the check for r_cpos is faked. 1958 */ 1959 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { 1960 len = le64_to_cpu(split_rec->r_cpos) - 1961 le64_to_cpu(orig_rec->r_cpos); 1962 orig_rec->r_clusters = cpu_to_le32(len); 1963 index++; 1964 } 1965 1966 le16_add_cpu(&rf_list->rl_used, recs_need); 1967 1968 if (split_rec->r_refcount) { 1969 rf_list->rl_recs[index] = *split_rec; 1970 mlog(0, "insert refcount record start %llu, len %u, count %u " 1971 "to leaf block %llu at index %d\n", 1972 (unsigned long long)le64_to_cpu(split_rec->r_cpos), 1973 le32_to_cpu(split_rec->r_clusters), 1974 le32_to_cpu(split_rec->r_refcount), 1975 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1976 1977 if (merge) 1978 ocfs2_refcount_rec_merge(rb, index); 1979 } 1980 1981 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1982 if (ret) 1983 mlog_errno(ret); 1984 1985 out: 1986 brelse(new_bh); 1987 return ret; 1988 } 1989 1990 static int __ocfs2_increase_refcount(handle_t *handle, 1991 struct ocfs2_caching_info *ci, 1992 struct buffer_head *ref_root_bh, 1993 u64 cpos, u32 len, int merge, 1994 struct ocfs2_alloc_context *meta_ac, 1995 struct ocfs2_cached_dealloc_ctxt *dealloc) 1996 { 1997 int ret = 0, index; 1998 struct buffer_head *ref_leaf_bh = NULL; 1999 struct ocfs2_refcount_rec rec; 2000 unsigned int set_len = 0; 2001 2002 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", 2003 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2004 (unsigned long long)cpos, len); 2005 2006 while (len) { 2007 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2008 cpos, len, &rec, &index, 2009 &ref_leaf_bh); 2010 if (ret) { 2011 mlog_errno(ret); 2012 goto out; 2013 } 2014 2015 set_len = le32_to_cpu(rec.r_clusters); 2016 2017 /* 2018 * Here we may meet with 3 situations: 2019 * 2020 * 1. If we find an already existing record, and the length 2021 * is the same, cool, we just need to increase the r_refcount 2022 * and it is OK. 2023 * 2. If we find a hole, just insert it with r_refcount = 1. 2024 * 3. If we are in the middle of one extent record, split 2025 * it. 2026 */ 2027 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && 2028 set_len <= len) { 2029 mlog(0, "increase refcount rec, start %llu, len %u, " 2030 "count %u\n", (unsigned long long)cpos, set_len, 2031 le32_to_cpu(rec.r_refcount)); 2032 ret = ocfs2_change_refcount_rec(handle, ci, 2033 ref_leaf_bh, index, 2034 merge, 1); 2035 if (ret) { 2036 mlog_errno(ret); 2037 goto out; 2038 } 2039 } else if (!rec.r_refcount) { 2040 rec.r_refcount = cpu_to_le32(1); 2041 2042 mlog(0, "insert refcount rec, start %llu, len %u\n", 2043 (unsigned long long)le64_to_cpu(rec.r_cpos), 2044 set_len); 2045 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, 2046 ref_leaf_bh, 2047 &rec, index, 2048 merge, meta_ac); 2049 if (ret) { 2050 mlog_errno(ret); 2051 goto out; 2052 } 2053 } else { 2054 set_len = min((u64)(cpos + len), 2055 le64_to_cpu(rec.r_cpos) + set_len) - cpos; 2056 rec.r_cpos = cpu_to_le64(cpos); 2057 rec.r_clusters = cpu_to_le32(set_len); 2058 le32_add_cpu(&rec.r_refcount, 1); 2059 2060 mlog(0, "split refcount rec, start %llu, " 2061 "len %u, count %u\n", 2062 (unsigned long long)le64_to_cpu(rec.r_cpos), 2063 set_len, le32_to_cpu(rec.r_refcount)); 2064 ret = ocfs2_split_refcount_rec(handle, ci, 2065 ref_root_bh, ref_leaf_bh, 2066 &rec, index, merge, 2067 meta_ac, dealloc); 2068 if (ret) { 2069 mlog_errno(ret); 2070 goto out; 2071 } 2072 } 2073 2074 cpos += set_len; 2075 len -= set_len; 2076 brelse(ref_leaf_bh); 2077 ref_leaf_bh = NULL; 2078 } 2079 2080 out: 2081 brelse(ref_leaf_bh); 2082 return ret; 2083 } 2084 2085 static int ocfs2_remove_refcount_extent(handle_t *handle, 2086 struct ocfs2_caching_info *ci, 2087 struct buffer_head *ref_root_bh, 2088 struct buffer_head *ref_leaf_bh, 2089 struct ocfs2_alloc_context *meta_ac, 2090 struct ocfs2_cached_dealloc_ctxt *dealloc) 2091 { 2092 int ret; 2093 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2094 struct ocfs2_refcount_block *rb = 2095 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2096 struct ocfs2_extent_tree et; 2097 2098 BUG_ON(rb->rf_records.rl_used); 2099 2100 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2101 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 2102 1, meta_ac, dealloc); 2103 if (ret) { 2104 mlog_errno(ret); 2105 goto out; 2106 } 2107 2108 ocfs2_remove_from_cache(ci, ref_leaf_bh); 2109 2110 /* 2111 * add the freed block to the dealloc so that it will be freed 2112 * when we run dealloc. 2113 */ 2114 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 2115 le16_to_cpu(rb->rf_suballoc_slot), 2116 le64_to_cpu(rb->rf_blkno), 2117 le16_to_cpu(rb->rf_suballoc_bit)); 2118 if (ret) { 2119 mlog_errno(ret); 2120 goto out; 2121 } 2122 2123 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 2124 OCFS2_JOURNAL_ACCESS_WRITE); 2125 if (ret) { 2126 mlog_errno(ret); 2127 goto out; 2128 } 2129 2130 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 2131 2132 le32_add_cpu(&rb->rf_clusters, -1); 2133 2134 /* 2135 * check whether we need to restore the root refcount block if 2136 * there is no leaf extent block at atll. 2137 */ 2138 if (!rb->rf_list.l_next_free_rec) { 2139 BUG_ON(rb->rf_clusters); 2140 2141 mlog(0, "reset refcount tree root %llu to be a record block.\n", 2142 (unsigned long long)ref_root_bh->b_blocknr); 2143 2144 rb->rf_flags = 0; 2145 rb->rf_parent = 0; 2146 rb->rf_cpos = 0; 2147 memset(&rb->rf_records, 0, sb->s_blocksize - 2148 offsetof(struct ocfs2_refcount_block, rf_records)); 2149 rb->rf_records.rl_count = 2150 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 2151 } 2152 2153 ocfs2_journal_dirty(handle, ref_root_bh); 2154 2155 out: 2156 return ret; 2157 } 2158 2159 int ocfs2_increase_refcount(handle_t *handle, 2160 struct ocfs2_caching_info *ci, 2161 struct buffer_head *ref_root_bh, 2162 u64 cpos, u32 len, 2163 struct ocfs2_alloc_context *meta_ac, 2164 struct ocfs2_cached_dealloc_ctxt *dealloc) 2165 { 2166 return __ocfs2_increase_refcount(handle, ci, ref_root_bh, 2167 cpos, len, 1, 2168 meta_ac, dealloc); 2169 } 2170 2171 static int ocfs2_decrease_refcount_rec(handle_t *handle, 2172 struct ocfs2_caching_info *ci, 2173 struct buffer_head *ref_root_bh, 2174 struct buffer_head *ref_leaf_bh, 2175 int index, u64 cpos, unsigned int len, 2176 struct ocfs2_alloc_context *meta_ac, 2177 struct ocfs2_cached_dealloc_ctxt *dealloc) 2178 { 2179 int ret; 2180 struct ocfs2_refcount_block *rb = 2181 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2182 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; 2183 2184 BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); 2185 BUG_ON(cpos + len > 2186 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); 2187 2188 if (cpos == le64_to_cpu(rec->r_cpos) && 2189 len == le32_to_cpu(rec->r_clusters)) 2190 ret = ocfs2_change_refcount_rec(handle, ci, 2191 ref_leaf_bh, index, 1, -1); 2192 else { 2193 struct ocfs2_refcount_rec split = *rec; 2194 split.r_cpos = cpu_to_le64(cpos); 2195 split.r_clusters = cpu_to_le32(len); 2196 2197 le32_add_cpu(&split.r_refcount, -1); 2198 2199 mlog(0, "split refcount rec, start %llu, " 2200 "len %u, count %u, original start %llu, len %u\n", 2201 (unsigned long long)le64_to_cpu(split.r_cpos), 2202 len, le32_to_cpu(split.r_refcount), 2203 (unsigned long long)le64_to_cpu(rec->r_cpos), 2204 le32_to_cpu(rec->r_clusters)); 2205 ret = ocfs2_split_refcount_rec(handle, ci, 2206 ref_root_bh, ref_leaf_bh, 2207 &split, index, 1, 2208 meta_ac, dealloc); 2209 } 2210 2211 if (ret) { 2212 mlog_errno(ret); 2213 goto out; 2214 } 2215 2216 /* Remove the leaf refcount block if it contains no refcount record. */ 2217 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { 2218 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, 2219 ref_leaf_bh, meta_ac, 2220 dealloc); 2221 if (ret) 2222 mlog_errno(ret); 2223 } 2224 2225 out: 2226 return ret; 2227 } 2228 2229 static int __ocfs2_decrease_refcount(handle_t *handle, 2230 struct ocfs2_caching_info *ci, 2231 struct buffer_head *ref_root_bh, 2232 u64 cpos, u32 len, 2233 struct ocfs2_alloc_context *meta_ac, 2234 struct ocfs2_cached_dealloc_ctxt *dealloc, 2235 int delete) 2236 { 2237 int ret = 0, index = 0; 2238 struct ocfs2_refcount_rec rec; 2239 unsigned int r_count = 0, r_len; 2240 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2241 struct buffer_head *ref_leaf_bh = NULL; 2242 2243 mlog(0, "Tree owner %llu, decrease refcount start %llu, " 2244 "len %u, delete %u\n", 2245 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2246 (unsigned long long)cpos, len, delete); 2247 2248 while (len) { 2249 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2250 cpos, len, &rec, &index, 2251 &ref_leaf_bh); 2252 if (ret) { 2253 mlog_errno(ret); 2254 goto out; 2255 } 2256 2257 r_count = le32_to_cpu(rec.r_refcount); 2258 BUG_ON(r_count == 0); 2259 if (!delete) 2260 BUG_ON(r_count > 1); 2261 2262 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + 2263 le32_to_cpu(rec.r_clusters)) - cpos; 2264 2265 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, 2266 ref_leaf_bh, index, 2267 cpos, r_len, 2268 meta_ac, dealloc); 2269 if (ret) { 2270 mlog_errno(ret); 2271 goto out; 2272 } 2273 2274 if (le32_to_cpu(rec.r_refcount) == 1 && delete) { 2275 ret = ocfs2_cache_cluster_dealloc(dealloc, 2276 ocfs2_clusters_to_blocks(sb, cpos), 2277 r_len); 2278 if (ret) { 2279 mlog_errno(ret); 2280 goto out; 2281 } 2282 } 2283 2284 cpos += r_len; 2285 len -= r_len; 2286 brelse(ref_leaf_bh); 2287 ref_leaf_bh = NULL; 2288 } 2289 2290 out: 2291 brelse(ref_leaf_bh); 2292 return ret; 2293 } 2294 2295 /* Caller must hold refcount tree lock. */ 2296 int ocfs2_decrease_refcount(struct inode *inode, 2297 handle_t *handle, u32 cpos, u32 len, 2298 struct ocfs2_alloc_context *meta_ac, 2299 struct ocfs2_cached_dealloc_ctxt *dealloc, 2300 int delete) 2301 { 2302 int ret; 2303 u64 ref_blkno; 2304 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2305 struct buffer_head *ref_root_bh = NULL; 2306 struct ocfs2_refcount_tree *tree; 2307 2308 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2309 2310 ret = ocfs2_get_refcount_block(inode, &ref_blkno); 2311 if (ret) { 2312 mlog_errno(ret); 2313 goto out; 2314 } 2315 2316 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); 2317 if (ret) { 2318 mlog_errno(ret); 2319 goto out; 2320 } 2321 2322 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 2323 &ref_root_bh); 2324 if (ret) { 2325 mlog_errno(ret); 2326 goto out; 2327 } 2328 2329 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, 2330 cpos, len, meta_ac, dealloc, delete); 2331 if (ret) 2332 mlog_errno(ret); 2333 out: 2334 brelse(ref_root_bh); 2335 return ret; 2336 } 2337 2338 /* 2339 * Mark the already-existing extent at cpos as refcounted for len clusters. 2340 * This adds the refcount extent flag. 2341 * 2342 * If the existing extent is larger than the request, initiate a 2343 * split. An attempt will be made at merging with adjacent extents. 2344 * 2345 * The caller is responsible for passing down meta_ac if we'll need it. 2346 */ 2347 static int ocfs2_mark_extent_refcounted(struct inode *inode, 2348 struct ocfs2_extent_tree *et, 2349 handle_t *handle, u32 cpos, 2350 u32 len, u32 phys, 2351 struct ocfs2_alloc_context *meta_ac, 2352 struct ocfs2_cached_dealloc_ctxt *dealloc) 2353 { 2354 int ret; 2355 2356 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", 2357 inode->i_ino, cpos, len, phys); 2358 2359 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2360 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2361 "tree, but the feature bit is not set in the " 2362 "super block.", inode->i_ino); 2363 ret = -EROFS; 2364 goto out; 2365 } 2366 2367 ret = ocfs2_change_extent_flag(handle, et, cpos, 2368 len, phys, meta_ac, dealloc, 2369 OCFS2_EXT_REFCOUNTED, 0); 2370 if (ret) 2371 mlog_errno(ret); 2372 2373 out: 2374 return ret; 2375 } 2376 2377 /* 2378 * Given some contiguous physical clusters, calculate what we need 2379 * for modifying their refcount. 2380 */ 2381 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, 2382 struct ocfs2_caching_info *ci, 2383 struct buffer_head *ref_root_bh, 2384 u64 start_cpos, 2385 u32 clusters, 2386 int *meta_add, 2387 int *credits) 2388 { 2389 int ret = 0, index, ref_blocks = 0, recs_add = 0; 2390 u64 cpos = start_cpos; 2391 struct ocfs2_refcount_block *rb; 2392 struct ocfs2_refcount_rec rec; 2393 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; 2394 u32 len; 2395 2396 mlog(0, "start_cpos %llu, clusters %u\n", 2397 (unsigned long long)start_cpos, clusters); 2398 while (clusters) { 2399 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2400 cpos, clusters, &rec, 2401 &index, &ref_leaf_bh); 2402 if (ret) { 2403 mlog_errno(ret); 2404 goto out; 2405 } 2406 2407 if (ref_leaf_bh != prev_bh) { 2408 /* 2409 * Now we encounter a new leaf block, so calculate 2410 * whether we need to extend the old leaf. 2411 */ 2412 if (prev_bh) { 2413 rb = (struct ocfs2_refcount_block *) 2414 prev_bh->b_data; 2415 2416 if (le64_to_cpu(rb->rf_records.rl_used) + 2417 recs_add > 2418 le16_to_cpu(rb->rf_records.rl_count)) 2419 ref_blocks++; 2420 } 2421 2422 recs_add = 0; 2423 *credits += 1; 2424 brelse(prev_bh); 2425 prev_bh = ref_leaf_bh; 2426 get_bh(prev_bh); 2427 } 2428 2429 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2430 2431 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," 2432 "rec->r_clusters %u, rec->r_refcount %u, index %d\n", 2433 recs_add, (unsigned long long)cpos, clusters, 2434 (unsigned long long)le64_to_cpu(rec.r_cpos), 2435 le32_to_cpu(rec.r_clusters), 2436 le32_to_cpu(rec.r_refcount), index); 2437 2438 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2439 le32_to_cpu(rec.r_clusters)) - cpos; 2440 /* 2441 * If the refcount rec already exist, cool. We just need 2442 * to check whether there is a split. Otherwise we just need 2443 * to increase the refcount. 2444 * If we will insert one, increases recs_add. 2445 * 2446 * We record all the records which will be inserted to the 2447 * same refcount block, so that we can tell exactly whether 2448 * we need a new refcount block or not. 2449 */ 2450 if (rec.r_refcount) { 2451 /* Check whether we need a split at the beginning. */ 2452 if (cpos == start_cpos && 2453 cpos != le64_to_cpu(rec.r_cpos)) 2454 recs_add++; 2455 2456 /* Check whether we need a split in the end. */ 2457 if (cpos + clusters < le64_to_cpu(rec.r_cpos) + 2458 le32_to_cpu(rec.r_clusters)) 2459 recs_add++; 2460 } else 2461 recs_add++; 2462 2463 brelse(ref_leaf_bh); 2464 ref_leaf_bh = NULL; 2465 clusters -= len; 2466 cpos += len; 2467 } 2468 2469 if (prev_bh) { 2470 rb = (struct ocfs2_refcount_block *)prev_bh->b_data; 2471 2472 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > 2473 le16_to_cpu(rb->rf_records.rl_count)) 2474 ref_blocks++; 2475 2476 *credits += 1; 2477 } 2478 2479 if (!ref_blocks) 2480 goto out; 2481 2482 mlog(0, "we need ref_blocks %d\n", ref_blocks); 2483 *meta_add += ref_blocks; 2484 *credits += ref_blocks; 2485 2486 /* 2487 * So we may need ref_blocks to insert into the tree. 2488 * That also means we need to change the b-tree and add that number 2489 * of records since we never merge them. 2490 * We need one more block for expansion since the new created leaf 2491 * block is also full and needs split. 2492 */ 2493 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 2494 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { 2495 struct ocfs2_extent_tree et; 2496 2497 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2498 *meta_add += ocfs2_extend_meta_needed(et.et_root_el); 2499 *credits += ocfs2_calc_extend_credits(sb, 2500 et.et_root_el, 2501 ref_blocks); 2502 } else { 2503 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; 2504 *meta_add += 1; 2505 } 2506 2507 out: 2508 brelse(ref_leaf_bh); 2509 brelse(prev_bh); 2510 return ret; 2511 } 2512 2513 /* 2514 * For refcount tree, we will decrease some contiguous clusters 2515 * refcount count, so just go through it to see how many blocks 2516 * we gonna touch and whether we need to create new blocks. 2517 * 2518 * Normally the refcount blocks store these refcount should be 2519 * contiguous also, so that we can get the number easily. 2520 * As for meta_ac, we will at most add split 2 refcount record and 2521 * 2 more refcount block, so just check it in a rough way. 2522 * 2523 * Caller must hold refcount tree lock. 2524 */ 2525 int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2526 struct buffer_head *di_bh, 2527 u64 phys_blkno, 2528 u32 clusters, 2529 int *credits, 2530 struct ocfs2_alloc_context **meta_ac) 2531 { 2532 int ret, ref_blocks = 0; 2533 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2534 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2535 struct buffer_head *ref_root_bh = NULL; 2536 struct ocfs2_refcount_tree *tree; 2537 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); 2538 2539 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2540 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2541 "tree, but the feature bit is not set in the " 2542 "super block.", inode->i_ino); 2543 ret = -EROFS; 2544 goto out; 2545 } 2546 2547 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2548 2549 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2550 le64_to_cpu(di->i_refcount_loc), &tree); 2551 if (ret) { 2552 mlog_errno(ret); 2553 goto out; 2554 } 2555 2556 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2557 le64_to_cpu(di->i_refcount_loc), 2558 &ref_root_bh); 2559 if (ret) { 2560 mlog_errno(ret); 2561 goto out; 2562 } 2563 2564 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 2565 &tree->rf_ci, 2566 ref_root_bh, 2567 start_cpos, clusters, 2568 &ref_blocks, credits); 2569 if (ret) { 2570 mlog_errno(ret); 2571 goto out; 2572 } 2573 2574 mlog(0, "reserve new metadata %d, credits = %d\n", 2575 ref_blocks, *credits); 2576 2577 if (ref_blocks) { 2578 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 2579 ref_blocks, meta_ac); 2580 if (ret) 2581 mlog_errno(ret); 2582 } 2583 2584 out: 2585 brelse(ref_root_bh); 2586 return ret; 2587 } 2588 2589 #define MAX_CONTIG_BYTES 1048576 2590 2591 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) 2592 { 2593 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); 2594 } 2595 2596 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) 2597 { 2598 return ~(ocfs2_cow_contig_clusters(sb) - 1); 2599 } 2600 2601 /* 2602 * Given an extent that starts at 'start' and an I/O that starts at 'cpos', 2603 * find an offset (start + (n * contig_clusters)) that is closest to cpos 2604 * while still being less than or equal to it. 2605 * 2606 * The goal is to break the extent at a multiple of contig_clusters. 2607 */ 2608 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, 2609 unsigned int start, 2610 unsigned int cpos) 2611 { 2612 BUG_ON(start > cpos); 2613 2614 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); 2615 } 2616 2617 /* 2618 * Given a cluster count of len, pad it out so that it is a multiple 2619 * of contig_clusters. 2620 */ 2621 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, 2622 unsigned int len) 2623 { 2624 unsigned int padded = 2625 (len + (ocfs2_cow_contig_clusters(sb) - 1)) & 2626 ocfs2_cow_contig_mask(sb); 2627 2628 /* Did we wrap? */ 2629 if (padded < len) 2630 padded = UINT_MAX; 2631 2632 return padded; 2633 } 2634 2635 /* 2636 * Calculate out the start and number of virtual clusters we need to to CoW. 2637 * 2638 * cpos is vitual start cluster position we want to do CoW in a 2639 * file and write_len is the cluster length. 2640 * max_cpos is the place where we want to stop CoW intentionally. 2641 * 2642 * Normal we will start CoW from the beginning of extent record cotaining cpos. 2643 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we 2644 * get good I/O from the resulting extent tree. 2645 */ 2646 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, 2647 struct ocfs2_extent_list *el, 2648 u32 cpos, 2649 u32 write_len, 2650 u32 max_cpos, 2651 u32 *cow_start, 2652 u32 *cow_len) 2653 { 2654 int ret = 0; 2655 int tree_height = le16_to_cpu(el->l_tree_depth), i; 2656 struct buffer_head *eb_bh = NULL; 2657 struct ocfs2_extent_block *eb = NULL; 2658 struct ocfs2_extent_rec *rec; 2659 unsigned int want_clusters, rec_end = 0; 2660 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); 2661 int leaf_clusters; 2662 2663 BUG_ON(cpos + write_len > max_cpos); 2664 2665 if (tree_height > 0) { 2666 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); 2667 if (ret) { 2668 mlog_errno(ret); 2669 goto out; 2670 } 2671 2672 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2673 el = &eb->h_list; 2674 2675 if (el->l_tree_depth) { 2676 ocfs2_error(inode->i_sb, 2677 "Inode %lu has non zero tree depth in " 2678 "leaf block %llu\n", inode->i_ino, 2679 (unsigned long long)eb_bh->b_blocknr); 2680 ret = -EROFS; 2681 goto out; 2682 } 2683 } 2684 2685 *cow_len = 0; 2686 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2687 rec = &el->l_recs[i]; 2688 2689 if (ocfs2_is_empty_extent(rec)) { 2690 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " 2691 "index %d\n", inode->i_ino, i); 2692 continue; 2693 } 2694 2695 if (le32_to_cpu(rec->e_cpos) + 2696 le16_to_cpu(rec->e_leaf_clusters) <= cpos) 2697 continue; 2698 2699 if (*cow_len == 0) { 2700 /* 2701 * We should find a refcounted record in the 2702 * first pass. 2703 */ 2704 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); 2705 *cow_start = le32_to_cpu(rec->e_cpos); 2706 } 2707 2708 /* 2709 * If we encounter a hole, a non-refcounted record or 2710 * pass the max_cpos, stop the search. 2711 */ 2712 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || 2713 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || 2714 (max_cpos <= le32_to_cpu(rec->e_cpos))) 2715 break; 2716 2717 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); 2718 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; 2719 if (rec_end > max_cpos) { 2720 rec_end = max_cpos; 2721 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); 2722 } 2723 2724 /* 2725 * How many clusters do we actually need from 2726 * this extent? First we see how many we actually 2727 * need to complete the write. If that's smaller 2728 * than contig_clusters, we try for contig_clusters. 2729 */ 2730 if (!*cow_len) 2731 want_clusters = write_len; 2732 else 2733 want_clusters = (cpos + write_len) - 2734 (*cow_start + *cow_len); 2735 if (want_clusters < contig_clusters) 2736 want_clusters = contig_clusters; 2737 2738 /* 2739 * If the write does not cover the whole extent, we 2740 * need to calculate how we're going to split the extent. 2741 * We try to do it on contig_clusters boundaries. 2742 * 2743 * Any extent smaller than contig_clusters will be 2744 * CoWed in its entirety. 2745 */ 2746 if (leaf_clusters <= contig_clusters) 2747 *cow_len += leaf_clusters; 2748 else if (*cow_len || (*cow_start == cpos)) { 2749 /* 2750 * This extent needs to be CoW'd from its 2751 * beginning, so all we have to do is compute 2752 * how many clusters to grab. We align 2753 * want_clusters to the edge of contig_clusters 2754 * to get better I/O. 2755 */ 2756 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2757 want_clusters); 2758 2759 if (leaf_clusters < want_clusters) 2760 *cow_len += leaf_clusters; 2761 else 2762 *cow_len += want_clusters; 2763 } else if ((*cow_start + contig_clusters) >= 2764 (cpos + write_len)) { 2765 /* 2766 * Breaking off contig_clusters at the front 2767 * of the extent will cover our write. That's 2768 * easy. 2769 */ 2770 *cow_len = contig_clusters; 2771 } else if ((rec_end - cpos) <= contig_clusters) { 2772 /* 2773 * Breaking off contig_clusters at the tail of 2774 * this extent will cover cpos. 2775 */ 2776 *cow_start = rec_end - contig_clusters; 2777 *cow_len = contig_clusters; 2778 } else if ((rec_end - cpos) <= want_clusters) { 2779 /* 2780 * While we can't fit the entire write in this 2781 * extent, we know that the write goes from cpos 2782 * to the end of the extent. Break that off. 2783 * We try to break it at some multiple of 2784 * contig_clusters from the front of the extent. 2785 * Failing that (ie, cpos is within 2786 * contig_clusters of the front), we'll CoW the 2787 * entire extent. 2788 */ 2789 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2790 *cow_start, cpos); 2791 *cow_len = rec_end - *cow_start; 2792 } else { 2793 /* 2794 * Ok, the entire write lives in the middle of 2795 * this extent. Let's try to slice the extent up 2796 * nicely. Optimally, our CoW region starts at 2797 * m*contig_clusters from the beginning of the 2798 * extent and goes for n*contig_clusters, 2799 * covering the entire write. 2800 */ 2801 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2802 *cow_start, cpos); 2803 2804 want_clusters = (cpos + write_len) - *cow_start; 2805 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2806 want_clusters); 2807 if (*cow_start + want_clusters <= rec_end) 2808 *cow_len = want_clusters; 2809 else 2810 *cow_len = rec_end - *cow_start; 2811 } 2812 2813 /* Have we covered our entire write yet? */ 2814 if ((*cow_start + *cow_len) >= (cpos + write_len)) 2815 break; 2816 2817 /* 2818 * If we reach the end of the extent block and don't get enough 2819 * clusters, continue with the next extent block if possible. 2820 */ 2821 if (i + 1 == le16_to_cpu(el->l_next_free_rec) && 2822 eb && eb->h_next_leaf_blk) { 2823 brelse(eb_bh); 2824 eb_bh = NULL; 2825 2826 ret = ocfs2_read_extent_block(INODE_CACHE(inode), 2827 le64_to_cpu(eb->h_next_leaf_blk), 2828 &eb_bh); 2829 if (ret) { 2830 mlog_errno(ret); 2831 goto out; 2832 } 2833 2834 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2835 el = &eb->h_list; 2836 i = -1; 2837 } 2838 } 2839 2840 out: 2841 brelse(eb_bh); 2842 return ret; 2843 } 2844 2845 /* 2846 * Prepare meta_ac, data_ac and calculate credits when we want to add some 2847 * num_clusters in data_tree "et" and change the refcount for the old 2848 * clusters(starting form p_cluster) in the refcount tree. 2849 * 2850 * Note: 2851 * 1. since we may split the old tree, so we at most will need num_clusters + 2 2852 * more new leaf records. 2853 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so 2854 * just give data_ac = NULL. 2855 */ 2856 static int ocfs2_lock_refcount_allocators(struct super_block *sb, 2857 u32 p_cluster, u32 num_clusters, 2858 struct ocfs2_extent_tree *et, 2859 struct ocfs2_caching_info *ref_ci, 2860 struct buffer_head *ref_root_bh, 2861 struct ocfs2_alloc_context **meta_ac, 2862 struct ocfs2_alloc_context **data_ac, 2863 int *credits) 2864 { 2865 int ret = 0, meta_add = 0; 2866 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); 2867 2868 if (num_free_extents < 0) { 2869 ret = num_free_extents; 2870 mlog_errno(ret); 2871 goto out; 2872 } 2873 2874 if (num_free_extents < num_clusters + 2) 2875 meta_add = 2876 ocfs2_extend_meta_needed(et->et_root_el); 2877 2878 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, 2879 num_clusters + 2); 2880 2881 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, 2882 p_cluster, num_clusters, 2883 &meta_add, credits); 2884 if (ret) { 2885 mlog_errno(ret); 2886 goto out; 2887 } 2888 2889 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", 2890 meta_add, num_clusters, *credits); 2891 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, 2892 meta_ac); 2893 if (ret) { 2894 mlog_errno(ret); 2895 goto out; 2896 } 2897 2898 if (data_ac) { 2899 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, 2900 data_ac); 2901 if (ret) 2902 mlog_errno(ret); 2903 } 2904 2905 out: 2906 if (ret) { 2907 if (*meta_ac) { 2908 ocfs2_free_alloc_context(*meta_ac); 2909 *meta_ac = NULL; 2910 } 2911 } 2912 2913 return ret; 2914 } 2915 2916 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) 2917 { 2918 BUG_ON(buffer_dirty(bh)); 2919 2920 clear_buffer_mapped(bh); 2921 2922 return 0; 2923 } 2924 2925 static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2926 struct ocfs2_cow_context *context, 2927 u32 cpos, u32 old_cluster, 2928 u32 new_cluster, u32 new_len) 2929 { 2930 int ret = 0, partial; 2931 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2932 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2933 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2934 struct page *page; 2935 pgoff_t page_index; 2936 unsigned int from, to; 2937 loff_t offset, end, map_end; 2938 struct address_space *mapping = context->inode->i_mapping; 2939 2940 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2941 new_cluster, new_len, cpos); 2942 2943 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2944 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2945 2946 while (offset < end) { 2947 page_index = offset >> PAGE_CACHE_SHIFT; 2948 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2949 if (map_end > end) 2950 map_end = end; 2951 2952 /* from, to is the offset within the page. */ 2953 from = offset & (PAGE_CACHE_SIZE - 1); 2954 to = PAGE_CACHE_SIZE; 2955 if (map_end & (PAGE_CACHE_SIZE - 1)) 2956 to = map_end & (PAGE_CACHE_SIZE - 1); 2957 2958 page = grab_cache_page(mapping, page_index); 2959 2960 /* This page can't be dirtied before we CoW it out. */ 2961 BUG_ON(PageDirty(page)); 2962 2963 if (!PageUptodate(page)) { 2964 ret = block_read_full_page(page, ocfs2_get_block); 2965 if (ret) { 2966 mlog_errno(ret); 2967 goto unlock; 2968 } 2969 lock_page(page); 2970 } 2971 2972 if (page_has_buffers(page)) { 2973 ret = walk_page_buffers(handle, page_buffers(page), 2974 from, to, &partial, 2975 ocfs2_clear_cow_buffer); 2976 if (ret) { 2977 mlog_errno(ret); 2978 goto unlock; 2979 } 2980 } 2981 2982 ocfs2_map_and_dirty_page(context->inode, 2983 handle, from, to, 2984 page, 0, &new_block); 2985 mark_page_accessed(page); 2986 unlock: 2987 unlock_page(page); 2988 page_cache_release(page); 2989 page = NULL; 2990 offset = map_end; 2991 if (ret) 2992 break; 2993 } 2994 2995 return ret; 2996 } 2997 2998 static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle, 2999 struct ocfs2_cow_context *context, 3000 u32 cpos, u32 old_cluster, 3001 u32 new_cluster, u32 new_len) 3002 { 3003 int ret = 0; 3004 struct super_block *sb = context->inode->i_sb; 3005 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3006 int i, blocks = ocfs2_clusters_to_blocks(sb, new_len); 3007 u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster); 3008 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 3009 struct ocfs2_super *osb = OCFS2_SB(sb); 3010 struct buffer_head *old_bh = NULL; 3011 struct buffer_head *new_bh = NULL; 3012 3013 mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster, 3014 new_cluster, new_len); 3015 3016 for (i = 0; i < blocks; i++, old_block++, new_block++) { 3017 new_bh = sb_getblk(osb->sb, new_block); 3018 if (new_bh == NULL) { 3019 ret = -EIO; 3020 mlog_errno(ret); 3021 break; 3022 } 3023 3024 ocfs2_set_new_buffer_uptodate(ci, new_bh); 3025 3026 ret = ocfs2_read_block(ci, old_block, &old_bh, NULL); 3027 if (ret) { 3028 mlog_errno(ret); 3029 break; 3030 } 3031 3032 ret = ocfs2_journal_access(handle, ci, new_bh, 3033 OCFS2_JOURNAL_ACCESS_CREATE); 3034 if (ret) { 3035 mlog_errno(ret); 3036 break; 3037 } 3038 3039 memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize); 3040 ret = ocfs2_journal_dirty(handle, new_bh); 3041 if (ret) { 3042 mlog_errno(ret); 3043 break; 3044 } 3045 3046 brelse(new_bh); 3047 brelse(old_bh); 3048 new_bh = NULL; 3049 old_bh = NULL; 3050 } 3051 3052 brelse(new_bh); 3053 brelse(old_bh); 3054 return ret; 3055 } 3056 3057 static int ocfs2_clear_ext_refcount(handle_t *handle, 3058 struct ocfs2_extent_tree *et, 3059 u32 cpos, u32 p_cluster, u32 len, 3060 unsigned int ext_flags, 3061 struct ocfs2_alloc_context *meta_ac, 3062 struct ocfs2_cached_dealloc_ctxt *dealloc) 3063 { 3064 int ret, index; 3065 struct ocfs2_extent_rec replace_rec; 3066 struct ocfs2_path *path = NULL; 3067 struct ocfs2_extent_list *el; 3068 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 3069 u64 ino = ocfs2_metadata_cache_owner(et->et_ci); 3070 3071 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", 3072 (unsigned long long)ino, cpos, len, p_cluster, ext_flags); 3073 3074 memset(&replace_rec, 0, sizeof(replace_rec)); 3075 replace_rec.e_cpos = cpu_to_le32(cpos); 3076 replace_rec.e_leaf_clusters = cpu_to_le16(len); 3077 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, 3078 p_cluster)); 3079 replace_rec.e_flags = ext_flags; 3080 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; 3081 3082 path = ocfs2_new_path_from_et(et); 3083 if (!path) { 3084 ret = -ENOMEM; 3085 mlog_errno(ret); 3086 goto out; 3087 } 3088 3089 ret = ocfs2_find_path(et->et_ci, path, cpos); 3090 if (ret) { 3091 mlog_errno(ret); 3092 goto out; 3093 } 3094 3095 el = path_leaf_el(path); 3096 3097 index = ocfs2_search_extent_list(el, cpos); 3098 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 3099 ocfs2_error(sb, 3100 "Inode %llu has an extent at cpos %u which can no " 3101 "longer be found.\n", 3102 (unsigned long long)ino, cpos); 3103 ret = -EROFS; 3104 goto out; 3105 } 3106 3107 ret = ocfs2_split_extent(handle, et, path, index, 3108 &replace_rec, meta_ac, dealloc); 3109 if (ret) 3110 mlog_errno(ret); 3111 3112 out: 3113 ocfs2_free_path(path); 3114 return ret; 3115 } 3116 3117 static int ocfs2_replace_clusters(handle_t *handle, 3118 struct ocfs2_cow_context *context, 3119 u32 cpos, u32 old, 3120 u32 new, u32 len, 3121 unsigned int ext_flags) 3122 { 3123 int ret; 3124 struct ocfs2_caching_info *ci = context->data_et.et_ci; 3125 u64 ino = ocfs2_metadata_cache_owner(ci); 3126 3127 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", 3128 (unsigned long long)ino, cpos, old, new, len, ext_flags); 3129 3130 /*If the old clusters is unwritten, no need to duplicate. */ 3131 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 3132 ret = context->cow_duplicate_clusters(handle, context, cpos, 3133 old, new, len); 3134 if (ret) { 3135 mlog_errno(ret); 3136 goto out; 3137 } 3138 } 3139 3140 ret = ocfs2_clear_ext_refcount(handle, &context->data_et, 3141 cpos, new, len, ext_flags, 3142 context->meta_ac, &context->dealloc); 3143 if (ret) 3144 mlog_errno(ret); 3145 out: 3146 return ret; 3147 } 3148 3149 static int ocfs2_cow_sync_writeback(struct super_block *sb, 3150 struct ocfs2_cow_context *context, 3151 u32 cpos, u32 num_clusters) 3152 { 3153 int ret = 0; 3154 loff_t offset, end, map_end; 3155 pgoff_t page_index; 3156 struct page *page; 3157 3158 if (ocfs2_should_order_data(context->inode)) 3159 return 0; 3160 3161 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 3162 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 3163 3164 ret = filemap_fdatawrite_range(context->inode->i_mapping, 3165 offset, end - 1); 3166 if (ret < 0) { 3167 mlog_errno(ret); 3168 return ret; 3169 } 3170 3171 while (offset < end) { 3172 page_index = offset >> PAGE_CACHE_SHIFT; 3173 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 3174 if (map_end > end) 3175 map_end = end; 3176 3177 page = grab_cache_page(context->inode->i_mapping, page_index); 3178 BUG_ON(!page); 3179 3180 wait_on_page_writeback(page); 3181 if (PageError(page)) { 3182 ret = -EIO; 3183 mlog_errno(ret); 3184 } else 3185 mark_page_accessed(page); 3186 3187 unlock_page(page); 3188 page_cache_release(page); 3189 page = NULL; 3190 offset = map_end; 3191 if (ret) 3192 break; 3193 } 3194 3195 return ret; 3196 } 3197 3198 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, 3199 u32 v_cluster, u32 *p_cluster, 3200 u32 *num_clusters, 3201 unsigned int *extent_flags) 3202 { 3203 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, 3204 num_clusters, extent_flags); 3205 } 3206 3207 static int ocfs2_make_clusters_writable(struct super_block *sb, 3208 struct ocfs2_cow_context *context, 3209 u32 cpos, u32 p_cluster, 3210 u32 num_clusters, unsigned int e_flags) 3211 { 3212 int ret, delete, index, credits = 0; 3213 u32 new_bit, new_len; 3214 unsigned int set_len; 3215 struct ocfs2_super *osb = OCFS2_SB(sb); 3216 handle_t *handle; 3217 struct buffer_head *ref_leaf_bh = NULL; 3218 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; 3219 struct ocfs2_refcount_rec rec; 3220 3221 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", 3222 cpos, p_cluster, num_clusters, e_flags); 3223 3224 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, 3225 &context->data_et, 3226 ref_ci, 3227 context->ref_root_bh, 3228 &context->meta_ac, 3229 &context->data_ac, &credits); 3230 if (ret) { 3231 mlog_errno(ret); 3232 return ret; 3233 } 3234 3235 if (context->post_refcount) 3236 credits += context->post_refcount->credits; 3237 3238 credits += context->extra_credits; 3239 handle = ocfs2_start_trans(osb, credits); 3240 if (IS_ERR(handle)) { 3241 ret = PTR_ERR(handle); 3242 mlog_errno(ret); 3243 goto out; 3244 } 3245 3246 while (num_clusters) { 3247 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, 3248 p_cluster, num_clusters, 3249 &rec, &index, &ref_leaf_bh); 3250 if (ret) { 3251 mlog_errno(ret); 3252 goto out_commit; 3253 } 3254 3255 BUG_ON(!rec.r_refcount); 3256 set_len = min((u64)p_cluster + num_clusters, 3257 le64_to_cpu(rec.r_cpos) + 3258 le32_to_cpu(rec.r_clusters)) - p_cluster; 3259 3260 /* 3261 * There are many different situation here. 3262 * 1. If refcount == 1, remove the flag and don't COW. 3263 * 2. If refcount > 1, allocate clusters. 3264 * Here we may not allocate r_len once at a time, so continue 3265 * until we reach num_clusters. 3266 */ 3267 if (le32_to_cpu(rec.r_refcount) == 1) { 3268 delete = 0; 3269 ret = ocfs2_clear_ext_refcount(handle, 3270 &context->data_et, 3271 cpos, p_cluster, 3272 set_len, e_flags, 3273 context->meta_ac, 3274 &context->dealloc); 3275 if (ret) { 3276 mlog_errno(ret); 3277 goto out_commit; 3278 } 3279 } else { 3280 delete = 1; 3281 3282 ret = __ocfs2_claim_clusters(osb, handle, 3283 context->data_ac, 3284 1, set_len, 3285 &new_bit, &new_len); 3286 if (ret) { 3287 mlog_errno(ret); 3288 goto out_commit; 3289 } 3290 3291 ret = ocfs2_replace_clusters(handle, context, 3292 cpos, p_cluster, new_bit, 3293 new_len, e_flags); 3294 if (ret) { 3295 mlog_errno(ret); 3296 goto out_commit; 3297 } 3298 set_len = new_len; 3299 } 3300 3301 ret = __ocfs2_decrease_refcount(handle, ref_ci, 3302 context->ref_root_bh, 3303 p_cluster, set_len, 3304 context->meta_ac, 3305 &context->dealloc, delete); 3306 if (ret) { 3307 mlog_errno(ret); 3308 goto out_commit; 3309 } 3310 3311 cpos += set_len; 3312 p_cluster += set_len; 3313 num_clusters -= set_len; 3314 brelse(ref_leaf_bh); 3315 ref_leaf_bh = NULL; 3316 } 3317 3318 /* handle any post_cow action. */ 3319 if (context->post_refcount && context->post_refcount->func) { 3320 ret = context->post_refcount->func(context->inode, handle, 3321 context->post_refcount->para); 3322 if (ret) { 3323 mlog_errno(ret); 3324 goto out_commit; 3325 } 3326 } 3327 3328 /* 3329 * Here we should write the new page out first if we are 3330 * in write-back mode. 3331 */ 3332 if (context->get_clusters == ocfs2_di_get_clusters) { 3333 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); 3334 if (ret) 3335 mlog_errno(ret); 3336 } 3337 3338 out_commit: 3339 ocfs2_commit_trans(osb, handle); 3340 3341 out: 3342 if (context->data_ac) { 3343 ocfs2_free_alloc_context(context->data_ac); 3344 context->data_ac = NULL; 3345 } 3346 if (context->meta_ac) { 3347 ocfs2_free_alloc_context(context->meta_ac); 3348 context->meta_ac = NULL; 3349 } 3350 brelse(ref_leaf_bh); 3351 3352 return ret; 3353 } 3354 3355 static int ocfs2_replace_cow(struct ocfs2_cow_context *context) 3356 { 3357 int ret = 0; 3358 struct inode *inode = context->inode; 3359 u32 cow_start = context->cow_start, cow_len = context->cow_len; 3360 u32 p_cluster, num_clusters; 3361 unsigned int ext_flags; 3362 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3363 3364 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 3365 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 3366 "tree, but the feature bit is not set in the " 3367 "super block.", inode->i_ino); 3368 return -EROFS; 3369 } 3370 3371 ocfs2_init_dealloc_ctxt(&context->dealloc); 3372 3373 while (cow_len) { 3374 ret = context->get_clusters(context, cow_start, &p_cluster, 3375 &num_clusters, &ext_flags); 3376 if (ret) { 3377 mlog_errno(ret); 3378 break; 3379 } 3380 3381 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); 3382 3383 if (cow_len < num_clusters) 3384 num_clusters = cow_len; 3385 3386 ret = ocfs2_make_clusters_writable(inode->i_sb, context, 3387 cow_start, p_cluster, 3388 num_clusters, ext_flags); 3389 if (ret) { 3390 mlog_errno(ret); 3391 break; 3392 } 3393 3394 cow_len -= num_clusters; 3395 cow_start += num_clusters; 3396 } 3397 3398 if (ocfs2_dealloc_has_cluster(&context->dealloc)) { 3399 ocfs2_schedule_truncate_log_flush(osb, 1); 3400 ocfs2_run_deallocs(osb, &context->dealloc); 3401 } 3402 3403 return ret; 3404 } 3405 3406 /* 3407 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3408 * past max_cpos. This will stop when it runs into a hole or an 3409 * unrefcounted extent. 3410 */ 3411 static int ocfs2_refcount_cow_hunk(struct inode *inode, 3412 struct buffer_head *di_bh, 3413 u32 cpos, u32 write_len, u32 max_cpos) 3414 { 3415 int ret; 3416 u32 cow_start = 0, cow_len = 0; 3417 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3418 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3419 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3420 struct buffer_head *ref_root_bh = NULL; 3421 struct ocfs2_refcount_tree *ref_tree; 3422 struct ocfs2_cow_context *context = NULL; 3423 3424 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 3425 3426 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, 3427 cpos, write_len, max_cpos, 3428 &cow_start, &cow_len); 3429 if (ret) { 3430 mlog_errno(ret); 3431 goto out; 3432 } 3433 3434 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " 3435 "cow_len %u\n", inode->i_ino, 3436 cpos, write_len, cow_start, cow_len); 3437 3438 BUG_ON(cow_len == 0); 3439 3440 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3441 if (!context) { 3442 ret = -ENOMEM; 3443 mlog_errno(ret); 3444 goto out; 3445 } 3446 3447 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 3448 1, &ref_tree, &ref_root_bh); 3449 if (ret) { 3450 mlog_errno(ret); 3451 goto out; 3452 } 3453 3454 context->inode = inode; 3455 context->cow_start = cow_start; 3456 context->cow_len = cow_len; 3457 context->ref_tree = ref_tree; 3458 context->ref_root_bh = ref_root_bh; 3459 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3460 context->get_clusters = ocfs2_di_get_clusters; 3461 3462 ocfs2_init_dinode_extent_tree(&context->data_et, 3463 INODE_CACHE(inode), di_bh); 3464 3465 ret = ocfs2_replace_cow(context); 3466 if (ret) 3467 mlog_errno(ret); 3468 3469 /* 3470 * truncate the extent map here since no matter whether we meet with 3471 * any error during the action, we shouldn't trust cached extent map 3472 * any more. 3473 */ 3474 ocfs2_extent_map_trunc(inode, cow_start); 3475 3476 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3477 brelse(ref_root_bh); 3478 out: 3479 kfree(context); 3480 return ret; 3481 } 3482 3483 /* 3484 * CoW any and all clusters between cpos and cpos+write_len. 3485 * Don't CoW past max_cpos. If this returns successfully, all 3486 * clusters between cpos and cpos+write_len are safe to modify. 3487 */ 3488 int ocfs2_refcount_cow(struct inode *inode, 3489 struct buffer_head *di_bh, 3490 u32 cpos, u32 write_len, u32 max_cpos) 3491 { 3492 int ret = 0; 3493 u32 p_cluster, num_clusters; 3494 unsigned int ext_flags; 3495 3496 while (write_len) { 3497 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3498 &num_clusters, &ext_flags); 3499 if (ret) { 3500 mlog_errno(ret); 3501 break; 3502 } 3503 3504 if (write_len < num_clusters) 3505 num_clusters = write_len; 3506 3507 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3508 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3509 num_clusters, max_cpos); 3510 if (ret) { 3511 mlog_errno(ret); 3512 break; 3513 } 3514 } 3515 3516 write_len -= num_clusters; 3517 cpos += num_clusters; 3518 } 3519 3520 return ret; 3521 } 3522 3523 static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context, 3524 u32 v_cluster, u32 *p_cluster, 3525 u32 *num_clusters, 3526 unsigned int *extent_flags) 3527 { 3528 struct inode *inode = context->inode; 3529 struct ocfs2_xattr_value_root *xv = context->cow_object; 3530 3531 return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster, 3532 num_clusters, &xv->xr_list, 3533 extent_flags); 3534 } 3535 3536 /* 3537 * Given a xattr value root, calculate the most meta/credits we need for 3538 * refcount tree change if we truncate it to 0. 3539 */ 3540 int ocfs2_refcounted_xattr_delete_need(struct inode *inode, 3541 struct ocfs2_caching_info *ref_ci, 3542 struct buffer_head *ref_root_bh, 3543 struct ocfs2_xattr_value_root *xv, 3544 int *meta_add, int *credits) 3545 { 3546 int ret = 0, index, ref_blocks = 0; 3547 u32 p_cluster, num_clusters; 3548 u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters); 3549 struct ocfs2_refcount_block *rb; 3550 struct ocfs2_refcount_rec rec; 3551 struct buffer_head *ref_leaf_bh = NULL; 3552 3553 while (cpos < clusters) { 3554 ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, 3555 &num_clusters, &xv->xr_list, 3556 NULL); 3557 if (ret) { 3558 mlog_errno(ret); 3559 goto out; 3560 } 3561 3562 cpos += num_clusters; 3563 3564 while (num_clusters) { 3565 ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh, 3566 p_cluster, num_clusters, 3567 &rec, &index, 3568 &ref_leaf_bh); 3569 if (ret) { 3570 mlog_errno(ret); 3571 goto out; 3572 } 3573 3574 BUG_ON(!rec.r_refcount); 3575 3576 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 3577 3578 /* 3579 * We really don't know whether the other clusters is in 3580 * this refcount block or not, so just take the worst 3581 * case that all the clusters are in this block and each 3582 * one will split a refcount rec, so totally we need 3583 * clusters * 2 new refcount rec. 3584 */ 3585 if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > 3586 le16_to_cpu(rb->rf_records.rl_count)) 3587 ref_blocks++; 3588 3589 *credits += 1; 3590 brelse(ref_leaf_bh); 3591 ref_leaf_bh = NULL; 3592 3593 if (num_clusters <= le32_to_cpu(rec.r_clusters)) 3594 break; 3595 else 3596 num_clusters -= le32_to_cpu(rec.r_clusters); 3597 p_cluster += num_clusters; 3598 } 3599 } 3600 3601 *meta_add += ref_blocks; 3602 if (!ref_blocks) 3603 goto out; 3604 3605 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 3606 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) 3607 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; 3608 else { 3609 struct ocfs2_extent_tree et; 3610 3611 ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh); 3612 *credits += ocfs2_calc_extend_credits(inode->i_sb, 3613 et.et_root_el, 3614 ref_blocks); 3615 } 3616 3617 out: 3618 brelse(ref_leaf_bh); 3619 return ret; 3620 } 3621 3622 /* 3623 * Do CoW for xattr. 3624 */ 3625 int ocfs2_refcount_cow_xattr(struct inode *inode, 3626 struct ocfs2_dinode *di, 3627 struct ocfs2_xattr_value_buf *vb, 3628 struct ocfs2_refcount_tree *ref_tree, 3629 struct buffer_head *ref_root_bh, 3630 u32 cpos, u32 write_len, 3631 struct ocfs2_post_refcount *post) 3632 { 3633 int ret; 3634 struct ocfs2_xattr_value_root *xv = vb->vb_xv; 3635 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3636 struct ocfs2_cow_context *context = NULL; 3637 u32 cow_start, cow_len; 3638 3639 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 3640 3641 ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, 3642 cpos, write_len, UINT_MAX, 3643 &cow_start, &cow_len); 3644 if (ret) { 3645 mlog_errno(ret); 3646 goto out; 3647 } 3648 3649 BUG_ON(cow_len == 0); 3650 3651 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3652 if (!context) { 3653 ret = -ENOMEM; 3654 mlog_errno(ret); 3655 goto out; 3656 } 3657 3658 context->inode = inode; 3659 context->cow_start = cow_start; 3660 context->cow_len = cow_len; 3661 context->ref_tree = ref_tree; 3662 context->ref_root_bh = ref_root_bh;; 3663 context->cow_object = xv; 3664 3665 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd; 3666 /* We need the extra credits for duplicate_clusters by jbd. */ 3667 context->extra_credits = 3668 ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len; 3669 context->get_clusters = ocfs2_xattr_value_get_clusters; 3670 context->post_refcount = post; 3671 3672 ocfs2_init_xattr_value_extent_tree(&context->data_et, 3673 INODE_CACHE(inode), vb); 3674 3675 ret = ocfs2_replace_cow(context); 3676 if (ret) 3677 mlog_errno(ret); 3678 3679 out: 3680 kfree(context); 3681 return ret; 3682 } 3683 3684 /* 3685 * Insert a new extent into refcount tree and mark a extent rec 3686 * as refcounted in the dinode tree. 3687 */ 3688 int ocfs2_add_refcount_flag(struct inode *inode, 3689 struct ocfs2_extent_tree *data_et, 3690 struct ocfs2_caching_info *ref_ci, 3691 struct buffer_head *ref_root_bh, 3692 u32 cpos, u32 p_cluster, u32 num_clusters, 3693 struct ocfs2_cached_dealloc_ctxt *dealloc, 3694 struct ocfs2_post_refcount *post) 3695 { 3696 int ret; 3697 handle_t *handle; 3698 int credits = 1, ref_blocks = 0; 3699 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3700 struct ocfs2_alloc_context *meta_ac = NULL; 3701 3702 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 3703 ref_ci, ref_root_bh, 3704 p_cluster, num_clusters, 3705 &ref_blocks, &credits); 3706 if (ret) { 3707 mlog_errno(ret); 3708 goto out; 3709 } 3710 3711 mlog(0, "reserve new metadata %d, credits = %d\n", 3712 ref_blocks, credits); 3713 3714 if (ref_blocks) { 3715 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 3716 ref_blocks, &meta_ac); 3717 if (ret) { 3718 mlog_errno(ret); 3719 goto out; 3720 } 3721 } 3722 3723 if (post) 3724 credits += post->credits; 3725 3726 handle = ocfs2_start_trans(osb, credits); 3727 if (IS_ERR(handle)) { 3728 ret = PTR_ERR(handle); 3729 mlog_errno(ret); 3730 goto out; 3731 } 3732 3733 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, 3734 cpos, num_clusters, p_cluster, 3735 meta_ac, dealloc); 3736 if (ret) { 3737 mlog_errno(ret); 3738 goto out_commit; 3739 } 3740 3741 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3742 p_cluster, num_clusters, 0, 3743 meta_ac, dealloc); 3744 if (ret) { 3745 mlog_errno(ret); 3746 goto out_commit; 3747 } 3748 3749 if (post && post->func) { 3750 ret = post->func(inode, handle, post->para); 3751 if (ret) 3752 mlog_errno(ret); 3753 } 3754 3755 out_commit: 3756 ocfs2_commit_trans(osb, handle); 3757 out: 3758 if (meta_ac) 3759 ocfs2_free_alloc_context(meta_ac); 3760 return ret; 3761 } 3762 3763 static int ocfs2_change_ctime(struct inode *inode, 3764 struct buffer_head *di_bh) 3765 { 3766 int ret; 3767 handle_t *handle; 3768 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3769 3770 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), 3771 OCFS2_INODE_UPDATE_CREDITS); 3772 if (IS_ERR(handle)) { 3773 ret = PTR_ERR(handle); 3774 mlog_errno(ret); 3775 goto out; 3776 } 3777 3778 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 3779 OCFS2_JOURNAL_ACCESS_WRITE); 3780 if (ret) { 3781 mlog_errno(ret); 3782 goto out_commit; 3783 } 3784 3785 inode->i_ctime = CURRENT_TIME; 3786 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 3787 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 3788 3789 ocfs2_journal_dirty(handle, di_bh); 3790 3791 out_commit: 3792 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 3793 out: 3794 return ret; 3795 } 3796 3797 static int ocfs2_attach_refcount_tree(struct inode *inode, 3798 struct buffer_head *di_bh) 3799 { 3800 int ret, data_changed = 0; 3801 struct buffer_head *ref_root_bh = NULL; 3802 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3803 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3804 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3805 struct ocfs2_refcount_tree *ref_tree; 3806 unsigned int ext_flags; 3807 loff_t size; 3808 u32 cpos, num_clusters, clusters, p_cluster; 3809 struct ocfs2_cached_dealloc_ctxt dealloc; 3810 struct ocfs2_extent_tree di_et; 3811 3812 ocfs2_init_dealloc_ctxt(&dealloc); 3813 3814 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { 3815 ret = ocfs2_create_refcount_tree(inode, di_bh); 3816 if (ret) { 3817 mlog_errno(ret); 3818 goto out; 3819 } 3820 } 3821 3822 BUG_ON(!di->i_refcount_loc); 3823 ret = ocfs2_lock_refcount_tree(osb, 3824 le64_to_cpu(di->i_refcount_loc), 1, 3825 &ref_tree, &ref_root_bh); 3826 if (ret) { 3827 mlog_errno(ret); 3828 goto out; 3829 } 3830 3831 if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) 3832 goto attach_xattr; 3833 3834 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); 3835 3836 size = i_size_read(inode); 3837 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); 3838 3839 cpos = 0; 3840 while (cpos < clusters) { 3841 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3842 &num_clusters, &ext_flags); 3843 3844 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { 3845 ret = ocfs2_add_refcount_flag(inode, &di_et, 3846 &ref_tree->rf_ci, 3847 ref_root_bh, cpos, 3848 p_cluster, num_clusters, 3849 &dealloc, NULL); 3850 if (ret) { 3851 mlog_errno(ret); 3852 goto unlock; 3853 } 3854 3855 data_changed = 1; 3856 } 3857 cpos += num_clusters; 3858 } 3859 3860 attach_xattr: 3861 if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) { 3862 ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh, 3863 &ref_tree->rf_ci, 3864 ref_root_bh, 3865 &dealloc); 3866 if (ret) { 3867 mlog_errno(ret); 3868 goto unlock; 3869 } 3870 } 3871 3872 if (data_changed) { 3873 ret = ocfs2_change_ctime(inode, di_bh); 3874 if (ret) 3875 mlog_errno(ret); 3876 } 3877 3878 unlock: 3879 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3880 brelse(ref_root_bh); 3881 3882 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { 3883 ocfs2_schedule_truncate_log_flush(osb, 1); 3884 ocfs2_run_deallocs(osb, &dealloc); 3885 } 3886 out: 3887 /* 3888 * Empty the extent map so that we may get the right extent 3889 * record from the disk. 3890 */ 3891 ocfs2_extent_map_trunc(inode, 0); 3892 3893 return ret; 3894 } 3895 3896 static int ocfs2_add_refcounted_extent(struct inode *inode, 3897 struct ocfs2_extent_tree *et, 3898 struct ocfs2_caching_info *ref_ci, 3899 struct buffer_head *ref_root_bh, 3900 u32 cpos, u32 p_cluster, u32 num_clusters, 3901 unsigned int ext_flags, 3902 struct ocfs2_cached_dealloc_ctxt *dealloc) 3903 { 3904 int ret; 3905 handle_t *handle; 3906 int credits = 0; 3907 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3908 struct ocfs2_alloc_context *meta_ac = NULL; 3909 3910 ret = ocfs2_lock_refcount_allocators(inode->i_sb, 3911 p_cluster, num_clusters, 3912 et, ref_ci, 3913 ref_root_bh, &meta_ac, 3914 NULL, &credits); 3915 if (ret) { 3916 mlog_errno(ret); 3917 goto out; 3918 } 3919 3920 handle = ocfs2_start_trans(osb, credits); 3921 if (IS_ERR(handle)) { 3922 ret = PTR_ERR(handle); 3923 mlog_errno(ret); 3924 goto out; 3925 } 3926 3927 ret = ocfs2_insert_extent(handle, et, cpos, 3928 ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), 3929 num_clusters, ext_flags, meta_ac); 3930 if (ret) { 3931 mlog_errno(ret); 3932 goto out_commit; 3933 } 3934 3935 ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3936 p_cluster, num_clusters, 3937 meta_ac, dealloc); 3938 if (ret) 3939 mlog_errno(ret); 3940 3941 out_commit: 3942 ocfs2_commit_trans(osb, handle); 3943 out: 3944 if (meta_ac) 3945 ocfs2_free_alloc_context(meta_ac); 3946 return ret; 3947 } 3948 3949 static int ocfs2_duplicate_inline_data(struct inode *s_inode, 3950 struct buffer_head *s_bh, 3951 struct inode *t_inode, 3952 struct buffer_head *t_bh) 3953 { 3954 int ret; 3955 handle_t *handle; 3956 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 3957 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; 3958 struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data; 3959 3960 BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); 3961 3962 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 3963 if (IS_ERR(handle)) { 3964 ret = PTR_ERR(handle); 3965 mlog_errno(ret); 3966 goto out; 3967 } 3968 3969 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, 3970 OCFS2_JOURNAL_ACCESS_WRITE); 3971 if (ret) { 3972 mlog_errno(ret); 3973 goto out_commit; 3974 } 3975 3976 t_di->id2.i_data.id_count = s_di->id2.i_data.id_count; 3977 memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data, 3978 le16_to_cpu(s_di->id2.i_data.id_count)); 3979 spin_lock(&OCFS2_I(t_inode)->ip_lock); 3980 OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 3981 t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features); 3982 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 3983 3984 ocfs2_journal_dirty(handle, t_bh); 3985 3986 out_commit: 3987 ocfs2_commit_trans(osb, handle); 3988 out: 3989 return ret; 3990 } 3991 3992 static int ocfs2_duplicate_extent_list(struct inode *s_inode, 3993 struct inode *t_inode, 3994 struct buffer_head *t_bh, 3995 struct ocfs2_caching_info *ref_ci, 3996 struct buffer_head *ref_root_bh, 3997 struct ocfs2_cached_dealloc_ctxt *dealloc) 3998 { 3999 int ret = 0; 4000 u32 p_cluster, num_clusters, clusters, cpos; 4001 loff_t size; 4002 unsigned int ext_flags; 4003 struct ocfs2_extent_tree et; 4004 4005 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); 4006 4007 size = i_size_read(s_inode); 4008 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); 4009 4010 cpos = 0; 4011 while (cpos < clusters) { 4012 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, 4013 &num_clusters, &ext_flags); 4014 4015 if (p_cluster) { 4016 ret = ocfs2_add_refcounted_extent(t_inode, &et, 4017 ref_ci, ref_root_bh, 4018 cpos, p_cluster, 4019 num_clusters, 4020 ext_flags, 4021 dealloc); 4022 if (ret) { 4023 mlog_errno(ret); 4024 goto out; 4025 } 4026 } 4027 4028 cpos += num_clusters; 4029 } 4030 4031 out: 4032 return ret; 4033 } 4034 4035 /* 4036 * change the new file's attributes to the src. 4037 * 4038 * reflink creates a snapshot of a file, that means the attributes 4039 * must be identical except for three exceptions - nlink, ino, and ctime. 4040 */ 4041 static int ocfs2_complete_reflink(struct inode *s_inode, 4042 struct buffer_head *s_bh, 4043 struct inode *t_inode, 4044 struct buffer_head *t_bh, 4045 bool preserve) 4046 { 4047 int ret; 4048 handle_t *handle; 4049 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; 4050 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; 4051 loff_t size = i_size_read(s_inode); 4052 4053 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), 4054 OCFS2_INODE_UPDATE_CREDITS); 4055 if (IS_ERR(handle)) { 4056 ret = PTR_ERR(handle); 4057 mlog_errno(ret); 4058 return ret; 4059 } 4060 4061 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, 4062 OCFS2_JOURNAL_ACCESS_WRITE); 4063 if (ret) { 4064 mlog_errno(ret); 4065 goto out_commit; 4066 } 4067 4068 spin_lock(&OCFS2_I(t_inode)->ip_lock); 4069 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; 4070 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; 4071 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 4072 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 4073 i_size_write(t_inode, size); 4074 4075 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 4076 di->i_clusters = s_di->i_clusters; 4077 di->i_size = s_di->i_size; 4078 di->i_dyn_features = s_di->i_dyn_features; 4079 di->i_attr = s_di->i_attr; 4080 4081 if (preserve) { 4082 di->i_uid = s_di->i_uid; 4083 di->i_gid = s_di->i_gid; 4084 di->i_mode = s_di->i_mode; 4085 4086 /* 4087 * update time. 4088 * we want mtime to appear identical to the source and 4089 * update ctime. 4090 */ 4091 t_inode->i_ctime = CURRENT_TIME; 4092 4093 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); 4094 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); 4095 4096 t_inode->i_mtime = s_inode->i_mtime; 4097 di->i_mtime = s_di->i_mtime; 4098 di->i_mtime_nsec = s_di->i_mtime_nsec; 4099 } 4100 4101 ocfs2_journal_dirty(handle, t_bh); 4102 4103 out_commit: 4104 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); 4105 return ret; 4106 } 4107 4108 static int ocfs2_create_reflink_node(struct inode *s_inode, 4109 struct buffer_head *s_bh, 4110 struct inode *t_inode, 4111 struct buffer_head *t_bh, 4112 bool preserve) 4113 { 4114 int ret; 4115 struct buffer_head *ref_root_bh = NULL; 4116 struct ocfs2_cached_dealloc_ctxt dealloc; 4117 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 4118 struct ocfs2_refcount_block *rb; 4119 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; 4120 struct ocfs2_refcount_tree *ref_tree; 4121 4122 ocfs2_init_dealloc_ctxt(&dealloc); 4123 4124 ret = ocfs2_set_refcount_tree(t_inode, t_bh, 4125 le64_to_cpu(di->i_refcount_loc)); 4126 if (ret) { 4127 mlog_errno(ret); 4128 goto out; 4129 } 4130 4131 if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { 4132 ret = ocfs2_duplicate_inline_data(s_inode, s_bh, 4133 t_inode, t_bh); 4134 if (ret) 4135 mlog_errno(ret); 4136 goto out; 4137 } 4138 4139 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 4140 1, &ref_tree, &ref_root_bh); 4141 if (ret) { 4142 mlog_errno(ret); 4143 goto out; 4144 } 4145 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 4146 4147 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, 4148 &ref_tree->rf_ci, ref_root_bh, 4149 &dealloc); 4150 if (ret) { 4151 mlog_errno(ret); 4152 goto out_unlock_refcount; 4153 } 4154 4155 out_unlock_refcount: 4156 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 4157 brelse(ref_root_bh); 4158 out: 4159 if (ocfs2_dealloc_has_cluster(&dealloc)) { 4160 ocfs2_schedule_truncate_log_flush(osb, 1); 4161 ocfs2_run_deallocs(osb, &dealloc); 4162 } 4163 4164 return ret; 4165 } 4166 4167 static int __ocfs2_reflink(struct dentry *old_dentry, 4168 struct buffer_head *old_bh, 4169 struct inode *new_inode, 4170 bool preserve) 4171 { 4172 int ret; 4173 struct inode *inode = old_dentry->d_inode; 4174 struct buffer_head *new_bh = NULL; 4175 4176 ret = filemap_fdatawrite(inode->i_mapping); 4177 if (ret) { 4178 mlog_errno(ret); 4179 goto out; 4180 } 4181 4182 ret = ocfs2_attach_refcount_tree(inode, old_bh); 4183 if (ret) { 4184 mlog_errno(ret); 4185 goto out; 4186 } 4187 4188 mutex_lock(&new_inode->i_mutex); 4189 ret = ocfs2_inode_lock(new_inode, &new_bh, 1); 4190 if (ret) { 4191 mlog_errno(ret); 4192 goto out_unlock; 4193 } 4194 4195 ret = ocfs2_create_reflink_node(inode, old_bh, 4196 new_inode, new_bh, preserve); 4197 if (ret) { 4198 mlog_errno(ret); 4199 goto inode_unlock; 4200 } 4201 4202 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) { 4203 ret = ocfs2_reflink_xattrs(inode, old_bh, 4204 new_inode, new_bh, 4205 preserve); 4206 if (ret) { 4207 mlog_errno(ret); 4208 goto inode_unlock; 4209 } 4210 } 4211 4212 ret = ocfs2_complete_reflink(inode, old_bh, 4213 new_inode, new_bh, preserve); 4214 if (ret) 4215 mlog_errno(ret); 4216 4217 inode_unlock: 4218 ocfs2_inode_unlock(new_inode, 1); 4219 brelse(new_bh); 4220 out_unlock: 4221 mutex_unlock(&new_inode->i_mutex); 4222 out: 4223 if (!ret) { 4224 ret = filemap_fdatawait(inode->i_mapping); 4225 if (ret) 4226 mlog_errno(ret); 4227 } 4228 return ret; 4229 } 4230 4231 static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, 4232 struct dentry *new_dentry, bool preserve) 4233 { 4234 int error; 4235 struct inode *inode = old_dentry->d_inode; 4236 struct buffer_head *old_bh = NULL; 4237 struct inode *new_orphan_inode = NULL; 4238 4239 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4240 return -EOPNOTSUPP; 4241 4242 error = ocfs2_create_inode_in_orphan(dir, inode->i_mode, 4243 &new_orphan_inode); 4244 if (error) { 4245 mlog_errno(error); 4246 goto out; 4247 } 4248 4249 error = ocfs2_inode_lock(inode, &old_bh, 1); 4250 if (error) { 4251 mlog_errno(error); 4252 goto out; 4253 } 4254 4255 down_write(&OCFS2_I(inode)->ip_xattr_sem); 4256 down_write(&OCFS2_I(inode)->ip_alloc_sem); 4257 error = __ocfs2_reflink(old_dentry, old_bh, 4258 new_orphan_inode, preserve); 4259 up_write(&OCFS2_I(inode)->ip_alloc_sem); 4260 up_write(&OCFS2_I(inode)->ip_xattr_sem); 4261 4262 ocfs2_inode_unlock(inode, 1); 4263 brelse(old_bh); 4264 4265 if (error) { 4266 mlog_errno(error); 4267 goto out; 4268 } 4269 4270 /* If the security isn't preserved, we need to re-initialize them. */ 4271 if (!preserve) { 4272 error = ocfs2_init_security_and_acl(dir, new_orphan_inode); 4273 if (error) 4274 mlog_errno(error); 4275 } 4276 out: 4277 if (!error) { 4278 error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode, 4279 new_dentry); 4280 if (error) 4281 mlog_errno(error); 4282 } 4283 4284 if (new_orphan_inode) { 4285 /* 4286 * We need to open_unlock the inode no matter whether we 4287 * succeed or not, so that other nodes can delete it later. 4288 */ 4289 ocfs2_open_unlock(new_orphan_inode); 4290 if (error) 4291 iput(new_orphan_inode); 4292 } 4293 4294 return error; 4295 } 4296 4297 /* 4298 * Below here are the bits used by OCFS2_IOC_REFLINK() to fake 4299 * sys_reflink(). This will go away when vfs_reflink() exists in 4300 * fs/namei.c. 4301 */ 4302 4303 /* copied from may_create in VFS. */ 4304 static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) 4305 { 4306 if (child->d_inode) 4307 return -EEXIST; 4308 if (IS_DEADDIR(dir)) 4309 return -ENOENT; 4310 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 4311 } 4312 4313 /* copied from user_path_parent. */ 4314 static int ocfs2_user_path_parent(const char __user *path, 4315 struct nameidata *nd, char **name) 4316 { 4317 char *s = getname(path); 4318 int error; 4319 4320 if (IS_ERR(s)) 4321 return PTR_ERR(s); 4322 4323 error = path_lookup(s, LOOKUP_PARENT, nd); 4324 if (error) 4325 putname(s); 4326 else 4327 *name = s; 4328 4329 return error; 4330 } 4331 4332 /** 4333 * ocfs2_vfs_reflink - Create a reference-counted link 4334 * 4335 * @old_dentry: source dentry + inode 4336 * @dir: directory to create the target 4337 * @new_dentry: target dentry 4338 * @preserve: if true, preserve all file attributes 4339 */ 4340 static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, 4341 struct dentry *new_dentry, bool preserve) 4342 { 4343 struct inode *inode = old_dentry->d_inode; 4344 int error; 4345 4346 if (!inode) 4347 return -ENOENT; 4348 4349 error = ocfs2_may_create(dir, new_dentry); 4350 if (error) 4351 return error; 4352 4353 if (dir->i_sb != inode->i_sb) 4354 return -EXDEV; 4355 4356 /* 4357 * A reflink to an append-only or immutable file cannot be created. 4358 */ 4359 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 4360 return -EPERM; 4361 4362 /* Only regular files can be reflinked. */ 4363 if (!S_ISREG(inode->i_mode)) 4364 return -EPERM; 4365 4366 /* 4367 * If the caller wants to preserve ownership, they require the 4368 * rights to do so. 4369 */ 4370 if (preserve) { 4371 if ((current_fsuid() != inode->i_uid) && !capable(CAP_CHOWN)) 4372 return -EPERM; 4373 if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN)) 4374 return -EPERM; 4375 } 4376 4377 /* 4378 * If the caller is modifying any aspect of the attributes, they 4379 * are not creating a snapshot. They need read permission on the 4380 * file. 4381 */ 4382 if (!preserve) { 4383 error = inode_permission(inode, MAY_READ); 4384 if (error) 4385 return error; 4386 } 4387 4388 mutex_lock(&inode->i_mutex); 4389 vfs_dq_init(dir); 4390 error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); 4391 mutex_unlock(&inode->i_mutex); 4392 if (!error) 4393 fsnotify_create(dir, new_dentry); 4394 return error; 4395 } 4396 /* 4397 * Most codes are copied from sys_linkat. 4398 */ 4399 int ocfs2_reflink_ioctl(struct inode *inode, 4400 const char __user *oldname, 4401 const char __user *newname, 4402 bool preserve) 4403 { 4404 struct dentry *new_dentry; 4405 struct nameidata nd; 4406 struct path old_path; 4407 int error; 4408 char *to = NULL; 4409 4410 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4411 return -EOPNOTSUPP; 4412 4413 error = user_path_at(AT_FDCWD, oldname, 0, &old_path); 4414 if (error) { 4415 mlog_errno(error); 4416 return error; 4417 } 4418 4419 error = ocfs2_user_path_parent(newname, &nd, &to); 4420 if (error) { 4421 mlog_errno(error); 4422 goto out; 4423 } 4424 4425 error = -EXDEV; 4426 if (old_path.mnt != nd.path.mnt) 4427 goto out_release; 4428 new_dentry = lookup_create(&nd, 0); 4429 error = PTR_ERR(new_dentry); 4430 if (IS_ERR(new_dentry)) { 4431 mlog_errno(error); 4432 goto out_unlock; 4433 } 4434 4435 error = mnt_want_write(nd.path.mnt); 4436 if (error) { 4437 mlog_errno(error); 4438 goto out_dput; 4439 } 4440 4441 error = ocfs2_vfs_reflink(old_path.dentry, 4442 nd.path.dentry->d_inode, 4443 new_dentry, preserve); 4444 mnt_drop_write(nd.path.mnt); 4445 out_dput: 4446 dput(new_dentry); 4447 out_unlock: 4448 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 4449 out_release: 4450 path_put(&nd.path); 4451 putname(to); 4452 out: 4453 path_put(&old_path); 4454 4455 return error; 4456 } 4457