1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * refcounttree.c 5 * 6 * Copyright (C) 2009 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 18 #include <linux/sort.h> 19 #define MLOG_MASK_PREFIX ML_REFCOUNT 20 #include <cluster/masklog.h> 21 #include "ocfs2.h" 22 #include "inode.h" 23 #include "alloc.h" 24 #include "suballoc.h" 25 #include "journal.h" 26 #include "uptodate.h" 27 #include "super.h" 28 #include "buffer_head_io.h" 29 #include "blockcheck.h" 30 #include "refcounttree.h" 31 #include "sysfile.h" 32 #include "dlmglue.h" 33 #include "extent_map.h" 34 #include "aops.h" 35 36 #include <linux/bio.h> 37 #include <linux/blkdev.h> 38 #include <linux/gfp.h> 39 #include <linux/slab.h> 40 #include <linux/writeback.h> 41 #include <linux/pagevec.h> 42 #include <linux/swap.h> 43 44 struct ocfs2_cow_context { 45 struct inode *inode; 46 u32 cow_start; 47 u32 cow_len; 48 struct ocfs2_extent_tree data_et; 49 struct ocfs2_refcount_tree *ref_tree; 50 struct buffer_head *ref_root_bh; 51 struct ocfs2_alloc_context *meta_ac; 52 struct ocfs2_alloc_context *data_ac; 53 struct ocfs2_cached_dealloc_ctxt dealloc; 54 int (*get_clusters)(struct ocfs2_cow_context *context, 55 u32 v_cluster, u32 *p_cluster, 56 u32 *num_clusters, 57 unsigned int *extent_flags); 58 int (*cow_duplicate_clusters)(handle_t *handle, 59 struct ocfs2_cow_context *context, 60 u32 cpos, u32 old_cluster, 61 u32 new_cluster, u32 new_len); 62 }; 63 64 static inline struct ocfs2_refcount_tree * 65 cache_info_to_refcount(struct ocfs2_caching_info *ci) 66 { 67 return container_of(ci, struct ocfs2_refcount_tree, rf_ci); 68 } 69 70 static int ocfs2_validate_refcount_block(struct super_block *sb, 71 struct buffer_head *bh) 72 { 73 int rc; 74 struct ocfs2_refcount_block *rb = 75 (struct ocfs2_refcount_block *)bh->b_data; 76 77 mlog(0, "Validating refcount block %llu\n", 78 (unsigned long long)bh->b_blocknr); 79 80 BUG_ON(!buffer_uptodate(bh)); 81 82 /* 83 * If the ecc fails, we return the error but otherwise 84 * leave the filesystem running. We know any error is 85 * local to this block. 86 */ 87 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check); 88 if (rc) { 89 mlog(ML_ERROR, "Checksum failed for refcount block %llu\n", 90 (unsigned long long)bh->b_blocknr); 91 return rc; 92 } 93 94 95 if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) { 96 ocfs2_error(sb, 97 "Refcount block #%llu has bad signature %.*s", 98 (unsigned long long)bh->b_blocknr, 7, 99 rb->rf_signature); 100 return -EINVAL; 101 } 102 103 if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) { 104 ocfs2_error(sb, 105 "Refcount block #%llu has an invalid rf_blkno " 106 "of %llu", 107 (unsigned long long)bh->b_blocknr, 108 (unsigned long long)le64_to_cpu(rb->rf_blkno)); 109 return -EINVAL; 110 } 111 112 if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) { 113 ocfs2_error(sb, 114 "Refcount block #%llu has an invalid " 115 "rf_fs_generation of #%u", 116 (unsigned long long)bh->b_blocknr, 117 le32_to_cpu(rb->rf_fs_generation)); 118 return -EINVAL; 119 } 120 121 return 0; 122 } 123 124 static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci, 125 u64 rb_blkno, 126 struct buffer_head **bh) 127 { 128 int rc; 129 struct buffer_head *tmp = *bh; 130 131 rc = ocfs2_read_block(ci, rb_blkno, &tmp, 132 ocfs2_validate_refcount_block); 133 134 /* If ocfs2_read_block() got us a new bh, pass it up. */ 135 if (!rc && !*bh) 136 *bh = tmp; 137 138 return rc; 139 } 140 141 static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci) 142 { 143 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 144 145 return rf->rf_blkno; 146 } 147 148 static struct super_block * 149 ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci) 150 { 151 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 152 153 return rf->rf_sb; 154 } 155 156 static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci) 157 { 158 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 159 160 spin_lock(&rf->rf_lock); 161 } 162 163 static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci) 164 { 165 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 166 167 spin_unlock(&rf->rf_lock); 168 } 169 170 static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci) 171 { 172 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 173 174 mutex_lock(&rf->rf_io_mutex); 175 } 176 177 static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci) 178 { 179 struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci); 180 181 mutex_unlock(&rf->rf_io_mutex); 182 } 183 184 static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = { 185 .co_owner = ocfs2_refcount_cache_owner, 186 .co_get_super = ocfs2_refcount_cache_get_super, 187 .co_cache_lock = ocfs2_refcount_cache_lock, 188 .co_cache_unlock = ocfs2_refcount_cache_unlock, 189 .co_io_lock = ocfs2_refcount_cache_io_lock, 190 .co_io_unlock = ocfs2_refcount_cache_io_unlock, 191 }; 192 193 static struct ocfs2_refcount_tree * 194 ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno) 195 { 196 struct rb_node *n = osb->osb_rf_lock_tree.rb_node; 197 struct ocfs2_refcount_tree *tree = NULL; 198 199 while (n) { 200 tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node); 201 202 if (blkno < tree->rf_blkno) 203 n = n->rb_left; 204 else if (blkno > tree->rf_blkno) 205 n = n->rb_right; 206 else 207 return tree; 208 } 209 210 return NULL; 211 } 212 213 /* osb_lock is already locked. */ 214 static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb, 215 struct ocfs2_refcount_tree *new) 216 { 217 u64 rf_blkno = new->rf_blkno; 218 struct rb_node *parent = NULL; 219 struct rb_node **p = &osb->osb_rf_lock_tree.rb_node; 220 struct ocfs2_refcount_tree *tmp; 221 222 while (*p) { 223 parent = *p; 224 225 tmp = rb_entry(parent, struct ocfs2_refcount_tree, 226 rf_node); 227 228 if (rf_blkno < tmp->rf_blkno) 229 p = &(*p)->rb_left; 230 else if (rf_blkno > tmp->rf_blkno) 231 p = &(*p)->rb_right; 232 else { 233 /* This should never happen! */ 234 mlog(ML_ERROR, "Duplicate refcount block %llu found!\n", 235 (unsigned long long)rf_blkno); 236 BUG(); 237 } 238 } 239 240 rb_link_node(&new->rf_node, parent, p); 241 rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree); 242 } 243 244 static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree) 245 { 246 ocfs2_metadata_cache_exit(&tree->rf_ci); 247 ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres); 248 ocfs2_lock_res_free(&tree->rf_lockres); 249 kfree(tree); 250 } 251 252 static inline void 253 ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb, 254 struct ocfs2_refcount_tree *tree) 255 { 256 rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree); 257 if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree) 258 osb->osb_ref_tree_lru = NULL; 259 } 260 261 static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, 262 struct ocfs2_refcount_tree *tree) 263 { 264 spin_lock(&osb->osb_lock); 265 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 266 spin_unlock(&osb->osb_lock); 267 } 268 269 void ocfs2_kref_remove_refcount_tree(struct kref *kref) 270 { 271 struct ocfs2_refcount_tree *tree = 272 container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); 273 274 ocfs2_free_refcount_tree(tree); 275 } 276 277 static inline void 278 ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree) 279 { 280 kref_get(&tree->rf_getcnt); 281 } 282 283 static inline void 284 ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree) 285 { 286 kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree); 287 } 288 289 static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new, 290 struct super_block *sb) 291 { 292 ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops); 293 mutex_init(&new->rf_io_mutex); 294 new->rf_sb = sb; 295 spin_lock_init(&new->rf_lock); 296 } 297 298 static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb, 299 struct ocfs2_refcount_tree *new, 300 u64 rf_blkno, u32 generation) 301 { 302 init_rwsem(&new->rf_sem); 303 ocfs2_refcount_lock_res_init(&new->rf_lockres, osb, 304 rf_blkno, generation); 305 } 306 307 static struct ocfs2_refcount_tree* 308 ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno) 309 { 310 struct ocfs2_refcount_tree *new; 311 312 new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS); 313 if (!new) 314 return NULL; 315 316 new->rf_blkno = rf_blkno; 317 kref_init(&new->rf_getcnt); 318 ocfs2_init_refcount_tree_ci(new, osb->sb); 319 320 return new; 321 } 322 323 static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno, 324 struct ocfs2_refcount_tree **ret_tree) 325 { 326 int ret = 0; 327 struct ocfs2_refcount_tree *tree, *new = NULL; 328 struct buffer_head *ref_root_bh = NULL; 329 struct ocfs2_refcount_block *ref_rb; 330 331 spin_lock(&osb->osb_lock); 332 if (osb->osb_ref_tree_lru && 333 osb->osb_ref_tree_lru->rf_blkno == rf_blkno) 334 tree = osb->osb_ref_tree_lru; 335 else 336 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 337 if (tree) 338 goto out; 339 340 spin_unlock(&osb->osb_lock); 341 342 new = ocfs2_allocate_refcount_tree(osb, rf_blkno); 343 if (!new) { 344 ret = -ENOMEM; 345 mlog_errno(ret); 346 return ret; 347 } 348 /* 349 * We need the generation to create the refcount tree lock and since 350 * it isn't changed during the tree modification, we are safe here to 351 * read without protection. 352 * We also have to purge the cache after we create the lock since the 353 * refcount block may have the stale data. It can only be trusted when 354 * we hold the refcount lock. 355 */ 356 ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh); 357 if (ret) { 358 mlog_errno(ret); 359 ocfs2_metadata_cache_exit(&new->rf_ci); 360 kfree(new); 361 return ret; 362 } 363 364 ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 365 new->rf_generation = le32_to_cpu(ref_rb->rf_generation); 366 ocfs2_init_refcount_tree_lock(osb, new, rf_blkno, 367 new->rf_generation); 368 ocfs2_metadata_cache_purge(&new->rf_ci); 369 370 spin_lock(&osb->osb_lock); 371 tree = ocfs2_find_refcount_tree(osb, rf_blkno); 372 if (tree) 373 goto out; 374 375 ocfs2_insert_refcount_tree(osb, new); 376 377 tree = new; 378 new = NULL; 379 380 out: 381 *ret_tree = tree; 382 383 osb->osb_ref_tree_lru = tree; 384 385 spin_unlock(&osb->osb_lock); 386 387 if (new) 388 ocfs2_free_refcount_tree(new); 389 390 brelse(ref_root_bh); 391 return ret; 392 } 393 394 static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) 395 { 396 int ret; 397 struct buffer_head *di_bh = NULL; 398 struct ocfs2_dinode *di; 399 400 ret = ocfs2_read_inode_block(inode, &di_bh); 401 if (ret) { 402 mlog_errno(ret); 403 goto out; 404 } 405 406 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 407 408 di = (struct ocfs2_dinode *)di_bh->b_data; 409 *ref_blkno = le64_to_cpu(di->i_refcount_loc); 410 brelse(di_bh); 411 out: 412 return ret; 413 } 414 415 static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 416 struct ocfs2_refcount_tree *tree, int rw) 417 { 418 int ret; 419 420 ret = ocfs2_refcount_lock(tree, rw); 421 if (ret) { 422 mlog_errno(ret); 423 goto out; 424 } 425 426 if (rw) 427 down_write(&tree->rf_sem); 428 else 429 down_read(&tree->rf_sem); 430 431 out: 432 return ret; 433 } 434 435 /* 436 * Lock the refcount tree pointed by ref_blkno and return the tree. 437 * In most case, we lock the tree and read the refcount block. 438 * So read it here if the caller really needs it. 439 * 440 * If the tree has been re-created by other node, it will free the 441 * old one and re-create it. 442 */ 443 int ocfs2_lock_refcount_tree(struct ocfs2_super *osb, 444 u64 ref_blkno, int rw, 445 struct ocfs2_refcount_tree **ret_tree, 446 struct buffer_head **ref_bh) 447 { 448 int ret, delete_tree = 0; 449 struct ocfs2_refcount_tree *tree = NULL; 450 struct buffer_head *ref_root_bh = NULL; 451 struct ocfs2_refcount_block *rb; 452 453 again: 454 ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree); 455 if (ret) { 456 mlog_errno(ret); 457 return ret; 458 } 459 460 ocfs2_refcount_tree_get(tree); 461 462 ret = __ocfs2_lock_refcount_tree(osb, tree, rw); 463 if (ret) { 464 mlog_errno(ret); 465 ocfs2_refcount_tree_put(tree); 466 goto out; 467 } 468 469 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 470 &ref_root_bh); 471 if (ret) { 472 mlog_errno(ret); 473 ocfs2_unlock_refcount_tree(osb, tree, rw); 474 ocfs2_refcount_tree_put(tree); 475 goto out; 476 } 477 478 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 479 /* 480 * If the refcount block has been freed and re-created, we may need 481 * to recreate the refcount tree also. 482 * 483 * Here we just remove the tree from the rb-tree, and the last 484 * kref holder will unlock and delete this refcount_tree. 485 * Then we goto "again" and ocfs2_get_refcount_tree will create 486 * the new refcount tree for us. 487 */ 488 if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) { 489 if (!tree->rf_removed) { 490 ocfs2_erase_refcount_tree_from_list(osb, tree); 491 tree->rf_removed = 1; 492 delete_tree = 1; 493 } 494 495 ocfs2_unlock_refcount_tree(osb, tree, rw); 496 /* 497 * We get an extra reference when we create the refcount 498 * tree, so another put will destroy it. 499 */ 500 if (delete_tree) 501 ocfs2_refcount_tree_put(tree); 502 brelse(ref_root_bh); 503 ref_root_bh = NULL; 504 goto again; 505 } 506 507 *ret_tree = tree; 508 if (ref_bh) { 509 *ref_bh = ref_root_bh; 510 ref_root_bh = NULL; 511 } 512 out: 513 brelse(ref_root_bh); 514 return ret; 515 } 516 517 int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, 518 struct ocfs2_refcount_tree **ret_tree, 519 struct buffer_head **ref_bh) 520 { 521 int ret; 522 u64 ref_blkno; 523 524 ret = ocfs2_get_refcount_block(inode, &ref_blkno); 525 if (ret) { 526 mlog_errno(ret); 527 return ret; 528 } 529 530 return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, 531 rw, ret_tree, ref_bh); 532 } 533 534 void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, 535 struct ocfs2_refcount_tree *tree, int rw) 536 { 537 if (rw) 538 up_write(&tree->rf_sem); 539 else 540 up_read(&tree->rf_sem); 541 542 ocfs2_refcount_unlock(tree, rw); 543 ocfs2_refcount_tree_put(tree); 544 } 545 546 void ocfs2_purge_refcount_trees(struct ocfs2_super *osb) 547 { 548 struct rb_node *node; 549 struct ocfs2_refcount_tree *tree; 550 struct rb_root *root = &osb->osb_rf_lock_tree; 551 552 while ((node = rb_last(root)) != NULL) { 553 tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node); 554 555 mlog(0, "Purge tree %llu\n", 556 (unsigned long long) tree->rf_blkno); 557 558 rb_erase(&tree->rf_node, root); 559 ocfs2_free_refcount_tree(tree); 560 } 561 } 562 563 /* 564 * Create a refcount tree for an inode. 565 * We take for granted that the inode is already locked. 566 */ 567 static int ocfs2_create_refcount_tree(struct inode *inode, 568 struct buffer_head *di_bh) 569 { 570 int ret; 571 handle_t *handle = NULL; 572 struct ocfs2_alloc_context *meta_ac = NULL; 573 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 574 struct ocfs2_inode_info *oi = OCFS2_I(inode); 575 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 576 struct buffer_head *new_bh = NULL; 577 struct ocfs2_refcount_block *rb; 578 struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL; 579 u16 suballoc_bit_start; 580 u32 num_got; 581 u64 first_blkno; 582 583 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 584 585 mlog(0, "create tree for inode %lu\n", inode->i_ino); 586 587 ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); 588 if (ret) { 589 mlog_errno(ret); 590 goto out; 591 } 592 593 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS); 594 if (IS_ERR(handle)) { 595 ret = PTR_ERR(handle); 596 mlog_errno(ret); 597 goto out; 598 } 599 600 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 601 OCFS2_JOURNAL_ACCESS_WRITE); 602 if (ret) { 603 mlog_errno(ret); 604 goto out_commit; 605 } 606 607 ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, 608 &suballoc_bit_start, &num_got, 609 &first_blkno); 610 if (ret) { 611 mlog_errno(ret); 612 goto out_commit; 613 } 614 615 new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno); 616 if (!new_tree) { 617 ret = -ENOMEM; 618 mlog_errno(ret); 619 goto out_commit; 620 } 621 622 new_bh = sb_getblk(inode->i_sb, first_blkno); 623 ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh); 624 625 ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh, 626 OCFS2_JOURNAL_ACCESS_CREATE); 627 if (ret) { 628 mlog_errno(ret); 629 goto out_commit; 630 } 631 632 /* Initialize ocfs2_refcount_block. */ 633 rb = (struct ocfs2_refcount_block *)new_bh->b_data; 634 memset(rb, 0, inode->i_sb->s_blocksize); 635 strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 636 rb->rf_suballoc_slot = cpu_to_le16(osb->slot_num); 637 rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 638 rb->rf_fs_generation = cpu_to_le32(osb->fs_generation); 639 rb->rf_blkno = cpu_to_le64(first_blkno); 640 rb->rf_count = cpu_to_le32(1); 641 rb->rf_records.rl_count = 642 cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb)); 643 spin_lock(&osb->osb_lock); 644 rb->rf_generation = osb->s_next_generation++; 645 spin_unlock(&osb->osb_lock); 646 647 ocfs2_journal_dirty(handle, new_bh); 648 649 spin_lock(&oi->ip_lock); 650 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 651 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 652 di->i_refcount_loc = cpu_to_le64(first_blkno); 653 spin_unlock(&oi->ip_lock); 654 655 mlog(0, "created tree for inode %lu, refblock %llu\n", 656 inode->i_ino, (unsigned long long)first_blkno); 657 658 ocfs2_journal_dirty(handle, di_bh); 659 660 /* 661 * We have to init the tree lock here since it will use 662 * the generation number to create it. 663 */ 664 new_tree->rf_generation = le32_to_cpu(rb->rf_generation); 665 ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno, 666 new_tree->rf_generation); 667 668 spin_lock(&osb->osb_lock); 669 tree = ocfs2_find_refcount_tree(osb, first_blkno); 670 671 /* 672 * We've just created a new refcount tree in this block. If 673 * we found a refcount tree on the ocfs2_super, it must be 674 * one we just deleted. We free the old tree before 675 * inserting the new tree. 676 */ 677 BUG_ON(tree && tree->rf_generation == new_tree->rf_generation); 678 if (tree) 679 ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree); 680 ocfs2_insert_refcount_tree(osb, new_tree); 681 spin_unlock(&osb->osb_lock); 682 new_tree = NULL; 683 if (tree) 684 ocfs2_refcount_tree_put(tree); 685 686 out_commit: 687 ocfs2_commit_trans(osb, handle); 688 689 out: 690 if (new_tree) { 691 ocfs2_metadata_cache_exit(&new_tree->rf_ci); 692 kfree(new_tree); 693 } 694 695 brelse(new_bh); 696 if (meta_ac) 697 ocfs2_free_alloc_context(meta_ac); 698 699 return ret; 700 } 701 702 static int ocfs2_set_refcount_tree(struct inode *inode, 703 struct buffer_head *di_bh, 704 u64 refcount_loc) 705 { 706 int ret; 707 handle_t *handle = NULL; 708 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 709 struct ocfs2_inode_info *oi = OCFS2_I(inode); 710 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 711 struct buffer_head *ref_root_bh = NULL; 712 struct ocfs2_refcount_block *rb; 713 struct ocfs2_refcount_tree *ref_tree; 714 715 BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); 716 717 ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, 718 &ref_tree, &ref_root_bh); 719 if (ret) { 720 mlog_errno(ret); 721 return ret; 722 } 723 724 handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS); 725 if (IS_ERR(handle)) { 726 ret = PTR_ERR(handle); 727 mlog_errno(ret); 728 goto out; 729 } 730 731 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 732 OCFS2_JOURNAL_ACCESS_WRITE); 733 if (ret) { 734 mlog_errno(ret); 735 goto out_commit; 736 } 737 738 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh, 739 OCFS2_JOURNAL_ACCESS_WRITE); 740 if (ret) { 741 mlog_errno(ret); 742 goto out_commit; 743 } 744 745 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 746 le32_add_cpu(&rb->rf_count, 1); 747 748 ocfs2_journal_dirty(handle, ref_root_bh); 749 750 spin_lock(&oi->ip_lock); 751 oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL; 752 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 753 di->i_refcount_loc = cpu_to_le64(refcount_loc); 754 spin_unlock(&oi->ip_lock); 755 ocfs2_journal_dirty(handle, di_bh); 756 757 out_commit: 758 ocfs2_commit_trans(osb, handle); 759 out: 760 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 761 brelse(ref_root_bh); 762 763 return ret; 764 } 765 766 int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) 767 { 768 int ret, delete_tree = 0; 769 handle_t *handle = NULL; 770 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 771 struct ocfs2_inode_info *oi = OCFS2_I(inode); 772 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 773 struct ocfs2_refcount_block *rb; 774 struct inode *alloc_inode = NULL; 775 struct buffer_head *alloc_bh = NULL; 776 struct buffer_head *blk_bh = NULL; 777 struct ocfs2_refcount_tree *ref_tree; 778 int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS; 779 u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); 780 u16 bit = 0; 781 782 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) 783 return 0; 784 785 BUG_ON(!ref_blkno); 786 ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh); 787 if (ret) { 788 mlog_errno(ret); 789 return ret; 790 } 791 792 rb = (struct ocfs2_refcount_block *)blk_bh->b_data; 793 794 /* 795 * If we are the last user, we need to free the block. 796 * So lock the allocator ahead. 797 */ 798 if (le32_to_cpu(rb->rf_count) == 1) { 799 blk = le64_to_cpu(rb->rf_blkno); 800 bit = le16_to_cpu(rb->rf_suballoc_bit); 801 bg_blkno = ocfs2_which_suballoc_group(blk, bit); 802 803 alloc_inode = ocfs2_get_system_file_inode(osb, 804 EXTENT_ALLOC_SYSTEM_INODE, 805 le16_to_cpu(rb->rf_suballoc_slot)); 806 if (!alloc_inode) { 807 ret = -ENOMEM; 808 mlog_errno(ret); 809 goto out; 810 } 811 mutex_lock(&alloc_inode->i_mutex); 812 813 ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); 814 if (ret) { 815 mlog_errno(ret); 816 goto out_mutex; 817 } 818 819 credits += OCFS2_SUBALLOC_FREE; 820 } 821 822 handle = ocfs2_start_trans(osb, credits); 823 if (IS_ERR(handle)) { 824 ret = PTR_ERR(handle); 825 mlog_errno(ret); 826 goto out_unlock; 827 } 828 829 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 830 OCFS2_JOURNAL_ACCESS_WRITE); 831 if (ret) { 832 mlog_errno(ret); 833 goto out_commit; 834 } 835 836 ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh, 837 OCFS2_JOURNAL_ACCESS_WRITE); 838 if (ret) { 839 mlog_errno(ret); 840 goto out_commit; 841 } 842 843 spin_lock(&oi->ip_lock); 844 oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL; 845 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 846 di->i_refcount_loc = 0; 847 spin_unlock(&oi->ip_lock); 848 ocfs2_journal_dirty(handle, di_bh); 849 850 le32_add_cpu(&rb->rf_count , -1); 851 ocfs2_journal_dirty(handle, blk_bh); 852 853 if (!rb->rf_count) { 854 delete_tree = 1; 855 ocfs2_erase_refcount_tree_from_list(osb, ref_tree); 856 ret = ocfs2_free_suballoc_bits(handle, alloc_inode, 857 alloc_bh, bit, bg_blkno, 1); 858 if (ret) 859 mlog_errno(ret); 860 } 861 862 out_commit: 863 ocfs2_commit_trans(osb, handle); 864 out_unlock: 865 if (alloc_inode) { 866 ocfs2_inode_unlock(alloc_inode, 1); 867 brelse(alloc_bh); 868 } 869 out_mutex: 870 if (alloc_inode) { 871 mutex_unlock(&alloc_inode->i_mutex); 872 iput(alloc_inode); 873 } 874 out: 875 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 876 if (delete_tree) 877 ocfs2_refcount_tree_put(ref_tree); 878 brelse(blk_bh); 879 880 return ret; 881 } 882 883 static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci, 884 struct buffer_head *ref_leaf_bh, 885 u64 cpos, unsigned int len, 886 struct ocfs2_refcount_rec *ret_rec, 887 int *index) 888 { 889 int i = 0; 890 struct ocfs2_refcount_block *rb = 891 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 892 struct ocfs2_refcount_rec *rec = NULL; 893 894 for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) { 895 rec = &rb->rf_records.rl_recs[i]; 896 897 if (le64_to_cpu(rec->r_cpos) + 898 le32_to_cpu(rec->r_clusters) <= cpos) 899 continue; 900 else if (le64_to_cpu(rec->r_cpos) > cpos) 901 break; 902 903 /* ok, cpos fail in this rec. Just return. */ 904 if (ret_rec) 905 *ret_rec = *rec; 906 goto out; 907 } 908 909 if (ret_rec) { 910 /* We meet with a hole here, so fake the rec. */ 911 ret_rec->r_cpos = cpu_to_le64(cpos); 912 ret_rec->r_refcount = 0; 913 if (i < le16_to_cpu(rb->rf_records.rl_used) && 914 le64_to_cpu(rec->r_cpos) < cpos + len) 915 ret_rec->r_clusters = 916 cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos); 917 else 918 ret_rec->r_clusters = cpu_to_le32(len); 919 } 920 921 out: 922 *index = i; 923 } 924 925 /* 926 * Given a cpos and len, try to find the refcount record which contains cpos. 927 * 1. If cpos can be found in one refcount record, return the record. 928 * 2. If cpos can't be found, return a fake record which start from cpos 929 * and end at a small value between cpos+len and start of the next record. 930 * This fake record has r_refcount = 0. 931 */ 932 static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, 933 struct buffer_head *ref_root_bh, 934 u64 cpos, unsigned int len, 935 struct ocfs2_refcount_rec *ret_rec, 936 int *index, 937 struct buffer_head **ret_bh) 938 { 939 int ret = 0, i, found; 940 u32 low_cpos; 941 struct ocfs2_extent_list *el; 942 struct ocfs2_extent_rec *tmp, *rec = NULL; 943 struct ocfs2_extent_block *eb; 944 struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; 945 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 946 struct ocfs2_refcount_block *rb = 947 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 948 949 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) { 950 ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len, 951 ret_rec, index); 952 *ret_bh = ref_root_bh; 953 get_bh(ref_root_bh); 954 return 0; 955 } 956 957 el = &rb->rf_list; 958 low_cpos = cpos & OCFS2_32BIT_POS_MASK; 959 960 if (el->l_tree_depth) { 961 ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh); 962 if (ret) { 963 mlog_errno(ret); 964 goto out; 965 } 966 967 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 968 el = &eb->h_list; 969 970 if (el->l_tree_depth) { 971 ocfs2_error(sb, 972 "refcount tree %llu has non zero tree " 973 "depth in leaf btree tree block %llu\n", 974 (unsigned long long)ocfs2_metadata_cache_owner(ci), 975 (unsigned long long)eb_bh->b_blocknr); 976 ret = -EROFS; 977 goto out; 978 } 979 } 980 981 found = 0; 982 for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { 983 rec = &el->l_recs[i]; 984 985 if (le32_to_cpu(rec->e_cpos) <= low_cpos) { 986 found = 1; 987 break; 988 } 989 } 990 991 /* adjust len when we have ocfs2_extent_rec after it. */ 992 if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { 993 tmp = &el->l_recs[i+1]; 994 995 if (le32_to_cpu(tmp->e_cpos) < cpos + len) 996 len = le32_to_cpu(tmp->e_cpos) - cpos; 997 } 998 999 ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), 1000 &ref_leaf_bh); 1001 if (ret) { 1002 mlog_errno(ret); 1003 goto out; 1004 } 1005 1006 ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len, 1007 ret_rec, index); 1008 *ret_bh = ref_leaf_bh; 1009 out: 1010 brelse(eb_bh); 1011 return ret; 1012 } 1013 1014 enum ocfs2_ref_rec_contig { 1015 REF_CONTIG_NONE = 0, 1016 REF_CONTIG_LEFT, 1017 REF_CONTIG_RIGHT, 1018 REF_CONTIG_LEFTRIGHT, 1019 }; 1020 1021 static enum ocfs2_ref_rec_contig 1022 ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb, 1023 int index) 1024 { 1025 if ((rb->rf_records.rl_recs[index].r_refcount == 1026 rb->rf_records.rl_recs[index + 1].r_refcount) && 1027 (le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) + 1028 le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) == 1029 le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos))) 1030 return REF_CONTIG_RIGHT; 1031 1032 return REF_CONTIG_NONE; 1033 } 1034 1035 static enum ocfs2_ref_rec_contig 1036 ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb, 1037 int index) 1038 { 1039 enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE; 1040 1041 if (index < le16_to_cpu(rb->rf_records.rl_used) - 1) 1042 ret = ocfs2_refcount_rec_adjacent(rb, index); 1043 1044 if (index > 0) { 1045 enum ocfs2_ref_rec_contig tmp; 1046 1047 tmp = ocfs2_refcount_rec_adjacent(rb, index - 1); 1048 1049 if (tmp == REF_CONTIG_RIGHT) { 1050 if (ret == REF_CONTIG_RIGHT) 1051 ret = REF_CONTIG_LEFTRIGHT; 1052 else 1053 ret = REF_CONTIG_LEFT; 1054 } 1055 } 1056 1057 return ret; 1058 } 1059 1060 static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb, 1061 int index) 1062 { 1063 BUG_ON(rb->rf_records.rl_recs[index].r_refcount != 1064 rb->rf_records.rl_recs[index+1].r_refcount); 1065 1066 le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters, 1067 le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters)); 1068 1069 if (index < le16_to_cpu(rb->rf_records.rl_used) - 2) 1070 memmove(&rb->rf_records.rl_recs[index + 1], 1071 &rb->rf_records.rl_recs[index + 2], 1072 sizeof(struct ocfs2_refcount_rec) * 1073 (le16_to_cpu(rb->rf_records.rl_used) - index - 2)); 1074 1075 memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1], 1076 0, sizeof(struct ocfs2_refcount_rec)); 1077 le16_add_cpu(&rb->rf_records.rl_used, -1); 1078 } 1079 1080 /* 1081 * Merge the refcount rec if we are contiguous with the adjacent recs. 1082 */ 1083 static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb, 1084 int index) 1085 { 1086 enum ocfs2_ref_rec_contig contig = 1087 ocfs2_refcount_rec_contig(rb, index); 1088 1089 if (contig == REF_CONTIG_NONE) 1090 return; 1091 1092 if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) { 1093 BUG_ON(index == 0); 1094 index--; 1095 } 1096 1097 ocfs2_rotate_refcount_rec_left(rb, index); 1098 1099 if (contig == REF_CONTIG_LEFTRIGHT) 1100 ocfs2_rotate_refcount_rec_left(rb, index); 1101 } 1102 1103 /* 1104 * Change the refcount indexed by "index" in ref_bh. 1105 * If refcount reaches 0, remove it. 1106 */ 1107 static int ocfs2_change_refcount_rec(handle_t *handle, 1108 struct ocfs2_caching_info *ci, 1109 struct buffer_head *ref_leaf_bh, 1110 int index, int change) 1111 { 1112 int ret; 1113 struct ocfs2_refcount_block *rb = 1114 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1115 struct ocfs2_refcount_list *rl = &rb->rf_records; 1116 struct ocfs2_refcount_rec *rec = &rl->rl_recs[index]; 1117 1118 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1119 OCFS2_JOURNAL_ACCESS_WRITE); 1120 if (ret) { 1121 mlog_errno(ret); 1122 goto out; 1123 } 1124 1125 mlog(0, "change index %d, old count %u, change %d\n", index, 1126 le32_to_cpu(rec->r_refcount), change); 1127 le32_add_cpu(&rec->r_refcount, change); 1128 1129 if (!rec->r_refcount) { 1130 if (index != le16_to_cpu(rl->rl_used) - 1) { 1131 memmove(rec, rec + 1, 1132 (le16_to_cpu(rl->rl_used) - index - 1) * 1133 sizeof(struct ocfs2_refcount_rec)); 1134 memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1], 1135 0, sizeof(struct ocfs2_refcount_rec)); 1136 } 1137 1138 le16_add_cpu(&rl->rl_used, -1); 1139 } else 1140 ocfs2_refcount_rec_merge(rb, index); 1141 1142 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1143 if (ret) 1144 mlog_errno(ret); 1145 out: 1146 return ret; 1147 } 1148 1149 static int ocfs2_expand_inline_ref_root(handle_t *handle, 1150 struct ocfs2_caching_info *ci, 1151 struct buffer_head *ref_root_bh, 1152 struct buffer_head **ref_leaf_bh, 1153 struct ocfs2_alloc_context *meta_ac) 1154 { 1155 int ret; 1156 u16 suballoc_bit_start; 1157 u32 num_got; 1158 u64 blkno; 1159 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1160 struct buffer_head *new_bh = NULL; 1161 struct ocfs2_refcount_block *new_rb; 1162 struct ocfs2_refcount_block *root_rb = 1163 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1164 1165 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1166 OCFS2_JOURNAL_ACCESS_WRITE); 1167 if (ret) { 1168 mlog_errno(ret); 1169 goto out; 1170 } 1171 1172 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1173 &suballoc_bit_start, &num_got, 1174 &blkno); 1175 if (ret) { 1176 mlog_errno(ret); 1177 goto out; 1178 } 1179 1180 new_bh = sb_getblk(sb, blkno); 1181 if (new_bh == NULL) { 1182 ret = -EIO; 1183 mlog_errno(ret); 1184 goto out; 1185 } 1186 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1187 1188 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1189 OCFS2_JOURNAL_ACCESS_CREATE); 1190 if (ret) { 1191 mlog_errno(ret); 1192 goto out; 1193 } 1194 1195 /* 1196 * Initialize ocfs2_refcount_block. 1197 * It should contain the same information as the old root. 1198 * so just memcpy it and change the corresponding field. 1199 */ 1200 memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize); 1201 1202 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1203 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1204 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1205 new_rb->rf_blkno = cpu_to_le64(blkno); 1206 new_rb->rf_cpos = cpu_to_le32(0); 1207 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1208 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1209 ocfs2_journal_dirty(handle, new_bh); 1210 1211 /* Now change the root. */ 1212 memset(&root_rb->rf_list, 0, sb->s_blocksize - 1213 offsetof(struct ocfs2_refcount_block, rf_list)); 1214 root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb)); 1215 root_rb->rf_clusters = cpu_to_le32(1); 1216 root_rb->rf_list.l_next_free_rec = cpu_to_le16(1); 1217 root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno); 1218 root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); 1219 root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL); 1220 1221 ocfs2_journal_dirty(handle, ref_root_bh); 1222 1223 mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno, 1224 le16_to_cpu(new_rb->rf_records.rl_used)); 1225 1226 *ref_leaf_bh = new_bh; 1227 new_bh = NULL; 1228 out: 1229 brelse(new_bh); 1230 return ret; 1231 } 1232 1233 static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev, 1234 struct ocfs2_refcount_rec *next) 1235 { 1236 if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <= 1237 ocfs2_get_ref_rec_low_cpos(next)) 1238 return 1; 1239 1240 return 0; 1241 } 1242 1243 static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b) 1244 { 1245 const struct ocfs2_refcount_rec *l = a, *r = b; 1246 u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l); 1247 u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r); 1248 1249 if (l_cpos > r_cpos) 1250 return 1; 1251 if (l_cpos < r_cpos) 1252 return -1; 1253 return 0; 1254 } 1255 1256 static int cmp_refcount_rec_by_cpos(const void *a, const void *b) 1257 { 1258 const struct ocfs2_refcount_rec *l = a, *r = b; 1259 u64 l_cpos = le64_to_cpu(l->r_cpos); 1260 u64 r_cpos = le64_to_cpu(r->r_cpos); 1261 1262 if (l_cpos > r_cpos) 1263 return 1; 1264 if (l_cpos < r_cpos) 1265 return -1; 1266 return 0; 1267 } 1268 1269 static void swap_refcount_rec(void *a, void *b, int size) 1270 { 1271 struct ocfs2_refcount_rec *l = a, *r = b, tmp; 1272 1273 tmp = *(struct ocfs2_refcount_rec *)l; 1274 *(struct ocfs2_refcount_rec *)l = 1275 *(struct ocfs2_refcount_rec *)r; 1276 *(struct ocfs2_refcount_rec *)r = tmp; 1277 } 1278 1279 /* 1280 * The refcount cpos are ordered by their 64bit cpos, 1281 * But we will use the low 32 bit to be the e_cpos in the b-tree. 1282 * So we need to make sure that this pos isn't intersected with others. 1283 * 1284 * Note: The refcount block is already sorted by their low 32 bit cpos, 1285 * So just try the middle pos first, and we will exit when we find 1286 * the good position. 1287 */ 1288 static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl, 1289 u32 *split_pos, int *split_index) 1290 { 1291 int num_used = le16_to_cpu(rl->rl_used); 1292 int delta, middle = num_used / 2; 1293 1294 for (delta = 0; delta < middle; delta++) { 1295 /* Let's check delta earlier than middle */ 1296 if (ocfs2_refcount_rec_no_intersect( 1297 &rl->rl_recs[middle - delta - 1], 1298 &rl->rl_recs[middle - delta])) { 1299 *split_index = middle - delta; 1300 break; 1301 } 1302 1303 /* For even counts, don't walk off the end */ 1304 if ((middle + delta + 1) == num_used) 1305 continue; 1306 1307 /* Now try delta past middle */ 1308 if (ocfs2_refcount_rec_no_intersect( 1309 &rl->rl_recs[middle + delta], 1310 &rl->rl_recs[middle + delta + 1])) { 1311 *split_index = middle + delta + 1; 1312 break; 1313 } 1314 } 1315 1316 if (delta >= middle) 1317 return -ENOSPC; 1318 1319 *split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]); 1320 return 0; 1321 } 1322 1323 static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, 1324 struct buffer_head *new_bh, 1325 u32 *split_cpos) 1326 { 1327 int split_index = 0, num_moved, ret; 1328 u32 cpos = 0; 1329 struct ocfs2_refcount_block *rb = 1330 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1331 struct ocfs2_refcount_list *rl = &rb->rf_records; 1332 struct ocfs2_refcount_block *new_rb = 1333 (struct ocfs2_refcount_block *)new_bh->b_data; 1334 struct ocfs2_refcount_list *new_rl = &new_rb->rf_records; 1335 1336 mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n", 1337 (unsigned long long)ref_leaf_bh->b_blocknr, 1338 le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); 1339 1340 /* 1341 * XXX: Improvement later. 1342 * If we know all the high 32 bit cpos is the same, no need to sort. 1343 * 1344 * In order to make the whole process safe, we do: 1345 * 1. sort the entries by their low 32 bit cpos first so that we can 1346 * find the split cpos easily. 1347 * 2. call ocfs2_insert_extent to insert the new refcount block. 1348 * 3. move the refcount rec to the new block. 1349 * 4. sort the entries by their 64 bit cpos. 1350 * 5. dirty the new_rb and rb. 1351 */ 1352 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1353 sizeof(struct ocfs2_refcount_rec), 1354 cmp_refcount_rec_by_low_cpos, swap_refcount_rec); 1355 1356 ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index); 1357 if (ret) { 1358 mlog_errno(ret); 1359 return ret; 1360 } 1361 1362 new_rb->rf_cpos = cpu_to_le32(cpos); 1363 1364 /* move refcount records starting from split_index to the new block. */ 1365 num_moved = le16_to_cpu(rl->rl_used) - split_index; 1366 memcpy(new_rl->rl_recs, &rl->rl_recs[split_index], 1367 num_moved * sizeof(struct ocfs2_refcount_rec)); 1368 1369 /*ok, remove the entries we just moved over to the other block. */ 1370 memset(&rl->rl_recs[split_index], 0, 1371 num_moved * sizeof(struct ocfs2_refcount_rec)); 1372 1373 /* change old and new rl_used accordingly. */ 1374 le16_add_cpu(&rl->rl_used, -num_moved); 1375 new_rl->rl_used = cpu_to_le32(num_moved); 1376 1377 sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), 1378 sizeof(struct ocfs2_refcount_rec), 1379 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1380 1381 sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used), 1382 sizeof(struct ocfs2_refcount_rec), 1383 cmp_refcount_rec_by_cpos, swap_refcount_rec); 1384 1385 *split_cpos = cpos; 1386 return 0; 1387 } 1388 1389 static int ocfs2_new_leaf_refcount_block(handle_t *handle, 1390 struct ocfs2_caching_info *ci, 1391 struct buffer_head *ref_root_bh, 1392 struct buffer_head *ref_leaf_bh, 1393 struct ocfs2_alloc_context *meta_ac) 1394 { 1395 int ret; 1396 u16 suballoc_bit_start; 1397 u32 num_got, new_cpos; 1398 u64 blkno; 1399 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1400 struct ocfs2_refcount_block *root_rb = 1401 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1402 struct buffer_head *new_bh = NULL; 1403 struct ocfs2_refcount_block *new_rb; 1404 struct ocfs2_extent_tree ref_et; 1405 1406 BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)); 1407 1408 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1409 OCFS2_JOURNAL_ACCESS_WRITE); 1410 if (ret) { 1411 mlog_errno(ret); 1412 goto out; 1413 } 1414 1415 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1416 OCFS2_JOURNAL_ACCESS_WRITE); 1417 if (ret) { 1418 mlog_errno(ret); 1419 goto out; 1420 } 1421 1422 ret = ocfs2_claim_metadata(OCFS2_SB(sb), handle, meta_ac, 1, 1423 &suballoc_bit_start, &num_got, 1424 &blkno); 1425 if (ret) { 1426 mlog_errno(ret); 1427 goto out; 1428 } 1429 1430 new_bh = sb_getblk(sb, blkno); 1431 if (new_bh == NULL) { 1432 ret = -EIO; 1433 mlog_errno(ret); 1434 goto out; 1435 } 1436 ocfs2_set_new_buffer_uptodate(ci, new_bh); 1437 1438 ret = ocfs2_journal_access_rb(handle, ci, new_bh, 1439 OCFS2_JOURNAL_ACCESS_CREATE); 1440 if (ret) { 1441 mlog_errno(ret); 1442 goto out; 1443 } 1444 1445 /* Initialize ocfs2_refcount_block. */ 1446 new_rb = (struct ocfs2_refcount_block *)new_bh->b_data; 1447 memset(new_rb, 0, sb->s_blocksize); 1448 strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE); 1449 new_rb->rf_suballoc_slot = cpu_to_le16(OCFS2_SB(sb)->slot_num); 1450 new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start); 1451 new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1452 new_rb->rf_blkno = cpu_to_le64(blkno); 1453 new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr); 1454 new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL); 1455 new_rb->rf_records.rl_count = 1456 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 1457 new_rb->rf_generation = root_rb->rf_generation; 1458 1459 ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos); 1460 if (ret) { 1461 mlog_errno(ret); 1462 goto out; 1463 } 1464 1465 ocfs2_journal_dirty(handle, ref_leaf_bh); 1466 ocfs2_journal_dirty(handle, new_bh); 1467 1468 ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh); 1469 1470 mlog(0, "insert new leaf block %llu at %u\n", 1471 (unsigned long long)new_bh->b_blocknr, new_cpos); 1472 1473 /* Insert the new leaf block with the specific offset cpos. */ 1474 ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr, 1475 1, 0, meta_ac); 1476 if (ret) 1477 mlog_errno(ret); 1478 1479 out: 1480 brelse(new_bh); 1481 return ret; 1482 } 1483 1484 static int ocfs2_expand_refcount_tree(handle_t *handle, 1485 struct ocfs2_caching_info *ci, 1486 struct buffer_head *ref_root_bh, 1487 struct buffer_head *ref_leaf_bh, 1488 struct ocfs2_alloc_context *meta_ac) 1489 { 1490 int ret; 1491 struct buffer_head *expand_bh = NULL; 1492 1493 if (ref_root_bh == ref_leaf_bh) { 1494 /* 1495 * the old root bh hasn't been expanded to a b-tree, 1496 * so expand it first. 1497 */ 1498 ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh, 1499 &expand_bh, meta_ac); 1500 if (ret) { 1501 mlog_errno(ret); 1502 goto out; 1503 } 1504 } else { 1505 expand_bh = ref_leaf_bh; 1506 get_bh(expand_bh); 1507 } 1508 1509 1510 /* Now add a new refcount block into the tree.*/ 1511 ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh, 1512 expand_bh, meta_ac); 1513 if (ret) 1514 mlog_errno(ret); 1515 out: 1516 brelse(expand_bh); 1517 return ret; 1518 } 1519 1520 /* 1521 * Adjust the extent rec in b-tree representing ref_leaf_bh. 1522 * 1523 * Only called when we have inserted a new refcount rec at index 0 1524 * which means ocfs2_extent_rec.e_cpos may need some change. 1525 */ 1526 static int ocfs2_adjust_refcount_rec(handle_t *handle, 1527 struct ocfs2_caching_info *ci, 1528 struct buffer_head *ref_root_bh, 1529 struct buffer_head *ref_leaf_bh, 1530 struct ocfs2_refcount_rec *rec) 1531 { 1532 int ret = 0, i; 1533 u32 new_cpos, old_cpos; 1534 struct ocfs2_path *path = NULL; 1535 struct ocfs2_extent_tree et; 1536 struct ocfs2_refcount_block *rb = 1537 (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1538 struct ocfs2_extent_list *el; 1539 1540 if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) 1541 goto out; 1542 1543 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1544 old_cpos = le32_to_cpu(rb->rf_cpos); 1545 new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK; 1546 if (old_cpos <= new_cpos) 1547 goto out; 1548 1549 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 1550 1551 path = ocfs2_new_path_from_et(&et); 1552 if (!path) { 1553 ret = -ENOMEM; 1554 mlog_errno(ret); 1555 goto out; 1556 } 1557 1558 ret = ocfs2_find_path(ci, path, old_cpos); 1559 if (ret) { 1560 mlog_errno(ret); 1561 goto out; 1562 } 1563 1564 /* 1565 * 2 more credits, one for the leaf refcount block, one for 1566 * the extent block contains the extent rec. 1567 */ 1568 ret = ocfs2_extend_trans(handle, handle->h_buffer_credits + 2); 1569 if (ret < 0) { 1570 mlog_errno(ret); 1571 goto out; 1572 } 1573 1574 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1575 OCFS2_JOURNAL_ACCESS_WRITE); 1576 if (ret < 0) { 1577 mlog_errno(ret); 1578 goto out; 1579 } 1580 1581 ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path), 1582 OCFS2_JOURNAL_ACCESS_WRITE); 1583 if (ret < 0) { 1584 mlog_errno(ret); 1585 goto out; 1586 } 1587 1588 /* change the leaf extent block first. */ 1589 el = path_leaf_el(path); 1590 1591 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) 1592 if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos) 1593 break; 1594 1595 BUG_ON(i == le16_to_cpu(el->l_next_free_rec)); 1596 1597 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1598 1599 /* change the r_cpos in the leaf block. */ 1600 rb->rf_cpos = cpu_to_le32(new_cpos); 1601 1602 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 1603 ocfs2_journal_dirty(handle, ref_leaf_bh); 1604 1605 out: 1606 ocfs2_free_path(path); 1607 return ret; 1608 } 1609 1610 static int ocfs2_insert_refcount_rec(handle_t *handle, 1611 struct ocfs2_caching_info *ci, 1612 struct buffer_head *ref_root_bh, 1613 struct buffer_head *ref_leaf_bh, 1614 struct ocfs2_refcount_rec *rec, 1615 int index, 1616 struct ocfs2_alloc_context *meta_ac) 1617 { 1618 int ret; 1619 struct ocfs2_refcount_block *rb = 1620 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1621 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1622 struct buffer_head *new_bh = NULL; 1623 1624 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1625 1626 if (rf_list->rl_used == rf_list->rl_count) { 1627 u64 cpos = le64_to_cpu(rec->r_cpos); 1628 u32 len = le32_to_cpu(rec->r_clusters); 1629 1630 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1631 ref_leaf_bh, meta_ac); 1632 if (ret) { 1633 mlog_errno(ret); 1634 goto out; 1635 } 1636 1637 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1638 cpos, len, NULL, &index, 1639 &new_bh); 1640 if (ret) { 1641 mlog_errno(ret); 1642 goto out; 1643 } 1644 1645 ref_leaf_bh = new_bh; 1646 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1647 rf_list = &rb->rf_records; 1648 } 1649 1650 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1651 OCFS2_JOURNAL_ACCESS_WRITE); 1652 if (ret) { 1653 mlog_errno(ret); 1654 goto out; 1655 } 1656 1657 if (index < le16_to_cpu(rf_list->rl_used)) 1658 memmove(&rf_list->rl_recs[index + 1], 1659 &rf_list->rl_recs[index], 1660 (le16_to_cpu(rf_list->rl_used) - index) * 1661 sizeof(struct ocfs2_refcount_rec)); 1662 1663 mlog(0, "insert refcount record start %llu, len %u, count %u " 1664 "to leaf block %llu at index %d\n", 1665 (unsigned long long)le64_to_cpu(rec->r_cpos), 1666 le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount), 1667 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1668 1669 rf_list->rl_recs[index] = *rec; 1670 1671 le16_add_cpu(&rf_list->rl_used, 1); 1672 1673 ocfs2_refcount_rec_merge(rb, index); 1674 1675 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1676 if (ret) { 1677 mlog_errno(ret); 1678 goto out; 1679 } 1680 1681 if (index == 0) { 1682 ret = ocfs2_adjust_refcount_rec(handle, ci, 1683 ref_root_bh, 1684 ref_leaf_bh, rec); 1685 if (ret) 1686 mlog_errno(ret); 1687 } 1688 out: 1689 brelse(new_bh); 1690 return ret; 1691 } 1692 1693 /* 1694 * Split the refcount_rec indexed by "index" in ref_leaf_bh. 1695 * This is much simple than our b-tree code. 1696 * split_rec is the new refcount rec we want to insert. 1697 * If split_rec->r_refcount > 0, we are changing the refcount(in case we 1698 * increase refcount or decrease a refcount to non-zero). 1699 * If split_rec->r_refcount == 0, we are punching a hole in current refcount 1700 * rec( in case we decrease a refcount to zero). 1701 */ 1702 static int ocfs2_split_refcount_rec(handle_t *handle, 1703 struct ocfs2_caching_info *ci, 1704 struct buffer_head *ref_root_bh, 1705 struct buffer_head *ref_leaf_bh, 1706 struct ocfs2_refcount_rec *split_rec, 1707 int index, 1708 struct ocfs2_alloc_context *meta_ac, 1709 struct ocfs2_cached_dealloc_ctxt *dealloc) 1710 { 1711 int ret, recs_need; 1712 u32 len; 1713 struct ocfs2_refcount_block *rb = 1714 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1715 struct ocfs2_refcount_list *rf_list = &rb->rf_records; 1716 struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index]; 1717 struct ocfs2_refcount_rec *tail_rec = NULL; 1718 struct buffer_head *new_bh = NULL; 1719 1720 BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL); 1721 1722 mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n", 1723 le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters), 1724 le64_to_cpu(split_rec->r_cpos), 1725 le32_to_cpu(split_rec->r_clusters)); 1726 1727 /* 1728 * If we just need to split the header or tail clusters, 1729 * no more recs are needed, just split is OK. 1730 * Otherwise we at least need one new recs. 1731 */ 1732 if (!split_rec->r_refcount && 1733 (split_rec->r_cpos == orig_rec->r_cpos || 1734 le64_to_cpu(split_rec->r_cpos) + 1735 le32_to_cpu(split_rec->r_clusters) == 1736 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1737 recs_need = 0; 1738 else 1739 recs_need = 1; 1740 1741 /* 1742 * We need one more rec if we split in the middle and the new rec have 1743 * some refcount in it. 1744 */ 1745 if (split_rec->r_refcount && 1746 (split_rec->r_cpos != orig_rec->r_cpos && 1747 le64_to_cpu(split_rec->r_cpos) + 1748 le32_to_cpu(split_rec->r_clusters) != 1749 le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters))) 1750 recs_need++; 1751 1752 /* If the leaf block don't have enough record, expand it. */ 1753 if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { 1754 struct ocfs2_refcount_rec tmp_rec; 1755 u64 cpos = le64_to_cpu(orig_rec->r_cpos); 1756 len = le32_to_cpu(orig_rec->r_clusters); 1757 ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh, 1758 ref_leaf_bh, meta_ac); 1759 if (ret) { 1760 mlog_errno(ret); 1761 goto out; 1762 } 1763 1764 /* 1765 * We have to re-get it since now cpos may be moved to 1766 * another leaf block. 1767 */ 1768 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1769 cpos, len, &tmp_rec, &index, 1770 &new_bh); 1771 if (ret) { 1772 mlog_errno(ret); 1773 goto out; 1774 } 1775 1776 ref_leaf_bh = new_bh; 1777 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1778 rf_list = &rb->rf_records; 1779 orig_rec = &rf_list->rl_recs[index]; 1780 } 1781 1782 ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh, 1783 OCFS2_JOURNAL_ACCESS_WRITE); 1784 if (ret) { 1785 mlog_errno(ret); 1786 goto out; 1787 } 1788 1789 /* 1790 * We have calculated out how many new records we need and store 1791 * in recs_need, so spare enough space first by moving the records 1792 * after "index" to the end. 1793 */ 1794 if (index != le16_to_cpu(rf_list->rl_used) - 1) 1795 memmove(&rf_list->rl_recs[index + 1 + recs_need], 1796 &rf_list->rl_recs[index + 1], 1797 (le16_to_cpu(rf_list->rl_used) - index - 1) * 1798 sizeof(struct ocfs2_refcount_rec)); 1799 1800 len = (le64_to_cpu(orig_rec->r_cpos) + 1801 le32_to_cpu(orig_rec->r_clusters)) - 1802 (le64_to_cpu(split_rec->r_cpos) + 1803 le32_to_cpu(split_rec->r_clusters)); 1804 1805 /* 1806 * If we have "len", the we will split in the tail and move it 1807 * to the end of the space we have just spared. 1808 */ 1809 if (len) { 1810 tail_rec = &rf_list->rl_recs[index + recs_need]; 1811 1812 memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); 1813 le64_add_cpu(&tail_rec->r_cpos, 1814 le32_to_cpu(tail_rec->r_clusters) - len); 1815 tail_rec->r_clusters = le32_to_cpu(len); 1816 } 1817 1818 /* 1819 * If the split pos isn't the same as the original one, we need to 1820 * split in the head. 1821 * 1822 * Note: We have the chance that split_rec.r_refcount = 0, 1823 * recs_need = 0 and len > 0, which means we just cut the head from 1824 * the orig_rec and in that case we have done some modification in 1825 * orig_rec above, so the check for r_cpos is faked. 1826 */ 1827 if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) { 1828 len = le64_to_cpu(split_rec->r_cpos) - 1829 le64_to_cpu(orig_rec->r_cpos); 1830 orig_rec->r_clusters = cpu_to_le32(len); 1831 index++; 1832 } 1833 1834 le16_add_cpu(&rf_list->rl_used, recs_need); 1835 1836 if (split_rec->r_refcount) { 1837 rf_list->rl_recs[index] = *split_rec; 1838 mlog(0, "insert refcount record start %llu, len %u, count %u " 1839 "to leaf block %llu at index %d\n", 1840 (unsigned long long)le64_to_cpu(split_rec->r_cpos), 1841 le32_to_cpu(split_rec->r_clusters), 1842 le32_to_cpu(split_rec->r_refcount), 1843 (unsigned long long)ref_leaf_bh->b_blocknr, index); 1844 1845 ocfs2_refcount_rec_merge(rb, index); 1846 } 1847 1848 ret = ocfs2_journal_dirty(handle, ref_leaf_bh); 1849 if (ret) 1850 mlog_errno(ret); 1851 1852 out: 1853 brelse(new_bh); 1854 return ret; 1855 } 1856 1857 static int __ocfs2_increase_refcount(handle_t *handle, 1858 struct ocfs2_caching_info *ci, 1859 struct buffer_head *ref_root_bh, 1860 u64 cpos, u32 len, 1861 struct ocfs2_alloc_context *meta_ac, 1862 struct ocfs2_cached_dealloc_ctxt *dealloc) 1863 { 1864 int ret = 0, index; 1865 struct buffer_head *ref_leaf_bh = NULL; 1866 struct ocfs2_refcount_rec rec; 1867 unsigned int set_len = 0; 1868 1869 mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n", 1870 (unsigned long long)ocfs2_metadata_cache_owner(ci), 1871 (unsigned long long)cpos, len); 1872 1873 while (len) { 1874 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 1875 cpos, len, &rec, &index, 1876 &ref_leaf_bh); 1877 if (ret) { 1878 mlog_errno(ret); 1879 goto out; 1880 } 1881 1882 set_len = le32_to_cpu(rec.r_clusters); 1883 1884 /* 1885 * Here we may meet with 3 situations: 1886 * 1887 * 1. If we find an already existing record, and the length 1888 * is the same, cool, we just need to increase the r_refcount 1889 * and it is OK. 1890 * 2. If we find a hole, just insert it with r_refcount = 1. 1891 * 3. If we are in the middle of one extent record, split 1892 * it. 1893 */ 1894 if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos && 1895 set_len <= len) { 1896 mlog(0, "increase refcount rec, start %llu, len %u, " 1897 "count %u\n", (unsigned long long)cpos, set_len, 1898 le32_to_cpu(rec.r_refcount)); 1899 ret = ocfs2_change_refcount_rec(handle, ci, 1900 ref_leaf_bh, index, 1); 1901 if (ret) { 1902 mlog_errno(ret); 1903 goto out; 1904 } 1905 } else if (!rec.r_refcount) { 1906 rec.r_refcount = cpu_to_le32(1); 1907 1908 mlog(0, "insert refcount rec, start %llu, len %u\n", 1909 (unsigned long long)le64_to_cpu(rec.r_cpos), 1910 set_len); 1911 ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh, 1912 ref_leaf_bh, 1913 &rec, index, meta_ac); 1914 if (ret) { 1915 mlog_errno(ret); 1916 goto out; 1917 } 1918 } else { 1919 set_len = min((u64)(cpos + len), 1920 le64_to_cpu(rec.r_cpos) + set_len) - cpos; 1921 rec.r_cpos = cpu_to_le64(cpos); 1922 rec.r_clusters = cpu_to_le32(set_len); 1923 le32_add_cpu(&rec.r_refcount, 1); 1924 1925 mlog(0, "split refcount rec, start %llu, " 1926 "len %u, count %u\n", 1927 (unsigned long long)le64_to_cpu(rec.r_cpos), 1928 set_len, le32_to_cpu(rec.r_refcount)); 1929 ret = ocfs2_split_refcount_rec(handle, ci, 1930 ref_root_bh, ref_leaf_bh, 1931 &rec, index, 1932 meta_ac, dealloc); 1933 if (ret) { 1934 mlog_errno(ret); 1935 goto out; 1936 } 1937 } 1938 1939 cpos += set_len; 1940 len -= set_len; 1941 brelse(ref_leaf_bh); 1942 ref_leaf_bh = NULL; 1943 } 1944 1945 out: 1946 brelse(ref_leaf_bh); 1947 return ret; 1948 } 1949 1950 static int ocfs2_remove_refcount_extent(handle_t *handle, 1951 struct ocfs2_caching_info *ci, 1952 struct buffer_head *ref_root_bh, 1953 struct buffer_head *ref_leaf_bh, 1954 struct ocfs2_alloc_context *meta_ac, 1955 struct ocfs2_cached_dealloc_ctxt *dealloc) 1956 { 1957 int ret; 1958 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 1959 struct ocfs2_refcount_block *rb = 1960 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 1961 struct ocfs2_extent_tree et; 1962 1963 BUG_ON(rb->rf_records.rl_used); 1964 1965 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 1966 ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos), 1967 1, meta_ac, dealloc); 1968 if (ret) { 1969 mlog_errno(ret); 1970 goto out; 1971 } 1972 1973 ocfs2_remove_from_cache(ci, ref_leaf_bh); 1974 1975 /* 1976 * add the freed block to the dealloc so that it will be freed 1977 * when we run dealloc. 1978 */ 1979 ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE, 1980 le16_to_cpu(rb->rf_suballoc_slot), 1981 le64_to_cpu(rb->rf_blkno), 1982 le16_to_cpu(rb->rf_suballoc_bit)); 1983 if (ret) { 1984 mlog_errno(ret); 1985 goto out; 1986 } 1987 1988 ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh, 1989 OCFS2_JOURNAL_ACCESS_WRITE); 1990 if (ret) { 1991 mlog_errno(ret); 1992 goto out; 1993 } 1994 1995 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 1996 1997 le32_add_cpu(&rb->rf_clusters, -1); 1998 1999 /* 2000 * check whether we need to restore the root refcount block if 2001 * there is no leaf extent block at atll. 2002 */ 2003 if (!rb->rf_list.l_next_free_rec) { 2004 BUG_ON(rb->rf_clusters); 2005 2006 mlog(0, "reset refcount tree root %llu to be a record block.\n", 2007 (unsigned long long)ref_root_bh->b_blocknr); 2008 2009 rb->rf_flags = 0; 2010 rb->rf_parent = 0; 2011 rb->rf_cpos = 0; 2012 memset(&rb->rf_records, 0, sb->s_blocksize - 2013 offsetof(struct ocfs2_refcount_block, rf_records)); 2014 rb->rf_records.rl_count = 2015 cpu_to_le16(ocfs2_refcount_recs_per_rb(sb)); 2016 } 2017 2018 ocfs2_journal_dirty(handle, ref_root_bh); 2019 2020 out: 2021 return ret; 2022 } 2023 2024 static int ocfs2_decrease_refcount_rec(handle_t *handle, 2025 struct ocfs2_caching_info *ci, 2026 struct buffer_head *ref_root_bh, 2027 struct buffer_head *ref_leaf_bh, 2028 int index, u64 cpos, unsigned int len, 2029 struct ocfs2_alloc_context *meta_ac, 2030 struct ocfs2_cached_dealloc_ctxt *dealloc) 2031 { 2032 int ret; 2033 struct ocfs2_refcount_block *rb = 2034 (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2035 struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index]; 2036 2037 BUG_ON(cpos < le64_to_cpu(rec->r_cpos)); 2038 BUG_ON(cpos + len > 2039 le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters)); 2040 2041 if (cpos == le64_to_cpu(rec->r_cpos) && 2042 len == le32_to_cpu(rec->r_clusters)) 2043 ret = ocfs2_change_refcount_rec(handle, ci, 2044 ref_leaf_bh, index, -1); 2045 else { 2046 struct ocfs2_refcount_rec split = *rec; 2047 split.r_cpos = cpu_to_le64(cpos); 2048 split.r_clusters = cpu_to_le32(len); 2049 2050 le32_add_cpu(&split.r_refcount, -1); 2051 2052 mlog(0, "split refcount rec, start %llu, " 2053 "len %u, count %u, original start %llu, len %u\n", 2054 (unsigned long long)le64_to_cpu(split.r_cpos), 2055 len, le32_to_cpu(split.r_refcount), 2056 (unsigned long long)le64_to_cpu(rec->r_cpos), 2057 le32_to_cpu(rec->r_clusters)); 2058 ret = ocfs2_split_refcount_rec(handle, ci, 2059 ref_root_bh, ref_leaf_bh, 2060 &split, index, 2061 meta_ac, dealloc); 2062 } 2063 2064 if (ret) { 2065 mlog_errno(ret); 2066 goto out; 2067 } 2068 2069 /* Remove the leaf refcount block if it contains no refcount record. */ 2070 if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) { 2071 ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh, 2072 ref_leaf_bh, meta_ac, 2073 dealloc); 2074 if (ret) 2075 mlog_errno(ret); 2076 } 2077 2078 out: 2079 return ret; 2080 } 2081 2082 static int __ocfs2_decrease_refcount(handle_t *handle, 2083 struct ocfs2_caching_info *ci, 2084 struct buffer_head *ref_root_bh, 2085 u64 cpos, u32 len, 2086 struct ocfs2_alloc_context *meta_ac, 2087 struct ocfs2_cached_dealloc_ctxt *dealloc, 2088 int delete) 2089 { 2090 int ret = 0, index = 0; 2091 struct ocfs2_refcount_rec rec; 2092 unsigned int r_count = 0, r_len; 2093 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2094 struct buffer_head *ref_leaf_bh = NULL; 2095 2096 mlog(0, "Tree owner %llu, decrease refcount start %llu, " 2097 "len %u, delete %u\n", 2098 (unsigned long long)ocfs2_metadata_cache_owner(ci), 2099 (unsigned long long)cpos, len, delete); 2100 2101 while (len) { 2102 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2103 cpos, len, &rec, &index, 2104 &ref_leaf_bh); 2105 if (ret) { 2106 mlog_errno(ret); 2107 goto out; 2108 } 2109 2110 r_count = le32_to_cpu(rec.r_refcount); 2111 BUG_ON(r_count == 0); 2112 if (!delete) 2113 BUG_ON(r_count > 1); 2114 2115 r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) + 2116 le32_to_cpu(rec.r_clusters)) - cpos; 2117 2118 ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh, 2119 ref_leaf_bh, index, 2120 cpos, r_len, 2121 meta_ac, dealloc); 2122 if (ret) { 2123 mlog_errno(ret); 2124 goto out; 2125 } 2126 2127 if (le32_to_cpu(rec.r_refcount) == 1 && delete) { 2128 ret = ocfs2_cache_cluster_dealloc(dealloc, 2129 ocfs2_clusters_to_blocks(sb, cpos), 2130 r_len); 2131 if (ret) { 2132 mlog_errno(ret); 2133 goto out; 2134 } 2135 } 2136 2137 cpos += r_len; 2138 len -= r_len; 2139 brelse(ref_leaf_bh); 2140 ref_leaf_bh = NULL; 2141 } 2142 2143 out: 2144 brelse(ref_leaf_bh); 2145 return ret; 2146 } 2147 2148 /* Caller must hold refcount tree lock. */ 2149 int ocfs2_decrease_refcount(struct inode *inode, 2150 handle_t *handle, u32 cpos, u32 len, 2151 struct ocfs2_alloc_context *meta_ac, 2152 struct ocfs2_cached_dealloc_ctxt *dealloc, 2153 int delete) 2154 { 2155 int ret; 2156 u64 ref_blkno; 2157 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2158 struct buffer_head *ref_root_bh = NULL; 2159 struct ocfs2_refcount_tree *tree; 2160 2161 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2162 2163 ret = ocfs2_get_refcount_block(inode, &ref_blkno); 2164 if (ret) { 2165 mlog_errno(ret); 2166 goto out; 2167 } 2168 2169 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree); 2170 if (ret) { 2171 mlog_errno(ret); 2172 goto out; 2173 } 2174 2175 ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno, 2176 &ref_root_bh); 2177 if (ret) { 2178 mlog_errno(ret); 2179 goto out; 2180 } 2181 2182 ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh, 2183 cpos, len, meta_ac, dealloc, delete); 2184 if (ret) 2185 mlog_errno(ret); 2186 out: 2187 brelse(ref_root_bh); 2188 return ret; 2189 } 2190 2191 /* 2192 * Mark the already-existing extent at cpos as refcounted for len clusters. 2193 * This adds the refcount extent flag. 2194 * 2195 * If the existing extent is larger than the request, initiate a 2196 * split. An attempt will be made at merging with adjacent extents. 2197 * 2198 * The caller is responsible for passing down meta_ac if we'll need it. 2199 */ 2200 static int ocfs2_mark_extent_refcounted(struct inode *inode, 2201 struct ocfs2_extent_tree *et, 2202 handle_t *handle, u32 cpos, 2203 u32 len, u32 phys, 2204 struct ocfs2_alloc_context *meta_ac, 2205 struct ocfs2_cached_dealloc_ctxt *dealloc) 2206 { 2207 int ret; 2208 2209 mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n", 2210 inode->i_ino, cpos, len, phys); 2211 2212 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2213 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2214 "tree, but the feature bit is not set in the " 2215 "super block.", inode->i_ino); 2216 ret = -EROFS; 2217 goto out; 2218 } 2219 2220 ret = ocfs2_change_extent_flag(handle, et, cpos, 2221 len, phys, meta_ac, dealloc, 2222 OCFS2_EXT_REFCOUNTED, 0); 2223 if (ret) 2224 mlog_errno(ret); 2225 2226 out: 2227 return ret; 2228 } 2229 2230 /* 2231 * Given some contiguous physical clusters, calculate what we need 2232 * for modifying their refcount. 2233 */ 2234 static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, 2235 struct ocfs2_caching_info *ci, 2236 struct buffer_head *ref_root_bh, 2237 u64 start_cpos, 2238 u32 clusters, 2239 int *meta_add, 2240 int *credits) 2241 { 2242 int ret = 0, index, ref_blocks = 0, recs_add = 0; 2243 u64 cpos = start_cpos; 2244 struct ocfs2_refcount_block *rb; 2245 struct ocfs2_refcount_rec rec; 2246 struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL; 2247 u32 len; 2248 2249 mlog(0, "start_cpos %llu, clusters %u\n", 2250 (unsigned long long)start_cpos, clusters); 2251 while (clusters) { 2252 ret = ocfs2_get_refcount_rec(ci, ref_root_bh, 2253 cpos, clusters, &rec, 2254 &index, &ref_leaf_bh); 2255 if (ret) { 2256 mlog_errno(ret); 2257 goto out; 2258 } 2259 2260 if (ref_leaf_bh != prev_bh) { 2261 /* 2262 * Now we encounter a new leaf block, so calculate 2263 * whether we need to extend the old leaf. 2264 */ 2265 if (prev_bh) { 2266 rb = (struct ocfs2_refcount_block *) 2267 prev_bh->b_data; 2268 2269 if (le64_to_cpu(rb->rf_records.rl_used) + 2270 recs_add > 2271 le16_to_cpu(rb->rf_records.rl_count)) 2272 ref_blocks++; 2273 } 2274 2275 recs_add = 0; 2276 *credits += 1; 2277 brelse(prev_bh); 2278 prev_bh = ref_leaf_bh; 2279 get_bh(prev_bh); 2280 } 2281 2282 rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data; 2283 2284 mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu," 2285 "rec->r_clusters %u, rec->r_refcount %u, index %d\n", 2286 recs_add, (unsigned long long)cpos, clusters, 2287 (unsigned long long)le64_to_cpu(rec.r_cpos), 2288 le32_to_cpu(rec.r_clusters), 2289 le32_to_cpu(rec.r_refcount), index); 2290 2291 len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) + 2292 le32_to_cpu(rec.r_clusters)) - cpos; 2293 /* 2294 * If the refcount rec already exist, cool. We just need 2295 * to check whether there is a split. Otherwise we just need 2296 * to increase the refcount. 2297 * If we will insert one, increases recs_add. 2298 * 2299 * We record all the records which will be inserted to the 2300 * same refcount block, so that we can tell exactly whether 2301 * we need a new refcount block or not. 2302 */ 2303 if (rec.r_refcount) { 2304 /* Check whether we need a split at the beginning. */ 2305 if (cpos == start_cpos && 2306 cpos != le64_to_cpu(rec.r_cpos)) 2307 recs_add++; 2308 2309 /* Check whether we need a split in the end. */ 2310 if (cpos + clusters < le64_to_cpu(rec.r_cpos) + 2311 le32_to_cpu(rec.r_clusters)) 2312 recs_add++; 2313 } else 2314 recs_add++; 2315 2316 brelse(ref_leaf_bh); 2317 ref_leaf_bh = NULL; 2318 clusters -= len; 2319 cpos += len; 2320 } 2321 2322 if (prev_bh) { 2323 rb = (struct ocfs2_refcount_block *)prev_bh->b_data; 2324 2325 if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > 2326 le16_to_cpu(rb->rf_records.rl_count)) 2327 ref_blocks++; 2328 2329 *credits += 1; 2330 } 2331 2332 if (!ref_blocks) 2333 goto out; 2334 2335 mlog(0, "we need ref_blocks %d\n", ref_blocks); 2336 *meta_add += ref_blocks; 2337 *credits += ref_blocks; 2338 2339 /* 2340 * So we may need ref_blocks to insert into the tree. 2341 * That also means we need to change the b-tree and add that number 2342 * of records since we never merge them. 2343 * We need one more block for expansion since the new created leaf 2344 * block is also full and needs split. 2345 */ 2346 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 2347 if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) { 2348 struct ocfs2_extent_tree et; 2349 2350 ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); 2351 *meta_add += ocfs2_extend_meta_needed(et.et_root_el); 2352 *credits += ocfs2_calc_extend_credits(sb, 2353 et.et_root_el, 2354 ref_blocks); 2355 } else { 2356 *credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS; 2357 *meta_add += 1; 2358 } 2359 2360 out: 2361 brelse(ref_leaf_bh); 2362 brelse(prev_bh); 2363 return ret; 2364 } 2365 2366 /* 2367 * For refcount tree, we will decrease some contiguous clusters 2368 * refcount count, so just go through it to see how many blocks 2369 * we gonna touch and whether we need to create new blocks. 2370 * 2371 * Normally the refcount blocks store these refcount should be 2372 * continguous also, so that we can get the number easily. 2373 * As for meta_ac, we will at most add split 2 refcount record and 2374 * 2 more refcount block, so just check it in a rough way. 2375 * 2376 * Caller must hold refcount tree lock. 2377 */ 2378 int ocfs2_prepare_refcount_change_for_del(struct inode *inode, 2379 struct buffer_head *di_bh, 2380 u64 phys_blkno, 2381 u32 clusters, 2382 int *credits, 2383 struct ocfs2_alloc_context **meta_ac) 2384 { 2385 int ret, ref_blocks = 0; 2386 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2387 struct ocfs2_inode_info *oi = OCFS2_I(inode); 2388 struct buffer_head *ref_root_bh = NULL; 2389 struct ocfs2_refcount_tree *tree; 2390 u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); 2391 2392 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 2393 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 2394 "tree, but the feature bit is not set in the " 2395 "super block.", inode->i_ino); 2396 ret = -EROFS; 2397 goto out; 2398 } 2399 2400 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 2401 2402 ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), 2403 le64_to_cpu(di->i_refcount_loc), &tree); 2404 if (ret) { 2405 mlog_errno(ret); 2406 goto out; 2407 } 2408 2409 ret = ocfs2_read_refcount_block(&tree->rf_ci, 2410 le64_to_cpu(di->i_refcount_loc), 2411 &ref_root_bh); 2412 if (ret) { 2413 mlog_errno(ret); 2414 goto out; 2415 } 2416 2417 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 2418 &tree->rf_ci, 2419 ref_root_bh, 2420 start_cpos, clusters, 2421 &ref_blocks, credits); 2422 if (ret) { 2423 mlog_errno(ret); 2424 goto out; 2425 } 2426 2427 mlog(0, "reserve new metadata %d, credits = %d\n", 2428 ref_blocks, *credits); 2429 2430 if (ref_blocks) { 2431 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 2432 ref_blocks, meta_ac); 2433 if (ret) 2434 mlog_errno(ret); 2435 } 2436 2437 out: 2438 brelse(ref_root_bh); 2439 return ret; 2440 } 2441 2442 #define MAX_CONTIG_BYTES 1048576 2443 2444 static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb) 2445 { 2446 return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES); 2447 } 2448 2449 static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb) 2450 { 2451 return ~(ocfs2_cow_contig_clusters(sb) - 1); 2452 } 2453 2454 /* 2455 * Given an extent that starts at 'start' and an I/O that starts at 'cpos', 2456 * find an offset (start + (n * contig_clusters)) that is closest to cpos 2457 * while still being less than or equal to it. 2458 * 2459 * The goal is to break the extent at a multiple of contig_clusters. 2460 */ 2461 static inline unsigned int ocfs2_cow_align_start(struct super_block *sb, 2462 unsigned int start, 2463 unsigned int cpos) 2464 { 2465 BUG_ON(start > cpos); 2466 2467 return start + ((cpos - start) & ocfs2_cow_contig_mask(sb)); 2468 } 2469 2470 /* 2471 * Given a cluster count of len, pad it out so that it is a multiple 2472 * of contig_clusters. 2473 */ 2474 static inline unsigned int ocfs2_cow_align_length(struct super_block *sb, 2475 unsigned int len) 2476 { 2477 unsigned int padded = 2478 (len + (ocfs2_cow_contig_clusters(sb) - 1)) & 2479 ocfs2_cow_contig_mask(sb); 2480 2481 /* Did we wrap? */ 2482 if (padded < len) 2483 padded = UINT_MAX; 2484 2485 return padded; 2486 } 2487 2488 /* 2489 * Calculate out the start and number of virtual clusters we need to to CoW. 2490 * 2491 * cpos is vitual start cluster position we want to do CoW in a 2492 * file and write_len is the cluster length. 2493 * max_cpos is the place where we want to stop CoW intentionally. 2494 * 2495 * Normal we will start CoW from the beginning of extent record cotaining cpos. 2496 * We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we 2497 * get good I/O from the resulting extent tree. 2498 */ 2499 static int ocfs2_refcount_cal_cow_clusters(struct inode *inode, 2500 struct ocfs2_extent_list *el, 2501 u32 cpos, 2502 u32 write_len, 2503 u32 max_cpos, 2504 u32 *cow_start, 2505 u32 *cow_len) 2506 { 2507 int ret = 0; 2508 int tree_height = le16_to_cpu(el->l_tree_depth), i; 2509 struct buffer_head *eb_bh = NULL; 2510 struct ocfs2_extent_block *eb = NULL; 2511 struct ocfs2_extent_rec *rec; 2512 unsigned int want_clusters, rec_end = 0; 2513 int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb); 2514 int leaf_clusters; 2515 2516 BUG_ON(cpos + write_len > max_cpos); 2517 2518 if (tree_height > 0) { 2519 ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh); 2520 if (ret) { 2521 mlog_errno(ret); 2522 goto out; 2523 } 2524 2525 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2526 el = &eb->h_list; 2527 2528 if (el->l_tree_depth) { 2529 ocfs2_error(inode->i_sb, 2530 "Inode %lu has non zero tree depth in " 2531 "leaf block %llu\n", inode->i_ino, 2532 (unsigned long long)eb_bh->b_blocknr); 2533 ret = -EROFS; 2534 goto out; 2535 } 2536 } 2537 2538 *cow_len = 0; 2539 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2540 rec = &el->l_recs[i]; 2541 2542 if (ocfs2_is_empty_extent(rec)) { 2543 mlog_bug_on_msg(i != 0, "Inode %lu has empty record in " 2544 "index %d\n", inode->i_ino, i); 2545 continue; 2546 } 2547 2548 if (le32_to_cpu(rec->e_cpos) + 2549 le16_to_cpu(rec->e_leaf_clusters) <= cpos) 2550 continue; 2551 2552 if (*cow_len == 0) { 2553 /* 2554 * We should find a refcounted record in the 2555 * first pass. 2556 */ 2557 BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED)); 2558 *cow_start = le32_to_cpu(rec->e_cpos); 2559 } 2560 2561 /* 2562 * If we encounter a hole, a non-refcounted record or 2563 * pass the max_cpos, stop the search. 2564 */ 2565 if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) || 2566 (*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) || 2567 (max_cpos <= le32_to_cpu(rec->e_cpos))) 2568 break; 2569 2570 leaf_clusters = le16_to_cpu(rec->e_leaf_clusters); 2571 rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters; 2572 if (rec_end > max_cpos) { 2573 rec_end = max_cpos; 2574 leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos); 2575 } 2576 2577 /* 2578 * How many clusters do we actually need from 2579 * this extent? First we see how many we actually 2580 * need to complete the write. If that's smaller 2581 * than contig_clusters, we try for contig_clusters. 2582 */ 2583 if (!*cow_len) 2584 want_clusters = write_len; 2585 else 2586 want_clusters = (cpos + write_len) - 2587 (*cow_start + *cow_len); 2588 if (want_clusters < contig_clusters) 2589 want_clusters = contig_clusters; 2590 2591 /* 2592 * If the write does not cover the whole extent, we 2593 * need to calculate how we're going to split the extent. 2594 * We try to do it on contig_clusters boundaries. 2595 * 2596 * Any extent smaller than contig_clusters will be 2597 * CoWed in its entirety. 2598 */ 2599 if (leaf_clusters <= contig_clusters) 2600 *cow_len += leaf_clusters; 2601 else if (*cow_len || (*cow_start == cpos)) { 2602 /* 2603 * This extent needs to be CoW'd from its 2604 * beginning, so all we have to do is compute 2605 * how many clusters to grab. We align 2606 * want_clusters to the edge of contig_clusters 2607 * to get better I/O. 2608 */ 2609 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2610 want_clusters); 2611 2612 if (leaf_clusters < want_clusters) 2613 *cow_len += leaf_clusters; 2614 else 2615 *cow_len += want_clusters; 2616 } else if ((*cow_start + contig_clusters) >= 2617 (cpos + write_len)) { 2618 /* 2619 * Breaking off contig_clusters at the front 2620 * of the extent will cover our write. That's 2621 * easy. 2622 */ 2623 *cow_len = contig_clusters; 2624 } else if ((rec_end - cpos) <= contig_clusters) { 2625 /* 2626 * Breaking off contig_clusters at the tail of 2627 * this extent will cover cpos. 2628 */ 2629 *cow_start = rec_end - contig_clusters; 2630 *cow_len = contig_clusters; 2631 } else if ((rec_end - cpos) <= want_clusters) { 2632 /* 2633 * While we can't fit the entire write in this 2634 * extent, we know that the write goes from cpos 2635 * to the end of the extent. Break that off. 2636 * We try to break it at some multiple of 2637 * contig_clusters from the front of the extent. 2638 * Failing that (ie, cpos is within 2639 * contig_clusters of the front), we'll CoW the 2640 * entire extent. 2641 */ 2642 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2643 *cow_start, cpos); 2644 *cow_len = rec_end - *cow_start; 2645 } else { 2646 /* 2647 * Ok, the entire write lives in the middle of 2648 * this extent. Let's try to slice the extent up 2649 * nicely. Optimally, our CoW region starts at 2650 * m*contig_clusters from the beginning of the 2651 * extent and goes for n*contig_clusters, 2652 * covering the entire write. 2653 */ 2654 *cow_start = ocfs2_cow_align_start(inode->i_sb, 2655 *cow_start, cpos); 2656 2657 want_clusters = (cpos + write_len) - *cow_start; 2658 want_clusters = ocfs2_cow_align_length(inode->i_sb, 2659 want_clusters); 2660 if (*cow_start + want_clusters <= rec_end) 2661 *cow_len = want_clusters; 2662 else 2663 *cow_len = rec_end - *cow_start; 2664 } 2665 2666 /* Have we covered our entire write yet? */ 2667 if ((*cow_start + *cow_len) >= (cpos + write_len)) 2668 break; 2669 2670 /* 2671 * If we reach the end of the extent block and don't get enough 2672 * clusters, continue with the next extent block if possible. 2673 */ 2674 if (i + 1 == le16_to_cpu(el->l_next_free_rec) && 2675 eb && eb->h_next_leaf_blk) { 2676 brelse(eb_bh); 2677 eb_bh = NULL; 2678 2679 ret = ocfs2_read_extent_block(INODE_CACHE(inode), 2680 le64_to_cpu(eb->h_next_leaf_blk), 2681 &eb_bh); 2682 if (ret) { 2683 mlog_errno(ret); 2684 goto out; 2685 } 2686 2687 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 2688 el = &eb->h_list; 2689 i = -1; 2690 } 2691 } 2692 2693 out: 2694 brelse(eb_bh); 2695 return ret; 2696 } 2697 2698 /* 2699 * Prepare meta_ac, data_ac and calculate credits when we want to add some 2700 * num_clusters in data_tree "et" and change the refcount for the old 2701 * clusters(starting form p_cluster) in the refcount tree. 2702 * 2703 * Note: 2704 * 1. since we may split the old tree, so we at most will need num_clusters + 2 2705 * more new leaf records. 2706 * 2. In some case, we may not need to reserve new clusters(e.g, reflink), so 2707 * just give data_ac = NULL. 2708 */ 2709 static int ocfs2_lock_refcount_allocators(struct super_block *sb, 2710 u32 p_cluster, u32 num_clusters, 2711 struct ocfs2_extent_tree *et, 2712 struct ocfs2_caching_info *ref_ci, 2713 struct buffer_head *ref_root_bh, 2714 struct ocfs2_alloc_context **meta_ac, 2715 struct ocfs2_alloc_context **data_ac, 2716 int *credits) 2717 { 2718 int ret = 0, meta_add = 0; 2719 int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); 2720 2721 if (num_free_extents < 0) { 2722 ret = num_free_extents; 2723 mlog_errno(ret); 2724 goto out; 2725 } 2726 2727 if (num_free_extents < num_clusters + 2) 2728 meta_add = 2729 ocfs2_extend_meta_needed(et->et_root_el); 2730 2731 *credits += ocfs2_calc_extend_credits(sb, et->et_root_el, 2732 num_clusters + 2); 2733 2734 ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh, 2735 p_cluster, num_clusters, 2736 &meta_add, credits); 2737 if (ret) { 2738 mlog_errno(ret); 2739 goto out; 2740 } 2741 2742 mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n", 2743 meta_add, num_clusters, *credits); 2744 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add, 2745 meta_ac); 2746 if (ret) { 2747 mlog_errno(ret); 2748 goto out; 2749 } 2750 2751 if (data_ac) { 2752 ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters, 2753 data_ac); 2754 if (ret) 2755 mlog_errno(ret); 2756 } 2757 2758 out: 2759 if (ret) { 2760 if (*meta_ac) { 2761 ocfs2_free_alloc_context(*meta_ac); 2762 *meta_ac = NULL; 2763 } 2764 } 2765 2766 return ret; 2767 } 2768 2769 static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh) 2770 { 2771 BUG_ON(buffer_dirty(bh)); 2772 2773 clear_buffer_mapped(bh); 2774 2775 return 0; 2776 } 2777 2778 static int ocfs2_duplicate_clusters_by_page(handle_t *handle, 2779 struct ocfs2_cow_context *context, 2780 u32 cpos, u32 old_cluster, 2781 u32 new_cluster, u32 new_len) 2782 { 2783 int ret = 0, partial; 2784 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2785 struct super_block *sb = ocfs2_metadata_cache_get_super(ci); 2786 u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster); 2787 struct page *page; 2788 pgoff_t page_index; 2789 unsigned int from, to; 2790 loff_t offset, end, map_end; 2791 struct address_space *mapping = context->inode->i_mapping; 2792 2793 mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster, 2794 new_cluster, new_len, cpos); 2795 2796 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2797 end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits); 2798 2799 while (offset < end) { 2800 page_index = offset >> PAGE_CACHE_SHIFT; 2801 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2802 if (map_end > end) 2803 map_end = end; 2804 2805 /* from, to is the offset within the page. */ 2806 from = offset & (PAGE_CACHE_SIZE - 1); 2807 to = PAGE_CACHE_SIZE; 2808 if (map_end & (PAGE_CACHE_SIZE - 1)) 2809 to = map_end & (PAGE_CACHE_SIZE - 1); 2810 2811 page = grab_cache_page(mapping, page_index); 2812 2813 /* This page can't be dirtied before we CoW it out. */ 2814 BUG_ON(PageDirty(page)); 2815 2816 if (!PageUptodate(page)) { 2817 ret = block_read_full_page(page, ocfs2_get_block); 2818 if (ret) { 2819 mlog_errno(ret); 2820 goto unlock; 2821 } 2822 lock_page(page); 2823 } 2824 2825 if (page_has_buffers(page)) { 2826 ret = walk_page_buffers(handle, page_buffers(page), 2827 from, to, &partial, 2828 ocfs2_clear_cow_buffer); 2829 if (ret) { 2830 mlog_errno(ret); 2831 goto unlock; 2832 } 2833 } 2834 2835 ocfs2_map_and_dirty_page(context->inode, 2836 handle, from, to, 2837 page, 0, &new_block); 2838 mark_page_accessed(page); 2839 unlock: 2840 unlock_page(page); 2841 page_cache_release(page); 2842 page = NULL; 2843 offset = map_end; 2844 if (ret) 2845 break; 2846 } 2847 2848 return ret; 2849 } 2850 2851 static int ocfs2_clear_ext_refcount(handle_t *handle, 2852 struct ocfs2_extent_tree *et, 2853 u32 cpos, u32 p_cluster, u32 len, 2854 unsigned int ext_flags, 2855 struct ocfs2_alloc_context *meta_ac, 2856 struct ocfs2_cached_dealloc_ctxt *dealloc) 2857 { 2858 int ret, index; 2859 struct ocfs2_extent_rec replace_rec; 2860 struct ocfs2_path *path = NULL; 2861 struct ocfs2_extent_list *el; 2862 struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); 2863 u64 ino = ocfs2_metadata_cache_owner(et->et_ci); 2864 2865 mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n", 2866 (unsigned long long)ino, cpos, len, p_cluster, ext_flags); 2867 2868 memset(&replace_rec, 0, sizeof(replace_rec)); 2869 replace_rec.e_cpos = cpu_to_le32(cpos); 2870 replace_rec.e_leaf_clusters = cpu_to_le16(len); 2871 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb, 2872 p_cluster)); 2873 replace_rec.e_flags = ext_flags; 2874 replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED; 2875 2876 path = ocfs2_new_path_from_et(et); 2877 if (!path) { 2878 ret = -ENOMEM; 2879 mlog_errno(ret); 2880 goto out; 2881 } 2882 2883 ret = ocfs2_find_path(et->et_ci, path, cpos); 2884 if (ret) { 2885 mlog_errno(ret); 2886 goto out; 2887 } 2888 2889 el = path_leaf_el(path); 2890 2891 index = ocfs2_search_extent_list(el, cpos); 2892 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 2893 ocfs2_error(sb, 2894 "Inode %llu has an extent at cpos %u which can no " 2895 "longer be found.\n", 2896 (unsigned long long)ino, cpos); 2897 ret = -EROFS; 2898 goto out; 2899 } 2900 2901 ret = ocfs2_split_extent(handle, et, path, index, 2902 &replace_rec, meta_ac, dealloc); 2903 if (ret) 2904 mlog_errno(ret); 2905 2906 out: 2907 ocfs2_free_path(path); 2908 return ret; 2909 } 2910 2911 static int ocfs2_replace_clusters(handle_t *handle, 2912 struct ocfs2_cow_context *context, 2913 u32 cpos, u32 old, 2914 u32 new, u32 len, 2915 unsigned int ext_flags) 2916 { 2917 int ret; 2918 struct ocfs2_caching_info *ci = context->data_et.et_ci; 2919 u64 ino = ocfs2_metadata_cache_owner(ci); 2920 2921 mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n", 2922 (unsigned long long)ino, cpos, old, new, len, ext_flags); 2923 2924 /*If the old clusters is unwritten, no need to duplicate. */ 2925 if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) { 2926 ret = context->cow_duplicate_clusters(handle, context, cpos, 2927 old, new, len); 2928 if (ret) { 2929 mlog_errno(ret); 2930 goto out; 2931 } 2932 } 2933 2934 ret = ocfs2_clear_ext_refcount(handle, &context->data_et, 2935 cpos, new, len, ext_flags, 2936 context->meta_ac, &context->dealloc); 2937 if (ret) 2938 mlog_errno(ret); 2939 out: 2940 return ret; 2941 } 2942 2943 static int ocfs2_cow_sync_writeback(struct super_block *sb, 2944 struct ocfs2_cow_context *context, 2945 u32 cpos, u32 num_clusters) 2946 { 2947 int ret = 0; 2948 loff_t offset, end, map_end; 2949 pgoff_t page_index; 2950 struct page *page; 2951 2952 if (ocfs2_should_order_data(context->inode)) 2953 return 0; 2954 2955 offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits; 2956 end = offset + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits); 2957 2958 ret = filemap_fdatawrite_range(context->inode->i_mapping, 2959 offset, end - 1); 2960 if (ret < 0) { 2961 mlog_errno(ret); 2962 return ret; 2963 } 2964 2965 while (offset < end) { 2966 page_index = offset >> PAGE_CACHE_SHIFT; 2967 map_end = (page_index + 1) << PAGE_CACHE_SHIFT; 2968 if (map_end > end) 2969 map_end = end; 2970 2971 page = grab_cache_page(context->inode->i_mapping, page_index); 2972 BUG_ON(!page); 2973 2974 wait_on_page_writeback(page); 2975 if (PageError(page)) { 2976 ret = -EIO; 2977 mlog_errno(ret); 2978 } else 2979 mark_page_accessed(page); 2980 2981 unlock_page(page); 2982 page_cache_release(page); 2983 page = NULL; 2984 offset = map_end; 2985 if (ret) 2986 break; 2987 } 2988 2989 return ret; 2990 } 2991 2992 static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context, 2993 u32 v_cluster, u32 *p_cluster, 2994 u32 *num_clusters, 2995 unsigned int *extent_flags) 2996 { 2997 return ocfs2_get_clusters(context->inode, v_cluster, p_cluster, 2998 num_clusters, extent_flags); 2999 } 3000 3001 static int ocfs2_make_clusters_writable(struct super_block *sb, 3002 struct ocfs2_cow_context *context, 3003 u32 cpos, u32 p_cluster, 3004 u32 num_clusters, unsigned int e_flags) 3005 { 3006 int ret, delete, index, credits = 0; 3007 u32 new_bit, new_len; 3008 unsigned int set_len; 3009 struct ocfs2_super *osb = OCFS2_SB(sb); 3010 handle_t *handle; 3011 struct buffer_head *ref_leaf_bh = NULL; 3012 struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci; 3013 struct ocfs2_refcount_rec rec; 3014 3015 mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n", 3016 cpos, p_cluster, num_clusters, e_flags); 3017 3018 ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters, 3019 &context->data_et, 3020 ref_ci, 3021 context->ref_root_bh, 3022 &context->meta_ac, 3023 &context->data_ac, &credits); 3024 if (ret) { 3025 mlog_errno(ret); 3026 return ret; 3027 } 3028 3029 handle = ocfs2_start_trans(osb, credits); 3030 if (IS_ERR(handle)) { 3031 ret = PTR_ERR(handle); 3032 mlog_errno(ret); 3033 goto out; 3034 } 3035 3036 while (num_clusters) { 3037 ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh, 3038 p_cluster, num_clusters, 3039 &rec, &index, &ref_leaf_bh); 3040 if (ret) { 3041 mlog_errno(ret); 3042 goto out_commit; 3043 } 3044 3045 BUG_ON(!rec.r_refcount); 3046 set_len = min((u64)p_cluster + num_clusters, 3047 le64_to_cpu(rec.r_cpos) + 3048 le32_to_cpu(rec.r_clusters)) - p_cluster; 3049 3050 /* 3051 * There are many different situation here. 3052 * 1. If refcount == 1, remove the flag and don't COW. 3053 * 2. If refcount > 1, allocate clusters. 3054 * Here we may not allocate r_len once at a time, so continue 3055 * until we reach num_clusters. 3056 */ 3057 if (le32_to_cpu(rec.r_refcount) == 1) { 3058 delete = 0; 3059 ret = ocfs2_clear_ext_refcount(handle, 3060 &context->data_et, 3061 cpos, p_cluster, 3062 set_len, e_flags, 3063 context->meta_ac, 3064 &context->dealloc); 3065 if (ret) { 3066 mlog_errno(ret); 3067 goto out_commit; 3068 } 3069 } else { 3070 delete = 1; 3071 3072 ret = __ocfs2_claim_clusters(osb, handle, 3073 context->data_ac, 3074 1, set_len, 3075 &new_bit, &new_len); 3076 if (ret) { 3077 mlog_errno(ret); 3078 goto out_commit; 3079 } 3080 3081 ret = ocfs2_replace_clusters(handle, context, 3082 cpos, p_cluster, new_bit, 3083 new_len, e_flags); 3084 if (ret) { 3085 mlog_errno(ret); 3086 goto out_commit; 3087 } 3088 set_len = new_len; 3089 } 3090 3091 ret = __ocfs2_decrease_refcount(handle, ref_ci, 3092 context->ref_root_bh, 3093 p_cluster, set_len, 3094 context->meta_ac, 3095 &context->dealloc, delete); 3096 if (ret) { 3097 mlog_errno(ret); 3098 goto out_commit; 3099 } 3100 3101 cpos += set_len; 3102 p_cluster += set_len; 3103 num_clusters -= set_len; 3104 brelse(ref_leaf_bh); 3105 ref_leaf_bh = NULL; 3106 } 3107 3108 /* 3109 * Here we should write the new page out first if we are 3110 * in write-back mode. 3111 */ 3112 ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters); 3113 if (ret) 3114 mlog_errno(ret); 3115 3116 out_commit: 3117 ocfs2_commit_trans(osb, handle); 3118 3119 out: 3120 if (context->data_ac) { 3121 ocfs2_free_alloc_context(context->data_ac); 3122 context->data_ac = NULL; 3123 } 3124 if (context->meta_ac) { 3125 ocfs2_free_alloc_context(context->meta_ac); 3126 context->meta_ac = NULL; 3127 } 3128 brelse(ref_leaf_bh); 3129 3130 return ret; 3131 } 3132 3133 static int ocfs2_replace_cow(struct ocfs2_cow_context *context) 3134 { 3135 int ret = 0; 3136 struct inode *inode = context->inode; 3137 u32 cow_start = context->cow_start, cow_len = context->cow_len; 3138 u32 p_cluster, num_clusters; 3139 unsigned int ext_flags; 3140 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3141 3142 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) { 3143 ocfs2_error(inode->i_sb, "Inode %lu want to use refcount " 3144 "tree, but the feature bit is not set in the " 3145 "super block.", inode->i_ino); 3146 return -EROFS; 3147 } 3148 3149 ocfs2_init_dealloc_ctxt(&context->dealloc); 3150 3151 while (cow_len) { 3152 ret = context->get_clusters(context, cow_start, &p_cluster, 3153 &num_clusters, &ext_flags); 3154 if (ret) { 3155 mlog_errno(ret); 3156 break; 3157 } 3158 3159 BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED)); 3160 3161 if (cow_len < num_clusters) 3162 num_clusters = cow_len; 3163 3164 ret = ocfs2_make_clusters_writable(inode->i_sb, context, 3165 cow_start, p_cluster, 3166 num_clusters, ext_flags); 3167 if (ret) { 3168 mlog_errno(ret); 3169 break; 3170 } 3171 3172 cow_len -= num_clusters; 3173 cow_start += num_clusters; 3174 } 3175 3176 if (ocfs2_dealloc_has_cluster(&context->dealloc)) { 3177 ocfs2_schedule_truncate_log_flush(osb, 1); 3178 ocfs2_run_deallocs(osb, &context->dealloc); 3179 } 3180 3181 return ret; 3182 } 3183 3184 /* 3185 * Starting at cpos, try to CoW write_len clusters. Don't CoW 3186 * past max_cpos. This will stop when it runs into a hole or an 3187 * unrefcounted extent. 3188 */ 3189 static int ocfs2_refcount_cow_hunk(struct inode *inode, 3190 struct buffer_head *di_bh, 3191 u32 cpos, u32 write_len, u32 max_cpos) 3192 { 3193 int ret; 3194 u32 cow_start = 0, cow_len = 0; 3195 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3196 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3197 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3198 struct buffer_head *ref_root_bh = NULL; 3199 struct ocfs2_refcount_tree *ref_tree; 3200 struct ocfs2_cow_context *context = NULL; 3201 3202 BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); 3203 3204 ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, 3205 cpos, write_len, max_cpos, 3206 &cow_start, &cow_len); 3207 if (ret) { 3208 mlog_errno(ret); 3209 goto out; 3210 } 3211 3212 mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, " 3213 "cow_len %u\n", inode->i_ino, 3214 cpos, write_len, cow_start, cow_len); 3215 3216 BUG_ON(cow_len == 0); 3217 3218 context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS); 3219 if (!context) { 3220 ret = -ENOMEM; 3221 mlog_errno(ret); 3222 goto out; 3223 } 3224 3225 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 3226 1, &ref_tree, &ref_root_bh); 3227 if (ret) { 3228 mlog_errno(ret); 3229 goto out; 3230 } 3231 3232 context->inode = inode; 3233 context->cow_start = cow_start; 3234 context->cow_len = cow_len; 3235 context->ref_tree = ref_tree; 3236 context->ref_root_bh = ref_root_bh; 3237 context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page; 3238 context->get_clusters = ocfs2_di_get_clusters; 3239 3240 ocfs2_init_dinode_extent_tree(&context->data_et, 3241 INODE_CACHE(inode), di_bh); 3242 3243 ret = ocfs2_replace_cow(context); 3244 if (ret) 3245 mlog_errno(ret); 3246 3247 /* 3248 * truncate the extent map here since no matter whether we meet with 3249 * any error during the action, we shouldn't trust cached extent map 3250 * any more. 3251 */ 3252 ocfs2_extent_map_trunc(inode, cow_start); 3253 3254 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3255 brelse(ref_root_bh); 3256 out: 3257 kfree(context); 3258 return ret; 3259 } 3260 3261 /* 3262 * CoW any and all clusters between cpos and cpos+write_len. 3263 * Don't CoW past max_cpos. If this returns successfully, all 3264 * clusters between cpos and cpos+write_len are safe to modify. 3265 */ 3266 int ocfs2_refcount_cow(struct inode *inode, 3267 struct buffer_head *di_bh, 3268 u32 cpos, u32 write_len, u32 max_cpos) 3269 { 3270 int ret = 0; 3271 u32 p_cluster, num_clusters; 3272 unsigned int ext_flags; 3273 3274 while (write_len) { 3275 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3276 &num_clusters, &ext_flags); 3277 if (ret) { 3278 mlog_errno(ret); 3279 break; 3280 } 3281 3282 if (write_len < num_clusters) 3283 num_clusters = write_len; 3284 3285 if (ext_flags & OCFS2_EXT_REFCOUNTED) { 3286 ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos, 3287 num_clusters, max_cpos); 3288 if (ret) { 3289 mlog_errno(ret); 3290 break; 3291 } 3292 } 3293 3294 write_len -= num_clusters; 3295 cpos += num_clusters; 3296 } 3297 3298 return ret; 3299 } 3300 3301 /* 3302 * Insert a new extent into refcount tree and mark a extent rec 3303 * as refcounted in the dinode tree. 3304 */ 3305 int ocfs2_add_refcount_flag(struct inode *inode, 3306 struct ocfs2_extent_tree *data_et, 3307 struct ocfs2_caching_info *ref_ci, 3308 struct buffer_head *ref_root_bh, 3309 u32 cpos, u32 p_cluster, u32 num_clusters, 3310 struct ocfs2_cached_dealloc_ctxt *dealloc) 3311 { 3312 int ret; 3313 handle_t *handle; 3314 int credits = 1, ref_blocks = 0; 3315 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3316 struct ocfs2_alloc_context *meta_ac = NULL; 3317 3318 ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, 3319 ref_ci, ref_root_bh, 3320 p_cluster, num_clusters, 3321 &ref_blocks, &credits); 3322 if (ret) { 3323 mlog_errno(ret); 3324 goto out; 3325 } 3326 3327 mlog(0, "reserve new metadata %d, credits = %d\n", 3328 ref_blocks, credits); 3329 3330 if (ref_blocks) { 3331 ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb), 3332 ref_blocks, &meta_ac); 3333 if (ret) { 3334 mlog_errno(ret); 3335 goto out; 3336 } 3337 } 3338 3339 handle = ocfs2_start_trans(osb, credits); 3340 if (IS_ERR(handle)) { 3341 ret = PTR_ERR(handle); 3342 mlog_errno(ret); 3343 goto out; 3344 } 3345 3346 ret = ocfs2_mark_extent_refcounted(inode, data_et, handle, 3347 cpos, num_clusters, p_cluster, 3348 meta_ac, dealloc); 3349 if (ret) { 3350 mlog_errno(ret); 3351 goto out_commit; 3352 } 3353 3354 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3355 p_cluster, num_clusters, 3356 meta_ac, dealloc); 3357 if (ret) 3358 mlog_errno(ret); 3359 3360 out_commit: 3361 ocfs2_commit_trans(osb, handle); 3362 out: 3363 if (meta_ac) 3364 ocfs2_free_alloc_context(meta_ac); 3365 return ret; 3366 } 3367 3368 static int ocfs2_change_ctime(struct inode *inode, 3369 struct buffer_head *di_bh) 3370 { 3371 int ret; 3372 handle_t *handle; 3373 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3374 3375 handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), 3376 OCFS2_INODE_UPDATE_CREDITS); 3377 if (IS_ERR(handle)) { 3378 ret = PTR_ERR(handle); 3379 mlog_errno(ret); 3380 goto out; 3381 } 3382 3383 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 3384 OCFS2_JOURNAL_ACCESS_WRITE); 3385 if (ret) { 3386 mlog_errno(ret); 3387 goto out_commit; 3388 } 3389 3390 inode->i_ctime = CURRENT_TIME; 3391 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 3392 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 3393 3394 ocfs2_journal_dirty(handle, di_bh); 3395 3396 out_commit: 3397 ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); 3398 out: 3399 return ret; 3400 } 3401 3402 static int ocfs2_attach_refcount_tree(struct inode *inode, 3403 struct buffer_head *di_bh) 3404 { 3405 int ret, data_changed = 0; 3406 struct buffer_head *ref_root_bh = NULL; 3407 struct ocfs2_inode_info *oi = OCFS2_I(inode); 3408 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 3409 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3410 struct ocfs2_refcount_tree *ref_tree; 3411 unsigned int ext_flags; 3412 loff_t size; 3413 u32 cpos, num_clusters, clusters, p_cluster; 3414 struct ocfs2_cached_dealloc_ctxt dealloc; 3415 struct ocfs2_extent_tree di_et; 3416 3417 ocfs2_init_dealloc_ctxt(&dealloc); 3418 3419 if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { 3420 ret = ocfs2_create_refcount_tree(inode, di_bh); 3421 if (ret) { 3422 mlog_errno(ret); 3423 goto out; 3424 } 3425 } 3426 3427 BUG_ON(!di->i_refcount_loc); 3428 ret = ocfs2_lock_refcount_tree(osb, 3429 le64_to_cpu(di->i_refcount_loc), 1, 3430 &ref_tree, &ref_root_bh); 3431 if (ret) { 3432 mlog_errno(ret); 3433 goto out; 3434 } 3435 3436 ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh); 3437 3438 size = i_size_read(inode); 3439 clusters = ocfs2_clusters_for_bytes(inode->i_sb, size); 3440 3441 cpos = 0; 3442 while (cpos < clusters) { 3443 ret = ocfs2_get_clusters(inode, cpos, &p_cluster, 3444 &num_clusters, &ext_flags); 3445 3446 if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { 3447 ret = ocfs2_add_refcount_flag(inode, &di_et, 3448 &ref_tree->rf_ci, 3449 ref_root_bh, cpos, 3450 p_cluster, num_clusters, 3451 &dealloc); 3452 if (ret) { 3453 mlog_errno(ret); 3454 goto unlock; 3455 } 3456 3457 data_changed = 1; 3458 } 3459 cpos += num_clusters; 3460 } 3461 3462 if (data_changed) { 3463 ret = ocfs2_change_ctime(inode, di_bh); 3464 if (ret) 3465 mlog_errno(ret); 3466 } 3467 3468 unlock: 3469 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3470 brelse(ref_root_bh); 3471 3472 if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) { 3473 ocfs2_schedule_truncate_log_flush(osb, 1); 3474 ocfs2_run_deallocs(osb, &dealloc); 3475 } 3476 out: 3477 /* 3478 * Empty the extent map so that we may get the right extent 3479 * record from the disk. 3480 */ 3481 ocfs2_extent_map_trunc(inode, 0); 3482 3483 return ret; 3484 } 3485 3486 static int ocfs2_add_refcounted_extent(struct inode *inode, 3487 struct ocfs2_extent_tree *et, 3488 struct ocfs2_caching_info *ref_ci, 3489 struct buffer_head *ref_root_bh, 3490 u32 cpos, u32 p_cluster, u32 num_clusters, 3491 unsigned int ext_flags, 3492 struct ocfs2_cached_dealloc_ctxt *dealloc) 3493 { 3494 int ret; 3495 handle_t *handle; 3496 int credits = 0; 3497 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3498 struct ocfs2_alloc_context *meta_ac = NULL; 3499 3500 ret = ocfs2_lock_refcount_allocators(inode->i_sb, 3501 p_cluster, num_clusters, 3502 et, ref_ci, 3503 ref_root_bh, &meta_ac, 3504 NULL, &credits); 3505 if (ret) { 3506 mlog_errno(ret); 3507 goto out; 3508 } 3509 3510 handle = ocfs2_start_trans(osb, credits); 3511 if (IS_ERR(handle)) { 3512 ret = PTR_ERR(handle); 3513 mlog_errno(ret); 3514 goto out; 3515 } 3516 3517 ret = ocfs2_insert_extent(handle, et, cpos, 3518 cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 3519 p_cluster)), 3520 num_clusters, ext_flags, meta_ac); 3521 if (ret) { 3522 mlog_errno(ret); 3523 goto out_commit; 3524 } 3525 3526 ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, 3527 p_cluster, num_clusters, 3528 meta_ac, dealloc); 3529 if (ret) 3530 mlog_errno(ret); 3531 3532 out_commit: 3533 ocfs2_commit_trans(osb, handle); 3534 out: 3535 if (meta_ac) 3536 ocfs2_free_alloc_context(meta_ac); 3537 return ret; 3538 } 3539 3540 static int ocfs2_duplicate_extent_list(struct inode *s_inode, 3541 struct inode *t_inode, 3542 struct buffer_head *t_bh, 3543 struct ocfs2_caching_info *ref_ci, 3544 struct buffer_head *ref_root_bh, 3545 struct ocfs2_cached_dealloc_ctxt *dealloc) 3546 { 3547 int ret = 0; 3548 u32 p_cluster, num_clusters, clusters, cpos; 3549 loff_t size; 3550 unsigned int ext_flags; 3551 struct ocfs2_extent_tree et; 3552 3553 ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh); 3554 3555 size = i_size_read(s_inode); 3556 clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size); 3557 3558 cpos = 0; 3559 while (cpos < clusters) { 3560 ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, 3561 &num_clusters, &ext_flags); 3562 3563 if (p_cluster) { 3564 ret = ocfs2_add_refcounted_extent(t_inode, &et, 3565 ref_ci, ref_root_bh, 3566 cpos, p_cluster, 3567 num_clusters, 3568 ext_flags, 3569 dealloc); 3570 if (ret) { 3571 mlog_errno(ret); 3572 goto out; 3573 } 3574 } 3575 3576 cpos += num_clusters; 3577 } 3578 3579 out: 3580 return ret; 3581 } 3582 3583 /* 3584 * change the new file's attributes to the src. 3585 * 3586 * reflink creates a snapshot of a file, that means the attributes 3587 * must be identical except for three exceptions - nlink, ino, and ctime. 3588 */ 3589 static int ocfs2_complete_reflink(struct inode *s_inode, 3590 struct buffer_head *s_bh, 3591 struct inode *t_inode, 3592 struct buffer_head *t_bh) 3593 { 3594 int ret; 3595 handle_t *handle; 3596 struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data; 3597 struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data; 3598 loff_t size = i_size_read(s_inode); 3599 3600 handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb), 3601 OCFS2_INODE_UPDATE_CREDITS); 3602 if (IS_ERR(handle)) { 3603 ret = PTR_ERR(handle); 3604 mlog_errno(ret); 3605 return ret; 3606 } 3607 3608 ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh, 3609 OCFS2_JOURNAL_ACCESS_WRITE); 3610 if (ret) { 3611 mlog_errno(ret); 3612 goto out_commit; 3613 } 3614 3615 spin_lock(&OCFS2_I(t_inode)->ip_lock); 3616 OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters; 3617 OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr; 3618 OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; 3619 spin_unlock(&OCFS2_I(t_inode)->ip_lock); 3620 i_size_write(t_inode, size); 3621 3622 di->i_xattr_inline_size = s_di->i_xattr_inline_size; 3623 di->i_clusters = s_di->i_clusters; 3624 di->i_size = s_di->i_size; 3625 di->i_dyn_features = s_di->i_dyn_features; 3626 di->i_attr = s_di->i_attr; 3627 di->i_uid = s_di->i_uid; 3628 di->i_gid = s_di->i_gid; 3629 di->i_mode = s_di->i_mode; 3630 3631 /* 3632 * update time. 3633 * we want mtime to appear identical to the source and update ctime. 3634 */ 3635 t_inode->i_ctime = CURRENT_TIME; 3636 3637 di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); 3638 di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); 3639 3640 t_inode->i_mtime = s_inode->i_mtime; 3641 di->i_mtime = s_di->i_mtime; 3642 di->i_mtime_nsec = s_di->i_mtime_nsec; 3643 3644 ocfs2_journal_dirty(handle, t_bh); 3645 3646 out_commit: 3647 ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle); 3648 return ret; 3649 } 3650 3651 static int ocfs2_create_reflink_node(struct inode *s_inode, 3652 struct buffer_head *s_bh, 3653 struct inode *t_inode, 3654 struct buffer_head *t_bh) 3655 { 3656 int ret; 3657 struct buffer_head *ref_root_bh = NULL; 3658 struct ocfs2_cached_dealloc_ctxt dealloc; 3659 struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb); 3660 struct ocfs2_refcount_block *rb; 3661 struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data; 3662 struct ocfs2_refcount_tree *ref_tree; 3663 3664 ocfs2_init_dealloc_ctxt(&dealloc); 3665 3666 ret = ocfs2_set_refcount_tree(t_inode, t_bh, 3667 le64_to_cpu(di->i_refcount_loc)); 3668 if (ret) { 3669 mlog_errno(ret); 3670 goto out; 3671 } 3672 3673 ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc), 3674 1, &ref_tree, &ref_root_bh); 3675 if (ret) { 3676 mlog_errno(ret); 3677 goto out; 3678 } 3679 rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data; 3680 3681 ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh, 3682 &ref_tree->rf_ci, ref_root_bh, 3683 &dealloc); 3684 if (ret) { 3685 mlog_errno(ret); 3686 goto out_unlock_refcount; 3687 } 3688 3689 ret = ocfs2_complete_reflink(s_inode, s_bh, t_inode, t_bh); 3690 if (ret) 3691 mlog_errno(ret); 3692 3693 out_unlock_refcount: 3694 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 3695 brelse(ref_root_bh); 3696 out: 3697 if (ocfs2_dealloc_has_cluster(&dealloc)) { 3698 ocfs2_schedule_truncate_log_flush(osb, 1); 3699 ocfs2_run_deallocs(osb, &dealloc); 3700 } 3701 3702 return ret; 3703 } 3704