1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * move_extents.c 5 * 6 * Copyright (C) 2011 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 #include <linux/fs.h> 18 #include <linux/types.h> 19 #include <linux/mount.h> 20 #include <linux/swap.h> 21 22 #include <cluster/masklog.h> 23 24 #include "ocfs2.h" 25 #include "ocfs2_ioctl.h" 26 27 #include "alloc.h" 28 #include "aops.h" 29 #include "dlmglue.h" 30 #include "extent_map.h" 31 #include "inode.h" 32 #include "journal.h" 33 #include "suballoc.h" 34 #include "uptodate.h" 35 #include "super.h" 36 #include "dir.h" 37 #include "buffer_head_io.h" 38 #include "sysfile.h" 39 #include "refcounttree.h" 40 #include "move_extents.h" 41 42 struct ocfs2_move_extents_context { 43 struct inode *inode; 44 struct file *file; 45 int auto_defrag; 46 int partial; 47 int credits; 48 u32 new_phys_cpos; 49 u32 clusters_moved; 50 u64 refcount_loc; 51 struct ocfs2_move_extents *range; 52 struct ocfs2_extent_tree et; 53 struct ocfs2_alloc_context *meta_ac; 54 struct ocfs2_alloc_context *data_ac; 55 struct ocfs2_cached_dealloc_ctxt dealloc; 56 }; 57 58 static int __ocfs2_move_extent(handle_t *handle, 59 struct ocfs2_move_extents_context *context, 60 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 61 int ext_flags) 62 { 63 int ret = 0, index; 64 struct inode *inode = context->inode; 65 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 66 struct ocfs2_extent_rec *rec, replace_rec; 67 struct ocfs2_path *path = NULL; 68 struct ocfs2_extent_list *el; 69 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 70 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 71 72 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, 73 p_cpos, new_p_cpos, len); 74 if (ret) { 75 mlog_errno(ret); 76 goto out; 77 } 78 79 memset(&replace_rec, 0, sizeof(replace_rec)); 80 replace_rec.e_cpos = cpu_to_le32(cpos); 81 replace_rec.e_leaf_clusters = cpu_to_le16(len); 82 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 83 new_p_cpos)); 84 85 path = ocfs2_new_path_from_et(&context->et); 86 if (!path) { 87 ret = -ENOMEM; 88 mlog_errno(ret); 89 goto out; 90 } 91 92 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 93 if (ret) { 94 mlog_errno(ret); 95 goto out; 96 } 97 98 el = path_leaf_el(path); 99 100 index = ocfs2_search_extent_list(el, cpos); 101 if (index == -1) { 102 ret = ocfs2_error(inode->i_sb, 103 "Inode %llu has an extent at cpos %u which can no longer be found\n", 104 (unsigned long long)ino, cpos); 105 goto out; 106 } 107 108 rec = &el->l_recs[index]; 109 110 BUG_ON(ext_flags != rec->e_flags); 111 /* 112 * after moving/defraging to new location, the extent is not going 113 * to be refcounted anymore. 114 */ 115 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 116 117 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 118 context->et.et_root_bh, 119 OCFS2_JOURNAL_ACCESS_WRITE); 120 if (ret) { 121 mlog_errno(ret); 122 goto out; 123 } 124 125 ret = ocfs2_split_extent(handle, &context->et, path, index, 126 &replace_rec, context->meta_ac, 127 &context->dealloc); 128 if (ret) { 129 mlog_errno(ret); 130 goto out; 131 } 132 133 ocfs2_journal_dirty(handle, context->et.et_root_bh); 134 135 context->new_phys_cpos = new_p_cpos; 136 137 /* 138 * need I to append truncate log for old clusters? 139 */ 140 if (old_blkno) { 141 if (ext_flags & OCFS2_EXT_REFCOUNTED) 142 ret = ocfs2_decrease_refcount(inode, handle, 143 ocfs2_blocks_to_clusters(osb->sb, 144 old_blkno), 145 len, context->meta_ac, 146 &context->dealloc, 1); 147 else 148 ret = ocfs2_truncate_log_append(osb, handle, 149 old_blkno, len); 150 } 151 152 ocfs2_update_inode_fsync_trans(handle, inode, 0); 153 out: 154 ocfs2_free_path(path); 155 return ret; 156 } 157 158 /* 159 * lock allocators, and reserving appropriate number of bits for 160 * meta blocks and data clusters. 161 * 162 * in some cases, we don't need to reserve clusters, just let data_ac 163 * be NULL. 164 */ 165 static int ocfs2_lock_allocators_move_extents(struct inode *inode, 166 struct ocfs2_extent_tree *et, 167 u32 clusters_to_move, 168 u32 extents_to_split, 169 struct ocfs2_alloc_context **meta_ac, 170 struct ocfs2_alloc_context **data_ac, 171 int extra_blocks, 172 int *credits) 173 { 174 int ret, num_free_extents; 175 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 176 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 177 178 num_free_extents = ocfs2_num_free_extents(osb, et); 179 if (num_free_extents < 0) { 180 ret = num_free_extents; 181 mlog_errno(ret); 182 goto out; 183 } 184 185 if (!num_free_extents || 186 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 187 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 188 189 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 190 if (ret) { 191 mlog_errno(ret); 192 goto out; 193 } 194 195 if (data_ac) { 196 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); 197 if (ret) { 198 mlog_errno(ret); 199 goto out; 200 } 201 } 202 203 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); 204 205 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 206 extra_blocks, clusters_to_move, *credits); 207 out: 208 if (ret) { 209 if (*meta_ac) { 210 ocfs2_free_alloc_context(*meta_ac); 211 *meta_ac = NULL; 212 } 213 } 214 215 return ret; 216 } 217 218 /* 219 * Using one journal handle to guarantee the data consistency in case 220 * crash happens anywhere. 221 * 222 * XXX: defrag can end up with finishing partial extent as requested, 223 * due to not enough contiguous clusters can be found in allocator. 224 */ 225 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 226 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 227 { 228 int ret, credits = 0, extra_blocks = 0, partial = context->partial; 229 handle_t *handle; 230 struct inode *inode = context->inode; 231 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 232 struct inode *tl_inode = osb->osb_tl_inode; 233 struct ocfs2_refcount_tree *ref_tree = NULL; 234 u32 new_phys_cpos, new_len; 235 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 236 237 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 238 239 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 240 OCFS2_HAS_REFCOUNT_FL)); 241 242 BUG_ON(!context->refcount_loc); 243 244 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 245 &ref_tree, NULL); 246 if (ret) { 247 mlog_errno(ret); 248 return ret; 249 } 250 251 ret = ocfs2_prepare_refcount_change_for_del(inode, 252 context->refcount_loc, 253 phys_blkno, 254 *len, 255 &credits, 256 &extra_blocks); 257 if (ret) { 258 mlog_errno(ret); 259 goto out; 260 } 261 } 262 263 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, 264 &context->meta_ac, 265 &context->data_ac, 266 extra_blocks, &credits); 267 if (ret) { 268 mlog_errno(ret); 269 goto out; 270 } 271 272 /* 273 * should be using allocation reservation strategy there? 274 * 275 * if (context->data_ac) 276 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 277 */ 278 279 inode_lock(tl_inode); 280 281 if (ocfs2_truncate_log_needs_flush(osb)) { 282 ret = __ocfs2_flush_truncate_log(osb); 283 if (ret < 0) { 284 mlog_errno(ret); 285 goto out_unlock_mutex; 286 } 287 } 288 289 handle = ocfs2_start_trans(osb, credits); 290 if (IS_ERR(handle)) { 291 ret = PTR_ERR(handle); 292 mlog_errno(ret); 293 goto out_unlock_mutex; 294 } 295 296 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 297 &new_phys_cpos, &new_len); 298 if (ret) { 299 mlog_errno(ret); 300 goto out_commit; 301 } 302 303 /* 304 * allowing partial extent moving is kind of 'pros and cons', it makes 305 * whole defragmentation less likely to fail, on the contrary, the bad 306 * thing is it may make the fs even more fragmented after moving, let 307 * userspace make a good decision here. 308 */ 309 if (new_len != *len) { 310 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 311 if (!partial) { 312 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 313 ret = -ENOSPC; 314 goto out_commit; 315 } 316 } 317 318 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 319 phys_cpos, new_phys_cpos); 320 321 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 322 new_phys_cpos, ext_flags); 323 if (ret) 324 mlog_errno(ret); 325 326 if (partial && (new_len != *len)) 327 *len = new_len; 328 329 /* 330 * Here we should write the new page out first if we are 331 * in write-back mode. 332 */ 333 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 334 if (ret) 335 mlog_errno(ret); 336 337 out_commit: 338 ocfs2_commit_trans(osb, handle); 339 340 out_unlock_mutex: 341 inode_unlock(tl_inode); 342 343 if (context->data_ac) { 344 ocfs2_free_alloc_context(context->data_ac); 345 context->data_ac = NULL; 346 } 347 348 if (context->meta_ac) { 349 ocfs2_free_alloc_context(context->meta_ac); 350 context->meta_ac = NULL; 351 } 352 353 out: 354 if (ref_tree) 355 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 356 357 return ret; 358 } 359 360 /* 361 * find the victim alloc group, where #blkno fits. 362 */ 363 static int ocfs2_find_victim_alloc_group(struct inode *inode, 364 u64 vict_blkno, 365 int type, int slot, 366 int *vict_bit, 367 struct buffer_head **ret_bh) 368 { 369 int ret, i, bits_per_unit = 0; 370 u64 blkno; 371 char namebuf[40]; 372 373 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 374 struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 375 struct ocfs2_chain_list *cl; 376 struct ocfs2_chain_rec *rec; 377 struct ocfs2_dinode *ac_dinode; 378 struct ocfs2_group_desc *bg; 379 380 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 381 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 382 strlen(namebuf), &blkno); 383 if (ret) { 384 ret = -ENOENT; 385 goto out; 386 } 387 388 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 389 if (ret) { 390 mlog_errno(ret); 391 goto out; 392 } 393 394 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 395 cl = &(ac_dinode->id2.i_chain); 396 rec = &(cl->cl_recs[0]); 397 398 if (type == GLOBAL_BITMAP_SYSTEM_INODE) 399 bits_per_unit = osb->s_clustersize_bits - 400 inode->i_sb->s_blocksize_bits; 401 /* 402 * 'vict_blkno' was out of the valid range. 403 */ 404 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 405 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 406 bits_per_unit))) { 407 ret = -EINVAL; 408 goto out; 409 } 410 411 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 412 413 rec = &(cl->cl_recs[i]); 414 if (!rec) 415 continue; 416 417 bg = NULL; 418 419 do { 420 if (!bg) 421 blkno = le64_to_cpu(rec->c_blkno); 422 else 423 blkno = le64_to_cpu(bg->bg_next_group); 424 425 if (gd_bh) { 426 brelse(gd_bh); 427 gd_bh = NULL; 428 } 429 430 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 431 if (ret) { 432 mlog_errno(ret); 433 goto out; 434 } 435 436 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 437 438 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 439 le16_to_cpu(bg->bg_bits))) { 440 441 *ret_bh = gd_bh; 442 *vict_bit = (vict_blkno - blkno) >> 443 bits_per_unit; 444 mlog(0, "find the victim group: #%llu, " 445 "total_bits: %u, vict_bit: %u\n", 446 blkno, le16_to_cpu(bg->bg_bits), 447 *vict_bit); 448 goto out; 449 } 450 451 } while (le64_to_cpu(bg->bg_next_group)); 452 } 453 454 ret = -EINVAL; 455 out: 456 brelse(ac_bh); 457 458 /* 459 * caller has to release the gd_bh properly. 460 */ 461 return ret; 462 } 463 464 /* 465 * XXX: helper to validate and adjust moving goal. 466 */ 467 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 468 struct ocfs2_move_extents *range) 469 { 470 int ret, goal_bit = 0; 471 472 struct buffer_head *gd_bh = NULL; 473 struct ocfs2_group_desc *bg; 474 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 475 int c_to_b = 1 << (osb->s_clustersize_bits - 476 inode->i_sb->s_blocksize_bits); 477 478 /* 479 * make goal become cluster aligned. 480 */ 481 range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 482 range->me_goal); 483 /* 484 * validate goal sits within global_bitmap, and return the victim 485 * group desc 486 */ 487 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 488 GLOBAL_BITMAP_SYSTEM_INODE, 489 OCFS2_INVALID_SLOT, 490 &goal_bit, &gd_bh); 491 if (ret) 492 goto out; 493 494 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 495 496 /* 497 * moving goal is not allowd to start with a group desc blok(#0 blk) 498 * let's compromise to the latter cluster. 499 */ 500 if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 501 range->me_goal += c_to_b; 502 503 /* 504 * movement is not gonna cross two groups. 505 */ 506 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 507 range->me_len) { 508 ret = -EINVAL; 509 goto out; 510 } 511 /* 512 * more exact validations/adjustments will be performed later during 513 * moving operation for each extent range. 514 */ 515 mlog(0, "extents get ready to be moved to #%llu block\n", 516 range->me_goal); 517 518 out: 519 brelse(gd_bh); 520 521 return ret; 522 } 523 524 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 525 int *goal_bit, u32 move_len, u32 max_hop, 526 u32 *phys_cpos) 527 { 528 int i, used, last_free_bits = 0, base_bit = *goal_bit; 529 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 530 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 531 le64_to_cpu(gd->bg_blkno)); 532 533 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 534 535 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 536 if (used) { 537 /* 538 * we even tried searching the free chunk by jumping 539 * a 'max_hop' distance, but still failed. 540 */ 541 if ((i - base_bit) > max_hop) { 542 *phys_cpos = 0; 543 break; 544 } 545 546 if (last_free_bits) 547 last_free_bits = 0; 548 549 continue; 550 } else 551 last_free_bits++; 552 553 if (last_free_bits == move_len) { 554 *goal_bit = i; 555 *phys_cpos = base_cpos + i; 556 break; 557 } 558 } 559 560 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 561 } 562 563 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 564 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 565 u32 len, int ext_flags) 566 { 567 int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 568 handle_t *handle; 569 struct inode *inode = context->inode; 570 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 571 struct inode *tl_inode = osb->osb_tl_inode; 572 struct inode *gb_inode = NULL; 573 struct buffer_head *gb_bh = NULL; 574 struct buffer_head *gd_bh = NULL; 575 struct ocfs2_group_desc *gd; 576 struct ocfs2_refcount_tree *ref_tree = NULL; 577 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 578 context->range->me_threshold); 579 u64 phys_blkno, new_phys_blkno; 580 581 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 582 583 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 584 585 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 586 OCFS2_HAS_REFCOUNT_FL)); 587 588 BUG_ON(!context->refcount_loc); 589 590 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 591 &ref_tree, NULL); 592 if (ret) { 593 mlog_errno(ret); 594 return ret; 595 } 596 597 ret = ocfs2_prepare_refcount_change_for_del(inode, 598 context->refcount_loc, 599 phys_blkno, 600 len, 601 &credits, 602 &extra_blocks); 603 if (ret) { 604 mlog_errno(ret); 605 goto out; 606 } 607 } 608 609 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, 610 &context->meta_ac, 611 NULL, extra_blocks, &credits); 612 if (ret) { 613 mlog_errno(ret); 614 goto out; 615 } 616 617 /* 618 * need to count 2 extra credits for global_bitmap inode and 619 * group descriptor. 620 */ 621 credits += OCFS2_INODE_UPDATE_CREDITS + 1; 622 623 /* 624 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 625 * logic, while we still need to lock the global_bitmap. 626 */ 627 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 628 OCFS2_INVALID_SLOT); 629 if (!gb_inode) { 630 mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 631 ret = -EIO; 632 goto out; 633 } 634 635 inode_lock(gb_inode); 636 637 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 638 if (ret) { 639 mlog_errno(ret); 640 goto out_unlock_gb_mutex; 641 } 642 643 inode_lock(tl_inode); 644 645 handle = ocfs2_start_trans(osb, credits); 646 if (IS_ERR(handle)) { 647 ret = PTR_ERR(handle); 648 mlog_errno(ret); 649 goto out_unlock_tl_inode; 650 } 651 652 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 653 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 654 GLOBAL_BITMAP_SYSTEM_INODE, 655 OCFS2_INVALID_SLOT, 656 &goal_bit, &gd_bh); 657 if (ret) { 658 mlog_errno(ret); 659 goto out_commit; 660 } 661 662 /* 663 * probe the victim cluster group to find a proper 664 * region to fit wanted movement, it even will perfrom 665 * a best-effort attempt by compromising to a threshold 666 * around the goal. 667 */ 668 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 669 new_phys_cpos); 670 if (!*new_phys_cpos) { 671 ret = -ENOSPC; 672 goto out_commit; 673 } 674 675 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 676 *new_phys_cpos, ext_flags); 677 if (ret) { 678 mlog_errno(ret); 679 goto out_commit; 680 } 681 682 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 683 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 684 le16_to_cpu(gd->bg_chain)); 685 if (ret) { 686 mlog_errno(ret); 687 goto out_commit; 688 } 689 690 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 691 goal_bit, len); 692 if (ret) { 693 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, 694 le16_to_cpu(gd->bg_chain)); 695 mlog_errno(ret); 696 } 697 698 /* 699 * Here we should write the new page out first if we are 700 * in write-back mode. 701 */ 702 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 703 if (ret) 704 mlog_errno(ret); 705 706 out_commit: 707 ocfs2_commit_trans(osb, handle); 708 brelse(gd_bh); 709 710 out_unlock_tl_inode: 711 inode_unlock(tl_inode); 712 713 ocfs2_inode_unlock(gb_inode, 1); 714 out_unlock_gb_mutex: 715 inode_unlock(gb_inode); 716 brelse(gb_bh); 717 iput(gb_inode); 718 719 out: 720 if (context->meta_ac) { 721 ocfs2_free_alloc_context(context->meta_ac); 722 context->meta_ac = NULL; 723 } 724 725 if (ref_tree) 726 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 727 728 return ret; 729 } 730 731 /* 732 * Helper to calculate the defraging length in one run according to threshold. 733 */ 734 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 735 u32 threshold, int *skip) 736 { 737 if ((*alloc_size + *len_defraged) < threshold) { 738 /* 739 * proceed defragmentation until we meet the thresh 740 */ 741 *len_defraged += *alloc_size; 742 } else if (*len_defraged == 0) { 743 /* 744 * XXX: skip a large extent. 745 */ 746 *skip = 1; 747 } else { 748 /* 749 * split this extent to coalesce with former pieces as 750 * to reach the threshold. 751 * 752 * we're done here with one cycle of defragmentation 753 * in a size of 'thresh', resetting 'len_defraged' 754 * forces a new defragmentation. 755 */ 756 *alloc_size = threshold - *len_defraged; 757 *len_defraged = 0; 758 } 759 } 760 761 static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 762 struct ocfs2_move_extents_context *context) 763 { 764 int ret = 0, flags, do_defrag, skip = 0; 765 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 766 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 767 768 struct inode *inode = context->inode; 769 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 770 struct ocfs2_move_extents *range = context->range; 771 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 772 773 if ((i_size_read(inode) == 0) || (range->me_len == 0)) 774 return 0; 775 776 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 777 return 0; 778 779 context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 780 781 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 782 ocfs2_init_dealloc_ctxt(&context->dealloc); 783 784 /* 785 * TO-DO XXX: 786 * 787 * - xattr extents. 788 */ 789 790 do_defrag = context->auto_defrag; 791 792 /* 793 * extents moving happens in unit of clusters, for the sake 794 * of simplicity, we may ignore two clusters where 'byte_start' 795 * and 'byte_start + len' were within. 796 */ 797 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 798 len_to_move = (range->me_start + range->me_len) >> 799 osb->s_clustersize_bits; 800 if (len_to_move >= move_start) 801 len_to_move -= move_start; 802 else 803 len_to_move = 0; 804 805 if (do_defrag) { 806 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 807 if (defrag_thresh <= 1) 808 goto done; 809 } else 810 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 811 range->me_goal); 812 813 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 814 "thresh: %u\n", 815 (unsigned long long)OCFS2_I(inode)->ip_blkno, 816 (unsigned long long)range->me_start, 817 (unsigned long long)range->me_len, 818 move_start, len_to_move, defrag_thresh); 819 820 cpos = move_start; 821 while (len_to_move) { 822 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 823 &flags); 824 if (ret) { 825 mlog_errno(ret); 826 goto out; 827 } 828 829 if (alloc_size > len_to_move) 830 alloc_size = len_to_move; 831 832 /* 833 * XXX: how to deal with a hole: 834 * 835 * - skip the hole of course 836 * - force a new defragmentation 837 */ 838 if (!phys_cpos) { 839 if (do_defrag) 840 len_defraged = 0; 841 842 goto next; 843 } 844 845 if (do_defrag) { 846 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 847 defrag_thresh, &skip); 848 /* 849 * skip large extents 850 */ 851 if (skip) { 852 skip = 0; 853 goto next; 854 } 855 856 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 857 "alloc_size: %u, len_defraged: %u\n", 858 cpos, phys_cpos, alloc_size, len_defraged); 859 860 ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 861 &alloc_size, flags); 862 } else { 863 ret = ocfs2_move_extent(context, cpos, phys_cpos, 864 &new_phys_cpos, alloc_size, 865 flags); 866 867 new_phys_cpos += alloc_size; 868 } 869 870 if (ret < 0) { 871 mlog_errno(ret); 872 goto out; 873 } 874 875 context->clusters_moved += alloc_size; 876 next: 877 cpos += alloc_size; 878 len_to_move -= alloc_size; 879 } 880 881 done: 882 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 883 884 out: 885 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 886 context->clusters_moved); 887 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 888 context->new_phys_cpos); 889 890 ocfs2_schedule_truncate_log_flush(osb, 1); 891 ocfs2_run_deallocs(osb, &context->dealloc); 892 893 return ret; 894 } 895 896 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 897 { 898 int status; 899 handle_t *handle; 900 struct inode *inode = context->inode; 901 struct ocfs2_dinode *di; 902 struct buffer_head *di_bh = NULL; 903 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 904 905 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 906 return -EROFS; 907 908 inode_lock(inode); 909 910 /* 911 * This prevents concurrent writes from other nodes 912 */ 913 status = ocfs2_rw_lock(inode, 1); 914 if (status) { 915 mlog_errno(status); 916 goto out; 917 } 918 919 status = ocfs2_inode_lock(inode, &di_bh, 1); 920 if (status) { 921 mlog_errno(status); 922 goto out_rw_unlock; 923 } 924 925 /* 926 * rememer ip_xattr_sem also needs to be held if necessary 927 */ 928 down_write(&OCFS2_I(inode)->ip_alloc_sem); 929 930 status = __ocfs2_move_extents_range(di_bh, context); 931 932 up_write(&OCFS2_I(inode)->ip_alloc_sem); 933 if (status) { 934 mlog_errno(status); 935 goto out_inode_unlock; 936 } 937 938 /* 939 * We update ctime for these changes 940 */ 941 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 942 if (IS_ERR(handle)) { 943 status = PTR_ERR(handle); 944 mlog_errno(status); 945 goto out_inode_unlock; 946 } 947 948 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 949 OCFS2_JOURNAL_ACCESS_WRITE); 950 if (status) { 951 mlog_errno(status); 952 goto out_commit; 953 } 954 955 di = (struct ocfs2_dinode *)di_bh->b_data; 956 inode->i_ctime = CURRENT_TIME; 957 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 958 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 959 ocfs2_update_inode_fsync_trans(handle, inode, 0); 960 961 ocfs2_journal_dirty(handle, di_bh); 962 963 out_commit: 964 ocfs2_commit_trans(osb, handle); 965 966 out_inode_unlock: 967 brelse(di_bh); 968 ocfs2_inode_unlock(inode, 1); 969 out_rw_unlock: 970 ocfs2_rw_unlock(inode, 1); 971 out: 972 inode_unlock(inode); 973 974 return status; 975 } 976 977 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 978 { 979 int status; 980 981 struct inode *inode = file_inode(filp); 982 struct ocfs2_move_extents range; 983 struct ocfs2_move_extents_context *context; 984 985 if (!argp) 986 return -EINVAL; 987 988 status = mnt_want_write_file(filp); 989 if (status) 990 return status; 991 992 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { 993 status = -EPERM; 994 goto out_drop; 995 } 996 997 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 998 status = -EPERM; 999 goto out_drop; 1000 } 1001 1002 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 1003 if (!context) { 1004 status = -ENOMEM; 1005 mlog_errno(status); 1006 goto out_drop; 1007 } 1008 1009 context->inode = inode; 1010 context->file = filp; 1011 1012 if (copy_from_user(&range, argp, sizeof(range))) { 1013 status = -EFAULT; 1014 goto out_free; 1015 } 1016 1017 if (range.me_start > i_size_read(inode)) { 1018 status = -EINVAL; 1019 goto out_free; 1020 } 1021 1022 if (range.me_start + range.me_len > i_size_read(inode)) 1023 range.me_len = i_size_read(inode) - range.me_start; 1024 1025 context->range = ⦥ 1026 1027 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 1028 context->auto_defrag = 1; 1029 /* 1030 * ok, the default theshold for the defragmentation 1031 * is 1M, since our maximum clustersize was 1M also. 1032 * any thought? 1033 */ 1034 if (!range.me_threshold) 1035 range.me_threshold = 1024 * 1024; 1036 1037 if (range.me_threshold > i_size_read(inode)) 1038 range.me_threshold = i_size_read(inode); 1039 1040 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 1041 context->partial = 1; 1042 } else { 1043 /* 1044 * first best-effort attempt to validate and adjust the goal 1045 * (physical address in block), while it can't guarantee later 1046 * operation can succeed all the time since global_bitmap may 1047 * change a bit over time. 1048 */ 1049 1050 status = ocfs2_validate_and_adjust_move_goal(inode, &range); 1051 if (status) 1052 goto out_copy; 1053 } 1054 1055 status = ocfs2_move_extents(context); 1056 if (status) 1057 mlog_errno(status); 1058 out_copy: 1059 /* 1060 * movement/defragmentation may end up being partially completed, 1061 * that's the reason why we need to return userspace the finished 1062 * length and new_offset even if failure happens somewhere. 1063 */ 1064 if (copy_to_user(argp, &range, sizeof(range))) 1065 status = -EFAULT; 1066 1067 out_free: 1068 kfree(context); 1069 out_drop: 1070 mnt_drop_write_file(filp); 1071 1072 return status; 1073 } 1074