1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * move_extents.c 5 * 6 * Copyright (C) 2011 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 #include <linux/fs.h> 18 #include <linux/types.h> 19 #include <linux/mount.h> 20 #include <linux/swap.h> 21 22 #include <cluster/masklog.h> 23 24 #include "ocfs2.h" 25 #include "ocfs2_ioctl.h" 26 27 #include "alloc.h" 28 #include "aops.h" 29 #include "dlmglue.h" 30 #include "extent_map.h" 31 #include "inode.h" 32 #include "journal.h" 33 #include "suballoc.h" 34 #include "uptodate.h" 35 #include "super.h" 36 #include "dir.h" 37 #include "buffer_head_io.h" 38 #include "sysfile.h" 39 #include "refcounttree.h" 40 #include "move_extents.h" 41 42 struct ocfs2_move_extents_context { 43 struct inode *inode; 44 struct file *file; 45 int auto_defrag; 46 int partial; 47 int credits; 48 u32 new_phys_cpos; 49 u32 clusters_moved; 50 u64 refcount_loc; 51 struct ocfs2_move_extents *range; 52 struct ocfs2_extent_tree et; 53 struct ocfs2_alloc_context *meta_ac; 54 struct ocfs2_alloc_context *data_ac; 55 struct ocfs2_cached_dealloc_ctxt dealloc; 56 }; 57 58 static int __ocfs2_move_extent(handle_t *handle, 59 struct ocfs2_move_extents_context *context, 60 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 61 int ext_flags) 62 { 63 int ret = 0, index; 64 struct inode *inode = context->inode; 65 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 66 struct ocfs2_extent_rec *rec, replace_rec; 67 struct ocfs2_path *path = NULL; 68 struct ocfs2_extent_list *el; 69 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 70 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 71 72 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, 73 p_cpos, new_p_cpos, len); 74 if (ret) { 75 mlog_errno(ret); 76 goto out; 77 } 78 79 memset(&replace_rec, 0, sizeof(replace_rec)); 80 replace_rec.e_cpos = cpu_to_le32(cpos); 81 replace_rec.e_leaf_clusters = cpu_to_le16(len); 82 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 83 new_p_cpos)); 84 85 path = ocfs2_new_path_from_et(&context->et); 86 if (!path) { 87 ret = -ENOMEM; 88 mlog_errno(ret); 89 goto out; 90 } 91 92 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 93 if (ret) { 94 mlog_errno(ret); 95 goto out; 96 } 97 98 el = path_leaf_el(path); 99 100 index = ocfs2_search_extent_list(el, cpos); 101 if (index == -1) { 102 ocfs2_error(inode->i_sb, 103 "Inode %llu has an extent at cpos %u which can no " 104 "longer be found.\n", 105 (unsigned long long)ino, cpos); 106 ret = -EROFS; 107 goto out; 108 } 109 110 rec = &el->l_recs[index]; 111 112 BUG_ON(ext_flags != rec->e_flags); 113 /* 114 * after moving/defraging to new location, the extent is not going 115 * to be refcounted anymore. 116 */ 117 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 118 119 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 120 context->et.et_root_bh, 121 OCFS2_JOURNAL_ACCESS_WRITE); 122 if (ret) { 123 mlog_errno(ret); 124 goto out; 125 } 126 127 ret = ocfs2_split_extent(handle, &context->et, path, index, 128 &replace_rec, context->meta_ac, 129 &context->dealloc); 130 if (ret) { 131 mlog_errno(ret); 132 goto out; 133 } 134 135 ocfs2_journal_dirty(handle, context->et.et_root_bh); 136 137 context->new_phys_cpos = new_p_cpos; 138 139 /* 140 * need I to append truncate log for old clusters? 141 */ 142 if (old_blkno) { 143 if (ext_flags & OCFS2_EXT_REFCOUNTED) 144 ret = ocfs2_decrease_refcount(inode, handle, 145 ocfs2_blocks_to_clusters(osb->sb, 146 old_blkno), 147 len, context->meta_ac, 148 &context->dealloc, 1); 149 else 150 ret = ocfs2_truncate_log_append(osb, handle, 151 old_blkno, len); 152 } 153 154 ocfs2_update_inode_fsync_trans(handle, inode, 0); 155 out: 156 ocfs2_free_path(path); 157 return ret; 158 } 159 160 /* 161 * lock allocators, and reserving appropriate number of bits for 162 * meta blocks and data clusters. 163 * 164 * in some cases, we don't need to reserve clusters, just let data_ac 165 * be NULL. 166 */ 167 static int ocfs2_lock_allocators_move_extents(struct inode *inode, 168 struct ocfs2_extent_tree *et, 169 u32 clusters_to_move, 170 u32 extents_to_split, 171 struct ocfs2_alloc_context **meta_ac, 172 struct ocfs2_alloc_context **data_ac, 173 int extra_blocks, 174 int *credits) 175 { 176 int ret, num_free_extents; 177 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 178 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 179 180 num_free_extents = ocfs2_num_free_extents(osb, et); 181 if (num_free_extents < 0) { 182 ret = num_free_extents; 183 mlog_errno(ret); 184 goto out; 185 } 186 187 if (!num_free_extents || 188 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 189 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 190 191 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 192 if (ret) { 193 mlog_errno(ret); 194 goto out; 195 } 196 197 if (data_ac) { 198 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); 199 if (ret) { 200 mlog_errno(ret); 201 goto out; 202 } 203 } 204 205 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); 206 207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 208 extra_blocks, clusters_to_move, *credits); 209 out: 210 if (ret) { 211 if (*meta_ac) { 212 ocfs2_free_alloc_context(*meta_ac); 213 *meta_ac = NULL; 214 } 215 } 216 217 return ret; 218 } 219 220 /* 221 * Using one journal handle to guarantee the data consistency in case 222 * crash happens anywhere. 223 * 224 * XXX: defrag can end up with finishing partial extent as requested, 225 * due to not enough contiguous clusters can be found in allocator. 226 */ 227 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 229 { 230 int ret, credits = 0, extra_blocks = 0, partial = context->partial; 231 handle_t *handle; 232 struct inode *inode = context->inode; 233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 234 struct inode *tl_inode = osb->osb_tl_inode; 235 struct ocfs2_refcount_tree *ref_tree = NULL; 236 u32 new_phys_cpos, new_len; 237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 238 239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 240 241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 242 OCFS2_HAS_REFCOUNT_FL)); 243 244 BUG_ON(!context->refcount_loc); 245 246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 247 &ref_tree, NULL); 248 if (ret) { 249 mlog_errno(ret); 250 return ret; 251 } 252 253 ret = ocfs2_prepare_refcount_change_for_del(inode, 254 context->refcount_loc, 255 phys_blkno, 256 *len, 257 &credits, 258 &extra_blocks); 259 if (ret) { 260 mlog_errno(ret); 261 goto out; 262 } 263 } 264 265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, 266 &context->meta_ac, 267 &context->data_ac, 268 extra_blocks, &credits); 269 if (ret) { 270 mlog_errno(ret); 271 goto out; 272 } 273 274 /* 275 * should be using allocation reservation strategy there? 276 * 277 * if (context->data_ac) 278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 279 */ 280 281 mutex_lock(&tl_inode->i_mutex); 282 283 if (ocfs2_truncate_log_needs_flush(osb)) { 284 ret = __ocfs2_flush_truncate_log(osb); 285 if (ret < 0) { 286 mlog_errno(ret); 287 goto out_unlock_mutex; 288 } 289 } 290 291 handle = ocfs2_start_trans(osb, credits); 292 if (IS_ERR(handle)) { 293 ret = PTR_ERR(handle); 294 mlog_errno(ret); 295 goto out_unlock_mutex; 296 } 297 298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 299 &new_phys_cpos, &new_len); 300 if (ret) { 301 mlog_errno(ret); 302 goto out_commit; 303 } 304 305 /* 306 * allowing partial extent moving is kind of 'pros and cons', it makes 307 * whole defragmentation less likely to fail, on the contrary, the bad 308 * thing is it may make the fs even more fragmented after moving, let 309 * userspace make a good decision here. 310 */ 311 if (new_len != *len) { 312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 313 if (!partial) { 314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 315 ret = -ENOSPC; 316 goto out_commit; 317 } 318 } 319 320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 321 phys_cpos, new_phys_cpos); 322 323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 324 new_phys_cpos, ext_flags); 325 if (ret) 326 mlog_errno(ret); 327 328 if (partial && (new_len != *len)) 329 *len = new_len; 330 331 /* 332 * Here we should write the new page out first if we are 333 * in write-back mode. 334 */ 335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 336 if (ret) 337 mlog_errno(ret); 338 339 out_commit: 340 ocfs2_commit_trans(osb, handle); 341 342 out_unlock_mutex: 343 mutex_unlock(&tl_inode->i_mutex); 344 345 if (context->data_ac) { 346 ocfs2_free_alloc_context(context->data_ac); 347 context->data_ac = NULL; 348 } 349 350 if (context->meta_ac) { 351 ocfs2_free_alloc_context(context->meta_ac); 352 context->meta_ac = NULL; 353 } 354 355 out: 356 if (ref_tree) 357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 358 359 return ret; 360 } 361 362 /* 363 * find the victim alloc group, where #blkno fits. 364 */ 365 static int ocfs2_find_victim_alloc_group(struct inode *inode, 366 u64 vict_blkno, 367 int type, int slot, 368 int *vict_bit, 369 struct buffer_head **ret_bh) 370 { 371 int ret, i, bits_per_unit = 0; 372 u64 blkno; 373 char namebuf[40]; 374 375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 377 struct ocfs2_chain_list *cl; 378 struct ocfs2_chain_rec *rec; 379 struct ocfs2_dinode *ac_dinode; 380 struct ocfs2_group_desc *bg; 381 382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 384 strlen(namebuf), &blkno); 385 if (ret) { 386 ret = -ENOENT; 387 goto out; 388 } 389 390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 391 if (ret) { 392 mlog_errno(ret); 393 goto out; 394 } 395 396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 397 cl = &(ac_dinode->id2.i_chain); 398 rec = &(cl->cl_recs[0]); 399 400 if (type == GLOBAL_BITMAP_SYSTEM_INODE) 401 bits_per_unit = osb->s_clustersize_bits - 402 inode->i_sb->s_blocksize_bits; 403 /* 404 * 'vict_blkno' was out of the valid range. 405 */ 406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 407 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 408 bits_per_unit))) { 409 ret = -EINVAL; 410 goto out; 411 } 412 413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 414 415 rec = &(cl->cl_recs[i]); 416 if (!rec) 417 continue; 418 419 bg = NULL; 420 421 do { 422 if (!bg) 423 blkno = le64_to_cpu(rec->c_blkno); 424 else 425 blkno = le64_to_cpu(bg->bg_next_group); 426 427 if (gd_bh) { 428 brelse(gd_bh); 429 gd_bh = NULL; 430 } 431 432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 433 if (ret) { 434 mlog_errno(ret); 435 goto out; 436 } 437 438 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 439 440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 441 le16_to_cpu(bg->bg_bits))) { 442 443 *ret_bh = gd_bh; 444 *vict_bit = (vict_blkno - blkno) >> 445 bits_per_unit; 446 mlog(0, "find the victim group: #%llu, " 447 "total_bits: %u, vict_bit: %u\n", 448 blkno, le16_to_cpu(bg->bg_bits), 449 *vict_bit); 450 goto out; 451 } 452 453 } while (le64_to_cpu(bg->bg_next_group)); 454 } 455 456 ret = -EINVAL; 457 out: 458 brelse(ac_bh); 459 460 /* 461 * caller has to release the gd_bh properly. 462 */ 463 return ret; 464 } 465 466 /* 467 * XXX: helper to validate and adjust moving goal. 468 */ 469 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 470 struct ocfs2_move_extents *range) 471 { 472 int ret, goal_bit = 0; 473 474 struct buffer_head *gd_bh = NULL; 475 struct ocfs2_group_desc *bg; 476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 477 int c_to_b = 1 << (osb->s_clustersize_bits - 478 inode->i_sb->s_blocksize_bits); 479 480 /* 481 * make goal become cluster aligned. 482 */ 483 range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 484 range->me_goal); 485 /* 486 * validate goal sits within global_bitmap, and return the victim 487 * group desc 488 */ 489 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 490 GLOBAL_BITMAP_SYSTEM_INODE, 491 OCFS2_INVALID_SLOT, 492 &goal_bit, &gd_bh); 493 if (ret) 494 goto out; 495 496 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 497 498 /* 499 * moving goal is not allowd to start with a group desc blok(#0 blk) 500 * let's compromise to the latter cluster. 501 */ 502 if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 503 range->me_goal += c_to_b; 504 505 /* 506 * movement is not gonna cross two groups. 507 */ 508 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 509 range->me_len) { 510 ret = -EINVAL; 511 goto out; 512 } 513 /* 514 * more exact validations/adjustments will be performed later during 515 * moving operation for each extent range. 516 */ 517 mlog(0, "extents get ready to be moved to #%llu block\n", 518 range->me_goal); 519 520 out: 521 brelse(gd_bh); 522 523 return ret; 524 } 525 526 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 527 int *goal_bit, u32 move_len, u32 max_hop, 528 u32 *phys_cpos) 529 { 530 int i, used, last_free_bits = 0, base_bit = *goal_bit; 531 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 532 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 533 le64_to_cpu(gd->bg_blkno)); 534 535 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 536 537 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 538 if (used) { 539 /* 540 * we even tried searching the free chunk by jumping 541 * a 'max_hop' distance, but still failed. 542 */ 543 if ((i - base_bit) > max_hop) { 544 *phys_cpos = 0; 545 break; 546 } 547 548 if (last_free_bits) 549 last_free_bits = 0; 550 551 continue; 552 } else 553 last_free_bits++; 554 555 if (last_free_bits == move_len) { 556 *goal_bit = i; 557 *phys_cpos = base_cpos + i; 558 break; 559 } 560 } 561 562 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 563 } 564 565 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 566 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 567 u32 len, int ext_flags) 568 { 569 int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 570 handle_t *handle; 571 struct inode *inode = context->inode; 572 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 573 struct inode *tl_inode = osb->osb_tl_inode; 574 struct inode *gb_inode = NULL; 575 struct buffer_head *gb_bh = NULL; 576 struct buffer_head *gd_bh = NULL; 577 struct ocfs2_group_desc *gd; 578 struct ocfs2_refcount_tree *ref_tree = NULL; 579 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 580 context->range->me_threshold); 581 u64 phys_blkno, new_phys_blkno; 582 583 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 584 585 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 586 587 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 588 OCFS2_HAS_REFCOUNT_FL)); 589 590 BUG_ON(!context->refcount_loc); 591 592 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 593 &ref_tree, NULL); 594 if (ret) { 595 mlog_errno(ret); 596 return ret; 597 } 598 599 ret = ocfs2_prepare_refcount_change_for_del(inode, 600 context->refcount_loc, 601 phys_blkno, 602 len, 603 &credits, 604 &extra_blocks); 605 if (ret) { 606 mlog_errno(ret); 607 goto out; 608 } 609 } 610 611 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, 612 &context->meta_ac, 613 NULL, extra_blocks, &credits); 614 if (ret) { 615 mlog_errno(ret); 616 goto out; 617 } 618 619 /* 620 * need to count 2 extra credits for global_bitmap inode and 621 * group descriptor. 622 */ 623 credits += OCFS2_INODE_UPDATE_CREDITS + 1; 624 625 /* 626 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 627 * logic, while we still need to lock the global_bitmap. 628 */ 629 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 630 OCFS2_INVALID_SLOT); 631 if (!gb_inode) { 632 mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 633 ret = -EIO; 634 goto out; 635 } 636 637 mutex_lock(&gb_inode->i_mutex); 638 639 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 640 if (ret) { 641 mlog_errno(ret); 642 goto out_unlock_gb_mutex; 643 } 644 645 mutex_lock(&tl_inode->i_mutex); 646 647 handle = ocfs2_start_trans(osb, credits); 648 if (IS_ERR(handle)) { 649 ret = PTR_ERR(handle); 650 mlog_errno(ret); 651 goto out_unlock_tl_inode; 652 } 653 654 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 655 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 656 GLOBAL_BITMAP_SYSTEM_INODE, 657 OCFS2_INVALID_SLOT, 658 &goal_bit, &gd_bh); 659 if (ret) { 660 mlog_errno(ret); 661 goto out_commit; 662 } 663 664 /* 665 * probe the victim cluster group to find a proper 666 * region to fit wanted movement, it even will perfrom 667 * a best-effort attempt by compromising to a threshold 668 * around the goal. 669 */ 670 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 671 new_phys_cpos); 672 if (!*new_phys_cpos) { 673 ret = -ENOSPC; 674 goto out_commit; 675 } 676 677 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 678 *new_phys_cpos, ext_flags); 679 if (ret) { 680 mlog_errno(ret); 681 goto out_commit; 682 } 683 684 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 685 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 686 le16_to_cpu(gd->bg_chain)); 687 if (ret) { 688 mlog_errno(ret); 689 goto out_commit; 690 } 691 692 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 693 goal_bit, len); 694 if (ret) { 695 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, 696 le16_to_cpu(gd->bg_chain)); 697 mlog_errno(ret); 698 } 699 700 /* 701 * Here we should write the new page out first if we are 702 * in write-back mode. 703 */ 704 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 705 if (ret) 706 mlog_errno(ret); 707 708 out_commit: 709 ocfs2_commit_trans(osb, handle); 710 brelse(gd_bh); 711 712 out_unlock_tl_inode: 713 mutex_unlock(&tl_inode->i_mutex); 714 715 ocfs2_inode_unlock(gb_inode, 1); 716 out_unlock_gb_mutex: 717 mutex_unlock(&gb_inode->i_mutex); 718 brelse(gb_bh); 719 iput(gb_inode); 720 721 out: 722 if (context->meta_ac) { 723 ocfs2_free_alloc_context(context->meta_ac); 724 context->meta_ac = NULL; 725 } 726 727 if (ref_tree) 728 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 729 730 return ret; 731 } 732 733 /* 734 * Helper to calculate the defraging length in one run according to threshold. 735 */ 736 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 737 u32 threshold, int *skip) 738 { 739 if ((*alloc_size + *len_defraged) < threshold) { 740 /* 741 * proceed defragmentation until we meet the thresh 742 */ 743 *len_defraged += *alloc_size; 744 } else if (*len_defraged == 0) { 745 /* 746 * XXX: skip a large extent. 747 */ 748 *skip = 1; 749 } else { 750 /* 751 * split this extent to coalesce with former pieces as 752 * to reach the threshold. 753 * 754 * we're done here with one cycle of defragmentation 755 * in a size of 'thresh', resetting 'len_defraged' 756 * forces a new defragmentation. 757 */ 758 *alloc_size = threshold - *len_defraged; 759 *len_defraged = 0; 760 } 761 } 762 763 static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 764 struct ocfs2_move_extents_context *context) 765 { 766 int ret = 0, flags, do_defrag, skip = 0; 767 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 768 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 769 770 struct inode *inode = context->inode; 771 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 772 struct ocfs2_move_extents *range = context->range; 773 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 774 775 if ((i_size_read(inode) == 0) || (range->me_len == 0)) 776 return 0; 777 778 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 779 return 0; 780 781 context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 782 783 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 784 ocfs2_init_dealloc_ctxt(&context->dealloc); 785 786 /* 787 * TO-DO XXX: 788 * 789 * - xattr extents. 790 */ 791 792 do_defrag = context->auto_defrag; 793 794 /* 795 * extents moving happens in unit of clusters, for the sake 796 * of simplicity, we may ignore two clusters where 'byte_start' 797 * and 'byte_start + len' were within. 798 */ 799 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 800 len_to_move = (range->me_start + range->me_len) >> 801 osb->s_clustersize_bits; 802 if (len_to_move >= move_start) 803 len_to_move -= move_start; 804 else 805 len_to_move = 0; 806 807 if (do_defrag) { 808 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 809 if (defrag_thresh <= 1) 810 goto done; 811 } else 812 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 813 range->me_goal); 814 815 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 816 "thresh: %u\n", 817 (unsigned long long)OCFS2_I(inode)->ip_blkno, 818 (unsigned long long)range->me_start, 819 (unsigned long long)range->me_len, 820 move_start, len_to_move, defrag_thresh); 821 822 cpos = move_start; 823 while (len_to_move) { 824 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 825 &flags); 826 if (ret) { 827 mlog_errno(ret); 828 goto out; 829 } 830 831 if (alloc_size > len_to_move) 832 alloc_size = len_to_move; 833 834 /* 835 * XXX: how to deal with a hole: 836 * 837 * - skip the hole of course 838 * - force a new defragmentation 839 */ 840 if (!phys_cpos) { 841 if (do_defrag) 842 len_defraged = 0; 843 844 goto next; 845 } 846 847 if (do_defrag) { 848 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 849 defrag_thresh, &skip); 850 /* 851 * skip large extents 852 */ 853 if (skip) { 854 skip = 0; 855 goto next; 856 } 857 858 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 859 "alloc_size: %u, len_defraged: %u\n", 860 cpos, phys_cpos, alloc_size, len_defraged); 861 862 ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 863 &alloc_size, flags); 864 } else { 865 ret = ocfs2_move_extent(context, cpos, phys_cpos, 866 &new_phys_cpos, alloc_size, 867 flags); 868 869 new_phys_cpos += alloc_size; 870 } 871 872 if (ret < 0) { 873 mlog_errno(ret); 874 goto out; 875 } 876 877 context->clusters_moved += alloc_size; 878 next: 879 cpos += alloc_size; 880 len_to_move -= alloc_size; 881 } 882 883 done: 884 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 885 886 out: 887 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 888 context->clusters_moved); 889 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 890 context->new_phys_cpos); 891 892 ocfs2_schedule_truncate_log_flush(osb, 1); 893 ocfs2_run_deallocs(osb, &context->dealloc); 894 895 return ret; 896 } 897 898 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 899 { 900 int status; 901 handle_t *handle; 902 struct inode *inode = context->inode; 903 struct ocfs2_dinode *di; 904 struct buffer_head *di_bh = NULL; 905 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 906 907 if (!inode) 908 return -ENOENT; 909 910 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 911 return -EROFS; 912 913 mutex_lock(&inode->i_mutex); 914 915 /* 916 * This prevents concurrent writes from other nodes 917 */ 918 status = ocfs2_rw_lock(inode, 1); 919 if (status) { 920 mlog_errno(status); 921 goto out; 922 } 923 924 status = ocfs2_inode_lock(inode, &di_bh, 1); 925 if (status) { 926 mlog_errno(status); 927 goto out_rw_unlock; 928 } 929 930 /* 931 * rememer ip_xattr_sem also needs to be held if necessary 932 */ 933 down_write(&OCFS2_I(inode)->ip_alloc_sem); 934 935 status = __ocfs2_move_extents_range(di_bh, context); 936 937 up_write(&OCFS2_I(inode)->ip_alloc_sem); 938 if (status) { 939 mlog_errno(status); 940 goto out_inode_unlock; 941 } 942 943 /* 944 * We update ctime for these changes 945 */ 946 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 947 if (IS_ERR(handle)) { 948 status = PTR_ERR(handle); 949 mlog_errno(status); 950 goto out_inode_unlock; 951 } 952 953 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 954 OCFS2_JOURNAL_ACCESS_WRITE); 955 if (status) { 956 mlog_errno(status); 957 goto out_commit; 958 } 959 960 di = (struct ocfs2_dinode *)di_bh->b_data; 961 inode->i_ctime = CURRENT_TIME; 962 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 963 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 964 ocfs2_update_inode_fsync_trans(handle, inode, 0); 965 966 ocfs2_journal_dirty(handle, di_bh); 967 968 out_commit: 969 ocfs2_commit_trans(osb, handle); 970 971 out_inode_unlock: 972 brelse(di_bh); 973 ocfs2_inode_unlock(inode, 1); 974 out_rw_unlock: 975 ocfs2_rw_unlock(inode, 1); 976 out: 977 mutex_unlock(&inode->i_mutex); 978 979 return status; 980 } 981 982 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 983 { 984 int status; 985 986 struct inode *inode = file_inode(filp); 987 struct ocfs2_move_extents range; 988 struct ocfs2_move_extents_context *context; 989 990 if (!argp) 991 return -EINVAL; 992 993 status = mnt_want_write_file(filp); 994 if (status) 995 return status; 996 997 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { 998 status = -EPERM; 999 goto out_drop; 1000 } 1001 1002 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1003 status = -EPERM; 1004 goto out_drop; 1005 } 1006 1007 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 1008 if (!context) { 1009 status = -ENOMEM; 1010 mlog_errno(status); 1011 goto out_drop; 1012 } 1013 1014 context->inode = inode; 1015 context->file = filp; 1016 1017 if (copy_from_user(&range, argp, sizeof(range))) { 1018 status = -EFAULT; 1019 goto out_free; 1020 } 1021 1022 if (range.me_start > i_size_read(inode)) { 1023 status = -EINVAL; 1024 goto out_free; 1025 } 1026 1027 if (range.me_start + range.me_len > i_size_read(inode)) 1028 range.me_len = i_size_read(inode) - range.me_start; 1029 1030 context->range = ⦥ 1031 1032 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 1033 context->auto_defrag = 1; 1034 /* 1035 * ok, the default theshold for the defragmentation 1036 * is 1M, since our maximum clustersize was 1M also. 1037 * any thought? 1038 */ 1039 if (!range.me_threshold) 1040 range.me_threshold = 1024 * 1024; 1041 1042 if (range.me_threshold > i_size_read(inode)) 1043 range.me_threshold = i_size_read(inode); 1044 1045 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 1046 context->partial = 1; 1047 } else { 1048 /* 1049 * first best-effort attempt to validate and adjust the goal 1050 * (physical address in block), while it can't guarantee later 1051 * operation can succeed all the time since global_bitmap may 1052 * change a bit over time. 1053 */ 1054 1055 status = ocfs2_validate_and_adjust_move_goal(inode, &range); 1056 if (status) 1057 goto out_copy; 1058 } 1059 1060 status = ocfs2_move_extents(context); 1061 if (status) 1062 mlog_errno(status); 1063 out_copy: 1064 /* 1065 * movement/defragmentation may end up being partially completed, 1066 * that's the reason why we need to return userspace the finished 1067 * length and new_offset even if failure happens somewhere. 1068 */ 1069 if (copy_to_user(argp, &range, sizeof(range))) 1070 status = -EFAULT; 1071 1072 out_free: 1073 kfree(context); 1074 out_drop: 1075 mnt_drop_write_file(filp); 1076 1077 return status; 1078 } 1079