1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * move_extents.c 5 * 6 * Copyright (C) 2011 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 #include <linux/fs.h> 18 #include <linux/types.h> 19 #include <linux/mount.h> 20 #include <linux/swap.h> 21 22 #include <cluster/masklog.h> 23 24 #include "ocfs2.h" 25 #include "ocfs2_ioctl.h" 26 27 #include "alloc.h" 28 #include "localalloc.h" 29 #include "aops.h" 30 #include "dlmglue.h" 31 #include "extent_map.h" 32 #include "inode.h" 33 #include "journal.h" 34 #include "suballoc.h" 35 #include "uptodate.h" 36 #include "super.h" 37 #include "dir.h" 38 #include "buffer_head_io.h" 39 #include "sysfile.h" 40 #include "refcounttree.h" 41 #include "move_extents.h" 42 43 struct ocfs2_move_extents_context { 44 struct inode *inode; 45 struct file *file; 46 int auto_defrag; 47 int partial; 48 int credits; 49 u32 new_phys_cpos; 50 u32 clusters_moved; 51 u64 refcount_loc; 52 struct ocfs2_move_extents *range; 53 struct ocfs2_extent_tree et; 54 struct ocfs2_alloc_context *meta_ac; 55 struct ocfs2_alloc_context *data_ac; 56 struct ocfs2_cached_dealloc_ctxt dealloc; 57 }; 58 59 static int __ocfs2_move_extent(handle_t *handle, 60 struct ocfs2_move_extents_context *context, 61 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 62 int ext_flags) 63 { 64 int ret = 0, index; 65 struct inode *inode = context->inode; 66 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 67 struct ocfs2_extent_rec *rec, replace_rec; 68 struct ocfs2_path *path = NULL; 69 struct ocfs2_extent_list *el; 70 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 71 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 72 73 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, 74 p_cpos, new_p_cpos, len); 75 if (ret) { 76 mlog_errno(ret); 77 goto out; 78 } 79 80 memset(&replace_rec, 0, sizeof(replace_rec)); 81 replace_rec.e_cpos = cpu_to_le32(cpos); 82 replace_rec.e_leaf_clusters = cpu_to_le16(len); 83 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 84 new_p_cpos)); 85 86 path = ocfs2_new_path_from_et(&context->et); 87 if (!path) { 88 ret = -ENOMEM; 89 mlog_errno(ret); 90 goto out; 91 } 92 93 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 94 if (ret) { 95 mlog_errno(ret); 96 goto out; 97 } 98 99 el = path_leaf_el(path); 100 101 index = ocfs2_search_extent_list(el, cpos); 102 if (index == -1) { 103 ret = ocfs2_error(inode->i_sb, 104 "Inode %llu has an extent at cpos %u which can no longer be found\n", 105 (unsigned long long)ino, cpos); 106 goto out; 107 } 108 109 rec = &el->l_recs[index]; 110 111 BUG_ON(ext_flags != rec->e_flags); 112 /* 113 * after moving/defraging to new location, the extent is not going 114 * to be refcounted anymore. 115 */ 116 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 117 118 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 119 context->et.et_root_bh, 120 OCFS2_JOURNAL_ACCESS_WRITE); 121 if (ret) { 122 mlog_errno(ret); 123 goto out; 124 } 125 126 ret = ocfs2_split_extent(handle, &context->et, path, index, 127 &replace_rec, context->meta_ac, 128 &context->dealloc); 129 if (ret) { 130 mlog_errno(ret); 131 goto out; 132 } 133 134 ocfs2_journal_dirty(handle, context->et.et_root_bh); 135 136 context->new_phys_cpos = new_p_cpos; 137 138 /* 139 * need I to append truncate log for old clusters? 140 */ 141 if (old_blkno) { 142 if (ext_flags & OCFS2_EXT_REFCOUNTED) 143 ret = ocfs2_decrease_refcount(inode, handle, 144 ocfs2_blocks_to_clusters(osb->sb, 145 old_blkno), 146 len, context->meta_ac, 147 &context->dealloc, 1); 148 else 149 ret = ocfs2_truncate_log_append(osb, handle, 150 old_blkno, len); 151 } 152 153 ocfs2_update_inode_fsync_trans(handle, inode, 0); 154 out: 155 ocfs2_free_path(path); 156 return ret; 157 } 158 159 /* 160 * lock allocator, and reserve appropriate number of bits for 161 * meta blocks. 162 */ 163 static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode, 164 struct ocfs2_extent_tree *et, 165 u32 clusters_to_move, 166 u32 extents_to_split, 167 struct ocfs2_alloc_context **meta_ac, 168 int extra_blocks, 169 int *credits) 170 { 171 int ret, num_free_extents; 172 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 173 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 174 175 num_free_extents = ocfs2_num_free_extents(et); 176 if (num_free_extents < 0) { 177 ret = num_free_extents; 178 mlog_errno(ret); 179 goto out; 180 } 181 182 if (!num_free_extents || 183 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 184 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 185 186 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 187 if (ret) { 188 mlog_errno(ret); 189 goto out; 190 } 191 192 193 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el); 194 195 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 196 extra_blocks, clusters_to_move, *credits); 197 out: 198 if (ret) { 199 if (*meta_ac) { 200 ocfs2_free_alloc_context(*meta_ac); 201 *meta_ac = NULL; 202 } 203 } 204 205 return ret; 206 } 207 208 /* 209 * Using one journal handle to guarantee the data consistency in case 210 * crash happens anywhere. 211 * 212 * XXX: defrag can end up with finishing partial extent as requested, 213 * due to not enough contiguous clusters can be found in allocator. 214 */ 215 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 216 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 217 { 218 int ret, credits = 0, extra_blocks = 0, partial = context->partial; 219 handle_t *handle; 220 struct inode *inode = context->inode; 221 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 222 struct inode *tl_inode = osb->osb_tl_inode; 223 struct ocfs2_refcount_tree *ref_tree = NULL; 224 u32 new_phys_cpos, new_len; 225 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 226 int need_free = 0; 227 228 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 229 BUG_ON(!ocfs2_is_refcount_inode(inode)); 230 BUG_ON(!context->refcount_loc); 231 232 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 233 &ref_tree, NULL); 234 if (ret) { 235 mlog_errno(ret); 236 return ret; 237 } 238 239 ret = ocfs2_prepare_refcount_change_for_del(inode, 240 context->refcount_loc, 241 phys_blkno, 242 *len, 243 &credits, 244 &extra_blocks); 245 if (ret) { 246 mlog_errno(ret); 247 goto out; 248 } 249 } 250 251 ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, 252 *len, 1, 253 &context->meta_ac, 254 extra_blocks, &credits); 255 if (ret) { 256 mlog_errno(ret); 257 goto out; 258 } 259 260 /* 261 * should be using allocation reservation strategy there? 262 * 263 * if (context->data_ac) 264 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 265 */ 266 267 inode_lock(tl_inode); 268 269 if (ocfs2_truncate_log_needs_flush(osb)) { 270 ret = __ocfs2_flush_truncate_log(osb); 271 if (ret < 0) { 272 mlog_errno(ret); 273 goto out_unlock_mutex; 274 } 275 } 276 277 /* 278 * Make sure ocfs2_reserve_cluster is called after 279 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen. 280 * 281 * If ocfs2_reserve_cluster is called 282 * before __ocfs2_flush_truncate_log, dead lock on global bitmap 283 * may happen. 284 * 285 */ 286 ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac); 287 if (ret) { 288 mlog_errno(ret); 289 goto out_unlock_mutex; 290 } 291 292 handle = ocfs2_start_trans(osb, credits); 293 if (IS_ERR(handle)) { 294 ret = PTR_ERR(handle); 295 mlog_errno(ret); 296 goto out_unlock_mutex; 297 } 298 299 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 300 &new_phys_cpos, &new_len); 301 if (ret) { 302 mlog_errno(ret); 303 goto out_commit; 304 } 305 306 /* 307 * allowing partial extent moving is kind of 'pros and cons', it makes 308 * whole defragmentation less likely to fail, on the contrary, the bad 309 * thing is it may make the fs even more fragmented after moving, let 310 * userspace make a good decision here. 311 */ 312 if (new_len != *len) { 313 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 314 if (!partial) { 315 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 316 ret = -ENOSPC; 317 need_free = 1; 318 goto out_commit; 319 } 320 } 321 322 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 323 phys_cpos, new_phys_cpos); 324 325 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 326 new_phys_cpos, ext_flags); 327 if (ret) 328 mlog_errno(ret); 329 330 if (partial && (new_len != *len)) 331 *len = new_len; 332 333 /* 334 * Here we should write the new page out first if we are 335 * in write-back mode. 336 */ 337 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 338 if (ret) 339 mlog_errno(ret); 340 341 out_commit: 342 if (need_free && context->data_ac) { 343 struct ocfs2_alloc_context *data_ac = context->data_ac; 344 345 if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL) 346 ocfs2_free_local_alloc_bits(osb, handle, data_ac, 347 new_phys_cpos, new_len); 348 else 349 ocfs2_free_clusters(handle, 350 data_ac->ac_inode, 351 data_ac->ac_bh, 352 ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos), 353 new_len); 354 } 355 356 ocfs2_commit_trans(osb, handle); 357 358 out_unlock_mutex: 359 inode_unlock(tl_inode); 360 361 if (context->data_ac) { 362 ocfs2_free_alloc_context(context->data_ac); 363 context->data_ac = NULL; 364 } 365 366 if (context->meta_ac) { 367 ocfs2_free_alloc_context(context->meta_ac); 368 context->meta_ac = NULL; 369 } 370 371 out: 372 if (ref_tree) 373 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 374 375 return ret; 376 } 377 378 /* 379 * find the victim alloc group, where #blkno fits. 380 */ 381 static int ocfs2_find_victim_alloc_group(struct inode *inode, 382 u64 vict_blkno, 383 int type, int slot, 384 int *vict_bit, 385 struct buffer_head **ret_bh) 386 { 387 int ret, i, bits_per_unit = 0; 388 u64 blkno; 389 char namebuf[40]; 390 391 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 392 struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 393 struct ocfs2_chain_list *cl; 394 struct ocfs2_chain_rec *rec; 395 struct ocfs2_dinode *ac_dinode; 396 struct ocfs2_group_desc *bg; 397 398 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 399 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 400 strlen(namebuf), &blkno); 401 if (ret) { 402 ret = -ENOENT; 403 goto out; 404 } 405 406 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 407 if (ret) { 408 mlog_errno(ret); 409 goto out; 410 } 411 412 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 413 cl = &(ac_dinode->id2.i_chain); 414 rec = &(cl->cl_recs[0]); 415 416 if (type == GLOBAL_BITMAP_SYSTEM_INODE) 417 bits_per_unit = osb->s_clustersize_bits - 418 inode->i_sb->s_blocksize_bits; 419 /* 420 * 'vict_blkno' was out of the valid range. 421 */ 422 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 423 (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 424 bits_per_unit))) { 425 ret = -EINVAL; 426 goto out; 427 } 428 429 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 430 431 rec = &(cl->cl_recs[i]); 432 if (!rec) 433 continue; 434 435 bg = NULL; 436 437 do { 438 if (!bg) 439 blkno = le64_to_cpu(rec->c_blkno); 440 else 441 blkno = le64_to_cpu(bg->bg_next_group); 442 443 if (gd_bh) { 444 brelse(gd_bh); 445 gd_bh = NULL; 446 } 447 448 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 449 if (ret) { 450 mlog_errno(ret); 451 goto out; 452 } 453 454 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 455 456 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 457 le16_to_cpu(bg->bg_bits))) { 458 459 *ret_bh = gd_bh; 460 *vict_bit = (vict_blkno - blkno) >> 461 bits_per_unit; 462 mlog(0, "find the victim group: #%llu, " 463 "total_bits: %u, vict_bit: %u\n", 464 blkno, le16_to_cpu(bg->bg_bits), 465 *vict_bit); 466 goto out; 467 } 468 469 } while (le64_to_cpu(bg->bg_next_group)); 470 } 471 472 ret = -EINVAL; 473 out: 474 brelse(ac_bh); 475 476 /* 477 * caller has to release the gd_bh properly. 478 */ 479 return ret; 480 } 481 482 /* 483 * XXX: helper to validate and adjust moving goal. 484 */ 485 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 486 struct ocfs2_move_extents *range) 487 { 488 int ret, goal_bit = 0; 489 490 struct buffer_head *gd_bh = NULL; 491 struct ocfs2_group_desc *bg; 492 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 493 int c_to_b = 1 << (osb->s_clustersize_bits - 494 inode->i_sb->s_blocksize_bits); 495 496 /* 497 * make goal become cluster aligned. 498 */ 499 range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 500 range->me_goal); 501 /* 502 * validate goal sits within global_bitmap, and return the victim 503 * group desc 504 */ 505 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 506 GLOBAL_BITMAP_SYSTEM_INODE, 507 OCFS2_INVALID_SLOT, 508 &goal_bit, &gd_bh); 509 if (ret) 510 goto out; 511 512 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 513 514 /* 515 * moving goal is not allowd to start with a group desc blok(#0 blk) 516 * let's compromise to the latter cluster. 517 */ 518 if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 519 range->me_goal += c_to_b; 520 521 /* 522 * movement is not gonna cross two groups. 523 */ 524 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 525 range->me_len) { 526 ret = -EINVAL; 527 goto out; 528 } 529 /* 530 * more exact validations/adjustments will be performed later during 531 * moving operation for each extent range. 532 */ 533 mlog(0, "extents get ready to be moved to #%llu block\n", 534 range->me_goal); 535 536 out: 537 brelse(gd_bh); 538 539 return ret; 540 } 541 542 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 543 int *goal_bit, u32 move_len, u32 max_hop, 544 u32 *phys_cpos) 545 { 546 int i, used, last_free_bits = 0, base_bit = *goal_bit; 547 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 548 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 549 le64_to_cpu(gd->bg_blkno)); 550 551 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 552 553 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 554 if (used) { 555 /* 556 * we even tried searching the free chunk by jumping 557 * a 'max_hop' distance, but still failed. 558 */ 559 if ((i - base_bit) > max_hop) { 560 *phys_cpos = 0; 561 break; 562 } 563 564 if (last_free_bits) 565 last_free_bits = 0; 566 567 continue; 568 } else 569 last_free_bits++; 570 571 if (last_free_bits == move_len) { 572 *goal_bit = i; 573 *phys_cpos = base_cpos + i; 574 break; 575 } 576 } 577 578 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 579 } 580 581 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 582 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 583 u32 len, int ext_flags) 584 { 585 int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 586 handle_t *handle; 587 struct inode *inode = context->inode; 588 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 589 struct inode *tl_inode = osb->osb_tl_inode; 590 struct inode *gb_inode = NULL; 591 struct buffer_head *gb_bh = NULL; 592 struct buffer_head *gd_bh = NULL; 593 struct ocfs2_group_desc *gd; 594 struct ocfs2_refcount_tree *ref_tree = NULL; 595 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 596 context->range->me_threshold); 597 u64 phys_blkno, new_phys_blkno; 598 599 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 600 601 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 602 BUG_ON(!ocfs2_is_refcount_inode(inode)); 603 BUG_ON(!context->refcount_loc); 604 605 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 606 &ref_tree, NULL); 607 if (ret) { 608 mlog_errno(ret); 609 return ret; 610 } 611 612 ret = ocfs2_prepare_refcount_change_for_del(inode, 613 context->refcount_loc, 614 phys_blkno, 615 len, 616 &credits, 617 &extra_blocks); 618 if (ret) { 619 mlog_errno(ret); 620 goto out; 621 } 622 } 623 624 ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et, 625 len, 1, 626 &context->meta_ac, 627 extra_blocks, &credits); 628 if (ret) { 629 mlog_errno(ret); 630 goto out; 631 } 632 633 /* 634 * need to count 2 extra credits for global_bitmap inode and 635 * group descriptor. 636 */ 637 credits += OCFS2_INODE_UPDATE_CREDITS + 1; 638 639 /* 640 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 641 * logic, while we still need to lock the global_bitmap. 642 */ 643 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 644 OCFS2_INVALID_SLOT); 645 if (!gb_inode) { 646 mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 647 ret = -EIO; 648 goto out; 649 } 650 651 inode_lock(gb_inode); 652 653 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 654 if (ret) { 655 mlog_errno(ret); 656 goto out_unlock_gb_mutex; 657 } 658 659 inode_lock(tl_inode); 660 661 handle = ocfs2_start_trans(osb, credits); 662 if (IS_ERR(handle)) { 663 ret = PTR_ERR(handle); 664 mlog_errno(ret); 665 goto out_unlock_tl_inode; 666 } 667 668 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 669 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 670 GLOBAL_BITMAP_SYSTEM_INODE, 671 OCFS2_INVALID_SLOT, 672 &goal_bit, &gd_bh); 673 if (ret) { 674 mlog_errno(ret); 675 goto out_commit; 676 } 677 678 /* 679 * probe the victim cluster group to find a proper 680 * region to fit wanted movement, it even will perfrom 681 * a best-effort attempt by compromising to a threshold 682 * around the goal. 683 */ 684 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 685 new_phys_cpos); 686 if (!*new_phys_cpos) { 687 ret = -ENOSPC; 688 goto out_commit; 689 } 690 691 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 692 *new_phys_cpos, ext_flags); 693 if (ret) { 694 mlog_errno(ret); 695 goto out_commit; 696 } 697 698 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 699 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 700 le16_to_cpu(gd->bg_chain)); 701 if (ret) { 702 mlog_errno(ret); 703 goto out_commit; 704 } 705 706 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 707 goal_bit, len); 708 if (ret) { 709 ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len, 710 le16_to_cpu(gd->bg_chain)); 711 mlog_errno(ret); 712 } 713 714 /* 715 * Here we should write the new page out first if we are 716 * in write-back mode. 717 */ 718 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 719 if (ret) 720 mlog_errno(ret); 721 722 out_commit: 723 ocfs2_commit_trans(osb, handle); 724 brelse(gd_bh); 725 726 out_unlock_tl_inode: 727 inode_unlock(tl_inode); 728 729 ocfs2_inode_unlock(gb_inode, 1); 730 out_unlock_gb_mutex: 731 inode_unlock(gb_inode); 732 brelse(gb_bh); 733 iput(gb_inode); 734 735 out: 736 if (context->meta_ac) { 737 ocfs2_free_alloc_context(context->meta_ac); 738 context->meta_ac = NULL; 739 } 740 741 if (ref_tree) 742 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 743 744 return ret; 745 } 746 747 /* 748 * Helper to calculate the defraging length in one run according to threshold. 749 */ 750 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 751 u32 threshold, int *skip) 752 { 753 if ((*alloc_size + *len_defraged) < threshold) { 754 /* 755 * proceed defragmentation until we meet the thresh 756 */ 757 *len_defraged += *alloc_size; 758 } else if (*len_defraged == 0) { 759 /* 760 * XXX: skip a large extent. 761 */ 762 *skip = 1; 763 } else { 764 /* 765 * split this extent to coalesce with former pieces as 766 * to reach the threshold. 767 * 768 * we're done here with one cycle of defragmentation 769 * in a size of 'thresh', resetting 'len_defraged' 770 * forces a new defragmentation. 771 */ 772 *alloc_size = threshold - *len_defraged; 773 *len_defraged = 0; 774 } 775 } 776 777 static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 778 struct ocfs2_move_extents_context *context) 779 { 780 int ret = 0, flags, do_defrag, skip = 0; 781 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 782 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 783 784 struct inode *inode = context->inode; 785 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 786 struct ocfs2_move_extents *range = context->range; 787 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 788 789 if ((i_size_read(inode) == 0) || (range->me_len == 0)) 790 return 0; 791 792 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 793 return 0; 794 795 context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 796 797 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 798 ocfs2_init_dealloc_ctxt(&context->dealloc); 799 800 /* 801 * TO-DO XXX: 802 * 803 * - xattr extents. 804 */ 805 806 do_defrag = context->auto_defrag; 807 808 /* 809 * extents moving happens in unit of clusters, for the sake 810 * of simplicity, we may ignore two clusters where 'byte_start' 811 * and 'byte_start + len' were within. 812 */ 813 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 814 len_to_move = (range->me_start + range->me_len) >> 815 osb->s_clustersize_bits; 816 if (len_to_move >= move_start) 817 len_to_move -= move_start; 818 else 819 len_to_move = 0; 820 821 if (do_defrag) { 822 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 823 if (defrag_thresh <= 1) 824 goto done; 825 } else 826 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 827 range->me_goal); 828 829 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 830 "thresh: %u\n", 831 (unsigned long long)OCFS2_I(inode)->ip_blkno, 832 (unsigned long long)range->me_start, 833 (unsigned long long)range->me_len, 834 move_start, len_to_move, defrag_thresh); 835 836 cpos = move_start; 837 while (len_to_move) { 838 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 839 &flags); 840 if (ret) { 841 mlog_errno(ret); 842 goto out; 843 } 844 845 if (alloc_size > len_to_move) 846 alloc_size = len_to_move; 847 848 /* 849 * XXX: how to deal with a hole: 850 * 851 * - skip the hole of course 852 * - force a new defragmentation 853 */ 854 if (!phys_cpos) { 855 if (do_defrag) 856 len_defraged = 0; 857 858 goto next; 859 } 860 861 if (do_defrag) { 862 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 863 defrag_thresh, &skip); 864 /* 865 * skip large extents 866 */ 867 if (skip) { 868 skip = 0; 869 goto next; 870 } 871 872 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 873 "alloc_size: %u, len_defraged: %u\n", 874 cpos, phys_cpos, alloc_size, len_defraged); 875 876 ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 877 &alloc_size, flags); 878 } else { 879 ret = ocfs2_move_extent(context, cpos, phys_cpos, 880 &new_phys_cpos, alloc_size, 881 flags); 882 883 new_phys_cpos += alloc_size; 884 } 885 886 if (ret < 0) { 887 mlog_errno(ret); 888 goto out; 889 } 890 891 context->clusters_moved += alloc_size; 892 next: 893 cpos += alloc_size; 894 len_to_move -= alloc_size; 895 } 896 897 done: 898 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 899 900 out: 901 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 902 context->clusters_moved); 903 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 904 context->new_phys_cpos); 905 906 ocfs2_schedule_truncate_log_flush(osb, 1); 907 ocfs2_run_deallocs(osb, &context->dealloc); 908 909 return ret; 910 } 911 912 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 913 { 914 int status; 915 handle_t *handle; 916 struct inode *inode = context->inode; 917 struct ocfs2_dinode *di; 918 struct buffer_head *di_bh = NULL; 919 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 920 921 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 922 return -EROFS; 923 924 inode_lock(inode); 925 926 /* 927 * This prevents concurrent writes from other nodes 928 */ 929 status = ocfs2_rw_lock(inode, 1); 930 if (status) { 931 mlog_errno(status); 932 goto out; 933 } 934 935 status = ocfs2_inode_lock(inode, &di_bh, 1); 936 if (status) { 937 mlog_errno(status); 938 goto out_rw_unlock; 939 } 940 941 /* 942 * rememer ip_xattr_sem also needs to be held if necessary 943 */ 944 down_write(&OCFS2_I(inode)->ip_alloc_sem); 945 946 status = __ocfs2_move_extents_range(di_bh, context); 947 948 up_write(&OCFS2_I(inode)->ip_alloc_sem); 949 if (status) { 950 mlog_errno(status); 951 goto out_inode_unlock; 952 } 953 954 /* 955 * We update ctime for these changes 956 */ 957 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 958 if (IS_ERR(handle)) { 959 status = PTR_ERR(handle); 960 mlog_errno(status); 961 goto out_inode_unlock; 962 } 963 964 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 965 OCFS2_JOURNAL_ACCESS_WRITE); 966 if (status) { 967 mlog_errno(status); 968 goto out_commit; 969 } 970 971 di = (struct ocfs2_dinode *)di_bh->b_data; 972 inode->i_ctime = current_time(inode); 973 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 974 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 975 ocfs2_update_inode_fsync_trans(handle, inode, 0); 976 977 ocfs2_journal_dirty(handle, di_bh); 978 979 out_commit: 980 ocfs2_commit_trans(osb, handle); 981 982 out_inode_unlock: 983 brelse(di_bh); 984 ocfs2_inode_unlock(inode, 1); 985 out_rw_unlock: 986 ocfs2_rw_unlock(inode, 1); 987 out: 988 inode_unlock(inode); 989 990 return status; 991 } 992 993 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 994 { 995 int status; 996 997 struct inode *inode = file_inode(filp); 998 struct ocfs2_move_extents range; 999 struct ocfs2_move_extents_context *context; 1000 1001 if (!argp) 1002 return -EINVAL; 1003 1004 status = mnt_want_write_file(filp); 1005 if (status) 1006 return status; 1007 1008 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) { 1009 status = -EPERM; 1010 goto out_drop; 1011 } 1012 1013 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1014 status = -EPERM; 1015 goto out_drop; 1016 } 1017 1018 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 1019 if (!context) { 1020 status = -ENOMEM; 1021 mlog_errno(status); 1022 goto out_drop; 1023 } 1024 1025 context->inode = inode; 1026 context->file = filp; 1027 1028 if (copy_from_user(&range, argp, sizeof(range))) { 1029 status = -EFAULT; 1030 goto out_free; 1031 } 1032 1033 if (range.me_start > i_size_read(inode)) { 1034 status = -EINVAL; 1035 goto out_free; 1036 } 1037 1038 if (range.me_start + range.me_len > i_size_read(inode)) 1039 range.me_len = i_size_read(inode) - range.me_start; 1040 1041 context->range = ⦥ 1042 1043 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 1044 context->auto_defrag = 1; 1045 /* 1046 * ok, the default theshold for the defragmentation 1047 * is 1M, since our maximum clustersize was 1M also. 1048 * any thought? 1049 */ 1050 if (!range.me_threshold) 1051 range.me_threshold = 1024 * 1024; 1052 1053 if (range.me_threshold > i_size_read(inode)) 1054 range.me_threshold = i_size_read(inode); 1055 1056 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 1057 context->partial = 1; 1058 } else { 1059 /* 1060 * first best-effort attempt to validate and adjust the goal 1061 * (physical address in block), while it can't guarantee later 1062 * operation can succeed all the time since global_bitmap may 1063 * change a bit over time. 1064 */ 1065 1066 status = ocfs2_validate_and_adjust_move_goal(inode, &range); 1067 if (status) 1068 goto out_copy; 1069 } 1070 1071 status = ocfs2_move_extents(context); 1072 if (status) 1073 mlog_errno(status); 1074 out_copy: 1075 /* 1076 * movement/defragmentation may end up being partially completed, 1077 * that's the reason why we need to return userspace the finished 1078 * length and new_offset even if failure happens somewhere. 1079 */ 1080 if (copy_to_user(argp, &range, sizeof(range))) 1081 status = -EFAULT; 1082 1083 out_free: 1084 kfree(context); 1085 out_drop: 1086 mnt_drop_write_file(filp); 1087 1088 return status; 1089 } 1090