1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * move_extents.c 5 * 6 * Copyright (C) 2011 Oracle. All rights reserved. 7 * 8 * This program is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU General Public 10 * License version 2 as published by the Free Software Foundation. 11 * 12 * This program is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * General Public License for more details. 16 */ 17 #include <linux/fs.h> 18 #include <linux/types.h> 19 #include <linux/mount.h> 20 #include <linux/swap.h> 21 22 #include <cluster/masklog.h> 23 24 #include "ocfs2.h" 25 #include "ocfs2_ioctl.h" 26 27 #include "alloc.h" 28 #include "aops.h" 29 #include "dlmglue.h" 30 #include "extent_map.h" 31 #include "inode.h" 32 #include "journal.h" 33 #include "suballoc.h" 34 #include "uptodate.h" 35 #include "super.h" 36 #include "dir.h" 37 #include "buffer_head_io.h" 38 #include "sysfile.h" 39 #include "refcounttree.h" 40 #include "move_extents.h" 41 42 struct ocfs2_move_extents_context { 43 struct inode *inode; 44 struct file *file; 45 int auto_defrag; 46 int partial; 47 int credits; 48 u32 new_phys_cpos; 49 u32 clusters_moved; 50 u64 refcount_loc; 51 struct ocfs2_move_extents *range; 52 struct ocfs2_extent_tree et; 53 struct ocfs2_alloc_context *meta_ac; 54 struct ocfs2_alloc_context *data_ac; 55 struct ocfs2_cached_dealloc_ctxt dealloc; 56 }; 57 58 static int __ocfs2_move_extent(handle_t *handle, 59 struct ocfs2_move_extents_context *context, 60 u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 61 int ext_flags) 62 { 63 int ret = 0, index; 64 struct inode *inode = context->inode; 65 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 66 struct ocfs2_extent_rec *rec, replace_rec; 67 struct ocfs2_path *path = NULL; 68 struct ocfs2_extent_list *el; 69 u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 70 u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 71 72 ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos, 73 p_cpos, new_p_cpos, len); 74 if (ret) { 75 mlog_errno(ret); 76 goto out; 77 } 78 79 memset(&replace_rec, 0, sizeof(replace_rec)); 80 replace_rec.e_cpos = cpu_to_le32(cpos); 81 replace_rec.e_leaf_clusters = cpu_to_le16(len); 82 replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 83 new_p_cpos)); 84 85 path = ocfs2_new_path_from_et(&context->et); 86 if (!path) { 87 ret = -ENOMEM; 88 mlog_errno(ret); 89 goto out; 90 } 91 92 ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 93 if (ret) { 94 mlog_errno(ret); 95 goto out; 96 } 97 98 el = path_leaf_el(path); 99 100 index = ocfs2_search_extent_list(el, cpos); 101 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 102 ocfs2_error(inode->i_sb, 103 "Inode %llu has an extent at cpos %u which can no " 104 "longer be found.\n", 105 (unsigned long long)ino, cpos); 106 ret = -EROFS; 107 goto out; 108 } 109 110 rec = &el->l_recs[index]; 111 112 BUG_ON(ext_flags != rec->e_flags); 113 /* 114 * after moving/defraging to new location, the extent is not going 115 * to be refcounted anymore. 116 */ 117 replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 118 119 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 120 context->et.et_root_bh, 121 OCFS2_JOURNAL_ACCESS_WRITE); 122 if (ret) { 123 mlog_errno(ret); 124 goto out; 125 } 126 127 ret = ocfs2_split_extent(handle, &context->et, path, index, 128 &replace_rec, context->meta_ac, 129 &context->dealloc); 130 if (ret) { 131 mlog_errno(ret); 132 goto out; 133 } 134 135 ocfs2_journal_dirty(handle, context->et.et_root_bh); 136 137 context->new_phys_cpos = new_p_cpos; 138 139 /* 140 * need I to append truncate log for old clusters? 141 */ 142 if (old_blkno) { 143 if (ext_flags & OCFS2_EXT_REFCOUNTED) 144 ret = ocfs2_decrease_refcount(inode, handle, 145 ocfs2_blocks_to_clusters(osb->sb, 146 old_blkno), 147 len, context->meta_ac, 148 &context->dealloc, 1); 149 else 150 ret = ocfs2_truncate_log_append(osb, handle, 151 old_blkno, len); 152 } 153 154 out: 155 ocfs2_free_path(path); 156 return ret; 157 } 158 159 /* 160 * lock allocators, and reserving appropriate number of bits for 161 * meta blocks and data clusters. 162 * 163 * in some cases, we don't need to reserve clusters, just let data_ac 164 * be NULL. 165 */ 166 static int ocfs2_lock_allocators_move_extents(struct inode *inode, 167 struct ocfs2_extent_tree *et, 168 u32 clusters_to_move, 169 u32 extents_to_split, 170 struct ocfs2_alloc_context **meta_ac, 171 struct ocfs2_alloc_context **data_ac, 172 int extra_blocks, 173 int *credits) 174 { 175 int ret, num_free_extents; 176 unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 177 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178 179 num_free_extents = ocfs2_num_free_extents(osb, et); 180 if (num_free_extents < 0) { 181 ret = num_free_extents; 182 mlog_errno(ret); 183 goto out; 184 } 185 186 if (!num_free_extents || 187 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 188 extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 189 190 ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 191 if (ret) { 192 mlog_errno(ret); 193 goto out; 194 } 195 196 if (data_ac) { 197 ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); 198 if (ret) { 199 mlog_errno(ret); 200 goto out; 201 } 202 } 203 204 *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, 205 clusters_to_move + 2); 206 207 mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 208 extra_blocks, clusters_to_move, *credits); 209 out: 210 if (ret) { 211 if (*meta_ac) { 212 ocfs2_free_alloc_context(*meta_ac); 213 *meta_ac = NULL; 214 } 215 } 216 217 return ret; 218 } 219 220 /* 221 * Using one journal handle to guarantee the data consistency in case 222 * crash happens anywhere. 223 * 224 * XXX: defrag can end up with finishing partial extent as requested, 225 * due to not enough contiguous clusters can be found in allocator. 226 */ 227 static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 228 u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 229 { 230 int ret, credits = 0, extra_blocks = 0, partial = context->partial; 231 handle_t *handle; 232 struct inode *inode = context->inode; 233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 234 struct inode *tl_inode = osb->osb_tl_inode; 235 struct ocfs2_refcount_tree *ref_tree = NULL; 236 u32 new_phys_cpos, new_len; 237 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 238 239 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 240 241 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 242 OCFS2_HAS_REFCOUNT_FL)); 243 244 BUG_ON(!context->refcount_loc); 245 246 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 247 &ref_tree, NULL); 248 if (ret) { 249 mlog_errno(ret); 250 return ret; 251 } 252 253 ret = ocfs2_prepare_refcount_change_for_del(inode, 254 context->refcount_loc, 255 phys_blkno, 256 *len, 257 &credits, 258 &extra_blocks); 259 if (ret) { 260 mlog_errno(ret); 261 goto out; 262 } 263 } 264 265 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, 266 &context->meta_ac, 267 &context->data_ac, 268 extra_blocks, &credits); 269 if (ret) { 270 mlog_errno(ret); 271 goto out; 272 } 273 274 /* 275 * should be using allocation reservation strategy there? 276 * 277 * if (context->data_ac) 278 * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 279 */ 280 281 mutex_lock(&tl_inode->i_mutex); 282 283 if (ocfs2_truncate_log_needs_flush(osb)) { 284 ret = __ocfs2_flush_truncate_log(osb); 285 if (ret < 0) { 286 mlog_errno(ret); 287 goto out_unlock_mutex; 288 } 289 } 290 291 handle = ocfs2_start_trans(osb, credits); 292 if (IS_ERR(handle)) { 293 ret = PTR_ERR(handle); 294 mlog_errno(ret); 295 goto out_unlock_mutex; 296 } 297 298 ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 299 &new_phys_cpos, &new_len); 300 if (ret) { 301 mlog_errno(ret); 302 goto out_commit; 303 } 304 305 /* 306 * allowing partial extent moving is kind of 'pros and cons', it makes 307 * whole defragmentation less likely to fail, on the contrary, the bad 308 * thing is it may make the fs even more fragmented after moving, let 309 * userspace make a good decision here. 310 */ 311 if (new_len != *len) { 312 mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 313 if (!partial) { 314 context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 315 ret = -ENOSPC; 316 goto out_commit; 317 } 318 } 319 320 mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 321 phys_cpos, new_phys_cpos); 322 323 ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 324 new_phys_cpos, ext_flags); 325 if (ret) 326 mlog_errno(ret); 327 328 if (partial && (new_len != *len)) 329 *len = new_len; 330 331 /* 332 * Here we should write the new page out first if we are 333 * in write-back mode. 334 */ 335 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 336 if (ret) 337 mlog_errno(ret); 338 339 out_commit: 340 ocfs2_commit_trans(osb, handle); 341 342 out_unlock_mutex: 343 mutex_unlock(&tl_inode->i_mutex); 344 345 if (context->data_ac) { 346 ocfs2_free_alloc_context(context->data_ac); 347 context->data_ac = NULL; 348 } 349 350 if (context->meta_ac) { 351 ocfs2_free_alloc_context(context->meta_ac); 352 context->meta_ac = NULL; 353 } 354 355 out: 356 if (ref_tree) 357 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 358 359 return ret; 360 } 361 362 /* 363 * find the victim alloc group, where #blkno fits. 364 */ 365 static int ocfs2_find_victim_alloc_group(struct inode *inode, 366 u64 vict_blkno, 367 int type, int slot, 368 int *vict_bit, 369 struct buffer_head **ret_bh) 370 { 371 int ret, i, bits_per_unit = 0; 372 u64 blkno; 373 char namebuf[40]; 374 375 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 376 struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 377 struct ocfs2_chain_list *cl; 378 struct ocfs2_chain_rec *rec; 379 struct ocfs2_dinode *ac_dinode; 380 struct ocfs2_group_desc *bg; 381 382 ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 383 ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 384 strlen(namebuf), &blkno); 385 if (ret) { 386 ret = -ENOENT; 387 goto out; 388 } 389 390 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 391 if (ret) { 392 mlog_errno(ret); 393 goto out; 394 } 395 396 ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 397 cl = &(ac_dinode->id2.i_chain); 398 rec = &(cl->cl_recs[0]); 399 400 if (type == GLOBAL_BITMAP_SYSTEM_INODE) 401 bits_per_unit = osb->s_clustersize_bits - 402 inode->i_sb->s_blocksize_bits; 403 /* 404 * 'vict_blkno' was out of the valid range. 405 */ 406 if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 407 (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 408 bits_per_unit))) { 409 ret = -EINVAL; 410 goto out; 411 } 412 413 for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 414 415 rec = &(cl->cl_recs[i]); 416 if (!rec) 417 continue; 418 419 bg = NULL; 420 421 do { 422 if (!bg) 423 blkno = le64_to_cpu(rec->c_blkno); 424 else 425 blkno = le64_to_cpu(bg->bg_next_group); 426 427 if (gd_bh) { 428 brelse(gd_bh); 429 gd_bh = NULL; 430 } 431 432 ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 433 if (ret) { 434 mlog_errno(ret); 435 goto out; 436 } 437 438 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 439 440 if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 441 le16_to_cpu(bg->bg_bits))) { 442 443 *ret_bh = gd_bh; 444 *vict_bit = (vict_blkno - blkno) >> 445 bits_per_unit; 446 mlog(0, "find the victim group: #%llu, " 447 "total_bits: %u, vict_bit: %u\n", 448 blkno, le16_to_cpu(bg->bg_bits), 449 *vict_bit); 450 goto out; 451 } 452 453 } while (le64_to_cpu(bg->bg_next_group)); 454 } 455 456 ret = -EINVAL; 457 out: 458 brelse(ac_bh); 459 460 /* 461 * caller has to release the gd_bh properly. 462 */ 463 return ret; 464 } 465 466 /* 467 * XXX: helper to validate and adjust moving goal. 468 */ 469 static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 470 struct ocfs2_move_extents *range) 471 { 472 int ret, goal_bit = 0; 473 474 struct buffer_head *gd_bh = NULL; 475 struct ocfs2_group_desc *bg; 476 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 477 int c_to_b = 1 << (osb->s_clustersize_bits - 478 inode->i_sb->s_blocksize_bits); 479 480 /* 481 * make goal become cluster aligned. 482 */ 483 range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 484 range->me_goal); 485 /* 486 * validate goal sits within global_bitmap, and return the victim 487 * group desc 488 */ 489 ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 490 GLOBAL_BITMAP_SYSTEM_INODE, 491 OCFS2_INVALID_SLOT, 492 &goal_bit, &gd_bh); 493 if (ret) 494 goto out; 495 496 bg = (struct ocfs2_group_desc *)gd_bh->b_data; 497 498 /* 499 * moving goal is not allowd to start with a group desc blok(#0 blk) 500 * let's compromise to the latter cluster. 501 */ 502 if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 503 range->me_goal += c_to_b; 504 505 /* 506 * movement is not gonna cross two groups. 507 */ 508 if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 509 range->me_len) { 510 ret = -EINVAL; 511 goto out; 512 } 513 /* 514 * more exact validations/adjustments will be performed later during 515 * moving operation for each extent range. 516 */ 517 mlog(0, "extents get ready to be moved to #%llu block\n", 518 range->me_goal); 519 520 out: 521 brelse(gd_bh); 522 523 return ret; 524 } 525 526 static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 527 int *goal_bit, u32 move_len, u32 max_hop, 528 u32 *phys_cpos) 529 { 530 int i, used, last_free_bits = 0, base_bit = *goal_bit; 531 struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 532 u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 533 le64_to_cpu(gd->bg_blkno)); 534 535 for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 536 537 used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 538 if (used) { 539 /* 540 * we even tried searching the free chunk by jumping 541 * a 'max_hop' distance, but still failed. 542 */ 543 if ((i - base_bit) > max_hop) { 544 *phys_cpos = 0; 545 break; 546 } 547 548 if (last_free_bits) 549 last_free_bits = 0; 550 551 continue; 552 } else 553 last_free_bits++; 554 555 if (last_free_bits == move_len) { 556 *goal_bit = i; 557 *phys_cpos = base_cpos + i; 558 break; 559 } 560 } 561 562 mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 563 } 564 565 static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 566 handle_t *handle, 567 struct buffer_head *di_bh, 568 u32 num_bits, 569 u16 chain) 570 { 571 int ret; 572 u32 tmp_used; 573 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 574 struct ocfs2_chain_list *cl = 575 (struct ocfs2_chain_list *) &di->id2.i_chain; 576 577 ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 578 OCFS2_JOURNAL_ACCESS_WRITE); 579 if (ret < 0) { 580 mlog_errno(ret); 581 goto out; 582 } 583 584 tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 585 di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 586 le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 587 ocfs2_journal_dirty(handle, di_bh); 588 589 out: 590 return ret; 591 } 592 593 static inline int ocfs2_block_group_set_bits(handle_t *handle, 594 struct inode *alloc_inode, 595 struct ocfs2_group_desc *bg, 596 struct buffer_head *group_bh, 597 unsigned int bit_off, 598 unsigned int num_bits) 599 { 600 int status; 601 void *bitmap = bg->bg_bitmap; 602 int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 603 604 /* All callers get the descriptor via 605 * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 606 BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 607 BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 608 609 mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 610 num_bits); 611 612 if (ocfs2_is_cluster_bitmap(alloc_inode)) 613 journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 614 615 status = ocfs2_journal_access_gd(handle, 616 INODE_CACHE(alloc_inode), 617 group_bh, 618 journal_type); 619 if (status < 0) { 620 mlog_errno(status); 621 goto bail; 622 } 623 624 le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 625 if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 626 ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 627 " count %u but claims %u are freed. num_bits %d", 628 (unsigned long long)le64_to_cpu(bg->bg_blkno), 629 le16_to_cpu(bg->bg_bits), 630 le16_to_cpu(bg->bg_free_bits_count), num_bits); 631 return -EROFS; 632 } 633 while (num_bits--) 634 ocfs2_set_bit(bit_off++, bitmap); 635 636 ocfs2_journal_dirty(handle, group_bh); 637 638 bail: 639 return status; 640 } 641 642 static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 643 u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 644 u32 len, int ext_flags) 645 { 646 int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 647 handle_t *handle; 648 struct inode *inode = context->inode; 649 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 650 struct inode *tl_inode = osb->osb_tl_inode; 651 struct inode *gb_inode = NULL; 652 struct buffer_head *gb_bh = NULL; 653 struct buffer_head *gd_bh = NULL; 654 struct ocfs2_group_desc *gd; 655 struct ocfs2_refcount_tree *ref_tree = NULL; 656 u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 657 context->range->me_threshold); 658 u64 phys_blkno, new_phys_blkno; 659 660 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 661 662 if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 663 664 BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 665 OCFS2_HAS_REFCOUNT_FL)); 666 667 BUG_ON(!context->refcount_loc); 668 669 ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 670 &ref_tree, NULL); 671 if (ret) { 672 mlog_errno(ret); 673 return ret; 674 } 675 676 ret = ocfs2_prepare_refcount_change_for_del(inode, 677 context->refcount_loc, 678 phys_blkno, 679 len, 680 &credits, 681 &extra_blocks); 682 if (ret) { 683 mlog_errno(ret); 684 goto out; 685 } 686 } 687 688 ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, 689 &context->meta_ac, 690 NULL, extra_blocks, &credits); 691 if (ret) { 692 mlog_errno(ret); 693 goto out; 694 } 695 696 /* 697 * need to count 2 extra credits for global_bitmap inode and 698 * group descriptor. 699 */ 700 credits += OCFS2_INODE_UPDATE_CREDITS + 1; 701 702 /* 703 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 704 * logic, while we still need to lock the global_bitmap. 705 */ 706 gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 707 OCFS2_INVALID_SLOT); 708 if (!gb_inode) { 709 mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 710 ret = -EIO; 711 goto out; 712 } 713 714 mutex_lock(&gb_inode->i_mutex); 715 716 ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 717 if (ret) { 718 mlog_errno(ret); 719 goto out_unlock_gb_mutex; 720 } 721 722 mutex_lock(&tl_inode->i_mutex); 723 724 handle = ocfs2_start_trans(osb, credits); 725 if (IS_ERR(handle)) { 726 ret = PTR_ERR(handle); 727 mlog_errno(ret); 728 goto out_unlock_tl_inode; 729 } 730 731 new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 732 ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 733 GLOBAL_BITMAP_SYSTEM_INODE, 734 OCFS2_INVALID_SLOT, 735 &goal_bit, &gd_bh); 736 if (ret) { 737 mlog_errno(ret); 738 goto out_commit; 739 } 740 741 /* 742 * probe the victim cluster group to find a proper 743 * region to fit wanted movement, it even will perfrom 744 * a best-effort attempt by compromising to a threshold 745 * around the goal. 746 */ 747 ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 748 new_phys_cpos); 749 if (!*new_phys_cpos) { 750 ret = -ENOSPC; 751 goto out_commit; 752 } 753 754 ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 755 *new_phys_cpos, ext_flags); 756 if (ret) { 757 mlog_errno(ret); 758 goto out_commit; 759 } 760 761 gd = (struct ocfs2_group_desc *)gd_bh->b_data; 762 ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 763 le16_to_cpu(gd->bg_chain)); 764 if (ret) { 765 mlog_errno(ret); 766 goto out_commit; 767 } 768 769 ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 770 goal_bit, len); 771 if (ret) 772 mlog_errno(ret); 773 774 /* 775 * Here we should write the new page out first if we are 776 * in write-back mode. 777 */ 778 ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 779 if (ret) 780 mlog_errno(ret); 781 782 out_commit: 783 ocfs2_commit_trans(osb, handle); 784 brelse(gd_bh); 785 786 out_unlock_tl_inode: 787 mutex_unlock(&tl_inode->i_mutex); 788 789 ocfs2_inode_unlock(gb_inode, 1); 790 out_unlock_gb_mutex: 791 mutex_unlock(&gb_inode->i_mutex); 792 brelse(gb_bh); 793 iput(gb_inode); 794 795 out: 796 if (context->meta_ac) { 797 ocfs2_free_alloc_context(context->meta_ac); 798 context->meta_ac = NULL; 799 } 800 801 if (ref_tree) 802 ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 803 804 return ret; 805 } 806 807 /* 808 * Helper to calculate the defraging length in one run according to threshold. 809 */ 810 static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 811 u32 threshold, int *skip) 812 { 813 if ((*alloc_size + *len_defraged) < threshold) { 814 /* 815 * proceed defragmentation until we meet the thresh 816 */ 817 *len_defraged += *alloc_size; 818 } else if (*len_defraged == 0) { 819 /* 820 * XXX: skip a large extent. 821 */ 822 *skip = 1; 823 } else { 824 /* 825 * split this extent to coalesce with former pieces as 826 * to reach the threshold. 827 * 828 * we're done here with one cycle of defragmentation 829 * in a size of 'thresh', resetting 'len_defraged' 830 * forces a new defragmentation. 831 */ 832 *alloc_size = threshold - *len_defraged; 833 *len_defraged = 0; 834 } 835 } 836 837 static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 838 struct ocfs2_move_extents_context *context) 839 { 840 int ret = 0, flags, do_defrag, skip = 0; 841 u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 842 u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 843 844 struct inode *inode = context->inode; 845 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 846 struct ocfs2_move_extents *range = context->range; 847 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 848 849 if ((i_size_read(inode) == 0) || (range->me_len == 0)) 850 return 0; 851 852 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 853 return 0; 854 855 context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 856 857 ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 858 ocfs2_init_dealloc_ctxt(&context->dealloc); 859 860 /* 861 * TO-DO XXX: 862 * 863 * - xattr extents. 864 */ 865 866 do_defrag = context->auto_defrag; 867 868 /* 869 * extents moving happens in unit of clusters, for the sake 870 * of simplicity, we may ignore two clusters where 'byte_start' 871 * and 'byte_start + len' were within. 872 */ 873 move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 874 len_to_move = (range->me_start + range->me_len) >> 875 osb->s_clustersize_bits; 876 if (len_to_move >= move_start) 877 len_to_move -= move_start; 878 else 879 len_to_move = 0; 880 881 if (do_defrag) { 882 defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 883 if (defrag_thresh <= 1) 884 goto done; 885 } else 886 new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 887 range->me_goal); 888 889 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 890 "thresh: %u\n", 891 (unsigned long long)OCFS2_I(inode)->ip_blkno, 892 (unsigned long long)range->me_start, 893 (unsigned long long)range->me_len, 894 move_start, len_to_move, defrag_thresh); 895 896 cpos = move_start; 897 while (len_to_move) { 898 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 899 &flags); 900 if (ret) { 901 mlog_errno(ret); 902 goto out; 903 } 904 905 if (alloc_size > len_to_move) 906 alloc_size = len_to_move; 907 908 /* 909 * XXX: how to deal with a hole: 910 * 911 * - skip the hole of course 912 * - force a new defragmentation 913 */ 914 if (!phys_cpos) { 915 if (do_defrag) 916 len_defraged = 0; 917 918 goto next; 919 } 920 921 if (do_defrag) { 922 ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 923 defrag_thresh, &skip); 924 /* 925 * skip large extents 926 */ 927 if (skip) { 928 skip = 0; 929 goto next; 930 } 931 932 mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 933 "alloc_size: %u, len_defraged: %u\n", 934 cpos, phys_cpos, alloc_size, len_defraged); 935 936 ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 937 &alloc_size, flags); 938 } else { 939 ret = ocfs2_move_extent(context, cpos, phys_cpos, 940 &new_phys_cpos, alloc_size, 941 flags); 942 943 new_phys_cpos += alloc_size; 944 } 945 946 if (ret < 0) { 947 mlog_errno(ret); 948 goto out; 949 } 950 951 context->clusters_moved += alloc_size; 952 next: 953 cpos += alloc_size; 954 len_to_move -= alloc_size; 955 } 956 957 done: 958 range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 959 960 out: 961 range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 962 context->clusters_moved); 963 range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 964 context->new_phys_cpos); 965 966 ocfs2_schedule_truncate_log_flush(osb, 1); 967 ocfs2_run_deallocs(osb, &context->dealloc); 968 969 return ret; 970 } 971 972 static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 973 { 974 int status; 975 handle_t *handle; 976 struct inode *inode = context->inode; 977 struct ocfs2_dinode *di; 978 struct buffer_head *di_bh = NULL; 979 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 980 981 if (!inode) 982 return -ENOENT; 983 984 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 985 return -EROFS; 986 987 mutex_lock(&inode->i_mutex); 988 989 /* 990 * This prevents concurrent writes from other nodes 991 */ 992 status = ocfs2_rw_lock(inode, 1); 993 if (status) { 994 mlog_errno(status); 995 goto out; 996 } 997 998 status = ocfs2_inode_lock(inode, &di_bh, 1); 999 if (status) { 1000 mlog_errno(status); 1001 goto out_rw_unlock; 1002 } 1003 1004 /* 1005 * rememer ip_xattr_sem also needs to be held if necessary 1006 */ 1007 down_write(&OCFS2_I(inode)->ip_alloc_sem); 1008 1009 status = __ocfs2_move_extents_range(di_bh, context); 1010 1011 up_write(&OCFS2_I(inode)->ip_alloc_sem); 1012 if (status) { 1013 mlog_errno(status); 1014 goto out_inode_unlock; 1015 } 1016 1017 /* 1018 * We update ctime for these changes 1019 */ 1020 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1021 if (IS_ERR(handle)) { 1022 status = PTR_ERR(handle); 1023 mlog_errno(status); 1024 goto out_inode_unlock; 1025 } 1026 1027 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 1028 OCFS2_JOURNAL_ACCESS_WRITE); 1029 if (status) { 1030 mlog_errno(status); 1031 goto out_commit; 1032 } 1033 1034 di = (struct ocfs2_dinode *)di_bh->b_data; 1035 inode->i_ctime = CURRENT_TIME; 1036 di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 1037 di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 1038 1039 ocfs2_journal_dirty(handle, di_bh); 1040 1041 out_commit: 1042 ocfs2_commit_trans(osb, handle); 1043 1044 out_inode_unlock: 1045 brelse(di_bh); 1046 ocfs2_inode_unlock(inode, 1); 1047 out_rw_unlock: 1048 ocfs2_rw_unlock(inode, 1); 1049 out: 1050 mutex_unlock(&inode->i_mutex); 1051 1052 return status; 1053 } 1054 1055 int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 1056 { 1057 int status; 1058 1059 struct inode *inode = file_inode(filp); 1060 struct ocfs2_move_extents range; 1061 struct ocfs2_move_extents_context *context; 1062 1063 if (!argp) 1064 return -EINVAL; 1065 1066 status = mnt_want_write_file(filp); 1067 if (status) 1068 return status; 1069 1070 if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) 1071 goto out_drop; 1072 1073 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 1074 status = -EPERM; 1075 goto out_drop; 1076 } 1077 1078 context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 1079 if (!context) { 1080 status = -ENOMEM; 1081 mlog_errno(status); 1082 goto out_drop; 1083 } 1084 1085 context->inode = inode; 1086 context->file = filp; 1087 1088 if (copy_from_user(&range, argp, sizeof(range))) { 1089 status = -EFAULT; 1090 goto out_free; 1091 } 1092 1093 if (range.me_start > i_size_read(inode)) 1094 goto out_free; 1095 1096 if (range.me_start + range.me_len > i_size_read(inode)) 1097 range.me_len = i_size_read(inode) - range.me_start; 1098 1099 context->range = ⦥ 1100 1101 if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 1102 context->auto_defrag = 1; 1103 /* 1104 * ok, the default theshold for the defragmentation 1105 * is 1M, since our maximum clustersize was 1M also. 1106 * any thought? 1107 */ 1108 if (!range.me_threshold) 1109 range.me_threshold = 1024 * 1024; 1110 1111 if (range.me_threshold > i_size_read(inode)) 1112 range.me_threshold = i_size_read(inode); 1113 1114 if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 1115 context->partial = 1; 1116 } else { 1117 /* 1118 * first best-effort attempt to validate and adjust the goal 1119 * (physical address in block), while it can't guarantee later 1120 * operation can succeed all the time since global_bitmap may 1121 * change a bit over time. 1122 */ 1123 1124 status = ocfs2_validate_and_adjust_move_goal(inode, &range); 1125 if (status) 1126 goto out_copy; 1127 } 1128 1129 status = ocfs2_move_extents(context); 1130 if (status) 1131 mlog_errno(status); 1132 out_copy: 1133 /* 1134 * movement/defragmentation may end up being partially completed, 1135 * that's the reason why we need to return userspace the finished 1136 * length and new_offset even if failure happens somewhere. 1137 */ 1138 if (copy_to_user(argp, &range, sizeof(range))) 1139 status = -EFAULT; 1140 1141 out_free: 1142 kfree(context); 1143 out_drop: 1144 mnt_drop_write_file(filp); 1145 1146 return status; 1147 } 1148