1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include "ctree.h" 8 #include "disk-io.h" 9 #include "print-tree.h" 10 #include "transaction.h" 11 #include "locking.h" 12 #include "accessors.h" 13 14 static struct kmem_cache *btrfs_inode_defrag_cachep; 15 16 /* 17 * When auto defrag is enabled we queue up these defrag structs to remember 18 * which inodes need defragging passes. 19 */ 20 struct inode_defrag { 21 struct rb_node rb_node; 22 /* Inode number */ 23 u64 ino; 24 /* 25 * Transid where the defrag was added, we search for extents newer than 26 * this. 27 */ 28 u64 transid; 29 30 /* Root objectid */ 31 u64 root; 32 33 /* 34 * The extent size threshold for autodefrag. 35 * 36 * This value is different for compressed/non-compressed extents, thus 37 * needs to be passed from higher layer. 38 * (aka, inode_should_defrag()) 39 */ 40 u32 extent_thresh; 41 }; 42 43 static int __compare_inode_defrag(struct inode_defrag *defrag1, 44 struct inode_defrag *defrag2) 45 { 46 if (defrag1->root > defrag2->root) 47 return 1; 48 else if (defrag1->root < defrag2->root) 49 return -1; 50 else if (defrag1->ino > defrag2->ino) 51 return 1; 52 else if (defrag1->ino < defrag2->ino) 53 return -1; 54 else 55 return 0; 56 } 57 58 /* 59 * Pop a record for an inode into the defrag tree. The lock must be held 60 * already. 61 * 62 * If you're inserting a record for an older transid than an existing record, 63 * the transid already in the tree is lowered. 64 * 65 * If an existing record is found the defrag item you pass in is freed. 66 */ 67 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, 68 struct inode_defrag *defrag) 69 { 70 struct btrfs_fs_info *fs_info = inode->root->fs_info; 71 struct inode_defrag *entry; 72 struct rb_node **p; 73 struct rb_node *parent = NULL; 74 int ret; 75 76 p = &fs_info->defrag_inodes.rb_node; 77 while (*p) { 78 parent = *p; 79 entry = rb_entry(parent, struct inode_defrag, rb_node); 80 81 ret = __compare_inode_defrag(defrag, entry); 82 if (ret < 0) 83 p = &parent->rb_left; 84 else if (ret > 0) 85 p = &parent->rb_right; 86 else { 87 /* 88 * If we're reinserting an entry for an old defrag run, 89 * make sure to lower the transid of our existing 90 * record. 91 */ 92 if (defrag->transid < entry->transid) 93 entry->transid = defrag->transid; 94 entry->extent_thresh = min(defrag->extent_thresh, 95 entry->extent_thresh); 96 return -EEXIST; 97 } 98 } 99 set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); 100 rb_link_node(&defrag->rb_node, parent, p); 101 rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); 102 return 0; 103 } 104 105 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) 106 { 107 if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) 108 return 0; 109 110 if (btrfs_fs_closing(fs_info)) 111 return 0; 112 113 return 1; 114 } 115 116 /* 117 * Insert a defrag record for this inode if auto defrag is enabled. 118 */ 119 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 120 struct btrfs_inode *inode, u32 extent_thresh) 121 { 122 struct btrfs_root *root = inode->root; 123 struct btrfs_fs_info *fs_info = root->fs_info; 124 struct inode_defrag *defrag; 125 u64 transid; 126 int ret; 127 128 if (!__need_auto_defrag(fs_info)) 129 return 0; 130 131 if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) 132 return 0; 133 134 if (trans) 135 transid = trans->transid; 136 else 137 transid = inode->root->last_trans; 138 139 defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); 140 if (!defrag) 141 return -ENOMEM; 142 143 defrag->ino = btrfs_ino(inode); 144 defrag->transid = transid; 145 defrag->root = root->root_key.objectid; 146 defrag->extent_thresh = extent_thresh; 147 148 spin_lock(&fs_info->defrag_inodes_lock); 149 if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { 150 /* 151 * If we set IN_DEFRAG flag and evict the inode from memory, 152 * and then re-read this inode, this new inode doesn't have 153 * IN_DEFRAG flag. At the case, we may find the existed defrag. 154 */ 155 ret = __btrfs_add_inode_defrag(inode, defrag); 156 if (ret) 157 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 158 } else { 159 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 160 } 161 spin_unlock(&fs_info->defrag_inodes_lock); 162 return 0; 163 } 164 165 /* 166 * Pick the defragable inode that we want, if it doesn't exist, we will get the 167 * next one. 168 */ 169 static struct inode_defrag *btrfs_pick_defrag_inode( 170 struct btrfs_fs_info *fs_info, u64 root, u64 ino) 171 { 172 struct inode_defrag *entry = NULL; 173 struct inode_defrag tmp; 174 struct rb_node *p; 175 struct rb_node *parent = NULL; 176 int ret; 177 178 tmp.ino = ino; 179 tmp.root = root; 180 181 spin_lock(&fs_info->defrag_inodes_lock); 182 p = fs_info->defrag_inodes.rb_node; 183 while (p) { 184 parent = p; 185 entry = rb_entry(parent, struct inode_defrag, rb_node); 186 187 ret = __compare_inode_defrag(&tmp, entry); 188 if (ret < 0) 189 p = parent->rb_left; 190 else if (ret > 0) 191 p = parent->rb_right; 192 else 193 goto out; 194 } 195 196 if (parent && __compare_inode_defrag(&tmp, entry) > 0) { 197 parent = rb_next(parent); 198 if (parent) 199 entry = rb_entry(parent, struct inode_defrag, rb_node); 200 else 201 entry = NULL; 202 } 203 out: 204 if (entry) 205 rb_erase(parent, &fs_info->defrag_inodes); 206 spin_unlock(&fs_info->defrag_inodes_lock); 207 return entry; 208 } 209 210 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) 211 { 212 struct inode_defrag *defrag; 213 struct rb_node *node; 214 215 spin_lock(&fs_info->defrag_inodes_lock); 216 node = rb_first(&fs_info->defrag_inodes); 217 while (node) { 218 rb_erase(node, &fs_info->defrag_inodes); 219 defrag = rb_entry(node, struct inode_defrag, rb_node); 220 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 221 222 cond_resched_lock(&fs_info->defrag_inodes_lock); 223 224 node = rb_first(&fs_info->defrag_inodes); 225 } 226 spin_unlock(&fs_info->defrag_inodes_lock); 227 } 228 229 #define BTRFS_DEFRAG_BATCH 1024 230 231 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, 232 struct inode_defrag *defrag) 233 { 234 struct btrfs_root *inode_root; 235 struct inode *inode; 236 struct btrfs_ioctl_defrag_range_args range; 237 int ret = 0; 238 u64 cur = 0; 239 240 again: 241 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 242 goto cleanup; 243 if (!__need_auto_defrag(fs_info)) 244 goto cleanup; 245 246 /* Get the inode */ 247 inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); 248 if (IS_ERR(inode_root)) { 249 ret = PTR_ERR(inode_root); 250 goto cleanup; 251 } 252 253 inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); 254 btrfs_put_root(inode_root); 255 if (IS_ERR(inode)) { 256 ret = PTR_ERR(inode); 257 goto cleanup; 258 } 259 260 if (cur >= i_size_read(inode)) { 261 iput(inode); 262 goto cleanup; 263 } 264 265 /* Do a chunk of defrag */ 266 clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); 267 memset(&range, 0, sizeof(range)); 268 range.len = (u64)-1; 269 range.start = cur; 270 range.extent_thresh = defrag->extent_thresh; 271 272 sb_start_write(fs_info->sb); 273 ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, 274 BTRFS_DEFRAG_BATCH); 275 sb_end_write(fs_info->sb); 276 iput(inode); 277 278 if (ret < 0) 279 goto cleanup; 280 281 cur = max(cur + fs_info->sectorsize, range.start); 282 goto again; 283 284 cleanup: 285 kmem_cache_free(btrfs_inode_defrag_cachep, defrag); 286 return ret; 287 } 288 289 /* 290 * Run through the list of inodes in the FS that need defragging. 291 */ 292 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) 293 { 294 struct inode_defrag *defrag; 295 u64 first_ino = 0; 296 u64 root_objectid = 0; 297 298 atomic_inc(&fs_info->defrag_running); 299 while (1) { 300 /* Pause the auto defragger. */ 301 if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) 302 break; 303 304 if (!__need_auto_defrag(fs_info)) 305 break; 306 307 /* find an inode to defrag */ 308 defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); 309 if (!defrag) { 310 if (root_objectid || first_ino) { 311 root_objectid = 0; 312 first_ino = 0; 313 continue; 314 } else { 315 break; 316 } 317 } 318 319 first_ino = defrag->ino + 1; 320 root_objectid = defrag->root; 321 322 __btrfs_run_defrag_inode(fs_info, defrag); 323 } 324 atomic_dec(&fs_info->defrag_running); 325 326 /* 327 * During unmount, we use the transaction_wait queue to wait for the 328 * defragger to stop. 329 */ 330 wake_up(&fs_info->transaction_wait); 331 return 0; 332 } 333 334 /* 335 * Defrag all the leaves in a given btree. 336 * Read all the leaves and try to get key order to 337 * better reflect disk order 338 */ 339 340 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 341 struct btrfs_root *root) 342 { 343 struct btrfs_path *path = NULL; 344 struct btrfs_key key; 345 int ret = 0; 346 int wret; 347 int level; 348 int next_key_ret = 0; 349 u64 last_ret = 0; 350 351 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) 352 goto out; 353 354 path = btrfs_alloc_path(); 355 if (!path) 356 return -ENOMEM; 357 358 level = btrfs_header_level(root->node); 359 360 if (level == 0) 361 goto out; 362 363 if (root->defrag_progress.objectid == 0) { 364 struct extent_buffer *root_node; 365 u32 nritems; 366 367 root_node = btrfs_lock_root_node(root); 368 nritems = btrfs_header_nritems(root_node); 369 root->defrag_max.objectid = 0; 370 /* from above we know this is not a leaf */ 371 btrfs_node_key_to_cpu(root_node, &root->defrag_max, 372 nritems - 1); 373 btrfs_tree_unlock(root_node); 374 free_extent_buffer(root_node); 375 memset(&key, 0, sizeof(key)); 376 } else { 377 memcpy(&key, &root->defrag_progress, sizeof(key)); 378 } 379 380 path->keep_locks = 1; 381 382 ret = btrfs_search_forward(root, &key, path, BTRFS_OLDEST_GENERATION); 383 if (ret < 0) 384 goto out; 385 if (ret > 0) { 386 ret = 0; 387 goto out; 388 } 389 btrfs_release_path(path); 390 /* 391 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all 392 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later 393 * a deadlock (attempting to write lock an already write locked leaf). 394 */ 395 path->lowest_level = 1; 396 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 397 398 if (wret < 0) { 399 ret = wret; 400 goto out; 401 } 402 if (!path->nodes[1]) { 403 ret = 0; 404 goto out; 405 } 406 /* 407 * The node at level 1 must always be locked when our path has 408 * keep_locks set and lowest_level is 1, regardless of the value of 409 * path->slots[1]. 410 */ 411 BUG_ON(path->locks[1] == 0); 412 ret = btrfs_realloc_node(trans, root, 413 path->nodes[1], 0, 414 &last_ret, 415 &root->defrag_progress); 416 if (ret) { 417 WARN_ON(ret == -EAGAIN); 418 goto out; 419 } 420 /* 421 * Now that we reallocated the node we can find the next key. Note that 422 * btrfs_find_next_key() can release our path and do another search 423 * without COWing, this is because even with path->keep_locks = 1, 424 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a 425 * node when path->slots[node_level - 1] does not point to the last 426 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore 427 * we search for the next key after reallocating our node. 428 */ 429 path->slots[1] = btrfs_header_nritems(path->nodes[1]); 430 next_key_ret = btrfs_find_next_key(root, path, &key, 1, 431 BTRFS_OLDEST_GENERATION); 432 if (next_key_ret == 0) { 433 memcpy(&root->defrag_progress, &key, sizeof(key)); 434 ret = -EAGAIN; 435 } 436 out: 437 btrfs_free_path(path); 438 if (ret == -EAGAIN) { 439 if (root->defrag_max.objectid > root->defrag_progress.objectid) 440 goto done; 441 if (root->defrag_max.type > root->defrag_progress.type) 442 goto done; 443 if (root->defrag_max.offset > root->defrag_progress.offset) 444 goto done; 445 ret = 0; 446 } 447 done: 448 if (ret != -EAGAIN) 449 memset(&root->defrag_progress, 0, 450 sizeof(root->defrag_progress)); 451 452 return ret; 453 } 454 455 void __cold btrfs_auto_defrag_exit(void) 456 { 457 kmem_cache_destroy(btrfs_inode_defrag_cachep); 458 } 459 460 int __init btrfs_auto_defrag_init(void) 461 { 462 btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", 463 sizeof(struct inode_defrag), 0, 464 SLAB_MEM_SPREAD, 465 NULL); 466 if (!btrfs_inode_defrag_cachep) 467 return -ENOMEM; 468 469 return 0; 470 } 471