1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "tree-checker.h" 32 33 /* 34 * Helpers to access qgroup reservation 35 * 36 * Callers should ensure the lock context and type are valid 37 */ 38 39 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 40 { 41 u64 ret = 0; 42 int i; 43 44 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 45 ret += qgroup->rsv.values[i]; 46 47 return ret; 48 } 49 50 #ifdef CONFIG_BTRFS_DEBUG 51 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 52 { 53 if (type == BTRFS_QGROUP_RSV_DATA) 54 return "data"; 55 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 56 return "meta_pertrans"; 57 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 58 return "meta_prealloc"; 59 return NULL; 60 } 61 #endif 62 63 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 64 struct btrfs_qgroup *qgroup, u64 num_bytes, 65 enum btrfs_qgroup_rsv_type type) 66 { 67 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 68 qgroup->rsv.values[type] += num_bytes; 69 } 70 71 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 72 struct btrfs_qgroup *qgroup, u64 num_bytes, 73 enum btrfs_qgroup_rsv_type type) 74 { 75 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 76 if (qgroup->rsv.values[type] >= num_bytes) { 77 qgroup->rsv.values[type] -= num_bytes; 78 return; 79 } 80 #ifdef CONFIG_BTRFS_DEBUG 81 WARN_RATELIMIT(1, 82 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 83 qgroup->qgroupid, qgroup_rsv_type_str(type), 84 qgroup->rsv.values[type], num_bytes); 85 #endif 86 qgroup->rsv.values[type] = 0; 87 } 88 89 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 90 struct btrfs_qgroup *dest, 91 struct btrfs_qgroup *src) 92 { 93 int i; 94 95 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 96 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 97 } 98 99 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 100 struct btrfs_qgroup *dest, 101 struct btrfs_qgroup *src) 102 { 103 int i; 104 105 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 106 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 107 } 108 109 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 110 int mod) 111 { 112 if (qg->old_refcnt < seq) 113 qg->old_refcnt = seq; 114 qg->old_refcnt += mod; 115 } 116 117 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 118 int mod) 119 { 120 if (qg->new_refcnt < seq) 121 qg->new_refcnt = seq; 122 qg->new_refcnt += mod; 123 } 124 125 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 126 { 127 if (qg->old_refcnt < seq) 128 return 0; 129 return qg->old_refcnt - seq; 130 } 131 132 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->new_refcnt < seq) 135 return 0; 136 return qg->new_refcnt - seq; 137 } 138 139 /* 140 * glue structure to represent the relations between qgroups. 141 */ 142 struct btrfs_qgroup_list { 143 struct list_head next_group; 144 struct list_head next_member; 145 struct btrfs_qgroup *group; 146 struct btrfs_qgroup *member; 147 }; 148 149 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 150 { 151 return (u64)(uintptr_t)qg; 152 } 153 154 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 155 { 156 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 157 } 158 159 static int 160 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 161 int init_flags); 162 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 163 164 /* must be called with qgroup_ioctl_lock held */ 165 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 166 u64 qgroupid) 167 { 168 struct rb_node *n = fs_info->qgroup_tree.rb_node; 169 struct btrfs_qgroup *qgroup; 170 171 while (n) { 172 qgroup = rb_entry(n, struct btrfs_qgroup, node); 173 if (qgroup->qgroupid < qgroupid) 174 n = n->rb_left; 175 else if (qgroup->qgroupid > qgroupid) 176 n = n->rb_right; 177 else 178 return qgroup; 179 } 180 return NULL; 181 } 182 183 /* must be called with qgroup_lock held */ 184 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 185 u64 qgroupid) 186 { 187 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 188 struct rb_node *parent = NULL; 189 struct btrfs_qgroup *qgroup; 190 191 while (*p) { 192 parent = *p; 193 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 194 195 if (qgroup->qgroupid < qgroupid) 196 p = &(*p)->rb_left; 197 else if (qgroup->qgroupid > qgroupid) 198 p = &(*p)->rb_right; 199 else 200 return qgroup; 201 } 202 203 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 204 if (!qgroup) 205 return ERR_PTR(-ENOMEM); 206 207 qgroup->qgroupid = qgroupid; 208 INIT_LIST_HEAD(&qgroup->groups); 209 INIT_LIST_HEAD(&qgroup->members); 210 INIT_LIST_HEAD(&qgroup->dirty); 211 INIT_LIST_HEAD(&qgroup->iterator); 212 213 rb_link_node(&qgroup->node, parent, p); 214 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 215 216 return qgroup; 217 } 218 219 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 220 struct btrfs_qgroup *qgroup) 221 { 222 struct btrfs_qgroup_list *list; 223 224 list_del(&qgroup->dirty); 225 while (!list_empty(&qgroup->groups)) { 226 list = list_first_entry(&qgroup->groups, 227 struct btrfs_qgroup_list, next_group); 228 list_del(&list->next_group); 229 list_del(&list->next_member); 230 kfree(list); 231 } 232 233 while (!list_empty(&qgroup->members)) { 234 list = list_first_entry(&qgroup->members, 235 struct btrfs_qgroup_list, next_member); 236 list_del(&list->next_group); 237 list_del(&list->next_member); 238 kfree(list); 239 } 240 } 241 242 /* must be called with qgroup_lock held */ 243 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 244 { 245 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 246 247 if (!qgroup) 248 return -ENOENT; 249 250 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 251 __del_qgroup_rb(fs_info, qgroup); 252 return 0; 253 } 254 255 /* 256 * Add relation specified by two qgroups. 257 * 258 * Must be called with qgroup_lock held. 259 * 260 * Return: 0 on success 261 * -ENOENT if one of the qgroups is NULL 262 * <0 other errors 263 */ 264 static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) 265 { 266 struct btrfs_qgroup_list *list; 267 268 if (!member || !parent) 269 return -ENOENT; 270 271 list = kzalloc(sizeof(*list), GFP_ATOMIC); 272 if (!list) 273 return -ENOMEM; 274 275 list->group = parent; 276 list->member = member; 277 list_add_tail(&list->next_group, &member->groups); 278 list_add_tail(&list->next_member, &parent->members); 279 280 return 0; 281 } 282 283 /* 284 * Add relation specified by two qgroup ids. 285 * 286 * Must be called with qgroup_lock held. 287 * 288 * Return: 0 on success 289 * -ENOENT if one of the ids does not exist 290 * <0 other errors 291 */ 292 static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) 293 { 294 struct btrfs_qgroup *member; 295 struct btrfs_qgroup *parent; 296 297 member = find_qgroup_rb(fs_info, memberid); 298 parent = find_qgroup_rb(fs_info, parentid); 299 300 return __add_relation_rb(member, parent); 301 } 302 303 /* Must be called with qgroup_lock held */ 304 static int del_relation_rb(struct btrfs_fs_info *fs_info, 305 u64 memberid, u64 parentid) 306 { 307 struct btrfs_qgroup *member; 308 struct btrfs_qgroup *parent; 309 struct btrfs_qgroup_list *list; 310 311 member = find_qgroup_rb(fs_info, memberid); 312 parent = find_qgroup_rb(fs_info, parentid); 313 if (!member || !parent) 314 return -ENOENT; 315 316 list_for_each_entry(list, &member->groups, next_group) { 317 if (list->group == parent) { 318 list_del(&list->next_group); 319 list_del(&list->next_member); 320 kfree(list); 321 return 0; 322 } 323 } 324 return -ENOENT; 325 } 326 327 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 328 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 329 u64 rfer, u64 excl) 330 { 331 struct btrfs_qgroup *qgroup; 332 333 qgroup = find_qgroup_rb(fs_info, qgroupid); 334 if (!qgroup) 335 return -EINVAL; 336 if (qgroup->rfer != rfer || qgroup->excl != excl) 337 return -EINVAL; 338 return 0; 339 } 340 #endif 341 342 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) 343 { 344 fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | 345 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 346 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 347 } 348 349 /* 350 * The full config is read in one go, only called from open_ctree() 351 * It doesn't use any locking, as at this point we're still single-threaded 352 */ 353 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 354 { 355 struct btrfs_key key; 356 struct btrfs_key found_key; 357 struct btrfs_root *quota_root = fs_info->quota_root; 358 struct btrfs_path *path = NULL; 359 struct extent_buffer *l; 360 int slot; 361 int ret = 0; 362 u64 flags = 0; 363 u64 rescan_progress = 0; 364 365 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 366 return 0; 367 368 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 369 if (!fs_info->qgroup_ulist) { 370 ret = -ENOMEM; 371 goto out; 372 } 373 374 path = btrfs_alloc_path(); 375 if (!path) { 376 ret = -ENOMEM; 377 goto out; 378 } 379 380 ret = btrfs_sysfs_add_qgroups(fs_info); 381 if (ret < 0) 382 goto out; 383 /* default this to quota off, in case no status key is found */ 384 fs_info->qgroup_flags = 0; 385 386 /* 387 * pass 1: read status, all qgroup infos and limits 388 */ 389 key.objectid = 0; 390 key.type = 0; 391 key.offset = 0; 392 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 393 if (ret) 394 goto out; 395 396 while (1) { 397 struct btrfs_qgroup *qgroup; 398 399 slot = path->slots[0]; 400 l = path->nodes[0]; 401 btrfs_item_key_to_cpu(l, &found_key, slot); 402 403 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 404 struct btrfs_qgroup_status_item *ptr; 405 406 ptr = btrfs_item_ptr(l, slot, 407 struct btrfs_qgroup_status_item); 408 409 if (btrfs_qgroup_status_version(l, ptr) != 410 BTRFS_QGROUP_STATUS_VERSION) { 411 btrfs_err(fs_info, 412 "old qgroup version, quota disabled"); 413 goto out; 414 } 415 if (btrfs_qgroup_status_generation(l, ptr) != 416 fs_info->generation) { 417 qgroup_mark_inconsistent(fs_info); 418 btrfs_err(fs_info, 419 "qgroup generation mismatch, marked as inconsistent"); 420 } 421 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 422 ptr); 423 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 424 goto next1; 425 } 426 427 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 428 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 429 goto next1; 430 431 qgroup = find_qgroup_rb(fs_info, found_key.offset); 432 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 433 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 434 btrfs_err(fs_info, "inconsistent qgroup config"); 435 qgroup_mark_inconsistent(fs_info); 436 } 437 if (!qgroup) { 438 qgroup = add_qgroup_rb(fs_info, found_key.offset); 439 if (IS_ERR(qgroup)) { 440 ret = PTR_ERR(qgroup); 441 goto out; 442 } 443 } 444 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 445 if (ret < 0) 446 goto out; 447 448 switch (found_key.type) { 449 case BTRFS_QGROUP_INFO_KEY: { 450 struct btrfs_qgroup_info_item *ptr; 451 452 ptr = btrfs_item_ptr(l, slot, 453 struct btrfs_qgroup_info_item); 454 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 455 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 456 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 457 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 458 /* generation currently unused */ 459 break; 460 } 461 case BTRFS_QGROUP_LIMIT_KEY: { 462 struct btrfs_qgroup_limit_item *ptr; 463 464 ptr = btrfs_item_ptr(l, slot, 465 struct btrfs_qgroup_limit_item); 466 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 467 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 468 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 469 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 470 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 471 break; 472 } 473 } 474 next1: 475 ret = btrfs_next_item(quota_root, path); 476 if (ret < 0) 477 goto out; 478 if (ret) 479 break; 480 } 481 btrfs_release_path(path); 482 483 /* 484 * pass 2: read all qgroup relations 485 */ 486 key.objectid = 0; 487 key.type = BTRFS_QGROUP_RELATION_KEY; 488 key.offset = 0; 489 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 490 if (ret) 491 goto out; 492 while (1) { 493 slot = path->slots[0]; 494 l = path->nodes[0]; 495 btrfs_item_key_to_cpu(l, &found_key, slot); 496 497 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 498 goto next2; 499 500 if (found_key.objectid > found_key.offset) { 501 /* parent <- member, not needed to build config */ 502 /* FIXME should we omit the key completely? */ 503 goto next2; 504 } 505 506 ret = add_relation_rb(fs_info, found_key.objectid, 507 found_key.offset); 508 if (ret == -ENOENT) { 509 btrfs_warn(fs_info, 510 "orphan qgroup relation 0x%llx->0x%llx", 511 found_key.objectid, found_key.offset); 512 ret = 0; /* ignore the error */ 513 } 514 if (ret) 515 goto out; 516 next2: 517 ret = btrfs_next_item(quota_root, path); 518 if (ret < 0) 519 goto out; 520 if (ret) 521 break; 522 } 523 out: 524 btrfs_free_path(path); 525 fs_info->qgroup_flags |= flags; 526 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 527 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 528 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 529 ret >= 0) 530 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 531 532 if (ret < 0) { 533 ulist_free(fs_info->qgroup_ulist); 534 fs_info->qgroup_ulist = NULL; 535 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 536 btrfs_sysfs_del_qgroups(fs_info); 537 } 538 539 return ret < 0 ? ret : 0; 540 } 541 542 /* 543 * Called in close_ctree() when quota is still enabled. This verifies we don't 544 * leak some reserved space. 545 * 546 * Return false if no reserved space is left. 547 * Return true if some reserved space is leaked. 548 */ 549 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 550 { 551 struct rb_node *node; 552 bool ret = false; 553 554 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 555 return ret; 556 /* 557 * Since we're unmounting, there is no race and no need to grab qgroup 558 * lock. And here we don't go post-order to provide a more user 559 * friendly sorted result. 560 */ 561 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 562 struct btrfs_qgroup *qgroup; 563 int i; 564 565 qgroup = rb_entry(node, struct btrfs_qgroup, node); 566 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 567 if (qgroup->rsv.values[i]) { 568 ret = true; 569 btrfs_warn(fs_info, 570 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 571 btrfs_qgroup_level(qgroup->qgroupid), 572 btrfs_qgroup_subvolid(qgroup->qgroupid), 573 i, qgroup->rsv.values[i]); 574 } 575 } 576 } 577 return ret; 578 } 579 580 /* 581 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 582 * first two are in single-threaded paths.And for the third one, we have set 583 * quota_root to be null with qgroup_lock held before, so it is safe to clean 584 * up the in-memory structures without qgroup_lock held. 585 */ 586 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 587 { 588 struct rb_node *n; 589 struct btrfs_qgroup *qgroup; 590 591 while ((n = rb_first(&fs_info->qgroup_tree))) { 592 qgroup = rb_entry(n, struct btrfs_qgroup, node); 593 rb_erase(n, &fs_info->qgroup_tree); 594 __del_qgroup_rb(fs_info, qgroup); 595 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 596 kfree(qgroup); 597 } 598 /* 599 * We call btrfs_free_qgroup_config() when unmounting 600 * filesystem and disabling quota, so we set qgroup_ulist 601 * to be null here to avoid double free. 602 */ 603 ulist_free(fs_info->qgroup_ulist); 604 fs_info->qgroup_ulist = NULL; 605 btrfs_sysfs_del_qgroups(fs_info); 606 } 607 608 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 625 626 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 627 628 btrfs_free_path(path); 629 return ret; 630 } 631 632 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 633 u64 dst) 634 { 635 int ret; 636 struct btrfs_root *quota_root = trans->fs_info->quota_root; 637 struct btrfs_path *path; 638 struct btrfs_key key; 639 640 path = btrfs_alloc_path(); 641 if (!path) 642 return -ENOMEM; 643 644 key.objectid = src; 645 key.type = BTRFS_QGROUP_RELATION_KEY; 646 key.offset = dst; 647 648 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 649 if (ret < 0) 650 goto out; 651 652 if (ret > 0) { 653 ret = -ENOENT; 654 goto out; 655 } 656 657 ret = btrfs_del_item(trans, quota_root, path); 658 out: 659 btrfs_free_path(path); 660 return ret; 661 } 662 663 static int add_qgroup_item(struct btrfs_trans_handle *trans, 664 struct btrfs_root *quota_root, u64 qgroupid) 665 { 666 int ret; 667 struct btrfs_path *path; 668 struct btrfs_qgroup_info_item *qgroup_info; 669 struct btrfs_qgroup_limit_item *qgroup_limit; 670 struct extent_buffer *leaf; 671 struct btrfs_key key; 672 673 if (btrfs_is_testing(quota_root->fs_info)) 674 return 0; 675 676 path = btrfs_alloc_path(); 677 if (!path) 678 return -ENOMEM; 679 680 key.objectid = 0; 681 key.type = BTRFS_QGROUP_INFO_KEY; 682 key.offset = qgroupid; 683 684 /* 685 * Avoid a transaction abort by catching -EEXIST here. In that 686 * case, we proceed by re-initializing the existing structure 687 * on disk. 688 */ 689 690 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 691 sizeof(*qgroup_info)); 692 if (ret && ret != -EEXIST) 693 goto out; 694 695 leaf = path->nodes[0]; 696 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 697 struct btrfs_qgroup_info_item); 698 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 699 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 700 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 701 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 702 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 703 704 btrfs_mark_buffer_dirty(trans, leaf); 705 706 btrfs_release_path(path); 707 708 key.type = BTRFS_QGROUP_LIMIT_KEY; 709 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 710 sizeof(*qgroup_limit)); 711 if (ret && ret != -EEXIST) 712 goto out; 713 714 leaf = path->nodes[0]; 715 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 716 struct btrfs_qgroup_limit_item); 717 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 718 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 719 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 720 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 721 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 722 723 btrfs_mark_buffer_dirty(trans, leaf); 724 725 ret = 0; 726 out: 727 btrfs_free_path(path); 728 return ret; 729 } 730 731 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 732 { 733 int ret; 734 struct btrfs_root *quota_root = trans->fs_info->quota_root; 735 struct btrfs_path *path; 736 struct btrfs_key key; 737 738 path = btrfs_alloc_path(); 739 if (!path) 740 return -ENOMEM; 741 742 key.objectid = 0; 743 key.type = BTRFS_QGROUP_INFO_KEY; 744 key.offset = qgroupid; 745 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 746 if (ret < 0) 747 goto out; 748 749 if (ret > 0) { 750 ret = -ENOENT; 751 goto out; 752 } 753 754 ret = btrfs_del_item(trans, quota_root, path); 755 if (ret) 756 goto out; 757 758 btrfs_release_path(path); 759 760 key.type = BTRFS_QGROUP_LIMIT_KEY; 761 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 762 if (ret < 0) 763 goto out; 764 765 if (ret > 0) { 766 ret = -ENOENT; 767 goto out; 768 } 769 770 ret = btrfs_del_item(trans, quota_root, path); 771 772 out: 773 btrfs_free_path(path); 774 return ret; 775 } 776 777 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 778 struct btrfs_qgroup *qgroup) 779 { 780 struct btrfs_root *quota_root = trans->fs_info->quota_root; 781 struct btrfs_path *path; 782 struct btrfs_key key; 783 struct extent_buffer *l; 784 struct btrfs_qgroup_limit_item *qgroup_limit; 785 int ret; 786 int slot; 787 788 key.objectid = 0; 789 key.type = BTRFS_QGROUP_LIMIT_KEY; 790 key.offset = qgroup->qgroupid; 791 792 path = btrfs_alloc_path(); 793 if (!path) 794 return -ENOMEM; 795 796 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 797 if (ret > 0) 798 ret = -ENOENT; 799 800 if (ret) 801 goto out; 802 803 l = path->nodes[0]; 804 slot = path->slots[0]; 805 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 806 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 807 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 808 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 809 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 810 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 811 812 btrfs_mark_buffer_dirty(trans, l); 813 814 out: 815 btrfs_free_path(path); 816 return ret; 817 } 818 819 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 820 struct btrfs_qgroup *qgroup) 821 { 822 struct btrfs_fs_info *fs_info = trans->fs_info; 823 struct btrfs_root *quota_root = fs_info->quota_root; 824 struct btrfs_path *path; 825 struct btrfs_key key; 826 struct extent_buffer *l; 827 struct btrfs_qgroup_info_item *qgroup_info; 828 int ret; 829 int slot; 830 831 if (btrfs_is_testing(fs_info)) 832 return 0; 833 834 key.objectid = 0; 835 key.type = BTRFS_QGROUP_INFO_KEY; 836 key.offset = qgroup->qgroupid; 837 838 path = btrfs_alloc_path(); 839 if (!path) 840 return -ENOMEM; 841 842 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 843 if (ret > 0) 844 ret = -ENOENT; 845 846 if (ret) 847 goto out; 848 849 l = path->nodes[0]; 850 slot = path->slots[0]; 851 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 852 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 853 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 854 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 855 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 856 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 857 858 btrfs_mark_buffer_dirty(trans, l); 859 860 out: 861 btrfs_free_path(path); 862 return ret; 863 } 864 865 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 866 { 867 struct btrfs_fs_info *fs_info = trans->fs_info; 868 struct btrfs_root *quota_root = fs_info->quota_root; 869 struct btrfs_path *path; 870 struct btrfs_key key; 871 struct extent_buffer *l; 872 struct btrfs_qgroup_status_item *ptr; 873 int ret; 874 int slot; 875 876 key.objectid = 0; 877 key.type = BTRFS_QGROUP_STATUS_KEY; 878 key.offset = 0; 879 880 path = btrfs_alloc_path(); 881 if (!path) 882 return -ENOMEM; 883 884 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 885 if (ret > 0) 886 ret = -ENOENT; 887 888 if (ret) 889 goto out; 890 891 l = path->nodes[0]; 892 slot = path->slots[0]; 893 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 894 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & 895 BTRFS_QGROUP_STATUS_FLAGS_MASK); 896 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 897 btrfs_set_qgroup_status_rescan(l, ptr, 898 fs_info->qgroup_rescan_progress.objectid); 899 900 btrfs_mark_buffer_dirty(trans, l); 901 902 out: 903 btrfs_free_path(path); 904 return ret; 905 } 906 907 /* 908 * called with qgroup_lock held 909 */ 910 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 911 struct btrfs_root *root) 912 { 913 struct btrfs_path *path; 914 struct btrfs_key key; 915 struct extent_buffer *leaf = NULL; 916 int ret; 917 int nr = 0; 918 919 path = btrfs_alloc_path(); 920 if (!path) 921 return -ENOMEM; 922 923 key.objectid = 0; 924 key.offset = 0; 925 key.type = 0; 926 927 while (1) { 928 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 929 if (ret < 0) 930 goto out; 931 leaf = path->nodes[0]; 932 nr = btrfs_header_nritems(leaf); 933 if (!nr) 934 break; 935 /* 936 * delete the leaf one by one 937 * since the whole tree is going 938 * to be deleted. 939 */ 940 path->slots[0] = 0; 941 ret = btrfs_del_items(trans, root, path, 0, nr); 942 if (ret) 943 goto out; 944 945 btrfs_release_path(path); 946 } 947 ret = 0; 948 out: 949 btrfs_free_path(path); 950 return ret; 951 } 952 953 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 954 { 955 struct btrfs_root *quota_root; 956 struct btrfs_root *tree_root = fs_info->tree_root; 957 struct btrfs_path *path = NULL; 958 struct btrfs_qgroup_status_item *ptr; 959 struct extent_buffer *leaf; 960 struct btrfs_key key; 961 struct btrfs_key found_key; 962 struct btrfs_qgroup *qgroup = NULL; 963 struct btrfs_trans_handle *trans = NULL; 964 struct ulist *ulist = NULL; 965 int ret = 0; 966 int slot; 967 968 /* 969 * We need to have subvol_sem write locked, to prevent races between 970 * concurrent tasks trying to enable quotas, because we will unlock 971 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 972 * and before setting BTRFS_FS_QUOTA_ENABLED. 973 */ 974 lockdep_assert_held_write(&fs_info->subvol_sem); 975 976 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 977 btrfs_err(fs_info, 978 "qgroups are currently unsupported in extent tree v2"); 979 return -EINVAL; 980 } 981 982 mutex_lock(&fs_info->qgroup_ioctl_lock); 983 if (fs_info->quota_root) 984 goto out; 985 986 ulist = ulist_alloc(GFP_KERNEL); 987 if (!ulist) { 988 ret = -ENOMEM; 989 goto out; 990 } 991 992 ret = btrfs_sysfs_add_qgroups(fs_info); 993 if (ret < 0) 994 goto out; 995 996 /* 997 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 998 * avoid lock acquisition inversion problems (reported by lockdep) between 999 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 1000 * start a transaction. 1001 * After we started the transaction lock qgroup_ioctl_lock again and 1002 * check if someone else created the quota root in the meanwhile. If so, 1003 * just return success and release the transaction handle. 1004 * 1005 * Also we don't need to worry about someone else calling 1006 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 1007 * that function returns 0 (success) when the sysfs entries already exist. 1008 */ 1009 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1010 1011 /* 1012 * 1 for quota root item 1013 * 1 for BTRFS_QGROUP_STATUS item 1014 * 1015 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 1016 * per subvolume. However those are not currently reserved since it 1017 * would be a lot of overkill. 1018 */ 1019 trans = btrfs_start_transaction(tree_root, 2); 1020 1021 mutex_lock(&fs_info->qgroup_ioctl_lock); 1022 if (IS_ERR(trans)) { 1023 ret = PTR_ERR(trans); 1024 trans = NULL; 1025 goto out; 1026 } 1027 1028 if (fs_info->quota_root) 1029 goto out; 1030 1031 fs_info->qgroup_ulist = ulist; 1032 ulist = NULL; 1033 1034 /* 1035 * initially create the quota tree 1036 */ 1037 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1038 if (IS_ERR(quota_root)) { 1039 ret = PTR_ERR(quota_root); 1040 btrfs_abort_transaction(trans, ret); 1041 goto out; 1042 } 1043 1044 path = btrfs_alloc_path(); 1045 if (!path) { 1046 ret = -ENOMEM; 1047 btrfs_abort_transaction(trans, ret); 1048 goto out_free_root; 1049 } 1050 1051 key.objectid = 0; 1052 key.type = BTRFS_QGROUP_STATUS_KEY; 1053 key.offset = 0; 1054 1055 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1056 sizeof(*ptr)); 1057 if (ret) { 1058 btrfs_abort_transaction(trans, ret); 1059 goto out_free_path; 1060 } 1061 1062 leaf = path->nodes[0]; 1063 ptr = btrfs_item_ptr(leaf, path->slots[0], 1064 struct btrfs_qgroup_status_item); 1065 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1066 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1067 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1068 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1069 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & 1070 BTRFS_QGROUP_STATUS_FLAGS_MASK); 1071 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1072 1073 btrfs_mark_buffer_dirty(trans, leaf); 1074 1075 key.objectid = 0; 1076 key.type = BTRFS_ROOT_REF_KEY; 1077 key.offset = 0; 1078 1079 btrfs_release_path(path); 1080 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1081 if (ret > 0) 1082 goto out_add_root; 1083 if (ret < 0) { 1084 btrfs_abort_transaction(trans, ret); 1085 goto out_free_path; 1086 } 1087 1088 while (1) { 1089 slot = path->slots[0]; 1090 leaf = path->nodes[0]; 1091 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1092 1093 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1094 1095 /* Release locks on tree_root before we access quota_root */ 1096 btrfs_release_path(path); 1097 1098 ret = add_qgroup_item(trans, quota_root, 1099 found_key.offset); 1100 if (ret) { 1101 btrfs_abort_transaction(trans, ret); 1102 goto out_free_path; 1103 } 1104 1105 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1106 if (IS_ERR(qgroup)) { 1107 ret = PTR_ERR(qgroup); 1108 btrfs_abort_transaction(trans, ret); 1109 goto out_free_path; 1110 } 1111 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1112 if (ret < 0) { 1113 btrfs_abort_transaction(trans, ret); 1114 goto out_free_path; 1115 } 1116 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1117 path, 1, 0); 1118 if (ret < 0) { 1119 btrfs_abort_transaction(trans, ret); 1120 goto out_free_path; 1121 } 1122 if (ret > 0) { 1123 /* 1124 * Shouldn't happen, but in case it does we 1125 * don't need to do the btrfs_next_item, just 1126 * continue. 1127 */ 1128 continue; 1129 } 1130 } 1131 ret = btrfs_next_item(tree_root, path); 1132 if (ret < 0) { 1133 btrfs_abort_transaction(trans, ret); 1134 goto out_free_path; 1135 } 1136 if (ret) 1137 break; 1138 } 1139 1140 out_add_root: 1141 btrfs_release_path(path); 1142 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1143 if (ret) { 1144 btrfs_abort_transaction(trans, ret); 1145 goto out_free_path; 1146 } 1147 1148 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1149 if (IS_ERR(qgroup)) { 1150 ret = PTR_ERR(qgroup); 1151 btrfs_abort_transaction(trans, ret); 1152 goto out_free_path; 1153 } 1154 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1155 if (ret < 0) { 1156 btrfs_abort_transaction(trans, ret); 1157 goto out_free_path; 1158 } 1159 1160 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1161 /* 1162 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1163 * a deadlock with tasks concurrently doing other qgroup operations, such 1164 * adding/removing qgroups or adding/deleting qgroup relations for example, 1165 * because all qgroup operations first start or join a transaction and then 1166 * lock the qgroup_ioctl_lock mutex. 1167 * We are safe from a concurrent task trying to enable quotas, by calling 1168 * this function, since we are serialized by fs_info->subvol_sem. 1169 */ 1170 ret = btrfs_commit_transaction(trans); 1171 trans = NULL; 1172 mutex_lock(&fs_info->qgroup_ioctl_lock); 1173 if (ret) 1174 goto out_free_path; 1175 1176 /* 1177 * Set quota enabled flag after committing the transaction, to avoid 1178 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1179 * creation. 1180 */ 1181 spin_lock(&fs_info->qgroup_lock); 1182 fs_info->quota_root = quota_root; 1183 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1184 spin_unlock(&fs_info->qgroup_lock); 1185 1186 ret = qgroup_rescan_init(fs_info, 0, 1); 1187 if (!ret) { 1188 qgroup_rescan_zero_tracking(fs_info); 1189 fs_info->qgroup_rescan_running = true; 1190 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1191 &fs_info->qgroup_rescan_work); 1192 } else { 1193 /* 1194 * We have set both BTRFS_FS_QUOTA_ENABLED and 1195 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with 1196 * -EINPROGRESS. That can happen because someone started the 1197 * rescan worker by calling quota rescan ioctl before we 1198 * attempted to initialize the rescan worker. Failure due to 1199 * quotas disabled in the meanwhile is not possible, because 1200 * we are holding a write lock on fs_info->subvol_sem, which 1201 * is also acquired when disabling quotas. 1202 * Ignore such error, and any other error would need to undo 1203 * everything we did in the transaction we just committed. 1204 */ 1205 ASSERT(ret == -EINPROGRESS); 1206 ret = 0; 1207 } 1208 1209 out_free_path: 1210 btrfs_free_path(path); 1211 out_free_root: 1212 if (ret) 1213 btrfs_put_root(quota_root); 1214 out: 1215 if (ret) { 1216 ulist_free(fs_info->qgroup_ulist); 1217 fs_info->qgroup_ulist = NULL; 1218 btrfs_sysfs_del_qgroups(fs_info); 1219 } 1220 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1221 if (ret && trans) 1222 btrfs_end_transaction(trans); 1223 else if (trans) 1224 ret = btrfs_end_transaction(trans); 1225 ulist_free(ulist); 1226 return ret; 1227 } 1228 1229 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1230 { 1231 struct btrfs_root *quota_root; 1232 struct btrfs_trans_handle *trans = NULL; 1233 int ret = 0; 1234 1235 /* 1236 * We need to have subvol_sem write locked to prevent races with 1237 * snapshot creation. 1238 */ 1239 lockdep_assert_held_write(&fs_info->subvol_sem); 1240 1241 /* 1242 * Lock the cleaner mutex to prevent races with concurrent relocation, 1243 * because relocation may be building backrefs for blocks of the quota 1244 * root while we are deleting the root. This is like dropping fs roots 1245 * of deleted snapshots/subvolumes, we need the same protection. 1246 * 1247 * This also prevents races between concurrent tasks trying to disable 1248 * quotas, because we will unlock and relock qgroup_ioctl_lock across 1249 * BTRFS_FS_QUOTA_ENABLED changes. 1250 */ 1251 mutex_lock(&fs_info->cleaner_mutex); 1252 1253 mutex_lock(&fs_info->qgroup_ioctl_lock); 1254 if (!fs_info->quota_root) 1255 goto out; 1256 1257 /* 1258 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1259 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1260 * to lock that mutex while holding a transaction handle and the rescan 1261 * worker needs to commit a transaction. 1262 */ 1263 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1264 1265 /* 1266 * Request qgroup rescan worker to complete and wait for it. This wait 1267 * must be done before transaction start for quota disable since it may 1268 * deadlock with transaction by the qgroup rescan worker. 1269 */ 1270 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1271 btrfs_qgroup_wait_for_completion(fs_info, false); 1272 1273 /* 1274 * 1 For the root item 1275 * 1276 * We should also reserve enough items for the quota tree deletion in 1277 * btrfs_clean_quota_tree but this is not done. 1278 * 1279 * Also, we must always start a transaction without holding the mutex 1280 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1281 */ 1282 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1283 1284 mutex_lock(&fs_info->qgroup_ioctl_lock); 1285 if (IS_ERR(trans)) { 1286 ret = PTR_ERR(trans); 1287 trans = NULL; 1288 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1289 goto out; 1290 } 1291 1292 if (!fs_info->quota_root) 1293 goto out; 1294 1295 spin_lock(&fs_info->qgroup_lock); 1296 quota_root = fs_info->quota_root; 1297 fs_info->quota_root = NULL; 1298 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1299 fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; 1300 spin_unlock(&fs_info->qgroup_lock); 1301 1302 btrfs_free_qgroup_config(fs_info); 1303 1304 ret = btrfs_clean_quota_tree(trans, quota_root); 1305 if (ret) { 1306 btrfs_abort_transaction(trans, ret); 1307 goto out; 1308 } 1309 1310 ret = btrfs_del_root(trans, "a_root->root_key); 1311 if (ret) { 1312 btrfs_abort_transaction(trans, ret); 1313 goto out; 1314 } 1315 1316 spin_lock(&fs_info->trans_lock); 1317 list_del("a_root->dirty_list); 1318 spin_unlock(&fs_info->trans_lock); 1319 1320 btrfs_tree_lock(quota_root->node); 1321 btrfs_clear_buffer_dirty(trans, quota_root->node); 1322 btrfs_tree_unlock(quota_root->node); 1323 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1324 quota_root->node, 0, 1); 1325 1326 btrfs_put_root(quota_root); 1327 1328 out: 1329 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1330 if (ret && trans) 1331 btrfs_end_transaction(trans); 1332 else if (trans) 1333 ret = btrfs_end_transaction(trans); 1334 mutex_unlock(&fs_info->cleaner_mutex); 1335 1336 return ret; 1337 } 1338 1339 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1340 struct btrfs_qgroup *qgroup) 1341 { 1342 if (list_empty(&qgroup->dirty)) 1343 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1344 } 1345 1346 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) 1347 { 1348 if (!list_empty(&qgroup->iterator)) 1349 return; 1350 1351 list_add_tail(&qgroup->iterator, head); 1352 } 1353 1354 static void qgroup_iterator_clean(struct list_head *head) 1355 { 1356 while (!list_empty(head)) { 1357 struct btrfs_qgroup *qgroup; 1358 1359 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); 1360 list_del_init(&qgroup->iterator); 1361 } 1362 } 1363 1364 /* 1365 * The easy accounting, we're updating qgroup relationship whose child qgroup 1366 * only has exclusive extents. 1367 * 1368 * In this case, all exclusive extents will also be exclusive for parent, so 1369 * excl/rfer just get added/removed. 1370 * 1371 * So is qgroup reservation space, which should also be added/removed to 1372 * parent. 1373 * Or when child tries to release reservation space, parent will underflow its 1374 * reservation (for relationship adding case). 1375 * 1376 * Caller should hold fs_info->qgroup_lock. 1377 */ 1378 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1379 struct ulist *tmp, u64 ref_root, 1380 struct btrfs_qgroup *src, int sign) 1381 { 1382 struct btrfs_qgroup *qgroup; 1383 struct btrfs_qgroup_list *glist; 1384 struct ulist_node *unode; 1385 struct ulist_iterator uiter; 1386 u64 num_bytes = src->excl; 1387 int ret = 0; 1388 1389 qgroup = find_qgroup_rb(fs_info, ref_root); 1390 if (!qgroup) 1391 goto out; 1392 1393 qgroup->rfer += sign * num_bytes; 1394 qgroup->rfer_cmpr += sign * num_bytes; 1395 1396 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1397 qgroup->excl += sign * num_bytes; 1398 qgroup->excl_cmpr += sign * num_bytes; 1399 1400 if (sign > 0) 1401 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1402 else 1403 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1404 1405 qgroup_dirty(fs_info, qgroup); 1406 1407 /* Get all of the parent groups that contain this qgroup */ 1408 list_for_each_entry(glist, &qgroup->groups, next_group) { 1409 ret = ulist_add(tmp, glist->group->qgroupid, 1410 qgroup_to_aux(glist->group), GFP_ATOMIC); 1411 if (ret < 0) 1412 goto out; 1413 } 1414 1415 /* Iterate all of the parents and adjust their reference counts */ 1416 ULIST_ITER_INIT(&uiter); 1417 while ((unode = ulist_next(tmp, &uiter))) { 1418 qgroup = unode_aux_to_qgroup(unode); 1419 qgroup->rfer += sign * num_bytes; 1420 qgroup->rfer_cmpr += sign * num_bytes; 1421 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1422 qgroup->excl += sign * num_bytes; 1423 if (sign > 0) 1424 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1425 else 1426 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1427 qgroup->excl_cmpr += sign * num_bytes; 1428 qgroup_dirty(fs_info, qgroup); 1429 1430 /* Add any parents of the parents */ 1431 list_for_each_entry(glist, &qgroup->groups, next_group) { 1432 ret = ulist_add(tmp, glist->group->qgroupid, 1433 qgroup_to_aux(glist->group), GFP_ATOMIC); 1434 if (ret < 0) 1435 goto out; 1436 } 1437 } 1438 ret = 0; 1439 out: 1440 return ret; 1441 } 1442 1443 1444 /* 1445 * Quick path for updating qgroup with only excl refs. 1446 * 1447 * In that case, just update all parent will be enough. 1448 * Or we needs to do a full rescan. 1449 * Caller should also hold fs_info->qgroup_lock. 1450 * 1451 * Return 0 for quick update, return >0 for need to full rescan 1452 * and mark INCONSISTENT flag. 1453 * Return < 0 for other error. 1454 */ 1455 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1456 struct ulist *tmp, u64 src, u64 dst, 1457 int sign) 1458 { 1459 struct btrfs_qgroup *qgroup; 1460 int ret = 1; 1461 int err = 0; 1462 1463 qgroup = find_qgroup_rb(fs_info, src); 1464 if (!qgroup) 1465 goto out; 1466 if (qgroup->excl == qgroup->rfer) { 1467 ret = 0; 1468 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1469 qgroup, sign); 1470 if (err < 0) { 1471 ret = err; 1472 goto out; 1473 } 1474 } 1475 out: 1476 if (ret) 1477 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1478 return ret; 1479 } 1480 1481 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1482 u64 dst) 1483 { 1484 struct btrfs_fs_info *fs_info = trans->fs_info; 1485 struct btrfs_qgroup *parent; 1486 struct btrfs_qgroup *member; 1487 struct btrfs_qgroup_list *list; 1488 struct ulist *tmp; 1489 unsigned int nofs_flag; 1490 int ret = 0; 1491 1492 /* Check the level of src and dst first */ 1493 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1494 return -EINVAL; 1495 1496 /* We hold a transaction handle open, must do a NOFS allocation. */ 1497 nofs_flag = memalloc_nofs_save(); 1498 tmp = ulist_alloc(GFP_KERNEL); 1499 memalloc_nofs_restore(nofs_flag); 1500 if (!tmp) 1501 return -ENOMEM; 1502 1503 mutex_lock(&fs_info->qgroup_ioctl_lock); 1504 if (!fs_info->quota_root) { 1505 ret = -ENOTCONN; 1506 goto out; 1507 } 1508 member = find_qgroup_rb(fs_info, src); 1509 parent = find_qgroup_rb(fs_info, dst); 1510 if (!member || !parent) { 1511 ret = -EINVAL; 1512 goto out; 1513 } 1514 1515 /* check if such qgroup relation exist firstly */ 1516 list_for_each_entry(list, &member->groups, next_group) { 1517 if (list->group == parent) { 1518 ret = -EEXIST; 1519 goto out; 1520 } 1521 } 1522 1523 ret = add_qgroup_relation_item(trans, src, dst); 1524 if (ret) 1525 goto out; 1526 1527 ret = add_qgroup_relation_item(trans, dst, src); 1528 if (ret) { 1529 del_qgroup_relation_item(trans, src, dst); 1530 goto out; 1531 } 1532 1533 spin_lock(&fs_info->qgroup_lock); 1534 ret = __add_relation_rb(member, parent); 1535 if (ret < 0) { 1536 spin_unlock(&fs_info->qgroup_lock); 1537 goto out; 1538 } 1539 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1540 spin_unlock(&fs_info->qgroup_lock); 1541 out: 1542 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1543 ulist_free(tmp); 1544 return ret; 1545 } 1546 1547 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1548 u64 dst) 1549 { 1550 struct btrfs_fs_info *fs_info = trans->fs_info; 1551 struct btrfs_qgroup *parent; 1552 struct btrfs_qgroup *member; 1553 struct btrfs_qgroup_list *list; 1554 struct ulist *tmp; 1555 bool found = false; 1556 unsigned int nofs_flag; 1557 int ret = 0; 1558 int ret2; 1559 1560 /* We hold a transaction handle open, must do a NOFS allocation. */ 1561 nofs_flag = memalloc_nofs_save(); 1562 tmp = ulist_alloc(GFP_KERNEL); 1563 memalloc_nofs_restore(nofs_flag); 1564 if (!tmp) 1565 return -ENOMEM; 1566 1567 if (!fs_info->quota_root) { 1568 ret = -ENOTCONN; 1569 goto out; 1570 } 1571 1572 member = find_qgroup_rb(fs_info, src); 1573 parent = find_qgroup_rb(fs_info, dst); 1574 /* 1575 * The parent/member pair doesn't exist, then try to delete the dead 1576 * relation items only. 1577 */ 1578 if (!member || !parent) 1579 goto delete_item; 1580 1581 /* check if such qgroup relation exist firstly */ 1582 list_for_each_entry(list, &member->groups, next_group) { 1583 if (list->group == parent) { 1584 found = true; 1585 break; 1586 } 1587 } 1588 1589 delete_item: 1590 ret = del_qgroup_relation_item(trans, src, dst); 1591 if (ret < 0 && ret != -ENOENT) 1592 goto out; 1593 ret2 = del_qgroup_relation_item(trans, dst, src); 1594 if (ret2 < 0 && ret2 != -ENOENT) 1595 goto out; 1596 1597 /* At least one deletion succeeded, return 0 */ 1598 if (!ret || !ret2) 1599 ret = 0; 1600 1601 if (found) { 1602 spin_lock(&fs_info->qgroup_lock); 1603 del_relation_rb(fs_info, src, dst); 1604 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1605 spin_unlock(&fs_info->qgroup_lock); 1606 } 1607 out: 1608 ulist_free(tmp); 1609 return ret; 1610 } 1611 1612 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1613 u64 dst) 1614 { 1615 struct btrfs_fs_info *fs_info = trans->fs_info; 1616 int ret = 0; 1617 1618 mutex_lock(&fs_info->qgroup_ioctl_lock); 1619 ret = __del_qgroup_relation(trans, src, dst); 1620 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1621 1622 return ret; 1623 } 1624 1625 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1626 { 1627 struct btrfs_fs_info *fs_info = trans->fs_info; 1628 struct btrfs_root *quota_root; 1629 struct btrfs_qgroup *qgroup; 1630 int ret = 0; 1631 1632 mutex_lock(&fs_info->qgroup_ioctl_lock); 1633 if (!fs_info->quota_root) { 1634 ret = -ENOTCONN; 1635 goto out; 1636 } 1637 quota_root = fs_info->quota_root; 1638 qgroup = find_qgroup_rb(fs_info, qgroupid); 1639 if (qgroup) { 1640 ret = -EEXIST; 1641 goto out; 1642 } 1643 1644 ret = add_qgroup_item(trans, quota_root, qgroupid); 1645 if (ret) 1646 goto out; 1647 1648 spin_lock(&fs_info->qgroup_lock); 1649 qgroup = add_qgroup_rb(fs_info, qgroupid); 1650 spin_unlock(&fs_info->qgroup_lock); 1651 1652 if (IS_ERR(qgroup)) { 1653 ret = PTR_ERR(qgroup); 1654 goto out; 1655 } 1656 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1657 out: 1658 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1659 return ret; 1660 } 1661 1662 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1663 { 1664 struct btrfs_fs_info *fs_info = trans->fs_info; 1665 struct btrfs_qgroup *qgroup; 1666 struct btrfs_qgroup_list *list; 1667 int ret = 0; 1668 1669 mutex_lock(&fs_info->qgroup_ioctl_lock); 1670 if (!fs_info->quota_root) { 1671 ret = -ENOTCONN; 1672 goto out; 1673 } 1674 1675 qgroup = find_qgroup_rb(fs_info, qgroupid); 1676 if (!qgroup) { 1677 ret = -ENOENT; 1678 goto out; 1679 } 1680 1681 /* Check if there are no children of this qgroup */ 1682 if (!list_empty(&qgroup->members)) { 1683 ret = -EBUSY; 1684 goto out; 1685 } 1686 1687 ret = del_qgroup_item(trans, qgroupid); 1688 if (ret && ret != -ENOENT) 1689 goto out; 1690 1691 while (!list_empty(&qgroup->groups)) { 1692 list = list_first_entry(&qgroup->groups, 1693 struct btrfs_qgroup_list, next_group); 1694 ret = __del_qgroup_relation(trans, qgroupid, 1695 list->group->qgroupid); 1696 if (ret) 1697 goto out; 1698 } 1699 1700 spin_lock(&fs_info->qgroup_lock); 1701 del_qgroup_rb(fs_info, qgroupid); 1702 spin_unlock(&fs_info->qgroup_lock); 1703 1704 /* 1705 * Remove the qgroup from sysfs now without holding the qgroup_lock 1706 * spinlock, since the sysfs_remove_group() function needs to take 1707 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1708 */ 1709 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1710 kfree(qgroup); 1711 out: 1712 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1713 return ret; 1714 } 1715 1716 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1717 struct btrfs_qgroup_limit *limit) 1718 { 1719 struct btrfs_fs_info *fs_info = trans->fs_info; 1720 struct btrfs_qgroup *qgroup; 1721 int ret = 0; 1722 /* Sometimes we would want to clear the limit on this qgroup. 1723 * To meet this requirement, we treat the -1 as a special value 1724 * which tell kernel to clear the limit on this qgroup. 1725 */ 1726 const u64 CLEAR_VALUE = -1; 1727 1728 mutex_lock(&fs_info->qgroup_ioctl_lock); 1729 if (!fs_info->quota_root) { 1730 ret = -ENOTCONN; 1731 goto out; 1732 } 1733 1734 qgroup = find_qgroup_rb(fs_info, qgroupid); 1735 if (!qgroup) { 1736 ret = -ENOENT; 1737 goto out; 1738 } 1739 1740 spin_lock(&fs_info->qgroup_lock); 1741 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1742 if (limit->max_rfer == CLEAR_VALUE) { 1743 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1744 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1745 qgroup->max_rfer = 0; 1746 } else { 1747 qgroup->max_rfer = limit->max_rfer; 1748 } 1749 } 1750 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1751 if (limit->max_excl == CLEAR_VALUE) { 1752 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1753 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1754 qgroup->max_excl = 0; 1755 } else { 1756 qgroup->max_excl = limit->max_excl; 1757 } 1758 } 1759 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1760 if (limit->rsv_rfer == CLEAR_VALUE) { 1761 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1762 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1763 qgroup->rsv_rfer = 0; 1764 } else { 1765 qgroup->rsv_rfer = limit->rsv_rfer; 1766 } 1767 } 1768 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1769 if (limit->rsv_excl == CLEAR_VALUE) { 1770 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1771 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1772 qgroup->rsv_excl = 0; 1773 } else { 1774 qgroup->rsv_excl = limit->rsv_excl; 1775 } 1776 } 1777 qgroup->lim_flags |= limit->flags; 1778 1779 spin_unlock(&fs_info->qgroup_lock); 1780 1781 ret = update_qgroup_limit_item(trans, qgroup); 1782 if (ret) { 1783 qgroup_mark_inconsistent(fs_info); 1784 btrfs_info(fs_info, "unable to update quota limit for %llu", 1785 qgroupid); 1786 } 1787 1788 out: 1789 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1790 return ret; 1791 } 1792 1793 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1794 struct btrfs_delayed_ref_root *delayed_refs, 1795 struct btrfs_qgroup_extent_record *record) 1796 { 1797 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1798 struct rb_node *parent_node = NULL; 1799 struct btrfs_qgroup_extent_record *entry; 1800 u64 bytenr = record->bytenr; 1801 1802 lockdep_assert_held(&delayed_refs->lock); 1803 trace_btrfs_qgroup_trace_extent(fs_info, record); 1804 1805 while (*p) { 1806 parent_node = *p; 1807 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1808 node); 1809 if (bytenr < entry->bytenr) { 1810 p = &(*p)->rb_left; 1811 } else if (bytenr > entry->bytenr) { 1812 p = &(*p)->rb_right; 1813 } else { 1814 if (record->data_rsv && !entry->data_rsv) { 1815 entry->data_rsv = record->data_rsv; 1816 entry->data_rsv_refroot = 1817 record->data_rsv_refroot; 1818 } 1819 return 1; 1820 } 1821 } 1822 1823 rb_link_node(&record->node, parent_node, p); 1824 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1825 return 0; 1826 } 1827 1828 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1829 struct btrfs_qgroup_extent_record *qrecord) 1830 { 1831 struct btrfs_backref_walk_ctx ctx = { 0 }; 1832 int ret; 1833 1834 /* 1835 * We are always called in a context where we are already holding a 1836 * transaction handle. Often we are called when adding a data delayed 1837 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1838 * in which case we will be holding a write lock on extent buffer from a 1839 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1840 * acquire fs_info->commit_root_sem, because that is a higher level lock 1841 * that must be acquired before locking any extent buffers. 1842 * 1843 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1844 * but we can't pass it a non-NULL transaction handle, because otherwise 1845 * it would not use commit roots and would lock extent buffers, causing 1846 * a deadlock if it ends up trying to read lock the same extent buffer 1847 * that was previously write locked at btrfs_truncate_inode_items(). 1848 * 1849 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1850 * explicitly tell it to not acquire the commit_root_sem - if we are 1851 * holding a transaction handle we don't need its protection. 1852 */ 1853 ASSERT(trans != NULL); 1854 1855 if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 1856 return 0; 1857 1858 ctx.bytenr = qrecord->bytenr; 1859 ctx.fs_info = trans->fs_info; 1860 1861 ret = btrfs_find_all_roots(&ctx, true); 1862 if (ret < 0) { 1863 qgroup_mark_inconsistent(trans->fs_info); 1864 btrfs_warn(trans->fs_info, 1865 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1866 ret); 1867 return 0; 1868 } 1869 1870 /* 1871 * Here we don't need to get the lock of 1872 * trans->transaction->delayed_refs, since inserted qrecord won't 1873 * be deleted, only qrecord->node may be modified (new qrecord insert) 1874 * 1875 * So modifying qrecord->old_roots is safe here 1876 */ 1877 qrecord->old_roots = ctx.roots; 1878 return 0; 1879 } 1880 1881 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1882 u64 num_bytes) 1883 { 1884 struct btrfs_fs_info *fs_info = trans->fs_info; 1885 struct btrfs_qgroup_extent_record *record; 1886 struct btrfs_delayed_ref_root *delayed_refs; 1887 int ret; 1888 1889 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1890 || bytenr == 0 || num_bytes == 0) 1891 return 0; 1892 record = kzalloc(sizeof(*record), GFP_NOFS); 1893 if (!record) 1894 return -ENOMEM; 1895 1896 delayed_refs = &trans->transaction->delayed_refs; 1897 record->bytenr = bytenr; 1898 record->num_bytes = num_bytes; 1899 record->old_roots = NULL; 1900 1901 spin_lock(&delayed_refs->lock); 1902 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1903 spin_unlock(&delayed_refs->lock); 1904 if (ret > 0) { 1905 kfree(record); 1906 return 0; 1907 } 1908 return btrfs_qgroup_trace_extent_post(trans, record); 1909 } 1910 1911 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1912 struct extent_buffer *eb) 1913 { 1914 struct btrfs_fs_info *fs_info = trans->fs_info; 1915 int nr = btrfs_header_nritems(eb); 1916 int i, extent_type, ret; 1917 struct btrfs_key key; 1918 struct btrfs_file_extent_item *fi; 1919 u64 bytenr, num_bytes; 1920 1921 /* We can be called directly from walk_up_proc() */ 1922 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1923 return 0; 1924 1925 for (i = 0; i < nr; i++) { 1926 btrfs_item_key_to_cpu(eb, &key, i); 1927 1928 if (key.type != BTRFS_EXTENT_DATA_KEY) 1929 continue; 1930 1931 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1932 /* filter out non qgroup-accountable extents */ 1933 extent_type = btrfs_file_extent_type(eb, fi); 1934 1935 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1936 continue; 1937 1938 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1939 if (!bytenr) 1940 continue; 1941 1942 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1943 1944 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); 1945 if (ret) 1946 return ret; 1947 } 1948 cond_resched(); 1949 return 0; 1950 } 1951 1952 /* 1953 * Walk up the tree from the bottom, freeing leaves and any interior 1954 * nodes which have had all slots visited. If a node (leaf or 1955 * interior) is freed, the node above it will have it's slot 1956 * incremented. The root node will never be freed. 1957 * 1958 * At the end of this function, we should have a path which has all 1959 * slots incremented to the next position for a search. If we need to 1960 * read a new node it will be NULL and the node above it will have the 1961 * correct slot selected for a later read. 1962 * 1963 * If we increment the root nodes slot counter past the number of 1964 * elements, 1 is returned to signal completion of the search. 1965 */ 1966 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1967 { 1968 int level = 0; 1969 int nr, slot; 1970 struct extent_buffer *eb; 1971 1972 if (root_level == 0) 1973 return 1; 1974 1975 while (level <= root_level) { 1976 eb = path->nodes[level]; 1977 nr = btrfs_header_nritems(eb); 1978 path->slots[level]++; 1979 slot = path->slots[level]; 1980 if (slot >= nr || level == 0) { 1981 /* 1982 * Don't free the root - we will detect this 1983 * condition after our loop and return a 1984 * positive value for caller to stop walking the tree. 1985 */ 1986 if (level != root_level) { 1987 btrfs_tree_unlock_rw(eb, path->locks[level]); 1988 path->locks[level] = 0; 1989 1990 free_extent_buffer(eb); 1991 path->nodes[level] = NULL; 1992 path->slots[level] = 0; 1993 } 1994 } else { 1995 /* 1996 * We have a valid slot to walk back down 1997 * from. Stop here so caller can process these 1998 * new nodes. 1999 */ 2000 break; 2001 } 2002 2003 level++; 2004 } 2005 2006 eb = path->nodes[root_level]; 2007 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 2008 return 1; 2009 2010 return 0; 2011 } 2012 2013 /* 2014 * Helper function to trace a subtree tree block swap. 2015 * 2016 * The swap will happen in highest tree block, but there may be a lot of 2017 * tree blocks involved. 2018 * 2019 * For example: 2020 * OO = Old tree blocks 2021 * NN = New tree blocks allocated during balance 2022 * 2023 * File tree (257) Reloc tree for 257 2024 * L2 OO NN 2025 * / \ / \ 2026 * L1 OO OO (a) OO NN (a) 2027 * / \ / \ / \ / \ 2028 * L0 OO OO OO OO OO OO NN NN 2029 * (b) (c) (b) (c) 2030 * 2031 * When calling qgroup_trace_extent_swap(), we will pass: 2032 * @src_eb = OO(a) 2033 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 2034 * @dst_level = 0 2035 * @root_level = 1 2036 * 2037 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 2038 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 2039 * 2040 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 2041 * 2042 * 1) Tree search from @src_eb 2043 * It should acts as a simplified btrfs_search_slot(). 2044 * The key for search can be extracted from @dst_path->nodes[dst_level] 2045 * (first key). 2046 * 2047 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 2048 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 2049 * They should be marked during previous (@dst_level = 1) iteration. 2050 * 2051 * 3) Mark file extents in leaves dirty 2052 * We don't have good way to pick out new file extents only. 2053 * So we still follow the old method by scanning all file extents in 2054 * the leave. 2055 * 2056 * This function can free us from keeping two paths, thus later we only need 2057 * to care about how to iterate all new tree blocks in reloc tree. 2058 */ 2059 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 2060 struct extent_buffer *src_eb, 2061 struct btrfs_path *dst_path, 2062 int dst_level, int root_level, 2063 bool trace_leaf) 2064 { 2065 struct btrfs_key key; 2066 struct btrfs_path *src_path; 2067 struct btrfs_fs_info *fs_info = trans->fs_info; 2068 u32 nodesize = fs_info->nodesize; 2069 int cur_level = root_level; 2070 int ret; 2071 2072 BUG_ON(dst_level > root_level); 2073 /* Level mismatch */ 2074 if (btrfs_header_level(src_eb) != root_level) 2075 return -EINVAL; 2076 2077 src_path = btrfs_alloc_path(); 2078 if (!src_path) { 2079 ret = -ENOMEM; 2080 goto out; 2081 } 2082 2083 if (dst_level) 2084 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2085 else 2086 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2087 2088 /* For src_path */ 2089 atomic_inc(&src_eb->refs); 2090 src_path->nodes[root_level] = src_eb; 2091 src_path->slots[root_level] = dst_path->slots[root_level]; 2092 src_path->locks[root_level] = 0; 2093 2094 /* A simplified version of btrfs_search_slot() */ 2095 while (cur_level >= dst_level) { 2096 struct btrfs_key src_key; 2097 struct btrfs_key dst_key; 2098 2099 if (src_path->nodes[cur_level] == NULL) { 2100 struct extent_buffer *eb; 2101 int parent_slot; 2102 2103 eb = src_path->nodes[cur_level + 1]; 2104 parent_slot = src_path->slots[cur_level + 1]; 2105 2106 eb = btrfs_read_node_slot(eb, parent_slot); 2107 if (IS_ERR(eb)) { 2108 ret = PTR_ERR(eb); 2109 goto out; 2110 } 2111 2112 src_path->nodes[cur_level] = eb; 2113 2114 btrfs_tree_read_lock(eb); 2115 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2116 } 2117 2118 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2119 if (cur_level) { 2120 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2121 &dst_key, dst_path->slots[cur_level]); 2122 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2123 &src_key, src_path->slots[cur_level]); 2124 } else { 2125 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2126 &dst_key, dst_path->slots[cur_level]); 2127 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2128 &src_key, src_path->slots[cur_level]); 2129 } 2130 /* Content mismatch, something went wrong */ 2131 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2132 ret = -ENOENT; 2133 goto out; 2134 } 2135 cur_level--; 2136 } 2137 2138 /* 2139 * Now both @dst_path and @src_path have been populated, record the tree 2140 * blocks for qgroup accounting. 2141 */ 2142 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2143 nodesize); 2144 if (ret < 0) 2145 goto out; 2146 ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, 2147 nodesize); 2148 if (ret < 0) 2149 goto out; 2150 2151 /* Record leaf file extents */ 2152 if (dst_level == 0 && trace_leaf) { 2153 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2154 if (ret < 0) 2155 goto out; 2156 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2157 } 2158 out: 2159 btrfs_free_path(src_path); 2160 return ret; 2161 } 2162 2163 /* 2164 * Helper function to do recursive generation-aware depth-first search, to 2165 * locate all new tree blocks in a subtree of reloc tree. 2166 * 2167 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2168 * reloc tree 2169 * L2 NN (a) 2170 * / \ 2171 * L1 OO NN (b) 2172 * / \ / \ 2173 * L0 OO OO OO NN 2174 * (c) (d) 2175 * If we pass: 2176 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2177 * @cur_level = 1 2178 * @root_level = 1 2179 * 2180 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2181 * above tree blocks along with their counter parts in file tree. 2182 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2183 * won't affect OO(c). 2184 */ 2185 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2186 struct extent_buffer *src_eb, 2187 struct btrfs_path *dst_path, 2188 int cur_level, int root_level, 2189 u64 last_snapshot, bool trace_leaf) 2190 { 2191 struct btrfs_fs_info *fs_info = trans->fs_info; 2192 struct extent_buffer *eb; 2193 bool need_cleanup = false; 2194 int ret = 0; 2195 int i; 2196 2197 /* Level sanity check */ 2198 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2199 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2200 root_level < cur_level) { 2201 btrfs_err_rl(fs_info, 2202 "%s: bad levels, cur_level=%d root_level=%d", 2203 __func__, cur_level, root_level); 2204 return -EUCLEAN; 2205 } 2206 2207 /* Read the tree block if needed */ 2208 if (dst_path->nodes[cur_level] == NULL) { 2209 int parent_slot; 2210 u64 child_gen; 2211 2212 /* 2213 * dst_path->nodes[root_level] must be initialized before 2214 * calling this function. 2215 */ 2216 if (cur_level == root_level) { 2217 btrfs_err_rl(fs_info, 2218 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2219 __func__, root_level, root_level, cur_level); 2220 return -EUCLEAN; 2221 } 2222 2223 /* 2224 * We need to get child blockptr/gen from parent before we can 2225 * read it. 2226 */ 2227 eb = dst_path->nodes[cur_level + 1]; 2228 parent_slot = dst_path->slots[cur_level + 1]; 2229 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2230 2231 /* This node is old, no need to trace */ 2232 if (child_gen < last_snapshot) 2233 goto out; 2234 2235 eb = btrfs_read_node_slot(eb, parent_slot); 2236 if (IS_ERR(eb)) { 2237 ret = PTR_ERR(eb); 2238 goto out; 2239 } 2240 2241 dst_path->nodes[cur_level] = eb; 2242 dst_path->slots[cur_level] = 0; 2243 2244 btrfs_tree_read_lock(eb); 2245 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2246 need_cleanup = true; 2247 } 2248 2249 /* Now record this tree block and its counter part for qgroups */ 2250 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2251 root_level, trace_leaf); 2252 if (ret < 0) 2253 goto cleanup; 2254 2255 eb = dst_path->nodes[cur_level]; 2256 2257 if (cur_level > 0) { 2258 /* Iterate all child tree blocks */ 2259 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2260 /* Skip old tree blocks as they won't be swapped */ 2261 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2262 continue; 2263 dst_path->slots[cur_level] = i; 2264 2265 /* Recursive call (at most 7 times) */ 2266 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2267 dst_path, cur_level - 1, root_level, 2268 last_snapshot, trace_leaf); 2269 if (ret < 0) 2270 goto cleanup; 2271 } 2272 } 2273 2274 cleanup: 2275 if (need_cleanup) { 2276 /* Clean up */ 2277 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2278 dst_path->locks[cur_level]); 2279 free_extent_buffer(dst_path->nodes[cur_level]); 2280 dst_path->nodes[cur_level] = NULL; 2281 dst_path->slots[cur_level] = 0; 2282 dst_path->locks[cur_level] = 0; 2283 } 2284 out: 2285 return ret; 2286 } 2287 2288 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2289 struct extent_buffer *src_eb, 2290 struct extent_buffer *dst_eb, 2291 u64 last_snapshot, bool trace_leaf) 2292 { 2293 struct btrfs_fs_info *fs_info = trans->fs_info; 2294 struct btrfs_path *dst_path = NULL; 2295 int level; 2296 int ret; 2297 2298 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2299 return 0; 2300 2301 /* Wrong parameter order */ 2302 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2303 btrfs_err_rl(fs_info, 2304 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2305 btrfs_header_generation(src_eb), 2306 btrfs_header_generation(dst_eb)); 2307 return -EUCLEAN; 2308 } 2309 2310 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2311 ret = -EIO; 2312 goto out; 2313 } 2314 2315 level = btrfs_header_level(dst_eb); 2316 dst_path = btrfs_alloc_path(); 2317 if (!dst_path) { 2318 ret = -ENOMEM; 2319 goto out; 2320 } 2321 /* For dst_path */ 2322 atomic_inc(&dst_eb->refs); 2323 dst_path->nodes[level] = dst_eb; 2324 dst_path->slots[level] = 0; 2325 dst_path->locks[level] = 0; 2326 2327 /* Do the generation aware breadth-first search */ 2328 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2329 level, last_snapshot, trace_leaf); 2330 if (ret < 0) 2331 goto out; 2332 ret = 0; 2333 2334 out: 2335 btrfs_free_path(dst_path); 2336 if (ret < 0) 2337 qgroup_mark_inconsistent(fs_info); 2338 return ret; 2339 } 2340 2341 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2342 struct extent_buffer *root_eb, 2343 u64 root_gen, int root_level) 2344 { 2345 struct btrfs_fs_info *fs_info = trans->fs_info; 2346 int ret = 0; 2347 int level; 2348 u8 drop_subptree_thres; 2349 struct extent_buffer *eb = root_eb; 2350 struct btrfs_path *path = NULL; 2351 2352 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2353 BUG_ON(root_eb == NULL); 2354 2355 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2356 return 0; 2357 2358 spin_lock(&fs_info->qgroup_lock); 2359 drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; 2360 spin_unlock(&fs_info->qgroup_lock); 2361 2362 /* 2363 * This function only gets called for snapshot drop, if we hit a high 2364 * node here, it means we are going to change ownership for quite a lot 2365 * of extents, which will greatly slow down btrfs_commit_transaction(). 2366 * 2367 * So here if we find a high tree here, we just skip the accounting and 2368 * mark qgroup inconsistent. 2369 */ 2370 if (root_level >= drop_subptree_thres) { 2371 qgroup_mark_inconsistent(fs_info); 2372 return 0; 2373 } 2374 2375 if (!extent_buffer_uptodate(root_eb)) { 2376 struct btrfs_tree_parent_check check = { 2377 .has_first_key = false, 2378 .transid = root_gen, 2379 .level = root_level 2380 }; 2381 2382 ret = btrfs_read_extent_buffer(root_eb, &check); 2383 if (ret) 2384 goto out; 2385 } 2386 2387 if (root_level == 0) { 2388 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2389 goto out; 2390 } 2391 2392 path = btrfs_alloc_path(); 2393 if (!path) 2394 return -ENOMEM; 2395 2396 /* 2397 * Walk down the tree. Missing extent blocks are filled in as 2398 * we go. Metadata is accounted every time we read a new 2399 * extent block. 2400 * 2401 * When we reach a leaf, we account for file extent items in it, 2402 * walk back up the tree (adjusting slot pointers as we go) 2403 * and restart the search process. 2404 */ 2405 atomic_inc(&root_eb->refs); /* For path */ 2406 path->nodes[root_level] = root_eb; 2407 path->slots[root_level] = 0; 2408 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2409 walk_down: 2410 level = root_level; 2411 while (level >= 0) { 2412 if (path->nodes[level] == NULL) { 2413 int parent_slot; 2414 u64 child_bytenr; 2415 2416 /* 2417 * We need to get child blockptr from parent before we 2418 * can read it. 2419 */ 2420 eb = path->nodes[level + 1]; 2421 parent_slot = path->slots[level + 1]; 2422 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2423 2424 eb = btrfs_read_node_slot(eb, parent_slot); 2425 if (IS_ERR(eb)) { 2426 ret = PTR_ERR(eb); 2427 goto out; 2428 } 2429 2430 path->nodes[level] = eb; 2431 path->slots[level] = 0; 2432 2433 btrfs_tree_read_lock(eb); 2434 path->locks[level] = BTRFS_READ_LOCK; 2435 2436 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2437 fs_info->nodesize); 2438 if (ret) 2439 goto out; 2440 } 2441 2442 if (level == 0) { 2443 ret = btrfs_qgroup_trace_leaf_items(trans, 2444 path->nodes[level]); 2445 if (ret) 2446 goto out; 2447 2448 /* Nonzero return here means we completed our search */ 2449 ret = adjust_slots_upwards(path, root_level); 2450 if (ret) 2451 break; 2452 2453 /* Restart search with new slots */ 2454 goto walk_down; 2455 } 2456 2457 level--; 2458 } 2459 2460 ret = 0; 2461 out: 2462 btrfs_free_path(path); 2463 2464 return ret; 2465 } 2466 2467 #define UPDATE_NEW 0 2468 #define UPDATE_OLD 1 2469 /* 2470 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2471 */ 2472 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2473 struct ulist *roots, struct ulist *tmp, 2474 struct ulist *qgroups, u64 seq, int update_old) 2475 { 2476 struct ulist_node *unode; 2477 struct ulist_iterator uiter; 2478 struct ulist_node *tmp_unode; 2479 struct ulist_iterator tmp_uiter; 2480 struct btrfs_qgroup *qg; 2481 int ret = 0; 2482 2483 if (!roots) 2484 return 0; 2485 ULIST_ITER_INIT(&uiter); 2486 while ((unode = ulist_next(roots, &uiter))) { 2487 qg = find_qgroup_rb(fs_info, unode->val); 2488 if (!qg) 2489 continue; 2490 2491 ulist_reinit(tmp); 2492 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2493 GFP_ATOMIC); 2494 if (ret < 0) 2495 return ret; 2496 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2497 if (ret < 0) 2498 return ret; 2499 ULIST_ITER_INIT(&tmp_uiter); 2500 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2501 struct btrfs_qgroup_list *glist; 2502 2503 qg = unode_aux_to_qgroup(tmp_unode); 2504 if (update_old) 2505 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2506 else 2507 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2508 list_for_each_entry(glist, &qg->groups, next_group) { 2509 ret = ulist_add(qgroups, glist->group->qgroupid, 2510 qgroup_to_aux(glist->group), 2511 GFP_ATOMIC); 2512 if (ret < 0) 2513 return ret; 2514 ret = ulist_add(tmp, glist->group->qgroupid, 2515 qgroup_to_aux(glist->group), 2516 GFP_ATOMIC); 2517 if (ret < 0) 2518 return ret; 2519 } 2520 } 2521 } 2522 return 0; 2523 } 2524 2525 /* 2526 * Update qgroup rfer/excl counters. 2527 * Rfer update is easy, codes can explain themselves. 2528 * 2529 * Excl update is tricky, the update is split into 2 parts. 2530 * Part 1: Possible exclusive <-> sharing detect: 2531 * | A | !A | 2532 * ------------------------------------- 2533 * B | * | - | 2534 * ------------------------------------- 2535 * !B | + | ** | 2536 * ------------------------------------- 2537 * 2538 * Conditions: 2539 * A: cur_old_roots < nr_old_roots (not exclusive before) 2540 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2541 * B: cur_new_roots < nr_new_roots (not exclusive now) 2542 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2543 * 2544 * Results: 2545 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2546 * *: Definitely not changed. **: Possible unchanged. 2547 * 2548 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2549 * 2550 * To make the logic clear, we first use condition A and B to split 2551 * combination into 4 results. 2552 * 2553 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2554 * only on variant maybe 0. 2555 * 2556 * Lastly, check result **, since there are 2 variants maybe 0, split them 2557 * again(2x2). 2558 * But this time we don't need to consider other things, the codes and logic 2559 * is easy to understand now. 2560 */ 2561 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2562 struct ulist *qgroups, 2563 u64 nr_old_roots, 2564 u64 nr_new_roots, 2565 u64 num_bytes, u64 seq) 2566 { 2567 struct ulist_node *unode; 2568 struct ulist_iterator uiter; 2569 struct btrfs_qgroup *qg; 2570 u64 cur_new_count, cur_old_count; 2571 2572 ULIST_ITER_INIT(&uiter); 2573 while ((unode = ulist_next(qgroups, &uiter))) { 2574 bool dirty = false; 2575 2576 qg = unode_aux_to_qgroup(unode); 2577 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2578 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2579 2580 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2581 cur_new_count); 2582 2583 /* Rfer update part */ 2584 if (cur_old_count == 0 && cur_new_count > 0) { 2585 qg->rfer += num_bytes; 2586 qg->rfer_cmpr += num_bytes; 2587 dirty = true; 2588 } 2589 if (cur_old_count > 0 && cur_new_count == 0) { 2590 qg->rfer -= num_bytes; 2591 qg->rfer_cmpr -= num_bytes; 2592 dirty = true; 2593 } 2594 2595 /* Excl update part */ 2596 /* Exclusive/none -> shared case */ 2597 if (cur_old_count == nr_old_roots && 2598 cur_new_count < nr_new_roots) { 2599 /* Exclusive -> shared */ 2600 if (cur_old_count != 0) { 2601 qg->excl -= num_bytes; 2602 qg->excl_cmpr -= num_bytes; 2603 dirty = true; 2604 } 2605 } 2606 2607 /* Shared -> exclusive/none case */ 2608 if (cur_old_count < nr_old_roots && 2609 cur_new_count == nr_new_roots) { 2610 /* Shared->exclusive */ 2611 if (cur_new_count != 0) { 2612 qg->excl += num_bytes; 2613 qg->excl_cmpr += num_bytes; 2614 dirty = true; 2615 } 2616 } 2617 2618 /* Exclusive/none -> exclusive/none case */ 2619 if (cur_old_count == nr_old_roots && 2620 cur_new_count == nr_new_roots) { 2621 if (cur_old_count == 0) { 2622 /* None -> exclusive/none */ 2623 2624 if (cur_new_count != 0) { 2625 /* None -> exclusive */ 2626 qg->excl += num_bytes; 2627 qg->excl_cmpr += num_bytes; 2628 dirty = true; 2629 } 2630 /* None -> none, nothing changed */ 2631 } else { 2632 /* Exclusive -> exclusive/none */ 2633 2634 if (cur_new_count == 0) { 2635 /* Exclusive -> none */ 2636 qg->excl -= num_bytes; 2637 qg->excl_cmpr -= num_bytes; 2638 dirty = true; 2639 } 2640 /* Exclusive -> exclusive, nothing changed */ 2641 } 2642 } 2643 2644 if (dirty) 2645 qgroup_dirty(fs_info, qg); 2646 } 2647 return 0; 2648 } 2649 2650 /* 2651 * Check if the @roots potentially is a list of fs tree roots 2652 * 2653 * Return 0 for definitely not a fs/subvol tree roots ulist 2654 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2655 * one as well) 2656 */ 2657 static int maybe_fs_roots(struct ulist *roots) 2658 { 2659 struct ulist_node *unode; 2660 struct ulist_iterator uiter; 2661 2662 /* Empty one, still possible for fs roots */ 2663 if (!roots || roots->nnodes == 0) 2664 return 1; 2665 2666 ULIST_ITER_INIT(&uiter); 2667 unode = ulist_next(roots, &uiter); 2668 if (!unode) 2669 return 1; 2670 2671 /* 2672 * If it contains fs tree roots, then it must belong to fs/subvol 2673 * trees. 2674 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2675 */ 2676 return is_fstree(unode->val); 2677 } 2678 2679 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2680 u64 num_bytes, struct ulist *old_roots, 2681 struct ulist *new_roots) 2682 { 2683 struct btrfs_fs_info *fs_info = trans->fs_info; 2684 struct ulist *qgroups = NULL; 2685 struct ulist *tmp = NULL; 2686 u64 seq; 2687 u64 nr_new_roots = 0; 2688 u64 nr_old_roots = 0; 2689 int ret = 0; 2690 2691 /* 2692 * If quotas get disabled meanwhile, the resources need to be freed and 2693 * we can't just exit here. 2694 */ 2695 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 2696 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 2697 goto out_free; 2698 2699 if (new_roots) { 2700 if (!maybe_fs_roots(new_roots)) 2701 goto out_free; 2702 nr_new_roots = new_roots->nnodes; 2703 } 2704 if (old_roots) { 2705 if (!maybe_fs_roots(old_roots)) 2706 goto out_free; 2707 nr_old_roots = old_roots->nnodes; 2708 } 2709 2710 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2711 if (nr_old_roots == 0 && nr_new_roots == 0) 2712 goto out_free; 2713 2714 BUG_ON(!fs_info->quota_root); 2715 2716 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2717 num_bytes, nr_old_roots, nr_new_roots); 2718 2719 qgroups = ulist_alloc(GFP_NOFS); 2720 if (!qgroups) { 2721 ret = -ENOMEM; 2722 goto out_free; 2723 } 2724 tmp = ulist_alloc(GFP_NOFS); 2725 if (!tmp) { 2726 ret = -ENOMEM; 2727 goto out_free; 2728 } 2729 2730 mutex_lock(&fs_info->qgroup_rescan_lock); 2731 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2732 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2733 mutex_unlock(&fs_info->qgroup_rescan_lock); 2734 ret = 0; 2735 goto out_free; 2736 } 2737 } 2738 mutex_unlock(&fs_info->qgroup_rescan_lock); 2739 2740 spin_lock(&fs_info->qgroup_lock); 2741 seq = fs_info->qgroup_seq; 2742 2743 /* Update old refcnts using old_roots */ 2744 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2745 UPDATE_OLD); 2746 if (ret < 0) 2747 goto out; 2748 2749 /* Update new refcnts using new_roots */ 2750 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2751 UPDATE_NEW); 2752 if (ret < 0) 2753 goto out; 2754 2755 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2756 num_bytes, seq); 2757 2758 /* 2759 * Bump qgroup_seq to avoid seq overlap 2760 */ 2761 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2762 out: 2763 spin_unlock(&fs_info->qgroup_lock); 2764 out_free: 2765 ulist_free(tmp); 2766 ulist_free(qgroups); 2767 ulist_free(old_roots); 2768 ulist_free(new_roots); 2769 return ret; 2770 } 2771 2772 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2773 { 2774 struct btrfs_fs_info *fs_info = trans->fs_info; 2775 struct btrfs_qgroup_extent_record *record; 2776 struct btrfs_delayed_ref_root *delayed_refs; 2777 struct ulist *new_roots = NULL; 2778 struct rb_node *node; 2779 u64 num_dirty_extents = 0; 2780 u64 qgroup_to_skip; 2781 int ret = 0; 2782 2783 delayed_refs = &trans->transaction->delayed_refs; 2784 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2785 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2786 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2787 node); 2788 2789 num_dirty_extents++; 2790 trace_btrfs_qgroup_account_extents(fs_info, record); 2791 2792 if (!ret && !(fs_info->qgroup_flags & 2793 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { 2794 struct btrfs_backref_walk_ctx ctx = { 0 }; 2795 2796 ctx.bytenr = record->bytenr; 2797 ctx.fs_info = fs_info; 2798 2799 /* 2800 * Old roots should be searched when inserting qgroup 2801 * extent record. 2802 * 2803 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, 2804 * we may have some record inserted during 2805 * NO_ACCOUNTING (thus no old_roots populated), but 2806 * later we start rescan, which clears NO_ACCOUNTING, 2807 * leaving some inserted records without old_roots 2808 * populated. 2809 * 2810 * Those cases are rare and should not cause too much 2811 * time spent during commit_transaction(). 2812 */ 2813 if (!record->old_roots) { 2814 /* Search commit root to find old_roots */ 2815 ret = btrfs_find_all_roots(&ctx, false); 2816 if (ret < 0) 2817 goto cleanup; 2818 record->old_roots = ctx.roots; 2819 ctx.roots = NULL; 2820 } 2821 2822 /* Free the reserved data space */ 2823 btrfs_qgroup_free_refroot(fs_info, 2824 record->data_rsv_refroot, 2825 record->data_rsv, 2826 BTRFS_QGROUP_RSV_DATA); 2827 /* 2828 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2829 * which doesn't lock tree or delayed_refs and search 2830 * current root. It's safe inside commit_transaction(). 2831 */ 2832 ctx.trans = trans; 2833 ctx.time_seq = BTRFS_SEQ_LAST; 2834 ret = btrfs_find_all_roots(&ctx, false); 2835 if (ret < 0) 2836 goto cleanup; 2837 new_roots = ctx.roots; 2838 if (qgroup_to_skip) { 2839 ulist_del(new_roots, qgroup_to_skip, 0); 2840 ulist_del(record->old_roots, qgroup_to_skip, 2841 0); 2842 } 2843 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2844 record->num_bytes, 2845 record->old_roots, 2846 new_roots); 2847 record->old_roots = NULL; 2848 new_roots = NULL; 2849 } 2850 cleanup: 2851 ulist_free(record->old_roots); 2852 ulist_free(new_roots); 2853 new_roots = NULL; 2854 rb_erase(node, &delayed_refs->dirty_extent_root); 2855 kfree(record); 2856 2857 } 2858 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2859 num_dirty_extents); 2860 return ret; 2861 } 2862 2863 /* 2864 * Writes all changed qgroups to disk. 2865 * Called by the transaction commit path and the qgroup assign ioctl. 2866 */ 2867 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2868 { 2869 struct btrfs_fs_info *fs_info = trans->fs_info; 2870 int ret = 0; 2871 2872 /* 2873 * In case we are called from the qgroup assign ioctl, assert that we 2874 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 2875 * disable operation (ioctl) and access a freed quota root. 2876 */ 2877 if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 2878 lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 2879 2880 if (!fs_info->quota_root) 2881 return ret; 2882 2883 spin_lock(&fs_info->qgroup_lock); 2884 while (!list_empty(&fs_info->dirty_qgroups)) { 2885 struct btrfs_qgroup *qgroup; 2886 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2887 struct btrfs_qgroup, dirty); 2888 list_del_init(&qgroup->dirty); 2889 spin_unlock(&fs_info->qgroup_lock); 2890 ret = update_qgroup_info_item(trans, qgroup); 2891 if (ret) 2892 qgroup_mark_inconsistent(fs_info); 2893 ret = update_qgroup_limit_item(trans, qgroup); 2894 if (ret) 2895 qgroup_mark_inconsistent(fs_info); 2896 spin_lock(&fs_info->qgroup_lock); 2897 } 2898 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2899 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2900 else 2901 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2902 spin_unlock(&fs_info->qgroup_lock); 2903 2904 ret = update_qgroup_status_item(trans); 2905 if (ret) 2906 qgroup_mark_inconsistent(fs_info); 2907 2908 return ret; 2909 } 2910 2911 /* 2912 * Copy the accounting information between qgroups. This is necessary 2913 * when a snapshot or a subvolume is created. Throwing an error will 2914 * cause a transaction abort so we take extra care here to only error 2915 * when a readonly fs is a reasonable outcome. 2916 */ 2917 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2918 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2919 { 2920 int ret = 0; 2921 int i; 2922 u64 *i_qgroups; 2923 bool committing = false; 2924 struct btrfs_fs_info *fs_info = trans->fs_info; 2925 struct btrfs_root *quota_root; 2926 struct btrfs_qgroup *srcgroup; 2927 struct btrfs_qgroup *dstgroup; 2928 bool need_rescan = false; 2929 u32 level_size = 0; 2930 u64 nums; 2931 2932 /* 2933 * There are only two callers of this function. 2934 * 2935 * One in create_subvol() in the ioctl context, which needs to hold 2936 * the qgroup_ioctl_lock. 2937 * 2938 * The other one in create_pending_snapshot() where no other qgroup 2939 * code can modify the fs as they all need to either start a new trans 2940 * or hold a trans handler, thus we don't need to hold 2941 * qgroup_ioctl_lock. 2942 * This would avoid long and complex lock chain and make lockdep happy. 2943 */ 2944 spin_lock(&fs_info->trans_lock); 2945 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2946 committing = true; 2947 spin_unlock(&fs_info->trans_lock); 2948 2949 if (!committing) 2950 mutex_lock(&fs_info->qgroup_ioctl_lock); 2951 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2952 goto out; 2953 2954 quota_root = fs_info->quota_root; 2955 if (!quota_root) { 2956 ret = -EINVAL; 2957 goto out; 2958 } 2959 2960 if (inherit) { 2961 i_qgroups = (u64 *)(inherit + 1); 2962 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2963 2 * inherit->num_excl_copies; 2964 for (i = 0; i < nums; ++i) { 2965 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2966 2967 /* 2968 * Zero out invalid groups so we can ignore 2969 * them later. 2970 */ 2971 if (!srcgroup || 2972 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2973 *i_qgroups = 0ULL; 2974 2975 ++i_qgroups; 2976 } 2977 } 2978 2979 /* 2980 * create a tracking group for the subvol itself 2981 */ 2982 ret = add_qgroup_item(trans, quota_root, objectid); 2983 if (ret) 2984 goto out; 2985 2986 /* 2987 * add qgroup to all inherited groups 2988 */ 2989 if (inherit) { 2990 i_qgroups = (u64 *)(inherit + 1); 2991 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 2992 if (*i_qgroups == 0) 2993 continue; 2994 ret = add_qgroup_relation_item(trans, objectid, 2995 *i_qgroups); 2996 if (ret && ret != -EEXIST) 2997 goto out; 2998 ret = add_qgroup_relation_item(trans, *i_qgroups, 2999 objectid); 3000 if (ret && ret != -EEXIST) 3001 goto out; 3002 } 3003 ret = 0; 3004 } 3005 3006 3007 spin_lock(&fs_info->qgroup_lock); 3008 3009 dstgroup = add_qgroup_rb(fs_info, objectid); 3010 if (IS_ERR(dstgroup)) { 3011 ret = PTR_ERR(dstgroup); 3012 goto unlock; 3013 } 3014 3015 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 3016 dstgroup->lim_flags = inherit->lim.flags; 3017 dstgroup->max_rfer = inherit->lim.max_rfer; 3018 dstgroup->max_excl = inherit->lim.max_excl; 3019 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 3020 dstgroup->rsv_excl = inherit->lim.rsv_excl; 3021 3022 qgroup_dirty(fs_info, dstgroup); 3023 } 3024 3025 if (srcid) { 3026 srcgroup = find_qgroup_rb(fs_info, srcid); 3027 if (!srcgroup) 3028 goto unlock; 3029 3030 /* 3031 * We call inherit after we clone the root in order to make sure 3032 * our counts don't go crazy, so at this point the only 3033 * difference between the two roots should be the root node. 3034 */ 3035 level_size = fs_info->nodesize; 3036 dstgroup->rfer = srcgroup->rfer; 3037 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 3038 dstgroup->excl = level_size; 3039 dstgroup->excl_cmpr = level_size; 3040 srcgroup->excl = level_size; 3041 srcgroup->excl_cmpr = level_size; 3042 3043 /* inherit the limit info */ 3044 dstgroup->lim_flags = srcgroup->lim_flags; 3045 dstgroup->max_rfer = srcgroup->max_rfer; 3046 dstgroup->max_excl = srcgroup->max_excl; 3047 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 3048 dstgroup->rsv_excl = srcgroup->rsv_excl; 3049 3050 qgroup_dirty(fs_info, dstgroup); 3051 qgroup_dirty(fs_info, srcgroup); 3052 } 3053 3054 if (!inherit) 3055 goto unlock; 3056 3057 i_qgroups = (u64 *)(inherit + 1); 3058 for (i = 0; i < inherit->num_qgroups; ++i) { 3059 if (*i_qgroups) { 3060 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 3061 if (ret) 3062 goto unlock; 3063 } 3064 ++i_qgroups; 3065 3066 /* 3067 * If we're doing a snapshot, and adding the snapshot to a new 3068 * qgroup, the numbers are guaranteed to be incorrect. 3069 */ 3070 if (srcid) 3071 need_rescan = true; 3072 } 3073 3074 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 3075 struct btrfs_qgroup *src; 3076 struct btrfs_qgroup *dst; 3077 3078 if (!i_qgroups[0] || !i_qgroups[1]) 3079 continue; 3080 3081 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3082 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3083 3084 if (!src || !dst) { 3085 ret = -EINVAL; 3086 goto unlock; 3087 } 3088 3089 dst->rfer = src->rfer - level_size; 3090 dst->rfer_cmpr = src->rfer_cmpr - level_size; 3091 3092 /* Manually tweaking numbers certainly needs a rescan */ 3093 need_rescan = true; 3094 } 3095 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 3096 struct btrfs_qgroup *src; 3097 struct btrfs_qgroup *dst; 3098 3099 if (!i_qgroups[0] || !i_qgroups[1]) 3100 continue; 3101 3102 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3103 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3104 3105 if (!src || !dst) { 3106 ret = -EINVAL; 3107 goto unlock; 3108 } 3109 3110 dst->excl = src->excl + level_size; 3111 dst->excl_cmpr = src->excl_cmpr + level_size; 3112 need_rescan = true; 3113 } 3114 3115 unlock: 3116 spin_unlock(&fs_info->qgroup_lock); 3117 if (!ret) 3118 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 3119 out: 3120 if (!committing) 3121 mutex_unlock(&fs_info->qgroup_ioctl_lock); 3122 if (need_rescan) 3123 qgroup_mark_inconsistent(fs_info); 3124 return ret; 3125 } 3126 3127 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3128 { 3129 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3130 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3131 return false; 3132 3133 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3134 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3135 return false; 3136 3137 return true; 3138 } 3139 3140 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3141 enum btrfs_qgroup_rsv_type type) 3142 { 3143 struct btrfs_qgroup *qgroup; 3144 struct btrfs_fs_info *fs_info = root->fs_info; 3145 u64 ref_root = root->root_key.objectid; 3146 int ret = 0; 3147 LIST_HEAD(qgroup_list); 3148 3149 if (!is_fstree(ref_root)) 3150 return 0; 3151 3152 if (num_bytes == 0) 3153 return 0; 3154 3155 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3156 capable(CAP_SYS_RESOURCE)) 3157 enforce = false; 3158 3159 spin_lock(&fs_info->qgroup_lock); 3160 if (!fs_info->quota_root) 3161 goto out; 3162 3163 qgroup = find_qgroup_rb(fs_info, ref_root); 3164 if (!qgroup) 3165 goto out; 3166 3167 qgroup_iterator_add(&qgroup_list, qgroup); 3168 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3169 struct btrfs_qgroup_list *glist; 3170 3171 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { 3172 ret = -EDQUOT; 3173 goto out; 3174 } 3175 3176 list_for_each_entry(glist, &qgroup->groups, next_group) 3177 qgroup_iterator_add(&qgroup_list, glist->group); 3178 } 3179 3180 ret = 0; 3181 /* 3182 * no limits exceeded, now record the reservation into all qgroups 3183 */ 3184 list_for_each_entry(qgroup, &qgroup_list, iterator) 3185 qgroup_rsv_add(fs_info, qgroup, num_bytes, type); 3186 3187 out: 3188 qgroup_iterator_clean(&qgroup_list); 3189 spin_unlock(&fs_info->qgroup_lock); 3190 return ret; 3191 } 3192 3193 /* 3194 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3195 * qgroup). 3196 * 3197 * Will handle all higher level qgroup too. 3198 * 3199 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3200 * This special case is only used for META_PERTRANS type. 3201 */ 3202 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3203 u64 ref_root, u64 num_bytes, 3204 enum btrfs_qgroup_rsv_type type) 3205 { 3206 struct btrfs_qgroup *qgroup; 3207 struct ulist_node *unode; 3208 struct ulist_iterator uiter; 3209 int ret = 0; 3210 3211 if (!is_fstree(ref_root)) 3212 return; 3213 3214 if (num_bytes == 0) 3215 return; 3216 3217 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3218 WARN(1, "%s: Invalid type to free", __func__); 3219 return; 3220 } 3221 spin_lock(&fs_info->qgroup_lock); 3222 3223 if (!fs_info->quota_root) 3224 goto out; 3225 3226 qgroup = find_qgroup_rb(fs_info, ref_root); 3227 if (!qgroup) 3228 goto out; 3229 3230 if (num_bytes == (u64)-1) 3231 /* 3232 * We're freeing all pertrans rsv, get reserved value from 3233 * level 0 qgroup as real num_bytes to free. 3234 */ 3235 num_bytes = qgroup->rsv.values[type]; 3236 3237 ulist_reinit(fs_info->qgroup_ulist); 3238 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3239 qgroup_to_aux(qgroup), GFP_ATOMIC); 3240 if (ret < 0) 3241 goto out; 3242 ULIST_ITER_INIT(&uiter); 3243 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3244 struct btrfs_qgroup *qg; 3245 struct btrfs_qgroup_list *glist; 3246 3247 qg = unode_aux_to_qgroup(unode); 3248 3249 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3250 3251 list_for_each_entry(glist, &qg->groups, next_group) { 3252 ret = ulist_add(fs_info->qgroup_ulist, 3253 glist->group->qgroupid, 3254 qgroup_to_aux(glist->group), GFP_ATOMIC); 3255 if (ret < 0) 3256 goto out; 3257 } 3258 } 3259 3260 out: 3261 spin_unlock(&fs_info->qgroup_lock); 3262 } 3263 3264 /* 3265 * Check if the leaf is the last leaf. Which means all node pointers 3266 * are at their last position. 3267 */ 3268 static bool is_last_leaf(struct btrfs_path *path) 3269 { 3270 int i; 3271 3272 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3273 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3274 return false; 3275 } 3276 return true; 3277 } 3278 3279 /* 3280 * returns < 0 on error, 0 when more leafs are to be scanned. 3281 * returns 1 when done. 3282 */ 3283 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3284 struct btrfs_path *path) 3285 { 3286 struct btrfs_fs_info *fs_info = trans->fs_info; 3287 struct btrfs_root *extent_root; 3288 struct btrfs_key found; 3289 struct extent_buffer *scratch_leaf = NULL; 3290 u64 num_bytes; 3291 bool done; 3292 int slot; 3293 int ret; 3294 3295 mutex_lock(&fs_info->qgroup_rescan_lock); 3296 extent_root = btrfs_extent_root(fs_info, 3297 fs_info->qgroup_rescan_progress.objectid); 3298 ret = btrfs_search_slot_for_read(extent_root, 3299 &fs_info->qgroup_rescan_progress, 3300 path, 1, 0); 3301 3302 btrfs_debug(fs_info, 3303 "current progress key (%llu %u %llu), search_slot ret %d", 3304 fs_info->qgroup_rescan_progress.objectid, 3305 fs_info->qgroup_rescan_progress.type, 3306 fs_info->qgroup_rescan_progress.offset, ret); 3307 3308 if (ret) { 3309 /* 3310 * The rescan is about to end, we will not be scanning any 3311 * further blocks. We cannot unset the RESCAN flag here, because 3312 * we want to commit the transaction if everything went well. 3313 * To make the live accounting work in this phase, we set our 3314 * scan progress pointer such that every real extent objectid 3315 * will be smaller. 3316 */ 3317 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3318 btrfs_release_path(path); 3319 mutex_unlock(&fs_info->qgroup_rescan_lock); 3320 return ret; 3321 } 3322 done = is_last_leaf(path); 3323 3324 btrfs_item_key_to_cpu(path->nodes[0], &found, 3325 btrfs_header_nritems(path->nodes[0]) - 1); 3326 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3327 3328 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3329 if (!scratch_leaf) { 3330 ret = -ENOMEM; 3331 mutex_unlock(&fs_info->qgroup_rescan_lock); 3332 goto out; 3333 } 3334 slot = path->slots[0]; 3335 btrfs_release_path(path); 3336 mutex_unlock(&fs_info->qgroup_rescan_lock); 3337 3338 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3339 struct btrfs_backref_walk_ctx ctx = { 0 }; 3340 3341 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3342 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3343 found.type != BTRFS_METADATA_ITEM_KEY) 3344 continue; 3345 if (found.type == BTRFS_METADATA_ITEM_KEY) 3346 num_bytes = fs_info->nodesize; 3347 else 3348 num_bytes = found.offset; 3349 3350 ctx.bytenr = found.objectid; 3351 ctx.fs_info = fs_info; 3352 3353 ret = btrfs_find_all_roots(&ctx, false); 3354 if (ret < 0) 3355 goto out; 3356 /* For rescan, just pass old_roots as NULL */ 3357 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3358 num_bytes, NULL, ctx.roots); 3359 if (ret < 0) 3360 goto out; 3361 } 3362 out: 3363 if (scratch_leaf) 3364 free_extent_buffer(scratch_leaf); 3365 3366 if (done && !ret) { 3367 ret = 1; 3368 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3369 } 3370 return ret; 3371 } 3372 3373 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3374 { 3375 return btrfs_fs_closing(fs_info) || 3376 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || 3377 !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3378 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3379 } 3380 3381 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3382 { 3383 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3384 qgroup_rescan_work); 3385 struct btrfs_path *path; 3386 struct btrfs_trans_handle *trans = NULL; 3387 int err = -ENOMEM; 3388 int ret = 0; 3389 bool stopped = false; 3390 bool did_leaf_rescans = false; 3391 3392 path = btrfs_alloc_path(); 3393 if (!path) 3394 goto out; 3395 /* 3396 * Rescan should only search for commit root, and any later difference 3397 * should be recorded by qgroup 3398 */ 3399 path->search_commit_root = 1; 3400 path->skip_locking = 1; 3401 3402 err = 0; 3403 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3404 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3405 if (IS_ERR(trans)) { 3406 err = PTR_ERR(trans); 3407 break; 3408 } 3409 3410 err = qgroup_rescan_leaf(trans, path); 3411 did_leaf_rescans = true; 3412 3413 if (err > 0) 3414 btrfs_commit_transaction(trans); 3415 else 3416 btrfs_end_transaction(trans); 3417 } 3418 3419 out: 3420 btrfs_free_path(path); 3421 3422 mutex_lock(&fs_info->qgroup_rescan_lock); 3423 if (err > 0 && 3424 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3425 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3426 } else if (err < 0 || stopped) { 3427 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3428 } 3429 mutex_unlock(&fs_info->qgroup_rescan_lock); 3430 3431 /* 3432 * Only update status, since the previous part has already updated the 3433 * qgroup info, and only if we did any actual work. This also prevents 3434 * race with a concurrent quota disable, which has already set 3435 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at 3436 * btrfs_quota_disable(). 3437 */ 3438 if (did_leaf_rescans) { 3439 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3440 if (IS_ERR(trans)) { 3441 err = PTR_ERR(trans); 3442 trans = NULL; 3443 btrfs_err(fs_info, 3444 "fail to start transaction for status update: %d", 3445 err); 3446 } 3447 } else { 3448 trans = NULL; 3449 } 3450 3451 mutex_lock(&fs_info->qgroup_rescan_lock); 3452 if (!stopped || 3453 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3454 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3455 if (trans) { 3456 ret = update_qgroup_status_item(trans); 3457 if (ret < 0) { 3458 err = ret; 3459 btrfs_err(fs_info, "fail to update qgroup status: %d", 3460 err); 3461 } 3462 } 3463 fs_info->qgroup_rescan_running = false; 3464 fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3465 complete_all(&fs_info->qgroup_rescan_completion); 3466 mutex_unlock(&fs_info->qgroup_rescan_lock); 3467 3468 if (!trans) 3469 return; 3470 3471 btrfs_end_transaction(trans); 3472 3473 if (stopped) { 3474 btrfs_info(fs_info, "qgroup scan paused"); 3475 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { 3476 btrfs_info(fs_info, "qgroup scan cancelled"); 3477 } else if (err >= 0) { 3478 btrfs_info(fs_info, "qgroup scan completed%s", 3479 err > 0 ? " (inconsistency flag cleared)" : ""); 3480 } else { 3481 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3482 } 3483 } 3484 3485 /* 3486 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3487 * memory required for the rescan context. 3488 */ 3489 static int 3490 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3491 int init_flags) 3492 { 3493 int ret = 0; 3494 3495 if (!init_flags) { 3496 /* we're resuming qgroup rescan at mount time */ 3497 if (!(fs_info->qgroup_flags & 3498 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3499 btrfs_warn(fs_info, 3500 "qgroup rescan init failed, qgroup rescan is not queued"); 3501 ret = -EINVAL; 3502 } else if (!(fs_info->qgroup_flags & 3503 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3504 btrfs_warn(fs_info, 3505 "qgroup rescan init failed, qgroup is not enabled"); 3506 ret = -EINVAL; 3507 } 3508 3509 if (ret) 3510 return ret; 3511 } 3512 3513 mutex_lock(&fs_info->qgroup_rescan_lock); 3514 3515 if (init_flags) { 3516 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3517 btrfs_warn(fs_info, 3518 "qgroup rescan is already in progress"); 3519 ret = -EINPROGRESS; 3520 } else if (!(fs_info->qgroup_flags & 3521 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3522 btrfs_warn(fs_info, 3523 "qgroup rescan init failed, qgroup is not enabled"); 3524 ret = -EINVAL; 3525 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3526 /* Quota disable is in progress */ 3527 ret = -EBUSY; 3528 } 3529 3530 if (ret) { 3531 mutex_unlock(&fs_info->qgroup_rescan_lock); 3532 return ret; 3533 } 3534 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3535 } 3536 3537 memset(&fs_info->qgroup_rescan_progress, 0, 3538 sizeof(fs_info->qgroup_rescan_progress)); 3539 fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 3540 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 3541 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3542 init_completion(&fs_info->qgroup_rescan_completion); 3543 mutex_unlock(&fs_info->qgroup_rescan_lock); 3544 3545 btrfs_init_work(&fs_info->qgroup_rescan_work, 3546 btrfs_qgroup_rescan_worker, NULL, NULL); 3547 return 0; 3548 } 3549 3550 static void 3551 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3552 { 3553 struct rb_node *n; 3554 struct btrfs_qgroup *qgroup; 3555 3556 spin_lock(&fs_info->qgroup_lock); 3557 /* clear all current qgroup tracking information */ 3558 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3559 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3560 qgroup->rfer = 0; 3561 qgroup->rfer_cmpr = 0; 3562 qgroup->excl = 0; 3563 qgroup->excl_cmpr = 0; 3564 qgroup_dirty(fs_info, qgroup); 3565 } 3566 spin_unlock(&fs_info->qgroup_lock); 3567 } 3568 3569 int 3570 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3571 { 3572 int ret = 0; 3573 struct btrfs_trans_handle *trans; 3574 3575 ret = qgroup_rescan_init(fs_info, 0, 1); 3576 if (ret) 3577 return ret; 3578 3579 /* 3580 * We have set the rescan_progress to 0, which means no more 3581 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3582 * However, btrfs_qgroup_account_ref may be right after its call 3583 * to btrfs_find_all_roots, in which case it would still do the 3584 * accounting. 3585 * To solve this, we're committing the transaction, which will 3586 * ensure we run all delayed refs and only after that, we are 3587 * going to clear all tracking information for a clean start. 3588 */ 3589 3590 trans = btrfs_attach_transaction_barrier(fs_info->fs_root); 3591 if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { 3592 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3593 return PTR_ERR(trans); 3594 } else if (trans != ERR_PTR(-ENOENT)) { 3595 ret = btrfs_commit_transaction(trans); 3596 if (ret) { 3597 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3598 return ret; 3599 } 3600 } 3601 3602 qgroup_rescan_zero_tracking(fs_info); 3603 3604 mutex_lock(&fs_info->qgroup_rescan_lock); 3605 fs_info->qgroup_rescan_running = true; 3606 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3607 &fs_info->qgroup_rescan_work); 3608 mutex_unlock(&fs_info->qgroup_rescan_lock); 3609 3610 return 0; 3611 } 3612 3613 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3614 bool interruptible) 3615 { 3616 int running; 3617 int ret = 0; 3618 3619 mutex_lock(&fs_info->qgroup_rescan_lock); 3620 running = fs_info->qgroup_rescan_running; 3621 mutex_unlock(&fs_info->qgroup_rescan_lock); 3622 3623 if (!running) 3624 return 0; 3625 3626 if (interruptible) 3627 ret = wait_for_completion_interruptible( 3628 &fs_info->qgroup_rescan_completion); 3629 else 3630 wait_for_completion(&fs_info->qgroup_rescan_completion); 3631 3632 return ret; 3633 } 3634 3635 /* 3636 * this is only called from open_ctree where we're still single threaded, thus 3637 * locking is omitted here. 3638 */ 3639 void 3640 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3641 { 3642 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3643 mutex_lock(&fs_info->qgroup_rescan_lock); 3644 fs_info->qgroup_rescan_running = true; 3645 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3646 &fs_info->qgroup_rescan_work); 3647 mutex_unlock(&fs_info->qgroup_rescan_lock); 3648 } 3649 } 3650 3651 #define rbtree_iterate_from_safe(node, next, start) \ 3652 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3653 3654 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3655 struct extent_changeset *reserved, u64 start, 3656 u64 len) 3657 { 3658 struct rb_node *node; 3659 struct rb_node *next; 3660 struct ulist_node *entry; 3661 int ret = 0; 3662 3663 node = reserved->range_changed.root.rb_node; 3664 if (!node) 3665 return 0; 3666 while (node) { 3667 entry = rb_entry(node, struct ulist_node, rb_node); 3668 if (entry->val < start) 3669 node = node->rb_right; 3670 else 3671 node = node->rb_left; 3672 } 3673 3674 if (entry->val > start && rb_prev(&entry->rb_node)) 3675 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3676 rb_node); 3677 3678 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3679 u64 entry_start; 3680 u64 entry_end; 3681 u64 entry_len; 3682 int clear_ret; 3683 3684 entry = rb_entry(node, struct ulist_node, rb_node); 3685 entry_start = entry->val; 3686 entry_end = entry->aux; 3687 entry_len = entry_end - entry_start + 1; 3688 3689 if (entry_start >= start + len) 3690 break; 3691 if (entry_start + entry_len <= start) 3692 continue; 3693 /* 3694 * Now the entry is in [start, start + len), revert the 3695 * EXTENT_QGROUP_RESERVED bit. 3696 */ 3697 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3698 entry_end, EXTENT_QGROUP_RESERVED); 3699 if (!ret && clear_ret < 0) 3700 ret = clear_ret; 3701 3702 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3703 if (likely(reserved->bytes_changed >= entry_len)) { 3704 reserved->bytes_changed -= entry_len; 3705 } else { 3706 WARN_ON(1); 3707 reserved->bytes_changed = 0; 3708 } 3709 } 3710 3711 return ret; 3712 } 3713 3714 /* 3715 * Try to free some space for qgroup. 3716 * 3717 * For qgroup, there are only 3 ways to free qgroup space: 3718 * - Flush nodatacow write 3719 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3720 * In theory, we should only flush nodatacow inodes, but it's not yet 3721 * possible, so we need to flush the whole root. 3722 * 3723 * - Wait for ordered extents 3724 * When ordered extents are finished, their reserved metadata is finally 3725 * converted to per_trans status, which can be freed by later commit 3726 * transaction. 3727 * 3728 * - Commit transaction 3729 * This would free the meta_per_trans space. 3730 * In theory this shouldn't provide much space, but any more qgroup space 3731 * is needed. 3732 */ 3733 static int try_flush_qgroup(struct btrfs_root *root) 3734 { 3735 struct btrfs_trans_handle *trans; 3736 int ret; 3737 3738 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3739 ASSERT(current->journal_info == NULL); 3740 if (WARN_ON(current->journal_info)) 3741 return 0; 3742 3743 /* 3744 * We don't want to run flush again and again, so if there is a running 3745 * one, we won't try to start a new flush, but exit directly. 3746 */ 3747 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3748 wait_event(root->qgroup_flush_wait, 3749 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3750 return 0; 3751 } 3752 3753 ret = btrfs_start_delalloc_snapshot(root, true); 3754 if (ret < 0) 3755 goto out; 3756 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3757 3758 trans = btrfs_attach_transaction_barrier(root); 3759 if (IS_ERR(trans)) { 3760 ret = PTR_ERR(trans); 3761 if (ret == -ENOENT) 3762 ret = 0; 3763 goto out; 3764 } 3765 3766 ret = btrfs_commit_transaction(trans); 3767 out: 3768 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3769 wake_up(&root->qgroup_flush_wait); 3770 return ret; 3771 } 3772 3773 static int qgroup_reserve_data(struct btrfs_inode *inode, 3774 struct extent_changeset **reserved_ret, u64 start, 3775 u64 len) 3776 { 3777 struct btrfs_root *root = inode->root; 3778 struct extent_changeset *reserved; 3779 bool new_reserved = false; 3780 u64 orig_reserved; 3781 u64 to_reserve; 3782 int ret; 3783 3784 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3785 !is_fstree(root->root_key.objectid) || len == 0) 3786 return 0; 3787 3788 /* @reserved parameter is mandatory for qgroup */ 3789 if (WARN_ON(!reserved_ret)) 3790 return -EINVAL; 3791 if (!*reserved_ret) { 3792 new_reserved = true; 3793 *reserved_ret = extent_changeset_alloc(); 3794 if (!*reserved_ret) 3795 return -ENOMEM; 3796 } 3797 reserved = *reserved_ret; 3798 /* Record already reserved space */ 3799 orig_reserved = reserved->bytes_changed; 3800 ret = set_record_extent_bits(&inode->io_tree, start, 3801 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3802 3803 /* Newly reserved space */ 3804 to_reserve = reserved->bytes_changed - orig_reserved; 3805 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3806 to_reserve, QGROUP_RESERVE); 3807 if (ret < 0) 3808 goto out; 3809 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3810 if (ret < 0) 3811 goto cleanup; 3812 3813 return ret; 3814 3815 cleanup: 3816 qgroup_unreserve_range(inode, reserved, start, len); 3817 out: 3818 if (new_reserved) { 3819 extent_changeset_free(reserved); 3820 *reserved_ret = NULL; 3821 } 3822 return ret; 3823 } 3824 3825 /* 3826 * Reserve qgroup space for range [start, start + len). 3827 * 3828 * This function will either reserve space from related qgroups or do nothing 3829 * if the range is already reserved. 3830 * 3831 * Return 0 for successful reservation 3832 * Return <0 for error (including -EQUOT) 3833 * 3834 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3835 * commit transaction. So caller should not hold any dirty page locked. 3836 */ 3837 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3838 struct extent_changeset **reserved_ret, u64 start, 3839 u64 len) 3840 { 3841 int ret; 3842 3843 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3844 if (ret <= 0 && ret != -EDQUOT) 3845 return ret; 3846 3847 ret = try_flush_qgroup(inode->root); 3848 if (ret < 0) 3849 return ret; 3850 return qgroup_reserve_data(inode, reserved_ret, start, len); 3851 } 3852 3853 /* Free ranges specified by @reserved, normally in error path */ 3854 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3855 struct extent_changeset *reserved, 3856 u64 start, u64 len, u64 *freed_ret) 3857 { 3858 struct btrfs_root *root = inode->root; 3859 struct ulist_node *unode; 3860 struct ulist_iterator uiter; 3861 struct extent_changeset changeset; 3862 u64 freed = 0; 3863 int ret; 3864 3865 extent_changeset_init(&changeset); 3866 len = round_up(start + len, root->fs_info->sectorsize); 3867 start = round_down(start, root->fs_info->sectorsize); 3868 3869 ULIST_ITER_INIT(&uiter); 3870 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3871 u64 range_start = unode->val; 3872 /* unode->aux is the inclusive end */ 3873 u64 range_len = unode->aux - range_start + 1; 3874 u64 free_start; 3875 u64 free_len; 3876 3877 extent_changeset_release(&changeset); 3878 3879 /* Only free range in range [start, start + len) */ 3880 if (range_start >= start + len || 3881 range_start + range_len <= start) 3882 continue; 3883 free_start = max(range_start, start); 3884 free_len = min(start + len, range_start + range_len) - 3885 free_start; 3886 /* 3887 * TODO: To also modify reserved->ranges_reserved to reflect 3888 * the modification. 3889 * 3890 * However as long as we free qgroup reserved according to 3891 * EXTENT_QGROUP_RESERVED, we won't double free. 3892 * So not need to rush. 3893 */ 3894 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3895 free_start + free_len - 1, 3896 EXTENT_QGROUP_RESERVED, &changeset); 3897 if (ret < 0) 3898 goto out; 3899 freed += changeset.bytes_changed; 3900 } 3901 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3902 BTRFS_QGROUP_RSV_DATA); 3903 if (freed_ret) 3904 *freed_ret = freed; 3905 ret = 0; 3906 out: 3907 extent_changeset_release(&changeset); 3908 return ret; 3909 } 3910 3911 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3912 struct extent_changeset *reserved, u64 start, u64 len, 3913 u64 *released, int free) 3914 { 3915 struct extent_changeset changeset; 3916 int trace_op = QGROUP_RELEASE; 3917 int ret; 3918 3919 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3920 return 0; 3921 3922 /* In release case, we shouldn't have @reserved */ 3923 WARN_ON(!free && reserved); 3924 if (free && reserved) 3925 return qgroup_free_reserved_data(inode, reserved, start, len, released); 3926 extent_changeset_init(&changeset); 3927 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3928 EXTENT_QGROUP_RESERVED, &changeset); 3929 if (ret < 0) 3930 goto out; 3931 3932 if (free) 3933 trace_op = QGROUP_FREE; 3934 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3935 changeset.bytes_changed, trace_op); 3936 if (free) 3937 btrfs_qgroup_free_refroot(inode->root->fs_info, 3938 inode->root->root_key.objectid, 3939 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3940 if (released) 3941 *released = changeset.bytes_changed; 3942 out: 3943 extent_changeset_release(&changeset); 3944 return ret; 3945 } 3946 3947 /* 3948 * Free a reserved space range from io_tree and related qgroups 3949 * 3950 * Should be called when a range of pages get invalidated before reaching disk. 3951 * Or for error cleanup case. 3952 * if @reserved is given, only reserved range in [@start, @start + @len) will 3953 * be freed. 3954 * 3955 * For data written to disk, use btrfs_qgroup_release_data(). 3956 * 3957 * NOTE: This function may sleep for memory allocation. 3958 */ 3959 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3960 struct extent_changeset *reserved, 3961 u64 start, u64 len, u64 *freed) 3962 { 3963 return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); 3964 } 3965 3966 /* 3967 * Release a reserved space range from io_tree only. 3968 * 3969 * Should be called when a range of pages get written to disk and corresponding 3970 * FILE_EXTENT is inserted into corresponding root. 3971 * 3972 * Since new qgroup accounting framework will only update qgroup numbers at 3973 * commit_transaction() time, its reserved space shouldn't be freed from 3974 * related qgroups. 3975 * 3976 * But we should release the range from io_tree, to allow further write to be 3977 * COWed. 3978 * 3979 * NOTE: This function may sleep for memory allocation. 3980 */ 3981 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) 3982 { 3983 return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); 3984 } 3985 3986 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3987 enum btrfs_qgroup_rsv_type type) 3988 { 3989 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3990 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3991 return; 3992 if (num_bytes == 0) 3993 return; 3994 3995 spin_lock(&root->qgroup_meta_rsv_lock); 3996 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 3997 root->qgroup_meta_rsv_prealloc += num_bytes; 3998 else 3999 root->qgroup_meta_rsv_pertrans += num_bytes; 4000 spin_unlock(&root->qgroup_meta_rsv_lock); 4001 } 4002 4003 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4004 enum btrfs_qgroup_rsv_type type) 4005 { 4006 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4007 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4008 return 0; 4009 if (num_bytes == 0) 4010 return 0; 4011 4012 spin_lock(&root->qgroup_meta_rsv_lock); 4013 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 4014 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 4015 num_bytes); 4016 root->qgroup_meta_rsv_prealloc -= num_bytes; 4017 } else { 4018 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 4019 num_bytes); 4020 root->qgroup_meta_rsv_pertrans -= num_bytes; 4021 } 4022 spin_unlock(&root->qgroup_meta_rsv_lock); 4023 return num_bytes; 4024 } 4025 4026 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4027 enum btrfs_qgroup_rsv_type type, bool enforce) 4028 { 4029 struct btrfs_fs_info *fs_info = root->fs_info; 4030 int ret; 4031 4032 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4033 !is_fstree(root->root_key.objectid) || num_bytes == 0) 4034 return 0; 4035 4036 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4037 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 4038 ret = qgroup_reserve(root, num_bytes, enforce, type); 4039 if (ret < 0) 4040 return ret; 4041 /* 4042 * Record what we have reserved into root. 4043 * 4044 * To avoid quota disabled->enabled underflow. 4045 * In that case, we may try to free space we haven't reserved 4046 * (since quota was disabled), so record what we reserved into root. 4047 * And ensure later release won't underflow this number. 4048 */ 4049 add_root_meta_rsv(root, num_bytes, type); 4050 return ret; 4051 } 4052 4053 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4054 enum btrfs_qgroup_rsv_type type, bool enforce, 4055 bool noflush) 4056 { 4057 int ret; 4058 4059 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4060 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4061 return ret; 4062 4063 ret = try_flush_qgroup(root); 4064 if (ret < 0) 4065 return ret; 4066 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4067 } 4068 4069 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 4070 { 4071 struct btrfs_fs_info *fs_info = root->fs_info; 4072 4073 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4074 !is_fstree(root->root_key.objectid)) 4075 return; 4076 4077 /* TODO: Update trace point to handle such free */ 4078 trace_qgroup_meta_free_all_pertrans(root); 4079 /* Special value -1 means to free all reserved space */ 4080 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 4081 BTRFS_QGROUP_RSV_META_PERTRANS); 4082 } 4083 4084 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4085 enum btrfs_qgroup_rsv_type type) 4086 { 4087 struct btrfs_fs_info *fs_info = root->fs_info; 4088 4089 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4090 !is_fstree(root->root_key.objectid)) 4091 return; 4092 4093 /* 4094 * reservation for META_PREALLOC can happen before quota is enabled, 4095 * which can lead to underflow. 4096 * Here ensure we will only free what we really have reserved. 4097 */ 4098 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4099 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4100 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4101 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 4102 num_bytes, type); 4103 } 4104 4105 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 4106 int num_bytes) 4107 { 4108 struct btrfs_qgroup *qgroup; 4109 LIST_HEAD(qgroup_list); 4110 4111 if (num_bytes == 0) 4112 return; 4113 if (!fs_info->quota_root) 4114 return; 4115 4116 spin_lock(&fs_info->qgroup_lock); 4117 qgroup = find_qgroup_rb(fs_info, ref_root); 4118 if (!qgroup) 4119 goto out; 4120 4121 qgroup_iterator_add(&qgroup_list, qgroup); 4122 list_for_each_entry(qgroup, &qgroup_list, iterator) { 4123 struct btrfs_qgroup_list *glist; 4124 4125 qgroup_rsv_release(fs_info, qgroup, num_bytes, 4126 BTRFS_QGROUP_RSV_META_PREALLOC); 4127 if (!sb_rdonly(fs_info->sb)) 4128 qgroup_rsv_add(fs_info, qgroup, num_bytes, 4129 BTRFS_QGROUP_RSV_META_PERTRANS); 4130 4131 list_for_each_entry(glist, &qgroup->groups, next_group) 4132 qgroup_iterator_add(&qgroup_list, glist->group); 4133 } 4134 out: 4135 qgroup_iterator_clean(&qgroup_list); 4136 spin_unlock(&fs_info->qgroup_lock); 4137 } 4138 4139 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4140 { 4141 struct btrfs_fs_info *fs_info = root->fs_info; 4142 4143 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4144 !is_fstree(root->root_key.objectid)) 4145 return; 4146 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4147 num_bytes = sub_root_meta_rsv(root, num_bytes, 4148 BTRFS_QGROUP_RSV_META_PREALLOC); 4149 trace_qgroup_meta_convert(root, num_bytes); 4150 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 4151 } 4152 4153 /* 4154 * Check qgroup reserved space leaking, normally at destroy inode 4155 * time 4156 */ 4157 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4158 { 4159 struct extent_changeset changeset; 4160 struct ulist_node *unode; 4161 struct ulist_iterator iter; 4162 int ret; 4163 4164 extent_changeset_init(&changeset); 4165 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4166 EXTENT_QGROUP_RESERVED, &changeset); 4167 4168 WARN_ON(ret < 0); 4169 if (WARN_ON(changeset.bytes_changed)) { 4170 ULIST_ITER_INIT(&iter); 4171 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4172 btrfs_warn(inode->root->fs_info, 4173 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4174 btrfs_ino(inode), unode->val, unode->aux); 4175 } 4176 btrfs_qgroup_free_refroot(inode->root->fs_info, 4177 inode->root->root_key.objectid, 4178 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4179 4180 } 4181 extent_changeset_release(&changeset); 4182 } 4183 4184 void btrfs_qgroup_init_swapped_blocks( 4185 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4186 { 4187 int i; 4188 4189 spin_lock_init(&swapped_blocks->lock); 4190 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4191 swapped_blocks->blocks[i] = RB_ROOT; 4192 swapped_blocks->swapped = false; 4193 } 4194 4195 /* 4196 * Delete all swapped blocks record of @root. 4197 * Every record here means we skipped a full subtree scan for qgroup. 4198 * 4199 * Gets called when committing one transaction. 4200 */ 4201 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4202 { 4203 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4204 int i; 4205 4206 swapped_blocks = &root->swapped_blocks; 4207 4208 spin_lock(&swapped_blocks->lock); 4209 if (!swapped_blocks->swapped) 4210 goto out; 4211 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4212 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4213 struct btrfs_qgroup_swapped_block *entry; 4214 struct btrfs_qgroup_swapped_block *next; 4215 4216 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4217 node) 4218 kfree(entry); 4219 swapped_blocks->blocks[i] = RB_ROOT; 4220 } 4221 swapped_blocks->swapped = false; 4222 out: 4223 spin_unlock(&swapped_blocks->lock); 4224 } 4225 4226 /* 4227 * Add subtree roots record into @subvol_root. 4228 * 4229 * @subvol_root: tree root of the subvolume tree get swapped 4230 * @bg: block group under balance 4231 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4232 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4233 * BOTH POINTERS ARE BEFORE TREE SWAP 4234 * @last_snapshot: last snapshot generation of the subvolume tree 4235 */ 4236 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4237 struct btrfs_root *subvol_root, 4238 struct btrfs_block_group *bg, 4239 struct extent_buffer *subvol_parent, int subvol_slot, 4240 struct extent_buffer *reloc_parent, int reloc_slot, 4241 u64 last_snapshot) 4242 { 4243 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4244 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4245 struct btrfs_qgroup_swapped_block *block; 4246 struct rb_node **cur; 4247 struct rb_node *parent = NULL; 4248 int level = btrfs_header_level(subvol_parent) - 1; 4249 int ret = 0; 4250 4251 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4252 return 0; 4253 4254 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4255 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4256 btrfs_err_rl(fs_info, 4257 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4258 __func__, 4259 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4260 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4261 return -EUCLEAN; 4262 } 4263 4264 block = kmalloc(sizeof(*block), GFP_NOFS); 4265 if (!block) { 4266 ret = -ENOMEM; 4267 goto out; 4268 } 4269 4270 /* 4271 * @reloc_parent/slot is still before swap, while @block is going to 4272 * record the bytenr after swap, so we do the swap here. 4273 */ 4274 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4275 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4276 reloc_slot); 4277 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4278 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4279 subvol_slot); 4280 block->last_snapshot = last_snapshot; 4281 block->level = level; 4282 4283 /* 4284 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4285 * no one else can modify tree blocks thus we qgroup will not change 4286 * no matter the value of trace_leaf. 4287 */ 4288 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4289 block->trace_leaf = true; 4290 else 4291 block->trace_leaf = false; 4292 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4293 4294 /* Insert @block into @blocks */ 4295 spin_lock(&blocks->lock); 4296 cur = &blocks->blocks[level].rb_node; 4297 while (*cur) { 4298 struct btrfs_qgroup_swapped_block *entry; 4299 4300 parent = *cur; 4301 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4302 node); 4303 4304 if (entry->subvol_bytenr < block->subvol_bytenr) { 4305 cur = &(*cur)->rb_left; 4306 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4307 cur = &(*cur)->rb_right; 4308 } else { 4309 if (entry->subvol_generation != 4310 block->subvol_generation || 4311 entry->reloc_bytenr != block->reloc_bytenr || 4312 entry->reloc_generation != 4313 block->reloc_generation) { 4314 /* 4315 * Duplicated but mismatch entry found. 4316 * Shouldn't happen. 4317 * 4318 * Marking qgroup inconsistent should be enough 4319 * for end users. 4320 */ 4321 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4322 ret = -EEXIST; 4323 } 4324 kfree(block); 4325 goto out_unlock; 4326 } 4327 } 4328 rb_link_node(&block->node, parent, cur); 4329 rb_insert_color(&block->node, &blocks->blocks[level]); 4330 blocks->swapped = true; 4331 out_unlock: 4332 spin_unlock(&blocks->lock); 4333 out: 4334 if (ret < 0) 4335 qgroup_mark_inconsistent(fs_info); 4336 return ret; 4337 } 4338 4339 /* 4340 * Check if the tree block is a subtree root, and if so do the needed 4341 * delayed subtree trace for qgroup. 4342 * 4343 * This is called during btrfs_cow_block(). 4344 */ 4345 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4346 struct btrfs_root *root, 4347 struct extent_buffer *subvol_eb) 4348 { 4349 struct btrfs_fs_info *fs_info = root->fs_info; 4350 struct btrfs_tree_parent_check check = { 0 }; 4351 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4352 struct btrfs_qgroup_swapped_block *block; 4353 struct extent_buffer *reloc_eb = NULL; 4354 struct rb_node *node; 4355 bool found = false; 4356 bool swapped = false; 4357 int level = btrfs_header_level(subvol_eb); 4358 int ret = 0; 4359 int i; 4360 4361 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4362 return 0; 4363 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4364 return 0; 4365 4366 spin_lock(&blocks->lock); 4367 if (!blocks->swapped) { 4368 spin_unlock(&blocks->lock); 4369 return 0; 4370 } 4371 node = blocks->blocks[level].rb_node; 4372 4373 while (node) { 4374 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4375 if (block->subvol_bytenr < subvol_eb->start) { 4376 node = node->rb_left; 4377 } else if (block->subvol_bytenr > subvol_eb->start) { 4378 node = node->rb_right; 4379 } else { 4380 found = true; 4381 break; 4382 } 4383 } 4384 if (!found) { 4385 spin_unlock(&blocks->lock); 4386 goto out; 4387 } 4388 /* Found one, remove it from @blocks first and update blocks->swapped */ 4389 rb_erase(&block->node, &blocks->blocks[level]); 4390 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4391 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4392 swapped = true; 4393 break; 4394 } 4395 } 4396 blocks->swapped = swapped; 4397 spin_unlock(&blocks->lock); 4398 4399 check.level = block->level; 4400 check.transid = block->reloc_generation; 4401 check.has_first_key = true; 4402 memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); 4403 4404 /* Read out reloc subtree root */ 4405 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); 4406 if (IS_ERR(reloc_eb)) { 4407 ret = PTR_ERR(reloc_eb); 4408 reloc_eb = NULL; 4409 goto free_out; 4410 } 4411 if (!extent_buffer_uptodate(reloc_eb)) { 4412 ret = -EIO; 4413 goto free_out; 4414 } 4415 4416 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4417 block->last_snapshot, block->trace_leaf); 4418 free_out: 4419 kfree(block); 4420 free_extent_buffer(reloc_eb); 4421 out: 4422 if (ret < 0) { 4423 btrfs_err_rl(fs_info, 4424 "failed to account subtree at bytenr %llu: %d", 4425 subvol_eb->start, ret); 4426 qgroup_mark_inconsistent(fs_info); 4427 } 4428 return ret; 4429 } 4430 4431 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4432 { 4433 struct btrfs_qgroup_extent_record *entry; 4434 struct btrfs_qgroup_extent_record *next; 4435 struct rb_root *root; 4436 4437 root = &trans->delayed_refs.dirty_extent_root; 4438 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4439 ulist_free(entry->old_roots); 4440 kfree(entry); 4441 } 4442 *root = RB_ROOT; 4443 } 4444