1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "tree-checker.h" 32 33 /* 34 * Helpers to access qgroup reservation 35 * 36 * Callers should ensure the lock context and type are valid 37 */ 38 39 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 40 { 41 u64 ret = 0; 42 int i; 43 44 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 45 ret += qgroup->rsv.values[i]; 46 47 return ret; 48 } 49 50 #ifdef CONFIG_BTRFS_DEBUG 51 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 52 { 53 if (type == BTRFS_QGROUP_RSV_DATA) 54 return "data"; 55 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 56 return "meta_pertrans"; 57 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 58 return "meta_prealloc"; 59 return NULL; 60 } 61 #endif 62 63 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 64 struct btrfs_qgroup *qgroup, u64 num_bytes, 65 enum btrfs_qgroup_rsv_type type) 66 { 67 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 68 qgroup->rsv.values[type] += num_bytes; 69 } 70 71 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 72 struct btrfs_qgroup *qgroup, u64 num_bytes, 73 enum btrfs_qgroup_rsv_type type) 74 { 75 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 76 if (qgroup->rsv.values[type] >= num_bytes) { 77 qgroup->rsv.values[type] -= num_bytes; 78 return; 79 } 80 #ifdef CONFIG_BTRFS_DEBUG 81 WARN_RATELIMIT(1, 82 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 83 qgroup->qgroupid, qgroup_rsv_type_str(type), 84 qgroup->rsv.values[type], num_bytes); 85 #endif 86 qgroup->rsv.values[type] = 0; 87 } 88 89 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 90 struct btrfs_qgroup *dest, 91 struct btrfs_qgroup *src) 92 { 93 int i; 94 95 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 96 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 97 } 98 99 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 100 struct btrfs_qgroup *dest, 101 struct btrfs_qgroup *src) 102 { 103 int i; 104 105 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 106 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 107 } 108 109 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 110 int mod) 111 { 112 if (qg->old_refcnt < seq) 113 qg->old_refcnt = seq; 114 qg->old_refcnt += mod; 115 } 116 117 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 118 int mod) 119 { 120 if (qg->new_refcnt < seq) 121 qg->new_refcnt = seq; 122 qg->new_refcnt += mod; 123 } 124 125 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 126 { 127 if (qg->old_refcnt < seq) 128 return 0; 129 return qg->old_refcnt - seq; 130 } 131 132 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->new_refcnt < seq) 135 return 0; 136 return qg->new_refcnt - seq; 137 } 138 139 /* 140 * glue structure to represent the relations between qgroups. 141 */ 142 struct btrfs_qgroup_list { 143 struct list_head next_group; 144 struct list_head next_member; 145 struct btrfs_qgroup *group; 146 struct btrfs_qgroup *member; 147 }; 148 149 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 150 { 151 return (u64)(uintptr_t)qg; 152 } 153 154 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 155 { 156 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 157 } 158 159 static int 160 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 161 int init_flags); 162 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 163 164 /* must be called with qgroup_ioctl_lock held */ 165 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 166 u64 qgroupid) 167 { 168 struct rb_node *n = fs_info->qgroup_tree.rb_node; 169 struct btrfs_qgroup *qgroup; 170 171 while (n) { 172 qgroup = rb_entry(n, struct btrfs_qgroup, node); 173 if (qgroup->qgroupid < qgroupid) 174 n = n->rb_left; 175 else if (qgroup->qgroupid > qgroupid) 176 n = n->rb_right; 177 else 178 return qgroup; 179 } 180 return NULL; 181 } 182 183 /* must be called with qgroup_lock held */ 184 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 185 u64 qgroupid) 186 { 187 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 188 struct rb_node *parent = NULL; 189 struct btrfs_qgroup *qgroup; 190 191 while (*p) { 192 parent = *p; 193 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 194 195 if (qgroup->qgroupid < qgroupid) 196 p = &(*p)->rb_left; 197 else if (qgroup->qgroupid > qgroupid) 198 p = &(*p)->rb_right; 199 else 200 return qgroup; 201 } 202 203 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 204 if (!qgroup) 205 return ERR_PTR(-ENOMEM); 206 207 qgroup->qgroupid = qgroupid; 208 INIT_LIST_HEAD(&qgroup->groups); 209 INIT_LIST_HEAD(&qgroup->members); 210 INIT_LIST_HEAD(&qgroup->dirty); 211 INIT_LIST_HEAD(&qgroup->iterator); 212 213 rb_link_node(&qgroup->node, parent, p); 214 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 215 216 return qgroup; 217 } 218 219 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 220 struct btrfs_qgroup *qgroup) 221 { 222 struct btrfs_qgroup_list *list; 223 224 list_del(&qgroup->dirty); 225 while (!list_empty(&qgroup->groups)) { 226 list = list_first_entry(&qgroup->groups, 227 struct btrfs_qgroup_list, next_group); 228 list_del(&list->next_group); 229 list_del(&list->next_member); 230 kfree(list); 231 } 232 233 while (!list_empty(&qgroup->members)) { 234 list = list_first_entry(&qgroup->members, 235 struct btrfs_qgroup_list, next_member); 236 list_del(&list->next_group); 237 list_del(&list->next_member); 238 kfree(list); 239 } 240 } 241 242 /* must be called with qgroup_lock held */ 243 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 244 { 245 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 246 247 if (!qgroup) 248 return -ENOENT; 249 250 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 251 __del_qgroup_rb(fs_info, qgroup); 252 return 0; 253 } 254 255 /* 256 * Add relation specified by two qgroups. 257 * 258 * Must be called with qgroup_lock held. 259 * 260 * Return: 0 on success 261 * -ENOENT if one of the qgroups is NULL 262 * <0 other errors 263 */ 264 static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) 265 { 266 struct btrfs_qgroup_list *list; 267 268 if (!member || !parent) 269 return -ENOENT; 270 271 list = kzalloc(sizeof(*list), GFP_ATOMIC); 272 if (!list) 273 return -ENOMEM; 274 275 list->group = parent; 276 list->member = member; 277 list_add_tail(&list->next_group, &member->groups); 278 list_add_tail(&list->next_member, &parent->members); 279 280 return 0; 281 } 282 283 /* 284 * Add relation specified by two qgroup ids. 285 * 286 * Must be called with qgroup_lock held. 287 * 288 * Return: 0 on success 289 * -ENOENT if one of the ids does not exist 290 * <0 other errors 291 */ 292 static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) 293 { 294 struct btrfs_qgroup *member; 295 struct btrfs_qgroup *parent; 296 297 member = find_qgroup_rb(fs_info, memberid); 298 parent = find_qgroup_rb(fs_info, parentid); 299 300 return __add_relation_rb(member, parent); 301 } 302 303 /* Must be called with qgroup_lock held */ 304 static int del_relation_rb(struct btrfs_fs_info *fs_info, 305 u64 memberid, u64 parentid) 306 { 307 struct btrfs_qgroup *member; 308 struct btrfs_qgroup *parent; 309 struct btrfs_qgroup_list *list; 310 311 member = find_qgroup_rb(fs_info, memberid); 312 parent = find_qgroup_rb(fs_info, parentid); 313 if (!member || !parent) 314 return -ENOENT; 315 316 list_for_each_entry(list, &member->groups, next_group) { 317 if (list->group == parent) { 318 list_del(&list->next_group); 319 list_del(&list->next_member); 320 kfree(list); 321 return 0; 322 } 323 } 324 return -ENOENT; 325 } 326 327 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 328 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 329 u64 rfer, u64 excl) 330 { 331 struct btrfs_qgroup *qgroup; 332 333 qgroup = find_qgroup_rb(fs_info, qgroupid); 334 if (!qgroup) 335 return -EINVAL; 336 if (qgroup->rfer != rfer || qgroup->excl != excl) 337 return -EINVAL; 338 return 0; 339 } 340 #endif 341 342 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) 343 { 344 fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | 345 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 346 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 347 } 348 349 /* 350 * The full config is read in one go, only called from open_ctree() 351 * It doesn't use any locking, as at this point we're still single-threaded 352 */ 353 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 354 { 355 struct btrfs_key key; 356 struct btrfs_key found_key; 357 struct btrfs_root *quota_root = fs_info->quota_root; 358 struct btrfs_path *path = NULL; 359 struct extent_buffer *l; 360 int slot; 361 int ret = 0; 362 u64 flags = 0; 363 u64 rescan_progress = 0; 364 365 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 366 return 0; 367 368 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 369 if (!fs_info->qgroup_ulist) { 370 ret = -ENOMEM; 371 goto out; 372 } 373 374 path = btrfs_alloc_path(); 375 if (!path) { 376 ret = -ENOMEM; 377 goto out; 378 } 379 380 ret = btrfs_sysfs_add_qgroups(fs_info); 381 if (ret < 0) 382 goto out; 383 /* default this to quota off, in case no status key is found */ 384 fs_info->qgroup_flags = 0; 385 386 /* 387 * pass 1: read status, all qgroup infos and limits 388 */ 389 key.objectid = 0; 390 key.type = 0; 391 key.offset = 0; 392 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 393 if (ret) 394 goto out; 395 396 while (1) { 397 struct btrfs_qgroup *qgroup; 398 399 slot = path->slots[0]; 400 l = path->nodes[0]; 401 btrfs_item_key_to_cpu(l, &found_key, slot); 402 403 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 404 struct btrfs_qgroup_status_item *ptr; 405 406 ptr = btrfs_item_ptr(l, slot, 407 struct btrfs_qgroup_status_item); 408 409 if (btrfs_qgroup_status_version(l, ptr) != 410 BTRFS_QGROUP_STATUS_VERSION) { 411 btrfs_err(fs_info, 412 "old qgroup version, quota disabled"); 413 goto out; 414 } 415 if (btrfs_qgroup_status_generation(l, ptr) != 416 fs_info->generation) { 417 qgroup_mark_inconsistent(fs_info); 418 btrfs_err(fs_info, 419 "qgroup generation mismatch, marked as inconsistent"); 420 } 421 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 422 ptr); 423 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 424 goto next1; 425 } 426 427 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 428 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 429 goto next1; 430 431 qgroup = find_qgroup_rb(fs_info, found_key.offset); 432 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 433 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 434 btrfs_err(fs_info, "inconsistent qgroup config"); 435 qgroup_mark_inconsistent(fs_info); 436 } 437 if (!qgroup) { 438 qgroup = add_qgroup_rb(fs_info, found_key.offset); 439 if (IS_ERR(qgroup)) { 440 ret = PTR_ERR(qgroup); 441 goto out; 442 } 443 } 444 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 445 if (ret < 0) 446 goto out; 447 448 switch (found_key.type) { 449 case BTRFS_QGROUP_INFO_KEY: { 450 struct btrfs_qgroup_info_item *ptr; 451 452 ptr = btrfs_item_ptr(l, slot, 453 struct btrfs_qgroup_info_item); 454 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 455 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 456 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 457 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 458 /* generation currently unused */ 459 break; 460 } 461 case BTRFS_QGROUP_LIMIT_KEY: { 462 struct btrfs_qgroup_limit_item *ptr; 463 464 ptr = btrfs_item_ptr(l, slot, 465 struct btrfs_qgroup_limit_item); 466 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 467 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 468 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 469 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 470 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 471 break; 472 } 473 } 474 next1: 475 ret = btrfs_next_item(quota_root, path); 476 if (ret < 0) 477 goto out; 478 if (ret) 479 break; 480 } 481 btrfs_release_path(path); 482 483 /* 484 * pass 2: read all qgroup relations 485 */ 486 key.objectid = 0; 487 key.type = BTRFS_QGROUP_RELATION_KEY; 488 key.offset = 0; 489 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 490 if (ret) 491 goto out; 492 while (1) { 493 slot = path->slots[0]; 494 l = path->nodes[0]; 495 btrfs_item_key_to_cpu(l, &found_key, slot); 496 497 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 498 goto next2; 499 500 if (found_key.objectid > found_key.offset) { 501 /* parent <- member, not needed to build config */ 502 /* FIXME should we omit the key completely? */ 503 goto next2; 504 } 505 506 ret = add_relation_rb(fs_info, found_key.objectid, 507 found_key.offset); 508 if (ret == -ENOENT) { 509 btrfs_warn(fs_info, 510 "orphan qgroup relation 0x%llx->0x%llx", 511 found_key.objectid, found_key.offset); 512 ret = 0; /* ignore the error */ 513 } 514 if (ret) 515 goto out; 516 next2: 517 ret = btrfs_next_item(quota_root, path); 518 if (ret < 0) 519 goto out; 520 if (ret) 521 break; 522 } 523 out: 524 btrfs_free_path(path); 525 fs_info->qgroup_flags |= flags; 526 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 527 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 528 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 529 ret >= 0) 530 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 531 532 if (ret < 0) { 533 ulist_free(fs_info->qgroup_ulist); 534 fs_info->qgroup_ulist = NULL; 535 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 536 btrfs_sysfs_del_qgroups(fs_info); 537 } 538 539 return ret < 0 ? ret : 0; 540 } 541 542 /* 543 * Called in close_ctree() when quota is still enabled. This verifies we don't 544 * leak some reserved space. 545 * 546 * Return false if no reserved space is left. 547 * Return true if some reserved space is leaked. 548 */ 549 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 550 { 551 struct rb_node *node; 552 bool ret = false; 553 554 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 555 return ret; 556 /* 557 * Since we're unmounting, there is no race and no need to grab qgroup 558 * lock. And here we don't go post-order to provide a more user 559 * friendly sorted result. 560 */ 561 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 562 struct btrfs_qgroup *qgroup; 563 int i; 564 565 qgroup = rb_entry(node, struct btrfs_qgroup, node); 566 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 567 if (qgroup->rsv.values[i]) { 568 ret = true; 569 btrfs_warn(fs_info, 570 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 571 btrfs_qgroup_level(qgroup->qgroupid), 572 btrfs_qgroup_subvolid(qgroup->qgroupid), 573 i, qgroup->rsv.values[i]); 574 } 575 } 576 } 577 return ret; 578 } 579 580 /* 581 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 582 * first two are in single-threaded paths.And for the third one, we have set 583 * quota_root to be null with qgroup_lock held before, so it is safe to clean 584 * up the in-memory structures without qgroup_lock held. 585 */ 586 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 587 { 588 struct rb_node *n; 589 struct btrfs_qgroup *qgroup; 590 591 while ((n = rb_first(&fs_info->qgroup_tree))) { 592 qgroup = rb_entry(n, struct btrfs_qgroup, node); 593 rb_erase(n, &fs_info->qgroup_tree); 594 __del_qgroup_rb(fs_info, qgroup); 595 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 596 kfree(qgroup); 597 } 598 /* 599 * We call btrfs_free_qgroup_config() when unmounting 600 * filesystem and disabling quota, so we set qgroup_ulist 601 * to be null here to avoid double free. 602 */ 603 ulist_free(fs_info->qgroup_ulist); 604 fs_info->qgroup_ulist = NULL; 605 btrfs_sysfs_del_qgroups(fs_info); 606 } 607 608 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 625 626 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 627 628 btrfs_free_path(path); 629 return ret; 630 } 631 632 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 633 u64 dst) 634 { 635 int ret; 636 struct btrfs_root *quota_root = trans->fs_info->quota_root; 637 struct btrfs_path *path; 638 struct btrfs_key key; 639 640 path = btrfs_alloc_path(); 641 if (!path) 642 return -ENOMEM; 643 644 key.objectid = src; 645 key.type = BTRFS_QGROUP_RELATION_KEY; 646 key.offset = dst; 647 648 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 649 if (ret < 0) 650 goto out; 651 652 if (ret > 0) { 653 ret = -ENOENT; 654 goto out; 655 } 656 657 ret = btrfs_del_item(trans, quota_root, path); 658 out: 659 btrfs_free_path(path); 660 return ret; 661 } 662 663 static int add_qgroup_item(struct btrfs_trans_handle *trans, 664 struct btrfs_root *quota_root, u64 qgroupid) 665 { 666 int ret; 667 struct btrfs_path *path; 668 struct btrfs_qgroup_info_item *qgroup_info; 669 struct btrfs_qgroup_limit_item *qgroup_limit; 670 struct extent_buffer *leaf; 671 struct btrfs_key key; 672 673 if (btrfs_is_testing(quota_root->fs_info)) 674 return 0; 675 676 path = btrfs_alloc_path(); 677 if (!path) 678 return -ENOMEM; 679 680 key.objectid = 0; 681 key.type = BTRFS_QGROUP_INFO_KEY; 682 key.offset = qgroupid; 683 684 /* 685 * Avoid a transaction abort by catching -EEXIST here. In that 686 * case, we proceed by re-initializing the existing structure 687 * on disk. 688 */ 689 690 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 691 sizeof(*qgroup_info)); 692 if (ret && ret != -EEXIST) 693 goto out; 694 695 leaf = path->nodes[0]; 696 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 697 struct btrfs_qgroup_info_item); 698 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 699 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 700 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 701 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 702 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 703 704 btrfs_mark_buffer_dirty(trans, leaf); 705 706 btrfs_release_path(path); 707 708 key.type = BTRFS_QGROUP_LIMIT_KEY; 709 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 710 sizeof(*qgroup_limit)); 711 if (ret && ret != -EEXIST) 712 goto out; 713 714 leaf = path->nodes[0]; 715 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 716 struct btrfs_qgroup_limit_item); 717 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 718 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 719 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 720 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 721 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 722 723 btrfs_mark_buffer_dirty(trans, leaf); 724 725 ret = 0; 726 out: 727 btrfs_free_path(path); 728 return ret; 729 } 730 731 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 732 { 733 int ret; 734 struct btrfs_root *quota_root = trans->fs_info->quota_root; 735 struct btrfs_path *path; 736 struct btrfs_key key; 737 738 path = btrfs_alloc_path(); 739 if (!path) 740 return -ENOMEM; 741 742 key.objectid = 0; 743 key.type = BTRFS_QGROUP_INFO_KEY; 744 key.offset = qgroupid; 745 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 746 if (ret < 0) 747 goto out; 748 749 if (ret > 0) { 750 ret = -ENOENT; 751 goto out; 752 } 753 754 ret = btrfs_del_item(trans, quota_root, path); 755 if (ret) 756 goto out; 757 758 btrfs_release_path(path); 759 760 key.type = BTRFS_QGROUP_LIMIT_KEY; 761 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 762 if (ret < 0) 763 goto out; 764 765 if (ret > 0) { 766 ret = -ENOENT; 767 goto out; 768 } 769 770 ret = btrfs_del_item(trans, quota_root, path); 771 772 out: 773 btrfs_free_path(path); 774 return ret; 775 } 776 777 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 778 struct btrfs_qgroup *qgroup) 779 { 780 struct btrfs_root *quota_root = trans->fs_info->quota_root; 781 struct btrfs_path *path; 782 struct btrfs_key key; 783 struct extent_buffer *l; 784 struct btrfs_qgroup_limit_item *qgroup_limit; 785 int ret; 786 int slot; 787 788 key.objectid = 0; 789 key.type = BTRFS_QGROUP_LIMIT_KEY; 790 key.offset = qgroup->qgroupid; 791 792 path = btrfs_alloc_path(); 793 if (!path) 794 return -ENOMEM; 795 796 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 797 if (ret > 0) 798 ret = -ENOENT; 799 800 if (ret) 801 goto out; 802 803 l = path->nodes[0]; 804 slot = path->slots[0]; 805 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 806 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 807 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 808 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 809 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 810 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 811 812 btrfs_mark_buffer_dirty(trans, l); 813 814 out: 815 btrfs_free_path(path); 816 return ret; 817 } 818 819 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 820 struct btrfs_qgroup *qgroup) 821 { 822 struct btrfs_fs_info *fs_info = trans->fs_info; 823 struct btrfs_root *quota_root = fs_info->quota_root; 824 struct btrfs_path *path; 825 struct btrfs_key key; 826 struct extent_buffer *l; 827 struct btrfs_qgroup_info_item *qgroup_info; 828 int ret; 829 int slot; 830 831 if (btrfs_is_testing(fs_info)) 832 return 0; 833 834 key.objectid = 0; 835 key.type = BTRFS_QGROUP_INFO_KEY; 836 key.offset = qgroup->qgroupid; 837 838 path = btrfs_alloc_path(); 839 if (!path) 840 return -ENOMEM; 841 842 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 843 if (ret > 0) 844 ret = -ENOENT; 845 846 if (ret) 847 goto out; 848 849 l = path->nodes[0]; 850 slot = path->slots[0]; 851 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 852 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 853 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 854 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 855 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 856 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 857 858 btrfs_mark_buffer_dirty(trans, l); 859 860 out: 861 btrfs_free_path(path); 862 return ret; 863 } 864 865 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 866 { 867 struct btrfs_fs_info *fs_info = trans->fs_info; 868 struct btrfs_root *quota_root = fs_info->quota_root; 869 struct btrfs_path *path; 870 struct btrfs_key key; 871 struct extent_buffer *l; 872 struct btrfs_qgroup_status_item *ptr; 873 int ret; 874 int slot; 875 876 key.objectid = 0; 877 key.type = BTRFS_QGROUP_STATUS_KEY; 878 key.offset = 0; 879 880 path = btrfs_alloc_path(); 881 if (!path) 882 return -ENOMEM; 883 884 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 885 if (ret > 0) 886 ret = -ENOENT; 887 888 if (ret) 889 goto out; 890 891 l = path->nodes[0]; 892 slot = path->slots[0]; 893 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 894 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & 895 BTRFS_QGROUP_STATUS_FLAGS_MASK); 896 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 897 btrfs_set_qgroup_status_rescan(l, ptr, 898 fs_info->qgroup_rescan_progress.objectid); 899 900 btrfs_mark_buffer_dirty(trans, l); 901 902 out: 903 btrfs_free_path(path); 904 return ret; 905 } 906 907 /* 908 * called with qgroup_lock held 909 */ 910 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 911 struct btrfs_root *root) 912 { 913 struct btrfs_path *path; 914 struct btrfs_key key; 915 struct extent_buffer *leaf = NULL; 916 int ret; 917 int nr = 0; 918 919 path = btrfs_alloc_path(); 920 if (!path) 921 return -ENOMEM; 922 923 key.objectid = 0; 924 key.offset = 0; 925 key.type = 0; 926 927 while (1) { 928 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 929 if (ret < 0) 930 goto out; 931 leaf = path->nodes[0]; 932 nr = btrfs_header_nritems(leaf); 933 if (!nr) 934 break; 935 /* 936 * delete the leaf one by one 937 * since the whole tree is going 938 * to be deleted. 939 */ 940 path->slots[0] = 0; 941 ret = btrfs_del_items(trans, root, path, 0, nr); 942 if (ret) 943 goto out; 944 945 btrfs_release_path(path); 946 } 947 ret = 0; 948 out: 949 btrfs_free_path(path); 950 return ret; 951 } 952 953 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 954 { 955 struct btrfs_root *quota_root; 956 struct btrfs_root *tree_root = fs_info->tree_root; 957 struct btrfs_path *path = NULL; 958 struct btrfs_qgroup_status_item *ptr; 959 struct extent_buffer *leaf; 960 struct btrfs_key key; 961 struct btrfs_key found_key; 962 struct btrfs_qgroup *qgroup = NULL; 963 struct btrfs_trans_handle *trans = NULL; 964 struct ulist *ulist = NULL; 965 int ret = 0; 966 int slot; 967 968 /* 969 * We need to have subvol_sem write locked, to prevent races between 970 * concurrent tasks trying to enable quotas, because we will unlock 971 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 972 * and before setting BTRFS_FS_QUOTA_ENABLED. 973 */ 974 lockdep_assert_held_write(&fs_info->subvol_sem); 975 976 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 977 btrfs_err(fs_info, 978 "qgroups are currently unsupported in extent tree v2"); 979 return -EINVAL; 980 } 981 982 mutex_lock(&fs_info->qgroup_ioctl_lock); 983 if (fs_info->quota_root) 984 goto out; 985 986 ulist = ulist_alloc(GFP_KERNEL); 987 if (!ulist) { 988 ret = -ENOMEM; 989 goto out; 990 } 991 992 ret = btrfs_sysfs_add_qgroups(fs_info); 993 if (ret < 0) 994 goto out; 995 996 /* 997 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 998 * avoid lock acquisition inversion problems (reported by lockdep) between 999 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 1000 * start a transaction. 1001 * After we started the transaction lock qgroup_ioctl_lock again and 1002 * check if someone else created the quota root in the meanwhile. If so, 1003 * just return success and release the transaction handle. 1004 * 1005 * Also we don't need to worry about someone else calling 1006 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 1007 * that function returns 0 (success) when the sysfs entries already exist. 1008 */ 1009 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1010 1011 /* 1012 * 1 for quota root item 1013 * 1 for BTRFS_QGROUP_STATUS item 1014 * 1015 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 1016 * per subvolume. However those are not currently reserved since it 1017 * would be a lot of overkill. 1018 */ 1019 trans = btrfs_start_transaction(tree_root, 2); 1020 1021 mutex_lock(&fs_info->qgroup_ioctl_lock); 1022 if (IS_ERR(trans)) { 1023 ret = PTR_ERR(trans); 1024 trans = NULL; 1025 goto out; 1026 } 1027 1028 if (fs_info->quota_root) 1029 goto out; 1030 1031 fs_info->qgroup_ulist = ulist; 1032 ulist = NULL; 1033 1034 /* 1035 * initially create the quota tree 1036 */ 1037 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1038 if (IS_ERR(quota_root)) { 1039 ret = PTR_ERR(quota_root); 1040 btrfs_abort_transaction(trans, ret); 1041 goto out; 1042 } 1043 1044 path = btrfs_alloc_path(); 1045 if (!path) { 1046 ret = -ENOMEM; 1047 btrfs_abort_transaction(trans, ret); 1048 goto out_free_root; 1049 } 1050 1051 key.objectid = 0; 1052 key.type = BTRFS_QGROUP_STATUS_KEY; 1053 key.offset = 0; 1054 1055 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1056 sizeof(*ptr)); 1057 if (ret) { 1058 btrfs_abort_transaction(trans, ret); 1059 goto out_free_path; 1060 } 1061 1062 leaf = path->nodes[0]; 1063 ptr = btrfs_item_ptr(leaf, path->slots[0], 1064 struct btrfs_qgroup_status_item); 1065 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1066 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1067 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1068 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1069 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & 1070 BTRFS_QGROUP_STATUS_FLAGS_MASK); 1071 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1072 1073 btrfs_mark_buffer_dirty(trans, leaf); 1074 1075 key.objectid = 0; 1076 key.type = BTRFS_ROOT_REF_KEY; 1077 key.offset = 0; 1078 1079 btrfs_release_path(path); 1080 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1081 if (ret > 0) 1082 goto out_add_root; 1083 if (ret < 0) { 1084 btrfs_abort_transaction(trans, ret); 1085 goto out_free_path; 1086 } 1087 1088 while (1) { 1089 slot = path->slots[0]; 1090 leaf = path->nodes[0]; 1091 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1092 1093 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1094 1095 /* Release locks on tree_root before we access quota_root */ 1096 btrfs_release_path(path); 1097 1098 ret = add_qgroup_item(trans, quota_root, 1099 found_key.offset); 1100 if (ret) { 1101 btrfs_abort_transaction(trans, ret); 1102 goto out_free_path; 1103 } 1104 1105 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1106 if (IS_ERR(qgroup)) { 1107 ret = PTR_ERR(qgroup); 1108 btrfs_abort_transaction(trans, ret); 1109 goto out_free_path; 1110 } 1111 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1112 if (ret < 0) { 1113 btrfs_abort_transaction(trans, ret); 1114 goto out_free_path; 1115 } 1116 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1117 path, 1, 0); 1118 if (ret < 0) { 1119 btrfs_abort_transaction(trans, ret); 1120 goto out_free_path; 1121 } 1122 if (ret > 0) { 1123 /* 1124 * Shouldn't happen, but in case it does we 1125 * don't need to do the btrfs_next_item, just 1126 * continue. 1127 */ 1128 continue; 1129 } 1130 } 1131 ret = btrfs_next_item(tree_root, path); 1132 if (ret < 0) { 1133 btrfs_abort_transaction(trans, ret); 1134 goto out_free_path; 1135 } 1136 if (ret) 1137 break; 1138 } 1139 1140 out_add_root: 1141 btrfs_release_path(path); 1142 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1143 if (ret) { 1144 btrfs_abort_transaction(trans, ret); 1145 goto out_free_path; 1146 } 1147 1148 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1149 if (IS_ERR(qgroup)) { 1150 ret = PTR_ERR(qgroup); 1151 btrfs_abort_transaction(trans, ret); 1152 goto out_free_path; 1153 } 1154 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1155 if (ret < 0) { 1156 btrfs_abort_transaction(trans, ret); 1157 goto out_free_path; 1158 } 1159 1160 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1161 /* 1162 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1163 * a deadlock with tasks concurrently doing other qgroup operations, such 1164 * adding/removing qgroups or adding/deleting qgroup relations for example, 1165 * because all qgroup operations first start or join a transaction and then 1166 * lock the qgroup_ioctl_lock mutex. 1167 * We are safe from a concurrent task trying to enable quotas, by calling 1168 * this function, since we are serialized by fs_info->subvol_sem. 1169 */ 1170 ret = btrfs_commit_transaction(trans); 1171 trans = NULL; 1172 mutex_lock(&fs_info->qgroup_ioctl_lock); 1173 if (ret) 1174 goto out_free_path; 1175 1176 /* 1177 * Set quota enabled flag after committing the transaction, to avoid 1178 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1179 * creation. 1180 */ 1181 spin_lock(&fs_info->qgroup_lock); 1182 fs_info->quota_root = quota_root; 1183 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1184 spin_unlock(&fs_info->qgroup_lock); 1185 1186 ret = qgroup_rescan_init(fs_info, 0, 1); 1187 if (!ret) { 1188 qgroup_rescan_zero_tracking(fs_info); 1189 fs_info->qgroup_rescan_running = true; 1190 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1191 &fs_info->qgroup_rescan_work); 1192 } else { 1193 /* 1194 * We have set both BTRFS_FS_QUOTA_ENABLED and 1195 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with 1196 * -EINPROGRESS. That can happen because someone started the 1197 * rescan worker by calling quota rescan ioctl before we 1198 * attempted to initialize the rescan worker. Failure due to 1199 * quotas disabled in the meanwhile is not possible, because 1200 * we are holding a write lock on fs_info->subvol_sem, which 1201 * is also acquired when disabling quotas. 1202 * Ignore such error, and any other error would need to undo 1203 * everything we did in the transaction we just committed. 1204 */ 1205 ASSERT(ret == -EINPROGRESS); 1206 ret = 0; 1207 } 1208 1209 out_free_path: 1210 btrfs_free_path(path); 1211 out_free_root: 1212 if (ret) 1213 btrfs_put_root(quota_root); 1214 out: 1215 if (ret) { 1216 ulist_free(fs_info->qgroup_ulist); 1217 fs_info->qgroup_ulist = NULL; 1218 btrfs_sysfs_del_qgroups(fs_info); 1219 } 1220 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1221 if (ret && trans) 1222 btrfs_end_transaction(trans); 1223 else if (trans) 1224 ret = btrfs_end_transaction(trans); 1225 ulist_free(ulist); 1226 return ret; 1227 } 1228 1229 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1230 { 1231 struct btrfs_root *quota_root = NULL; 1232 struct btrfs_trans_handle *trans = NULL; 1233 int ret = 0; 1234 1235 /* 1236 * We need to have subvol_sem write locked to prevent races with 1237 * snapshot creation. 1238 */ 1239 lockdep_assert_held_write(&fs_info->subvol_sem); 1240 1241 /* 1242 * Lock the cleaner mutex to prevent races with concurrent relocation, 1243 * because relocation may be building backrefs for blocks of the quota 1244 * root while we are deleting the root. This is like dropping fs roots 1245 * of deleted snapshots/subvolumes, we need the same protection. 1246 * 1247 * This also prevents races between concurrent tasks trying to disable 1248 * quotas, because we will unlock and relock qgroup_ioctl_lock across 1249 * BTRFS_FS_QUOTA_ENABLED changes. 1250 */ 1251 mutex_lock(&fs_info->cleaner_mutex); 1252 1253 mutex_lock(&fs_info->qgroup_ioctl_lock); 1254 if (!fs_info->quota_root) 1255 goto out; 1256 1257 /* 1258 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1259 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1260 * to lock that mutex while holding a transaction handle and the rescan 1261 * worker needs to commit a transaction. 1262 */ 1263 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1264 1265 /* 1266 * Request qgroup rescan worker to complete and wait for it. This wait 1267 * must be done before transaction start for quota disable since it may 1268 * deadlock with transaction by the qgroup rescan worker. 1269 */ 1270 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1271 btrfs_qgroup_wait_for_completion(fs_info, false); 1272 1273 /* 1274 * 1 For the root item 1275 * 1276 * We should also reserve enough items for the quota tree deletion in 1277 * btrfs_clean_quota_tree but this is not done. 1278 * 1279 * Also, we must always start a transaction without holding the mutex 1280 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1281 */ 1282 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1283 1284 mutex_lock(&fs_info->qgroup_ioctl_lock); 1285 if (IS_ERR(trans)) { 1286 ret = PTR_ERR(trans); 1287 trans = NULL; 1288 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1289 goto out; 1290 } 1291 1292 if (!fs_info->quota_root) 1293 goto out; 1294 1295 spin_lock(&fs_info->qgroup_lock); 1296 quota_root = fs_info->quota_root; 1297 fs_info->quota_root = NULL; 1298 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1299 fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; 1300 spin_unlock(&fs_info->qgroup_lock); 1301 1302 btrfs_free_qgroup_config(fs_info); 1303 1304 ret = btrfs_clean_quota_tree(trans, quota_root); 1305 if (ret) { 1306 btrfs_abort_transaction(trans, ret); 1307 goto out; 1308 } 1309 1310 ret = btrfs_del_root(trans, "a_root->root_key); 1311 if (ret) { 1312 btrfs_abort_transaction(trans, ret); 1313 goto out; 1314 } 1315 1316 spin_lock(&fs_info->trans_lock); 1317 list_del("a_root->dirty_list); 1318 spin_unlock(&fs_info->trans_lock); 1319 1320 btrfs_tree_lock(quota_root->node); 1321 btrfs_clear_buffer_dirty(trans, quota_root->node); 1322 btrfs_tree_unlock(quota_root->node); 1323 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1324 quota_root->node, 0, 1); 1325 1326 1327 out: 1328 btrfs_put_root(quota_root); 1329 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1330 if (ret && trans) 1331 btrfs_end_transaction(trans); 1332 else if (trans) 1333 ret = btrfs_end_transaction(trans); 1334 mutex_unlock(&fs_info->cleaner_mutex); 1335 1336 return ret; 1337 } 1338 1339 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1340 struct btrfs_qgroup *qgroup) 1341 { 1342 if (list_empty(&qgroup->dirty)) 1343 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1344 } 1345 1346 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) 1347 { 1348 if (!list_empty(&qgroup->iterator)) 1349 return; 1350 1351 list_add_tail(&qgroup->iterator, head); 1352 } 1353 1354 static void qgroup_iterator_clean(struct list_head *head) 1355 { 1356 while (!list_empty(head)) { 1357 struct btrfs_qgroup *qgroup; 1358 1359 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); 1360 list_del_init(&qgroup->iterator); 1361 } 1362 } 1363 1364 /* 1365 * The easy accounting, we're updating qgroup relationship whose child qgroup 1366 * only has exclusive extents. 1367 * 1368 * In this case, all exclusive extents will also be exclusive for parent, so 1369 * excl/rfer just get added/removed. 1370 * 1371 * So is qgroup reservation space, which should also be added/removed to 1372 * parent. 1373 * Or when child tries to release reservation space, parent will underflow its 1374 * reservation (for relationship adding case). 1375 * 1376 * Caller should hold fs_info->qgroup_lock. 1377 */ 1378 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1379 struct ulist *tmp, u64 ref_root, 1380 struct btrfs_qgroup *src, int sign) 1381 { 1382 struct btrfs_qgroup *qgroup; 1383 struct btrfs_qgroup_list *glist; 1384 struct ulist_node *unode; 1385 struct ulist_iterator uiter; 1386 u64 num_bytes = src->excl; 1387 int ret = 0; 1388 1389 qgroup = find_qgroup_rb(fs_info, ref_root); 1390 if (!qgroup) 1391 goto out; 1392 1393 qgroup->rfer += sign * num_bytes; 1394 qgroup->rfer_cmpr += sign * num_bytes; 1395 1396 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1397 qgroup->excl += sign * num_bytes; 1398 qgroup->excl_cmpr += sign * num_bytes; 1399 1400 if (sign > 0) 1401 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1402 else 1403 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1404 1405 qgroup_dirty(fs_info, qgroup); 1406 1407 /* Get all of the parent groups that contain this qgroup */ 1408 list_for_each_entry(glist, &qgroup->groups, next_group) { 1409 ret = ulist_add(tmp, glist->group->qgroupid, 1410 qgroup_to_aux(glist->group), GFP_ATOMIC); 1411 if (ret < 0) 1412 goto out; 1413 } 1414 1415 /* Iterate all of the parents and adjust their reference counts */ 1416 ULIST_ITER_INIT(&uiter); 1417 while ((unode = ulist_next(tmp, &uiter))) { 1418 qgroup = unode_aux_to_qgroup(unode); 1419 qgroup->rfer += sign * num_bytes; 1420 qgroup->rfer_cmpr += sign * num_bytes; 1421 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1422 qgroup->excl += sign * num_bytes; 1423 if (sign > 0) 1424 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1425 else 1426 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1427 qgroup->excl_cmpr += sign * num_bytes; 1428 qgroup_dirty(fs_info, qgroup); 1429 1430 /* Add any parents of the parents */ 1431 list_for_each_entry(glist, &qgroup->groups, next_group) { 1432 ret = ulist_add(tmp, glist->group->qgroupid, 1433 qgroup_to_aux(glist->group), GFP_ATOMIC); 1434 if (ret < 0) 1435 goto out; 1436 } 1437 } 1438 ret = 0; 1439 out: 1440 return ret; 1441 } 1442 1443 1444 /* 1445 * Quick path for updating qgroup with only excl refs. 1446 * 1447 * In that case, just update all parent will be enough. 1448 * Or we needs to do a full rescan. 1449 * Caller should also hold fs_info->qgroup_lock. 1450 * 1451 * Return 0 for quick update, return >0 for need to full rescan 1452 * and mark INCONSISTENT flag. 1453 * Return < 0 for other error. 1454 */ 1455 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1456 struct ulist *tmp, u64 src, u64 dst, 1457 int sign) 1458 { 1459 struct btrfs_qgroup *qgroup; 1460 int ret = 1; 1461 int err = 0; 1462 1463 qgroup = find_qgroup_rb(fs_info, src); 1464 if (!qgroup) 1465 goto out; 1466 if (qgroup->excl == qgroup->rfer) { 1467 ret = 0; 1468 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1469 qgroup, sign); 1470 if (err < 0) { 1471 ret = err; 1472 goto out; 1473 } 1474 } 1475 out: 1476 if (ret) 1477 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1478 return ret; 1479 } 1480 1481 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1482 u64 dst) 1483 { 1484 struct btrfs_fs_info *fs_info = trans->fs_info; 1485 struct btrfs_qgroup *parent; 1486 struct btrfs_qgroup *member; 1487 struct btrfs_qgroup_list *list; 1488 struct ulist *tmp; 1489 unsigned int nofs_flag; 1490 int ret = 0; 1491 1492 /* Check the level of src and dst first */ 1493 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1494 return -EINVAL; 1495 1496 /* We hold a transaction handle open, must do a NOFS allocation. */ 1497 nofs_flag = memalloc_nofs_save(); 1498 tmp = ulist_alloc(GFP_KERNEL); 1499 memalloc_nofs_restore(nofs_flag); 1500 if (!tmp) 1501 return -ENOMEM; 1502 1503 mutex_lock(&fs_info->qgroup_ioctl_lock); 1504 if (!fs_info->quota_root) { 1505 ret = -ENOTCONN; 1506 goto out; 1507 } 1508 member = find_qgroup_rb(fs_info, src); 1509 parent = find_qgroup_rb(fs_info, dst); 1510 if (!member || !parent) { 1511 ret = -EINVAL; 1512 goto out; 1513 } 1514 1515 /* check if such qgroup relation exist firstly */ 1516 list_for_each_entry(list, &member->groups, next_group) { 1517 if (list->group == parent) { 1518 ret = -EEXIST; 1519 goto out; 1520 } 1521 } 1522 1523 ret = add_qgroup_relation_item(trans, src, dst); 1524 if (ret) 1525 goto out; 1526 1527 ret = add_qgroup_relation_item(trans, dst, src); 1528 if (ret) { 1529 del_qgroup_relation_item(trans, src, dst); 1530 goto out; 1531 } 1532 1533 spin_lock(&fs_info->qgroup_lock); 1534 ret = __add_relation_rb(member, parent); 1535 if (ret < 0) { 1536 spin_unlock(&fs_info->qgroup_lock); 1537 goto out; 1538 } 1539 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1540 spin_unlock(&fs_info->qgroup_lock); 1541 out: 1542 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1543 ulist_free(tmp); 1544 return ret; 1545 } 1546 1547 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1548 u64 dst) 1549 { 1550 struct btrfs_fs_info *fs_info = trans->fs_info; 1551 struct btrfs_qgroup *parent; 1552 struct btrfs_qgroup *member; 1553 struct btrfs_qgroup_list *list; 1554 struct ulist *tmp; 1555 bool found = false; 1556 unsigned int nofs_flag; 1557 int ret = 0; 1558 int ret2; 1559 1560 /* We hold a transaction handle open, must do a NOFS allocation. */ 1561 nofs_flag = memalloc_nofs_save(); 1562 tmp = ulist_alloc(GFP_KERNEL); 1563 memalloc_nofs_restore(nofs_flag); 1564 if (!tmp) 1565 return -ENOMEM; 1566 1567 if (!fs_info->quota_root) { 1568 ret = -ENOTCONN; 1569 goto out; 1570 } 1571 1572 member = find_qgroup_rb(fs_info, src); 1573 parent = find_qgroup_rb(fs_info, dst); 1574 /* 1575 * The parent/member pair doesn't exist, then try to delete the dead 1576 * relation items only. 1577 */ 1578 if (!member || !parent) 1579 goto delete_item; 1580 1581 /* check if such qgroup relation exist firstly */ 1582 list_for_each_entry(list, &member->groups, next_group) { 1583 if (list->group == parent) { 1584 found = true; 1585 break; 1586 } 1587 } 1588 1589 delete_item: 1590 ret = del_qgroup_relation_item(trans, src, dst); 1591 if (ret < 0 && ret != -ENOENT) 1592 goto out; 1593 ret2 = del_qgroup_relation_item(trans, dst, src); 1594 if (ret2 < 0 && ret2 != -ENOENT) 1595 goto out; 1596 1597 /* At least one deletion succeeded, return 0 */ 1598 if (!ret || !ret2) 1599 ret = 0; 1600 1601 if (found) { 1602 spin_lock(&fs_info->qgroup_lock); 1603 del_relation_rb(fs_info, src, dst); 1604 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1605 spin_unlock(&fs_info->qgroup_lock); 1606 } 1607 out: 1608 ulist_free(tmp); 1609 return ret; 1610 } 1611 1612 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1613 u64 dst) 1614 { 1615 struct btrfs_fs_info *fs_info = trans->fs_info; 1616 int ret = 0; 1617 1618 mutex_lock(&fs_info->qgroup_ioctl_lock); 1619 ret = __del_qgroup_relation(trans, src, dst); 1620 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1621 1622 return ret; 1623 } 1624 1625 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1626 { 1627 struct btrfs_fs_info *fs_info = trans->fs_info; 1628 struct btrfs_root *quota_root; 1629 struct btrfs_qgroup *qgroup; 1630 int ret = 0; 1631 1632 mutex_lock(&fs_info->qgroup_ioctl_lock); 1633 if (!fs_info->quota_root) { 1634 ret = -ENOTCONN; 1635 goto out; 1636 } 1637 quota_root = fs_info->quota_root; 1638 qgroup = find_qgroup_rb(fs_info, qgroupid); 1639 if (qgroup) { 1640 ret = -EEXIST; 1641 goto out; 1642 } 1643 1644 ret = add_qgroup_item(trans, quota_root, qgroupid); 1645 if (ret) 1646 goto out; 1647 1648 spin_lock(&fs_info->qgroup_lock); 1649 qgroup = add_qgroup_rb(fs_info, qgroupid); 1650 spin_unlock(&fs_info->qgroup_lock); 1651 1652 if (IS_ERR(qgroup)) { 1653 ret = PTR_ERR(qgroup); 1654 goto out; 1655 } 1656 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1657 out: 1658 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1659 return ret; 1660 } 1661 1662 static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) 1663 { 1664 return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || 1665 qgroup->excl > 0 || qgroup->excl_cmpr > 0 || 1666 qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || 1667 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || 1668 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); 1669 } 1670 1671 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1672 { 1673 struct btrfs_fs_info *fs_info = trans->fs_info; 1674 struct btrfs_qgroup *qgroup; 1675 struct btrfs_qgroup_list *list; 1676 int ret = 0; 1677 1678 mutex_lock(&fs_info->qgroup_ioctl_lock); 1679 if (!fs_info->quota_root) { 1680 ret = -ENOTCONN; 1681 goto out; 1682 } 1683 1684 qgroup = find_qgroup_rb(fs_info, qgroupid); 1685 if (!qgroup) { 1686 ret = -ENOENT; 1687 goto out; 1688 } 1689 1690 if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { 1691 ret = -EBUSY; 1692 goto out; 1693 } 1694 1695 /* Check if there are no children of this qgroup */ 1696 if (!list_empty(&qgroup->members)) { 1697 ret = -EBUSY; 1698 goto out; 1699 } 1700 1701 ret = del_qgroup_item(trans, qgroupid); 1702 if (ret && ret != -ENOENT) 1703 goto out; 1704 1705 while (!list_empty(&qgroup->groups)) { 1706 list = list_first_entry(&qgroup->groups, 1707 struct btrfs_qgroup_list, next_group); 1708 ret = __del_qgroup_relation(trans, qgroupid, 1709 list->group->qgroupid); 1710 if (ret) 1711 goto out; 1712 } 1713 1714 spin_lock(&fs_info->qgroup_lock); 1715 del_qgroup_rb(fs_info, qgroupid); 1716 spin_unlock(&fs_info->qgroup_lock); 1717 1718 /* 1719 * Remove the qgroup from sysfs now without holding the qgroup_lock 1720 * spinlock, since the sysfs_remove_group() function needs to take 1721 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1722 */ 1723 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1724 kfree(qgroup); 1725 out: 1726 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1727 return ret; 1728 } 1729 1730 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1731 struct btrfs_qgroup_limit *limit) 1732 { 1733 struct btrfs_fs_info *fs_info = trans->fs_info; 1734 struct btrfs_qgroup *qgroup; 1735 int ret = 0; 1736 /* Sometimes we would want to clear the limit on this qgroup. 1737 * To meet this requirement, we treat the -1 as a special value 1738 * which tell kernel to clear the limit on this qgroup. 1739 */ 1740 const u64 CLEAR_VALUE = -1; 1741 1742 mutex_lock(&fs_info->qgroup_ioctl_lock); 1743 if (!fs_info->quota_root) { 1744 ret = -ENOTCONN; 1745 goto out; 1746 } 1747 1748 qgroup = find_qgroup_rb(fs_info, qgroupid); 1749 if (!qgroup) { 1750 ret = -ENOENT; 1751 goto out; 1752 } 1753 1754 spin_lock(&fs_info->qgroup_lock); 1755 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1756 if (limit->max_rfer == CLEAR_VALUE) { 1757 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1758 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1759 qgroup->max_rfer = 0; 1760 } else { 1761 qgroup->max_rfer = limit->max_rfer; 1762 } 1763 } 1764 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1765 if (limit->max_excl == CLEAR_VALUE) { 1766 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1767 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1768 qgroup->max_excl = 0; 1769 } else { 1770 qgroup->max_excl = limit->max_excl; 1771 } 1772 } 1773 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1774 if (limit->rsv_rfer == CLEAR_VALUE) { 1775 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1776 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1777 qgroup->rsv_rfer = 0; 1778 } else { 1779 qgroup->rsv_rfer = limit->rsv_rfer; 1780 } 1781 } 1782 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1783 if (limit->rsv_excl == CLEAR_VALUE) { 1784 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1785 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1786 qgroup->rsv_excl = 0; 1787 } else { 1788 qgroup->rsv_excl = limit->rsv_excl; 1789 } 1790 } 1791 qgroup->lim_flags |= limit->flags; 1792 1793 spin_unlock(&fs_info->qgroup_lock); 1794 1795 ret = update_qgroup_limit_item(trans, qgroup); 1796 if (ret) { 1797 qgroup_mark_inconsistent(fs_info); 1798 btrfs_info(fs_info, "unable to update quota limit for %llu", 1799 qgroupid); 1800 } 1801 1802 out: 1803 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1804 return ret; 1805 } 1806 1807 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1808 struct btrfs_delayed_ref_root *delayed_refs, 1809 struct btrfs_qgroup_extent_record *record) 1810 { 1811 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1812 struct rb_node *parent_node = NULL; 1813 struct btrfs_qgroup_extent_record *entry; 1814 u64 bytenr = record->bytenr; 1815 1816 lockdep_assert_held(&delayed_refs->lock); 1817 trace_btrfs_qgroup_trace_extent(fs_info, record); 1818 1819 while (*p) { 1820 parent_node = *p; 1821 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1822 node); 1823 if (bytenr < entry->bytenr) { 1824 p = &(*p)->rb_left; 1825 } else if (bytenr > entry->bytenr) { 1826 p = &(*p)->rb_right; 1827 } else { 1828 if (record->data_rsv && !entry->data_rsv) { 1829 entry->data_rsv = record->data_rsv; 1830 entry->data_rsv_refroot = 1831 record->data_rsv_refroot; 1832 } 1833 return 1; 1834 } 1835 } 1836 1837 rb_link_node(&record->node, parent_node, p); 1838 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1839 return 0; 1840 } 1841 1842 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1843 struct btrfs_qgroup_extent_record *qrecord) 1844 { 1845 struct btrfs_backref_walk_ctx ctx = { 0 }; 1846 int ret; 1847 1848 /* 1849 * We are always called in a context where we are already holding a 1850 * transaction handle. Often we are called when adding a data delayed 1851 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1852 * in which case we will be holding a write lock on extent buffer from a 1853 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1854 * acquire fs_info->commit_root_sem, because that is a higher level lock 1855 * that must be acquired before locking any extent buffers. 1856 * 1857 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1858 * but we can't pass it a non-NULL transaction handle, because otherwise 1859 * it would not use commit roots and would lock extent buffers, causing 1860 * a deadlock if it ends up trying to read lock the same extent buffer 1861 * that was previously write locked at btrfs_truncate_inode_items(). 1862 * 1863 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1864 * explicitly tell it to not acquire the commit_root_sem - if we are 1865 * holding a transaction handle we don't need its protection. 1866 */ 1867 ASSERT(trans != NULL); 1868 1869 if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 1870 return 0; 1871 1872 ctx.bytenr = qrecord->bytenr; 1873 ctx.fs_info = trans->fs_info; 1874 1875 ret = btrfs_find_all_roots(&ctx, true); 1876 if (ret < 0) { 1877 qgroup_mark_inconsistent(trans->fs_info); 1878 btrfs_warn(trans->fs_info, 1879 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1880 ret); 1881 return 0; 1882 } 1883 1884 /* 1885 * Here we don't need to get the lock of 1886 * trans->transaction->delayed_refs, since inserted qrecord won't 1887 * be deleted, only qrecord->node may be modified (new qrecord insert) 1888 * 1889 * So modifying qrecord->old_roots is safe here 1890 */ 1891 qrecord->old_roots = ctx.roots; 1892 return 0; 1893 } 1894 1895 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1896 u64 num_bytes) 1897 { 1898 struct btrfs_fs_info *fs_info = trans->fs_info; 1899 struct btrfs_qgroup_extent_record *record; 1900 struct btrfs_delayed_ref_root *delayed_refs; 1901 int ret; 1902 1903 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1904 || bytenr == 0 || num_bytes == 0) 1905 return 0; 1906 record = kzalloc(sizeof(*record), GFP_NOFS); 1907 if (!record) 1908 return -ENOMEM; 1909 1910 delayed_refs = &trans->transaction->delayed_refs; 1911 record->bytenr = bytenr; 1912 record->num_bytes = num_bytes; 1913 record->old_roots = NULL; 1914 1915 spin_lock(&delayed_refs->lock); 1916 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1917 spin_unlock(&delayed_refs->lock); 1918 if (ret > 0) { 1919 kfree(record); 1920 return 0; 1921 } 1922 return btrfs_qgroup_trace_extent_post(trans, record); 1923 } 1924 1925 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1926 struct extent_buffer *eb) 1927 { 1928 struct btrfs_fs_info *fs_info = trans->fs_info; 1929 int nr = btrfs_header_nritems(eb); 1930 int i, extent_type, ret; 1931 struct btrfs_key key; 1932 struct btrfs_file_extent_item *fi; 1933 u64 bytenr, num_bytes; 1934 1935 /* We can be called directly from walk_up_proc() */ 1936 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1937 return 0; 1938 1939 for (i = 0; i < nr; i++) { 1940 btrfs_item_key_to_cpu(eb, &key, i); 1941 1942 if (key.type != BTRFS_EXTENT_DATA_KEY) 1943 continue; 1944 1945 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1946 /* filter out non qgroup-accountable extents */ 1947 extent_type = btrfs_file_extent_type(eb, fi); 1948 1949 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1950 continue; 1951 1952 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1953 if (!bytenr) 1954 continue; 1955 1956 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1957 1958 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); 1959 if (ret) 1960 return ret; 1961 } 1962 cond_resched(); 1963 return 0; 1964 } 1965 1966 /* 1967 * Walk up the tree from the bottom, freeing leaves and any interior 1968 * nodes which have had all slots visited. If a node (leaf or 1969 * interior) is freed, the node above it will have it's slot 1970 * incremented. The root node will never be freed. 1971 * 1972 * At the end of this function, we should have a path which has all 1973 * slots incremented to the next position for a search. If we need to 1974 * read a new node it will be NULL and the node above it will have the 1975 * correct slot selected for a later read. 1976 * 1977 * If we increment the root nodes slot counter past the number of 1978 * elements, 1 is returned to signal completion of the search. 1979 */ 1980 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1981 { 1982 int level = 0; 1983 int nr, slot; 1984 struct extent_buffer *eb; 1985 1986 if (root_level == 0) 1987 return 1; 1988 1989 while (level <= root_level) { 1990 eb = path->nodes[level]; 1991 nr = btrfs_header_nritems(eb); 1992 path->slots[level]++; 1993 slot = path->slots[level]; 1994 if (slot >= nr || level == 0) { 1995 /* 1996 * Don't free the root - we will detect this 1997 * condition after our loop and return a 1998 * positive value for caller to stop walking the tree. 1999 */ 2000 if (level != root_level) { 2001 btrfs_tree_unlock_rw(eb, path->locks[level]); 2002 path->locks[level] = 0; 2003 2004 free_extent_buffer(eb); 2005 path->nodes[level] = NULL; 2006 path->slots[level] = 0; 2007 } 2008 } else { 2009 /* 2010 * We have a valid slot to walk back down 2011 * from. Stop here so caller can process these 2012 * new nodes. 2013 */ 2014 break; 2015 } 2016 2017 level++; 2018 } 2019 2020 eb = path->nodes[root_level]; 2021 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 2022 return 1; 2023 2024 return 0; 2025 } 2026 2027 /* 2028 * Helper function to trace a subtree tree block swap. 2029 * 2030 * The swap will happen in highest tree block, but there may be a lot of 2031 * tree blocks involved. 2032 * 2033 * For example: 2034 * OO = Old tree blocks 2035 * NN = New tree blocks allocated during balance 2036 * 2037 * File tree (257) Reloc tree for 257 2038 * L2 OO NN 2039 * / \ / \ 2040 * L1 OO OO (a) OO NN (a) 2041 * / \ / \ / \ / \ 2042 * L0 OO OO OO OO OO OO NN NN 2043 * (b) (c) (b) (c) 2044 * 2045 * When calling qgroup_trace_extent_swap(), we will pass: 2046 * @src_eb = OO(a) 2047 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 2048 * @dst_level = 0 2049 * @root_level = 1 2050 * 2051 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 2052 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 2053 * 2054 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 2055 * 2056 * 1) Tree search from @src_eb 2057 * It should acts as a simplified btrfs_search_slot(). 2058 * The key for search can be extracted from @dst_path->nodes[dst_level] 2059 * (first key). 2060 * 2061 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 2062 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 2063 * They should be marked during previous (@dst_level = 1) iteration. 2064 * 2065 * 3) Mark file extents in leaves dirty 2066 * We don't have good way to pick out new file extents only. 2067 * So we still follow the old method by scanning all file extents in 2068 * the leave. 2069 * 2070 * This function can free us from keeping two paths, thus later we only need 2071 * to care about how to iterate all new tree blocks in reloc tree. 2072 */ 2073 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 2074 struct extent_buffer *src_eb, 2075 struct btrfs_path *dst_path, 2076 int dst_level, int root_level, 2077 bool trace_leaf) 2078 { 2079 struct btrfs_key key; 2080 struct btrfs_path *src_path; 2081 struct btrfs_fs_info *fs_info = trans->fs_info; 2082 u32 nodesize = fs_info->nodesize; 2083 int cur_level = root_level; 2084 int ret; 2085 2086 BUG_ON(dst_level > root_level); 2087 /* Level mismatch */ 2088 if (btrfs_header_level(src_eb) != root_level) 2089 return -EINVAL; 2090 2091 src_path = btrfs_alloc_path(); 2092 if (!src_path) { 2093 ret = -ENOMEM; 2094 goto out; 2095 } 2096 2097 if (dst_level) 2098 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2099 else 2100 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2101 2102 /* For src_path */ 2103 atomic_inc(&src_eb->refs); 2104 src_path->nodes[root_level] = src_eb; 2105 src_path->slots[root_level] = dst_path->slots[root_level]; 2106 src_path->locks[root_level] = 0; 2107 2108 /* A simplified version of btrfs_search_slot() */ 2109 while (cur_level >= dst_level) { 2110 struct btrfs_key src_key; 2111 struct btrfs_key dst_key; 2112 2113 if (src_path->nodes[cur_level] == NULL) { 2114 struct extent_buffer *eb; 2115 int parent_slot; 2116 2117 eb = src_path->nodes[cur_level + 1]; 2118 parent_slot = src_path->slots[cur_level + 1]; 2119 2120 eb = btrfs_read_node_slot(eb, parent_slot); 2121 if (IS_ERR(eb)) { 2122 ret = PTR_ERR(eb); 2123 goto out; 2124 } 2125 2126 src_path->nodes[cur_level] = eb; 2127 2128 btrfs_tree_read_lock(eb); 2129 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2130 } 2131 2132 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2133 if (cur_level) { 2134 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2135 &dst_key, dst_path->slots[cur_level]); 2136 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2137 &src_key, src_path->slots[cur_level]); 2138 } else { 2139 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2140 &dst_key, dst_path->slots[cur_level]); 2141 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2142 &src_key, src_path->slots[cur_level]); 2143 } 2144 /* Content mismatch, something went wrong */ 2145 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2146 ret = -ENOENT; 2147 goto out; 2148 } 2149 cur_level--; 2150 } 2151 2152 /* 2153 * Now both @dst_path and @src_path have been populated, record the tree 2154 * blocks for qgroup accounting. 2155 */ 2156 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2157 nodesize); 2158 if (ret < 0) 2159 goto out; 2160 ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, 2161 nodesize); 2162 if (ret < 0) 2163 goto out; 2164 2165 /* Record leaf file extents */ 2166 if (dst_level == 0 && trace_leaf) { 2167 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2168 if (ret < 0) 2169 goto out; 2170 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2171 } 2172 out: 2173 btrfs_free_path(src_path); 2174 return ret; 2175 } 2176 2177 /* 2178 * Helper function to do recursive generation-aware depth-first search, to 2179 * locate all new tree blocks in a subtree of reloc tree. 2180 * 2181 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2182 * reloc tree 2183 * L2 NN (a) 2184 * / \ 2185 * L1 OO NN (b) 2186 * / \ / \ 2187 * L0 OO OO OO NN 2188 * (c) (d) 2189 * If we pass: 2190 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2191 * @cur_level = 1 2192 * @root_level = 1 2193 * 2194 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2195 * above tree blocks along with their counter parts in file tree. 2196 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2197 * won't affect OO(c). 2198 */ 2199 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2200 struct extent_buffer *src_eb, 2201 struct btrfs_path *dst_path, 2202 int cur_level, int root_level, 2203 u64 last_snapshot, bool trace_leaf) 2204 { 2205 struct btrfs_fs_info *fs_info = trans->fs_info; 2206 struct extent_buffer *eb; 2207 bool need_cleanup = false; 2208 int ret = 0; 2209 int i; 2210 2211 /* Level sanity check */ 2212 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2213 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2214 root_level < cur_level) { 2215 btrfs_err_rl(fs_info, 2216 "%s: bad levels, cur_level=%d root_level=%d", 2217 __func__, cur_level, root_level); 2218 return -EUCLEAN; 2219 } 2220 2221 /* Read the tree block if needed */ 2222 if (dst_path->nodes[cur_level] == NULL) { 2223 int parent_slot; 2224 u64 child_gen; 2225 2226 /* 2227 * dst_path->nodes[root_level] must be initialized before 2228 * calling this function. 2229 */ 2230 if (cur_level == root_level) { 2231 btrfs_err_rl(fs_info, 2232 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2233 __func__, root_level, root_level, cur_level); 2234 return -EUCLEAN; 2235 } 2236 2237 /* 2238 * We need to get child blockptr/gen from parent before we can 2239 * read it. 2240 */ 2241 eb = dst_path->nodes[cur_level + 1]; 2242 parent_slot = dst_path->slots[cur_level + 1]; 2243 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2244 2245 /* This node is old, no need to trace */ 2246 if (child_gen < last_snapshot) 2247 goto out; 2248 2249 eb = btrfs_read_node_slot(eb, parent_slot); 2250 if (IS_ERR(eb)) { 2251 ret = PTR_ERR(eb); 2252 goto out; 2253 } 2254 2255 dst_path->nodes[cur_level] = eb; 2256 dst_path->slots[cur_level] = 0; 2257 2258 btrfs_tree_read_lock(eb); 2259 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2260 need_cleanup = true; 2261 } 2262 2263 /* Now record this tree block and its counter part for qgroups */ 2264 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2265 root_level, trace_leaf); 2266 if (ret < 0) 2267 goto cleanup; 2268 2269 eb = dst_path->nodes[cur_level]; 2270 2271 if (cur_level > 0) { 2272 /* Iterate all child tree blocks */ 2273 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2274 /* Skip old tree blocks as they won't be swapped */ 2275 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2276 continue; 2277 dst_path->slots[cur_level] = i; 2278 2279 /* Recursive call (at most 7 times) */ 2280 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2281 dst_path, cur_level - 1, root_level, 2282 last_snapshot, trace_leaf); 2283 if (ret < 0) 2284 goto cleanup; 2285 } 2286 } 2287 2288 cleanup: 2289 if (need_cleanup) { 2290 /* Clean up */ 2291 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2292 dst_path->locks[cur_level]); 2293 free_extent_buffer(dst_path->nodes[cur_level]); 2294 dst_path->nodes[cur_level] = NULL; 2295 dst_path->slots[cur_level] = 0; 2296 dst_path->locks[cur_level] = 0; 2297 } 2298 out: 2299 return ret; 2300 } 2301 2302 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2303 struct extent_buffer *src_eb, 2304 struct extent_buffer *dst_eb, 2305 u64 last_snapshot, bool trace_leaf) 2306 { 2307 struct btrfs_fs_info *fs_info = trans->fs_info; 2308 struct btrfs_path *dst_path = NULL; 2309 int level; 2310 int ret; 2311 2312 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2313 return 0; 2314 2315 /* Wrong parameter order */ 2316 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2317 btrfs_err_rl(fs_info, 2318 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2319 btrfs_header_generation(src_eb), 2320 btrfs_header_generation(dst_eb)); 2321 return -EUCLEAN; 2322 } 2323 2324 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2325 ret = -EIO; 2326 goto out; 2327 } 2328 2329 level = btrfs_header_level(dst_eb); 2330 dst_path = btrfs_alloc_path(); 2331 if (!dst_path) { 2332 ret = -ENOMEM; 2333 goto out; 2334 } 2335 /* For dst_path */ 2336 atomic_inc(&dst_eb->refs); 2337 dst_path->nodes[level] = dst_eb; 2338 dst_path->slots[level] = 0; 2339 dst_path->locks[level] = 0; 2340 2341 /* Do the generation aware breadth-first search */ 2342 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2343 level, last_snapshot, trace_leaf); 2344 if (ret < 0) 2345 goto out; 2346 ret = 0; 2347 2348 out: 2349 btrfs_free_path(dst_path); 2350 if (ret < 0) 2351 qgroup_mark_inconsistent(fs_info); 2352 return ret; 2353 } 2354 2355 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2356 struct extent_buffer *root_eb, 2357 u64 root_gen, int root_level) 2358 { 2359 struct btrfs_fs_info *fs_info = trans->fs_info; 2360 int ret = 0; 2361 int level; 2362 u8 drop_subptree_thres; 2363 struct extent_buffer *eb = root_eb; 2364 struct btrfs_path *path = NULL; 2365 2366 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2367 BUG_ON(root_eb == NULL); 2368 2369 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2370 return 0; 2371 2372 spin_lock(&fs_info->qgroup_lock); 2373 drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; 2374 spin_unlock(&fs_info->qgroup_lock); 2375 2376 /* 2377 * This function only gets called for snapshot drop, if we hit a high 2378 * node here, it means we are going to change ownership for quite a lot 2379 * of extents, which will greatly slow down btrfs_commit_transaction(). 2380 * 2381 * So here if we find a high tree here, we just skip the accounting and 2382 * mark qgroup inconsistent. 2383 */ 2384 if (root_level >= drop_subptree_thres) { 2385 qgroup_mark_inconsistent(fs_info); 2386 return 0; 2387 } 2388 2389 if (!extent_buffer_uptodate(root_eb)) { 2390 struct btrfs_tree_parent_check check = { 2391 .has_first_key = false, 2392 .transid = root_gen, 2393 .level = root_level 2394 }; 2395 2396 ret = btrfs_read_extent_buffer(root_eb, &check); 2397 if (ret) 2398 goto out; 2399 } 2400 2401 if (root_level == 0) { 2402 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2403 goto out; 2404 } 2405 2406 path = btrfs_alloc_path(); 2407 if (!path) 2408 return -ENOMEM; 2409 2410 /* 2411 * Walk down the tree. Missing extent blocks are filled in as 2412 * we go. Metadata is accounted every time we read a new 2413 * extent block. 2414 * 2415 * When we reach a leaf, we account for file extent items in it, 2416 * walk back up the tree (adjusting slot pointers as we go) 2417 * and restart the search process. 2418 */ 2419 atomic_inc(&root_eb->refs); /* For path */ 2420 path->nodes[root_level] = root_eb; 2421 path->slots[root_level] = 0; 2422 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2423 walk_down: 2424 level = root_level; 2425 while (level >= 0) { 2426 if (path->nodes[level] == NULL) { 2427 int parent_slot; 2428 u64 child_bytenr; 2429 2430 /* 2431 * We need to get child blockptr from parent before we 2432 * can read it. 2433 */ 2434 eb = path->nodes[level + 1]; 2435 parent_slot = path->slots[level + 1]; 2436 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2437 2438 eb = btrfs_read_node_slot(eb, parent_slot); 2439 if (IS_ERR(eb)) { 2440 ret = PTR_ERR(eb); 2441 goto out; 2442 } 2443 2444 path->nodes[level] = eb; 2445 path->slots[level] = 0; 2446 2447 btrfs_tree_read_lock(eb); 2448 path->locks[level] = BTRFS_READ_LOCK; 2449 2450 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2451 fs_info->nodesize); 2452 if (ret) 2453 goto out; 2454 } 2455 2456 if (level == 0) { 2457 ret = btrfs_qgroup_trace_leaf_items(trans, 2458 path->nodes[level]); 2459 if (ret) 2460 goto out; 2461 2462 /* Nonzero return here means we completed our search */ 2463 ret = adjust_slots_upwards(path, root_level); 2464 if (ret) 2465 break; 2466 2467 /* Restart search with new slots */ 2468 goto walk_down; 2469 } 2470 2471 level--; 2472 } 2473 2474 ret = 0; 2475 out: 2476 btrfs_free_path(path); 2477 2478 return ret; 2479 } 2480 2481 #define UPDATE_NEW 0 2482 #define UPDATE_OLD 1 2483 /* 2484 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2485 */ 2486 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2487 struct ulist *roots, struct ulist *tmp, 2488 struct ulist *qgroups, u64 seq, int update_old) 2489 { 2490 struct ulist_node *unode; 2491 struct ulist_iterator uiter; 2492 struct ulist_node *tmp_unode; 2493 struct ulist_iterator tmp_uiter; 2494 struct btrfs_qgroup *qg; 2495 int ret = 0; 2496 2497 if (!roots) 2498 return 0; 2499 ULIST_ITER_INIT(&uiter); 2500 while ((unode = ulist_next(roots, &uiter))) { 2501 qg = find_qgroup_rb(fs_info, unode->val); 2502 if (!qg) 2503 continue; 2504 2505 ulist_reinit(tmp); 2506 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2507 GFP_ATOMIC); 2508 if (ret < 0) 2509 return ret; 2510 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2511 if (ret < 0) 2512 return ret; 2513 ULIST_ITER_INIT(&tmp_uiter); 2514 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2515 struct btrfs_qgroup_list *glist; 2516 2517 qg = unode_aux_to_qgroup(tmp_unode); 2518 if (update_old) 2519 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2520 else 2521 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2522 list_for_each_entry(glist, &qg->groups, next_group) { 2523 ret = ulist_add(qgroups, glist->group->qgroupid, 2524 qgroup_to_aux(glist->group), 2525 GFP_ATOMIC); 2526 if (ret < 0) 2527 return ret; 2528 ret = ulist_add(tmp, glist->group->qgroupid, 2529 qgroup_to_aux(glist->group), 2530 GFP_ATOMIC); 2531 if (ret < 0) 2532 return ret; 2533 } 2534 } 2535 } 2536 return 0; 2537 } 2538 2539 /* 2540 * Update qgroup rfer/excl counters. 2541 * Rfer update is easy, codes can explain themselves. 2542 * 2543 * Excl update is tricky, the update is split into 2 parts. 2544 * Part 1: Possible exclusive <-> sharing detect: 2545 * | A | !A | 2546 * ------------------------------------- 2547 * B | * | - | 2548 * ------------------------------------- 2549 * !B | + | ** | 2550 * ------------------------------------- 2551 * 2552 * Conditions: 2553 * A: cur_old_roots < nr_old_roots (not exclusive before) 2554 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2555 * B: cur_new_roots < nr_new_roots (not exclusive now) 2556 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2557 * 2558 * Results: 2559 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2560 * *: Definitely not changed. **: Possible unchanged. 2561 * 2562 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2563 * 2564 * To make the logic clear, we first use condition A and B to split 2565 * combination into 4 results. 2566 * 2567 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2568 * only on variant maybe 0. 2569 * 2570 * Lastly, check result **, since there are 2 variants maybe 0, split them 2571 * again(2x2). 2572 * But this time we don't need to consider other things, the codes and logic 2573 * is easy to understand now. 2574 */ 2575 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2576 struct ulist *qgroups, 2577 u64 nr_old_roots, 2578 u64 nr_new_roots, 2579 u64 num_bytes, u64 seq) 2580 { 2581 struct ulist_node *unode; 2582 struct ulist_iterator uiter; 2583 struct btrfs_qgroup *qg; 2584 u64 cur_new_count, cur_old_count; 2585 2586 ULIST_ITER_INIT(&uiter); 2587 while ((unode = ulist_next(qgroups, &uiter))) { 2588 bool dirty = false; 2589 2590 qg = unode_aux_to_qgroup(unode); 2591 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2592 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2593 2594 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2595 cur_new_count); 2596 2597 /* Rfer update part */ 2598 if (cur_old_count == 0 && cur_new_count > 0) { 2599 qg->rfer += num_bytes; 2600 qg->rfer_cmpr += num_bytes; 2601 dirty = true; 2602 } 2603 if (cur_old_count > 0 && cur_new_count == 0) { 2604 qg->rfer -= num_bytes; 2605 qg->rfer_cmpr -= num_bytes; 2606 dirty = true; 2607 } 2608 2609 /* Excl update part */ 2610 /* Exclusive/none -> shared case */ 2611 if (cur_old_count == nr_old_roots && 2612 cur_new_count < nr_new_roots) { 2613 /* Exclusive -> shared */ 2614 if (cur_old_count != 0) { 2615 qg->excl -= num_bytes; 2616 qg->excl_cmpr -= num_bytes; 2617 dirty = true; 2618 } 2619 } 2620 2621 /* Shared -> exclusive/none case */ 2622 if (cur_old_count < nr_old_roots && 2623 cur_new_count == nr_new_roots) { 2624 /* Shared->exclusive */ 2625 if (cur_new_count != 0) { 2626 qg->excl += num_bytes; 2627 qg->excl_cmpr += num_bytes; 2628 dirty = true; 2629 } 2630 } 2631 2632 /* Exclusive/none -> exclusive/none case */ 2633 if (cur_old_count == nr_old_roots && 2634 cur_new_count == nr_new_roots) { 2635 if (cur_old_count == 0) { 2636 /* None -> exclusive/none */ 2637 2638 if (cur_new_count != 0) { 2639 /* None -> exclusive */ 2640 qg->excl += num_bytes; 2641 qg->excl_cmpr += num_bytes; 2642 dirty = true; 2643 } 2644 /* None -> none, nothing changed */ 2645 } else { 2646 /* Exclusive -> exclusive/none */ 2647 2648 if (cur_new_count == 0) { 2649 /* Exclusive -> none */ 2650 qg->excl -= num_bytes; 2651 qg->excl_cmpr -= num_bytes; 2652 dirty = true; 2653 } 2654 /* Exclusive -> exclusive, nothing changed */ 2655 } 2656 } 2657 2658 if (dirty) 2659 qgroup_dirty(fs_info, qg); 2660 } 2661 return 0; 2662 } 2663 2664 /* 2665 * Check if the @roots potentially is a list of fs tree roots 2666 * 2667 * Return 0 for definitely not a fs/subvol tree roots ulist 2668 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2669 * one as well) 2670 */ 2671 static int maybe_fs_roots(struct ulist *roots) 2672 { 2673 struct ulist_node *unode; 2674 struct ulist_iterator uiter; 2675 2676 /* Empty one, still possible for fs roots */ 2677 if (!roots || roots->nnodes == 0) 2678 return 1; 2679 2680 ULIST_ITER_INIT(&uiter); 2681 unode = ulist_next(roots, &uiter); 2682 if (!unode) 2683 return 1; 2684 2685 /* 2686 * If it contains fs tree roots, then it must belong to fs/subvol 2687 * trees. 2688 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2689 */ 2690 return is_fstree(unode->val); 2691 } 2692 2693 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2694 u64 num_bytes, struct ulist *old_roots, 2695 struct ulist *new_roots) 2696 { 2697 struct btrfs_fs_info *fs_info = trans->fs_info; 2698 struct ulist *qgroups = NULL; 2699 struct ulist *tmp = NULL; 2700 u64 seq; 2701 u64 nr_new_roots = 0; 2702 u64 nr_old_roots = 0; 2703 int ret = 0; 2704 2705 /* 2706 * If quotas get disabled meanwhile, the resources need to be freed and 2707 * we can't just exit here. 2708 */ 2709 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 2710 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 2711 goto out_free; 2712 2713 if (new_roots) { 2714 if (!maybe_fs_roots(new_roots)) 2715 goto out_free; 2716 nr_new_roots = new_roots->nnodes; 2717 } 2718 if (old_roots) { 2719 if (!maybe_fs_roots(old_roots)) 2720 goto out_free; 2721 nr_old_roots = old_roots->nnodes; 2722 } 2723 2724 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2725 if (nr_old_roots == 0 && nr_new_roots == 0) 2726 goto out_free; 2727 2728 BUG_ON(!fs_info->quota_root); 2729 2730 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2731 num_bytes, nr_old_roots, nr_new_roots); 2732 2733 qgroups = ulist_alloc(GFP_NOFS); 2734 if (!qgroups) { 2735 ret = -ENOMEM; 2736 goto out_free; 2737 } 2738 tmp = ulist_alloc(GFP_NOFS); 2739 if (!tmp) { 2740 ret = -ENOMEM; 2741 goto out_free; 2742 } 2743 2744 mutex_lock(&fs_info->qgroup_rescan_lock); 2745 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2746 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2747 mutex_unlock(&fs_info->qgroup_rescan_lock); 2748 ret = 0; 2749 goto out_free; 2750 } 2751 } 2752 mutex_unlock(&fs_info->qgroup_rescan_lock); 2753 2754 spin_lock(&fs_info->qgroup_lock); 2755 seq = fs_info->qgroup_seq; 2756 2757 /* Update old refcnts using old_roots */ 2758 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2759 UPDATE_OLD); 2760 if (ret < 0) 2761 goto out; 2762 2763 /* Update new refcnts using new_roots */ 2764 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2765 UPDATE_NEW); 2766 if (ret < 0) 2767 goto out; 2768 2769 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2770 num_bytes, seq); 2771 2772 /* 2773 * Bump qgroup_seq to avoid seq overlap 2774 */ 2775 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2776 out: 2777 spin_unlock(&fs_info->qgroup_lock); 2778 out_free: 2779 ulist_free(tmp); 2780 ulist_free(qgroups); 2781 ulist_free(old_roots); 2782 ulist_free(new_roots); 2783 return ret; 2784 } 2785 2786 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2787 { 2788 struct btrfs_fs_info *fs_info = trans->fs_info; 2789 struct btrfs_qgroup_extent_record *record; 2790 struct btrfs_delayed_ref_root *delayed_refs; 2791 struct ulist *new_roots = NULL; 2792 struct rb_node *node; 2793 u64 num_dirty_extents = 0; 2794 u64 qgroup_to_skip; 2795 int ret = 0; 2796 2797 delayed_refs = &trans->transaction->delayed_refs; 2798 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2799 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2800 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2801 node); 2802 2803 num_dirty_extents++; 2804 trace_btrfs_qgroup_account_extents(fs_info, record); 2805 2806 if (!ret && !(fs_info->qgroup_flags & 2807 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { 2808 struct btrfs_backref_walk_ctx ctx = { 0 }; 2809 2810 ctx.bytenr = record->bytenr; 2811 ctx.fs_info = fs_info; 2812 2813 /* 2814 * Old roots should be searched when inserting qgroup 2815 * extent record. 2816 * 2817 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, 2818 * we may have some record inserted during 2819 * NO_ACCOUNTING (thus no old_roots populated), but 2820 * later we start rescan, which clears NO_ACCOUNTING, 2821 * leaving some inserted records without old_roots 2822 * populated. 2823 * 2824 * Those cases are rare and should not cause too much 2825 * time spent during commit_transaction(). 2826 */ 2827 if (!record->old_roots) { 2828 /* Search commit root to find old_roots */ 2829 ret = btrfs_find_all_roots(&ctx, false); 2830 if (ret < 0) 2831 goto cleanup; 2832 record->old_roots = ctx.roots; 2833 ctx.roots = NULL; 2834 } 2835 2836 /* 2837 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2838 * which doesn't lock tree or delayed_refs and search 2839 * current root. It's safe inside commit_transaction(). 2840 */ 2841 ctx.trans = trans; 2842 ctx.time_seq = BTRFS_SEQ_LAST; 2843 ret = btrfs_find_all_roots(&ctx, false); 2844 if (ret < 0) 2845 goto cleanup; 2846 new_roots = ctx.roots; 2847 if (qgroup_to_skip) { 2848 ulist_del(new_roots, qgroup_to_skip, 0); 2849 ulist_del(record->old_roots, qgroup_to_skip, 2850 0); 2851 } 2852 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2853 record->num_bytes, 2854 record->old_roots, 2855 new_roots); 2856 record->old_roots = NULL; 2857 new_roots = NULL; 2858 } 2859 /* Free the reserved data space */ 2860 btrfs_qgroup_free_refroot(fs_info, 2861 record->data_rsv_refroot, 2862 record->data_rsv, 2863 BTRFS_QGROUP_RSV_DATA); 2864 cleanup: 2865 ulist_free(record->old_roots); 2866 ulist_free(new_roots); 2867 new_roots = NULL; 2868 rb_erase(node, &delayed_refs->dirty_extent_root); 2869 kfree(record); 2870 2871 } 2872 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2873 num_dirty_extents); 2874 return ret; 2875 } 2876 2877 /* 2878 * Writes all changed qgroups to disk. 2879 * Called by the transaction commit path and the qgroup assign ioctl. 2880 */ 2881 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2882 { 2883 struct btrfs_fs_info *fs_info = trans->fs_info; 2884 int ret = 0; 2885 2886 /* 2887 * In case we are called from the qgroup assign ioctl, assert that we 2888 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 2889 * disable operation (ioctl) and access a freed quota root. 2890 */ 2891 if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 2892 lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 2893 2894 if (!fs_info->quota_root) 2895 return ret; 2896 2897 spin_lock(&fs_info->qgroup_lock); 2898 while (!list_empty(&fs_info->dirty_qgroups)) { 2899 struct btrfs_qgroup *qgroup; 2900 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2901 struct btrfs_qgroup, dirty); 2902 list_del_init(&qgroup->dirty); 2903 spin_unlock(&fs_info->qgroup_lock); 2904 ret = update_qgroup_info_item(trans, qgroup); 2905 if (ret) 2906 qgroup_mark_inconsistent(fs_info); 2907 ret = update_qgroup_limit_item(trans, qgroup); 2908 if (ret) 2909 qgroup_mark_inconsistent(fs_info); 2910 spin_lock(&fs_info->qgroup_lock); 2911 } 2912 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2913 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2914 else 2915 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2916 spin_unlock(&fs_info->qgroup_lock); 2917 2918 ret = update_qgroup_status_item(trans); 2919 if (ret) 2920 qgroup_mark_inconsistent(fs_info); 2921 2922 return ret; 2923 } 2924 2925 /* 2926 * Copy the accounting information between qgroups. This is necessary 2927 * when a snapshot or a subvolume is created. Throwing an error will 2928 * cause a transaction abort so we take extra care here to only error 2929 * when a readonly fs is a reasonable outcome. 2930 */ 2931 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2932 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2933 { 2934 int ret = 0; 2935 int i; 2936 u64 *i_qgroups; 2937 bool committing = false; 2938 struct btrfs_fs_info *fs_info = trans->fs_info; 2939 struct btrfs_root *quota_root; 2940 struct btrfs_qgroup *srcgroup; 2941 struct btrfs_qgroup *dstgroup; 2942 bool need_rescan = false; 2943 u32 level_size = 0; 2944 u64 nums; 2945 2946 /* 2947 * There are only two callers of this function. 2948 * 2949 * One in create_subvol() in the ioctl context, which needs to hold 2950 * the qgroup_ioctl_lock. 2951 * 2952 * The other one in create_pending_snapshot() where no other qgroup 2953 * code can modify the fs as they all need to either start a new trans 2954 * or hold a trans handler, thus we don't need to hold 2955 * qgroup_ioctl_lock. 2956 * This would avoid long and complex lock chain and make lockdep happy. 2957 */ 2958 spin_lock(&fs_info->trans_lock); 2959 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2960 committing = true; 2961 spin_unlock(&fs_info->trans_lock); 2962 2963 if (!committing) 2964 mutex_lock(&fs_info->qgroup_ioctl_lock); 2965 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2966 goto out; 2967 2968 quota_root = fs_info->quota_root; 2969 if (!quota_root) { 2970 ret = -EINVAL; 2971 goto out; 2972 } 2973 2974 if (inherit) { 2975 i_qgroups = (u64 *)(inherit + 1); 2976 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2977 2 * inherit->num_excl_copies; 2978 for (i = 0; i < nums; ++i) { 2979 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2980 2981 /* 2982 * Zero out invalid groups so we can ignore 2983 * them later. 2984 */ 2985 if (!srcgroup || 2986 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2987 *i_qgroups = 0ULL; 2988 2989 ++i_qgroups; 2990 } 2991 } 2992 2993 /* 2994 * create a tracking group for the subvol itself 2995 */ 2996 ret = add_qgroup_item(trans, quota_root, objectid); 2997 if (ret) 2998 goto out; 2999 3000 /* 3001 * add qgroup to all inherited groups 3002 */ 3003 if (inherit) { 3004 i_qgroups = (u64 *)(inherit + 1); 3005 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 3006 if (*i_qgroups == 0) 3007 continue; 3008 ret = add_qgroup_relation_item(trans, objectid, 3009 *i_qgroups); 3010 if (ret && ret != -EEXIST) 3011 goto out; 3012 ret = add_qgroup_relation_item(trans, *i_qgroups, 3013 objectid); 3014 if (ret && ret != -EEXIST) 3015 goto out; 3016 } 3017 ret = 0; 3018 } 3019 3020 3021 spin_lock(&fs_info->qgroup_lock); 3022 3023 dstgroup = add_qgroup_rb(fs_info, objectid); 3024 if (IS_ERR(dstgroup)) { 3025 ret = PTR_ERR(dstgroup); 3026 goto unlock; 3027 } 3028 3029 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 3030 dstgroup->lim_flags = inherit->lim.flags; 3031 dstgroup->max_rfer = inherit->lim.max_rfer; 3032 dstgroup->max_excl = inherit->lim.max_excl; 3033 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 3034 dstgroup->rsv_excl = inherit->lim.rsv_excl; 3035 3036 qgroup_dirty(fs_info, dstgroup); 3037 } 3038 3039 if (srcid) { 3040 srcgroup = find_qgroup_rb(fs_info, srcid); 3041 if (!srcgroup) 3042 goto unlock; 3043 3044 /* 3045 * We call inherit after we clone the root in order to make sure 3046 * our counts don't go crazy, so at this point the only 3047 * difference between the two roots should be the root node. 3048 */ 3049 level_size = fs_info->nodesize; 3050 dstgroup->rfer = srcgroup->rfer; 3051 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 3052 dstgroup->excl = level_size; 3053 dstgroup->excl_cmpr = level_size; 3054 srcgroup->excl = level_size; 3055 srcgroup->excl_cmpr = level_size; 3056 3057 /* inherit the limit info */ 3058 dstgroup->lim_flags = srcgroup->lim_flags; 3059 dstgroup->max_rfer = srcgroup->max_rfer; 3060 dstgroup->max_excl = srcgroup->max_excl; 3061 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 3062 dstgroup->rsv_excl = srcgroup->rsv_excl; 3063 3064 qgroup_dirty(fs_info, dstgroup); 3065 qgroup_dirty(fs_info, srcgroup); 3066 } 3067 3068 if (!inherit) 3069 goto unlock; 3070 3071 i_qgroups = (u64 *)(inherit + 1); 3072 for (i = 0; i < inherit->num_qgroups; ++i) { 3073 if (*i_qgroups) { 3074 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 3075 if (ret) 3076 goto unlock; 3077 } 3078 ++i_qgroups; 3079 3080 /* 3081 * If we're doing a snapshot, and adding the snapshot to a new 3082 * qgroup, the numbers are guaranteed to be incorrect. 3083 */ 3084 if (srcid) 3085 need_rescan = true; 3086 } 3087 3088 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 3089 struct btrfs_qgroup *src; 3090 struct btrfs_qgroup *dst; 3091 3092 if (!i_qgroups[0] || !i_qgroups[1]) 3093 continue; 3094 3095 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3096 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3097 3098 if (!src || !dst) { 3099 ret = -EINVAL; 3100 goto unlock; 3101 } 3102 3103 dst->rfer = src->rfer - level_size; 3104 dst->rfer_cmpr = src->rfer_cmpr - level_size; 3105 3106 /* Manually tweaking numbers certainly needs a rescan */ 3107 need_rescan = true; 3108 } 3109 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 3110 struct btrfs_qgroup *src; 3111 struct btrfs_qgroup *dst; 3112 3113 if (!i_qgroups[0] || !i_qgroups[1]) 3114 continue; 3115 3116 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3117 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3118 3119 if (!src || !dst) { 3120 ret = -EINVAL; 3121 goto unlock; 3122 } 3123 3124 dst->excl = src->excl + level_size; 3125 dst->excl_cmpr = src->excl_cmpr + level_size; 3126 need_rescan = true; 3127 } 3128 3129 unlock: 3130 spin_unlock(&fs_info->qgroup_lock); 3131 if (!ret) 3132 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 3133 out: 3134 if (!committing) 3135 mutex_unlock(&fs_info->qgroup_ioctl_lock); 3136 if (need_rescan) 3137 qgroup_mark_inconsistent(fs_info); 3138 return ret; 3139 } 3140 3141 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3142 { 3143 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3144 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3145 return false; 3146 3147 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3148 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3149 return false; 3150 3151 return true; 3152 } 3153 3154 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3155 enum btrfs_qgroup_rsv_type type) 3156 { 3157 struct btrfs_qgroup *qgroup; 3158 struct btrfs_fs_info *fs_info = root->fs_info; 3159 u64 ref_root = root->root_key.objectid; 3160 int ret = 0; 3161 LIST_HEAD(qgroup_list); 3162 3163 if (!is_fstree(ref_root)) 3164 return 0; 3165 3166 if (num_bytes == 0) 3167 return 0; 3168 3169 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3170 capable(CAP_SYS_RESOURCE)) 3171 enforce = false; 3172 3173 spin_lock(&fs_info->qgroup_lock); 3174 if (!fs_info->quota_root) 3175 goto out; 3176 3177 qgroup = find_qgroup_rb(fs_info, ref_root); 3178 if (!qgroup) 3179 goto out; 3180 3181 qgroup_iterator_add(&qgroup_list, qgroup); 3182 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3183 struct btrfs_qgroup_list *glist; 3184 3185 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { 3186 ret = -EDQUOT; 3187 goto out; 3188 } 3189 3190 list_for_each_entry(glist, &qgroup->groups, next_group) 3191 qgroup_iterator_add(&qgroup_list, glist->group); 3192 } 3193 3194 ret = 0; 3195 /* 3196 * no limits exceeded, now record the reservation into all qgroups 3197 */ 3198 list_for_each_entry(qgroup, &qgroup_list, iterator) 3199 qgroup_rsv_add(fs_info, qgroup, num_bytes, type); 3200 3201 out: 3202 qgroup_iterator_clean(&qgroup_list); 3203 spin_unlock(&fs_info->qgroup_lock); 3204 return ret; 3205 } 3206 3207 /* 3208 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3209 * qgroup). 3210 * 3211 * Will handle all higher level qgroup too. 3212 * 3213 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3214 * This special case is only used for META_PERTRANS type. 3215 */ 3216 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3217 u64 ref_root, u64 num_bytes, 3218 enum btrfs_qgroup_rsv_type type) 3219 { 3220 struct btrfs_qgroup *qgroup; 3221 struct ulist_node *unode; 3222 struct ulist_iterator uiter; 3223 int ret = 0; 3224 3225 if (!is_fstree(ref_root)) 3226 return; 3227 3228 if (num_bytes == 0) 3229 return; 3230 3231 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3232 WARN(1, "%s: Invalid type to free", __func__); 3233 return; 3234 } 3235 spin_lock(&fs_info->qgroup_lock); 3236 3237 if (!fs_info->quota_root) 3238 goto out; 3239 3240 qgroup = find_qgroup_rb(fs_info, ref_root); 3241 if (!qgroup) 3242 goto out; 3243 3244 if (num_bytes == (u64)-1) 3245 /* 3246 * We're freeing all pertrans rsv, get reserved value from 3247 * level 0 qgroup as real num_bytes to free. 3248 */ 3249 num_bytes = qgroup->rsv.values[type]; 3250 3251 ulist_reinit(fs_info->qgroup_ulist); 3252 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3253 qgroup_to_aux(qgroup), GFP_ATOMIC); 3254 if (ret < 0) 3255 goto out; 3256 ULIST_ITER_INIT(&uiter); 3257 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3258 struct btrfs_qgroup *qg; 3259 struct btrfs_qgroup_list *glist; 3260 3261 qg = unode_aux_to_qgroup(unode); 3262 3263 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3264 3265 list_for_each_entry(glist, &qg->groups, next_group) { 3266 ret = ulist_add(fs_info->qgroup_ulist, 3267 glist->group->qgroupid, 3268 qgroup_to_aux(glist->group), GFP_ATOMIC); 3269 if (ret < 0) 3270 goto out; 3271 } 3272 } 3273 3274 out: 3275 spin_unlock(&fs_info->qgroup_lock); 3276 } 3277 3278 /* 3279 * Check if the leaf is the last leaf. Which means all node pointers 3280 * are at their last position. 3281 */ 3282 static bool is_last_leaf(struct btrfs_path *path) 3283 { 3284 int i; 3285 3286 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3287 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3288 return false; 3289 } 3290 return true; 3291 } 3292 3293 /* 3294 * returns < 0 on error, 0 when more leafs are to be scanned. 3295 * returns 1 when done. 3296 */ 3297 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3298 struct btrfs_path *path) 3299 { 3300 struct btrfs_fs_info *fs_info = trans->fs_info; 3301 struct btrfs_root *extent_root; 3302 struct btrfs_key found; 3303 struct extent_buffer *scratch_leaf = NULL; 3304 u64 num_bytes; 3305 bool done; 3306 int slot; 3307 int ret; 3308 3309 mutex_lock(&fs_info->qgroup_rescan_lock); 3310 extent_root = btrfs_extent_root(fs_info, 3311 fs_info->qgroup_rescan_progress.objectid); 3312 ret = btrfs_search_slot_for_read(extent_root, 3313 &fs_info->qgroup_rescan_progress, 3314 path, 1, 0); 3315 3316 btrfs_debug(fs_info, 3317 "current progress key (%llu %u %llu), search_slot ret %d", 3318 fs_info->qgroup_rescan_progress.objectid, 3319 fs_info->qgroup_rescan_progress.type, 3320 fs_info->qgroup_rescan_progress.offset, ret); 3321 3322 if (ret) { 3323 /* 3324 * The rescan is about to end, we will not be scanning any 3325 * further blocks. We cannot unset the RESCAN flag here, because 3326 * we want to commit the transaction if everything went well. 3327 * To make the live accounting work in this phase, we set our 3328 * scan progress pointer such that every real extent objectid 3329 * will be smaller. 3330 */ 3331 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3332 btrfs_release_path(path); 3333 mutex_unlock(&fs_info->qgroup_rescan_lock); 3334 return ret; 3335 } 3336 done = is_last_leaf(path); 3337 3338 btrfs_item_key_to_cpu(path->nodes[0], &found, 3339 btrfs_header_nritems(path->nodes[0]) - 1); 3340 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3341 3342 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3343 if (!scratch_leaf) { 3344 ret = -ENOMEM; 3345 mutex_unlock(&fs_info->qgroup_rescan_lock); 3346 goto out; 3347 } 3348 slot = path->slots[0]; 3349 btrfs_release_path(path); 3350 mutex_unlock(&fs_info->qgroup_rescan_lock); 3351 3352 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3353 struct btrfs_backref_walk_ctx ctx = { 0 }; 3354 3355 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3356 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3357 found.type != BTRFS_METADATA_ITEM_KEY) 3358 continue; 3359 if (found.type == BTRFS_METADATA_ITEM_KEY) 3360 num_bytes = fs_info->nodesize; 3361 else 3362 num_bytes = found.offset; 3363 3364 ctx.bytenr = found.objectid; 3365 ctx.fs_info = fs_info; 3366 3367 ret = btrfs_find_all_roots(&ctx, false); 3368 if (ret < 0) 3369 goto out; 3370 /* For rescan, just pass old_roots as NULL */ 3371 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3372 num_bytes, NULL, ctx.roots); 3373 if (ret < 0) 3374 goto out; 3375 } 3376 out: 3377 if (scratch_leaf) 3378 free_extent_buffer(scratch_leaf); 3379 3380 if (done && !ret) { 3381 ret = 1; 3382 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3383 } 3384 return ret; 3385 } 3386 3387 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3388 { 3389 return btrfs_fs_closing(fs_info) || 3390 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || 3391 !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3392 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3393 } 3394 3395 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3396 { 3397 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3398 qgroup_rescan_work); 3399 struct btrfs_path *path; 3400 struct btrfs_trans_handle *trans = NULL; 3401 int err = -ENOMEM; 3402 int ret = 0; 3403 bool stopped = false; 3404 bool did_leaf_rescans = false; 3405 3406 path = btrfs_alloc_path(); 3407 if (!path) 3408 goto out; 3409 /* 3410 * Rescan should only search for commit root, and any later difference 3411 * should be recorded by qgroup 3412 */ 3413 path->search_commit_root = 1; 3414 path->skip_locking = 1; 3415 3416 err = 0; 3417 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3418 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3419 if (IS_ERR(trans)) { 3420 err = PTR_ERR(trans); 3421 break; 3422 } 3423 3424 err = qgroup_rescan_leaf(trans, path); 3425 did_leaf_rescans = true; 3426 3427 if (err > 0) 3428 btrfs_commit_transaction(trans); 3429 else 3430 btrfs_end_transaction(trans); 3431 } 3432 3433 out: 3434 btrfs_free_path(path); 3435 3436 mutex_lock(&fs_info->qgroup_rescan_lock); 3437 if (err > 0 && 3438 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3439 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3440 } else if (err < 0 || stopped) { 3441 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3442 } 3443 mutex_unlock(&fs_info->qgroup_rescan_lock); 3444 3445 /* 3446 * Only update status, since the previous part has already updated the 3447 * qgroup info, and only if we did any actual work. This also prevents 3448 * race with a concurrent quota disable, which has already set 3449 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at 3450 * btrfs_quota_disable(). 3451 */ 3452 if (did_leaf_rescans) { 3453 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3454 if (IS_ERR(trans)) { 3455 err = PTR_ERR(trans); 3456 trans = NULL; 3457 btrfs_err(fs_info, 3458 "fail to start transaction for status update: %d", 3459 err); 3460 } 3461 } else { 3462 trans = NULL; 3463 } 3464 3465 mutex_lock(&fs_info->qgroup_rescan_lock); 3466 if (!stopped || 3467 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3468 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3469 if (trans) { 3470 ret = update_qgroup_status_item(trans); 3471 if (ret < 0) { 3472 err = ret; 3473 btrfs_err(fs_info, "fail to update qgroup status: %d", 3474 err); 3475 } 3476 } 3477 fs_info->qgroup_rescan_running = false; 3478 fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3479 complete_all(&fs_info->qgroup_rescan_completion); 3480 mutex_unlock(&fs_info->qgroup_rescan_lock); 3481 3482 if (!trans) 3483 return; 3484 3485 btrfs_end_transaction(trans); 3486 3487 if (stopped) { 3488 btrfs_info(fs_info, "qgroup scan paused"); 3489 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { 3490 btrfs_info(fs_info, "qgroup scan cancelled"); 3491 } else if (err >= 0) { 3492 btrfs_info(fs_info, "qgroup scan completed%s", 3493 err > 0 ? " (inconsistency flag cleared)" : ""); 3494 } else { 3495 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3496 } 3497 } 3498 3499 /* 3500 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3501 * memory required for the rescan context. 3502 */ 3503 static int 3504 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3505 int init_flags) 3506 { 3507 int ret = 0; 3508 3509 if (!init_flags) { 3510 /* we're resuming qgroup rescan at mount time */ 3511 if (!(fs_info->qgroup_flags & 3512 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3513 btrfs_warn(fs_info, 3514 "qgroup rescan init failed, qgroup rescan is not queued"); 3515 ret = -EINVAL; 3516 } else if (!(fs_info->qgroup_flags & 3517 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3518 btrfs_warn(fs_info, 3519 "qgroup rescan init failed, qgroup is not enabled"); 3520 ret = -EINVAL; 3521 } 3522 3523 if (ret) 3524 return ret; 3525 } 3526 3527 mutex_lock(&fs_info->qgroup_rescan_lock); 3528 3529 if (init_flags) { 3530 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3531 btrfs_warn(fs_info, 3532 "qgroup rescan is already in progress"); 3533 ret = -EINPROGRESS; 3534 } else if (!(fs_info->qgroup_flags & 3535 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3536 btrfs_warn(fs_info, 3537 "qgroup rescan init failed, qgroup is not enabled"); 3538 ret = -EINVAL; 3539 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3540 /* Quota disable is in progress */ 3541 ret = -EBUSY; 3542 } 3543 3544 if (ret) { 3545 mutex_unlock(&fs_info->qgroup_rescan_lock); 3546 return ret; 3547 } 3548 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3549 } 3550 3551 memset(&fs_info->qgroup_rescan_progress, 0, 3552 sizeof(fs_info->qgroup_rescan_progress)); 3553 fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 3554 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 3555 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3556 init_completion(&fs_info->qgroup_rescan_completion); 3557 mutex_unlock(&fs_info->qgroup_rescan_lock); 3558 3559 btrfs_init_work(&fs_info->qgroup_rescan_work, 3560 btrfs_qgroup_rescan_worker, NULL, NULL); 3561 return 0; 3562 } 3563 3564 static void 3565 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3566 { 3567 struct rb_node *n; 3568 struct btrfs_qgroup *qgroup; 3569 3570 spin_lock(&fs_info->qgroup_lock); 3571 /* clear all current qgroup tracking information */ 3572 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3573 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3574 qgroup->rfer = 0; 3575 qgroup->rfer_cmpr = 0; 3576 qgroup->excl = 0; 3577 qgroup->excl_cmpr = 0; 3578 qgroup_dirty(fs_info, qgroup); 3579 } 3580 spin_unlock(&fs_info->qgroup_lock); 3581 } 3582 3583 int 3584 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3585 { 3586 int ret = 0; 3587 struct btrfs_trans_handle *trans; 3588 3589 ret = qgroup_rescan_init(fs_info, 0, 1); 3590 if (ret) 3591 return ret; 3592 3593 /* 3594 * We have set the rescan_progress to 0, which means no more 3595 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3596 * However, btrfs_qgroup_account_ref may be right after its call 3597 * to btrfs_find_all_roots, in which case it would still do the 3598 * accounting. 3599 * To solve this, we're committing the transaction, which will 3600 * ensure we run all delayed refs and only after that, we are 3601 * going to clear all tracking information for a clean start. 3602 */ 3603 3604 trans = btrfs_attach_transaction_barrier(fs_info->fs_root); 3605 if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { 3606 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3607 return PTR_ERR(trans); 3608 } else if (trans != ERR_PTR(-ENOENT)) { 3609 ret = btrfs_commit_transaction(trans); 3610 if (ret) { 3611 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3612 return ret; 3613 } 3614 } 3615 3616 qgroup_rescan_zero_tracking(fs_info); 3617 3618 mutex_lock(&fs_info->qgroup_rescan_lock); 3619 fs_info->qgroup_rescan_running = true; 3620 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3621 &fs_info->qgroup_rescan_work); 3622 mutex_unlock(&fs_info->qgroup_rescan_lock); 3623 3624 return 0; 3625 } 3626 3627 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3628 bool interruptible) 3629 { 3630 int running; 3631 int ret = 0; 3632 3633 mutex_lock(&fs_info->qgroup_rescan_lock); 3634 running = fs_info->qgroup_rescan_running; 3635 mutex_unlock(&fs_info->qgroup_rescan_lock); 3636 3637 if (!running) 3638 return 0; 3639 3640 if (interruptible) 3641 ret = wait_for_completion_interruptible( 3642 &fs_info->qgroup_rescan_completion); 3643 else 3644 wait_for_completion(&fs_info->qgroup_rescan_completion); 3645 3646 return ret; 3647 } 3648 3649 /* 3650 * this is only called from open_ctree where we're still single threaded, thus 3651 * locking is omitted here. 3652 */ 3653 void 3654 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3655 { 3656 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3657 mutex_lock(&fs_info->qgroup_rescan_lock); 3658 fs_info->qgroup_rescan_running = true; 3659 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3660 &fs_info->qgroup_rescan_work); 3661 mutex_unlock(&fs_info->qgroup_rescan_lock); 3662 } 3663 } 3664 3665 #define rbtree_iterate_from_safe(node, next, start) \ 3666 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3667 3668 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3669 struct extent_changeset *reserved, u64 start, 3670 u64 len) 3671 { 3672 struct rb_node *node; 3673 struct rb_node *next; 3674 struct ulist_node *entry; 3675 int ret = 0; 3676 3677 node = reserved->range_changed.root.rb_node; 3678 if (!node) 3679 return 0; 3680 while (node) { 3681 entry = rb_entry(node, struct ulist_node, rb_node); 3682 if (entry->val < start) 3683 node = node->rb_right; 3684 else 3685 node = node->rb_left; 3686 } 3687 3688 if (entry->val > start && rb_prev(&entry->rb_node)) 3689 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3690 rb_node); 3691 3692 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3693 u64 entry_start; 3694 u64 entry_end; 3695 u64 entry_len; 3696 int clear_ret; 3697 3698 entry = rb_entry(node, struct ulist_node, rb_node); 3699 entry_start = entry->val; 3700 entry_end = entry->aux; 3701 entry_len = entry_end - entry_start + 1; 3702 3703 if (entry_start >= start + len) 3704 break; 3705 if (entry_start + entry_len <= start) 3706 continue; 3707 /* 3708 * Now the entry is in [start, start + len), revert the 3709 * EXTENT_QGROUP_RESERVED bit. 3710 */ 3711 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3712 entry_end, EXTENT_QGROUP_RESERVED); 3713 if (!ret && clear_ret < 0) 3714 ret = clear_ret; 3715 3716 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3717 if (likely(reserved->bytes_changed >= entry_len)) { 3718 reserved->bytes_changed -= entry_len; 3719 } else { 3720 WARN_ON(1); 3721 reserved->bytes_changed = 0; 3722 } 3723 } 3724 3725 return ret; 3726 } 3727 3728 /* 3729 * Try to free some space for qgroup. 3730 * 3731 * For qgroup, there are only 3 ways to free qgroup space: 3732 * - Flush nodatacow write 3733 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3734 * In theory, we should only flush nodatacow inodes, but it's not yet 3735 * possible, so we need to flush the whole root. 3736 * 3737 * - Wait for ordered extents 3738 * When ordered extents are finished, their reserved metadata is finally 3739 * converted to per_trans status, which can be freed by later commit 3740 * transaction. 3741 * 3742 * - Commit transaction 3743 * This would free the meta_per_trans space. 3744 * In theory this shouldn't provide much space, but any more qgroup space 3745 * is needed. 3746 */ 3747 static int try_flush_qgroup(struct btrfs_root *root) 3748 { 3749 struct btrfs_trans_handle *trans; 3750 int ret; 3751 3752 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3753 ASSERT(current->journal_info == NULL); 3754 if (WARN_ON(current->journal_info)) 3755 return 0; 3756 3757 /* 3758 * We don't want to run flush again and again, so if there is a running 3759 * one, we won't try to start a new flush, but exit directly. 3760 */ 3761 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3762 wait_event(root->qgroup_flush_wait, 3763 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3764 return 0; 3765 } 3766 3767 ret = btrfs_start_delalloc_snapshot(root, true); 3768 if (ret < 0) 3769 goto out; 3770 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3771 3772 trans = btrfs_attach_transaction_barrier(root); 3773 if (IS_ERR(trans)) { 3774 ret = PTR_ERR(trans); 3775 if (ret == -ENOENT) 3776 ret = 0; 3777 goto out; 3778 } 3779 3780 ret = btrfs_commit_transaction(trans); 3781 out: 3782 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3783 wake_up(&root->qgroup_flush_wait); 3784 return ret; 3785 } 3786 3787 static int qgroup_reserve_data(struct btrfs_inode *inode, 3788 struct extent_changeset **reserved_ret, u64 start, 3789 u64 len) 3790 { 3791 struct btrfs_root *root = inode->root; 3792 struct extent_changeset *reserved; 3793 bool new_reserved = false; 3794 u64 orig_reserved; 3795 u64 to_reserve; 3796 int ret; 3797 3798 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3799 !is_fstree(root->root_key.objectid) || len == 0) 3800 return 0; 3801 3802 /* @reserved parameter is mandatory for qgroup */ 3803 if (WARN_ON(!reserved_ret)) 3804 return -EINVAL; 3805 if (!*reserved_ret) { 3806 new_reserved = true; 3807 *reserved_ret = extent_changeset_alloc(); 3808 if (!*reserved_ret) 3809 return -ENOMEM; 3810 } 3811 reserved = *reserved_ret; 3812 /* Record already reserved space */ 3813 orig_reserved = reserved->bytes_changed; 3814 ret = set_record_extent_bits(&inode->io_tree, start, 3815 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3816 3817 /* Newly reserved space */ 3818 to_reserve = reserved->bytes_changed - orig_reserved; 3819 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3820 to_reserve, QGROUP_RESERVE); 3821 if (ret < 0) 3822 goto out; 3823 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3824 if (ret < 0) 3825 goto cleanup; 3826 3827 return ret; 3828 3829 cleanup: 3830 qgroup_unreserve_range(inode, reserved, start, len); 3831 out: 3832 if (new_reserved) { 3833 extent_changeset_free(reserved); 3834 *reserved_ret = NULL; 3835 } 3836 return ret; 3837 } 3838 3839 /* 3840 * Reserve qgroup space for range [start, start + len). 3841 * 3842 * This function will either reserve space from related qgroups or do nothing 3843 * if the range is already reserved. 3844 * 3845 * Return 0 for successful reservation 3846 * Return <0 for error (including -EQUOT) 3847 * 3848 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3849 * commit transaction. So caller should not hold any dirty page locked. 3850 */ 3851 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3852 struct extent_changeset **reserved_ret, u64 start, 3853 u64 len) 3854 { 3855 int ret; 3856 3857 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3858 if (ret <= 0 && ret != -EDQUOT) 3859 return ret; 3860 3861 ret = try_flush_qgroup(inode->root); 3862 if (ret < 0) 3863 return ret; 3864 return qgroup_reserve_data(inode, reserved_ret, start, len); 3865 } 3866 3867 /* Free ranges specified by @reserved, normally in error path */ 3868 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3869 struct extent_changeset *reserved, 3870 u64 start, u64 len, u64 *freed_ret) 3871 { 3872 struct btrfs_root *root = inode->root; 3873 struct ulist_node *unode; 3874 struct ulist_iterator uiter; 3875 struct extent_changeset changeset; 3876 u64 freed = 0; 3877 int ret; 3878 3879 extent_changeset_init(&changeset); 3880 len = round_up(start + len, root->fs_info->sectorsize); 3881 start = round_down(start, root->fs_info->sectorsize); 3882 3883 ULIST_ITER_INIT(&uiter); 3884 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3885 u64 range_start = unode->val; 3886 /* unode->aux is the inclusive end */ 3887 u64 range_len = unode->aux - range_start + 1; 3888 u64 free_start; 3889 u64 free_len; 3890 3891 extent_changeset_release(&changeset); 3892 3893 /* Only free range in range [start, start + len) */ 3894 if (range_start >= start + len || 3895 range_start + range_len <= start) 3896 continue; 3897 free_start = max(range_start, start); 3898 free_len = min(start + len, range_start + range_len) - 3899 free_start; 3900 /* 3901 * TODO: To also modify reserved->ranges_reserved to reflect 3902 * the modification. 3903 * 3904 * However as long as we free qgroup reserved according to 3905 * EXTENT_QGROUP_RESERVED, we won't double free. 3906 * So not need to rush. 3907 */ 3908 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3909 free_start + free_len - 1, 3910 EXTENT_QGROUP_RESERVED, &changeset); 3911 if (ret < 0) 3912 goto out; 3913 freed += changeset.bytes_changed; 3914 } 3915 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3916 BTRFS_QGROUP_RSV_DATA); 3917 if (freed_ret) 3918 *freed_ret = freed; 3919 ret = 0; 3920 out: 3921 extent_changeset_release(&changeset); 3922 return ret; 3923 } 3924 3925 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3926 struct extent_changeset *reserved, u64 start, u64 len, 3927 u64 *released, int free) 3928 { 3929 struct extent_changeset changeset; 3930 int trace_op = QGROUP_RELEASE; 3931 int ret; 3932 3933 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3934 return 0; 3935 3936 /* In release case, we shouldn't have @reserved */ 3937 WARN_ON(!free && reserved); 3938 if (free && reserved) 3939 return qgroup_free_reserved_data(inode, reserved, start, len, released); 3940 extent_changeset_init(&changeset); 3941 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3942 EXTENT_QGROUP_RESERVED, &changeset); 3943 if (ret < 0) 3944 goto out; 3945 3946 if (free) 3947 trace_op = QGROUP_FREE; 3948 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3949 changeset.bytes_changed, trace_op); 3950 if (free) 3951 btrfs_qgroup_free_refroot(inode->root->fs_info, 3952 inode->root->root_key.objectid, 3953 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3954 if (released) 3955 *released = changeset.bytes_changed; 3956 out: 3957 extent_changeset_release(&changeset); 3958 return ret; 3959 } 3960 3961 /* 3962 * Free a reserved space range from io_tree and related qgroups 3963 * 3964 * Should be called when a range of pages get invalidated before reaching disk. 3965 * Or for error cleanup case. 3966 * if @reserved is given, only reserved range in [@start, @start + @len) will 3967 * be freed. 3968 * 3969 * For data written to disk, use btrfs_qgroup_release_data(). 3970 * 3971 * NOTE: This function may sleep for memory allocation. 3972 */ 3973 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3974 struct extent_changeset *reserved, 3975 u64 start, u64 len, u64 *freed) 3976 { 3977 return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); 3978 } 3979 3980 /* 3981 * Release a reserved space range from io_tree only. 3982 * 3983 * Should be called when a range of pages get written to disk and corresponding 3984 * FILE_EXTENT is inserted into corresponding root. 3985 * 3986 * Since new qgroup accounting framework will only update qgroup numbers at 3987 * commit_transaction() time, its reserved space shouldn't be freed from 3988 * related qgroups. 3989 * 3990 * But we should release the range from io_tree, to allow further write to be 3991 * COWed. 3992 * 3993 * NOTE: This function may sleep for memory allocation. 3994 */ 3995 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) 3996 { 3997 return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); 3998 } 3999 4000 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4001 enum btrfs_qgroup_rsv_type type) 4002 { 4003 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4004 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4005 return; 4006 if (num_bytes == 0) 4007 return; 4008 4009 spin_lock(&root->qgroup_meta_rsv_lock); 4010 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 4011 root->qgroup_meta_rsv_prealloc += num_bytes; 4012 else 4013 root->qgroup_meta_rsv_pertrans += num_bytes; 4014 spin_unlock(&root->qgroup_meta_rsv_lock); 4015 } 4016 4017 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4018 enum btrfs_qgroup_rsv_type type) 4019 { 4020 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4021 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4022 return 0; 4023 if (num_bytes == 0) 4024 return 0; 4025 4026 spin_lock(&root->qgroup_meta_rsv_lock); 4027 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 4028 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 4029 num_bytes); 4030 root->qgroup_meta_rsv_prealloc -= num_bytes; 4031 } else { 4032 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 4033 num_bytes); 4034 root->qgroup_meta_rsv_pertrans -= num_bytes; 4035 } 4036 spin_unlock(&root->qgroup_meta_rsv_lock); 4037 return num_bytes; 4038 } 4039 4040 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4041 enum btrfs_qgroup_rsv_type type, bool enforce) 4042 { 4043 struct btrfs_fs_info *fs_info = root->fs_info; 4044 int ret; 4045 4046 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4047 !is_fstree(root->root_key.objectid) || num_bytes == 0) 4048 return 0; 4049 4050 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4051 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 4052 ret = qgroup_reserve(root, num_bytes, enforce, type); 4053 if (ret < 0) 4054 return ret; 4055 /* 4056 * Record what we have reserved into root. 4057 * 4058 * To avoid quota disabled->enabled underflow. 4059 * In that case, we may try to free space we haven't reserved 4060 * (since quota was disabled), so record what we reserved into root. 4061 * And ensure later release won't underflow this number. 4062 */ 4063 add_root_meta_rsv(root, num_bytes, type); 4064 return ret; 4065 } 4066 4067 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4068 enum btrfs_qgroup_rsv_type type, bool enforce, 4069 bool noflush) 4070 { 4071 int ret; 4072 4073 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4074 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4075 return ret; 4076 4077 ret = try_flush_qgroup(root); 4078 if (ret < 0) 4079 return ret; 4080 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4081 } 4082 4083 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 4084 { 4085 struct btrfs_fs_info *fs_info = root->fs_info; 4086 4087 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4088 !is_fstree(root->root_key.objectid)) 4089 return; 4090 4091 /* TODO: Update trace point to handle such free */ 4092 trace_qgroup_meta_free_all_pertrans(root); 4093 /* Special value -1 means to free all reserved space */ 4094 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 4095 BTRFS_QGROUP_RSV_META_PERTRANS); 4096 } 4097 4098 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4099 enum btrfs_qgroup_rsv_type type) 4100 { 4101 struct btrfs_fs_info *fs_info = root->fs_info; 4102 4103 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4104 !is_fstree(root->root_key.objectid)) 4105 return; 4106 4107 /* 4108 * reservation for META_PREALLOC can happen before quota is enabled, 4109 * which can lead to underflow. 4110 * Here ensure we will only free what we really have reserved. 4111 */ 4112 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4113 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4114 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4115 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 4116 num_bytes, type); 4117 } 4118 4119 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 4120 int num_bytes) 4121 { 4122 struct btrfs_qgroup *qgroup; 4123 LIST_HEAD(qgroup_list); 4124 4125 if (num_bytes == 0) 4126 return; 4127 if (!fs_info->quota_root) 4128 return; 4129 4130 spin_lock(&fs_info->qgroup_lock); 4131 qgroup = find_qgroup_rb(fs_info, ref_root); 4132 if (!qgroup) 4133 goto out; 4134 4135 qgroup_iterator_add(&qgroup_list, qgroup); 4136 list_for_each_entry(qgroup, &qgroup_list, iterator) { 4137 struct btrfs_qgroup_list *glist; 4138 4139 qgroup_rsv_release(fs_info, qgroup, num_bytes, 4140 BTRFS_QGROUP_RSV_META_PREALLOC); 4141 if (!sb_rdonly(fs_info->sb)) 4142 qgroup_rsv_add(fs_info, qgroup, num_bytes, 4143 BTRFS_QGROUP_RSV_META_PERTRANS); 4144 4145 list_for_each_entry(glist, &qgroup->groups, next_group) 4146 qgroup_iterator_add(&qgroup_list, glist->group); 4147 } 4148 out: 4149 qgroup_iterator_clean(&qgroup_list); 4150 spin_unlock(&fs_info->qgroup_lock); 4151 } 4152 4153 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4154 { 4155 struct btrfs_fs_info *fs_info = root->fs_info; 4156 4157 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4158 !is_fstree(root->root_key.objectid)) 4159 return; 4160 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4161 num_bytes = sub_root_meta_rsv(root, num_bytes, 4162 BTRFS_QGROUP_RSV_META_PREALLOC); 4163 trace_qgroup_meta_convert(root, num_bytes); 4164 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 4165 if (!sb_rdonly(fs_info->sb)) 4166 add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); 4167 } 4168 4169 /* 4170 * Check qgroup reserved space leaking, normally at destroy inode 4171 * time 4172 */ 4173 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4174 { 4175 struct extent_changeset changeset; 4176 struct ulist_node *unode; 4177 struct ulist_iterator iter; 4178 int ret; 4179 4180 extent_changeset_init(&changeset); 4181 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4182 EXTENT_QGROUP_RESERVED, &changeset); 4183 4184 WARN_ON(ret < 0); 4185 if (WARN_ON(changeset.bytes_changed)) { 4186 ULIST_ITER_INIT(&iter); 4187 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4188 btrfs_warn(inode->root->fs_info, 4189 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4190 btrfs_ino(inode), unode->val, unode->aux); 4191 } 4192 btrfs_qgroup_free_refroot(inode->root->fs_info, 4193 inode->root->root_key.objectid, 4194 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4195 4196 } 4197 extent_changeset_release(&changeset); 4198 } 4199 4200 void btrfs_qgroup_init_swapped_blocks( 4201 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4202 { 4203 int i; 4204 4205 spin_lock_init(&swapped_blocks->lock); 4206 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4207 swapped_blocks->blocks[i] = RB_ROOT; 4208 swapped_blocks->swapped = false; 4209 } 4210 4211 /* 4212 * Delete all swapped blocks record of @root. 4213 * Every record here means we skipped a full subtree scan for qgroup. 4214 * 4215 * Gets called when committing one transaction. 4216 */ 4217 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4218 { 4219 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4220 int i; 4221 4222 swapped_blocks = &root->swapped_blocks; 4223 4224 spin_lock(&swapped_blocks->lock); 4225 if (!swapped_blocks->swapped) 4226 goto out; 4227 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4228 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4229 struct btrfs_qgroup_swapped_block *entry; 4230 struct btrfs_qgroup_swapped_block *next; 4231 4232 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4233 node) 4234 kfree(entry); 4235 swapped_blocks->blocks[i] = RB_ROOT; 4236 } 4237 swapped_blocks->swapped = false; 4238 out: 4239 spin_unlock(&swapped_blocks->lock); 4240 } 4241 4242 /* 4243 * Add subtree roots record into @subvol_root. 4244 * 4245 * @subvol_root: tree root of the subvolume tree get swapped 4246 * @bg: block group under balance 4247 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4248 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4249 * BOTH POINTERS ARE BEFORE TREE SWAP 4250 * @last_snapshot: last snapshot generation of the subvolume tree 4251 */ 4252 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4253 struct btrfs_root *subvol_root, 4254 struct btrfs_block_group *bg, 4255 struct extent_buffer *subvol_parent, int subvol_slot, 4256 struct extent_buffer *reloc_parent, int reloc_slot, 4257 u64 last_snapshot) 4258 { 4259 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4260 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4261 struct btrfs_qgroup_swapped_block *block; 4262 struct rb_node **cur; 4263 struct rb_node *parent = NULL; 4264 int level = btrfs_header_level(subvol_parent) - 1; 4265 int ret = 0; 4266 4267 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4268 return 0; 4269 4270 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4271 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4272 btrfs_err_rl(fs_info, 4273 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4274 __func__, 4275 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4276 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4277 return -EUCLEAN; 4278 } 4279 4280 block = kmalloc(sizeof(*block), GFP_NOFS); 4281 if (!block) { 4282 ret = -ENOMEM; 4283 goto out; 4284 } 4285 4286 /* 4287 * @reloc_parent/slot is still before swap, while @block is going to 4288 * record the bytenr after swap, so we do the swap here. 4289 */ 4290 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4291 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4292 reloc_slot); 4293 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4294 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4295 subvol_slot); 4296 block->last_snapshot = last_snapshot; 4297 block->level = level; 4298 4299 /* 4300 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4301 * no one else can modify tree blocks thus we qgroup will not change 4302 * no matter the value of trace_leaf. 4303 */ 4304 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4305 block->trace_leaf = true; 4306 else 4307 block->trace_leaf = false; 4308 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4309 4310 /* Insert @block into @blocks */ 4311 spin_lock(&blocks->lock); 4312 cur = &blocks->blocks[level].rb_node; 4313 while (*cur) { 4314 struct btrfs_qgroup_swapped_block *entry; 4315 4316 parent = *cur; 4317 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4318 node); 4319 4320 if (entry->subvol_bytenr < block->subvol_bytenr) { 4321 cur = &(*cur)->rb_left; 4322 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4323 cur = &(*cur)->rb_right; 4324 } else { 4325 if (entry->subvol_generation != 4326 block->subvol_generation || 4327 entry->reloc_bytenr != block->reloc_bytenr || 4328 entry->reloc_generation != 4329 block->reloc_generation) { 4330 /* 4331 * Duplicated but mismatch entry found. 4332 * Shouldn't happen. 4333 * 4334 * Marking qgroup inconsistent should be enough 4335 * for end users. 4336 */ 4337 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4338 ret = -EEXIST; 4339 } 4340 kfree(block); 4341 goto out_unlock; 4342 } 4343 } 4344 rb_link_node(&block->node, parent, cur); 4345 rb_insert_color(&block->node, &blocks->blocks[level]); 4346 blocks->swapped = true; 4347 out_unlock: 4348 spin_unlock(&blocks->lock); 4349 out: 4350 if (ret < 0) 4351 qgroup_mark_inconsistent(fs_info); 4352 return ret; 4353 } 4354 4355 /* 4356 * Check if the tree block is a subtree root, and if so do the needed 4357 * delayed subtree trace for qgroup. 4358 * 4359 * This is called during btrfs_cow_block(). 4360 */ 4361 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4362 struct btrfs_root *root, 4363 struct extent_buffer *subvol_eb) 4364 { 4365 struct btrfs_fs_info *fs_info = root->fs_info; 4366 struct btrfs_tree_parent_check check = { 0 }; 4367 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4368 struct btrfs_qgroup_swapped_block *block; 4369 struct extent_buffer *reloc_eb = NULL; 4370 struct rb_node *node; 4371 bool found = false; 4372 bool swapped = false; 4373 int level = btrfs_header_level(subvol_eb); 4374 int ret = 0; 4375 int i; 4376 4377 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4378 return 0; 4379 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4380 return 0; 4381 4382 spin_lock(&blocks->lock); 4383 if (!blocks->swapped) { 4384 spin_unlock(&blocks->lock); 4385 return 0; 4386 } 4387 node = blocks->blocks[level].rb_node; 4388 4389 while (node) { 4390 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4391 if (block->subvol_bytenr < subvol_eb->start) { 4392 node = node->rb_left; 4393 } else if (block->subvol_bytenr > subvol_eb->start) { 4394 node = node->rb_right; 4395 } else { 4396 found = true; 4397 break; 4398 } 4399 } 4400 if (!found) { 4401 spin_unlock(&blocks->lock); 4402 goto out; 4403 } 4404 /* Found one, remove it from @blocks first and update blocks->swapped */ 4405 rb_erase(&block->node, &blocks->blocks[level]); 4406 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4407 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4408 swapped = true; 4409 break; 4410 } 4411 } 4412 blocks->swapped = swapped; 4413 spin_unlock(&blocks->lock); 4414 4415 check.level = block->level; 4416 check.transid = block->reloc_generation; 4417 check.has_first_key = true; 4418 memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); 4419 4420 /* Read out reloc subtree root */ 4421 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); 4422 if (IS_ERR(reloc_eb)) { 4423 ret = PTR_ERR(reloc_eb); 4424 reloc_eb = NULL; 4425 goto free_out; 4426 } 4427 if (!extent_buffer_uptodate(reloc_eb)) { 4428 ret = -EIO; 4429 goto free_out; 4430 } 4431 4432 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4433 block->last_snapshot, block->trace_leaf); 4434 free_out: 4435 kfree(block); 4436 free_extent_buffer(reloc_eb); 4437 out: 4438 if (ret < 0) { 4439 btrfs_err_rl(fs_info, 4440 "failed to account subtree at bytenr %llu: %d", 4441 subvol_eb->start, ret); 4442 qgroup_mark_inconsistent(fs_info); 4443 } 4444 return ret; 4445 } 4446 4447 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4448 { 4449 struct btrfs_qgroup_extent_record *entry; 4450 struct btrfs_qgroup_extent_record *next; 4451 struct rb_root *root; 4452 4453 root = &trans->delayed_refs.dirty_extent_root; 4454 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4455 ulist_free(entry->old_roots); 4456 kfree(entry); 4457 } 4458 *root = RB_ROOT; 4459 } 4460