1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 #include "fs.h" 28 #include "accessors.h" 29 #include "extent-tree.h" 30 #include "root-tree.h" 31 #include "tree-checker.h" 32 33 /* 34 * Helpers to access qgroup reservation 35 * 36 * Callers should ensure the lock context and type are valid 37 */ 38 39 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 40 { 41 u64 ret = 0; 42 int i; 43 44 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 45 ret += qgroup->rsv.values[i]; 46 47 return ret; 48 } 49 50 #ifdef CONFIG_BTRFS_DEBUG 51 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 52 { 53 if (type == BTRFS_QGROUP_RSV_DATA) 54 return "data"; 55 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 56 return "meta_pertrans"; 57 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 58 return "meta_prealloc"; 59 return NULL; 60 } 61 #endif 62 63 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 64 struct btrfs_qgroup *qgroup, u64 num_bytes, 65 enum btrfs_qgroup_rsv_type type) 66 { 67 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 68 qgroup->rsv.values[type] += num_bytes; 69 } 70 71 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 72 struct btrfs_qgroup *qgroup, u64 num_bytes, 73 enum btrfs_qgroup_rsv_type type) 74 { 75 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 76 if (qgroup->rsv.values[type] >= num_bytes) { 77 qgroup->rsv.values[type] -= num_bytes; 78 return; 79 } 80 #ifdef CONFIG_BTRFS_DEBUG 81 WARN_RATELIMIT(1, 82 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 83 qgroup->qgroupid, qgroup_rsv_type_str(type), 84 qgroup->rsv.values[type], num_bytes); 85 #endif 86 qgroup->rsv.values[type] = 0; 87 } 88 89 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 90 struct btrfs_qgroup *dest, 91 struct btrfs_qgroup *src) 92 { 93 int i; 94 95 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 96 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 97 } 98 99 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 100 struct btrfs_qgroup *dest, 101 struct btrfs_qgroup *src) 102 { 103 int i; 104 105 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 106 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 107 } 108 109 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 110 int mod) 111 { 112 if (qg->old_refcnt < seq) 113 qg->old_refcnt = seq; 114 qg->old_refcnt += mod; 115 } 116 117 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 118 int mod) 119 { 120 if (qg->new_refcnt < seq) 121 qg->new_refcnt = seq; 122 qg->new_refcnt += mod; 123 } 124 125 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 126 { 127 if (qg->old_refcnt < seq) 128 return 0; 129 return qg->old_refcnt - seq; 130 } 131 132 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->new_refcnt < seq) 135 return 0; 136 return qg->new_refcnt - seq; 137 } 138 139 /* 140 * glue structure to represent the relations between qgroups. 141 */ 142 struct btrfs_qgroup_list { 143 struct list_head next_group; 144 struct list_head next_member; 145 struct btrfs_qgroup *group; 146 struct btrfs_qgroup *member; 147 }; 148 149 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 150 { 151 return (u64)(uintptr_t)qg; 152 } 153 154 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 155 { 156 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 157 } 158 159 static int 160 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 161 int init_flags); 162 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 163 164 /* must be called with qgroup_ioctl_lock held */ 165 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 166 u64 qgroupid) 167 { 168 struct rb_node *n = fs_info->qgroup_tree.rb_node; 169 struct btrfs_qgroup *qgroup; 170 171 while (n) { 172 qgroup = rb_entry(n, struct btrfs_qgroup, node); 173 if (qgroup->qgroupid < qgroupid) 174 n = n->rb_left; 175 else if (qgroup->qgroupid > qgroupid) 176 n = n->rb_right; 177 else 178 return qgroup; 179 } 180 return NULL; 181 } 182 183 /* must be called with qgroup_lock held */ 184 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 185 u64 qgroupid) 186 { 187 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 188 struct rb_node *parent = NULL; 189 struct btrfs_qgroup *qgroup; 190 191 while (*p) { 192 parent = *p; 193 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 194 195 if (qgroup->qgroupid < qgroupid) 196 p = &(*p)->rb_left; 197 else if (qgroup->qgroupid > qgroupid) 198 p = &(*p)->rb_right; 199 else 200 return qgroup; 201 } 202 203 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 204 if (!qgroup) 205 return ERR_PTR(-ENOMEM); 206 207 qgroup->qgroupid = qgroupid; 208 INIT_LIST_HEAD(&qgroup->groups); 209 INIT_LIST_HEAD(&qgroup->members); 210 INIT_LIST_HEAD(&qgroup->dirty); 211 INIT_LIST_HEAD(&qgroup->iterator); 212 213 rb_link_node(&qgroup->node, parent, p); 214 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 215 216 return qgroup; 217 } 218 219 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 220 struct btrfs_qgroup *qgroup) 221 { 222 struct btrfs_qgroup_list *list; 223 224 list_del(&qgroup->dirty); 225 while (!list_empty(&qgroup->groups)) { 226 list = list_first_entry(&qgroup->groups, 227 struct btrfs_qgroup_list, next_group); 228 list_del(&list->next_group); 229 list_del(&list->next_member); 230 kfree(list); 231 } 232 233 while (!list_empty(&qgroup->members)) { 234 list = list_first_entry(&qgroup->members, 235 struct btrfs_qgroup_list, next_member); 236 list_del(&list->next_group); 237 list_del(&list->next_member); 238 kfree(list); 239 } 240 } 241 242 /* must be called with qgroup_lock held */ 243 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 244 { 245 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 246 247 if (!qgroup) 248 return -ENOENT; 249 250 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 251 __del_qgroup_rb(fs_info, qgroup); 252 return 0; 253 } 254 255 /* 256 * Add relation specified by two qgroups. 257 * 258 * Must be called with qgroup_lock held. 259 * 260 * Return: 0 on success 261 * -ENOENT if one of the qgroups is NULL 262 * <0 other errors 263 */ 264 static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) 265 { 266 struct btrfs_qgroup_list *list; 267 268 if (!member || !parent) 269 return -ENOENT; 270 271 list = kzalloc(sizeof(*list), GFP_ATOMIC); 272 if (!list) 273 return -ENOMEM; 274 275 list->group = parent; 276 list->member = member; 277 list_add_tail(&list->next_group, &member->groups); 278 list_add_tail(&list->next_member, &parent->members); 279 280 return 0; 281 } 282 283 /* 284 * Add relation specified by two qgroup ids. 285 * 286 * Must be called with qgroup_lock held. 287 * 288 * Return: 0 on success 289 * -ENOENT if one of the ids does not exist 290 * <0 other errors 291 */ 292 static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) 293 { 294 struct btrfs_qgroup *member; 295 struct btrfs_qgroup *parent; 296 297 member = find_qgroup_rb(fs_info, memberid); 298 parent = find_qgroup_rb(fs_info, parentid); 299 300 return __add_relation_rb(member, parent); 301 } 302 303 /* Must be called with qgroup_lock held */ 304 static int del_relation_rb(struct btrfs_fs_info *fs_info, 305 u64 memberid, u64 parentid) 306 { 307 struct btrfs_qgroup *member; 308 struct btrfs_qgroup *parent; 309 struct btrfs_qgroup_list *list; 310 311 member = find_qgroup_rb(fs_info, memberid); 312 parent = find_qgroup_rb(fs_info, parentid); 313 if (!member || !parent) 314 return -ENOENT; 315 316 list_for_each_entry(list, &member->groups, next_group) { 317 if (list->group == parent) { 318 list_del(&list->next_group); 319 list_del(&list->next_member); 320 kfree(list); 321 return 0; 322 } 323 } 324 return -ENOENT; 325 } 326 327 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 328 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 329 u64 rfer, u64 excl) 330 { 331 struct btrfs_qgroup *qgroup; 332 333 qgroup = find_qgroup_rb(fs_info, qgroupid); 334 if (!qgroup) 335 return -EINVAL; 336 if (qgroup->rfer != rfer || qgroup->excl != excl) 337 return -EINVAL; 338 return 0; 339 } 340 #endif 341 342 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) 343 { 344 fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | 345 BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 346 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 347 } 348 349 /* 350 * The full config is read in one go, only called from open_ctree() 351 * It doesn't use any locking, as at this point we're still single-threaded 352 */ 353 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 354 { 355 struct btrfs_key key; 356 struct btrfs_key found_key; 357 struct btrfs_root *quota_root = fs_info->quota_root; 358 struct btrfs_path *path = NULL; 359 struct extent_buffer *l; 360 int slot; 361 int ret = 0; 362 u64 flags = 0; 363 u64 rescan_progress = 0; 364 365 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 366 return 0; 367 368 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 369 if (!fs_info->qgroup_ulist) { 370 ret = -ENOMEM; 371 goto out; 372 } 373 374 path = btrfs_alloc_path(); 375 if (!path) { 376 ret = -ENOMEM; 377 goto out; 378 } 379 380 ret = btrfs_sysfs_add_qgroups(fs_info); 381 if (ret < 0) 382 goto out; 383 /* default this to quota off, in case no status key is found */ 384 fs_info->qgroup_flags = 0; 385 386 /* 387 * pass 1: read status, all qgroup infos and limits 388 */ 389 key.objectid = 0; 390 key.type = 0; 391 key.offset = 0; 392 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 393 if (ret) 394 goto out; 395 396 while (1) { 397 struct btrfs_qgroup *qgroup; 398 399 slot = path->slots[0]; 400 l = path->nodes[0]; 401 btrfs_item_key_to_cpu(l, &found_key, slot); 402 403 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 404 struct btrfs_qgroup_status_item *ptr; 405 406 ptr = btrfs_item_ptr(l, slot, 407 struct btrfs_qgroup_status_item); 408 409 if (btrfs_qgroup_status_version(l, ptr) != 410 BTRFS_QGROUP_STATUS_VERSION) { 411 btrfs_err(fs_info, 412 "old qgroup version, quota disabled"); 413 goto out; 414 } 415 if (btrfs_qgroup_status_generation(l, ptr) != 416 fs_info->generation) { 417 qgroup_mark_inconsistent(fs_info); 418 btrfs_err(fs_info, 419 "qgroup generation mismatch, marked as inconsistent"); 420 } 421 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 422 ptr); 423 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 424 goto next1; 425 } 426 427 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 428 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 429 goto next1; 430 431 qgroup = find_qgroup_rb(fs_info, found_key.offset); 432 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 433 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 434 btrfs_err(fs_info, "inconsistent qgroup config"); 435 qgroup_mark_inconsistent(fs_info); 436 } 437 if (!qgroup) { 438 qgroup = add_qgroup_rb(fs_info, found_key.offset); 439 if (IS_ERR(qgroup)) { 440 ret = PTR_ERR(qgroup); 441 goto out; 442 } 443 } 444 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 445 if (ret < 0) 446 goto out; 447 448 switch (found_key.type) { 449 case BTRFS_QGROUP_INFO_KEY: { 450 struct btrfs_qgroup_info_item *ptr; 451 452 ptr = btrfs_item_ptr(l, slot, 453 struct btrfs_qgroup_info_item); 454 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 455 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 456 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 457 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 458 /* generation currently unused */ 459 break; 460 } 461 case BTRFS_QGROUP_LIMIT_KEY: { 462 struct btrfs_qgroup_limit_item *ptr; 463 464 ptr = btrfs_item_ptr(l, slot, 465 struct btrfs_qgroup_limit_item); 466 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 467 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 468 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 469 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 470 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 471 break; 472 } 473 } 474 next1: 475 ret = btrfs_next_item(quota_root, path); 476 if (ret < 0) 477 goto out; 478 if (ret) 479 break; 480 } 481 btrfs_release_path(path); 482 483 /* 484 * pass 2: read all qgroup relations 485 */ 486 key.objectid = 0; 487 key.type = BTRFS_QGROUP_RELATION_KEY; 488 key.offset = 0; 489 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 490 if (ret) 491 goto out; 492 while (1) { 493 slot = path->slots[0]; 494 l = path->nodes[0]; 495 btrfs_item_key_to_cpu(l, &found_key, slot); 496 497 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 498 goto next2; 499 500 if (found_key.objectid > found_key.offset) { 501 /* parent <- member, not needed to build config */ 502 /* FIXME should we omit the key completely? */ 503 goto next2; 504 } 505 506 ret = add_relation_rb(fs_info, found_key.objectid, 507 found_key.offset); 508 if (ret == -ENOENT) { 509 btrfs_warn(fs_info, 510 "orphan qgroup relation 0x%llx->0x%llx", 511 found_key.objectid, found_key.offset); 512 ret = 0; /* ignore the error */ 513 } 514 if (ret) 515 goto out; 516 next2: 517 ret = btrfs_next_item(quota_root, path); 518 if (ret < 0) 519 goto out; 520 if (ret) 521 break; 522 } 523 out: 524 btrfs_free_path(path); 525 fs_info->qgroup_flags |= flags; 526 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 527 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 528 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 529 ret >= 0) 530 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 531 532 if (ret < 0) { 533 ulist_free(fs_info->qgroup_ulist); 534 fs_info->qgroup_ulist = NULL; 535 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 536 btrfs_sysfs_del_qgroups(fs_info); 537 } 538 539 return ret < 0 ? ret : 0; 540 } 541 542 /* 543 * Called in close_ctree() when quota is still enabled. This verifies we don't 544 * leak some reserved space. 545 * 546 * Return false if no reserved space is left. 547 * Return true if some reserved space is leaked. 548 */ 549 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 550 { 551 struct rb_node *node; 552 bool ret = false; 553 554 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 555 return ret; 556 /* 557 * Since we're unmounting, there is no race and no need to grab qgroup 558 * lock. And here we don't go post-order to provide a more user 559 * friendly sorted result. 560 */ 561 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 562 struct btrfs_qgroup *qgroup; 563 int i; 564 565 qgroup = rb_entry(node, struct btrfs_qgroup, node); 566 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 567 if (qgroup->rsv.values[i]) { 568 ret = true; 569 btrfs_warn(fs_info, 570 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 571 btrfs_qgroup_level(qgroup->qgroupid), 572 btrfs_qgroup_subvolid(qgroup->qgroupid), 573 i, qgroup->rsv.values[i]); 574 } 575 } 576 } 577 return ret; 578 } 579 580 /* 581 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 582 * first two are in single-threaded paths.And for the third one, we have set 583 * quota_root to be null with qgroup_lock held before, so it is safe to clean 584 * up the in-memory structures without qgroup_lock held. 585 */ 586 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 587 { 588 struct rb_node *n; 589 struct btrfs_qgroup *qgroup; 590 591 while ((n = rb_first(&fs_info->qgroup_tree))) { 592 qgroup = rb_entry(n, struct btrfs_qgroup, node); 593 rb_erase(n, &fs_info->qgroup_tree); 594 __del_qgroup_rb(fs_info, qgroup); 595 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 596 kfree(qgroup); 597 } 598 /* 599 * We call btrfs_free_qgroup_config() when unmounting 600 * filesystem and disabling quota, so we set qgroup_ulist 601 * to be null here to avoid double free. 602 */ 603 ulist_free(fs_info->qgroup_ulist); 604 fs_info->qgroup_ulist = NULL; 605 btrfs_sysfs_del_qgroups(fs_info); 606 } 607 608 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 625 626 btrfs_mark_buffer_dirty(trans, path->nodes[0]); 627 628 btrfs_free_path(path); 629 return ret; 630 } 631 632 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 633 u64 dst) 634 { 635 int ret; 636 struct btrfs_root *quota_root = trans->fs_info->quota_root; 637 struct btrfs_path *path; 638 struct btrfs_key key; 639 640 path = btrfs_alloc_path(); 641 if (!path) 642 return -ENOMEM; 643 644 key.objectid = src; 645 key.type = BTRFS_QGROUP_RELATION_KEY; 646 key.offset = dst; 647 648 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 649 if (ret < 0) 650 goto out; 651 652 if (ret > 0) { 653 ret = -ENOENT; 654 goto out; 655 } 656 657 ret = btrfs_del_item(trans, quota_root, path); 658 out: 659 btrfs_free_path(path); 660 return ret; 661 } 662 663 static int add_qgroup_item(struct btrfs_trans_handle *trans, 664 struct btrfs_root *quota_root, u64 qgroupid) 665 { 666 int ret; 667 struct btrfs_path *path; 668 struct btrfs_qgroup_info_item *qgroup_info; 669 struct btrfs_qgroup_limit_item *qgroup_limit; 670 struct extent_buffer *leaf; 671 struct btrfs_key key; 672 673 if (btrfs_is_testing(quota_root->fs_info)) 674 return 0; 675 676 path = btrfs_alloc_path(); 677 if (!path) 678 return -ENOMEM; 679 680 key.objectid = 0; 681 key.type = BTRFS_QGROUP_INFO_KEY; 682 key.offset = qgroupid; 683 684 /* 685 * Avoid a transaction abort by catching -EEXIST here. In that 686 * case, we proceed by re-initializing the existing structure 687 * on disk. 688 */ 689 690 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 691 sizeof(*qgroup_info)); 692 if (ret && ret != -EEXIST) 693 goto out; 694 695 leaf = path->nodes[0]; 696 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 697 struct btrfs_qgroup_info_item); 698 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 699 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 700 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 701 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 702 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 703 704 btrfs_mark_buffer_dirty(trans, leaf); 705 706 btrfs_release_path(path); 707 708 key.type = BTRFS_QGROUP_LIMIT_KEY; 709 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 710 sizeof(*qgroup_limit)); 711 if (ret && ret != -EEXIST) 712 goto out; 713 714 leaf = path->nodes[0]; 715 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 716 struct btrfs_qgroup_limit_item); 717 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 718 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 719 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 720 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 721 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 722 723 btrfs_mark_buffer_dirty(trans, leaf); 724 725 ret = 0; 726 out: 727 btrfs_free_path(path); 728 return ret; 729 } 730 731 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 732 { 733 int ret; 734 struct btrfs_root *quota_root = trans->fs_info->quota_root; 735 struct btrfs_path *path; 736 struct btrfs_key key; 737 738 path = btrfs_alloc_path(); 739 if (!path) 740 return -ENOMEM; 741 742 key.objectid = 0; 743 key.type = BTRFS_QGROUP_INFO_KEY; 744 key.offset = qgroupid; 745 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 746 if (ret < 0) 747 goto out; 748 749 if (ret > 0) { 750 ret = -ENOENT; 751 goto out; 752 } 753 754 ret = btrfs_del_item(trans, quota_root, path); 755 if (ret) 756 goto out; 757 758 btrfs_release_path(path); 759 760 key.type = BTRFS_QGROUP_LIMIT_KEY; 761 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 762 if (ret < 0) 763 goto out; 764 765 if (ret > 0) { 766 ret = -ENOENT; 767 goto out; 768 } 769 770 ret = btrfs_del_item(trans, quota_root, path); 771 772 out: 773 btrfs_free_path(path); 774 return ret; 775 } 776 777 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 778 struct btrfs_qgroup *qgroup) 779 { 780 struct btrfs_root *quota_root = trans->fs_info->quota_root; 781 struct btrfs_path *path; 782 struct btrfs_key key; 783 struct extent_buffer *l; 784 struct btrfs_qgroup_limit_item *qgroup_limit; 785 int ret; 786 int slot; 787 788 key.objectid = 0; 789 key.type = BTRFS_QGROUP_LIMIT_KEY; 790 key.offset = qgroup->qgroupid; 791 792 path = btrfs_alloc_path(); 793 if (!path) 794 return -ENOMEM; 795 796 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 797 if (ret > 0) 798 ret = -ENOENT; 799 800 if (ret) 801 goto out; 802 803 l = path->nodes[0]; 804 slot = path->slots[0]; 805 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 806 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 807 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 808 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 809 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 810 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 811 812 btrfs_mark_buffer_dirty(trans, l); 813 814 out: 815 btrfs_free_path(path); 816 return ret; 817 } 818 819 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 820 struct btrfs_qgroup *qgroup) 821 { 822 struct btrfs_fs_info *fs_info = trans->fs_info; 823 struct btrfs_root *quota_root = fs_info->quota_root; 824 struct btrfs_path *path; 825 struct btrfs_key key; 826 struct extent_buffer *l; 827 struct btrfs_qgroup_info_item *qgroup_info; 828 int ret; 829 int slot; 830 831 if (btrfs_is_testing(fs_info)) 832 return 0; 833 834 key.objectid = 0; 835 key.type = BTRFS_QGROUP_INFO_KEY; 836 key.offset = qgroup->qgroupid; 837 838 path = btrfs_alloc_path(); 839 if (!path) 840 return -ENOMEM; 841 842 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 843 if (ret > 0) 844 ret = -ENOENT; 845 846 if (ret) 847 goto out; 848 849 l = path->nodes[0]; 850 slot = path->slots[0]; 851 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 852 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 853 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 854 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 855 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 856 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 857 858 btrfs_mark_buffer_dirty(trans, l); 859 860 out: 861 btrfs_free_path(path); 862 return ret; 863 } 864 865 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 866 { 867 struct btrfs_fs_info *fs_info = trans->fs_info; 868 struct btrfs_root *quota_root = fs_info->quota_root; 869 struct btrfs_path *path; 870 struct btrfs_key key; 871 struct extent_buffer *l; 872 struct btrfs_qgroup_status_item *ptr; 873 int ret; 874 int slot; 875 876 key.objectid = 0; 877 key.type = BTRFS_QGROUP_STATUS_KEY; 878 key.offset = 0; 879 880 path = btrfs_alloc_path(); 881 if (!path) 882 return -ENOMEM; 883 884 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 885 if (ret > 0) 886 ret = -ENOENT; 887 888 if (ret) 889 goto out; 890 891 l = path->nodes[0]; 892 slot = path->slots[0]; 893 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 894 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags & 895 BTRFS_QGROUP_STATUS_FLAGS_MASK); 896 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 897 btrfs_set_qgroup_status_rescan(l, ptr, 898 fs_info->qgroup_rescan_progress.objectid); 899 900 btrfs_mark_buffer_dirty(trans, l); 901 902 out: 903 btrfs_free_path(path); 904 return ret; 905 } 906 907 /* 908 * called with qgroup_lock held 909 */ 910 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 911 struct btrfs_root *root) 912 { 913 struct btrfs_path *path; 914 struct btrfs_key key; 915 struct extent_buffer *leaf = NULL; 916 int ret; 917 int nr = 0; 918 919 path = btrfs_alloc_path(); 920 if (!path) 921 return -ENOMEM; 922 923 key.objectid = 0; 924 key.offset = 0; 925 key.type = 0; 926 927 while (1) { 928 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 929 if (ret < 0) 930 goto out; 931 leaf = path->nodes[0]; 932 nr = btrfs_header_nritems(leaf); 933 if (!nr) 934 break; 935 /* 936 * delete the leaf one by one 937 * since the whole tree is going 938 * to be deleted. 939 */ 940 path->slots[0] = 0; 941 ret = btrfs_del_items(trans, root, path, 0, nr); 942 if (ret) 943 goto out; 944 945 btrfs_release_path(path); 946 } 947 ret = 0; 948 out: 949 btrfs_free_path(path); 950 return ret; 951 } 952 953 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 954 { 955 struct btrfs_root *quota_root; 956 struct btrfs_root *tree_root = fs_info->tree_root; 957 struct btrfs_path *path = NULL; 958 struct btrfs_qgroup_status_item *ptr; 959 struct extent_buffer *leaf; 960 struct btrfs_key key; 961 struct btrfs_key found_key; 962 struct btrfs_qgroup *qgroup = NULL; 963 struct btrfs_trans_handle *trans = NULL; 964 struct ulist *ulist = NULL; 965 int ret = 0; 966 int slot; 967 968 /* 969 * We need to have subvol_sem write locked, to prevent races between 970 * concurrent tasks trying to enable quotas, because we will unlock 971 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 972 * and before setting BTRFS_FS_QUOTA_ENABLED. 973 */ 974 lockdep_assert_held_write(&fs_info->subvol_sem); 975 976 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 977 btrfs_err(fs_info, 978 "qgroups are currently unsupported in extent tree v2"); 979 return -EINVAL; 980 } 981 982 mutex_lock(&fs_info->qgroup_ioctl_lock); 983 if (fs_info->quota_root) 984 goto out; 985 986 ulist = ulist_alloc(GFP_KERNEL); 987 if (!ulist) { 988 ret = -ENOMEM; 989 goto out; 990 } 991 992 ret = btrfs_sysfs_add_qgroups(fs_info); 993 if (ret < 0) 994 goto out; 995 996 /* 997 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 998 * avoid lock acquisition inversion problems (reported by lockdep) between 999 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 1000 * start a transaction. 1001 * After we started the transaction lock qgroup_ioctl_lock again and 1002 * check if someone else created the quota root in the meanwhile. If so, 1003 * just return success and release the transaction handle. 1004 * 1005 * Also we don't need to worry about someone else calling 1006 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 1007 * that function returns 0 (success) when the sysfs entries already exist. 1008 */ 1009 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1010 1011 /* 1012 * 1 for quota root item 1013 * 1 for BTRFS_QGROUP_STATUS item 1014 * 1015 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 1016 * per subvolume. However those are not currently reserved since it 1017 * would be a lot of overkill. 1018 */ 1019 trans = btrfs_start_transaction(tree_root, 2); 1020 1021 mutex_lock(&fs_info->qgroup_ioctl_lock); 1022 if (IS_ERR(trans)) { 1023 ret = PTR_ERR(trans); 1024 trans = NULL; 1025 goto out; 1026 } 1027 1028 if (fs_info->quota_root) 1029 goto out; 1030 1031 fs_info->qgroup_ulist = ulist; 1032 ulist = NULL; 1033 1034 /* 1035 * initially create the quota tree 1036 */ 1037 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1038 if (IS_ERR(quota_root)) { 1039 ret = PTR_ERR(quota_root); 1040 btrfs_abort_transaction(trans, ret); 1041 goto out; 1042 } 1043 1044 path = btrfs_alloc_path(); 1045 if (!path) { 1046 ret = -ENOMEM; 1047 btrfs_abort_transaction(trans, ret); 1048 goto out_free_root; 1049 } 1050 1051 key.objectid = 0; 1052 key.type = BTRFS_QGROUP_STATUS_KEY; 1053 key.offset = 0; 1054 1055 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1056 sizeof(*ptr)); 1057 if (ret) { 1058 btrfs_abort_transaction(trans, ret); 1059 goto out_free_path; 1060 } 1061 1062 leaf = path->nodes[0]; 1063 ptr = btrfs_item_ptr(leaf, path->slots[0], 1064 struct btrfs_qgroup_status_item); 1065 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1066 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1067 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1068 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1069 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & 1070 BTRFS_QGROUP_STATUS_FLAGS_MASK); 1071 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1072 1073 btrfs_mark_buffer_dirty(trans, leaf); 1074 1075 key.objectid = 0; 1076 key.type = BTRFS_ROOT_REF_KEY; 1077 key.offset = 0; 1078 1079 btrfs_release_path(path); 1080 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1081 if (ret > 0) 1082 goto out_add_root; 1083 if (ret < 0) { 1084 btrfs_abort_transaction(trans, ret); 1085 goto out_free_path; 1086 } 1087 1088 while (1) { 1089 slot = path->slots[0]; 1090 leaf = path->nodes[0]; 1091 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1092 1093 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1094 1095 /* Release locks on tree_root before we access quota_root */ 1096 btrfs_release_path(path); 1097 1098 ret = add_qgroup_item(trans, quota_root, 1099 found_key.offset); 1100 if (ret) { 1101 btrfs_abort_transaction(trans, ret); 1102 goto out_free_path; 1103 } 1104 1105 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1106 if (IS_ERR(qgroup)) { 1107 ret = PTR_ERR(qgroup); 1108 btrfs_abort_transaction(trans, ret); 1109 goto out_free_path; 1110 } 1111 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1112 if (ret < 0) { 1113 btrfs_abort_transaction(trans, ret); 1114 goto out_free_path; 1115 } 1116 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1117 path, 1, 0); 1118 if (ret < 0) { 1119 btrfs_abort_transaction(trans, ret); 1120 goto out_free_path; 1121 } 1122 if (ret > 0) { 1123 /* 1124 * Shouldn't happen, but in case it does we 1125 * don't need to do the btrfs_next_item, just 1126 * continue. 1127 */ 1128 continue; 1129 } 1130 } 1131 ret = btrfs_next_item(tree_root, path); 1132 if (ret < 0) { 1133 btrfs_abort_transaction(trans, ret); 1134 goto out_free_path; 1135 } 1136 if (ret) 1137 break; 1138 } 1139 1140 out_add_root: 1141 btrfs_release_path(path); 1142 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1143 if (ret) { 1144 btrfs_abort_transaction(trans, ret); 1145 goto out_free_path; 1146 } 1147 1148 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1149 if (IS_ERR(qgroup)) { 1150 ret = PTR_ERR(qgroup); 1151 btrfs_abort_transaction(trans, ret); 1152 goto out_free_path; 1153 } 1154 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1155 if (ret < 0) { 1156 btrfs_abort_transaction(trans, ret); 1157 goto out_free_path; 1158 } 1159 1160 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1161 /* 1162 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1163 * a deadlock with tasks concurrently doing other qgroup operations, such 1164 * adding/removing qgroups or adding/deleting qgroup relations for example, 1165 * because all qgroup operations first start or join a transaction and then 1166 * lock the qgroup_ioctl_lock mutex. 1167 * We are safe from a concurrent task trying to enable quotas, by calling 1168 * this function, since we are serialized by fs_info->subvol_sem. 1169 */ 1170 ret = btrfs_commit_transaction(trans); 1171 trans = NULL; 1172 mutex_lock(&fs_info->qgroup_ioctl_lock); 1173 if (ret) 1174 goto out_free_path; 1175 1176 /* 1177 * Set quota enabled flag after committing the transaction, to avoid 1178 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1179 * creation. 1180 */ 1181 spin_lock(&fs_info->qgroup_lock); 1182 fs_info->quota_root = quota_root; 1183 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1184 spin_unlock(&fs_info->qgroup_lock); 1185 1186 ret = qgroup_rescan_init(fs_info, 0, 1); 1187 if (!ret) { 1188 qgroup_rescan_zero_tracking(fs_info); 1189 fs_info->qgroup_rescan_running = true; 1190 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1191 &fs_info->qgroup_rescan_work); 1192 } else { 1193 /* 1194 * We have set both BTRFS_FS_QUOTA_ENABLED and 1195 * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with 1196 * -EINPROGRESS. That can happen because someone started the 1197 * rescan worker by calling quota rescan ioctl before we 1198 * attempted to initialize the rescan worker. Failure due to 1199 * quotas disabled in the meanwhile is not possible, because 1200 * we are holding a write lock on fs_info->subvol_sem, which 1201 * is also acquired when disabling quotas. 1202 * Ignore such error, and any other error would need to undo 1203 * everything we did in the transaction we just committed. 1204 */ 1205 ASSERT(ret == -EINPROGRESS); 1206 ret = 0; 1207 } 1208 1209 out_free_path: 1210 btrfs_free_path(path); 1211 out_free_root: 1212 if (ret) 1213 btrfs_put_root(quota_root); 1214 out: 1215 if (ret) { 1216 ulist_free(fs_info->qgroup_ulist); 1217 fs_info->qgroup_ulist = NULL; 1218 btrfs_sysfs_del_qgroups(fs_info); 1219 } 1220 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1221 if (ret && trans) 1222 btrfs_end_transaction(trans); 1223 else if (trans) 1224 ret = btrfs_end_transaction(trans); 1225 ulist_free(ulist); 1226 return ret; 1227 } 1228 1229 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1230 { 1231 struct btrfs_root *quota_root = NULL; 1232 struct btrfs_trans_handle *trans = NULL; 1233 int ret = 0; 1234 1235 /* 1236 * We need to have subvol_sem write locked to prevent races with 1237 * snapshot creation. 1238 */ 1239 lockdep_assert_held_write(&fs_info->subvol_sem); 1240 1241 /* 1242 * Lock the cleaner mutex to prevent races with concurrent relocation, 1243 * because relocation may be building backrefs for blocks of the quota 1244 * root while we are deleting the root. This is like dropping fs roots 1245 * of deleted snapshots/subvolumes, we need the same protection. 1246 * 1247 * This also prevents races between concurrent tasks trying to disable 1248 * quotas, because we will unlock and relock qgroup_ioctl_lock across 1249 * BTRFS_FS_QUOTA_ENABLED changes. 1250 */ 1251 mutex_lock(&fs_info->cleaner_mutex); 1252 1253 mutex_lock(&fs_info->qgroup_ioctl_lock); 1254 if (!fs_info->quota_root) 1255 goto out; 1256 1257 /* 1258 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1259 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1260 * to lock that mutex while holding a transaction handle and the rescan 1261 * worker needs to commit a transaction. 1262 */ 1263 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1264 1265 /* 1266 * Request qgroup rescan worker to complete and wait for it. This wait 1267 * must be done before transaction start for quota disable since it may 1268 * deadlock with transaction by the qgroup rescan worker. 1269 */ 1270 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1271 btrfs_qgroup_wait_for_completion(fs_info, false); 1272 1273 /* 1274 * 1 For the root item 1275 * 1276 * We should also reserve enough items for the quota tree deletion in 1277 * btrfs_clean_quota_tree but this is not done. 1278 * 1279 * Also, we must always start a transaction without holding the mutex 1280 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1281 */ 1282 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1283 1284 mutex_lock(&fs_info->qgroup_ioctl_lock); 1285 if (IS_ERR(trans)) { 1286 ret = PTR_ERR(trans); 1287 trans = NULL; 1288 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1289 goto out; 1290 } 1291 1292 if (!fs_info->quota_root) 1293 goto out; 1294 1295 spin_lock(&fs_info->qgroup_lock); 1296 quota_root = fs_info->quota_root; 1297 fs_info->quota_root = NULL; 1298 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1299 fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; 1300 spin_unlock(&fs_info->qgroup_lock); 1301 1302 btrfs_free_qgroup_config(fs_info); 1303 1304 ret = btrfs_clean_quota_tree(trans, quota_root); 1305 if (ret) { 1306 btrfs_abort_transaction(trans, ret); 1307 goto out; 1308 } 1309 1310 ret = btrfs_del_root(trans, "a_root->root_key); 1311 if (ret) { 1312 btrfs_abort_transaction(trans, ret); 1313 goto out; 1314 } 1315 1316 spin_lock(&fs_info->trans_lock); 1317 list_del("a_root->dirty_list); 1318 spin_unlock(&fs_info->trans_lock); 1319 1320 btrfs_tree_lock(quota_root->node); 1321 btrfs_clear_buffer_dirty(trans, quota_root->node); 1322 btrfs_tree_unlock(quota_root->node); 1323 ret = btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1324 quota_root->node, 0, 1); 1325 1326 if (ret < 0) 1327 btrfs_abort_transaction(trans, ret); 1328 1329 out: 1330 btrfs_put_root(quota_root); 1331 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1332 if (ret && trans) 1333 btrfs_end_transaction(trans); 1334 else if (trans) 1335 ret = btrfs_end_transaction(trans); 1336 mutex_unlock(&fs_info->cleaner_mutex); 1337 1338 return ret; 1339 } 1340 1341 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1342 struct btrfs_qgroup *qgroup) 1343 { 1344 if (list_empty(&qgroup->dirty)) 1345 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1346 } 1347 1348 static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) 1349 { 1350 if (!list_empty(&qgroup->iterator)) 1351 return; 1352 1353 list_add_tail(&qgroup->iterator, head); 1354 } 1355 1356 static void qgroup_iterator_clean(struct list_head *head) 1357 { 1358 while (!list_empty(head)) { 1359 struct btrfs_qgroup *qgroup; 1360 1361 qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); 1362 list_del_init(&qgroup->iterator); 1363 } 1364 } 1365 1366 /* 1367 * The easy accounting, we're updating qgroup relationship whose child qgroup 1368 * only has exclusive extents. 1369 * 1370 * In this case, all exclusive extents will also be exclusive for parent, so 1371 * excl/rfer just get added/removed. 1372 * 1373 * So is qgroup reservation space, which should also be added/removed to 1374 * parent. 1375 * Or when child tries to release reservation space, parent will underflow its 1376 * reservation (for relationship adding case). 1377 * 1378 * Caller should hold fs_info->qgroup_lock. 1379 */ 1380 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1381 struct ulist *tmp, u64 ref_root, 1382 struct btrfs_qgroup *src, int sign) 1383 { 1384 struct btrfs_qgroup *qgroup; 1385 struct btrfs_qgroup_list *glist; 1386 struct ulist_node *unode; 1387 struct ulist_iterator uiter; 1388 u64 num_bytes = src->excl; 1389 int ret = 0; 1390 1391 qgroup = find_qgroup_rb(fs_info, ref_root); 1392 if (!qgroup) 1393 goto out; 1394 1395 qgroup->rfer += sign * num_bytes; 1396 qgroup->rfer_cmpr += sign * num_bytes; 1397 1398 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1399 qgroup->excl += sign * num_bytes; 1400 qgroup->excl_cmpr += sign * num_bytes; 1401 1402 if (sign > 0) 1403 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1404 else 1405 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1406 1407 qgroup_dirty(fs_info, qgroup); 1408 1409 /* Get all of the parent groups that contain this qgroup */ 1410 list_for_each_entry(glist, &qgroup->groups, next_group) { 1411 ret = ulist_add(tmp, glist->group->qgroupid, 1412 qgroup_to_aux(glist->group), GFP_ATOMIC); 1413 if (ret < 0) 1414 goto out; 1415 } 1416 1417 /* Iterate all of the parents and adjust their reference counts */ 1418 ULIST_ITER_INIT(&uiter); 1419 while ((unode = ulist_next(tmp, &uiter))) { 1420 qgroup = unode_aux_to_qgroup(unode); 1421 qgroup->rfer += sign * num_bytes; 1422 qgroup->rfer_cmpr += sign * num_bytes; 1423 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1424 qgroup->excl += sign * num_bytes; 1425 if (sign > 0) 1426 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1427 else 1428 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1429 qgroup->excl_cmpr += sign * num_bytes; 1430 qgroup_dirty(fs_info, qgroup); 1431 1432 /* Add any parents of the parents */ 1433 list_for_each_entry(glist, &qgroup->groups, next_group) { 1434 ret = ulist_add(tmp, glist->group->qgroupid, 1435 qgroup_to_aux(glist->group), GFP_ATOMIC); 1436 if (ret < 0) 1437 goto out; 1438 } 1439 } 1440 ret = 0; 1441 out: 1442 return ret; 1443 } 1444 1445 1446 /* 1447 * Quick path for updating qgroup with only excl refs. 1448 * 1449 * In that case, just update all parent will be enough. 1450 * Or we needs to do a full rescan. 1451 * Caller should also hold fs_info->qgroup_lock. 1452 * 1453 * Return 0 for quick update, return >0 for need to full rescan 1454 * and mark INCONSISTENT flag. 1455 * Return < 0 for other error. 1456 */ 1457 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1458 struct ulist *tmp, u64 src, u64 dst, 1459 int sign) 1460 { 1461 struct btrfs_qgroup *qgroup; 1462 int ret = 1; 1463 int err = 0; 1464 1465 qgroup = find_qgroup_rb(fs_info, src); 1466 if (!qgroup) 1467 goto out; 1468 if (qgroup->excl == qgroup->rfer) { 1469 ret = 0; 1470 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1471 qgroup, sign); 1472 if (err < 0) { 1473 ret = err; 1474 goto out; 1475 } 1476 } 1477 out: 1478 if (ret) 1479 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1480 return ret; 1481 } 1482 1483 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1484 u64 dst) 1485 { 1486 struct btrfs_fs_info *fs_info = trans->fs_info; 1487 struct btrfs_qgroup *parent; 1488 struct btrfs_qgroup *member; 1489 struct btrfs_qgroup_list *list; 1490 struct ulist *tmp; 1491 unsigned int nofs_flag; 1492 int ret = 0; 1493 1494 /* Check the level of src and dst first */ 1495 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1496 return -EINVAL; 1497 1498 /* We hold a transaction handle open, must do a NOFS allocation. */ 1499 nofs_flag = memalloc_nofs_save(); 1500 tmp = ulist_alloc(GFP_KERNEL); 1501 memalloc_nofs_restore(nofs_flag); 1502 if (!tmp) 1503 return -ENOMEM; 1504 1505 mutex_lock(&fs_info->qgroup_ioctl_lock); 1506 if (!fs_info->quota_root) { 1507 ret = -ENOTCONN; 1508 goto out; 1509 } 1510 member = find_qgroup_rb(fs_info, src); 1511 parent = find_qgroup_rb(fs_info, dst); 1512 if (!member || !parent) { 1513 ret = -EINVAL; 1514 goto out; 1515 } 1516 1517 /* check if such qgroup relation exist firstly */ 1518 list_for_each_entry(list, &member->groups, next_group) { 1519 if (list->group == parent) { 1520 ret = -EEXIST; 1521 goto out; 1522 } 1523 } 1524 1525 ret = add_qgroup_relation_item(trans, src, dst); 1526 if (ret) 1527 goto out; 1528 1529 ret = add_qgroup_relation_item(trans, dst, src); 1530 if (ret) { 1531 del_qgroup_relation_item(trans, src, dst); 1532 goto out; 1533 } 1534 1535 spin_lock(&fs_info->qgroup_lock); 1536 ret = __add_relation_rb(member, parent); 1537 if (ret < 0) { 1538 spin_unlock(&fs_info->qgroup_lock); 1539 goto out; 1540 } 1541 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1542 spin_unlock(&fs_info->qgroup_lock); 1543 out: 1544 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1545 ulist_free(tmp); 1546 return ret; 1547 } 1548 1549 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1550 u64 dst) 1551 { 1552 struct btrfs_fs_info *fs_info = trans->fs_info; 1553 struct btrfs_qgroup *parent; 1554 struct btrfs_qgroup *member; 1555 struct btrfs_qgroup_list *list; 1556 struct ulist *tmp; 1557 bool found = false; 1558 unsigned int nofs_flag; 1559 int ret = 0; 1560 int ret2; 1561 1562 /* We hold a transaction handle open, must do a NOFS allocation. */ 1563 nofs_flag = memalloc_nofs_save(); 1564 tmp = ulist_alloc(GFP_KERNEL); 1565 memalloc_nofs_restore(nofs_flag); 1566 if (!tmp) 1567 return -ENOMEM; 1568 1569 if (!fs_info->quota_root) { 1570 ret = -ENOTCONN; 1571 goto out; 1572 } 1573 1574 member = find_qgroup_rb(fs_info, src); 1575 parent = find_qgroup_rb(fs_info, dst); 1576 /* 1577 * The parent/member pair doesn't exist, then try to delete the dead 1578 * relation items only. 1579 */ 1580 if (!member || !parent) 1581 goto delete_item; 1582 1583 /* check if such qgroup relation exist firstly */ 1584 list_for_each_entry(list, &member->groups, next_group) { 1585 if (list->group == parent) { 1586 found = true; 1587 break; 1588 } 1589 } 1590 1591 delete_item: 1592 ret = del_qgroup_relation_item(trans, src, dst); 1593 if (ret < 0 && ret != -ENOENT) 1594 goto out; 1595 ret2 = del_qgroup_relation_item(trans, dst, src); 1596 if (ret2 < 0 && ret2 != -ENOENT) 1597 goto out; 1598 1599 /* At least one deletion succeeded, return 0 */ 1600 if (!ret || !ret2) 1601 ret = 0; 1602 1603 if (found) { 1604 spin_lock(&fs_info->qgroup_lock); 1605 del_relation_rb(fs_info, src, dst); 1606 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1607 spin_unlock(&fs_info->qgroup_lock); 1608 } 1609 out: 1610 ulist_free(tmp); 1611 return ret; 1612 } 1613 1614 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1615 u64 dst) 1616 { 1617 struct btrfs_fs_info *fs_info = trans->fs_info; 1618 int ret = 0; 1619 1620 mutex_lock(&fs_info->qgroup_ioctl_lock); 1621 ret = __del_qgroup_relation(trans, src, dst); 1622 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1623 1624 return ret; 1625 } 1626 1627 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1628 { 1629 struct btrfs_fs_info *fs_info = trans->fs_info; 1630 struct btrfs_root *quota_root; 1631 struct btrfs_qgroup *qgroup; 1632 int ret = 0; 1633 1634 mutex_lock(&fs_info->qgroup_ioctl_lock); 1635 if (!fs_info->quota_root) { 1636 ret = -ENOTCONN; 1637 goto out; 1638 } 1639 quota_root = fs_info->quota_root; 1640 qgroup = find_qgroup_rb(fs_info, qgroupid); 1641 if (qgroup) { 1642 ret = -EEXIST; 1643 goto out; 1644 } 1645 1646 ret = add_qgroup_item(trans, quota_root, qgroupid); 1647 if (ret) 1648 goto out; 1649 1650 spin_lock(&fs_info->qgroup_lock); 1651 qgroup = add_qgroup_rb(fs_info, qgroupid); 1652 spin_unlock(&fs_info->qgroup_lock); 1653 1654 if (IS_ERR(qgroup)) { 1655 ret = PTR_ERR(qgroup); 1656 goto out; 1657 } 1658 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1659 out: 1660 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1661 return ret; 1662 } 1663 1664 static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) 1665 { 1666 return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || 1667 qgroup->excl > 0 || qgroup->excl_cmpr > 0 || 1668 qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || 1669 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || 1670 qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); 1671 } 1672 1673 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1674 { 1675 struct btrfs_fs_info *fs_info = trans->fs_info; 1676 struct btrfs_qgroup *qgroup; 1677 struct btrfs_qgroup_list *list; 1678 int ret = 0; 1679 1680 mutex_lock(&fs_info->qgroup_ioctl_lock); 1681 if (!fs_info->quota_root) { 1682 ret = -ENOTCONN; 1683 goto out; 1684 } 1685 1686 qgroup = find_qgroup_rb(fs_info, qgroupid); 1687 if (!qgroup) { 1688 ret = -ENOENT; 1689 goto out; 1690 } 1691 1692 if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { 1693 ret = -EBUSY; 1694 goto out; 1695 } 1696 1697 /* Check if there are no children of this qgroup */ 1698 if (!list_empty(&qgroup->members)) { 1699 ret = -EBUSY; 1700 goto out; 1701 } 1702 1703 ret = del_qgroup_item(trans, qgroupid); 1704 if (ret && ret != -ENOENT) 1705 goto out; 1706 1707 while (!list_empty(&qgroup->groups)) { 1708 list = list_first_entry(&qgroup->groups, 1709 struct btrfs_qgroup_list, next_group); 1710 ret = __del_qgroup_relation(trans, qgroupid, 1711 list->group->qgroupid); 1712 if (ret) 1713 goto out; 1714 } 1715 1716 spin_lock(&fs_info->qgroup_lock); 1717 del_qgroup_rb(fs_info, qgroupid); 1718 spin_unlock(&fs_info->qgroup_lock); 1719 1720 /* 1721 * Remove the qgroup from sysfs now without holding the qgroup_lock 1722 * spinlock, since the sysfs_remove_group() function needs to take 1723 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1724 */ 1725 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1726 kfree(qgroup); 1727 out: 1728 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1729 return ret; 1730 } 1731 1732 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1733 struct btrfs_qgroup_limit *limit) 1734 { 1735 struct btrfs_fs_info *fs_info = trans->fs_info; 1736 struct btrfs_qgroup *qgroup; 1737 int ret = 0; 1738 /* Sometimes we would want to clear the limit on this qgroup. 1739 * To meet this requirement, we treat the -1 as a special value 1740 * which tell kernel to clear the limit on this qgroup. 1741 */ 1742 const u64 CLEAR_VALUE = -1; 1743 1744 mutex_lock(&fs_info->qgroup_ioctl_lock); 1745 if (!fs_info->quota_root) { 1746 ret = -ENOTCONN; 1747 goto out; 1748 } 1749 1750 qgroup = find_qgroup_rb(fs_info, qgroupid); 1751 if (!qgroup) { 1752 ret = -ENOENT; 1753 goto out; 1754 } 1755 1756 spin_lock(&fs_info->qgroup_lock); 1757 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1758 if (limit->max_rfer == CLEAR_VALUE) { 1759 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1760 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1761 qgroup->max_rfer = 0; 1762 } else { 1763 qgroup->max_rfer = limit->max_rfer; 1764 } 1765 } 1766 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1767 if (limit->max_excl == CLEAR_VALUE) { 1768 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1769 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1770 qgroup->max_excl = 0; 1771 } else { 1772 qgroup->max_excl = limit->max_excl; 1773 } 1774 } 1775 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1776 if (limit->rsv_rfer == CLEAR_VALUE) { 1777 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1778 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1779 qgroup->rsv_rfer = 0; 1780 } else { 1781 qgroup->rsv_rfer = limit->rsv_rfer; 1782 } 1783 } 1784 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1785 if (limit->rsv_excl == CLEAR_VALUE) { 1786 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1787 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1788 qgroup->rsv_excl = 0; 1789 } else { 1790 qgroup->rsv_excl = limit->rsv_excl; 1791 } 1792 } 1793 qgroup->lim_flags |= limit->flags; 1794 1795 spin_unlock(&fs_info->qgroup_lock); 1796 1797 ret = update_qgroup_limit_item(trans, qgroup); 1798 if (ret) { 1799 qgroup_mark_inconsistent(fs_info); 1800 btrfs_info(fs_info, "unable to update quota limit for %llu", 1801 qgroupid); 1802 } 1803 1804 out: 1805 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1806 return ret; 1807 } 1808 1809 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1810 struct btrfs_delayed_ref_root *delayed_refs, 1811 struct btrfs_qgroup_extent_record *record) 1812 { 1813 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1814 struct rb_node *parent_node = NULL; 1815 struct btrfs_qgroup_extent_record *entry; 1816 u64 bytenr = record->bytenr; 1817 1818 lockdep_assert_held(&delayed_refs->lock); 1819 trace_btrfs_qgroup_trace_extent(fs_info, record); 1820 1821 while (*p) { 1822 parent_node = *p; 1823 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1824 node); 1825 if (bytenr < entry->bytenr) { 1826 p = &(*p)->rb_left; 1827 } else if (bytenr > entry->bytenr) { 1828 p = &(*p)->rb_right; 1829 } else { 1830 if (record->data_rsv && !entry->data_rsv) { 1831 entry->data_rsv = record->data_rsv; 1832 entry->data_rsv_refroot = 1833 record->data_rsv_refroot; 1834 } 1835 return 1; 1836 } 1837 } 1838 1839 rb_link_node(&record->node, parent_node, p); 1840 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1841 return 0; 1842 } 1843 1844 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1845 struct btrfs_qgroup_extent_record *qrecord) 1846 { 1847 struct btrfs_backref_walk_ctx ctx = { 0 }; 1848 int ret; 1849 1850 /* 1851 * We are always called in a context where we are already holding a 1852 * transaction handle. Often we are called when adding a data delayed 1853 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1854 * in which case we will be holding a write lock on extent buffer from a 1855 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1856 * acquire fs_info->commit_root_sem, because that is a higher level lock 1857 * that must be acquired before locking any extent buffers. 1858 * 1859 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1860 * but we can't pass it a non-NULL transaction handle, because otherwise 1861 * it would not use commit roots and would lock extent buffers, causing 1862 * a deadlock if it ends up trying to read lock the same extent buffer 1863 * that was previously write locked at btrfs_truncate_inode_items(). 1864 * 1865 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1866 * explicitly tell it to not acquire the commit_root_sem - if we are 1867 * holding a transaction handle we don't need its protection. 1868 */ 1869 ASSERT(trans != NULL); 1870 1871 if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 1872 return 0; 1873 1874 ctx.bytenr = qrecord->bytenr; 1875 ctx.fs_info = trans->fs_info; 1876 1877 ret = btrfs_find_all_roots(&ctx, true); 1878 if (ret < 0) { 1879 qgroup_mark_inconsistent(trans->fs_info); 1880 btrfs_warn(trans->fs_info, 1881 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1882 ret); 1883 return 0; 1884 } 1885 1886 /* 1887 * Here we don't need to get the lock of 1888 * trans->transaction->delayed_refs, since inserted qrecord won't 1889 * be deleted, only qrecord->node may be modified (new qrecord insert) 1890 * 1891 * So modifying qrecord->old_roots is safe here 1892 */ 1893 qrecord->old_roots = ctx.roots; 1894 return 0; 1895 } 1896 1897 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1898 u64 num_bytes) 1899 { 1900 struct btrfs_fs_info *fs_info = trans->fs_info; 1901 struct btrfs_qgroup_extent_record *record; 1902 struct btrfs_delayed_ref_root *delayed_refs; 1903 int ret; 1904 1905 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1906 || bytenr == 0 || num_bytes == 0) 1907 return 0; 1908 record = kzalloc(sizeof(*record), GFP_NOFS); 1909 if (!record) 1910 return -ENOMEM; 1911 1912 delayed_refs = &trans->transaction->delayed_refs; 1913 record->bytenr = bytenr; 1914 record->num_bytes = num_bytes; 1915 record->old_roots = NULL; 1916 1917 spin_lock(&delayed_refs->lock); 1918 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1919 spin_unlock(&delayed_refs->lock); 1920 if (ret > 0) { 1921 kfree(record); 1922 return 0; 1923 } 1924 return btrfs_qgroup_trace_extent_post(trans, record); 1925 } 1926 1927 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1928 struct extent_buffer *eb) 1929 { 1930 struct btrfs_fs_info *fs_info = trans->fs_info; 1931 int nr = btrfs_header_nritems(eb); 1932 int i, extent_type, ret; 1933 struct btrfs_key key; 1934 struct btrfs_file_extent_item *fi; 1935 u64 bytenr, num_bytes; 1936 1937 /* We can be called directly from walk_up_proc() */ 1938 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1939 return 0; 1940 1941 for (i = 0; i < nr; i++) { 1942 btrfs_item_key_to_cpu(eb, &key, i); 1943 1944 if (key.type != BTRFS_EXTENT_DATA_KEY) 1945 continue; 1946 1947 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1948 /* filter out non qgroup-accountable extents */ 1949 extent_type = btrfs_file_extent_type(eb, fi); 1950 1951 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1952 continue; 1953 1954 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1955 if (!bytenr) 1956 continue; 1957 1958 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1959 1960 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); 1961 if (ret) 1962 return ret; 1963 } 1964 cond_resched(); 1965 return 0; 1966 } 1967 1968 /* 1969 * Walk up the tree from the bottom, freeing leaves and any interior 1970 * nodes which have had all slots visited. If a node (leaf or 1971 * interior) is freed, the node above it will have it's slot 1972 * incremented. The root node will never be freed. 1973 * 1974 * At the end of this function, we should have a path which has all 1975 * slots incremented to the next position for a search. If we need to 1976 * read a new node it will be NULL and the node above it will have the 1977 * correct slot selected for a later read. 1978 * 1979 * If we increment the root nodes slot counter past the number of 1980 * elements, 1 is returned to signal completion of the search. 1981 */ 1982 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1983 { 1984 int level = 0; 1985 int nr, slot; 1986 struct extent_buffer *eb; 1987 1988 if (root_level == 0) 1989 return 1; 1990 1991 while (level <= root_level) { 1992 eb = path->nodes[level]; 1993 nr = btrfs_header_nritems(eb); 1994 path->slots[level]++; 1995 slot = path->slots[level]; 1996 if (slot >= nr || level == 0) { 1997 /* 1998 * Don't free the root - we will detect this 1999 * condition after our loop and return a 2000 * positive value for caller to stop walking the tree. 2001 */ 2002 if (level != root_level) { 2003 btrfs_tree_unlock_rw(eb, path->locks[level]); 2004 path->locks[level] = 0; 2005 2006 free_extent_buffer(eb); 2007 path->nodes[level] = NULL; 2008 path->slots[level] = 0; 2009 } 2010 } else { 2011 /* 2012 * We have a valid slot to walk back down 2013 * from. Stop here so caller can process these 2014 * new nodes. 2015 */ 2016 break; 2017 } 2018 2019 level++; 2020 } 2021 2022 eb = path->nodes[root_level]; 2023 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 2024 return 1; 2025 2026 return 0; 2027 } 2028 2029 /* 2030 * Helper function to trace a subtree tree block swap. 2031 * 2032 * The swap will happen in highest tree block, but there may be a lot of 2033 * tree blocks involved. 2034 * 2035 * For example: 2036 * OO = Old tree blocks 2037 * NN = New tree blocks allocated during balance 2038 * 2039 * File tree (257) Reloc tree for 257 2040 * L2 OO NN 2041 * / \ / \ 2042 * L1 OO OO (a) OO NN (a) 2043 * / \ / \ / \ / \ 2044 * L0 OO OO OO OO OO OO NN NN 2045 * (b) (c) (b) (c) 2046 * 2047 * When calling qgroup_trace_extent_swap(), we will pass: 2048 * @src_eb = OO(a) 2049 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 2050 * @dst_level = 0 2051 * @root_level = 1 2052 * 2053 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 2054 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 2055 * 2056 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 2057 * 2058 * 1) Tree search from @src_eb 2059 * It should acts as a simplified btrfs_search_slot(). 2060 * The key for search can be extracted from @dst_path->nodes[dst_level] 2061 * (first key). 2062 * 2063 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 2064 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 2065 * They should be marked during previous (@dst_level = 1) iteration. 2066 * 2067 * 3) Mark file extents in leaves dirty 2068 * We don't have good way to pick out new file extents only. 2069 * So we still follow the old method by scanning all file extents in 2070 * the leave. 2071 * 2072 * This function can free us from keeping two paths, thus later we only need 2073 * to care about how to iterate all new tree blocks in reloc tree. 2074 */ 2075 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 2076 struct extent_buffer *src_eb, 2077 struct btrfs_path *dst_path, 2078 int dst_level, int root_level, 2079 bool trace_leaf) 2080 { 2081 struct btrfs_key key; 2082 struct btrfs_path *src_path; 2083 struct btrfs_fs_info *fs_info = trans->fs_info; 2084 u32 nodesize = fs_info->nodesize; 2085 int cur_level = root_level; 2086 int ret; 2087 2088 BUG_ON(dst_level > root_level); 2089 /* Level mismatch */ 2090 if (btrfs_header_level(src_eb) != root_level) 2091 return -EINVAL; 2092 2093 src_path = btrfs_alloc_path(); 2094 if (!src_path) { 2095 ret = -ENOMEM; 2096 goto out; 2097 } 2098 2099 if (dst_level) 2100 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2101 else 2102 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2103 2104 /* For src_path */ 2105 atomic_inc(&src_eb->refs); 2106 src_path->nodes[root_level] = src_eb; 2107 src_path->slots[root_level] = dst_path->slots[root_level]; 2108 src_path->locks[root_level] = 0; 2109 2110 /* A simplified version of btrfs_search_slot() */ 2111 while (cur_level >= dst_level) { 2112 struct btrfs_key src_key; 2113 struct btrfs_key dst_key; 2114 2115 if (src_path->nodes[cur_level] == NULL) { 2116 struct extent_buffer *eb; 2117 int parent_slot; 2118 2119 eb = src_path->nodes[cur_level + 1]; 2120 parent_slot = src_path->slots[cur_level + 1]; 2121 2122 eb = btrfs_read_node_slot(eb, parent_slot); 2123 if (IS_ERR(eb)) { 2124 ret = PTR_ERR(eb); 2125 goto out; 2126 } 2127 2128 src_path->nodes[cur_level] = eb; 2129 2130 btrfs_tree_read_lock(eb); 2131 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2132 } 2133 2134 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2135 if (cur_level) { 2136 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2137 &dst_key, dst_path->slots[cur_level]); 2138 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2139 &src_key, src_path->slots[cur_level]); 2140 } else { 2141 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2142 &dst_key, dst_path->slots[cur_level]); 2143 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2144 &src_key, src_path->slots[cur_level]); 2145 } 2146 /* Content mismatch, something went wrong */ 2147 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2148 ret = -ENOENT; 2149 goto out; 2150 } 2151 cur_level--; 2152 } 2153 2154 /* 2155 * Now both @dst_path and @src_path have been populated, record the tree 2156 * blocks for qgroup accounting. 2157 */ 2158 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2159 nodesize); 2160 if (ret < 0) 2161 goto out; 2162 ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, 2163 nodesize); 2164 if (ret < 0) 2165 goto out; 2166 2167 /* Record leaf file extents */ 2168 if (dst_level == 0 && trace_leaf) { 2169 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2170 if (ret < 0) 2171 goto out; 2172 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2173 } 2174 out: 2175 btrfs_free_path(src_path); 2176 return ret; 2177 } 2178 2179 /* 2180 * Helper function to do recursive generation-aware depth-first search, to 2181 * locate all new tree blocks in a subtree of reloc tree. 2182 * 2183 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2184 * reloc tree 2185 * L2 NN (a) 2186 * / \ 2187 * L1 OO NN (b) 2188 * / \ / \ 2189 * L0 OO OO OO NN 2190 * (c) (d) 2191 * If we pass: 2192 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2193 * @cur_level = 1 2194 * @root_level = 1 2195 * 2196 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2197 * above tree blocks along with their counter parts in file tree. 2198 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2199 * won't affect OO(c). 2200 */ 2201 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2202 struct extent_buffer *src_eb, 2203 struct btrfs_path *dst_path, 2204 int cur_level, int root_level, 2205 u64 last_snapshot, bool trace_leaf) 2206 { 2207 struct btrfs_fs_info *fs_info = trans->fs_info; 2208 struct extent_buffer *eb; 2209 bool need_cleanup = false; 2210 int ret = 0; 2211 int i; 2212 2213 /* Level sanity check */ 2214 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2215 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2216 root_level < cur_level) { 2217 btrfs_err_rl(fs_info, 2218 "%s: bad levels, cur_level=%d root_level=%d", 2219 __func__, cur_level, root_level); 2220 return -EUCLEAN; 2221 } 2222 2223 /* Read the tree block if needed */ 2224 if (dst_path->nodes[cur_level] == NULL) { 2225 int parent_slot; 2226 u64 child_gen; 2227 2228 /* 2229 * dst_path->nodes[root_level] must be initialized before 2230 * calling this function. 2231 */ 2232 if (cur_level == root_level) { 2233 btrfs_err_rl(fs_info, 2234 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2235 __func__, root_level, root_level, cur_level); 2236 return -EUCLEAN; 2237 } 2238 2239 /* 2240 * We need to get child blockptr/gen from parent before we can 2241 * read it. 2242 */ 2243 eb = dst_path->nodes[cur_level + 1]; 2244 parent_slot = dst_path->slots[cur_level + 1]; 2245 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2246 2247 /* This node is old, no need to trace */ 2248 if (child_gen < last_snapshot) 2249 goto out; 2250 2251 eb = btrfs_read_node_slot(eb, parent_slot); 2252 if (IS_ERR(eb)) { 2253 ret = PTR_ERR(eb); 2254 goto out; 2255 } 2256 2257 dst_path->nodes[cur_level] = eb; 2258 dst_path->slots[cur_level] = 0; 2259 2260 btrfs_tree_read_lock(eb); 2261 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2262 need_cleanup = true; 2263 } 2264 2265 /* Now record this tree block and its counter part for qgroups */ 2266 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2267 root_level, trace_leaf); 2268 if (ret < 0) 2269 goto cleanup; 2270 2271 eb = dst_path->nodes[cur_level]; 2272 2273 if (cur_level > 0) { 2274 /* Iterate all child tree blocks */ 2275 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2276 /* Skip old tree blocks as they won't be swapped */ 2277 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2278 continue; 2279 dst_path->slots[cur_level] = i; 2280 2281 /* Recursive call (at most 7 times) */ 2282 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2283 dst_path, cur_level - 1, root_level, 2284 last_snapshot, trace_leaf); 2285 if (ret < 0) 2286 goto cleanup; 2287 } 2288 } 2289 2290 cleanup: 2291 if (need_cleanup) { 2292 /* Clean up */ 2293 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2294 dst_path->locks[cur_level]); 2295 free_extent_buffer(dst_path->nodes[cur_level]); 2296 dst_path->nodes[cur_level] = NULL; 2297 dst_path->slots[cur_level] = 0; 2298 dst_path->locks[cur_level] = 0; 2299 } 2300 out: 2301 return ret; 2302 } 2303 2304 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2305 struct extent_buffer *src_eb, 2306 struct extent_buffer *dst_eb, 2307 u64 last_snapshot, bool trace_leaf) 2308 { 2309 struct btrfs_fs_info *fs_info = trans->fs_info; 2310 struct btrfs_path *dst_path = NULL; 2311 int level; 2312 int ret; 2313 2314 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2315 return 0; 2316 2317 /* Wrong parameter order */ 2318 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2319 btrfs_err_rl(fs_info, 2320 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2321 btrfs_header_generation(src_eb), 2322 btrfs_header_generation(dst_eb)); 2323 return -EUCLEAN; 2324 } 2325 2326 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2327 ret = -EIO; 2328 goto out; 2329 } 2330 2331 level = btrfs_header_level(dst_eb); 2332 dst_path = btrfs_alloc_path(); 2333 if (!dst_path) { 2334 ret = -ENOMEM; 2335 goto out; 2336 } 2337 /* For dst_path */ 2338 atomic_inc(&dst_eb->refs); 2339 dst_path->nodes[level] = dst_eb; 2340 dst_path->slots[level] = 0; 2341 dst_path->locks[level] = 0; 2342 2343 /* Do the generation aware breadth-first search */ 2344 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2345 level, last_snapshot, trace_leaf); 2346 if (ret < 0) 2347 goto out; 2348 ret = 0; 2349 2350 out: 2351 btrfs_free_path(dst_path); 2352 if (ret < 0) 2353 qgroup_mark_inconsistent(fs_info); 2354 return ret; 2355 } 2356 2357 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2358 struct extent_buffer *root_eb, 2359 u64 root_gen, int root_level) 2360 { 2361 struct btrfs_fs_info *fs_info = trans->fs_info; 2362 int ret = 0; 2363 int level; 2364 u8 drop_subptree_thres; 2365 struct extent_buffer *eb = root_eb; 2366 struct btrfs_path *path = NULL; 2367 2368 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2369 BUG_ON(root_eb == NULL); 2370 2371 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2372 return 0; 2373 2374 spin_lock(&fs_info->qgroup_lock); 2375 drop_subptree_thres = fs_info->qgroup_drop_subtree_thres; 2376 spin_unlock(&fs_info->qgroup_lock); 2377 2378 /* 2379 * This function only gets called for snapshot drop, if we hit a high 2380 * node here, it means we are going to change ownership for quite a lot 2381 * of extents, which will greatly slow down btrfs_commit_transaction(). 2382 * 2383 * So here if we find a high tree here, we just skip the accounting and 2384 * mark qgroup inconsistent. 2385 */ 2386 if (root_level >= drop_subptree_thres) { 2387 qgroup_mark_inconsistent(fs_info); 2388 return 0; 2389 } 2390 2391 if (!extent_buffer_uptodate(root_eb)) { 2392 struct btrfs_tree_parent_check check = { 2393 .has_first_key = false, 2394 .transid = root_gen, 2395 .level = root_level 2396 }; 2397 2398 ret = btrfs_read_extent_buffer(root_eb, &check); 2399 if (ret) 2400 goto out; 2401 } 2402 2403 if (root_level == 0) { 2404 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2405 goto out; 2406 } 2407 2408 path = btrfs_alloc_path(); 2409 if (!path) 2410 return -ENOMEM; 2411 2412 /* 2413 * Walk down the tree. Missing extent blocks are filled in as 2414 * we go. Metadata is accounted every time we read a new 2415 * extent block. 2416 * 2417 * When we reach a leaf, we account for file extent items in it, 2418 * walk back up the tree (adjusting slot pointers as we go) 2419 * and restart the search process. 2420 */ 2421 atomic_inc(&root_eb->refs); /* For path */ 2422 path->nodes[root_level] = root_eb; 2423 path->slots[root_level] = 0; 2424 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2425 walk_down: 2426 level = root_level; 2427 while (level >= 0) { 2428 if (path->nodes[level] == NULL) { 2429 int parent_slot; 2430 u64 child_bytenr; 2431 2432 /* 2433 * We need to get child blockptr from parent before we 2434 * can read it. 2435 */ 2436 eb = path->nodes[level + 1]; 2437 parent_slot = path->slots[level + 1]; 2438 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2439 2440 eb = btrfs_read_node_slot(eb, parent_slot); 2441 if (IS_ERR(eb)) { 2442 ret = PTR_ERR(eb); 2443 goto out; 2444 } 2445 2446 path->nodes[level] = eb; 2447 path->slots[level] = 0; 2448 2449 btrfs_tree_read_lock(eb); 2450 path->locks[level] = BTRFS_READ_LOCK; 2451 2452 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2453 fs_info->nodesize); 2454 if (ret) 2455 goto out; 2456 } 2457 2458 if (level == 0) { 2459 ret = btrfs_qgroup_trace_leaf_items(trans, 2460 path->nodes[level]); 2461 if (ret) 2462 goto out; 2463 2464 /* Nonzero return here means we completed our search */ 2465 ret = adjust_slots_upwards(path, root_level); 2466 if (ret) 2467 break; 2468 2469 /* Restart search with new slots */ 2470 goto walk_down; 2471 } 2472 2473 level--; 2474 } 2475 2476 ret = 0; 2477 out: 2478 btrfs_free_path(path); 2479 2480 return ret; 2481 } 2482 2483 #define UPDATE_NEW 0 2484 #define UPDATE_OLD 1 2485 /* 2486 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2487 */ 2488 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2489 struct ulist *roots, struct ulist *tmp, 2490 struct ulist *qgroups, u64 seq, int update_old) 2491 { 2492 struct ulist_node *unode; 2493 struct ulist_iterator uiter; 2494 struct ulist_node *tmp_unode; 2495 struct ulist_iterator tmp_uiter; 2496 struct btrfs_qgroup *qg; 2497 int ret = 0; 2498 2499 if (!roots) 2500 return 0; 2501 ULIST_ITER_INIT(&uiter); 2502 while ((unode = ulist_next(roots, &uiter))) { 2503 qg = find_qgroup_rb(fs_info, unode->val); 2504 if (!qg) 2505 continue; 2506 2507 ulist_reinit(tmp); 2508 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2509 GFP_ATOMIC); 2510 if (ret < 0) 2511 return ret; 2512 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2513 if (ret < 0) 2514 return ret; 2515 ULIST_ITER_INIT(&tmp_uiter); 2516 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2517 struct btrfs_qgroup_list *glist; 2518 2519 qg = unode_aux_to_qgroup(tmp_unode); 2520 if (update_old) 2521 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2522 else 2523 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2524 list_for_each_entry(glist, &qg->groups, next_group) { 2525 ret = ulist_add(qgroups, glist->group->qgroupid, 2526 qgroup_to_aux(glist->group), 2527 GFP_ATOMIC); 2528 if (ret < 0) 2529 return ret; 2530 ret = ulist_add(tmp, glist->group->qgroupid, 2531 qgroup_to_aux(glist->group), 2532 GFP_ATOMIC); 2533 if (ret < 0) 2534 return ret; 2535 } 2536 } 2537 } 2538 return 0; 2539 } 2540 2541 /* 2542 * Update qgroup rfer/excl counters. 2543 * Rfer update is easy, codes can explain themselves. 2544 * 2545 * Excl update is tricky, the update is split into 2 parts. 2546 * Part 1: Possible exclusive <-> sharing detect: 2547 * | A | !A | 2548 * ------------------------------------- 2549 * B | * | - | 2550 * ------------------------------------- 2551 * !B | + | ** | 2552 * ------------------------------------- 2553 * 2554 * Conditions: 2555 * A: cur_old_roots < nr_old_roots (not exclusive before) 2556 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2557 * B: cur_new_roots < nr_new_roots (not exclusive now) 2558 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2559 * 2560 * Results: 2561 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2562 * *: Definitely not changed. **: Possible unchanged. 2563 * 2564 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2565 * 2566 * To make the logic clear, we first use condition A and B to split 2567 * combination into 4 results. 2568 * 2569 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2570 * only on variant maybe 0. 2571 * 2572 * Lastly, check result **, since there are 2 variants maybe 0, split them 2573 * again(2x2). 2574 * But this time we don't need to consider other things, the codes and logic 2575 * is easy to understand now. 2576 */ 2577 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2578 struct ulist *qgroups, 2579 u64 nr_old_roots, 2580 u64 nr_new_roots, 2581 u64 num_bytes, u64 seq) 2582 { 2583 struct ulist_node *unode; 2584 struct ulist_iterator uiter; 2585 struct btrfs_qgroup *qg; 2586 u64 cur_new_count, cur_old_count; 2587 2588 ULIST_ITER_INIT(&uiter); 2589 while ((unode = ulist_next(qgroups, &uiter))) { 2590 bool dirty = false; 2591 2592 qg = unode_aux_to_qgroup(unode); 2593 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2594 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2595 2596 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2597 cur_new_count); 2598 2599 /* Rfer update part */ 2600 if (cur_old_count == 0 && cur_new_count > 0) { 2601 qg->rfer += num_bytes; 2602 qg->rfer_cmpr += num_bytes; 2603 dirty = true; 2604 } 2605 if (cur_old_count > 0 && cur_new_count == 0) { 2606 qg->rfer -= num_bytes; 2607 qg->rfer_cmpr -= num_bytes; 2608 dirty = true; 2609 } 2610 2611 /* Excl update part */ 2612 /* Exclusive/none -> shared case */ 2613 if (cur_old_count == nr_old_roots && 2614 cur_new_count < nr_new_roots) { 2615 /* Exclusive -> shared */ 2616 if (cur_old_count != 0) { 2617 qg->excl -= num_bytes; 2618 qg->excl_cmpr -= num_bytes; 2619 dirty = true; 2620 } 2621 } 2622 2623 /* Shared -> exclusive/none case */ 2624 if (cur_old_count < nr_old_roots && 2625 cur_new_count == nr_new_roots) { 2626 /* Shared->exclusive */ 2627 if (cur_new_count != 0) { 2628 qg->excl += num_bytes; 2629 qg->excl_cmpr += num_bytes; 2630 dirty = true; 2631 } 2632 } 2633 2634 /* Exclusive/none -> exclusive/none case */ 2635 if (cur_old_count == nr_old_roots && 2636 cur_new_count == nr_new_roots) { 2637 if (cur_old_count == 0) { 2638 /* None -> exclusive/none */ 2639 2640 if (cur_new_count != 0) { 2641 /* None -> exclusive */ 2642 qg->excl += num_bytes; 2643 qg->excl_cmpr += num_bytes; 2644 dirty = true; 2645 } 2646 /* None -> none, nothing changed */ 2647 } else { 2648 /* Exclusive -> exclusive/none */ 2649 2650 if (cur_new_count == 0) { 2651 /* Exclusive -> none */ 2652 qg->excl -= num_bytes; 2653 qg->excl_cmpr -= num_bytes; 2654 dirty = true; 2655 } 2656 /* Exclusive -> exclusive, nothing changed */ 2657 } 2658 } 2659 2660 if (dirty) 2661 qgroup_dirty(fs_info, qg); 2662 } 2663 return 0; 2664 } 2665 2666 /* 2667 * Check if the @roots potentially is a list of fs tree roots 2668 * 2669 * Return 0 for definitely not a fs/subvol tree roots ulist 2670 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2671 * one as well) 2672 */ 2673 static int maybe_fs_roots(struct ulist *roots) 2674 { 2675 struct ulist_node *unode; 2676 struct ulist_iterator uiter; 2677 2678 /* Empty one, still possible for fs roots */ 2679 if (!roots || roots->nnodes == 0) 2680 return 1; 2681 2682 ULIST_ITER_INIT(&uiter); 2683 unode = ulist_next(roots, &uiter); 2684 if (!unode) 2685 return 1; 2686 2687 /* 2688 * If it contains fs tree roots, then it must belong to fs/subvol 2689 * trees. 2690 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2691 */ 2692 return is_fstree(unode->val); 2693 } 2694 2695 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2696 u64 num_bytes, struct ulist *old_roots, 2697 struct ulist *new_roots) 2698 { 2699 struct btrfs_fs_info *fs_info = trans->fs_info; 2700 struct ulist *qgroups = NULL; 2701 struct ulist *tmp = NULL; 2702 u64 seq; 2703 u64 nr_new_roots = 0; 2704 u64 nr_old_roots = 0; 2705 int ret = 0; 2706 2707 /* 2708 * If quotas get disabled meanwhile, the resources need to be freed and 2709 * we can't just exit here. 2710 */ 2711 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 2712 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) 2713 goto out_free; 2714 2715 if (new_roots) { 2716 if (!maybe_fs_roots(new_roots)) 2717 goto out_free; 2718 nr_new_roots = new_roots->nnodes; 2719 } 2720 if (old_roots) { 2721 if (!maybe_fs_roots(old_roots)) 2722 goto out_free; 2723 nr_old_roots = old_roots->nnodes; 2724 } 2725 2726 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2727 if (nr_old_roots == 0 && nr_new_roots == 0) 2728 goto out_free; 2729 2730 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2731 num_bytes, nr_old_roots, nr_new_roots); 2732 2733 qgroups = ulist_alloc(GFP_NOFS); 2734 if (!qgroups) { 2735 ret = -ENOMEM; 2736 goto out_free; 2737 } 2738 tmp = ulist_alloc(GFP_NOFS); 2739 if (!tmp) { 2740 ret = -ENOMEM; 2741 goto out_free; 2742 } 2743 2744 mutex_lock(&fs_info->qgroup_rescan_lock); 2745 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2746 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2747 mutex_unlock(&fs_info->qgroup_rescan_lock); 2748 ret = 0; 2749 goto out_free; 2750 } 2751 } 2752 mutex_unlock(&fs_info->qgroup_rescan_lock); 2753 2754 spin_lock(&fs_info->qgroup_lock); 2755 seq = fs_info->qgroup_seq; 2756 2757 /* Update old refcnts using old_roots */ 2758 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2759 UPDATE_OLD); 2760 if (ret < 0) 2761 goto out; 2762 2763 /* Update new refcnts using new_roots */ 2764 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2765 UPDATE_NEW); 2766 if (ret < 0) 2767 goto out; 2768 2769 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2770 num_bytes, seq); 2771 2772 /* 2773 * Bump qgroup_seq to avoid seq overlap 2774 */ 2775 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2776 out: 2777 spin_unlock(&fs_info->qgroup_lock); 2778 out_free: 2779 ulist_free(tmp); 2780 ulist_free(qgroups); 2781 ulist_free(old_roots); 2782 ulist_free(new_roots); 2783 return ret; 2784 } 2785 2786 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2787 { 2788 struct btrfs_fs_info *fs_info = trans->fs_info; 2789 struct btrfs_qgroup_extent_record *record; 2790 struct btrfs_delayed_ref_root *delayed_refs; 2791 struct ulist *new_roots = NULL; 2792 struct rb_node *node; 2793 u64 num_dirty_extents = 0; 2794 u64 qgroup_to_skip; 2795 int ret = 0; 2796 2797 delayed_refs = &trans->transaction->delayed_refs; 2798 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2799 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2800 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2801 node); 2802 2803 num_dirty_extents++; 2804 trace_btrfs_qgroup_account_extents(fs_info, record); 2805 2806 if (!ret && !(fs_info->qgroup_flags & 2807 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { 2808 struct btrfs_backref_walk_ctx ctx = { 0 }; 2809 2810 ctx.bytenr = record->bytenr; 2811 ctx.fs_info = fs_info; 2812 2813 /* 2814 * Old roots should be searched when inserting qgroup 2815 * extent record. 2816 * 2817 * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, 2818 * we may have some record inserted during 2819 * NO_ACCOUNTING (thus no old_roots populated), but 2820 * later we start rescan, which clears NO_ACCOUNTING, 2821 * leaving some inserted records without old_roots 2822 * populated. 2823 * 2824 * Those cases are rare and should not cause too much 2825 * time spent during commit_transaction(). 2826 */ 2827 if (!record->old_roots) { 2828 /* Search commit root to find old_roots */ 2829 ret = btrfs_find_all_roots(&ctx, false); 2830 if (ret < 0) 2831 goto cleanup; 2832 record->old_roots = ctx.roots; 2833 ctx.roots = NULL; 2834 } 2835 2836 /* 2837 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2838 * which doesn't lock tree or delayed_refs and search 2839 * current root. It's safe inside commit_transaction(). 2840 */ 2841 ctx.trans = trans; 2842 ctx.time_seq = BTRFS_SEQ_LAST; 2843 ret = btrfs_find_all_roots(&ctx, false); 2844 if (ret < 0) 2845 goto cleanup; 2846 new_roots = ctx.roots; 2847 if (qgroup_to_skip) { 2848 ulist_del(new_roots, qgroup_to_skip, 0); 2849 ulist_del(record->old_roots, qgroup_to_skip, 2850 0); 2851 } 2852 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2853 record->num_bytes, 2854 record->old_roots, 2855 new_roots); 2856 record->old_roots = NULL; 2857 new_roots = NULL; 2858 } 2859 /* Free the reserved data space */ 2860 btrfs_qgroup_free_refroot(fs_info, 2861 record->data_rsv_refroot, 2862 record->data_rsv, 2863 BTRFS_QGROUP_RSV_DATA); 2864 cleanup: 2865 ulist_free(record->old_roots); 2866 ulist_free(new_roots); 2867 new_roots = NULL; 2868 rb_erase(node, &delayed_refs->dirty_extent_root); 2869 kfree(record); 2870 2871 } 2872 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2873 num_dirty_extents); 2874 return ret; 2875 } 2876 2877 /* 2878 * Writes all changed qgroups to disk. 2879 * Called by the transaction commit path and the qgroup assign ioctl. 2880 */ 2881 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2882 { 2883 struct btrfs_fs_info *fs_info = trans->fs_info; 2884 int ret = 0; 2885 2886 /* 2887 * In case we are called from the qgroup assign ioctl, assert that we 2888 * are holding the qgroup_ioctl_lock, otherwise we can race with a quota 2889 * disable operation (ioctl) and access a freed quota root. 2890 */ 2891 if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) 2892 lockdep_assert_held(&fs_info->qgroup_ioctl_lock); 2893 2894 if (!fs_info->quota_root) 2895 return ret; 2896 2897 spin_lock(&fs_info->qgroup_lock); 2898 while (!list_empty(&fs_info->dirty_qgroups)) { 2899 struct btrfs_qgroup *qgroup; 2900 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2901 struct btrfs_qgroup, dirty); 2902 list_del_init(&qgroup->dirty); 2903 spin_unlock(&fs_info->qgroup_lock); 2904 ret = update_qgroup_info_item(trans, qgroup); 2905 if (ret) 2906 qgroup_mark_inconsistent(fs_info); 2907 ret = update_qgroup_limit_item(trans, qgroup); 2908 if (ret) 2909 qgroup_mark_inconsistent(fs_info); 2910 spin_lock(&fs_info->qgroup_lock); 2911 } 2912 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2913 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2914 else 2915 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2916 spin_unlock(&fs_info->qgroup_lock); 2917 2918 ret = update_qgroup_status_item(trans); 2919 if (ret) 2920 qgroup_mark_inconsistent(fs_info); 2921 2922 return ret; 2923 } 2924 2925 /* 2926 * Copy the accounting information between qgroups. This is necessary 2927 * when a snapshot or a subvolume is created. Throwing an error will 2928 * cause a transaction abort so we take extra care here to only error 2929 * when a readonly fs is a reasonable outcome. 2930 */ 2931 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2932 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2933 { 2934 int ret = 0; 2935 int i; 2936 u64 *i_qgroups; 2937 bool committing = false; 2938 struct btrfs_fs_info *fs_info = trans->fs_info; 2939 struct btrfs_root *quota_root; 2940 struct btrfs_qgroup *srcgroup; 2941 struct btrfs_qgroup *dstgroup; 2942 bool need_rescan = false; 2943 u32 level_size = 0; 2944 u64 nums; 2945 2946 /* 2947 * There are only two callers of this function. 2948 * 2949 * One in create_subvol() in the ioctl context, which needs to hold 2950 * the qgroup_ioctl_lock. 2951 * 2952 * The other one in create_pending_snapshot() where no other qgroup 2953 * code can modify the fs as they all need to either start a new trans 2954 * or hold a trans handler, thus we don't need to hold 2955 * qgroup_ioctl_lock. 2956 * This would avoid long and complex lock chain and make lockdep happy. 2957 */ 2958 spin_lock(&fs_info->trans_lock); 2959 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2960 committing = true; 2961 spin_unlock(&fs_info->trans_lock); 2962 2963 if (!committing) 2964 mutex_lock(&fs_info->qgroup_ioctl_lock); 2965 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2966 goto out; 2967 2968 quota_root = fs_info->quota_root; 2969 if (!quota_root) { 2970 ret = -EINVAL; 2971 goto out; 2972 } 2973 2974 if (inherit) { 2975 i_qgroups = (u64 *)(inherit + 1); 2976 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2977 2 * inherit->num_excl_copies; 2978 for (i = 0; i < nums; ++i) { 2979 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2980 2981 /* 2982 * Zero out invalid groups so we can ignore 2983 * them later. 2984 */ 2985 if (!srcgroup || 2986 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2987 *i_qgroups = 0ULL; 2988 2989 ++i_qgroups; 2990 } 2991 } 2992 2993 /* 2994 * create a tracking group for the subvol itself 2995 */ 2996 ret = add_qgroup_item(trans, quota_root, objectid); 2997 if (ret) 2998 goto out; 2999 3000 /* 3001 * add qgroup to all inherited groups 3002 */ 3003 if (inherit) { 3004 i_qgroups = (u64 *)(inherit + 1); 3005 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 3006 if (*i_qgroups == 0) 3007 continue; 3008 ret = add_qgroup_relation_item(trans, objectid, 3009 *i_qgroups); 3010 if (ret && ret != -EEXIST) 3011 goto out; 3012 ret = add_qgroup_relation_item(trans, *i_qgroups, 3013 objectid); 3014 if (ret && ret != -EEXIST) 3015 goto out; 3016 } 3017 ret = 0; 3018 } 3019 3020 3021 spin_lock(&fs_info->qgroup_lock); 3022 3023 dstgroup = add_qgroup_rb(fs_info, objectid); 3024 if (IS_ERR(dstgroup)) { 3025 ret = PTR_ERR(dstgroup); 3026 goto unlock; 3027 } 3028 3029 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 3030 dstgroup->lim_flags = inherit->lim.flags; 3031 dstgroup->max_rfer = inherit->lim.max_rfer; 3032 dstgroup->max_excl = inherit->lim.max_excl; 3033 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 3034 dstgroup->rsv_excl = inherit->lim.rsv_excl; 3035 3036 qgroup_dirty(fs_info, dstgroup); 3037 } 3038 3039 if (srcid) { 3040 srcgroup = find_qgroup_rb(fs_info, srcid); 3041 if (!srcgroup) 3042 goto unlock; 3043 3044 /* 3045 * We call inherit after we clone the root in order to make sure 3046 * our counts don't go crazy, so at this point the only 3047 * difference between the two roots should be the root node. 3048 */ 3049 level_size = fs_info->nodesize; 3050 dstgroup->rfer = srcgroup->rfer; 3051 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 3052 dstgroup->excl = level_size; 3053 dstgroup->excl_cmpr = level_size; 3054 srcgroup->excl = level_size; 3055 srcgroup->excl_cmpr = level_size; 3056 3057 /* inherit the limit info */ 3058 dstgroup->lim_flags = srcgroup->lim_flags; 3059 dstgroup->max_rfer = srcgroup->max_rfer; 3060 dstgroup->max_excl = srcgroup->max_excl; 3061 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 3062 dstgroup->rsv_excl = srcgroup->rsv_excl; 3063 3064 qgroup_dirty(fs_info, dstgroup); 3065 qgroup_dirty(fs_info, srcgroup); 3066 } 3067 3068 if (!inherit) 3069 goto unlock; 3070 3071 i_qgroups = (u64 *)(inherit + 1); 3072 for (i = 0; i < inherit->num_qgroups; ++i) { 3073 if (*i_qgroups) { 3074 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 3075 if (ret) 3076 goto unlock; 3077 } 3078 ++i_qgroups; 3079 3080 /* 3081 * If we're doing a snapshot, and adding the snapshot to a new 3082 * qgroup, the numbers are guaranteed to be incorrect. 3083 */ 3084 if (srcid) 3085 need_rescan = true; 3086 } 3087 3088 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 3089 struct btrfs_qgroup *src; 3090 struct btrfs_qgroup *dst; 3091 3092 if (!i_qgroups[0] || !i_qgroups[1]) 3093 continue; 3094 3095 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3096 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3097 3098 if (!src || !dst) { 3099 ret = -EINVAL; 3100 goto unlock; 3101 } 3102 3103 dst->rfer = src->rfer - level_size; 3104 dst->rfer_cmpr = src->rfer_cmpr - level_size; 3105 3106 /* Manually tweaking numbers certainly needs a rescan */ 3107 need_rescan = true; 3108 } 3109 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 3110 struct btrfs_qgroup *src; 3111 struct btrfs_qgroup *dst; 3112 3113 if (!i_qgroups[0] || !i_qgroups[1]) 3114 continue; 3115 3116 src = find_qgroup_rb(fs_info, i_qgroups[0]); 3117 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 3118 3119 if (!src || !dst) { 3120 ret = -EINVAL; 3121 goto unlock; 3122 } 3123 3124 dst->excl = src->excl + level_size; 3125 dst->excl_cmpr = src->excl_cmpr + level_size; 3126 need_rescan = true; 3127 } 3128 3129 unlock: 3130 spin_unlock(&fs_info->qgroup_lock); 3131 if (!ret) 3132 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 3133 out: 3134 if (!committing) 3135 mutex_unlock(&fs_info->qgroup_ioctl_lock); 3136 if (need_rescan) 3137 qgroup_mark_inconsistent(fs_info); 3138 return ret; 3139 } 3140 3141 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3142 { 3143 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3144 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3145 return false; 3146 3147 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3148 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3149 return false; 3150 3151 return true; 3152 } 3153 3154 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3155 enum btrfs_qgroup_rsv_type type) 3156 { 3157 struct btrfs_qgroup *qgroup; 3158 struct btrfs_fs_info *fs_info = root->fs_info; 3159 u64 ref_root = root->root_key.objectid; 3160 int ret = 0; 3161 LIST_HEAD(qgroup_list); 3162 3163 if (!is_fstree(ref_root)) 3164 return 0; 3165 3166 if (num_bytes == 0) 3167 return 0; 3168 3169 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3170 capable(CAP_SYS_RESOURCE)) 3171 enforce = false; 3172 3173 spin_lock(&fs_info->qgroup_lock); 3174 if (!fs_info->quota_root) 3175 goto out; 3176 3177 qgroup = find_qgroup_rb(fs_info, ref_root); 3178 if (!qgroup) 3179 goto out; 3180 3181 qgroup_iterator_add(&qgroup_list, qgroup); 3182 list_for_each_entry(qgroup, &qgroup_list, iterator) { 3183 struct btrfs_qgroup_list *glist; 3184 3185 if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { 3186 ret = -EDQUOT; 3187 goto out; 3188 } 3189 3190 list_for_each_entry(glist, &qgroup->groups, next_group) 3191 qgroup_iterator_add(&qgroup_list, glist->group); 3192 } 3193 3194 ret = 0; 3195 /* 3196 * no limits exceeded, now record the reservation into all qgroups 3197 */ 3198 list_for_each_entry(qgroup, &qgroup_list, iterator) 3199 qgroup_rsv_add(fs_info, qgroup, num_bytes, type); 3200 3201 out: 3202 qgroup_iterator_clean(&qgroup_list); 3203 spin_unlock(&fs_info->qgroup_lock); 3204 return ret; 3205 } 3206 3207 /* 3208 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3209 * qgroup). 3210 * 3211 * Will handle all higher level qgroup too. 3212 * 3213 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3214 * This special case is only used for META_PERTRANS type. 3215 */ 3216 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3217 u64 ref_root, u64 num_bytes, 3218 enum btrfs_qgroup_rsv_type type) 3219 { 3220 struct btrfs_qgroup *qgroup; 3221 struct ulist_node *unode; 3222 struct ulist_iterator uiter; 3223 int ret = 0; 3224 3225 if (!is_fstree(ref_root)) 3226 return; 3227 3228 if (num_bytes == 0) 3229 return; 3230 3231 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3232 WARN(1, "%s: Invalid type to free", __func__); 3233 return; 3234 } 3235 spin_lock(&fs_info->qgroup_lock); 3236 3237 if (!fs_info->quota_root) 3238 goto out; 3239 3240 qgroup = find_qgroup_rb(fs_info, ref_root); 3241 if (!qgroup) 3242 goto out; 3243 3244 if (num_bytes == (u64)-1) 3245 /* 3246 * We're freeing all pertrans rsv, get reserved value from 3247 * level 0 qgroup as real num_bytes to free. 3248 */ 3249 num_bytes = qgroup->rsv.values[type]; 3250 3251 ulist_reinit(fs_info->qgroup_ulist); 3252 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3253 qgroup_to_aux(qgroup), GFP_ATOMIC); 3254 if (ret < 0) 3255 goto out; 3256 ULIST_ITER_INIT(&uiter); 3257 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3258 struct btrfs_qgroup *qg; 3259 struct btrfs_qgroup_list *glist; 3260 3261 qg = unode_aux_to_qgroup(unode); 3262 3263 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3264 3265 list_for_each_entry(glist, &qg->groups, next_group) { 3266 ret = ulist_add(fs_info->qgroup_ulist, 3267 glist->group->qgroupid, 3268 qgroup_to_aux(glist->group), GFP_ATOMIC); 3269 if (ret < 0) 3270 goto out; 3271 } 3272 } 3273 3274 out: 3275 spin_unlock(&fs_info->qgroup_lock); 3276 } 3277 3278 /* 3279 * Check if the leaf is the last leaf. Which means all node pointers 3280 * are at their last position. 3281 */ 3282 static bool is_last_leaf(struct btrfs_path *path) 3283 { 3284 int i; 3285 3286 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3287 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3288 return false; 3289 } 3290 return true; 3291 } 3292 3293 /* 3294 * returns < 0 on error, 0 when more leafs are to be scanned. 3295 * returns 1 when done. 3296 */ 3297 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3298 struct btrfs_path *path) 3299 { 3300 struct btrfs_fs_info *fs_info = trans->fs_info; 3301 struct btrfs_root *extent_root; 3302 struct btrfs_key found; 3303 struct extent_buffer *scratch_leaf = NULL; 3304 u64 num_bytes; 3305 bool done; 3306 int slot; 3307 int ret; 3308 3309 mutex_lock(&fs_info->qgroup_rescan_lock); 3310 extent_root = btrfs_extent_root(fs_info, 3311 fs_info->qgroup_rescan_progress.objectid); 3312 ret = btrfs_search_slot_for_read(extent_root, 3313 &fs_info->qgroup_rescan_progress, 3314 path, 1, 0); 3315 3316 btrfs_debug(fs_info, 3317 "current progress key (%llu %u %llu), search_slot ret %d", 3318 fs_info->qgroup_rescan_progress.objectid, 3319 fs_info->qgroup_rescan_progress.type, 3320 fs_info->qgroup_rescan_progress.offset, ret); 3321 3322 if (ret) { 3323 /* 3324 * The rescan is about to end, we will not be scanning any 3325 * further blocks. We cannot unset the RESCAN flag here, because 3326 * we want to commit the transaction if everything went well. 3327 * To make the live accounting work in this phase, we set our 3328 * scan progress pointer such that every real extent objectid 3329 * will be smaller. 3330 */ 3331 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3332 btrfs_release_path(path); 3333 mutex_unlock(&fs_info->qgroup_rescan_lock); 3334 return ret; 3335 } 3336 done = is_last_leaf(path); 3337 3338 btrfs_item_key_to_cpu(path->nodes[0], &found, 3339 btrfs_header_nritems(path->nodes[0]) - 1); 3340 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3341 3342 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3343 if (!scratch_leaf) { 3344 ret = -ENOMEM; 3345 mutex_unlock(&fs_info->qgroup_rescan_lock); 3346 goto out; 3347 } 3348 slot = path->slots[0]; 3349 btrfs_release_path(path); 3350 mutex_unlock(&fs_info->qgroup_rescan_lock); 3351 3352 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3353 struct btrfs_backref_walk_ctx ctx = { 0 }; 3354 3355 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3356 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3357 found.type != BTRFS_METADATA_ITEM_KEY) 3358 continue; 3359 if (found.type == BTRFS_METADATA_ITEM_KEY) 3360 num_bytes = fs_info->nodesize; 3361 else 3362 num_bytes = found.offset; 3363 3364 ctx.bytenr = found.objectid; 3365 ctx.fs_info = fs_info; 3366 3367 ret = btrfs_find_all_roots(&ctx, false); 3368 if (ret < 0) 3369 goto out; 3370 /* For rescan, just pass old_roots as NULL */ 3371 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3372 num_bytes, NULL, ctx.roots); 3373 if (ret < 0) 3374 goto out; 3375 } 3376 out: 3377 if (scratch_leaf) 3378 free_extent_buffer(scratch_leaf); 3379 3380 if (done && !ret) { 3381 ret = 1; 3382 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3383 } 3384 return ret; 3385 } 3386 3387 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3388 { 3389 return btrfs_fs_closing(fs_info) || 3390 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || 3391 !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3392 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3393 } 3394 3395 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3396 { 3397 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3398 qgroup_rescan_work); 3399 struct btrfs_path *path; 3400 struct btrfs_trans_handle *trans = NULL; 3401 int err = -ENOMEM; 3402 int ret = 0; 3403 bool stopped = false; 3404 bool did_leaf_rescans = false; 3405 3406 path = btrfs_alloc_path(); 3407 if (!path) 3408 goto out; 3409 /* 3410 * Rescan should only search for commit root, and any later difference 3411 * should be recorded by qgroup 3412 */ 3413 path->search_commit_root = 1; 3414 path->skip_locking = 1; 3415 3416 err = 0; 3417 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3418 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3419 if (IS_ERR(trans)) { 3420 err = PTR_ERR(trans); 3421 break; 3422 } 3423 3424 err = qgroup_rescan_leaf(trans, path); 3425 did_leaf_rescans = true; 3426 3427 if (err > 0) 3428 btrfs_commit_transaction(trans); 3429 else 3430 btrfs_end_transaction(trans); 3431 } 3432 3433 out: 3434 btrfs_free_path(path); 3435 3436 mutex_lock(&fs_info->qgroup_rescan_lock); 3437 if (err > 0 && 3438 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3439 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3440 } else if (err < 0 || stopped) { 3441 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3442 } 3443 mutex_unlock(&fs_info->qgroup_rescan_lock); 3444 3445 /* 3446 * Only update status, since the previous part has already updated the 3447 * qgroup info, and only if we did any actual work. This also prevents 3448 * race with a concurrent quota disable, which has already set 3449 * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at 3450 * btrfs_quota_disable(). 3451 */ 3452 if (did_leaf_rescans) { 3453 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3454 if (IS_ERR(trans)) { 3455 err = PTR_ERR(trans); 3456 trans = NULL; 3457 btrfs_err(fs_info, 3458 "fail to start transaction for status update: %d", 3459 err); 3460 } 3461 } else { 3462 trans = NULL; 3463 } 3464 3465 mutex_lock(&fs_info->qgroup_rescan_lock); 3466 if (!stopped || 3467 fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) 3468 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3469 if (trans) { 3470 ret = update_qgroup_status_item(trans); 3471 if (ret < 0) { 3472 err = ret; 3473 btrfs_err(fs_info, "fail to update qgroup status: %d", 3474 err); 3475 } 3476 } 3477 fs_info->qgroup_rescan_running = false; 3478 fs_info->qgroup_flags &= ~BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; 3479 complete_all(&fs_info->qgroup_rescan_completion); 3480 mutex_unlock(&fs_info->qgroup_rescan_lock); 3481 3482 if (!trans) 3483 return; 3484 3485 btrfs_end_transaction(trans); 3486 3487 if (stopped) { 3488 btrfs_info(fs_info, "qgroup scan paused"); 3489 } else if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) { 3490 btrfs_info(fs_info, "qgroup scan cancelled"); 3491 } else if (err >= 0) { 3492 btrfs_info(fs_info, "qgroup scan completed%s", 3493 err > 0 ? " (inconsistency flag cleared)" : ""); 3494 } else { 3495 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3496 } 3497 } 3498 3499 /* 3500 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3501 * memory required for the rescan context. 3502 */ 3503 static int 3504 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3505 int init_flags) 3506 { 3507 int ret = 0; 3508 3509 if (!init_flags) { 3510 /* we're resuming qgroup rescan at mount time */ 3511 if (!(fs_info->qgroup_flags & 3512 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3513 btrfs_warn(fs_info, 3514 "qgroup rescan init failed, qgroup rescan is not queued"); 3515 ret = -EINVAL; 3516 } else if (!(fs_info->qgroup_flags & 3517 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3518 btrfs_warn(fs_info, 3519 "qgroup rescan init failed, qgroup is not enabled"); 3520 ret = -EINVAL; 3521 } 3522 3523 if (ret) 3524 return ret; 3525 } 3526 3527 mutex_lock(&fs_info->qgroup_rescan_lock); 3528 3529 if (init_flags) { 3530 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3531 btrfs_warn(fs_info, 3532 "qgroup rescan is already in progress"); 3533 ret = -EINPROGRESS; 3534 } else if (!(fs_info->qgroup_flags & 3535 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3536 btrfs_warn(fs_info, 3537 "qgroup rescan init failed, qgroup is not enabled"); 3538 ret = -EINVAL; 3539 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3540 /* Quota disable is in progress */ 3541 ret = -EBUSY; 3542 } 3543 3544 if (ret) { 3545 mutex_unlock(&fs_info->qgroup_rescan_lock); 3546 return ret; 3547 } 3548 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3549 } 3550 3551 memset(&fs_info->qgroup_rescan_progress, 0, 3552 sizeof(fs_info->qgroup_rescan_progress)); 3553 fs_info->qgroup_flags &= ~(BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | 3554 BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); 3555 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3556 init_completion(&fs_info->qgroup_rescan_completion); 3557 mutex_unlock(&fs_info->qgroup_rescan_lock); 3558 3559 btrfs_init_work(&fs_info->qgroup_rescan_work, 3560 btrfs_qgroup_rescan_worker, NULL, NULL); 3561 return 0; 3562 } 3563 3564 static void 3565 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3566 { 3567 struct rb_node *n; 3568 struct btrfs_qgroup *qgroup; 3569 3570 spin_lock(&fs_info->qgroup_lock); 3571 /* clear all current qgroup tracking information */ 3572 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3573 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3574 qgroup->rfer = 0; 3575 qgroup->rfer_cmpr = 0; 3576 qgroup->excl = 0; 3577 qgroup->excl_cmpr = 0; 3578 qgroup_dirty(fs_info, qgroup); 3579 } 3580 spin_unlock(&fs_info->qgroup_lock); 3581 } 3582 3583 int 3584 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3585 { 3586 int ret = 0; 3587 struct btrfs_trans_handle *trans; 3588 3589 ret = qgroup_rescan_init(fs_info, 0, 1); 3590 if (ret) 3591 return ret; 3592 3593 /* 3594 * We have set the rescan_progress to 0, which means no more 3595 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3596 * However, btrfs_qgroup_account_ref may be right after its call 3597 * to btrfs_find_all_roots, in which case it would still do the 3598 * accounting. 3599 * To solve this, we're committing the transaction, which will 3600 * ensure we run all delayed refs and only after that, we are 3601 * going to clear all tracking information for a clean start. 3602 */ 3603 3604 trans = btrfs_attach_transaction_barrier(fs_info->fs_root); 3605 if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { 3606 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3607 return PTR_ERR(trans); 3608 } else if (trans != ERR_PTR(-ENOENT)) { 3609 ret = btrfs_commit_transaction(trans); 3610 if (ret) { 3611 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3612 return ret; 3613 } 3614 } 3615 3616 qgroup_rescan_zero_tracking(fs_info); 3617 3618 mutex_lock(&fs_info->qgroup_rescan_lock); 3619 fs_info->qgroup_rescan_running = true; 3620 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3621 &fs_info->qgroup_rescan_work); 3622 mutex_unlock(&fs_info->qgroup_rescan_lock); 3623 3624 return 0; 3625 } 3626 3627 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3628 bool interruptible) 3629 { 3630 int running; 3631 int ret = 0; 3632 3633 mutex_lock(&fs_info->qgroup_rescan_lock); 3634 running = fs_info->qgroup_rescan_running; 3635 mutex_unlock(&fs_info->qgroup_rescan_lock); 3636 3637 if (!running) 3638 return 0; 3639 3640 if (interruptible) 3641 ret = wait_for_completion_interruptible( 3642 &fs_info->qgroup_rescan_completion); 3643 else 3644 wait_for_completion(&fs_info->qgroup_rescan_completion); 3645 3646 return ret; 3647 } 3648 3649 /* 3650 * this is only called from open_ctree where we're still single threaded, thus 3651 * locking is omitted here. 3652 */ 3653 void 3654 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3655 { 3656 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3657 mutex_lock(&fs_info->qgroup_rescan_lock); 3658 fs_info->qgroup_rescan_running = true; 3659 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3660 &fs_info->qgroup_rescan_work); 3661 mutex_unlock(&fs_info->qgroup_rescan_lock); 3662 } 3663 } 3664 3665 #define rbtree_iterate_from_safe(node, next, start) \ 3666 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3667 3668 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3669 struct extent_changeset *reserved, u64 start, 3670 u64 len) 3671 { 3672 struct rb_node *node; 3673 struct rb_node *next; 3674 struct ulist_node *entry; 3675 int ret = 0; 3676 3677 node = reserved->range_changed.root.rb_node; 3678 if (!node) 3679 return 0; 3680 while (node) { 3681 entry = rb_entry(node, struct ulist_node, rb_node); 3682 if (entry->val < start) 3683 node = node->rb_right; 3684 else 3685 node = node->rb_left; 3686 } 3687 3688 if (entry->val > start && rb_prev(&entry->rb_node)) 3689 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3690 rb_node); 3691 3692 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3693 u64 entry_start; 3694 u64 entry_end; 3695 u64 entry_len; 3696 int clear_ret; 3697 3698 entry = rb_entry(node, struct ulist_node, rb_node); 3699 entry_start = entry->val; 3700 entry_end = entry->aux; 3701 entry_len = entry_end - entry_start + 1; 3702 3703 if (entry_start >= start + len) 3704 break; 3705 if (entry_start + entry_len <= start) 3706 continue; 3707 /* 3708 * Now the entry is in [start, start + len), revert the 3709 * EXTENT_QGROUP_RESERVED bit. 3710 */ 3711 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3712 entry_end, EXTENT_QGROUP_RESERVED); 3713 if (!ret && clear_ret < 0) 3714 ret = clear_ret; 3715 3716 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3717 if (likely(reserved->bytes_changed >= entry_len)) { 3718 reserved->bytes_changed -= entry_len; 3719 } else { 3720 WARN_ON(1); 3721 reserved->bytes_changed = 0; 3722 } 3723 } 3724 3725 return ret; 3726 } 3727 3728 /* 3729 * Try to free some space for qgroup. 3730 * 3731 * For qgroup, there are only 3 ways to free qgroup space: 3732 * - Flush nodatacow write 3733 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3734 * In theory, we should only flush nodatacow inodes, but it's not yet 3735 * possible, so we need to flush the whole root. 3736 * 3737 * - Wait for ordered extents 3738 * When ordered extents are finished, their reserved metadata is finally 3739 * converted to per_trans status, which can be freed by later commit 3740 * transaction. 3741 * 3742 * - Commit transaction 3743 * This would free the meta_per_trans space. 3744 * In theory this shouldn't provide much space, but any more qgroup space 3745 * is needed. 3746 */ 3747 static int try_flush_qgroup(struct btrfs_root *root) 3748 { 3749 struct btrfs_trans_handle *trans; 3750 int ret; 3751 3752 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3753 ASSERT(current->journal_info == NULL); 3754 if (WARN_ON(current->journal_info)) 3755 return 0; 3756 3757 /* 3758 * We don't want to run flush again and again, so if there is a running 3759 * one, we won't try to start a new flush, but exit directly. 3760 */ 3761 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3762 wait_event(root->qgroup_flush_wait, 3763 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3764 return 0; 3765 } 3766 3767 btrfs_run_delayed_iputs(root->fs_info); 3768 btrfs_wait_on_delayed_iputs(root->fs_info); 3769 ret = btrfs_start_delalloc_snapshot(root, true); 3770 if (ret < 0) 3771 goto out; 3772 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3773 3774 trans = btrfs_attach_transaction_barrier(root); 3775 if (IS_ERR(trans)) { 3776 ret = PTR_ERR(trans); 3777 if (ret == -ENOENT) 3778 ret = 0; 3779 goto out; 3780 } 3781 3782 ret = btrfs_commit_transaction(trans); 3783 out: 3784 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3785 wake_up(&root->qgroup_flush_wait); 3786 return ret; 3787 } 3788 3789 static int qgroup_reserve_data(struct btrfs_inode *inode, 3790 struct extent_changeset **reserved_ret, u64 start, 3791 u64 len) 3792 { 3793 struct btrfs_root *root = inode->root; 3794 struct extent_changeset *reserved; 3795 bool new_reserved = false; 3796 u64 orig_reserved; 3797 u64 to_reserve; 3798 int ret; 3799 3800 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3801 !is_fstree(root->root_key.objectid) || len == 0) 3802 return 0; 3803 3804 /* @reserved parameter is mandatory for qgroup */ 3805 if (WARN_ON(!reserved_ret)) 3806 return -EINVAL; 3807 if (!*reserved_ret) { 3808 new_reserved = true; 3809 *reserved_ret = extent_changeset_alloc(); 3810 if (!*reserved_ret) 3811 return -ENOMEM; 3812 } 3813 reserved = *reserved_ret; 3814 /* Record already reserved space */ 3815 orig_reserved = reserved->bytes_changed; 3816 ret = set_record_extent_bits(&inode->io_tree, start, 3817 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3818 3819 /* Newly reserved space */ 3820 to_reserve = reserved->bytes_changed - orig_reserved; 3821 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3822 to_reserve, QGROUP_RESERVE); 3823 if (ret < 0) 3824 goto out; 3825 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3826 if (ret < 0) 3827 goto cleanup; 3828 3829 return ret; 3830 3831 cleanup: 3832 qgroup_unreserve_range(inode, reserved, start, len); 3833 out: 3834 if (new_reserved) { 3835 extent_changeset_free(reserved); 3836 *reserved_ret = NULL; 3837 } 3838 return ret; 3839 } 3840 3841 /* 3842 * Reserve qgroup space for range [start, start + len). 3843 * 3844 * This function will either reserve space from related qgroups or do nothing 3845 * if the range is already reserved. 3846 * 3847 * Return 0 for successful reservation 3848 * Return <0 for error (including -EQUOT) 3849 * 3850 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3851 * commit transaction. So caller should not hold any dirty page locked. 3852 */ 3853 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3854 struct extent_changeset **reserved_ret, u64 start, 3855 u64 len) 3856 { 3857 int ret; 3858 3859 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3860 if (ret <= 0 && ret != -EDQUOT) 3861 return ret; 3862 3863 ret = try_flush_qgroup(inode->root); 3864 if (ret < 0) 3865 return ret; 3866 return qgroup_reserve_data(inode, reserved_ret, start, len); 3867 } 3868 3869 /* Free ranges specified by @reserved, normally in error path */ 3870 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3871 struct extent_changeset *reserved, 3872 u64 start, u64 len, u64 *freed_ret) 3873 { 3874 struct btrfs_root *root = inode->root; 3875 struct ulist_node *unode; 3876 struct ulist_iterator uiter; 3877 struct extent_changeset changeset; 3878 u64 freed = 0; 3879 int ret; 3880 3881 extent_changeset_init(&changeset); 3882 len = round_up(start + len, root->fs_info->sectorsize); 3883 start = round_down(start, root->fs_info->sectorsize); 3884 3885 ULIST_ITER_INIT(&uiter); 3886 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3887 u64 range_start = unode->val; 3888 /* unode->aux is the inclusive end */ 3889 u64 range_len = unode->aux - range_start + 1; 3890 u64 free_start; 3891 u64 free_len; 3892 3893 extent_changeset_release(&changeset); 3894 3895 /* Only free range in range [start, start + len) */ 3896 if (range_start >= start + len || 3897 range_start + range_len <= start) 3898 continue; 3899 free_start = max(range_start, start); 3900 free_len = min(start + len, range_start + range_len) - 3901 free_start; 3902 /* 3903 * TODO: To also modify reserved->ranges_reserved to reflect 3904 * the modification. 3905 * 3906 * However as long as we free qgroup reserved according to 3907 * EXTENT_QGROUP_RESERVED, we won't double free. 3908 * So not need to rush. 3909 */ 3910 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3911 free_start + free_len - 1, 3912 EXTENT_QGROUP_RESERVED, &changeset); 3913 if (ret < 0) 3914 goto out; 3915 freed += changeset.bytes_changed; 3916 } 3917 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3918 BTRFS_QGROUP_RSV_DATA); 3919 if (freed_ret) 3920 *freed_ret = freed; 3921 ret = 0; 3922 out: 3923 extent_changeset_release(&changeset); 3924 return ret; 3925 } 3926 3927 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3928 struct extent_changeset *reserved, u64 start, u64 len, 3929 u64 *released, int free) 3930 { 3931 struct extent_changeset changeset; 3932 int trace_op = QGROUP_RELEASE; 3933 int ret; 3934 3935 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3936 return 0; 3937 3938 /* In release case, we shouldn't have @reserved */ 3939 WARN_ON(!free && reserved); 3940 if (free && reserved) 3941 return qgroup_free_reserved_data(inode, reserved, start, len, released); 3942 extent_changeset_init(&changeset); 3943 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3944 EXTENT_QGROUP_RESERVED, &changeset); 3945 if (ret < 0) 3946 goto out; 3947 3948 if (free) 3949 trace_op = QGROUP_FREE; 3950 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3951 changeset.bytes_changed, trace_op); 3952 if (free) 3953 btrfs_qgroup_free_refroot(inode->root->fs_info, 3954 inode->root->root_key.objectid, 3955 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3956 if (released) 3957 *released = changeset.bytes_changed; 3958 out: 3959 extent_changeset_release(&changeset); 3960 return ret; 3961 } 3962 3963 /* 3964 * Free a reserved space range from io_tree and related qgroups 3965 * 3966 * Should be called when a range of pages get invalidated before reaching disk. 3967 * Or for error cleanup case. 3968 * if @reserved is given, only reserved range in [@start, @start + @len) will 3969 * be freed. 3970 * 3971 * For data written to disk, use btrfs_qgroup_release_data(). 3972 * 3973 * NOTE: This function may sleep for memory allocation. 3974 */ 3975 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3976 struct extent_changeset *reserved, 3977 u64 start, u64 len, u64 *freed) 3978 { 3979 return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); 3980 } 3981 3982 /* 3983 * Release a reserved space range from io_tree only. 3984 * 3985 * Should be called when a range of pages get written to disk and corresponding 3986 * FILE_EXTENT is inserted into corresponding root. 3987 * 3988 * Since new qgroup accounting framework will only update qgroup numbers at 3989 * commit_transaction() time, its reserved space shouldn't be freed from 3990 * related qgroups. 3991 * 3992 * But we should release the range from io_tree, to allow further write to be 3993 * COWed. 3994 * 3995 * NOTE: This function may sleep for memory allocation. 3996 */ 3997 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) 3998 { 3999 return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); 4000 } 4001 4002 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4003 enum btrfs_qgroup_rsv_type type) 4004 { 4005 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4006 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4007 return; 4008 if (num_bytes == 0) 4009 return; 4010 4011 spin_lock(&root->qgroup_meta_rsv_lock); 4012 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 4013 root->qgroup_meta_rsv_prealloc += num_bytes; 4014 else 4015 root->qgroup_meta_rsv_pertrans += num_bytes; 4016 spin_unlock(&root->qgroup_meta_rsv_lock); 4017 } 4018 4019 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 4020 enum btrfs_qgroup_rsv_type type) 4021 { 4022 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 4023 type != BTRFS_QGROUP_RSV_META_PERTRANS) 4024 return 0; 4025 if (num_bytes == 0) 4026 return 0; 4027 4028 spin_lock(&root->qgroup_meta_rsv_lock); 4029 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 4030 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 4031 num_bytes); 4032 root->qgroup_meta_rsv_prealloc -= num_bytes; 4033 } else { 4034 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 4035 num_bytes); 4036 root->qgroup_meta_rsv_pertrans -= num_bytes; 4037 } 4038 spin_unlock(&root->qgroup_meta_rsv_lock); 4039 return num_bytes; 4040 } 4041 4042 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4043 enum btrfs_qgroup_rsv_type type, bool enforce) 4044 { 4045 struct btrfs_fs_info *fs_info = root->fs_info; 4046 int ret; 4047 4048 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4049 !is_fstree(root->root_key.objectid) || num_bytes == 0) 4050 return 0; 4051 4052 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4053 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 4054 ret = qgroup_reserve(root, num_bytes, enforce, type); 4055 if (ret < 0) 4056 return ret; 4057 /* 4058 * Record what we have reserved into root. 4059 * 4060 * To avoid quota disabled->enabled underflow. 4061 * In that case, we may try to free space we haven't reserved 4062 * (since quota was disabled), so record what we reserved into root. 4063 * And ensure later release won't underflow this number. 4064 */ 4065 add_root_meta_rsv(root, num_bytes, type); 4066 return ret; 4067 } 4068 4069 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4070 enum btrfs_qgroup_rsv_type type, bool enforce, 4071 bool noflush) 4072 { 4073 int ret; 4074 4075 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4076 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4077 return ret; 4078 4079 ret = try_flush_qgroup(root); 4080 if (ret < 0) 4081 return ret; 4082 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4083 } 4084 4085 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 4086 { 4087 struct btrfs_fs_info *fs_info = root->fs_info; 4088 4089 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4090 !is_fstree(root->root_key.objectid)) 4091 return; 4092 4093 /* TODO: Update trace point to handle such free */ 4094 trace_qgroup_meta_free_all_pertrans(root); 4095 /* Special value -1 means to free all reserved space */ 4096 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 4097 BTRFS_QGROUP_RSV_META_PERTRANS); 4098 } 4099 4100 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4101 enum btrfs_qgroup_rsv_type type) 4102 { 4103 struct btrfs_fs_info *fs_info = root->fs_info; 4104 4105 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4106 !is_fstree(root->root_key.objectid)) 4107 return; 4108 4109 /* 4110 * reservation for META_PREALLOC can happen before quota is enabled, 4111 * which can lead to underflow. 4112 * Here ensure we will only free what we really have reserved. 4113 */ 4114 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4115 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4116 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4117 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 4118 num_bytes, type); 4119 } 4120 4121 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 4122 int num_bytes) 4123 { 4124 struct btrfs_qgroup *qgroup; 4125 LIST_HEAD(qgroup_list); 4126 4127 if (num_bytes == 0) 4128 return; 4129 if (!fs_info->quota_root) 4130 return; 4131 4132 spin_lock(&fs_info->qgroup_lock); 4133 qgroup = find_qgroup_rb(fs_info, ref_root); 4134 if (!qgroup) 4135 goto out; 4136 4137 qgroup_iterator_add(&qgroup_list, qgroup); 4138 list_for_each_entry(qgroup, &qgroup_list, iterator) { 4139 struct btrfs_qgroup_list *glist; 4140 4141 qgroup_rsv_release(fs_info, qgroup, num_bytes, 4142 BTRFS_QGROUP_RSV_META_PREALLOC); 4143 if (!sb_rdonly(fs_info->sb)) 4144 qgroup_rsv_add(fs_info, qgroup, num_bytes, 4145 BTRFS_QGROUP_RSV_META_PERTRANS); 4146 4147 list_for_each_entry(glist, &qgroup->groups, next_group) 4148 qgroup_iterator_add(&qgroup_list, glist->group); 4149 } 4150 out: 4151 qgroup_iterator_clean(&qgroup_list); 4152 spin_unlock(&fs_info->qgroup_lock); 4153 } 4154 4155 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4156 { 4157 struct btrfs_fs_info *fs_info = root->fs_info; 4158 4159 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4160 !is_fstree(root->root_key.objectid)) 4161 return; 4162 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4163 num_bytes = sub_root_meta_rsv(root, num_bytes, 4164 BTRFS_QGROUP_RSV_META_PREALLOC); 4165 trace_qgroup_meta_convert(root, num_bytes); 4166 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 4167 if (!sb_rdonly(fs_info->sb)) 4168 add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); 4169 } 4170 4171 /* 4172 * Check qgroup reserved space leaking, normally at destroy inode 4173 * time 4174 */ 4175 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4176 { 4177 struct extent_changeset changeset; 4178 struct ulist_node *unode; 4179 struct ulist_iterator iter; 4180 int ret; 4181 4182 extent_changeset_init(&changeset); 4183 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4184 EXTENT_QGROUP_RESERVED, &changeset); 4185 4186 WARN_ON(ret < 0); 4187 if (WARN_ON(changeset.bytes_changed)) { 4188 ULIST_ITER_INIT(&iter); 4189 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4190 btrfs_warn(inode->root->fs_info, 4191 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4192 btrfs_ino(inode), unode->val, unode->aux); 4193 } 4194 btrfs_qgroup_free_refroot(inode->root->fs_info, 4195 inode->root->root_key.objectid, 4196 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4197 4198 } 4199 extent_changeset_release(&changeset); 4200 } 4201 4202 void btrfs_qgroup_init_swapped_blocks( 4203 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4204 { 4205 int i; 4206 4207 spin_lock_init(&swapped_blocks->lock); 4208 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4209 swapped_blocks->blocks[i] = RB_ROOT; 4210 swapped_blocks->swapped = false; 4211 } 4212 4213 /* 4214 * Delete all swapped blocks record of @root. 4215 * Every record here means we skipped a full subtree scan for qgroup. 4216 * 4217 * Gets called when committing one transaction. 4218 */ 4219 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4220 { 4221 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4222 int i; 4223 4224 swapped_blocks = &root->swapped_blocks; 4225 4226 spin_lock(&swapped_blocks->lock); 4227 if (!swapped_blocks->swapped) 4228 goto out; 4229 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4230 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4231 struct btrfs_qgroup_swapped_block *entry; 4232 struct btrfs_qgroup_swapped_block *next; 4233 4234 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4235 node) 4236 kfree(entry); 4237 swapped_blocks->blocks[i] = RB_ROOT; 4238 } 4239 swapped_blocks->swapped = false; 4240 out: 4241 spin_unlock(&swapped_blocks->lock); 4242 } 4243 4244 /* 4245 * Add subtree roots record into @subvol_root. 4246 * 4247 * @subvol_root: tree root of the subvolume tree get swapped 4248 * @bg: block group under balance 4249 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4250 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4251 * BOTH POINTERS ARE BEFORE TREE SWAP 4252 * @last_snapshot: last snapshot generation of the subvolume tree 4253 */ 4254 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4255 struct btrfs_root *subvol_root, 4256 struct btrfs_block_group *bg, 4257 struct extent_buffer *subvol_parent, int subvol_slot, 4258 struct extent_buffer *reloc_parent, int reloc_slot, 4259 u64 last_snapshot) 4260 { 4261 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4262 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4263 struct btrfs_qgroup_swapped_block *block; 4264 struct rb_node **cur; 4265 struct rb_node *parent = NULL; 4266 int level = btrfs_header_level(subvol_parent) - 1; 4267 int ret = 0; 4268 4269 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4270 return 0; 4271 4272 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4273 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4274 btrfs_err_rl(fs_info, 4275 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4276 __func__, 4277 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4278 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4279 return -EUCLEAN; 4280 } 4281 4282 block = kmalloc(sizeof(*block), GFP_NOFS); 4283 if (!block) { 4284 ret = -ENOMEM; 4285 goto out; 4286 } 4287 4288 /* 4289 * @reloc_parent/slot is still before swap, while @block is going to 4290 * record the bytenr after swap, so we do the swap here. 4291 */ 4292 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4293 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4294 reloc_slot); 4295 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4296 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4297 subvol_slot); 4298 block->last_snapshot = last_snapshot; 4299 block->level = level; 4300 4301 /* 4302 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4303 * no one else can modify tree blocks thus we qgroup will not change 4304 * no matter the value of trace_leaf. 4305 */ 4306 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4307 block->trace_leaf = true; 4308 else 4309 block->trace_leaf = false; 4310 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4311 4312 /* Insert @block into @blocks */ 4313 spin_lock(&blocks->lock); 4314 cur = &blocks->blocks[level].rb_node; 4315 while (*cur) { 4316 struct btrfs_qgroup_swapped_block *entry; 4317 4318 parent = *cur; 4319 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4320 node); 4321 4322 if (entry->subvol_bytenr < block->subvol_bytenr) { 4323 cur = &(*cur)->rb_left; 4324 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4325 cur = &(*cur)->rb_right; 4326 } else { 4327 if (entry->subvol_generation != 4328 block->subvol_generation || 4329 entry->reloc_bytenr != block->reloc_bytenr || 4330 entry->reloc_generation != 4331 block->reloc_generation) { 4332 /* 4333 * Duplicated but mismatch entry found. 4334 * Shouldn't happen. 4335 * 4336 * Marking qgroup inconsistent should be enough 4337 * for end users. 4338 */ 4339 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4340 ret = -EEXIST; 4341 } 4342 kfree(block); 4343 goto out_unlock; 4344 } 4345 } 4346 rb_link_node(&block->node, parent, cur); 4347 rb_insert_color(&block->node, &blocks->blocks[level]); 4348 blocks->swapped = true; 4349 out_unlock: 4350 spin_unlock(&blocks->lock); 4351 out: 4352 if (ret < 0) 4353 qgroup_mark_inconsistent(fs_info); 4354 return ret; 4355 } 4356 4357 /* 4358 * Check if the tree block is a subtree root, and if so do the needed 4359 * delayed subtree trace for qgroup. 4360 * 4361 * This is called during btrfs_cow_block(). 4362 */ 4363 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4364 struct btrfs_root *root, 4365 struct extent_buffer *subvol_eb) 4366 { 4367 struct btrfs_fs_info *fs_info = root->fs_info; 4368 struct btrfs_tree_parent_check check = { 0 }; 4369 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4370 struct btrfs_qgroup_swapped_block *block; 4371 struct extent_buffer *reloc_eb = NULL; 4372 struct rb_node *node; 4373 bool found = false; 4374 bool swapped = false; 4375 int level = btrfs_header_level(subvol_eb); 4376 int ret = 0; 4377 int i; 4378 4379 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4380 return 0; 4381 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4382 return 0; 4383 4384 spin_lock(&blocks->lock); 4385 if (!blocks->swapped) { 4386 spin_unlock(&blocks->lock); 4387 return 0; 4388 } 4389 node = blocks->blocks[level].rb_node; 4390 4391 while (node) { 4392 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4393 if (block->subvol_bytenr < subvol_eb->start) { 4394 node = node->rb_left; 4395 } else if (block->subvol_bytenr > subvol_eb->start) { 4396 node = node->rb_right; 4397 } else { 4398 found = true; 4399 break; 4400 } 4401 } 4402 if (!found) { 4403 spin_unlock(&blocks->lock); 4404 goto out; 4405 } 4406 /* Found one, remove it from @blocks first and update blocks->swapped */ 4407 rb_erase(&block->node, &blocks->blocks[level]); 4408 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4409 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4410 swapped = true; 4411 break; 4412 } 4413 } 4414 blocks->swapped = swapped; 4415 spin_unlock(&blocks->lock); 4416 4417 check.level = block->level; 4418 check.transid = block->reloc_generation; 4419 check.has_first_key = true; 4420 memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); 4421 4422 /* Read out reloc subtree root */ 4423 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); 4424 if (IS_ERR(reloc_eb)) { 4425 ret = PTR_ERR(reloc_eb); 4426 reloc_eb = NULL; 4427 goto free_out; 4428 } 4429 if (!extent_buffer_uptodate(reloc_eb)) { 4430 ret = -EIO; 4431 goto free_out; 4432 } 4433 4434 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4435 block->last_snapshot, block->trace_leaf); 4436 free_out: 4437 kfree(block); 4438 free_extent_buffer(reloc_eb); 4439 out: 4440 if (ret < 0) { 4441 btrfs_err_rl(fs_info, 4442 "failed to account subtree at bytenr %llu: %d", 4443 subvol_eb->start, ret); 4444 qgroup_mark_inconsistent(fs_info); 4445 } 4446 return ret; 4447 } 4448 4449 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4450 { 4451 struct btrfs_qgroup_extent_record *entry; 4452 struct btrfs_qgroup_extent_record *next; 4453 struct rb_root *root; 4454 4455 root = &trans->delayed_refs.dirty_extent_root; 4456 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4457 ulist_free(entry->old_roots); 4458 kfree(entry); 4459 } 4460 *root = RB_ROOT; 4461 } 4462