1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 28 /* TODO XXX FIXME 29 * - subvol delete -> delete when ref goes to 0? delete limits also? 30 * - reorganize keys 31 * - compressed 32 * - sync 33 * - copy also limits on subvol creation 34 * - limit 35 * - caches for ulists 36 * - performance benchmarks 37 * - check all ioctl parameters 38 */ 39 40 /* 41 * Helpers to access qgroup reservation 42 * 43 * Callers should ensure the lock context and type are valid 44 */ 45 46 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 47 { 48 u64 ret = 0; 49 int i; 50 51 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 52 ret += qgroup->rsv.values[i]; 53 54 return ret; 55 } 56 57 #ifdef CONFIG_BTRFS_DEBUG 58 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 59 { 60 if (type == BTRFS_QGROUP_RSV_DATA) 61 return "data"; 62 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 63 return "meta_pertrans"; 64 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 65 return "meta_prealloc"; 66 return NULL; 67 } 68 #endif 69 70 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 71 struct btrfs_qgroup *qgroup, u64 num_bytes, 72 enum btrfs_qgroup_rsv_type type) 73 { 74 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 75 qgroup->rsv.values[type] += num_bytes; 76 } 77 78 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 79 struct btrfs_qgroup *qgroup, u64 num_bytes, 80 enum btrfs_qgroup_rsv_type type) 81 { 82 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 83 if (qgroup->rsv.values[type] >= num_bytes) { 84 qgroup->rsv.values[type] -= num_bytes; 85 return; 86 } 87 #ifdef CONFIG_BTRFS_DEBUG 88 WARN_RATELIMIT(1, 89 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 90 qgroup->qgroupid, qgroup_rsv_type_str(type), 91 qgroup->rsv.values[type], num_bytes); 92 #endif 93 qgroup->rsv.values[type] = 0; 94 } 95 96 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 97 struct btrfs_qgroup *dest, 98 struct btrfs_qgroup *src) 99 { 100 int i; 101 102 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 103 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 104 } 105 106 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 107 struct btrfs_qgroup *dest, 108 struct btrfs_qgroup *src) 109 { 110 int i; 111 112 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 113 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 114 } 115 116 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 117 int mod) 118 { 119 if (qg->old_refcnt < seq) 120 qg->old_refcnt = seq; 121 qg->old_refcnt += mod; 122 } 123 124 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 125 int mod) 126 { 127 if (qg->new_refcnt < seq) 128 qg->new_refcnt = seq; 129 qg->new_refcnt += mod; 130 } 131 132 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->old_refcnt < seq) 135 return 0; 136 return qg->old_refcnt - seq; 137 } 138 139 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 140 { 141 if (qg->new_refcnt < seq) 142 return 0; 143 return qg->new_refcnt - seq; 144 } 145 146 /* 147 * glue structure to represent the relations between qgroups. 148 */ 149 struct btrfs_qgroup_list { 150 struct list_head next_group; 151 struct list_head next_member; 152 struct btrfs_qgroup *group; 153 struct btrfs_qgroup *member; 154 }; 155 156 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 157 { 158 return (u64)(uintptr_t)qg; 159 } 160 161 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 162 { 163 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 164 } 165 166 static int 167 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 168 int init_flags); 169 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 170 171 /* must be called with qgroup_ioctl_lock held */ 172 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 173 u64 qgroupid) 174 { 175 struct rb_node *n = fs_info->qgroup_tree.rb_node; 176 struct btrfs_qgroup *qgroup; 177 178 while (n) { 179 qgroup = rb_entry(n, struct btrfs_qgroup, node); 180 if (qgroup->qgroupid < qgroupid) 181 n = n->rb_left; 182 else if (qgroup->qgroupid > qgroupid) 183 n = n->rb_right; 184 else 185 return qgroup; 186 } 187 return NULL; 188 } 189 190 /* must be called with qgroup_lock held */ 191 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 192 u64 qgroupid) 193 { 194 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 195 struct rb_node *parent = NULL; 196 struct btrfs_qgroup *qgroup; 197 198 while (*p) { 199 parent = *p; 200 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 201 202 if (qgroup->qgroupid < qgroupid) 203 p = &(*p)->rb_left; 204 else if (qgroup->qgroupid > qgroupid) 205 p = &(*p)->rb_right; 206 else 207 return qgroup; 208 } 209 210 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 211 if (!qgroup) 212 return ERR_PTR(-ENOMEM); 213 214 qgroup->qgroupid = qgroupid; 215 INIT_LIST_HEAD(&qgroup->groups); 216 INIT_LIST_HEAD(&qgroup->members); 217 INIT_LIST_HEAD(&qgroup->dirty); 218 219 rb_link_node(&qgroup->node, parent, p); 220 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 221 222 return qgroup; 223 } 224 225 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 226 struct btrfs_qgroup *qgroup) 227 { 228 struct btrfs_qgroup_list *list; 229 230 list_del(&qgroup->dirty); 231 while (!list_empty(&qgroup->groups)) { 232 list = list_first_entry(&qgroup->groups, 233 struct btrfs_qgroup_list, next_group); 234 list_del(&list->next_group); 235 list_del(&list->next_member); 236 kfree(list); 237 } 238 239 while (!list_empty(&qgroup->members)) { 240 list = list_first_entry(&qgroup->members, 241 struct btrfs_qgroup_list, next_member); 242 list_del(&list->next_group); 243 list_del(&list->next_member); 244 kfree(list); 245 } 246 } 247 248 /* must be called with qgroup_lock held */ 249 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 250 { 251 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 252 253 if (!qgroup) 254 return -ENOENT; 255 256 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 257 __del_qgroup_rb(fs_info, qgroup); 258 return 0; 259 } 260 261 /* must be called with qgroup_lock held */ 262 static int add_relation_rb(struct btrfs_fs_info *fs_info, 263 u64 memberid, u64 parentid) 264 { 265 struct btrfs_qgroup *member; 266 struct btrfs_qgroup *parent; 267 struct btrfs_qgroup_list *list; 268 269 member = find_qgroup_rb(fs_info, memberid); 270 parent = find_qgroup_rb(fs_info, parentid); 271 if (!member || !parent) 272 return -ENOENT; 273 274 list = kzalloc(sizeof(*list), GFP_ATOMIC); 275 if (!list) 276 return -ENOMEM; 277 278 list->group = parent; 279 list->member = member; 280 list_add_tail(&list->next_group, &member->groups); 281 list_add_tail(&list->next_member, &parent->members); 282 283 return 0; 284 } 285 286 /* must be called with qgroup_lock held */ 287 static int del_relation_rb(struct btrfs_fs_info *fs_info, 288 u64 memberid, u64 parentid) 289 { 290 struct btrfs_qgroup *member; 291 struct btrfs_qgroup *parent; 292 struct btrfs_qgroup_list *list; 293 294 member = find_qgroup_rb(fs_info, memberid); 295 parent = find_qgroup_rb(fs_info, parentid); 296 if (!member || !parent) 297 return -ENOENT; 298 299 list_for_each_entry(list, &member->groups, next_group) { 300 if (list->group == parent) { 301 list_del(&list->next_group); 302 list_del(&list->next_member); 303 kfree(list); 304 return 0; 305 } 306 } 307 return -ENOENT; 308 } 309 310 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 311 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 312 u64 rfer, u64 excl) 313 { 314 struct btrfs_qgroup *qgroup; 315 316 qgroup = find_qgroup_rb(fs_info, qgroupid); 317 if (!qgroup) 318 return -EINVAL; 319 if (qgroup->rfer != rfer || qgroup->excl != excl) 320 return -EINVAL; 321 return 0; 322 } 323 #endif 324 325 /* 326 * The full config is read in one go, only called from open_ctree() 327 * It doesn't use any locking, as at this point we're still single-threaded 328 */ 329 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 330 { 331 struct btrfs_key key; 332 struct btrfs_key found_key; 333 struct btrfs_root *quota_root = fs_info->quota_root; 334 struct btrfs_path *path = NULL; 335 struct extent_buffer *l; 336 int slot; 337 int ret = 0; 338 u64 flags = 0; 339 u64 rescan_progress = 0; 340 341 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 342 return 0; 343 344 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 345 if (!fs_info->qgroup_ulist) { 346 ret = -ENOMEM; 347 goto out; 348 } 349 350 path = btrfs_alloc_path(); 351 if (!path) { 352 ret = -ENOMEM; 353 goto out; 354 } 355 356 ret = btrfs_sysfs_add_qgroups(fs_info); 357 if (ret < 0) 358 goto out; 359 /* default this to quota off, in case no status key is found */ 360 fs_info->qgroup_flags = 0; 361 362 /* 363 * pass 1: read status, all qgroup infos and limits 364 */ 365 key.objectid = 0; 366 key.type = 0; 367 key.offset = 0; 368 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 369 if (ret) 370 goto out; 371 372 while (1) { 373 struct btrfs_qgroup *qgroup; 374 375 slot = path->slots[0]; 376 l = path->nodes[0]; 377 btrfs_item_key_to_cpu(l, &found_key, slot); 378 379 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 380 struct btrfs_qgroup_status_item *ptr; 381 382 ptr = btrfs_item_ptr(l, slot, 383 struct btrfs_qgroup_status_item); 384 385 if (btrfs_qgroup_status_version(l, ptr) != 386 BTRFS_QGROUP_STATUS_VERSION) { 387 btrfs_err(fs_info, 388 "old qgroup version, quota disabled"); 389 goto out; 390 } 391 if (btrfs_qgroup_status_generation(l, ptr) != 392 fs_info->generation) { 393 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 394 btrfs_err(fs_info, 395 "qgroup generation mismatch, marked as inconsistent"); 396 } 397 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 398 ptr); 399 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 400 goto next1; 401 } 402 403 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 404 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 405 goto next1; 406 407 qgroup = find_qgroup_rb(fs_info, found_key.offset); 408 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 409 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 410 btrfs_err(fs_info, "inconsistent qgroup config"); 411 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 412 } 413 if (!qgroup) { 414 qgroup = add_qgroup_rb(fs_info, found_key.offset); 415 if (IS_ERR(qgroup)) { 416 ret = PTR_ERR(qgroup); 417 goto out; 418 } 419 } 420 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 421 if (ret < 0) 422 goto out; 423 424 switch (found_key.type) { 425 case BTRFS_QGROUP_INFO_KEY: { 426 struct btrfs_qgroup_info_item *ptr; 427 428 ptr = btrfs_item_ptr(l, slot, 429 struct btrfs_qgroup_info_item); 430 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 431 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 432 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 433 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 434 /* generation currently unused */ 435 break; 436 } 437 case BTRFS_QGROUP_LIMIT_KEY: { 438 struct btrfs_qgroup_limit_item *ptr; 439 440 ptr = btrfs_item_ptr(l, slot, 441 struct btrfs_qgroup_limit_item); 442 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 443 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 444 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 445 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 446 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 447 break; 448 } 449 } 450 next1: 451 ret = btrfs_next_item(quota_root, path); 452 if (ret < 0) 453 goto out; 454 if (ret) 455 break; 456 } 457 btrfs_release_path(path); 458 459 /* 460 * pass 2: read all qgroup relations 461 */ 462 key.objectid = 0; 463 key.type = BTRFS_QGROUP_RELATION_KEY; 464 key.offset = 0; 465 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 466 if (ret) 467 goto out; 468 while (1) { 469 slot = path->slots[0]; 470 l = path->nodes[0]; 471 btrfs_item_key_to_cpu(l, &found_key, slot); 472 473 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 474 goto next2; 475 476 if (found_key.objectid > found_key.offset) { 477 /* parent <- member, not needed to build config */ 478 /* FIXME should we omit the key completely? */ 479 goto next2; 480 } 481 482 ret = add_relation_rb(fs_info, found_key.objectid, 483 found_key.offset); 484 if (ret == -ENOENT) { 485 btrfs_warn(fs_info, 486 "orphan qgroup relation 0x%llx->0x%llx", 487 found_key.objectid, found_key.offset); 488 ret = 0; /* ignore the error */ 489 } 490 if (ret) 491 goto out; 492 next2: 493 ret = btrfs_next_item(quota_root, path); 494 if (ret < 0) 495 goto out; 496 if (ret) 497 break; 498 } 499 out: 500 btrfs_free_path(path); 501 fs_info->qgroup_flags |= flags; 502 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 503 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 504 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 505 ret >= 0) 506 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 507 508 if (ret < 0) { 509 ulist_free(fs_info->qgroup_ulist); 510 fs_info->qgroup_ulist = NULL; 511 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 512 btrfs_sysfs_del_qgroups(fs_info); 513 } 514 515 return ret < 0 ? ret : 0; 516 } 517 518 /* 519 * Called in close_ctree() when quota is still enabled. This verifies we don't 520 * leak some reserved space. 521 * 522 * Return false if no reserved space is left. 523 * Return true if some reserved space is leaked. 524 */ 525 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 526 { 527 struct rb_node *node; 528 bool ret = false; 529 530 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 531 return ret; 532 /* 533 * Since we're unmounting, there is no race and no need to grab qgroup 534 * lock. And here we don't go post-order to provide a more user 535 * friendly sorted result. 536 */ 537 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 538 struct btrfs_qgroup *qgroup; 539 int i; 540 541 qgroup = rb_entry(node, struct btrfs_qgroup, node); 542 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 543 if (qgroup->rsv.values[i]) { 544 ret = true; 545 btrfs_warn(fs_info, 546 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 547 btrfs_qgroup_level(qgroup->qgroupid), 548 btrfs_qgroup_subvolid(qgroup->qgroupid), 549 i, qgroup->rsv.values[i]); 550 } 551 } 552 } 553 return ret; 554 } 555 556 /* 557 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 558 * first two are in single-threaded paths.And for the third one, we have set 559 * quota_root to be null with qgroup_lock held before, so it is safe to clean 560 * up the in-memory structures without qgroup_lock held. 561 */ 562 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 563 { 564 struct rb_node *n; 565 struct btrfs_qgroup *qgroup; 566 567 while ((n = rb_first(&fs_info->qgroup_tree))) { 568 qgroup = rb_entry(n, struct btrfs_qgroup, node); 569 rb_erase(n, &fs_info->qgroup_tree); 570 __del_qgroup_rb(fs_info, qgroup); 571 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 572 kfree(qgroup); 573 } 574 /* 575 * We call btrfs_free_qgroup_config() when unmounting 576 * filesystem and disabling quota, so we set qgroup_ulist 577 * to be null here to avoid double free. 578 */ 579 ulist_free(fs_info->qgroup_ulist); 580 fs_info->qgroup_ulist = NULL; 581 btrfs_sysfs_del_qgroups(fs_info); 582 } 583 584 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 585 u64 dst) 586 { 587 int ret; 588 struct btrfs_root *quota_root = trans->fs_info->quota_root; 589 struct btrfs_path *path; 590 struct btrfs_key key; 591 592 path = btrfs_alloc_path(); 593 if (!path) 594 return -ENOMEM; 595 596 key.objectid = src; 597 key.type = BTRFS_QGROUP_RELATION_KEY; 598 key.offset = dst; 599 600 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 601 602 btrfs_mark_buffer_dirty(path->nodes[0]); 603 604 btrfs_free_path(path); 605 return ret; 606 } 607 608 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 625 if (ret < 0) 626 goto out; 627 628 if (ret > 0) { 629 ret = -ENOENT; 630 goto out; 631 } 632 633 ret = btrfs_del_item(trans, quota_root, path); 634 out: 635 btrfs_free_path(path); 636 return ret; 637 } 638 639 static int add_qgroup_item(struct btrfs_trans_handle *trans, 640 struct btrfs_root *quota_root, u64 qgroupid) 641 { 642 int ret; 643 struct btrfs_path *path; 644 struct btrfs_qgroup_info_item *qgroup_info; 645 struct btrfs_qgroup_limit_item *qgroup_limit; 646 struct extent_buffer *leaf; 647 struct btrfs_key key; 648 649 if (btrfs_is_testing(quota_root->fs_info)) 650 return 0; 651 652 path = btrfs_alloc_path(); 653 if (!path) 654 return -ENOMEM; 655 656 key.objectid = 0; 657 key.type = BTRFS_QGROUP_INFO_KEY; 658 key.offset = qgroupid; 659 660 /* 661 * Avoid a transaction abort by catching -EEXIST here. In that 662 * case, we proceed by re-initializing the existing structure 663 * on disk. 664 */ 665 666 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 667 sizeof(*qgroup_info)); 668 if (ret && ret != -EEXIST) 669 goto out; 670 671 leaf = path->nodes[0]; 672 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 673 struct btrfs_qgroup_info_item); 674 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 675 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 676 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 677 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 678 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 679 680 btrfs_mark_buffer_dirty(leaf); 681 682 btrfs_release_path(path); 683 684 key.type = BTRFS_QGROUP_LIMIT_KEY; 685 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 686 sizeof(*qgroup_limit)); 687 if (ret && ret != -EEXIST) 688 goto out; 689 690 leaf = path->nodes[0]; 691 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 692 struct btrfs_qgroup_limit_item); 693 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 694 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 695 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 696 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 697 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 698 699 btrfs_mark_buffer_dirty(leaf); 700 701 ret = 0; 702 out: 703 btrfs_free_path(path); 704 return ret; 705 } 706 707 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 708 { 709 int ret; 710 struct btrfs_root *quota_root = trans->fs_info->quota_root; 711 struct btrfs_path *path; 712 struct btrfs_key key; 713 714 path = btrfs_alloc_path(); 715 if (!path) 716 return -ENOMEM; 717 718 key.objectid = 0; 719 key.type = BTRFS_QGROUP_INFO_KEY; 720 key.offset = qgroupid; 721 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 722 if (ret < 0) 723 goto out; 724 725 if (ret > 0) { 726 ret = -ENOENT; 727 goto out; 728 } 729 730 ret = btrfs_del_item(trans, quota_root, path); 731 if (ret) 732 goto out; 733 734 btrfs_release_path(path); 735 736 key.type = BTRFS_QGROUP_LIMIT_KEY; 737 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 738 if (ret < 0) 739 goto out; 740 741 if (ret > 0) { 742 ret = -ENOENT; 743 goto out; 744 } 745 746 ret = btrfs_del_item(trans, quota_root, path); 747 748 out: 749 btrfs_free_path(path); 750 return ret; 751 } 752 753 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 754 struct btrfs_qgroup *qgroup) 755 { 756 struct btrfs_root *quota_root = trans->fs_info->quota_root; 757 struct btrfs_path *path; 758 struct btrfs_key key; 759 struct extent_buffer *l; 760 struct btrfs_qgroup_limit_item *qgroup_limit; 761 int ret; 762 int slot; 763 764 key.objectid = 0; 765 key.type = BTRFS_QGROUP_LIMIT_KEY; 766 key.offset = qgroup->qgroupid; 767 768 path = btrfs_alloc_path(); 769 if (!path) 770 return -ENOMEM; 771 772 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 773 if (ret > 0) 774 ret = -ENOENT; 775 776 if (ret) 777 goto out; 778 779 l = path->nodes[0]; 780 slot = path->slots[0]; 781 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 782 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 783 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 784 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 785 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 786 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 787 788 btrfs_mark_buffer_dirty(l); 789 790 out: 791 btrfs_free_path(path); 792 return ret; 793 } 794 795 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 796 struct btrfs_qgroup *qgroup) 797 { 798 struct btrfs_fs_info *fs_info = trans->fs_info; 799 struct btrfs_root *quota_root = fs_info->quota_root; 800 struct btrfs_path *path; 801 struct btrfs_key key; 802 struct extent_buffer *l; 803 struct btrfs_qgroup_info_item *qgroup_info; 804 int ret; 805 int slot; 806 807 if (btrfs_is_testing(fs_info)) 808 return 0; 809 810 key.objectid = 0; 811 key.type = BTRFS_QGROUP_INFO_KEY; 812 key.offset = qgroup->qgroupid; 813 814 path = btrfs_alloc_path(); 815 if (!path) 816 return -ENOMEM; 817 818 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 819 if (ret > 0) 820 ret = -ENOENT; 821 822 if (ret) 823 goto out; 824 825 l = path->nodes[0]; 826 slot = path->slots[0]; 827 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 828 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 829 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 830 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 831 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 832 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 833 834 btrfs_mark_buffer_dirty(l); 835 836 out: 837 btrfs_free_path(path); 838 return ret; 839 } 840 841 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 842 { 843 struct btrfs_fs_info *fs_info = trans->fs_info; 844 struct btrfs_root *quota_root = fs_info->quota_root; 845 struct btrfs_path *path; 846 struct btrfs_key key; 847 struct extent_buffer *l; 848 struct btrfs_qgroup_status_item *ptr; 849 int ret; 850 int slot; 851 852 key.objectid = 0; 853 key.type = BTRFS_QGROUP_STATUS_KEY; 854 key.offset = 0; 855 856 path = btrfs_alloc_path(); 857 if (!path) 858 return -ENOMEM; 859 860 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 861 if (ret > 0) 862 ret = -ENOENT; 863 864 if (ret) 865 goto out; 866 867 l = path->nodes[0]; 868 slot = path->slots[0]; 869 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 870 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); 871 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 872 btrfs_set_qgroup_status_rescan(l, ptr, 873 fs_info->qgroup_rescan_progress.objectid); 874 875 btrfs_mark_buffer_dirty(l); 876 877 out: 878 btrfs_free_path(path); 879 return ret; 880 } 881 882 /* 883 * called with qgroup_lock held 884 */ 885 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 886 struct btrfs_root *root) 887 { 888 struct btrfs_path *path; 889 struct btrfs_key key; 890 struct extent_buffer *leaf = NULL; 891 int ret; 892 int nr = 0; 893 894 path = btrfs_alloc_path(); 895 if (!path) 896 return -ENOMEM; 897 898 key.objectid = 0; 899 key.offset = 0; 900 key.type = 0; 901 902 while (1) { 903 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 904 if (ret < 0) 905 goto out; 906 leaf = path->nodes[0]; 907 nr = btrfs_header_nritems(leaf); 908 if (!nr) 909 break; 910 /* 911 * delete the leaf one by one 912 * since the whole tree is going 913 * to be deleted. 914 */ 915 path->slots[0] = 0; 916 ret = btrfs_del_items(trans, root, path, 0, nr); 917 if (ret) 918 goto out; 919 920 btrfs_release_path(path); 921 } 922 ret = 0; 923 out: 924 btrfs_free_path(path); 925 return ret; 926 } 927 928 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 929 { 930 struct btrfs_root *quota_root; 931 struct btrfs_root *tree_root = fs_info->tree_root; 932 struct btrfs_path *path = NULL; 933 struct btrfs_qgroup_status_item *ptr; 934 struct extent_buffer *leaf; 935 struct btrfs_key key; 936 struct btrfs_key found_key; 937 struct btrfs_qgroup *qgroup = NULL; 938 struct btrfs_trans_handle *trans = NULL; 939 struct ulist *ulist = NULL; 940 int ret = 0; 941 int slot; 942 943 mutex_lock(&fs_info->qgroup_ioctl_lock); 944 if (fs_info->quota_root) 945 goto out; 946 947 ulist = ulist_alloc(GFP_KERNEL); 948 if (!ulist) { 949 ret = -ENOMEM; 950 goto out; 951 } 952 953 ret = btrfs_sysfs_add_qgroups(fs_info); 954 if (ret < 0) 955 goto out; 956 957 /* 958 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 959 * avoid lock acquisition inversion problems (reported by lockdep) between 960 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 961 * start a transaction. 962 * After we started the transaction lock qgroup_ioctl_lock again and 963 * check if someone else created the quota root in the meanwhile. If so, 964 * just return success and release the transaction handle. 965 * 966 * Also we don't need to worry about someone else calling 967 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 968 * that function returns 0 (success) when the sysfs entries already exist. 969 */ 970 mutex_unlock(&fs_info->qgroup_ioctl_lock); 971 972 /* 973 * 1 for quota root item 974 * 1 for BTRFS_QGROUP_STATUS item 975 * 976 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 977 * per subvolume. However those are not currently reserved since it 978 * would be a lot of overkill. 979 */ 980 trans = btrfs_start_transaction(tree_root, 2); 981 982 mutex_lock(&fs_info->qgroup_ioctl_lock); 983 if (IS_ERR(trans)) { 984 ret = PTR_ERR(trans); 985 trans = NULL; 986 goto out; 987 } 988 989 if (fs_info->quota_root) 990 goto out; 991 992 fs_info->qgroup_ulist = ulist; 993 ulist = NULL; 994 995 /* 996 * initially create the quota tree 997 */ 998 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 999 if (IS_ERR(quota_root)) { 1000 ret = PTR_ERR(quota_root); 1001 btrfs_abort_transaction(trans, ret); 1002 goto out; 1003 } 1004 1005 path = btrfs_alloc_path(); 1006 if (!path) { 1007 ret = -ENOMEM; 1008 btrfs_abort_transaction(trans, ret); 1009 goto out_free_root; 1010 } 1011 1012 key.objectid = 0; 1013 key.type = BTRFS_QGROUP_STATUS_KEY; 1014 key.offset = 0; 1015 1016 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1017 sizeof(*ptr)); 1018 if (ret) { 1019 btrfs_abort_transaction(trans, ret); 1020 goto out_free_path; 1021 } 1022 1023 leaf = path->nodes[0]; 1024 ptr = btrfs_item_ptr(leaf, path->slots[0], 1025 struct btrfs_qgroup_status_item); 1026 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1027 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1028 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1029 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1030 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); 1031 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1032 1033 btrfs_mark_buffer_dirty(leaf); 1034 1035 key.objectid = 0; 1036 key.type = BTRFS_ROOT_REF_KEY; 1037 key.offset = 0; 1038 1039 btrfs_release_path(path); 1040 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1041 if (ret > 0) 1042 goto out_add_root; 1043 if (ret < 0) { 1044 btrfs_abort_transaction(trans, ret); 1045 goto out_free_path; 1046 } 1047 1048 while (1) { 1049 slot = path->slots[0]; 1050 leaf = path->nodes[0]; 1051 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1052 1053 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1054 1055 /* Release locks on tree_root before we access quota_root */ 1056 btrfs_release_path(path); 1057 1058 ret = add_qgroup_item(trans, quota_root, 1059 found_key.offset); 1060 if (ret) { 1061 btrfs_abort_transaction(trans, ret); 1062 goto out_free_path; 1063 } 1064 1065 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1066 if (IS_ERR(qgroup)) { 1067 ret = PTR_ERR(qgroup); 1068 btrfs_abort_transaction(trans, ret); 1069 goto out_free_path; 1070 } 1071 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1072 if (ret < 0) { 1073 btrfs_abort_transaction(trans, ret); 1074 goto out_free_path; 1075 } 1076 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1077 path, 1, 0); 1078 if (ret < 0) { 1079 btrfs_abort_transaction(trans, ret); 1080 goto out_free_path; 1081 } 1082 if (ret > 0) { 1083 /* 1084 * Shouldn't happen, but in case it does we 1085 * don't need to do the btrfs_next_item, just 1086 * continue. 1087 */ 1088 continue; 1089 } 1090 } 1091 ret = btrfs_next_item(tree_root, path); 1092 if (ret < 0) { 1093 btrfs_abort_transaction(trans, ret); 1094 goto out_free_path; 1095 } 1096 if (ret) 1097 break; 1098 } 1099 1100 out_add_root: 1101 btrfs_release_path(path); 1102 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1103 if (ret) { 1104 btrfs_abort_transaction(trans, ret); 1105 goto out_free_path; 1106 } 1107 1108 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1109 if (IS_ERR(qgroup)) { 1110 ret = PTR_ERR(qgroup); 1111 btrfs_abort_transaction(trans, ret); 1112 goto out_free_path; 1113 } 1114 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1115 if (ret < 0) { 1116 btrfs_abort_transaction(trans, ret); 1117 goto out_free_path; 1118 } 1119 1120 ret = btrfs_commit_transaction(trans); 1121 trans = NULL; 1122 if (ret) 1123 goto out_free_path; 1124 1125 /* 1126 * Set quota enabled flag after committing the transaction, to avoid 1127 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1128 * creation. 1129 */ 1130 spin_lock(&fs_info->qgroup_lock); 1131 fs_info->quota_root = quota_root; 1132 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1133 spin_unlock(&fs_info->qgroup_lock); 1134 1135 ret = qgroup_rescan_init(fs_info, 0, 1); 1136 if (!ret) { 1137 qgroup_rescan_zero_tracking(fs_info); 1138 fs_info->qgroup_rescan_running = true; 1139 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1140 &fs_info->qgroup_rescan_work); 1141 } 1142 1143 out_free_path: 1144 btrfs_free_path(path); 1145 out_free_root: 1146 if (ret) 1147 btrfs_put_root(quota_root); 1148 out: 1149 if (ret) { 1150 ulist_free(fs_info->qgroup_ulist); 1151 fs_info->qgroup_ulist = NULL; 1152 btrfs_sysfs_del_qgroups(fs_info); 1153 } 1154 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1155 if (ret && trans) 1156 btrfs_end_transaction(trans); 1157 else if (trans) 1158 ret = btrfs_end_transaction(trans); 1159 ulist_free(ulist); 1160 return ret; 1161 } 1162 1163 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1164 { 1165 struct btrfs_root *quota_root; 1166 struct btrfs_trans_handle *trans = NULL; 1167 int ret = 0; 1168 1169 mutex_lock(&fs_info->qgroup_ioctl_lock); 1170 if (!fs_info->quota_root) 1171 goto out; 1172 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1173 1174 /* 1175 * 1 For the root item 1176 * 1177 * We should also reserve enough items for the quota tree deletion in 1178 * btrfs_clean_quota_tree but this is not done. 1179 * 1180 * Also, we must always start a transaction without holding the mutex 1181 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1182 */ 1183 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1184 1185 mutex_lock(&fs_info->qgroup_ioctl_lock); 1186 if (IS_ERR(trans)) { 1187 ret = PTR_ERR(trans); 1188 trans = NULL; 1189 goto out; 1190 } 1191 1192 if (!fs_info->quota_root) 1193 goto out; 1194 1195 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1196 btrfs_qgroup_wait_for_completion(fs_info, false); 1197 spin_lock(&fs_info->qgroup_lock); 1198 quota_root = fs_info->quota_root; 1199 fs_info->quota_root = NULL; 1200 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1201 spin_unlock(&fs_info->qgroup_lock); 1202 1203 btrfs_free_qgroup_config(fs_info); 1204 1205 ret = btrfs_clean_quota_tree(trans, quota_root); 1206 if (ret) { 1207 btrfs_abort_transaction(trans, ret); 1208 goto out; 1209 } 1210 1211 ret = btrfs_del_root(trans, "a_root->root_key); 1212 if (ret) { 1213 btrfs_abort_transaction(trans, ret); 1214 goto out; 1215 } 1216 1217 list_del("a_root->dirty_list); 1218 1219 btrfs_tree_lock(quota_root->node); 1220 btrfs_clean_tree_block(quota_root->node); 1221 btrfs_tree_unlock(quota_root->node); 1222 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1223 quota_root->node, 0, 1); 1224 1225 btrfs_put_root(quota_root); 1226 1227 out: 1228 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1229 if (ret && trans) 1230 btrfs_end_transaction(trans); 1231 else if (trans) 1232 ret = btrfs_end_transaction(trans); 1233 1234 return ret; 1235 } 1236 1237 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1238 struct btrfs_qgroup *qgroup) 1239 { 1240 if (list_empty(&qgroup->dirty)) 1241 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1242 } 1243 1244 /* 1245 * The easy accounting, we're updating qgroup relationship whose child qgroup 1246 * only has exclusive extents. 1247 * 1248 * In this case, all exclusive extents will also be exclusive for parent, so 1249 * excl/rfer just get added/removed. 1250 * 1251 * So is qgroup reservation space, which should also be added/removed to 1252 * parent. 1253 * Or when child tries to release reservation space, parent will underflow its 1254 * reservation (for relationship adding case). 1255 * 1256 * Caller should hold fs_info->qgroup_lock. 1257 */ 1258 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1259 struct ulist *tmp, u64 ref_root, 1260 struct btrfs_qgroup *src, int sign) 1261 { 1262 struct btrfs_qgroup *qgroup; 1263 struct btrfs_qgroup_list *glist; 1264 struct ulist_node *unode; 1265 struct ulist_iterator uiter; 1266 u64 num_bytes = src->excl; 1267 int ret = 0; 1268 1269 qgroup = find_qgroup_rb(fs_info, ref_root); 1270 if (!qgroup) 1271 goto out; 1272 1273 qgroup->rfer += sign * num_bytes; 1274 qgroup->rfer_cmpr += sign * num_bytes; 1275 1276 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1277 qgroup->excl += sign * num_bytes; 1278 qgroup->excl_cmpr += sign * num_bytes; 1279 1280 if (sign > 0) 1281 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1282 else 1283 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1284 1285 qgroup_dirty(fs_info, qgroup); 1286 1287 /* Get all of the parent groups that contain this qgroup */ 1288 list_for_each_entry(glist, &qgroup->groups, next_group) { 1289 ret = ulist_add(tmp, glist->group->qgroupid, 1290 qgroup_to_aux(glist->group), GFP_ATOMIC); 1291 if (ret < 0) 1292 goto out; 1293 } 1294 1295 /* Iterate all of the parents and adjust their reference counts */ 1296 ULIST_ITER_INIT(&uiter); 1297 while ((unode = ulist_next(tmp, &uiter))) { 1298 qgroup = unode_aux_to_qgroup(unode); 1299 qgroup->rfer += sign * num_bytes; 1300 qgroup->rfer_cmpr += sign * num_bytes; 1301 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1302 qgroup->excl += sign * num_bytes; 1303 if (sign > 0) 1304 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1305 else 1306 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1307 qgroup->excl_cmpr += sign * num_bytes; 1308 qgroup_dirty(fs_info, qgroup); 1309 1310 /* Add any parents of the parents */ 1311 list_for_each_entry(glist, &qgroup->groups, next_group) { 1312 ret = ulist_add(tmp, glist->group->qgroupid, 1313 qgroup_to_aux(glist->group), GFP_ATOMIC); 1314 if (ret < 0) 1315 goto out; 1316 } 1317 } 1318 ret = 0; 1319 out: 1320 return ret; 1321 } 1322 1323 1324 /* 1325 * Quick path for updating qgroup with only excl refs. 1326 * 1327 * In that case, just update all parent will be enough. 1328 * Or we needs to do a full rescan. 1329 * Caller should also hold fs_info->qgroup_lock. 1330 * 1331 * Return 0 for quick update, return >0 for need to full rescan 1332 * and mark INCONSISTENT flag. 1333 * Return < 0 for other error. 1334 */ 1335 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1336 struct ulist *tmp, u64 src, u64 dst, 1337 int sign) 1338 { 1339 struct btrfs_qgroup *qgroup; 1340 int ret = 1; 1341 int err = 0; 1342 1343 qgroup = find_qgroup_rb(fs_info, src); 1344 if (!qgroup) 1345 goto out; 1346 if (qgroup->excl == qgroup->rfer) { 1347 ret = 0; 1348 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1349 qgroup, sign); 1350 if (err < 0) { 1351 ret = err; 1352 goto out; 1353 } 1354 } 1355 out: 1356 if (ret) 1357 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1358 return ret; 1359 } 1360 1361 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1362 u64 dst) 1363 { 1364 struct btrfs_fs_info *fs_info = trans->fs_info; 1365 struct btrfs_qgroup *parent; 1366 struct btrfs_qgroup *member; 1367 struct btrfs_qgroup_list *list; 1368 struct ulist *tmp; 1369 unsigned int nofs_flag; 1370 int ret = 0; 1371 1372 /* Check the level of src and dst first */ 1373 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1374 return -EINVAL; 1375 1376 /* We hold a transaction handle open, must do a NOFS allocation. */ 1377 nofs_flag = memalloc_nofs_save(); 1378 tmp = ulist_alloc(GFP_KERNEL); 1379 memalloc_nofs_restore(nofs_flag); 1380 if (!tmp) 1381 return -ENOMEM; 1382 1383 mutex_lock(&fs_info->qgroup_ioctl_lock); 1384 if (!fs_info->quota_root) { 1385 ret = -ENOTCONN; 1386 goto out; 1387 } 1388 member = find_qgroup_rb(fs_info, src); 1389 parent = find_qgroup_rb(fs_info, dst); 1390 if (!member || !parent) { 1391 ret = -EINVAL; 1392 goto out; 1393 } 1394 1395 /* check if such qgroup relation exist firstly */ 1396 list_for_each_entry(list, &member->groups, next_group) { 1397 if (list->group == parent) { 1398 ret = -EEXIST; 1399 goto out; 1400 } 1401 } 1402 1403 ret = add_qgroup_relation_item(trans, src, dst); 1404 if (ret) 1405 goto out; 1406 1407 ret = add_qgroup_relation_item(trans, dst, src); 1408 if (ret) { 1409 del_qgroup_relation_item(trans, src, dst); 1410 goto out; 1411 } 1412 1413 spin_lock(&fs_info->qgroup_lock); 1414 ret = add_relation_rb(fs_info, src, dst); 1415 if (ret < 0) { 1416 spin_unlock(&fs_info->qgroup_lock); 1417 goto out; 1418 } 1419 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1420 spin_unlock(&fs_info->qgroup_lock); 1421 out: 1422 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1423 ulist_free(tmp); 1424 return ret; 1425 } 1426 1427 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1428 u64 dst) 1429 { 1430 struct btrfs_fs_info *fs_info = trans->fs_info; 1431 struct btrfs_qgroup *parent; 1432 struct btrfs_qgroup *member; 1433 struct btrfs_qgroup_list *list; 1434 struct ulist *tmp; 1435 bool found = false; 1436 unsigned int nofs_flag; 1437 int ret = 0; 1438 int ret2; 1439 1440 /* We hold a transaction handle open, must do a NOFS allocation. */ 1441 nofs_flag = memalloc_nofs_save(); 1442 tmp = ulist_alloc(GFP_KERNEL); 1443 memalloc_nofs_restore(nofs_flag); 1444 if (!tmp) 1445 return -ENOMEM; 1446 1447 if (!fs_info->quota_root) { 1448 ret = -ENOTCONN; 1449 goto out; 1450 } 1451 1452 member = find_qgroup_rb(fs_info, src); 1453 parent = find_qgroup_rb(fs_info, dst); 1454 /* 1455 * The parent/member pair doesn't exist, then try to delete the dead 1456 * relation items only. 1457 */ 1458 if (!member || !parent) 1459 goto delete_item; 1460 1461 /* check if such qgroup relation exist firstly */ 1462 list_for_each_entry(list, &member->groups, next_group) { 1463 if (list->group == parent) { 1464 found = true; 1465 break; 1466 } 1467 } 1468 1469 delete_item: 1470 ret = del_qgroup_relation_item(trans, src, dst); 1471 if (ret < 0 && ret != -ENOENT) 1472 goto out; 1473 ret2 = del_qgroup_relation_item(trans, dst, src); 1474 if (ret2 < 0 && ret2 != -ENOENT) 1475 goto out; 1476 1477 /* At least one deletion succeeded, return 0 */ 1478 if (!ret || !ret2) 1479 ret = 0; 1480 1481 if (found) { 1482 spin_lock(&fs_info->qgroup_lock); 1483 del_relation_rb(fs_info, src, dst); 1484 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1485 spin_unlock(&fs_info->qgroup_lock); 1486 } 1487 out: 1488 ulist_free(tmp); 1489 return ret; 1490 } 1491 1492 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1493 u64 dst) 1494 { 1495 struct btrfs_fs_info *fs_info = trans->fs_info; 1496 int ret = 0; 1497 1498 mutex_lock(&fs_info->qgroup_ioctl_lock); 1499 ret = __del_qgroup_relation(trans, src, dst); 1500 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1501 1502 return ret; 1503 } 1504 1505 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1506 { 1507 struct btrfs_fs_info *fs_info = trans->fs_info; 1508 struct btrfs_root *quota_root; 1509 struct btrfs_qgroup *qgroup; 1510 int ret = 0; 1511 1512 mutex_lock(&fs_info->qgroup_ioctl_lock); 1513 if (!fs_info->quota_root) { 1514 ret = -ENOTCONN; 1515 goto out; 1516 } 1517 quota_root = fs_info->quota_root; 1518 qgroup = find_qgroup_rb(fs_info, qgroupid); 1519 if (qgroup) { 1520 ret = -EEXIST; 1521 goto out; 1522 } 1523 1524 ret = add_qgroup_item(trans, quota_root, qgroupid); 1525 if (ret) 1526 goto out; 1527 1528 spin_lock(&fs_info->qgroup_lock); 1529 qgroup = add_qgroup_rb(fs_info, qgroupid); 1530 spin_unlock(&fs_info->qgroup_lock); 1531 1532 if (IS_ERR(qgroup)) { 1533 ret = PTR_ERR(qgroup); 1534 goto out; 1535 } 1536 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1537 out: 1538 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1539 return ret; 1540 } 1541 1542 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1543 { 1544 struct btrfs_fs_info *fs_info = trans->fs_info; 1545 struct btrfs_qgroup *qgroup; 1546 struct btrfs_qgroup_list *list; 1547 int ret = 0; 1548 1549 mutex_lock(&fs_info->qgroup_ioctl_lock); 1550 if (!fs_info->quota_root) { 1551 ret = -ENOTCONN; 1552 goto out; 1553 } 1554 1555 qgroup = find_qgroup_rb(fs_info, qgroupid); 1556 if (!qgroup) { 1557 ret = -ENOENT; 1558 goto out; 1559 } 1560 1561 /* Check if there are no children of this qgroup */ 1562 if (!list_empty(&qgroup->members)) { 1563 ret = -EBUSY; 1564 goto out; 1565 } 1566 1567 ret = del_qgroup_item(trans, qgroupid); 1568 if (ret && ret != -ENOENT) 1569 goto out; 1570 1571 while (!list_empty(&qgroup->groups)) { 1572 list = list_first_entry(&qgroup->groups, 1573 struct btrfs_qgroup_list, next_group); 1574 ret = __del_qgroup_relation(trans, qgroupid, 1575 list->group->qgroupid); 1576 if (ret) 1577 goto out; 1578 } 1579 1580 spin_lock(&fs_info->qgroup_lock); 1581 del_qgroup_rb(fs_info, qgroupid); 1582 spin_unlock(&fs_info->qgroup_lock); 1583 1584 /* 1585 * Remove the qgroup from sysfs now without holding the qgroup_lock 1586 * spinlock, since the sysfs_remove_group() function needs to take 1587 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1588 */ 1589 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1590 kfree(qgroup); 1591 out: 1592 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1593 return ret; 1594 } 1595 1596 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1597 struct btrfs_qgroup_limit *limit) 1598 { 1599 struct btrfs_fs_info *fs_info = trans->fs_info; 1600 struct btrfs_qgroup *qgroup; 1601 int ret = 0; 1602 /* Sometimes we would want to clear the limit on this qgroup. 1603 * To meet this requirement, we treat the -1 as a special value 1604 * which tell kernel to clear the limit on this qgroup. 1605 */ 1606 const u64 CLEAR_VALUE = -1; 1607 1608 mutex_lock(&fs_info->qgroup_ioctl_lock); 1609 if (!fs_info->quota_root) { 1610 ret = -ENOTCONN; 1611 goto out; 1612 } 1613 1614 qgroup = find_qgroup_rb(fs_info, qgroupid); 1615 if (!qgroup) { 1616 ret = -ENOENT; 1617 goto out; 1618 } 1619 1620 spin_lock(&fs_info->qgroup_lock); 1621 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1622 if (limit->max_rfer == CLEAR_VALUE) { 1623 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1624 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1625 qgroup->max_rfer = 0; 1626 } else { 1627 qgroup->max_rfer = limit->max_rfer; 1628 } 1629 } 1630 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1631 if (limit->max_excl == CLEAR_VALUE) { 1632 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1633 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1634 qgroup->max_excl = 0; 1635 } else { 1636 qgroup->max_excl = limit->max_excl; 1637 } 1638 } 1639 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1640 if (limit->rsv_rfer == CLEAR_VALUE) { 1641 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1642 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1643 qgroup->rsv_rfer = 0; 1644 } else { 1645 qgroup->rsv_rfer = limit->rsv_rfer; 1646 } 1647 } 1648 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1649 if (limit->rsv_excl == CLEAR_VALUE) { 1650 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1651 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1652 qgroup->rsv_excl = 0; 1653 } else { 1654 qgroup->rsv_excl = limit->rsv_excl; 1655 } 1656 } 1657 qgroup->lim_flags |= limit->flags; 1658 1659 spin_unlock(&fs_info->qgroup_lock); 1660 1661 ret = update_qgroup_limit_item(trans, qgroup); 1662 if (ret) { 1663 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1664 btrfs_info(fs_info, "unable to update quota limit for %llu", 1665 qgroupid); 1666 } 1667 1668 out: 1669 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1670 return ret; 1671 } 1672 1673 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1674 struct btrfs_delayed_ref_root *delayed_refs, 1675 struct btrfs_qgroup_extent_record *record) 1676 { 1677 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1678 struct rb_node *parent_node = NULL; 1679 struct btrfs_qgroup_extent_record *entry; 1680 u64 bytenr = record->bytenr; 1681 1682 lockdep_assert_held(&delayed_refs->lock); 1683 trace_btrfs_qgroup_trace_extent(fs_info, record); 1684 1685 while (*p) { 1686 parent_node = *p; 1687 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1688 node); 1689 if (bytenr < entry->bytenr) { 1690 p = &(*p)->rb_left; 1691 } else if (bytenr > entry->bytenr) { 1692 p = &(*p)->rb_right; 1693 } else { 1694 if (record->data_rsv && !entry->data_rsv) { 1695 entry->data_rsv = record->data_rsv; 1696 entry->data_rsv_refroot = 1697 record->data_rsv_refroot; 1698 } 1699 return 1; 1700 } 1701 } 1702 1703 rb_link_node(&record->node, parent_node, p); 1704 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1705 return 0; 1706 } 1707 1708 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1709 struct btrfs_qgroup_extent_record *qrecord) 1710 { 1711 struct ulist *old_root; 1712 u64 bytenr = qrecord->bytenr; 1713 int ret; 1714 1715 /* 1716 * We are always called in a context where we are already holding a 1717 * transaction handle. Often we are called when adding a data delayed 1718 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1719 * in which case we will be holding a write lock on extent buffer from a 1720 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1721 * acquire fs_info->commit_root_sem, because that is a higher level lock 1722 * that must be acquired before locking any extent buffers. 1723 * 1724 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1725 * but we can't pass it a non-NULL transaction handle, because otherwise 1726 * it would not use commit roots and would lock extent buffers, causing 1727 * a deadlock if it ends up trying to read lock the same extent buffer 1728 * that was previously write locked at btrfs_truncate_inode_items(). 1729 * 1730 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1731 * explicitly tell it to not acquire the commit_root_sem - if we are 1732 * holding a transaction handle we don't need its protection. 1733 */ 1734 ASSERT(trans != NULL); 1735 1736 ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, 1737 true); 1738 if (ret < 0) { 1739 trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1740 btrfs_warn(trans->fs_info, 1741 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1742 ret); 1743 return 0; 1744 } 1745 1746 /* 1747 * Here we don't need to get the lock of 1748 * trans->transaction->delayed_refs, since inserted qrecord won't 1749 * be deleted, only qrecord->node may be modified (new qrecord insert) 1750 * 1751 * So modifying qrecord->old_roots is safe here 1752 */ 1753 qrecord->old_roots = old_root; 1754 return 0; 1755 } 1756 1757 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1758 u64 num_bytes, gfp_t gfp_flag) 1759 { 1760 struct btrfs_fs_info *fs_info = trans->fs_info; 1761 struct btrfs_qgroup_extent_record *record; 1762 struct btrfs_delayed_ref_root *delayed_refs; 1763 int ret; 1764 1765 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1766 || bytenr == 0 || num_bytes == 0) 1767 return 0; 1768 record = kzalloc(sizeof(*record), gfp_flag); 1769 if (!record) 1770 return -ENOMEM; 1771 1772 delayed_refs = &trans->transaction->delayed_refs; 1773 record->bytenr = bytenr; 1774 record->num_bytes = num_bytes; 1775 record->old_roots = NULL; 1776 1777 spin_lock(&delayed_refs->lock); 1778 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1779 spin_unlock(&delayed_refs->lock); 1780 if (ret > 0) { 1781 kfree(record); 1782 return 0; 1783 } 1784 return btrfs_qgroup_trace_extent_post(trans, record); 1785 } 1786 1787 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1788 struct extent_buffer *eb) 1789 { 1790 struct btrfs_fs_info *fs_info = trans->fs_info; 1791 int nr = btrfs_header_nritems(eb); 1792 int i, extent_type, ret; 1793 struct btrfs_key key; 1794 struct btrfs_file_extent_item *fi; 1795 u64 bytenr, num_bytes; 1796 1797 /* We can be called directly from walk_up_proc() */ 1798 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1799 return 0; 1800 1801 for (i = 0; i < nr; i++) { 1802 btrfs_item_key_to_cpu(eb, &key, i); 1803 1804 if (key.type != BTRFS_EXTENT_DATA_KEY) 1805 continue; 1806 1807 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1808 /* filter out non qgroup-accountable extents */ 1809 extent_type = btrfs_file_extent_type(eb, fi); 1810 1811 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1812 continue; 1813 1814 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1815 if (!bytenr) 1816 continue; 1817 1818 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1819 1820 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, 1821 GFP_NOFS); 1822 if (ret) 1823 return ret; 1824 } 1825 cond_resched(); 1826 return 0; 1827 } 1828 1829 /* 1830 * Walk up the tree from the bottom, freeing leaves and any interior 1831 * nodes which have had all slots visited. If a node (leaf or 1832 * interior) is freed, the node above it will have it's slot 1833 * incremented. The root node will never be freed. 1834 * 1835 * At the end of this function, we should have a path which has all 1836 * slots incremented to the next position for a search. If we need to 1837 * read a new node it will be NULL and the node above it will have the 1838 * correct slot selected for a later read. 1839 * 1840 * If we increment the root nodes slot counter past the number of 1841 * elements, 1 is returned to signal completion of the search. 1842 */ 1843 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1844 { 1845 int level = 0; 1846 int nr, slot; 1847 struct extent_buffer *eb; 1848 1849 if (root_level == 0) 1850 return 1; 1851 1852 while (level <= root_level) { 1853 eb = path->nodes[level]; 1854 nr = btrfs_header_nritems(eb); 1855 path->slots[level]++; 1856 slot = path->slots[level]; 1857 if (slot >= nr || level == 0) { 1858 /* 1859 * Don't free the root - we will detect this 1860 * condition after our loop and return a 1861 * positive value for caller to stop walking the tree. 1862 */ 1863 if (level != root_level) { 1864 btrfs_tree_unlock_rw(eb, path->locks[level]); 1865 path->locks[level] = 0; 1866 1867 free_extent_buffer(eb); 1868 path->nodes[level] = NULL; 1869 path->slots[level] = 0; 1870 } 1871 } else { 1872 /* 1873 * We have a valid slot to walk back down 1874 * from. Stop here so caller can process these 1875 * new nodes. 1876 */ 1877 break; 1878 } 1879 1880 level++; 1881 } 1882 1883 eb = path->nodes[root_level]; 1884 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 1885 return 1; 1886 1887 return 0; 1888 } 1889 1890 /* 1891 * Helper function to trace a subtree tree block swap. 1892 * 1893 * The swap will happen in highest tree block, but there may be a lot of 1894 * tree blocks involved. 1895 * 1896 * For example: 1897 * OO = Old tree blocks 1898 * NN = New tree blocks allocated during balance 1899 * 1900 * File tree (257) Reloc tree for 257 1901 * L2 OO NN 1902 * / \ / \ 1903 * L1 OO OO (a) OO NN (a) 1904 * / \ / \ / \ / \ 1905 * L0 OO OO OO OO OO OO NN NN 1906 * (b) (c) (b) (c) 1907 * 1908 * When calling qgroup_trace_extent_swap(), we will pass: 1909 * @src_eb = OO(a) 1910 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 1911 * @dst_level = 0 1912 * @root_level = 1 1913 * 1914 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 1915 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 1916 * 1917 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 1918 * 1919 * 1) Tree search from @src_eb 1920 * It should acts as a simplified btrfs_search_slot(). 1921 * The key for search can be extracted from @dst_path->nodes[dst_level] 1922 * (first key). 1923 * 1924 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 1925 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 1926 * They should be marked during previous (@dst_level = 1) iteration. 1927 * 1928 * 3) Mark file extents in leaves dirty 1929 * We don't have good way to pick out new file extents only. 1930 * So we still follow the old method by scanning all file extents in 1931 * the leave. 1932 * 1933 * This function can free us from keeping two paths, thus later we only need 1934 * to care about how to iterate all new tree blocks in reloc tree. 1935 */ 1936 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 1937 struct extent_buffer *src_eb, 1938 struct btrfs_path *dst_path, 1939 int dst_level, int root_level, 1940 bool trace_leaf) 1941 { 1942 struct btrfs_key key; 1943 struct btrfs_path *src_path; 1944 struct btrfs_fs_info *fs_info = trans->fs_info; 1945 u32 nodesize = fs_info->nodesize; 1946 int cur_level = root_level; 1947 int ret; 1948 1949 BUG_ON(dst_level > root_level); 1950 /* Level mismatch */ 1951 if (btrfs_header_level(src_eb) != root_level) 1952 return -EINVAL; 1953 1954 src_path = btrfs_alloc_path(); 1955 if (!src_path) { 1956 ret = -ENOMEM; 1957 goto out; 1958 } 1959 1960 if (dst_level) 1961 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 1962 else 1963 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 1964 1965 /* For src_path */ 1966 atomic_inc(&src_eb->refs); 1967 src_path->nodes[root_level] = src_eb; 1968 src_path->slots[root_level] = dst_path->slots[root_level]; 1969 src_path->locks[root_level] = 0; 1970 1971 /* A simplified version of btrfs_search_slot() */ 1972 while (cur_level >= dst_level) { 1973 struct btrfs_key src_key; 1974 struct btrfs_key dst_key; 1975 1976 if (src_path->nodes[cur_level] == NULL) { 1977 struct extent_buffer *eb; 1978 int parent_slot; 1979 1980 eb = src_path->nodes[cur_level + 1]; 1981 parent_slot = src_path->slots[cur_level + 1]; 1982 1983 eb = btrfs_read_node_slot(eb, parent_slot); 1984 if (IS_ERR(eb)) { 1985 ret = PTR_ERR(eb); 1986 goto out; 1987 } 1988 1989 src_path->nodes[cur_level] = eb; 1990 1991 btrfs_tree_read_lock(eb); 1992 src_path->locks[cur_level] = BTRFS_READ_LOCK; 1993 } 1994 1995 src_path->slots[cur_level] = dst_path->slots[cur_level]; 1996 if (cur_level) { 1997 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 1998 &dst_key, dst_path->slots[cur_level]); 1999 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2000 &src_key, src_path->slots[cur_level]); 2001 } else { 2002 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2003 &dst_key, dst_path->slots[cur_level]); 2004 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2005 &src_key, src_path->slots[cur_level]); 2006 } 2007 /* Content mismatch, something went wrong */ 2008 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2009 ret = -ENOENT; 2010 goto out; 2011 } 2012 cur_level--; 2013 } 2014 2015 /* 2016 * Now both @dst_path and @src_path have been populated, record the tree 2017 * blocks for qgroup accounting. 2018 */ 2019 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2020 nodesize, GFP_NOFS); 2021 if (ret < 0) 2022 goto out; 2023 ret = btrfs_qgroup_trace_extent(trans, 2024 dst_path->nodes[dst_level]->start, 2025 nodesize, GFP_NOFS); 2026 if (ret < 0) 2027 goto out; 2028 2029 /* Record leaf file extents */ 2030 if (dst_level == 0 && trace_leaf) { 2031 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2032 if (ret < 0) 2033 goto out; 2034 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2035 } 2036 out: 2037 btrfs_free_path(src_path); 2038 return ret; 2039 } 2040 2041 /* 2042 * Helper function to do recursive generation-aware depth-first search, to 2043 * locate all new tree blocks in a subtree of reloc tree. 2044 * 2045 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2046 * reloc tree 2047 * L2 NN (a) 2048 * / \ 2049 * L1 OO NN (b) 2050 * / \ / \ 2051 * L0 OO OO OO NN 2052 * (c) (d) 2053 * If we pass: 2054 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2055 * @cur_level = 1 2056 * @root_level = 1 2057 * 2058 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2059 * above tree blocks along with their counter parts in file tree. 2060 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2061 * won't affect OO(c). 2062 */ 2063 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2064 struct extent_buffer *src_eb, 2065 struct btrfs_path *dst_path, 2066 int cur_level, int root_level, 2067 u64 last_snapshot, bool trace_leaf) 2068 { 2069 struct btrfs_fs_info *fs_info = trans->fs_info; 2070 struct extent_buffer *eb; 2071 bool need_cleanup = false; 2072 int ret = 0; 2073 int i; 2074 2075 /* Level sanity check */ 2076 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2077 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2078 root_level < cur_level) { 2079 btrfs_err_rl(fs_info, 2080 "%s: bad levels, cur_level=%d root_level=%d", 2081 __func__, cur_level, root_level); 2082 return -EUCLEAN; 2083 } 2084 2085 /* Read the tree block if needed */ 2086 if (dst_path->nodes[cur_level] == NULL) { 2087 int parent_slot; 2088 u64 child_gen; 2089 2090 /* 2091 * dst_path->nodes[root_level] must be initialized before 2092 * calling this function. 2093 */ 2094 if (cur_level == root_level) { 2095 btrfs_err_rl(fs_info, 2096 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2097 __func__, root_level, root_level, cur_level); 2098 return -EUCLEAN; 2099 } 2100 2101 /* 2102 * We need to get child blockptr/gen from parent before we can 2103 * read it. 2104 */ 2105 eb = dst_path->nodes[cur_level + 1]; 2106 parent_slot = dst_path->slots[cur_level + 1]; 2107 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2108 2109 /* This node is old, no need to trace */ 2110 if (child_gen < last_snapshot) 2111 goto out; 2112 2113 eb = btrfs_read_node_slot(eb, parent_slot); 2114 if (IS_ERR(eb)) { 2115 ret = PTR_ERR(eb); 2116 goto out; 2117 } 2118 2119 dst_path->nodes[cur_level] = eb; 2120 dst_path->slots[cur_level] = 0; 2121 2122 btrfs_tree_read_lock(eb); 2123 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2124 need_cleanup = true; 2125 } 2126 2127 /* Now record this tree block and its counter part for qgroups */ 2128 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2129 root_level, trace_leaf); 2130 if (ret < 0) 2131 goto cleanup; 2132 2133 eb = dst_path->nodes[cur_level]; 2134 2135 if (cur_level > 0) { 2136 /* Iterate all child tree blocks */ 2137 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2138 /* Skip old tree blocks as they won't be swapped */ 2139 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2140 continue; 2141 dst_path->slots[cur_level] = i; 2142 2143 /* Recursive call (at most 7 times) */ 2144 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2145 dst_path, cur_level - 1, root_level, 2146 last_snapshot, trace_leaf); 2147 if (ret < 0) 2148 goto cleanup; 2149 } 2150 } 2151 2152 cleanup: 2153 if (need_cleanup) { 2154 /* Clean up */ 2155 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2156 dst_path->locks[cur_level]); 2157 free_extent_buffer(dst_path->nodes[cur_level]); 2158 dst_path->nodes[cur_level] = NULL; 2159 dst_path->slots[cur_level] = 0; 2160 dst_path->locks[cur_level] = 0; 2161 } 2162 out: 2163 return ret; 2164 } 2165 2166 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2167 struct extent_buffer *src_eb, 2168 struct extent_buffer *dst_eb, 2169 u64 last_snapshot, bool trace_leaf) 2170 { 2171 struct btrfs_fs_info *fs_info = trans->fs_info; 2172 struct btrfs_path *dst_path = NULL; 2173 int level; 2174 int ret; 2175 2176 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2177 return 0; 2178 2179 /* Wrong parameter order */ 2180 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2181 btrfs_err_rl(fs_info, 2182 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2183 btrfs_header_generation(src_eb), 2184 btrfs_header_generation(dst_eb)); 2185 return -EUCLEAN; 2186 } 2187 2188 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2189 ret = -EIO; 2190 goto out; 2191 } 2192 2193 level = btrfs_header_level(dst_eb); 2194 dst_path = btrfs_alloc_path(); 2195 if (!dst_path) { 2196 ret = -ENOMEM; 2197 goto out; 2198 } 2199 /* For dst_path */ 2200 atomic_inc(&dst_eb->refs); 2201 dst_path->nodes[level] = dst_eb; 2202 dst_path->slots[level] = 0; 2203 dst_path->locks[level] = 0; 2204 2205 /* Do the generation aware breadth-first search */ 2206 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2207 level, last_snapshot, trace_leaf); 2208 if (ret < 0) 2209 goto out; 2210 ret = 0; 2211 2212 out: 2213 btrfs_free_path(dst_path); 2214 if (ret < 0) 2215 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2216 return ret; 2217 } 2218 2219 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2220 struct extent_buffer *root_eb, 2221 u64 root_gen, int root_level) 2222 { 2223 struct btrfs_fs_info *fs_info = trans->fs_info; 2224 int ret = 0; 2225 int level; 2226 struct extent_buffer *eb = root_eb; 2227 struct btrfs_path *path = NULL; 2228 2229 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2230 BUG_ON(root_eb == NULL); 2231 2232 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2233 return 0; 2234 2235 if (!extent_buffer_uptodate(root_eb)) { 2236 ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); 2237 if (ret) 2238 goto out; 2239 } 2240 2241 if (root_level == 0) { 2242 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2243 goto out; 2244 } 2245 2246 path = btrfs_alloc_path(); 2247 if (!path) 2248 return -ENOMEM; 2249 2250 /* 2251 * Walk down the tree. Missing extent blocks are filled in as 2252 * we go. Metadata is accounted every time we read a new 2253 * extent block. 2254 * 2255 * When we reach a leaf, we account for file extent items in it, 2256 * walk back up the tree (adjusting slot pointers as we go) 2257 * and restart the search process. 2258 */ 2259 atomic_inc(&root_eb->refs); /* For path */ 2260 path->nodes[root_level] = root_eb; 2261 path->slots[root_level] = 0; 2262 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2263 walk_down: 2264 level = root_level; 2265 while (level >= 0) { 2266 if (path->nodes[level] == NULL) { 2267 int parent_slot; 2268 u64 child_bytenr; 2269 2270 /* 2271 * We need to get child blockptr from parent before we 2272 * can read it. 2273 */ 2274 eb = path->nodes[level + 1]; 2275 parent_slot = path->slots[level + 1]; 2276 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2277 2278 eb = btrfs_read_node_slot(eb, parent_slot); 2279 if (IS_ERR(eb)) { 2280 ret = PTR_ERR(eb); 2281 goto out; 2282 } 2283 2284 path->nodes[level] = eb; 2285 path->slots[level] = 0; 2286 2287 btrfs_tree_read_lock(eb); 2288 path->locks[level] = BTRFS_READ_LOCK; 2289 2290 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2291 fs_info->nodesize, 2292 GFP_NOFS); 2293 if (ret) 2294 goto out; 2295 } 2296 2297 if (level == 0) { 2298 ret = btrfs_qgroup_trace_leaf_items(trans, 2299 path->nodes[level]); 2300 if (ret) 2301 goto out; 2302 2303 /* Nonzero return here means we completed our search */ 2304 ret = adjust_slots_upwards(path, root_level); 2305 if (ret) 2306 break; 2307 2308 /* Restart search with new slots */ 2309 goto walk_down; 2310 } 2311 2312 level--; 2313 } 2314 2315 ret = 0; 2316 out: 2317 btrfs_free_path(path); 2318 2319 return ret; 2320 } 2321 2322 #define UPDATE_NEW 0 2323 #define UPDATE_OLD 1 2324 /* 2325 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2326 */ 2327 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2328 struct ulist *roots, struct ulist *tmp, 2329 struct ulist *qgroups, u64 seq, int update_old) 2330 { 2331 struct ulist_node *unode; 2332 struct ulist_iterator uiter; 2333 struct ulist_node *tmp_unode; 2334 struct ulist_iterator tmp_uiter; 2335 struct btrfs_qgroup *qg; 2336 int ret = 0; 2337 2338 if (!roots) 2339 return 0; 2340 ULIST_ITER_INIT(&uiter); 2341 while ((unode = ulist_next(roots, &uiter))) { 2342 qg = find_qgroup_rb(fs_info, unode->val); 2343 if (!qg) 2344 continue; 2345 2346 ulist_reinit(tmp); 2347 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2348 GFP_ATOMIC); 2349 if (ret < 0) 2350 return ret; 2351 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2352 if (ret < 0) 2353 return ret; 2354 ULIST_ITER_INIT(&tmp_uiter); 2355 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2356 struct btrfs_qgroup_list *glist; 2357 2358 qg = unode_aux_to_qgroup(tmp_unode); 2359 if (update_old) 2360 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2361 else 2362 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2363 list_for_each_entry(glist, &qg->groups, next_group) { 2364 ret = ulist_add(qgroups, glist->group->qgroupid, 2365 qgroup_to_aux(glist->group), 2366 GFP_ATOMIC); 2367 if (ret < 0) 2368 return ret; 2369 ret = ulist_add(tmp, glist->group->qgroupid, 2370 qgroup_to_aux(glist->group), 2371 GFP_ATOMIC); 2372 if (ret < 0) 2373 return ret; 2374 } 2375 } 2376 } 2377 return 0; 2378 } 2379 2380 /* 2381 * Update qgroup rfer/excl counters. 2382 * Rfer update is easy, codes can explain themselves. 2383 * 2384 * Excl update is tricky, the update is split into 2 parts. 2385 * Part 1: Possible exclusive <-> sharing detect: 2386 * | A | !A | 2387 * ------------------------------------- 2388 * B | * | - | 2389 * ------------------------------------- 2390 * !B | + | ** | 2391 * ------------------------------------- 2392 * 2393 * Conditions: 2394 * A: cur_old_roots < nr_old_roots (not exclusive before) 2395 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2396 * B: cur_new_roots < nr_new_roots (not exclusive now) 2397 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2398 * 2399 * Results: 2400 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2401 * *: Definitely not changed. **: Possible unchanged. 2402 * 2403 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2404 * 2405 * To make the logic clear, we first use condition A and B to split 2406 * combination into 4 results. 2407 * 2408 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2409 * only on variant maybe 0. 2410 * 2411 * Lastly, check result **, since there are 2 variants maybe 0, split them 2412 * again(2x2). 2413 * But this time we don't need to consider other things, the codes and logic 2414 * is easy to understand now. 2415 */ 2416 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2417 struct ulist *qgroups, 2418 u64 nr_old_roots, 2419 u64 nr_new_roots, 2420 u64 num_bytes, u64 seq) 2421 { 2422 struct ulist_node *unode; 2423 struct ulist_iterator uiter; 2424 struct btrfs_qgroup *qg; 2425 u64 cur_new_count, cur_old_count; 2426 2427 ULIST_ITER_INIT(&uiter); 2428 while ((unode = ulist_next(qgroups, &uiter))) { 2429 bool dirty = false; 2430 2431 qg = unode_aux_to_qgroup(unode); 2432 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2433 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2434 2435 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2436 cur_new_count); 2437 2438 /* Rfer update part */ 2439 if (cur_old_count == 0 && cur_new_count > 0) { 2440 qg->rfer += num_bytes; 2441 qg->rfer_cmpr += num_bytes; 2442 dirty = true; 2443 } 2444 if (cur_old_count > 0 && cur_new_count == 0) { 2445 qg->rfer -= num_bytes; 2446 qg->rfer_cmpr -= num_bytes; 2447 dirty = true; 2448 } 2449 2450 /* Excl update part */ 2451 /* Exclusive/none -> shared case */ 2452 if (cur_old_count == nr_old_roots && 2453 cur_new_count < nr_new_roots) { 2454 /* Exclusive -> shared */ 2455 if (cur_old_count != 0) { 2456 qg->excl -= num_bytes; 2457 qg->excl_cmpr -= num_bytes; 2458 dirty = true; 2459 } 2460 } 2461 2462 /* Shared -> exclusive/none case */ 2463 if (cur_old_count < nr_old_roots && 2464 cur_new_count == nr_new_roots) { 2465 /* Shared->exclusive */ 2466 if (cur_new_count != 0) { 2467 qg->excl += num_bytes; 2468 qg->excl_cmpr += num_bytes; 2469 dirty = true; 2470 } 2471 } 2472 2473 /* Exclusive/none -> exclusive/none case */ 2474 if (cur_old_count == nr_old_roots && 2475 cur_new_count == nr_new_roots) { 2476 if (cur_old_count == 0) { 2477 /* None -> exclusive/none */ 2478 2479 if (cur_new_count != 0) { 2480 /* None -> exclusive */ 2481 qg->excl += num_bytes; 2482 qg->excl_cmpr += num_bytes; 2483 dirty = true; 2484 } 2485 /* None -> none, nothing changed */ 2486 } else { 2487 /* Exclusive -> exclusive/none */ 2488 2489 if (cur_new_count == 0) { 2490 /* Exclusive -> none */ 2491 qg->excl -= num_bytes; 2492 qg->excl_cmpr -= num_bytes; 2493 dirty = true; 2494 } 2495 /* Exclusive -> exclusive, nothing changed */ 2496 } 2497 } 2498 2499 if (dirty) 2500 qgroup_dirty(fs_info, qg); 2501 } 2502 return 0; 2503 } 2504 2505 /* 2506 * Check if the @roots potentially is a list of fs tree roots 2507 * 2508 * Return 0 for definitely not a fs/subvol tree roots ulist 2509 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2510 * one as well) 2511 */ 2512 static int maybe_fs_roots(struct ulist *roots) 2513 { 2514 struct ulist_node *unode; 2515 struct ulist_iterator uiter; 2516 2517 /* Empty one, still possible for fs roots */ 2518 if (!roots || roots->nnodes == 0) 2519 return 1; 2520 2521 ULIST_ITER_INIT(&uiter); 2522 unode = ulist_next(roots, &uiter); 2523 if (!unode) 2524 return 1; 2525 2526 /* 2527 * If it contains fs tree roots, then it must belong to fs/subvol 2528 * trees. 2529 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2530 */ 2531 return is_fstree(unode->val); 2532 } 2533 2534 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2535 u64 num_bytes, struct ulist *old_roots, 2536 struct ulist *new_roots) 2537 { 2538 struct btrfs_fs_info *fs_info = trans->fs_info; 2539 struct ulist *qgroups = NULL; 2540 struct ulist *tmp = NULL; 2541 u64 seq; 2542 u64 nr_new_roots = 0; 2543 u64 nr_old_roots = 0; 2544 int ret = 0; 2545 2546 /* 2547 * If quotas get disabled meanwhile, the resources need to be freed and 2548 * we can't just exit here. 2549 */ 2550 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2551 goto out_free; 2552 2553 if (new_roots) { 2554 if (!maybe_fs_roots(new_roots)) 2555 goto out_free; 2556 nr_new_roots = new_roots->nnodes; 2557 } 2558 if (old_roots) { 2559 if (!maybe_fs_roots(old_roots)) 2560 goto out_free; 2561 nr_old_roots = old_roots->nnodes; 2562 } 2563 2564 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2565 if (nr_old_roots == 0 && nr_new_roots == 0) 2566 goto out_free; 2567 2568 BUG_ON(!fs_info->quota_root); 2569 2570 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2571 num_bytes, nr_old_roots, nr_new_roots); 2572 2573 qgroups = ulist_alloc(GFP_NOFS); 2574 if (!qgroups) { 2575 ret = -ENOMEM; 2576 goto out_free; 2577 } 2578 tmp = ulist_alloc(GFP_NOFS); 2579 if (!tmp) { 2580 ret = -ENOMEM; 2581 goto out_free; 2582 } 2583 2584 mutex_lock(&fs_info->qgroup_rescan_lock); 2585 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2586 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2587 mutex_unlock(&fs_info->qgroup_rescan_lock); 2588 ret = 0; 2589 goto out_free; 2590 } 2591 } 2592 mutex_unlock(&fs_info->qgroup_rescan_lock); 2593 2594 spin_lock(&fs_info->qgroup_lock); 2595 seq = fs_info->qgroup_seq; 2596 2597 /* Update old refcnts using old_roots */ 2598 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2599 UPDATE_OLD); 2600 if (ret < 0) 2601 goto out; 2602 2603 /* Update new refcnts using new_roots */ 2604 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2605 UPDATE_NEW); 2606 if (ret < 0) 2607 goto out; 2608 2609 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2610 num_bytes, seq); 2611 2612 /* 2613 * Bump qgroup_seq to avoid seq overlap 2614 */ 2615 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2616 out: 2617 spin_unlock(&fs_info->qgroup_lock); 2618 out_free: 2619 ulist_free(tmp); 2620 ulist_free(qgroups); 2621 ulist_free(old_roots); 2622 ulist_free(new_roots); 2623 return ret; 2624 } 2625 2626 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2627 { 2628 struct btrfs_fs_info *fs_info = trans->fs_info; 2629 struct btrfs_qgroup_extent_record *record; 2630 struct btrfs_delayed_ref_root *delayed_refs; 2631 struct ulist *new_roots = NULL; 2632 struct rb_node *node; 2633 u64 num_dirty_extents = 0; 2634 u64 qgroup_to_skip; 2635 int ret = 0; 2636 2637 delayed_refs = &trans->transaction->delayed_refs; 2638 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2639 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2640 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2641 node); 2642 2643 num_dirty_extents++; 2644 trace_btrfs_qgroup_account_extents(fs_info, record); 2645 2646 if (!ret) { 2647 /* 2648 * Old roots should be searched when inserting qgroup 2649 * extent record 2650 */ 2651 if (WARN_ON(!record->old_roots)) { 2652 /* Search commit root to find old_roots */ 2653 ret = btrfs_find_all_roots(NULL, fs_info, 2654 record->bytenr, 0, 2655 &record->old_roots, false); 2656 if (ret < 0) 2657 goto cleanup; 2658 } 2659 2660 /* Free the reserved data space */ 2661 btrfs_qgroup_free_refroot(fs_info, 2662 record->data_rsv_refroot, 2663 record->data_rsv, 2664 BTRFS_QGROUP_RSV_DATA); 2665 /* 2666 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2667 * which doesn't lock tree or delayed_refs and search 2668 * current root. It's safe inside commit_transaction(). 2669 */ 2670 ret = btrfs_find_all_roots(trans, fs_info, 2671 record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); 2672 if (ret < 0) 2673 goto cleanup; 2674 if (qgroup_to_skip) { 2675 ulist_del(new_roots, qgroup_to_skip, 0); 2676 ulist_del(record->old_roots, qgroup_to_skip, 2677 0); 2678 } 2679 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2680 record->num_bytes, 2681 record->old_roots, 2682 new_roots); 2683 record->old_roots = NULL; 2684 new_roots = NULL; 2685 } 2686 cleanup: 2687 ulist_free(record->old_roots); 2688 ulist_free(new_roots); 2689 new_roots = NULL; 2690 rb_erase(node, &delayed_refs->dirty_extent_root); 2691 kfree(record); 2692 2693 } 2694 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2695 num_dirty_extents); 2696 return ret; 2697 } 2698 2699 /* 2700 * called from commit_transaction. Writes all changed qgroups to disk. 2701 */ 2702 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2703 { 2704 struct btrfs_fs_info *fs_info = trans->fs_info; 2705 int ret = 0; 2706 2707 if (!fs_info->quota_root) 2708 return ret; 2709 2710 spin_lock(&fs_info->qgroup_lock); 2711 while (!list_empty(&fs_info->dirty_qgroups)) { 2712 struct btrfs_qgroup *qgroup; 2713 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2714 struct btrfs_qgroup, dirty); 2715 list_del_init(&qgroup->dirty); 2716 spin_unlock(&fs_info->qgroup_lock); 2717 ret = update_qgroup_info_item(trans, qgroup); 2718 if (ret) 2719 fs_info->qgroup_flags |= 2720 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2721 ret = update_qgroup_limit_item(trans, qgroup); 2722 if (ret) 2723 fs_info->qgroup_flags |= 2724 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2725 spin_lock(&fs_info->qgroup_lock); 2726 } 2727 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2728 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2729 else 2730 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2731 spin_unlock(&fs_info->qgroup_lock); 2732 2733 ret = update_qgroup_status_item(trans); 2734 if (ret) 2735 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2736 2737 return ret; 2738 } 2739 2740 /* 2741 * Copy the accounting information between qgroups. This is necessary 2742 * when a snapshot or a subvolume is created. Throwing an error will 2743 * cause a transaction abort so we take extra care here to only error 2744 * when a readonly fs is a reasonable outcome. 2745 */ 2746 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2747 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2748 { 2749 int ret = 0; 2750 int i; 2751 u64 *i_qgroups; 2752 bool committing = false; 2753 struct btrfs_fs_info *fs_info = trans->fs_info; 2754 struct btrfs_root *quota_root; 2755 struct btrfs_qgroup *srcgroup; 2756 struct btrfs_qgroup *dstgroup; 2757 bool need_rescan = false; 2758 u32 level_size = 0; 2759 u64 nums; 2760 2761 /* 2762 * There are only two callers of this function. 2763 * 2764 * One in create_subvol() in the ioctl context, which needs to hold 2765 * the qgroup_ioctl_lock. 2766 * 2767 * The other one in create_pending_snapshot() where no other qgroup 2768 * code can modify the fs as they all need to either start a new trans 2769 * or hold a trans handler, thus we don't need to hold 2770 * qgroup_ioctl_lock. 2771 * This would avoid long and complex lock chain and make lockdep happy. 2772 */ 2773 spin_lock(&fs_info->trans_lock); 2774 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2775 committing = true; 2776 spin_unlock(&fs_info->trans_lock); 2777 2778 if (!committing) 2779 mutex_lock(&fs_info->qgroup_ioctl_lock); 2780 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2781 goto out; 2782 2783 quota_root = fs_info->quota_root; 2784 if (!quota_root) { 2785 ret = -EINVAL; 2786 goto out; 2787 } 2788 2789 if (inherit) { 2790 i_qgroups = (u64 *)(inherit + 1); 2791 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2792 2 * inherit->num_excl_copies; 2793 for (i = 0; i < nums; ++i) { 2794 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2795 2796 /* 2797 * Zero out invalid groups so we can ignore 2798 * them later. 2799 */ 2800 if (!srcgroup || 2801 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2802 *i_qgroups = 0ULL; 2803 2804 ++i_qgroups; 2805 } 2806 } 2807 2808 /* 2809 * create a tracking group for the subvol itself 2810 */ 2811 ret = add_qgroup_item(trans, quota_root, objectid); 2812 if (ret) 2813 goto out; 2814 2815 /* 2816 * add qgroup to all inherited groups 2817 */ 2818 if (inherit) { 2819 i_qgroups = (u64 *)(inherit + 1); 2820 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 2821 if (*i_qgroups == 0) 2822 continue; 2823 ret = add_qgroup_relation_item(trans, objectid, 2824 *i_qgroups); 2825 if (ret && ret != -EEXIST) 2826 goto out; 2827 ret = add_qgroup_relation_item(trans, *i_qgroups, 2828 objectid); 2829 if (ret && ret != -EEXIST) 2830 goto out; 2831 } 2832 ret = 0; 2833 } 2834 2835 2836 spin_lock(&fs_info->qgroup_lock); 2837 2838 dstgroup = add_qgroup_rb(fs_info, objectid); 2839 if (IS_ERR(dstgroup)) { 2840 ret = PTR_ERR(dstgroup); 2841 goto unlock; 2842 } 2843 2844 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 2845 dstgroup->lim_flags = inherit->lim.flags; 2846 dstgroup->max_rfer = inherit->lim.max_rfer; 2847 dstgroup->max_excl = inherit->lim.max_excl; 2848 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 2849 dstgroup->rsv_excl = inherit->lim.rsv_excl; 2850 2851 ret = update_qgroup_limit_item(trans, dstgroup); 2852 if (ret) { 2853 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2854 btrfs_info(fs_info, 2855 "unable to update quota limit for %llu", 2856 dstgroup->qgroupid); 2857 goto unlock; 2858 } 2859 } 2860 2861 if (srcid) { 2862 srcgroup = find_qgroup_rb(fs_info, srcid); 2863 if (!srcgroup) 2864 goto unlock; 2865 2866 /* 2867 * We call inherit after we clone the root in order to make sure 2868 * our counts don't go crazy, so at this point the only 2869 * difference between the two roots should be the root node. 2870 */ 2871 level_size = fs_info->nodesize; 2872 dstgroup->rfer = srcgroup->rfer; 2873 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 2874 dstgroup->excl = level_size; 2875 dstgroup->excl_cmpr = level_size; 2876 srcgroup->excl = level_size; 2877 srcgroup->excl_cmpr = level_size; 2878 2879 /* inherit the limit info */ 2880 dstgroup->lim_flags = srcgroup->lim_flags; 2881 dstgroup->max_rfer = srcgroup->max_rfer; 2882 dstgroup->max_excl = srcgroup->max_excl; 2883 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 2884 dstgroup->rsv_excl = srcgroup->rsv_excl; 2885 2886 qgroup_dirty(fs_info, dstgroup); 2887 qgroup_dirty(fs_info, srcgroup); 2888 } 2889 2890 if (!inherit) 2891 goto unlock; 2892 2893 i_qgroups = (u64 *)(inherit + 1); 2894 for (i = 0; i < inherit->num_qgroups; ++i) { 2895 if (*i_qgroups) { 2896 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 2897 if (ret) 2898 goto unlock; 2899 } 2900 ++i_qgroups; 2901 2902 /* 2903 * If we're doing a snapshot, and adding the snapshot to a new 2904 * qgroup, the numbers are guaranteed to be incorrect. 2905 */ 2906 if (srcid) 2907 need_rescan = true; 2908 } 2909 2910 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 2911 struct btrfs_qgroup *src; 2912 struct btrfs_qgroup *dst; 2913 2914 if (!i_qgroups[0] || !i_qgroups[1]) 2915 continue; 2916 2917 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2918 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2919 2920 if (!src || !dst) { 2921 ret = -EINVAL; 2922 goto unlock; 2923 } 2924 2925 dst->rfer = src->rfer - level_size; 2926 dst->rfer_cmpr = src->rfer_cmpr - level_size; 2927 2928 /* Manually tweaking numbers certainly needs a rescan */ 2929 need_rescan = true; 2930 } 2931 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 2932 struct btrfs_qgroup *src; 2933 struct btrfs_qgroup *dst; 2934 2935 if (!i_qgroups[0] || !i_qgroups[1]) 2936 continue; 2937 2938 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2939 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2940 2941 if (!src || !dst) { 2942 ret = -EINVAL; 2943 goto unlock; 2944 } 2945 2946 dst->excl = src->excl + level_size; 2947 dst->excl_cmpr = src->excl_cmpr + level_size; 2948 need_rescan = true; 2949 } 2950 2951 unlock: 2952 spin_unlock(&fs_info->qgroup_lock); 2953 if (!ret) 2954 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 2955 out: 2956 if (!committing) 2957 mutex_unlock(&fs_info->qgroup_ioctl_lock); 2958 if (need_rescan) 2959 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2960 return ret; 2961 } 2962 2963 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 2964 { 2965 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 2966 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 2967 return false; 2968 2969 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 2970 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 2971 return false; 2972 2973 return true; 2974 } 2975 2976 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 2977 enum btrfs_qgroup_rsv_type type) 2978 { 2979 struct btrfs_qgroup *qgroup; 2980 struct btrfs_fs_info *fs_info = root->fs_info; 2981 u64 ref_root = root->root_key.objectid; 2982 int ret = 0; 2983 struct ulist_node *unode; 2984 struct ulist_iterator uiter; 2985 2986 if (!is_fstree(ref_root)) 2987 return 0; 2988 2989 if (num_bytes == 0) 2990 return 0; 2991 2992 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 2993 capable(CAP_SYS_RESOURCE)) 2994 enforce = false; 2995 2996 spin_lock(&fs_info->qgroup_lock); 2997 if (!fs_info->quota_root) 2998 goto out; 2999 3000 qgroup = find_qgroup_rb(fs_info, ref_root); 3001 if (!qgroup) 3002 goto out; 3003 3004 /* 3005 * in a first step, we check all affected qgroups if any limits would 3006 * be exceeded 3007 */ 3008 ulist_reinit(fs_info->qgroup_ulist); 3009 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3010 qgroup_to_aux(qgroup), GFP_ATOMIC); 3011 if (ret < 0) 3012 goto out; 3013 ULIST_ITER_INIT(&uiter); 3014 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3015 struct btrfs_qgroup *qg; 3016 struct btrfs_qgroup_list *glist; 3017 3018 qg = unode_aux_to_qgroup(unode); 3019 3020 if (enforce && !qgroup_check_limits(qg, num_bytes)) { 3021 ret = -EDQUOT; 3022 goto out; 3023 } 3024 3025 list_for_each_entry(glist, &qg->groups, next_group) { 3026 ret = ulist_add(fs_info->qgroup_ulist, 3027 glist->group->qgroupid, 3028 qgroup_to_aux(glist->group), GFP_ATOMIC); 3029 if (ret < 0) 3030 goto out; 3031 } 3032 } 3033 ret = 0; 3034 /* 3035 * no limits exceeded, now record the reservation into all qgroups 3036 */ 3037 ULIST_ITER_INIT(&uiter); 3038 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3039 struct btrfs_qgroup *qg; 3040 3041 qg = unode_aux_to_qgroup(unode); 3042 3043 qgroup_rsv_add(fs_info, qg, num_bytes, type); 3044 } 3045 3046 out: 3047 spin_unlock(&fs_info->qgroup_lock); 3048 return ret; 3049 } 3050 3051 /* 3052 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3053 * qgroup). 3054 * 3055 * Will handle all higher level qgroup too. 3056 * 3057 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3058 * This special case is only used for META_PERTRANS type. 3059 */ 3060 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3061 u64 ref_root, u64 num_bytes, 3062 enum btrfs_qgroup_rsv_type type) 3063 { 3064 struct btrfs_qgroup *qgroup; 3065 struct ulist_node *unode; 3066 struct ulist_iterator uiter; 3067 int ret = 0; 3068 3069 if (!is_fstree(ref_root)) 3070 return; 3071 3072 if (num_bytes == 0) 3073 return; 3074 3075 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3076 WARN(1, "%s: Invalid type to free", __func__); 3077 return; 3078 } 3079 spin_lock(&fs_info->qgroup_lock); 3080 3081 if (!fs_info->quota_root) 3082 goto out; 3083 3084 qgroup = find_qgroup_rb(fs_info, ref_root); 3085 if (!qgroup) 3086 goto out; 3087 3088 if (num_bytes == (u64)-1) 3089 /* 3090 * We're freeing all pertrans rsv, get reserved value from 3091 * level 0 qgroup as real num_bytes to free. 3092 */ 3093 num_bytes = qgroup->rsv.values[type]; 3094 3095 ulist_reinit(fs_info->qgroup_ulist); 3096 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3097 qgroup_to_aux(qgroup), GFP_ATOMIC); 3098 if (ret < 0) 3099 goto out; 3100 ULIST_ITER_INIT(&uiter); 3101 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3102 struct btrfs_qgroup *qg; 3103 struct btrfs_qgroup_list *glist; 3104 3105 qg = unode_aux_to_qgroup(unode); 3106 3107 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3108 3109 list_for_each_entry(glist, &qg->groups, next_group) { 3110 ret = ulist_add(fs_info->qgroup_ulist, 3111 glist->group->qgroupid, 3112 qgroup_to_aux(glist->group), GFP_ATOMIC); 3113 if (ret < 0) 3114 goto out; 3115 } 3116 } 3117 3118 out: 3119 spin_unlock(&fs_info->qgroup_lock); 3120 } 3121 3122 /* 3123 * Check if the leaf is the last leaf. Which means all node pointers 3124 * are at their last position. 3125 */ 3126 static bool is_last_leaf(struct btrfs_path *path) 3127 { 3128 int i; 3129 3130 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3131 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3132 return false; 3133 } 3134 return true; 3135 } 3136 3137 /* 3138 * returns < 0 on error, 0 when more leafs are to be scanned. 3139 * returns 1 when done. 3140 */ 3141 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3142 struct btrfs_path *path) 3143 { 3144 struct btrfs_fs_info *fs_info = trans->fs_info; 3145 struct btrfs_key found; 3146 struct extent_buffer *scratch_leaf = NULL; 3147 struct ulist *roots = NULL; 3148 u64 num_bytes; 3149 bool done; 3150 int slot; 3151 int ret; 3152 3153 mutex_lock(&fs_info->qgroup_rescan_lock); 3154 ret = btrfs_search_slot_for_read(fs_info->extent_root, 3155 &fs_info->qgroup_rescan_progress, 3156 path, 1, 0); 3157 3158 btrfs_debug(fs_info, 3159 "current progress key (%llu %u %llu), search_slot ret %d", 3160 fs_info->qgroup_rescan_progress.objectid, 3161 fs_info->qgroup_rescan_progress.type, 3162 fs_info->qgroup_rescan_progress.offset, ret); 3163 3164 if (ret) { 3165 /* 3166 * The rescan is about to end, we will not be scanning any 3167 * further blocks. We cannot unset the RESCAN flag here, because 3168 * we want to commit the transaction if everything went well. 3169 * To make the live accounting work in this phase, we set our 3170 * scan progress pointer such that every real extent objectid 3171 * will be smaller. 3172 */ 3173 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3174 btrfs_release_path(path); 3175 mutex_unlock(&fs_info->qgroup_rescan_lock); 3176 return ret; 3177 } 3178 done = is_last_leaf(path); 3179 3180 btrfs_item_key_to_cpu(path->nodes[0], &found, 3181 btrfs_header_nritems(path->nodes[0]) - 1); 3182 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3183 3184 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3185 if (!scratch_leaf) { 3186 ret = -ENOMEM; 3187 mutex_unlock(&fs_info->qgroup_rescan_lock); 3188 goto out; 3189 } 3190 slot = path->slots[0]; 3191 btrfs_release_path(path); 3192 mutex_unlock(&fs_info->qgroup_rescan_lock); 3193 3194 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3195 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3196 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3197 found.type != BTRFS_METADATA_ITEM_KEY) 3198 continue; 3199 if (found.type == BTRFS_METADATA_ITEM_KEY) 3200 num_bytes = fs_info->nodesize; 3201 else 3202 num_bytes = found.offset; 3203 3204 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 3205 &roots, false); 3206 if (ret < 0) 3207 goto out; 3208 /* For rescan, just pass old_roots as NULL */ 3209 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3210 num_bytes, NULL, roots); 3211 if (ret < 0) 3212 goto out; 3213 } 3214 out: 3215 if (scratch_leaf) 3216 free_extent_buffer(scratch_leaf); 3217 3218 if (done && !ret) { 3219 ret = 1; 3220 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3221 } 3222 return ret; 3223 } 3224 3225 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3226 { 3227 return btrfs_fs_closing(fs_info) || 3228 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); 3229 } 3230 3231 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3232 { 3233 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3234 qgroup_rescan_work); 3235 struct btrfs_path *path; 3236 struct btrfs_trans_handle *trans = NULL; 3237 int err = -ENOMEM; 3238 int ret = 0; 3239 bool stopped = false; 3240 3241 path = btrfs_alloc_path(); 3242 if (!path) 3243 goto out; 3244 /* 3245 * Rescan should only search for commit root, and any later difference 3246 * should be recorded by qgroup 3247 */ 3248 path->search_commit_root = 1; 3249 path->skip_locking = 1; 3250 3251 err = 0; 3252 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3253 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3254 if (IS_ERR(trans)) { 3255 err = PTR_ERR(trans); 3256 break; 3257 } 3258 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3259 err = -EINTR; 3260 } else { 3261 err = qgroup_rescan_leaf(trans, path); 3262 } 3263 if (err > 0) 3264 btrfs_commit_transaction(trans); 3265 else 3266 btrfs_end_transaction(trans); 3267 } 3268 3269 out: 3270 btrfs_free_path(path); 3271 3272 mutex_lock(&fs_info->qgroup_rescan_lock); 3273 if (err > 0 && 3274 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3275 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3276 } else if (err < 0) { 3277 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3278 } 3279 mutex_unlock(&fs_info->qgroup_rescan_lock); 3280 3281 /* 3282 * only update status, since the previous part has already updated the 3283 * qgroup info. 3284 */ 3285 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3286 if (IS_ERR(trans)) { 3287 err = PTR_ERR(trans); 3288 trans = NULL; 3289 btrfs_err(fs_info, 3290 "fail to start transaction for status update: %d", 3291 err); 3292 } 3293 3294 mutex_lock(&fs_info->qgroup_rescan_lock); 3295 if (!stopped) 3296 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3297 if (trans) { 3298 ret = update_qgroup_status_item(trans); 3299 if (ret < 0) { 3300 err = ret; 3301 btrfs_err(fs_info, "fail to update qgroup status: %d", 3302 err); 3303 } 3304 } 3305 fs_info->qgroup_rescan_running = false; 3306 complete_all(&fs_info->qgroup_rescan_completion); 3307 mutex_unlock(&fs_info->qgroup_rescan_lock); 3308 3309 if (!trans) 3310 return; 3311 3312 btrfs_end_transaction(trans); 3313 3314 if (stopped) { 3315 btrfs_info(fs_info, "qgroup scan paused"); 3316 } else if (err >= 0) { 3317 btrfs_info(fs_info, "qgroup scan completed%s", 3318 err > 0 ? " (inconsistency flag cleared)" : ""); 3319 } else { 3320 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3321 } 3322 } 3323 3324 /* 3325 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3326 * memory required for the rescan context. 3327 */ 3328 static int 3329 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3330 int init_flags) 3331 { 3332 int ret = 0; 3333 3334 if (!init_flags) { 3335 /* we're resuming qgroup rescan at mount time */ 3336 if (!(fs_info->qgroup_flags & 3337 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3338 btrfs_warn(fs_info, 3339 "qgroup rescan init failed, qgroup rescan is not queued"); 3340 ret = -EINVAL; 3341 } else if (!(fs_info->qgroup_flags & 3342 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3343 btrfs_warn(fs_info, 3344 "qgroup rescan init failed, qgroup is not enabled"); 3345 ret = -EINVAL; 3346 } 3347 3348 if (ret) 3349 return ret; 3350 } 3351 3352 mutex_lock(&fs_info->qgroup_rescan_lock); 3353 3354 if (init_flags) { 3355 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3356 btrfs_warn(fs_info, 3357 "qgroup rescan is already in progress"); 3358 ret = -EINPROGRESS; 3359 } else if (!(fs_info->qgroup_flags & 3360 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3361 btrfs_warn(fs_info, 3362 "qgroup rescan init failed, qgroup is not enabled"); 3363 ret = -EINVAL; 3364 } 3365 3366 if (ret) { 3367 mutex_unlock(&fs_info->qgroup_rescan_lock); 3368 return ret; 3369 } 3370 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3371 } 3372 3373 memset(&fs_info->qgroup_rescan_progress, 0, 3374 sizeof(fs_info->qgroup_rescan_progress)); 3375 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3376 init_completion(&fs_info->qgroup_rescan_completion); 3377 mutex_unlock(&fs_info->qgroup_rescan_lock); 3378 3379 btrfs_init_work(&fs_info->qgroup_rescan_work, 3380 btrfs_qgroup_rescan_worker, NULL, NULL); 3381 return 0; 3382 } 3383 3384 static void 3385 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3386 { 3387 struct rb_node *n; 3388 struct btrfs_qgroup *qgroup; 3389 3390 spin_lock(&fs_info->qgroup_lock); 3391 /* clear all current qgroup tracking information */ 3392 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3393 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3394 qgroup->rfer = 0; 3395 qgroup->rfer_cmpr = 0; 3396 qgroup->excl = 0; 3397 qgroup->excl_cmpr = 0; 3398 qgroup_dirty(fs_info, qgroup); 3399 } 3400 spin_unlock(&fs_info->qgroup_lock); 3401 } 3402 3403 int 3404 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3405 { 3406 int ret = 0; 3407 struct btrfs_trans_handle *trans; 3408 3409 ret = qgroup_rescan_init(fs_info, 0, 1); 3410 if (ret) 3411 return ret; 3412 3413 /* 3414 * We have set the rescan_progress to 0, which means no more 3415 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3416 * However, btrfs_qgroup_account_ref may be right after its call 3417 * to btrfs_find_all_roots, in which case it would still do the 3418 * accounting. 3419 * To solve this, we're committing the transaction, which will 3420 * ensure we run all delayed refs and only after that, we are 3421 * going to clear all tracking information for a clean start. 3422 */ 3423 3424 trans = btrfs_join_transaction(fs_info->fs_root); 3425 if (IS_ERR(trans)) { 3426 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3427 return PTR_ERR(trans); 3428 } 3429 ret = btrfs_commit_transaction(trans); 3430 if (ret) { 3431 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3432 return ret; 3433 } 3434 3435 qgroup_rescan_zero_tracking(fs_info); 3436 3437 mutex_lock(&fs_info->qgroup_rescan_lock); 3438 fs_info->qgroup_rescan_running = true; 3439 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3440 &fs_info->qgroup_rescan_work); 3441 mutex_unlock(&fs_info->qgroup_rescan_lock); 3442 3443 return 0; 3444 } 3445 3446 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3447 bool interruptible) 3448 { 3449 int running; 3450 int ret = 0; 3451 3452 mutex_lock(&fs_info->qgroup_rescan_lock); 3453 running = fs_info->qgroup_rescan_running; 3454 mutex_unlock(&fs_info->qgroup_rescan_lock); 3455 3456 if (!running) 3457 return 0; 3458 3459 if (interruptible) 3460 ret = wait_for_completion_interruptible( 3461 &fs_info->qgroup_rescan_completion); 3462 else 3463 wait_for_completion(&fs_info->qgroup_rescan_completion); 3464 3465 return ret; 3466 } 3467 3468 /* 3469 * this is only called from open_ctree where we're still single threaded, thus 3470 * locking is omitted here. 3471 */ 3472 void 3473 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3474 { 3475 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3476 mutex_lock(&fs_info->qgroup_rescan_lock); 3477 fs_info->qgroup_rescan_running = true; 3478 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3479 &fs_info->qgroup_rescan_work); 3480 mutex_unlock(&fs_info->qgroup_rescan_lock); 3481 } 3482 } 3483 3484 #define rbtree_iterate_from_safe(node, next, start) \ 3485 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3486 3487 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3488 struct extent_changeset *reserved, u64 start, 3489 u64 len) 3490 { 3491 struct rb_node *node; 3492 struct rb_node *next; 3493 struct ulist_node *entry; 3494 int ret = 0; 3495 3496 node = reserved->range_changed.root.rb_node; 3497 if (!node) 3498 return 0; 3499 while (node) { 3500 entry = rb_entry(node, struct ulist_node, rb_node); 3501 if (entry->val < start) 3502 node = node->rb_right; 3503 else 3504 node = node->rb_left; 3505 } 3506 3507 if (entry->val > start && rb_prev(&entry->rb_node)) 3508 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3509 rb_node); 3510 3511 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3512 u64 entry_start; 3513 u64 entry_end; 3514 u64 entry_len; 3515 int clear_ret; 3516 3517 entry = rb_entry(node, struct ulist_node, rb_node); 3518 entry_start = entry->val; 3519 entry_end = entry->aux; 3520 entry_len = entry_end - entry_start + 1; 3521 3522 if (entry_start >= start + len) 3523 break; 3524 if (entry_start + entry_len <= start) 3525 continue; 3526 /* 3527 * Now the entry is in [start, start + len), revert the 3528 * EXTENT_QGROUP_RESERVED bit. 3529 */ 3530 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3531 entry_end, EXTENT_QGROUP_RESERVED); 3532 if (!ret && clear_ret < 0) 3533 ret = clear_ret; 3534 3535 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3536 if (likely(reserved->bytes_changed >= entry_len)) { 3537 reserved->bytes_changed -= entry_len; 3538 } else { 3539 WARN_ON(1); 3540 reserved->bytes_changed = 0; 3541 } 3542 } 3543 3544 return ret; 3545 } 3546 3547 /* 3548 * Try to free some space for qgroup. 3549 * 3550 * For qgroup, there are only 3 ways to free qgroup space: 3551 * - Flush nodatacow write 3552 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3553 * In theory, we should only flush nodatacow inodes, but it's not yet 3554 * possible, so we need to flush the whole root. 3555 * 3556 * - Wait for ordered extents 3557 * When ordered extents are finished, their reserved metadata is finally 3558 * converted to per_trans status, which can be freed by later commit 3559 * transaction. 3560 * 3561 * - Commit transaction 3562 * This would free the meta_per_trans space. 3563 * In theory this shouldn't provide much space, but any more qgroup space 3564 * is needed. 3565 */ 3566 static int try_flush_qgroup(struct btrfs_root *root) 3567 { 3568 struct btrfs_trans_handle *trans; 3569 int ret; 3570 3571 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3572 ASSERT(current->journal_info == NULL); 3573 if (WARN_ON(current->journal_info)) 3574 return 0; 3575 3576 /* 3577 * We don't want to run flush again and again, so if there is a running 3578 * one, we won't try to start a new flush, but exit directly. 3579 */ 3580 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3581 wait_event(root->qgroup_flush_wait, 3582 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3583 return 0; 3584 } 3585 3586 ret = btrfs_start_delalloc_snapshot(root, true); 3587 if (ret < 0) 3588 goto out; 3589 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3590 3591 trans = btrfs_join_transaction(root); 3592 if (IS_ERR(trans)) { 3593 ret = PTR_ERR(trans); 3594 goto out; 3595 } 3596 3597 ret = btrfs_commit_transaction(trans); 3598 out: 3599 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3600 wake_up(&root->qgroup_flush_wait); 3601 return ret; 3602 } 3603 3604 static int qgroup_reserve_data(struct btrfs_inode *inode, 3605 struct extent_changeset **reserved_ret, u64 start, 3606 u64 len) 3607 { 3608 struct btrfs_root *root = inode->root; 3609 struct extent_changeset *reserved; 3610 bool new_reserved = false; 3611 u64 orig_reserved; 3612 u64 to_reserve; 3613 int ret; 3614 3615 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3616 !is_fstree(root->root_key.objectid) || len == 0) 3617 return 0; 3618 3619 /* @reserved parameter is mandatory for qgroup */ 3620 if (WARN_ON(!reserved_ret)) 3621 return -EINVAL; 3622 if (!*reserved_ret) { 3623 new_reserved = true; 3624 *reserved_ret = extent_changeset_alloc(); 3625 if (!*reserved_ret) 3626 return -ENOMEM; 3627 } 3628 reserved = *reserved_ret; 3629 /* Record already reserved space */ 3630 orig_reserved = reserved->bytes_changed; 3631 ret = set_record_extent_bits(&inode->io_tree, start, 3632 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3633 3634 /* Newly reserved space */ 3635 to_reserve = reserved->bytes_changed - orig_reserved; 3636 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3637 to_reserve, QGROUP_RESERVE); 3638 if (ret < 0) 3639 goto out; 3640 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3641 if (ret < 0) 3642 goto cleanup; 3643 3644 return ret; 3645 3646 cleanup: 3647 qgroup_unreserve_range(inode, reserved, start, len); 3648 out: 3649 if (new_reserved) { 3650 extent_changeset_free(reserved); 3651 *reserved_ret = NULL; 3652 } 3653 return ret; 3654 } 3655 3656 /* 3657 * Reserve qgroup space for range [start, start + len). 3658 * 3659 * This function will either reserve space from related qgroups or do nothing 3660 * if the range is already reserved. 3661 * 3662 * Return 0 for successful reservation 3663 * Return <0 for error (including -EQUOT) 3664 * 3665 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3666 * commit transaction. So caller should not hold any dirty page locked. 3667 */ 3668 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3669 struct extent_changeset **reserved_ret, u64 start, 3670 u64 len) 3671 { 3672 int ret; 3673 3674 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3675 if (ret <= 0 && ret != -EDQUOT) 3676 return ret; 3677 3678 ret = try_flush_qgroup(inode->root); 3679 if (ret < 0) 3680 return ret; 3681 return qgroup_reserve_data(inode, reserved_ret, start, len); 3682 } 3683 3684 /* Free ranges specified by @reserved, normally in error path */ 3685 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3686 struct extent_changeset *reserved, u64 start, u64 len) 3687 { 3688 struct btrfs_root *root = inode->root; 3689 struct ulist_node *unode; 3690 struct ulist_iterator uiter; 3691 struct extent_changeset changeset; 3692 int freed = 0; 3693 int ret; 3694 3695 extent_changeset_init(&changeset); 3696 len = round_up(start + len, root->fs_info->sectorsize); 3697 start = round_down(start, root->fs_info->sectorsize); 3698 3699 ULIST_ITER_INIT(&uiter); 3700 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3701 u64 range_start = unode->val; 3702 /* unode->aux is the inclusive end */ 3703 u64 range_len = unode->aux - range_start + 1; 3704 u64 free_start; 3705 u64 free_len; 3706 3707 extent_changeset_release(&changeset); 3708 3709 /* Only free range in range [start, start + len) */ 3710 if (range_start >= start + len || 3711 range_start + range_len <= start) 3712 continue; 3713 free_start = max(range_start, start); 3714 free_len = min(start + len, range_start + range_len) - 3715 free_start; 3716 /* 3717 * TODO: To also modify reserved->ranges_reserved to reflect 3718 * the modification. 3719 * 3720 * However as long as we free qgroup reserved according to 3721 * EXTENT_QGROUP_RESERVED, we won't double free. 3722 * So not need to rush. 3723 */ 3724 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3725 free_start + free_len - 1, 3726 EXTENT_QGROUP_RESERVED, &changeset); 3727 if (ret < 0) 3728 goto out; 3729 freed += changeset.bytes_changed; 3730 } 3731 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3732 BTRFS_QGROUP_RSV_DATA); 3733 ret = freed; 3734 out: 3735 extent_changeset_release(&changeset); 3736 return ret; 3737 } 3738 3739 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3740 struct extent_changeset *reserved, u64 start, u64 len, 3741 int free) 3742 { 3743 struct extent_changeset changeset; 3744 int trace_op = QGROUP_RELEASE; 3745 int ret; 3746 3747 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3748 return 0; 3749 3750 /* In release case, we shouldn't have @reserved */ 3751 WARN_ON(!free && reserved); 3752 if (free && reserved) 3753 return qgroup_free_reserved_data(inode, reserved, start, len); 3754 extent_changeset_init(&changeset); 3755 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3756 EXTENT_QGROUP_RESERVED, &changeset); 3757 if (ret < 0) 3758 goto out; 3759 3760 if (free) 3761 trace_op = QGROUP_FREE; 3762 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3763 changeset.bytes_changed, trace_op); 3764 if (free) 3765 btrfs_qgroup_free_refroot(inode->root->fs_info, 3766 inode->root->root_key.objectid, 3767 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3768 ret = changeset.bytes_changed; 3769 out: 3770 extent_changeset_release(&changeset); 3771 return ret; 3772 } 3773 3774 /* 3775 * Free a reserved space range from io_tree and related qgroups 3776 * 3777 * Should be called when a range of pages get invalidated before reaching disk. 3778 * Or for error cleanup case. 3779 * if @reserved is given, only reserved range in [@start, @start + @len) will 3780 * be freed. 3781 * 3782 * For data written to disk, use btrfs_qgroup_release_data(). 3783 * 3784 * NOTE: This function may sleep for memory allocation. 3785 */ 3786 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3787 struct extent_changeset *reserved, u64 start, u64 len) 3788 { 3789 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); 3790 } 3791 3792 /* 3793 * Release a reserved space range from io_tree only. 3794 * 3795 * Should be called when a range of pages get written to disk and corresponding 3796 * FILE_EXTENT is inserted into corresponding root. 3797 * 3798 * Since new qgroup accounting framework will only update qgroup numbers at 3799 * commit_transaction() time, its reserved space shouldn't be freed from 3800 * related qgroups. 3801 * 3802 * But we should release the range from io_tree, to allow further write to be 3803 * COWed. 3804 * 3805 * NOTE: This function may sleep for memory allocation. 3806 */ 3807 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) 3808 { 3809 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); 3810 } 3811 3812 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3813 enum btrfs_qgroup_rsv_type type) 3814 { 3815 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3816 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3817 return; 3818 if (num_bytes == 0) 3819 return; 3820 3821 spin_lock(&root->qgroup_meta_rsv_lock); 3822 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 3823 root->qgroup_meta_rsv_prealloc += num_bytes; 3824 else 3825 root->qgroup_meta_rsv_pertrans += num_bytes; 3826 spin_unlock(&root->qgroup_meta_rsv_lock); 3827 } 3828 3829 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3830 enum btrfs_qgroup_rsv_type type) 3831 { 3832 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3833 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3834 return 0; 3835 if (num_bytes == 0) 3836 return 0; 3837 3838 spin_lock(&root->qgroup_meta_rsv_lock); 3839 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 3840 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 3841 num_bytes); 3842 root->qgroup_meta_rsv_prealloc -= num_bytes; 3843 } else { 3844 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 3845 num_bytes); 3846 root->qgroup_meta_rsv_pertrans -= num_bytes; 3847 } 3848 spin_unlock(&root->qgroup_meta_rsv_lock); 3849 return num_bytes; 3850 } 3851 3852 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3853 enum btrfs_qgroup_rsv_type type, bool enforce) 3854 { 3855 struct btrfs_fs_info *fs_info = root->fs_info; 3856 int ret; 3857 3858 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3859 !is_fstree(root->root_key.objectid) || num_bytes == 0) 3860 return 0; 3861 3862 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3863 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 3864 ret = qgroup_reserve(root, num_bytes, enforce, type); 3865 if (ret < 0) 3866 return ret; 3867 /* 3868 * Record what we have reserved into root. 3869 * 3870 * To avoid quota disabled->enabled underflow. 3871 * In that case, we may try to free space we haven't reserved 3872 * (since quota was disabled), so record what we reserved into root. 3873 * And ensure later release won't underflow this number. 3874 */ 3875 add_root_meta_rsv(root, num_bytes, type); 3876 return ret; 3877 } 3878 3879 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3880 enum btrfs_qgroup_rsv_type type, bool enforce) 3881 { 3882 int ret; 3883 3884 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3885 if (ret <= 0 && ret != -EDQUOT) 3886 return ret; 3887 3888 ret = try_flush_qgroup(root); 3889 if (ret < 0) 3890 return ret; 3891 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3892 } 3893 3894 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 3895 { 3896 struct btrfs_fs_info *fs_info = root->fs_info; 3897 3898 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3899 !is_fstree(root->root_key.objectid)) 3900 return; 3901 3902 /* TODO: Update trace point to handle such free */ 3903 trace_qgroup_meta_free_all_pertrans(root); 3904 /* Special value -1 means to free all reserved space */ 3905 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 3906 BTRFS_QGROUP_RSV_META_PERTRANS); 3907 } 3908 3909 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 3910 enum btrfs_qgroup_rsv_type type) 3911 { 3912 struct btrfs_fs_info *fs_info = root->fs_info; 3913 3914 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3915 !is_fstree(root->root_key.objectid)) 3916 return; 3917 3918 /* 3919 * reservation for META_PREALLOC can happen before quota is enabled, 3920 * which can lead to underflow. 3921 * Here ensure we will only free what we really have reserved. 3922 */ 3923 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 3924 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3925 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 3926 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 3927 num_bytes, type); 3928 } 3929 3930 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 3931 int num_bytes) 3932 { 3933 struct btrfs_qgroup *qgroup; 3934 struct ulist_node *unode; 3935 struct ulist_iterator uiter; 3936 int ret = 0; 3937 3938 if (num_bytes == 0) 3939 return; 3940 if (!fs_info->quota_root) 3941 return; 3942 3943 spin_lock(&fs_info->qgroup_lock); 3944 qgroup = find_qgroup_rb(fs_info, ref_root); 3945 if (!qgroup) 3946 goto out; 3947 ulist_reinit(fs_info->qgroup_ulist); 3948 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3949 qgroup_to_aux(qgroup), GFP_ATOMIC); 3950 if (ret < 0) 3951 goto out; 3952 ULIST_ITER_INIT(&uiter); 3953 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3954 struct btrfs_qgroup *qg; 3955 struct btrfs_qgroup_list *glist; 3956 3957 qg = unode_aux_to_qgroup(unode); 3958 3959 qgroup_rsv_release(fs_info, qg, num_bytes, 3960 BTRFS_QGROUP_RSV_META_PREALLOC); 3961 qgroup_rsv_add(fs_info, qg, num_bytes, 3962 BTRFS_QGROUP_RSV_META_PERTRANS); 3963 list_for_each_entry(glist, &qg->groups, next_group) { 3964 ret = ulist_add(fs_info->qgroup_ulist, 3965 glist->group->qgroupid, 3966 qgroup_to_aux(glist->group), GFP_ATOMIC); 3967 if (ret < 0) 3968 goto out; 3969 } 3970 } 3971 out: 3972 spin_unlock(&fs_info->qgroup_lock); 3973 } 3974 3975 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 3976 { 3977 struct btrfs_fs_info *fs_info = root->fs_info; 3978 3979 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3980 !is_fstree(root->root_key.objectid)) 3981 return; 3982 /* Same as btrfs_qgroup_free_meta_prealloc() */ 3983 num_bytes = sub_root_meta_rsv(root, num_bytes, 3984 BTRFS_QGROUP_RSV_META_PREALLOC); 3985 trace_qgroup_meta_convert(root, num_bytes); 3986 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 3987 } 3988 3989 /* 3990 * Check qgroup reserved space leaking, normally at destroy inode 3991 * time 3992 */ 3993 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 3994 { 3995 struct extent_changeset changeset; 3996 struct ulist_node *unode; 3997 struct ulist_iterator iter; 3998 int ret; 3999 4000 extent_changeset_init(&changeset); 4001 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4002 EXTENT_QGROUP_RESERVED, &changeset); 4003 4004 WARN_ON(ret < 0); 4005 if (WARN_ON(changeset.bytes_changed)) { 4006 ULIST_ITER_INIT(&iter); 4007 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4008 btrfs_warn(inode->root->fs_info, 4009 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4010 btrfs_ino(inode), unode->val, unode->aux); 4011 } 4012 btrfs_qgroup_free_refroot(inode->root->fs_info, 4013 inode->root->root_key.objectid, 4014 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4015 4016 } 4017 extent_changeset_release(&changeset); 4018 } 4019 4020 void btrfs_qgroup_init_swapped_blocks( 4021 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4022 { 4023 int i; 4024 4025 spin_lock_init(&swapped_blocks->lock); 4026 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4027 swapped_blocks->blocks[i] = RB_ROOT; 4028 swapped_blocks->swapped = false; 4029 } 4030 4031 /* 4032 * Delete all swapped blocks record of @root. 4033 * Every record here means we skipped a full subtree scan for qgroup. 4034 * 4035 * Gets called when committing one transaction. 4036 */ 4037 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4038 { 4039 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4040 int i; 4041 4042 swapped_blocks = &root->swapped_blocks; 4043 4044 spin_lock(&swapped_blocks->lock); 4045 if (!swapped_blocks->swapped) 4046 goto out; 4047 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4048 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4049 struct btrfs_qgroup_swapped_block *entry; 4050 struct btrfs_qgroup_swapped_block *next; 4051 4052 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4053 node) 4054 kfree(entry); 4055 swapped_blocks->blocks[i] = RB_ROOT; 4056 } 4057 swapped_blocks->swapped = false; 4058 out: 4059 spin_unlock(&swapped_blocks->lock); 4060 } 4061 4062 /* 4063 * Add subtree roots record into @subvol_root. 4064 * 4065 * @subvol_root: tree root of the subvolume tree get swapped 4066 * @bg: block group under balance 4067 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4068 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4069 * BOTH POINTERS ARE BEFORE TREE SWAP 4070 * @last_snapshot: last snapshot generation of the subvolume tree 4071 */ 4072 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4073 struct btrfs_root *subvol_root, 4074 struct btrfs_block_group *bg, 4075 struct extent_buffer *subvol_parent, int subvol_slot, 4076 struct extent_buffer *reloc_parent, int reloc_slot, 4077 u64 last_snapshot) 4078 { 4079 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4080 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4081 struct btrfs_qgroup_swapped_block *block; 4082 struct rb_node **cur; 4083 struct rb_node *parent = NULL; 4084 int level = btrfs_header_level(subvol_parent) - 1; 4085 int ret = 0; 4086 4087 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4088 return 0; 4089 4090 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4091 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4092 btrfs_err_rl(fs_info, 4093 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4094 __func__, 4095 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4096 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4097 return -EUCLEAN; 4098 } 4099 4100 block = kmalloc(sizeof(*block), GFP_NOFS); 4101 if (!block) { 4102 ret = -ENOMEM; 4103 goto out; 4104 } 4105 4106 /* 4107 * @reloc_parent/slot is still before swap, while @block is going to 4108 * record the bytenr after swap, so we do the swap here. 4109 */ 4110 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4111 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4112 reloc_slot); 4113 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4114 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4115 subvol_slot); 4116 block->last_snapshot = last_snapshot; 4117 block->level = level; 4118 4119 /* 4120 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4121 * no one else can modify tree blocks thus we qgroup will not change 4122 * no matter the value of trace_leaf. 4123 */ 4124 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4125 block->trace_leaf = true; 4126 else 4127 block->trace_leaf = false; 4128 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4129 4130 /* Insert @block into @blocks */ 4131 spin_lock(&blocks->lock); 4132 cur = &blocks->blocks[level].rb_node; 4133 while (*cur) { 4134 struct btrfs_qgroup_swapped_block *entry; 4135 4136 parent = *cur; 4137 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4138 node); 4139 4140 if (entry->subvol_bytenr < block->subvol_bytenr) { 4141 cur = &(*cur)->rb_left; 4142 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4143 cur = &(*cur)->rb_right; 4144 } else { 4145 if (entry->subvol_generation != 4146 block->subvol_generation || 4147 entry->reloc_bytenr != block->reloc_bytenr || 4148 entry->reloc_generation != 4149 block->reloc_generation) { 4150 /* 4151 * Duplicated but mismatch entry found. 4152 * Shouldn't happen. 4153 * 4154 * Marking qgroup inconsistent should be enough 4155 * for end users. 4156 */ 4157 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4158 ret = -EEXIST; 4159 } 4160 kfree(block); 4161 goto out_unlock; 4162 } 4163 } 4164 rb_link_node(&block->node, parent, cur); 4165 rb_insert_color(&block->node, &blocks->blocks[level]); 4166 blocks->swapped = true; 4167 out_unlock: 4168 spin_unlock(&blocks->lock); 4169 out: 4170 if (ret < 0) 4171 fs_info->qgroup_flags |= 4172 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4173 return ret; 4174 } 4175 4176 /* 4177 * Check if the tree block is a subtree root, and if so do the needed 4178 * delayed subtree trace for qgroup. 4179 * 4180 * This is called during btrfs_cow_block(). 4181 */ 4182 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4183 struct btrfs_root *root, 4184 struct extent_buffer *subvol_eb) 4185 { 4186 struct btrfs_fs_info *fs_info = root->fs_info; 4187 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4188 struct btrfs_qgroup_swapped_block *block; 4189 struct extent_buffer *reloc_eb = NULL; 4190 struct rb_node *node; 4191 bool found = false; 4192 bool swapped = false; 4193 int level = btrfs_header_level(subvol_eb); 4194 int ret = 0; 4195 int i; 4196 4197 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4198 return 0; 4199 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4200 return 0; 4201 4202 spin_lock(&blocks->lock); 4203 if (!blocks->swapped) { 4204 spin_unlock(&blocks->lock); 4205 return 0; 4206 } 4207 node = blocks->blocks[level].rb_node; 4208 4209 while (node) { 4210 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4211 if (block->subvol_bytenr < subvol_eb->start) { 4212 node = node->rb_left; 4213 } else if (block->subvol_bytenr > subvol_eb->start) { 4214 node = node->rb_right; 4215 } else { 4216 found = true; 4217 break; 4218 } 4219 } 4220 if (!found) { 4221 spin_unlock(&blocks->lock); 4222 goto out; 4223 } 4224 /* Found one, remove it from @blocks first and update blocks->swapped */ 4225 rb_erase(&block->node, &blocks->blocks[level]); 4226 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4227 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4228 swapped = true; 4229 break; 4230 } 4231 } 4232 blocks->swapped = swapped; 4233 spin_unlock(&blocks->lock); 4234 4235 /* Read out reloc subtree root */ 4236 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, 4237 block->reloc_generation, block->level, 4238 &block->first_key); 4239 if (IS_ERR(reloc_eb)) { 4240 ret = PTR_ERR(reloc_eb); 4241 reloc_eb = NULL; 4242 goto free_out; 4243 } 4244 if (!extent_buffer_uptodate(reloc_eb)) { 4245 ret = -EIO; 4246 goto free_out; 4247 } 4248 4249 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4250 block->last_snapshot, block->trace_leaf); 4251 free_out: 4252 kfree(block); 4253 free_extent_buffer(reloc_eb); 4254 out: 4255 if (ret < 0) { 4256 btrfs_err_rl(fs_info, 4257 "failed to account subtree at bytenr %llu: %d", 4258 subvol_eb->start, ret); 4259 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4260 } 4261 return ret; 4262 } 4263 4264 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4265 { 4266 struct btrfs_qgroup_extent_record *entry; 4267 struct btrfs_qgroup_extent_record *next; 4268 struct rb_root *root; 4269 4270 root = &trans->delayed_refs.dirty_extent_root; 4271 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4272 ulist_free(entry->old_roots); 4273 kfree(entry); 4274 } 4275 } 4276