1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 28 /* TODO XXX FIXME 29 * - subvol delete -> delete when ref goes to 0? delete limits also? 30 * - reorganize keys 31 * - compressed 32 * - sync 33 * - copy also limits on subvol creation 34 * - limit 35 * - caches for ulists 36 * - performance benchmarks 37 * - check all ioctl parameters 38 */ 39 40 /* 41 * Helpers to access qgroup reservation 42 * 43 * Callers should ensure the lock context and type are valid 44 */ 45 46 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 47 { 48 u64 ret = 0; 49 int i; 50 51 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 52 ret += qgroup->rsv.values[i]; 53 54 return ret; 55 } 56 57 #ifdef CONFIG_BTRFS_DEBUG 58 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 59 { 60 if (type == BTRFS_QGROUP_RSV_DATA) 61 return "data"; 62 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 63 return "meta_pertrans"; 64 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 65 return "meta_prealloc"; 66 return NULL; 67 } 68 #endif 69 70 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 71 struct btrfs_qgroup *qgroup, u64 num_bytes, 72 enum btrfs_qgroup_rsv_type type) 73 { 74 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 75 qgroup->rsv.values[type] += num_bytes; 76 } 77 78 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 79 struct btrfs_qgroup *qgroup, u64 num_bytes, 80 enum btrfs_qgroup_rsv_type type) 81 { 82 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 83 if (qgroup->rsv.values[type] >= num_bytes) { 84 qgroup->rsv.values[type] -= num_bytes; 85 return; 86 } 87 #ifdef CONFIG_BTRFS_DEBUG 88 WARN_RATELIMIT(1, 89 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 90 qgroup->qgroupid, qgroup_rsv_type_str(type), 91 qgroup->rsv.values[type], num_bytes); 92 #endif 93 qgroup->rsv.values[type] = 0; 94 } 95 96 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 97 struct btrfs_qgroup *dest, 98 struct btrfs_qgroup *src) 99 { 100 int i; 101 102 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 103 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 104 } 105 106 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 107 struct btrfs_qgroup *dest, 108 struct btrfs_qgroup *src) 109 { 110 int i; 111 112 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 113 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 114 } 115 116 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 117 int mod) 118 { 119 if (qg->old_refcnt < seq) 120 qg->old_refcnt = seq; 121 qg->old_refcnt += mod; 122 } 123 124 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 125 int mod) 126 { 127 if (qg->new_refcnt < seq) 128 qg->new_refcnt = seq; 129 qg->new_refcnt += mod; 130 } 131 132 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->old_refcnt < seq) 135 return 0; 136 return qg->old_refcnt - seq; 137 } 138 139 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 140 { 141 if (qg->new_refcnt < seq) 142 return 0; 143 return qg->new_refcnt - seq; 144 } 145 146 /* 147 * glue structure to represent the relations between qgroups. 148 */ 149 struct btrfs_qgroup_list { 150 struct list_head next_group; 151 struct list_head next_member; 152 struct btrfs_qgroup *group; 153 struct btrfs_qgroup *member; 154 }; 155 156 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 157 { 158 return (u64)(uintptr_t)qg; 159 } 160 161 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 162 { 163 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 164 } 165 166 static int 167 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 168 int init_flags); 169 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 170 171 /* must be called with qgroup_ioctl_lock held */ 172 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 173 u64 qgroupid) 174 { 175 struct rb_node *n = fs_info->qgroup_tree.rb_node; 176 struct btrfs_qgroup *qgroup; 177 178 while (n) { 179 qgroup = rb_entry(n, struct btrfs_qgroup, node); 180 if (qgroup->qgroupid < qgroupid) 181 n = n->rb_left; 182 else if (qgroup->qgroupid > qgroupid) 183 n = n->rb_right; 184 else 185 return qgroup; 186 } 187 return NULL; 188 } 189 190 /* must be called with qgroup_lock held */ 191 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 192 u64 qgroupid) 193 { 194 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 195 struct rb_node *parent = NULL; 196 struct btrfs_qgroup *qgroup; 197 198 while (*p) { 199 parent = *p; 200 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 201 202 if (qgroup->qgroupid < qgroupid) 203 p = &(*p)->rb_left; 204 else if (qgroup->qgroupid > qgroupid) 205 p = &(*p)->rb_right; 206 else 207 return qgroup; 208 } 209 210 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 211 if (!qgroup) 212 return ERR_PTR(-ENOMEM); 213 214 qgroup->qgroupid = qgroupid; 215 INIT_LIST_HEAD(&qgroup->groups); 216 INIT_LIST_HEAD(&qgroup->members); 217 INIT_LIST_HEAD(&qgroup->dirty); 218 219 rb_link_node(&qgroup->node, parent, p); 220 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 221 222 return qgroup; 223 } 224 225 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 226 struct btrfs_qgroup *qgroup) 227 { 228 struct btrfs_qgroup_list *list; 229 230 list_del(&qgroup->dirty); 231 while (!list_empty(&qgroup->groups)) { 232 list = list_first_entry(&qgroup->groups, 233 struct btrfs_qgroup_list, next_group); 234 list_del(&list->next_group); 235 list_del(&list->next_member); 236 kfree(list); 237 } 238 239 while (!list_empty(&qgroup->members)) { 240 list = list_first_entry(&qgroup->members, 241 struct btrfs_qgroup_list, next_member); 242 list_del(&list->next_group); 243 list_del(&list->next_member); 244 kfree(list); 245 } 246 } 247 248 /* must be called with qgroup_lock held */ 249 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 250 { 251 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 252 253 if (!qgroup) 254 return -ENOENT; 255 256 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 257 __del_qgroup_rb(fs_info, qgroup); 258 return 0; 259 } 260 261 /* must be called with qgroup_lock held */ 262 static int add_relation_rb(struct btrfs_fs_info *fs_info, 263 u64 memberid, u64 parentid) 264 { 265 struct btrfs_qgroup *member; 266 struct btrfs_qgroup *parent; 267 struct btrfs_qgroup_list *list; 268 269 member = find_qgroup_rb(fs_info, memberid); 270 parent = find_qgroup_rb(fs_info, parentid); 271 if (!member || !parent) 272 return -ENOENT; 273 274 list = kzalloc(sizeof(*list), GFP_ATOMIC); 275 if (!list) 276 return -ENOMEM; 277 278 list->group = parent; 279 list->member = member; 280 list_add_tail(&list->next_group, &member->groups); 281 list_add_tail(&list->next_member, &parent->members); 282 283 return 0; 284 } 285 286 /* must be called with qgroup_lock held */ 287 static int del_relation_rb(struct btrfs_fs_info *fs_info, 288 u64 memberid, u64 parentid) 289 { 290 struct btrfs_qgroup *member; 291 struct btrfs_qgroup *parent; 292 struct btrfs_qgroup_list *list; 293 294 member = find_qgroup_rb(fs_info, memberid); 295 parent = find_qgroup_rb(fs_info, parentid); 296 if (!member || !parent) 297 return -ENOENT; 298 299 list_for_each_entry(list, &member->groups, next_group) { 300 if (list->group == parent) { 301 list_del(&list->next_group); 302 list_del(&list->next_member); 303 kfree(list); 304 return 0; 305 } 306 } 307 return -ENOENT; 308 } 309 310 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 311 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 312 u64 rfer, u64 excl) 313 { 314 struct btrfs_qgroup *qgroup; 315 316 qgroup = find_qgroup_rb(fs_info, qgroupid); 317 if (!qgroup) 318 return -EINVAL; 319 if (qgroup->rfer != rfer || qgroup->excl != excl) 320 return -EINVAL; 321 return 0; 322 } 323 #endif 324 325 /* 326 * The full config is read in one go, only called from open_ctree() 327 * It doesn't use any locking, as at this point we're still single-threaded 328 */ 329 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 330 { 331 struct btrfs_key key; 332 struct btrfs_key found_key; 333 struct btrfs_root *quota_root = fs_info->quota_root; 334 struct btrfs_path *path = NULL; 335 struct extent_buffer *l; 336 int slot; 337 int ret = 0; 338 u64 flags = 0; 339 u64 rescan_progress = 0; 340 341 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 342 return 0; 343 344 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 345 if (!fs_info->qgroup_ulist) { 346 ret = -ENOMEM; 347 goto out; 348 } 349 350 path = btrfs_alloc_path(); 351 if (!path) { 352 ret = -ENOMEM; 353 goto out; 354 } 355 356 ret = btrfs_sysfs_add_qgroups(fs_info); 357 if (ret < 0) 358 goto out; 359 /* default this to quota off, in case no status key is found */ 360 fs_info->qgroup_flags = 0; 361 362 /* 363 * pass 1: read status, all qgroup infos and limits 364 */ 365 key.objectid = 0; 366 key.type = 0; 367 key.offset = 0; 368 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 369 if (ret) 370 goto out; 371 372 while (1) { 373 struct btrfs_qgroup *qgroup; 374 375 slot = path->slots[0]; 376 l = path->nodes[0]; 377 btrfs_item_key_to_cpu(l, &found_key, slot); 378 379 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 380 struct btrfs_qgroup_status_item *ptr; 381 382 ptr = btrfs_item_ptr(l, slot, 383 struct btrfs_qgroup_status_item); 384 385 if (btrfs_qgroup_status_version(l, ptr) != 386 BTRFS_QGROUP_STATUS_VERSION) { 387 btrfs_err(fs_info, 388 "old qgroup version, quota disabled"); 389 goto out; 390 } 391 if (btrfs_qgroup_status_generation(l, ptr) != 392 fs_info->generation) { 393 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 394 btrfs_err(fs_info, 395 "qgroup generation mismatch, marked as inconsistent"); 396 } 397 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 398 ptr); 399 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 400 goto next1; 401 } 402 403 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 404 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 405 goto next1; 406 407 qgroup = find_qgroup_rb(fs_info, found_key.offset); 408 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 409 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 410 btrfs_err(fs_info, "inconsistent qgroup config"); 411 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 412 } 413 if (!qgroup) { 414 qgroup = add_qgroup_rb(fs_info, found_key.offset); 415 if (IS_ERR(qgroup)) { 416 ret = PTR_ERR(qgroup); 417 goto out; 418 } 419 } 420 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 421 if (ret < 0) 422 goto out; 423 424 switch (found_key.type) { 425 case BTRFS_QGROUP_INFO_KEY: { 426 struct btrfs_qgroup_info_item *ptr; 427 428 ptr = btrfs_item_ptr(l, slot, 429 struct btrfs_qgroup_info_item); 430 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 431 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 432 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 433 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 434 /* generation currently unused */ 435 break; 436 } 437 case BTRFS_QGROUP_LIMIT_KEY: { 438 struct btrfs_qgroup_limit_item *ptr; 439 440 ptr = btrfs_item_ptr(l, slot, 441 struct btrfs_qgroup_limit_item); 442 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 443 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 444 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 445 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 446 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 447 break; 448 } 449 } 450 next1: 451 ret = btrfs_next_item(quota_root, path); 452 if (ret < 0) 453 goto out; 454 if (ret) 455 break; 456 } 457 btrfs_release_path(path); 458 459 /* 460 * pass 2: read all qgroup relations 461 */ 462 key.objectid = 0; 463 key.type = BTRFS_QGROUP_RELATION_KEY; 464 key.offset = 0; 465 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 466 if (ret) 467 goto out; 468 while (1) { 469 slot = path->slots[0]; 470 l = path->nodes[0]; 471 btrfs_item_key_to_cpu(l, &found_key, slot); 472 473 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 474 goto next2; 475 476 if (found_key.objectid > found_key.offset) { 477 /* parent <- member, not needed to build config */ 478 /* FIXME should we omit the key completely? */ 479 goto next2; 480 } 481 482 ret = add_relation_rb(fs_info, found_key.objectid, 483 found_key.offset); 484 if (ret == -ENOENT) { 485 btrfs_warn(fs_info, 486 "orphan qgroup relation 0x%llx->0x%llx", 487 found_key.objectid, found_key.offset); 488 ret = 0; /* ignore the error */ 489 } 490 if (ret) 491 goto out; 492 next2: 493 ret = btrfs_next_item(quota_root, path); 494 if (ret < 0) 495 goto out; 496 if (ret) 497 break; 498 } 499 out: 500 btrfs_free_path(path); 501 fs_info->qgroup_flags |= flags; 502 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 503 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 504 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 505 ret >= 0) 506 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 507 508 if (ret < 0) { 509 ulist_free(fs_info->qgroup_ulist); 510 fs_info->qgroup_ulist = NULL; 511 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 512 btrfs_sysfs_del_qgroups(fs_info); 513 } 514 515 return ret < 0 ? ret : 0; 516 } 517 518 /* 519 * Called in close_ctree() when quota is still enabled. This verifies we don't 520 * leak some reserved space. 521 * 522 * Return false if no reserved space is left. 523 * Return true if some reserved space is leaked. 524 */ 525 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 526 { 527 struct rb_node *node; 528 bool ret = false; 529 530 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 531 return ret; 532 /* 533 * Since we're unmounting, there is no race and no need to grab qgroup 534 * lock. And here we don't go post-order to provide a more user 535 * friendly sorted result. 536 */ 537 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 538 struct btrfs_qgroup *qgroup; 539 int i; 540 541 qgroup = rb_entry(node, struct btrfs_qgroup, node); 542 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 543 if (qgroup->rsv.values[i]) { 544 ret = true; 545 btrfs_warn(fs_info, 546 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 547 btrfs_qgroup_level(qgroup->qgroupid), 548 btrfs_qgroup_subvolid(qgroup->qgroupid), 549 i, qgroup->rsv.values[i]); 550 } 551 } 552 } 553 return ret; 554 } 555 556 /* 557 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 558 * first two are in single-threaded paths.And for the third one, we have set 559 * quota_root to be null with qgroup_lock held before, so it is safe to clean 560 * up the in-memory structures without qgroup_lock held. 561 */ 562 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 563 { 564 struct rb_node *n; 565 struct btrfs_qgroup *qgroup; 566 567 while ((n = rb_first(&fs_info->qgroup_tree))) { 568 qgroup = rb_entry(n, struct btrfs_qgroup, node); 569 rb_erase(n, &fs_info->qgroup_tree); 570 __del_qgroup_rb(fs_info, qgroup); 571 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 572 kfree(qgroup); 573 } 574 /* 575 * We call btrfs_free_qgroup_config() when unmounting 576 * filesystem and disabling quota, so we set qgroup_ulist 577 * to be null here to avoid double free. 578 */ 579 ulist_free(fs_info->qgroup_ulist); 580 fs_info->qgroup_ulist = NULL; 581 btrfs_sysfs_del_qgroups(fs_info); 582 } 583 584 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 585 u64 dst) 586 { 587 int ret; 588 struct btrfs_root *quota_root = trans->fs_info->quota_root; 589 struct btrfs_path *path; 590 struct btrfs_key key; 591 592 path = btrfs_alloc_path(); 593 if (!path) 594 return -ENOMEM; 595 596 key.objectid = src; 597 key.type = BTRFS_QGROUP_RELATION_KEY; 598 key.offset = dst; 599 600 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 601 602 btrfs_mark_buffer_dirty(path->nodes[0]); 603 604 btrfs_free_path(path); 605 return ret; 606 } 607 608 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 625 if (ret < 0) 626 goto out; 627 628 if (ret > 0) { 629 ret = -ENOENT; 630 goto out; 631 } 632 633 ret = btrfs_del_item(trans, quota_root, path); 634 out: 635 btrfs_free_path(path); 636 return ret; 637 } 638 639 static int add_qgroup_item(struct btrfs_trans_handle *trans, 640 struct btrfs_root *quota_root, u64 qgroupid) 641 { 642 int ret; 643 struct btrfs_path *path; 644 struct btrfs_qgroup_info_item *qgroup_info; 645 struct btrfs_qgroup_limit_item *qgroup_limit; 646 struct extent_buffer *leaf; 647 struct btrfs_key key; 648 649 if (btrfs_is_testing(quota_root->fs_info)) 650 return 0; 651 652 path = btrfs_alloc_path(); 653 if (!path) 654 return -ENOMEM; 655 656 key.objectid = 0; 657 key.type = BTRFS_QGROUP_INFO_KEY; 658 key.offset = qgroupid; 659 660 /* 661 * Avoid a transaction abort by catching -EEXIST here. In that 662 * case, we proceed by re-initializing the existing structure 663 * on disk. 664 */ 665 666 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 667 sizeof(*qgroup_info)); 668 if (ret && ret != -EEXIST) 669 goto out; 670 671 leaf = path->nodes[0]; 672 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 673 struct btrfs_qgroup_info_item); 674 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 675 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 676 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 677 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 678 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 679 680 btrfs_mark_buffer_dirty(leaf); 681 682 btrfs_release_path(path); 683 684 key.type = BTRFS_QGROUP_LIMIT_KEY; 685 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 686 sizeof(*qgroup_limit)); 687 if (ret && ret != -EEXIST) 688 goto out; 689 690 leaf = path->nodes[0]; 691 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 692 struct btrfs_qgroup_limit_item); 693 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 694 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 695 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 696 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 697 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 698 699 btrfs_mark_buffer_dirty(leaf); 700 701 ret = 0; 702 out: 703 btrfs_free_path(path); 704 return ret; 705 } 706 707 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 708 { 709 int ret; 710 struct btrfs_root *quota_root = trans->fs_info->quota_root; 711 struct btrfs_path *path; 712 struct btrfs_key key; 713 714 path = btrfs_alloc_path(); 715 if (!path) 716 return -ENOMEM; 717 718 key.objectid = 0; 719 key.type = BTRFS_QGROUP_INFO_KEY; 720 key.offset = qgroupid; 721 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 722 if (ret < 0) 723 goto out; 724 725 if (ret > 0) { 726 ret = -ENOENT; 727 goto out; 728 } 729 730 ret = btrfs_del_item(trans, quota_root, path); 731 if (ret) 732 goto out; 733 734 btrfs_release_path(path); 735 736 key.type = BTRFS_QGROUP_LIMIT_KEY; 737 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 738 if (ret < 0) 739 goto out; 740 741 if (ret > 0) { 742 ret = -ENOENT; 743 goto out; 744 } 745 746 ret = btrfs_del_item(trans, quota_root, path); 747 748 out: 749 btrfs_free_path(path); 750 return ret; 751 } 752 753 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 754 struct btrfs_qgroup *qgroup) 755 { 756 struct btrfs_root *quota_root = trans->fs_info->quota_root; 757 struct btrfs_path *path; 758 struct btrfs_key key; 759 struct extent_buffer *l; 760 struct btrfs_qgroup_limit_item *qgroup_limit; 761 int ret; 762 int slot; 763 764 key.objectid = 0; 765 key.type = BTRFS_QGROUP_LIMIT_KEY; 766 key.offset = qgroup->qgroupid; 767 768 path = btrfs_alloc_path(); 769 if (!path) 770 return -ENOMEM; 771 772 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 773 if (ret > 0) 774 ret = -ENOENT; 775 776 if (ret) 777 goto out; 778 779 l = path->nodes[0]; 780 slot = path->slots[0]; 781 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 782 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 783 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 784 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 785 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 786 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 787 788 btrfs_mark_buffer_dirty(l); 789 790 out: 791 btrfs_free_path(path); 792 return ret; 793 } 794 795 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 796 struct btrfs_qgroup *qgroup) 797 { 798 struct btrfs_fs_info *fs_info = trans->fs_info; 799 struct btrfs_root *quota_root = fs_info->quota_root; 800 struct btrfs_path *path; 801 struct btrfs_key key; 802 struct extent_buffer *l; 803 struct btrfs_qgroup_info_item *qgroup_info; 804 int ret; 805 int slot; 806 807 if (btrfs_is_testing(fs_info)) 808 return 0; 809 810 key.objectid = 0; 811 key.type = BTRFS_QGROUP_INFO_KEY; 812 key.offset = qgroup->qgroupid; 813 814 path = btrfs_alloc_path(); 815 if (!path) 816 return -ENOMEM; 817 818 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 819 if (ret > 0) 820 ret = -ENOENT; 821 822 if (ret) 823 goto out; 824 825 l = path->nodes[0]; 826 slot = path->slots[0]; 827 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 828 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 829 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 830 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 831 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 832 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 833 834 btrfs_mark_buffer_dirty(l); 835 836 out: 837 btrfs_free_path(path); 838 return ret; 839 } 840 841 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 842 { 843 struct btrfs_fs_info *fs_info = trans->fs_info; 844 struct btrfs_root *quota_root = fs_info->quota_root; 845 struct btrfs_path *path; 846 struct btrfs_key key; 847 struct extent_buffer *l; 848 struct btrfs_qgroup_status_item *ptr; 849 int ret; 850 int slot; 851 852 key.objectid = 0; 853 key.type = BTRFS_QGROUP_STATUS_KEY; 854 key.offset = 0; 855 856 path = btrfs_alloc_path(); 857 if (!path) 858 return -ENOMEM; 859 860 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 861 if (ret > 0) 862 ret = -ENOENT; 863 864 if (ret) 865 goto out; 866 867 l = path->nodes[0]; 868 slot = path->slots[0]; 869 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 870 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); 871 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 872 btrfs_set_qgroup_status_rescan(l, ptr, 873 fs_info->qgroup_rescan_progress.objectid); 874 875 btrfs_mark_buffer_dirty(l); 876 877 out: 878 btrfs_free_path(path); 879 return ret; 880 } 881 882 /* 883 * called with qgroup_lock held 884 */ 885 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 886 struct btrfs_root *root) 887 { 888 struct btrfs_path *path; 889 struct btrfs_key key; 890 struct extent_buffer *leaf = NULL; 891 int ret; 892 int nr = 0; 893 894 path = btrfs_alloc_path(); 895 if (!path) 896 return -ENOMEM; 897 898 key.objectid = 0; 899 key.offset = 0; 900 key.type = 0; 901 902 while (1) { 903 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 904 if (ret < 0) 905 goto out; 906 leaf = path->nodes[0]; 907 nr = btrfs_header_nritems(leaf); 908 if (!nr) 909 break; 910 /* 911 * delete the leaf one by one 912 * since the whole tree is going 913 * to be deleted. 914 */ 915 path->slots[0] = 0; 916 ret = btrfs_del_items(trans, root, path, 0, nr); 917 if (ret) 918 goto out; 919 920 btrfs_release_path(path); 921 } 922 ret = 0; 923 out: 924 btrfs_free_path(path); 925 return ret; 926 } 927 928 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 929 { 930 struct btrfs_root *quota_root; 931 struct btrfs_root *tree_root = fs_info->tree_root; 932 struct btrfs_path *path = NULL; 933 struct btrfs_qgroup_status_item *ptr; 934 struct extent_buffer *leaf; 935 struct btrfs_key key; 936 struct btrfs_key found_key; 937 struct btrfs_qgroup *qgroup = NULL; 938 struct btrfs_trans_handle *trans = NULL; 939 struct ulist *ulist = NULL; 940 int ret = 0; 941 int slot; 942 943 /* 944 * We need to have subvol_sem write locked, to prevent races between 945 * concurrent tasks trying to enable quotas, because we will unlock 946 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 947 * and before setting BTRFS_FS_QUOTA_ENABLED. 948 */ 949 lockdep_assert_held_write(&fs_info->subvol_sem); 950 951 mutex_lock(&fs_info->qgroup_ioctl_lock); 952 if (fs_info->quota_root) 953 goto out; 954 955 ulist = ulist_alloc(GFP_KERNEL); 956 if (!ulist) { 957 ret = -ENOMEM; 958 goto out; 959 } 960 961 ret = btrfs_sysfs_add_qgroups(fs_info); 962 if (ret < 0) 963 goto out; 964 965 /* 966 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 967 * avoid lock acquisition inversion problems (reported by lockdep) between 968 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 969 * start a transaction. 970 * After we started the transaction lock qgroup_ioctl_lock again and 971 * check if someone else created the quota root in the meanwhile. If so, 972 * just return success and release the transaction handle. 973 * 974 * Also we don't need to worry about someone else calling 975 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 976 * that function returns 0 (success) when the sysfs entries already exist. 977 */ 978 mutex_unlock(&fs_info->qgroup_ioctl_lock); 979 980 /* 981 * 1 for quota root item 982 * 1 for BTRFS_QGROUP_STATUS item 983 * 984 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 985 * per subvolume. However those are not currently reserved since it 986 * would be a lot of overkill. 987 */ 988 trans = btrfs_start_transaction(tree_root, 2); 989 990 mutex_lock(&fs_info->qgroup_ioctl_lock); 991 if (IS_ERR(trans)) { 992 ret = PTR_ERR(trans); 993 trans = NULL; 994 goto out; 995 } 996 997 if (fs_info->quota_root) 998 goto out; 999 1000 fs_info->qgroup_ulist = ulist; 1001 ulist = NULL; 1002 1003 /* 1004 * initially create the quota tree 1005 */ 1006 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1007 if (IS_ERR(quota_root)) { 1008 ret = PTR_ERR(quota_root); 1009 btrfs_abort_transaction(trans, ret); 1010 goto out; 1011 } 1012 1013 path = btrfs_alloc_path(); 1014 if (!path) { 1015 ret = -ENOMEM; 1016 btrfs_abort_transaction(trans, ret); 1017 goto out_free_root; 1018 } 1019 1020 key.objectid = 0; 1021 key.type = BTRFS_QGROUP_STATUS_KEY; 1022 key.offset = 0; 1023 1024 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1025 sizeof(*ptr)); 1026 if (ret) { 1027 btrfs_abort_transaction(trans, ret); 1028 goto out_free_path; 1029 } 1030 1031 leaf = path->nodes[0]; 1032 ptr = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_qgroup_status_item); 1034 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1035 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1036 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1037 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1038 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); 1039 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1040 1041 btrfs_mark_buffer_dirty(leaf); 1042 1043 key.objectid = 0; 1044 key.type = BTRFS_ROOT_REF_KEY; 1045 key.offset = 0; 1046 1047 btrfs_release_path(path); 1048 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1049 if (ret > 0) 1050 goto out_add_root; 1051 if (ret < 0) { 1052 btrfs_abort_transaction(trans, ret); 1053 goto out_free_path; 1054 } 1055 1056 while (1) { 1057 slot = path->slots[0]; 1058 leaf = path->nodes[0]; 1059 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1060 1061 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1062 1063 /* Release locks on tree_root before we access quota_root */ 1064 btrfs_release_path(path); 1065 1066 ret = add_qgroup_item(trans, quota_root, 1067 found_key.offset); 1068 if (ret) { 1069 btrfs_abort_transaction(trans, ret); 1070 goto out_free_path; 1071 } 1072 1073 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1074 if (IS_ERR(qgroup)) { 1075 ret = PTR_ERR(qgroup); 1076 btrfs_abort_transaction(trans, ret); 1077 goto out_free_path; 1078 } 1079 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1080 if (ret < 0) { 1081 btrfs_abort_transaction(trans, ret); 1082 goto out_free_path; 1083 } 1084 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1085 path, 1, 0); 1086 if (ret < 0) { 1087 btrfs_abort_transaction(trans, ret); 1088 goto out_free_path; 1089 } 1090 if (ret > 0) { 1091 /* 1092 * Shouldn't happen, but in case it does we 1093 * don't need to do the btrfs_next_item, just 1094 * continue. 1095 */ 1096 continue; 1097 } 1098 } 1099 ret = btrfs_next_item(tree_root, path); 1100 if (ret < 0) { 1101 btrfs_abort_transaction(trans, ret); 1102 goto out_free_path; 1103 } 1104 if (ret) 1105 break; 1106 } 1107 1108 out_add_root: 1109 btrfs_release_path(path); 1110 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1111 if (ret) { 1112 btrfs_abort_transaction(trans, ret); 1113 goto out_free_path; 1114 } 1115 1116 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1117 if (IS_ERR(qgroup)) { 1118 ret = PTR_ERR(qgroup); 1119 btrfs_abort_transaction(trans, ret); 1120 goto out_free_path; 1121 } 1122 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1123 if (ret < 0) { 1124 btrfs_abort_transaction(trans, ret); 1125 goto out_free_path; 1126 } 1127 1128 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1129 /* 1130 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1131 * a deadlock with tasks concurrently doing other qgroup operations, such 1132 * adding/removing qgroups or adding/deleting qgroup relations for example, 1133 * because all qgroup operations first start or join a transaction and then 1134 * lock the qgroup_ioctl_lock mutex. 1135 * We are safe from a concurrent task trying to enable quotas, by calling 1136 * this function, since we are serialized by fs_info->subvol_sem. 1137 */ 1138 ret = btrfs_commit_transaction(trans); 1139 trans = NULL; 1140 mutex_lock(&fs_info->qgroup_ioctl_lock); 1141 if (ret) 1142 goto out_free_path; 1143 1144 /* 1145 * Set quota enabled flag after committing the transaction, to avoid 1146 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1147 * creation. 1148 */ 1149 spin_lock(&fs_info->qgroup_lock); 1150 fs_info->quota_root = quota_root; 1151 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1152 spin_unlock(&fs_info->qgroup_lock); 1153 1154 ret = qgroup_rescan_init(fs_info, 0, 1); 1155 if (!ret) { 1156 qgroup_rescan_zero_tracking(fs_info); 1157 fs_info->qgroup_rescan_running = true; 1158 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1159 &fs_info->qgroup_rescan_work); 1160 } 1161 1162 out_free_path: 1163 btrfs_free_path(path); 1164 out_free_root: 1165 if (ret) 1166 btrfs_put_root(quota_root); 1167 out: 1168 if (ret) { 1169 ulist_free(fs_info->qgroup_ulist); 1170 fs_info->qgroup_ulist = NULL; 1171 btrfs_sysfs_del_qgroups(fs_info); 1172 } 1173 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1174 if (ret && trans) 1175 btrfs_end_transaction(trans); 1176 else if (trans) 1177 ret = btrfs_end_transaction(trans); 1178 ulist_free(ulist); 1179 return ret; 1180 } 1181 1182 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1183 { 1184 struct btrfs_root *quota_root; 1185 struct btrfs_trans_handle *trans = NULL; 1186 int ret = 0; 1187 1188 /* 1189 * We need to have subvol_sem write locked, to prevent races between 1190 * concurrent tasks trying to disable quotas, because we will unlock 1191 * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. 1192 */ 1193 lockdep_assert_held_write(&fs_info->subvol_sem); 1194 1195 mutex_lock(&fs_info->qgroup_ioctl_lock); 1196 if (!fs_info->quota_root) 1197 goto out; 1198 1199 /* 1200 * Request qgroup rescan worker to complete and wait for it. This wait 1201 * must be done before transaction start for quota disable since it may 1202 * deadlock with transaction by the qgroup rescan worker. 1203 */ 1204 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1205 btrfs_qgroup_wait_for_completion(fs_info, false); 1206 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1207 1208 /* 1209 * 1 For the root item 1210 * 1211 * We should also reserve enough items for the quota tree deletion in 1212 * btrfs_clean_quota_tree but this is not done. 1213 * 1214 * Also, we must always start a transaction without holding the mutex 1215 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1216 */ 1217 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1218 1219 mutex_lock(&fs_info->qgroup_ioctl_lock); 1220 if (IS_ERR(trans)) { 1221 ret = PTR_ERR(trans); 1222 trans = NULL; 1223 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1224 goto out; 1225 } 1226 1227 if (!fs_info->quota_root) 1228 goto out; 1229 1230 spin_lock(&fs_info->qgroup_lock); 1231 quota_root = fs_info->quota_root; 1232 fs_info->quota_root = NULL; 1233 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1234 spin_unlock(&fs_info->qgroup_lock); 1235 1236 btrfs_free_qgroup_config(fs_info); 1237 1238 ret = btrfs_clean_quota_tree(trans, quota_root); 1239 if (ret) { 1240 btrfs_abort_transaction(trans, ret); 1241 goto out; 1242 } 1243 1244 ret = btrfs_del_root(trans, "a_root->root_key); 1245 if (ret) { 1246 btrfs_abort_transaction(trans, ret); 1247 goto out; 1248 } 1249 1250 list_del("a_root->dirty_list); 1251 1252 btrfs_tree_lock(quota_root->node); 1253 btrfs_clean_tree_block(quota_root->node); 1254 btrfs_tree_unlock(quota_root->node); 1255 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1256 quota_root->node, 0, 1); 1257 1258 btrfs_put_root(quota_root); 1259 1260 out: 1261 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1262 if (ret && trans) 1263 btrfs_end_transaction(trans); 1264 else if (trans) 1265 ret = btrfs_end_transaction(trans); 1266 1267 return ret; 1268 } 1269 1270 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1271 struct btrfs_qgroup *qgroup) 1272 { 1273 if (list_empty(&qgroup->dirty)) 1274 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1275 } 1276 1277 /* 1278 * The easy accounting, we're updating qgroup relationship whose child qgroup 1279 * only has exclusive extents. 1280 * 1281 * In this case, all exclusive extents will also be exclusive for parent, so 1282 * excl/rfer just get added/removed. 1283 * 1284 * So is qgroup reservation space, which should also be added/removed to 1285 * parent. 1286 * Or when child tries to release reservation space, parent will underflow its 1287 * reservation (for relationship adding case). 1288 * 1289 * Caller should hold fs_info->qgroup_lock. 1290 */ 1291 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1292 struct ulist *tmp, u64 ref_root, 1293 struct btrfs_qgroup *src, int sign) 1294 { 1295 struct btrfs_qgroup *qgroup; 1296 struct btrfs_qgroup_list *glist; 1297 struct ulist_node *unode; 1298 struct ulist_iterator uiter; 1299 u64 num_bytes = src->excl; 1300 int ret = 0; 1301 1302 qgroup = find_qgroup_rb(fs_info, ref_root); 1303 if (!qgroup) 1304 goto out; 1305 1306 qgroup->rfer += sign * num_bytes; 1307 qgroup->rfer_cmpr += sign * num_bytes; 1308 1309 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1310 qgroup->excl += sign * num_bytes; 1311 qgroup->excl_cmpr += sign * num_bytes; 1312 1313 if (sign > 0) 1314 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1315 else 1316 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1317 1318 qgroup_dirty(fs_info, qgroup); 1319 1320 /* Get all of the parent groups that contain this qgroup */ 1321 list_for_each_entry(glist, &qgroup->groups, next_group) { 1322 ret = ulist_add(tmp, glist->group->qgroupid, 1323 qgroup_to_aux(glist->group), GFP_ATOMIC); 1324 if (ret < 0) 1325 goto out; 1326 } 1327 1328 /* Iterate all of the parents and adjust their reference counts */ 1329 ULIST_ITER_INIT(&uiter); 1330 while ((unode = ulist_next(tmp, &uiter))) { 1331 qgroup = unode_aux_to_qgroup(unode); 1332 qgroup->rfer += sign * num_bytes; 1333 qgroup->rfer_cmpr += sign * num_bytes; 1334 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1335 qgroup->excl += sign * num_bytes; 1336 if (sign > 0) 1337 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1338 else 1339 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1340 qgroup->excl_cmpr += sign * num_bytes; 1341 qgroup_dirty(fs_info, qgroup); 1342 1343 /* Add any parents of the parents */ 1344 list_for_each_entry(glist, &qgroup->groups, next_group) { 1345 ret = ulist_add(tmp, glist->group->qgroupid, 1346 qgroup_to_aux(glist->group), GFP_ATOMIC); 1347 if (ret < 0) 1348 goto out; 1349 } 1350 } 1351 ret = 0; 1352 out: 1353 return ret; 1354 } 1355 1356 1357 /* 1358 * Quick path for updating qgroup with only excl refs. 1359 * 1360 * In that case, just update all parent will be enough. 1361 * Or we needs to do a full rescan. 1362 * Caller should also hold fs_info->qgroup_lock. 1363 * 1364 * Return 0 for quick update, return >0 for need to full rescan 1365 * and mark INCONSISTENT flag. 1366 * Return < 0 for other error. 1367 */ 1368 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1369 struct ulist *tmp, u64 src, u64 dst, 1370 int sign) 1371 { 1372 struct btrfs_qgroup *qgroup; 1373 int ret = 1; 1374 int err = 0; 1375 1376 qgroup = find_qgroup_rb(fs_info, src); 1377 if (!qgroup) 1378 goto out; 1379 if (qgroup->excl == qgroup->rfer) { 1380 ret = 0; 1381 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1382 qgroup, sign); 1383 if (err < 0) { 1384 ret = err; 1385 goto out; 1386 } 1387 } 1388 out: 1389 if (ret) 1390 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1391 return ret; 1392 } 1393 1394 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1395 u64 dst) 1396 { 1397 struct btrfs_fs_info *fs_info = trans->fs_info; 1398 struct btrfs_qgroup *parent; 1399 struct btrfs_qgroup *member; 1400 struct btrfs_qgroup_list *list; 1401 struct ulist *tmp; 1402 unsigned int nofs_flag; 1403 int ret = 0; 1404 1405 /* Check the level of src and dst first */ 1406 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1407 return -EINVAL; 1408 1409 /* We hold a transaction handle open, must do a NOFS allocation. */ 1410 nofs_flag = memalloc_nofs_save(); 1411 tmp = ulist_alloc(GFP_KERNEL); 1412 memalloc_nofs_restore(nofs_flag); 1413 if (!tmp) 1414 return -ENOMEM; 1415 1416 mutex_lock(&fs_info->qgroup_ioctl_lock); 1417 if (!fs_info->quota_root) { 1418 ret = -ENOTCONN; 1419 goto out; 1420 } 1421 member = find_qgroup_rb(fs_info, src); 1422 parent = find_qgroup_rb(fs_info, dst); 1423 if (!member || !parent) { 1424 ret = -EINVAL; 1425 goto out; 1426 } 1427 1428 /* check if such qgroup relation exist firstly */ 1429 list_for_each_entry(list, &member->groups, next_group) { 1430 if (list->group == parent) { 1431 ret = -EEXIST; 1432 goto out; 1433 } 1434 } 1435 1436 ret = add_qgroup_relation_item(trans, src, dst); 1437 if (ret) 1438 goto out; 1439 1440 ret = add_qgroup_relation_item(trans, dst, src); 1441 if (ret) { 1442 del_qgroup_relation_item(trans, src, dst); 1443 goto out; 1444 } 1445 1446 spin_lock(&fs_info->qgroup_lock); 1447 ret = add_relation_rb(fs_info, src, dst); 1448 if (ret < 0) { 1449 spin_unlock(&fs_info->qgroup_lock); 1450 goto out; 1451 } 1452 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1453 spin_unlock(&fs_info->qgroup_lock); 1454 out: 1455 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1456 ulist_free(tmp); 1457 return ret; 1458 } 1459 1460 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1461 u64 dst) 1462 { 1463 struct btrfs_fs_info *fs_info = trans->fs_info; 1464 struct btrfs_qgroup *parent; 1465 struct btrfs_qgroup *member; 1466 struct btrfs_qgroup_list *list; 1467 struct ulist *tmp; 1468 bool found = false; 1469 unsigned int nofs_flag; 1470 int ret = 0; 1471 int ret2; 1472 1473 /* We hold a transaction handle open, must do a NOFS allocation. */ 1474 nofs_flag = memalloc_nofs_save(); 1475 tmp = ulist_alloc(GFP_KERNEL); 1476 memalloc_nofs_restore(nofs_flag); 1477 if (!tmp) 1478 return -ENOMEM; 1479 1480 if (!fs_info->quota_root) { 1481 ret = -ENOTCONN; 1482 goto out; 1483 } 1484 1485 member = find_qgroup_rb(fs_info, src); 1486 parent = find_qgroup_rb(fs_info, dst); 1487 /* 1488 * The parent/member pair doesn't exist, then try to delete the dead 1489 * relation items only. 1490 */ 1491 if (!member || !parent) 1492 goto delete_item; 1493 1494 /* check if such qgroup relation exist firstly */ 1495 list_for_each_entry(list, &member->groups, next_group) { 1496 if (list->group == parent) { 1497 found = true; 1498 break; 1499 } 1500 } 1501 1502 delete_item: 1503 ret = del_qgroup_relation_item(trans, src, dst); 1504 if (ret < 0 && ret != -ENOENT) 1505 goto out; 1506 ret2 = del_qgroup_relation_item(trans, dst, src); 1507 if (ret2 < 0 && ret2 != -ENOENT) 1508 goto out; 1509 1510 /* At least one deletion succeeded, return 0 */ 1511 if (!ret || !ret2) 1512 ret = 0; 1513 1514 if (found) { 1515 spin_lock(&fs_info->qgroup_lock); 1516 del_relation_rb(fs_info, src, dst); 1517 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1518 spin_unlock(&fs_info->qgroup_lock); 1519 } 1520 out: 1521 ulist_free(tmp); 1522 return ret; 1523 } 1524 1525 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1526 u64 dst) 1527 { 1528 struct btrfs_fs_info *fs_info = trans->fs_info; 1529 int ret = 0; 1530 1531 mutex_lock(&fs_info->qgroup_ioctl_lock); 1532 ret = __del_qgroup_relation(trans, src, dst); 1533 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1534 1535 return ret; 1536 } 1537 1538 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1539 { 1540 struct btrfs_fs_info *fs_info = trans->fs_info; 1541 struct btrfs_root *quota_root; 1542 struct btrfs_qgroup *qgroup; 1543 int ret = 0; 1544 1545 mutex_lock(&fs_info->qgroup_ioctl_lock); 1546 if (!fs_info->quota_root) { 1547 ret = -ENOTCONN; 1548 goto out; 1549 } 1550 quota_root = fs_info->quota_root; 1551 qgroup = find_qgroup_rb(fs_info, qgroupid); 1552 if (qgroup) { 1553 ret = -EEXIST; 1554 goto out; 1555 } 1556 1557 ret = add_qgroup_item(trans, quota_root, qgroupid); 1558 if (ret) 1559 goto out; 1560 1561 spin_lock(&fs_info->qgroup_lock); 1562 qgroup = add_qgroup_rb(fs_info, qgroupid); 1563 spin_unlock(&fs_info->qgroup_lock); 1564 1565 if (IS_ERR(qgroup)) { 1566 ret = PTR_ERR(qgroup); 1567 goto out; 1568 } 1569 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1570 out: 1571 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1572 return ret; 1573 } 1574 1575 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1576 { 1577 struct btrfs_fs_info *fs_info = trans->fs_info; 1578 struct btrfs_qgroup *qgroup; 1579 struct btrfs_qgroup_list *list; 1580 int ret = 0; 1581 1582 mutex_lock(&fs_info->qgroup_ioctl_lock); 1583 if (!fs_info->quota_root) { 1584 ret = -ENOTCONN; 1585 goto out; 1586 } 1587 1588 qgroup = find_qgroup_rb(fs_info, qgroupid); 1589 if (!qgroup) { 1590 ret = -ENOENT; 1591 goto out; 1592 } 1593 1594 /* Check if there are no children of this qgroup */ 1595 if (!list_empty(&qgroup->members)) { 1596 ret = -EBUSY; 1597 goto out; 1598 } 1599 1600 ret = del_qgroup_item(trans, qgroupid); 1601 if (ret && ret != -ENOENT) 1602 goto out; 1603 1604 while (!list_empty(&qgroup->groups)) { 1605 list = list_first_entry(&qgroup->groups, 1606 struct btrfs_qgroup_list, next_group); 1607 ret = __del_qgroup_relation(trans, qgroupid, 1608 list->group->qgroupid); 1609 if (ret) 1610 goto out; 1611 } 1612 1613 spin_lock(&fs_info->qgroup_lock); 1614 del_qgroup_rb(fs_info, qgroupid); 1615 spin_unlock(&fs_info->qgroup_lock); 1616 1617 /* 1618 * Remove the qgroup from sysfs now without holding the qgroup_lock 1619 * spinlock, since the sysfs_remove_group() function needs to take 1620 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1621 */ 1622 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1623 kfree(qgroup); 1624 out: 1625 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1626 return ret; 1627 } 1628 1629 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1630 struct btrfs_qgroup_limit *limit) 1631 { 1632 struct btrfs_fs_info *fs_info = trans->fs_info; 1633 struct btrfs_qgroup *qgroup; 1634 int ret = 0; 1635 /* Sometimes we would want to clear the limit on this qgroup. 1636 * To meet this requirement, we treat the -1 as a special value 1637 * which tell kernel to clear the limit on this qgroup. 1638 */ 1639 const u64 CLEAR_VALUE = -1; 1640 1641 mutex_lock(&fs_info->qgroup_ioctl_lock); 1642 if (!fs_info->quota_root) { 1643 ret = -ENOTCONN; 1644 goto out; 1645 } 1646 1647 qgroup = find_qgroup_rb(fs_info, qgroupid); 1648 if (!qgroup) { 1649 ret = -ENOENT; 1650 goto out; 1651 } 1652 1653 spin_lock(&fs_info->qgroup_lock); 1654 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1655 if (limit->max_rfer == CLEAR_VALUE) { 1656 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1657 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1658 qgroup->max_rfer = 0; 1659 } else { 1660 qgroup->max_rfer = limit->max_rfer; 1661 } 1662 } 1663 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1664 if (limit->max_excl == CLEAR_VALUE) { 1665 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1666 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1667 qgroup->max_excl = 0; 1668 } else { 1669 qgroup->max_excl = limit->max_excl; 1670 } 1671 } 1672 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1673 if (limit->rsv_rfer == CLEAR_VALUE) { 1674 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1675 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1676 qgroup->rsv_rfer = 0; 1677 } else { 1678 qgroup->rsv_rfer = limit->rsv_rfer; 1679 } 1680 } 1681 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1682 if (limit->rsv_excl == CLEAR_VALUE) { 1683 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1684 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1685 qgroup->rsv_excl = 0; 1686 } else { 1687 qgroup->rsv_excl = limit->rsv_excl; 1688 } 1689 } 1690 qgroup->lim_flags |= limit->flags; 1691 1692 spin_unlock(&fs_info->qgroup_lock); 1693 1694 ret = update_qgroup_limit_item(trans, qgroup); 1695 if (ret) { 1696 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1697 btrfs_info(fs_info, "unable to update quota limit for %llu", 1698 qgroupid); 1699 } 1700 1701 out: 1702 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1703 return ret; 1704 } 1705 1706 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1707 struct btrfs_delayed_ref_root *delayed_refs, 1708 struct btrfs_qgroup_extent_record *record) 1709 { 1710 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1711 struct rb_node *parent_node = NULL; 1712 struct btrfs_qgroup_extent_record *entry; 1713 u64 bytenr = record->bytenr; 1714 1715 lockdep_assert_held(&delayed_refs->lock); 1716 trace_btrfs_qgroup_trace_extent(fs_info, record); 1717 1718 while (*p) { 1719 parent_node = *p; 1720 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1721 node); 1722 if (bytenr < entry->bytenr) { 1723 p = &(*p)->rb_left; 1724 } else if (bytenr > entry->bytenr) { 1725 p = &(*p)->rb_right; 1726 } else { 1727 if (record->data_rsv && !entry->data_rsv) { 1728 entry->data_rsv = record->data_rsv; 1729 entry->data_rsv_refroot = 1730 record->data_rsv_refroot; 1731 } 1732 return 1; 1733 } 1734 } 1735 1736 rb_link_node(&record->node, parent_node, p); 1737 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1738 return 0; 1739 } 1740 1741 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1742 struct btrfs_qgroup_extent_record *qrecord) 1743 { 1744 struct ulist *old_root; 1745 u64 bytenr = qrecord->bytenr; 1746 int ret; 1747 1748 /* 1749 * We are always called in a context where we are already holding a 1750 * transaction handle. Often we are called when adding a data delayed 1751 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1752 * in which case we will be holding a write lock on extent buffer from a 1753 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1754 * acquire fs_info->commit_root_sem, because that is a higher level lock 1755 * that must be acquired before locking any extent buffers. 1756 * 1757 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1758 * but we can't pass it a non-NULL transaction handle, because otherwise 1759 * it would not use commit roots and would lock extent buffers, causing 1760 * a deadlock if it ends up trying to read lock the same extent buffer 1761 * that was previously write locked at btrfs_truncate_inode_items(). 1762 * 1763 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1764 * explicitly tell it to not acquire the commit_root_sem - if we are 1765 * holding a transaction handle we don't need its protection. 1766 */ 1767 ASSERT(trans != NULL); 1768 1769 ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, 1770 true); 1771 if (ret < 0) { 1772 trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1773 btrfs_warn(trans->fs_info, 1774 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1775 ret); 1776 return 0; 1777 } 1778 1779 /* 1780 * Here we don't need to get the lock of 1781 * trans->transaction->delayed_refs, since inserted qrecord won't 1782 * be deleted, only qrecord->node may be modified (new qrecord insert) 1783 * 1784 * So modifying qrecord->old_roots is safe here 1785 */ 1786 qrecord->old_roots = old_root; 1787 return 0; 1788 } 1789 1790 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1791 u64 num_bytes, gfp_t gfp_flag) 1792 { 1793 struct btrfs_fs_info *fs_info = trans->fs_info; 1794 struct btrfs_qgroup_extent_record *record; 1795 struct btrfs_delayed_ref_root *delayed_refs; 1796 int ret; 1797 1798 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1799 || bytenr == 0 || num_bytes == 0) 1800 return 0; 1801 record = kzalloc(sizeof(*record), gfp_flag); 1802 if (!record) 1803 return -ENOMEM; 1804 1805 delayed_refs = &trans->transaction->delayed_refs; 1806 record->bytenr = bytenr; 1807 record->num_bytes = num_bytes; 1808 record->old_roots = NULL; 1809 1810 spin_lock(&delayed_refs->lock); 1811 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1812 spin_unlock(&delayed_refs->lock); 1813 if (ret > 0) { 1814 kfree(record); 1815 return 0; 1816 } 1817 return btrfs_qgroup_trace_extent_post(trans, record); 1818 } 1819 1820 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1821 struct extent_buffer *eb) 1822 { 1823 struct btrfs_fs_info *fs_info = trans->fs_info; 1824 int nr = btrfs_header_nritems(eb); 1825 int i, extent_type, ret; 1826 struct btrfs_key key; 1827 struct btrfs_file_extent_item *fi; 1828 u64 bytenr, num_bytes; 1829 1830 /* We can be called directly from walk_up_proc() */ 1831 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1832 return 0; 1833 1834 for (i = 0; i < nr; i++) { 1835 btrfs_item_key_to_cpu(eb, &key, i); 1836 1837 if (key.type != BTRFS_EXTENT_DATA_KEY) 1838 continue; 1839 1840 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1841 /* filter out non qgroup-accountable extents */ 1842 extent_type = btrfs_file_extent_type(eb, fi); 1843 1844 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1845 continue; 1846 1847 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1848 if (!bytenr) 1849 continue; 1850 1851 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1852 1853 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, 1854 GFP_NOFS); 1855 if (ret) 1856 return ret; 1857 } 1858 cond_resched(); 1859 return 0; 1860 } 1861 1862 /* 1863 * Walk up the tree from the bottom, freeing leaves and any interior 1864 * nodes which have had all slots visited. If a node (leaf or 1865 * interior) is freed, the node above it will have it's slot 1866 * incremented. The root node will never be freed. 1867 * 1868 * At the end of this function, we should have a path which has all 1869 * slots incremented to the next position for a search. If we need to 1870 * read a new node it will be NULL and the node above it will have the 1871 * correct slot selected for a later read. 1872 * 1873 * If we increment the root nodes slot counter past the number of 1874 * elements, 1 is returned to signal completion of the search. 1875 */ 1876 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1877 { 1878 int level = 0; 1879 int nr, slot; 1880 struct extent_buffer *eb; 1881 1882 if (root_level == 0) 1883 return 1; 1884 1885 while (level <= root_level) { 1886 eb = path->nodes[level]; 1887 nr = btrfs_header_nritems(eb); 1888 path->slots[level]++; 1889 slot = path->slots[level]; 1890 if (slot >= nr || level == 0) { 1891 /* 1892 * Don't free the root - we will detect this 1893 * condition after our loop and return a 1894 * positive value for caller to stop walking the tree. 1895 */ 1896 if (level != root_level) { 1897 btrfs_tree_unlock_rw(eb, path->locks[level]); 1898 path->locks[level] = 0; 1899 1900 free_extent_buffer(eb); 1901 path->nodes[level] = NULL; 1902 path->slots[level] = 0; 1903 } 1904 } else { 1905 /* 1906 * We have a valid slot to walk back down 1907 * from. Stop here so caller can process these 1908 * new nodes. 1909 */ 1910 break; 1911 } 1912 1913 level++; 1914 } 1915 1916 eb = path->nodes[root_level]; 1917 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 1918 return 1; 1919 1920 return 0; 1921 } 1922 1923 /* 1924 * Helper function to trace a subtree tree block swap. 1925 * 1926 * The swap will happen in highest tree block, but there may be a lot of 1927 * tree blocks involved. 1928 * 1929 * For example: 1930 * OO = Old tree blocks 1931 * NN = New tree blocks allocated during balance 1932 * 1933 * File tree (257) Reloc tree for 257 1934 * L2 OO NN 1935 * / \ / \ 1936 * L1 OO OO (a) OO NN (a) 1937 * / \ / \ / \ / \ 1938 * L0 OO OO OO OO OO OO NN NN 1939 * (b) (c) (b) (c) 1940 * 1941 * When calling qgroup_trace_extent_swap(), we will pass: 1942 * @src_eb = OO(a) 1943 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 1944 * @dst_level = 0 1945 * @root_level = 1 1946 * 1947 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 1948 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 1949 * 1950 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 1951 * 1952 * 1) Tree search from @src_eb 1953 * It should acts as a simplified btrfs_search_slot(). 1954 * The key for search can be extracted from @dst_path->nodes[dst_level] 1955 * (first key). 1956 * 1957 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 1958 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 1959 * They should be marked during previous (@dst_level = 1) iteration. 1960 * 1961 * 3) Mark file extents in leaves dirty 1962 * We don't have good way to pick out new file extents only. 1963 * So we still follow the old method by scanning all file extents in 1964 * the leave. 1965 * 1966 * This function can free us from keeping two paths, thus later we only need 1967 * to care about how to iterate all new tree blocks in reloc tree. 1968 */ 1969 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 1970 struct extent_buffer *src_eb, 1971 struct btrfs_path *dst_path, 1972 int dst_level, int root_level, 1973 bool trace_leaf) 1974 { 1975 struct btrfs_key key; 1976 struct btrfs_path *src_path; 1977 struct btrfs_fs_info *fs_info = trans->fs_info; 1978 u32 nodesize = fs_info->nodesize; 1979 int cur_level = root_level; 1980 int ret; 1981 1982 BUG_ON(dst_level > root_level); 1983 /* Level mismatch */ 1984 if (btrfs_header_level(src_eb) != root_level) 1985 return -EINVAL; 1986 1987 src_path = btrfs_alloc_path(); 1988 if (!src_path) { 1989 ret = -ENOMEM; 1990 goto out; 1991 } 1992 1993 if (dst_level) 1994 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 1995 else 1996 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 1997 1998 /* For src_path */ 1999 atomic_inc(&src_eb->refs); 2000 src_path->nodes[root_level] = src_eb; 2001 src_path->slots[root_level] = dst_path->slots[root_level]; 2002 src_path->locks[root_level] = 0; 2003 2004 /* A simplified version of btrfs_search_slot() */ 2005 while (cur_level >= dst_level) { 2006 struct btrfs_key src_key; 2007 struct btrfs_key dst_key; 2008 2009 if (src_path->nodes[cur_level] == NULL) { 2010 struct extent_buffer *eb; 2011 int parent_slot; 2012 2013 eb = src_path->nodes[cur_level + 1]; 2014 parent_slot = src_path->slots[cur_level + 1]; 2015 2016 eb = btrfs_read_node_slot(eb, parent_slot); 2017 if (IS_ERR(eb)) { 2018 ret = PTR_ERR(eb); 2019 goto out; 2020 } 2021 2022 src_path->nodes[cur_level] = eb; 2023 2024 btrfs_tree_read_lock(eb); 2025 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2026 } 2027 2028 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2029 if (cur_level) { 2030 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2031 &dst_key, dst_path->slots[cur_level]); 2032 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2033 &src_key, src_path->slots[cur_level]); 2034 } else { 2035 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2036 &dst_key, dst_path->slots[cur_level]); 2037 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2038 &src_key, src_path->slots[cur_level]); 2039 } 2040 /* Content mismatch, something went wrong */ 2041 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2042 ret = -ENOENT; 2043 goto out; 2044 } 2045 cur_level--; 2046 } 2047 2048 /* 2049 * Now both @dst_path and @src_path have been populated, record the tree 2050 * blocks for qgroup accounting. 2051 */ 2052 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2053 nodesize, GFP_NOFS); 2054 if (ret < 0) 2055 goto out; 2056 ret = btrfs_qgroup_trace_extent(trans, 2057 dst_path->nodes[dst_level]->start, 2058 nodesize, GFP_NOFS); 2059 if (ret < 0) 2060 goto out; 2061 2062 /* Record leaf file extents */ 2063 if (dst_level == 0 && trace_leaf) { 2064 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2065 if (ret < 0) 2066 goto out; 2067 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2068 } 2069 out: 2070 btrfs_free_path(src_path); 2071 return ret; 2072 } 2073 2074 /* 2075 * Helper function to do recursive generation-aware depth-first search, to 2076 * locate all new tree blocks in a subtree of reloc tree. 2077 * 2078 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2079 * reloc tree 2080 * L2 NN (a) 2081 * / \ 2082 * L1 OO NN (b) 2083 * / \ / \ 2084 * L0 OO OO OO NN 2085 * (c) (d) 2086 * If we pass: 2087 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2088 * @cur_level = 1 2089 * @root_level = 1 2090 * 2091 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2092 * above tree blocks along with their counter parts in file tree. 2093 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2094 * won't affect OO(c). 2095 */ 2096 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2097 struct extent_buffer *src_eb, 2098 struct btrfs_path *dst_path, 2099 int cur_level, int root_level, 2100 u64 last_snapshot, bool trace_leaf) 2101 { 2102 struct btrfs_fs_info *fs_info = trans->fs_info; 2103 struct extent_buffer *eb; 2104 bool need_cleanup = false; 2105 int ret = 0; 2106 int i; 2107 2108 /* Level sanity check */ 2109 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2110 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2111 root_level < cur_level) { 2112 btrfs_err_rl(fs_info, 2113 "%s: bad levels, cur_level=%d root_level=%d", 2114 __func__, cur_level, root_level); 2115 return -EUCLEAN; 2116 } 2117 2118 /* Read the tree block if needed */ 2119 if (dst_path->nodes[cur_level] == NULL) { 2120 int parent_slot; 2121 u64 child_gen; 2122 2123 /* 2124 * dst_path->nodes[root_level] must be initialized before 2125 * calling this function. 2126 */ 2127 if (cur_level == root_level) { 2128 btrfs_err_rl(fs_info, 2129 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2130 __func__, root_level, root_level, cur_level); 2131 return -EUCLEAN; 2132 } 2133 2134 /* 2135 * We need to get child blockptr/gen from parent before we can 2136 * read it. 2137 */ 2138 eb = dst_path->nodes[cur_level + 1]; 2139 parent_slot = dst_path->slots[cur_level + 1]; 2140 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2141 2142 /* This node is old, no need to trace */ 2143 if (child_gen < last_snapshot) 2144 goto out; 2145 2146 eb = btrfs_read_node_slot(eb, parent_slot); 2147 if (IS_ERR(eb)) { 2148 ret = PTR_ERR(eb); 2149 goto out; 2150 } 2151 2152 dst_path->nodes[cur_level] = eb; 2153 dst_path->slots[cur_level] = 0; 2154 2155 btrfs_tree_read_lock(eb); 2156 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2157 need_cleanup = true; 2158 } 2159 2160 /* Now record this tree block and its counter part for qgroups */ 2161 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2162 root_level, trace_leaf); 2163 if (ret < 0) 2164 goto cleanup; 2165 2166 eb = dst_path->nodes[cur_level]; 2167 2168 if (cur_level > 0) { 2169 /* Iterate all child tree blocks */ 2170 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2171 /* Skip old tree blocks as they won't be swapped */ 2172 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2173 continue; 2174 dst_path->slots[cur_level] = i; 2175 2176 /* Recursive call (at most 7 times) */ 2177 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2178 dst_path, cur_level - 1, root_level, 2179 last_snapshot, trace_leaf); 2180 if (ret < 0) 2181 goto cleanup; 2182 } 2183 } 2184 2185 cleanup: 2186 if (need_cleanup) { 2187 /* Clean up */ 2188 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2189 dst_path->locks[cur_level]); 2190 free_extent_buffer(dst_path->nodes[cur_level]); 2191 dst_path->nodes[cur_level] = NULL; 2192 dst_path->slots[cur_level] = 0; 2193 dst_path->locks[cur_level] = 0; 2194 } 2195 out: 2196 return ret; 2197 } 2198 2199 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2200 struct extent_buffer *src_eb, 2201 struct extent_buffer *dst_eb, 2202 u64 last_snapshot, bool trace_leaf) 2203 { 2204 struct btrfs_fs_info *fs_info = trans->fs_info; 2205 struct btrfs_path *dst_path = NULL; 2206 int level; 2207 int ret; 2208 2209 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2210 return 0; 2211 2212 /* Wrong parameter order */ 2213 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2214 btrfs_err_rl(fs_info, 2215 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2216 btrfs_header_generation(src_eb), 2217 btrfs_header_generation(dst_eb)); 2218 return -EUCLEAN; 2219 } 2220 2221 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2222 ret = -EIO; 2223 goto out; 2224 } 2225 2226 level = btrfs_header_level(dst_eb); 2227 dst_path = btrfs_alloc_path(); 2228 if (!dst_path) { 2229 ret = -ENOMEM; 2230 goto out; 2231 } 2232 /* For dst_path */ 2233 atomic_inc(&dst_eb->refs); 2234 dst_path->nodes[level] = dst_eb; 2235 dst_path->slots[level] = 0; 2236 dst_path->locks[level] = 0; 2237 2238 /* Do the generation aware breadth-first search */ 2239 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2240 level, last_snapshot, trace_leaf); 2241 if (ret < 0) 2242 goto out; 2243 ret = 0; 2244 2245 out: 2246 btrfs_free_path(dst_path); 2247 if (ret < 0) 2248 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2249 return ret; 2250 } 2251 2252 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2253 struct extent_buffer *root_eb, 2254 u64 root_gen, int root_level) 2255 { 2256 struct btrfs_fs_info *fs_info = trans->fs_info; 2257 int ret = 0; 2258 int level; 2259 struct extent_buffer *eb = root_eb; 2260 struct btrfs_path *path = NULL; 2261 2262 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2263 BUG_ON(root_eb == NULL); 2264 2265 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2266 return 0; 2267 2268 if (!extent_buffer_uptodate(root_eb)) { 2269 ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); 2270 if (ret) 2271 goto out; 2272 } 2273 2274 if (root_level == 0) { 2275 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2276 goto out; 2277 } 2278 2279 path = btrfs_alloc_path(); 2280 if (!path) 2281 return -ENOMEM; 2282 2283 /* 2284 * Walk down the tree. Missing extent blocks are filled in as 2285 * we go. Metadata is accounted every time we read a new 2286 * extent block. 2287 * 2288 * When we reach a leaf, we account for file extent items in it, 2289 * walk back up the tree (adjusting slot pointers as we go) 2290 * and restart the search process. 2291 */ 2292 atomic_inc(&root_eb->refs); /* For path */ 2293 path->nodes[root_level] = root_eb; 2294 path->slots[root_level] = 0; 2295 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2296 walk_down: 2297 level = root_level; 2298 while (level >= 0) { 2299 if (path->nodes[level] == NULL) { 2300 int parent_slot; 2301 u64 child_bytenr; 2302 2303 /* 2304 * We need to get child blockptr from parent before we 2305 * can read it. 2306 */ 2307 eb = path->nodes[level + 1]; 2308 parent_slot = path->slots[level + 1]; 2309 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2310 2311 eb = btrfs_read_node_slot(eb, parent_slot); 2312 if (IS_ERR(eb)) { 2313 ret = PTR_ERR(eb); 2314 goto out; 2315 } 2316 2317 path->nodes[level] = eb; 2318 path->slots[level] = 0; 2319 2320 btrfs_tree_read_lock(eb); 2321 path->locks[level] = BTRFS_READ_LOCK; 2322 2323 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2324 fs_info->nodesize, 2325 GFP_NOFS); 2326 if (ret) 2327 goto out; 2328 } 2329 2330 if (level == 0) { 2331 ret = btrfs_qgroup_trace_leaf_items(trans, 2332 path->nodes[level]); 2333 if (ret) 2334 goto out; 2335 2336 /* Nonzero return here means we completed our search */ 2337 ret = adjust_slots_upwards(path, root_level); 2338 if (ret) 2339 break; 2340 2341 /* Restart search with new slots */ 2342 goto walk_down; 2343 } 2344 2345 level--; 2346 } 2347 2348 ret = 0; 2349 out: 2350 btrfs_free_path(path); 2351 2352 return ret; 2353 } 2354 2355 #define UPDATE_NEW 0 2356 #define UPDATE_OLD 1 2357 /* 2358 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2359 */ 2360 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2361 struct ulist *roots, struct ulist *tmp, 2362 struct ulist *qgroups, u64 seq, int update_old) 2363 { 2364 struct ulist_node *unode; 2365 struct ulist_iterator uiter; 2366 struct ulist_node *tmp_unode; 2367 struct ulist_iterator tmp_uiter; 2368 struct btrfs_qgroup *qg; 2369 int ret = 0; 2370 2371 if (!roots) 2372 return 0; 2373 ULIST_ITER_INIT(&uiter); 2374 while ((unode = ulist_next(roots, &uiter))) { 2375 qg = find_qgroup_rb(fs_info, unode->val); 2376 if (!qg) 2377 continue; 2378 2379 ulist_reinit(tmp); 2380 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2381 GFP_ATOMIC); 2382 if (ret < 0) 2383 return ret; 2384 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2385 if (ret < 0) 2386 return ret; 2387 ULIST_ITER_INIT(&tmp_uiter); 2388 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2389 struct btrfs_qgroup_list *glist; 2390 2391 qg = unode_aux_to_qgroup(tmp_unode); 2392 if (update_old) 2393 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2394 else 2395 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2396 list_for_each_entry(glist, &qg->groups, next_group) { 2397 ret = ulist_add(qgroups, glist->group->qgroupid, 2398 qgroup_to_aux(glist->group), 2399 GFP_ATOMIC); 2400 if (ret < 0) 2401 return ret; 2402 ret = ulist_add(tmp, glist->group->qgroupid, 2403 qgroup_to_aux(glist->group), 2404 GFP_ATOMIC); 2405 if (ret < 0) 2406 return ret; 2407 } 2408 } 2409 } 2410 return 0; 2411 } 2412 2413 /* 2414 * Update qgroup rfer/excl counters. 2415 * Rfer update is easy, codes can explain themselves. 2416 * 2417 * Excl update is tricky, the update is split into 2 parts. 2418 * Part 1: Possible exclusive <-> sharing detect: 2419 * | A | !A | 2420 * ------------------------------------- 2421 * B | * | - | 2422 * ------------------------------------- 2423 * !B | + | ** | 2424 * ------------------------------------- 2425 * 2426 * Conditions: 2427 * A: cur_old_roots < nr_old_roots (not exclusive before) 2428 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2429 * B: cur_new_roots < nr_new_roots (not exclusive now) 2430 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2431 * 2432 * Results: 2433 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2434 * *: Definitely not changed. **: Possible unchanged. 2435 * 2436 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2437 * 2438 * To make the logic clear, we first use condition A and B to split 2439 * combination into 4 results. 2440 * 2441 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2442 * only on variant maybe 0. 2443 * 2444 * Lastly, check result **, since there are 2 variants maybe 0, split them 2445 * again(2x2). 2446 * But this time we don't need to consider other things, the codes and logic 2447 * is easy to understand now. 2448 */ 2449 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2450 struct ulist *qgroups, 2451 u64 nr_old_roots, 2452 u64 nr_new_roots, 2453 u64 num_bytes, u64 seq) 2454 { 2455 struct ulist_node *unode; 2456 struct ulist_iterator uiter; 2457 struct btrfs_qgroup *qg; 2458 u64 cur_new_count, cur_old_count; 2459 2460 ULIST_ITER_INIT(&uiter); 2461 while ((unode = ulist_next(qgroups, &uiter))) { 2462 bool dirty = false; 2463 2464 qg = unode_aux_to_qgroup(unode); 2465 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2466 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2467 2468 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2469 cur_new_count); 2470 2471 /* Rfer update part */ 2472 if (cur_old_count == 0 && cur_new_count > 0) { 2473 qg->rfer += num_bytes; 2474 qg->rfer_cmpr += num_bytes; 2475 dirty = true; 2476 } 2477 if (cur_old_count > 0 && cur_new_count == 0) { 2478 qg->rfer -= num_bytes; 2479 qg->rfer_cmpr -= num_bytes; 2480 dirty = true; 2481 } 2482 2483 /* Excl update part */ 2484 /* Exclusive/none -> shared case */ 2485 if (cur_old_count == nr_old_roots && 2486 cur_new_count < nr_new_roots) { 2487 /* Exclusive -> shared */ 2488 if (cur_old_count != 0) { 2489 qg->excl -= num_bytes; 2490 qg->excl_cmpr -= num_bytes; 2491 dirty = true; 2492 } 2493 } 2494 2495 /* Shared -> exclusive/none case */ 2496 if (cur_old_count < nr_old_roots && 2497 cur_new_count == nr_new_roots) { 2498 /* Shared->exclusive */ 2499 if (cur_new_count != 0) { 2500 qg->excl += num_bytes; 2501 qg->excl_cmpr += num_bytes; 2502 dirty = true; 2503 } 2504 } 2505 2506 /* Exclusive/none -> exclusive/none case */ 2507 if (cur_old_count == nr_old_roots && 2508 cur_new_count == nr_new_roots) { 2509 if (cur_old_count == 0) { 2510 /* None -> exclusive/none */ 2511 2512 if (cur_new_count != 0) { 2513 /* None -> exclusive */ 2514 qg->excl += num_bytes; 2515 qg->excl_cmpr += num_bytes; 2516 dirty = true; 2517 } 2518 /* None -> none, nothing changed */ 2519 } else { 2520 /* Exclusive -> exclusive/none */ 2521 2522 if (cur_new_count == 0) { 2523 /* Exclusive -> none */ 2524 qg->excl -= num_bytes; 2525 qg->excl_cmpr -= num_bytes; 2526 dirty = true; 2527 } 2528 /* Exclusive -> exclusive, nothing changed */ 2529 } 2530 } 2531 2532 if (dirty) 2533 qgroup_dirty(fs_info, qg); 2534 } 2535 return 0; 2536 } 2537 2538 /* 2539 * Check if the @roots potentially is a list of fs tree roots 2540 * 2541 * Return 0 for definitely not a fs/subvol tree roots ulist 2542 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2543 * one as well) 2544 */ 2545 static int maybe_fs_roots(struct ulist *roots) 2546 { 2547 struct ulist_node *unode; 2548 struct ulist_iterator uiter; 2549 2550 /* Empty one, still possible for fs roots */ 2551 if (!roots || roots->nnodes == 0) 2552 return 1; 2553 2554 ULIST_ITER_INIT(&uiter); 2555 unode = ulist_next(roots, &uiter); 2556 if (!unode) 2557 return 1; 2558 2559 /* 2560 * If it contains fs tree roots, then it must belong to fs/subvol 2561 * trees. 2562 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2563 */ 2564 return is_fstree(unode->val); 2565 } 2566 2567 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2568 u64 num_bytes, struct ulist *old_roots, 2569 struct ulist *new_roots) 2570 { 2571 struct btrfs_fs_info *fs_info = trans->fs_info; 2572 struct ulist *qgroups = NULL; 2573 struct ulist *tmp = NULL; 2574 u64 seq; 2575 u64 nr_new_roots = 0; 2576 u64 nr_old_roots = 0; 2577 int ret = 0; 2578 2579 /* 2580 * If quotas get disabled meanwhile, the resources need to be freed and 2581 * we can't just exit here. 2582 */ 2583 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2584 goto out_free; 2585 2586 if (new_roots) { 2587 if (!maybe_fs_roots(new_roots)) 2588 goto out_free; 2589 nr_new_roots = new_roots->nnodes; 2590 } 2591 if (old_roots) { 2592 if (!maybe_fs_roots(old_roots)) 2593 goto out_free; 2594 nr_old_roots = old_roots->nnodes; 2595 } 2596 2597 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2598 if (nr_old_roots == 0 && nr_new_roots == 0) 2599 goto out_free; 2600 2601 BUG_ON(!fs_info->quota_root); 2602 2603 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2604 num_bytes, nr_old_roots, nr_new_roots); 2605 2606 qgroups = ulist_alloc(GFP_NOFS); 2607 if (!qgroups) { 2608 ret = -ENOMEM; 2609 goto out_free; 2610 } 2611 tmp = ulist_alloc(GFP_NOFS); 2612 if (!tmp) { 2613 ret = -ENOMEM; 2614 goto out_free; 2615 } 2616 2617 mutex_lock(&fs_info->qgroup_rescan_lock); 2618 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2619 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2620 mutex_unlock(&fs_info->qgroup_rescan_lock); 2621 ret = 0; 2622 goto out_free; 2623 } 2624 } 2625 mutex_unlock(&fs_info->qgroup_rescan_lock); 2626 2627 spin_lock(&fs_info->qgroup_lock); 2628 seq = fs_info->qgroup_seq; 2629 2630 /* Update old refcnts using old_roots */ 2631 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2632 UPDATE_OLD); 2633 if (ret < 0) 2634 goto out; 2635 2636 /* Update new refcnts using new_roots */ 2637 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2638 UPDATE_NEW); 2639 if (ret < 0) 2640 goto out; 2641 2642 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2643 num_bytes, seq); 2644 2645 /* 2646 * Bump qgroup_seq to avoid seq overlap 2647 */ 2648 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2649 out: 2650 spin_unlock(&fs_info->qgroup_lock); 2651 out_free: 2652 ulist_free(tmp); 2653 ulist_free(qgroups); 2654 ulist_free(old_roots); 2655 ulist_free(new_roots); 2656 return ret; 2657 } 2658 2659 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2660 { 2661 struct btrfs_fs_info *fs_info = trans->fs_info; 2662 struct btrfs_qgroup_extent_record *record; 2663 struct btrfs_delayed_ref_root *delayed_refs; 2664 struct ulist *new_roots = NULL; 2665 struct rb_node *node; 2666 u64 num_dirty_extents = 0; 2667 u64 qgroup_to_skip; 2668 int ret = 0; 2669 2670 delayed_refs = &trans->transaction->delayed_refs; 2671 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2672 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2673 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2674 node); 2675 2676 num_dirty_extents++; 2677 trace_btrfs_qgroup_account_extents(fs_info, record); 2678 2679 if (!ret) { 2680 /* 2681 * Old roots should be searched when inserting qgroup 2682 * extent record 2683 */ 2684 if (WARN_ON(!record->old_roots)) { 2685 /* Search commit root to find old_roots */ 2686 ret = btrfs_find_all_roots(NULL, fs_info, 2687 record->bytenr, 0, 2688 &record->old_roots, false); 2689 if (ret < 0) 2690 goto cleanup; 2691 } 2692 2693 /* Free the reserved data space */ 2694 btrfs_qgroup_free_refroot(fs_info, 2695 record->data_rsv_refroot, 2696 record->data_rsv, 2697 BTRFS_QGROUP_RSV_DATA); 2698 /* 2699 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2700 * which doesn't lock tree or delayed_refs and search 2701 * current root. It's safe inside commit_transaction(). 2702 */ 2703 ret = btrfs_find_all_roots(trans, fs_info, 2704 record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); 2705 if (ret < 0) 2706 goto cleanup; 2707 if (qgroup_to_skip) { 2708 ulist_del(new_roots, qgroup_to_skip, 0); 2709 ulist_del(record->old_roots, qgroup_to_skip, 2710 0); 2711 } 2712 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2713 record->num_bytes, 2714 record->old_roots, 2715 new_roots); 2716 record->old_roots = NULL; 2717 new_roots = NULL; 2718 } 2719 cleanup: 2720 ulist_free(record->old_roots); 2721 ulist_free(new_roots); 2722 new_roots = NULL; 2723 rb_erase(node, &delayed_refs->dirty_extent_root); 2724 kfree(record); 2725 2726 } 2727 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2728 num_dirty_extents); 2729 return ret; 2730 } 2731 2732 /* 2733 * called from commit_transaction. Writes all changed qgroups to disk. 2734 */ 2735 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2736 { 2737 struct btrfs_fs_info *fs_info = trans->fs_info; 2738 int ret = 0; 2739 2740 if (!fs_info->quota_root) 2741 return ret; 2742 2743 spin_lock(&fs_info->qgroup_lock); 2744 while (!list_empty(&fs_info->dirty_qgroups)) { 2745 struct btrfs_qgroup *qgroup; 2746 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2747 struct btrfs_qgroup, dirty); 2748 list_del_init(&qgroup->dirty); 2749 spin_unlock(&fs_info->qgroup_lock); 2750 ret = update_qgroup_info_item(trans, qgroup); 2751 if (ret) 2752 fs_info->qgroup_flags |= 2753 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2754 ret = update_qgroup_limit_item(trans, qgroup); 2755 if (ret) 2756 fs_info->qgroup_flags |= 2757 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2758 spin_lock(&fs_info->qgroup_lock); 2759 } 2760 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2761 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2762 else 2763 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2764 spin_unlock(&fs_info->qgroup_lock); 2765 2766 ret = update_qgroup_status_item(trans); 2767 if (ret) 2768 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2769 2770 return ret; 2771 } 2772 2773 /* 2774 * Copy the accounting information between qgroups. This is necessary 2775 * when a snapshot or a subvolume is created. Throwing an error will 2776 * cause a transaction abort so we take extra care here to only error 2777 * when a readonly fs is a reasonable outcome. 2778 */ 2779 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2780 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2781 { 2782 int ret = 0; 2783 int i; 2784 u64 *i_qgroups; 2785 bool committing = false; 2786 struct btrfs_fs_info *fs_info = trans->fs_info; 2787 struct btrfs_root *quota_root; 2788 struct btrfs_qgroup *srcgroup; 2789 struct btrfs_qgroup *dstgroup; 2790 bool need_rescan = false; 2791 u32 level_size = 0; 2792 u64 nums; 2793 2794 /* 2795 * There are only two callers of this function. 2796 * 2797 * One in create_subvol() in the ioctl context, which needs to hold 2798 * the qgroup_ioctl_lock. 2799 * 2800 * The other one in create_pending_snapshot() where no other qgroup 2801 * code can modify the fs as they all need to either start a new trans 2802 * or hold a trans handler, thus we don't need to hold 2803 * qgroup_ioctl_lock. 2804 * This would avoid long and complex lock chain and make lockdep happy. 2805 */ 2806 spin_lock(&fs_info->trans_lock); 2807 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2808 committing = true; 2809 spin_unlock(&fs_info->trans_lock); 2810 2811 if (!committing) 2812 mutex_lock(&fs_info->qgroup_ioctl_lock); 2813 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2814 goto out; 2815 2816 quota_root = fs_info->quota_root; 2817 if (!quota_root) { 2818 ret = -EINVAL; 2819 goto out; 2820 } 2821 2822 if (inherit) { 2823 i_qgroups = (u64 *)(inherit + 1); 2824 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2825 2 * inherit->num_excl_copies; 2826 for (i = 0; i < nums; ++i) { 2827 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2828 2829 /* 2830 * Zero out invalid groups so we can ignore 2831 * them later. 2832 */ 2833 if (!srcgroup || 2834 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2835 *i_qgroups = 0ULL; 2836 2837 ++i_qgroups; 2838 } 2839 } 2840 2841 /* 2842 * create a tracking group for the subvol itself 2843 */ 2844 ret = add_qgroup_item(trans, quota_root, objectid); 2845 if (ret) 2846 goto out; 2847 2848 /* 2849 * add qgroup to all inherited groups 2850 */ 2851 if (inherit) { 2852 i_qgroups = (u64 *)(inherit + 1); 2853 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 2854 if (*i_qgroups == 0) 2855 continue; 2856 ret = add_qgroup_relation_item(trans, objectid, 2857 *i_qgroups); 2858 if (ret && ret != -EEXIST) 2859 goto out; 2860 ret = add_qgroup_relation_item(trans, *i_qgroups, 2861 objectid); 2862 if (ret && ret != -EEXIST) 2863 goto out; 2864 } 2865 ret = 0; 2866 } 2867 2868 2869 spin_lock(&fs_info->qgroup_lock); 2870 2871 dstgroup = add_qgroup_rb(fs_info, objectid); 2872 if (IS_ERR(dstgroup)) { 2873 ret = PTR_ERR(dstgroup); 2874 goto unlock; 2875 } 2876 2877 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 2878 dstgroup->lim_flags = inherit->lim.flags; 2879 dstgroup->max_rfer = inherit->lim.max_rfer; 2880 dstgroup->max_excl = inherit->lim.max_excl; 2881 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 2882 dstgroup->rsv_excl = inherit->lim.rsv_excl; 2883 2884 ret = update_qgroup_limit_item(trans, dstgroup); 2885 if (ret) { 2886 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2887 btrfs_info(fs_info, 2888 "unable to update quota limit for %llu", 2889 dstgroup->qgroupid); 2890 goto unlock; 2891 } 2892 } 2893 2894 if (srcid) { 2895 srcgroup = find_qgroup_rb(fs_info, srcid); 2896 if (!srcgroup) 2897 goto unlock; 2898 2899 /* 2900 * We call inherit after we clone the root in order to make sure 2901 * our counts don't go crazy, so at this point the only 2902 * difference between the two roots should be the root node. 2903 */ 2904 level_size = fs_info->nodesize; 2905 dstgroup->rfer = srcgroup->rfer; 2906 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 2907 dstgroup->excl = level_size; 2908 dstgroup->excl_cmpr = level_size; 2909 srcgroup->excl = level_size; 2910 srcgroup->excl_cmpr = level_size; 2911 2912 /* inherit the limit info */ 2913 dstgroup->lim_flags = srcgroup->lim_flags; 2914 dstgroup->max_rfer = srcgroup->max_rfer; 2915 dstgroup->max_excl = srcgroup->max_excl; 2916 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 2917 dstgroup->rsv_excl = srcgroup->rsv_excl; 2918 2919 qgroup_dirty(fs_info, dstgroup); 2920 qgroup_dirty(fs_info, srcgroup); 2921 } 2922 2923 if (!inherit) 2924 goto unlock; 2925 2926 i_qgroups = (u64 *)(inherit + 1); 2927 for (i = 0; i < inherit->num_qgroups; ++i) { 2928 if (*i_qgroups) { 2929 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 2930 if (ret) 2931 goto unlock; 2932 } 2933 ++i_qgroups; 2934 2935 /* 2936 * If we're doing a snapshot, and adding the snapshot to a new 2937 * qgroup, the numbers are guaranteed to be incorrect. 2938 */ 2939 if (srcid) 2940 need_rescan = true; 2941 } 2942 2943 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 2944 struct btrfs_qgroup *src; 2945 struct btrfs_qgroup *dst; 2946 2947 if (!i_qgroups[0] || !i_qgroups[1]) 2948 continue; 2949 2950 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2951 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2952 2953 if (!src || !dst) { 2954 ret = -EINVAL; 2955 goto unlock; 2956 } 2957 2958 dst->rfer = src->rfer - level_size; 2959 dst->rfer_cmpr = src->rfer_cmpr - level_size; 2960 2961 /* Manually tweaking numbers certainly needs a rescan */ 2962 need_rescan = true; 2963 } 2964 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 2965 struct btrfs_qgroup *src; 2966 struct btrfs_qgroup *dst; 2967 2968 if (!i_qgroups[0] || !i_qgroups[1]) 2969 continue; 2970 2971 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2972 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2973 2974 if (!src || !dst) { 2975 ret = -EINVAL; 2976 goto unlock; 2977 } 2978 2979 dst->excl = src->excl + level_size; 2980 dst->excl_cmpr = src->excl_cmpr + level_size; 2981 need_rescan = true; 2982 } 2983 2984 unlock: 2985 spin_unlock(&fs_info->qgroup_lock); 2986 if (!ret) 2987 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 2988 out: 2989 if (!committing) 2990 mutex_unlock(&fs_info->qgroup_ioctl_lock); 2991 if (need_rescan) 2992 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2993 return ret; 2994 } 2995 2996 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 2997 { 2998 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 2999 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3000 return false; 3001 3002 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3003 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3004 return false; 3005 3006 return true; 3007 } 3008 3009 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3010 enum btrfs_qgroup_rsv_type type) 3011 { 3012 struct btrfs_qgroup *qgroup; 3013 struct btrfs_fs_info *fs_info = root->fs_info; 3014 u64 ref_root = root->root_key.objectid; 3015 int ret = 0; 3016 struct ulist_node *unode; 3017 struct ulist_iterator uiter; 3018 3019 if (!is_fstree(ref_root)) 3020 return 0; 3021 3022 if (num_bytes == 0) 3023 return 0; 3024 3025 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3026 capable(CAP_SYS_RESOURCE)) 3027 enforce = false; 3028 3029 spin_lock(&fs_info->qgroup_lock); 3030 if (!fs_info->quota_root) 3031 goto out; 3032 3033 qgroup = find_qgroup_rb(fs_info, ref_root); 3034 if (!qgroup) 3035 goto out; 3036 3037 /* 3038 * in a first step, we check all affected qgroups if any limits would 3039 * be exceeded 3040 */ 3041 ulist_reinit(fs_info->qgroup_ulist); 3042 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3043 qgroup_to_aux(qgroup), GFP_ATOMIC); 3044 if (ret < 0) 3045 goto out; 3046 ULIST_ITER_INIT(&uiter); 3047 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3048 struct btrfs_qgroup *qg; 3049 struct btrfs_qgroup_list *glist; 3050 3051 qg = unode_aux_to_qgroup(unode); 3052 3053 if (enforce && !qgroup_check_limits(qg, num_bytes)) { 3054 ret = -EDQUOT; 3055 goto out; 3056 } 3057 3058 list_for_each_entry(glist, &qg->groups, next_group) { 3059 ret = ulist_add(fs_info->qgroup_ulist, 3060 glist->group->qgroupid, 3061 qgroup_to_aux(glist->group), GFP_ATOMIC); 3062 if (ret < 0) 3063 goto out; 3064 } 3065 } 3066 ret = 0; 3067 /* 3068 * no limits exceeded, now record the reservation into all qgroups 3069 */ 3070 ULIST_ITER_INIT(&uiter); 3071 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3072 struct btrfs_qgroup *qg; 3073 3074 qg = unode_aux_to_qgroup(unode); 3075 3076 qgroup_rsv_add(fs_info, qg, num_bytes, type); 3077 } 3078 3079 out: 3080 spin_unlock(&fs_info->qgroup_lock); 3081 return ret; 3082 } 3083 3084 /* 3085 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3086 * qgroup). 3087 * 3088 * Will handle all higher level qgroup too. 3089 * 3090 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3091 * This special case is only used for META_PERTRANS type. 3092 */ 3093 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3094 u64 ref_root, u64 num_bytes, 3095 enum btrfs_qgroup_rsv_type type) 3096 { 3097 struct btrfs_qgroup *qgroup; 3098 struct ulist_node *unode; 3099 struct ulist_iterator uiter; 3100 int ret = 0; 3101 3102 if (!is_fstree(ref_root)) 3103 return; 3104 3105 if (num_bytes == 0) 3106 return; 3107 3108 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3109 WARN(1, "%s: Invalid type to free", __func__); 3110 return; 3111 } 3112 spin_lock(&fs_info->qgroup_lock); 3113 3114 if (!fs_info->quota_root) 3115 goto out; 3116 3117 qgroup = find_qgroup_rb(fs_info, ref_root); 3118 if (!qgroup) 3119 goto out; 3120 3121 if (num_bytes == (u64)-1) 3122 /* 3123 * We're freeing all pertrans rsv, get reserved value from 3124 * level 0 qgroup as real num_bytes to free. 3125 */ 3126 num_bytes = qgroup->rsv.values[type]; 3127 3128 ulist_reinit(fs_info->qgroup_ulist); 3129 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3130 qgroup_to_aux(qgroup), GFP_ATOMIC); 3131 if (ret < 0) 3132 goto out; 3133 ULIST_ITER_INIT(&uiter); 3134 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3135 struct btrfs_qgroup *qg; 3136 struct btrfs_qgroup_list *glist; 3137 3138 qg = unode_aux_to_qgroup(unode); 3139 3140 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3141 3142 list_for_each_entry(glist, &qg->groups, next_group) { 3143 ret = ulist_add(fs_info->qgroup_ulist, 3144 glist->group->qgroupid, 3145 qgroup_to_aux(glist->group), GFP_ATOMIC); 3146 if (ret < 0) 3147 goto out; 3148 } 3149 } 3150 3151 out: 3152 spin_unlock(&fs_info->qgroup_lock); 3153 } 3154 3155 /* 3156 * Check if the leaf is the last leaf. Which means all node pointers 3157 * are at their last position. 3158 */ 3159 static bool is_last_leaf(struct btrfs_path *path) 3160 { 3161 int i; 3162 3163 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3164 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3165 return false; 3166 } 3167 return true; 3168 } 3169 3170 /* 3171 * returns < 0 on error, 0 when more leafs are to be scanned. 3172 * returns 1 when done. 3173 */ 3174 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3175 struct btrfs_path *path) 3176 { 3177 struct btrfs_fs_info *fs_info = trans->fs_info; 3178 struct btrfs_root *extent_root; 3179 struct btrfs_key found; 3180 struct extent_buffer *scratch_leaf = NULL; 3181 struct ulist *roots = NULL; 3182 u64 num_bytes; 3183 bool done; 3184 int slot; 3185 int ret; 3186 3187 mutex_lock(&fs_info->qgroup_rescan_lock); 3188 extent_root = btrfs_extent_root(fs_info, 3189 fs_info->qgroup_rescan_progress.objectid); 3190 ret = btrfs_search_slot_for_read(extent_root, 3191 &fs_info->qgroup_rescan_progress, 3192 path, 1, 0); 3193 3194 btrfs_debug(fs_info, 3195 "current progress key (%llu %u %llu), search_slot ret %d", 3196 fs_info->qgroup_rescan_progress.objectid, 3197 fs_info->qgroup_rescan_progress.type, 3198 fs_info->qgroup_rescan_progress.offset, ret); 3199 3200 if (ret) { 3201 /* 3202 * The rescan is about to end, we will not be scanning any 3203 * further blocks. We cannot unset the RESCAN flag here, because 3204 * we want to commit the transaction if everything went well. 3205 * To make the live accounting work in this phase, we set our 3206 * scan progress pointer such that every real extent objectid 3207 * will be smaller. 3208 */ 3209 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3210 btrfs_release_path(path); 3211 mutex_unlock(&fs_info->qgroup_rescan_lock); 3212 return ret; 3213 } 3214 done = is_last_leaf(path); 3215 3216 btrfs_item_key_to_cpu(path->nodes[0], &found, 3217 btrfs_header_nritems(path->nodes[0]) - 1); 3218 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3219 3220 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3221 if (!scratch_leaf) { 3222 ret = -ENOMEM; 3223 mutex_unlock(&fs_info->qgroup_rescan_lock); 3224 goto out; 3225 } 3226 slot = path->slots[0]; 3227 btrfs_release_path(path); 3228 mutex_unlock(&fs_info->qgroup_rescan_lock); 3229 3230 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3231 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3232 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3233 found.type != BTRFS_METADATA_ITEM_KEY) 3234 continue; 3235 if (found.type == BTRFS_METADATA_ITEM_KEY) 3236 num_bytes = fs_info->nodesize; 3237 else 3238 num_bytes = found.offset; 3239 3240 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 3241 &roots, false); 3242 if (ret < 0) 3243 goto out; 3244 /* For rescan, just pass old_roots as NULL */ 3245 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3246 num_bytes, NULL, roots); 3247 if (ret < 0) 3248 goto out; 3249 } 3250 out: 3251 if (scratch_leaf) 3252 free_extent_buffer(scratch_leaf); 3253 3254 if (done && !ret) { 3255 ret = 1; 3256 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3257 } 3258 return ret; 3259 } 3260 3261 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3262 { 3263 return btrfs_fs_closing(fs_info) || 3264 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); 3265 } 3266 3267 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3268 { 3269 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3270 qgroup_rescan_work); 3271 struct btrfs_path *path; 3272 struct btrfs_trans_handle *trans = NULL; 3273 int err = -ENOMEM; 3274 int ret = 0; 3275 bool stopped = false; 3276 3277 path = btrfs_alloc_path(); 3278 if (!path) 3279 goto out; 3280 /* 3281 * Rescan should only search for commit root, and any later difference 3282 * should be recorded by qgroup 3283 */ 3284 path->search_commit_root = 1; 3285 path->skip_locking = 1; 3286 3287 err = 0; 3288 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3289 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3290 if (IS_ERR(trans)) { 3291 err = PTR_ERR(trans); 3292 break; 3293 } 3294 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3295 err = -EINTR; 3296 } else { 3297 err = qgroup_rescan_leaf(trans, path); 3298 } 3299 if (err > 0) 3300 btrfs_commit_transaction(trans); 3301 else 3302 btrfs_end_transaction(trans); 3303 } 3304 3305 out: 3306 btrfs_free_path(path); 3307 3308 mutex_lock(&fs_info->qgroup_rescan_lock); 3309 if (err > 0 && 3310 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3311 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3312 } else if (err < 0) { 3313 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3314 } 3315 mutex_unlock(&fs_info->qgroup_rescan_lock); 3316 3317 /* 3318 * only update status, since the previous part has already updated the 3319 * qgroup info. 3320 */ 3321 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3322 if (IS_ERR(trans)) { 3323 err = PTR_ERR(trans); 3324 trans = NULL; 3325 btrfs_err(fs_info, 3326 "fail to start transaction for status update: %d", 3327 err); 3328 } 3329 3330 mutex_lock(&fs_info->qgroup_rescan_lock); 3331 if (!stopped) 3332 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3333 if (trans) { 3334 ret = update_qgroup_status_item(trans); 3335 if (ret < 0) { 3336 err = ret; 3337 btrfs_err(fs_info, "fail to update qgroup status: %d", 3338 err); 3339 } 3340 } 3341 fs_info->qgroup_rescan_running = false; 3342 complete_all(&fs_info->qgroup_rescan_completion); 3343 mutex_unlock(&fs_info->qgroup_rescan_lock); 3344 3345 if (!trans) 3346 return; 3347 3348 btrfs_end_transaction(trans); 3349 3350 if (stopped) { 3351 btrfs_info(fs_info, "qgroup scan paused"); 3352 } else if (err >= 0) { 3353 btrfs_info(fs_info, "qgroup scan completed%s", 3354 err > 0 ? " (inconsistency flag cleared)" : ""); 3355 } else { 3356 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3357 } 3358 } 3359 3360 /* 3361 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3362 * memory required for the rescan context. 3363 */ 3364 static int 3365 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3366 int init_flags) 3367 { 3368 int ret = 0; 3369 3370 if (!init_flags) { 3371 /* we're resuming qgroup rescan at mount time */ 3372 if (!(fs_info->qgroup_flags & 3373 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3374 btrfs_warn(fs_info, 3375 "qgroup rescan init failed, qgroup rescan is not queued"); 3376 ret = -EINVAL; 3377 } else if (!(fs_info->qgroup_flags & 3378 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3379 btrfs_warn(fs_info, 3380 "qgroup rescan init failed, qgroup is not enabled"); 3381 ret = -EINVAL; 3382 } 3383 3384 if (ret) 3385 return ret; 3386 } 3387 3388 mutex_lock(&fs_info->qgroup_rescan_lock); 3389 3390 if (init_flags) { 3391 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3392 btrfs_warn(fs_info, 3393 "qgroup rescan is already in progress"); 3394 ret = -EINPROGRESS; 3395 } else if (!(fs_info->qgroup_flags & 3396 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3397 btrfs_warn(fs_info, 3398 "qgroup rescan init failed, qgroup is not enabled"); 3399 ret = -EINVAL; 3400 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3401 /* Quota disable is in progress */ 3402 ret = -EBUSY; 3403 } 3404 3405 if (ret) { 3406 mutex_unlock(&fs_info->qgroup_rescan_lock); 3407 return ret; 3408 } 3409 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3410 } 3411 3412 memset(&fs_info->qgroup_rescan_progress, 0, 3413 sizeof(fs_info->qgroup_rescan_progress)); 3414 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3415 init_completion(&fs_info->qgroup_rescan_completion); 3416 mutex_unlock(&fs_info->qgroup_rescan_lock); 3417 3418 btrfs_init_work(&fs_info->qgroup_rescan_work, 3419 btrfs_qgroup_rescan_worker, NULL, NULL); 3420 return 0; 3421 } 3422 3423 static void 3424 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3425 { 3426 struct rb_node *n; 3427 struct btrfs_qgroup *qgroup; 3428 3429 spin_lock(&fs_info->qgroup_lock); 3430 /* clear all current qgroup tracking information */ 3431 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3432 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3433 qgroup->rfer = 0; 3434 qgroup->rfer_cmpr = 0; 3435 qgroup->excl = 0; 3436 qgroup->excl_cmpr = 0; 3437 qgroup_dirty(fs_info, qgroup); 3438 } 3439 spin_unlock(&fs_info->qgroup_lock); 3440 } 3441 3442 int 3443 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3444 { 3445 int ret = 0; 3446 struct btrfs_trans_handle *trans; 3447 3448 ret = qgroup_rescan_init(fs_info, 0, 1); 3449 if (ret) 3450 return ret; 3451 3452 /* 3453 * We have set the rescan_progress to 0, which means no more 3454 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3455 * However, btrfs_qgroup_account_ref may be right after its call 3456 * to btrfs_find_all_roots, in which case it would still do the 3457 * accounting. 3458 * To solve this, we're committing the transaction, which will 3459 * ensure we run all delayed refs and only after that, we are 3460 * going to clear all tracking information for a clean start. 3461 */ 3462 3463 trans = btrfs_join_transaction(fs_info->fs_root); 3464 if (IS_ERR(trans)) { 3465 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3466 return PTR_ERR(trans); 3467 } 3468 ret = btrfs_commit_transaction(trans); 3469 if (ret) { 3470 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3471 return ret; 3472 } 3473 3474 qgroup_rescan_zero_tracking(fs_info); 3475 3476 mutex_lock(&fs_info->qgroup_rescan_lock); 3477 fs_info->qgroup_rescan_running = true; 3478 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3479 &fs_info->qgroup_rescan_work); 3480 mutex_unlock(&fs_info->qgroup_rescan_lock); 3481 3482 return 0; 3483 } 3484 3485 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3486 bool interruptible) 3487 { 3488 int running; 3489 int ret = 0; 3490 3491 mutex_lock(&fs_info->qgroup_rescan_lock); 3492 running = fs_info->qgroup_rescan_running; 3493 mutex_unlock(&fs_info->qgroup_rescan_lock); 3494 3495 if (!running) 3496 return 0; 3497 3498 if (interruptible) 3499 ret = wait_for_completion_interruptible( 3500 &fs_info->qgroup_rescan_completion); 3501 else 3502 wait_for_completion(&fs_info->qgroup_rescan_completion); 3503 3504 return ret; 3505 } 3506 3507 /* 3508 * this is only called from open_ctree where we're still single threaded, thus 3509 * locking is omitted here. 3510 */ 3511 void 3512 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3513 { 3514 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3515 mutex_lock(&fs_info->qgroup_rescan_lock); 3516 fs_info->qgroup_rescan_running = true; 3517 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3518 &fs_info->qgroup_rescan_work); 3519 mutex_unlock(&fs_info->qgroup_rescan_lock); 3520 } 3521 } 3522 3523 #define rbtree_iterate_from_safe(node, next, start) \ 3524 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3525 3526 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3527 struct extent_changeset *reserved, u64 start, 3528 u64 len) 3529 { 3530 struct rb_node *node; 3531 struct rb_node *next; 3532 struct ulist_node *entry; 3533 int ret = 0; 3534 3535 node = reserved->range_changed.root.rb_node; 3536 if (!node) 3537 return 0; 3538 while (node) { 3539 entry = rb_entry(node, struct ulist_node, rb_node); 3540 if (entry->val < start) 3541 node = node->rb_right; 3542 else 3543 node = node->rb_left; 3544 } 3545 3546 if (entry->val > start && rb_prev(&entry->rb_node)) 3547 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3548 rb_node); 3549 3550 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3551 u64 entry_start; 3552 u64 entry_end; 3553 u64 entry_len; 3554 int clear_ret; 3555 3556 entry = rb_entry(node, struct ulist_node, rb_node); 3557 entry_start = entry->val; 3558 entry_end = entry->aux; 3559 entry_len = entry_end - entry_start + 1; 3560 3561 if (entry_start >= start + len) 3562 break; 3563 if (entry_start + entry_len <= start) 3564 continue; 3565 /* 3566 * Now the entry is in [start, start + len), revert the 3567 * EXTENT_QGROUP_RESERVED bit. 3568 */ 3569 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3570 entry_end, EXTENT_QGROUP_RESERVED); 3571 if (!ret && clear_ret < 0) 3572 ret = clear_ret; 3573 3574 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3575 if (likely(reserved->bytes_changed >= entry_len)) { 3576 reserved->bytes_changed -= entry_len; 3577 } else { 3578 WARN_ON(1); 3579 reserved->bytes_changed = 0; 3580 } 3581 } 3582 3583 return ret; 3584 } 3585 3586 /* 3587 * Try to free some space for qgroup. 3588 * 3589 * For qgroup, there are only 3 ways to free qgroup space: 3590 * - Flush nodatacow write 3591 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3592 * In theory, we should only flush nodatacow inodes, but it's not yet 3593 * possible, so we need to flush the whole root. 3594 * 3595 * - Wait for ordered extents 3596 * When ordered extents are finished, their reserved metadata is finally 3597 * converted to per_trans status, which can be freed by later commit 3598 * transaction. 3599 * 3600 * - Commit transaction 3601 * This would free the meta_per_trans space. 3602 * In theory this shouldn't provide much space, but any more qgroup space 3603 * is needed. 3604 */ 3605 static int try_flush_qgroup(struct btrfs_root *root) 3606 { 3607 struct btrfs_trans_handle *trans; 3608 int ret; 3609 3610 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3611 ASSERT(current->journal_info == NULL); 3612 if (WARN_ON(current->journal_info)) 3613 return 0; 3614 3615 /* 3616 * We don't want to run flush again and again, so if there is a running 3617 * one, we won't try to start a new flush, but exit directly. 3618 */ 3619 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3620 wait_event(root->qgroup_flush_wait, 3621 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3622 return 0; 3623 } 3624 3625 ret = btrfs_start_delalloc_snapshot(root, true); 3626 if (ret < 0) 3627 goto out; 3628 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3629 3630 trans = btrfs_join_transaction(root); 3631 if (IS_ERR(trans)) { 3632 ret = PTR_ERR(trans); 3633 goto out; 3634 } 3635 3636 ret = btrfs_commit_transaction(trans); 3637 out: 3638 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3639 wake_up(&root->qgroup_flush_wait); 3640 return ret; 3641 } 3642 3643 static int qgroup_reserve_data(struct btrfs_inode *inode, 3644 struct extent_changeset **reserved_ret, u64 start, 3645 u64 len) 3646 { 3647 struct btrfs_root *root = inode->root; 3648 struct extent_changeset *reserved; 3649 bool new_reserved = false; 3650 u64 orig_reserved; 3651 u64 to_reserve; 3652 int ret; 3653 3654 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3655 !is_fstree(root->root_key.objectid) || len == 0) 3656 return 0; 3657 3658 /* @reserved parameter is mandatory for qgroup */ 3659 if (WARN_ON(!reserved_ret)) 3660 return -EINVAL; 3661 if (!*reserved_ret) { 3662 new_reserved = true; 3663 *reserved_ret = extent_changeset_alloc(); 3664 if (!*reserved_ret) 3665 return -ENOMEM; 3666 } 3667 reserved = *reserved_ret; 3668 /* Record already reserved space */ 3669 orig_reserved = reserved->bytes_changed; 3670 ret = set_record_extent_bits(&inode->io_tree, start, 3671 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3672 3673 /* Newly reserved space */ 3674 to_reserve = reserved->bytes_changed - orig_reserved; 3675 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3676 to_reserve, QGROUP_RESERVE); 3677 if (ret < 0) 3678 goto out; 3679 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3680 if (ret < 0) 3681 goto cleanup; 3682 3683 return ret; 3684 3685 cleanup: 3686 qgroup_unreserve_range(inode, reserved, start, len); 3687 out: 3688 if (new_reserved) { 3689 extent_changeset_free(reserved); 3690 *reserved_ret = NULL; 3691 } 3692 return ret; 3693 } 3694 3695 /* 3696 * Reserve qgroup space for range [start, start + len). 3697 * 3698 * This function will either reserve space from related qgroups or do nothing 3699 * if the range is already reserved. 3700 * 3701 * Return 0 for successful reservation 3702 * Return <0 for error (including -EQUOT) 3703 * 3704 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3705 * commit transaction. So caller should not hold any dirty page locked. 3706 */ 3707 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3708 struct extent_changeset **reserved_ret, u64 start, 3709 u64 len) 3710 { 3711 int ret; 3712 3713 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3714 if (ret <= 0 && ret != -EDQUOT) 3715 return ret; 3716 3717 ret = try_flush_qgroup(inode->root); 3718 if (ret < 0) 3719 return ret; 3720 return qgroup_reserve_data(inode, reserved_ret, start, len); 3721 } 3722 3723 /* Free ranges specified by @reserved, normally in error path */ 3724 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3725 struct extent_changeset *reserved, u64 start, u64 len) 3726 { 3727 struct btrfs_root *root = inode->root; 3728 struct ulist_node *unode; 3729 struct ulist_iterator uiter; 3730 struct extent_changeset changeset; 3731 int freed = 0; 3732 int ret; 3733 3734 extent_changeset_init(&changeset); 3735 len = round_up(start + len, root->fs_info->sectorsize); 3736 start = round_down(start, root->fs_info->sectorsize); 3737 3738 ULIST_ITER_INIT(&uiter); 3739 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3740 u64 range_start = unode->val; 3741 /* unode->aux is the inclusive end */ 3742 u64 range_len = unode->aux - range_start + 1; 3743 u64 free_start; 3744 u64 free_len; 3745 3746 extent_changeset_release(&changeset); 3747 3748 /* Only free range in range [start, start + len) */ 3749 if (range_start >= start + len || 3750 range_start + range_len <= start) 3751 continue; 3752 free_start = max(range_start, start); 3753 free_len = min(start + len, range_start + range_len) - 3754 free_start; 3755 /* 3756 * TODO: To also modify reserved->ranges_reserved to reflect 3757 * the modification. 3758 * 3759 * However as long as we free qgroup reserved according to 3760 * EXTENT_QGROUP_RESERVED, we won't double free. 3761 * So not need to rush. 3762 */ 3763 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3764 free_start + free_len - 1, 3765 EXTENT_QGROUP_RESERVED, &changeset); 3766 if (ret < 0) 3767 goto out; 3768 freed += changeset.bytes_changed; 3769 } 3770 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3771 BTRFS_QGROUP_RSV_DATA); 3772 ret = freed; 3773 out: 3774 extent_changeset_release(&changeset); 3775 return ret; 3776 } 3777 3778 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3779 struct extent_changeset *reserved, u64 start, u64 len, 3780 int free) 3781 { 3782 struct extent_changeset changeset; 3783 int trace_op = QGROUP_RELEASE; 3784 int ret; 3785 3786 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3787 return 0; 3788 3789 /* In release case, we shouldn't have @reserved */ 3790 WARN_ON(!free && reserved); 3791 if (free && reserved) 3792 return qgroup_free_reserved_data(inode, reserved, start, len); 3793 extent_changeset_init(&changeset); 3794 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3795 EXTENT_QGROUP_RESERVED, &changeset); 3796 if (ret < 0) 3797 goto out; 3798 3799 if (free) 3800 trace_op = QGROUP_FREE; 3801 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3802 changeset.bytes_changed, trace_op); 3803 if (free) 3804 btrfs_qgroup_free_refroot(inode->root->fs_info, 3805 inode->root->root_key.objectid, 3806 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3807 ret = changeset.bytes_changed; 3808 out: 3809 extent_changeset_release(&changeset); 3810 return ret; 3811 } 3812 3813 /* 3814 * Free a reserved space range from io_tree and related qgroups 3815 * 3816 * Should be called when a range of pages get invalidated before reaching disk. 3817 * Or for error cleanup case. 3818 * if @reserved is given, only reserved range in [@start, @start + @len) will 3819 * be freed. 3820 * 3821 * For data written to disk, use btrfs_qgroup_release_data(). 3822 * 3823 * NOTE: This function may sleep for memory allocation. 3824 */ 3825 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3826 struct extent_changeset *reserved, u64 start, u64 len) 3827 { 3828 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); 3829 } 3830 3831 /* 3832 * Release a reserved space range from io_tree only. 3833 * 3834 * Should be called when a range of pages get written to disk and corresponding 3835 * FILE_EXTENT is inserted into corresponding root. 3836 * 3837 * Since new qgroup accounting framework will only update qgroup numbers at 3838 * commit_transaction() time, its reserved space shouldn't be freed from 3839 * related qgroups. 3840 * 3841 * But we should release the range from io_tree, to allow further write to be 3842 * COWed. 3843 * 3844 * NOTE: This function may sleep for memory allocation. 3845 */ 3846 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) 3847 { 3848 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); 3849 } 3850 3851 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3852 enum btrfs_qgroup_rsv_type type) 3853 { 3854 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3855 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3856 return; 3857 if (num_bytes == 0) 3858 return; 3859 3860 spin_lock(&root->qgroup_meta_rsv_lock); 3861 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 3862 root->qgroup_meta_rsv_prealloc += num_bytes; 3863 else 3864 root->qgroup_meta_rsv_pertrans += num_bytes; 3865 spin_unlock(&root->qgroup_meta_rsv_lock); 3866 } 3867 3868 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3869 enum btrfs_qgroup_rsv_type type) 3870 { 3871 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3872 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3873 return 0; 3874 if (num_bytes == 0) 3875 return 0; 3876 3877 spin_lock(&root->qgroup_meta_rsv_lock); 3878 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 3879 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 3880 num_bytes); 3881 root->qgroup_meta_rsv_prealloc -= num_bytes; 3882 } else { 3883 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 3884 num_bytes); 3885 root->qgroup_meta_rsv_pertrans -= num_bytes; 3886 } 3887 spin_unlock(&root->qgroup_meta_rsv_lock); 3888 return num_bytes; 3889 } 3890 3891 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3892 enum btrfs_qgroup_rsv_type type, bool enforce) 3893 { 3894 struct btrfs_fs_info *fs_info = root->fs_info; 3895 int ret; 3896 3897 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3898 !is_fstree(root->root_key.objectid) || num_bytes == 0) 3899 return 0; 3900 3901 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3902 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 3903 ret = qgroup_reserve(root, num_bytes, enforce, type); 3904 if (ret < 0) 3905 return ret; 3906 /* 3907 * Record what we have reserved into root. 3908 * 3909 * To avoid quota disabled->enabled underflow. 3910 * In that case, we may try to free space we haven't reserved 3911 * (since quota was disabled), so record what we reserved into root. 3912 * And ensure later release won't underflow this number. 3913 */ 3914 add_root_meta_rsv(root, num_bytes, type); 3915 return ret; 3916 } 3917 3918 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3919 enum btrfs_qgroup_rsv_type type, bool enforce) 3920 { 3921 int ret; 3922 3923 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3924 if (ret <= 0 && ret != -EDQUOT) 3925 return ret; 3926 3927 ret = try_flush_qgroup(root); 3928 if (ret < 0) 3929 return ret; 3930 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3931 } 3932 3933 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 3934 { 3935 struct btrfs_fs_info *fs_info = root->fs_info; 3936 3937 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3938 !is_fstree(root->root_key.objectid)) 3939 return; 3940 3941 /* TODO: Update trace point to handle such free */ 3942 trace_qgroup_meta_free_all_pertrans(root); 3943 /* Special value -1 means to free all reserved space */ 3944 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 3945 BTRFS_QGROUP_RSV_META_PERTRANS); 3946 } 3947 3948 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 3949 enum btrfs_qgroup_rsv_type type) 3950 { 3951 struct btrfs_fs_info *fs_info = root->fs_info; 3952 3953 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3954 !is_fstree(root->root_key.objectid)) 3955 return; 3956 3957 /* 3958 * reservation for META_PREALLOC can happen before quota is enabled, 3959 * which can lead to underflow. 3960 * Here ensure we will only free what we really have reserved. 3961 */ 3962 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 3963 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3964 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 3965 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 3966 num_bytes, type); 3967 } 3968 3969 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 3970 int num_bytes) 3971 { 3972 struct btrfs_qgroup *qgroup; 3973 struct ulist_node *unode; 3974 struct ulist_iterator uiter; 3975 int ret = 0; 3976 3977 if (num_bytes == 0) 3978 return; 3979 if (!fs_info->quota_root) 3980 return; 3981 3982 spin_lock(&fs_info->qgroup_lock); 3983 qgroup = find_qgroup_rb(fs_info, ref_root); 3984 if (!qgroup) 3985 goto out; 3986 ulist_reinit(fs_info->qgroup_ulist); 3987 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3988 qgroup_to_aux(qgroup), GFP_ATOMIC); 3989 if (ret < 0) 3990 goto out; 3991 ULIST_ITER_INIT(&uiter); 3992 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3993 struct btrfs_qgroup *qg; 3994 struct btrfs_qgroup_list *glist; 3995 3996 qg = unode_aux_to_qgroup(unode); 3997 3998 qgroup_rsv_release(fs_info, qg, num_bytes, 3999 BTRFS_QGROUP_RSV_META_PREALLOC); 4000 qgroup_rsv_add(fs_info, qg, num_bytes, 4001 BTRFS_QGROUP_RSV_META_PERTRANS); 4002 list_for_each_entry(glist, &qg->groups, next_group) { 4003 ret = ulist_add(fs_info->qgroup_ulist, 4004 glist->group->qgroupid, 4005 qgroup_to_aux(glist->group), GFP_ATOMIC); 4006 if (ret < 0) 4007 goto out; 4008 } 4009 } 4010 out: 4011 spin_unlock(&fs_info->qgroup_lock); 4012 } 4013 4014 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4015 { 4016 struct btrfs_fs_info *fs_info = root->fs_info; 4017 4018 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4019 !is_fstree(root->root_key.objectid)) 4020 return; 4021 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4022 num_bytes = sub_root_meta_rsv(root, num_bytes, 4023 BTRFS_QGROUP_RSV_META_PREALLOC); 4024 trace_qgroup_meta_convert(root, num_bytes); 4025 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 4026 } 4027 4028 /* 4029 * Check qgroup reserved space leaking, normally at destroy inode 4030 * time 4031 */ 4032 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4033 { 4034 struct extent_changeset changeset; 4035 struct ulist_node *unode; 4036 struct ulist_iterator iter; 4037 int ret; 4038 4039 extent_changeset_init(&changeset); 4040 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4041 EXTENT_QGROUP_RESERVED, &changeset); 4042 4043 WARN_ON(ret < 0); 4044 if (WARN_ON(changeset.bytes_changed)) { 4045 ULIST_ITER_INIT(&iter); 4046 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4047 btrfs_warn(inode->root->fs_info, 4048 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4049 btrfs_ino(inode), unode->val, unode->aux); 4050 } 4051 btrfs_qgroup_free_refroot(inode->root->fs_info, 4052 inode->root->root_key.objectid, 4053 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4054 4055 } 4056 extent_changeset_release(&changeset); 4057 } 4058 4059 void btrfs_qgroup_init_swapped_blocks( 4060 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4061 { 4062 int i; 4063 4064 spin_lock_init(&swapped_blocks->lock); 4065 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4066 swapped_blocks->blocks[i] = RB_ROOT; 4067 swapped_blocks->swapped = false; 4068 } 4069 4070 /* 4071 * Delete all swapped blocks record of @root. 4072 * Every record here means we skipped a full subtree scan for qgroup. 4073 * 4074 * Gets called when committing one transaction. 4075 */ 4076 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4077 { 4078 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4079 int i; 4080 4081 swapped_blocks = &root->swapped_blocks; 4082 4083 spin_lock(&swapped_blocks->lock); 4084 if (!swapped_blocks->swapped) 4085 goto out; 4086 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4087 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4088 struct btrfs_qgroup_swapped_block *entry; 4089 struct btrfs_qgroup_swapped_block *next; 4090 4091 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4092 node) 4093 kfree(entry); 4094 swapped_blocks->blocks[i] = RB_ROOT; 4095 } 4096 swapped_blocks->swapped = false; 4097 out: 4098 spin_unlock(&swapped_blocks->lock); 4099 } 4100 4101 /* 4102 * Add subtree roots record into @subvol_root. 4103 * 4104 * @subvol_root: tree root of the subvolume tree get swapped 4105 * @bg: block group under balance 4106 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4107 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4108 * BOTH POINTERS ARE BEFORE TREE SWAP 4109 * @last_snapshot: last snapshot generation of the subvolume tree 4110 */ 4111 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4112 struct btrfs_root *subvol_root, 4113 struct btrfs_block_group *bg, 4114 struct extent_buffer *subvol_parent, int subvol_slot, 4115 struct extent_buffer *reloc_parent, int reloc_slot, 4116 u64 last_snapshot) 4117 { 4118 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4119 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4120 struct btrfs_qgroup_swapped_block *block; 4121 struct rb_node **cur; 4122 struct rb_node *parent = NULL; 4123 int level = btrfs_header_level(subvol_parent) - 1; 4124 int ret = 0; 4125 4126 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4127 return 0; 4128 4129 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4130 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4131 btrfs_err_rl(fs_info, 4132 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4133 __func__, 4134 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4135 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4136 return -EUCLEAN; 4137 } 4138 4139 block = kmalloc(sizeof(*block), GFP_NOFS); 4140 if (!block) { 4141 ret = -ENOMEM; 4142 goto out; 4143 } 4144 4145 /* 4146 * @reloc_parent/slot is still before swap, while @block is going to 4147 * record the bytenr after swap, so we do the swap here. 4148 */ 4149 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4150 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4151 reloc_slot); 4152 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4153 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4154 subvol_slot); 4155 block->last_snapshot = last_snapshot; 4156 block->level = level; 4157 4158 /* 4159 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4160 * no one else can modify tree blocks thus we qgroup will not change 4161 * no matter the value of trace_leaf. 4162 */ 4163 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4164 block->trace_leaf = true; 4165 else 4166 block->trace_leaf = false; 4167 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4168 4169 /* Insert @block into @blocks */ 4170 spin_lock(&blocks->lock); 4171 cur = &blocks->blocks[level].rb_node; 4172 while (*cur) { 4173 struct btrfs_qgroup_swapped_block *entry; 4174 4175 parent = *cur; 4176 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4177 node); 4178 4179 if (entry->subvol_bytenr < block->subvol_bytenr) { 4180 cur = &(*cur)->rb_left; 4181 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4182 cur = &(*cur)->rb_right; 4183 } else { 4184 if (entry->subvol_generation != 4185 block->subvol_generation || 4186 entry->reloc_bytenr != block->reloc_bytenr || 4187 entry->reloc_generation != 4188 block->reloc_generation) { 4189 /* 4190 * Duplicated but mismatch entry found. 4191 * Shouldn't happen. 4192 * 4193 * Marking qgroup inconsistent should be enough 4194 * for end users. 4195 */ 4196 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4197 ret = -EEXIST; 4198 } 4199 kfree(block); 4200 goto out_unlock; 4201 } 4202 } 4203 rb_link_node(&block->node, parent, cur); 4204 rb_insert_color(&block->node, &blocks->blocks[level]); 4205 blocks->swapped = true; 4206 out_unlock: 4207 spin_unlock(&blocks->lock); 4208 out: 4209 if (ret < 0) 4210 fs_info->qgroup_flags |= 4211 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4212 return ret; 4213 } 4214 4215 /* 4216 * Check if the tree block is a subtree root, and if so do the needed 4217 * delayed subtree trace for qgroup. 4218 * 4219 * This is called during btrfs_cow_block(). 4220 */ 4221 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4222 struct btrfs_root *root, 4223 struct extent_buffer *subvol_eb) 4224 { 4225 struct btrfs_fs_info *fs_info = root->fs_info; 4226 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4227 struct btrfs_qgroup_swapped_block *block; 4228 struct extent_buffer *reloc_eb = NULL; 4229 struct rb_node *node; 4230 bool found = false; 4231 bool swapped = false; 4232 int level = btrfs_header_level(subvol_eb); 4233 int ret = 0; 4234 int i; 4235 4236 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4237 return 0; 4238 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4239 return 0; 4240 4241 spin_lock(&blocks->lock); 4242 if (!blocks->swapped) { 4243 spin_unlock(&blocks->lock); 4244 return 0; 4245 } 4246 node = blocks->blocks[level].rb_node; 4247 4248 while (node) { 4249 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4250 if (block->subvol_bytenr < subvol_eb->start) { 4251 node = node->rb_left; 4252 } else if (block->subvol_bytenr > subvol_eb->start) { 4253 node = node->rb_right; 4254 } else { 4255 found = true; 4256 break; 4257 } 4258 } 4259 if (!found) { 4260 spin_unlock(&blocks->lock); 4261 goto out; 4262 } 4263 /* Found one, remove it from @blocks first and update blocks->swapped */ 4264 rb_erase(&block->node, &blocks->blocks[level]); 4265 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4266 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4267 swapped = true; 4268 break; 4269 } 4270 } 4271 blocks->swapped = swapped; 4272 spin_unlock(&blocks->lock); 4273 4274 /* Read out reloc subtree root */ 4275 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, 4276 block->reloc_generation, block->level, 4277 &block->first_key); 4278 if (IS_ERR(reloc_eb)) { 4279 ret = PTR_ERR(reloc_eb); 4280 reloc_eb = NULL; 4281 goto free_out; 4282 } 4283 if (!extent_buffer_uptodate(reloc_eb)) { 4284 ret = -EIO; 4285 goto free_out; 4286 } 4287 4288 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4289 block->last_snapshot, block->trace_leaf); 4290 free_out: 4291 kfree(block); 4292 free_extent_buffer(reloc_eb); 4293 out: 4294 if (ret < 0) { 4295 btrfs_err_rl(fs_info, 4296 "failed to account subtree at bytenr %llu: %d", 4297 subvol_eb->start, ret); 4298 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4299 } 4300 return ret; 4301 } 4302 4303 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4304 { 4305 struct btrfs_qgroup_extent_record *entry; 4306 struct btrfs_qgroup_extent_record *next; 4307 struct rb_root *root; 4308 4309 root = &trans->delayed_refs.dirty_extent_root; 4310 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4311 ulist_free(entry->old_roots); 4312 kfree(entry); 4313 } 4314 } 4315