1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2011 STRATO. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/writeback.h> 9 #include <linux/blkdev.h> 10 #include <linux/rbtree.h> 11 #include <linux/slab.h> 12 #include <linux/workqueue.h> 13 #include <linux/btrfs.h> 14 #include <linux/sched/mm.h> 15 16 #include "ctree.h" 17 #include "transaction.h" 18 #include "disk-io.h" 19 #include "locking.h" 20 #include "ulist.h" 21 #include "backref.h" 22 #include "extent_io.h" 23 #include "qgroup.h" 24 #include "block-group.h" 25 #include "sysfs.h" 26 #include "tree-mod-log.h" 27 28 /* TODO XXX FIXME 29 * - subvol delete -> delete when ref goes to 0? delete limits also? 30 * - reorganize keys 31 * - compressed 32 * - sync 33 * - copy also limits on subvol creation 34 * - limit 35 * - caches for ulists 36 * - performance benchmarks 37 * - check all ioctl parameters 38 */ 39 40 /* 41 * Helpers to access qgroup reservation 42 * 43 * Callers should ensure the lock context and type are valid 44 */ 45 46 static u64 qgroup_rsv_total(const struct btrfs_qgroup *qgroup) 47 { 48 u64 ret = 0; 49 int i; 50 51 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 52 ret += qgroup->rsv.values[i]; 53 54 return ret; 55 } 56 57 #ifdef CONFIG_BTRFS_DEBUG 58 static const char *qgroup_rsv_type_str(enum btrfs_qgroup_rsv_type type) 59 { 60 if (type == BTRFS_QGROUP_RSV_DATA) 61 return "data"; 62 if (type == BTRFS_QGROUP_RSV_META_PERTRANS) 63 return "meta_pertrans"; 64 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 65 return "meta_prealloc"; 66 return NULL; 67 } 68 #endif 69 70 static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, 71 struct btrfs_qgroup *qgroup, u64 num_bytes, 72 enum btrfs_qgroup_rsv_type type) 73 { 74 trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); 75 qgroup->rsv.values[type] += num_bytes; 76 } 77 78 static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, 79 struct btrfs_qgroup *qgroup, u64 num_bytes, 80 enum btrfs_qgroup_rsv_type type) 81 { 82 trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); 83 if (qgroup->rsv.values[type] >= num_bytes) { 84 qgroup->rsv.values[type] -= num_bytes; 85 return; 86 } 87 #ifdef CONFIG_BTRFS_DEBUG 88 WARN_RATELIMIT(1, 89 "qgroup %llu %s reserved space underflow, have %llu to free %llu", 90 qgroup->qgroupid, qgroup_rsv_type_str(type), 91 qgroup->rsv.values[type], num_bytes); 92 #endif 93 qgroup->rsv.values[type] = 0; 94 } 95 96 static void qgroup_rsv_add_by_qgroup(struct btrfs_fs_info *fs_info, 97 struct btrfs_qgroup *dest, 98 struct btrfs_qgroup *src) 99 { 100 int i; 101 102 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 103 qgroup_rsv_add(fs_info, dest, src->rsv.values[i], i); 104 } 105 106 static void qgroup_rsv_release_by_qgroup(struct btrfs_fs_info *fs_info, 107 struct btrfs_qgroup *dest, 108 struct btrfs_qgroup *src) 109 { 110 int i; 111 112 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) 113 qgroup_rsv_release(fs_info, dest, src->rsv.values[i], i); 114 } 115 116 static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, 117 int mod) 118 { 119 if (qg->old_refcnt < seq) 120 qg->old_refcnt = seq; 121 qg->old_refcnt += mod; 122 } 123 124 static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, 125 int mod) 126 { 127 if (qg->new_refcnt < seq) 128 qg->new_refcnt = seq; 129 qg->new_refcnt += mod; 130 } 131 132 static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) 133 { 134 if (qg->old_refcnt < seq) 135 return 0; 136 return qg->old_refcnt - seq; 137 } 138 139 static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) 140 { 141 if (qg->new_refcnt < seq) 142 return 0; 143 return qg->new_refcnt - seq; 144 } 145 146 /* 147 * glue structure to represent the relations between qgroups. 148 */ 149 struct btrfs_qgroup_list { 150 struct list_head next_group; 151 struct list_head next_member; 152 struct btrfs_qgroup *group; 153 struct btrfs_qgroup *member; 154 }; 155 156 static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) 157 { 158 return (u64)(uintptr_t)qg; 159 } 160 161 static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) 162 { 163 return (struct btrfs_qgroup *)(uintptr_t)n->aux; 164 } 165 166 static int 167 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 168 int init_flags); 169 static void qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info); 170 171 /* must be called with qgroup_ioctl_lock held */ 172 static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, 173 u64 qgroupid) 174 { 175 struct rb_node *n = fs_info->qgroup_tree.rb_node; 176 struct btrfs_qgroup *qgroup; 177 178 while (n) { 179 qgroup = rb_entry(n, struct btrfs_qgroup, node); 180 if (qgroup->qgroupid < qgroupid) 181 n = n->rb_left; 182 else if (qgroup->qgroupid > qgroupid) 183 n = n->rb_right; 184 else 185 return qgroup; 186 } 187 return NULL; 188 } 189 190 /* must be called with qgroup_lock held */ 191 static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, 192 u64 qgroupid) 193 { 194 struct rb_node **p = &fs_info->qgroup_tree.rb_node; 195 struct rb_node *parent = NULL; 196 struct btrfs_qgroup *qgroup; 197 198 while (*p) { 199 parent = *p; 200 qgroup = rb_entry(parent, struct btrfs_qgroup, node); 201 202 if (qgroup->qgroupid < qgroupid) 203 p = &(*p)->rb_left; 204 else if (qgroup->qgroupid > qgroupid) 205 p = &(*p)->rb_right; 206 else 207 return qgroup; 208 } 209 210 qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); 211 if (!qgroup) 212 return ERR_PTR(-ENOMEM); 213 214 qgroup->qgroupid = qgroupid; 215 INIT_LIST_HEAD(&qgroup->groups); 216 INIT_LIST_HEAD(&qgroup->members); 217 INIT_LIST_HEAD(&qgroup->dirty); 218 219 rb_link_node(&qgroup->node, parent, p); 220 rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); 221 222 return qgroup; 223 } 224 225 static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, 226 struct btrfs_qgroup *qgroup) 227 { 228 struct btrfs_qgroup_list *list; 229 230 list_del(&qgroup->dirty); 231 while (!list_empty(&qgroup->groups)) { 232 list = list_first_entry(&qgroup->groups, 233 struct btrfs_qgroup_list, next_group); 234 list_del(&list->next_group); 235 list_del(&list->next_member); 236 kfree(list); 237 } 238 239 while (!list_empty(&qgroup->members)) { 240 list = list_first_entry(&qgroup->members, 241 struct btrfs_qgroup_list, next_member); 242 list_del(&list->next_group); 243 list_del(&list->next_member); 244 kfree(list); 245 } 246 } 247 248 /* must be called with qgroup_lock held */ 249 static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) 250 { 251 struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid); 252 253 if (!qgroup) 254 return -ENOENT; 255 256 rb_erase(&qgroup->node, &fs_info->qgroup_tree); 257 __del_qgroup_rb(fs_info, qgroup); 258 return 0; 259 } 260 261 /* must be called with qgroup_lock held */ 262 static int add_relation_rb(struct btrfs_fs_info *fs_info, 263 u64 memberid, u64 parentid) 264 { 265 struct btrfs_qgroup *member; 266 struct btrfs_qgroup *parent; 267 struct btrfs_qgroup_list *list; 268 269 member = find_qgroup_rb(fs_info, memberid); 270 parent = find_qgroup_rb(fs_info, parentid); 271 if (!member || !parent) 272 return -ENOENT; 273 274 list = kzalloc(sizeof(*list), GFP_ATOMIC); 275 if (!list) 276 return -ENOMEM; 277 278 list->group = parent; 279 list->member = member; 280 list_add_tail(&list->next_group, &member->groups); 281 list_add_tail(&list->next_member, &parent->members); 282 283 return 0; 284 } 285 286 /* must be called with qgroup_lock held */ 287 static int del_relation_rb(struct btrfs_fs_info *fs_info, 288 u64 memberid, u64 parentid) 289 { 290 struct btrfs_qgroup *member; 291 struct btrfs_qgroup *parent; 292 struct btrfs_qgroup_list *list; 293 294 member = find_qgroup_rb(fs_info, memberid); 295 parent = find_qgroup_rb(fs_info, parentid); 296 if (!member || !parent) 297 return -ENOENT; 298 299 list_for_each_entry(list, &member->groups, next_group) { 300 if (list->group == parent) { 301 list_del(&list->next_group); 302 list_del(&list->next_member); 303 kfree(list); 304 return 0; 305 } 306 } 307 return -ENOENT; 308 } 309 310 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 311 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 312 u64 rfer, u64 excl) 313 { 314 struct btrfs_qgroup *qgroup; 315 316 qgroup = find_qgroup_rb(fs_info, qgroupid); 317 if (!qgroup) 318 return -EINVAL; 319 if (qgroup->rfer != rfer || qgroup->excl != excl) 320 return -EINVAL; 321 return 0; 322 } 323 #endif 324 325 /* 326 * The full config is read in one go, only called from open_ctree() 327 * It doesn't use any locking, as at this point we're still single-threaded 328 */ 329 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) 330 { 331 struct btrfs_key key; 332 struct btrfs_key found_key; 333 struct btrfs_root *quota_root = fs_info->quota_root; 334 struct btrfs_path *path = NULL; 335 struct extent_buffer *l; 336 int slot; 337 int ret = 0; 338 u64 flags = 0; 339 u64 rescan_progress = 0; 340 341 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 342 return 0; 343 344 fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); 345 if (!fs_info->qgroup_ulist) { 346 ret = -ENOMEM; 347 goto out; 348 } 349 350 path = btrfs_alloc_path(); 351 if (!path) { 352 ret = -ENOMEM; 353 goto out; 354 } 355 356 ret = btrfs_sysfs_add_qgroups(fs_info); 357 if (ret < 0) 358 goto out; 359 /* default this to quota off, in case no status key is found */ 360 fs_info->qgroup_flags = 0; 361 362 /* 363 * pass 1: read status, all qgroup infos and limits 364 */ 365 key.objectid = 0; 366 key.type = 0; 367 key.offset = 0; 368 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 1); 369 if (ret) 370 goto out; 371 372 while (1) { 373 struct btrfs_qgroup *qgroup; 374 375 slot = path->slots[0]; 376 l = path->nodes[0]; 377 btrfs_item_key_to_cpu(l, &found_key, slot); 378 379 if (found_key.type == BTRFS_QGROUP_STATUS_KEY) { 380 struct btrfs_qgroup_status_item *ptr; 381 382 ptr = btrfs_item_ptr(l, slot, 383 struct btrfs_qgroup_status_item); 384 385 if (btrfs_qgroup_status_version(l, ptr) != 386 BTRFS_QGROUP_STATUS_VERSION) { 387 btrfs_err(fs_info, 388 "old qgroup version, quota disabled"); 389 goto out; 390 } 391 if (btrfs_qgroup_status_generation(l, ptr) != 392 fs_info->generation) { 393 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 394 btrfs_err(fs_info, 395 "qgroup generation mismatch, marked as inconsistent"); 396 } 397 fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, 398 ptr); 399 rescan_progress = btrfs_qgroup_status_rescan(l, ptr); 400 goto next1; 401 } 402 403 if (found_key.type != BTRFS_QGROUP_INFO_KEY && 404 found_key.type != BTRFS_QGROUP_LIMIT_KEY) 405 goto next1; 406 407 qgroup = find_qgroup_rb(fs_info, found_key.offset); 408 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) || 409 (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) { 410 btrfs_err(fs_info, "inconsistent qgroup config"); 411 flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 412 } 413 if (!qgroup) { 414 qgroup = add_qgroup_rb(fs_info, found_key.offset); 415 if (IS_ERR(qgroup)) { 416 ret = PTR_ERR(qgroup); 417 goto out; 418 } 419 } 420 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 421 if (ret < 0) 422 goto out; 423 424 switch (found_key.type) { 425 case BTRFS_QGROUP_INFO_KEY: { 426 struct btrfs_qgroup_info_item *ptr; 427 428 ptr = btrfs_item_ptr(l, slot, 429 struct btrfs_qgroup_info_item); 430 qgroup->rfer = btrfs_qgroup_info_rfer(l, ptr); 431 qgroup->rfer_cmpr = btrfs_qgroup_info_rfer_cmpr(l, ptr); 432 qgroup->excl = btrfs_qgroup_info_excl(l, ptr); 433 qgroup->excl_cmpr = btrfs_qgroup_info_excl_cmpr(l, ptr); 434 /* generation currently unused */ 435 break; 436 } 437 case BTRFS_QGROUP_LIMIT_KEY: { 438 struct btrfs_qgroup_limit_item *ptr; 439 440 ptr = btrfs_item_ptr(l, slot, 441 struct btrfs_qgroup_limit_item); 442 qgroup->lim_flags = btrfs_qgroup_limit_flags(l, ptr); 443 qgroup->max_rfer = btrfs_qgroup_limit_max_rfer(l, ptr); 444 qgroup->max_excl = btrfs_qgroup_limit_max_excl(l, ptr); 445 qgroup->rsv_rfer = btrfs_qgroup_limit_rsv_rfer(l, ptr); 446 qgroup->rsv_excl = btrfs_qgroup_limit_rsv_excl(l, ptr); 447 break; 448 } 449 } 450 next1: 451 ret = btrfs_next_item(quota_root, path); 452 if (ret < 0) 453 goto out; 454 if (ret) 455 break; 456 } 457 btrfs_release_path(path); 458 459 /* 460 * pass 2: read all qgroup relations 461 */ 462 key.objectid = 0; 463 key.type = BTRFS_QGROUP_RELATION_KEY; 464 key.offset = 0; 465 ret = btrfs_search_slot_for_read(quota_root, &key, path, 1, 0); 466 if (ret) 467 goto out; 468 while (1) { 469 slot = path->slots[0]; 470 l = path->nodes[0]; 471 btrfs_item_key_to_cpu(l, &found_key, slot); 472 473 if (found_key.type != BTRFS_QGROUP_RELATION_KEY) 474 goto next2; 475 476 if (found_key.objectid > found_key.offset) { 477 /* parent <- member, not needed to build config */ 478 /* FIXME should we omit the key completely? */ 479 goto next2; 480 } 481 482 ret = add_relation_rb(fs_info, found_key.objectid, 483 found_key.offset); 484 if (ret == -ENOENT) { 485 btrfs_warn(fs_info, 486 "orphan qgroup relation 0x%llx->0x%llx", 487 found_key.objectid, found_key.offset); 488 ret = 0; /* ignore the error */ 489 } 490 if (ret) 491 goto out; 492 next2: 493 ret = btrfs_next_item(quota_root, path); 494 if (ret < 0) 495 goto out; 496 if (ret) 497 break; 498 } 499 out: 500 btrfs_free_path(path); 501 fs_info->qgroup_flags |= flags; 502 if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) 503 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 504 else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && 505 ret >= 0) 506 ret = qgroup_rescan_init(fs_info, rescan_progress, 0); 507 508 if (ret < 0) { 509 ulist_free(fs_info->qgroup_ulist); 510 fs_info->qgroup_ulist = NULL; 511 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 512 btrfs_sysfs_del_qgroups(fs_info); 513 } 514 515 return ret < 0 ? ret : 0; 516 } 517 518 /* 519 * Called in close_ctree() when quota is still enabled. This verifies we don't 520 * leak some reserved space. 521 * 522 * Return false if no reserved space is left. 523 * Return true if some reserved space is leaked. 524 */ 525 bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) 526 { 527 struct rb_node *node; 528 bool ret = false; 529 530 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 531 return ret; 532 /* 533 * Since we're unmounting, there is no race and no need to grab qgroup 534 * lock. And here we don't go post-order to provide a more user 535 * friendly sorted result. 536 */ 537 for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) { 538 struct btrfs_qgroup *qgroup; 539 int i; 540 541 qgroup = rb_entry(node, struct btrfs_qgroup, node); 542 for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) { 543 if (qgroup->rsv.values[i]) { 544 ret = true; 545 btrfs_warn(fs_info, 546 "qgroup %hu/%llu has unreleased space, type %d rsv %llu", 547 btrfs_qgroup_level(qgroup->qgroupid), 548 btrfs_qgroup_subvolid(qgroup->qgroupid), 549 i, qgroup->rsv.values[i]); 550 } 551 } 552 } 553 return ret; 554 } 555 556 /* 557 * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(), 558 * first two are in single-threaded paths.And for the third one, we have set 559 * quota_root to be null with qgroup_lock held before, so it is safe to clean 560 * up the in-memory structures without qgroup_lock held. 561 */ 562 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) 563 { 564 struct rb_node *n; 565 struct btrfs_qgroup *qgroup; 566 567 while ((n = rb_first(&fs_info->qgroup_tree))) { 568 qgroup = rb_entry(n, struct btrfs_qgroup, node); 569 rb_erase(n, &fs_info->qgroup_tree); 570 __del_qgroup_rb(fs_info, qgroup); 571 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 572 kfree(qgroup); 573 } 574 /* 575 * We call btrfs_free_qgroup_config() when unmounting 576 * filesystem and disabling quota, so we set qgroup_ulist 577 * to be null here to avoid double free. 578 */ 579 ulist_free(fs_info->qgroup_ulist); 580 fs_info->qgroup_ulist = NULL; 581 btrfs_sysfs_del_qgroups(fs_info); 582 } 583 584 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 585 u64 dst) 586 { 587 int ret; 588 struct btrfs_root *quota_root = trans->fs_info->quota_root; 589 struct btrfs_path *path; 590 struct btrfs_key key; 591 592 path = btrfs_alloc_path(); 593 if (!path) 594 return -ENOMEM; 595 596 key.objectid = src; 597 key.type = BTRFS_QGROUP_RELATION_KEY; 598 key.offset = dst; 599 600 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 601 602 btrfs_mark_buffer_dirty(path->nodes[0]); 603 604 btrfs_free_path(path); 605 return ret; 606 } 607 608 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 609 u64 dst) 610 { 611 int ret; 612 struct btrfs_root *quota_root = trans->fs_info->quota_root; 613 struct btrfs_path *path; 614 struct btrfs_key key; 615 616 path = btrfs_alloc_path(); 617 if (!path) 618 return -ENOMEM; 619 620 key.objectid = src; 621 key.type = BTRFS_QGROUP_RELATION_KEY; 622 key.offset = dst; 623 624 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 625 if (ret < 0) 626 goto out; 627 628 if (ret > 0) { 629 ret = -ENOENT; 630 goto out; 631 } 632 633 ret = btrfs_del_item(trans, quota_root, path); 634 out: 635 btrfs_free_path(path); 636 return ret; 637 } 638 639 static int add_qgroup_item(struct btrfs_trans_handle *trans, 640 struct btrfs_root *quota_root, u64 qgroupid) 641 { 642 int ret; 643 struct btrfs_path *path; 644 struct btrfs_qgroup_info_item *qgroup_info; 645 struct btrfs_qgroup_limit_item *qgroup_limit; 646 struct extent_buffer *leaf; 647 struct btrfs_key key; 648 649 if (btrfs_is_testing(quota_root->fs_info)) 650 return 0; 651 652 path = btrfs_alloc_path(); 653 if (!path) 654 return -ENOMEM; 655 656 key.objectid = 0; 657 key.type = BTRFS_QGROUP_INFO_KEY; 658 key.offset = qgroupid; 659 660 /* 661 * Avoid a transaction abort by catching -EEXIST here. In that 662 * case, we proceed by re-initializing the existing structure 663 * on disk. 664 */ 665 666 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 667 sizeof(*qgroup_info)); 668 if (ret && ret != -EEXIST) 669 goto out; 670 671 leaf = path->nodes[0]; 672 qgroup_info = btrfs_item_ptr(leaf, path->slots[0], 673 struct btrfs_qgroup_info_item); 674 btrfs_set_qgroup_info_generation(leaf, qgroup_info, trans->transid); 675 btrfs_set_qgroup_info_rfer(leaf, qgroup_info, 0); 676 btrfs_set_qgroup_info_rfer_cmpr(leaf, qgroup_info, 0); 677 btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); 678 btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); 679 680 btrfs_mark_buffer_dirty(leaf); 681 682 btrfs_release_path(path); 683 684 key.type = BTRFS_QGROUP_LIMIT_KEY; 685 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 686 sizeof(*qgroup_limit)); 687 if (ret && ret != -EEXIST) 688 goto out; 689 690 leaf = path->nodes[0]; 691 qgroup_limit = btrfs_item_ptr(leaf, path->slots[0], 692 struct btrfs_qgroup_limit_item); 693 btrfs_set_qgroup_limit_flags(leaf, qgroup_limit, 0); 694 btrfs_set_qgroup_limit_max_rfer(leaf, qgroup_limit, 0); 695 btrfs_set_qgroup_limit_max_excl(leaf, qgroup_limit, 0); 696 btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); 697 btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); 698 699 btrfs_mark_buffer_dirty(leaf); 700 701 ret = 0; 702 out: 703 btrfs_free_path(path); 704 return ret; 705 } 706 707 static int del_qgroup_item(struct btrfs_trans_handle *trans, u64 qgroupid) 708 { 709 int ret; 710 struct btrfs_root *quota_root = trans->fs_info->quota_root; 711 struct btrfs_path *path; 712 struct btrfs_key key; 713 714 path = btrfs_alloc_path(); 715 if (!path) 716 return -ENOMEM; 717 718 key.objectid = 0; 719 key.type = BTRFS_QGROUP_INFO_KEY; 720 key.offset = qgroupid; 721 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 722 if (ret < 0) 723 goto out; 724 725 if (ret > 0) { 726 ret = -ENOENT; 727 goto out; 728 } 729 730 ret = btrfs_del_item(trans, quota_root, path); 731 if (ret) 732 goto out; 733 734 btrfs_release_path(path); 735 736 key.type = BTRFS_QGROUP_LIMIT_KEY; 737 ret = btrfs_search_slot(trans, quota_root, &key, path, -1, 1); 738 if (ret < 0) 739 goto out; 740 741 if (ret > 0) { 742 ret = -ENOENT; 743 goto out; 744 } 745 746 ret = btrfs_del_item(trans, quota_root, path); 747 748 out: 749 btrfs_free_path(path); 750 return ret; 751 } 752 753 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, 754 struct btrfs_qgroup *qgroup) 755 { 756 struct btrfs_root *quota_root = trans->fs_info->quota_root; 757 struct btrfs_path *path; 758 struct btrfs_key key; 759 struct extent_buffer *l; 760 struct btrfs_qgroup_limit_item *qgroup_limit; 761 int ret; 762 int slot; 763 764 key.objectid = 0; 765 key.type = BTRFS_QGROUP_LIMIT_KEY; 766 key.offset = qgroup->qgroupid; 767 768 path = btrfs_alloc_path(); 769 if (!path) 770 return -ENOMEM; 771 772 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 773 if (ret > 0) 774 ret = -ENOENT; 775 776 if (ret) 777 goto out; 778 779 l = path->nodes[0]; 780 slot = path->slots[0]; 781 qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); 782 btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); 783 btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); 784 btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); 785 btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); 786 btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); 787 788 btrfs_mark_buffer_dirty(l); 789 790 out: 791 btrfs_free_path(path); 792 return ret; 793 } 794 795 static int update_qgroup_info_item(struct btrfs_trans_handle *trans, 796 struct btrfs_qgroup *qgroup) 797 { 798 struct btrfs_fs_info *fs_info = trans->fs_info; 799 struct btrfs_root *quota_root = fs_info->quota_root; 800 struct btrfs_path *path; 801 struct btrfs_key key; 802 struct extent_buffer *l; 803 struct btrfs_qgroup_info_item *qgroup_info; 804 int ret; 805 int slot; 806 807 if (btrfs_is_testing(fs_info)) 808 return 0; 809 810 key.objectid = 0; 811 key.type = BTRFS_QGROUP_INFO_KEY; 812 key.offset = qgroup->qgroupid; 813 814 path = btrfs_alloc_path(); 815 if (!path) 816 return -ENOMEM; 817 818 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 819 if (ret > 0) 820 ret = -ENOENT; 821 822 if (ret) 823 goto out; 824 825 l = path->nodes[0]; 826 slot = path->slots[0]; 827 qgroup_info = btrfs_item_ptr(l, slot, struct btrfs_qgroup_info_item); 828 btrfs_set_qgroup_info_generation(l, qgroup_info, trans->transid); 829 btrfs_set_qgroup_info_rfer(l, qgroup_info, qgroup->rfer); 830 btrfs_set_qgroup_info_rfer_cmpr(l, qgroup_info, qgroup->rfer_cmpr); 831 btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); 832 btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); 833 834 btrfs_mark_buffer_dirty(l); 835 836 out: 837 btrfs_free_path(path); 838 return ret; 839 } 840 841 static int update_qgroup_status_item(struct btrfs_trans_handle *trans) 842 { 843 struct btrfs_fs_info *fs_info = trans->fs_info; 844 struct btrfs_root *quota_root = fs_info->quota_root; 845 struct btrfs_path *path; 846 struct btrfs_key key; 847 struct extent_buffer *l; 848 struct btrfs_qgroup_status_item *ptr; 849 int ret; 850 int slot; 851 852 key.objectid = 0; 853 key.type = BTRFS_QGROUP_STATUS_KEY; 854 key.offset = 0; 855 856 path = btrfs_alloc_path(); 857 if (!path) 858 return -ENOMEM; 859 860 ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1); 861 if (ret > 0) 862 ret = -ENOENT; 863 864 if (ret) 865 goto out; 866 867 l = path->nodes[0]; 868 slot = path->slots[0]; 869 ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); 870 btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); 871 btrfs_set_qgroup_status_generation(l, ptr, trans->transid); 872 btrfs_set_qgroup_status_rescan(l, ptr, 873 fs_info->qgroup_rescan_progress.objectid); 874 875 btrfs_mark_buffer_dirty(l); 876 877 out: 878 btrfs_free_path(path); 879 return ret; 880 } 881 882 /* 883 * called with qgroup_lock held 884 */ 885 static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, 886 struct btrfs_root *root) 887 { 888 struct btrfs_path *path; 889 struct btrfs_key key; 890 struct extent_buffer *leaf = NULL; 891 int ret; 892 int nr = 0; 893 894 path = btrfs_alloc_path(); 895 if (!path) 896 return -ENOMEM; 897 898 key.objectid = 0; 899 key.offset = 0; 900 key.type = 0; 901 902 while (1) { 903 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 904 if (ret < 0) 905 goto out; 906 leaf = path->nodes[0]; 907 nr = btrfs_header_nritems(leaf); 908 if (!nr) 909 break; 910 /* 911 * delete the leaf one by one 912 * since the whole tree is going 913 * to be deleted. 914 */ 915 path->slots[0] = 0; 916 ret = btrfs_del_items(trans, root, path, 0, nr); 917 if (ret) 918 goto out; 919 920 btrfs_release_path(path); 921 } 922 ret = 0; 923 out: 924 btrfs_free_path(path); 925 return ret; 926 } 927 928 int btrfs_quota_enable(struct btrfs_fs_info *fs_info) 929 { 930 struct btrfs_root *quota_root; 931 struct btrfs_root *tree_root = fs_info->tree_root; 932 struct btrfs_path *path = NULL; 933 struct btrfs_qgroup_status_item *ptr; 934 struct extent_buffer *leaf; 935 struct btrfs_key key; 936 struct btrfs_key found_key; 937 struct btrfs_qgroup *qgroup = NULL; 938 struct btrfs_trans_handle *trans = NULL; 939 struct ulist *ulist = NULL; 940 int ret = 0; 941 int slot; 942 943 /* 944 * We need to have subvol_sem write locked, to prevent races between 945 * concurrent tasks trying to enable quotas, because we will unlock 946 * and relock qgroup_ioctl_lock before setting fs_info->quota_root 947 * and before setting BTRFS_FS_QUOTA_ENABLED. 948 */ 949 lockdep_assert_held_write(&fs_info->subvol_sem); 950 951 mutex_lock(&fs_info->qgroup_ioctl_lock); 952 if (fs_info->quota_root) 953 goto out; 954 955 ulist = ulist_alloc(GFP_KERNEL); 956 if (!ulist) { 957 ret = -ENOMEM; 958 goto out; 959 } 960 961 ret = btrfs_sysfs_add_qgroups(fs_info); 962 if (ret < 0) 963 goto out; 964 965 /* 966 * Unlock qgroup_ioctl_lock before starting the transaction. This is to 967 * avoid lock acquisition inversion problems (reported by lockdep) between 968 * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we 969 * start a transaction. 970 * After we started the transaction lock qgroup_ioctl_lock again and 971 * check if someone else created the quota root in the meanwhile. If so, 972 * just return success and release the transaction handle. 973 * 974 * Also we don't need to worry about someone else calling 975 * btrfs_sysfs_add_qgroups() after we unlock and getting an error because 976 * that function returns 0 (success) when the sysfs entries already exist. 977 */ 978 mutex_unlock(&fs_info->qgroup_ioctl_lock); 979 980 /* 981 * 1 for quota root item 982 * 1 for BTRFS_QGROUP_STATUS item 983 * 984 * Yet we also need 2*n items for a QGROUP_INFO/QGROUP_LIMIT items 985 * per subvolume. However those are not currently reserved since it 986 * would be a lot of overkill. 987 */ 988 trans = btrfs_start_transaction(tree_root, 2); 989 990 mutex_lock(&fs_info->qgroup_ioctl_lock); 991 if (IS_ERR(trans)) { 992 ret = PTR_ERR(trans); 993 trans = NULL; 994 goto out; 995 } 996 997 if (fs_info->quota_root) 998 goto out; 999 1000 fs_info->qgroup_ulist = ulist; 1001 ulist = NULL; 1002 1003 /* 1004 * initially create the quota tree 1005 */ 1006 quota_root = btrfs_create_tree(trans, BTRFS_QUOTA_TREE_OBJECTID); 1007 if (IS_ERR(quota_root)) { 1008 ret = PTR_ERR(quota_root); 1009 btrfs_abort_transaction(trans, ret); 1010 goto out; 1011 } 1012 1013 path = btrfs_alloc_path(); 1014 if (!path) { 1015 ret = -ENOMEM; 1016 btrfs_abort_transaction(trans, ret); 1017 goto out_free_root; 1018 } 1019 1020 key.objectid = 0; 1021 key.type = BTRFS_QGROUP_STATUS_KEY; 1022 key.offset = 0; 1023 1024 ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 1025 sizeof(*ptr)); 1026 if (ret) { 1027 btrfs_abort_transaction(trans, ret); 1028 goto out_free_path; 1029 } 1030 1031 leaf = path->nodes[0]; 1032 ptr = btrfs_item_ptr(leaf, path->slots[0], 1033 struct btrfs_qgroup_status_item); 1034 btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); 1035 btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); 1036 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | 1037 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1038 btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); 1039 btrfs_set_qgroup_status_rescan(leaf, ptr, 0); 1040 1041 btrfs_mark_buffer_dirty(leaf); 1042 1043 key.objectid = 0; 1044 key.type = BTRFS_ROOT_REF_KEY; 1045 key.offset = 0; 1046 1047 btrfs_release_path(path); 1048 ret = btrfs_search_slot_for_read(tree_root, &key, path, 1, 0); 1049 if (ret > 0) 1050 goto out_add_root; 1051 if (ret < 0) { 1052 btrfs_abort_transaction(trans, ret); 1053 goto out_free_path; 1054 } 1055 1056 while (1) { 1057 slot = path->slots[0]; 1058 leaf = path->nodes[0]; 1059 btrfs_item_key_to_cpu(leaf, &found_key, slot); 1060 1061 if (found_key.type == BTRFS_ROOT_REF_KEY) { 1062 1063 /* Release locks on tree_root before we access quota_root */ 1064 btrfs_release_path(path); 1065 1066 ret = add_qgroup_item(trans, quota_root, 1067 found_key.offset); 1068 if (ret) { 1069 btrfs_abort_transaction(trans, ret); 1070 goto out_free_path; 1071 } 1072 1073 qgroup = add_qgroup_rb(fs_info, found_key.offset); 1074 if (IS_ERR(qgroup)) { 1075 ret = PTR_ERR(qgroup); 1076 btrfs_abort_transaction(trans, ret); 1077 goto out_free_path; 1078 } 1079 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1080 if (ret < 0) { 1081 btrfs_abort_transaction(trans, ret); 1082 goto out_free_path; 1083 } 1084 ret = btrfs_search_slot_for_read(tree_root, &found_key, 1085 path, 1, 0); 1086 if (ret < 0) { 1087 btrfs_abort_transaction(trans, ret); 1088 goto out_free_path; 1089 } 1090 if (ret > 0) { 1091 /* 1092 * Shouldn't happen, but in case it does we 1093 * don't need to do the btrfs_next_item, just 1094 * continue. 1095 */ 1096 continue; 1097 } 1098 } 1099 ret = btrfs_next_item(tree_root, path); 1100 if (ret < 0) { 1101 btrfs_abort_transaction(trans, ret); 1102 goto out_free_path; 1103 } 1104 if (ret) 1105 break; 1106 } 1107 1108 out_add_root: 1109 btrfs_release_path(path); 1110 ret = add_qgroup_item(trans, quota_root, BTRFS_FS_TREE_OBJECTID); 1111 if (ret) { 1112 btrfs_abort_transaction(trans, ret); 1113 goto out_free_path; 1114 } 1115 1116 qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); 1117 if (IS_ERR(qgroup)) { 1118 ret = PTR_ERR(qgroup); 1119 btrfs_abort_transaction(trans, ret); 1120 goto out_free_path; 1121 } 1122 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1123 if (ret < 0) { 1124 btrfs_abort_transaction(trans, ret); 1125 goto out_free_path; 1126 } 1127 1128 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1129 /* 1130 * Commit the transaction while not holding qgroup_ioctl_lock, to avoid 1131 * a deadlock with tasks concurrently doing other qgroup operations, such 1132 * adding/removing qgroups or adding/deleting qgroup relations for example, 1133 * because all qgroup operations first start or join a transaction and then 1134 * lock the qgroup_ioctl_lock mutex. 1135 * We are safe from a concurrent task trying to enable quotas, by calling 1136 * this function, since we are serialized by fs_info->subvol_sem. 1137 */ 1138 ret = btrfs_commit_transaction(trans); 1139 trans = NULL; 1140 mutex_lock(&fs_info->qgroup_ioctl_lock); 1141 if (ret) 1142 goto out_free_path; 1143 1144 /* 1145 * Set quota enabled flag after committing the transaction, to avoid 1146 * deadlocks on fs_info->qgroup_ioctl_lock with concurrent snapshot 1147 * creation. 1148 */ 1149 spin_lock(&fs_info->qgroup_lock); 1150 fs_info->quota_root = quota_root; 1151 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1152 spin_unlock(&fs_info->qgroup_lock); 1153 1154 ret = qgroup_rescan_init(fs_info, 0, 1); 1155 if (!ret) { 1156 qgroup_rescan_zero_tracking(fs_info); 1157 fs_info->qgroup_rescan_running = true; 1158 btrfs_queue_work(fs_info->qgroup_rescan_workers, 1159 &fs_info->qgroup_rescan_work); 1160 } 1161 1162 out_free_path: 1163 btrfs_free_path(path); 1164 out_free_root: 1165 if (ret) 1166 btrfs_put_root(quota_root); 1167 out: 1168 if (ret) { 1169 ulist_free(fs_info->qgroup_ulist); 1170 fs_info->qgroup_ulist = NULL; 1171 btrfs_sysfs_del_qgroups(fs_info); 1172 } 1173 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1174 if (ret && trans) 1175 btrfs_end_transaction(trans); 1176 else if (trans) 1177 ret = btrfs_end_transaction(trans); 1178 ulist_free(ulist); 1179 return ret; 1180 } 1181 1182 int btrfs_quota_disable(struct btrfs_fs_info *fs_info) 1183 { 1184 struct btrfs_root *quota_root; 1185 struct btrfs_trans_handle *trans = NULL; 1186 int ret = 0; 1187 1188 /* 1189 * We need to have subvol_sem write locked, to prevent races between 1190 * concurrent tasks trying to disable quotas, because we will unlock 1191 * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes. 1192 */ 1193 lockdep_assert_held_write(&fs_info->subvol_sem); 1194 1195 mutex_lock(&fs_info->qgroup_ioctl_lock); 1196 if (!fs_info->quota_root) 1197 goto out; 1198 1199 /* 1200 * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to 1201 * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs 1202 * to lock that mutex while holding a transaction handle and the rescan 1203 * worker needs to commit a transaction. 1204 */ 1205 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1206 1207 /* 1208 * Request qgroup rescan worker to complete and wait for it. This wait 1209 * must be done before transaction start for quota disable since it may 1210 * deadlock with transaction by the qgroup rescan worker. 1211 */ 1212 clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1213 btrfs_qgroup_wait_for_completion(fs_info, false); 1214 1215 /* 1216 * 1 For the root item 1217 * 1218 * We should also reserve enough items for the quota tree deletion in 1219 * btrfs_clean_quota_tree but this is not done. 1220 * 1221 * Also, we must always start a transaction without holding the mutex 1222 * qgroup_ioctl_lock, see btrfs_quota_enable(). 1223 */ 1224 trans = btrfs_start_transaction(fs_info->tree_root, 1); 1225 1226 mutex_lock(&fs_info->qgroup_ioctl_lock); 1227 if (IS_ERR(trans)) { 1228 ret = PTR_ERR(trans); 1229 trans = NULL; 1230 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1231 goto out; 1232 } 1233 1234 if (!fs_info->quota_root) 1235 goto out; 1236 1237 spin_lock(&fs_info->qgroup_lock); 1238 quota_root = fs_info->quota_root; 1239 fs_info->quota_root = NULL; 1240 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 1241 spin_unlock(&fs_info->qgroup_lock); 1242 1243 btrfs_free_qgroup_config(fs_info); 1244 1245 ret = btrfs_clean_quota_tree(trans, quota_root); 1246 if (ret) { 1247 btrfs_abort_transaction(trans, ret); 1248 goto out; 1249 } 1250 1251 ret = btrfs_del_root(trans, "a_root->root_key); 1252 if (ret) { 1253 btrfs_abort_transaction(trans, ret); 1254 goto out; 1255 } 1256 1257 list_del("a_root->dirty_list); 1258 1259 btrfs_tree_lock(quota_root->node); 1260 btrfs_clean_tree_block(quota_root->node); 1261 btrfs_tree_unlock(quota_root->node); 1262 btrfs_free_tree_block(trans, btrfs_root_id(quota_root), 1263 quota_root->node, 0, 1); 1264 1265 btrfs_put_root(quota_root); 1266 1267 out: 1268 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1269 if (ret && trans) 1270 btrfs_end_transaction(trans); 1271 else if (trans) 1272 ret = btrfs_end_transaction(trans); 1273 1274 return ret; 1275 } 1276 1277 static void qgroup_dirty(struct btrfs_fs_info *fs_info, 1278 struct btrfs_qgroup *qgroup) 1279 { 1280 if (list_empty(&qgroup->dirty)) 1281 list_add(&qgroup->dirty, &fs_info->dirty_qgroups); 1282 } 1283 1284 /* 1285 * The easy accounting, we're updating qgroup relationship whose child qgroup 1286 * only has exclusive extents. 1287 * 1288 * In this case, all exclusive extents will also be exclusive for parent, so 1289 * excl/rfer just get added/removed. 1290 * 1291 * So is qgroup reservation space, which should also be added/removed to 1292 * parent. 1293 * Or when child tries to release reservation space, parent will underflow its 1294 * reservation (for relationship adding case). 1295 * 1296 * Caller should hold fs_info->qgroup_lock. 1297 */ 1298 static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, 1299 struct ulist *tmp, u64 ref_root, 1300 struct btrfs_qgroup *src, int sign) 1301 { 1302 struct btrfs_qgroup *qgroup; 1303 struct btrfs_qgroup_list *glist; 1304 struct ulist_node *unode; 1305 struct ulist_iterator uiter; 1306 u64 num_bytes = src->excl; 1307 int ret = 0; 1308 1309 qgroup = find_qgroup_rb(fs_info, ref_root); 1310 if (!qgroup) 1311 goto out; 1312 1313 qgroup->rfer += sign * num_bytes; 1314 qgroup->rfer_cmpr += sign * num_bytes; 1315 1316 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1317 qgroup->excl += sign * num_bytes; 1318 qgroup->excl_cmpr += sign * num_bytes; 1319 1320 if (sign > 0) 1321 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1322 else 1323 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1324 1325 qgroup_dirty(fs_info, qgroup); 1326 1327 /* Get all of the parent groups that contain this qgroup */ 1328 list_for_each_entry(glist, &qgroup->groups, next_group) { 1329 ret = ulist_add(tmp, glist->group->qgroupid, 1330 qgroup_to_aux(glist->group), GFP_ATOMIC); 1331 if (ret < 0) 1332 goto out; 1333 } 1334 1335 /* Iterate all of the parents and adjust their reference counts */ 1336 ULIST_ITER_INIT(&uiter); 1337 while ((unode = ulist_next(tmp, &uiter))) { 1338 qgroup = unode_aux_to_qgroup(unode); 1339 qgroup->rfer += sign * num_bytes; 1340 qgroup->rfer_cmpr += sign * num_bytes; 1341 WARN_ON(sign < 0 && qgroup->excl < num_bytes); 1342 qgroup->excl += sign * num_bytes; 1343 if (sign > 0) 1344 qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); 1345 else 1346 qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); 1347 qgroup->excl_cmpr += sign * num_bytes; 1348 qgroup_dirty(fs_info, qgroup); 1349 1350 /* Add any parents of the parents */ 1351 list_for_each_entry(glist, &qgroup->groups, next_group) { 1352 ret = ulist_add(tmp, glist->group->qgroupid, 1353 qgroup_to_aux(glist->group), GFP_ATOMIC); 1354 if (ret < 0) 1355 goto out; 1356 } 1357 } 1358 ret = 0; 1359 out: 1360 return ret; 1361 } 1362 1363 1364 /* 1365 * Quick path for updating qgroup with only excl refs. 1366 * 1367 * In that case, just update all parent will be enough. 1368 * Or we needs to do a full rescan. 1369 * Caller should also hold fs_info->qgroup_lock. 1370 * 1371 * Return 0 for quick update, return >0 for need to full rescan 1372 * and mark INCONSISTENT flag. 1373 * Return < 0 for other error. 1374 */ 1375 static int quick_update_accounting(struct btrfs_fs_info *fs_info, 1376 struct ulist *tmp, u64 src, u64 dst, 1377 int sign) 1378 { 1379 struct btrfs_qgroup *qgroup; 1380 int ret = 1; 1381 int err = 0; 1382 1383 qgroup = find_qgroup_rb(fs_info, src); 1384 if (!qgroup) 1385 goto out; 1386 if (qgroup->excl == qgroup->rfer) { 1387 ret = 0; 1388 err = __qgroup_excl_accounting(fs_info, tmp, dst, 1389 qgroup, sign); 1390 if (err < 0) { 1391 ret = err; 1392 goto out; 1393 } 1394 } 1395 out: 1396 if (ret) 1397 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1398 return ret; 1399 } 1400 1401 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1402 u64 dst) 1403 { 1404 struct btrfs_fs_info *fs_info = trans->fs_info; 1405 struct btrfs_qgroup *parent; 1406 struct btrfs_qgroup *member; 1407 struct btrfs_qgroup_list *list; 1408 struct ulist *tmp; 1409 unsigned int nofs_flag; 1410 int ret = 0; 1411 1412 /* Check the level of src and dst first */ 1413 if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) 1414 return -EINVAL; 1415 1416 /* We hold a transaction handle open, must do a NOFS allocation. */ 1417 nofs_flag = memalloc_nofs_save(); 1418 tmp = ulist_alloc(GFP_KERNEL); 1419 memalloc_nofs_restore(nofs_flag); 1420 if (!tmp) 1421 return -ENOMEM; 1422 1423 mutex_lock(&fs_info->qgroup_ioctl_lock); 1424 if (!fs_info->quota_root) { 1425 ret = -ENOTCONN; 1426 goto out; 1427 } 1428 member = find_qgroup_rb(fs_info, src); 1429 parent = find_qgroup_rb(fs_info, dst); 1430 if (!member || !parent) { 1431 ret = -EINVAL; 1432 goto out; 1433 } 1434 1435 /* check if such qgroup relation exist firstly */ 1436 list_for_each_entry(list, &member->groups, next_group) { 1437 if (list->group == parent) { 1438 ret = -EEXIST; 1439 goto out; 1440 } 1441 } 1442 1443 ret = add_qgroup_relation_item(trans, src, dst); 1444 if (ret) 1445 goto out; 1446 1447 ret = add_qgroup_relation_item(trans, dst, src); 1448 if (ret) { 1449 del_qgroup_relation_item(trans, src, dst); 1450 goto out; 1451 } 1452 1453 spin_lock(&fs_info->qgroup_lock); 1454 ret = add_relation_rb(fs_info, src, dst); 1455 if (ret < 0) { 1456 spin_unlock(&fs_info->qgroup_lock); 1457 goto out; 1458 } 1459 ret = quick_update_accounting(fs_info, tmp, src, dst, 1); 1460 spin_unlock(&fs_info->qgroup_lock); 1461 out: 1462 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1463 ulist_free(tmp); 1464 return ret; 1465 } 1466 1467 static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1468 u64 dst) 1469 { 1470 struct btrfs_fs_info *fs_info = trans->fs_info; 1471 struct btrfs_qgroup *parent; 1472 struct btrfs_qgroup *member; 1473 struct btrfs_qgroup_list *list; 1474 struct ulist *tmp; 1475 bool found = false; 1476 unsigned int nofs_flag; 1477 int ret = 0; 1478 int ret2; 1479 1480 /* We hold a transaction handle open, must do a NOFS allocation. */ 1481 nofs_flag = memalloc_nofs_save(); 1482 tmp = ulist_alloc(GFP_KERNEL); 1483 memalloc_nofs_restore(nofs_flag); 1484 if (!tmp) 1485 return -ENOMEM; 1486 1487 if (!fs_info->quota_root) { 1488 ret = -ENOTCONN; 1489 goto out; 1490 } 1491 1492 member = find_qgroup_rb(fs_info, src); 1493 parent = find_qgroup_rb(fs_info, dst); 1494 /* 1495 * The parent/member pair doesn't exist, then try to delete the dead 1496 * relation items only. 1497 */ 1498 if (!member || !parent) 1499 goto delete_item; 1500 1501 /* check if such qgroup relation exist firstly */ 1502 list_for_each_entry(list, &member->groups, next_group) { 1503 if (list->group == parent) { 1504 found = true; 1505 break; 1506 } 1507 } 1508 1509 delete_item: 1510 ret = del_qgroup_relation_item(trans, src, dst); 1511 if (ret < 0 && ret != -ENOENT) 1512 goto out; 1513 ret2 = del_qgroup_relation_item(trans, dst, src); 1514 if (ret2 < 0 && ret2 != -ENOENT) 1515 goto out; 1516 1517 /* At least one deletion succeeded, return 0 */ 1518 if (!ret || !ret2) 1519 ret = 0; 1520 1521 if (found) { 1522 spin_lock(&fs_info->qgroup_lock); 1523 del_relation_rb(fs_info, src, dst); 1524 ret = quick_update_accounting(fs_info, tmp, src, dst, -1); 1525 spin_unlock(&fs_info->qgroup_lock); 1526 } 1527 out: 1528 ulist_free(tmp); 1529 return ret; 1530 } 1531 1532 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 1533 u64 dst) 1534 { 1535 struct btrfs_fs_info *fs_info = trans->fs_info; 1536 int ret = 0; 1537 1538 mutex_lock(&fs_info->qgroup_ioctl_lock); 1539 ret = __del_qgroup_relation(trans, src, dst); 1540 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1541 1542 return ret; 1543 } 1544 1545 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1546 { 1547 struct btrfs_fs_info *fs_info = trans->fs_info; 1548 struct btrfs_root *quota_root; 1549 struct btrfs_qgroup *qgroup; 1550 int ret = 0; 1551 1552 mutex_lock(&fs_info->qgroup_ioctl_lock); 1553 if (!fs_info->quota_root) { 1554 ret = -ENOTCONN; 1555 goto out; 1556 } 1557 quota_root = fs_info->quota_root; 1558 qgroup = find_qgroup_rb(fs_info, qgroupid); 1559 if (qgroup) { 1560 ret = -EEXIST; 1561 goto out; 1562 } 1563 1564 ret = add_qgroup_item(trans, quota_root, qgroupid); 1565 if (ret) 1566 goto out; 1567 1568 spin_lock(&fs_info->qgroup_lock); 1569 qgroup = add_qgroup_rb(fs_info, qgroupid); 1570 spin_unlock(&fs_info->qgroup_lock); 1571 1572 if (IS_ERR(qgroup)) { 1573 ret = PTR_ERR(qgroup); 1574 goto out; 1575 } 1576 ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); 1577 out: 1578 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1579 return ret; 1580 } 1581 1582 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) 1583 { 1584 struct btrfs_fs_info *fs_info = trans->fs_info; 1585 struct btrfs_qgroup *qgroup; 1586 struct btrfs_qgroup_list *list; 1587 int ret = 0; 1588 1589 mutex_lock(&fs_info->qgroup_ioctl_lock); 1590 if (!fs_info->quota_root) { 1591 ret = -ENOTCONN; 1592 goto out; 1593 } 1594 1595 qgroup = find_qgroup_rb(fs_info, qgroupid); 1596 if (!qgroup) { 1597 ret = -ENOENT; 1598 goto out; 1599 } 1600 1601 /* Check if there are no children of this qgroup */ 1602 if (!list_empty(&qgroup->members)) { 1603 ret = -EBUSY; 1604 goto out; 1605 } 1606 1607 ret = del_qgroup_item(trans, qgroupid); 1608 if (ret && ret != -ENOENT) 1609 goto out; 1610 1611 while (!list_empty(&qgroup->groups)) { 1612 list = list_first_entry(&qgroup->groups, 1613 struct btrfs_qgroup_list, next_group); 1614 ret = __del_qgroup_relation(trans, qgroupid, 1615 list->group->qgroupid); 1616 if (ret) 1617 goto out; 1618 } 1619 1620 spin_lock(&fs_info->qgroup_lock); 1621 del_qgroup_rb(fs_info, qgroupid); 1622 spin_unlock(&fs_info->qgroup_lock); 1623 1624 /* 1625 * Remove the qgroup from sysfs now without holding the qgroup_lock 1626 * spinlock, since the sysfs_remove_group() function needs to take 1627 * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). 1628 */ 1629 btrfs_sysfs_del_one_qgroup(fs_info, qgroup); 1630 kfree(qgroup); 1631 out: 1632 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1633 return ret; 1634 } 1635 1636 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 1637 struct btrfs_qgroup_limit *limit) 1638 { 1639 struct btrfs_fs_info *fs_info = trans->fs_info; 1640 struct btrfs_qgroup *qgroup; 1641 int ret = 0; 1642 /* Sometimes we would want to clear the limit on this qgroup. 1643 * To meet this requirement, we treat the -1 as a special value 1644 * which tell kernel to clear the limit on this qgroup. 1645 */ 1646 const u64 CLEAR_VALUE = -1; 1647 1648 mutex_lock(&fs_info->qgroup_ioctl_lock); 1649 if (!fs_info->quota_root) { 1650 ret = -ENOTCONN; 1651 goto out; 1652 } 1653 1654 qgroup = find_qgroup_rb(fs_info, qgroupid); 1655 if (!qgroup) { 1656 ret = -ENOENT; 1657 goto out; 1658 } 1659 1660 spin_lock(&fs_info->qgroup_lock); 1661 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { 1662 if (limit->max_rfer == CLEAR_VALUE) { 1663 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1664 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; 1665 qgroup->max_rfer = 0; 1666 } else { 1667 qgroup->max_rfer = limit->max_rfer; 1668 } 1669 } 1670 if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { 1671 if (limit->max_excl == CLEAR_VALUE) { 1672 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1673 limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; 1674 qgroup->max_excl = 0; 1675 } else { 1676 qgroup->max_excl = limit->max_excl; 1677 } 1678 } 1679 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { 1680 if (limit->rsv_rfer == CLEAR_VALUE) { 1681 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1682 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; 1683 qgroup->rsv_rfer = 0; 1684 } else { 1685 qgroup->rsv_rfer = limit->rsv_rfer; 1686 } 1687 } 1688 if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { 1689 if (limit->rsv_excl == CLEAR_VALUE) { 1690 qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1691 limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; 1692 qgroup->rsv_excl = 0; 1693 } else { 1694 qgroup->rsv_excl = limit->rsv_excl; 1695 } 1696 } 1697 qgroup->lim_flags |= limit->flags; 1698 1699 spin_unlock(&fs_info->qgroup_lock); 1700 1701 ret = update_qgroup_limit_item(trans, qgroup); 1702 if (ret) { 1703 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1704 btrfs_info(fs_info, "unable to update quota limit for %llu", 1705 qgroupid); 1706 } 1707 1708 out: 1709 mutex_unlock(&fs_info->qgroup_ioctl_lock); 1710 return ret; 1711 } 1712 1713 int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, 1714 struct btrfs_delayed_ref_root *delayed_refs, 1715 struct btrfs_qgroup_extent_record *record) 1716 { 1717 struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; 1718 struct rb_node *parent_node = NULL; 1719 struct btrfs_qgroup_extent_record *entry; 1720 u64 bytenr = record->bytenr; 1721 1722 lockdep_assert_held(&delayed_refs->lock); 1723 trace_btrfs_qgroup_trace_extent(fs_info, record); 1724 1725 while (*p) { 1726 parent_node = *p; 1727 entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, 1728 node); 1729 if (bytenr < entry->bytenr) { 1730 p = &(*p)->rb_left; 1731 } else if (bytenr > entry->bytenr) { 1732 p = &(*p)->rb_right; 1733 } else { 1734 if (record->data_rsv && !entry->data_rsv) { 1735 entry->data_rsv = record->data_rsv; 1736 entry->data_rsv_refroot = 1737 record->data_rsv_refroot; 1738 } 1739 return 1; 1740 } 1741 } 1742 1743 rb_link_node(&record->node, parent_node, p); 1744 rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); 1745 return 0; 1746 } 1747 1748 int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, 1749 struct btrfs_qgroup_extent_record *qrecord) 1750 { 1751 struct ulist *old_root; 1752 u64 bytenr = qrecord->bytenr; 1753 int ret; 1754 1755 /* 1756 * We are always called in a context where we are already holding a 1757 * transaction handle. Often we are called when adding a data delayed 1758 * reference from btrfs_truncate_inode_items() (truncating or unlinking), 1759 * in which case we will be holding a write lock on extent buffer from a 1760 * subvolume tree. In this case we can't allow btrfs_find_all_roots() to 1761 * acquire fs_info->commit_root_sem, because that is a higher level lock 1762 * that must be acquired before locking any extent buffers. 1763 * 1764 * So we want btrfs_find_all_roots() to not acquire the commit_root_sem 1765 * but we can't pass it a non-NULL transaction handle, because otherwise 1766 * it would not use commit roots and would lock extent buffers, causing 1767 * a deadlock if it ends up trying to read lock the same extent buffer 1768 * that was previously write locked at btrfs_truncate_inode_items(). 1769 * 1770 * So pass a NULL transaction handle to btrfs_find_all_roots() and 1771 * explicitly tell it to not acquire the commit_root_sem - if we are 1772 * holding a transaction handle we don't need its protection. 1773 */ 1774 ASSERT(trans != NULL); 1775 1776 ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, 1777 true); 1778 if (ret < 0) { 1779 trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 1780 btrfs_warn(trans->fs_info, 1781 "error accounting new delayed refs extent (err code: %d), quota inconsistent", 1782 ret); 1783 return 0; 1784 } 1785 1786 /* 1787 * Here we don't need to get the lock of 1788 * trans->transaction->delayed_refs, since inserted qrecord won't 1789 * be deleted, only qrecord->node may be modified (new qrecord insert) 1790 * 1791 * So modifying qrecord->old_roots is safe here 1792 */ 1793 qrecord->old_roots = old_root; 1794 return 0; 1795 } 1796 1797 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 1798 u64 num_bytes, gfp_t gfp_flag) 1799 { 1800 struct btrfs_fs_info *fs_info = trans->fs_info; 1801 struct btrfs_qgroup_extent_record *record; 1802 struct btrfs_delayed_ref_root *delayed_refs; 1803 int ret; 1804 1805 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) 1806 || bytenr == 0 || num_bytes == 0) 1807 return 0; 1808 record = kzalloc(sizeof(*record), gfp_flag); 1809 if (!record) 1810 return -ENOMEM; 1811 1812 delayed_refs = &trans->transaction->delayed_refs; 1813 record->bytenr = bytenr; 1814 record->num_bytes = num_bytes; 1815 record->old_roots = NULL; 1816 1817 spin_lock(&delayed_refs->lock); 1818 ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); 1819 spin_unlock(&delayed_refs->lock); 1820 if (ret > 0) { 1821 kfree(record); 1822 return 0; 1823 } 1824 return btrfs_qgroup_trace_extent_post(trans, record); 1825 } 1826 1827 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 1828 struct extent_buffer *eb) 1829 { 1830 struct btrfs_fs_info *fs_info = trans->fs_info; 1831 int nr = btrfs_header_nritems(eb); 1832 int i, extent_type, ret; 1833 struct btrfs_key key; 1834 struct btrfs_file_extent_item *fi; 1835 u64 bytenr, num_bytes; 1836 1837 /* We can be called directly from walk_up_proc() */ 1838 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 1839 return 0; 1840 1841 for (i = 0; i < nr; i++) { 1842 btrfs_item_key_to_cpu(eb, &key, i); 1843 1844 if (key.type != BTRFS_EXTENT_DATA_KEY) 1845 continue; 1846 1847 fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); 1848 /* filter out non qgroup-accountable extents */ 1849 extent_type = btrfs_file_extent_type(eb, fi); 1850 1851 if (extent_type == BTRFS_FILE_EXTENT_INLINE) 1852 continue; 1853 1854 bytenr = btrfs_file_extent_disk_bytenr(eb, fi); 1855 if (!bytenr) 1856 continue; 1857 1858 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); 1859 1860 ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, 1861 GFP_NOFS); 1862 if (ret) 1863 return ret; 1864 } 1865 cond_resched(); 1866 return 0; 1867 } 1868 1869 /* 1870 * Walk up the tree from the bottom, freeing leaves and any interior 1871 * nodes which have had all slots visited. If a node (leaf or 1872 * interior) is freed, the node above it will have it's slot 1873 * incremented. The root node will never be freed. 1874 * 1875 * At the end of this function, we should have a path which has all 1876 * slots incremented to the next position for a search. If we need to 1877 * read a new node it will be NULL and the node above it will have the 1878 * correct slot selected for a later read. 1879 * 1880 * If we increment the root nodes slot counter past the number of 1881 * elements, 1 is returned to signal completion of the search. 1882 */ 1883 static int adjust_slots_upwards(struct btrfs_path *path, int root_level) 1884 { 1885 int level = 0; 1886 int nr, slot; 1887 struct extent_buffer *eb; 1888 1889 if (root_level == 0) 1890 return 1; 1891 1892 while (level <= root_level) { 1893 eb = path->nodes[level]; 1894 nr = btrfs_header_nritems(eb); 1895 path->slots[level]++; 1896 slot = path->slots[level]; 1897 if (slot >= nr || level == 0) { 1898 /* 1899 * Don't free the root - we will detect this 1900 * condition after our loop and return a 1901 * positive value for caller to stop walking the tree. 1902 */ 1903 if (level != root_level) { 1904 btrfs_tree_unlock_rw(eb, path->locks[level]); 1905 path->locks[level] = 0; 1906 1907 free_extent_buffer(eb); 1908 path->nodes[level] = NULL; 1909 path->slots[level] = 0; 1910 } 1911 } else { 1912 /* 1913 * We have a valid slot to walk back down 1914 * from. Stop here so caller can process these 1915 * new nodes. 1916 */ 1917 break; 1918 } 1919 1920 level++; 1921 } 1922 1923 eb = path->nodes[root_level]; 1924 if (path->slots[root_level] >= btrfs_header_nritems(eb)) 1925 return 1; 1926 1927 return 0; 1928 } 1929 1930 /* 1931 * Helper function to trace a subtree tree block swap. 1932 * 1933 * The swap will happen in highest tree block, but there may be a lot of 1934 * tree blocks involved. 1935 * 1936 * For example: 1937 * OO = Old tree blocks 1938 * NN = New tree blocks allocated during balance 1939 * 1940 * File tree (257) Reloc tree for 257 1941 * L2 OO NN 1942 * / \ / \ 1943 * L1 OO OO (a) OO NN (a) 1944 * / \ / \ / \ / \ 1945 * L0 OO OO OO OO OO OO NN NN 1946 * (b) (c) (b) (c) 1947 * 1948 * When calling qgroup_trace_extent_swap(), we will pass: 1949 * @src_eb = OO(a) 1950 * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ] 1951 * @dst_level = 0 1952 * @root_level = 1 1953 * 1954 * In that case, qgroup_trace_extent_swap() will search from OO(a) to 1955 * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty. 1956 * 1957 * The main work of qgroup_trace_extent_swap() can be split into 3 parts: 1958 * 1959 * 1) Tree search from @src_eb 1960 * It should acts as a simplified btrfs_search_slot(). 1961 * The key for search can be extracted from @dst_path->nodes[dst_level] 1962 * (first key). 1963 * 1964 * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty 1965 * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty. 1966 * They should be marked during previous (@dst_level = 1) iteration. 1967 * 1968 * 3) Mark file extents in leaves dirty 1969 * We don't have good way to pick out new file extents only. 1970 * So we still follow the old method by scanning all file extents in 1971 * the leave. 1972 * 1973 * This function can free us from keeping two paths, thus later we only need 1974 * to care about how to iterate all new tree blocks in reloc tree. 1975 */ 1976 static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, 1977 struct extent_buffer *src_eb, 1978 struct btrfs_path *dst_path, 1979 int dst_level, int root_level, 1980 bool trace_leaf) 1981 { 1982 struct btrfs_key key; 1983 struct btrfs_path *src_path; 1984 struct btrfs_fs_info *fs_info = trans->fs_info; 1985 u32 nodesize = fs_info->nodesize; 1986 int cur_level = root_level; 1987 int ret; 1988 1989 BUG_ON(dst_level > root_level); 1990 /* Level mismatch */ 1991 if (btrfs_header_level(src_eb) != root_level) 1992 return -EINVAL; 1993 1994 src_path = btrfs_alloc_path(); 1995 if (!src_path) { 1996 ret = -ENOMEM; 1997 goto out; 1998 } 1999 2000 if (dst_level) 2001 btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2002 else 2003 btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0); 2004 2005 /* For src_path */ 2006 atomic_inc(&src_eb->refs); 2007 src_path->nodes[root_level] = src_eb; 2008 src_path->slots[root_level] = dst_path->slots[root_level]; 2009 src_path->locks[root_level] = 0; 2010 2011 /* A simplified version of btrfs_search_slot() */ 2012 while (cur_level >= dst_level) { 2013 struct btrfs_key src_key; 2014 struct btrfs_key dst_key; 2015 2016 if (src_path->nodes[cur_level] == NULL) { 2017 struct extent_buffer *eb; 2018 int parent_slot; 2019 2020 eb = src_path->nodes[cur_level + 1]; 2021 parent_slot = src_path->slots[cur_level + 1]; 2022 2023 eb = btrfs_read_node_slot(eb, parent_slot); 2024 if (IS_ERR(eb)) { 2025 ret = PTR_ERR(eb); 2026 goto out; 2027 } 2028 2029 src_path->nodes[cur_level] = eb; 2030 2031 btrfs_tree_read_lock(eb); 2032 src_path->locks[cur_level] = BTRFS_READ_LOCK; 2033 } 2034 2035 src_path->slots[cur_level] = dst_path->slots[cur_level]; 2036 if (cur_level) { 2037 btrfs_node_key_to_cpu(dst_path->nodes[cur_level], 2038 &dst_key, dst_path->slots[cur_level]); 2039 btrfs_node_key_to_cpu(src_path->nodes[cur_level], 2040 &src_key, src_path->slots[cur_level]); 2041 } else { 2042 btrfs_item_key_to_cpu(dst_path->nodes[cur_level], 2043 &dst_key, dst_path->slots[cur_level]); 2044 btrfs_item_key_to_cpu(src_path->nodes[cur_level], 2045 &src_key, src_path->slots[cur_level]); 2046 } 2047 /* Content mismatch, something went wrong */ 2048 if (btrfs_comp_cpu_keys(&dst_key, &src_key)) { 2049 ret = -ENOENT; 2050 goto out; 2051 } 2052 cur_level--; 2053 } 2054 2055 /* 2056 * Now both @dst_path and @src_path have been populated, record the tree 2057 * blocks for qgroup accounting. 2058 */ 2059 ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, 2060 nodesize, GFP_NOFS); 2061 if (ret < 0) 2062 goto out; 2063 ret = btrfs_qgroup_trace_extent(trans, 2064 dst_path->nodes[dst_level]->start, 2065 nodesize, GFP_NOFS); 2066 if (ret < 0) 2067 goto out; 2068 2069 /* Record leaf file extents */ 2070 if (dst_level == 0 && trace_leaf) { 2071 ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]); 2072 if (ret < 0) 2073 goto out; 2074 ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]); 2075 } 2076 out: 2077 btrfs_free_path(src_path); 2078 return ret; 2079 } 2080 2081 /* 2082 * Helper function to do recursive generation-aware depth-first search, to 2083 * locate all new tree blocks in a subtree of reloc tree. 2084 * 2085 * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot) 2086 * reloc tree 2087 * L2 NN (a) 2088 * / \ 2089 * L1 OO NN (b) 2090 * / \ / \ 2091 * L0 OO OO OO NN 2092 * (c) (d) 2093 * If we pass: 2094 * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ], 2095 * @cur_level = 1 2096 * @root_level = 1 2097 * 2098 * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace 2099 * above tree blocks along with their counter parts in file tree. 2100 * While during search, old tree blocks OO(c) will be skipped as tree block swap 2101 * won't affect OO(c). 2102 */ 2103 static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans, 2104 struct extent_buffer *src_eb, 2105 struct btrfs_path *dst_path, 2106 int cur_level, int root_level, 2107 u64 last_snapshot, bool trace_leaf) 2108 { 2109 struct btrfs_fs_info *fs_info = trans->fs_info; 2110 struct extent_buffer *eb; 2111 bool need_cleanup = false; 2112 int ret = 0; 2113 int i; 2114 2115 /* Level sanity check */ 2116 if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL - 1 || 2117 root_level < 0 || root_level >= BTRFS_MAX_LEVEL - 1 || 2118 root_level < cur_level) { 2119 btrfs_err_rl(fs_info, 2120 "%s: bad levels, cur_level=%d root_level=%d", 2121 __func__, cur_level, root_level); 2122 return -EUCLEAN; 2123 } 2124 2125 /* Read the tree block if needed */ 2126 if (dst_path->nodes[cur_level] == NULL) { 2127 int parent_slot; 2128 u64 child_gen; 2129 2130 /* 2131 * dst_path->nodes[root_level] must be initialized before 2132 * calling this function. 2133 */ 2134 if (cur_level == root_level) { 2135 btrfs_err_rl(fs_info, 2136 "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d", 2137 __func__, root_level, root_level, cur_level); 2138 return -EUCLEAN; 2139 } 2140 2141 /* 2142 * We need to get child blockptr/gen from parent before we can 2143 * read it. 2144 */ 2145 eb = dst_path->nodes[cur_level + 1]; 2146 parent_slot = dst_path->slots[cur_level + 1]; 2147 child_gen = btrfs_node_ptr_generation(eb, parent_slot); 2148 2149 /* This node is old, no need to trace */ 2150 if (child_gen < last_snapshot) 2151 goto out; 2152 2153 eb = btrfs_read_node_slot(eb, parent_slot); 2154 if (IS_ERR(eb)) { 2155 ret = PTR_ERR(eb); 2156 goto out; 2157 } 2158 2159 dst_path->nodes[cur_level] = eb; 2160 dst_path->slots[cur_level] = 0; 2161 2162 btrfs_tree_read_lock(eb); 2163 dst_path->locks[cur_level] = BTRFS_READ_LOCK; 2164 need_cleanup = true; 2165 } 2166 2167 /* Now record this tree block and its counter part for qgroups */ 2168 ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level, 2169 root_level, trace_leaf); 2170 if (ret < 0) 2171 goto cleanup; 2172 2173 eb = dst_path->nodes[cur_level]; 2174 2175 if (cur_level > 0) { 2176 /* Iterate all child tree blocks */ 2177 for (i = 0; i < btrfs_header_nritems(eb); i++) { 2178 /* Skip old tree blocks as they won't be swapped */ 2179 if (btrfs_node_ptr_generation(eb, i) < last_snapshot) 2180 continue; 2181 dst_path->slots[cur_level] = i; 2182 2183 /* Recursive call (at most 7 times) */ 2184 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, 2185 dst_path, cur_level - 1, root_level, 2186 last_snapshot, trace_leaf); 2187 if (ret < 0) 2188 goto cleanup; 2189 } 2190 } 2191 2192 cleanup: 2193 if (need_cleanup) { 2194 /* Clean up */ 2195 btrfs_tree_unlock_rw(dst_path->nodes[cur_level], 2196 dst_path->locks[cur_level]); 2197 free_extent_buffer(dst_path->nodes[cur_level]); 2198 dst_path->nodes[cur_level] = NULL; 2199 dst_path->slots[cur_level] = 0; 2200 dst_path->locks[cur_level] = 0; 2201 } 2202 out: 2203 return ret; 2204 } 2205 2206 static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 2207 struct extent_buffer *src_eb, 2208 struct extent_buffer *dst_eb, 2209 u64 last_snapshot, bool trace_leaf) 2210 { 2211 struct btrfs_fs_info *fs_info = trans->fs_info; 2212 struct btrfs_path *dst_path = NULL; 2213 int level; 2214 int ret; 2215 2216 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2217 return 0; 2218 2219 /* Wrong parameter order */ 2220 if (btrfs_header_generation(src_eb) > btrfs_header_generation(dst_eb)) { 2221 btrfs_err_rl(fs_info, 2222 "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__, 2223 btrfs_header_generation(src_eb), 2224 btrfs_header_generation(dst_eb)); 2225 return -EUCLEAN; 2226 } 2227 2228 if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) { 2229 ret = -EIO; 2230 goto out; 2231 } 2232 2233 level = btrfs_header_level(dst_eb); 2234 dst_path = btrfs_alloc_path(); 2235 if (!dst_path) { 2236 ret = -ENOMEM; 2237 goto out; 2238 } 2239 /* For dst_path */ 2240 atomic_inc(&dst_eb->refs); 2241 dst_path->nodes[level] = dst_eb; 2242 dst_path->slots[level] = 0; 2243 dst_path->locks[level] = 0; 2244 2245 /* Do the generation aware breadth-first search */ 2246 ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level, 2247 level, last_snapshot, trace_leaf); 2248 if (ret < 0) 2249 goto out; 2250 ret = 0; 2251 2252 out: 2253 btrfs_free_path(dst_path); 2254 if (ret < 0) 2255 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2256 return ret; 2257 } 2258 2259 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 2260 struct extent_buffer *root_eb, 2261 u64 root_gen, int root_level) 2262 { 2263 struct btrfs_fs_info *fs_info = trans->fs_info; 2264 int ret = 0; 2265 int level; 2266 struct extent_buffer *eb = root_eb; 2267 struct btrfs_path *path = NULL; 2268 2269 BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); 2270 BUG_ON(root_eb == NULL); 2271 2272 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2273 return 0; 2274 2275 if (!extent_buffer_uptodate(root_eb)) { 2276 ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); 2277 if (ret) 2278 goto out; 2279 } 2280 2281 if (root_level == 0) { 2282 ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2283 goto out; 2284 } 2285 2286 path = btrfs_alloc_path(); 2287 if (!path) 2288 return -ENOMEM; 2289 2290 /* 2291 * Walk down the tree. Missing extent blocks are filled in as 2292 * we go. Metadata is accounted every time we read a new 2293 * extent block. 2294 * 2295 * When we reach a leaf, we account for file extent items in it, 2296 * walk back up the tree (adjusting slot pointers as we go) 2297 * and restart the search process. 2298 */ 2299 atomic_inc(&root_eb->refs); /* For path */ 2300 path->nodes[root_level] = root_eb; 2301 path->slots[root_level] = 0; 2302 path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ 2303 walk_down: 2304 level = root_level; 2305 while (level >= 0) { 2306 if (path->nodes[level] == NULL) { 2307 int parent_slot; 2308 u64 child_bytenr; 2309 2310 /* 2311 * We need to get child blockptr from parent before we 2312 * can read it. 2313 */ 2314 eb = path->nodes[level + 1]; 2315 parent_slot = path->slots[level + 1]; 2316 child_bytenr = btrfs_node_blockptr(eb, parent_slot); 2317 2318 eb = btrfs_read_node_slot(eb, parent_slot); 2319 if (IS_ERR(eb)) { 2320 ret = PTR_ERR(eb); 2321 goto out; 2322 } 2323 2324 path->nodes[level] = eb; 2325 path->slots[level] = 0; 2326 2327 btrfs_tree_read_lock(eb); 2328 path->locks[level] = BTRFS_READ_LOCK; 2329 2330 ret = btrfs_qgroup_trace_extent(trans, child_bytenr, 2331 fs_info->nodesize, 2332 GFP_NOFS); 2333 if (ret) 2334 goto out; 2335 } 2336 2337 if (level == 0) { 2338 ret = btrfs_qgroup_trace_leaf_items(trans, 2339 path->nodes[level]); 2340 if (ret) 2341 goto out; 2342 2343 /* Nonzero return here means we completed our search */ 2344 ret = adjust_slots_upwards(path, root_level); 2345 if (ret) 2346 break; 2347 2348 /* Restart search with new slots */ 2349 goto walk_down; 2350 } 2351 2352 level--; 2353 } 2354 2355 ret = 0; 2356 out: 2357 btrfs_free_path(path); 2358 2359 return ret; 2360 } 2361 2362 #define UPDATE_NEW 0 2363 #define UPDATE_OLD 1 2364 /* 2365 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2366 */ 2367 static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, 2368 struct ulist *roots, struct ulist *tmp, 2369 struct ulist *qgroups, u64 seq, int update_old) 2370 { 2371 struct ulist_node *unode; 2372 struct ulist_iterator uiter; 2373 struct ulist_node *tmp_unode; 2374 struct ulist_iterator tmp_uiter; 2375 struct btrfs_qgroup *qg; 2376 int ret = 0; 2377 2378 if (!roots) 2379 return 0; 2380 ULIST_ITER_INIT(&uiter); 2381 while ((unode = ulist_next(roots, &uiter))) { 2382 qg = find_qgroup_rb(fs_info, unode->val); 2383 if (!qg) 2384 continue; 2385 2386 ulist_reinit(tmp); 2387 ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), 2388 GFP_ATOMIC); 2389 if (ret < 0) 2390 return ret; 2391 ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); 2392 if (ret < 0) 2393 return ret; 2394 ULIST_ITER_INIT(&tmp_uiter); 2395 while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { 2396 struct btrfs_qgroup_list *glist; 2397 2398 qg = unode_aux_to_qgroup(tmp_unode); 2399 if (update_old) 2400 btrfs_qgroup_update_old_refcnt(qg, seq, 1); 2401 else 2402 btrfs_qgroup_update_new_refcnt(qg, seq, 1); 2403 list_for_each_entry(glist, &qg->groups, next_group) { 2404 ret = ulist_add(qgroups, glist->group->qgroupid, 2405 qgroup_to_aux(glist->group), 2406 GFP_ATOMIC); 2407 if (ret < 0) 2408 return ret; 2409 ret = ulist_add(tmp, glist->group->qgroupid, 2410 qgroup_to_aux(glist->group), 2411 GFP_ATOMIC); 2412 if (ret < 0) 2413 return ret; 2414 } 2415 } 2416 } 2417 return 0; 2418 } 2419 2420 /* 2421 * Update qgroup rfer/excl counters. 2422 * Rfer update is easy, codes can explain themselves. 2423 * 2424 * Excl update is tricky, the update is split into 2 parts. 2425 * Part 1: Possible exclusive <-> sharing detect: 2426 * | A | !A | 2427 * ------------------------------------- 2428 * B | * | - | 2429 * ------------------------------------- 2430 * !B | + | ** | 2431 * ------------------------------------- 2432 * 2433 * Conditions: 2434 * A: cur_old_roots < nr_old_roots (not exclusive before) 2435 * !A: cur_old_roots == nr_old_roots (possible exclusive before) 2436 * B: cur_new_roots < nr_new_roots (not exclusive now) 2437 * !B: cur_new_roots == nr_new_roots (possible exclusive now) 2438 * 2439 * Results: 2440 * +: Possible sharing -> exclusive -: Possible exclusive -> sharing 2441 * *: Definitely not changed. **: Possible unchanged. 2442 * 2443 * For !A and !B condition, the exception is cur_old/new_roots == 0 case. 2444 * 2445 * To make the logic clear, we first use condition A and B to split 2446 * combination into 4 results. 2447 * 2448 * Then, for result "+" and "-", check old/new_roots == 0 case, as in them 2449 * only on variant maybe 0. 2450 * 2451 * Lastly, check result **, since there are 2 variants maybe 0, split them 2452 * again(2x2). 2453 * But this time we don't need to consider other things, the codes and logic 2454 * is easy to understand now. 2455 */ 2456 static int qgroup_update_counters(struct btrfs_fs_info *fs_info, 2457 struct ulist *qgroups, 2458 u64 nr_old_roots, 2459 u64 nr_new_roots, 2460 u64 num_bytes, u64 seq) 2461 { 2462 struct ulist_node *unode; 2463 struct ulist_iterator uiter; 2464 struct btrfs_qgroup *qg; 2465 u64 cur_new_count, cur_old_count; 2466 2467 ULIST_ITER_INIT(&uiter); 2468 while ((unode = ulist_next(qgroups, &uiter))) { 2469 bool dirty = false; 2470 2471 qg = unode_aux_to_qgroup(unode); 2472 cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); 2473 cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); 2474 2475 trace_qgroup_update_counters(fs_info, qg, cur_old_count, 2476 cur_new_count); 2477 2478 /* Rfer update part */ 2479 if (cur_old_count == 0 && cur_new_count > 0) { 2480 qg->rfer += num_bytes; 2481 qg->rfer_cmpr += num_bytes; 2482 dirty = true; 2483 } 2484 if (cur_old_count > 0 && cur_new_count == 0) { 2485 qg->rfer -= num_bytes; 2486 qg->rfer_cmpr -= num_bytes; 2487 dirty = true; 2488 } 2489 2490 /* Excl update part */ 2491 /* Exclusive/none -> shared case */ 2492 if (cur_old_count == nr_old_roots && 2493 cur_new_count < nr_new_roots) { 2494 /* Exclusive -> shared */ 2495 if (cur_old_count != 0) { 2496 qg->excl -= num_bytes; 2497 qg->excl_cmpr -= num_bytes; 2498 dirty = true; 2499 } 2500 } 2501 2502 /* Shared -> exclusive/none case */ 2503 if (cur_old_count < nr_old_roots && 2504 cur_new_count == nr_new_roots) { 2505 /* Shared->exclusive */ 2506 if (cur_new_count != 0) { 2507 qg->excl += num_bytes; 2508 qg->excl_cmpr += num_bytes; 2509 dirty = true; 2510 } 2511 } 2512 2513 /* Exclusive/none -> exclusive/none case */ 2514 if (cur_old_count == nr_old_roots && 2515 cur_new_count == nr_new_roots) { 2516 if (cur_old_count == 0) { 2517 /* None -> exclusive/none */ 2518 2519 if (cur_new_count != 0) { 2520 /* None -> exclusive */ 2521 qg->excl += num_bytes; 2522 qg->excl_cmpr += num_bytes; 2523 dirty = true; 2524 } 2525 /* None -> none, nothing changed */ 2526 } else { 2527 /* Exclusive -> exclusive/none */ 2528 2529 if (cur_new_count == 0) { 2530 /* Exclusive -> none */ 2531 qg->excl -= num_bytes; 2532 qg->excl_cmpr -= num_bytes; 2533 dirty = true; 2534 } 2535 /* Exclusive -> exclusive, nothing changed */ 2536 } 2537 } 2538 2539 if (dirty) 2540 qgroup_dirty(fs_info, qg); 2541 } 2542 return 0; 2543 } 2544 2545 /* 2546 * Check if the @roots potentially is a list of fs tree roots 2547 * 2548 * Return 0 for definitely not a fs/subvol tree roots ulist 2549 * Return 1 for possible fs/subvol tree roots in the list (considering an empty 2550 * one as well) 2551 */ 2552 static int maybe_fs_roots(struct ulist *roots) 2553 { 2554 struct ulist_node *unode; 2555 struct ulist_iterator uiter; 2556 2557 /* Empty one, still possible for fs roots */ 2558 if (!roots || roots->nnodes == 0) 2559 return 1; 2560 2561 ULIST_ITER_INIT(&uiter); 2562 unode = ulist_next(roots, &uiter); 2563 if (!unode) 2564 return 1; 2565 2566 /* 2567 * If it contains fs tree roots, then it must belong to fs/subvol 2568 * trees. 2569 * If it contains a non-fs tree, it won't be shared with fs/subvol trees. 2570 */ 2571 return is_fstree(unode->val); 2572 } 2573 2574 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 2575 u64 num_bytes, struct ulist *old_roots, 2576 struct ulist *new_roots) 2577 { 2578 struct btrfs_fs_info *fs_info = trans->fs_info; 2579 struct ulist *qgroups = NULL; 2580 struct ulist *tmp = NULL; 2581 u64 seq; 2582 u64 nr_new_roots = 0; 2583 u64 nr_old_roots = 0; 2584 int ret = 0; 2585 2586 /* 2587 * If quotas get disabled meanwhile, the resources need to be freed and 2588 * we can't just exit here. 2589 */ 2590 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2591 goto out_free; 2592 2593 if (new_roots) { 2594 if (!maybe_fs_roots(new_roots)) 2595 goto out_free; 2596 nr_new_roots = new_roots->nnodes; 2597 } 2598 if (old_roots) { 2599 if (!maybe_fs_roots(old_roots)) 2600 goto out_free; 2601 nr_old_roots = old_roots->nnodes; 2602 } 2603 2604 /* Quick exit, either not fs tree roots, or won't affect any qgroup */ 2605 if (nr_old_roots == 0 && nr_new_roots == 0) 2606 goto out_free; 2607 2608 BUG_ON(!fs_info->quota_root); 2609 2610 trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, 2611 num_bytes, nr_old_roots, nr_new_roots); 2612 2613 qgroups = ulist_alloc(GFP_NOFS); 2614 if (!qgroups) { 2615 ret = -ENOMEM; 2616 goto out_free; 2617 } 2618 tmp = ulist_alloc(GFP_NOFS); 2619 if (!tmp) { 2620 ret = -ENOMEM; 2621 goto out_free; 2622 } 2623 2624 mutex_lock(&fs_info->qgroup_rescan_lock); 2625 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 2626 if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { 2627 mutex_unlock(&fs_info->qgroup_rescan_lock); 2628 ret = 0; 2629 goto out_free; 2630 } 2631 } 2632 mutex_unlock(&fs_info->qgroup_rescan_lock); 2633 2634 spin_lock(&fs_info->qgroup_lock); 2635 seq = fs_info->qgroup_seq; 2636 2637 /* Update old refcnts using old_roots */ 2638 ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, 2639 UPDATE_OLD); 2640 if (ret < 0) 2641 goto out; 2642 2643 /* Update new refcnts using new_roots */ 2644 ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, 2645 UPDATE_NEW); 2646 if (ret < 0) 2647 goto out; 2648 2649 qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, 2650 num_bytes, seq); 2651 2652 /* 2653 * Bump qgroup_seq to avoid seq overlap 2654 */ 2655 fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; 2656 out: 2657 spin_unlock(&fs_info->qgroup_lock); 2658 out_free: 2659 ulist_free(tmp); 2660 ulist_free(qgroups); 2661 ulist_free(old_roots); 2662 ulist_free(new_roots); 2663 return ret; 2664 } 2665 2666 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) 2667 { 2668 struct btrfs_fs_info *fs_info = trans->fs_info; 2669 struct btrfs_qgroup_extent_record *record; 2670 struct btrfs_delayed_ref_root *delayed_refs; 2671 struct ulist *new_roots = NULL; 2672 struct rb_node *node; 2673 u64 num_dirty_extents = 0; 2674 u64 qgroup_to_skip; 2675 int ret = 0; 2676 2677 delayed_refs = &trans->transaction->delayed_refs; 2678 qgroup_to_skip = delayed_refs->qgroup_to_skip; 2679 while ((node = rb_first(&delayed_refs->dirty_extent_root))) { 2680 record = rb_entry(node, struct btrfs_qgroup_extent_record, 2681 node); 2682 2683 num_dirty_extents++; 2684 trace_btrfs_qgroup_account_extents(fs_info, record); 2685 2686 if (!ret) { 2687 /* 2688 * Old roots should be searched when inserting qgroup 2689 * extent record 2690 */ 2691 if (WARN_ON(!record->old_roots)) { 2692 /* Search commit root to find old_roots */ 2693 ret = btrfs_find_all_roots(NULL, fs_info, 2694 record->bytenr, 0, 2695 &record->old_roots, false); 2696 if (ret < 0) 2697 goto cleanup; 2698 } 2699 2700 /* Free the reserved data space */ 2701 btrfs_qgroup_free_refroot(fs_info, 2702 record->data_rsv_refroot, 2703 record->data_rsv, 2704 BTRFS_QGROUP_RSV_DATA); 2705 /* 2706 * Use BTRFS_SEQ_LAST as time_seq to do special search, 2707 * which doesn't lock tree or delayed_refs and search 2708 * current root. It's safe inside commit_transaction(). 2709 */ 2710 ret = btrfs_find_all_roots(trans, fs_info, 2711 record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); 2712 if (ret < 0) 2713 goto cleanup; 2714 if (qgroup_to_skip) { 2715 ulist_del(new_roots, qgroup_to_skip, 0); 2716 ulist_del(record->old_roots, qgroup_to_skip, 2717 0); 2718 } 2719 ret = btrfs_qgroup_account_extent(trans, record->bytenr, 2720 record->num_bytes, 2721 record->old_roots, 2722 new_roots); 2723 record->old_roots = NULL; 2724 new_roots = NULL; 2725 } 2726 cleanup: 2727 ulist_free(record->old_roots); 2728 ulist_free(new_roots); 2729 new_roots = NULL; 2730 rb_erase(node, &delayed_refs->dirty_extent_root); 2731 kfree(record); 2732 2733 } 2734 trace_qgroup_num_dirty_extents(fs_info, trans->transid, 2735 num_dirty_extents); 2736 return ret; 2737 } 2738 2739 /* 2740 * called from commit_transaction. Writes all changed qgroups to disk. 2741 */ 2742 int btrfs_run_qgroups(struct btrfs_trans_handle *trans) 2743 { 2744 struct btrfs_fs_info *fs_info = trans->fs_info; 2745 int ret = 0; 2746 2747 if (!fs_info->quota_root) 2748 return ret; 2749 2750 spin_lock(&fs_info->qgroup_lock); 2751 while (!list_empty(&fs_info->dirty_qgroups)) { 2752 struct btrfs_qgroup *qgroup; 2753 qgroup = list_first_entry(&fs_info->dirty_qgroups, 2754 struct btrfs_qgroup, dirty); 2755 list_del_init(&qgroup->dirty); 2756 spin_unlock(&fs_info->qgroup_lock); 2757 ret = update_qgroup_info_item(trans, qgroup); 2758 if (ret) 2759 fs_info->qgroup_flags |= 2760 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2761 ret = update_qgroup_limit_item(trans, qgroup); 2762 if (ret) 2763 fs_info->qgroup_flags |= 2764 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2765 spin_lock(&fs_info->qgroup_lock); 2766 } 2767 if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2768 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; 2769 else 2770 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; 2771 spin_unlock(&fs_info->qgroup_lock); 2772 2773 ret = update_qgroup_status_item(trans); 2774 if (ret) 2775 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2776 2777 return ret; 2778 } 2779 2780 /* 2781 * Copy the accounting information between qgroups. This is necessary 2782 * when a snapshot or a subvolume is created. Throwing an error will 2783 * cause a transaction abort so we take extra care here to only error 2784 * when a readonly fs is a reasonable outcome. 2785 */ 2786 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 2787 u64 objectid, struct btrfs_qgroup_inherit *inherit) 2788 { 2789 int ret = 0; 2790 int i; 2791 u64 *i_qgroups; 2792 bool committing = false; 2793 struct btrfs_fs_info *fs_info = trans->fs_info; 2794 struct btrfs_root *quota_root; 2795 struct btrfs_qgroup *srcgroup; 2796 struct btrfs_qgroup *dstgroup; 2797 bool need_rescan = false; 2798 u32 level_size = 0; 2799 u64 nums; 2800 2801 /* 2802 * There are only two callers of this function. 2803 * 2804 * One in create_subvol() in the ioctl context, which needs to hold 2805 * the qgroup_ioctl_lock. 2806 * 2807 * The other one in create_pending_snapshot() where no other qgroup 2808 * code can modify the fs as they all need to either start a new trans 2809 * or hold a trans handler, thus we don't need to hold 2810 * qgroup_ioctl_lock. 2811 * This would avoid long and complex lock chain and make lockdep happy. 2812 */ 2813 spin_lock(&fs_info->trans_lock); 2814 if (trans->transaction->state == TRANS_STATE_COMMIT_DOING) 2815 committing = true; 2816 spin_unlock(&fs_info->trans_lock); 2817 2818 if (!committing) 2819 mutex_lock(&fs_info->qgroup_ioctl_lock); 2820 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 2821 goto out; 2822 2823 quota_root = fs_info->quota_root; 2824 if (!quota_root) { 2825 ret = -EINVAL; 2826 goto out; 2827 } 2828 2829 if (inherit) { 2830 i_qgroups = (u64 *)(inherit + 1); 2831 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + 2832 2 * inherit->num_excl_copies; 2833 for (i = 0; i < nums; ++i) { 2834 srcgroup = find_qgroup_rb(fs_info, *i_qgroups); 2835 2836 /* 2837 * Zero out invalid groups so we can ignore 2838 * them later. 2839 */ 2840 if (!srcgroup || 2841 ((srcgroup->qgroupid >> 48) <= (objectid >> 48))) 2842 *i_qgroups = 0ULL; 2843 2844 ++i_qgroups; 2845 } 2846 } 2847 2848 /* 2849 * create a tracking group for the subvol itself 2850 */ 2851 ret = add_qgroup_item(trans, quota_root, objectid); 2852 if (ret) 2853 goto out; 2854 2855 /* 2856 * add qgroup to all inherited groups 2857 */ 2858 if (inherit) { 2859 i_qgroups = (u64 *)(inherit + 1); 2860 for (i = 0; i < inherit->num_qgroups; ++i, ++i_qgroups) { 2861 if (*i_qgroups == 0) 2862 continue; 2863 ret = add_qgroup_relation_item(trans, objectid, 2864 *i_qgroups); 2865 if (ret && ret != -EEXIST) 2866 goto out; 2867 ret = add_qgroup_relation_item(trans, *i_qgroups, 2868 objectid); 2869 if (ret && ret != -EEXIST) 2870 goto out; 2871 } 2872 ret = 0; 2873 } 2874 2875 2876 spin_lock(&fs_info->qgroup_lock); 2877 2878 dstgroup = add_qgroup_rb(fs_info, objectid); 2879 if (IS_ERR(dstgroup)) { 2880 ret = PTR_ERR(dstgroup); 2881 goto unlock; 2882 } 2883 2884 if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { 2885 dstgroup->lim_flags = inherit->lim.flags; 2886 dstgroup->max_rfer = inherit->lim.max_rfer; 2887 dstgroup->max_excl = inherit->lim.max_excl; 2888 dstgroup->rsv_rfer = inherit->lim.rsv_rfer; 2889 dstgroup->rsv_excl = inherit->lim.rsv_excl; 2890 2891 ret = update_qgroup_limit_item(trans, dstgroup); 2892 if (ret) { 2893 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 2894 btrfs_info(fs_info, 2895 "unable to update quota limit for %llu", 2896 dstgroup->qgroupid); 2897 goto unlock; 2898 } 2899 } 2900 2901 if (srcid) { 2902 srcgroup = find_qgroup_rb(fs_info, srcid); 2903 if (!srcgroup) 2904 goto unlock; 2905 2906 /* 2907 * We call inherit after we clone the root in order to make sure 2908 * our counts don't go crazy, so at this point the only 2909 * difference between the two roots should be the root node. 2910 */ 2911 level_size = fs_info->nodesize; 2912 dstgroup->rfer = srcgroup->rfer; 2913 dstgroup->rfer_cmpr = srcgroup->rfer_cmpr; 2914 dstgroup->excl = level_size; 2915 dstgroup->excl_cmpr = level_size; 2916 srcgroup->excl = level_size; 2917 srcgroup->excl_cmpr = level_size; 2918 2919 /* inherit the limit info */ 2920 dstgroup->lim_flags = srcgroup->lim_flags; 2921 dstgroup->max_rfer = srcgroup->max_rfer; 2922 dstgroup->max_excl = srcgroup->max_excl; 2923 dstgroup->rsv_rfer = srcgroup->rsv_rfer; 2924 dstgroup->rsv_excl = srcgroup->rsv_excl; 2925 2926 qgroup_dirty(fs_info, dstgroup); 2927 qgroup_dirty(fs_info, srcgroup); 2928 } 2929 2930 if (!inherit) 2931 goto unlock; 2932 2933 i_qgroups = (u64 *)(inherit + 1); 2934 for (i = 0; i < inherit->num_qgroups; ++i) { 2935 if (*i_qgroups) { 2936 ret = add_relation_rb(fs_info, objectid, *i_qgroups); 2937 if (ret) 2938 goto unlock; 2939 } 2940 ++i_qgroups; 2941 2942 /* 2943 * If we're doing a snapshot, and adding the snapshot to a new 2944 * qgroup, the numbers are guaranteed to be incorrect. 2945 */ 2946 if (srcid) 2947 need_rescan = true; 2948 } 2949 2950 for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) { 2951 struct btrfs_qgroup *src; 2952 struct btrfs_qgroup *dst; 2953 2954 if (!i_qgroups[0] || !i_qgroups[1]) 2955 continue; 2956 2957 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2958 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2959 2960 if (!src || !dst) { 2961 ret = -EINVAL; 2962 goto unlock; 2963 } 2964 2965 dst->rfer = src->rfer - level_size; 2966 dst->rfer_cmpr = src->rfer_cmpr - level_size; 2967 2968 /* Manually tweaking numbers certainly needs a rescan */ 2969 need_rescan = true; 2970 } 2971 for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) { 2972 struct btrfs_qgroup *src; 2973 struct btrfs_qgroup *dst; 2974 2975 if (!i_qgroups[0] || !i_qgroups[1]) 2976 continue; 2977 2978 src = find_qgroup_rb(fs_info, i_qgroups[0]); 2979 dst = find_qgroup_rb(fs_info, i_qgroups[1]); 2980 2981 if (!src || !dst) { 2982 ret = -EINVAL; 2983 goto unlock; 2984 } 2985 2986 dst->excl = src->excl + level_size; 2987 dst->excl_cmpr = src->excl_cmpr + level_size; 2988 need_rescan = true; 2989 } 2990 2991 unlock: 2992 spin_unlock(&fs_info->qgroup_lock); 2993 if (!ret) 2994 ret = btrfs_sysfs_add_one_qgroup(fs_info, dstgroup); 2995 out: 2996 if (!committing) 2997 mutex_unlock(&fs_info->qgroup_ioctl_lock); 2998 if (need_rescan) 2999 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3000 return ret; 3001 } 3002 3003 static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes) 3004 { 3005 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && 3006 qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer) 3007 return false; 3008 3009 if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) && 3010 qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl) 3011 return false; 3012 3013 return true; 3014 } 3015 3016 static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, 3017 enum btrfs_qgroup_rsv_type type) 3018 { 3019 struct btrfs_qgroup *qgroup; 3020 struct btrfs_fs_info *fs_info = root->fs_info; 3021 u64 ref_root = root->root_key.objectid; 3022 int ret = 0; 3023 struct ulist_node *unode; 3024 struct ulist_iterator uiter; 3025 3026 if (!is_fstree(ref_root)) 3027 return 0; 3028 3029 if (num_bytes == 0) 3030 return 0; 3031 3032 if (test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags) && 3033 capable(CAP_SYS_RESOURCE)) 3034 enforce = false; 3035 3036 spin_lock(&fs_info->qgroup_lock); 3037 if (!fs_info->quota_root) 3038 goto out; 3039 3040 qgroup = find_qgroup_rb(fs_info, ref_root); 3041 if (!qgroup) 3042 goto out; 3043 3044 /* 3045 * in a first step, we check all affected qgroups if any limits would 3046 * be exceeded 3047 */ 3048 ulist_reinit(fs_info->qgroup_ulist); 3049 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3050 qgroup_to_aux(qgroup), GFP_ATOMIC); 3051 if (ret < 0) 3052 goto out; 3053 ULIST_ITER_INIT(&uiter); 3054 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3055 struct btrfs_qgroup *qg; 3056 struct btrfs_qgroup_list *glist; 3057 3058 qg = unode_aux_to_qgroup(unode); 3059 3060 if (enforce && !qgroup_check_limits(qg, num_bytes)) { 3061 ret = -EDQUOT; 3062 goto out; 3063 } 3064 3065 list_for_each_entry(glist, &qg->groups, next_group) { 3066 ret = ulist_add(fs_info->qgroup_ulist, 3067 glist->group->qgroupid, 3068 qgroup_to_aux(glist->group), GFP_ATOMIC); 3069 if (ret < 0) 3070 goto out; 3071 } 3072 } 3073 ret = 0; 3074 /* 3075 * no limits exceeded, now record the reservation into all qgroups 3076 */ 3077 ULIST_ITER_INIT(&uiter); 3078 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3079 struct btrfs_qgroup *qg; 3080 3081 qg = unode_aux_to_qgroup(unode); 3082 3083 qgroup_rsv_add(fs_info, qg, num_bytes, type); 3084 } 3085 3086 out: 3087 spin_unlock(&fs_info->qgroup_lock); 3088 return ret; 3089 } 3090 3091 /* 3092 * Free @num_bytes of reserved space with @type for qgroup. (Normally level 0 3093 * qgroup). 3094 * 3095 * Will handle all higher level qgroup too. 3096 * 3097 * NOTE: If @num_bytes is (u64)-1, this means to free all bytes of this qgroup. 3098 * This special case is only used for META_PERTRANS type. 3099 */ 3100 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 3101 u64 ref_root, u64 num_bytes, 3102 enum btrfs_qgroup_rsv_type type) 3103 { 3104 struct btrfs_qgroup *qgroup; 3105 struct ulist_node *unode; 3106 struct ulist_iterator uiter; 3107 int ret = 0; 3108 3109 if (!is_fstree(ref_root)) 3110 return; 3111 3112 if (num_bytes == 0) 3113 return; 3114 3115 if (num_bytes == (u64)-1 && type != BTRFS_QGROUP_RSV_META_PERTRANS) { 3116 WARN(1, "%s: Invalid type to free", __func__); 3117 return; 3118 } 3119 spin_lock(&fs_info->qgroup_lock); 3120 3121 if (!fs_info->quota_root) 3122 goto out; 3123 3124 qgroup = find_qgroup_rb(fs_info, ref_root); 3125 if (!qgroup) 3126 goto out; 3127 3128 if (num_bytes == (u64)-1) 3129 /* 3130 * We're freeing all pertrans rsv, get reserved value from 3131 * level 0 qgroup as real num_bytes to free. 3132 */ 3133 num_bytes = qgroup->rsv.values[type]; 3134 3135 ulist_reinit(fs_info->qgroup_ulist); 3136 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3137 qgroup_to_aux(qgroup), GFP_ATOMIC); 3138 if (ret < 0) 3139 goto out; 3140 ULIST_ITER_INIT(&uiter); 3141 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 3142 struct btrfs_qgroup *qg; 3143 struct btrfs_qgroup_list *glist; 3144 3145 qg = unode_aux_to_qgroup(unode); 3146 3147 qgroup_rsv_release(fs_info, qg, num_bytes, type); 3148 3149 list_for_each_entry(glist, &qg->groups, next_group) { 3150 ret = ulist_add(fs_info->qgroup_ulist, 3151 glist->group->qgroupid, 3152 qgroup_to_aux(glist->group), GFP_ATOMIC); 3153 if (ret < 0) 3154 goto out; 3155 } 3156 } 3157 3158 out: 3159 spin_unlock(&fs_info->qgroup_lock); 3160 } 3161 3162 /* 3163 * Check if the leaf is the last leaf. Which means all node pointers 3164 * are at their last position. 3165 */ 3166 static bool is_last_leaf(struct btrfs_path *path) 3167 { 3168 int i; 3169 3170 for (i = 1; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) { 3171 if (path->slots[i] != btrfs_header_nritems(path->nodes[i]) - 1) 3172 return false; 3173 } 3174 return true; 3175 } 3176 3177 /* 3178 * returns < 0 on error, 0 when more leafs are to be scanned. 3179 * returns 1 when done. 3180 */ 3181 static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, 3182 struct btrfs_path *path) 3183 { 3184 struct btrfs_fs_info *fs_info = trans->fs_info; 3185 struct btrfs_root *extent_root; 3186 struct btrfs_key found; 3187 struct extent_buffer *scratch_leaf = NULL; 3188 struct ulist *roots = NULL; 3189 u64 num_bytes; 3190 bool done; 3191 int slot; 3192 int ret; 3193 3194 mutex_lock(&fs_info->qgroup_rescan_lock); 3195 extent_root = btrfs_extent_root(fs_info, 3196 fs_info->qgroup_rescan_progress.objectid); 3197 ret = btrfs_search_slot_for_read(extent_root, 3198 &fs_info->qgroup_rescan_progress, 3199 path, 1, 0); 3200 3201 btrfs_debug(fs_info, 3202 "current progress key (%llu %u %llu), search_slot ret %d", 3203 fs_info->qgroup_rescan_progress.objectid, 3204 fs_info->qgroup_rescan_progress.type, 3205 fs_info->qgroup_rescan_progress.offset, ret); 3206 3207 if (ret) { 3208 /* 3209 * The rescan is about to end, we will not be scanning any 3210 * further blocks. We cannot unset the RESCAN flag here, because 3211 * we want to commit the transaction if everything went well. 3212 * To make the live accounting work in this phase, we set our 3213 * scan progress pointer such that every real extent objectid 3214 * will be smaller. 3215 */ 3216 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3217 btrfs_release_path(path); 3218 mutex_unlock(&fs_info->qgroup_rescan_lock); 3219 return ret; 3220 } 3221 done = is_last_leaf(path); 3222 3223 btrfs_item_key_to_cpu(path->nodes[0], &found, 3224 btrfs_header_nritems(path->nodes[0]) - 1); 3225 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 3226 3227 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]); 3228 if (!scratch_leaf) { 3229 ret = -ENOMEM; 3230 mutex_unlock(&fs_info->qgroup_rescan_lock); 3231 goto out; 3232 } 3233 slot = path->slots[0]; 3234 btrfs_release_path(path); 3235 mutex_unlock(&fs_info->qgroup_rescan_lock); 3236 3237 for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { 3238 btrfs_item_key_to_cpu(scratch_leaf, &found, slot); 3239 if (found.type != BTRFS_EXTENT_ITEM_KEY && 3240 found.type != BTRFS_METADATA_ITEM_KEY) 3241 continue; 3242 if (found.type == BTRFS_METADATA_ITEM_KEY) 3243 num_bytes = fs_info->nodesize; 3244 else 3245 num_bytes = found.offset; 3246 3247 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, 3248 &roots, false); 3249 if (ret < 0) 3250 goto out; 3251 /* For rescan, just pass old_roots as NULL */ 3252 ret = btrfs_qgroup_account_extent(trans, found.objectid, 3253 num_bytes, NULL, roots); 3254 if (ret < 0) 3255 goto out; 3256 } 3257 out: 3258 if (scratch_leaf) 3259 free_extent_buffer(scratch_leaf); 3260 3261 if (done && !ret) { 3262 ret = 1; 3263 fs_info->qgroup_rescan_progress.objectid = (u64)-1; 3264 } 3265 return ret; 3266 } 3267 3268 static bool rescan_should_stop(struct btrfs_fs_info *fs_info) 3269 { 3270 return btrfs_fs_closing(fs_info) || 3271 test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); 3272 } 3273 3274 static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) 3275 { 3276 struct btrfs_fs_info *fs_info = container_of(work, struct btrfs_fs_info, 3277 qgroup_rescan_work); 3278 struct btrfs_path *path; 3279 struct btrfs_trans_handle *trans = NULL; 3280 int err = -ENOMEM; 3281 int ret = 0; 3282 bool stopped = false; 3283 3284 path = btrfs_alloc_path(); 3285 if (!path) 3286 goto out; 3287 /* 3288 * Rescan should only search for commit root, and any later difference 3289 * should be recorded by qgroup 3290 */ 3291 path->search_commit_root = 1; 3292 path->skip_locking = 1; 3293 3294 err = 0; 3295 while (!err && !(stopped = rescan_should_stop(fs_info))) { 3296 trans = btrfs_start_transaction(fs_info->fs_root, 0); 3297 if (IS_ERR(trans)) { 3298 err = PTR_ERR(trans); 3299 break; 3300 } 3301 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3302 err = -EINTR; 3303 } else { 3304 err = qgroup_rescan_leaf(trans, path); 3305 } 3306 if (err > 0) 3307 btrfs_commit_transaction(trans); 3308 else 3309 btrfs_end_transaction(trans); 3310 } 3311 3312 out: 3313 btrfs_free_path(path); 3314 3315 mutex_lock(&fs_info->qgroup_rescan_lock); 3316 if (err > 0 && 3317 fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { 3318 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3319 } else if (err < 0) { 3320 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 3321 } 3322 mutex_unlock(&fs_info->qgroup_rescan_lock); 3323 3324 /* 3325 * only update status, since the previous part has already updated the 3326 * qgroup info. 3327 */ 3328 trans = btrfs_start_transaction(fs_info->quota_root, 1); 3329 if (IS_ERR(trans)) { 3330 err = PTR_ERR(trans); 3331 trans = NULL; 3332 btrfs_err(fs_info, 3333 "fail to start transaction for status update: %d", 3334 err); 3335 } 3336 3337 mutex_lock(&fs_info->qgroup_rescan_lock); 3338 if (!stopped) 3339 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3340 if (trans) { 3341 ret = update_qgroup_status_item(trans); 3342 if (ret < 0) { 3343 err = ret; 3344 btrfs_err(fs_info, "fail to update qgroup status: %d", 3345 err); 3346 } 3347 } 3348 fs_info->qgroup_rescan_running = false; 3349 complete_all(&fs_info->qgroup_rescan_completion); 3350 mutex_unlock(&fs_info->qgroup_rescan_lock); 3351 3352 if (!trans) 3353 return; 3354 3355 btrfs_end_transaction(trans); 3356 3357 if (stopped) { 3358 btrfs_info(fs_info, "qgroup scan paused"); 3359 } else if (err >= 0) { 3360 btrfs_info(fs_info, "qgroup scan completed%s", 3361 err > 0 ? " (inconsistency flag cleared)" : ""); 3362 } else { 3363 btrfs_err(fs_info, "qgroup scan failed with %d", err); 3364 } 3365 } 3366 3367 /* 3368 * Checks that (a) no rescan is running and (b) quota is enabled. Allocates all 3369 * memory required for the rescan context. 3370 */ 3371 static int 3372 qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, 3373 int init_flags) 3374 { 3375 int ret = 0; 3376 3377 if (!init_flags) { 3378 /* we're resuming qgroup rescan at mount time */ 3379 if (!(fs_info->qgroup_flags & 3380 BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { 3381 btrfs_warn(fs_info, 3382 "qgroup rescan init failed, qgroup rescan is not queued"); 3383 ret = -EINVAL; 3384 } else if (!(fs_info->qgroup_flags & 3385 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3386 btrfs_warn(fs_info, 3387 "qgroup rescan init failed, qgroup is not enabled"); 3388 ret = -EINVAL; 3389 } 3390 3391 if (ret) 3392 return ret; 3393 } 3394 3395 mutex_lock(&fs_info->qgroup_rescan_lock); 3396 3397 if (init_flags) { 3398 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3399 btrfs_warn(fs_info, 3400 "qgroup rescan is already in progress"); 3401 ret = -EINPROGRESS; 3402 } else if (!(fs_info->qgroup_flags & 3403 BTRFS_QGROUP_STATUS_FLAG_ON)) { 3404 btrfs_warn(fs_info, 3405 "qgroup rescan init failed, qgroup is not enabled"); 3406 ret = -EINVAL; 3407 } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { 3408 /* Quota disable is in progress */ 3409 ret = -EBUSY; 3410 } 3411 3412 if (ret) { 3413 mutex_unlock(&fs_info->qgroup_rescan_lock); 3414 return ret; 3415 } 3416 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3417 } 3418 3419 memset(&fs_info->qgroup_rescan_progress, 0, 3420 sizeof(fs_info->qgroup_rescan_progress)); 3421 fs_info->qgroup_rescan_progress.objectid = progress_objectid; 3422 init_completion(&fs_info->qgroup_rescan_completion); 3423 mutex_unlock(&fs_info->qgroup_rescan_lock); 3424 3425 btrfs_init_work(&fs_info->qgroup_rescan_work, 3426 btrfs_qgroup_rescan_worker, NULL, NULL); 3427 return 0; 3428 } 3429 3430 static void 3431 qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info) 3432 { 3433 struct rb_node *n; 3434 struct btrfs_qgroup *qgroup; 3435 3436 spin_lock(&fs_info->qgroup_lock); 3437 /* clear all current qgroup tracking information */ 3438 for (n = rb_first(&fs_info->qgroup_tree); n; n = rb_next(n)) { 3439 qgroup = rb_entry(n, struct btrfs_qgroup, node); 3440 qgroup->rfer = 0; 3441 qgroup->rfer_cmpr = 0; 3442 qgroup->excl = 0; 3443 qgroup->excl_cmpr = 0; 3444 qgroup_dirty(fs_info, qgroup); 3445 } 3446 spin_unlock(&fs_info->qgroup_lock); 3447 } 3448 3449 int 3450 btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) 3451 { 3452 int ret = 0; 3453 struct btrfs_trans_handle *trans; 3454 3455 ret = qgroup_rescan_init(fs_info, 0, 1); 3456 if (ret) 3457 return ret; 3458 3459 /* 3460 * We have set the rescan_progress to 0, which means no more 3461 * delayed refs will be accounted by btrfs_qgroup_account_ref. 3462 * However, btrfs_qgroup_account_ref may be right after its call 3463 * to btrfs_find_all_roots, in which case it would still do the 3464 * accounting. 3465 * To solve this, we're committing the transaction, which will 3466 * ensure we run all delayed refs and only after that, we are 3467 * going to clear all tracking information for a clean start. 3468 */ 3469 3470 trans = btrfs_join_transaction(fs_info->fs_root); 3471 if (IS_ERR(trans)) { 3472 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3473 return PTR_ERR(trans); 3474 } 3475 ret = btrfs_commit_transaction(trans); 3476 if (ret) { 3477 fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; 3478 return ret; 3479 } 3480 3481 qgroup_rescan_zero_tracking(fs_info); 3482 3483 mutex_lock(&fs_info->qgroup_rescan_lock); 3484 fs_info->qgroup_rescan_running = true; 3485 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3486 &fs_info->qgroup_rescan_work); 3487 mutex_unlock(&fs_info->qgroup_rescan_lock); 3488 3489 return 0; 3490 } 3491 3492 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 3493 bool interruptible) 3494 { 3495 int running; 3496 int ret = 0; 3497 3498 mutex_lock(&fs_info->qgroup_rescan_lock); 3499 running = fs_info->qgroup_rescan_running; 3500 mutex_unlock(&fs_info->qgroup_rescan_lock); 3501 3502 if (!running) 3503 return 0; 3504 3505 if (interruptible) 3506 ret = wait_for_completion_interruptible( 3507 &fs_info->qgroup_rescan_completion); 3508 else 3509 wait_for_completion(&fs_info->qgroup_rescan_completion); 3510 3511 return ret; 3512 } 3513 3514 /* 3515 * this is only called from open_ctree where we're still single threaded, thus 3516 * locking is omitted here. 3517 */ 3518 void 3519 btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info) 3520 { 3521 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { 3522 mutex_lock(&fs_info->qgroup_rescan_lock); 3523 fs_info->qgroup_rescan_running = true; 3524 btrfs_queue_work(fs_info->qgroup_rescan_workers, 3525 &fs_info->qgroup_rescan_work); 3526 mutex_unlock(&fs_info->qgroup_rescan_lock); 3527 } 3528 } 3529 3530 #define rbtree_iterate_from_safe(node, next, start) \ 3531 for (node = start; node && ({ next = rb_next(node); 1;}); node = next) 3532 3533 static int qgroup_unreserve_range(struct btrfs_inode *inode, 3534 struct extent_changeset *reserved, u64 start, 3535 u64 len) 3536 { 3537 struct rb_node *node; 3538 struct rb_node *next; 3539 struct ulist_node *entry; 3540 int ret = 0; 3541 3542 node = reserved->range_changed.root.rb_node; 3543 if (!node) 3544 return 0; 3545 while (node) { 3546 entry = rb_entry(node, struct ulist_node, rb_node); 3547 if (entry->val < start) 3548 node = node->rb_right; 3549 else 3550 node = node->rb_left; 3551 } 3552 3553 if (entry->val > start && rb_prev(&entry->rb_node)) 3554 entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node, 3555 rb_node); 3556 3557 rbtree_iterate_from_safe(node, next, &entry->rb_node) { 3558 u64 entry_start; 3559 u64 entry_end; 3560 u64 entry_len; 3561 int clear_ret; 3562 3563 entry = rb_entry(node, struct ulist_node, rb_node); 3564 entry_start = entry->val; 3565 entry_end = entry->aux; 3566 entry_len = entry_end - entry_start + 1; 3567 3568 if (entry_start >= start + len) 3569 break; 3570 if (entry_start + entry_len <= start) 3571 continue; 3572 /* 3573 * Now the entry is in [start, start + len), revert the 3574 * EXTENT_QGROUP_RESERVED bit. 3575 */ 3576 clear_ret = clear_extent_bits(&inode->io_tree, entry_start, 3577 entry_end, EXTENT_QGROUP_RESERVED); 3578 if (!ret && clear_ret < 0) 3579 ret = clear_ret; 3580 3581 ulist_del(&reserved->range_changed, entry->val, entry->aux); 3582 if (likely(reserved->bytes_changed >= entry_len)) { 3583 reserved->bytes_changed -= entry_len; 3584 } else { 3585 WARN_ON(1); 3586 reserved->bytes_changed = 0; 3587 } 3588 } 3589 3590 return ret; 3591 } 3592 3593 /* 3594 * Try to free some space for qgroup. 3595 * 3596 * For qgroup, there are only 3 ways to free qgroup space: 3597 * - Flush nodatacow write 3598 * Any nodatacow write will free its reserved data space at run_delalloc_range(). 3599 * In theory, we should only flush nodatacow inodes, but it's not yet 3600 * possible, so we need to flush the whole root. 3601 * 3602 * - Wait for ordered extents 3603 * When ordered extents are finished, their reserved metadata is finally 3604 * converted to per_trans status, which can be freed by later commit 3605 * transaction. 3606 * 3607 * - Commit transaction 3608 * This would free the meta_per_trans space. 3609 * In theory this shouldn't provide much space, but any more qgroup space 3610 * is needed. 3611 */ 3612 static int try_flush_qgroup(struct btrfs_root *root) 3613 { 3614 struct btrfs_trans_handle *trans; 3615 int ret; 3616 3617 /* Can't hold an open transaction or we run the risk of deadlocking. */ 3618 ASSERT(current->journal_info == NULL); 3619 if (WARN_ON(current->journal_info)) 3620 return 0; 3621 3622 /* 3623 * We don't want to run flush again and again, so if there is a running 3624 * one, we won't try to start a new flush, but exit directly. 3625 */ 3626 if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { 3627 wait_event(root->qgroup_flush_wait, 3628 !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); 3629 return 0; 3630 } 3631 3632 ret = btrfs_start_delalloc_snapshot(root, true); 3633 if (ret < 0) 3634 goto out; 3635 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); 3636 3637 trans = btrfs_join_transaction(root); 3638 if (IS_ERR(trans)) { 3639 ret = PTR_ERR(trans); 3640 goto out; 3641 } 3642 3643 ret = btrfs_commit_transaction(trans); 3644 out: 3645 clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); 3646 wake_up(&root->qgroup_flush_wait); 3647 return ret; 3648 } 3649 3650 static int qgroup_reserve_data(struct btrfs_inode *inode, 3651 struct extent_changeset **reserved_ret, u64 start, 3652 u64 len) 3653 { 3654 struct btrfs_root *root = inode->root; 3655 struct extent_changeset *reserved; 3656 bool new_reserved = false; 3657 u64 orig_reserved; 3658 u64 to_reserve; 3659 int ret; 3660 3661 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || 3662 !is_fstree(root->root_key.objectid) || len == 0) 3663 return 0; 3664 3665 /* @reserved parameter is mandatory for qgroup */ 3666 if (WARN_ON(!reserved_ret)) 3667 return -EINVAL; 3668 if (!*reserved_ret) { 3669 new_reserved = true; 3670 *reserved_ret = extent_changeset_alloc(); 3671 if (!*reserved_ret) 3672 return -ENOMEM; 3673 } 3674 reserved = *reserved_ret; 3675 /* Record already reserved space */ 3676 orig_reserved = reserved->bytes_changed; 3677 ret = set_record_extent_bits(&inode->io_tree, start, 3678 start + len -1, EXTENT_QGROUP_RESERVED, reserved); 3679 3680 /* Newly reserved space */ 3681 to_reserve = reserved->bytes_changed - orig_reserved; 3682 trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len, 3683 to_reserve, QGROUP_RESERVE); 3684 if (ret < 0) 3685 goto out; 3686 ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA); 3687 if (ret < 0) 3688 goto cleanup; 3689 3690 return ret; 3691 3692 cleanup: 3693 qgroup_unreserve_range(inode, reserved, start, len); 3694 out: 3695 if (new_reserved) { 3696 extent_changeset_free(reserved); 3697 *reserved_ret = NULL; 3698 } 3699 return ret; 3700 } 3701 3702 /* 3703 * Reserve qgroup space for range [start, start + len). 3704 * 3705 * This function will either reserve space from related qgroups or do nothing 3706 * if the range is already reserved. 3707 * 3708 * Return 0 for successful reservation 3709 * Return <0 for error (including -EQUOT) 3710 * 3711 * NOTE: This function may sleep for memory allocation, dirty page flushing and 3712 * commit transaction. So caller should not hold any dirty page locked. 3713 */ 3714 int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, 3715 struct extent_changeset **reserved_ret, u64 start, 3716 u64 len) 3717 { 3718 int ret; 3719 3720 ret = qgroup_reserve_data(inode, reserved_ret, start, len); 3721 if (ret <= 0 && ret != -EDQUOT) 3722 return ret; 3723 3724 ret = try_flush_qgroup(inode->root); 3725 if (ret < 0) 3726 return ret; 3727 return qgroup_reserve_data(inode, reserved_ret, start, len); 3728 } 3729 3730 /* Free ranges specified by @reserved, normally in error path */ 3731 static int qgroup_free_reserved_data(struct btrfs_inode *inode, 3732 struct extent_changeset *reserved, u64 start, u64 len) 3733 { 3734 struct btrfs_root *root = inode->root; 3735 struct ulist_node *unode; 3736 struct ulist_iterator uiter; 3737 struct extent_changeset changeset; 3738 int freed = 0; 3739 int ret; 3740 3741 extent_changeset_init(&changeset); 3742 len = round_up(start + len, root->fs_info->sectorsize); 3743 start = round_down(start, root->fs_info->sectorsize); 3744 3745 ULIST_ITER_INIT(&uiter); 3746 while ((unode = ulist_next(&reserved->range_changed, &uiter))) { 3747 u64 range_start = unode->val; 3748 /* unode->aux is the inclusive end */ 3749 u64 range_len = unode->aux - range_start + 1; 3750 u64 free_start; 3751 u64 free_len; 3752 3753 extent_changeset_release(&changeset); 3754 3755 /* Only free range in range [start, start + len) */ 3756 if (range_start >= start + len || 3757 range_start + range_len <= start) 3758 continue; 3759 free_start = max(range_start, start); 3760 free_len = min(start + len, range_start + range_len) - 3761 free_start; 3762 /* 3763 * TODO: To also modify reserved->ranges_reserved to reflect 3764 * the modification. 3765 * 3766 * However as long as we free qgroup reserved according to 3767 * EXTENT_QGROUP_RESERVED, we won't double free. 3768 * So not need to rush. 3769 */ 3770 ret = clear_record_extent_bits(&inode->io_tree, free_start, 3771 free_start + free_len - 1, 3772 EXTENT_QGROUP_RESERVED, &changeset); 3773 if (ret < 0) 3774 goto out; 3775 freed += changeset.bytes_changed; 3776 } 3777 btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, 3778 BTRFS_QGROUP_RSV_DATA); 3779 ret = freed; 3780 out: 3781 extent_changeset_release(&changeset); 3782 return ret; 3783 } 3784 3785 static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, 3786 struct extent_changeset *reserved, u64 start, u64 len, 3787 int free) 3788 { 3789 struct extent_changeset changeset; 3790 int trace_op = QGROUP_RELEASE; 3791 int ret; 3792 3793 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) 3794 return 0; 3795 3796 /* In release case, we shouldn't have @reserved */ 3797 WARN_ON(!free && reserved); 3798 if (free && reserved) 3799 return qgroup_free_reserved_data(inode, reserved, start, len); 3800 extent_changeset_init(&changeset); 3801 ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, 3802 EXTENT_QGROUP_RESERVED, &changeset); 3803 if (ret < 0) 3804 goto out; 3805 3806 if (free) 3807 trace_op = QGROUP_FREE; 3808 trace_btrfs_qgroup_release_data(&inode->vfs_inode, start, len, 3809 changeset.bytes_changed, trace_op); 3810 if (free) 3811 btrfs_qgroup_free_refroot(inode->root->fs_info, 3812 inode->root->root_key.objectid, 3813 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 3814 ret = changeset.bytes_changed; 3815 out: 3816 extent_changeset_release(&changeset); 3817 return ret; 3818 } 3819 3820 /* 3821 * Free a reserved space range from io_tree and related qgroups 3822 * 3823 * Should be called when a range of pages get invalidated before reaching disk. 3824 * Or for error cleanup case. 3825 * if @reserved is given, only reserved range in [@start, @start + @len) will 3826 * be freed. 3827 * 3828 * For data written to disk, use btrfs_qgroup_release_data(). 3829 * 3830 * NOTE: This function may sleep for memory allocation. 3831 */ 3832 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 3833 struct extent_changeset *reserved, u64 start, u64 len) 3834 { 3835 return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); 3836 } 3837 3838 /* 3839 * Release a reserved space range from io_tree only. 3840 * 3841 * Should be called when a range of pages get written to disk and corresponding 3842 * FILE_EXTENT is inserted into corresponding root. 3843 * 3844 * Since new qgroup accounting framework will only update qgroup numbers at 3845 * commit_transaction() time, its reserved space shouldn't be freed from 3846 * related qgroups. 3847 * 3848 * But we should release the range from io_tree, to allow further write to be 3849 * COWed. 3850 * 3851 * NOTE: This function may sleep for memory allocation. 3852 */ 3853 int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) 3854 { 3855 return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); 3856 } 3857 3858 static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3859 enum btrfs_qgroup_rsv_type type) 3860 { 3861 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3862 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3863 return; 3864 if (num_bytes == 0) 3865 return; 3866 3867 spin_lock(&root->qgroup_meta_rsv_lock); 3868 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) 3869 root->qgroup_meta_rsv_prealloc += num_bytes; 3870 else 3871 root->qgroup_meta_rsv_pertrans += num_bytes; 3872 spin_unlock(&root->qgroup_meta_rsv_lock); 3873 } 3874 3875 static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, 3876 enum btrfs_qgroup_rsv_type type) 3877 { 3878 if (type != BTRFS_QGROUP_RSV_META_PREALLOC && 3879 type != BTRFS_QGROUP_RSV_META_PERTRANS) 3880 return 0; 3881 if (num_bytes == 0) 3882 return 0; 3883 3884 spin_lock(&root->qgroup_meta_rsv_lock); 3885 if (type == BTRFS_QGROUP_RSV_META_PREALLOC) { 3886 num_bytes = min_t(u64, root->qgroup_meta_rsv_prealloc, 3887 num_bytes); 3888 root->qgroup_meta_rsv_prealloc -= num_bytes; 3889 } else { 3890 num_bytes = min_t(u64, root->qgroup_meta_rsv_pertrans, 3891 num_bytes); 3892 root->qgroup_meta_rsv_pertrans -= num_bytes; 3893 } 3894 spin_unlock(&root->qgroup_meta_rsv_lock); 3895 return num_bytes; 3896 } 3897 3898 int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3899 enum btrfs_qgroup_rsv_type type, bool enforce) 3900 { 3901 struct btrfs_fs_info *fs_info = root->fs_info; 3902 int ret; 3903 3904 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3905 !is_fstree(root->root_key.objectid) || num_bytes == 0) 3906 return 0; 3907 3908 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3909 trace_qgroup_meta_reserve(root, (s64)num_bytes, type); 3910 ret = qgroup_reserve(root, num_bytes, enforce, type); 3911 if (ret < 0) 3912 return ret; 3913 /* 3914 * Record what we have reserved into root. 3915 * 3916 * To avoid quota disabled->enabled underflow. 3917 * In that case, we may try to free space we haven't reserved 3918 * (since quota was disabled), so record what we reserved into root. 3919 * And ensure later release won't underflow this number. 3920 */ 3921 add_root_meta_rsv(root, num_bytes, type); 3922 return ret; 3923 } 3924 3925 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 3926 enum btrfs_qgroup_rsv_type type, bool enforce) 3927 { 3928 int ret; 3929 3930 ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3931 if (ret <= 0 && ret != -EDQUOT) 3932 return ret; 3933 3934 ret = try_flush_qgroup(root); 3935 if (ret < 0) 3936 return ret; 3937 return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 3938 } 3939 3940 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) 3941 { 3942 struct btrfs_fs_info *fs_info = root->fs_info; 3943 3944 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3945 !is_fstree(root->root_key.objectid)) 3946 return; 3947 3948 /* TODO: Update trace point to handle such free */ 3949 trace_qgroup_meta_free_all_pertrans(root); 3950 /* Special value -1 means to free all reserved space */ 3951 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1, 3952 BTRFS_QGROUP_RSV_META_PERTRANS); 3953 } 3954 3955 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 3956 enum btrfs_qgroup_rsv_type type) 3957 { 3958 struct btrfs_fs_info *fs_info = root->fs_info; 3959 3960 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 3961 !is_fstree(root->root_key.objectid)) 3962 return; 3963 3964 /* 3965 * reservation for META_PREALLOC can happen before quota is enabled, 3966 * which can lead to underflow. 3967 * Here ensure we will only free what we really have reserved. 3968 */ 3969 num_bytes = sub_root_meta_rsv(root, num_bytes, type); 3970 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 3971 trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); 3972 btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, 3973 num_bytes, type); 3974 } 3975 3976 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, 3977 int num_bytes) 3978 { 3979 struct btrfs_qgroup *qgroup; 3980 struct ulist_node *unode; 3981 struct ulist_iterator uiter; 3982 int ret = 0; 3983 3984 if (num_bytes == 0) 3985 return; 3986 if (!fs_info->quota_root) 3987 return; 3988 3989 spin_lock(&fs_info->qgroup_lock); 3990 qgroup = find_qgroup_rb(fs_info, ref_root); 3991 if (!qgroup) 3992 goto out; 3993 ulist_reinit(fs_info->qgroup_ulist); 3994 ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, 3995 qgroup_to_aux(qgroup), GFP_ATOMIC); 3996 if (ret < 0) 3997 goto out; 3998 ULIST_ITER_INIT(&uiter); 3999 while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { 4000 struct btrfs_qgroup *qg; 4001 struct btrfs_qgroup_list *glist; 4002 4003 qg = unode_aux_to_qgroup(unode); 4004 4005 qgroup_rsv_release(fs_info, qg, num_bytes, 4006 BTRFS_QGROUP_RSV_META_PREALLOC); 4007 qgroup_rsv_add(fs_info, qg, num_bytes, 4008 BTRFS_QGROUP_RSV_META_PERTRANS); 4009 list_for_each_entry(glist, &qg->groups, next_group) { 4010 ret = ulist_add(fs_info->qgroup_ulist, 4011 glist->group->qgroupid, 4012 qgroup_to_aux(glist->group), GFP_ATOMIC); 4013 if (ret < 0) 4014 goto out; 4015 } 4016 } 4017 out: 4018 spin_unlock(&fs_info->qgroup_lock); 4019 } 4020 4021 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) 4022 { 4023 struct btrfs_fs_info *fs_info = root->fs_info; 4024 4025 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || 4026 !is_fstree(root->root_key.objectid)) 4027 return; 4028 /* Same as btrfs_qgroup_free_meta_prealloc() */ 4029 num_bytes = sub_root_meta_rsv(root, num_bytes, 4030 BTRFS_QGROUP_RSV_META_PREALLOC); 4031 trace_qgroup_meta_convert(root, num_bytes); 4032 qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes); 4033 } 4034 4035 /* 4036 * Check qgroup reserved space leaking, normally at destroy inode 4037 * time 4038 */ 4039 void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) 4040 { 4041 struct extent_changeset changeset; 4042 struct ulist_node *unode; 4043 struct ulist_iterator iter; 4044 int ret; 4045 4046 extent_changeset_init(&changeset); 4047 ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, 4048 EXTENT_QGROUP_RESERVED, &changeset); 4049 4050 WARN_ON(ret < 0); 4051 if (WARN_ON(changeset.bytes_changed)) { 4052 ULIST_ITER_INIT(&iter); 4053 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4054 btrfs_warn(inode->root->fs_info, 4055 "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu", 4056 btrfs_ino(inode), unode->val, unode->aux); 4057 } 4058 btrfs_qgroup_free_refroot(inode->root->fs_info, 4059 inode->root->root_key.objectid, 4060 changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); 4061 4062 } 4063 extent_changeset_release(&changeset); 4064 } 4065 4066 void btrfs_qgroup_init_swapped_blocks( 4067 struct btrfs_qgroup_swapped_blocks *swapped_blocks) 4068 { 4069 int i; 4070 4071 spin_lock_init(&swapped_blocks->lock); 4072 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 4073 swapped_blocks->blocks[i] = RB_ROOT; 4074 swapped_blocks->swapped = false; 4075 } 4076 4077 /* 4078 * Delete all swapped blocks record of @root. 4079 * Every record here means we skipped a full subtree scan for qgroup. 4080 * 4081 * Gets called when committing one transaction. 4082 */ 4083 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root) 4084 { 4085 struct btrfs_qgroup_swapped_blocks *swapped_blocks; 4086 int i; 4087 4088 swapped_blocks = &root->swapped_blocks; 4089 4090 spin_lock(&swapped_blocks->lock); 4091 if (!swapped_blocks->swapped) 4092 goto out; 4093 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4094 struct rb_root *cur_root = &swapped_blocks->blocks[i]; 4095 struct btrfs_qgroup_swapped_block *entry; 4096 struct btrfs_qgroup_swapped_block *next; 4097 4098 rbtree_postorder_for_each_entry_safe(entry, next, cur_root, 4099 node) 4100 kfree(entry); 4101 swapped_blocks->blocks[i] = RB_ROOT; 4102 } 4103 swapped_blocks->swapped = false; 4104 out: 4105 spin_unlock(&swapped_blocks->lock); 4106 } 4107 4108 /* 4109 * Add subtree roots record into @subvol_root. 4110 * 4111 * @subvol_root: tree root of the subvolume tree get swapped 4112 * @bg: block group under balance 4113 * @subvol_parent/slot: pointer to the subtree root in subvolume tree 4114 * @reloc_parent/slot: pointer to the subtree root in reloc tree 4115 * BOTH POINTERS ARE BEFORE TREE SWAP 4116 * @last_snapshot: last snapshot generation of the subvolume tree 4117 */ 4118 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 4119 struct btrfs_root *subvol_root, 4120 struct btrfs_block_group *bg, 4121 struct extent_buffer *subvol_parent, int subvol_slot, 4122 struct extent_buffer *reloc_parent, int reloc_slot, 4123 u64 last_snapshot) 4124 { 4125 struct btrfs_fs_info *fs_info = subvol_root->fs_info; 4126 struct btrfs_qgroup_swapped_blocks *blocks = &subvol_root->swapped_blocks; 4127 struct btrfs_qgroup_swapped_block *block; 4128 struct rb_node **cur; 4129 struct rb_node *parent = NULL; 4130 int level = btrfs_header_level(subvol_parent) - 1; 4131 int ret = 0; 4132 4133 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4134 return 0; 4135 4136 if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > 4137 btrfs_node_ptr_generation(reloc_parent, reloc_slot)) { 4138 btrfs_err_rl(fs_info, 4139 "%s: bad parameter order, subvol_gen=%llu reloc_gen=%llu", 4140 __func__, 4141 btrfs_node_ptr_generation(subvol_parent, subvol_slot), 4142 btrfs_node_ptr_generation(reloc_parent, reloc_slot)); 4143 return -EUCLEAN; 4144 } 4145 4146 block = kmalloc(sizeof(*block), GFP_NOFS); 4147 if (!block) { 4148 ret = -ENOMEM; 4149 goto out; 4150 } 4151 4152 /* 4153 * @reloc_parent/slot is still before swap, while @block is going to 4154 * record the bytenr after swap, so we do the swap here. 4155 */ 4156 block->subvol_bytenr = btrfs_node_blockptr(reloc_parent, reloc_slot); 4157 block->subvol_generation = btrfs_node_ptr_generation(reloc_parent, 4158 reloc_slot); 4159 block->reloc_bytenr = btrfs_node_blockptr(subvol_parent, subvol_slot); 4160 block->reloc_generation = btrfs_node_ptr_generation(subvol_parent, 4161 subvol_slot); 4162 block->last_snapshot = last_snapshot; 4163 block->level = level; 4164 4165 /* 4166 * If we have bg == NULL, we're called from btrfs_recover_relocation(), 4167 * no one else can modify tree blocks thus we qgroup will not change 4168 * no matter the value of trace_leaf. 4169 */ 4170 if (bg && bg->flags & BTRFS_BLOCK_GROUP_DATA) 4171 block->trace_leaf = true; 4172 else 4173 block->trace_leaf = false; 4174 btrfs_node_key_to_cpu(reloc_parent, &block->first_key, reloc_slot); 4175 4176 /* Insert @block into @blocks */ 4177 spin_lock(&blocks->lock); 4178 cur = &blocks->blocks[level].rb_node; 4179 while (*cur) { 4180 struct btrfs_qgroup_swapped_block *entry; 4181 4182 parent = *cur; 4183 entry = rb_entry(parent, struct btrfs_qgroup_swapped_block, 4184 node); 4185 4186 if (entry->subvol_bytenr < block->subvol_bytenr) { 4187 cur = &(*cur)->rb_left; 4188 } else if (entry->subvol_bytenr > block->subvol_bytenr) { 4189 cur = &(*cur)->rb_right; 4190 } else { 4191 if (entry->subvol_generation != 4192 block->subvol_generation || 4193 entry->reloc_bytenr != block->reloc_bytenr || 4194 entry->reloc_generation != 4195 block->reloc_generation) { 4196 /* 4197 * Duplicated but mismatch entry found. 4198 * Shouldn't happen. 4199 * 4200 * Marking qgroup inconsistent should be enough 4201 * for end users. 4202 */ 4203 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); 4204 ret = -EEXIST; 4205 } 4206 kfree(block); 4207 goto out_unlock; 4208 } 4209 } 4210 rb_link_node(&block->node, parent, cur); 4211 rb_insert_color(&block->node, &blocks->blocks[level]); 4212 blocks->swapped = true; 4213 out_unlock: 4214 spin_unlock(&blocks->lock); 4215 out: 4216 if (ret < 0) 4217 fs_info->qgroup_flags |= 4218 BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4219 return ret; 4220 } 4221 4222 /* 4223 * Check if the tree block is a subtree root, and if so do the needed 4224 * delayed subtree trace for qgroup. 4225 * 4226 * This is called during btrfs_cow_block(). 4227 */ 4228 int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, 4229 struct btrfs_root *root, 4230 struct extent_buffer *subvol_eb) 4231 { 4232 struct btrfs_fs_info *fs_info = root->fs_info; 4233 struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; 4234 struct btrfs_qgroup_swapped_block *block; 4235 struct extent_buffer *reloc_eb = NULL; 4236 struct rb_node *node; 4237 bool found = false; 4238 bool swapped = false; 4239 int level = btrfs_header_level(subvol_eb); 4240 int ret = 0; 4241 int i; 4242 4243 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 4244 return 0; 4245 if (!is_fstree(root->root_key.objectid) || !root->reloc_root) 4246 return 0; 4247 4248 spin_lock(&blocks->lock); 4249 if (!blocks->swapped) { 4250 spin_unlock(&blocks->lock); 4251 return 0; 4252 } 4253 node = blocks->blocks[level].rb_node; 4254 4255 while (node) { 4256 block = rb_entry(node, struct btrfs_qgroup_swapped_block, node); 4257 if (block->subvol_bytenr < subvol_eb->start) { 4258 node = node->rb_left; 4259 } else if (block->subvol_bytenr > subvol_eb->start) { 4260 node = node->rb_right; 4261 } else { 4262 found = true; 4263 break; 4264 } 4265 } 4266 if (!found) { 4267 spin_unlock(&blocks->lock); 4268 goto out; 4269 } 4270 /* Found one, remove it from @blocks first and update blocks->swapped */ 4271 rb_erase(&block->node, &blocks->blocks[level]); 4272 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 4273 if (RB_EMPTY_ROOT(&blocks->blocks[i])) { 4274 swapped = true; 4275 break; 4276 } 4277 } 4278 blocks->swapped = swapped; 4279 spin_unlock(&blocks->lock); 4280 4281 /* Read out reloc subtree root */ 4282 reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, 4283 block->reloc_generation, block->level, 4284 &block->first_key); 4285 if (IS_ERR(reloc_eb)) { 4286 ret = PTR_ERR(reloc_eb); 4287 reloc_eb = NULL; 4288 goto free_out; 4289 } 4290 if (!extent_buffer_uptodate(reloc_eb)) { 4291 ret = -EIO; 4292 goto free_out; 4293 } 4294 4295 ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, 4296 block->last_snapshot, block->trace_leaf); 4297 free_out: 4298 kfree(block); 4299 free_extent_buffer(reloc_eb); 4300 out: 4301 if (ret < 0) { 4302 btrfs_err_rl(fs_info, 4303 "failed to account subtree at bytenr %llu: %d", 4304 subvol_eb->start, ret); 4305 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; 4306 } 4307 return ret; 4308 } 4309 4310 void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) 4311 { 4312 struct btrfs_qgroup_extent_record *entry; 4313 struct btrfs_qgroup_extent_record *next; 4314 struct rb_root *root; 4315 4316 root = &trans->delayed_refs.dirty_extent_root; 4317 rbtree_postorder_for_each_entry_safe(entry, next, root, node) { 4318 ulist_free(entry->old_roots); 4319 kfree(entry); 4320 } 4321 } 4322