1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (C) 2014 Facebook. All rights reserved. 4 */ 5 6 #ifndef BTRFS_QGROUP_H 7 #define BTRFS_QGROUP_H 8 9 #include <linux/spinlock.h> 10 #include <linux/rbtree.h> 11 #include "ulist.h" 12 #include "delayed-ref.h" 13 14 /* 15 * Btrfs qgroup overview 16 * 17 * Btrfs qgroup splits into 3 main part: 18 * 1) Reserve 19 * Reserve metadata/data space for incoming operations 20 * Affect how qgroup limit works 21 * 22 * 2) Trace 23 * Tell btrfs qgroup to trace dirty extents. 24 * 25 * Dirty extents including: 26 * - Newly allocated extents 27 * - Extents going to be deleted (in this trans) 28 * - Extents whose owner is going to be modified 29 * 30 * This is the main part affects whether qgroup numbers will stay 31 * consistent. 32 * Btrfs qgroup can trace clean extents and won't cause any problem, 33 * but it will consume extra CPU time, it should be avoided if possible. 34 * 35 * 3) Account 36 * Btrfs qgroup will updates its numbers, based on dirty extents traced 37 * in previous step. 38 * 39 * Normally at qgroup rescan and transaction commit time. 40 */ 41 42 /* 43 * Special performance optimization for balance. 44 * 45 * For balance, we need to swap subtree of subvolume and reloc trees. 46 * In theory, we need to trace all subtree blocks of both subvolume and reloc 47 * trees, since their owner has changed during such swap. 48 * 49 * However since balance has ensured that both subtrees are containing the 50 * same contents and have the same tree structures, such swap won't cause 51 * qgroup number change. 52 * 53 * But there is a race window between subtree swap and transaction commit, 54 * during that window, if we increase/decrease tree level or merge/split tree 55 * blocks, we still need to trace the original subtrees. 56 * 57 * So for balance, we use a delayed subtree tracing, whose workflow is: 58 * 59 * 1) Record the subtree root block get swapped. 60 * 61 * During subtree swap: 62 * O = Old tree blocks 63 * N = New tree blocks 64 * reloc tree subvolume tree X 65 * Root Root 66 * / \ / \ 67 * NA OB OA OB 68 * / | | \ / | | \ 69 * NC ND OE OF OC OD OE OF 70 * 71 * In this case, NA and OA are going to be swapped, record (NA, OA) into 72 * subvolume tree X. 73 * 74 * 2) After subtree swap. 75 * reloc tree subvolume tree X 76 * Root Root 77 * / \ / \ 78 * OA OB NA OB 79 * / | | \ / | | \ 80 * OC OD OE OF NC ND OE OF 81 * 82 * 3a) COW happens for OB 83 * If we are going to COW tree block OB, we check OB's bytenr against 84 * tree X's swapped_blocks structure. 85 * If it doesn't fit any, nothing will happen. 86 * 87 * 3b) COW happens for NA 88 * Check NA's bytenr against tree X's swapped_blocks, and get a hit. 89 * Then we do subtree scan on both subtrees OA and NA. 90 * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND). 91 * 92 * Then no matter what we do to subvolume tree X, qgroup numbers will 93 * still be correct. 94 * Then NA's record gets removed from X's swapped_blocks. 95 * 96 * 4) Transaction commit 97 * Any record in X's swapped_blocks gets removed, since there is no 98 * modification to the swapped subtrees, no need to trigger heavy qgroup 99 * subtree rescan for them. 100 */ 101 102 /* 103 * Record a dirty extent, and info qgroup to update quota on it 104 * TODO: Use kmem cache to alloc it. 105 */ 106 struct btrfs_qgroup_extent_record { 107 struct rb_node node; 108 u64 bytenr; 109 u64 num_bytes; 110 struct ulist *old_roots; 111 }; 112 113 struct btrfs_qgroup_swapped_block { 114 struct rb_node node; 115 116 int level; 117 bool trace_leaf; 118 119 /* bytenr/generation of the tree block in subvolume tree after swap */ 120 u64 subvol_bytenr; 121 u64 subvol_generation; 122 123 /* bytenr/generation of the tree block in reloc tree after swap */ 124 u64 reloc_bytenr; 125 u64 reloc_generation; 126 127 u64 last_snapshot; 128 struct btrfs_key first_key; 129 }; 130 131 /* 132 * Qgroup reservation types: 133 * 134 * DATA: 135 * space reserved for data 136 * 137 * META_PERTRANS: 138 * Space reserved for metadata (per-transaction) 139 * Due to the fact that qgroup data is only updated at transaction commit 140 * time, reserved space for metadata must be kept until transaction 141 * commits. 142 * Any metadata reserved that are used in btrfs_start_transaction() should 143 * be of this type. 144 * 145 * META_PREALLOC: 146 * There are cases where metadata space is reserved before starting 147 * transaction, and then btrfs_join_transaction() to get a trans handle. 148 * Any metadata reserved for such usage should be of this type. 149 * And after join_transaction() part (or all) of such reservation should 150 * be converted into META_PERTRANS. 151 */ 152 enum btrfs_qgroup_rsv_type { 153 BTRFS_QGROUP_RSV_DATA, 154 BTRFS_QGROUP_RSV_META_PERTRANS, 155 BTRFS_QGROUP_RSV_META_PREALLOC, 156 BTRFS_QGROUP_RSV_LAST, 157 }; 158 159 /* 160 * Represents how many bytes we have reserved for this qgroup. 161 * 162 * Each type should have different reservation behavior. 163 * E.g, data follows its io_tree flag modification, while 164 * *currently* meta is just reserve-and-clear during transaction. 165 * 166 * TODO: Add new type for reservation which can survive transaction commit. 167 * Current metadata reservation behavior is not suitable for such case. 168 */ 169 struct btrfs_qgroup_rsv { 170 u64 values[BTRFS_QGROUP_RSV_LAST]; 171 }; 172 173 /* 174 * one struct for each qgroup, organized in fs_info->qgroup_tree. 175 */ 176 struct btrfs_qgroup { 177 u64 qgroupid; 178 179 /* 180 * state 181 */ 182 u64 rfer; /* referenced */ 183 u64 rfer_cmpr; /* referenced compressed */ 184 u64 excl; /* exclusive */ 185 u64 excl_cmpr; /* exclusive compressed */ 186 187 /* 188 * limits 189 */ 190 u64 lim_flags; /* which limits are set */ 191 u64 max_rfer; 192 u64 max_excl; 193 u64 rsv_rfer; 194 u64 rsv_excl; 195 196 /* 197 * reservation tracking 198 */ 199 struct btrfs_qgroup_rsv rsv; 200 201 /* 202 * lists 203 */ 204 struct list_head groups; /* groups this group is member of */ 205 struct list_head members; /* groups that are members of this group */ 206 struct list_head dirty; /* dirty groups */ 207 struct rb_node node; /* tree of qgroups */ 208 209 /* 210 * temp variables for accounting operations 211 * Refer to qgroup_shared_accounting() for details. 212 */ 213 u64 old_refcnt; 214 u64 new_refcnt; 215 }; 216 217 /* 218 * For qgroup event trace points only 219 */ 220 #define QGROUP_RESERVE (1<<0) 221 #define QGROUP_RELEASE (1<<1) 222 #define QGROUP_FREE (1<<2) 223 224 int btrfs_quota_enable(struct btrfs_fs_info *fs_info); 225 int btrfs_quota_disable(struct btrfs_fs_info *fs_info); 226 int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); 227 void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); 228 int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, 229 bool interruptible); 230 int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 231 u64 dst); 232 int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, 233 u64 dst); 234 int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); 235 int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); 236 int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, 237 struct btrfs_qgroup_limit *limit); 238 int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); 239 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); 240 struct btrfs_delayed_extent_op; 241 242 /* 243 * Inform qgroup to trace one dirty extent, its info is recorded in @record. 244 * So qgroup can account it at transaction committing time. 245 * 246 * No lock version, caller must acquire delayed ref lock and allocated memory, 247 * then call btrfs_qgroup_trace_extent_post() after exiting lock context. 248 * 249 * Return 0 for success insert 250 * Return >0 for existing record, caller can free @record safely. 251 * Error is not possible 252 */ 253 int btrfs_qgroup_trace_extent_nolock( 254 struct btrfs_fs_info *fs_info, 255 struct btrfs_delayed_ref_root *delayed_refs, 256 struct btrfs_qgroup_extent_record *record); 257 258 /* 259 * Post handler after qgroup_trace_extent_nolock(). 260 * 261 * NOTE: Current qgroup does the expensive backref walk at transaction 262 * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming 263 * new transaction. 264 * This is designed to allow btrfs_find_all_roots() to get correct new_roots 265 * result. 266 * 267 * However for old_roots there is no need to do backref walk at that time, 268 * since we search commit roots to walk backref and result will always be 269 * correct. 270 * 271 * Due to the nature of no lock version, we can't do backref there. 272 * So we must call btrfs_qgroup_trace_extent_post() after exiting 273 * spinlock context. 274 * 275 * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result 276 * using current root, then we can move all expensive backref walk out of 277 * transaction committing, but not now as qgroup accounting will be wrong again. 278 */ 279 int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, 280 struct btrfs_qgroup_extent_record *qrecord); 281 282 /* 283 * Inform qgroup to trace one dirty extent, specified by @bytenr and 284 * @num_bytes. 285 * So qgroup can account it at commit trans time. 286 * 287 * Better encapsulated version, with memory allocation and backref walk for 288 * commit roots. 289 * So this can sleep. 290 * 291 * Return 0 if the operation is done. 292 * Return <0 for error, like memory allocation failure or invalid parameter 293 * (NULL trans) 294 */ 295 int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, 296 u64 num_bytes, gfp_t gfp_flag); 297 298 /* 299 * Inform qgroup to trace all leaf items of data 300 * 301 * Return 0 for success 302 * Return <0 for error(ENOMEM) 303 */ 304 int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, 305 struct extent_buffer *eb); 306 /* 307 * Inform qgroup to trace a whole subtree, including all its child tree 308 * blocks and data. 309 * The root tree block is specified by @root_eb. 310 * 311 * Normally used by relocation(tree block swap) and subvolume deletion. 312 * 313 * Return 0 for success 314 * Return <0 for error(ENOMEM or tree search error) 315 */ 316 int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, 317 struct extent_buffer *root_eb, 318 u64 root_gen, int root_level); 319 320 int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, 321 struct btrfs_block_group_cache *bg_cache, 322 struct extent_buffer *src_parent, int src_slot, 323 struct extent_buffer *dst_parent, int dst_slot, 324 u64 last_snapshot); 325 int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, 326 u64 num_bytes, struct ulist *old_roots, 327 struct ulist *new_roots); 328 int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); 329 int btrfs_run_qgroups(struct btrfs_trans_handle *trans); 330 int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, 331 u64 objectid, struct btrfs_qgroup_inherit *inherit); 332 void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, 333 u64 ref_root, u64 num_bytes, 334 enum btrfs_qgroup_rsv_type type); 335 static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, 336 u64 ref_root, u64 num_bytes) 337 { 338 if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) 339 return; 340 trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); 341 btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes, 342 BTRFS_QGROUP_RSV_DATA); 343 } 344 345 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 346 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, 347 u64 rfer, u64 excl); 348 #endif 349 350 /* New io_tree based accurate qgroup reserve API */ 351 int btrfs_qgroup_reserve_data(struct inode *inode, 352 struct extent_changeset **reserved, u64 start, u64 len); 353 int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len); 354 int btrfs_qgroup_free_data(struct inode *inode, 355 struct extent_changeset *reserved, u64 start, u64 len); 356 357 int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 358 enum btrfs_qgroup_rsv_type type, bool enforce); 359 /* Reserve metadata space for pertrans and prealloc type */ 360 static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, 361 int num_bytes, bool enforce) 362 { 363 return __btrfs_qgroup_reserve_meta(root, num_bytes, 364 BTRFS_QGROUP_RSV_META_PERTRANS, enforce); 365 } 366 static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, 367 int num_bytes, bool enforce) 368 { 369 return __btrfs_qgroup_reserve_meta(root, num_bytes, 370 BTRFS_QGROUP_RSV_META_PREALLOC, enforce); 371 } 372 373 void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 374 enum btrfs_qgroup_rsv_type type); 375 376 /* Free per-transaction meta reservation for error handling */ 377 static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, 378 int num_bytes) 379 { 380 __btrfs_qgroup_free_meta(root, num_bytes, 381 BTRFS_QGROUP_RSV_META_PERTRANS); 382 } 383 384 /* Pre-allocated meta reservation can be freed at need */ 385 static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, 386 int num_bytes) 387 { 388 __btrfs_qgroup_free_meta(root, num_bytes, 389 BTRFS_QGROUP_RSV_META_PREALLOC); 390 } 391 392 /* 393 * Per-transaction meta reservation should be all freed at transaction commit 394 * time 395 */ 396 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); 397 398 /* 399 * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. 400 * 401 * This is called when preallocated meta reservation needs to be used. 402 * Normally after btrfs_join_transaction() call. 403 */ 404 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); 405 406 void btrfs_qgroup_check_reserved_leak(struct inode *inode); 407 408 /* btrfs_qgroup_swapped_blocks related functions */ 409 void btrfs_qgroup_init_swapped_blocks( 410 struct btrfs_qgroup_swapped_blocks *swapped_blocks); 411 412 void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root); 413 int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, 414 struct btrfs_root *subvol_root, 415 struct btrfs_block_group_cache *bg, 416 struct extent_buffer *subvol_parent, int subvol_slot, 417 struct extent_buffer *reloc_parent, int reloc_slot, 418 u64 last_snapshot); 419 420 #endif 421