xref: /openbmc/linux/fs/btrfs/block-rsv.c (revision a89aa749ece9c6fee7932163472d2ee0efd6ddd3)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include "misc.h"
4 #include "ctree.h"
5 #include "block-rsv.h"
6 #include "space-info.h"
7 #include "transaction.h"
8 
9 /*
10  * HOW DO BLOCK RESERVES WORK
11  *
12  *   Think of block_rsv's as buckets for logically grouped metadata
13  *   reservations.  Each block_rsv has a ->size and a ->reserved.  ->size is
14  *   how large we want our block rsv to be, ->reserved is how much space is
15  *   currently reserved for this block reserve.
16  *
17  *   ->failfast exists for the truncate case, and is described below.
18  *
19  * NORMAL OPERATION
20  *
21  *   -> Reserve
22  *     Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill
23  *
24  *     We call into btrfs_reserve_metadata_bytes() with our bytes, which is
25  *     accounted for in space_info->bytes_may_use, and then add the bytes to
26  *     ->reserved, and ->size in the case of btrfs_block_rsv_add.
27  *
28  *     ->size is an over-estimation of how much we may use for a particular
29  *     operation.
30  *
31  *   -> Use
32  *     Entrance: btrfs_use_block_rsv
33  *
34  *     When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv()
35  *     to determine the appropriate block_rsv to use, and then verify that
36  *     ->reserved has enough space for our tree block allocation.  Once
37  *     successful we subtract fs_info->nodesize from ->reserved.
38  *
39  *   -> Finish
40  *     Entrance: btrfs_block_rsv_release
41  *
42  *     We are finished with our operation, subtract our individual reservation
43  *     from ->size, and then subtract ->size from ->reserved and free up the
44  *     excess if there is any.
45  *
46  *     There is some logic here to refill the delayed refs rsv or the global rsv
47  *     as needed, otherwise the excess is subtracted from
48  *     space_info->bytes_may_use.
49  *
50  * TYPES OF BLOCK RESERVES
51  *
52  * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK
53  *   These behave normally, as described above, just within the confines of the
54  *   lifetime of their particular operation (transaction for the whole trans
55  *   handle lifetime, for example).
56  *
57  * BLOCK_RSV_GLOBAL
58  *   It is impossible to properly account for all the space that may be required
59  *   to make our extent tree updates.  This block reserve acts as an overflow
60  *   buffer in case our delayed refs reserve does not reserve enough space to
61  *   update the extent tree.
62  *
63  *   We can steal from this in some cases as well, notably on evict() or
64  *   truncate() in order to help users recover from ENOSPC conditions.
65  *
66  * BLOCK_RSV_DELALLOC
67  *   The individual item sizes are determined by the per-inode size
68  *   calculations, which are described with the delalloc code.  This is pretty
69  *   straightforward, it's just the calculation of ->size encodes a lot of
70  *   different items, and thus it gets used when updating inodes, inserting file
71  *   extents, and inserting checksums.
72  *
73  * BLOCK_RSV_DELREFS
74  *   We keep a running tally of how many delayed refs we have on the system.
75  *   We assume each one of these delayed refs are going to use a full
76  *   reservation.  We use the transaction items and pre-reserve space for every
77  *   operation, and use this reservation to refill any gap between ->size and
78  *   ->reserved that may exist.
79  *
80  *   From there it's straightforward, removing a delayed ref means we remove its
81  *   count from ->size and free up reservations as necessary.  Since this is
82  *   the most dynamic block reserve in the system, we will try to refill this
83  *   block reserve first with any excess returned by any other block reserve.
84  *
85  * BLOCK_RSV_EMPTY
86  *   This is the fallback block reserve to make us try to reserve space if we
87  *   don't have a specific bucket for this allocation.  It is mostly used for
88  *   updating the device tree and such, since that is a separate pool we're
89  *   content to just reserve space from the space_info on demand.
90  *
91  * BLOCK_RSV_TEMP
92  *   This is used by things like truncate and iput.  We will temporarily
93  *   allocate a block reserve, set it to some size, and then truncate bytes
94  *   until we have no space left.  With ->failfast set we'll simply return
95  *   ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try
96  *   to make a new reservation.  This is because these operations are
97  *   unbounded, so we want to do as much work as we can, and then back off and
98  *   re-reserve.
99  */
100 
101 static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
102 				    struct btrfs_block_rsv *block_rsv,
103 				    struct btrfs_block_rsv *dest, u64 num_bytes,
104 				    u64 *qgroup_to_release_ret)
105 {
106 	struct btrfs_space_info *space_info = block_rsv->space_info;
107 	u64 qgroup_to_release = 0;
108 	u64 ret;
109 
110 	spin_lock(&block_rsv->lock);
111 	if (num_bytes == (u64)-1) {
112 		num_bytes = block_rsv->size;
113 		qgroup_to_release = block_rsv->qgroup_rsv_size;
114 	}
115 	block_rsv->size -= num_bytes;
116 	if (block_rsv->reserved >= block_rsv->size) {
117 		num_bytes = block_rsv->reserved - block_rsv->size;
118 		block_rsv->reserved = block_rsv->size;
119 		block_rsv->full = 1;
120 	} else {
121 		num_bytes = 0;
122 	}
123 	if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
124 		qgroup_to_release = block_rsv->qgroup_rsv_reserved -
125 				    block_rsv->qgroup_rsv_size;
126 		block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
127 	} else {
128 		qgroup_to_release = 0;
129 	}
130 	spin_unlock(&block_rsv->lock);
131 
132 	ret = num_bytes;
133 	if (num_bytes > 0) {
134 		if (dest) {
135 			spin_lock(&dest->lock);
136 			if (!dest->full) {
137 				u64 bytes_to_add;
138 
139 				bytes_to_add = dest->size - dest->reserved;
140 				bytes_to_add = min(num_bytes, bytes_to_add);
141 				dest->reserved += bytes_to_add;
142 				if (dest->reserved >= dest->size)
143 					dest->full = 1;
144 				num_bytes -= bytes_to_add;
145 			}
146 			spin_unlock(&dest->lock);
147 		}
148 		if (num_bytes)
149 			btrfs_space_info_free_bytes_may_use(fs_info,
150 							    space_info,
151 							    num_bytes);
152 	}
153 	if (qgroup_to_release_ret)
154 		*qgroup_to_release_ret = qgroup_to_release;
155 	return ret;
156 }
157 
158 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
159 			    struct btrfs_block_rsv *dst, u64 num_bytes,
160 			    bool update_size)
161 {
162 	int ret;
163 
164 	ret = btrfs_block_rsv_use_bytes(src, num_bytes);
165 	if (ret)
166 		return ret;
167 
168 	btrfs_block_rsv_add_bytes(dst, num_bytes, update_size);
169 	return 0;
170 }
171 
172 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
173 {
174 	memset(rsv, 0, sizeof(*rsv));
175 	spin_lock_init(&rsv->lock);
176 	rsv->type = type;
177 }
178 
179 void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
180 				   struct btrfs_block_rsv *rsv,
181 				   unsigned short type)
182 {
183 	btrfs_init_block_rsv(rsv, type);
184 	rsv->space_info = btrfs_find_space_info(fs_info,
185 					    BTRFS_BLOCK_GROUP_METADATA);
186 }
187 
188 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
189 					      unsigned short type)
190 {
191 	struct btrfs_block_rsv *block_rsv;
192 
193 	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
194 	if (!block_rsv)
195 		return NULL;
196 
197 	btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
198 	return block_rsv;
199 }
200 
201 void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
202 			  struct btrfs_block_rsv *rsv)
203 {
204 	if (!rsv)
205 		return;
206 	btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL);
207 	kfree(rsv);
208 }
209 
210 int btrfs_block_rsv_add(struct btrfs_root *root,
211 			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
212 			enum btrfs_reserve_flush_enum flush)
213 {
214 	int ret;
215 
216 	if (num_bytes == 0)
217 		return 0;
218 
219 	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
220 	if (!ret)
221 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true);
222 
223 	return ret;
224 }
225 
226 int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
227 {
228 	u64 num_bytes = 0;
229 	int ret = -ENOSPC;
230 
231 	if (!block_rsv)
232 		return 0;
233 
234 	spin_lock(&block_rsv->lock);
235 	num_bytes = div_factor(block_rsv->size, min_factor);
236 	if (block_rsv->reserved >= num_bytes)
237 		ret = 0;
238 	spin_unlock(&block_rsv->lock);
239 
240 	return ret;
241 }
242 
243 int btrfs_block_rsv_refill(struct btrfs_root *root,
244 			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
245 			   enum btrfs_reserve_flush_enum flush)
246 {
247 	u64 num_bytes = 0;
248 	int ret = -ENOSPC;
249 
250 	if (!block_rsv)
251 		return 0;
252 
253 	spin_lock(&block_rsv->lock);
254 	num_bytes = min_reserved;
255 	if (block_rsv->reserved >= num_bytes)
256 		ret = 0;
257 	else
258 		num_bytes -= block_rsv->reserved;
259 	spin_unlock(&block_rsv->lock);
260 
261 	if (!ret)
262 		return 0;
263 
264 	ret = btrfs_reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
265 	if (!ret) {
266 		btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
267 		return 0;
268 	}
269 
270 	return ret;
271 }
272 
273 u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
274 			    struct btrfs_block_rsv *block_rsv, u64 num_bytes,
275 			    u64 *qgroup_to_release)
276 {
277 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
278 	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
279 	struct btrfs_block_rsv *target = NULL;
280 
281 	/*
282 	 * If we are the delayed_rsv then push to the global rsv, otherwise dump
283 	 * into the delayed rsv if it is not full.
284 	 */
285 	if (block_rsv == delayed_rsv)
286 		target = global_rsv;
287 	else if (block_rsv != global_rsv && !delayed_rsv->full)
288 		target = delayed_rsv;
289 
290 	if (target && block_rsv->space_info != target->space_info)
291 		target = NULL;
292 
293 	return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
294 				       qgroup_to_release);
295 }
296 
297 int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes)
298 {
299 	int ret = -ENOSPC;
300 
301 	spin_lock(&block_rsv->lock);
302 	if (block_rsv->reserved >= num_bytes) {
303 		block_rsv->reserved -= num_bytes;
304 		if (block_rsv->reserved < block_rsv->size)
305 			block_rsv->full = 0;
306 		ret = 0;
307 	}
308 	spin_unlock(&block_rsv->lock);
309 	return ret;
310 }
311 
312 void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
313 			       u64 num_bytes, bool update_size)
314 {
315 	spin_lock(&block_rsv->lock);
316 	block_rsv->reserved += num_bytes;
317 	if (update_size)
318 		block_rsv->size += num_bytes;
319 	else if (block_rsv->reserved >= block_rsv->size)
320 		block_rsv->full = 1;
321 	spin_unlock(&block_rsv->lock);
322 }
323 
324 int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
325 			     struct btrfs_block_rsv *dest, u64 num_bytes,
326 			     int min_factor)
327 {
328 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
329 	u64 min_bytes;
330 
331 	if (global_rsv->space_info != dest->space_info)
332 		return -ENOSPC;
333 
334 	spin_lock(&global_rsv->lock);
335 	min_bytes = div_factor(global_rsv->size, min_factor);
336 	if (global_rsv->reserved < min_bytes + num_bytes) {
337 		spin_unlock(&global_rsv->lock);
338 		return -ENOSPC;
339 	}
340 	global_rsv->reserved -= num_bytes;
341 	if (global_rsv->reserved < global_rsv->size)
342 		global_rsv->full = 0;
343 	spin_unlock(&global_rsv->lock);
344 
345 	btrfs_block_rsv_add_bytes(dest, num_bytes, true);
346 	return 0;
347 }
348 
349 void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
350 {
351 	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
352 	struct btrfs_space_info *sinfo = block_rsv->space_info;
353 	u64 num_bytes;
354 	unsigned min_items;
355 
356 	/*
357 	 * The global block rsv is based on the size of the extent tree, the
358 	 * checksum tree and the root tree.  If the fs is empty we want to set
359 	 * it to a minimal amount for safety.
360 	 */
361 	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
362 		btrfs_root_used(&fs_info->csum_root->root_item) +
363 		btrfs_root_used(&fs_info->tree_root->root_item);
364 
365 	/*
366 	 * We at a minimum are going to modify the csum root, the tree root, and
367 	 * the extent root.
368 	 */
369 	min_items = 3;
370 
371 	/*
372 	 * But we also want to reserve enough space so we can do the fallback
373 	 * global reserve for an unlink, which is an additional 5 items (see the
374 	 * comment in __unlink_start_trans for what we're modifying.)
375 	 *
376 	 * But we also need space for the delayed ref updates from the unlink,
377 	 * so its 10, 5 for the actual operation, and 5 for the delayed ref
378 	 * updates.
379 	 */
380 	min_items += 10;
381 
382 	num_bytes = max_t(u64, num_bytes,
383 			  btrfs_calc_insert_metadata_size(fs_info, min_items));
384 
385 	spin_lock(&sinfo->lock);
386 	spin_lock(&block_rsv->lock);
387 
388 	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
389 
390 	if (block_rsv->reserved < block_rsv->size) {
391 		num_bytes = block_rsv->size - block_rsv->reserved;
392 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
393 						      num_bytes);
394 		block_rsv->reserved = block_rsv->size;
395 	} else if (block_rsv->reserved > block_rsv->size) {
396 		num_bytes = block_rsv->reserved - block_rsv->size;
397 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo,
398 						      -num_bytes);
399 		block_rsv->reserved = block_rsv->size;
400 		btrfs_try_granting_tickets(fs_info, sinfo);
401 	}
402 
403 	if (block_rsv->reserved == block_rsv->size)
404 		block_rsv->full = 1;
405 	else
406 		block_rsv->full = 0;
407 
408 	spin_unlock(&block_rsv->lock);
409 	spin_unlock(&sinfo->lock);
410 }
411 
412 void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info)
413 {
414 	struct btrfs_space_info *space_info;
415 
416 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
417 	fs_info->chunk_block_rsv.space_info = space_info;
418 
419 	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
420 	fs_info->global_block_rsv.space_info = space_info;
421 	fs_info->trans_block_rsv.space_info = space_info;
422 	fs_info->empty_block_rsv.space_info = space_info;
423 	fs_info->delayed_block_rsv.space_info = space_info;
424 	fs_info->delayed_refs_rsv.space_info = space_info;
425 
426 	fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
427 	fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
428 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
429 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
430 	if (fs_info->quota_root)
431 		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
432 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
433 
434 	btrfs_update_global_block_rsv(fs_info);
435 }
436 
437 void btrfs_release_global_block_rsv(struct btrfs_fs_info *fs_info)
438 {
439 	btrfs_block_rsv_release(fs_info, &fs_info->global_block_rsv, (u64)-1,
440 				NULL);
441 	WARN_ON(fs_info->trans_block_rsv.size > 0);
442 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
443 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
444 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
445 	WARN_ON(fs_info->delayed_block_rsv.size > 0);
446 	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
447 	WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
448 	WARN_ON(fs_info->delayed_refs_rsv.size > 0);
449 }
450 
451 static struct btrfs_block_rsv *get_block_rsv(
452 					const struct btrfs_trans_handle *trans,
453 					const struct btrfs_root *root)
454 {
455 	struct btrfs_fs_info *fs_info = root->fs_info;
456 	struct btrfs_block_rsv *block_rsv = NULL;
457 
458 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
459 	    (root == fs_info->csum_root && trans->adding_csums) ||
460 	    (root == fs_info->uuid_root))
461 		block_rsv = trans->block_rsv;
462 
463 	if (!block_rsv)
464 		block_rsv = root->block_rsv;
465 
466 	if (!block_rsv)
467 		block_rsv = &fs_info->empty_block_rsv;
468 
469 	return block_rsv;
470 }
471 
472 struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
473 					    struct btrfs_root *root,
474 					    u32 blocksize)
475 {
476 	struct btrfs_fs_info *fs_info = root->fs_info;
477 	struct btrfs_block_rsv *block_rsv;
478 	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
479 	int ret;
480 	bool global_updated = false;
481 
482 	block_rsv = get_block_rsv(trans, root);
483 
484 	if (unlikely(block_rsv->size == 0))
485 		goto try_reserve;
486 again:
487 	ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
488 	if (!ret)
489 		return block_rsv;
490 
491 	if (block_rsv->failfast)
492 		return ERR_PTR(ret);
493 
494 	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
495 		global_updated = true;
496 		btrfs_update_global_block_rsv(fs_info);
497 		goto again;
498 	}
499 
500 	/*
501 	 * The global reserve still exists to save us from ourselves, so don't
502 	 * warn_on if we are short on our delayed refs reserve.
503 	 */
504 	if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
505 	    btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
506 		static DEFINE_RATELIMIT_STATE(_rs,
507 				DEFAULT_RATELIMIT_INTERVAL * 10,
508 				/*DEFAULT_RATELIMIT_BURST*/ 1);
509 		if (__ratelimit(&_rs))
510 			WARN(1, KERN_DEBUG
511 				"BTRFS: block rsv returned %d\n", ret);
512 	}
513 try_reserve:
514 	ret = btrfs_reserve_metadata_bytes(root, block_rsv, blocksize,
515 					   BTRFS_RESERVE_NO_FLUSH);
516 	if (!ret)
517 		return block_rsv;
518 	/*
519 	 * If we couldn't reserve metadata bytes try and use some from
520 	 * the global reserve if its space type is the same as the global
521 	 * reservation.
522 	 */
523 	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
524 	    block_rsv->space_info == global_rsv->space_info) {
525 		ret = btrfs_block_rsv_use_bytes(global_rsv, blocksize);
526 		if (!ret)
527 			return global_rsv;
528 	}
529 	return ERR_PTR(ret);
530 }
531