xref: /openbmc/linux/fs/xfs/xfs_extent_busy.c (revision 52451502)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * Copyright (c) 2010 David Chinner.
5  * Copyright (c) 2011 Christoph Hellwig.
6  * All Rights Reserved.
7  */
8 #include "xfs.h"
9 #include "xfs_fs.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_shared.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_alloc.h"
16 #include "xfs_extent_busy.h"
17 #include "xfs_trace.h"
18 #include "xfs_trans.h"
19 #include "xfs_log.h"
20 #include "xfs_ag.h"
21 
22 static void
23 xfs_extent_busy_insert_list(
24 	struct xfs_perag	*pag,
25 	xfs_agblock_t		bno,
26 	xfs_extlen_t		len,
27 	unsigned int		flags,
28 	struct list_head	*busy_list)
29 {
30 	struct xfs_extent_busy	*new;
31 	struct xfs_extent_busy	*busyp;
32 	struct rb_node		**rbp;
33 	struct rb_node		*parent = NULL;
34 
35 	new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
36 	new->agno = pag->pag_agno;
37 	new->bno = bno;
38 	new->length = len;
39 	INIT_LIST_HEAD(&new->list);
40 	new->flags = flags;
41 
42 	/* trace before insert to be able to see failed inserts */
43 	trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
44 
45 	spin_lock(&pag->pagb_lock);
46 	rbp = &pag->pagb_tree.rb_node;
47 	while (*rbp) {
48 		parent = *rbp;
49 		busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
50 
51 		if (new->bno < busyp->bno) {
52 			rbp = &(*rbp)->rb_left;
53 			ASSERT(new->bno + new->length <= busyp->bno);
54 		} else if (new->bno > busyp->bno) {
55 			rbp = &(*rbp)->rb_right;
56 			ASSERT(bno >= busyp->bno + busyp->length);
57 		} else {
58 			ASSERT(0);
59 		}
60 	}
61 
62 	rb_link_node(&new->rb_node, parent, rbp);
63 	rb_insert_color(&new->rb_node, &pag->pagb_tree);
64 
65 	list_add(&new->list, busy_list);
66 	spin_unlock(&pag->pagb_lock);
67 }
68 
69 void
70 xfs_extent_busy_insert(
71 	struct xfs_trans	*tp,
72 	struct xfs_perag	*pag,
73 	xfs_agblock_t		bno,
74 	xfs_extlen_t		len,
75 	unsigned int		flags)
76 {
77 	xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
78 }
79 
80 void
81 xfs_extent_busy_insert_discard(
82 	struct xfs_perag	*pag,
83 	xfs_agblock_t		bno,
84 	xfs_extlen_t		len,
85 	struct list_head	*busy_list)
86 {
87 	xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
88 			busy_list);
89 }
90 
91 /*
92  * Search for a busy extent within the range of the extent we are about to
93  * allocate.  You need to be holding the busy extent tree lock when calling
94  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
95  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
96  * match. This is done so that a non-zero return indicates an overlap that
97  * will require a synchronous transaction, but it can still be
98  * used to distinguish between a partial or exact match.
99  */
100 int
101 xfs_extent_busy_search(
102 	struct xfs_mount	*mp,
103 	struct xfs_perag	*pag,
104 	xfs_agblock_t		bno,
105 	xfs_extlen_t		len)
106 {
107 	struct rb_node		*rbp;
108 	struct xfs_extent_busy	*busyp;
109 	int			match = 0;
110 
111 	/* find closest start bno overlap */
112 	spin_lock(&pag->pagb_lock);
113 	rbp = pag->pagb_tree.rb_node;
114 	while (rbp) {
115 		busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
116 		if (bno < busyp->bno) {
117 			/* may overlap, but exact start block is lower */
118 			if (bno + len > busyp->bno)
119 				match = -1;
120 			rbp = rbp->rb_left;
121 		} else if (bno > busyp->bno) {
122 			/* may overlap, but exact start block is higher */
123 			if (bno < busyp->bno + busyp->length)
124 				match = -1;
125 			rbp = rbp->rb_right;
126 		} else {
127 			/* bno matches busyp, length determines exact match */
128 			match = (busyp->length == len) ? 1 : -1;
129 			break;
130 		}
131 	}
132 	spin_unlock(&pag->pagb_lock);
133 	return match;
134 }
135 
136 /*
137  * The found free extent [fbno, fend] overlaps part or all of the given busy
138  * extent.  If the overlap covers the beginning, the end, or all of the busy
139  * extent, the overlapping portion can be made unbusy and used for the
140  * allocation.  We can't split a busy extent because we can't modify a
141  * transaction/CIL context busy list, but we can update an entry's block
142  * number or length.
143  *
144  * Returns true if the extent can safely be reused, or false if the search
145  * needs to be restarted.
146  */
147 STATIC bool
148 xfs_extent_busy_update_extent(
149 	struct xfs_mount	*mp,
150 	struct xfs_perag	*pag,
151 	struct xfs_extent_busy	*busyp,
152 	xfs_agblock_t		fbno,
153 	xfs_extlen_t		flen,
154 	bool			userdata) __releases(&pag->pagb_lock)
155 					  __acquires(&pag->pagb_lock)
156 {
157 	xfs_agblock_t		fend = fbno + flen;
158 	xfs_agblock_t		bbno = busyp->bno;
159 	xfs_agblock_t		bend = bbno + busyp->length;
160 
161 	/*
162 	 * This extent is currently being discarded.  Give the thread
163 	 * performing the discard a chance to mark the extent unbusy
164 	 * and retry.
165 	 */
166 	if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
167 		spin_unlock(&pag->pagb_lock);
168 		delay(1);
169 		spin_lock(&pag->pagb_lock);
170 		return false;
171 	}
172 
173 	/*
174 	 * If there is a busy extent overlapping a user allocation, we have
175 	 * no choice but to force the log and retry the search.
176 	 *
177 	 * Fortunately this does not happen during normal operation, but
178 	 * only if the filesystem is very low on space and has to dip into
179 	 * the AGFL for normal allocations.
180 	 */
181 	if (userdata)
182 		goto out_force_log;
183 
184 	if (bbno < fbno && bend > fend) {
185 		/*
186 		 * Case 1:
187 		 *    bbno           bend
188 		 *    +BBBBBBBBBBBBBBBBB+
189 		 *        +---------+
190 		 *        fbno   fend
191 		 */
192 
193 		/*
194 		 * We would have to split the busy extent to be able to track
195 		 * it correct, which we cannot do because we would have to
196 		 * modify the list of busy extents attached to the transaction
197 		 * or CIL context, which is immutable.
198 		 *
199 		 * Force out the log to clear the busy extent and retry the
200 		 * search.
201 		 */
202 		goto out_force_log;
203 	} else if (bbno >= fbno && bend <= fend) {
204 		/*
205 		 * Case 2:
206 		 *    bbno           bend
207 		 *    +BBBBBBBBBBBBBBBBB+
208 		 *    +-----------------+
209 		 *    fbno           fend
210 		 *
211 		 * Case 3:
212 		 *    bbno           bend
213 		 *    +BBBBBBBBBBBBBBBBB+
214 		 *    +--------------------------+
215 		 *    fbno                    fend
216 		 *
217 		 * Case 4:
218 		 *             bbno           bend
219 		 *             +BBBBBBBBBBBBBBBBB+
220 		 *    +--------------------------+
221 		 *    fbno                    fend
222 		 *
223 		 * Case 5:
224 		 *             bbno           bend
225 		 *             +BBBBBBBBBBBBBBBBB+
226 		 *    +-----------------------------------+
227 		 *    fbno                             fend
228 		 *
229 		 */
230 
231 		/*
232 		 * The busy extent is fully covered by the extent we are
233 		 * allocating, and can simply be removed from the rbtree.
234 		 * However we cannot remove it from the immutable list
235 		 * tracking busy extents in the transaction or CIL context,
236 		 * so set the length to zero to mark it invalid.
237 		 *
238 		 * We also need to restart the busy extent search from the
239 		 * tree root, because erasing the node can rearrange the
240 		 * tree topology.
241 		 */
242 		rb_erase(&busyp->rb_node, &pag->pagb_tree);
243 		busyp->length = 0;
244 		return false;
245 	} else if (fend < bend) {
246 		/*
247 		 * Case 6:
248 		 *              bbno           bend
249 		 *             +BBBBBBBBBBBBBBBBB+
250 		 *             +---------+
251 		 *             fbno   fend
252 		 *
253 		 * Case 7:
254 		 *             bbno           bend
255 		 *             +BBBBBBBBBBBBBBBBB+
256 		 *    +------------------+
257 		 *    fbno            fend
258 		 *
259 		 */
260 		busyp->bno = fend;
261 		busyp->length = bend - fend;
262 	} else if (bbno < fbno) {
263 		/*
264 		 * Case 8:
265 		 *    bbno           bend
266 		 *    +BBBBBBBBBBBBBBBBB+
267 		 *        +-------------+
268 		 *        fbno       fend
269 		 *
270 		 * Case 9:
271 		 *    bbno           bend
272 		 *    +BBBBBBBBBBBBBBBBB+
273 		 *        +----------------------+
274 		 *        fbno                fend
275 		 */
276 		busyp->length = fbno - busyp->bno;
277 	} else {
278 		ASSERT(0);
279 	}
280 
281 	trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
282 	return true;
283 
284 out_force_log:
285 	spin_unlock(&pag->pagb_lock);
286 	xfs_log_force(mp, XFS_LOG_SYNC);
287 	trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
288 	spin_lock(&pag->pagb_lock);
289 	return false;
290 }
291 
292 
293 /*
294  * For a given extent [fbno, flen], make sure we can reuse it safely.
295  */
296 void
297 xfs_extent_busy_reuse(
298 	struct xfs_mount	*mp,
299 	struct xfs_perag	*pag,
300 	xfs_agblock_t		fbno,
301 	xfs_extlen_t		flen,
302 	bool			userdata)
303 {
304 	struct rb_node		*rbp;
305 
306 	ASSERT(flen > 0);
307 	spin_lock(&pag->pagb_lock);
308 restart:
309 	rbp = pag->pagb_tree.rb_node;
310 	while (rbp) {
311 		struct xfs_extent_busy *busyp =
312 			rb_entry(rbp, struct xfs_extent_busy, rb_node);
313 		xfs_agblock_t	bbno = busyp->bno;
314 		xfs_agblock_t	bend = bbno + busyp->length;
315 
316 		if (fbno + flen <= bbno) {
317 			rbp = rbp->rb_left;
318 			continue;
319 		} else if (fbno >= bend) {
320 			rbp = rbp->rb_right;
321 			continue;
322 		}
323 
324 		if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
325 						  userdata))
326 			goto restart;
327 	}
328 	spin_unlock(&pag->pagb_lock);
329 }
330 
331 /*
332  * For a given extent [fbno, flen], search the busy extent list to find a
333  * subset of the extent that is not busy.  If *rlen is smaller than
334  * args->minlen no suitable extent could be found, and the higher level
335  * code needs to force out the log and retry the allocation.
336  *
337  * Return the current busy generation for the AG if the extent is busy. This
338  * value can be used to wait for at least one of the currently busy extents
339  * to be cleared. Note that the busy list is not guaranteed to be empty after
340  * the gen is woken. The state of a specific extent must always be confirmed
341  * with another call to xfs_extent_busy_trim() before it can be used.
342  */
343 bool
344 xfs_extent_busy_trim(
345 	struct xfs_alloc_arg	*args,
346 	xfs_agblock_t		*bno,
347 	xfs_extlen_t		*len,
348 	unsigned		*busy_gen)
349 {
350 	xfs_agblock_t		fbno;
351 	xfs_extlen_t		flen;
352 	struct rb_node		*rbp;
353 	bool			ret = false;
354 
355 	ASSERT(*len > 0);
356 
357 	spin_lock(&args->pag->pagb_lock);
358 	fbno = *bno;
359 	flen = *len;
360 	rbp = args->pag->pagb_tree.rb_node;
361 	while (rbp && flen >= args->minlen) {
362 		struct xfs_extent_busy *busyp =
363 			rb_entry(rbp, struct xfs_extent_busy, rb_node);
364 		xfs_agblock_t	fend = fbno + flen;
365 		xfs_agblock_t	bbno = busyp->bno;
366 		xfs_agblock_t	bend = bbno + busyp->length;
367 
368 		if (fend <= bbno) {
369 			rbp = rbp->rb_left;
370 			continue;
371 		} else if (fbno >= bend) {
372 			rbp = rbp->rb_right;
373 			continue;
374 		}
375 
376 		if (bbno <= fbno) {
377 			/* start overlap */
378 
379 			/*
380 			 * Case 1:
381 			 *    bbno           bend
382 			 *    +BBBBBBBBBBBBBBBBB+
383 			 *        +---------+
384 			 *        fbno   fend
385 			 *
386 			 * Case 2:
387 			 *    bbno           bend
388 			 *    +BBBBBBBBBBBBBBBBB+
389 			 *    +-------------+
390 			 *    fbno       fend
391 			 *
392 			 * Case 3:
393 			 *    bbno           bend
394 			 *    +BBBBBBBBBBBBBBBBB+
395 			 *        +-------------+
396 			 *        fbno       fend
397 			 *
398 			 * Case 4:
399 			 *    bbno           bend
400 			 *    +BBBBBBBBBBBBBBBBB+
401 			 *    +-----------------+
402 			 *    fbno           fend
403 			 *
404 			 * No unbusy region in extent, return failure.
405 			 */
406 			if (fend <= bend)
407 				goto fail;
408 
409 			/*
410 			 * Case 5:
411 			 *    bbno           bend
412 			 *    +BBBBBBBBBBBBBBBBB+
413 			 *        +----------------------+
414 			 *        fbno                fend
415 			 *
416 			 * Case 6:
417 			 *    bbno           bend
418 			 *    +BBBBBBBBBBBBBBBBB+
419 			 *    +--------------------------+
420 			 *    fbno                    fend
421 			 *
422 			 * Needs to be trimmed to:
423 			 *                       +-------+
424 			 *                       fbno fend
425 			 */
426 			fbno = bend;
427 		} else if (bend >= fend) {
428 			/* end overlap */
429 
430 			/*
431 			 * Case 7:
432 			 *             bbno           bend
433 			 *             +BBBBBBBBBBBBBBBBB+
434 			 *    +------------------+
435 			 *    fbno            fend
436 			 *
437 			 * Case 8:
438 			 *             bbno           bend
439 			 *             +BBBBBBBBBBBBBBBBB+
440 			 *    +--------------------------+
441 			 *    fbno                    fend
442 			 *
443 			 * Needs to be trimmed to:
444 			 *    +-------+
445 			 *    fbno fend
446 			 */
447 			fend = bbno;
448 		} else {
449 			/* middle overlap */
450 
451 			/*
452 			 * Case 9:
453 			 *             bbno           bend
454 			 *             +BBBBBBBBBBBBBBBBB+
455 			 *    +-----------------------------------+
456 			 *    fbno                             fend
457 			 *
458 			 * Can be trimmed to:
459 			 *    +-------+        OR         +-------+
460 			 *    fbno fend                   fbno fend
461 			 *
462 			 * Backward allocation leads to significant
463 			 * fragmentation of directories, which degrades
464 			 * directory performance, therefore we always want to
465 			 * choose the option that produces forward allocation
466 			 * patterns.
467 			 * Preferring the lower bno extent will make the next
468 			 * request use "fend" as the start of the next
469 			 * allocation;  if the segment is no longer busy at
470 			 * that point, we'll get a contiguous allocation, but
471 			 * even if it is still busy, we will get a forward
472 			 * allocation.
473 			 * We try to avoid choosing the segment at "bend",
474 			 * because that can lead to the next allocation
475 			 * taking the segment at "fbno", which would be a
476 			 * backward allocation.  We only use the segment at
477 			 * "fbno" if it is much larger than the current
478 			 * requested size, because in that case there's a
479 			 * good chance subsequent allocations will be
480 			 * contiguous.
481 			 */
482 			if (bbno - fbno >= args->maxlen) {
483 				/* left candidate fits perfect */
484 				fend = bbno;
485 			} else if (fend - bend >= args->maxlen * 4) {
486 				/* right candidate has enough free space */
487 				fbno = bend;
488 			} else if (bbno - fbno >= args->minlen) {
489 				/* left candidate fits minimum requirement */
490 				fend = bbno;
491 			} else {
492 				goto fail;
493 			}
494 		}
495 
496 		flen = fend - fbno;
497 	}
498 out:
499 
500 	if (fbno != *bno || flen != *len) {
501 		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
502 					  fbno, flen);
503 		*bno = fbno;
504 		*len = flen;
505 		*busy_gen = args->pag->pagb_gen;
506 		ret = true;
507 	}
508 	spin_unlock(&args->pag->pagb_lock);
509 	return ret;
510 fail:
511 	/*
512 	 * Return a zero extent length as failure indications.  All callers
513 	 * re-check if the trimmed extent satisfies the minlen requirement.
514 	 */
515 	flen = 0;
516 	goto out;
517 }
518 
519 STATIC void
520 xfs_extent_busy_clear_one(
521 	struct xfs_mount	*mp,
522 	struct xfs_perag	*pag,
523 	struct xfs_extent_busy	*busyp)
524 {
525 	if (busyp->length) {
526 		trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
527 						busyp->length);
528 		rb_erase(&busyp->rb_node, &pag->pagb_tree);
529 	}
530 
531 	list_del_init(&busyp->list);
532 	kmem_free(busyp);
533 }
534 
535 static void
536 xfs_extent_busy_put_pag(
537 	struct xfs_perag	*pag,
538 	bool			wakeup)
539 		__releases(pag->pagb_lock)
540 {
541 	if (wakeup) {
542 		pag->pagb_gen++;
543 		wake_up_all(&pag->pagb_wait);
544 	}
545 
546 	spin_unlock(&pag->pagb_lock);
547 	xfs_perag_put(pag);
548 }
549 
550 /*
551  * Remove all extents on the passed in list from the busy extents tree.
552  * If do_discard is set skip extents that need to be discarded, and mark
553  * these as undergoing a discard operation instead.
554  */
555 void
556 xfs_extent_busy_clear(
557 	struct xfs_mount	*mp,
558 	struct list_head	*list,
559 	bool			do_discard)
560 {
561 	struct xfs_extent_busy	*busyp, *n;
562 	struct xfs_perag	*pag = NULL;
563 	xfs_agnumber_t		agno = NULLAGNUMBER;
564 	bool			wakeup = false;
565 
566 	list_for_each_entry_safe(busyp, n, list, list) {
567 		if (busyp->agno != agno) {
568 			if (pag)
569 				xfs_extent_busy_put_pag(pag, wakeup);
570 			agno = busyp->agno;
571 			pag = xfs_perag_get(mp, agno);
572 			spin_lock(&pag->pagb_lock);
573 			wakeup = false;
574 		}
575 
576 		if (do_discard && busyp->length &&
577 		    !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
578 			busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
579 		} else {
580 			xfs_extent_busy_clear_one(mp, pag, busyp);
581 			wakeup = true;
582 		}
583 	}
584 
585 	if (pag)
586 		xfs_extent_busy_put_pag(pag, wakeup);
587 }
588 
589 /*
590  * Flush out all busy extents for this AG.
591  *
592  * If the current transaction is holding busy extents, the caller may not want
593  * to wait for committed busy extents to resolve. If we are being told just to
594  * try a flush or progress has been made since we last skipped a busy extent,
595  * return immediately to allow the caller to try again.
596  *
597  * If we are freeing extents, we might actually be holding the only free extents
598  * in the transaction busy list and the log force won't resolve that situation.
599  * In this case, we must return -EAGAIN to avoid a deadlock by informing the
600  * caller it needs to commit the busy extents it holds before retrying the
601  * extent free operation.
602  */
603 int
604 xfs_extent_busy_flush(
605 	struct xfs_trans	*tp,
606 	struct xfs_perag	*pag,
607 	unsigned		busy_gen,
608 	uint32_t		alloc_flags)
609 {
610 	DEFINE_WAIT		(wait);
611 	int			error;
612 
613 	error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
614 	if (error)
615 		return error;
616 
617 	/* Avoid deadlocks on uncommitted busy extents. */
618 	if (!list_empty(&tp->t_busy)) {
619 		if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
620 			return 0;
621 
622 		if (busy_gen != READ_ONCE(pag->pagb_gen))
623 			return 0;
624 
625 		if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
626 			return -EAGAIN;
627 	}
628 
629 	/* Wait for committed busy extents to resolve. */
630 	do {
631 		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
632 		if  (busy_gen != READ_ONCE(pag->pagb_gen))
633 			break;
634 		schedule();
635 	} while (1);
636 
637 	finish_wait(&pag->pagb_wait, &wait);
638 	return 0;
639 }
640 
641 void
642 xfs_extent_busy_wait_all(
643 	struct xfs_mount	*mp)
644 {
645 	struct xfs_perag	*pag;
646 	DEFINE_WAIT		(wait);
647 	xfs_agnumber_t		agno;
648 
649 	for_each_perag(mp, agno, pag) {
650 		do {
651 			prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
652 			if  (RB_EMPTY_ROOT(&pag->pagb_tree))
653 				break;
654 			schedule();
655 		} while (1);
656 		finish_wait(&pag->pagb_wait, &wait);
657 	}
658 }
659 
660 /*
661  * Callback for list_sort to sort busy extents by the AG they reside in.
662  */
663 int
664 xfs_extent_busy_ag_cmp(
665 	void			*priv,
666 	const struct list_head	*l1,
667 	const struct list_head	*l2)
668 {
669 	struct xfs_extent_busy	*b1 =
670 		container_of(l1, struct xfs_extent_busy, list);
671 	struct xfs_extent_busy	*b2 =
672 		container_of(l2, struct xfs_extent_busy, list);
673 	s32 diff;
674 
675 	diff = b1->agno - b2->agno;
676 	if (!diff)
677 		diff = b1->bno - b2->bno;
678 	return diff;
679 }
680