xref: /openbmc/linux/fs/xfs/libxfs/xfs_defer.c (revision 512edfac)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2016 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_defer.h"
14 #include "xfs_trans.h"
15 #include "xfs_buf_item.h"
16 #include "xfs_inode.h"
17 #include "xfs_inode_item.h"
18 #include "xfs_trace.h"
19 #include "xfs_icache.h"
20 #include "xfs_log.h"
21 
22 /*
23  * Deferred Operations in XFS
24  *
25  * Due to the way locking rules work in XFS, certain transactions (block
26  * mapping and unmapping, typically) have permanent reservations so that
27  * we can roll the transaction to adhere to AG locking order rules and
28  * to unlock buffers between metadata updates.  Prior to rmap/reflink,
29  * the mapping code had a mechanism to perform these deferrals for
30  * extents that were going to be freed; this code makes that facility
31  * more generic.
32  *
33  * When adding the reverse mapping and reflink features, it became
34  * necessary to perform complex remapping multi-transactions to comply
35  * with AG locking order rules, and to be able to spread a single
36  * refcount update operation (an operation on an n-block extent can
37  * update as many as n records!) among multiple transactions.  XFS can
38  * roll a transaction to facilitate this, but using this facility
39  * requires us to log "intent" items in case log recovery needs to
40  * redo the operation, and to log "done" items to indicate that redo
41  * is not necessary.
42  *
43  * Deferred work is tracked in xfs_defer_pending items.  Each pending
44  * item tracks one type of deferred work.  Incoming work items (which
45  * have not yet had an intent logged) are attached to a pending item
46  * on the dop_intake list, where they wait for the caller to finish
47  * the deferred operations.
48  *
49  * Finishing a set of deferred operations is an involved process.  To
50  * start, we define "rolling a deferred-op transaction" as follows:
51  *
52  * > For each xfs_defer_pending item on the dop_intake list,
53  *   - Sort the work items in AG order.  XFS locking
54  *     order rules require us to lock buffers in AG order.
55  *   - Create a log intent item for that type.
56  *   - Attach it to the pending item.
57  *   - Move the pending item from the dop_intake list to the
58  *     dop_pending list.
59  * > Roll the transaction.
60  *
61  * NOTE: To avoid exceeding the transaction reservation, we limit the
62  * number of items that we attach to a given xfs_defer_pending.
63  *
64  * The actual finishing process looks like this:
65  *
66  * > For each xfs_defer_pending in the dop_pending list,
67  *   - Roll the deferred-op transaction as above.
68  *   - Create a log done item for that type, and attach it to the
69  *     log intent item.
70  *   - For each work item attached to the log intent item,
71  *     * Perform the described action.
72  *     * Attach the work item to the log done item.
73  *     * If the result of doing the work was -EAGAIN, ->finish work
74  *       wants a new transaction.  See the "Requesting a Fresh
75  *       Transaction while Finishing Deferred Work" section below for
76  *       details.
77  *
78  * The key here is that we must log an intent item for all pending
79  * work items every time we roll the transaction, and that we must log
80  * a done item as soon as the work is completed.  With this mechanism
81  * we can perform complex remapping operations, chaining intent items
82  * as needed.
83  *
84  * Requesting a Fresh Transaction while Finishing Deferred Work
85  *
86  * If ->finish_item decides that it needs a fresh transaction to
87  * finish the work, it must ask its caller (xfs_defer_finish) for a
88  * continuation.  The most likely cause of this circumstance are the
89  * refcount adjust functions deciding that they've logged enough items
90  * to be at risk of exceeding the transaction reservation.
91  *
92  * To get a fresh transaction, we want to log the existing log done
93  * item to prevent the log intent item from replaying, immediately log
94  * a new log intent item with the unfinished work items, roll the
95  * transaction, and re-call ->finish_item wherever it left off.  The
96  * log done item and the new log intent item must be in the same
97  * transaction or atomicity cannot be guaranteed; defer_finish ensures
98  * that this happens.
99  *
100  * This requires some coordination between ->finish_item and
101  * defer_finish.  Upon deciding to request a new transaction,
102  * ->finish_item should update the current work item to reflect the
103  * unfinished work.  Next, it should reset the log done item's list
104  * count to the number of items finished, and return -EAGAIN.
105  * defer_finish sees the -EAGAIN, logs the new log intent item
106  * with the remaining work items, and leaves the xfs_defer_pending
107  * item at the head of the dop_work queue.  Then it rolls the
108  * transaction and picks up processing where it left off.  It is
109  * required that ->finish_item must be careful to leave enough
110  * transaction reservation to fit the new log intent item.
111  *
112  * This is an example of remapping the extent (E, E+B) into file X at
113  * offset A and dealing with the extent (C, C+B) already being mapped
114  * there:
115  * +-------------------------------------------------+
116  * | Unmap file X startblock C offset A length B     | t0
117  * | Intent to reduce refcount for extent (C, B)     |
118  * | Intent to remove rmap (X, C, A, B)              |
119  * | Intent to free extent (D, 1) (bmbt block)       |
120  * | Intent to map (X, A, B) at startblock E         |
121  * +-------------------------------------------------+
122  * | Map file X startblock E offset A length B       | t1
123  * | Done mapping (X, E, A, B)                       |
124  * | Intent to increase refcount for extent (E, B)   |
125  * | Intent to add rmap (X, E, A, B)                 |
126  * +-------------------------------------------------+
127  * | Reduce refcount for extent (C, B)               | t2
128  * | Done reducing refcount for extent (C, 9)        |
129  * | Intent to reduce refcount for extent (C+9, B-9) |
130  * | (ran out of space after 9 refcount updates)     |
131  * +-------------------------------------------------+
132  * | Reduce refcount for extent (C+9, B+9)           | t3
133  * | Done reducing refcount for extent (C+9, B-9)    |
134  * | Increase refcount for extent (E, B)             |
135  * | Done increasing refcount for extent (E, B)      |
136  * | Intent to free extent (C, B)                    |
137  * | Intent to free extent (F, 1) (refcountbt block) |
138  * | Intent to remove rmap (F, 1, REFC)              |
139  * +-------------------------------------------------+
140  * | Remove rmap (X, C, A, B)                        | t4
141  * | Done removing rmap (X, C, A, B)                 |
142  * | Add rmap (X, E, A, B)                           |
143  * | Done adding rmap (X, E, A, B)                   |
144  * | Remove rmap (F, 1, REFC)                        |
145  * | Done removing rmap (F, 1, REFC)                 |
146  * +-------------------------------------------------+
147  * | Free extent (C, B)                              | t5
148  * | Done freeing extent (C, B)                      |
149  * | Free extent (D, 1)                              |
150  * | Done freeing extent (D, 1)                      |
151  * | Free extent (F, 1)                              |
152  * | Done freeing extent (F, 1)                      |
153  * +-------------------------------------------------+
154  *
155  * If we should crash before t2 commits, log recovery replays
156  * the following intent items:
157  *
158  * - Intent to reduce refcount for extent (C, B)
159  * - Intent to remove rmap (X, C, A, B)
160  * - Intent to free extent (D, 1) (bmbt block)
161  * - Intent to increase refcount for extent (E, B)
162  * - Intent to add rmap (X, E, A, B)
163  *
164  * In the process of recovering, it should also generate and take care
165  * of these intent items:
166  *
167  * - Intent to free extent (C, B)
168  * - Intent to free extent (F, 1) (refcountbt block)
169  * - Intent to remove rmap (F, 1, REFC)
170  *
171  * Note that the continuation requested between t2 and t3 is likely to
172  * reoccur.
173  */
174 
175 static const struct xfs_defer_op_type *defer_op_types[] = {
176 	[XFS_DEFER_OPS_TYPE_BMAP]	= &xfs_bmap_update_defer_type,
177 	[XFS_DEFER_OPS_TYPE_REFCOUNT]	= &xfs_refcount_update_defer_type,
178 	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
179 	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
180 	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
181 };
182 
183 static void
184 xfs_defer_create_intent(
185 	struct xfs_trans		*tp,
186 	struct xfs_defer_pending	*dfp,
187 	bool				sort)
188 {
189 	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
190 
191 	if (!dfp->dfp_intent)
192 		dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
193 						     dfp->dfp_count, sort);
194 }
195 
196 /*
197  * For each pending item in the intake list, log its intent item and the
198  * associated extents, then add the entire intake list to the end of
199  * the pending list.
200  */
201 STATIC void
202 xfs_defer_create_intents(
203 	struct xfs_trans		*tp)
204 {
205 	struct xfs_defer_pending	*dfp;
206 
207 	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
208 		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
209 		xfs_defer_create_intent(tp, dfp, true);
210 	}
211 }
212 
213 /* Abort all the intents that were committed. */
214 STATIC void
215 xfs_defer_trans_abort(
216 	struct xfs_trans		*tp,
217 	struct list_head		*dop_pending)
218 {
219 	struct xfs_defer_pending	*dfp;
220 	const struct xfs_defer_op_type	*ops;
221 
222 	trace_xfs_defer_trans_abort(tp, _RET_IP_);
223 
224 	/* Abort intent items that don't have a done item. */
225 	list_for_each_entry(dfp, dop_pending, dfp_list) {
226 		ops = defer_op_types[dfp->dfp_type];
227 		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
228 		if (dfp->dfp_intent && !dfp->dfp_done) {
229 			ops->abort_intent(dfp->dfp_intent);
230 			dfp->dfp_intent = NULL;
231 		}
232 	}
233 }
234 
235 /*
236  * Capture resources that the caller said not to release ("held") when the
237  * transaction commits.  Caller is responsible for zero-initializing @dres.
238  */
239 static int
240 xfs_defer_save_resources(
241 	struct xfs_defer_resources	*dres,
242 	struct xfs_trans		*tp)
243 {
244 	struct xfs_buf_log_item		*bli;
245 	struct xfs_inode_log_item	*ili;
246 	struct xfs_log_item		*lip;
247 
248 	BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
249 
250 	list_for_each_entry(lip, &tp->t_items, li_trans) {
251 		switch (lip->li_type) {
252 		case XFS_LI_BUF:
253 			bli = container_of(lip, struct xfs_buf_log_item,
254 					   bli_item);
255 			if (bli->bli_flags & XFS_BLI_HOLD) {
256 				if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
257 					ASSERT(0);
258 					return -EFSCORRUPTED;
259 				}
260 				if (bli->bli_flags & XFS_BLI_ORDERED)
261 					dres->dr_ordered |=
262 							(1U << dres->dr_bufs);
263 				else
264 					xfs_trans_dirty_buf(tp, bli->bli_buf);
265 				dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
266 			}
267 			break;
268 		case XFS_LI_INODE:
269 			ili = container_of(lip, struct xfs_inode_log_item,
270 					   ili_item);
271 			if (ili->ili_lock_flags == 0) {
272 				if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
273 					ASSERT(0);
274 					return -EFSCORRUPTED;
275 				}
276 				xfs_trans_log_inode(tp, ili->ili_inode,
277 						    XFS_ILOG_CORE);
278 				dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
279 			}
280 			break;
281 		default:
282 			break;
283 		}
284 	}
285 
286 	return 0;
287 }
288 
289 /* Attach the held resources to the transaction. */
290 static void
291 xfs_defer_restore_resources(
292 	struct xfs_trans		*tp,
293 	struct xfs_defer_resources	*dres)
294 {
295 	unsigned short			i;
296 
297 	/* Rejoin the joined inodes. */
298 	for (i = 0; i < dres->dr_inos; i++)
299 		xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
300 
301 	/* Rejoin the buffers and dirty them so the log moves forward. */
302 	for (i = 0; i < dres->dr_bufs; i++) {
303 		xfs_trans_bjoin(tp, dres->dr_bp[i]);
304 		if (dres->dr_ordered & (1U << i))
305 			xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
306 		xfs_trans_bhold(tp, dres->dr_bp[i]);
307 	}
308 }
309 
310 /* Roll a transaction so we can do some deferred op processing. */
311 STATIC int
312 xfs_defer_trans_roll(
313 	struct xfs_trans		**tpp)
314 {
315 	struct xfs_defer_resources	dres = { };
316 	int				error;
317 
318 	error = xfs_defer_save_resources(&dres, *tpp);
319 	if (error)
320 		return error;
321 
322 	trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
323 
324 	/*
325 	 * Roll the transaction.  Rolling always given a new transaction (even
326 	 * if committing the old one fails!) to hand back to the caller, so we
327 	 * join the held resources to the new transaction so that we always
328 	 * return with the held resources joined to @tpp, no matter what
329 	 * happened.
330 	 */
331 	error = xfs_trans_roll(tpp);
332 
333 	xfs_defer_restore_resources(*tpp, &dres);
334 
335 	if (error)
336 		trace_xfs_defer_trans_roll_error(*tpp, error);
337 	return error;
338 }
339 
340 /*
341  * Free up any items left in the list.
342  */
343 static void
344 xfs_defer_cancel_list(
345 	struct xfs_mount		*mp,
346 	struct list_head		*dop_list)
347 {
348 	struct xfs_defer_pending	*dfp;
349 	struct xfs_defer_pending	*pli;
350 	struct list_head		*pwi;
351 	struct list_head		*n;
352 	const struct xfs_defer_op_type	*ops;
353 
354 	/*
355 	 * Free the pending items.  Caller should already have arranged
356 	 * for the intent items to be released.
357 	 */
358 	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
359 		ops = defer_op_types[dfp->dfp_type];
360 		trace_xfs_defer_cancel_list(mp, dfp);
361 		list_del(&dfp->dfp_list);
362 		list_for_each_safe(pwi, n, &dfp->dfp_work) {
363 			list_del(pwi);
364 			dfp->dfp_count--;
365 			ops->cancel_item(pwi);
366 		}
367 		ASSERT(dfp->dfp_count == 0);
368 		kmem_free(dfp);
369 	}
370 }
371 
372 /*
373  * Prevent a log intent item from pinning the tail of the log by logging a
374  * done item to release the intent item; and then log a new intent item.
375  * The caller should provide a fresh transaction and roll it after we're done.
376  */
377 static int
378 xfs_defer_relog(
379 	struct xfs_trans		**tpp,
380 	struct list_head		*dfops)
381 {
382 	struct xlog			*log = (*tpp)->t_mountp->m_log;
383 	struct xfs_defer_pending	*dfp;
384 	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;
385 
386 
387 	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
388 
389 	list_for_each_entry(dfp, dfops, dfp_list) {
390 		/*
391 		 * If the log intent item for this deferred op is not a part of
392 		 * the current log checkpoint, relog the intent item to keep
393 		 * the log tail moving forward.  We're ok with this being racy
394 		 * because an incorrect decision means we'll be a little slower
395 		 * at pushing the tail.
396 		 */
397 		if (dfp->dfp_intent == NULL ||
398 		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
399 			continue;
400 
401 		/*
402 		 * Figure out where we need the tail to be in order to maintain
403 		 * the minimum required free space in the log.  Only sample
404 		 * the log threshold once per call.
405 		 */
406 		if (threshold_lsn == NULLCOMMITLSN) {
407 			threshold_lsn = xlog_grant_push_threshold(log, 0);
408 			if (threshold_lsn == NULLCOMMITLSN)
409 				break;
410 		}
411 		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
412 			continue;
413 
414 		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
415 		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
416 		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
417 	}
418 
419 	if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
420 		return xfs_defer_trans_roll(tpp);
421 	return 0;
422 }
423 
424 /*
425  * Log an intent-done item for the first pending intent, and finish the work
426  * items.
427  */
428 static int
429 xfs_defer_finish_one(
430 	struct xfs_trans		*tp,
431 	struct xfs_defer_pending	*dfp)
432 {
433 	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
434 	struct xfs_btree_cur		*state = NULL;
435 	struct list_head		*li, *n;
436 	int				error;
437 
438 	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
439 
440 	dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
441 	list_for_each_safe(li, n, &dfp->dfp_work) {
442 		list_del(li);
443 		dfp->dfp_count--;
444 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
445 		if (error == -EAGAIN) {
446 			/*
447 			 * Caller wants a fresh transaction; put the work item
448 			 * back on the list and log a new log intent item to
449 			 * replace the old one.  See "Requesting a Fresh
450 			 * Transaction while Finishing Deferred Work" above.
451 			 */
452 			list_add(li, &dfp->dfp_work);
453 			dfp->dfp_count++;
454 			dfp->dfp_done = NULL;
455 			dfp->dfp_intent = NULL;
456 			xfs_defer_create_intent(tp, dfp, false);
457 		}
458 
459 		if (error)
460 			goto out;
461 	}
462 
463 	/* Done with the dfp, free it. */
464 	list_del(&dfp->dfp_list);
465 	kmem_free(dfp);
466 out:
467 	if (ops->finish_cleanup)
468 		ops->finish_cleanup(tp, state, error);
469 	return error;
470 }
471 
472 /*
473  * Finish all the pending work.  This involves logging intent items for
474  * any work items that wandered in since the last transaction roll (if
475  * one has even happened), rolling the transaction, and finishing the
476  * work items in the first item on the logged-and-pending list.
477  *
478  * If an inode is provided, relog it to the new transaction.
479  */
480 int
481 xfs_defer_finish_noroll(
482 	struct xfs_trans		**tp)
483 {
484 	struct xfs_defer_pending	*dfp;
485 	int				error = 0;
486 	LIST_HEAD(dop_pending);
487 
488 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
489 
490 	trace_xfs_defer_finish(*tp, _RET_IP_);
491 
492 	/* Until we run out of pending work to finish... */
493 	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
494 		/*
495 		 * Deferred items that are created in the process of finishing
496 		 * other deferred work items should be queued at the head of
497 		 * the pending list, which puts them ahead of the deferred work
498 		 * that was created by the caller.  This keeps the number of
499 		 * pending work items to a minimum, which decreases the amount
500 		 * of time that any one intent item can stick around in memory,
501 		 * pinning the log tail.
502 		 */
503 		xfs_defer_create_intents(*tp);
504 		list_splice_init(&(*tp)->t_dfops, &dop_pending);
505 
506 		error = xfs_defer_trans_roll(tp);
507 		if (error)
508 			goto out_shutdown;
509 
510 		/* Possibly relog intent items to keep the log moving. */
511 		error = xfs_defer_relog(tp, &dop_pending);
512 		if (error)
513 			goto out_shutdown;
514 
515 		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
516 				       dfp_list);
517 		error = xfs_defer_finish_one(*tp, dfp);
518 		if (error && error != -EAGAIN)
519 			goto out_shutdown;
520 	}
521 
522 	trace_xfs_defer_finish_done(*tp, _RET_IP_);
523 	return 0;
524 
525 out_shutdown:
526 	xfs_defer_trans_abort(*tp, &dop_pending);
527 	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
528 	trace_xfs_defer_finish_error(*tp, error);
529 	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
530 	xfs_defer_cancel(*tp);
531 	return error;
532 }
533 
534 int
535 xfs_defer_finish(
536 	struct xfs_trans	**tp)
537 {
538 	int			error;
539 
540 	/*
541 	 * Finish and roll the transaction once more to avoid returning to the
542 	 * caller with a dirty transaction.
543 	 */
544 	error = xfs_defer_finish_noroll(tp);
545 	if (error)
546 		return error;
547 	if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
548 		error = xfs_defer_trans_roll(tp);
549 		if (error) {
550 			xfs_force_shutdown((*tp)->t_mountp,
551 					   SHUTDOWN_CORRUPT_INCORE);
552 			return error;
553 		}
554 	}
555 
556 	/* Reset LOWMODE now that we've finished all the dfops. */
557 	ASSERT(list_empty(&(*tp)->t_dfops));
558 	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
559 	return 0;
560 }
561 
562 void
563 xfs_defer_cancel(
564 	struct xfs_trans	*tp)
565 {
566 	struct xfs_mount	*mp = tp->t_mountp;
567 
568 	trace_xfs_defer_cancel(tp, _RET_IP_);
569 	xfs_defer_cancel_list(mp, &tp->t_dfops);
570 }
571 
572 /* Add an item for later deferred processing. */
573 void
574 xfs_defer_add(
575 	struct xfs_trans		*tp,
576 	enum xfs_defer_ops_type		type,
577 	struct list_head		*li)
578 {
579 	struct xfs_defer_pending	*dfp = NULL;
580 	const struct xfs_defer_op_type	*ops;
581 
582 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
583 	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
584 
585 	/*
586 	 * Add the item to a pending item at the end of the intake list.
587 	 * If the last pending item has the same type, reuse it.  Else,
588 	 * create a new pending item at the end of the intake list.
589 	 */
590 	if (!list_empty(&tp->t_dfops)) {
591 		dfp = list_last_entry(&tp->t_dfops,
592 				struct xfs_defer_pending, dfp_list);
593 		ops = defer_op_types[dfp->dfp_type];
594 		if (dfp->dfp_type != type ||
595 		    (ops->max_items && dfp->dfp_count >= ops->max_items))
596 			dfp = NULL;
597 	}
598 	if (!dfp) {
599 		dfp = kmem_alloc(sizeof(struct xfs_defer_pending),
600 				KM_NOFS);
601 		dfp->dfp_type = type;
602 		dfp->dfp_intent = NULL;
603 		dfp->dfp_done = NULL;
604 		dfp->dfp_count = 0;
605 		INIT_LIST_HEAD(&dfp->dfp_work);
606 		list_add_tail(&dfp->dfp_list, &tp->t_dfops);
607 	}
608 
609 	list_add_tail(li, &dfp->dfp_work);
610 	dfp->dfp_count++;
611 }
612 
613 /*
614  * Move deferred ops from one transaction to another and reset the source to
615  * initial state. This is primarily used to carry state forward across
616  * transaction rolls with pending dfops.
617  */
618 void
619 xfs_defer_move(
620 	struct xfs_trans	*dtp,
621 	struct xfs_trans	*stp)
622 {
623 	list_splice_init(&stp->t_dfops, &dtp->t_dfops);
624 
625 	/*
626 	 * Low free space mode was historically controlled by a dfops field.
627 	 * This meant that low mode state potentially carried across multiple
628 	 * transaction rolls. Transfer low mode on a dfops move to preserve
629 	 * that behavior.
630 	 */
631 	dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
632 	stp->t_flags &= ~XFS_TRANS_LOWMODE;
633 }
634 
635 /*
636  * Prepare a chain of fresh deferred ops work items to be completed later.  Log
637  * recovery requires the ability to put off until later the actual finishing
638  * work so that it can process unfinished items recovered from the log in
639  * correct order.
640  *
641  * Create and log intent items for all the work that we're capturing so that we
642  * can be assured that the items will get replayed if the system goes down
643  * before log recovery gets a chance to finish the work it put off.  The entire
644  * deferred ops state is transferred to the capture structure and the
645  * transaction is then ready for the caller to commit it.  If there are no
646  * intent items to capture, this function returns NULL.
647  *
648  * If capture_ip is not NULL, the capture structure will obtain an extra
649  * reference to the inode.
650  */
651 static struct xfs_defer_capture *
652 xfs_defer_ops_capture(
653 	struct xfs_trans		*tp)
654 {
655 	struct xfs_defer_capture	*dfc;
656 	unsigned short			i;
657 	int				error;
658 
659 	if (list_empty(&tp->t_dfops))
660 		return NULL;
661 
662 	/* Create an object to capture the defer ops. */
663 	dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
664 	INIT_LIST_HEAD(&dfc->dfc_list);
665 	INIT_LIST_HEAD(&dfc->dfc_dfops);
666 
667 	xfs_defer_create_intents(tp);
668 
669 	/* Move the dfops chain and transaction state to the capture struct. */
670 	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
671 	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
672 	tp->t_flags &= ~XFS_TRANS_LOWMODE;
673 
674 	/* Capture the remaining block reservations along with the dfops. */
675 	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
676 	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
677 
678 	/* Preserve the log reservation size. */
679 	dfc->dfc_logres = tp->t_log_res;
680 
681 	error = xfs_defer_save_resources(&dfc->dfc_held, tp);
682 	if (error) {
683 		/*
684 		 * Resource capture should never fail, but if it does, we
685 		 * still have to shut down the log and release things
686 		 * properly.
687 		 */
688 		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
689 	}
690 
691 	/*
692 	 * Grab extra references to the inodes and buffers because callers are
693 	 * expected to release their held references after we commit the
694 	 * transaction.
695 	 */
696 	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
697 		ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
698 		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
699 	}
700 
701 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
702 		xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
703 
704 	return dfc;
705 }
706 
707 /* Release all resources that we used to capture deferred ops. */
708 void
709 xfs_defer_ops_capture_free(
710 	struct xfs_mount		*mp,
711 	struct xfs_defer_capture	*dfc)
712 {
713 	unsigned short			i;
714 
715 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
716 
717 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
718 		xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
719 
720 	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
721 		xfs_irele(dfc->dfc_held.dr_ip[i]);
722 
723 	kmem_free(dfc);
724 }
725 
726 /*
727  * Capture any deferred ops and commit the transaction.  This is the last step
728  * needed to finish a log intent item that we recovered from the log.  If any
729  * of the deferred ops operate on an inode, the caller must pass in that inode
730  * so that the reference can be transferred to the capture structure.  The
731  * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
732  * xfs_defer_ops_continue.
733  */
734 int
735 xfs_defer_ops_capture_and_commit(
736 	struct xfs_trans		*tp,
737 	struct list_head		*capture_list)
738 {
739 	struct xfs_mount		*mp = tp->t_mountp;
740 	struct xfs_defer_capture	*dfc;
741 	int				error;
742 
743 	/* If we don't capture anything, commit transaction and exit. */
744 	dfc = xfs_defer_ops_capture(tp);
745 	if (!dfc)
746 		return xfs_trans_commit(tp);
747 
748 	/* Commit the transaction and add the capture structure to the list. */
749 	error = xfs_trans_commit(tp);
750 	if (error) {
751 		xfs_defer_ops_capture_free(mp, dfc);
752 		return error;
753 	}
754 
755 	list_add_tail(&dfc->dfc_list, capture_list);
756 	return 0;
757 }
758 
759 /*
760  * Attach a chain of captured deferred ops to a new transaction and free the
761  * capture structure.  If an inode was captured, it will be passed back to the
762  * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
763  * The caller now owns the inode reference.
764  */
765 void
766 xfs_defer_ops_continue(
767 	struct xfs_defer_capture	*dfc,
768 	struct xfs_trans		*tp,
769 	struct xfs_defer_resources	*dres)
770 {
771 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
772 	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
773 
774 	/* Lock and join the captured inode to the new transaction. */
775 	if (dfc->dfc_held.dr_inos == 2)
776 		xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
777 				    dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
778 	else if (dfc->dfc_held.dr_inos == 1)
779 		xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
780 	xfs_defer_restore_resources(tp, &dfc->dfc_held);
781 	memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
782 
783 	/* Move captured dfops chain and state to the transaction. */
784 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
785 	tp->t_flags |= dfc->dfc_tpflags;
786 
787 	kmem_free(dfc);
788 }
789 
790 /* Release the resources captured and continued during recovery. */
791 void
792 xfs_defer_resources_rele(
793 	struct xfs_defer_resources	*dres)
794 {
795 	unsigned short			i;
796 
797 	for (i = 0; i < dres->dr_inos; i++) {
798 		xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
799 		xfs_irele(dres->dr_ip[i]);
800 		dres->dr_ip[i] = NULL;
801 	}
802 
803 	for (i = 0; i < dres->dr_bufs; i++) {
804 		xfs_buf_relse(dres->dr_bp[i]);
805 		dres->dr_bp[i] = NULL;
806 	}
807 
808 	dres->dr_inos = 0;
809 	dres->dr_bufs = 0;
810 	dres->dr_ordered = 0;
811 }
812