xref: /openbmc/linux/fs/xfs/xfs_inode_item.c (revision a266ef69b890f099069cf51bb40572611c435a54)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_trace.h"
17 #include "xfs_trans_priv.h"
18 #include "xfs_buf_item.h"
19 #include "xfs_log.h"
20 #include "xfs_log_priv.h"
21 #include "xfs_error.h"
22 
23 #include <linux/iversion.h>
24 
25 struct kmem_cache	*xfs_ili_cache;		/* inode log item */
26 
27 static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip)
28 {
29 	return container_of(lip, struct xfs_inode_log_item, ili_item);
30 }
31 
32 /*
33  * The logged size of an inode fork is always the current size of the inode
34  * fork. This means that when an inode fork is relogged, the size of the logged
35  * region is determined by the current state, not the combination of the
36  * previously logged state + the current state. This is different relogging
37  * behaviour to most other log items which will retain the size of the
38  * previously logged changes when smaller regions are relogged.
39  *
40  * Hence operations that remove data from the inode fork (e.g. shortform
41  * dir/attr remove, extent form extent removal, etc), the size of the relogged
42  * inode gets -smaller- rather than stays the same size as the previously logged
43  * size and this can result in the committing transaction reducing the amount of
44  * space being consumed by the CIL.
45  */
46 STATIC void
47 xfs_inode_item_data_fork_size(
48 	struct xfs_inode_log_item *iip,
49 	int			*nvecs,
50 	int			*nbytes)
51 {
52 	struct xfs_inode	*ip = iip->ili_inode;
53 
54 	switch (ip->i_df.if_format) {
55 	case XFS_DINODE_FMT_EXTENTS:
56 		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
57 		    ip->i_df.if_nextents > 0 &&
58 		    ip->i_df.if_bytes > 0) {
59 			/* worst case, doesn't subtract delalloc extents */
60 			*nbytes += xfs_inode_data_fork_size(ip);
61 			*nvecs += 1;
62 		}
63 		break;
64 	case XFS_DINODE_FMT_BTREE:
65 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
66 		    ip->i_df.if_broot_bytes > 0) {
67 			*nbytes += ip->i_df.if_broot_bytes;
68 			*nvecs += 1;
69 		}
70 		break;
71 	case XFS_DINODE_FMT_LOCAL:
72 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
73 		    ip->i_df.if_bytes > 0) {
74 			*nbytes += xlog_calc_iovec_len(ip->i_df.if_bytes);
75 			*nvecs += 1;
76 		}
77 		break;
78 
79 	case XFS_DINODE_FMT_DEV:
80 		break;
81 	default:
82 		ASSERT(0);
83 		break;
84 	}
85 }
86 
87 STATIC void
88 xfs_inode_item_attr_fork_size(
89 	struct xfs_inode_log_item *iip,
90 	int			*nvecs,
91 	int			*nbytes)
92 {
93 	struct xfs_inode	*ip = iip->ili_inode;
94 
95 	switch (ip->i_af.if_format) {
96 	case XFS_DINODE_FMT_EXTENTS:
97 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
98 		    ip->i_af.if_nextents > 0 &&
99 		    ip->i_af.if_bytes > 0) {
100 			/* worst case, doesn't subtract unused space */
101 			*nbytes += xfs_inode_attr_fork_size(ip);
102 			*nvecs += 1;
103 		}
104 		break;
105 	case XFS_DINODE_FMT_BTREE:
106 		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
107 		    ip->i_af.if_broot_bytes > 0) {
108 			*nbytes += ip->i_af.if_broot_bytes;
109 			*nvecs += 1;
110 		}
111 		break;
112 	case XFS_DINODE_FMT_LOCAL:
113 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
114 		    ip->i_af.if_bytes > 0) {
115 			*nbytes += xlog_calc_iovec_len(ip->i_af.if_bytes);
116 			*nvecs += 1;
117 		}
118 		break;
119 	default:
120 		ASSERT(0);
121 		break;
122 	}
123 }
124 
125 /*
126  * This returns the number of iovecs needed to log the given inode item.
127  *
128  * We need one iovec for the inode log format structure, one for the
129  * inode core, and possibly one for the inode data/extents/b-tree root
130  * and one for the inode attribute data/extents/b-tree root.
131  */
132 STATIC void
133 xfs_inode_item_size(
134 	struct xfs_log_item	*lip,
135 	int			*nvecs,
136 	int			*nbytes)
137 {
138 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
139 	struct xfs_inode	*ip = iip->ili_inode;
140 
141 	*nvecs += 2;
142 	*nbytes += sizeof(struct xfs_inode_log_format) +
143 		   xfs_log_dinode_size(ip->i_mount);
144 
145 	xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
146 	if (xfs_inode_has_attr_fork(ip))
147 		xfs_inode_item_attr_fork_size(iip, nvecs, nbytes);
148 }
149 
150 STATIC void
151 xfs_inode_item_format_data_fork(
152 	struct xfs_inode_log_item *iip,
153 	struct xfs_inode_log_format *ilf,
154 	struct xfs_log_vec	*lv,
155 	struct xfs_log_iovec	**vecp)
156 {
157 	struct xfs_inode	*ip = iip->ili_inode;
158 	size_t			data_bytes;
159 
160 	switch (ip->i_df.if_format) {
161 	case XFS_DINODE_FMT_EXTENTS:
162 		iip->ili_fields &=
163 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
164 
165 		if ((iip->ili_fields & XFS_ILOG_DEXT) &&
166 		    ip->i_df.if_nextents > 0 &&
167 		    ip->i_df.if_bytes > 0) {
168 			struct xfs_bmbt_rec *p;
169 
170 			ASSERT(xfs_iext_count(&ip->i_df) > 0);
171 
172 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
173 			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
174 			xlog_finish_iovec(lv, *vecp, data_bytes);
175 
176 			ASSERT(data_bytes <= ip->i_df.if_bytes);
177 
178 			ilf->ilf_dsize = data_bytes;
179 			ilf->ilf_size++;
180 		} else {
181 			iip->ili_fields &= ~XFS_ILOG_DEXT;
182 		}
183 		break;
184 	case XFS_DINODE_FMT_BTREE:
185 		iip->ili_fields &=
186 			~(XFS_ILOG_DDATA | XFS_ILOG_DEXT | XFS_ILOG_DEV);
187 
188 		if ((iip->ili_fields & XFS_ILOG_DBROOT) &&
189 		    ip->i_df.if_broot_bytes > 0) {
190 			ASSERT(ip->i_df.if_broot != NULL);
191 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IBROOT,
192 					ip->i_df.if_broot,
193 					ip->i_df.if_broot_bytes);
194 			ilf->ilf_dsize = ip->i_df.if_broot_bytes;
195 			ilf->ilf_size++;
196 		} else {
197 			ASSERT(!(iip->ili_fields &
198 				 XFS_ILOG_DBROOT));
199 			iip->ili_fields &= ~XFS_ILOG_DBROOT;
200 		}
201 		break;
202 	case XFS_DINODE_FMT_LOCAL:
203 		iip->ili_fields &=
204 			~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
205 		if ((iip->ili_fields & XFS_ILOG_DDATA) &&
206 		    ip->i_df.if_bytes > 0) {
207 			ASSERT(ip->i_df.if_u1.if_data != NULL);
208 			ASSERT(ip->i_disk_size > 0);
209 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
210 					ip->i_df.if_u1.if_data,
211 					ip->i_df.if_bytes);
212 			ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
213 			ilf->ilf_size++;
214 		} else {
215 			iip->ili_fields &= ~XFS_ILOG_DDATA;
216 		}
217 		break;
218 	case XFS_DINODE_FMT_DEV:
219 		iip->ili_fields &=
220 			~(XFS_ILOG_DDATA | XFS_ILOG_DBROOT | XFS_ILOG_DEXT);
221 		if (iip->ili_fields & XFS_ILOG_DEV)
222 			ilf->ilf_u.ilfu_rdev = sysv_encode_dev(VFS_I(ip)->i_rdev);
223 		break;
224 	default:
225 		ASSERT(0);
226 		break;
227 	}
228 }
229 
230 STATIC void
231 xfs_inode_item_format_attr_fork(
232 	struct xfs_inode_log_item *iip,
233 	struct xfs_inode_log_format *ilf,
234 	struct xfs_log_vec	*lv,
235 	struct xfs_log_iovec	**vecp)
236 {
237 	struct xfs_inode	*ip = iip->ili_inode;
238 	size_t			data_bytes;
239 
240 	switch (ip->i_af.if_format) {
241 	case XFS_DINODE_FMT_EXTENTS:
242 		iip->ili_fields &=
243 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT);
244 
245 		if ((iip->ili_fields & XFS_ILOG_AEXT) &&
246 		    ip->i_af.if_nextents > 0 &&
247 		    ip->i_af.if_bytes > 0) {
248 			struct xfs_bmbt_rec *p;
249 
250 			ASSERT(xfs_iext_count(&ip->i_af) ==
251 				ip->i_af.if_nextents);
252 
253 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_EXT);
254 			data_bytes = xfs_iextents_copy(ip, p, XFS_ATTR_FORK);
255 			xlog_finish_iovec(lv, *vecp, data_bytes);
256 
257 			ilf->ilf_asize = data_bytes;
258 			ilf->ilf_size++;
259 		} else {
260 			iip->ili_fields &= ~XFS_ILOG_AEXT;
261 		}
262 		break;
263 	case XFS_DINODE_FMT_BTREE:
264 		iip->ili_fields &=
265 			~(XFS_ILOG_ADATA | XFS_ILOG_AEXT);
266 
267 		if ((iip->ili_fields & XFS_ILOG_ABROOT) &&
268 		    ip->i_af.if_broot_bytes > 0) {
269 			ASSERT(ip->i_af.if_broot != NULL);
270 
271 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_BROOT,
272 					ip->i_af.if_broot,
273 					ip->i_af.if_broot_bytes);
274 			ilf->ilf_asize = ip->i_af.if_broot_bytes;
275 			ilf->ilf_size++;
276 		} else {
277 			iip->ili_fields &= ~XFS_ILOG_ABROOT;
278 		}
279 		break;
280 	case XFS_DINODE_FMT_LOCAL:
281 		iip->ili_fields &=
282 			~(XFS_ILOG_AEXT | XFS_ILOG_ABROOT);
283 
284 		if ((iip->ili_fields & XFS_ILOG_ADATA) &&
285 		    ip->i_af.if_bytes > 0) {
286 			ASSERT(ip->i_af.if_u1.if_data != NULL);
287 			xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
288 					ip->i_af.if_u1.if_data,
289 					ip->i_af.if_bytes);
290 			ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
291 			ilf->ilf_size++;
292 		} else {
293 			iip->ili_fields &= ~XFS_ILOG_ADATA;
294 		}
295 		break;
296 	default:
297 		ASSERT(0);
298 		break;
299 	}
300 }
301 
302 /*
303  * Convert an incore timestamp to a log timestamp.  Note that the log format
304  * specifies host endian format!
305  */
306 static inline xfs_log_timestamp_t
307 xfs_inode_to_log_dinode_ts(
308 	struct xfs_inode		*ip,
309 	const struct timespec64		tv)
310 {
311 	struct xfs_log_legacy_timestamp	*lits;
312 	xfs_log_timestamp_t		its;
313 
314 	if (xfs_inode_has_bigtime(ip))
315 		return xfs_inode_encode_bigtime(tv);
316 
317 	lits = (struct xfs_log_legacy_timestamp *)&its;
318 	lits->t_sec = tv.tv_sec;
319 	lits->t_nsec = tv.tv_nsec;
320 
321 	return its;
322 }
323 
324 /*
325  * The legacy DMAPI fields are only present in the on-disk and in-log inodes,
326  * but not in the in-memory one.  But we are guaranteed to have an inode buffer
327  * in memory when logging an inode, so we can just copy it from the on-disk
328  * inode to the in-log inode here so that recovery of file system with these
329  * fields set to non-zero values doesn't lose them.  For all other cases we zero
330  * the fields.
331  */
332 static void
333 xfs_copy_dm_fields_to_log_dinode(
334 	struct xfs_inode	*ip,
335 	struct xfs_log_dinode	*to)
336 {
337 	struct xfs_dinode	*dip;
338 
339 	dip = xfs_buf_offset(ip->i_itemp->ili_item.li_buf,
340 			     ip->i_imap.im_boffset);
341 
342 	if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) {
343 		to->di_dmevmask = be32_to_cpu(dip->di_dmevmask);
344 		to->di_dmstate = be16_to_cpu(dip->di_dmstate);
345 	} else {
346 		to->di_dmevmask = 0;
347 		to->di_dmstate = 0;
348 	}
349 }
350 
351 static inline void
352 xfs_inode_to_log_dinode_iext_counters(
353 	struct xfs_inode	*ip,
354 	struct xfs_log_dinode	*to)
355 {
356 	if (xfs_inode_has_large_extent_counts(ip)) {
357 		to->di_big_nextents = xfs_ifork_nextents(&ip->i_df);
358 		to->di_big_anextents = xfs_ifork_nextents(&ip->i_af);
359 		to->di_nrext64_pad = 0;
360 	} else {
361 		to->di_nextents = xfs_ifork_nextents(&ip->i_df);
362 		to->di_anextents = xfs_ifork_nextents(&ip->i_af);
363 	}
364 }
365 
366 static void
367 xfs_inode_to_log_dinode(
368 	struct xfs_inode	*ip,
369 	struct xfs_log_dinode	*to,
370 	xfs_lsn_t		lsn)
371 {
372 	struct inode		*inode = VFS_I(ip);
373 
374 	to->di_magic = XFS_DINODE_MAGIC;
375 	to->di_format = xfs_ifork_format(&ip->i_df);
376 	to->di_uid = i_uid_read(inode);
377 	to->di_gid = i_gid_read(inode);
378 	to->di_projid_lo = ip->i_projid & 0xffff;
379 	to->di_projid_hi = ip->i_projid >> 16;
380 
381 	memset(to->di_pad3, 0, sizeof(to->di_pad3));
382 	to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime);
383 	to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime);
384 	to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime);
385 	to->di_nlink = inode->i_nlink;
386 	to->di_gen = inode->i_generation;
387 	to->di_mode = inode->i_mode;
388 
389 	to->di_size = ip->i_disk_size;
390 	to->di_nblocks = ip->i_nblocks;
391 	to->di_extsize = ip->i_extsize;
392 	to->di_forkoff = ip->i_forkoff;
393 	to->di_aformat = xfs_ifork_format(&ip->i_af);
394 	to->di_flags = ip->i_diflags;
395 
396 	xfs_copy_dm_fields_to_log_dinode(ip, to);
397 
398 	/* log a dummy value to ensure log structure is fully initialised */
399 	to->di_next_unlinked = NULLAGINO;
400 
401 	if (xfs_has_v3inodes(ip->i_mount)) {
402 		to->di_version = 3;
403 		to->di_changecount = inode_peek_iversion(inode);
404 		to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime);
405 		to->di_flags2 = ip->i_diflags2;
406 		to->di_cowextsize = ip->i_cowextsize;
407 		to->di_ino = ip->i_ino;
408 		to->di_lsn = lsn;
409 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
410 		uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
411 		to->di_v3_pad = 0;
412 	} else {
413 		to->di_version = 2;
414 		to->di_flushiter = ip->i_flushiter;
415 		memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad));
416 	}
417 
418 	xfs_inode_to_log_dinode_iext_counters(ip, to);
419 }
420 
421 /*
422  * Format the inode core. Current timestamp data is only in the VFS inode
423  * fields, so we need to grab them from there. Hence rather than just copying
424  * the XFS inode core structure, format the fields directly into the iovec.
425  */
426 static void
427 xfs_inode_item_format_core(
428 	struct xfs_inode	*ip,
429 	struct xfs_log_vec	*lv,
430 	struct xfs_log_iovec	**vecp)
431 {
432 	struct xfs_log_dinode	*dic;
433 
434 	dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
435 	xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
436 	xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
437 }
438 
439 /*
440  * This is called to fill in the vector of log iovecs for the given inode
441  * log item.  It fills the first item with an inode log format structure,
442  * the second with the on-disk inode structure, and a possible third and/or
443  * fourth with the inode data/extents/b-tree root and inode attributes
444  * data/extents/b-tree root.
445  *
446  * Note: Always use the 64 bit inode log format structure so we don't
447  * leave an uninitialised hole in the format item on 64 bit systems. Log
448  * recovery on 32 bit systems handles this just fine, so there's no reason
449  * for not using an initialising the properly padded structure all the time.
450  */
451 STATIC void
452 xfs_inode_item_format(
453 	struct xfs_log_item	*lip,
454 	struct xfs_log_vec	*lv)
455 {
456 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
457 	struct xfs_inode	*ip = iip->ili_inode;
458 	struct xfs_log_iovec	*vecp = NULL;
459 	struct xfs_inode_log_format *ilf;
460 
461 	ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
462 	ilf->ilf_type = XFS_LI_INODE;
463 	ilf->ilf_ino = ip->i_ino;
464 	ilf->ilf_blkno = ip->i_imap.im_blkno;
465 	ilf->ilf_len = ip->i_imap.im_len;
466 	ilf->ilf_boffset = ip->i_imap.im_boffset;
467 	ilf->ilf_fields = XFS_ILOG_CORE;
468 	ilf->ilf_size = 2; /* format + core */
469 
470 	/*
471 	 * make sure we don't leak uninitialised data into the log in the case
472 	 * when we don't log every field in the inode.
473 	 */
474 	ilf->ilf_dsize = 0;
475 	ilf->ilf_asize = 0;
476 	ilf->ilf_pad = 0;
477 	memset(&ilf->ilf_u, 0, sizeof(ilf->ilf_u));
478 
479 	xlog_finish_iovec(lv, vecp, sizeof(*ilf));
480 
481 	xfs_inode_item_format_core(ip, lv, &vecp);
482 	xfs_inode_item_format_data_fork(iip, ilf, lv, &vecp);
483 	if (xfs_inode_has_attr_fork(ip)) {
484 		xfs_inode_item_format_attr_fork(iip, ilf, lv, &vecp);
485 	} else {
486 		iip->ili_fields &=
487 			~(XFS_ILOG_ADATA | XFS_ILOG_ABROOT | XFS_ILOG_AEXT);
488 	}
489 
490 	/* update the format with the exact fields we actually logged */
491 	ilf->ilf_fields |= (iip->ili_fields & ~XFS_ILOG_TIMESTAMP);
492 }
493 
494 /*
495  * This is called to pin the inode associated with the inode log
496  * item in memory so it cannot be written out.
497  */
498 STATIC void
499 xfs_inode_item_pin(
500 	struct xfs_log_item	*lip)
501 {
502 	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
503 
504 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
505 	ASSERT(lip->li_buf);
506 
507 	trace_xfs_inode_pin(ip, _RET_IP_);
508 	atomic_inc(&ip->i_pincount);
509 }
510 
511 
512 /*
513  * This is called to unpin the inode associated with the inode log
514  * item which was previously pinned with a call to xfs_inode_item_pin().
515  *
516  * Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
517  *
518  * Note that unpin can race with inode cluster buffer freeing marking the buffer
519  * stale. In that case, flush completions are run from the buffer unpin call,
520  * which may happen before the inode is unpinned. If we lose the race, there
521  * will be no buffer attached to the log item, but the inode will be marked
522  * XFS_ISTALE.
523  */
524 STATIC void
525 xfs_inode_item_unpin(
526 	struct xfs_log_item	*lip,
527 	int			remove)
528 {
529 	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
530 
531 	trace_xfs_inode_unpin(ip, _RET_IP_);
532 	ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
533 	ASSERT(atomic_read(&ip->i_pincount) > 0);
534 	if (atomic_dec_and_test(&ip->i_pincount))
535 		wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
536 }
537 
538 STATIC uint
539 xfs_inode_item_push(
540 	struct xfs_log_item	*lip,
541 	struct list_head	*buffer_list)
542 		__releases(&lip->li_ailp->ail_lock)
543 		__acquires(&lip->li_ailp->ail_lock)
544 {
545 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
546 	struct xfs_inode	*ip = iip->ili_inode;
547 	struct xfs_buf		*bp = lip->li_buf;
548 	uint			rval = XFS_ITEM_SUCCESS;
549 	int			error;
550 
551 	if (!bp || (ip->i_flags & XFS_ISTALE)) {
552 		/*
553 		 * Inode item/buffer is being aborted due to cluster
554 		 * buffer deletion. Trigger a log force to have that operation
555 		 * completed and items removed from the AIL before the next push
556 		 * attempt.
557 		 */
558 		return XFS_ITEM_PINNED;
559 	}
560 
561 	if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp))
562 		return XFS_ITEM_PINNED;
563 
564 	if (xfs_iflags_test(ip, XFS_IFLUSHING))
565 		return XFS_ITEM_FLUSHING;
566 
567 	if (!xfs_buf_trylock(bp))
568 		return XFS_ITEM_LOCKED;
569 
570 	spin_unlock(&lip->li_ailp->ail_lock);
571 
572 	/*
573 	 * We need to hold a reference for flushing the cluster buffer as it may
574 	 * fail the buffer without IO submission. In which case, we better get a
575 	 * reference for that completion because otherwise we don't get a
576 	 * reference for IO until we queue the buffer for delwri submission.
577 	 */
578 	xfs_buf_hold(bp);
579 	error = xfs_iflush_cluster(bp);
580 	if (!error) {
581 		if (!xfs_buf_delwri_queue(bp, buffer_list))
582 			rval = XFS_ITEM_FLUSHING;
583 		xfs_buf_relse(bp);
584 	} else {
585 		/*
586 		 * Release the buffer if we were unable to flush anything. On
587 		 * any other error, the buffer has already been released.
588 		 */
589 		if (error == -EAGAIN)
590 			xfs_buf_relse(bp);
591 		rval = XFS_ITEM_LOCKED;
592 	}
593 
594 	spin_lock(&lip->li_ailp->ail_lock);
595 	return rval;
596 }
597 
598 /*
599  * Unlock the inode associated with the inode log item.
600  */
601 STATIC void
602 xfs_inode_item_release(
603 	struct xfs_log_item	*lip)
604 {
605 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
606 	struct xfs_inode	*ip = iip->ili_inode;
607 	unsigned short		lock_flags;
608 
609 	ASSERT(ip->i_itemp != NULL);
610 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
611 
612 	lock_flags = iip->ili_lock_flags;
613 	iip->ili_lock_flags = 0;
614 	if (lock_flags)
615 		xfs_iunlock(ip, lock_flags);
616 }
617 
618 /*
619  * This is called to find out where the oldest active copy of the inode log
620  * item in the on disk log resides now that the last log write of it completed
621  * at the given lsn.  Since we always re-log all dirty data in an inode, the
622  * latest copy in the on disk log is the only one that matters.  Therefore,
623  * simply return the given lsn.
624  *
625  * If the inode has been marked stale because the cluster is being freed, we
626  * don't want to (re-)insert this inode into the AIL. There is a race condition
627  * where the cluster buffer may be unpinned before the inode is inserted into
628  * the AIL during transaction committed processing. If the buffer is unpinned
629  * before the inode item has been committed and inserted, then it is possible
630  * for the buffer to be written and IO completes before the inode is inserted
631  * into the AIL. In that case, we'd be inserting a clean, stale inode into the
632  * AIL which will never get removed. It will, however, get reclaimed which
633  * triggers an assert in xfs_inode_free() complaining about freein an inode
634  * still in the AIL.
635  *
636  * To avoid this, just unpin the inode directly and return a LSN of -1 so the
637  * transaction committed code knows that it does not need to do any further
638  * processing on the item.
639  */
640 STATIC xfs_lsn_t
641 xfs_inode_item_committed(
642 	struct xfs_log_item	*lip,
643 	xfs_lsn_t		lsn)
644 {
645 	struct xfs_inode_log_item *iip = INODE_ITEM(lip);
646 	struct xfs_inode	*ip = iip->ili_inode;
647 
648 	if (xfs_iflags_test(ip, XFS_ISTALE)) {
649 		xfs_inode_item_unpin(lip, 0);
650 		return -1;
651 	}
652 	return lsn;
653 }
654 
655 STATIC void
656 xfs_inode_item_committing(
657 	struct xfs_log_item	*lip,
658 	xfs_csn_t		seq)
659 {
660 	INODE_ITEM(lip)->ili_commit_seq = seq;
661 	return xfs_inode_item_release(lip);
662 }
663 
664 static const struct xfs_item_ops xfs_inode_item_ops = {
665 	.iop_size	= xfs_inode_item_size,
666 	.iop_format	= xfs_inode_item_format,
667 	.iop_pin	= xfs_inode_item_pin,
668 	.iop_unpin	= xfs_inode_item_unpin,
669 	.iop_release	= xfs_inode_item_release,
670 	.iop_committed	= xfs_inode_item_committed,
671 	.iop_push	= xfs_inode_item_push,
672 	.iop_committing	= xfs_inode_item_committing,
673 };
674 
675 
676 /*
677  * Initialize the inode log item for a newly allocated (in-core) inode.
678  */
679 void
680 xfs_inode_item_init(
681 	struct xfs_inode	*ip,
682 	struct xfs_mount	*mp)
683 {
684 	struct xfs_inode_log_item *iip;
685 
686 	ASSERT(ip->i_itemp == NULL);
687 	iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache,
688 					      GFP_KERNEL | __GFP_NOFAIL);
689 
690 	iip->ili_inode = ip;
691 	spin_lock_init(&iip->ili_lock);
692 	xfs_log_item_init(mp, &iip->ili_item, XFS_LI_INODE,
693 						&xfs_inode_item_ops);
694 }
695 
696 /*
697  * Free the inode log item and any memory hanging off of it.
698  */
699 void
700 xfs_inode_item_destroy(
701 	struct xfs_inode	*ip)
702 {
703 	struct xfs_inode_log_item *iip = ip->i_itemp;
704 
705 	ASSERT(iip->ili_item.li_buf == NULL);
706 
707 	ip->i_itemp = NULL;
708 	kmem_free(iip->ili_item.li_lv_shadow);
709 	kmem_cache_free(xfs_ili_cache, iip);
710 }
711 
712 
713 /*
714  * We only want to pull the item from the AIL if it is actually there
715  * and its location in the log has not changed since we started the
716  * flush.  Thus, we only bother if the inode's lsn has not changed.
717  */
718 static void
719 xfs_iflush_ail_updates(
720 	struct xfs_ail		*ailp,
721 	struct list_head	*list)
722 {
723 	struct xfs_log_item	*lip;
724 	xfs_lsn_t		tail_lsn = 0;
725 
726 	/* this is an opencoded batch version of xfs_trans_ail_delete */
727 	spin_lock(&ailp->ail_lock);
728 	list_for_each_entry(lip, list, li_bio_list) {
729 		xfs_lsn_t	lsn;
730 
731 		clear_bit(XFS_LI_FAILED, &lip->li_flags);
732 		if (INODE_ITEM(lip)->ili_flush_lsn != lip->li_lsn)
733 			continue;
734 
735 		/*
736 		 * dgc: Not sure how this happens, but it happens very
737 		 * occassionaly via generic/388.  xfs_iflush_abort() also
738 		 * silently handles this same "under writeback but not in AIL at
739 		 * shutdown" condition via xfs_trans_ail_delete().
740 		 */
741 		if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
742 			ASSERT(xlog_is_shutdown(lip->li_log));
743 			continue;
744 		}
745 
746 		lsn = xfs_ail_delete_one(ailp, lip);
747 		if (!tail_lsn && lsn)
748 			tail_lsn = lsn;
749 	}
750 	xfs_ail_update_finish(ailp, tail_lsn);
751 }
752 
753 /*
754  * Walk the list of inodes that have completed their IOs. If they are clean
755  * remove them from the list and dissociate them from the buffer. Buffers that
756  * are still dirty remain linked to the buffer and on the list. Caller must
757  * handle them appropriately.
758  */
759 static void
760 xfs_iflush_finish(
761 	struct xfs_buf		*bp,
762 	struct list_head	*list)
763 {
764 	struct xfs_log_item	*lip, *n;
765 
766 	list_for_each_entry_safe(lip, n, list, li_bio_list) {
767 		struct xfs_inode_log_item *iip = INODE_ITEM(lip);
768 		bool	drop_buffer = false;
769 
770 		spin_lock(&iip->ili_lock);
771 
772 		/*
773 		 * Remove the reference to the cluster buffer if the inode is
774 		 * clean in memory and drop the buffer reference once we've
775 		 * dropped the locks we hold.
776 		 */
777 		ASSERT(iip->ili_item.li_buf == bp);
778 		if (!iip->ili_fields) {
779 			iip->ili_item.li_buf = NULL;
780 			list_del_init(&lip->li_bio_list);
781 			drop_buffer = true;
782 		}
783 		iip->ili_last_fields = 0;
784 		iip->ili_flush_lsn = 0;
785 		spin_unlock(&iip->ili_lock);
786 		xfs_iflags_clear(iip->ili_inode, XFS_IFLUSHING);
787 		if (drop_buffer)
788 			xfs_buf_rele(bp);
789 	}
790 }
791 
792 /*
793  * Inode buffer IO completion routine.  It is responsible for removing inodes
794  * attached to the buffer from the AIL if they have not been re-logged and
795  * completing the inode flush.
796  */
797 void
798 xfs_buf_inode_iodone(
799 	struct xfs_buf		*bp)
800 {
801 	struct xfs_log_item	*lip, *n;
802 	LIST_HEAD(flushed_inodes);
803 	LIST_HEAD(ail_updates);
804 
805 	/*
806 	 * Pull the attached inodes from the buffer one at a time and take the
807 	 * appropriate action on them.
808 	 */
809 	list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
810 		struct xfs_inode_log_item *iip = INODE_ITEM(lip);
811 
812 		if (xfs_iflags_test(iip->ili_inode, XFS_ISTALE)) {
813 			xfs_iflush_abort(iip->ili_inode);
814 			continue;
815 		}
816 		if (!iip->ili_last_fields)
817 			continue;
818 
819 		/* Do an unlocked check for needing the AIL lock. */
820 		if (iip->ili_flush_lsn == lip->li_lsn ||
821 		    test_bit(XFS_LI_FAILED, &lip->li_flags))
822 			list_move_tail(&lip->li_bio_list, &ail_updates);
823 		else
824 			list_move_tail(&lip->li_bio_list, &flushed_inodes);
825 	}
826 
827 	if (!list_empty(&ail_updates)) {
828 		xfs_iflush_ail_updates(bp->b_mount->m_ail, &ail_updates);
829 		list_splice_tail(&ail_updates, &flushed_inodes);
830 	}
831 
832 	xfs_iflush_finish(bp, &flushed_inodes);
833 	if (!list_empty(&flushed_inodes))
834 		list_splice_tail(&flushed_inodes, &bp->b_li_list);
835 }
836 
837 void
838 xfs_buf_inode_io_fail(
839 	struct xfs_buf		*bp)
840 {
841 	struct xfs_log_item	*lip;
842 
843 	list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
844 		set_bit(XFS_LI_FAILED, &lip->li_flags);
845 }
846 
847 /*
848  * Clear the inode logging fields so no more flushes are attempted.  If we are
849  * on a buffer list, it is now safe to remove it because the buffer is
850  * guaranteed to be locked. The caller will drop the reference to the buffer
851  * the log item held.
852  */
853 static void
854 xfs_iflush_abort_clean(
855 	struct xfs_inode_log_item *iip)
856 {
857 	iip->ili_last_fields = 0;
858 	iip->ili_fields = 0;
859 	iip->ili_fsync_fields = 0;
860 	iip->ili_flush_lsn = 0;
861 	iip->ili_item.li_buf = NULL;
862 	list_del_init(&iip->ili_item.li_bio_list);
863 }
864 
865 /*
866  * Abort flushing the inode from a context holding the cluster buffer locked.
867  *
868  * This is the normal runtime method of aborting writeback of an inode that is
869  * attached to a cluster buffer. It occurs when the inode and the backing
870  * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster
871  * flushing or buffer IO completion encounters a log shutdown situation.
872  *
873  * If we need to abort inode writeback and we don't already hold the buffer
874  * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be
875  * necessary in a shutdown situation.
876  */
877 void
878 xfs_iflush_abort(
879 	struct xfs_inode	*ip)
880 {
881 	struct xfs_inode_log_item *iip = ip->i_itemp;
882 	struct xfs_buf		*bp;
883 
884 	if (!iip) {
885 		/* clean inode, nothing to do */
886 		xfs_iflags_clear(ip, XFS_IFLUSHING);
887 		return;
888 	}
889 
890 	/*
891 	 * Remove the inode item from the AIL before we clear its internal
892 	 * state. Whilst the inode is in the AIL, it should have a valid buffer
893 	 * pointer for push operations to access - it is only safe to remove the
894 	 * inode from the buffer once it has been removed from the AIL.
895 	 *
896 	 * We also clear the failed bit before removing the item from the AIL
897 	 * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer
898 	 * references the inode item owns and needs to hold until we've fully
899 	 * aborted the inode log item and detached it from the buffer.
900 	 */
901 	clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
902 	xfs_trans_ail_delete(&iip->ili_item, 0);
903 
904 	/*
905 	 * Grab the inode buffer so can we release the reference the inode log
906 	 * item holds on it.
907 	 */
908 	spin_lock(&iip->ili_lock);
909 	bp = iip->ili_item.li_buf;
910 	xfs_iflush_abort_clean(iip);
911 	spin_unlock(&iip->ili_lock);
912 
913 	xfs_iflags_clear(ip, XFS_IFLUSHING);
914 	if (bp)
915 		xfs_buf_rele(bp);
916 }
917 
918 /*
919  * Abort an inode flush in the case of a shutdown filesystem. This can be called
920  * from anywhere with just an inode reference and does not require holding the
921  * inode cluster buffer locked. If the inode is attached to a cluster buffer,
922  * it will grab and lock it safely, then abort the inode flush.
923  */
924 void
925 xfs_iflush_shutdown_abort(
926 	struct xfs_inode	*ip)
927 {
928 	struct xfs_inode_log_item *iip = ip->i_itemp;
929 	struct xfs_buf		*bp;
930 
931 	if (!iip) {
932 		/* clean inode, nothing to do */
933 		xfs_iflags_clear(ip, XFS_IFLUSHING);
934 		return;
935 	}
936 
937 	spin_lock(&iip->ili_lock);
938 	bp = iip->ili_item.li_buf;
939 	if (!bp) {
940 		spin_unlock(&iip->ili_lock);
941 		xfs_iflush_abort(ip);
942 		return;
943 	}
944 
945 	/*
946 	 * We have to take a reference to the buffer so that it doesn't get
947 	 * freed when we drop the ili_lock and then wait to lock the buffer.
948 	 * We'll clean up the extra reference after we pick up the ili_lock
949 	 * again.
950 	 */
951 	xfs_buf_hold(bp);
952 	spin_unlock(&iip->ili_lock);
953 	xfs_buf_lock(bp);
954 
955 	spin_lock(&iip->ili_lock);
956 	if (!iip->ili_item.li_buf) {
957 		/*
958 		 * Raced with another removal, hold the only reference
959 		 * to bp now. Inode should not be in the AIL now, so just clean
960 		 * up and return;
961 		 */
962 		ASSERT(list_empty(&iip->ili_item.li_bio_list));
963 		ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags));
964 		xfs_iflush_abort_clean(iip);
965 		spin_unlock(&iip->ili_lock);
966 		xfs_iflags_clear(ip, XFS_IFLUSHING);
967 		xfs_buf_relse(bp);
968 		return;
969 	}
970 
971 	/*
972 	 * Got two references to bp. The first will get dropped by
973 	 * xfs_iflush_abort() when the item is removed from the buffer list, but
974 	 * we can't drop our reference until _abort() returns because we have to
975 	 * unlock the buffer as well. Hence we abort and then unlock and release
976 	 * our reference to the buffer.
977 	 */
978 	ASSERT(iip->ili_item.li_buf == bp);
979 	spin_unlock(&iip->ili_lock);
980 	xfs_iflush_abort(ip);
981 	xfs_buf_relse(bp);
982 }
983 
984 
985 /*
986  * convert an xfs_inode_log_format struct from the old 32 bit version
987  * (which can have different field alignments) to the native 64 bit version
988  */
989 int
990 xfs_inode_item_format_convert(
991 	struct xfs_log_iovec		*buf,
992 	struct xfs_inode_log_format	*in_f)
993 {
994 	struct xfs_inode_log_format_32	*in_f32 = buf->i_addr;
995 
996 	if (buf->i_len != sizeof(*in_f32)) {
997 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
998 		return -EFSCORRUPTED;
999 	}
1000 
1001 	in_f->ilf_type = in_f32->ilf_type;
1002 	in_f->ilf_size = in_f32->ilf_size;
1003 	in_f->ilf_fields = in_f32->ilf_fields;
1004 	in_f->ilf_asize = in_f32->ilf_asize;
1005 	in_f->ilf_dsize = in_f32->ilf_dsize;
1006 	in_f->ilf_ino = in_f32->ilf_ino;
1007 	memcpy(&in_f->ilf_u, &in_f32->ilf_u, sizeof(in_f->ilf_u));
1008 	in_f->ilf_blkno = in_f32->ilf_blkno;
1009 	in_f->ilf_len = in_f32->ilf_len;
1010 	in_f->ilf_boffset = in_f32->ilf_boffset;
1011 	return 0;
1012 }
1013