xref: /openbmc/linux/fs/jbd2/commit.c (revision 7fc96d71)
1  // SPDX-License-Identifier: GPL-2.0+
2  /*
3   * linux/fs/jbd2/commit.c
4   *
5   * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6   *
7   * Copyright 1998 Red Hat corp --- All Rights Reserved
8   *
9   * Journal commit routines for the generic filesystem journaling code;
10   * part of the ext2fs journaling system.
11   */
12  
13  #include <linux/time.h>
14  #include <linux/fs.h>
15  #include <linux/jbd2.h>
16  #include <linux/errno.h>
17  #include <linux/slab.h>
18  #include <linux/mm.h>
19  #include <linux/pagemap.h>
20  #include <linux/jiffies.h>
21  #include <linux/crc32.h>
22  #include <linux/writeback.h>
23  #include <linux/backing-dev.h>
24  #include <linux/bio.h>
25  #include <linux/blkdev.h>
26  #include <linux/bitops.h>
27  #include <trace/events/jbd2.h>
28  
29  /*
30   * IO end handler for temporary buffer_heads handling writes to the journal.
31   */
32  static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
33  {
34  	struct buffer_head *orig_bh = bh->b_private;
35  
36  	BUFFER_TRACE(bh, "");
37  	if (uptodate)
38  		set_buffer_uptodate(bh);
39  	else
40  		clear_buffer_uptodate(bh);
41  	if (orig_bh) {
42  		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
43  		smp_mb__after_atomic();
44  		wake_up_bit(&orig_bh->b_state, BH_Shadow);
45  	}
46  	unlock_buffer(bh);
47  }
48  
49  /*
50   * When an ext4 file is truncated, it is possible that some pages are not
51   * successfully freed, because they are attached to a committing transaction.
52   * After the transaction commits, these pages are left on the LRU, with no
53   * ->mapping, and with attached buffers.  These pages are trivially reclaimable
54   * by the VM, but their apparent absence upsets the VM accounting, and it makes
55   * the numbers in /proc/meminfo look odd.
56   *
57   * So here, we have a buffer which has just come off the forget list.  Look to
58   * see if we can strip all buffers from the backing page.
59   *
60   * Called under lock_journal(), and possibly under journal_datalist_lock.  The
61   * caller provided us with a ref against the buffer, and we drop that here.
62   */
63  static void release_buffer_page(struct buffer_head *bh)
64  {
65  	struct folio *folio;
66  	struct page *page;
67  
68  	if (buffer_dirty(bh))
69  		goto nope;
70  	if (atomic_read(&bh->b_count) != 1)
71  		goto nope;
72  	page = bh->b_page;
73  	if (!page)
74  		goto nope;
75  	folio = page_folio(page);
76  	if (folio->mapping)
77  		goto nope;
78  
79  	/* OK, it's a truncated page */
80  	if (!folio_trylock(folio))
81  		goto nope;
82  
83  	folio_get(folio);
84  	__brelse(bh);
85  	try_to_free_buffers(folio);
86  	folio_unlock(folio);
87  	folio_put(folio);
88  	return;
89  
90  nope:
91  	__brelse(bh);
92  }
93  
94  static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
95  {
96  	struct commit_header *h;
97  	__u32 csum;
98  
99  	if (!jbd2_journal_has_csum_v2or3(j))
100  		return;
101  
102  	h = (struct commit_header *)(bh->b_data);
103  	h->h_chksum_type = 0;
104  	h->h_chksum_size = 0;
105  	h->h_chksum[0] = 0;
106  	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
107  	h->h_chksum[0] = cpu_to_be32(csum);
108  }
109  
110  /*
111   * Done it all: now submit the commit record.  We should have
112   * cleaned up our previous buffers by now, so if we are in abort
113   * mode we can now just skip the rest of the journal write
114   * entirely.
115   *
116   * Returns 1 if the journal needs to be aborted or 0 on success
117   */
118  static int journal_submit_commit_record(journal_t *journal,
119  					transaction_t *commit_transaction,
120  					struct buffer_head **cbh,
121  					__u32 crc32_sum)
122  {
123  	struct commit_header *tmp;
124  	struct buffer_head *bh;
125  	int ret;
126  	struct timespec64 now;
127  
128  	*cbh = NULL;
129  
130  	if (is_journal_aborted(journal))
131  		return 0;
132  
133  	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
134  						JBD2_COMMIT_BLOCK);
135  	if (!bh)
136  		return 1;
137  
138  	tmp = (struct commit_header *)bh->b_data;
139  	ktime_get_coarse_real_ts64(&now);
140  	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141  	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142  
143  	if (jbd2_has_feature_checksum(journal)) {
144  		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145  		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146  		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147  	}
148  	jbd2_commit_block_csum_set(journal, bh);
149  
150  	BUFFER_TRACE(bh, "submit commit block");
151  	lock_buffer(bh);
152  	clear_buffer_dirty(bh);
153  	set_buffer_uptodate(bh);
154  	bh->b_end_io = journal_end_buffer_io_sync;
155  
156  	if (journal->j_flags & JBD2_BARRIER &&
157  	    !jbd2_has_feature_async_commit(journal))
158  		ret = submit_bh(REQ_OP_WRITE,
159  			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
160  	else
161  		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
162  
163  	*cbh = bh;
164  	return ret;
165  }
166  
167  /*
168   * This function along with journal_submit_commit_record
169   * allows to write the commit record asynchronously.
170   */
171  static int journal_wait_on_commit_record(journal_t *journal,
172  					 struct buffer_head *bh)
173  {
174  	int ret = 0;
175  
176  	clear_buffer_dirty(bh);
177  	wait_on_buffer(bh);
178  
179  	if (unlikely(!buffer_uptodate(bh)))
180  		ret = -EIO;
181  	put_bh(bh);            /* One for getblk() */
182  
183  	return ret;
184  }
185  
186  /*
187   * write the filemap data using writepage() address_space_operations.
188   * We don't do block allocation here even for delalloc. We don't
189   * use writepages() because with delayed allocation we may be doing
190   * block allocation in writepages().
191   */
192  int jbd2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
193  {
194  	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
195  	struct writeback_control wbc = {
196  		.sync_mode =  WB_SYNC_ALL,
197  		.nr_to_write = mapping->nrpages * 2,
198  		.range_start = jinode->i_dirty_start,
199  		.range_end = jinode->i_dirty_end,
200  	};
201  
202  	/*
203  	 * submit the inode data buffers. We use writepage
204  	 * instead of writepages. Because writepages can do
205  	 * block allocation with delalloc. We need to write
206  	 * only allocated blocks here.
207  	 */
208  	return generic_writepages(mapping, &wbc);
209  }
210  
211  /* Send all the data buffers related to an inode */
212  int jbd2_submit_inode_data(struct jbd2_inode *jinode)
213  {
214  
215  	if (!jinode || !(jinode->i_flags & JI_WRITE_DATA))
216  		return 0;
217  
218  	trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
219  	return jbd2_journal_submit_inode_data_buffers(jinode);
220  
221  }
222  EXPORT_SYMBOL(jbd2_submit_inode_data);
223  
224  int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode)
225  {
226  	if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) ||
227  		!jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping)
228  		return 0;
229  	return filemap_fdatawait_range_keep_errors(
230  		jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start,
231  		jinode->i_dirty_end);
232  }
233  EXPORT_SYMBOL(jbd2_wait_inode_data);
234  
235  /*
236   * Submit all the data buffers of inode associated with the transaction to
237   * disk.
238   *
239   * We are in a committing transaction. Therefore no new inode can be added to
240   * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
241   * operate on from being released while we write out pages.
242   */
243  static int journal_submit_data_buffers(journal_t *journal,
244  		transaction_t *commit_transaction)
245  {
246  	struct jbd2_inode *jinode;
247  	int err, ret = 0;
248  
249  	spin_lock(&journal->j_list_lock);
250  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
251  		if (!(jinode->i_flags & JI_WRITE_DATA))
252  			continue;
253  		jinode->i_flags |= JI_COMMIT_RUNNING;
254  		spin_unlock(&journal->j_list_lock);
255  		/* submit the inode data buffers. */
256  		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
257  		if (journal->j_submit_inode_data_buffers) {
258  			err = journal->j_submit_inode_data_buffers(jinode);
259  			if (!ret)
260  				ret = err;
261  		}
262  		spin_lock(&journal->j_list_lock);
263  		J_ASSERT(jinode->i_transaction == commit_transaction);
264  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
265  		smp_mb();
266  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
267  	}
268  	spin_unlock(&journal->j_list_lock);
269  	return ret;
270  }
271  
272  int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode)
273  {
274  	struct address_space *mapping = jinode->i_vfs_inode->i_mapping;
275  
276  	return filemap_fdatawait_range_keep_errors(mapping,
277  						   jinode->i_dirty_start,
278  						   jinode->i_dirty_end);
279  }
280  
281  /*
282   * Wait for data submitted for writeout, refile inodes to proper
283   * transaction if needed.
284   *
285   */
286  static int journal_finish_inode_data_buffers(journal_t *journal,
287  		transaction_t *commit_transaction)
288  {
289  	struct jbd2_inode *jinode, *next_i;
290  	int err, ret = 0;
291  
292  	/* For locking, see the comment in journal_submit_data_buffers() */
293  	spin_lock(&journal->j_list_lock);
294  	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
295  		if (!(jinode->i_flags & JI_WAIT_DATA))
296  			continue;
297  		jinode->i_flags |= JI_COMMIT_RUNNING;
298  		spin_unlock(&journal->j_list_lock);
299  		/* wait for the inode data buffers writeout. */
300  		if (journal->j_finish_inode_data_buffers) {
301  			err = journal->j_finish_inode_data_buffers(jinode);
302  			if (!ret)
303  				ret = err;
304  		}
305  		spin_lock(&journal->j_list_lock);
306  		jinode->i_flags &= ~JI_COMMIT_RUNNING;
307  		smp_mb();
308  		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
309  	}
310  
311  	/* Now refile inode to proper lists */
312  	list_for_each_entry_safe(jinode, next_i,
313  				 &commit_transaction->t_inode_list, i_list) {
314  		list_del(&jinode->i_list);
315  		if (jinode->i_next_transaction) {
316  			jinode->i_transaction = jinode->i_next_transaction;
317  			jinode->i_next_transaction = NULL;
318  			list_add(&jinode->i_list,
319  				&jinode->i_transaction->t_inode_list);
320  		} else {
321  			jinode->i_transaction = NULL;
322  			jinode->i_dirty_start = 0;
323  			jinode->i_dirty_end = 0;
324  		}
325  	}
326  	spin_unlock(&journal->j_list_lock);
327  
328  	return ret;
329  }
330  
331  static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
332  {
333  	struct page *page = bh->b_page;
334  	char *addr;
335  	__u32 checksum;
336  
337  	addr = kmap_atomic(page);
338  	checksum = crc32_be(crc32_sum,
339  		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
340  	kunmap_atomic(addr);
341  
342  	return checksum;
343  }
344  
345  static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
346  				   unsigned long long block)
347  {
348  	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
349  	if (jbd2_has_feature_64bit(j))
350  		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
351  }
352  
353  static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
354  				    struct buffer_head *bh, __u32 sequence)
355  {
356  	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
357  	struct page *page = bh->b_page;
358  	__u8 *addr;
359  	__u32 csum32;
360  	__be32 seq;
361  
362  	if (!jbd2_journal_has_csum_v2or3(j))
363  		return;
364  
365  	seq = cpu_to_be32(sequence);
366  	addr = kmap_atomic(page);
367  	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
368  	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
369  			     bh->b_size);
370  	kunmap_atomic(addr);
371  
372  	if (jbd2_has_feature_csum3(j))
373  		tag3->t_checksum = cpu_to_be32(csum32);
374  	else
375  		tag->t_checksum = cpu_to_be16(csum32);
376  }
377  /*
378   * jbd2_journal_commit_transaction
379   *
380   * The primary function for committing a transaction to the log.  This
381   * function is called by the journal thread to begin a complete commit.
382   */
383  void jbd2_journal_commit_transaction(journal_t *journal)
384  {
385  	struct transaction_stats_s stats;
386  	transaction_t *commit_transaction;
387  	struct journal_head *jh;
388  	struct buffer_head *descriptor;
389  	struct buffer_head **wbuf = journal->j_wbuf;
390  	int bufs;
391  	int flags;
392  	int err;
393  	unsigned long long blocknr;
394  	ktime_t start_time;
395  	u64 commit_time;
396  	char *tagp = NULL;
397  	journal_block_tag_t *tag = NULL;
398  	int space_left = 0;
399  	int first_tag = 0;
400  	int tag_flag;
401  	int i;
402  	int tag_bytes = journal_tag_bytes(journal);
403  	struct buffer_head *cbh = NULL; /* For transactional checksums */
404  	__u32 crc32_sum = ~0;
405  	struct blk_plug plug;
406  	/* Tail of the journal */
407  	unsigned long first_block;
408  	tid_t first_tid;
409  	int update_tail;
410  	int csum_size = 0;
411  	LIST_HEAD(io_bufs);
412  	LIST_HEAD(log_bufs);
413  
414  	if (jbd2_journal_has_csum_v2or3(journal))
415  		csum_size = sizeof(struct jbd2_journal_block_tail);
416  
417  	/*
418  	 * First job: lock down the current transaction and wait for
419  	 * all outstanding updates to complete.
420  	 */
421  
422  	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
423  	if (journal->j_flags & JBD2_FLUSHED) {
424  		jbd_debug(3, "super block updated\n");
425  		mutex_lock_io(&journal->j_checkpoint_mutex);
426  		/*
427  		 * We hold j_checkpoint_mutex so tail cannot change under us.
428  		 * We don't need any special data guarantees for writing sb
429  		 * since journal is empty and it is ok for write to be
430  		 * flushed only with transaction commit.
431  		 */
432  		jbd2_journal_update_sb_log_tail(journal,
433  						journal->j_tail_sequence,
434  						journal->j_tail,
435  						REQ_SYNC);
436  		mutex_unlock(&journal->j_checkpoint_mutex);
437  	} else {
438  		jbd_debug(3, "superblock not updated\n");
439  	}
440  
441  	J_ASSERT(journal->j_running_transaction != NULL);
442  	J_ASSERT(journal->j_committing_transaction == NULL);
443  
444  	write_lock(&journal->j_state_lock);
445  	journal->j_flags |= JBD2_FULL_COMMIT_ONGOING;
446  	while (journal->j_flags & JBD2_FAST_COMMIT_ONGOING) {
447  		DEFINE_WAIT(wait);
448  
449  		prepare_to_wait(&journal->j_fc_wait, &wait,
450  				TASK_UNINTERRUPTIBLE);
451  		write_unlock(&journal->j_state_lock);
452  		schedule();
453  		write_lock(&journal->j_state_lock);
454  		finish_wait(&journal->j_fc_wait, &wait);
455  		/*
456  		 * TODO: by blocking fast commits here, we are increasing
457  		 * fsync() latency slightly. Strictly speaking, we don't need
458  		 * to block fast commits until the transaction enters T_FLUSH
459  		 * state. So an optimization is possible where we block new fast
460  		 * commits here and wait for existing ones to complete
461  		 * just before we enter T_FLUSH. That way, the existing fast
462  		 * commits and this full commit can proceed parallely.
463  		 */
464  	}
465  	write_unlock(&journal->j_state_lock);
466  
467  	commit_transaction = journal->j_running_transaction;
468  
469  	trace_jbd2_start_commit(journal, commit_transaction);
470  	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
471  			commit_transaction->t_tid);
472  
473  	write_lock(&journal->j_state_lock);
474  	journal->j_fc_off = 0;
475  	J_ASSERT(commit_transaction->t_state == T_RUNNING);
476  	commit_transaction->t_state = T_LOCKED;
477  
478  	trace_jbd2_commit_locking(journal, commit_transaction);
479  	stats.run.rs_wait = commit_transaction->t_max_wait;
480  	stats.run.rs_request_delay = 0;
481  	stats.run.rs_locked = jiffies;
482  	if (commit_transaction->t_requested)
483  		stats.run.rs_request_delay =
484  			jbd2_time_diff(commit_transaction->t_requested,
485  				       stats.run.rs_locked);
486  	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
487  					      stats.run.rs_locked);
488  
489  	// waits for any t_updates to finish
490  	jbd2_journal_wait_updates(journal);
491  
492  	commit_transaction->t_state = T_SWITCH;
493  
494  	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
495  			journal->j_max_transaction_buffers);
496  
497  	/*
498  	 * First thing we are allowed to do is to discard any remaining
499  	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
500  	 * that there are no such buffers: if a large filesystem
501  	 * operation like a truncate needs to split itself over multiple
502  	 * transactions, then it may try to do a jbd2_journal_restart() while
503  	 * there are still BJ_Reserved buffers outstanding.  These must
504  	 * be released cleanly from the current transaction.
505  	 *
506  	 * In this case, the filesystem must still reserve write access
507  	 * again before modifying the buffer in the new transaction, but
508  	 * we do not require it to remember exactly which old buffers it
509  	 * has reserved.  This is consistent with the existing behaviour
510  	 * that multiple jbd2_journal_get_write_access() calls to the same
511  	 * buffer are perfectly permissible.
512  	 * We use journal->j_state_lock here to serialize processing of
513  	 * t_reserved_list with eviction of buffers from journal_unmap_buffer().
514  	 */
515  	while (commit_transaction->t_reserved_list) {
516  		jh = commit_transaction->t_reserved_list;
517  		JBUFFER_TRACE(jh, "reserved, unused: refile");
518  		/*
519  		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
520  		 * leave undo-committed data.
521  		 */
522  		if (jh->b_committed_data) {
523  			struct buffer_head *bh = jh2bh(jh);
524  
525  			spin_lock(&jh->b_state_lock);
526  			jbd2_free(jh->b_committed_data, bh->b_size);
527  			jh->b_committed_data = NULL;
528  			spin_unlock(&jh->b_state_lock);
529  		}
530  		jbd2_journal_refile_buffer(journal, jh);
531  	}
532  
533  	write_unlock(&journal->j_state_lock);
534  	/*
535  	 * Now try to drop any written-back buffers from the journal's
536  	 * checkpoint lists.  We do this *before* commit because it potentially
537  	 * frees some memory
538  	 */
539  	spin_lock(&journal->j_list_lock);
540  	__jbd2_journal_clean_checkpoint_list(journal, false);
541  	spin_unlock(&journal->j_list_lock);
542  
543  	jbd_debug(3, "JBD2: commit phase 1\n");
544  
545  	/*
546  	 * Clear revoked flag to reflect there is no revoked buffers
547  	 * in the next transaction which is going to be started.
548  	 */
549  	jbd2_clear_buffer_revoked_flags(journal);
550  
551  	/*
552  	 * Switch to a new revoke table.
553  	 */
554  	jbd2_journal_switch_revoke_table(journal);
555  
556  	/*
557  	 * Reserved credits cannot be claimed anymore, free them
558  	 */
559  	atomic_sub(atomic_read(&journal->j_reserved_credits),
560  		   &commit_transaction->t_outstanding_credits);
561  
562  	write_lock(&journal->j_state_lock);
563  	trace_jbd2_commit_flushing(journal, commit_transaction);
564  	stats.run.rs_flushing = jiffies;
565  	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
566  					     stats.run.rs_flushing);
567  
568  	commit_transaction->t_state = T_FLUSH;
569  	journal->j_committing_transaction = commit_transaction;
570  	journal->j_running_transaction = NULL;
571  	start_time = ktime_get();
572  	commit_transaction->t_log_start = journal->j_head;
573  	wake_up(&journal->j_wait_transaction_locked);
574  	write_unlock(&journal->j_state_lock);
575  
576  	jbd_debug(3, "JBD2: commit phase 2a\n");
577  
578  	/*
579  	 * Now start flushing things to disk, in the order they appear
580  	 * on the transaction lists.  Data blocks go first.
581  	 */
582  	err = journal_submit_data_buffers(journal, commit_transaction);
583  	if (err)
584  		jbd2_journal_abort(journal, err);
585  
586  	blk_start_plug(&plug);
587  	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
588  
589  	jbd_debug(3, "JBD2: commit phase 2b\n");
590  
591  	/*
592  	 * Way to go: we have now written out all of the data for a
593  	 * transaction!  Now comes the tricky part: we need to write out
594  	 * metadata.  Loop over the transaction's entire buffer list:
595  	 */
596  	write_lock(&journal->j_state_lock);
597  	commit_transaction->t_state = T_COMMIT;
598  	write_unlock(&journal->j_state_lock);
599  
600  	trace_jbd2_commit_logging(journal, commit_transaction);
601  	stats.run.rs_logging = jiffies;
602  	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
603  					       stats.run.rs_logging);
604  	stats.run.rs_blocks = commit_transaction->t_nr_buffers;
605  	stats.run.rs_blocks_logged = 0;
606  
607  	J_ASSERT(commit_transaction->t_nr_buffers <=
608  		 atomic_read(&commit_transaction->t_outstanding_credits));
609  
610  	err = 0;
611  	bufs = 0;
612  	descriptor = NULL;
613  	while (commit_transaction->t_buffers) {
614  
615  		/* Find the next buffer to be journaled... */
616  
617  		jh = commit_transaction->t_buffers;
618  
619  		/* If we're in abort mode, we just un-journal the buffer and
620  		   release it. */
621  
622  		if (is_journal_aborted(journal)) {
623  			clear_buffer_jbddirty(jh2bh(jh));
624  			JBUFFER_TRACE(jh, "journal is aborting: refile");
625  			jbd2_buffer_abort_trigger(jh,
626  						  jh->b_frozen_data ?
627  						  jh->b_frozen_triggers :
628  						  jh->b_triggers);
629  			jbd2_journal_refile_buffer(journal, jh);
630  			/* If that was the last one, we need to clean up
631  			 * any descriptor buffers which may have been
632  			 * already allocated, even if we are now
633  			 * aborting. */
634  			if (!commit_transaction->t_buffers)
635  				goto start_journal_io;
636  			continue;
637  		}
638  
639  		/* Make sure we have a descriptor block in which to
640  		   record the metadata buffer. */
641  
642  		if (!descriptor) {
643  			J_ASSERT (bufs == 0);
644  
645  			jbd_debug(4, "JBD2: get descriptor\n");
646  
647  			descriptor = jbd2_journal_get_descriptor_buffer(
648  							commit_transaction,
649  							JBD2_DESCRIPTOR_BLOCK);
650  			if (!descriptor) {
651  				jbd2_journal_abort(journal, -EIO);
652  				continue;
653  			}
654  
655  			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
656  				(unsigned long long)descriptor->b_blocknr,
657  				descriptor->b_data);
658  			tagp = &descriptor->b_data[sizeof(journal_header_t)];
659  			space_left = descriptor->b_size -
660  						sizeof(journal_header_t);
661  			first_tag = 1;
662  			set_buffer_jwrite(descriptor);
663  			set_buffer_dirty(descriptor);
664  			wbuf[bufs++] = descriptor;
665  
666  			/* Record it so that we can wait for IO
667                             completion later */
668  			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
669  			jbd2_file_log_bh(&log_bufs, descriptor);
670  		}
671  
672  		/* Where is the buffer to be written? */
673  
674  		err = jbd2_journal_next_log_block(journal, &blocknr);
675  		/* If the block mapping failed, just abandon the buffer
676  		   and repeat this loop: we'll fall into the
677  		   refile-on-abort condition above. */
678  		if (err) {
679  			jbd2_journal_abort(journal, err);
680  			continue;
681  		}
682  
683  		/*
684  		 * start_this_handle() uses t_outstanding_credits to determine
685  		 * the free space in the log.
686  		 */
687  		atomic_dec(&commit_transaction->t_outstanding_credits);
688  
689  		/* Bump b_count to prevent truncate from stumbling over
690                     the shadowed buffer!  @@@ This can go if we ever get
691                     rid of the shadow pairing of buffers. */
692  		atomic_inc(&jh2bh(jh)->b_count);
693  
694  		/*
695  		 * Make a temporary IO buffer with which to write it out
696  		 * (this will requeue the metadata buffer to BJ_Shadow).
697  		 */
698  		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
699  		JBUFFER_TRACE(jh, "ph3: write metadata");
700  		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
701  						jh, &wbuf[bufs], blocknr);
702  		if (flags < 0) {
703  			jbd2_journal_abort(journal, flags);
704  			continue;
705  		}
706  		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
707  
708  		/* Record the new block's tag in the current descriptor
709                     buffer */
710  
711  		tag_flag = 0;
712  		if (flags & 1)
713  			tag_flag |= JBD2_FLAG_ESCAPE;
714  		if (!first_tag)
715  			tag_flag |= JBD2_FLAG_SAME_UUID;
716  
717  		tag = (journal_block_tag_t *) tagp;
718  		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
719  		tag->t_flags = cpu_to_be16(tag_flag);
720  		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
721  					commit_transaction->t_tid);
722  		tagp += tag_bytes;
723  		space_left -= tag_bytes;
724  		bufs++;
725  
726  		if (first_tag) {
727  			memcpy (tagp, journal->j_uuid, 16);
728  			tagp += 16;
729  			space_left -= 16;
730  			first_tag = 0;
731  		}
732  
733  		/* If there's no more to do, or if the descriptor is full,
734  		   let the IO rip! */
735  
736  		if (bufs == journal->j_wbufsize ||
737  		    commit_transaction->t_buffers == NULL ||
738  		    space_left < tag_bytes + 16 + csum_size) {
739  
740  			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
741  
742  			/* Write an end-of-descriptor marker before
743                             submitting the IOs.  "tag" still points to
744                             the last tag we set up. */
745  
746  			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
747  start_journal_io:
748  			if (descriptor)
749  				jbd2_descriptor_block_csum_set(journal,
750  							descriptor);
751  
752  			for (i = 0; i < bufs; i++) {
753  				struct buffer_head *bh = wbuf[i];
754  				/*
755  				 * Compute checksum.
756  				 */
757  				if (jbd2_has_feature_checksum(journal)) {
758  					crc32_sum =
759  					    jbd2_checksum_data(crc32_sum, bh);
760  				}
761  
762  				lock_buffer(bh);
763  				clear_buffer_dirty(bh);
764  				set_buffer_uptodate(bh);
765  				bh->b_end_io = journal_end_buffer_io_sync;
766  				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
767  			}
768  			cond_resched();
769  
770  			/* Force a new descriptor to be generated next
771                             time round the loop. */
772  			descriptor = NULL;
773  			bufs = 0;
774  		}
775  	}
776  
777  	err = journal_finish_inode_data_buffers(journal, commit_transaction);
778  	if (err) {
779  		printk(KERN_WARNING
780  			"JBD2: Detected IO errors while flushing file data "
781  		       "on %s\n", journal->j_devname);
782  		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
783  			jbd2_journal_abort(journal, err);
784  		err = 0;
785  	}
786  
787  	/*
788  	 * Get current oldest transaction in the log before we issue flush
789  	 * to the filesystem device. After the flush we can be sure that
790  	 * blocks of all older transactions are checkpointed to persistent
791  	 * storage and we will be safe to update journal start in the
792  	 * superblock with the numbers we get here.
793  	 */
794  	update_tail =
795  		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
796  
797  	write_lock(&journal->j_state_lock);
798  	if (update_tail) {
799  		long freed = first_block - journal->j_tail;
800  
801  		if (first_block < journal->j_tail)
802  			freed += journal->j_last - journal->j_first;
803  		/* Update tail only if we free significant amount of space */
804  		if (freed < jbd2_journal_get_max_txn_bufs(journal))
805  			update_tail = 0;
806  	}
807  	J_ASSERT(commit_transaction->t_state == T_COMMIT);
808  	commit_transaction->t_state = T_COMMIT_DFLUSH;
809  	write_unlock(&journal->j_state_lock);
810  
811  	/*
812  	 * If the journal is not located on the file system device,
813  	 * then we must flush the file system device before we issue
814  	 * the commit record
815  	 */
816  	if (commit_transaction->t_need_data_flush &&
817  	    (journal->j_fs_dev != journal->j_dev) &&
818  	    (journal->j_flags & JBD2_BARRIER))
819  		blkdev_issue_flush(journal->j_fs_dev);
820  
821  	/* Done it all: now write the commit record asynchronously. */
822  	if (jbd2_has_feature_async_commit(journal)) {
823  		err = journal_submit_commit_record(journal, commit_transaction,
824  						 &cbh, crc32_sum);
825  		if (err)
826  			jbd2_journal_abort(journal, err);
827  	}
828  
829  	blk_finish_plug(&plug);
830  
831  	/* Lo and behold: we have just managed to send a transaction to
832             the log.  Before we can commit it, wait for the IO so far to
833             complete.  Control buffers being written are on the
834             transaction's t_log_list queue, and metadata buffers are on
835             the io_bufs list.
836  
837  	   Wait for the buffers in reverse order.  That way we are
838  	   less likely to be woken up until all IOs have completed, and
839  	   so we incur less scheduling load.
840  	*/
841  
842  	jbd_debug(3, "JBD2: commit phase 3\n");
843  
844  	while (!list_empty(&io_bufs)) {
845  		struct buffer_head *bh = list_entry(io_bufs.prev,
846  						    struct buffer_head,
847  						    b_assoc_buffers);
848  
849  		wait_on_buffer(bh);
850  		cond_resched();
851  
852  		if (unlikely(!buffer_uptodate(bh)))
853  			err = -EIO;
854  		jbd2_unfile_log_bh(bh);
855  		stats.run.rs_blocks_logged++;
856  
857  		/*
858  		 * The list contains temporary buffer heads created by
859  		 * jbd2_journal_write_metadata_buffer().
860  		 */
861  		BUFFER_TRACE(bh, "dumping temporary bh");
862  		__brelse(bh);
863  		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
864  		free_buffer_head(bh);
865  
866  		/* We also have to refile the corresponding shadowed buffer */
867  		jh = commit_transaction->t_shadow_list->b_tprev;
868  		bh = jh2bh(jh);
869  		clear_buffer_jwrite(bh);
870  		J_ASSERT_BH(bh, buffer_jbddirty(bh));
871  		J_ASSERT_BH(bh, !buffer_shadow(bh));
872  
873  		/* The metadata is now released for reuse, but we need
874                     to remember it against this transaction so that when
875                     we finally commit, we can do any checkpointing
876                     required. */
877  		JBUFFER_TRACE(jh, "file as BJ_Forget");
878  		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
879  		JBUFFER_TRACE(jh, "brelse shadowed buffer");
880  		__brelse(bh);
881  	}
882  
883  	J_ASSERT (commit_transaction->t_shadow_list == NULL);
884  
885  	jbd_debug(3, "JBD2: commit phase 4\n");
886  
887  	/* Here we wait for the revoke record and descriptor record buffers */
888  	while (!list_empty(&log_bufs)) {
889  		struct buffer_head *bh;
890  
891  		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
892  		wait_on_buffer(bh);
893  		cond_resched();
894  
895  		if (unlikely(!buffer_uptodate(bh)))
896  			err = -EIO;
897  
898  		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
899  		clear_buffer_jwrite(bh);
900  		jbd2_unfile_log_bh(bh);
901  		stats.run.rs_blocks_logged++;
902  		__brelse(bh);		/* One for getblk */
903  		/* AKPM: bforget here */
904  	}
905  
906  	if (err)
907  		jbd2_journal_abort(journal, err);
908  
909  	jbd_debug(3, "JBD2: commit phase 5\n");
910  	write_lock(&journal->j_state_lock);
911  	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
912  	commit_transaction->t_state = T_COMMIT_JFLUSH;
913  	write_unlock(&journal->j_state_lock);
914  
915  	if (!jbd2_has_feature_async_commit(journal)) {
916  		err = journal_submit_commit_record(journal, commit_transaction,
917  						&cbh, crc32_sum);
918  		if (err)
919  			jbd2_journal_abort(journal, err);
920  	}
921  	if (cbh)
922  		err = journal_wait_on_commit_record(journal, cbh);
923  	stats.run.rs_blocks_logged++;
924  	if (jbd2_has_feature_async_commit(journal) &&
925  	    journal->j_flags & JBD2_BARRIER) {
926  		blkdev_issue_flush(journal->j_dev);
927  	}
928  
929  	if (err)
930  		jbd2_journal_abort(journal, err);
931  
932  	WARN_ON_ONCE(
933  		atomic_read(&commit_transaction->t_outstanding_credits) < 0);
934  
935  	/*
936  	 * Now disk caches for filesystem device are flushed so we are safe to
937  	 * erase checkpointed transactions from the log by updating journal
938  	 * superblock.
939  	 */
940  	if (update_tail)
941  		jbd2_update_log_tail(journal, first_tid, first_block);
942  
943  	/* End of a transaction!  Finally, we can do checkpoint
944             processing: any buffers committed as a result of this
945             transaction can be removed from any checkpoint list it was on
946             before. */
947  
948  	jbd_debug(3, "JBD2: commit phase 6\n");
949  
950  	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
951  	J_ASSERT(commit_transaction->t_buffers == NULL);
952  	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
953  	J_ASSERT(commit_transaction->t_shadow_list == NULL);
954  
955  restart_loop:
956  	/*
957  	 * As there are other places (journal_unmap_buffer()) adding buffers
958  	 * to this list we have to be careful and hold the j_list_lock.
959  	 */
960  	spin_lock(&journal->j_list_lock);
961  	while (commit_transaction->t_forget) {
962  		transaction_t *cp_transaction;
963  		struct buffer_head *bh;
964  		int try_to_free = 0;
965  		bool drop_ref;
966  
967  		jh = commit_transaction->t_forget;
968  		spin_unlock(&journal->j_list_lock);
969  		bh = jh2bh(jh);
970  		/*
971  		 * Get a reference so that bh cannot be freed before we are
972  		 * done with it.
973  		 */
974  		get_bh(bh);
975  		spin_lock(&jh->b_state_lock);
976  		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
977  
978  		/*
979  		 * If there is undo-protected committed data against
980  		 * this buffer, then we can remove it now.  If it is a
981  		 * buffer needing such protection, the old frozen_data
982  		 * field now points to a committed version of the
983  		 * buffer, so rotate that field to the new committed
984  		 * data.
985  		 *
986  		 * Otherwise, we can just throw away the frozen data now.
987  		 *
988  		 * We also know that the frozen data has already fired
989  		 * its triggers if they exist, so we can clear that too.
990  		 */
991  		if (jh->b_committed_data) {
992  			jbd2_free(jh->b_committed_data, bh->b_size);
993  			jh->b_committed_data = NULL;
994  			if (jh->b_frozen_data) {
995  				jh->b_committed_data = jh->b_frozen_data;
996  				jh->b_frozen_data = NULL;
997  				jh->b_frozen_triggers = NULL;
998  			}
999  		} else if (jh->b_frozen_data) {
1000  			jbd2_free(jh->b_frozen_data, bh->b_size);
1001  			jh->b_frozen_data = NULL;
1002  			jh->b_frozen_triggers = NULL;
1003  		}
1004  
1005  		spin_lock(&journal->j_list_lock);
1006  		cp_transaction = jh->b_cp_transaction;
1007  		if (cp_transaction) {
1008  			JBUFFER_TRACE(jh, "remove from old cp transaction");
1009  			cp_transaction->t_chp_stats.cs_dropped++;
1010  			__jbd2_journal_remove_checkpoint(jh);
1011  		}
1012  
1013  		/* Only re-checkpoint the buffer_head if it is marked
1014  		 * dirty.  If the buffer was added to the BJ_Forget list
1015  		 * by jbd2_journal_forget, it may no longer be dirty and
1016  		 * there's no point in keeping a checkpoint record for
1017  		 * it. */
1018  
1019  		/*
1020  		 * A buffer which has been freed while still being journaled
1021  		 * by a previous transaction, refile the buffer to BJ_Forget of
1022  		 * the running transaction. If the just committed transaction
1023  		 * contains "add to orphan" operation, we can completely
1024  		 * invalidate the buffer now. We are rather through in that
1025  		 * since the buffer may be still accessible when blocksize <
1026  		 * pagesize and it is attached to the last partial page.
1027  		 */
1028  		if (buffer_freed(bh) && !jh->b_next_transaction) {
1029  			struct address_space *mapping;
1030  
1031  			clear_buffer_freed(bh);
1032  			clear_buffer_jbddirty(bh);
1033  
1034  			/*
1035  			 * Block device buffers need to stay mapped all the
1036  			 * time, so it is enough to clear buffer_jbddirty and
1037  			 * buffer_freed bits. For the file mapping buffers (i.e.
1038  			 * journalled data) we need to unmap buffer and clear
1039  			 * more bits. We also need to be careful about the check
1040  			 * because the data page mapping can get cleared under
1041  			 * our hands. Note that if mapping == NULL, we don't
1042  			 * need to make buffer unmapped because the page is
1043  			 * already detached from the mapping and buffers cannot
1044  			 * get reused.
1045  			 */
1046  			mapping = READ_ONCE(bh->b_page->mapping);
1047  			if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
1048  				clear_buffer_mapped(bh);
1049  				clear_buffer_new(bh);
1050  				clear_buffer_req(bh);
1051  				bh->b_bdev = NULL;
1052  			}
1053  		}
1054  
1055  		if (buffer_jbddirty(bh)) {
1056  			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1057  			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1058  			if (is_journal_aborted(journal))
1059  				clear_buffer_jbddirty(bh);
1060  		} else {
1061  			J_ASSERT_BH(bh, !buffer_dirty(bh));
1062  			/*
1063  			 * The buffer on BJ_Forget list and not jbddirty means
1064  			 * it has been freed by this transaction and hence it
1065  			 * could not have been reallocated until this
1066  			 * transaction has committed. *BUT* it could be
1067  			 * reallocated once we have written all the data to
1068  			 * disk and before we process the buffer on BJ_Forget
1069  			 * list.
1070  			 */
1071  			if (!jh->b_next_transaction)
1072  				try_to_free = 1;
1073  		}
1074  		JBUFFER_TRACE(jh, "refile or unfile buffer");
1075  		drop_ref = __jbd2_journal_refile_buffer(jh);
1076  		spin_unlock(&jh->b_state_lock);
1077  		if (drop_ref)
1078  			jbd2_journal_put_journal_head(jh);
1079  		if (try_to_free)
1080  			release_buffer_page(bh);	/* Drops bh reference */
1081  		else
1082  			__brelse(bh);
1083  		cond_resched_lock(&journal->j_list_lock);
1084  	}
1085  	spin_unlock(&journal->j_list_lock);
1086  	/*
1087  	 * This is a bit sleazy.  We use j_list_lock to protect transition
1088  	 * of a transaction into T_FINISHED state and calling
1089  	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1090  	 * other checkpointing code processing the transaction...
1091  	 */
1092  	write_lock(&journal->j_state_lock);
1093  	spin_lock(&journal->j_list_lock);
1094  	/*
1095  	 * Now recheck if some buffers did not get attached to the transaction
1096  	 * while the lock was dropped...
1097  	 */
1098  	if (commit_transaction->t_forget) {
1099  		spin_unlock(&journal->j_list_lock);
1100  		write_unlock(&journal->j_state_lock);
1101  		goto restart_loop;
1102  	}
1103  
1104  	/* Add the transaction to the checkpoint list
1105  	 * __journal_remove_checkpoint() can not destroy transaction
1106  	 * under us because it is not marked as T_FINISHED yet */
1107  	if (journal->j_checkpoint_transactions == NULL) {
1108  		journal->j_checkpoint_transactions = commit_transaction;
1109  		commit_transaction->t_cpnext = commit_transaction;
1110  		commit_transaction->t_cpprev = commit_transaction;
1111  	} else {
1112  		commit_transaction->t_cpnext =
1113  			journal->j_checkpoint_transactions;
1114  		commit_transaction->t_cpprev =
1115  			commit_transaction->t_cpnext->t_cpprev;
1116  		commit_transaction->t_cpnext->t_cpprev =
1117  			commit_transaction;
1118  		commit_transaction->t_cpprev->t_cpnext =
1119  				commit_transaction;
1120  	}
1121  	spin_unlock(&journal->j_list_lock);
1122  
1123  	/* Done with this transaction! */
1124  
1125  	jbd_debug(3, "JBD2: commit phase 7\n");
1126  
1127  	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1128  
1129  	commit_transaction->t_start = jiffies;
1130  	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1131  					      commit_transaction->t_start);
1132  
1133  	/*
1134  	 * File the transaction statistics
1135  	 */
1136  	stats.ts_tid = commit_transaction->t_tid;
1137  	stats.run.rs_handle_count =
1138  		atomic_read(&commit_transaction->t_handle_count);
1139  	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1140  			     commit_transaction->t_tid, &stats.run);
1141  	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1142  
1143  	commit_transaction->t_state = T_COMMIT_CALLBACK;
1144  	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1145  	journal->j_commit_sequence = commit_transaction->t_tid;
1146  	journal->j_committing_transaction = NULL;
1147  	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1148  
1149  	/*
1150  	 * weight the commit time higher than the average time so we don't
1151  	 * react too strongly to vast changes in the commit time
1152  	 */
1153  	if (likely(journal->j_average_commit_time))
1154  		journal->j_average_commit_time = (commit_time +
1155  				journal->j_average_commit_time*3) / 4;
1156  	else
1157  		journal->j_average_commit_time = commit_time;
1158  
1159  	write_unlock(&journal->j_state_lock);
1160  
1161  	if (journal->j_commit_callback)
1162  		journal->j_commit_callback(journal, commit_transaction);
1163  	if (journal->j_fc_cleanup_callback)
1164  		journal->j_fc_cleanup_callback(journal, 1, commit_transaction->t_tid);
1165  
1166  	trace_jbd2_end_commit(journal, commit_transaction);
1167  	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1168  		  journal->j_commit_sequence, journal->j_tail_sequence);
1169  
1170  	write_lock(&journal->j_state_lock);
1171  	journal->j_flags &= ~JBD2_FULL_COMMIT_ONGOING;
1172  	journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
1173  	spin_lock(&journal->j_list_lock);
1174  	commit_transaction->t_state = T_FINISHED;
1175  	/* Check if the transaction can be dropped now that we are finished */
1176  	if (commit_transaction->t_checkpoint_list == NULL &&
1177  	    commit_transaction->t_checkpoint_io_list == NULL) {
1178  		__jbd2_journal_drop_transaction(journal, commit_transaction);
1179  		jbd2_journal_free_transaction(commit_transaction);
1180  	}
1181  	spin_unlock(&journal->j_list_lock);
1182  	write_unlock(&journal->j_state_lock);
1183  	wake_up(&journal->j_wait_done_commit);
1184  	wake_up(&journal->j_fc_wait);
1185  
1186  	/*
1187  	 * Calculate overall stats
1188  	 */
1189  	spin_lock(&journal->j_history_lock);
1190  	journal->j_stats.ts_tid++;
1191  	journal->j_stats.ts_requested += stats.ts_requested;
1192  	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1193  	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1194  	journal->j_stats.run.rs_running += stats.run.rs_running;
1195  	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1196  	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1197  	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1198  	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1199  	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1200  	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1201  	spin_unlock(&journal->j_history_lock);
1202  }
1203