xref: /openbmc/linux/fs/jbd2/commit.c (revision 0edbfea5)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * IO end handler for temporary buffer_heads handling writes to the journal.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	struct buffer_head *orig_bh = bh->b_private;
38 
39 	BUFFER_TRACE(bh, "");
40 	if (uptodate)
41 		set_buffer_uptodate(bh);
42 	else
43 		clear_buffer_uptodate(bh);
44 	if (orig_bh) {
45 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 		smp_mb__after_atomic();
47 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 	}
49 	unlock_buffer(bh);
50 }
51 
52 /*
53  * When an ext4 file is truncated, it is possible that some pages are not
54  * successfully freed, because they are attached to a committing transaction.
55  * After the transaction commits, these pages are left on the LRU, with no
56  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57  * by the VM, but their apparent absence upsets the VM accounting, and it makes
58  * the numbers in /proc/meminfo look odd.
59  *
60  * So here, we have a buffer which has just come off the forget list.  Look to
61  * see if we can strip all buffers from the backing page.
62  *
63  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64  * caller provided us with a ref against the buffer, and we drop that here.
65  */
66 static void release_buffer_page(struct buffer_head *bh)
67 {
68 	struct page *page;
69 
70 	if (buffer_dirty(bh))
71 		goto nope;
72 	if (atomic_read(&bh->b_count) != 1)
73 		goto nope;
74 	page = bh->b_page;
75 	if (!page)
76 		goto nope;
77 	if (page->mapping)
78 		goto nope;
79 
80 	/* OK, it's a truncated page */
81 	if (!trylock_page(page))
82 		goto nope;
83 
84 	get_page(page);
85 	__brelse(bh);
86 	try_to_free_buffers(page);
87 	unlock_page(page);
88 	put_page(page);
89 	return;
90 
91 nope:
92 	__brelse(bh);
93 }
94 
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96 {
97 	struct commit_header *h;
98 	__u32 csum;
99 
100 	if (!jbd2_journal_has_csum_v2or3(j))
101 		return;
102 
103 	h = (struct commit_header *)(bh->b_data);
104 	h->h_chksum_type = 0;
105 	h->h_chksum_size = 0;
106 	h->h_chksum[0] = 0;
107 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 	h->h_chksum[0] = cpu_to_be32(csum);
109 }
110 
111 /*
112  * Done it all: now submit the commit record.  We should have
113  * cleaned up our previous buffers by now, so if we are in abort
114  * mode we can now just skip the rest of the journal write
115  * entirely.
116  *
117  * Returns 1 if the journal needs to be aborted or 0 on success
118  */
119 static int journal_submit_commit_record(journal_t *journal,
120 					transaction_t *commit_transaction,
121 					struct buffer_head **cbh,
122 					__u32 crc32_sum)
123 {
124 	struct commit_header *tmp;
125 	struct buffer_head *bh;
126 	int ret;
127 	struct timespec now = current_kernel_time();
128 
129 	*cbh = NULL;
130 
131 	if (is_journal_aborted(journal))
132 		return 0;
133 
134 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 						JBD2_COMMIT_BLOCK);
136 	if (!bh)
137 		return 1;
138 
139 	tmp = (struct commit_header *)bh->b_data;
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (jbd2_has_feature_checksum(journal)) {
144 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147 	}
148 	jbd2_commit_block_csum_set(journal, bh);
149 
150 	BUFFER_TRACE(bh, "submit commit block");
151 	lock_buffer(bh);
152 	clear_buffer_dirty(bh);
153 	set_buffer_uptodate(bh);
154 	bh->b_end_io = journal_end_buffer_io_sync;
155 
156 	if (journal->j_flags & JBD2_BARRIER &&
157 	    !jbd2_has_feature_async_commit(journal))
158 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
159 	else
160 		ret = submit_bh(WRITE_SYNC, bh);
161 
162 	*cbh = bh;
163 	return ret;
164 }
165 
166 /*
167  * This function along with journal_submit_commit_record
168  * allows to write the commit record asynchronously.
169  */
170 static int journal_wait_on_commit_record(journal_t *journal,
171 					 struct buffer_head *bh)
172 {
173 	int ret = 0;
174 
175 	clear_buffer_dirty(bh);
176 	wait_on_buffer(bh);
177 
178 	if (unlikely(!buffer_uptodate(bh)))
179 		ret = -EIO;
180 	put_bh(bh);            /* One for getblk() */
181 
182 	return ret;
183 }
184 
185 /*
186  * write the filemap data using writepage() address_space_operations.
187  * We don't do block allocation here even for delalloc. We don't
188  * use writepages() because with dealyed allocation we may be doing
189  * block allocation in writepages().
190  */
191 static int journal_submit_inode_data_buffers(struct address_space *mapping)
192 {
193 	int ret;
194 	struct writeback_control wbc = {
195 		.sync_mode =  WB_SYNC_ALL,
196 		.nr_to_write = mapping->nrpages * 2,
197 		.range_start = 0,
198 		.range_end = i_size_read(mapping->host),
199 	};
200 
201 	ret = generic_writepages(mapping, &wbc);
202 	return ret;
203 }
204 
205 /*
206  * Submit all the data buffers of inode associated with the transaction to
207  * disk.
208  *
209  * We are in a committing transaction. Therefore no new inode can be added to
210  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
211  * operate on from being released while we write out pages.
212  */
213 static int journal_submit_data_buffers(journal_t *journal,
214 		transaction_t *commit_transaction)
215 {
216 	struct jbd2_inode *jinode;
217 	int err, ret = 0;
218 	struct address_space *mapping;
219 
220 	spin_lock(&journal->j_list_lock);
221 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222 		if (!(jinode->i_flags & JI_WRITE_DATA))
223 			continue;
224 		mapping = jinode->i_vfs_inode->i_mapping;
225 		jinode->i_flags |= JI_COMMIT_RUNNING;
226 		spin_unlock(&journal->j_list_lock);
227 		/*
228 		 * submit the inode data buffers. We use writepage
229 		 * instead of writepages. Because writepages can do
230 		 * block allocation  with delalloc. We need to write
231 		 * only allocated blocks here.
232 		 */
233 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 		err = journal_submit_inode_data_buffers(mapping);
235 		if (!ret)
236 			ret = err;
237 		spin_lock(&journal->j_list_lock);
238 		J_ASSERT(jinode->i_transaction == commit_transaction);
239 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
240 		smp_mb();
241 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 	}
243 	spin_unlock(&journal->j_list_lock);
244 	return ret;
245 }
246 
247 /*
248  * Wait for data submitted for writeout, refile inodes to proper
249  * transaction if needed.
250  *
251  */
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 		transaction_t *commit_transaction)
254 {
255 	struct jbd2_inode *jinode, *next_i;
256 	int err, ret = 0;
257 
258 	/* For locking, see the comment in journal_submit_data_buffers() */
259 	spin_lock(&journal->j_list_lock);
260 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 		if (!(jinode->i_flags & JI_WAIT_DATA))
262 			continue;
263 		jinode->i_flags |= JI_COMMIT_RUNNING;
264 		spin_unlock(&journal->j_list_lock);
265 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266 		if (err) {
267 			/*
268 			 * Because AS_EIO is cleared by
269 			 * filemap_fdatawait_range(), set it again so
270 			 * that user process can get -EIO from fsync().
271 			 */
272 			set_bit(AS_EIO,
273 				&jinode->i_vfs_inode->i_mapping->flags);
274 
275 			if (!ret)
276 				ret = err;
277 		}
278 		spin_lock(&journal->j_list_lock);
279 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
280 		smp_mb();
281 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282 	}
283 
284 	/* Now refile inode to proper lists */
285 	list_for_each_entry_safe(jinode, next_i,
286 				 &commit_transaction->t_inode_list, i_list) {
287 		list_del(&jinode->i_list);
288 		if (jinode->i_next_transaction) {
289 			jinode->i_transaction = jinode->i_next_transaction;
290 			jinode->i_next_transaction = NULL;
291 			list_add(&jinode->i_list,
292 				&jinode->i_transaction->t_inode_list);
293 		} else {
294 			jinode->i_transaction = NULL;
295 		}
296 	}
297 	spin_unlock(&journal->j_list_lock);
298 
299 	return ret;
300 }
301 
302 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
303 {
304 	struct page *page = bh->b_page;
305 	char *addr;
306 	__u32 checksum;
307 
308 	addr = kmap_atomic(page);
309 	checksum = crc32_be(crc32_sum,
310 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
311 	kunmap_atomic(addr);
312 
313 	return checksum;
314 }
315 
316 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
317 				   unsigned long long block)
318 {
319 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
320 	if (jbd2_has_feature_64bit(j))
321 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322 }
323 
324 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
325 				    struct buffer_head *bh, __u32 sequence)
326 {
327 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
328 	struct page *page = bh->b_page;
329 	__u8 *addr;
330 	__u32 csum32;
331 	__be32 seq;
332 
333 	if (!jbd2_journal_has_csum_v2or3(j))
334 		return;
335 
336 	seq = cpu_to_be32(sequence);
337 	addr = kmap_atomic(page);
338 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
339 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
340 			     bh->b_size);
341 	kunmap_atomic(addr);
342 
343 	if (jbd2_has_feature_csum3(j))
344 		tag3->t_checksum = cpu_to_be32(csum32);
345 	else
346 		tag->t_checksum = cpu_to_be16(csum32);
347 }
348 /*
349  * jbd2_journal_commit_transaction
350  *
351  * The primary function for committing a transaction to the log.  This
352  * function is called by the journal thread to begin a complete commit.
353  */
354 void jbd2_journal_commit_transaction(journal_t *journal)
355 {
356 	struct transaction_stats_s stats;
357 	transaction_t *commit_transaction;
358 	struct journal_head *jh;
359 	struct buffer_head *descriptor;
360 	struct buffer_head **wbuf = journal->j_wbuf;
361 	int bufs;
362 	int flags;
363 	int err;
364 	unsigned long long blocknr;
365 	ktime_t start_time;
366 	u64 commit_time;
367 	char *tagp = NULL;
368 	journal_block_tag_t *tag = NULL;
369 	int space_left = 0;
370 	int first_tag = 0;
371 	int tag_flag;
372 	int i;
373 	int tag_bytes = journal_tag_bytes(journal);
374 	struct buffer_head *cbh = NULL; /* For transactional checksums */
375 	__u32 crc32_sum = ~0;
376 	struct blk_plug plug;
377 	/* Tail of the journal */
378 	unsigned long first_block;
379 	tid_t first_tid;
380 	int update_tail;
381 	int csum_size = 0;
382 	LIST_HEAD(io_bufs);
383 	LIST_HEAD(log_bufs);
384 
385 	if (jbd2_journal_has_csum_v2or3(journal))
386 		csum_size = sizeof(struct jbd2_journal_block_tail);
387 
388 	/*
389 	 * First job: lock down the current transaction and wait for
390 	 * all outstanding updates to complete.
391 	 */
392 
393 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
394 	if (journal->j_flags & JBD2_FLUSHED) {
395 		jbd_debug(3, "super block updated\n");
396 		mutex_lock(&journal->j_checkpoint_mutex);
397 		/*
398 		 * We hold j_checkpoint_mutex so tail cannot change under us.
399 		 * We don't need any special data guarantees for writing sb
400 		 * since journal is empty and it is ok for write to be
401 		 * flushed only with transaction commit.
402 		 */
403 		jbd2_journal_update_sb_log_tail(journal,
404 						journal->j_tail_sequence,
405 						journal->j_tail,
406 						WRITE_SYNC);
407 		mutex_unlock(&journal->j_checkpoint_mutex);
408 	} else {
409 		jbd_debug(3, "superblock not updated\n");
410 	}
411 
412 	J_ASSERT(journal->j_running_transaction != NULL);
413 	J_ASSERT(journal->j_committing_transaction == NULL);
414 
415 	commit_transaction = journal->j_running_transaction;
416 
417 	trace_jbd2_start_commit(journal, commit_transaction);
418 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
419 			commit_transaction->t_tid);
420 
421 	write_lock(&journal->j_state_lock);
422 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
423 	commit_transaction->t_state = T_LOCKED;
424 
425 	trace_jbd2_commit_locking(journal, commit_transaction);
426 	stats.run.rs_wait = commit_transaction->t_max_wait;
427 	stats.run.rs_request_delay = 0;
428 	stats.run.rs_locked = jiffies;
429 	if (commit_transaction->t_requested)
430 		stats.run.rs_request_delay =
431 			jbd2_time_diff(commit_transaction->t_requested,
432 				       stats.run.rs_locked);
433 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
434 					      stats.run.rs_locked);
435 
436 	spin_lock(&commit_transaction->t_handle_lock);
437 	while (atomic_read(&commit_transaction->t_updates)) {
438 		DEFINE_WAIT(wait);
439 
440 		prepare_to_wait(&journal->j_wait_updates, &wait,
441 					TASK_UNINTERRUPTIBLE);
442 		if (atomic_read(&commit_transaction->t_updates)) {
443 			spin_unlock(&commit_transaction->t_handle_lock);
444 			write_unlock(&journal->j_state_lock);
445 			schedule();
446 			write_lock(&journal->j_state_lock);
447 			spin_lock(&commit_transaction->t_handle_lock);
448 		}
449 		finish_wait(&journal->j_wait_updates, &wait);
450 	}
451 	spin_unlock(&commit_transaction->t_handle_lock);
452 
453 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
454 			journal->j_max_transaction_buffers);
455 
456 	/*
457 	 * First thing we are allowed to do is to discard any remaining
458 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
459 	 * that there are no such buffers: if a large filesystem
460 	 * operation like a truncate needs to split itself over multiple
461 	 * transactions, then it may try to do a jbd2_journal_restart() while
462 	 * there are still BJ_Reserved buffers outstanding.  These must
463 	 * be released cleanly from the current transaction.
464 	 *
465 	 * In this case, the filesystem must still reserve write access
466 	 * again before modifying the buffer in the new transaction, but
467 	 * we do not require it to remember exactly which old buffers it
468 	 * has reserved.  This is consistent with the existing behaviour
469 	 * that multiple jbd2_journal_get_write_access() calls to the same
470 	 * buffer are perfectly permissible.
471 	 */
472 	while (commit_transaction->t_reserved_list) {
473 		jh = commit_transaction->t_reserved_list;
474 		JBUFFER_TRACE(jh, "reserved, unused: refile");
475 		/*
476 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
477 		 * leave undo-committed data.
478 		 */
479 		if (jh->b_committed_data) {
480 			struct buffer_head *bh = jh2bh(jh);
481 
482 			jbd_lock_bh_state(bh);
483 			jbd2_free(jh->b_committed_data, bh->b_size);
484 			jh->b_committed_data = NULL;
485 			jbd_unlock_bh_state(bh);
486 		}
487 		jbd2_journal_refile_buffer(journal, jh);
488 	}
489 
490 	/*
491 	 * Now try to drop any written-back buffers from the journal's
492 	 * checkpoint lists.  We do this *before* commit because it potentially
493 	 * frees some memory
494 	 */
495 	spin_lock(&journal->j_list_lock);
496 	__jbd2_journal_clean_checkpoint_list(journal, false);
497 	spin_unlock(&journal->j_list_lock);
498 
499 	jbd_debug(3, "JBD2: commit phase 1\n");
500 
501 	/*
502 	 * Clear revoked flag to reflect there is no revoked buffers
503 	 * in the next transaction which is going to be started.
504 	 */
505 	jbd2_clear_buffer_revoked_flags(journal);
506 
507 	/*
508 	 * Switch to a new revoke table.
509 	 */
510 	jbd2_journal_switch_revoke_table(journal);
511 
512 	/*
513 	 * Reserved credits cannot be claimed anymore, free them
514 	 */
515 	atomic_sub(atomic_read(&journal->j_reserved_credits),
516 		   &commit_transaction->t_outstanding_credits);
517 
518 	trace_jbd2_commit_flushing(journal, commit_transaction);
519 	stats.run.rs_flushing = jiffies;
520 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
521 					     stats.run.rs_flushing);
522 
523 	commit_transaction->t_state = T_FLUSH;
524 	journal->j_committing_transaction = commit_transaction;
525 	journal->j_running_transaction = NULL;
526 	start_time = ktime_get();
527 	commit_transaction->t_log_start = journal->j_head;
528 	wake_up(&journal->j_wait_transaction_locked);
529 	write_unlock(&journal->j_state_lock);
530 
531 	jbd_debug(3, "JBD2: commit phase 2a\n");
532 
533 	/*
534 	 * Now start flushing things to disk, in the order they appear
535 	 * on the transaction lists.  Data blocks go first.
536 	 */
537 	err = journal_submit_data_buffers(journal, commit_transaction);
538 	if (err)
539 		jbd2_journal_abort(journal, err);
540 
541 	blk_start_plug(&plug);
542 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
543 
544 	jbd_debug(3, "JBD2: commit phase 2b\n");
545 
546 	/*
547 	 * Way to go: we have now written out all of the data for a
548 	 * transaction!  Now comes the tricky part: we need to write out
549 	 * metadata.  Loop over the transaction's entire buffer list:
550 	 */
551 	write_lock(&journal->j_state_lock);
552 	commit_transaction->t_state = T_COMMIT;
553 	write_unlock(&journal->j_state_lock);
554 
555 	trace_jbd2_commit_logging(journal, commit_transaction);
556 	stats.run.rs_logging = jiffies;
557 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
558 					       stats.run.rs_logging);
559 	stats.run.rs_blocks =
560 		atomic_read(&commit_transaction->t_outstanding_credits);
561 	stats.run.rs_blocks_logged = 0;
562 
563 	J_ASSERT(commit_transaction->t_nr_buffers <=
564 		 atomic_read(&commit_transaction->t_outstanding_credits));
565 
566 	err = 0;
567 	bufs = 0;
568 	descriptor = NULL;
569 	while (commit_transaction->t_buffers) {
570 
571 		/* Find the next buffer to be journaled... */
572 
573 		jh = commit_transaction->t_buffers;
574 
575 		/* If we're in abort mode, we just un-journal the buffer and
576 		   release it. */
577 
578 		if (is_journal_aborted(journal)) {
579 			clear_buffer_jbddirty(jh2bh(jh));
580 			JBUFFER_TRACE(jh, "journal is aborting: refile");
581 			jbd2_buffer_abort_trigger(jh,
582 						  jh->b_frozen_data ?
583 						  jh->b_frozen_triggers :
584 						  jh->b_triggers);
585 			jbd2_journal_refile_buffer(journal, jh);
586 			/* If that was the last one, we need to clean up
587 			 * any descriptor buffers which may have been
588 			 * already allocated, even if we are now
589 			 * aborting. */
590 			if (!commit_transaction->t_buffers)
591 				goto start_journal_io;
592 			continue;
593 		}
594 
595 		/* Make sure we have a descriptor block in which to
596 		   record the metadata buffer. */
597 
598 		if (!descriptor) {
599 			J_ASSERT (bufs == 0);
600 
601 			jbd_debug(4, "JBD2: get descriptor\n");
602 
603 			descriptor = jbd2_journal_get_descriptor_buffer(
604 							commit_transaction,
605 							JBD2_DESCRIPTOR_BLOCK);
606 			if (!descriptor) {
607 				jbd2_journal_abort(journal, -EIO);
608 				continue;
609 			}
610 
611 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
612 				(unsigned long long)descriptor->b_blocknr,
613 				descriptor->b_data);
614 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
615 			space_left = descriptor->b_size -
616 						sizeof(journal_header_t);
617 			first_tag = 1;
618 			set_buffer_jwrite(descriptor);
619 			set_buffer_dirty(descriptor);
620 			wbuf[bufs++] = descriptor;
621 
622 			/* Record it so that we can wait for IO
623                            completion later */
624 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
625 			jbd2_file_log_bh(&log_bufs, descriptor);
626 		}
627 
628 		/* Where is the buffer to be written? */
629 
630 		err = jbd2_journal_next_log_block(journal, &blocknr);
631 		/* If the block mapping failed, just abandon the buffer
632 		   and repeat this loop: we'll fall into the
633 		   refile-on-abort condition above. */
634 		if (err) {
635 			jbd2_journal_abort(journal, err);
636 			continue;
637 		}
638 
639 		/*
640 		 * start_this_handle() uses t_outstanding_credits to determine
641 		 * the free space in the log, but this counter is changed
642 		 * by jbd2_journal_next_log_block() also.
643 		 */
644 		atomic_dec(&commit_transaction->t_outstanding_credits);
645 
646 		/* Bump b_count to prevent truncate from stumbling over
647                    the shadowed buffer!  @@@ This can go if we ever get
648                    rid of the shadow pairing of buffers. */
649 		atomic_inc(&jh2bh(jh)->b_count);
650 
651 		/*
652 		 * Make a temporary IO buffer with which to write it out
653 		 * (this will requeue the metadata buffer to BJ_Shadow).
654 		 */
655 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
656 		JBUFFER_TRACE(jh, "ph3: write metadata");
657 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
658 						jh, &wbuf[bufs], blocknr);
659 		if (flags < 0) {
660 			jbd2_journal_abort(journal, flags);
661 			continue;
662 		}
663 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
664 
665 		/* Record the new block's tag in the current descriptor
666                    buffer */
667 
668 		tag_flag = 0;
669 		if (flags & 1)
670 			tag_flag |= JBD2_FLAG_ESCAPE;
671 		if (!first_tag)
672 			tag_flag |= JBD2_FLAG_SAME_UUID;
673 
674 		tag = (journal_block_tag_t *) tagp;
675 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
676 		tag->t_flags = cpu_to_be16(tag_flag);
677 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
678 					commit_transaction->t_tid);
679 		tagp += tag_bytes;
680 		space_left -= tag_bytes;
681 		bufs++;
682 
683 		if (first_tag) {
684 			memcpy (tagp, journal->j_uuid, 16);
685 			tagp += 16;
686 			space_left -= 16;
687 			first_tag = 0;
688 		}
689 
690 		/* If there's no more to do, or if the descriptor is full,
691 		   let the IO rip! */
692 
693 		if (bufs == journal->j_wbufsize ||
694 		    commit_transaction->t_buffers == NULL ||
695 		    space_left < tag_bytes + 16 + csum_size) {
696 
697 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
698 
699 			/* Write an end-of-descriptor marker before
700                            submitting the IOs.  "tag" still points to
701                            the last tag we set up. */
702 
703 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
704 
705 			jbd2_descriptor_block_csum_set(journal, descriptor);
706 start_journal_io:
707 			for (i = 0; i < bufs; i++) {
708 				struct buffer_head *bh = wbuf[i];
709 				/*
710 				 * Compute checksum.
711 				 */
712 				if (jbd2_has_feature_checksum(journal)) {
713 					crc32_sum =
714 					    jbd2_checksum_data(crc32_sum, bh);
715 				}
716 
717 				lock_buffer(bh);
718 				clear_buffer_dirty(bh);
719 				set_buffer_uptodate(bh);
720 				bh->b_end_io = journal_end_buffer_io_sync;
721 				submit_bh(WRITE_SYNC, bh);
722 			}
723 			cond_resched();
724 			stats.run.rs_blocks_logged += bufs;
725 
726 			/* Force a new descriptor to be generated next
727                            time round the loop. */
728 			descriptor = NULL;
729 			bufs = 0;
730 		}
731 	}
732 
733 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
734 	if (err) {
735 		printk(KERN_WARNING
736 			"JBD2: Detected IO errors while flushing file data "
737 		       "on %s\n", journal->j_devname);
738 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
739 			jbd2_journal_abort(journal, err);
740 		err = 0;
741 	}
742 
743 	/*
744 	 * Get current oldest transaction in the log before we issue flush
745 	 * to the filesystem device. After the flush we can be sure that
746 	 * blocks of all older transactions are checkpointed to persistent
747 	 * storage and we will be safe to update journal start in the
748 	 * superblock with the numbers we get here.
749 	 */
750 	update_tail =
751 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
752 
753 	write_lock(&journal->j_state_lock);
754 	if (update_tail) {
755 		long freed = first_block - journal->j_tail;
756 
757 		if (first_block < journal->j_tail)
758 			freed += journal->j_last - journal->j_first;
759 		/* Update tail only if we free significant amount of space */
760 		if (freed < journal->j_maxlen / 4)
761 			update_tail = 0;
762 	}
763 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
764 	commit_transaction->t_state = T_COMMIT_DFLUSH;
765 	write_unlock(&journal->j_state_lock);
766 
767 	/*
768 	 * If the journal is not located on the file system device,
769 	 * then we must flush the file system device before we issue
770 	 * the commit record
771 	 */
772 	if (commit_transaction->t_need_data_flush &&
773 	    (journal->j_fs_dev != journal->j_dev) &&
774 	    (journal->j_flags & JBD2_BARRIER))
775 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
776 
777 	/* Done it all: now write the commit record asynchronously. */
778 	if (jbd2_has_feature_async_commit(journal)) {
779 		err = journal_submit_commit_record(journal, commit_transaction,
780 						 &cbh, crc32_sum);
781 		if (err)
782 			__jbd2_journal_abort_hard(journal);
783 	}
784 
785 	blk_finish_plug(&plug);
786 
787 	/* Lo and behold: we have just managed to send a transaction to
788            the log.  Before we can commit it, wait for the IO so far to
789            complete.  Control buffers being written are on the
790            transaction's t_log_list queue, and metadata buffers are on
791            the io_bufs list.
792 
793 	   Wait for the buffers in reverse order.  That way we are
794 	   less likely to be woken up until all IOs have completed, and
795 	   so we incur less scheduling load.
796 	*/
797 
798 	jbd_debug(3, "JBD2: commit phase 3\n");
799 
800 	while (!list_empty(&io_bufs)) {
801 		struct buffer_head *bh = list_entry(io_bufs.prev,
802 						    struct buffer_head,
803 						    b_assoc_buffers);
804 
805 		wait_on_buffer(bh);
806 		cond_resched();
807 
808 		if (unlikely(!buffer_uptodate(bh)))
809 			err = -EIO;
810 		jbd2_unfile_log_bh(bh);
811 
812 		/*
813 		 * The list contains temporary buffer heads created by
814 		 * jbd2_journal_write_metadata_buffer().
815 		 */
816 		BUFFER_TRACE(bh, "dumping temporary bh");
817 		__brelse(bh);
818 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
819 		free_buffer_head(bh);
820 
821 		/* We also have to refile the corresponding shadowed buffer */
822 		jh = commit_transaction->t_shadow_list->b_tprev;
823 		bh = jh2bh(jh);
824 		clear_buffer_jwrite(bh);
825 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
826 		J_ASSERT_BH(bh, !buffer_shadow(bh));
827 
828 		/* The metadata is now released for reuse, but we need
829                    to remember it against this transaction so that when
830                    we finally commit, we can do any checkpointing
831                    required. */
832 		JBUFFER_TRACE(jh, "file as BJ_Forget");
833 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
834 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
835 		__brelse(bh);
836 	}
837 
838 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
839 
840 	jbd_debug(3, "JBD2: commit phase 4\n");
841 
842 	/* Here we wait for the revoke record and descriptor record buffers */
843 	while (!list_empty(&log_bufs)) {
844 		struct buffer_head *bh;
845 
846 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
847 		wait_on_buffer(bh);
848 		cond_resched();
849 
850 		if (unlikely(!buffer_uptodate(bh)))
851 			err = -EIO;
852 
853 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
854 		clear_buffer_jwrite(bh);
855 		jbd2_unfile_log_bh(bh);
856 		__brelse(bh);		/* One for getblk */
857 		/* AKPM: bforget here */
858 	}
859 
860 	if (err)
861 		jbd2_journal_abort(journal, err);
862 
863 	jbd_debug(3, "JBD2: commit phase 5\n");
864 	write_lock(&journal->j_state_lock);
865 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
866 	commit_transaction->t_state = T_COMMIT_JFLUSH;
867 	write_unlock(&journal->j_state_lock);
868 
869 	if (!jbd2_has_feature_async_commit(journal)) {
870 		err = journal_submit_commit_record(journal, commit_transaction,
871 						&cbh, crc32_sum);
872 		if (err)
873 			__jbd2_journal_abort_hard(journal);
874 	}
875 	if (cbh)
876 		err = journal_wait_on_commit_record(journal, cbh);
877 	if (jbd2_has_feature_async_commit(journal) &&
878 	    journal->j_flags & JBD2_BARRIER) {
879 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
880 	}
881 
882 	if (err)
883 		jbd2_journal_abort(journal, err);
884 
885 	/*
886 	 * Now disk caches for filesystem device are flushed so we are safe to
887 	 * erase checkpointed transactions from the log by updating journal
888 	 * superblock.
889 	 */
890 	if (update_tail)
891 		jbd2_update_log_tail(journal, first_tid, first_block);
892 
893 	/* End of a transaction!  Finally, we can do checkpoint
894            processing: any buffers committed as a result of this
895            transaction can be removed from any checkpoint list it was on
896            before. */
897 
898 	jbd_debug(3, "JBD2: commit phase 6\n");
899 
900 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
901 	J_ASSERT(commit_transaction->t_buffers == NULL);
902 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
903 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
904 
905 restart_loop:
906 	/*
907 	 * As there are other places (journal_unmap_buffer()) adding buffers
908 	 * to this list we have to be careful and hold the j_list_lock.
909 	 */
910 	spin_lock(&journal->j_list_lock);
911 	while (commit_transaction->t_forget) {
912 		transaction_t *cp_transaction;
913 		struct buffer_head *bh;
914 		int try_to_free = 0;
915 
916 		jh = commit_transaction->t_forget;
917 		spin_unlock(&journal->j_list_lock);
918 		bh = jh2bh(jh);
919 		/*
920 		 * Get a reference so that bh cannot be freed before we are
921 		 * done with it.
922 		 */
923 		get_bh(bh);
924 		jbd_lock_bh_state(bh);
925 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
926 
927 		/*
928 		 * If there is undo-protected committed data against
929 		 * this buffer, then we can remove it now.  If it is a
930 		 * buffer needing such protection, the old frozen_data
931 		 * field now points to a committed version of the
932 		 * buffer, so rotate that field to the new committed
933 		 * data.
934 		 *
935 		 * Otherwise, we can just throw away the frozen data now.
936 		 *
937 		 * We also know that the frozen data has already fired
938 		 * its triggers if they exist, so we can clear that too.
939 		 */
940 		if (jh->b_committed_data) {
941 			jbd2_free(jh->b_committed_data, bh->b_size);
942 			jh->b_committed_data = NULL;
943 			if (jh->b_frozen_data) {
944 				jh->b_committed_data = jh->b_frozen_data;
945 				jh->b_frozen_data = NULL;
946 				jh->b_frozen_triggers = NULL;
947 			}
948 		} else if (jh->b_frozen_data) {
949 			jbd2_free(jh->b_frozen_data, bh->b_size);
950 			jh->b_frozen_data = NULL;
951 			jh->b_frozen_triggers = NULL;
952 		}
953 
954 		spin_lock(&journal->j_list_lock);
955 		cp_transaction = jh->b_cp_transaction;
956 		if (cp_transaction) {
957 			JBUFFER_TRACE(jh, "remove from old cp transaction");
958 			cp_transaction->t_chp_stats.cs_dropped++;
959 			__jbd2_journal_remove_checkpoint(jh);
960 		}
961 
962 		/* Only re-checkpoint the buffer_head if it is marked
963 		 * dirty.  If the buffer was added to the BJ_Forget list
964 		 * by jbd2_journal_forget, it may no longer be dirty and
965 		 * there's no point in keeping a checkpoint record for
966 		 * it. */
967 
968 		/*
969 		* A buffer which has been freed while still being journaled by
970 		* a previous transaction.
971 		*/
972 		if (buffer_freed(bh)) {
973 			/*
974 			 * If the running transaction is the one containing
975 			 * "add to orphan" operation (b_next_transaction !=
976 			 * NULL), we have to wait for that transaction to
977 			 * commit before we can really get rid of the buffer.
978 			 * So just clear b_modified to not confuse transaction
979 			 * credit accounting and refile the buffer to
980 			 * BJ_Forget of the running transaction. If the just
981 			 * committed transaction contains "add to orphan"
982 			 * operation, we can completely invalidate the buffer
983 			 * now. We are rather through in that since the
984 			 * buffer may be still accessible when blocksize <
985 			 * pagesize and it is attached to the last partial
986 			 * page.
987 			 */
988 			jh->b_modified = 0;
989 			if (!jh->b_next_transaction) {
990 				clear_buffer_freed(bh);
991 				clear_buffer_jbddirty(bh);
992 				clear_buffer_mapped(bh);
993 				clear_buffer_new(bh);
994 				clear_buffer_req(bh);
995 				bh->b_bdev = NULL;
996 			}
997 		}
998 
999 		if (buffer_jbddirty(bh)) {
1000 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1001 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1002 			if (is_journal_aborted(journal))
1003 				clear_buffer_jbddirty(bh);
1004 		} else {
1005 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1006 			/*
1007 			 * The buffer on BJ_Forget list and not jbddirty means
1008 			 * it has been freed by this transaction and hence it
1009 			 * could not have been reallocated until this
1010 			 * transaction has committed. *BUT* it could be
1011 			 * reallocated once we have written all the data to
1012 			 * disk and before we process the buffer on BJ_Forget
1013 			 * list.
1014 			 */
1015 			if (!jh->b_next_transaction)
1016 				try_to_free = 1;
1017 		}
1018 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1019 		__jbd2_journal_refile_buffer(jh);
1020 		jbd_unlock_bh_state(bh);
1021 		if (try_to_free)
1022 			release_buffer_page(bh);	/* Drops bh reference */
1023 		else
1024 			__brelse(bh);
1025 		cond_resched_lock(&journal->j_list_lock);
1026 	}
1027 	spin_unlock(&journal->j_list_lock);
1028 	/*
1029 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1030 	 * of a transaction into T_FINISHED state and calling
1031 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1032 	 * other checkpointing code processing the transaction...
1033 	 */
1034 	write_lock(&journal->j_state_lock);
1035 	spin_lock(&journal->j_list_lock);
1036 	/*
1037 	 * Now recheck if some buffers did not get attached to the transaction
1038 	 * while the lock was dropped...
1039 	 */
1040 	if (commit_transaction->t_forget) {
1041 		spin_unlock(&journal->j_list_lock);
1042 		write_unlock(&journal->j_state_lock);
1043 		goto restart_loop;
1044 	}
1045 
1046 	/* Add the transaction to the checkpoint list
1047 	 * __journal_remove_checkpoint() can not destroy transaction
1048 	 * under us because it is not marked as T_FINISHED yet */
1049 	if (journal->j_checkpoint_transactions == NULL) {
1050 		journal->j_checkpoint_transactions = commit_transaction;
1051 		commit_transaction->t_cpnext = commit_transaction;
1052 		commit_transaction->t_cpprev = commit_transaction;
1053 	} else {
1054 		commit_transaction->t_cpnext =
1055 			journal->j_checkpoint_transactions;
1056 		commit_transaction->t_cpprev =
1057 			commit_transaction->t_cpnext->t_cpprev;
1058 		commit_transaction->t_cpnext->t_cpprev =
1059 			commit_transaction;
1060 		commit_transaction->t_cpprev->t_cpnext =
1061 				commit_transaction;
1062 	}
1063 	spin_unlock(&journal->j_list_lock);
1064 
1065 	/* Done with this transaction! */
1066 
1067 	jbd_debug(3, "JBD2: commit phase 7\n");
1068 
1069 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1070 
1071 	commit_transaction->t_start = jiffies;
1072 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1073 					      commit_transaction->t_start);
1074 
1075 	/*
1076 	 * File the transaction statistics
1077 	 */
1078 	stats.ts_tid = commit_transaction->t_tid;
1079 	stats.run.rs_handle_count =
1080 		atomic_read(&commit_transaction->t_handle_count);
1081 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1082 			     commit_transaction->t_tid, &stats.run);
1083 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1084 
1085 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1086 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1087 	journal->j_commit_sequence = commit_transaction->t_tid;
1088 	journal->j_committing_transaction = NULL;
1089 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1090 
1091 	/*
1092 	 * weight the commit time higher than the average time so we don't
1093 	 * react too strongly to vast changes in the commit time
1094 	 */
1095 	if (likely(journal->j_average_commit_time))
1096 		journal->j_average_commit_time = (commit_time +
1097 				journal->j_average_commit_time*3) / 4;
1098 	else
1099 		journal->j_average_commit_time = commit_time;
1100 
1101 	write_unlock(&journal->j_state_lock);
1102 
1103 	if (journal->j_commit_callback)
1104 		journal->j_commit_callback(journal, commit_transaction);
1105 
1106 	trace_jbd2_end_commit(journal, commit_transaction);
1107 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1108 		  journal->j_commit_sequence, journal->j_tail_sequence);
1109 
1110 	write_lock(&journal->j_state_lock);
1111 	spin_lock(&journal->j_list_lock);
1112 	commit_transaction->t_state = T_FINISHED;
1113 	/* Check if the transaction can be dropped now that we are finished */
1114 	if (commit_transaction->t_checkpoint_list == NULL &&
1115 	    commit_transaction->t_checkpoint_io_list == NULL) {
1116 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1117 		jbd2_journal_free_transaction(commit_transaction);
1118 	}
1119 	spin_unlock(&journal->j_list_lock);
1120 	write_unlock(&journal->j_state_lock);
1121 	wake_up(&journal->j_wait_done_commit);
1122 
1123 	/*
1124 	 * Calculate overall stats
1125 	 */
1126 	spin_lock(&journal->j_history_lock);
1127 	journal->j_stats.ts_tid++;
1128 	journal->j_stats.ts_requested += stats.ts_requested;
1129 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1130 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1131 	journal->j_stats.run.rs_running += stats.run.rs_running;
1132 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1133 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1134 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1135 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1136 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1137 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1138 	spin_unlock(&journal->j_history_lock);
1139 }
1140