xref: /openbmc/linux/fs/jbd2/commit.c (revision 110e6f26)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * IO end handler for temporary buffer_heads handling writes to the journal.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	struct buffer_head *orig_bh = bh->b_private;
38 
39 	BUFFER_TRACE(bh, "");
40 	if (uptodate)
41 		set_buffer_uptodate(bh);
42 	else
43 		clear_buffer_uptodate(bh);
44 	if (orig_bh) {
45 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 		smp_mb__after_atomic();
47 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 	}
49 	unlock_buffer(bh);
50 }
51 
52 /*
53  * When an ext4 file is truncated, it is possible that some pages are not
54  * successfully freed, because they are attached to a committing transaction.
55  * After the transaction commits, these pages are left on the LRU, with no
56  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57  * by the VM, but their apparent absence upsets the VM accounting, and it makes
58  * the numbers in /proc/meminfo look odd.
59  *
60  * So here, we have a buffer which has just come off the forget list.  Look to
61  * see if we can strip all buffers from the backing page.
62  *
63  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64  * caller provided us with a ref against the buffer, and we drop that here.
65  */
66 static void release_buffer_page(struct buffer_head *bh)
67 {
68 	struct page *page;
69 
70 	if (buffer_dirty(bh))
71 		goto nope;
72 	if (atomic_read(&bh->b_count) != 1)
73 		goto nope;
74 	page = bh->b_page;
75 	if (!page)
76 		goto nope;
77 	if (page->mapping)
78 		goto nope;
79 
80 	/* OK, it's a truncated page */
81 	if (!trylock_page(page))
82 		goto nope;
83 
84 	get_page(page);
85 	__brelse(bh);
86 	try_to_free_buffers(page);
87 	unlock_page(page);
88 	put_page(page);
89 	return;
90 
91 nope:
92 	__brelse(bh);
93 }
94 
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96 {
97 	struct commit_header *h;
98 	__u32 csum;
99 
100 	if (!jbd2_journal_has_csum_v2or3(j))
101 		return;
102 
103 	h = (struct commit_header *)(bh->b_data);
104 	h->h_chksum_type = 0;
105 	h->h_chksum_size = 0;
106 	h->h_chksum[0] = 0;
107 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 	h->h_chksum[0] = cpu_to_be32(csum);
109 }
110 
111 /*
112  * Done it all: now submit the commit record.  We should have
113  * cleaned up our previous buffers by now, so if we are in abort
114  * mode we can now just skip the rest of the journal write
115  * entirely.
116  *
117  * Returns 1 if the journal needs to be aborted or 0 on success
118  */
119 static int journal_submit_commit_record(journal_t *journal,
120 					transaction_t *commit_transaction,
121 					struct buffer_head **cbh,
122 					__u32 crc32_sum)
123 {
124 	struct commit_header *tmp;
125 	struct buffer_head *bh;
126 	int ret;
127 	struct timespec now = current_kernel_time();
128 
129 	*cbh = NULL;
130 
131 	if (is_journal_aborted(journal))
132 		return 0;
133 
134 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 						JBD2_COMMIT_BLOCK);
136 	if (!bh)
137 		return 1;
138 
139 	tmp = (struct commit_header *)bh->b_data;
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (jbd2_has_feature_checksum(journal)) {
144 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147 	}
148 	jbd2_commit_block_csum_set(journal, bh);
149 
150 	BUFFER_TRACE(bh, "submit commit block");
151 	lock_buffer(bh);
152 	clear_buffer_dirty(bh);
153 	set_buffer_uptodate(bh);
154 	bh->b_end_io = journal_end_buffer_io_sync;
155 
156 	if (journal->j_flags & JBD2_BARRIER &&
157 	    !jbd2_has_feature_async_commit(journal))
158 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
159 	else
160 		ret = submit_bh(WRITE_SYNC, bh);
161 
162 	*cbh = bh;
163 	return ret;
164 }
165 
166 /*
167  * This function along with journal_submit_commit_record
168  * allows to write the commit record asynchronously.
169  */
170 static int journal_wait_on_commit_record(journal_t *journal,
171 					 struct buffer_head *bh)
172 {
173 	int ret = 0;
174 
175 	clear_buffer_dirty(bh);
176 	wait_on_buffer(bh);
177 
178 	if (unlikely(!buffer_uptodate(bh)))
179 		ret = -EIO;
180 	put_bh(bh);            /* One for getblk() */
181 
182 	return ret;
183 }
184 
185 /*
186  * write the filemap data using writepage() address_space_operations.
187  * We don't do block allocation here even for delalloc. We don't
188  * use writepages() because with dealyed allocation we may be doing
189  * block allocation in writepages().
190  */
191 static int journal_submit_inode_data_buffers(struct address_space *mapping)
192 {
193 	int ret;
194 	struct writeback_control wbc = {
195 		.sync_mode =  WB_SYNC_ALL,
196 		.nr_to_write = mapping->nrpages * 2,
197 		.range_start = 0,
198 		.range_end = i_size_read(mapping->host),
199 	};
200 
201 	ret = generic_writepages(mapping, &wbc);
202 	return ret;
203 }
204 
205 /*
206  * Submit all the data buffers of inode associated with the transaction to
207  * disk.
208  *
209  * We are in a committing transaction. Therefore no new inode can be added to
210  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
211  * operate on from being released while we write out pages.
212  */
213 static int journal_submit_data_buffers(journal_t *journal,
214 		transaction_t *commit_transaction)
215 {
216 	struct jbd2_inode *jinode;
217 	int err, ret = 0;
218 	struct address_space *mapping;
219 
220 	spin_lock(&journal->j_list_lock);
221 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222 		mapping = jinode->i_vfs_inode->i_mapping;
223 		jinode->i_flags |= JI_COMMIT_RUNNING;
224 		spin_unlock(&journal->j_list_lock);
225 		/*
226 		 * submit the inode data buffers. We use writepage
227 		 * instead of writepages. Because writepages can do
228 		 * block allocation  with delalloc. We need to write
229 		 * only allocated blocks here.
230 		 */
231 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
232 		err = journal_submit_inode_data_buffers(mapping);
233 		if (!ret)
234 			ret = err;
235 		spin_lock(&journal->j_list_lock);
236 		J_ASSERT(jinode->i_transaction == commit_transaction);
237 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
238 		smp_mb();
239 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
240 	}
241 	spin_unlock(&journal->j_list_lock);
242 	return ret;
243 }
244 
245 /*
246  * Wait for data submitted for writeout, refile inodes to proper
247  * transaction if needed.
248  *
249  */
250 static int journal_finish_inode_data_buffers(journal_t *journal,
251 		transaction_t *commit_transaction)
252 {
253 	struct jbd2_inode *jinode, *next_i;
254 	int err, ret = 0;
255 
256 	/* For locking, see the comment in journal_submit_data_buffers() */
257 	spin_lock(&journal->j_list_lock);
258 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
259 		jinode->i_flags |= JI_COMMIT_RUNNING;
260 		spin_unlock(&journal->j_list_lock);
261 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
262 		if (err) {
263 			/*
264 			 * Because AS_EIO is cleared by
265 			 * filemap_fdatawait_range(), set it again so
266 			 * that user process can get -EIO from fsync().
267 			 */
268 			set_bit(AS_EIO,
269 				&jinode->i_vfs_inode->i_mapping->flags);
270 
271 			if (!ret)
272 				ret = err;
273 		}
274 		spin_lock(&journal->j_list_lock);
275 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
276 		smp_mb();
277 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
278 	}
279 
280 	/* Now refile inode to proper lists */
281 	list_for_each_entry_safe(jinode, next_i,
282 				 &commit_transaction->t_inode_list, i_list) {
283 		list_del(&jinode->i_list);
284 		if (jinode->i_next_transaction) {
285 			jinode->i_transaction = jinode->i_next_transaction;
286 			jinode->i_next_transaction = NULL;
287 			list_add(&jinode->i_list,
288 				&jinode->i_transaction->t_inode_list);
289 		} else {
290 			jinode->i_transaction = NULL;
291 		}
292 	}
293 	spin_unlock(&journal->j_list_lock);
294 
295 	return ret;
296 }
297 
298 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
299 {
300 	struct page *page = bh->b_page;
301 	char *addr;
302 	__u32 checksum;
303 
304 	addr = kmap_atomic(page);
305 	checksum = crc32_be(crc32_sum,
306 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
307 	kunmap_atomic(addr);
308 
309 	return checksum;
310 }
311 
312 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
313 				   unsigned long long block)
314 {
315 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
316 	if (jbd2_has_feature_64bit(j))
317 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
318 }
319 
320 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
321 				    struct buffer_head *bh, __u32 sequence)
322 {
323 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
324 	struct page *page = bh->b_page;
325 	__u8 *addr;
326 	__u32 csum32;
327 	__be32 seq;
328 
329 	if (!jbd2_journal_has_csum_v2or3(j))
330 		return;
331 
332 	seq = cpu_to_be32(sequence);
333 	addr = kmap_atomic(page);
334 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
335 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
336 			     bh->b_size);
337 	kunmap_atomic(addr);
338 
339 	if (jbd2_has_feature_csum3(j))
340 		tag3->t_checksum = cpu_to_be32(csum32);
341 	else
342 		tag->t_checksum = cpu_to_be16(csum32);
343 }
344 /*
345  * jbd2_journal_commit_transaction
346  *
347  * The primary function for committing a transaction to the log.  This
348  * function is called by the journal thread to begin a complete commit.
349  */
350 void jbd2_journal_commit_transaction(journal_t *journal)
351 {
352 	struct transaction_stats_s stats;
353 	transaction_t *commit_transaction;
354 	struct journal_head *jh;
355 	struct buffer_head *descriptor;
356 	struct buffer_head **wbuf = journal->j_wbuf;
357 	int bufs;
358 	int flags;
359 	int err;
360 	unsigned long long blocknr;
361 	ktime_t start_time;
362 	u64 commit_time;
363 	char *tagp = NULL;
364 	journal_block_tag_t *tag = NULL;
365 	int space_left = 0;
366 	int first_tag = 0;
367 	int tag_flag;
368 	int i;
369 	int tag_bytes = journal_tag_bytes(journal);
370 	struct buffer_head *cbh = NULL; /* For transactional checksums */
371 	__u32 crc32_sum = ~0;
372 	struct blk_plug plug;
373 	/* Tail of the journal */
374 	unsigned long first_block;
375 	tid_t first_tid;
376 	int update_tail;
377 	int csum_size = 0;
378 	LIST_HEAD(io_bufs);
379 	LIST_HEAD(log_bufs);
380 
381 	if (jbd2_journal_has_csum_v2or3(journal))
382 		csum_size = sizeof(struct jbd2_journal_block_tail);
383 
384 	/*
385 	 * First job: lock down the current transaction and wait for
386 	 * all outstanding updates to complete.
387 	 */
388 
389 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
390 	if (journal->j_flags & JBD2_FLUSHED) {
391 		jbd_debug(3, "super block updated\n");
392 		mutex_lock(&journal->j_checkpoint_mutex);
393 		/*
394 		 * We hold j_checkpoint_mutex so tail cannot change under us.
395 		 * We don't need any special data guarantees for writing sb
396 		 * since journal is empty and it is ok for write to be
397 		 * flushed only with transaction commit.
398 		 */
399 		jbd2_journal_update_sb_log_tail(journal,
400 						journal->j_tail_sequence,
401 						journal->j_tail,
402 						WRITE_SYNC);
403 		mutex_unlock(&journal->j_checkpoint_mutex);
404 	} else {
405 		jbd_debug(3, "superblock not updated\n");
406 	}
407 
408 	J_ASSERT(journal->j_running_transaction != NULL);
409 	J_ASSERT(journal->j_committing_transaction == NULL);
410 
411 	commit_transaction = journal->j_running_transaction;
412 
413 	trace_jbd2_start_commit(journal, commit_transaction);
414 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
415 			commit_transaction->t_tid);
416 
417 	write_lock(&journal->j_state_lock);
418 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
419 	commit_transaction->t_state = T_LOCKED;
420 
421 	trace_jbd2_commit_locking(journal, commit_transaction);
422 	stats.run.rs_wait = commit_transaction->t_max_wait;
423 	stats.run.rs_request_delay = 0;
424 	stats.run.rs_locked = jiffies;
425 	if (commit_transaction->t_requested)
426 		stats.run.rs_request_delay =
427 			jbd2_time_diff(commit_transaction->t_requested,
428 				       stats.run.rs_locked);
429 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
430 					      stats.run.rs_locked);
431 
432 	spin_lock(&commit_transaction->t_handle_lock);
433 	while (atomic_read(&commit_transaction->t_updates)) {
434 		DEFINE_WAIT(wait);
435 
436 		prepare_to_wait(&journal->j_wait_updates, &wait,
437 					TASK_UNINTERRUPTIBLE);
438 		if (atomic_read(&commit_transaction->t_updates)) {
439 			spin_unlock(&commit_transaction->t_handle_lock);
440 			write_unlock(&journal->j_state_lock);
441 			schedule();
442 			write_lock(&journal->j_state_lock);
443 			spin_lock(&commit_transaction->t_handle_lock);
444 		}
445 		finish_wait(&journal->j_wait_updates, &wait);
446 	}
447 	spin_unlock(&commit_transaction->t_handle_lock);
448 
449 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
450 			journal->j_max_transaction_buffers);
451 
452 	/*
453 	 * First thing we are allowed to do is to discard any remaining
454 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
455 	 * that there are no such buffers: if a large filesystem
456 	 * operation like a truncate needs to split itself over multiple
457 	 * transactions, then it may try to do a jbd2_journal_restart() while
458 	 * there are still BJ_Reserved buffers outstanding.  These must
459 	 * be released cleanly from the current transaction.
460 	 *
461 	 * In this case, the filesystem must still reserve write access
462 	 * again before modifying the buffer in the new transaction, but
463 	 * we do not require it to remember exactly which old buffers it
464 	 * has reserved.  This is consistent with the existing behaviour
465 	 * that multiple jbd2_journal_get_write_access() calls to the same
466 	 * buffer are perfectly permissible.
467 	 */
468 	while (commit_transaction->t_reserved_list) {
469 		jh = commit_transaction->t_reserved_list;
470 		JBUFFER_TRACE(jh, "reserved, unused: refile");
471 		/*
472 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
473 		 * leave undo-committed data.
474 		 */
475 		if (jh->b_committed_data) {
476 			struct buffer_head *bh = jh2bh(jh);
477 
478 			jbd_lock_bh_state(bh);
479 			jbd2_free(jh->b_committed_data, bh->b_size);
480 			jh->b_committed_data = NULL;
481 			jbd_unlock_bh_state(bh);
482 		}
483 		jbd2_journal_refile_buffer(journal, jh);
484 	}
485 
486 	/*
487 	 * Now try to drop any written-back buffers from the journal's
488 	 * checkpoint lists.  We do this *before* commit because it potentially
489 	 * frees some memory
490 	 */
491 	spin_lock(&journal->j_list_lock);
492 	__jbd2_journal_clean_checkpoint_list(journal, false);
493 	spin_unlock(&journal->j_list_lock);
494 
495 	jbd_debug(3, "JBD2: commit phase 1\n");
496 
497 	/*
498 	 * Clear revoked flag to reflect there is no revoked buffers
499 	 * in the next transaction which is going to be started.
500 	 */
501 	jbd2_clear_buffer_revoked_flags(journal);
502 
503 	/*
504 	 * Switch to a new revoke table.
505 	 */
506 	jbd2_journal_switch_revoke_table(journal);
507 
508 	/*
509 	 * Reserved credits cannot be claimed anymore, free them
510 	 */
511 	atomic_sub(atomic_read(&journal->j_reserved_credits),
512 		   &commit_transaction->t_outstanding_credits);
513 
514 	trace_jbd2_commit_flushing(journal, commit_transaction);
515 	stats.run.rs_flushing = jiffies;
516 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
517 					     stats.run.rs_flushing);
518 
519 	commit_transaction->t_state = T_FLUSH;
520 	journal->j_committing_transaction = commit_transaction;
521 	journal->j_running_transaction = NULL;
522 	start_time = ktime_get();
523 	commit_transaction->t_log_start = journal->j_head;
524 	wake_up(&journal->j_wait_transaction_locked);
525 	write_unlock(&journal->j_state_lock);
526 
527 	jbd_debug(3, "JBD2: commit phase 2a\n");
528 
529 	/*
530 	 * Now start flushing things to disk, in the order they appear
531 	 * on the transaction lists.  Data blocks go first.
532 	 */
533 	err = journal_submit_data_buffers(journal, commit_transaction);
534 	if (err)
535 		jbd2_journal_abort(journal, err);
536 
537 	blk_start_plug(&plug);
538 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
539 
540 	jbd_debug(3, "JBD2: commit phase 2b\n");
541 
542 	/*
543 	 * Way to go: we have now written out all of the data for a
544 	 * transaction!  Now comes the tricky part: we need to write out
545 	 * metadata.  Loop over the transaction's entire buffer list:
546 	 */
547 	write_lock(&journal->j_state_lock);
548 	commit_transaction->t_state = T_COMMIT;
549 	write_unlock(&journal->j_state_lock);
550 
551 	trace_jbd2_commit_logging(journal, commit_transaction);
552 	stats.run.rs_logging = jiffies;
553 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
554 					       stats.run.rs_logging);
555 	stats.run.rs_blocks =
556 		atomic_read(&commit_transaction->t_outstanding_credits);
557 	stats.run.rs_blocks_logged = 0;
558 
559 	J_ASSERT(commit_transaction->t_nr_buffers <=
560 		 atomic_read(&commit_transaction->t_outstanding_credits));
561 
562 	err = 0;
563 	bufs = 0;
564 	descriptor = NULL;
565 	while (commit_transaction->t_buffers) {
566 
567 		/* Find the next buffer to be journaled... */
568 
569 		jh = commit_transaction->t_buffers;
570 
571 		/* If we're in abort mode, we just un-journal the buffer and
572 		   release it. */
573 
574 		if (is_journal_aborted(journal)) {
575 			clear_buffer_jbddirty(jh2bh(jh));
576 			JBUFFER_TRACE(jh, "journal is aborting: refile");
577 			jbd2_buffer_abort_trigger(jh,
578 						  jh->b_frozen_data ?
579 						  jh->b_frozen_triggers :
580 						  jh->b_triggers);
581 			jbd2_journal_refile_buffer(journal, jh);
582 			/* If that was the last one, we need to clean up
583 			 * any descriptor buffers which may have been
584 			 * already allocated, even if we are now
585 			 * aborting. */
586 			if (!commit_transaction->t_buffers)
587 				goto start_journal_io;
588 			continue;
589 		}
590 
591 		/* Make sure we have a descriptor block in which to
592 		   record the metadata buffer. */
593 
594 		if (!descriptor) {
595 			J_ASSERT (bufs == 0);
596 
597 			jbd_debug(4, "JBD2: get descriptor\n");
598 
599 			descriptor = jbd2_journal_get_descriptor_buffer(
600 							commit_transaction,
601 							JBD2_DESCRIPTOR_BLOCK);
602 			if (!descriptor) {
603 				jbd2_journal_abort(journal, -EIO);
604 				continue;
605 			}
606 
607 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
608 				(unsigned long long)descriptor->b_blocknr,
609 				descriptor->b_data);
610 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
611 			space_left = descriptor->b_size -
612 						sizeof(journal_header_t);
613 			first_tag = 1;
614 			set_buffer_jwrite(descriptor);
615 			set_buffer_dirty(descriptor);
616 			wbuf[bufs++] = descriptor;
617 
618 			/* Record it so that we can wait for IO
619                            completion later */
620 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
621 			jbd2_file_log_bh(&log_bufs, descriptor);
622 		}
623 
624 		/* Where is the buffer to be written? */
625 
626 		err = jbd2_journal_next_log_block(journal, &blocknr);
627 		/* If the block mapping failed, just abandon the buffer
628 		   and repeat this loop: we'll fall into the
629 		   refile-on-abort condition above. */
630 		if (err) {
631 			jbd2_journal_abort(journal, err);
632 			continue;
633 		}
634 
635 		/*
636 		 * start_this_handle() uses t_outstanding_credits to determine
637 		 * the free space in the log, but this counter is changed
638 		 * by jbd2_journal_next_log_block() also.
639 		 */
640 		atomic_dec(&commit_transaction->t_outstanding_credits);
641 
642 		/* Bump b_count to prevent truncate from stumbling over
643                    the shadowed buffer!  @@@ This can go if we ever get
644                    rid of the shadow pairing of buffers. */
645 		atomic_inc(&jh2bh(jh)->b_count);
646 
647 		/*
648 		 * Make a temporary IO buffer with which to write it out
649 		 * (this will requeue the metadata buffer to BJ_Shadow).
650 		 */
651 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
652 		JBUFFER_TRACE(jh, "ph3: write metadata");
653 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
654 						jh, &wbuf[bufs], blocknr);
655 		if (flags < 0) {
656 			jbd2_journal_abort(journal, flags);
657 			continue;
658 		}
659 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
660 
661 		/* Record the new block's tag in the current descriptor
662                    buffer */
663 
664 		tag_flag = 0;
665 		if (flags & 1)
666 			tag_flag |= JBD2_FLAG_ESCAPE;
667 		if (!first_tag)
668 			tag_flag |= JBD2_FLAG_SAME_UUID;
669 
670 		tag = (journal_block_tag_t *) tagp;
671 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
672 		tag->t_flags = cpu_to_be16(tag_flag);
673 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
674 					commit_transaction->t_tid);
675 		tagp += tag_bytes;
676 		space_left -= tag_bytes;
677 		bufs++;
678 
679 		if (first_tag) {
680 			memcpy (tagp, journal->j_uuid, 16);
681 			tagp += 16;
682 			space_left -= 16;
683 			first_tag = 0;
684 		}
685 
686 		/* If there's no more to do, or if the descriptor is full,
687 		   let the IO rip! */
688 
689 		if (bufs == journal->j_wbufsize ||
690 		    commit_transaction->t_buffers == NULL ||
691 		    space_left < tag_bytes + 16 + csum_size) {
692 
693 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
694 
695 			/* Write an end-of-descriptor marker before
696                            submitting the IOs.  "tag" still points to
697                            the last tag we set up. */
698 
699 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
700 
701 			jbd2_descriptor_block_csum_set(journal, descriptor);
702 start_journal_io:
703 			for (i = 0; i < bufs; i++) {
704 				struct buffer_head *bh = wbuf[i];
705 				/*
706 				 * Compute checksum.
707 				 */
708 				if (jbd2_has_feature_checksum(journal)) {
709 					crc32_sum =
710 					    jbd2_checksum_data(crc32_sum, bh);
711 				}
712 
713 				lock_buffer(bh);
714 				clear_buffer_dirty(bh);
715 				set_buffer_uptodate(bh);
716 				bh->b_end_io = journal_end_buffer_io_sync;
717 				submit_bh(WRITE_SYNC, bh);
718 			}
719 			cond_resched();
720 			stats.run.rs_blocks_logged += bufs;
721 
722 			/* Force a new descriptor to be generated next
723                            time round the loop. */
724 			descriptor = NULL;
725 			bufs = 0;
726 		}
727 	}
728 
729 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
730 	if (err) {
731 		printk(KERN_WARNING
732 			"JBD2: Detected IO errors while flushing file data "
733 		       "on %s\n", journal->j_devname);
734 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
735 			jbd2_journal_abort(journal, err);
736 		err = 0;
737 	}
738 
739 	/*
740 	 * Get current oldest transaction in the log before we issue flush
741 	 * to the filesystem device. After the flush we can be sure that
742 	 * blocks of all older transactions are checkpointed to persistent
743 	 * storage and we will be safe to update journal start in the
744 	 * superblock with the numbers we get here.
745 	 */
746 	update_tail =
747 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
748 
749 	write_lock(&journal->j_state_lock);
750 	if (update_tail) {
751 		long freed = first_block - journal->j_tail;
752 
753 		if (first_block < journal->j_tail)
754 			freed += journal->j_last - journal->j_first;
755 		/* Update tail only if we free significant amount of space */
756 		if (freed < journal->j_maxlen / 4)
757 			update_tail = 0;
758 	}
759 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
760 	commit_transaction->t_state = T_COMMIT_DFLUSH;
761 	write_unlock(&journal->j_state_lock);
762 
763 	/*
764 	 * If the journal is not located on the file system device,
765 	 * then we must flush the file system device before we issue
766 	 * the commit record
767 	 */
768 	if (commit_transaction->t_need_data_flush &&
769 	    (journal->j_fs_dev != journal->j_dev) &&
770 	    (journal->j_flags & JBD2_BARRIER))
771 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
772 
773 	/* Done it all: now write the commit record asynchronously. */
774 	if (jbd2_has_feature_async_commit(journal)) {
775 		err = journal_submit_commit_record(journal, commit_transaction,
776 						 &cbh, crc32_sum);
777 		if (err)
778 			__jbd2_journal_abort_hard(journal);
779 	}
780 
781 	blk_finish_plug(&plug);
782 
783 	/* Lo and behold: we have just managed to send a transaction to
784            the log.  Before we can commit it, wait for the IO so far to
785            complete.  Control buffers being written are on the
786            transaction's t_log_list queue, and metadata buffers are on
787            the io_bufs list.
788 
789 	   Wait for the buffers in reverse order.  That way we are
790 	   less likely to be woken up until all IOs have completed, and
791 	   so we incur less scheduling load.
792 	*/
793 
794 	jbd_debug(3, "JBD2: commit phase 3\n");
795 
796 	while (!list_empty(&io_bufs)) {
797 		struct buffer_head *bh = list_entry(io_bufs.prev,
798 						    struct buffer_head,
799 						    b_assoc_buffers);
800 
801 		wait_on_buffer(bh);
802 		cond_resched();
803 
804 		if (unlikely(!buffer_uptodate(bh)))
805 			err = -EIO;
806 		jbd2_unfile_log_bh(bh);
807 
808 		/*
809 		 * The list contains temporary buffer heads created by
810 		 * jbd2_journal_write_metadata_buffer().
811 		 */
812 		BUFFER_TRACE(bh, "dumping temporary bh");
813 		__brelse(bh);
814 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
815 		free_buffer_head(bh);
816 
817 		/* We also have to refile the corresponding shadowed buffer */
818 		jh = commit_transaction->t_shadow_list->b_tprev;
819 		bh = jh2bh(jh);
820 		clear_buffer_jwrite(bh);
821 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
822 		J_ASSERT_BH(bh, !buffer_shadow(bh));
823 
824 		/* The metadata is now released for reuse, but we need
825                    to remember it against this transaction so that when
826                    we finally commit, we can do any checkpointing
827                    required. */
828 		JBUFFER_TRACE(jh, "file as BJ_Forget");
829 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
830 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
831 		__brelse(bh);
832 	}
833 
834 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
835 
836 	jbd_debug(3, "JBD2: commit phase 4\n");
837 
838 	/* Here we wait for the revoke record and descriptor record buffers */
839 	while (!list_empty(&log_bufs)) {
840 		struct buffer_head *bh;
841 
842 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
843 		wait_on_buffer(bh);
844 		cond_resched();
845 
846 		if (unlikely(!buffer_uptodate(bh)))
847 			err = -EIO;
848 
849 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
850 		clear_buffer_jwrite(bh);
851 		jbd2_unfile_log_bh(bh);
852 		__brelse(bh);		/* One for getblk */
853 		/* AKPM: bforget here */
854 	}
855 
856 	if (err)
857 		jbd2_journal_abort(journal, err);
858 
859 	jbd_debug(3, "JBD2: commit phase 5\n");
860 	write_lock(&journal->j_state_lock);
861 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
862 	commit_transaction->t_state = T_COMMIT_JFLUSH;
863 	write_unlock(&journal->j_state_lock);
864 
865 	if (!jbd2_has_feature_async_commit(journal)) {
866 		err = journal_submit_commit_record(journal, commit_transaction,
867 						&cbh, crc32_sum);
868 		if (err)
869 			__jbd2_journal_abort_hard(journal);
870 	}
871 	if (cbh)
872 		err = journal_wait_on_commit_record(journal, cbh);
873 	if (jbd2_has_feature_async_commit(journal) &&
874 	    journal->j_flags & JBD2_BARRIER) {
875 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
876 	}
877 
878 	if (err)
879 		jbd2_journal_abort(journal, err);
880 
881 	/*
882 	 * Now disk caches for filesystem device are flushed so we are safe to
883 	 * erase checkpointed transactions from the log by updating journal
884 	 * superblock.
885 	 */
886 	if (update_tail)
887 		jbd2_update_log_tail(journal, first_tid, first_block);
888 
889 	/* End of a transaction!  Finally, we can do checkpoint
890            processing: any buffers committed as a result of this
891            transaction can be removed from any checkpoint list it was on
892            before. */
893 
894 	jbd_debug(3, "JBD2: commit phase 6\n");
895 
896 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
897 	J_ASSERT(commit_transaction->t_buffers == NULL);
898 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
899 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
900 
901 restart_loop:
902 	/*
903 	 * As there are other places (journal_unmap_buffer()) adding buffers
904 	 * to this list we have to be careful and hold the j_list_lock.
905 	 */
906 	spin_lock(&journal->j_list_lock);
907 	while (commit_transaction->t_forget) {
908 		transaction_t *cp_transaction;
909 		struct buffer_head *bh;
910 		int try_to_free = 0;
911 
912 		jh = commit_transaction->t_forget;
913 		spin_unlock(&journal->j_list_lock);
914 		bh = jh2bh(jh);
915 		/*
916 		 * Get a reference so that bh cannot be freed before we are
917 		 * done with it.
918 		 */
919 		get_bh(bh);
920 		jbd_lock_bh_state(bh);
921 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
922 
923 		/*
924 		 * If there is undo-protected committed data against
925 		 * this buffer, then we can remove it now.  If it is a
926 		 * buffer needing such protection, the old frozen_data
927 		 * field now points to a committed version of the
928 		 * buffer, so rotate that field to the new committed
929 		 * data.
930 		 *
931 		 * Otherwise, we can just throw away the frozen data now.
932 		 *
933 		 * We also know that the frozen data has already fired
934 		 * its triggers if they exist, so we can clear that too.
935 		 */
936 		if (jh->b_committed_data) {
937 			jbd2_free(jh->b_committed_data, bh->b_size);
938 			jh->b_committed_data = NULL;
939 			if (jh->b_frozen_data) {
940 				jh->b_committed_data = jh->b_frozen_data;
941 				jh->b_frozen_data = NULL;
942 				jh->b_frozen_triggers = NULL;
943 			}
944 		} else if (jh->b_frozen_data) {
945 			jbd2_free(jh->b_frozen_data, bh->b_size);
946 			jh->b_frozen_data = NULL;
947 			jh->b_frozen_triggers = NULL;
948 		}
949 
950 		spin_lock(&journal->j_list_lock);
951 		cp_transaction = jh->b_cp_transaction;
952 		if (cp_transaction) {
953 			JBUFFER_TRACE(jh, "remove from old cp transaction");
954 			cp_transaction->t_chp_stats.cs_dropped++;
955 			__jbd2_journal_remove_checkpoint(jh);
956 		}
957 
958 		/* Only re-checkpoint the buffer_head if it is marked
959 		 * dirty.  If the buffer was added to the BJ_Forget list
960 		 * by jbd2_journal_forget, it may no longer be dirty and
961 		 * there's no point in keeping a checkpoint record for
962 		 * it. */
963 
964 		/*
965 		* A buffer which has been freed while still being journaled by
966 		* a previous transaction.
967 		*/
968 		if (buffer_freed(bh)) {
969 			/*
970 			 * If the running transaction is the one containing
971 			 * "add to orphan" operation (b_next_transaction !=
972 			 * NULL), we have to wait for that transaction to
973 			 * commit before we can really get rid of the buffer.
974 			 * So just clear b_modified to not confuse transaction
975 			 * credit accounting and refile the buffer to
976 			 * BJ_Forget of the running transaction. If the just
977 			 * committed transaction contains "add to orphan"
978 			 * operation, we can completely invalidate the buffer
979 			 * now. We are rather through in that since the
980 			 * buffer may be still accessible when blocksize <
981 			 * pagesize and it is attached to the last partial
982 			 * page.
983 			 */
984 			jh->b_modified = 0;
985 			if (!jh->b_next_transaction) {
986 				clear_buffer_freed(bh);
987 				clear_buffer_jbddirty(bh);
988 				clear_buffer_mapped(bh);
989 				clear_buffer_new(bh);
990 				clear_buffer_req(bh);
991 				bh->b_bdev = NULL;
992 			}
993 		}
994 
995 		if (buffer_jbddirty(bh)) {
996 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
997 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
998 			if (is_journal_aborted(journal))
999 				clear_buffer_jbddirty(bh);
1000 		} else {
1001 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1002 			/*
1003 			 * The buffer on BJ_Forget list and not jbddirty means
1004 			 * it has been freed by this transaction and hence it
1005 			 * could not have been reallocated until this
1006 			 * transaction has committed. *BUT* it could be
1007 			 * reallocated once we have written all the data to
1008 			 * disk and before we process the buffer on BJ_Forget
1009 			 * list.
1010 			 */
1011 			if (!jh->b_next_transaction)
1012 				try_to_free = 1;
1013 		}
1014 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1015 		__jbd2_journal_refile_buffer(jh);
1016 		jbd_unlock_bh_state(bh);
1017 		if (try_to_free)
1018 			release_buffer_page(bh);	/* Drops bh reference */
1019 		else
1020 			__brelse(bh);
1021 		cond_resched_lock(&journal->j_list_lock);
1022 	}
1023 	spin_unlock(&journal->j_list_lock);
1024 	/*
1025 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1026 	 * of a transaction into T_FINISHED state and calling
1027 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1028 	 * other checkpointing code processing the transaction...
1029 	 */
1030 	write_lock(&journal->j_state_lock);
1031 	spin_lock(&journal->j_list_lock);
1032 	/*
1033 	 * Now recheck if some buffers did not get attached to the transaction
1034 	 * while the lock was dropped...
1035 	 */
1036 	if (commit_transaction->t_forget) {
1037 		spin_unlock(&journal->j_list_lock);
1038 		write_unlock(&journal->j_state_lock);
1039 		goto restart_loop;
1040 	}
1041 
1042 	/* Add the transaction to the checkpoint list
1043 	 * __journal_remove_checkpoint() can not destroy transaction
1044 	 * under us because it is not marked as T_FINISHED yet */
1045 	if (journal->j_checkpoint_transactions == NULL) {
1046 		journal->j_checkpoint_transactions = commit_transaction;
1047 		commit_transaction->t_cpnext = commit_transaction;
1048 		commit_transaction->t_cpprev = commit_transaction;
1049 	} else {
1050 		commit_transaction->t_cpnext =
1051 			journal->j_checkpoint_transactions;
1052 		commit_transaction->t_cpprev =
1053 			commit_transaction->t_cpnext->t_cpprev;
1054 		commit_transaction->t_cpnext->t_cpprev =
1055 			commit_transaction;
1056 		commit_transaction->t_cpprev->t_cpnext =
1057 				commit_transaction;
1058 	}
1059 	spin_unlock(&journal->j_list_lock);
1060 
1061 	/* Done with this transaction! */
1062 
1063 	jbd_debug(3, "JBD2: commit phase 7\n");
1064 
1065 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1066 
1067 	commit_transaction->t_start = jiffies;
1068 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1069 					      commit_transaction->t_start);
1070 
1071 	/*
1072 	 * File the transaction statistics
1073 	 */
1074 	stats.ts_tid = commit_transaction->t_tid;
1075 	stats.run.rs_handle_count =
1076 		atomic_read(&commit_transaction->t_handle_count);
1077 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1078 			     commit_transaction->t_tid, &stats.run);
1079 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1080 
1081 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1082 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1083 	journal->j_commit_sequence = commit_transaction->t_tid;
1084 	journal->j_committing_transaction = NULL;
1085 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1086 
1087 	/*
1088 	 * weight the commit time higher than the average time so we don't
1089 	 * react too strongly to vast changes in the commit time
1090 	 */
1091 	if (likely(journal->j_average_commit_time))
1092 		journal->j_average_commit_time = (commit_time +
1093 				journal->j_average_commit_time*3) / 4;
1094 	else
1095 		journal->j_average_commit_time = commit_time;
1096 
1097 	write_unlock(&journal->j_state_lock);
1098 
1099 	if (journal->j_commit_callback)
1100 		journal->j_commit_callback(journal, commit_transaction);
1101 
1102 	trace_jbd2_end_commit(journal, commit_transaction);
1103 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1104 		  journal->j_commit_sequence, journal->j_tail_sequence);
1105 
1106 	write_lock(&journal->j_state_lock);
1107 	spin_lock(&journal->j_list_lock);
1108 	commit_transaction->t_state = T_FINISHED;
1109 	/* Check if the transaction can be dropped now that we are finished */
1110 	if (commit_transaction->t_checkpoint_list == NULL &&
1111 	    commit_transaction->t_checkpoint_io_list == NULL) {
1112 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1113 		jbd2_journal_free_transaction(commit_transaction);
1114 	}
1115 	spin_unlock(&journal->j_list_lock);
1116 	write_unlock(&journal->j_state_lock);
1117 	wake_up(&journal->j_wait_done_commit);
1118 
1119 	/*
1120 	 * Calculate overall stats
1121 	 */
1122 	spin_lock(&journal->j_history_lock);
1123 	journal->j_stats.ts_tid++;
1124 	journal->j_stats.ts_requested += stats.ts_requested;
1125 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1126 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1127 	journal->j_stats.run.rs_running += stats.run.rs_running;
1128 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1129 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1130 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1131 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1132 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1133 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1134 	spin_unlock(&journal->j_history_lock);
1135 }
1136