xref: /openbmc/linux/fs/jbd2/commit.c (revision f3a8b664)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * IO end handler for temporary buffer_heads handling writes to the journal.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	struct buffer_head *orig_bh = bh->b_private;
38 
39 	BUFFER_TRACE(bh, "");
40 	if (uptodate)
41 		set_buffer_uptodate(bh);
42 	else
43 		clear_buffer_uptodate(bh);
44 	if (orig_bh) {
45 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 		smp_mb__after_atomic();
47 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
48 	}
49 	unlock_buffer(bh);
50 }
51 
52 /*
53  * When an ext4 file is truncated, it is possible that some pages are not
54  * successfully freed, because they are attached to a committing transaction.
55  * After the transaction commits, these pages are left on the LRU, with no
56  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
57  * by the VM, but their apparent absence upsets the VM accounting, and it makes
58  * the numbers in /proc/meminfo look odd.
59  *
60  * So here, we have a buffer which has just come off the forget list.  Look to
61  * see if we can strip all buffers from the backing page.
62  *
63  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
64  * caller provided us with a ref against the buffer, and we drop that here.
65  */
66 static void release_buffer_page(struct buffer_head *bh)
67 {
68 	struct page *page;
69 
70 	if (buffer_dirty(bh))
71 		goto nope;
72 	if (atomic_read(&bh->b_count) != 1)
73 		goto nope;
74 	page = bh->b_page;
75 	if (!page)
76 		goto nope;
77 	if (page->mapping)
78 		goto nope;
79 
80 	/* OK, it's a truncated page */
81 	if (!trylock_page(page))
82 		goto nope;
83 
84 	get_page(page);
85 	__brelse(bh);
86 	try_to_free_buffers(page);
87 	unlock_page(page);
88 	put_page(page);
89 	return;
90 
91 nope:
92 	__brelse(bh);
93 }
94 
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
96 {
97 	struct commit_header *h;
98 	__u32 csum;
99 
100 	if (!jbd2_journal_has_csum_v2or3(j))
101 		return;
102 
103 	h = (struct commit_header *)(bh->b_data);
104 	h->h_chksum_type = 0;
105 	h->h_chksum_size = 0;
106 	h->h_chksum[0] = 0;
107 	csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 	h->h_chksum[0] = cpu_to_be32(csum);
109 }
110 
111 /*
112  * Done it all: now submit the commit record.  We should have
113  * cleaned up our previous buffers by now, so if we are in abort
114  * mode we can now just skip the rest of the journal write
115  * entirely.
116  *
117  * Returns 1 if the journal needs to be aborted or 0 on success
118  */
119 static int journal_submit_commit_record(journal_t *journal,
120 					transaction_t *commit_transaction,
121 					struct buffer_head **cbh,
122 					__u32 crc32_sum)
123 {
124 	struct commit_header *tmp;
125 	struct buffer_head *bh;
126 	int ret;
127 	struct timespec64 now = current_kernel_time64();
128 
129 	*cbh = NULL;
130 
131 	if (is_journal_aborted(journal))
132 		return 0;
133 
134 	bh = jbd2_journal_get_descriptor_buffer(commit_transaction,
135 						JBD2_COMMIT_BLOCK);
136 	if (!bh)
137 		return 1;
138 
139 	tmp = (struct commit_header *)bh->b_data;
140 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142 
143 	if (jbd2_has_feature_checksum(journal)) {
144 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
145 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
146 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
147 	}
148 	jbd2_commit_block_csum_set(journal, bh);
149 
150 	BUFFER_TRACE(bh, "submit commit block");
151 	lock_buffer(bh);
152 	clear_buffer_dirty(bh);
153 	set_buffer_uptodate(bh);
154 	bh->b_end_io = journal_end_buffer_io_sync;
155 
156 	if (journal->j_flags & JBD2_BARRIER &&
157 	    !jbd2_has_feature_async_commit(journal))
158 		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
159 	else
160 		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
161 
162 	*cbh = bh;
163 	return ret;
164 }
165 
166 /*
167  * This function along with journal_submit_commit_record
168  * allows to write the commit record asynchronously.
169  */
170 static int journal_wait_on_commit_record(journal_t *journal,
171 					 struct buffer_head *bh)
172 {
173 	int ret = 0;
174 
175 	clear_buffer_dirty(bh);
176 	wait_on_buffer(bh);
177 
178 	if (unlikely(!buffer_uptodate(bh)))
179 		ret = -EIO;
180 	put_bh(bh);            /* One for getblk() */
181 
182 	return ret;
183 }
184 
185 /*
186  * write the filemap data using writepage() address_space_operations.
187  * We don't do block allocation here even for delalloc. We don't
188  * use writepages() because with dealyed allocation we may be doing
189  * block allocation in writepages().
190  */
191 static int journal_submit_inode_data_buffers(struct address_space *mapping)
192 {
193 	int ret;
194 	struct writeback_control wbc = {
195 		.sync_mode =  WB_SYNC_ALL,
196 		.nr_to_write = mapping->nrpages * 2,
197 		.range_start = 0,
198 		.range_end = i_size_read(mapping->host),
199 	};
200 
201 	ret = generic_writepages(mapping, &wbc);
202 	return ret;
203 }
204 
205 /*
206  * Submit all the data buffers of inode associated with the transaction to
207  * disk.
208  *
209  * We are in a committing transaction. Therefore no new inode can be added to
210  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
211  * operate on from being released while we write out pages.
212  */
213 static int journal_submit_data_buffers(journal_t *journal,
214 		transaction_t *commit_transaction)
215 {
216 	struct jbd2_inode *jinode;
217 	int err, ret = 0;
218 	struct address_space *mapping;
219 
220 	spin_lock(&journal->j_list_lock);
221 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
222 		if (!(jinode->i_flags & JI_WRITE_DATA))
223 			continue;
224 		mapping = jinode->i_vfs_inode->i_mapping;
225 		jinode->i_flags |= JI_COMMIT_RUNNING;
226 		spin_unlock(&journal->j_list_lock);
227 		/*
228 		 * submit the inode data buffers. We use writepage
229 		 * instead of writepages. Because writepages can do
230 		 * block allocation  with delalloc. We need to write
231 		 * only allocated blocks here.
232 		 */
233 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 		err = journal_submit_inode_data_buffers(mapping);
235 		if (!ret)
236 			ret = err;
237 		spin_lock(&journal->j_list_lock);
238 		J_ASSERT(jinode->i_transaction == commit_transaction);
239 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
240 		smp_mb();
241 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 	}
243 	spin_unlock(&journal->j_list_lock);
244 	return ret;
245 }
246 
247 /*
248  * Wait for data submitted for writeout, refile inodes to proper
249  * transaction if needed.
250  *
251  */
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 		transaction_t *commit_transaction)
254 {
255 	struct jbd2_inode *jinode, *next_i;
256 	int err, ret = 0;
257 
258 	/* For locking, see the comment in journal_submit_data_buffers() */
259 	spin_lock(&journal->j_list_lock);
260 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 		if (!(jinode->i_flags & JI_WAIT_DATA))
262 			continue;
263 		jinode->i_flags |= JI_COMMIT_RUNNING;
264 		spin_unlock(&journal->j_list_lock);
265 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266 		if (err) {
267 			/*
268 			 * Because AS_EIO is cleared by
269 			 * filemap_fdatawait_range(), set it again so
270 			 * that user process can get -EIO from fsync().
271 			 */
272 			mapping_set_error(jinode->i_vfs_inode->i_mapping, -EIO);
273 
274 			if (!ret)
275 				ret = err;
276 		}
277 		spin_lock(&journal->j_list_lock);
278 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
279 		smp_mb();
280 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281 	}
282 
283 	/* Now refile inode to proper lists */
284 	list_for_each_entry_safe(jinode, next_i,
285 				 &commit_transaction->t_inode_list, i_list) {
286 		list_del(&jinode->i_list);
287 		if (jinode->i_next_transaction) {
288 			jinode->i_transaction = jinode->i_next_transaction;
289 			jinode->i_next_transaction = NULL;
290 			list_add(&jinode->i_list,
291 				&jinode->i_transaction->t_inode_list);
292 		} else {
293 			jinode->i_transaction = NULL;
294 		}
295 	}
296 	spin_unlock(&journal->j_list_lock);
297 
298 	return ret;
299 }
300 
301 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 {
303 	struct page *page = bh->b_page;
304 	char *addr;
305 	__u32 checksum;
306 
307 	addr = kmap_atomic(page);
308 	checksum = crc32_be(crc32_sum,
309 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
310 	kunmap_atomic(addr);
311 
312 	return checksum;
313 }
314 
315 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
316 				   unsigned long long block)
317 {
318 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
319 	if (jbd2_has_feature_64bit(j))
320 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
321 }
322 
323 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
324 				    struct buffer_head *bh, __u32 sequence)
325 {
326 	journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
327 	struct page *page = bh->b_page;
328 	__u8 *addr;
329 	__u32 csum32;
330 	__be32 seq;
331 
332 	if (!jbd2_journal_has_csum_v2or3(j))
333 		return;
334 
335 	seq = cpu_to_be32(sequence);
336 	addr = kmap_atomic(page);
337 	csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
338 	csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
339 			     bh->b_size);
340 	kunmap_atomic(addr);
341 
342 	if (jbd2_has_feature_csum3(j))
343 		tag3->t_checksum = cpu_to_be32(csum32);
344 	else
345 		tag->t_checksum = cpu_to_be16(csum32);
346 }
347 /*
348  * jbd2_journal_commit_transaction
349  *
350  * The primary function for committing a transaction to the log.  This
351  * function is called by the journal thread to begin a complete commit.
352  */
353 void jbd2_journal_commit_transaction(journal_t *journal)
354 {
355 	struct transaction_stats_s stats;
356 	transaction_t *commit_transaction;
357 	struct journal_head *jh;
358 	struct buffer_head *descriptor;
359 	struct buffer_head **wbuf = journal->j_wbuf;
360 	int bufs;
361 	int flags;
362 	int err;
363 	unsigned long long blocknr;
364 	ktime_t start_time;
365 	u64 commit_time;
366 	char *tagp = NULL;
367 	journal_block_tag_t *tag = NULL;
368 	int space_left = 0;
369 	int first_tag = 0;
370 	int tag_flag;
371 	int i;
372 	int tag_bytes = journal_tag_bytes(journal);
373 	struct buffer_head *cbh = NULL; /* For transactional checksums */
374 	__u32 crc32_sum = ~0;
375 	struct blk_plug plug;
376 	/* Tail of the journal */
377 	unsigned long first_block;
378 	tid_t first_tid;
379 	int update_tail;
380 	int csum_size = 0;
381 	LIST_HEAD(io_bufs);
382 	LIST_HEAD(log_bufs);
383 
384 	if (jbd2_journal_has_csum_v2or3(journal))
385 		csum_size = sizeof(struct jbd2_journal_block_tail);
386 
387 	/*
388 	 * First job: lock down the current transaction and wait for
389 	 * all outstanding updates to complete.
390 	 */
391 
392 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
393 	if (journal->j_flags & JBD2_FLUSHED) {
394 		jbd_debug(3, "super block updated\n");
395 		mutex_lock(&journal->j_checkpoint_mutex);
396 		/*
397 		 * We hold j_checkpoint_mutex so tail cannot change under us.
398 		 * We don't need any special data guarantees for writing sb
399 		 * since journal is empty and it is ok for write to be
400 		 * flushed only with transaction commit.
401 		 */
402 		jbd2_journal_update_sb_log_tail(journal,
403 						journal->j_tail_sequence,
404 						journal->j_tail,
405 						WRITE_SYNC);
406 		mutex_unlock(&journal->j_checkpoint_mutex);
407 	} else {
408 		jbd_debug(3, "superblock not updated\n");
409 	}
410 
411 	J_ASSERT(journal->j_running_transaction != NULL);
412 	J_ASSERT(journal->j_committing_transaction == NULL);
413 
414 	commit_transaction = journal->j_running_transaction;
415 
416 	trace_jbd2_start_commit(journal, commit_transaction);
417 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
418 			commit_transaction->t_tid);
419 
420 	write_lock(&journal->j_state_lock);
421 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
422 	commit_transaction->t_state = T_LOCKED;
423 
424 	trace_jbd2_commit_locking(journal, commit_transaction);
425 	stats.run.rs_wait = commit_transaction->t_max_wait;
426 	stats.run.rs_request_delay = 0;
427 	stats.run.rs_locked = jiffies;
428 	if (commit_transaction->t_requested)
429 		stats.run.rs_request_delay =
430 			jbd2_time_diff(commit_transaction->t_requested,
431 				       stats.run.rs_locked);
432 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
433 					      stats.run.rs_locked);
434 
435 	spin_lock(&commit_transaction->t_handle_lock);
436 	while (atomic_read(&commit_transaction->t_updates)) {
437 		DEFINE_WAIT(wait);
438 
439 		prepare_to_wait(&journal->j_wait_updates, &wait,
440 					TASK_UNINTERRUPTIBLE);
441 		if (atomic_read(&commit_transaction->t_updates)) {
442 			spin_unlock(&commit_transaction->t_handle_lock);
443 			write_unlock(&journal->j_state_lock);
444 			schedule();
445 			write_lock(&journal->j_state_lock);
446 			spin_lock(&commit_transaction->t_handle_lock);
447 		}
448 		finish_wait(&journal->j_wait_updates, &wait);
449 	}
450 	spin_unlock(&commit_transaction->t_handle_lock);
451 
452 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
453 			journal->j_max_transaction_buffers);
454 
455 	/*
456 	 * First thing we are allowed to do is to discard any remaining
457 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
458 	 * that there are no such buffers: if a large filesystem
459 	 * operation like a truncate needs to split itself over multiple
460 	 * transactions, then it may try to do a jbd2_journal_restart() while
461 	 * there are still BJ_Reserved buffers outstanding.  These must
462 	 * be released cleanly from the current transaction.
463 	 *
464 	 * In this case, the filesystem must still reserve write access
465 	 * again before modifying the buffer in the new transaction, but
466 	 * we do not require it to remember exactly which old buffers it
467 	 * has reserved.  This is consistent with the existing behaviour
468 	 * that multiple jbd2_journal_get_write_access() calls to the same
469 	 * buffer are perfectly permissible.
470 	 */
471 	while (commit_transaction->t_reserved_list) {
472 		jh = commit_transaction->t_reserved_list;
473 		JBUFFER_TRACE(jh, "reserved, unused: refile");
474 		/*
475 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
476 		 * leave undo-committed data.
477 		 */
478 		if (jh->b_committed_data) {
479 			struct buffer_head *bh = jh2bh(jh);
480 
481 			jbd_lock_bh_state(bh);
482 			jbd2_free(jh->b_committed_data, bh->b_size);
483 			jh->b_committed_data = NULL;
484 			jbd_unlock_bh_state(bh);
485 		}
486 		jbd2_journal_refile_buffer(journal, jh);
487 	}
488 
489 	/*
490 	 * Now try to drop any written-back buffers from the journal's
491 	 * checkpoint lists.  We do this *before* commit because it potentially
492 	 * frees some memory
493 	 */
494 	spin_lock(&journal->j_list_lock);
495 	__jbd2_journal_clean_checkpoint_list(journal, false);
496 	spin_unlock(&journal->j_list_lock);
497 
498 	jbd_debug(3, "JBD2: commit phase 1\n");
499 
500 	/*
501 	 * Clear revoked flag to reflect there is no revoked buffers
502 	 * in the next transaction which is going to be started.
503 	 */
504 	jbd2_clear_buffer_revoked_flags(journal);
505 
506 	/*
507 	 * Switch to a new revoke table.
508 	 */
509 	jbd2_journal_switch_revoke_table(journal);
510 
511 	/*
512 	 * Reserved credits cannot be claimed anymore, free them
513 	 */
514 	atomic_sub(atomic_read(&journal->j_reserved_credits),
515 		   &commit_transaction->t_outstanding_credits);
516 
517 	trace_jbd2_commit_flushing(journal, commit_transaction);
518 	stats.run.rs_flushing = jiffies;
519 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
520 					     stats.run.rs_flushing);
521 
522 	commit_transaction->t_state = T_FLUSH;
523 	journal->j_committing_transaction = commit_transaction;
524 	journal->j_running_transaction = NULL;
525 	start_time = ktime_get();
526 	commit_transaction->t_log_start = journal->j_head;
527 	wake_up(&journal->j_wait_transaction_locked);
528 	write_unlock(&journal->j_state_lock);
529 
530 	jbd_debug(3, "JBD2: commit phase 2a\n");
531 
532 	/*
533 	 * Now start flushing things to disk, in the order they appear
534 	 * on the transaction lists.  Data blocks go first.
535 	 */
536 	err = journal_submit_data_buffers(journal, commit_transaction);
537 	if (err)
538 		jbd2_journal_abort(journal, err);
539 
540 	blk_start_plug(&plug);
541 	jbd2_journal_write_revoke_records(commit_transaction, &log_bufs);
542 
543 	jbd_debug(3, "JBD2: commit phase 2b\n");
544 
545 	/*
546 	 * Way to go: we have now written out all of the data for a
547 	 * transaction!  Now comes the tricky part: we need to write out
548 	 * metadata.  Loop over the transaction's entire buffer list:
549 	 */
550 	write_lock(&journal->j_state_lock);
551 	commit_transaction->t_state = T_COMMIT;
552 	write_unlock(&journal->j_state_lock);
553 
554 	trace_jbd2_commit_logging(journal, commit_transaction);
555 	stats.run.rs_logging = jiffies;
556 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
557 					       stats.run.rs_logging);
558 	stats.run.rs_blocks =
559 		atomic_read(&commit_transaction->t_outstanding_credits);
560 	stats.run.rs_blocks_logged = 0;
561 
562 	J_ASSERT(commit_transaction->t_nr_buffers <=
563 		 atomic_read(&commit_transaction->t_outstanding_credits));
564 
565 	err = 0;
566 	bufs = 0;
567 	descriptor = NULL;
568 	while (commit_transaction->t_buffers) {
569 
570 		/* Find the next buffer to be journaled... */
571 
572 		jh = commit_transaction->t_buffers;
573 
574 		/* If we're in abort mode, we just un-journal the buffer and
575 		   release it. */
576 
577 		if (is_journal_aborted(journal)) {
578 			clear_buffer_jbddirty(jh2bh(jh));
579 			JBUFFER_TRACE(jh, "journal is aborting: refile");
580 			jbd2_buffer_abort_trigger(jh,
581 						  jh->b_frozen_data ?
582 						  jh->b_frozen_triggers :
583 						  jh->b_triggers);
584 			jbd2_journal_refile_buffer(journal, jh);
585 			/* If that was the last one, we need to clean up
586 			 * any descriptor buffers which may have been
587 			 * already allocated, even if we are now
588 			 * aborting. */
589 			if (!commit_transaction->t_buffers)
590 				goto start_journal_io;
591 			continue;
592 		}
593 
594 		/* Make sure we have a descriptor block in which to
595 		   record the metadata buffer. */
596 
597 		if (!descriptor) {
598 			J_ASSERT (bufs == 0);
599 
600 			jbd_debug(4, "JBD2: get descriptor\n");
601 
602 			descriptor = jbd2_journal_get_descriptor_buffer(
603 							commit_transaction,
604 							JBD2_DESCRIPTOR_BLOCK);
605 			if (!descriptor) {
606 				jbd2_journal_abort(journal, -EIO);
607 				continue;
608 			}
609 
610 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
611 				(unsigned long long)descriptor->b_blocknr,
612 				descriptor->b_data);
613 			tagp = &descriptor->b_data[sizeof(journal_header_t)];
614 			space_left = descriptor->b_size -
615 						sizeof(journal_header_t);
616 			first_tag = 1;
617 			set_buffer_jwrite(descriptor);
618 			set_buffer_dirty(descriptor);
619 			wbuf[bufs++] = descriptor;
620 
621 			/* Record it so that we can wait for IO
622                            completion later */
623 			BUFFER_TRACE(descriptor, "ph3: file as descriptor");
624 			jbd2_file_log_bh(&log_bufs, descriptor);
625 		}
626 
627 		/* Where is the buffer to be written? */
628 
629 		err = jbd2_journal_next_log_block(journal, &blocknr);
630 		/* If the block mapping failed, just abandon the buffer
631 		   and repeat this loop: we'll fall into the
632 		   refile-on-abort condition above. */
633 		if (err) {
634 			jbd2_journal_abort(journal, err);
635 			continue;
636 		}
637 
638 		/*
639 		 * start_this_handle() uses t_outstanding_credits to determine
640 		 * the free space in the log, but this counter is changed
641 		 * by jbd2_journal_next_log_block() also.
642 		 */
643 		atomic_dec(&commit_transaction->t_outstanding_credits);
644 
645 		/* Bump b_count to prevent truncate from stumbling over
646                    the shadowed buffer!  @@@ This can go if we ever get
647                    rid of the shadow pairing of buffers. */
648 		atomic_inc(&jh2bh(jh)->b_count);
649 
650 		/*
651 		 * Make a temporary IO buffer with which to write it out
652 		 * (this will requeue the metadata buffer to BJ_Shadow).
653 		 */
654 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
655 		JBUFFER_TRACE(jh, "ph3: write metadata");
656 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
657 						jh, &wbuf[bufs], blocknr);
658 		if (flags < 0) {
659 			jbd2_journal_abort(journal, flags);
660 			continue;
661 		}
662 		jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
663 
664 		/* Record the new block's tag in the current descriptor
665                    buffer */
666 
667 		tag_flag = 0;
668 		if (flags & 1)
669 			tag_flag |= JBD2_FLAG_ESCAPE;
670 		if (!first_tag)
671 			tag_flag |= JBD2_FLAG_SAME_UUID;
672 
673 		tag = (journal_block_tag_t *) tagp;
674 		write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
675 		tag->t_flags = cpu_to_be16(tag_flag);
676 		jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
677 					commit_transaction->t_tid);
678 		tagp += tag_bytes;
679 		space_left -= tag_bytes;
680 		bufs++;
681 
682 		if (first_tag) {
683 			memcpy (tagp, journal->j_uuid, 16);
684 			tagp += 16;
685 			space_left -= 16;
686 			first_tag = 0;
687 		}
688 
689 		/* If there's no more to do, or if the descriptor is full,
690 		   let the IO rip! */
691 
692 		if (bufs == journal->j_wbufsize ||
693 		    commit_transaction->t_buffers == NULL ||
694 		    space_left < tag_bytes + 16 + csum_size) {
695 
696 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
697 
698 			/* Write an end-of-descriptor marker before
699                            submitting the IOs.  "tag" still points to
700                            the last tag we set up. */
701 
702 			tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
703 
704 			jbd2_descriptor_block_csum_set(journal, descriptor);
705 start_journal_io:
706 			for (i = 0; i < bufs; i++) {
707 				struct buffer_head *bh = wbuf[i];
708 				/*
709 				 * Compute checksum.
710 				 */
711 				if (jbd2_has_feature_checksum(journal)) {
712 					crc32_sum =
713 					    jbd2_checksum_data(crc32_sum, bh);
714 				}
715 
716 				lock_buffer(bh);
717 				clear_buffer_dirty(bh);
718 				set_buffer_uptodate(bh);
719 				bh->b_end_io = journal_end_buffer_io_sync;
720 				submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
721 			}
722 			cond_resched();
723 			stats.run.rs_blocks_logged += bufs;
724 
725 			/* Force a new descriptor to be generated next
726                            time round the loop. */
727 			descriptor = NULL;
728 			bufs = 0;
729 		}
730 	}
731 
732 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
733 	if (err) {
734 		printk(KERN_WARNING
735 			"JBD2: Detected IO errors while flushing file data "
736 		       "on %s\n", journal->j_devname);
737 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
738 			jbd2_journal_abort(journal, err);
739 		err = 0;
740 	}
741 
742 	/*
743 	 * Get current oldest transaction in the log before we issue flush
744 	 * to the filesystem device. After the flush we can be sure that
745 	 * blocks of all older transactions are checkpointed to persistent
746 	 * storage and we will be safe to update journal start in the
747 	 * superblock with the numbers we get here.
748 	 */
749 	update_tail =
750 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
751 
752 	write_lock(&journal->j_state_lock);
753 	if (update_tail) {
754 		long freed = first_block - journal->j_tail;
755 
756 		if (first_block < journal->j_tail)
757 			freed += journal->j_last - journal->j_first;
758 		/* Update tail only if we free significant amount of space */
759 		if (freed < journal->j_maxlen / 4)
760 			update_tail = 0;
761 	}
762 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
763 	commit_transaction->t_state = T_COMMIT_DFLUSH;
764 	write_unlock(&journal->j_state_lock);
765 
766 	/*
767 	 * If the journal is not located on the file system device,
768 	 * then we must flush the file system device before we issue
769 	 * the commit record
770 	 */
771 	if (commit_transaction->t_need_data_flush &&
772 	    (journal->j_fs_dev != journal->j_dev) &&
773 	    (journal->j_flags & JBD2_BARRIER))
774 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
775 
776 	/* Done it all: now write the commit record asynchronously. */
777 	if (jbd2_has_feature_async_commit(journal)) {
778 		err = journal_submit_commit_record(journal, commit_transaction,
779 						 &cbh, crc32_sum);
780 		if (err)
781 			__jbd2_journal_abort_hard(journal);
782 	}
783 
784 	blk_finish_plug(&plug);
785 
786 	/* Lo and behold: we have just managed to send a transaction to
787            the log.  Before we can commit it, wait for the IO so far to
788            complete.  Control buffers being written are on the
789            transaction's t_log_list queue, and metadata buffers are on
790            the io_bufs list.
791 
792 	   Wait for the buffers in reverse order.  That way we are
793 	   less likely to be woken up until all IOs have completed, and
794 	   so we incur less scheduling load.
795 	*/
796 
797 	jbd_debug(3, "JBD2: commit phase 3\n");
798 
799 	while (!list_empty(&io_bufs)) {
800 		struct buffer_head *bh = list_entry(io_bufs.prev,
801 						    struct buffer_head,
802 						    b_assoc_buffers);
803 
804 		wait_on_buffer(bh);
805 		cond_resched();
806 
807 		if (unlikely(!buffer_uptodate(bh)))
808 			err = -EIO;
809 		jbd2_unfile_log_bh(bh);
810 
811 		/*
812 		 * The list contains temporary buffer heads created by
813 		 * jbd2_journal_write_metadata_buffer().
814 		 */
815 		BUFFER_TRACE(bh, "dumping temporary bh");
816 		__brelse(bh);
817 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
818 		free_buffer_head(bh);
819 
820 		/* We also have to refile the corresponding shadowed buffer */
821 		jh = commit_transaction->t_shadow_list->b_tprev;
822 		bh = jh2bh(jh);
823 		clear_buffer_jwrite(bh);
824 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
825 		J_ASSERT_BH(bh, !buffer_shadow(bh));
826 
827 		/* The metadata is now released for reuse, but we need
828                    to remember it against this transaction so that when
829                    we finally commit, we can do any checkpointing
830                    required. */
831 		JBUFFER_TRACE(jh, "file as BJ_Forget");
832 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
833 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
834 		__brelse(bh);
835 	}
836 
837 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
838 
839 	jbd_debug(3, "JBD2: commit phase 4\n");
840 
841 	/* Here we wait for the revoke record and descriptor record buffers */
842 	while (!list_empty(&log_bufs)) {
843 		struct buffer_head *bh;
844 
845 		bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
846 		wait_on_buffer(bh);
847 		cond_resched();
848 
849 		if (unlikely(!buffer_uptodate(bh)))
850 			err = -EIO;
851 
852 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
853 		clear_buffer_jwrite(bh);
854 		jbd2_unfile_log_bh(bh);
855 		__brelse(bh);		/* One for getblk */
856 		/* AKPM: bforget here */
857 	}
858 
859 	if (err)
860 		jbd2_journal_abort(journal, err);
861 
862 	jbd_debug(3, "JBD2: commit phase 5\n");
863 	write_lock(&journal->j_state_lock);
864 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
865 	commit_transaction->t_state = T_COMMIT_JFLUSH;
866 	write_unlock(&journal->j_state_lock);
867 
868 	if (!jbd2_has_feature_async_commit(journal)) {
869 		err = journal_submit_commit_record(journal, commit_transaction,
870 						&cbh, crc32_sum);
871 		if (err)
872 			__jbd2_journal_abort_hard(journal);
873 	}
874 	if (cbh)
875 		err = journal_wait_on_commit_record(journal, cbh);
876 	if (jbd2_has_feature_async_commit(journal) &&
877 	    journal->j_flags & JBD2_BARRIER) {
878 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
879 	}
880 
881 	if (err)
882 		jbd2_journal_abort(journal, err);
883 
884 	/*
885 	 * Now disk caches for filesystem device are flushed so we are safe to
886 	 * erase checkpointed transactions from the log by updating journal
887 	 * superblock.
888 	 */
889 	if (update_tail)
890 		jbd2_update_log_tail(journal, first_tid, first_block);
891 
892 	/* End of a transaction!  Finally, we can do checkpoint
893            processing: any buffers committed as a result of this
894            transaction can be removed from any checkpoint list it was on
895            before. */
896 
897 	jbd_debug(3, "JBD2: commit phase 6\n");
898 
899 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
900 	J_ASSERT(commit_transaction->t_buffers == NULL);
901 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
902 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
903 
904 restart_loop:
905 	/*
906 	 * As there are other places (journal_unmap_buffer()) adding buffers
907 	 * to this list we have to be careful and hold the j_list_lock.
908 	 */
909 	spin_lock(&journal->j_list_lock);
910 	while (commit_transaction->t_forget) {
911 		transaction_t *cp_transaction;
912 		struct buffer_head *bh;
913 		int try_to_free = 0;
914 
915 		jh = commit_transaction->t_forget;
916 		spin_unlock(&journal->j_list_lock);
917 		bh = jh2bh(jh);
918 		/*
919 		 * Get a reference so that bh cannot be freed before we are
920 		 * done with it.
921 		 */
922 		get_bh(bh);
923 		jbd_lock_bh_state(bh);
924 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
925 
926 		/*
927 		 * If there is undo-protected committed data against
928 		 * this buffer, then we can remove it now.  If it is a
929 		 * buffer needing such protection, the old frozen_data
930 		 * field now points to a committed version of the
931 		 * buffer, so rotate that field to the new committed
932 		 * data.
933 		 *
934 		 * Otherwise, we can just throw away the frozen data now.
935 		 *
936 		 * We also know that the frozen data has already fired
937 		 * its triggers if they exist, so we can clear that too.
938 		 */
939 		if (jh->b_committed_data) {
940 			jbd2_free(jh->b_committed_data, bh->b_size);
941 			jh->b_committed_data = NULL;
942 			if (jh->b_frozen_data) {
943 				jh->b_committed_data = jh->b_frozen_data;
944 				jh->b_frozen_data = NULL;
945 				jh->b_frozen_triggers = NULL;
946 			}
947 		} else if (jh->b_frozen_data) {
948 			jbd2_free(jh->b_frozen_data, bh->b_size);
949 			jh->b_frozen_data = NULL;
950 			jh->b_frozen_triggers = NULL;
951 		}
952 
953 		spin_lock(&journal->j_list_lock);
954 		cp_transaction = jh->b_cp_transaction;
955 		if (cp_transaction) {
956 			JBUFFER_TRACE(jh, "remove from old cp transaction");
957 			cp_transaction->t_chp_stats.cs_dropped++;
958 			__jbd2_journal_remove_checkpoint(jh);
959 		}
960 
961 		/* Only re-checkpoint the buffer_head if it is marked
962 		 * dirty.  If the buffer was added to the BJ_Forget list
963 		 * by jbd2_journal_forget, it may no longer be dirty and
964 		 * there's no point in keeping a checkpoint record for
965 		 * it. */
966 
967 		/*
968 		* A buffer which has been freed while still being journaled by
969 		* a previous transaction.
970 		*/
971 		if (buffer_freed(bh)) {
972 			/*
973 			 * If the running transaction is the one containing
974 			 * "add to orphan" operation (b_next_transaction !=
975 			 * NULL), we have to wait for that transaction to
976 			 * commit before we can really get rid of the buffer.
977 			 * So just clear b_modified to not confuse transaction
978 			 * credit accounting and refile the buffer to
979 			 * BJ_Forget of the running transaction. If the just
980 			 * committed transaction contains "add to orphan"
981 			 * operation, we can completely invalidate the buffer
982 			 * now. We are rather through in that since the
983 			 * buffer may be still accessible when blocksize <
984 			 * pagesize and it is attached to the last partial
985 			 * page.
986 			 */
987 			jh->b_modified = 0;
988 			if (!jh->b_next_transaction) {
989 				clear_buffer_freed(bh);
990 				clear_buffer_jbddirty(bh);
991 				clear_buffer_mapped(bh);
992 				clear_buffer_new(bh);
993 				clear_buffer_req(bh);
994 				bh->b_bdev = NULL;
995 			}
996 		}
997 
998 		if (buffer_jbddirty(bh)) {
999 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
1000 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
1001 			if (is_journal_aborted(journal))
1002 				clear_buffer_jbddirty(bh);
1003 		} else {
1004 			J_ASSERT_BH(bh, !buffer_dirty(bh));
1005 			/*
1006 			 * The buffer on BJ_Forget list and not jbddirty means
1007 			 * it has been freed by this transaction and hence it
1008 			 * could not have been reallocated until this
1009 			 * transaction has committed. *BUT* it could be
1010 			 * reallocated once we have written all the data to
1011 			 * disk and before we process the buffer on BJ_Forget
1012 			 * list.
1013 			 */
1014 			if (!jh->b_next_transaction)
1015 				try_to_free = 1;
1016 		}
1017 		JBUFFER_TRACE(jh, "refile or unfile buffer");
1018 		__jbd2_journal_refile_buffer(jh);
1019 		jbd_unlock_bh_state(bh);
1020 		if (try_to_free)
1021 			release_buffer_page(bh);	/* Drops bh reference */
1022 		else
1023 			__brelse(bh);
1024 		cond_resched_lock(&journal->j_list_lock);
1025 	}
1026 	spin_unlock(&journal->j_list_lock);
1027 	/*
1028 	 * This is a bit sleazy.  We use j_list_lock to protect transition
1029 	 * of a transaction into T_FINISHED state and calling
1030 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1031 	 * other checkpointing code processing the transaction...
1032 	 */
1033 	write_lock(&journal->j_state_lock);
1034 	spin_lock(&journal->j_list_lock);
1035 	/*
1036 	 * Now recheck if some buffers did not get attached to the transaction
1037 	 * while the lock was dropped...
1038 	 */
1039 	if (commit_transaction->t_forget) {
1040 		spin_unlock(&journal->j_list_lock);
1041 		write_unlock(&journal->j_state_lock);
1042 		goto restart_loop;
1043 	}
1044 
1045 	/* Add the transaction to the checkpoint list
1046 	 * __journal_remove_checkpoint() can not destroy transaction
1047 	 * under us because it is not marked as T_FINISHED yet */
1048 	if (journal->j_checkpoint_transactions == NULL) {
1049 		journal->j_checkpoint_transactions = commit_transaction;
1050 		commit_transaction->t_cpnext = commit_transaction;
1051 		commit_transaction->t_cpprev = commit_transaction;
1052 	} else {
1053 		commit_transaction->t_cpnext =
1054 			journal->j_checkpoint_transactions;
1055 		commit_transaction->t_cpprev =
1056 			commit_transaction->t_cpnext->t_cpprev;
1057 		commit_transaction->t_cpnext->t_cpprev =
1058 			commit_transaction;
1059 		commit_transaction->t_cpprev->t_cpnext =
1060 				commit_transaction;
1061 	}
1062 	spin_unlock(&journal->j_list_lock);
1063 
1064 	/* Done with this transaction! */
1065 
1066 	jbd_debug(3, "JBD2: commit phase 7\n");
1067 
1068 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1069 
1070 	commit_transaction->t_start = jiffies;
1071 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1072 					      commit_transaction->t_start);
1073 
1074 	/*
1075 	 * File the transaction statistics
1076 	 */
1077 	stats.ts_tid = commit_transaction->t_tid;
1078 	stats.run.rs_handle_count =
1079 		atomic_read(&commit_transaction->t_handle_count);
1080 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1081 			     commit_transaction->t_tid, &stats.run);
1082 	stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1083 
1084 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1085 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1086 	journal->j_commit_sequence = commit_transaction->t_tid;
1087 	journal->j_committing_transaction = NULL;
1088 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1089 
1090 	/*
1091 	 * weight the commit time higher than the average time so we don't
1092 	 * react too strongly to vast changes in the commit time
1093 	 */
1094 	if (likely(journal->j_average_commit_time))
1095 		journal->j_average_commit_time = (commit_time +
1096 				journal->j_average_commit_time*3) / 4;
1097 	else
1098 		journal->j_average_commit_time = commit_time;
1099 
1100 	write_unlock(&journal->j_state_lock);
1101 
1102 	if (journal->j_commit_callback)
1103 		journal->j_commit_callback(journal, commit_transaction);
1104 
1105 	trace_jbd2_end_commit(journal, commit_transaction);
1106 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1107 		  journal->j_commit_sequence, journal->j_tail_sequence);
1108 
1109 	write_lock(&journal->j_state_lock);
1110 	spin_lock(&journal->j_list_lock);
1111 	commit_transaction->t_state = T_FINISHED;
1112 	/* Check if the transaction can be dropped now that we are finished */
1113 	if (commit_transaction->t_checkpoint_list == NULL &&
1114 	    commit_transaction->t_checkpoint_io_list == NULL) {
1115 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1116 		jbd2_journal_free_transaction(commit_transaction);
1117 	}
1118 	spin_unlock(&journal->j_list_lock);
1119 	write_unlock(&journal->j_state_lock);
1120 	wake_up(&journal->j_wait_done_commit);
1121 
1122 	/*
1123 	 * Calculate overall stats
1124 	 */
1125 	spin_lock(&journal->j_history_lock);
1126 	journal->j_stats.ts_tid++;
1127 	journal->j_stats.ts_requested += stats.ts_requested;
1128 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1129 	journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1130 	journal->j_stats.run.rs_running += stats.run.rs_running;
1131 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1132 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1133 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1134 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1135 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1136 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1137 	spin_unlock(&journal->j_history_lock);
1138 }
1139