xref: /openbmc/linux/fs/jbd2/commit.c (revision 9d56dd3b083a3bec56e9da35ce07baca81030b03)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30 
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36 	BUFFER_TRACE(bh, "");
37 	if (uptodate)
38 		set_buffer_uptodate(bh);
39 	else
40 		clear_buffer_uptodate(bh);
41 	unlock_buffer(bh);
42 }
43 
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60 	struct page *page;
61 
62 	if (buffer_dirty(bh))
63 		goto nope;
64 	if (atomic_read(&bh->b_count) != 1)
65 		goto nope;
66 	page = bh->b_page;
67 	if (!page)
68 		goto nope;
69 	if (page->mapping)
70 		goto nope;
71 
72 	/* OK, it's a truncated page */
73 	if (!trylock_page(page))
74 		goto nope;
75 
76 	page_cache_get(page);
77 	__brelse(bh);
78 	try_to_free_buffers(page);
79 	unlock_page(page);
80 	page_cache_release(page);
81 	return;
82 
83 nope:
84 	__brelse(bh);
85 }
86 
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96 					transaction_t *commit_transaction,
97 					struct buffer_head **cbh,
98 					__u32 crc32_sum)
99 {
100 	struct journal_head *descriptor;
101 	struct commit_header *tmp;
102 	struct buffer_head *bh;
103 	int ret;
104 	int barrier_done = 0;
105 	struct timespec now = current_kernel_time();
106 
107 	if (is_journal_aborted(journal))
108 		return 0;
109 
110 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
111 	if (!descriptor)
112 		return 1;
113 
114 	bh = jh2bh(descriptor);
115 
116 	tmp = (struct commit_header *)bh->b_data;
117 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
118 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
119 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
120 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
121 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
122 
123 	if (JBD2_HAS_COMPAT_FEATURE(journal,
124 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
125 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
126 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
127 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
128 	}
129 
130 	JBUFFER_TRACE(descriptor, "submit commit block");
131 	lock_buffer(bh);
132 	clear_buffer_dirty(bh);
133 	set_buffer_uptodate(bh);
134 	bh->b_end_io = journal_end_buffer_io_sync;
135 
136 	if (journal->j_flags & JBD2_BARRIER &&
137 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
139 		set_buffer_ordered(bh);
140 		barrier_done = 1;
141 	}
142 	ret = submit_bh(WRITE_SYNC_PLUG, bh);
143 	if (barrier_done)
144 		clear_buffer_ordered(bh);
145 
146 	/* is it possible for another commit to fail at roughly
147 	 * the same time as this one?  If so, we don't want to
148 	 * trust the barrier flag in the super, but instead want
149 	 * to remember if we sent a barrier request
150 	 */
151 	if (ret == -EOPNOTSUPP && barrier_done) {
152 		printk(KERN_WARNING
153 		       "JBD: barrier-based sync failed on %s - "
154 		       "disabling barriers\n", journal->j_devname);
155 		spin_lock(&journal->j_state_lock);
156 		journal->j_flags &= ~JBD2_BARRIER;
157 		spin_unlock(&journal->j_state_lock);
158 
159 		/* And try again, without the barrier */
160 		lock_buffer(bh);
161 		set_buffer_uptodate(bh);
162 		clear_buffer_dirty(bh);
163 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 	}
165 	*cbh = bh;
166 	return ret;
167 }
168 
169 /*
170  * This function along with journal_submit_commit_record
171  * allows to write the commit record asynchronously.
172  */
173 static int journal_wait_on_commit_record(journal_t *journal,
174 					 struct buffer_head *bh)
175 {
176 	int ret = 0;
177 
178 retry:
179 	clear_buffer_dirty(bh);
180 	wait_on_buffer(bh);
181 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 		printk(KERN_WARNING
183 		       "JBD2: wait_on_commit_record: sync failed on %s - "
184 		       "disabling barriers\n", journal->j_devname);
185 		spin_lock(&journal->j_state_lock);
186 		journal->j_flags &= ~JBD2_BARRIER;
187 		spin_unlock(&journal->j_state_lock);
188 
189 		lock_buffer(bh);
190 		clear_buffer_dirty(bh);
191 		set_buffer_uptodate(bh);
192 		bh->b_end_io = journal_end_buffer_io_sync;
193 
194 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
195 		if (ret) {
196 			unlock_buffer(bh);
197 			return ret;
198 		}
199 		goto retry;
200 	}
201 
202 	if (unlikely(!buffer_uptodate(bh)))
203 		ret = -EIO;
204 	put_bh(bh);            /* One for getblk() */
205 	jbd2_journal_put_journal_head(bh2jh(bh));
206 
207 	return ret;
208 }
209 
210 /*
211  * write the filemap data using writepage() address_space_operations.
212  * We don't do block allocation here even for delalloc. We don't
213  * use writepages() because with dealyed allocation we may be doing
214  * block allocation in writepages().
215  */
216 static int journal_submit_inode_data_buffers(struct address_space *mapping)
217 {
218 	int ret;
219 	struct writeback_control wbc = {
220 		.sync_mode =  WB_SYNC_ALL,
221 		.nr_to_write = mapping->nrpages * 2,
222 		.range_start = 0,
223 		.range_end = i_size_read(mapping->host),
224 	};
225 
226 	ret = generic_writepages(mapping, &wbc);
227 	return ret;
228 }
229 
230 /*
231  * Submit all the data buffers of inode associated with the transaction to
232  * disk.
233  *
234  * We are in a committing transaction. Therefore no new inode can be added to
235  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
236  * operate on from being released while we write out pages.
237  */
238 static int journal_submit_data_buffers(journal_t *journal,
239 		transaction_t *commit_transaction)
240 {
241 	struct jbd2_inode *jinode;
242 	int err, ret = 0;
243 	struct address_space *mapping;
244 
245 	spin_lock(&journal->j_list_lock);
246 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 		mapping = jinode->i_vfs_inode->i_mapping;
248 		jinode->i_flags |= JI_COMMIT_RUNNING;
249 		spin_unlock(&journal->j_list_lock);
250 		/*
251 		 * submit the inode data buffers. We use writepage
252 		 * instead of writepages. Because writepages can do
253 		 * block allocation  with delalloc. We need to write
254 		 * only allocated blocks here.
255 		 */
256 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
257 		err = journal_submit_inode_data_buffers(mapping);
258 		if (!ret)
259 			ret = err;
260 		spin_lock(&journal->j_list_lock);
261 		J_ASSERT(jinode->i_transaction == commit_transaction);
262 		commit_transaction->t_flushed_data_blocks = 1;
263 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
264 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265 	}
266 	spin_unlock(&journal->j_list_lock);
267 	return ret;
268 }
269 
270 /*
271  * Wait for data submitted for writeout, refile inodes to proper
272  * transaction if needed.
273  *
274  */
275 static int journal_finish_inode_data_buffers(journal_t *journal,
276 		transaction_t *commit_transaction)
277 {
278 	struct jbd2_inode *jinode, *next_i;
279 	int err, ret = 0;
280 
281 	/* For locking, see the comment in journal_submit_data_buffers() */
282 	spin_lock(&journal->j_list_lock);
283 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
284 		jinode->i_flags |= JI_COMMIT_RUNNING;
285 		spin_unlock(&journal->j_list_lock);
286 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
287 		if (err) {
288 			/*
289 			 * Because AS_EIO is cleared by
290 			 * filemap_fdatawait_range(), set it again so
291 			 * that user process can get -EIO from fsync().
292 			 */
293 			set_bit(AS_EIO,
294 				&jinode->i_vfs_inode->i_mapping->flags);
295 
296 			if (!ret)
297 				ret = err;
298 		}
299 		spin_lock(&journal->j_list_lock);
300 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
301 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
302 	}
303 
304 	/* Now refile inode to proper lists */
305 	list_for_each_entry_safe(jinode, next_i,
306 				 &commit_transaction->t_inode_list, i_list) {
307 		list_del(&jinode->i_list);
308 		if (jinode->i_next_transaction) {
309 			jinode->i_transaction = jinode->i_next_transaction;
310 			jinode->i_next_transaction = NULL;
311 			list_add(&jinode->i_list,
312 				&jinode->i_transaction->t_inode_list);
313 		} else {
314 			jinode->i_transaction = NULL;
315 		}
316 	}
317 	spin_unlock(&journal->j_list_lock);
318 
319 	return ret;
320 }
321 
322 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
323 {
324 	struct page *page = bh->b_page;
325 	char *addr;
326 	__u32 checksum;
327 
328 	addr = kmap_atomic(page, KM_USER0);
329 	checksum = crc32_be(crc32_sum,
330 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
331 	kunmap_atomic(addr, KM_USER0);
332 
333 	return checksum;
334 }
335 
336 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
337 				   unsigned long long block)
338 {
339 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
340 	if (tag_bytes > JBD2_TAG_SIZE32)
341 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
342 }
343 
344 /*
345  * jbd2_journal_commit_transaction
346  *
347  * The primary function for committing a transaction to the log.  This
348  * function is called by the journal thread to begin a complete commit.
349  */
350 void jbd2_journal_commit_transaction(journal_t *journal)
351 {
352 	struct transaction_stats_s stats;
353 	transaction_t *commit_transaction;
354 	struct journal_head *jh, *new_jh, *descriptor;
355 	struct buffer_head **wbuf = journal->j_wbuf;
356 	int bufs;
357 	int flags;
358 	int err;
359 	unsigned long long blocknr;
360 	ktime_t start_time;
361 	u64 commit_time;
362 	char *tagp = NULL;
363 	journal_header_t *header;
364 	journal_block_tag_t *tag = NULL;
365 	int space_left = 0;
366 	int first_tag = 0;
367 	int tag_flag;
368 	int i, to_free = 0;
369 	int tag_bytes = journal_tag_bytes(journal);
370 	struct buffer_head *cbh = NULL; /* For transactional checksums */
371 	__u32 crc32_sum = ~0;
372 	int write_op = WRITE;
373 
374 	/*
375 	 * First job: lock down the current transaction and wait for
376 	 * all outstanding updates to complete.
377 	 */
378 
379 #ifdef COMMIT_STATS
380 	spin_lock(&journal->j_list_lock);
381 	summarise_journal_usage(journal);
382 	spin_unlock(&journal->j_list_lock);
383 #endif
384 
385 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
386 	if (journal->j_flags & JBD2_FLUSHED) {
387 		jbd_debug(3, "super block updated\n");
388 		jbd2_journal_update_superblock(journal, 1);
389 	} else {
390 		jbd_debug(3, "superblock not updated\n");
391 	}
392 
393 	J_ASSERT(journal->j_running_transaction != NULL);
394 	J_ASSERT(journal->j_committing_transaction == NULL);
395 
396 	commit_transaction = journal->j_running_transaction;
397 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
398 
399 	trace_jbd2_start_commit(journal, commit_transaction);
400 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 			commit_transaction->t_tid);
402 
403 	spin_lock(&journal->j_state_lock);
404 	commit_transaction->t_state = T_LOCKED;
405 
406 	/*
407 	 * Use plugged writes here, since we want to submit several before
408 	 * we unplug the device. We don't do explicit unplugging in here,
409 	 * instead we rely on sync_buffer() doing the unplug for us.
410 	 */
411 	if (commit_transaction->t_synchronous_commit)
412 		write_op = WRITE_SYNC_PLUG;
413 	trace_jbd2_commit_locking(journal, commit_transaction);
414 	stats.run.rs_wait = commit_transaction->t_max_wait;
415 	stats.run.rs_locked = jiffies;
416 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
417 					      stats.run.rs_locked);
418 
419 	spin_lock(&commit_transaction->t_handle_lock);
420 	while (commit_transaction->t_updates) {
421 		DEFINE_WAIT(wait);
422 
423 		prepare_to_wait(&journal->j_wait_updates, &wait,
424 					TASK_UNINTERRUPTIBLE);
425 		if (commit_transaction->t_updates) {
426 			spin_unlock(&commit_transaction->t_handle_lock);
427 			spin_unlock(&journal->j_state_lock);
428 			schedule();
429 			spin_lock(&journal->j_state_lock);
430 			spin_lock(&commit_transaction->t_handle_lock);
431 		}
432 		finish_wait(&journal->j_wait_updates, &wait);
433 	}
434 	spin_unlock(&commit_transaction->t_handle_lock);
435 
436 	J_ASSERT (commit_transaction->t_outstanding_credits <=
437 			journal->j_max_transaction_buffers);
438 
439 	/*
440 	 * First thing we are allowed to do is to discard any remaining
441 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
442 	 * that there are no such buffers: if a large filesystem
443 	 * operation like a truncate needs to split itself over multiple
444 	 * transactions, then it may try to do a jbd2_journal_restart() while
445 	 * there are still BJ_Reserved buffers outstanding.  These must
446 	 * be released cleanly from the current transaction.
447 	 *
448 	 * In this case, the filesystem must still reserve write access
449 	 * again before modifying the buffer in the new transaction, but
450 	 * we do not require it to remember exactly which old buffers it
451 	 * has reserved.  This is consistent with the existing behaviour
452 	 * that multiple jbd2_journal_get_write_access() calls to the same
453 	 * buffer are perfectly permissable.
454 	 */
455 	while (commit_transaction->t_reserved_list) {
456 		jh = commit_transaction->t_reserved_list;
457 		JBUFFER_TRACE(jh, "reserved, unused: refile");
458 		/*
459 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
460 		 * leave undo-committed data.
461 		 */
462 		if (jh->b_committed_data) {
463 			struct buffer_head *bh = jh2bh(jh);
464 
465 			jbd_lock_bh_state(bh);
466 			jbd2_free(jh->b_committed_data, bh->b_size);
467 			jh->b_committed_data = NULL;
468 			jbd_unlock_bh_state(bh);
469 		}
470 		jbd2_journal_refile_buffer(journal, jh);
471 	}
472 
473 	/*
474 	 * Now try to drop any written-back buffers from the journal's
475 	 * checkpoint lists.  We do this *before* commit because it potentially
476 	 * frees some memory
477 	 */
478 	spin_lock(&journal->j_list_lock);
479 	__jbd2_journal_clean_checkpoint_list(journal);
480 	spin_unlock(&journal->j_list_lock);
481 
482 	jbd_debug (3, "JBD: commit phase 1\n");
483 
484 	/*
485 	 * Switch to a new revoke table.
486 	 */
487 	jbd2_journal_switch_revoke_table(journal);
488 
489 	trace_jbd2_commit_flushing(journal, commit_transaction);
490 	stats.run.rs_flushing = jiffies;
491 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
492 					     stats.run.rs_flushing);
493 
494 	commit_transaction->t_state = T_FLUSH;
495 	journal->j_committing_transaction = commit_transaction;
496 	journal->j_running_transaction = NULL;
497 	start_time = ktime_get();
498 	commit_transaction->t_log_start = journal->j_head;
499 	wake_up(&journal->j_wait_transaction_locked);
500 	spin_unlock(&journal->j_state_lock);
501 
502 	jbd_debug (3, "JBD: commit phase 2\n");
503 
504 	/*
505 	 * Now start flushing things to disk, in the order they appear
506 	 * on the transaction lists.  Data blocks go first.
507 	 */
508 	err = journal_submit_data_buffers(journal, commit_transaction);
509 	if (err)
510 		jbd2_journal_abort(journal, err);
511 
512 	jbd2_journal_write_revoke_records(journal, commit_transaction,
513 					  write_op);
514 
515 	jbd_debug(3, "JBD: commit phase 2\n");
516 
517 	/*
518 	 * Way to go: we have now written out all of the data for a
519 	 * transaction!  Now comes the tricky part: we need to write out
520 	 * metadata.  Loop over the transaction's entire buffer list:
521 	 */
522 	spin_lock(&journal->j_state_lock);
523 	commit_transaction->t_state = T_COMMIT;
524 	spin_unlock(&journal->j_state_lock);
525 
526 	trace_jbd2_commit_logging(journal, commit_transaction);
527 	stats.run.rs_logging = jiffies;
528 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 					       stats.run.rs_logging);
530 	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
531 	stats.run.rs_blocks_logged = 0;
532 
533 	J_ASSERT(commit_transaction->t_nr_buffers <=
534 		 commit_transaction->t_outstanding_credits);
535 
536 	err = 0;
537 	descriptor = NULL;
538 	bufs = 0;
539 	while (commit_transaction->t_buffers) {
540 
541 		/* Find the next buffer to be journaled... */
542 
543 		jh = commit_transaction->t_buffers;
544 
545 		/* If we're in abort mode, we just un-journal the buffer and
546 		   release it. */
547 
548 		if (is_journal_aborted(journal)) {
549 			clear_buffer_jbddirty(jh2bh(jh));
550 			JBUFFER_TRACE(jh, "journal is aborting: refile");
551 			jbd2_buffer_abort_trigger(jh,
552 						  jh->b_frozen_data ?
553 						  jh->b_frozen_triggers :
554 						  jh->b_triggers);
555 			jbd2_journal_refile_buffer(journal, jh);
556 			/* If that was the last one, we need to clean up
557 			 * any descriptor buffers which may have been
558 			 * already allocated, even if we are now
559 			 * aborting. */
560 			if (!commit_transaction->t_buffers)
561 				goto start_journal_io;
562 			continue;
563 		}
564 
565 		/* Make sure we have a descriptor block in which to
566 		   record the metadata buffer. */
567 
568 		if (!descriptor) {
569 			struct buffer_head *bh;
570 
571 			J_ASSERT (bufs == 0);
572 
573 			jbd_debug(4, "JBD: get descriptor\n");
574 
575 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
576 			if (!descriptor) {
577 				jbd2_journal_abort(journal, -EIO);
578 				continue;
579 			}
580 
581 			bh = jh2bh(descriptor);
582 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
583 				(unsigned long long)bh->b_blocknr, bh->b_data);
584 			header = (journal_header_t *)&bh->b_data[0];
585 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
586 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
587 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
588 
589 			tagp = &bh->b_data[sizeof(journal_header_t)];
590 			space_left = bh->b_size - sizeof(journal_header_t);
591 			first_tag = 1;
592 			set_buffer_jwrite(bh);
593 			set_buffer_dirty(bh);
594 			wbuf[bufs++] = bh;
595 
596 			/* Record it so that we can wait for IO
597                            completion later */
598 			BUFFER_TRACE(bh, "ph3: file as descriptor");
599 			jbd2_journal_file_buffer(descriptor, commit_transaction,
600 					BJ_LogCtl);
601 		}
602 
603 		/* Where is the buffer to be written? */
604 
605 		err = jbd2_journal_next_log_block(journal, &blocknr);
606 		/* If the block mapping failed, just abandon the buffer
607 		   and repeat this loop: we'll fall into the
608 		   refile-on-abort condition above. */
609 		if (err) {
610 			jbd2_journal_abort(journal, err);
611 			continue;
612 		}
613 
614 		/*
615 		 * start_this_handle() uses t_outstanding_credits to determine
616 		 * the free space in the log, but this counter is changed
617 		 * by jbd2_journal_next_log_block() also.
618 		 */
619 		commit_transaction->t_outstanding_credits--;
620 
621 		/* Bump b_count to prevent truncate from stumbling over
622                    the shadowed buffer!  @@@ This can go if we ever get
623                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
624 		atomic_inc(&jh2bh(jh)->b_count);
625 
626 		/* Make a temporary IO buffer with which to write it out
627                    (this will requeue both the metadata buffer and the
628                    temporary IO buffer). new_bh goes on BJ_IO*/
629 
630 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
631 		/*
632 		 * akpm: jbd2_journal_write_metadata_buffer() sets
633 		 * new_bh->b_transaction to commit_transaction.
634 		 * We need to clean this up before we release new_bh
635 		 * (which is of type BJ_IO)
636 		 */
637 		JBUFFER_TRACE(jh, "ph3: write metadata");
638 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
639 						      jh, &new_jh, blocknr);
640 		if (flags < 0) {
641 			jbd2_journal_abort(journal, flags);
642 			continue;
643 		}
644 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
645 		wbuf[bufs++] = jh2bh(new_jh);
646 
647 		/* Record the new block's tag in the current descriptor
648                    buffer */
649 
650 		tag_flag = 0;
651 		if (flags & 1)
652 			tag_flag |= JBD2_FLAG_ESCAPE;
653 		if (!first_tag)
654 			tag_flag |= JBD2_FLAG_SAME_UUID;
655 
656 		tag = (journal_block_tag_t *) tagp;
657 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
658 		tag->t_flags = cpu_to_be32(tag_flag);
659 		tagp += tag_bytes;
660 		space_left -= tag_bytes;
661 
662 		if (first_tag) {
663 			memcpy (tagp, journal->j_uuid, 16);
664 			tagp += 16;
665 			space_left -= 16;
666 			first_tag = 0;
667 		}
668 
669 		/* If there's no more to do, or if the descriptor is full,
670 		   let the IO rip! */
671 
672 		if (bufs == journal->j_wbufsize ||
673 		    commit_transaction->t_buffers == NULL ||
674 		    space_left < tag_bytes + 16) {
675 
676 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
677 
678 			/* Write an end-of-descriptor marker before
679                            submitting the IOs.  "tag" still points to
680                            the last tag we set up. */
681 
682 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
683 
684 start_journal_io:
685 			for (i = 0; i < bufs; i++) {
686 				struct buffer_head *bh = wbuf[i];
687 				/*
688 				 * Compute checksum.
689 				 */
690 				if (JBD2_HAS_COMPAT_FEATURE(journal,
691 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
692 					crc32_sum =
693 					    jbd2_checksum_data(crc32_sum, bh);
694 				}
695 
696 				lock_buffer(bh);
697 				clear_buffer_dirty(bh);
698 				set_buffer_uptodate(bh);
699 				bh->b_end_io = journal_end_buffer_io_sync;
700 				submit_bh(write_op, bh);
701 			}
702 			cond_resched();
703 			stats.run.rs_blocks_logged += bufs;
704 
705 			/* Force a new descriptor to be generated next
706                            time round the loop. */
707 			descriptor = NULL;
708 			bufs = 0;
709 		}
710 	}
711 
712 	/*
713 	 * If the journal is not located on the file system device,
714 	 * then we must flush the file system device before we issue
715 	 * the commit record
716 	 */
717 	if (commit_transaction->t_flushed_data_blocks &&
718 	    (journal->j_fs_dev != journal->j_dev) &&
719 	    (journal->j_flags & JBD2_BARRIER))
720 		blkdev_issue_flush(journal->j_fs_dev, NULL);
721 
722 	/* Done it all: now write the commit record asynchronously. */
723 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
724 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
725 		err = journal_submit_commit_record(journal, commit_transaction,
726 						 &cbh, crc32_sum);
727 		if (err)
728 			__jbd2_journal_abort_hard(journal);
729 		if (journal->j_flags & JBD2_BARRIER)
730 			blkdev_issue_flush(journal->j_dev, NULL);
731 	}
732 
733 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
734 	if (err) {
735 		printk(KERN_WARNING
736 			"JBD2: Detected IO errors while flushing file data "
737 		       "on %s\n", journal->j_devname);
738 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
739 			jbd2_journal_abort(journal, err);
740 		err = 0;
741 	}
742 
743 	/* Lo and behold: we have just managed to send a transaction to
744            the log.  Before we can commit it, wait for the IO so far to
745            complete.  Control buffers being written are on the
746            transaction's t_log_list queue, and metadata buffers are on
747            the t_iobuf_list queue.
748 
749 	   Wait for the buffers in reverse order.  That way we are
750 	   less likely to be woken up until all IOs have completed, and
751 	   so we incur less scheduling load.
752 	*/
753 
754 	jbd_debug(3, "JBD: commit phase 3\n");
755 
756 	/*
757 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
758 	 * See __journal_try_to_free_buffer.
759 	 */
760 wait_for_iobuf:
761 	while (commit_transaction->t_iobuf_list != NULL) {
762 		struct buffer_head *bh;
763 
764 		jh = commit_transaction->t_iobuf_list->b_tprev;
765 		bh = jh2bh(jh);
766 		if (buffer_locked(bh)) {
767 			wait_on_buffer(bh);
768 			goto wait_for_iobuf;
769 		}
770 		if (cond_resched())
771 			goto wait_for_iobuf;
772 
773 		if (unlikely(!buffer_uptodate(bh)))
774 			err = -EIO;
775 
776 		clear_buffer_jwrite(bh);
777 
778 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
779 		jbd2_journal_unfile_buffer(journal, jh);
780 
781 		/*
782 		 * ->t_iobuf_list should contain only dummy buffer_heads
783 		 * which were created by jbd2_journal_write_metadata_buffer().
784 		 */
785 		BUFFER_TRACE(bh, "dumping temporary bh");
786 		jbd2_journal_put_journal_head(jh);
787 		__brelse(bh);
788 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
789 		free_buffer_head(bh);
790 
791 		/* We also have to unlock and free the corresponding
792                    shadowed buffer */
793 		jh = commit_transaction->t_shadow_list->b_tprev;
794 		bh = jh2bh(jh);
795 		clear_bit(BH_JWrite, &bh->b_state);
796 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
797 
798 		/* The metadata is now released for reuse, but we need
799                    to remember it against this transaction so that when
800                    we finally commit, we can do any checkpointing
801                    required. */
802 		JBUFFER_TRACE(jh, "file as BJ_Forget");
803 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
804 		/* Wake up any transactions which were waiting for this
805 		   IO to complete */
806 		wake_up_bit(&bh->b_state, BH_Unshadow);
807 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
808 		__brelse(bh);
809 	}
810 
811 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
812 
813 	jbd_debug(3, "JBD: commit phase 4\n");
814 
815 	/* Here we wait for the revoke record and descriptor record buffers */
816  wait_for_ctlbuf:
817 	while (commit_transaction->t_log_list != NULL) {
818 		struct buffer_head *bh;
819 
820 		jh = commit_transaction->t_log_list->b_tprev;
821 		bh = jh2bh(jh);
822 		if (buffer_locked(bh)) {
823 			wait_on_buffer(bh);
824 			goto wait_for_ctlbuf;
825 		}
826 		if (cond_resched())
827 			goto wait_for_ctlbuf;
828 
829 		if (unlikely(!buffer_uptodate(bh)))
830 			err = -EIO;
831 
832 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
833 		clear_buffer_jwrite(bh);
834 		jbd2_journal_unfile_buffer(journal, jh);
835 		jbd2_journal_put_journal_head(jh);
836 		__brelse(bh);		/* One for getblk */
837 		/* AKPM: bforget here */
838 	}
839 
840 	if (err)
841 		jbd2_journal_abort(journal, err);
842 
843 	jbd_debug(3, "JBD: commit phase 5\n");
844 
845 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
846 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
847 		err = journal_submit_commit_record(journal, commit_transaction,
848 						&cbh, crc32_sum);
849 		if (err)
850 			__jbd2_journal_abort_hard(journal);
851 	}
852 	if (!err && !is_journal_aborted(journal))
853 		err = journal_wait_on_commit_record(journal, cbh);
854 
855 	if (err)
856 		jbd2_journal_abort(journal, err);
857 
858 	/* End of a transaction!  Finally, we can do checkpoint
859            processing: any buffers committed as a result of this
860            transaction can be removed from any checkpoint list it was on
861            before. */
862 
863 	jbd_debug(3, "JBD: commit phase 6\n");
864 
865 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
866 	J_ASSERT(commit_transaction->t_buffers == NULL);
867 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
868 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
869 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
870 	J_ASSERT(commit_transaction->t_log_list == NULL);
871 
872 restart_loop:
873 	/*
874 	 * As there are other places (journal_unmap_buffer()) adding buffers
875 	 * to this list we have to be careful and hold the j_list_lock.
876 	 */
877 	spin_lock(&journal->j_list_lock);
878 	while (commit_transaction->t_forget) {
879 		transaction_t *cp_transaction;
880 		struct buffer_head *bh;
881 
882 		jh = commit_transaction->t_forget;
883 		spin_unlock(&journal->j_list_lock);
884 		bh = jh2bh(jh);
885 		jbd_lock_bh_state(bh);
886 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
887 			jh->b_transaction == journal->j_running_transaction);
888 
889 		/*
890 		 * If there is undo-protected committed data against
891 		 * this buffer, then we can remove it now.  If it is a
892 		 * buffer needing such protection, the old frozen_data
893 		 * field now points to a committed version of the
894 		 * buffer, so rotate that field to the new committed
895 		 * data.
896 		 *
897 		 * Otherwise, we can just throw away the frozen data now.
898 		 *
899 		 * We also know that the frozen data has already fired
900 		 * its triggers if they exist, so we can clear that too.
901 		 */
902 		if (jh->b_committed_data) {
903 			jbd2_free(jh->b_committed_data, bh->b_size);
904 			jh->b_committed_data = NULL;
905 			if (jh->b_frozen_data) {
906 				jh->b_committed_data = jh->b_frozen_data;
907 				jh->b_frozen_data = NULL;
908 				jh->b_frozen_triggers = NULL;
909 			}
910 		} else if (jh->b_frozen_data) {
911 			jbd2_free(jh->b_frozen_data, bh->b_size);
912 			jh->b_frozen_data = NULL;
913 			jh->b_frozen_triggers = NULL;
914 		}
915 
916 		spin_lock(&journal->j_list_lock);
917 		cp_transaction = jh->b_cp_transaction;
918 		if (cp_transaction) {
919 			JBUFFER_TRACE(jh, "remove from old cp transaction");
920 			cp_transaction->t_chp_stats.cs_dropped++;
921 			__jbd2_journal_remove_checkpoint(jh);
922 		}
923 
924 		/* Only re-checkpoint the buffer_head if it is marked
925 		 * dirty.  If the buffer was added to the BJ_Forget list
926 		 * by jbd2_journal_forget, it may no longer be dirty and
927 		 * there's no point in keeping a checkpoint record for
928 		 * it. */
929 
930 		/* A buffer which has been freed while still being
931 		 * journaled by a previous transaction may end up still
932 		 * being dirty here, but we want to avoid writing back
933 		 * that buffer in the future now that the last use has
934 		 * been committed.  That's not only a performance gain,
935 		 * it also stops aliasing problems if the buffer is left
936 		 * behind for writeback and gets reallocated for another
937 		 * use in a different page. */
938 		if (buffer_freed(bh)) {
939 			clear_buffer_freed(bh);
940 			clear_buffer_jbddirty(bh);
941 		}
942 
943 		if (buffer_jbddirty(bh)) {
944 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
945 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
946 			if (is_journal_aborted(journal))
947 				clear_buffer_jbddirty(bh);
948 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
949 			__jbd2_journal_refile_buffer(jh);
950 			jbd_unlock_bh_state(bh);
951 		} else {
952 			J_ASSERT_BH(bh, !buffer_dirty(bh));
953 			/* The buffer on BJ_Forget list and not jbddirty means
954 			 * it has been freed by this transaction and hence it
955 			 * could not have been reallocated until this
956 			 * transaction has committed. *BUT* it could be
957 			 * reallocated once we have written all the data to
958 			 * disk and before we process the buffer on BJ_Forget
959 			 * list. */
960 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
961 			__jbd2_journal_refile_buffer(jh);
962 			if (!jh->b_transaction) {
963 				jbd_unlock_bh_state(bh);
964 				 /* needs a brelse */
965 				jbd2_journal_remove_journal_head(bh);
966 				release_buffer_page(bh);
967 			} else
968 				jbd_unlock_bh_state(bh);
969 		}
970 		cond_resched_lock(&journal->j_list_lock);
971 	}
972 	spin_unlock(&journal->j_list_lock);
973 	/*
974 	 * This is a bit sleazy.  We use j_list_lock to protect transition
975 	 * of a transaction into T_FINISHED state and calling
976 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
977 	 * other checkpointing code processing the transaction...
978 	 */
979 	spin_lock(&journal->j_state_lock);
980 	spin_lock(&journal->j_list_lock);
981 	/*
982 	 * Now recheck if some buffers did not get attached to the transaction
983 	 * while the lock was dropped...
984 	 */
985 	if (commit_transaction->t_forget) {
986 		spin_unlock(&journal->j_list_lock);
987 		spin_unlock(&journal->j_state_lock);
988 		goto restart_loop;
989 	}
990 
991 	/* Done with this transaction! */
992 
993 	jbd_debug(3, "JBD: commit phase 7\n");
994 
995 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
996 
997 	commit_transaction->t_start = jiffies;
998 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
999 					      commit_transaction->t_start);
1000 
1001 	/*
1002 	 * File the transaction statistics
1003 	 */
1004 	stats.ts_tid = commit_transaction->t_tid;
1005 	stats.run.rs_handle_count = commit_transaction->t_handle_count;
1006 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1007 			     commit_transaction->t_tid, &stats.run);
1008 
1009 	/*
1010 	 * Calculate overall stats
1011 	 */
1012 	spin_lock(&journal->j_history_lock);
1013 	journal->j_stats.ts_tid++;
1014 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1015 	journal->j_stats.run.rs_running += stats.run.rs_running;
1016 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1017 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1018 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1019 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1020 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1021 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1022 	spin_unlock(&journal->j_history_lock);
1023 
1024 	commit_transaction->t_state = T_FINISHED;
1025 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1026 	journal->j_commit_sequence = commit_transaction->t_tid;
1027 	journal->j_committing_transaction = NULL;
1028 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1029 
1030 	/*
1031 	 * weight the commit time higher than the average time so we don't
1032 	 * react too strongly to vast changes in the commit time
1033 	 */
1034 	if (likely(journal->j_average_commit_time))
1035 		journal->j_average_commit_time = (commit_time +
1036 				journal->j_average_commit_time*3) / 4;
1037 	else
1038 		journal->j_average_commit_time = commit_time;
1039 	spin_unlock(&journal->j_state_lock);
1040 
1041 	if (commit_transaction->t_checkpoint_list == NULL &&
1042 	    commit_transaction->t_checkpoint_io_list == NULL) {
1043 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1044 		to_free = 1;
1045 	} else {
1046 		if (journal->j_checkpoint_transactions == NULL) {
1047 			journal->j_checkpoint_transactions = commit_transaction;
1048 			commit_transaction->t_cpnext = commit_transaction;
1049 			commit_transaction->t_cpprev = commit_transaction;
1050 		} else {
1051 			commit_transaction->t_cpnext =
1052 				journal->j_checkpoint_transactions;
1053 			commit_transaction->t_cpprev =
1054 				commit_transaction->t_cpnext->t_cpprev;
1055 			commit_transaction->t_cpnext->t_cpprev =
1056 				commit_transaction;
1057 			commit_transaction->t_cpprev->t_cpnext =
1058 				commit_transaction;
1059 		}
1060 	}
1061 	spin_unlock(&journal->j_list_lock);
1062 
1063 	if (journal->j_commit_callback)
1064 		journal->j_commit_callback(journal, commit_transaction);
1065 
1066 	trace_jbd2_end_commit(journal, commit_transaction);
1067 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1068 		  journal->j_commit_sequence, journal->j_tail_sequence);
1069 	if (to_free)
1070 		kfree(commit_transaction);
1071 
1072 	wake_up(&journal->j_wait_done_commit);
1073 }
1074