xref: /openbmc/linux/fs/jbd2/commit.c (revision c21b37f6)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 
24 /*
25  * Default IO end handler for temporary BJ_IO buffer_heads.
26  */
27 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
28 {
29 	BUFFER_TRACE(bh, "");
30 	if (uptodate)
31 		set_buffer_uptodate(bh);
32 	else
33 		clear_buffer_uptodate(bh);
34 	unlock_buffer(bh);
35 }
36 
37 /*
38  * When an ext3-ordered file is truncated, it is possible that many pages are
39  * not sucessfully freed, because they are attached to a committing transaction.
40  * After the transaction commits, these pages are left on the LRU, with no
41  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
42  * by the VM, but their apparent absence upsets the VM accounting, and it makes
43  * the numbers in /proc/meminfo look odd.
44  *
45  * So here, we have a buffer which has just come off the forget list.  Look to
46  * see if we can strip all buffers from the backing page.
47  *
48  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
49  * caller provided us with a ref against the buffer, and we drop that here.
50  */
51 static void release_buffer_page(struct buffer_head *bh)
52 {
53 	struct page *page;
54 
55 	if (buffer_dirty(bh))
56 		goto nope;
57 	if (atomic_read(&bh->b_count) != 1)
58 		goto nope;
59 	page = bh->b_page;
60 	if (!page)
61 		goto nope;
62 	if (page->mapping)
63 		goto nope;
64 
65 	/* OK, it's a truncated page */
66 	if (TestSetPageLocked(page))
67 		goto nope;
68 
69 	page_cache_get(page);
70 	__brelse(bh);
71 	try_to_free_buffers(page);
72 	unlock_page(page);
73 	page_cache_release(page);
74 	return;
75 
76 nope:
77 	__brelse(bh);
78 }
79 
80 /*
81  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
82  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
83  * return 0.  j_list_lock is dropped in this case.
84  */
85 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
86 {
87 	if (!jbd_trylock_bh_state(bh)) {
88 		spin_unlock(&journal->j_list_lock);
89 		schedule();
90 		return 0;
91 	}
92 	return 1;
93 }
94 
95 /* Done it all: now write the commit record.  We should have
96  * cleaned up our previous buffers by now, so if we are in abort
97  * mode we can now just skip the rest of the journal write
98  * entirely.
99  *
100  * Returns 1 if the journal needs to be aborted or 0 on success
101  */
102 static int journal_write_commit_record(journal_t *journal,
103 					transaction_t *commit_transaction)
104 {
105 	struct journal_head *descriptor;
106 	struct buffer_head *bh;
107 	int i, ret;
108 	int barrier_done = 0;
109 
110 	if (is_journal_aborted(journal))
111 		return 0;
112 
113 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
114 	if (!descriptor)
115 		return 1;
116 
117 	bh = jh2bh(descriptor);
118 
119 	/* AKPM: buglet - add `i' to tmp! */
120 	for (i = 0; i < bh->b_size; i += 512) {
121 		journal_header_t *tmp = (journal_header_t*)bh->b_data;
122 		tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
123 		tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
124 		tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
125 	}
126 
127 	JBUFFER_TRACE(descriptor, "write commit block");
128 	set_buffer_dirty(bh);
129 	if (journal->j_flags & JBD2_BARRIER) {
130 		set_buffer_ordered(bh);
131 		barrier_done = 1;
132 	}
133 	ret = sync_dirty_buffer(bh);
134 	/* is it possible for another commit to fail at roughly
135 	 * the same time as this one?  If so, we don't want to
136 	 * trust the barrier flag in the super, but instead want
137 	 * to remember if we sent a barrier request
138 	 */
139 	if (ret == -EOPNOTSUPP && barrier_done) {
140 		char b[BDEVNAME_SIZE];
141 
142 		printk(KERN_WARNING
143 			"JBD: barrier-based sync failed on %s - "
144 			"disabling barriers\n",
145 			bdevname(journal->j_dev, b));
146 		spin_lock(&journal->j_state_lock);
147 		journal->j_flags &= ~JBD2_BARRIER;
148 		spin_unlock(&journal->j_state_lock);
149 
150 		/* And try again, without the barrier */
151 		clear_buffer_ordered(bh);
152 		set_buffer_uptodate(bh);
153 		set_buffer_dirty(bh);
154 		ret = sync_dirty_buffer(bh);
155 	}
156 	put_bh(bh);		/* One for getblk() */
157 	jbd2_journal_put_journal_head(descriptor);
158 
159 	return (ret == -EIO);
160 }
161 
162 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
163 {
164 	int i;
165 
166 	for (i = 0; i < bufs; i++) {
167 		wbuf[i]->b_end_io = end_buffer_write_sync;
168 		/* We use-up our safety reference in submit_bh() */
169 		submit_bh(WRITE, wbuf[i]);
170 	}
171 }
172 
173 /*
174  *  Submit all the data buffers to disk
175  */
176 static void journal_submit_data_buffers(journal_t *journal,
177 				transaction_t *commit_transaction)
178 {
179 	struct journal_head *jh;
180 	struct buffer_head *bh;
181 	int locked;
182 	int bufs = 0;
183 	struct buffer_head **wbuf = journal->j_wbuf;
184 
185 	/*
186 	 * Whenever we unlock the journal and sleep, things can get added
187 	 * onto ->t_sync_datalist, so we have to keep looping back to
188 	 * write_out_data until we *know* that the list is empty.
189 	 *
190 	 * Cleanup any flushed data buffers from the data list.  Even in
191 	 * abort mode, we want to flush this out as soon as possible.
192 	 */
193 write_out_data:
194 	cond_resched();
195 	spin_lock(&journal->j_list_lock);
196 
197 	while (commit_transaction->t_sync_datalist) {
198 		jh = commit_transaction->t_sync_datalist;
199 		bh = jh2bh(jh);
200 		locked = 0;
201 
202 		/* Get reference just to make sure buffer does not disappear
203 		 * when we are forced to drop various locks */
204 		get_bh(bh);
205 		/* If the buffer is dirty, we need to submit IO and hence
206 		 * we need the buffer lock. We try to lock the buffer without
207 		 * blocking. If we fail, we need to drop j_list_lock and do
208 		 * blocking lock_buffer().
209 		 */
210 		if (buffer_dirty(bh)) {
211 			if (test_set_buffer_locked(bh)) {
212 				BUFFER_TRACE(bh, "needs blocking lock");
213 				spin_unlock(&journal->j_list_lock);
214 				/* Write out all data to prevent deadlocks */
215 				journal_do_submit_data(wbuf, bufs);
216 				bufs = 0;
217 				lock_buffer(bh);
218 				spin_lock(&journal->j_list_lock);
219 			}
220 			locked = 1;
221 		}
222 		/* We have to get bh_state lock. Again out of order, sigh. */
223 		if (!inverted_lock(journal, bh)) {
224 			jbd_lock_bh_state(bh);
225 			spin_lock(&journal->j_list_lock);
226 		}
227 		/* Someone already cleaned up the buffer? */
228 		if (!buffer_jbd(bh)
229 			|| jh->b_transaction != commit_transaction
230 			|| jh->b_jlist != BJ_SyncData) {
231 			jbd_unlock_bh_state(bh);
232 			if (locked)
233 				unlock_buffer(bh);
234 			BUFFER_TRACE(bh, "already cleaned up");
235 			put_bh(bh);
236 			continue;
237 		}
238 		if (locked && test_clear_buffer_dirty(bh)) {
239 			BUFFER_TRACE(bh, "needs writeout, adding to array");
240 			wbuf[bufs++] = bh;
241 			__jbd2_journal_file_buffer(jh, commit_transaction,
242 						BJ_Locked);
243 			jbd_unlock_bh_state(bh);
244 			if (bufs == journal->j_wbufsize) {
245 				spin_unlock(&journal->j_list_lock);
246 				journal_do_submit_data(wbuf, bufs);
247 				bufs = 0;
248 				goto write_out_data;
249 			}
250 		} else if (!locked && buffer_locked(bh)) {
251 			__jbd2_journal_file_buffer(jh, commit_transaction,
252 						BJ_Locked);
253 			jbd_unlock_bh_state(bh);
254 			put_bh(bh);
255 		} else {
256 			BUFFER_TRACE(bh, "writeout complete: unfile");
257 			__jbd2_journal_unfile_buffer(jh);
258 			jbd_unlock_bh_state(bh);
259 			if (locked)
260 				unlock_buffer(bh);
261 			jbd2_journal_remove_journal_head(bh);
262 			/* Once for our safety reference, once for
263 			 * jbd2_journal_remove_journal_head() */
264 			put_bh(bh);
265 			put_bh(bh);
266 		}
267 
268 		if (lock_need_resched(&journal->j_list_lock)) {
269 			spin_unlock(&journal->j_list_lock);
270 			goto write_out_data;
271 		}
272 	}
273 	spin_unlock(&journal->j_list_lock);
274 	journal_do_submit_data(wbuf, bufs);
275 }
276 
277 static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
278 				   unsigned long long block)
279 {
280 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
281 	if (tag_bytes > JBD_TAG_SIZE32)
282 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
283 }
284 
285 /*
286  * jbd2_journal_commit_transaction
287  *
288  * The primary function for committing a transaction to the log.  This
289  * function is called by the journal thread to begin a complete commit.
290  */
291 void jbd2_journal_commit_transaction(journal_t *journal)
292 {
293 	transaction_t *commit_transaction;
294 	struct journal_head *jh, *new_jh, *descriptor;
295 	struct buffer_head **wbuf = journal->j_wbuf;
296 	int bufs;
297 	int flags;
298 	int err;
299 	unsigned long long blocknr;
300 	char *tagp = NULL;
301 	journal_header_t *header;
302 	journal_block_tag_t *tag = NULL;
303 	int space_left = 0;
304 	int first_tag = 0;
305 	int tag_flag;
306 	int i;
307 	int tag_bytes = journal_tag_bytes(journal);
308 
309 	/*
310 	 * First job: lock down the current transaction and wait for
311 	 * all outstanding updates to complete.
312 	 */
313 
314 #ifdef COMMIT_STATS
315 	spin_lock(&journal->j_list_lock);
316 	summarise_journal_usage(journal);
317 	spin_unlock(&journal->j_list_lock);
318 #endif
319 
320 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
321 	if (journal->j_flags & JBD2_FLUSHED) {
322 		jbd_debug(3, "super block updated\n");
323 		jbd2_journal_update_superblock(journal, 1);
324 	} else {
325 		jbd_debug(3, "superblock not updated\n");
326 	}
327 
328 	J_ASSERT(journal->j_running_transaction != NULL);
329 	J_ASSERT(journal->j_committing_transaction == NULL);
330 
331 	commit_transaction = journal->j_running_transaction;
332 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
333 
334 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
335 			commit_transaction->t_tid);
336 
337 	spin_lock(&journal->j_state_lock);
338 	commit_transaction->t_state = T_LOCKED;
339 
340 	spin_lock(&commit_transaction->t_handle_lock);
341 	while (commit_transaction->t_updates) {
342 		DEFINE_WAIT(wait);
343 
344 		prepare_to_wait(&journal->j_wait_updates, &wait,
345 					TASK_UNINTERRUPTIBLE);
346 		if (commit_transaction->t_updates) {
347 			spin_unlock(&commit_transaction->t_handle_lock);
348 			spin_unlock(&journal->j_state_lock);
349 			schedule();
350 			spin_lock(&journal->j_state_lock);
351 			spin_lock(&commit_transaction->t_handle_lock);
352 		}
353 		finish_wait(&journal->j_wait_updates, &wait);
354 	}
355 	spin_unlock(&commit_transaction->t_handle_lock);
356 
357 	J_ASSERT (commit_transaction->t_outstanding_credits <=
358 			journal->j_max_transaction_buffers);
359 
360 	/*
361 	 * First thing we are allowed to do is to discard any remaining
362 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
363 	 * that there are no such buffers: if a large filesystem
364 	 * operation like a truncate needs to split itself over multiple
365 	 * transactions, then it may try to do a jbd2_journal_restart() while
366 	 * there are still BJ_Reserved buffers outstanding.  These must
367 	 * be released cleanly from the current transaction.
368 	 *
369 	 * In this case, the filesystem must still reserve write access
370 	 * again before modifying the buffer in the new transaction, but
371 	 * we do not require it to remember exactly which old buffers it
372 	 * has reserved.  This is consistent with the existing behaviour
373 	 * that multiple jbd2_journal_get_write_access() calls to the same
374 	 * buffer are perfectly permissable.
375 	 */
376 	while (commit_transaction->t_reserved_list) {
377 		jh = commit_transaction->t_reserved_list;
378 		JBUFFER_TRACE(jh, "reserved, unused: refile");
379 		/*
380 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
381 		 * leave undo-committed data.
382 		 */
383 		if (jh->b_committed_data) {
384 			struct buffer_head *bh = jh2bh(jh);
385 
386 			jbd_lock_bh_state(bh);
387 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
388 			jh->b_committed_data = NULL;
389 			jbd_unlock_bh_state(bh);
390 		}
391 		jbd2_journal_refile_buffer(journal, jh);
392 	}
393 
394 	/*
395 	 * Now try to drop any written-back buffers from the journal's
396 	 * checkpoint lists.  We do this *before* commit because it potentially
397 	 * frees some memory
398 	 */
399 	spin_lock(&journal->j_list_lock);
400 	__jbd2_journal_clean_checkpoint_list(journal);
401 	spin_unlock(&journal->j_list_lock);
402 
403 	jbd_debug (3, "JBD: commit phase 1\n");
404 
405 	/*
406 	 * Switch to a new revoke table.
407 	 */
408 	jbd2_journal_switch_revoke_table(journal);
409 
410 	commit_transaction->t_state = T_FLUSH;
411 	journal->j_committing_transaction = commit_transaction;
412 	journal->j_running_transaction = NULL;
413 	commit_transaction->t_log_start = journal->j_head;
414 	wake_up(&journal->j_wait_transaction_locked);
415 	spin_unlock(&journal->j_state_lock);
416 
417 	jbd_debug (3, "JBD: commit phase 2\n");
418 
419 	/*
420 	 * First, drop modified flag: all accesses to the buffers
421 	 * will be tracked for a new trasaction only -bzzz
422 	 */
423 	spin_lock(&journal->j_list_lock);
424 	if (commit_transaction->t_buffers) {
425 		new_jh = jh = commit_transaction->t_buffers->b_tnext;
426 		do {
427 			J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
428 					new_jh->b_modified == 0);
429 			new_jh->b_modified = 0;
430 			new_jh = new_jh->b_tnext;
431 		} while (new_jh != jh);
432 	}
433 	spin_unlock(&journal->j_list_lock);
434 
435 	/*
436 	 * Now start flushing things to disk, in the order they appear
437 	 * on the transaction lists.  Data blocks go first.
438 	 */
439 	err = 0;
440 	journal_submit_data_buffers(journal, commit_transaction);
441 
442 	/*
443 	 * Wait for all previously submitted IO to complete.
444 	 */
445 	spin_lock(&journal->j_list_lock);
446 	while (commit_transaction->t_locked_list) {
447 		struct buffer_head *bh;
448 
449 		jh = commit_transaction->t_locked_list->b_tprev;
450 		bh = jh2bh(jh);
451 		get_bh(bh);
452 		if (buffer_locked(bh)) {
453 			spin_unlock(&journal->j_list_lock);
454 			wait_on_buffer(bh);
455 			if (unlikely(!buffer_uptodate(bh)))
456 				err = -EIO;
457 			spin_lock(&journal->j_list_lock);
458 		}
459 		if (!inverted_lock(journal, bh)) {
460 			put_bh(bh);
461 			spin_lock(&journal->j_list_lock);
462 			continue;
463 		}
464 		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
465 			__jbd2_journal_unfile_buffer(jh);
466 			jbd_unlock_bh_state(bh);
467 			jbd2_journal_remove_journal_head(bh);
468 			put_bh(bh);
469 		} else {
470 			jbd_unlock_bh_state(bh);
471 		}
472 		put_bh(bh);
473 		cond_resched_lock(&journal->j_list_lock);
474 	}
475 	spin_unlock(&journal->j_list_lock);
476 
477 	if (err)
478 		__jbd2_journal_abort_hard(journal);
479 
480 	jbd2_journal_write_revoke_records(journal, commit_transaction);
481 
482 	jbd_debug(3, "JBD: commit phase 2\n");
483 
484 	/*
485 	 * If we found any dirty or locked buffers, then we should have
486 	 * looped back up to the write_out_data label.  If there weren't
487 	 * any then journal_clean_data_list should have wiped the list
488 	 * clean by now, so check that it is in fact empty.
489 	 */
490 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
491 
492 	jbd_debug (3, "JBD: commit phase 3\n");
493 
494 	/*
495 	 * Way to go: we have now written out all of the data for a
496 	 * transaction!  Now comes the tricky part: we need to write out
497 	 * metadata.  Loop over the transaction's entire buffer list:
498 	 */
499 	commit_transaction->t_state = T_COMMIT;
500 
501 	descriptor = NULL;
502 	bufs = 0;
503 	while (commit_transaction->t_buffers) {
504 
505 		/* Find the next buffer to be journaled... */
506 
507 		jh = commit_transaction->t_buffers;
508 
509 		/* If we're in abort mode, we just un-journal the buffer and
510 		   release it for background writing. */
511 
512 		if (is_journal_aborted(journal)) {
513 			JBUFFER_TRACE(jh, "journal is aborting: refile");
514 			jbd2_journal_refile_buffer(journal, jh);
515 			/* If that was the last one, we need to clean up
516 			 * any descriptor buffers which may have been
517 			 * already allocated, even if we are now
518 			 * aborting. */
519 			if (!commit_transaction->t_buffers)
520 				goto start_journal_io;
521 			continue;
522 		}
523 
524 		/* Make sure we have a descriptor block in which to
525 		   record the metadata buffer. */
526 
527 		if (!descriptor) {
528 			struct buffer_head *bh;
529 
530 			J_ASSERT (bufs == 0);
531 
532 			jbd_debug(4, "JBD: get descriptor\n");
533 
534 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
535 			if (!descriptor) {
536 				__jbd2_journal_abort_hard(journal);
537 				continue;
538 			}
539 
540 			bh = jh2bh(descriptor);
541 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
542 				(unsigned long long)bh->b_blocknr, bh->b_data);
543 			header = (journal_header_t *)&bh->b_data[0];
544 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
545 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
546 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
547 
548 			tagp = &bh->b_data[sizeof(journal_header_t)];
549 			space_left = bh->b_size - sizeof(journal_header_t);
550 			first_tag = 1;
551 			set_buffer_jwrite(bh);
552 			set_buffer_dirty(bh);
553 			wbuf[bufs++] = bh;
554 
555 			/* Record it so that we can wait for IO
556                            completion later */
557 			BUFFER_TRACE(bh, "ph3: file as descriptor");
558 			jbd2_journal_file_buffer(descriptor, commit_transaction,
559 					BJ_LogCtl);
560 		}
561 
562 		/* Where is the buffer to be written? */
563 
564 		err = jbd2_journal_next_log_block(journal, &blocknr);
565 		/* If the block mapping failed, just abandon the buffer
566 		   and repeat this loop: we'll fall into the
567 		   refile-on-abort condition above. */
568 		if (err) {
569 			__jbd2_journal_abort_hard(journal);
570 			continue;
571 		}
572 
573 		/*
574 		 * start_this_handle() uses t_outstanding_credits to determine
575 		 * the free space in the log, but this counter is changed
576 		 * by jbd2_journal_next_log_block() also.
577 		 */
578 		commit_transaction->t_outstanding_credits--;
579 
580 		/* Bump b_count to prevent truncate from stumbling over
581                    the shadowed buffer!  @@@ This can go if we ever get
582                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
583 		atomic_inc(&jh2bh(jh)->b_count);
584 
585 		/* Make a temporary IO buffer with which to write it out
586                    (this will requeue both the metadata buffer and the
587                    temporary IO buffer). new_bh goes on BJ_IO*/
588 
589 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
590 		/*
591 		 * akpm: jbd2_journal_write_metadata_buffer() sets
592 		 * new_bh->b_transaction to commit_transaction.
593 		 * We need to clean this up before we release new_bh
594 		 * (which is of type BJ_IO)
595 		 */
596 		JBUFFER_TRACE(jh, "ph3: write metadata");
597 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
598 						      jh, &new_jh, blocknr);
599 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
600 		wbuf[bufs++] = jh2bh(new_jh);
601 
602 		/* Record the new block's tag in the current descriptor
603                    buffer */
604 
605 		tag_flag = 0;
606 		if (flags & 1)
607 			tag_flag |= JBD2_FLAG_ESCAPE;
608 		if (!first_tag)
609 			tag_flag |= JBD2_FLAG_SAME_UUID;
610 
611 		tag = (journal_block_tag_t *) tagp;
612 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
613 		tag->t_flags = cpu_to_be32(tag_flag);
614 		tagp += tag_bytes;
615 		space_left -= tag_bytes;
616 
617 		if (first_tag) {
618 			memcpy (tagp, journal->j_uuid, 16);
619 			tagp += 16;
620 			space_left -= 16;
621 			first_tag = 0;
622 		}
623 
624 		/* If there's no more to do, or if the descriptor is full,
625 		   let the IO rip! */
626 
627 		if (bufs == journal->j_wbufsize ||
628 		    commit_transaction->t_buffers == NULL ||
629 		    space_left < tag_bytes + 16) {
630 
631 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
632 
633 			/* Write an end-of-descriptor marker before
634                            submitting the IOs.  "tag" still points to
635                            the last tag we set up. */
636 
637 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
638 
639 start_journal_io:
640 			for (i = 0; i < bufs; i++) {
641 				struct buffer_head *bh = wbuf[i];
642 				lock_buffer(bh);
643 				clear_buffer_dirty(bh);
644 				set_buffer_uptodate(bh);
645 				bh->b_end_io = journal_end_buffer_io_sync;
646 				submit_bh(WRITE, bh);
647 			}
648 			cond_resched();
649 
650 			/* Force a new descriptor to be generated next
651                            time round the loop. */
652 			descriptor = NULL;
653 			bufs = 0;
654 		}
655 	}
656 
657 	/* Lo and behold: we have just managed to send a transaction to
658            the log.  Before we can commit it, wait for the IO so far to
659            complete.  Control buffers being written are on the
660            transaction's t_log_list queue, and metadata buffers are on
661            the t_iobuf_list queue.
662 
663 	   Wait for the buffers in reverse order.  That way we are
664 	   less likely to be woken up until all IOs have completed, and
665 	   so we incur less scheduling load.
666 	*/
667 
668 	jbd_debug(3, "JBD: commit phase 4\n");
669 
670 	/*
671 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
672 	 * See __journal_try_to_free_buffer.
673 	 */
674 wait_for_iobuf:
675 	while (commit_transaction->t_iobuf_list != NULL) {
676 		struct buffer_head *bh;
677 
678 		jh = commit_transaction->t_iobuf_list->b_tprev;
679 		bh = jh2bh(jh);
680 		if (buffer_locked(bh)) {
681 			wait_on_buffer(bh);
682 			goto wait_for_iobuf;
683 		}
684 		if (cond_resched())
685 			goto wait_for_iobuf;
686 
687 		if (unlikely(!buffer_uptodate(bh)))
688 			err = -EIO;
689 
690 		clear_buffer_jwrite(bh);
691 
692 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
693 		jbd2_journal_unfile_buffer(journal, jh);
694 
695 		/*
696 		 * ->t_iobuf_list should contain only dummy buffer_heads
697 		 * which were created by jbd2_journal_write_metadata_buffer().
698 		 */
699 		BUFFER_TRACE(bh, "dumping temporary bh");
700 		jbd2_journal_put_journal_head(jh);
701 		__brelse(bh);
702 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
703 		free_buffer_head(bh);
704 
705 		/* We also have to unlock and free the corresponding
706                    shadowed buffer */
707 		jh = commit_transaction->t_shadow_list->b_tprev;
708 		bh = jh2bh(jh);
709 		clear_bit(BH_JWrite, &bh->b_state);
710 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
711 
712 		/* The metadata is now released for reuse, but we need
713                    to remember it against this transaction so that when
714                    we finally commit, we can do any checkpointing
715                    required. */
716 		JBUFFER_TRACE(jh, "file as BJ_Forget");
717 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
718 		/* Wake up any transactions which were waiting for this
719 		   IO to complete */
720 		wake_up_bit(&bh->b_state, BH_Unshadow);
721 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
722 		__brelse(bh);
723 	}
724 
725 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
726 
727 	jbd_debug(3, "JBD: commit phase 5\n");
728 
729 	/* Here we wait for the revoke record and descriptor record buffers */
730  wait_for_ctlbuf:
731 	while (commit_transaction->t_log_list != NULL) {
732 		struct buffer_head *bh;
733 
734 		jh = commit_transaction->t_log_list->b_tprev;
735 		bh = jh2bh(jh);
736 		if (buffer_locked(bh)) {
737 			wait_on_buffer(bh);
738 			goto wait_for_ctlbuf;
739 		}
740 		if (cond_resched())
741 			goto wait_for_ctlbuf;
742 
743 		if (unlikely(!buffer_uptodate(bh)))
744 			err = -EIO;
745 
746 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
747 		clear_buffer_jwrite(bh);
748 		jbd2_journal_unfile_buffer(journal, jh);
749 		jbd2_journal_put_journal_head(jh);
750 		__brelse(bh);		/* One for getblk */
751 		/* AKPM: bforget here */
752 	}
753 
754 	jbd_debug(3, "JBD: commit phase 6\n");
755 
756 	if (journal_write_commit_record(journal, commit_transaction))
757 		err = -EIO;
758 
759 	if (err)
760 		__jbd2_journal_abort_hard(journal);
761 
762 	/* End of a transaction!  Finally, we can do checkpoint
763            processing: any buffers committed as a result of this
764            transaction can be removed from any checkpoint list it was on
765            before. */
766 
767 	jbd_debug(3, "JBD: commit phase 7\n");
768 
769 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
770 	J_ASSERT(commit_transaction->t_buffers == NULL);
771 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
772 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
773 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
774 	J_ASSERT(commit_transaction->t_log_list == NULL);
775 
776 restart_loop:
777 	/*
778 	 * As there are other places (journal_unmap_buffer()) adding buffers
779 	 * to this list we have to be careful and hold the j_list_lock.
780 	 */
781 	spin_lock(&journal->j_list_lock);
782 	while (commit_transaction->t_forget) {
783 		transaction_t *cp_transaction;
784 		struct buffer_head *bh;
785 
786 		jh = commit_transaction->t_forget;
787 		spin_unlock(&journal->j_list_lock);
788 		bh = jh2bh(jh);
789 		jbd_lock_bh_state(bh);
790 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
791 			jh->b_transaction == journal->j_running_transaction);
792 
793 		/*
794 		 * If there is undo-protected committed data against
795 		 * this buffer, then we can remove it now.  If it is a
796 		 * buffer needing such protection, the old frozen_data
797 		 * field now points to a committed version of the
798 		 * buffer, so rotate that field to the new committed
799 		 * data.
800 		 *
801 		 * Otherwise, we can just throw away the frozen data now.
802 		 */
803 		if (jh->b_committed_data) {
804 			jbd2_slab_free(jh->b_committed_data, bh->b_size);
805 			jh->b_committed_data = NULL;
806 			if (jh->b_frozen_data) {
807 				jh->b_committed_data = jh->b_frozen_data;
808 				jh->b_frozen_data = NULL;
809 			}
810 		} else if (jh->b_frozen_data) {
811 			jbd2_slab_free(jh->b_frozen_data, bh->b_size);
812 			jh->b_frozen_data = NULL;
813 		}
814 
815 		spin_lock(&journal->j_list_lock);
816 		cp_transaction = jh->b_cp_transaction;
817 		if (cp_transaction) {
818 			JBUFFER_TRACE(jh, "remove from old cp transaction");
819 			__jbd2_journal_remove_checkpoint(jh);
820 		}
821 
822 		/* Only re-checkpoint the buffer_head if it is marked
823 		 * dirty.  If the buffer was added to the BJ_Forget list
824 		 * by jbd2_journal_forget, it may no longer be dirty and
825 		 * there's no point in keeping a checkpoint record for
826 		 * it. */
827 
828 		/* A buffer which has been freed while still being
829 		 * journaled by a previous transaction may end up still
830 		 * being dirty here, but we want to avoid writing back
831 		 * that buffer in the future now that the last use has
832 		 * been committed.  That's not only a performance gain,
833 		 * it also stops aliasing problems if the buffer is left
834 		 * behind for writeback and gets reallocated for another
835 		 * use in a different page. */
836 		if (buffer_freed(bh)) {
837 			clear_buffer_freed(bh);
838 			clear_buffer_jbddirty(bh);
839 		}
840 
841 		if (buffer_jbddirty(bh)) {
842 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
843 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
844 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
845 			__jbd2_journal_refile_buffer(jh);
846 			jbd_unlock_bh_state(bh);
847 		} else {
848 			J_ASSERT_BH(bh, !buffer_dirty(bh));
849 			/* The buffer on BJ_Forget list and not jbddirty means
850 			 * it has been freed by this transaction and hence it
851 			 * could not have been reallocated until this
852 			 * transaction has committed. *BUT* it could be
853 			 * reallocated once we have written all the data to
854 			 * disk and before we process the buffer on BJ_Forget
855 			 * list. */
856 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
857 			__jbd2_journal_refile_buffer(jh);
858 			if (!jh->b_transaction) {
859 				jbd_unlock_bh_state(bh);
860 				 /* needs a brelse */
861 				jbd2_journal_remove_journal_head(bh);
862 				release_buffer_page(bh);
863 			} else
864 				jbd_unlock_bh_state(bh);
865 		}
866 		cond_resched_lock(&journal->j_list_lock);
867 	}
868 	spin_unlock(&journal->j_list_lock);
869 	/*
870 	 * This is a bit sleazy.  We borrow j_list_lock to protect
871 	 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
872 	 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
873 	 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
874 	 */
875 	spin_lock(&journal->j_state_lock);
876 	spin_lock(&journal->j_list_lock);
877 	/*
878 	 * Now recheck if some buffers did not get attached to the transaction
879 	 * while the lock was dropped...
880 	 */
881 	if (commit_transaction->t_forget) {
882 		spin_unlock(&journal->j_list_lock);
883 		spin_unlock(&journal->j_state_lock);
884 		goto restart_loop;
885 	}
886 
887 	/* Done with this transaction! */
888 
889 	jbd_debug(3, "JBD: commit phase 8\n");
890 
891 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
892 
893 	commit_transaction->t_state = T_FINISHED;
894 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
895 	journal->j_commit_sequence = commit_transaction->t_tid;
896 	journal->j_committing_transaction = NULL;
897 	spin_unlock(&journal->j_state_lock);
898 
899 	if (commit_transaction->t_checkpoint_list == NULL &&
900 	    commit_transaction->t_checkpoint_io_list == NULL) {
901 		__jbd2_journal_drop_transaction(journal, commit_transaction);
902 	} else {
903 		if (journal->j_checkpoint_transactions == NULL) {
904 			journal->j_checkpoint_transactions = commit_transaction;
905 			commit_transaction->t_cpnext = commit_transaction;
906 			commit_transaction->t_cpprev = commit_transaction;
907 		} else {
908 			commit_transaction->t_cpnext =
909 				journal->j_checkpoint_transactions;
910 			commit_transaction->t_cpprev =
911 				commit_transaction->t_cpnext->t_cpprev;
912 			commit_transaction->t_cpnext->t_cpprev =
913 				commit_transaction;
914 			commit_transaction->t_cpprev->t_cpnext =
915 				commit_transaction;
916 		}
917 	}
918 	spin_unlock(&journal->j_list_lock);
919 
920 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
921 		  journal->j_commit_sequence, journal->j_tail_sequence);
922 
923 	wake_up(&journal->j_wait_done_commit);
924 }
925