xref: /openbmc/linux/fs/jbd2/commit.c (revision 5d4a2e29)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <trace/events/jbd2.h>
30 
31 /*
32  * Default IO end handler for temporary BJ_IO buffer_heads.
33  */
34 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
35 {
36 	BUFFER_TRACE(bh, "");
37 	if (uptodate)
38 		set_buffer_uptodate(bh);
39 	else
40 		clear_buffer_uptodate(bh);
41 	unlock_buffer(bh);
42 }
43 
44 /*
45  * When an ext4 file is truncated, it is possible that some pages are not
46  * successfully freed, because they are attached to a committing transaction.
47  * After the transaction commits, these pages are left on the LRU, with no
48  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
49  * by the VM, but their apparent absence upsets the VM accounting, and it makes
50  * the numbers in /proc/meminfo look odd.
51  *
52  * So here, we have a buffer which has just come off the forget list.  Look to
53  * see if we can strip all buffers from the backing page.
54  *
55  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
56  * caller provided us with a ref against the buffer, and we drop that here.
57  */
58 static void release_buffer_page(struct buffer_head *bh)
59 {
60 	struct page *page;
61 
62 	if (buffer_dirty(bh))
63 		goto nope;
64 	if (atomic_read(&bh->b_count) != 1)
65 		goto nope;
66 	page = bh->b_page;
67 	if (!page)
68 		goto nope;
69 	if (page->mapping)
70 		goto nope;
71 
72 	/* OK, it's a truncated page */
73 	if (!trylock_page(page))
74 		goto nope;
75 
76 	page_cache_get(page);
77 	__brelse(bh);
78 	try_to_free_buffers(page);
79 	unlock_page(page);
80 	page_cache_release(page);
81 	return;
82 
83 nope:
84 	__brelse(bh);
85 }
86 
87 /*
88  * Done it all: now submit the commit record.  We should have
89  * cleaned up our previous buffers by now, so if we are in abort
90  * mode we can now just skip the rest of the journal write
91  * entirely.
92  *
93  * Returns 1 if the journal needs to be aborted or 0 on success
94  */
95 static int journal_submit_commit_record(journal_t *journal,
96 					transaction_t *commit_transaction,
97 					struct buffer_head **cbh,
98 					__u32 crc32_sum)
99 {
100 	struct journal_head *descriptor;
101 	struct commit_header *tmp;
102 	struct buffer_head *bh;
103 	int ret;
104 	int barrier_done = 0;
105 	struct timespec now = current_kernel_time();
106 
107 	if (is_journal_aborted(journal))
108 		return 0;
109 
110 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
111 	if (!descriptor)
112 		return 1;
113 
114 	bh = jh2bh(descriptor);
115 
116 	tmp = (struct commit_header *)bh->b_data;
117 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
118 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
119 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
120 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
121 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
122 
123 	if (JBD2_HAS_COMPAT_FEATURE(journal,
124 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
125 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
126 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
127 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
128 	}
129 
130 	JBUFFER_TRACE(descriptor, "submit commit block");
131 	lock_buffer(bh);
132 	clear_buffer_dirty(bh);
133 	set_buffer_uptodate(bh);
134 	bh->b_end_io = journal_end_buffer_io_sync;
135 
136 	if (journal->j_flags & JBD2_BARRIER &&
137 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
138 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
139 		set_buffer_ordered(bh);
140 		barrier_done = 1;
141 	}
142 	ret = submit_bh(WRITE_SYNC_PLUG, bh);
143 	if (barrier_done)
144 		clear_buffer_ordered(bh);
145 
146 	/* is it possible for another commit to fail at roughly
147 	 * the same time as this one?  If so, we don't want to
148 	 * trust the barrier flag in the super, but instead want
149 	 * to remember if we sent a barrier request
150 	 */
151 	if (ret == -EOPNOTSUPP && barrier_done) {
152 		printk(KERN_WARNING
153 		       "JBD: barrier-based sync failed on %s - "
154 		       "disabling barriers\n", journal->j_devname);
155 		spin_lock(&journal->j_state_lock);
156 		journal->j_flags &= ~JBD2_BARRIER;
157 		spin_unlock(&journal->j_state_lock);
158 
159 		/* And try again, without the barrier */
160 		lock_buffer(bh);
161 		set_buffer_uptodate(bh);
162 		clear_buffer_dirty(bh);
163 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
164 	}
165 	*cbh = bh;
166 	return ret;
167 }
168 
169 /*
170  * This function along with journal_submit_commit_record
171  * allows to write the commit record asynchronously.
172  */
173 static int journal_wait_on_commit_record(journal_t *journal,
174 					 struct buffer_head *bh)
175 {
176 	int ret = 0;
177 
178 retry:
179 	clear_buffer_dirty(bh);
180 	wait_on_buffer(bh);
181 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
182 		printk(KERN_WARNING
183 		       "JBD2: wait_on_commit_record: sync failed on %s - "
184 		       "disabling barriers\n", journal->j_devname);
185 		spin_lock(&journal->j_state_lock);
186 		journal->j_flags &= ~JBD2_BARRIER;
187 		spin_unlock(&journal->j_state_lock);
188 
189 		lock_buffer(bh);
190 		clear_buffer_dirty(bh);
191 		set_buffer_uptodate(bh);
192 		bh->b_end_io = journal_end_buffer_io_sync;
193 
194 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
195 		if (ret) {
196 			unlock_buffer(bh);
197 			return ret;
198 		}
199 		goto retry;
200 	}
201 
202 	if (unlikely(!buffer_uptodate(bh)))
203 		ret = -EIO;
204 	put_bh(bh);            /* One for getblk() */
205 	jbd2_journal_put_journal_head(bh2jh(bh));
206 
207 	return ret;
208 }
209 
210 /*
211  * write the filemap data using writepage() address_space_operations.
212  * We don't do block allocation here even for delalloc. We don't
213  * use writepages() because with dealyed allocation we may be doing
214  * block allocation in writepages().
215  */
216 static int journal_submit_inode_data_buffers(struct address_space *mapping)
217 {
218 	int ret;
219 	struct writeback_control wbc = {
220 		.sync_mode =  WB_SYNC_ALL,
221 		.nr_to_write = mapping->nrpages * 2,
222 		.range_start = 0,
223 		.range_end = i_size_read(mapping->host),
224 	};
225 
226 	ret = generic_writepages(mapping, &wbc);
227 	return ret;
228 }
229 
230 /*
231  * Submit all the data buffers of inode associated with the transaction to
232  * disk.
233  *
234  * We are in a committing transaction. Therefore no new inode can be added to
235  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
236  * operate on from being released while we write out pages.
237  */
238 static int journal_submit_data_buffers(journal_t *journal,
239 		transaction_t *commit_transaction)
240 {
241 	struct jbd2_inode *jinode;
242 	int err, ret = 0;
243 	struct address_space *mapping;
244 
245 	spin_lock(&journal->j_list_lock);
246 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 		mapping = jinode->i_vfs_inode->i_mapping;
248 		jinode->i_flags |= JI_COMMIT_RUNNING;
249 		spin_unlock(&journal->j_list_lock);
250 		/*
251 		 * submit the inode data buffers. We use writepage
252 		 * instead of writepages. Because writepages can do
253 		 * block allocation  with delalloc. We need to write
254 		 * only allocated blocks here.
255 		 */
256 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
257 		err = journal_submit_inode_data_buffers(mapping);
258 		if (!ret)
259 			ret = err;
260 		spin_lock(&journal->j_list_lock);
261 		J_ASSERT(jinode->i_transaction == commit_transaction);
262 		commit_transaction->t_flushed_data_blocks = 1;
263 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
264 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
265 	}
266 	spin_unlock(&journal->j_list_lock);
267 	return ret;
268 }
269 
270 /*
271  * Wait for data submitted for writeout, refile inodes to proper
272  * transaction if needed.
273  *
274  */
275 static int journal_finish_inode_data_buffers(journal_t *journal,
276 		transaction_t *commit_transaction)
277 {
278 	struct jbd2_inode *jinode, *next_i;
279 	int err, ret = 0;
280 
281 	/* For locking, see the comment in journal_submit_data_buffers() */
282 	spin_lock(&journal->j_list_lock);
283 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
284 		jinode->i_flags |= JI_COMMIT_RUNNING;
285 		spin_unlock(&journal->j_list_lock);
286 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
287 		if (err) {
288 			/*
289 			 * Because AS_EIO is cleared by
290 			 * filemap_fdatawait_range(), set it again so
291 			 * that user process can get -EIO from fsync().
292 			 */
293 			set_bit(AS_EIO,
294 				&jinode->i_vfs_inode->i_mapping->flags);
295 
296 			if (!ret)
297 				ret = err;
298 		}
299 		spin_lock(&journal->j_list_lock);
300 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
301 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
302 	}
303 
304 	/* Now refile inode to proper lists */
305 	list_for_each_entry_safe(jinode, next_i,
306 				 &commit_transaction->t_inode_list, i_list) {
307 		list_del(&jinode->i_list);
308 		if (jinode->i_next_transaction) {
309 			jinode->i_transaction = jinode->i_next_transaction;
310 			jinode->i_next_transaction = NULL;
311 			list_add(&jinode->i_list,
312 				&jinode->i_transaction->t_inode_list);
313 		} else {
314 			jinode->i_transaction = NULL;
315 		}
316 	}
317 	spin_unlock(&journal->j_list_lock);
318 
319 	return ret;
320 }
321 
322 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
323 {
324 	struct page *page = bh->b_page;
325 	char *addr;
326 	__u32 checksum;
327 
328 	addr = kmap_atomic(page, KM_USER0);
329 	checksum = crc32_be(crc32_sum,
330 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
331 	kunmap_atomic(addr, KM_USER0);
332 
333 	return checksum;
334 }
335 
336 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
337 				   unsigned long long block)
338 {
339 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
340 	if (tag_bytes > JBD2_TAG_SIZE32)
341 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
342 }
343 
344 /*
345  * jbd2_journal_commit_transaction
346  *
347  * The primary function for committing a transaction to the log.  This
348  * function is called by the journal thread to begin a complete commit.
349  */
350 void jbd2_journal_commit_transaction(journal_t *journal)
351 {
352 	struct transaction_stats_s stats;
353 	transaction_t *commit_transaction;
354 	struct journal_head *jh, *new_jh, *descriptor;
355 	struct buffer_head **wbuf = journal->j_wbuf;
356 	int bufs;
357 	int flags;
358 	int err;
359 	unsigned long long blocknr;
360 	ktime_t start_time;
361 	u64 commit_time;
362 	char *tagp = NULL;
363 	journal_header_t *header;
364 	journal_block_tag_t *tag = NULL;
365 	int space_left = 0;
366 	int first_tag = 0;
367 	int tag_flag;
368 	int i, to_free = 0;
369 	int tag_bytes = journal_tag_bytes(journal);
370 	struct buffer_head *cbh = NULL; /* For transactional checksums */
371 	__u32 crc32_sum = ~0;
372 	int write_op = WRITE;
373 
374 	/*
375 	 * First job: lock down the current transaction and wait for
376 	 * all outstanding updates to complete.
377 	 */
378 
379 #ifdef COMMIT_STATS
380 	spin_lock(&journal->j_list_lock);
381 	summarise_journal_usage(journal);
382 	spin_unlock(&journal->j_list_lock);
383 #endif
384 
385 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
386 	if (journal->j_flags & JBD2_FLUSHED) {
387 		jbd_debug(3, "super block updated\n");
388 		jbd2_journal_update_superblock(journal, 1);
389 	} else {
390 		jbd_debug(3, "superblock not updated\n");
391 	}
392 
393 	J_ASSERT(journal->j_running_transaction != NULL);
394 	J_ASSERT(journal->j_committing_transaction == NULL);
395 
396 	commit_transaction = journal->j_running_transaction;
397 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
398 
399 	trace_jbd2_start_commit(journal, commit_transaction);
400 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
401 			commit_transaction->t_tid);
402 
403 	spin_lock(&journal->j_state_lock);
404 	commit_transaction->t_state = T_LOCKED;
405 
406 	/*
407 	 * Use plugged writes here, since we want to submit several before
408 	 * we unplug the device. We don't do explicit unplugging in here,
409 	 * instead we rely on sync_buffer() doing the unplug for us.
410 	 */
411 	if (commit_transaction->t_synchronous_commit)
412 		write_op = WRITE_SYNC_PLUG;
413 	trace_jbd2_commit_locking(journal, commit_transaction);
414 	stats.run.rs_wait = commit_transaction->t_max_wait;
415 	stats.run.rs_locked = jiffies;
416 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
417 					      stats.run.rs_locked);
418 
419 	spin_lock(&commit_transaction->t_handle_lock);
420 	while (commit_transaction->t_updates) {
421 		DEFINE_WAIT(wait);
422 
423 		prepare_to_wait(&journal->j_wait_updates, &wait,
424 					TASK_UNINTERRUPTIBLE);
425 		if (commit_transaction->t_updates) {
426 			spin_unlock(&commit_transaction->t_handle_lock);
427 			spin_unlock(&journal->j_state_lock);
428 			schedule();
429 			spin_lock(&journal->j_state_lock);
430 			spin_lock(&commit_transaction->t_handle_lock);
431 		}
432 		finish_wait(&journal->j_wait_updates, &wait);
433 	}
434 	spin_unlock(&commit_transaction->t_handle_lock);
435 
436 	J_ASSERT (commit_transaction->t_outstanding_credits <=
437 			journal->j_max_transaction_buffers);
438 
439 	/*
440 	 * First thing we are allowed to do is to discard any remaining
441 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
442 	 * that there are no such buffers: if a large filesystem
443 	 * operation like a truncate needs to split itself over multiple
444 	 * transactions, then it may try to do a jbd2_journal_restart() while
445 	 * there are still BJ_Reserved buffers outstanding.  These must
446 	 * be released cleanly from the current transaction.
447 	 *
448 	 * In this case, the filesystem must still reserve write access
449 	 * again before modifying the buffer in the new transaction, but
450 	 * we do not require it to remember exactly which old buffers it
451 	 * has reserved.  This is consistent with the existing behaviour
452 	 * that multiple jbd2_journal_get_write_access() calls to the same
453 	 * buffer are perfectly permissable.
454 	 */
455 	while (commit_transaction->t_reserved_list) {
456 		jh = commit_transaction->t_reserved_list;
457 		JBUFFER_TRACE(jh, "reserved, unused: refile");
458 		/*
459 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
460 		 * leave undo-committed data.
461 		 */
462 		if (jh->b_committed_data) {
463 			struct buffer_head *bh = jh2bh(jh);
464 
465 			jbd_lock_bh_state(bh);
466 			jbd2_free(jh->b_committed_data, bh->b_size);
467 			jh->b_committed_data = NULL;
468 			jbd_unlock_bh_state(bh);
469 		}
470 		jbd2_journal_refile_buffer(journal, jh);
471 	}
472 
473 	/*
474 	 * Now try to drop any written-back buffers from the journal's
475 	 * checkpoint lists.  We do this *before* commit because it potentially
476 	 * frees some memory
477 	 */
478 	spin_lock(&journal->j_list_lock);
479 	__jbd2_journal_clean_checkpoint_list(journal);
480 	spin_unlock(&journal->j_list_lock);
481 
482 	jbd_debug (3, "JBD: commit phase 1\n");
483 
484 	/*
485 	 * Switch to a new revoke table.
486 	 */
487 	jbd2_journal_switch_revoke_table(journal);
488 
489 	trace_jbd2_commit_flushing(journal, commit_transaction);
490 	stats.run.rs_flushing = jiffies;
491 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
492 					     stats.run.rs_flushing);
493 
494 	commit_transaction->t_state = T_FLUSH;
495 	journal->j_committing_transaction = commit_transaction;
496 	journal->j_running_transaction = NULL;
497 	start_time = ktime_get();
498 	commit_transaction->t_log_start = journal->j_head;
499 	wake_up(&journal->j_wait_transaction_locked);
500 	spin_unlock(&journal->j_state_lock);
501 
502 	jbd_debug (3, "JBD: commit phase 2\n");
503 
504 	/*
505 	 * Now start flushing things to disk, in the order they appear
506 	 * on the transaction lists.  Data blocks go first.
507 	 */
508 	err = journal_submit_data_buffers(journal, commit_transaction);
509 	if (err)
510 		jbd2_journal_abort(journal, err);
511 
512 	jbd2_journal_write_revoke_records(journal, commit_transaction,
513 					  write_op);
514 
515 	jbd_debug(3, "JBD: commit phase 2\n");
516 
517 	/*
518 	 * Way to go: we have now written out all of the data for a
519 	 * transaction!  Now comes the tricky part: we need to write out
520 	 * metadata.  Loop over the transaction's entire buffer list:
521 	 */
522 	spin_lock(&journal->j_state_lock);
523 	commit_transaction->t_state = T_COMMIT;
524 	spin_unlock(&journal->j_state_lock);
525 
526 	trace_jbd2_commit_logging(journal, commit_transaction);
527 	stats.run.rs_logging = jiffies;
528 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
529 					       stats.run.rs_logging);
530 	stats.run.rs_blocks = commit_transaction->t_outstanding_credits;
531 	stats.run.rs_blocks_logged = 0;
532 
533 	J_ASSERT(commit_transaction->t_nr_buffers <=
534 		 commit_transaction->t_outstanding_credits);
535 
536 	err = 0;
537 	descriptor = NULL;
538 	bufs = 0;
539 	while (commit_transaction->t_buffers) {
540 
541 		/* Find the next buffer to be journaled... */
542 
543 		jh = commit_transaction->t_buffers;
544 
545 		/* If we're in abort mode, we just un-journal the buffer and
546 		   release it. */
547 
548 		if (is_journal_aborted(journal)) {
549 			clear_buffer_jbddirty(jh2bh(jh));
550 			JBUFFER_TRACE(jh, "journal is aborting: refile");
551 			jbd2_buffer_abort_trigger(jh,
552 						  jh->b_frozen_data ?
553 						  jh->b_frozen_triggers :
554 						  jh->b_triggers);
555 			jbd2_journal_refile_buffer(journal, jh);
556 			/* If that was the last one, we need to clean up
557 			 * any descriptor buffers which may have been
558 			 * already allocated, even if we are now
559 			 * aborting. */
560 			if (!commit_transaction->t_buffers)
561 				goto start_journal_io;
562 			continue;
563 		}
564 
565 		/* Make sure we have a descriptor block in which to
566 		   record the metadata buffer. */
567 
568 		if (!descriptor) {
569 			struct buffer_head *bh;
570 
571 			J_ASSERT (bufs == 0);
572 
573 			jbd_debug(4, "JBD: get descriptor\n");
574 
575 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
576 			if (!descriptor) {
577 				jbd2_journal_abort(journal, -EIO);
578 				continue;
579 			}
580 
581 			bh = jh2bh(descriptor);
582 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
583 				(unsigned long long)bh->b_blocknr, bh->b_data);
584 			header = (journal_header_t *)&bh->b_data[0];
585 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
586 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
587 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
588 
589 			tagp = &bh->b_data[sizeof(journal_header_t)];
590 			space_left = bh->b_size - sizeof(journal_header_t);
591 			first_tag = 1;
592 			set_buffer_jwrite(bh);
593 			set_buffer_dirty(bh);
594 			wbuf[bufs++] = bh;
595 
596 			/* Record it so that we can wait for IO
597                            completion later */
598 			BUFFER_TRACE(bh, "ph3: file as descriptor");
599 			jbd2_journal_file_buffer(descriptor, commit_transaction,
600 					BJ_LogCtl);
601 		}
602 
603 		/* Where is the buffer to be written? */
604 
605 		err = jbd2_journal_next_log_block(journal, &blocknr);
606 		/* If the block mapping failed, just abandon the buffer
607 		   and repeat this loop: we'll fall into the
608 		   refile-on-abort condition above. */
609 		if (err) {
610 			jbd2_journal_abort(journal, err);
611 			continue;
612 		}
613 
614 		/*
615 		 * start_this_handle() uses t_outstanding_credits to determine
616 		 * the free space in the log, but this counter is changed
617 		 * by jbd2_journal_next_log_block() also.
618 		 */
619 		commit_transaction->t_outstanding_credits--;
620 
621 		/* Bump b_count to prevent truncate from stumbling over
622                    the shadowed buffer!  @@@ This can go if we ever get
623                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
624 		atomic_inc(&jh2bh(jh)->b_count);
625 
626 		/* Make a temporary IO buffer with which to write it out
627                    (this will requeue both the metadata buffer and the
628                    temporary IO buffer). new_bh goes on BJ_IO*/
629 
630 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
631 		/*
632 		 * akpm: jbd2_journal_write_metadata_buffer() sets
633 		 * new_bh->b_transaction to commit_transaction.
634 		 * We need to clean this up before we release new_bh
635 		 * (which is of type BJ_IO)
636 		 */
637 		JBUFFER_TRACE(jh, "ph3: write metadata");
638 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
639 						      jh, &new_jh, blocknr);
640 		if (flags < 0) {
641 			jbd2_journal_abort(journal, flags);
642 			continue;
643 		}
644 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
645 		wbuf[bufs++] = jh2bh(new_jh);
646 
647 		/* Record the new block's tag in the current descriptor
648                    buffer */
649 
650 		tag_flag = 0;
651 		if (flags & 1)
652 			tag_flag |= JBD2_FLAG_ESCAPE;
653 		if (!first_tag)
654 			tag_flag |= JBD2_FLAG_SAME_UUID;
655 
656 		tag = (journal_block_tag_t *) tagp;
657 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
658 		tag->t_flags = cpu_to_be32(tag_flag);
659 		tagp += tag_bytes;
660 		space_left -= tag_bytes;
661 
662 		if (first_tag) {
663 			memcpy (tagp, journal->j_uuid, 16);
664 			tagp += 16;
665 			space_left -= 16;
666 			first_tag = 0;
667 		}
668 
669 		/* If there's no more to do, or if the descriptor is full,
670 		   let the IO rip! */
671 
672 		if (bufs == journal->j_wbufsize ||
673 		    commit_transaction->t_buffers == NULL ||
674 		    space_left < tag_bytes + 16) {
675 
676 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
677 
678 			/* Write an end-of-descriptor marker before
679                            submitting the IOs.  "tag" still points to
680                            the last tag we set up. */
681 
682 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
683 
684 start_journal_io:
685 			for (i = 0; i < bufs; i++) {
686 				struct buffer_head *bh = wbuf[i];
687 				/*
688 				 * Compute checksum.
689 				 */
690 				if (JBD2_HAS_COMPAT_FEATURE(journal,
691 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
692 					crc32_sum =
693 					    jbd2_checksum_data(crc32_sum, bh);
694 				}
695 
696 				lock_buffer(bh);
697 				clear_buffer_dirty(bh);
698 				set_buffer_uptodate(bh);
699 				bh->b_end_io = journal_end_buffer_io_sync;
700 				submit_bh(write_op, bh);
701 			}
702 			cond_resched();
703 			stats.run.rs_blocks_logged += bufs;
704 
705 			/* Force a new descriptor to be generated next
706                            time round the loop. */
707 			descriptor = NULL;
708 			bufs = 0;
709 		}
710 	}
711 
712 	/*
713 	 * If the journal is not located on the file system device,
714 	 * then we must flush the file system device before we issue
715 	 * the commit record
716 	 */
717 	if (commit_transaction->t_flushed_data_blocks &&
718 	    (journal->j_fs_dev != journal->j_dev) &&
719 	    (journal->j_flags & JBD2_BARRIER))
720 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
721 			BLKDEV_IFL_WAIT);
722 
723 	/* Done it all: now write the commit record asynchronously. */
724 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
725 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
726 		err = journal_submit_commit_record(journal, commit_transaction,
727 						 &cbh, crc32_sum);
728 		if (err)
729 			__jbd2_journal_abort_hard(journal);
730 		if (journal->j_flags & JBD2_BARRIER)
731 			blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
732 				BLKDEV_IFL_WAIT);
733 	}
734 
735 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
736 	if (err) {
737 		printk(KERN_WARNING
738 			"JBD2: Detected IO errors while flushing file data "
739 		       "on %s\n", journal->j_devname);
740 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
741 			jbd2_journal_abort(journal, err);
742 		err = 0;
743 	}
744 
745 	/* Lo and behold: we have just managed to send a transaction to
746            the log.  Before we can commit it, wait for the IO so far to
747            complete.  Control buffers being written are on the
748            transaction's t_log_list queue, and metadata buffers are on
749            the t_iobuf_list queue.
750 
751 	   Wait for the buffers in reverse order.  That way we are
752 	   less likely to be woken up until all IOs have completed, and
753 	   so we incur less scheduling load.
754 	*/
755 
756 	jbd_debug(3, "JBD: commit phase 3\n");
757 
758 	/*
759 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
760 	 * See __journal_try_to_free_buffer.
761 	 */
762 wait_for_iobuf:
763 	while (commit_transaction->t_iobuf_list != NULL) {
764 		struct buffer_head *bh;
765 
766 		jh = commit_transaction->t_iobuf_list->b_tprev;
767 		bh = jh2bh(jh);
768 		if (buffer_locked(bh)) {
769 			wait_on_buffer(bh);
770 			goto wait_for_iobuf;
771 		}
772 		if (cond_resched())
773 			goto wait_for_iobuf;
774 
775 		if (unlikely(!buffer_uptodate(bh)))
776 			err = -EIO;
777 
778 		clear_buffer_jwrite(bh);
779 
780 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
781 		jbd2_journal_unfile_buffer(journal, jh);
782 
783 		/*
784 		 * ->t_iobuf_list should contain only dummy buffer_heads
785 		 * which were created by jbd2_journal_write_metadata_buffer().
786 		 */
787 		BUFFER_TRACE(bh, "dumping temporary bh");
788 		jbd2_journal_put_journal_head(jh);
789 		__brelse(bh);
790 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
791 		free_buffer_head(bh);
792 
793 		/* We also have to unlock and free the corresponding
794                    shadowed buffer */
795 		jh = commit_transaction->t_shadow_list->b_tprev;
796 		bh = jh2bh(jh);
797 		clear_bit(BH_JWrite, &bh->b_state);
798 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
799 
800 		/* The metadata is now released for reuse, but we need
801                    to remember it against this transaction so that when
802                    we finally commit, we can do any checkpointing
803                    required. */
804 		JBUFFER_TRACE(jh, "file as BJ_Forget");
805 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
806 		/* Wake up any transactions which were waiting for this
807 		   IO to complete */
808 		wake_up_bit(&bh->b_state, BH_Unshadow);
809 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
810 		__brelse(bh);
811 	}
812 
813 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
814 
815 	jbd_debug(3, "JBD: commit phase 4\n");
816 
817 	/* Here we wait for the revoke record and descriptor record buffers */
818  wait_for_ctlbuf:
819 	while (commit_transaction->t_log_list != NULL) {
820 		struct buffer_head *bh;
821 
822 		jh = commit_transaction->t_log_list->b_tprev;
823 		bh = jh2bh(jh);
824 		if (buffer_locked(bh)) {
825 			wait_on_buffer(bh);
826 			goto wait_for_ctlbuf;
827 		}
828 		if (cond_resched())
829 			goto wait_for_ctlbuf;
830 
831 		if (unlikely(!buffer_uptodate(bh)))
832 			err = -EIO;
833 
834 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
835 		clear_buffer_jwrite(bh);
836 		jbd2_journal_unfile_buffer(journal, jh);
837 		jbd2_journal_put_journal_head(jh);
838 		__brelse(bh);		/* One for getblk */
839 		/* AKPM: bforget here */
840 	}
841 
842 	if (err)
843 		jbd2_journal_abort(journal, err);
844 
845 	jbd_debug(3, "JBD: commit phase 5\n");
846 
847 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
848 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
849 		err = journal_submit_commit_record(journal, commit_transaction,
850 						&cbh, crc32_sum);
851 		if (err)
852 			__jbd2_journal_abort_hard(journal);
853 	}
854 	if (!err && !is_journal_aborted(journal))
855 		err = journal_wait_on_commit_record(journal, cbh);
856 
857 	if (err)
858 		jbd2_journal_abort(journal, err);
859 
860 	/* End of a transaction!  Finally, we can do checkpoint
861            processing: any buffers committed as a result of this
862            transaction can be removed from any checkpoint list it was on
863            before. */
864 
865 	jbd_debug(3, "JBD: commit phase 6\n");
866 
867 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
868 	J_ASSERT(commit_transaction->t_buffers == NULL);
869 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
870 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
871 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
872 	J_ASSERT(commit_transaction->t_log_list == NULL);
873 
874 restart_loop:
875 	/*
876 	 * As there are other places (journal_unmap_buffer()) adding buffers
877 	 * to this list we have to be careful and hold the j_list_lock.
878 	 */
879 	spin_lock(&journal->j_list_lock);
880 	while (commit_transaction->t_forget) {
881 		transaction_t *cp_transaction;
882 		struct buffer_head *bh;
883 
884 		jh = commit_transaction->t_forget;
885 		spin_unlock(&journal->j_list_lock);
886 		bh = jh2bh(jh);
887 		jbd_lock_bh_state(bh);
888 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
889 
890 		/*
891 		 * If there is undo-protected committed data against
892 		 * this buffer, then we can remove it now.  If it is a
893 		 * buffer needing such protection, the old frozen_data
894 		 * field now points to a committed version of the
895 		 * buffer, so rotate that field to the new committed
896 		 * data.
897 		 *
898 		 * Otherwise, we can just throw away the frozen data now.
899 		 *
900 		 * We also know that the frozen data has already fired
901 		 * its triggers if they exist, so we can clear that too.
902 		 */
903 		if (jh->b_committed_data) {
904 			jbd2_free(jh->b_committed_data, bh->b_size);
905 			jh->b_committed_data = NULL;
906 			if (jh->b_frozen_data) {
907 				jh->b_committed_data = jh->b_frozen_data;
908 				jh->b_frozen_data = NULL;
909 				jh->b_frozen_triggers = NULL;
910 			}
911 		} else if (jh->b_frozen_data) {
912 			jbd2_free(jh->b_frozen_data, bh->b_size);
913 			jh->b_frozen_data = NULL;
914 			jh->b_frozen_triggers = NULL;
915 		}
916 
917 		spin_lock(&journal->j_list_lock);
918 		cp_transaction = jh->b_cp_transaction;
919 		if (cp_transaction) {
920 			JBUFFER_TRACE(jh, "remove from old cp transaction");
921 			cp_transaction->t_chp_stats.cs_dropped++;
922 			__jbd2_journal_remove_checkpoint(jh);
923 		}
924 
925 		/* Only re-checkpoint the buffer_head if it is marked
926 		 * dirty.  If the buffer was added to the BJ_Forget list
927 		 * by jbd2_journal_forget, it may no longer be dirty and
928 		 * there's no point in keeping a checkpoint record for
929 		 * it. */
930 
931 		/* A buffer which has been freed while still being
932 		 * journaled by a previous transaction may end up still
933 		 * being dirty here, but we want to avoid writing back
934 		 * that buffer in the future after the "add to orphan"
935 		 * operation been committed,  That's not only a performance
936 		 * gain, it also stops aliasing problems if the buffer is
937 		 * left behind for writeback and gets reallocated for another
938 		 * use in a different page. */
939 		if (buffer_freed(bh) && !jh->b_next_transaction) {
940 			clear_buffer_freed(bh);
941 			clear_buffer_jbddirty(bh);
942 		}
943 
944 		if (buffer_jbddirty(bh)) {
945 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
946 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
947 			if (is_journal_aborted(journal))
948 				clear_buffer_jbddirty(bh);
949 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
950 			__jbd2_journal_refile_buffer(jh);
951 			jbd_unlock_bh_state(bh);
952 		} else {
953 			J_ASSERT_BH(bh, !buffer_dirty(bh));
954 			/* The buffer on BJ_Forget list and not jbddirty means
955 			 * it has been freed by this transaction and hence it
956 			 * could not have been reallocated until this
957 			 * transaction has committed. *BUT* it could be
958 			 * reallocated once we have written all the data to
959 			 * disk and before we process the buffer on BJ_Forget
960 			 * list. */
961 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
962 			__jbd2_journal_refile_buffer(jh);
963 			if (!jh->b_transaction) {
964 				jbd_unlock_bh_state(bh);
965 				 /* needs a brelse */
966 				jbd2_journal_remove_journal_head(bh);
967 				release_buffer_page(bh);
968 			} else
969 				jbd_unlock_bh_state(bh);
970 		}
971 		cond_resched_lock(&journal->j_list_lock);
972 	}
973 	spin_unlock(&journal->j_list_lock);
974 	/*
975 	 * This is a bit sleazy.  We use j_list_lock to protect transition
976 	 * of a transaction into T_FINISHED state and calling
977 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
978 	 * other checkpointing code processing the transaction...
979 	 */
980 	spin_lock(&journal->j_state_lock);
981 	spin_lock(&journal->j_list_lock);
982 	/*
983 	 * Now recheck if some buffers did not get attached to the transaction
984 	 * while the lock was dropped...
985 	 */
986 	if (commit_transaction->t_forget) {
987 		spin_unlock(&journal->j_list_lock);
988 		spin_unlock(&journal->j_state_lock);
989 		goto restart_loop;
990 	}
991 
992 	/* Done with this transaction! */
993 
994 	jbd_debug(3, "JBD: commit phase 7\n");
995 
996 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
997 
998 	commit_transaction->t_start = jiffies;
999 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1000 					      commit_transaction->t_start);
1001 
1002 	/*
1003 	 * File the transaction statistics
1004 	 */
1005 	stats.ts_tid = commit_transaction->t_tid;
1006 	stats.run.rs_handle_count = commit_transaction->t_handle_count;
1007 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1008 			     commit_transaction->t_tid, &stats.run);
1009 
1010 	/*
1011 	 * Calculate overall stats
1012 	 */
1013 	spin_lock(&journal->j_history_lock);
1014 	journal->j_stats.ts_tid++;
1015 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1016 	journal->j_stats.run.rs_running += stats.run.rs_running;
1017 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1018 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1019 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1020 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1021 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1022 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1023 	spin_unlock(&journal->j_history_lock);
1024 
1025 	commit_transaction->t_state = T_FINISHED;
1026 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1027 	journal->j_commit_sequence = commit_transaction->t_tid;
1028 	journal->j_committing_transaction = NULL;
1029 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1030 
1031 	/*
1032 	 * weight the commit time higher than the average time so we don't
1033 	 * react too strongly to vast changes in the commit time
1034 	 */
1035 	if (likely(journal->j_average_commit_time))
1036 		journal->j_average_commit_time = (commit_time +
1037 				journal->j_average_commit_time*3) / 4;
1038 	else
1039 		journal->j_average_commit_time = commit_time;
1040 	spin_unlock(&journal->j_state_lock);
1041 
1042 	if (commit_transaction->t_checkpoint_list == NULL &&
1043 	    commit_transaction->t_checkpoint_io_list == NULL) {
1044 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1045 		to_free = 1;
1046 	} else {
1047 		if (journal->j_checkpoint_transactions == NULL) {
1048 			journal->j_checkpoint_transactions = commit_transaction;
1049 			commit_transaction->t_cpnext = commit_transaction;
1050 			commit_transaction->t_cpprev = commit_transaction;
1051 		} else {
1052 			commit_transaction->t_cpnext =
1053 				journal->j_checkpoint_transactions;
1054 			commit_transaction->t_cpprev =
1055 				commit_transaction->t_cpnext->t_cpprev;
1056 			commit_transaction->t_cpnext->t_cpprev =
1057 				commit_transaction;
1058 			commit_transaction->t_cpprev->t_cpnext =
1059 				commit_transaction;
1060 		}
1061 	}
1062 	spin_unlock(&journal->j_list_lock);
1063 
1064 	if (journal->j_commit_callback)
1065 		journal->j_commit_callback(journal, commit_transaction);
1066 
1067 	trace_jbd2_end_commit(journal, commit_transaction);
1068 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1069 		  journal->j_commit_sequence, journal->j_tail_sequence);
1070 	if (to_free)
1071 		kfree(commit_transaction);
1072 
1073 	wake_up(&journal->j_wait_done_commit);
1074 }
1075