xref: /openbmc/linux/fs/jbd2/commit.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/marker.h>
20 #include <linux/errno.h>
21 #include <linux/slab.h>
22 #include <linux/mm.h>
23 #include <linux/pagemap.h>
24 #include <linux/jiffies.h>
25 #include <linux/crc32.h>
26 #include <linux/writeback.h>
27 #include <linux/backing-dev.h>
28 #include <linux/bio.h>
29 
30 /*
31  * Default IO end handler for temporary BJ_IO buffer_heads.
32  */
33 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
34 {
35 	BUFFER_TRACE(bh, "");
36 	if (uptodate)
37 		set_buffer_uptodate(bh);
38 	else
39 		clear_buffer_uptodate(bh);
40 	unlock_buffer(bh);
41 }
42 
43 /*
44  * When an ext4 file is truncated, it is possible that some pages are not
45  * successfully freed, because they are attached to a committing transaction.
46  * After the transaction commits, these pages are left on the LRU, with no
47  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
48  * by the VM, but their apparent absence upsets the VM accounting, and it makes
49  * the numbers in /proc/meminfo look odd.
50  *
51  * So here, we have a buffer which has just come off the forget list.  Look to
52  * see if we can strip all buffers from the backing page.
53  *
54  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
55  * caller provided us with a ref against the buffer, and we drop that here.
56  */
57 static void release_buffer_page(struct buffer_head *bh)
58 {
59 	struct page *page;
60 
61 	if (buffer_dirty(bh))
62 		goto nope;
63 	if (atomic_read(&bh->b_count) != 1)
64 		goto nope;
65 	page = bh->b_page;
66 	if (!page)
67 		goto nope;
68 	if (page->mapping)
69 		goto nope;
70 
71 	/* OK, it's a truncated page */
72 	if (!trylock_page(page))
73 		goto nope;
74 
75 	page_cache_get(page);
76 	__brelse(bh);
77 	try_to_free_buffers(page);
78 	unlock_page(page);
79 	page_cache_release(page);
80 	return;
81 
82 nope:
83 	__brelse(bh);
84 }
85 
86 /*
87  * Done it all: now submit the commit record.  We should have
88  * cleaned up our previous buffers by now, so if we are in abort
89  * mode we can now just skip the rest of the journal write
90  * entirely.
91  *
92  * Returns 1 if the journal needs to be aborted or 0 on success
93  */
94 static int journal_submit_commit_record(journal_t *journal,
95 					transaction_t *commit_transaction,
96 					struct buffer_head **cbh,
97 					__u32 crc32_sum)
98 {
99 	struct journal_head *descriptor;
100 	struct commit_header *tmp;
101 	struct buffer_head *bh;
102 	int ret;
103 	int barrier_done = 0;
104 	struct timespec now = current_kernel_time();
105 
106 	if (is_journal_aborted(journal))
107 		return 0;
108 
109 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
110 	if (!descriptor)
111 		return 1;
112 
113 	bh = jh2bh(descriptor);
114 
115 	tmp = (struct commit_header *)bh->b_data;
116 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
117 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
118 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
119 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
120 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
121 
122 	if (JBD2_HAS_COMPAT_FEATURE(journal,
123 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
124 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
125 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
126 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
127 	}
128 
129 	JBUFFER_TRACE(descriptor, "submit commit block");
130 	lock_buffer(bh);
131 	clear_buffer_dirty(bh);
132 	set_buffer_uptodate(bh);
133 	bh->b_end_io = journal_end_buffer_io_sync;
134 
135 	if (journal->j_flags & JBD2_BARRIER &&
136 		!JBD2_HAS_INCOMPAT_FEATURE(journal,
137 					 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
138 		set_buffer_ordered(bh);
139 		barrier_done = 1;
140 	}
141 	ret = submit_bh(WRITE_SYNC, bh);
142 	if (barrier_done)
143 		clear_buffer_ordered(bh);
144 
145 	/* is it possible for another commit to fail at roughly
146 	 * the same time as this one?  If so, we don't want to
147 	 * trust the barrier flag in the super, but instead want
148 	 * to remember if we sent a barrier request
149 	 */
150 	if (ret == -EOPNOTSUPP && barrier_done) {
151 		printk(KERN_WARNING
152 		       "JBD: barrier-based sync failed on %s - "
153 		       "disabling barriers\n", journal->j_devname);
154 		spin_lock(&journal->j_state_lock);
155 		journal->j_flags &= ~JBD2_BARRIER;
156 		spin_unlock(&journal->j_state_lock);
157 
158 		/* And try again, without the barrier */
159 		lock_buffer(bh);
160 		set_buffer_uptodate(bh);
161 		clear_buffer_dirty(bh);
162 		ret = submit_bh(WRITE_SYNC, bh);
163 	}
164 	*cbh = bh;
165 	return ret;
166 }
167 
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(journal_t *journal,
173 					 struct buffer_head *bh)
174 {
175 	int ret = 0;
176 
177 retry:
178 	clear_buffer_dirty(bh);
179 	wait_on_buffer(bh);
180 	if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) {
181 		printk(KERN_WARNING
182 		       "JBD2: wait_on_commit_record: sync failed on %s - "
183 		       "disabling barriers\n", journal->j_devname);
184 		spin_lock(&journal->j_state_lock);
185 		journal->j_flags &= ~JBD2_BARRIER;
186 		spin_unlock(&journal->j_state_lock);
187 
188 		lock_buffer(bh);
189 		clear_buffer_dirty(bh);
190 		set_buffer_uptodate(bh);
191 		bh->b_end_io = journal_end_buffer_io_sync;
192 
193 		ret = submit_bh(WRITE_SYNC, bh);
194 		if (ret) {
195 			unlock_buffer(bh);
196 			return ret;
197 		}
198 		goto retry;
199 	}
200 
201 	if (unlikely(!buffer_uptodate(bh)))
202 		ret = -EIO;
203 	put_bh(bh);            /* One for getblk() */
204 	jbd2_journal_put_journal_head(bh2jh(bh));
205 
206 	return ret;
207 }
208 
209 /*
210  * write the filemap data using writepage() address_space_operations.
211  * We don't do block allocation here even for delalloc. We don't
212  * use writepages() because with dealyed allocation we may be doing
213  * block allocation in writepages().
214  */
215 static int journal_submit_inode_data_buffers(struct address_space *mapping)
216 {
217 	int ret;
218 	struct writeback_control wbc = {
219 		.sync_mode =  WB_SYNC_ALL,
220 		.nr_to_write = mapping->nrpages * 2,
221 		.range_start = 0,
222 		.range_end = i_size_read(mapping->host),
223 		.for_writepages = 1,
224 	};
225 
226 	ret = generic_writepages(mapping, &wbc);
227 	return ret;
228 }
229 
230 /*
231  * Submit all the data buffers of inode associated with the transaction to
232  * disk.
233  *
234  * We are in a committing transaction. Therefore no new inode can be added to
235  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
236  * operate on from being released while we write out pages.
237  */
238 static int journal_submit_data_buffers(journal_t *journal,
239 		transaction_t *commit_transaction)
240 {
241 	struct jbd2_inode *jinode;
242 	int err, ret = 0;
243 	struct address_space *mapping;
244 
245 	spin_lock(&journal->j_list_lock);
246 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
247 		mapping = jinode->i_vfs_inode->i_mapping;
248 		jinode->i_flags |= JI_COMMIT_RUNNING;
249 		spin_unlock(&journal->j_list_lock);
250 		/*
251 		 * submit the inode data buffers. We use writepage
252 		 * instead of writepages. Because writepages can do
253 		 * block allocation  with delalloc. We need to write
254 		 * only allocated blocks here.
255 		 */
256 		err = journal_submit_inode_data_buffers(mapping);
257 		if (!ret)
258 			ret = err;
259 		spin_lock(&journal->j_list_lock);
260 		J_ASSERT(jinode->i_transaction == commit_transaction);
261 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
262 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
263 	}
264 	spin_unlock(&journal->j_list_lock);
265 	return ret;
266 }
267 
268 /*
269  * Wait for data submitted for writeout, refile inodes to proper
270  * transaction if needed.
271  *
272  */
273 static int journal_finish_inode_data_buffers(journal_t *journal,
274 		transaction_t *commit_transaction)
275 {
276 	struct jbd2_inode *jinode, *next_i;
277 	int err, ret = 0;
278 
279 	/* For locking, see the comment in journal_submit_data_buffers() */
280 	spin_lock(&journal->j_list_lock);
281 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
282 		jinode->i_flags |= JI_COMMIT_RUNNING;
283 		spin_unlock(&journal->j_list_lock);
284 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
285 		if (err) {
286 			/*
287 			 * Because AS_EIO is cleared by
288 			 * wait_on_page_writeback_range(), set it again so
289 			 * that user process can get -EIO from fsync().
290 			 */
291 			set_bit(AS_EIO,
292 				&jinode->i_vfs_inode->i_mapping->flags);
293 
294 			if (!ret)
295 				ret = err;
296 		}
297 		spin_lock(&journal->j_list_lock);
298 		jinode->i_flags &= ~JI_COMMIT_RUNNING;
299 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
300 	}
301 
302 	/* Now refile inode to proper lists */
303 	list_for_each_entry_safe(jinode, next_i,
304 				 &commit_transaction->t_inode_list, i_list) {
305 		list_del(&jinode->i_list);
306 		if (jinode->i_next_transaction) {
307 			jinode->i_transaction = jinode->i_next_transaction;
308 			jinode->i_next_transaction = NULL;
309 			list_add(&jinode->i_list,
310 				&jinode->i_transaction->t_inode_list);
311 		} else {
312 			jinode->i_transaction = NULL;
313 		}
314 	}
315 	spin_unlock(&journal->j_list_lock);
316 
317 	return ret;
318 }
319 
320 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
321 {
322 	struct page *page = bh->b_page;
323 	char *addr;
324 	__u32 checksum;
325 
326 	addr = kmap_atomic(page, KM_USER0);
327 	checksum = crc32_be(crc32_sum,
328 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
329 	kunmap_atomic(addr, KM_USER0);
330 
331 	return checksum;
332 }
333 
334 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
335 				   unsigned long long block)
336 {
337 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
338 	if (tag_bytes > JBD2_TAG_SIZE32)
339 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
340 }
341 
342 /*
343  * jbd2_journal_commit_transaction
344  *
345  * The primary function for committing a transaction to the log.  This
346  * function is called by the journal thread to begin a complete commit.
347  */
348 void jbd2_journal_commit_transaction(journal_t *journal)
349 {
350 	struct transaction_stats_s stats;
351 	transaction_t *commit_transaction;
352 	struct journal_head *jh, *new_jh, *descriptor;
353 	struct buffer_head **wbuf = journal->j_wbuf;
354 	int bufs;
355 	int flags;
356 	int err;
357 	unsigned long long blocknr;
358 	ktime_t start_time;
359 	u64 commit_time;
360 	char *tagp = NULL;
361 	journal_header_t *header;
362 	journal_block_tag_t *tag = NULL;
363 	int space_left = 0;
364 	int first_tag = 0;
365 	int tag_flag;
366 	int i, to_free = 0;
367 	int tag_bytes = journal_tag_bytes(journal);
368 	struct buffer_head *cbh = NULL; /* For transactional checksums */
369 	__u32 crc32_sum = ~0;
370 	int write_op = WRITE;
371 
372 	/*
373 	 * First job: lock down the current transaction and wait for
374 	 * all outstanding updates to complete.
375 	 */
376 
377 #ifdef COMMIT_STATS
378 	spin_lock(&journal->j_list_lock);
379 	summarise_journal_usage(journal);
380 	spin_unlock(&journal->j_list_lock);
381 #endif
382 
383 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
384 	if (journal->j_flags & JBD2_FLUSHED) {
385 		jbd_debug(3, "super block updated\n");
386 		jbd2_journal_update_superblock(journal, 1);
387 	} else {
388 		jbd_debug(3, "superblock not updated\n");
389 	}
390 
391 	J_ASSERT(journal->j_running_transaction != NULL);
392 	J_ASSERT(journal->j_committing_transaction == NULL);
393 
394 	commit_transaction = journal->j_running_transaction;
395 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
396 
397 	trace_mark(jbd2_start_commit, "dev %s transaction %d",
398 		   journal->j_devname, commit_transaction->t_tid);
399 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
400 			commit_transaction->t_tid);
401 
402 	spin_lock(&journal->j_state_lock);
403 	commit_transaction->t_state = T_LOCKED;
404 
405 	if (commit_transaction->t_synchronous_commit)
406 		write_op = WRITE_SYNC;
407 	stats.u.run.rs_wait = commit_transaction->t_max_wait;
408 	stats.u.run.rs_locked = jiffies;
409 	stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
410 						stats.u.run.rs_locked);
411 
412 	spin_lock(&commit_transaction->t_handle_lock);
413 	while (commit_transaction->t_updates) {
414 		DEFINE_WAIT(wait);
415 
416 		prepare_to_wait(&journal->j_wait_updates, &wait,
417 					TASK_UNINTERRUPTIBLE);
418 		if (commit_transaction->t_updates) {
419 			spin_unlock(&commit_transaction->t_handle_lock);
420 			spin_unlock(&journal->j_state_lock);
421 			schedule();
422 			spin_lock(&journal->j_state_lock);
423 			spin_lock(&commit_transaction->t_handle_lock);
424 		}
425 		finish_wait(&journal->j_wait_updates, &wait);
426 	}
427 	spin_unlock(&commit_transaction->t_handle_lock);
428 
429 	J_ASSERT (commit_transaction->t_outstanding_credits <=
430 			journal->j_max_transaction_buffers);
431 
432 	/*
433 	 * First thing we are allowed to do is to discard any remaining
434 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
435 	 * that there are no such buffers: if a large filesystem
436 	 * operation like a truncate needs to split itself over multiple
437 	 * transactions, then it may try to do a jbd2_journal_restart() while
438 	 * there are still BJ_Reserved buffers outstanding.  These must
439 	 * be released cleanly from the current transaction.
440 	 *
441 	 * In this case, the filesystem must still reserve write access
442 	 * again before modifying the buffer in the new transaction, but
443 	 * we do not require it to remember exactly which old buffers it
444 	 * has reserved.  This is consistent with the existing behaviour
445 	 * that multiple jbd2_journal_get_write_access() calls to the same
446 	 * buffer are perfectly permissable.
447 	 */
448 	while (commit_transaction->t_reserved_list) {
449 		jh = commit_transaction->t_reserved_list;
450 		JBUFFER_TRACE(jh, "reserved, unused: refile");
451 		/*
452 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
453 		 * leave undo-committed data.
454 		 */
455 		if (jh->b_committed_data) {
456 			struct buffer_head *bh = jh2bh(jh);
457 
458 			jbd_lock_bh_state(bh);
459 			jbd2_free(jh->b_committed_data, bh->b_size);
460 			jh->b_committed_data = NULL;
461 			jbd_unlock_bh_state(bh);
462 		}
463 		jbd2_journal_refile_buffer(journal, jh);
464 	}
465 
466 	/*
467 	 * Now try to drop any written-back buffers from the journal's
468 	 * checkpoint lists.  We do this *before* commit because it potentially
469 	 * frees some memory
470 	 */
471 	spin_lock(&journal->j_list_lock);
472 	__jbd2_journal_clean_checkpoint_list(journal);
473 	spin_unlock(&journal->j_list_lock);
474 
475 	jbd_debug (3, "JBD: commit phase 1\n");
476 
477 	/*
478 	 * Switch to a new revoke table.
479 	 */
480 	jbd2_journal_switch_revoke_table(journal);
481 
482 	stats.u.run.rs_flushing = jiffies;
483 	stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
484 					       stats.u.run.rs_flushing);
485 
486 	commit_transaction->t_state = T_FLUSH;
487 	journal->j_committing_transaction = commit_transaction;
488 	journal->j_running_transaction = NULL;
489 	start_time = ktime_get();
490 	commit_transaction->t_log_start = journal->j_head;
491 	wake_up(&journal->j_wait_transaction_locked);
492 	spin_unlock(&journal->j_state_lock);
493 
494 	jbd_debug (3, "JBD: commit phase 2\n");
495 
496 	/*
497 	 * Now start flushing things to disk, in the order they appear
498 	 * on the transaction lists.  Data blocks go first.
499 	 */
500 	err = journal_submit_data_buffers(journal, commit_transaction);
501 	if (err)
502 		jbd2_journal_abort(journal, err);
503 
504 	jbd2_journal_write_revoke_records(journal, commit_transaction);
505 
506 	jbd_debug(3, "JBD: commit phase 2\n");
507 
508 	/*
509 	 * Way to go: we have now written out all of the data for a
510 	 * transaction!  Now comes the tricky part: we need to write out
511 	 * metadata.  Loop over the transaction's entire buffer list:
512 	 */
513 	spin_lock(&journal->j_state_lock);
514 	commit_transaction->t_state = T_COMMIT;
515 	spin_unlock(&journal->j_state_lock);
516 
517 	stats.u.run.rs_logging = jiffies;
518 	stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
519 						 stats.u.run.rs_logging);
520 	stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
521 	stats.u.run.rs_blocks_logged = 0;
522 
523 	J_ASSERT(commit_transaction->t_nr_buffers <=
524 		 commit_transaction->t_outstanding_credits);
525 
526 	err = 0;
527 	descriptor = NULL;
528 	bufs = 0;
529 	while (commit_transaction->t_buffers) {
530 
531 		/* Find the next buffer to be journaled... */
532 
533 		jh = commit_transaction->t_buffers;
534 
535 		/* If we're in abort mode, we just un-journal the buffer and
536 		   release it. */
537 
538 		if (is_journal_aborted(journal)) {
539 			clear_buffer_jbddirty(jh2bh(jh));
540 			JBUFFER_TRACE(jh, "journal is aborting: refile");
541 			jbd2_buffer_abort_trigger(jh,
542 						  jh->b_frozen_data ?
543 						  jh->b_frozen_triggers :
544 						  jh->b_triggers);
545 			jbd2_journal_refile_buffer(journal, jh);
546 			/* If that was the last one, we need to clean up
547 			 * any descriptor buffers which may have been
548 			 * already allocated, even if we are now
549 			 * aborting. */
550 			if (!commit_transaction->t_buffers)
551 				goto start_journal_io;
552 			continue;
553 		}
554 
555 		/* Make sure we have a descriptor block in which to
556 		   record the metadata buffer. */
557 
558 		if (!descriptor) {
559 			struct buffer_head *bh;
560 
561 			J_ASSERT (bufs == 0);
562 
563 			jbd_debug(4, "JBD: get descriptor\n");
564 
565 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
566 			if (!descriptor) {
567 				jbd2_journal_abort(journal, -EIO);
568 				continue;
569 			}
570 
571 			bh = jh2bh(descriptor);
572 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
573 				(unsigned long long)bh->b_blocknr, bh->b_data);
574 			header = (journal_header_t *)&bh->b_data[0];
575 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
576 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
577 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
578 
579 			tagp = &bh->b_data[sizeof(journal_header_t)];
580 			space_left = bh->b_size - sizeof(journal_header_t);
581 			first_tag = 1;
582 			set_buffer_jwrite(bh);
583 			set_buffer_dirty(bh);
584 			wbuf[bufs++] = bh;
585 
586 			/* Record it so that we can wait for IO
587                            completion later */
588 			BUFFER_TRACE(bh, "ph3: file as descriptor");
589 			jbd2_journal_file_buffer(descriptor, commit_transaction,
590 					BJ_LogCtl);
591 		}
592 
593 		/* Where is the buffer to be written? */
594 
595 		err = jbd2_journal_next_log_block(journal, &blocknr);
596 		/* If the block mapping failed, just abandon the buffer
597 		   and repeat this loop: we'll fall into the
598 		   refile-on-abort condition above. */
599 		if (err) {
600 			jbd2_journal_abort(journal, err);
601 			continue;
602 		}
603 
604 		/*
605 		 * start_this_handle() uses t_outstanding_credits to determine
606 		 * the free space in the log, but this counter is changed
607 		 * by jbd2_journal_next_log_block() also.
608 		 */
609 		commit_transaction->t_outstanding_credits--;
610 
611 		/* Bump b_count to prevent truncate from stumbling over
612                    the shadowed buffer!  @@@ This can go if we ever get
613                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
614 		atomic_inc(&jh2bh(jh)->b_count);
615 
616 		/* Make a temporary IO buffer with which to write it out
617                    (this will requeue both the metadata buffer and the
618                    temporary IO buffer). new_bh goes on BJ_IO*/
619 
620 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
621 		/*
622 		 * akpm: jbd2_journal_write_metadata_buffer() sets
623 		 * new_bh->b_transaction to commit_transaction.
624 		 * We need to clean this up before we release new_bh
625 		 * (which is of type BJ_IO)
626 		 */
627 		JBUFFER_TRACE(jh, "ph3: write metadata");
628 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
629 						      jh, &new_jh, blocknr);
630 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
631 		wbuf[bufs++] = jh2bh(new_jh);
632 
633 		/* Record the new block's tag in the current descriptor
634                    buffer */
635 
636 		tag_flag = 0;
637 		if (flags & 1)
638 			tag_flag |= JBD2_FLAG_ESCAPE;
639 		if (!first_tag)
640 			tag_flag |= JBD2_FLAG_SAME_UUID;
641 
642 		tag = (journal_block_tag_t *) tagp;
643 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
644 		tag->t_flags = cpu_to_be32(tag_flag);
645 		tagp += tag_bytes;
646 		space_left -= tag_bytes;
647 
648 		if (first_tag) {
649 			memcpy (tagp, journal->j_uuid, 16);
650 			tagp += 16;
651 			space_left -= 16;
652 			first_tag = 0;
653 		}
654 
655 		/* If there's no more to do, or if the descriptor is full,
656 		   let the IO rip! */
657 
658 		if (bufs == journal->j_wbufsize ||
659 		    commit_transaction->t_buffers == NULL ||
660 		    space_left < tag_bytes + 16) {
661 
662 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
663 
664 			/* Write an end-of-descriptor marker before
665                            submitting the IOs.  "tag" still points to
666                            the last tag we set up. */
667 
668 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
669 
670 start_journal_io:
671 			for (i = 0; i < bufs; i++) {
672 				struct buffer_head *bh = wbuf[i];
673 				/*
674 				 * Compute checksum.
675 				 */
676 				if (JBD2_HAS_COMPAT_FEATURE(journal,
677 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
678 					crc32_sum =
679 					    jbd2_checksum_data(crc32_sum, bh);
680 				}
681 
682 				lock_buffer(bh);
683 				clear_buffer_dirty(bh);
684 				set_buffer_uptodate(bh);
685 				bh->b_end_io = journal_end_buffer_io_sync;
686 				submit_bh(write_op, bh);
687 			}
688 			cond_resched();
689 			stats.u.run.rs_blocks_logged += bufs;
690 
691 			/* Force a new descriptor to be generated next
692                            time round the loop. */
693 			descriptor = NULL;
694 			bufs = 0;
695 		}
696 	}
697 
698 	/* Done it all: now write the commit record asynchronously. */
699 
700 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
701 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
702 		err = journal_submit_commit_record(journal, commit_transaction,
703 						 &cbh, crc32_sum);
704 		if (err)
705 			__jbd2_journal_abort_hard(journal);
706 	}
707 
708 	/*
709 	 * This is the right place to wait for data buffers both for ASYNC
710 	 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
711 	 * the commit block went to disk (which happens above). If commit is
712 	 * SYNC, we need to wait for data buffers before we start writing
713 	 * commit block, which happens below in such setting.
714 	 */
715 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
716 	if (err) {
717 		printk(KERN_WARNING
718 			"JBD2: Detected IO errors while flushing file data "
719 		       "on %s\n", journal->j_devname);
720 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
721 			jbd2_journal_abort(journal, err);
722 		err = 0;
723 	}
724 
725 	/* Lo and behold: we have just managed to send a transaction to
726            the log.  Before we can commit it, wait for the IO so far to
727            complete.  Control buffers being written are on the
728            transaction's t_log_list queue, and metadata buffers are on
729            the t_iobuf_list queue.
730 
731 	   Wait for the buffers in reverse order.  That way we are
732 	   less likely to be woken up until all IOs have completed, and
733 	   so we incur less scheduling load.
734 	*/
735 
736 	jbd_debug(3, "JBD: commit phase 3\n");
737 
738 	/*
739 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
740 	 * See __journal_try_to_free_buffer.
741 	 */
742 wait_for_iobuf:
743 	while (commit_transaction->t_iobuf_list != NULL) {
744 		struct buffer_head *bh;
745 
746 		jh = commit_transaction->t_iobuf_list->b_tprev;
747 		bh = jh2bh(jh);
748 		if (buffer_locked(bh)) {
749 			wait_on_buffer(bh);
750 			goto wait_for_iobuf;
751 		}
752 		if (cond_resched())
753 			goto wait_for_iobuf;
754 
755 		if (unlikely(!buffer_uptodate(bh)))
756 			err = -EIO;
757 
758 		clear_buffer_jwrite(bh);
759 
760 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
761 		jbd2_journal_unfile_buffer(journal, jh);
762 
763 		/*
764 		 * ->t_iobuf_list should contain only dummy buffer_heads
765 		 * which were created by jbd2_journal_write_metadata_buffer().
766 		 */
767 		BUFFER_TRACE(bh, "dumping temporary bh");
768 		jbd2_journal_put_journal_head(jh);
769 		__brelse(bh);
770 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
771 		free_buffer_head(bh);
772 
773 		/* We also have to unlock and free the corresponding
774                    shadowed buffer */
775 		jh = commit_transaction->t_shadow_list->b_tprev;
776 		bh = jh2bh(jh);
777 		clear_bit(BH_JWrite, &bh->b_state);
778 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
779 
780 		/* The metadata is now released for reuse, but we need
781                    to remember it against this transaction so that when
782                    we finally commit, we can do any checkpointing
783                    required. */
784 		JBUFFER_TRACE(jh, "file as BJ_Forget");
785 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
786 		/* Wake up any transactions which were waiting for this
787 		   IO to complete */
788 		wake_up_bit(&bh->b_state, BH_Unshadow);
789 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
790 		__brelse(bh);
791 	}
792 
793 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
794 
795 	jbd_debug(3, "JBD: commit phase 4\n");
796 
797 	/* Here we wait for the revoke record and descriptor record buffers */
798  wait_for_ctlbuf:
799 	while (commit_transaction->t_log_list != NULL) {
800 		struct buffer_head *bh;
801 
802 		jh = commit_transaction->t_log_list->b_tprev;
803 		bh = jh2bh(jh);
804 		if (buffer_locked(bh)) {
805 			wait_on_buffer(bh);
806 			goto wait_for_ctlbuf;
807 		}
808 		if (cond_resched())
809 			goto wait_for_ctlbuf;
810 
811 		if (unlikely(!buffer_uptodate(bh)))
812 			err = -EIO;
813 
814 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
815 		clear_buffer_jwrite(bh);
816 		jbd2_journal_unfile_buffer(journal, jh);
817 		jbd2_journal_put_journal_head(jh);
818 		__brelse(bh);		/* One for getblk */
819 		/* AKPM: bforget here */
820 	}
821 
822 	if (err)
823 		jbd2_journal_abort(journal, err);
824 
825 	jbd_debug(3, "JBD: commit phase 5\n");
826 
827 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
828 		JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
829 		err = journal_submit_commit_record(journal, commit_transaction,
830 						&cbh, crc32_sum);
831 		if (err)
832 			__jbd2_journal_abort_hard(journal);
833 	}
834 	if (!err && !is_journal_aborted(journal))
835 		err = journal_wait_on_commit_record(journal, cbh);
836 
837 	if (err)
838 		jbd2_journal_abort(journal, err);
839 
840 	/* End of a transaction!  Finally, we can do checkpoint
841            processing: any buffers committed as a result of this
842            transaction can be removed from any checkpoint list it was on
843            before. */
844 
845 	jbd_debug(3, "JBD: commit phase 6\n");
846 
847 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
848 	J_ASSERT(commit_transaction->t_buffers == NULL);
849 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
850 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
851 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
852 	J_ASSERT(commit_transaction->t_log_list == NULL);
853 
854 restart_loop:
855 	/*
856 	 * As there are other places (journal_unmap_buffer()) adding buffers
857 	 * to this list we have to be careful and hold the j_list_lock.
858 	 */
859 	spin_lock(&journal->j_list_lock);
860 	while (commit_transaction->t_forget) {
861 		transaction_t *cp_transaction;
862 		struct buffer_head *bh;
863 
864 		jh = commit_transaction->t_forget;
865 		spin_unlock(&journal->j_list_lock);
866 		bh = jh2bh(jh);
867 		jbd_lock_bh_state(bh);
868 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
869 			jh->b_transaction == journal->j_running_transaction);
870 
871 		/*
872 		 * If there is undo-protected committed data against
873 		 * this buffer, then we can remove it now.  If it is a
874 		 * buffer needing such protection, the old frozen_data
875 		 * field now points to a committed version of the
876 		 * buffer, so rotate that field to the new committed
877 		 * data.
878 		 *
879 		 * Otherwise, we can just throw away the frozen data now.
880 		 *
881 		 * We also know that the frozen data has already fired
882 		 * its triggers if they exist, so we can clear that too.
883 		 */
884 		if (jh->b_committed_data) {
885 			jbd2_free(jh->b_committed_data, bh->b_size);
886 			jh->b_committed_data = NULL;
887 			if (jh->b_frozen_data) {
888 				jh->b_committed_data = jh->b_frozen_data;
889 				jh->b_frozen_data = NULL;
890 				jh->b_frozen_triggers = NULL;
891 			}
892 		} else if (jh->b_frozen_data) {
893 			jbd2_free(jh->b_frozen_data, bh->b_size);
894 			jh->b_frozen_data = NULL;
895 			jh->b_frozen_triggers = NULL;
896 		}
897 
898 		spin_lock(&journal->j_list_lock);
899 		cp_transaction = jh->b_cp_transaction;
900 		if (cp_transaction) {
901 			JBUFFER_TRACE(jh, "remove from old cp transaction");
902 			cp_transaction->t_chp_stats.cs_dropped++;
903 			__jbd2_journal_remove_checkpoint(jh);
904 		}
905 
906 		/* Only re-checkpoint the buffer_head if it is marked
907 		 * dirty.  If the buffer was added to the BJ_Forget list
908 		 * by jbd2_journal_forget, it may no longer be dirty and
909 		 * there's no point in keeping a checkpoint record for
910 		 * it. */
911 
912 		/* A buffer which has been freed while still being
913 		 * journaled by a previous transaction may end up still
914 		 * being dirty here, but we want to avoid writing back
915 		 * that buffer in the future now that the last use has
916 		 * been committed.  That's not only a performance gain,
917 		 * it also stops aliasing problems if the buffer is left
918 		 * behind for writeback and gets reallocated for another
919 		 * use in a different page. */
920 		if (buffer_freed(bh)) {
921 			clear_buffer_freed(bh);
922 			clear_buffer_jbddirty(bh);
923 		}
924 
925 		if (buffer_jbddirty(bh)) {
926 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
927 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
928 			if (is_journal_aborted(journal))
929 				clear_buffer_jbddirty(bh);
930 			JBUFFER_TRACE(jh, "refile for checkpoint writeback");
931 			__jbd2_journal_refile_buffer(jh);
932 			jbd_unlock_bh_state(bh);
933 		} else {
934 			J_ASSERT_BH(bh, !buffer_dirty(bh));
935 			/* The buffer on BJ_Forget list and not jbddirty means
936 			 * it has been freed by this transaction and hence it
937 			 * could not have been reallocated until this
938 			 * transaction has committed. *BUT* it could be
939 			 * reallocated once we have written all the data to
940 			 * disk and before we process the buffer on BJ_Forget
941 			 * list. */
942 			JBUFFER_TRACE(jh, "refile or unfile freed buffer");
943 			__jbd2_journal_refile_buffer(jh);
944 			if (!jh->b_transaction) {
945 				jbd_unlock_bh_state(bh);
946 				 /* needs a brelse */
947 				jbd2_journal_remove_journal_head(bh);
948 				release_buffer_page(bh);
949 			} else
950 				jbd_unlock_bh_state(bh);
951 		}
952 		cond_resched_lock(&journal->j_list_lock);
953 	}
954 	spin_unlock(&journal->j_list_lock);
955 	/*
956 	 * This is a bit sleazy.  We use j_list_lock to protect transition
957 	 * of a transaction into T_FINISHED state and calling
958 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
959 	 * other checkpointing code processing the transaction...
960 	 */
961 	spin_lock(&journal->j_state_lock);
962 	spin_lock(&journal->j_list_lock);
963 	/*
964 	 * Now recheck if some buffers did not get attached to the transaction
965 	 * while the lock was dropped...
966 	 */
967 	if (commit_transaction->t_forget) {
968 		spin_unlock(&journal->j_list_lock);
969 		spin_unlock(&journal->j_state_lock);
970 		goto restart_loop;
971 	}
972 
973 	/* Done with this transaction! */
974 
975 	jbd_debug(3, "JBD: commit phase 7\n");
976 
977 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
978 
979 	commit_transaction->t_start = jiffies;
980 	stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
981 						commit_transaction->t_start);
982 
983 	/*
984 	 * File the transaction for history
985 	 */
986 	stats.ts_type = JBD2_STATS_RUN;
987 	stats.ts_tid = commit_transaction->t_tid;
988 	stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
989 	spin_lock(&journal->j_history_lock);
990 	memcpy(journal->j_history + journal->j_history_cur, &stats,
991 			sizeof(stats));
992 	if (++journal->j_history_cur == journal->j_history_max)
993 		journal->j_history_cur = 0;
994 
995 	/*
996 	 * Calculate overall stats
997 	 */
998 	journal->j_stats.ts_tid++;
999 	journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
1000 	journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
1001 	journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
1002 	journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
1003 	journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
1004 	journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
1005 	journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
1006 	journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
1007 	spin_unlock(&journal->j_history_lock);
1008 
1009 	commit_transaction->t_state = T_FINISHED;
1010 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1011 	journal->j_commit_sequence = commit_transaction->t_tid;
1012 	journal->j_committing_transaction = NULL;
1013 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1014 
1015 	/*
1016 	 * weight the commit time higher than the average time so we don't
1017 	 * react too strongly to vast changes in the commit time
1018 	 */
1019 	if (likely(journal->j_average_commit_time))
1020 		journal->j_average_commit_time = (commit_time +
1021 				journal->j_average_commit_time*3) / 4;
1022 	else
1023 		journal->j_average_commit_time = commit_time;
1024 	spin_unlock(&journal->j_state_lock);
1025 
1026 	if (commit_transaction->t_checkpoint_list == NULL &&
1027 	    commit_transaction->t_checkpoint_io_list == NULL) {
1028 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1029 		to_free = 1;
1030 	} else {
1031 		if (journal->j_checkpoint_transactions == NULL) {
1032 			journal->j_checkpoint_transactions = commit_transaction;
1033 			commit_transaction->t_cpnext = commit_transaction;
1034 			commit_transaction->t_cpprev = commit_transaction;
1035 		} else {
1036 			commit_transaction->t_cpnext =
1037 				journal->j_checkpoint_transactions;
1038 			commit_transaction->t_cpprev =
1039 				commit_transaction->t_cpnext->t_cpprev;
1040 			commit_transaction->t_cpnext->t_cpprev =
1041 				commit_transaction;
1042 			commit_transaction->t_cpprev->t_cpnext =
1043 				commit_transaction;
1044 		}
1045 	}
1046 	spin_unlock(&journal->j_list_lock);
1047 
1048 	if (journal->j_commit_callback)
1049 		journal->j_commit_callback(journal, commit_transaction);
1050 
1051 	trace_mark(jbd2_end_commit, "dev %s transaction %d head %d",
1052 		   journal->j_devname, commit_transaction->t_tid,
1053 		   journal->j_tail_sequence);
1054 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
1055 		  journal->j_commit_sequence, journal->j_tail_sequence);
1056 	if (to_free)
1057 		kfree(commit_transaction);
1058 
1059 	wake_up(&journal->j_wait_done_commit);
1060 }
1061