xref: /openbmc/linux/fs/btrfs/transaction.c (revision e8e0929d)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/fs.h>
20 #include <linux/sched.h>
21 #include <linux/writeback.h>
22 #include <linux/pagemap.h>
23 #include <linux/blkdev.h>
24 #include "ctree.h"
25 #include "disk-io.h"
26 #include "transaction.h"
27 #include "locking.h"
28 #include "tree-log.h"
29 
30 #define BTRFS_ROOT_TRANS_TAG 0
31 
32 static noinline void put_transaction(struct btrfs_transaction *transaction)
33 {
34 	WARN_ON(transaction->use_count == 0);
35 	transaction->use_count--;
36 	if (transaction->use_count == 0) {
37 		list_del_init(&transaction->list);
38 		memset(transaction, 0, sizeof(*transaction));
39 		kmem_cache_free(btrfs_transaction_cachep, transaction);
40 	}
41 }
42 
43 static noinline void switch_commit_root(struct btrfs_root *root)
44 {
45 	free_extent_buffer(root->commit_root);
46 	root->commit_root = btrfs_root_node(root);
47 }
48 
49 /*
50  * either allocate a new transaction or hop into the existing one
51  */
52 static noinline int join_transaction(struct btrfs_root *root)
53 {
54 	struct btrfs_transaction *cur_trans;
55 	cur_trans = root->fs_info->running_transaction;
56 	if (!cur_trans) {
57 		cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
58 					     GFP_NOFS);
59 		BUG_ON(!cur_trans);
60 		root->fs_info->generation++;
61 		cur_trans->num_writers = 1;
62 		cur_trans->num_joined = 0;
63 		cur_trans->transid = root->fs_info->generation;
64 		init_waitqueue_head(&cur_trans->writer_wait);
65 		init_waitqueue_head(&cur_trans->commit_wait);
66 		cur_trans->in_commit = 0;
67 		cur_trans->blocked = 0;
68 		cur_trans->use_count = 1;
69 		cur_trans->commit_done = 0;
70 		cur_trans->start_time = get_seconds();
71 
72 		cur_trans->delayed_refs.root.rb_node = NULL;
73 		cur_trans->delayed_refs.num_entries = 0;
74 		cur_trans->delayed_refs.num_heads_ready = 0;
75 		cur_trans->delayed_refs.num_heads = 0;
76 		cur_trans->delayed_refs.flushing = 0;
77 		cur_trans->delayed_refs.run_delayed_start = 0;
78 		spin_lock_init(&cur_trans->delayed_refs.lock);
79 
80 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
81 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
82 		extent_io_tree_init(&cur_trans->dirty_pages,
83 				     root->fs_info->btree_inode->i_mapping,
84 				     GFP_NOFS);
85 		spin_lock(&root->fs_info->new_trans_lock);
86 		root->fs_info->running_transaction = cur_trans;
87 		spin_unlock(&root->fs_info->new_trans_lock);
88 	} else {
89 		cur_trans->num_writers++;
90 		cur_trans->num_joined++;
91 	}
92 
93 	return 0;
94 }
95 
96 /*
97  * this does all the record keeping required to make sure that a reference
98  * counted root is properly recorded in a given transaction.  This is required
99  * to make sure the old root from before we joined the transaction is deleted
100  * when the transaction commits
101  */
102 static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
103 					 struct btrfs_root *root)
104 {
105 	if (root->ref_cows && root->last_trans < trans->transid) {
106 		WARN_ON(root == root->fs_info->extent_root);
107 		WARN_ON(root->commit_root != root->node);
108 
109 		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
110 			   (unsigned long)root->root_key.objectid,
111 			   BTRFS_ROOT_TRANS_TAG);
112 		root->last_trans = trans->transid;
113 		btrfs_init_reloc_root(trans, root);
114 	}
115 	return 0;
116 }
117 
118 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
119 			       struct btrfs_root *root)
120 {
121 	if (!root->ref_cows)
122 		return 0;
123 
124 	mutex_lock(&root->fs_info->trans_mutex);
125 	if (root->last_trans == trans->transid) {
126 		mutex_unlock(&root->fs_info->trans_mutex);
127 		return 0;
128 	}
129 
130 	record_root_in_trans(trans, root);
131 	mutex_unlock(&root->fs_info->trans_mutex);
132 	return 0;
133 }
134 
135 /* wait for commit against the current transaction to become unblocked
136  * when this is done, it is safe to start a new transaction, but the current
137  * transaction might not be fully on disk.
138  */
139 static void wait_current_trans(struct btrfs_root *root)
140 {
141 	struct btrfs_transaction *cur_trans;
142 
143 	cur_trans = root->fs_info->running_transaction;
144 	if (cur_trans && cur_trans->blocked) {
145 		DEFINE_WAIT(wait);
146 		cur_trans->use_count++;
147 		while (1) {
148 			prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 					TASK_UNINTERRUPTIBLE);
150 			if (cur_trans->blocked) {
151 				mutex_unlock(&root->fs_info->trans_mutex);
152 				schedule();
153 				mutex_lock(&root->fs_info->trans_mutex);
154 				finish_wait(&root->fs_info->transaction_wait,
155 					    &wait);
156 			} else {
157 				finish_wait(&root->fs_info->transaction_wait,
158 					    &wait);
159 				break;
160 			}
161 		}
162 		put_transaction(cur_trans);
163 	}
164 }
165 
166 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
167 					     int num_blocks, int wait)
168 {
169 	struct btrfs_trans_handle *h =
170 		kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
171 	int ret;
172 
173 	mutex_lock(&root->fs_info->trans_mutex);
174 	if (!root->fs_info->log_root_recovering &&
175 	    ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2))
176 		wait_current_trans(root);
177 	ret = join_transaction(root);
178 	BUG_ON(ret);
179 
180 	h->transid = root->fs_info->running_transaction->transid;
181 	h->transaction = root->fs_info->running_transaction;
182 	h->blocks_reserved = num_blocks;
183 	h->blocks_used = 0;
184 	h->block_group = 0;
185 	h->alloc_exclude_nr = 0;
186 	h->alloc_exclude_start = 0;
187 	h->delayed_ref_updates = 0;
188 
189 	if (!current->journal_info)
190 		current->journal_info = h;
191 
192 	root->fs_info->running_transaction->use_count++;
193 	record_root_in_trans(h, root);
194 	mutex_unlock(&root->fs_info->trans_mutex);
195 	return h;
196 }
197 
198 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
199 						   int num_blocks)
200 {
201 	return start_transaction(root, num_blocks, 1);
202 }
203 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
204 						   int num_blocks)
205 {
206 	return start_transaction(root, num_blocks, 0);
207 }
208 
209 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
210 							 int num_blocks)
211 {
212 	return start_transaction(r, num_blocks, 2);
213 }
214 
215 /* wait for a transaction commit to be fully complete */
216 static noinline int wait_for_commit(struct btrfs_root *root,
217 				    struct btrfs_transaction *commit)
218 {
219 	DEFINE_WAIT(wait);
220 	mutex_lock(&root->fs_info->trans_mutex);
221 	while (!commit->commit_done) {
222 		prepare_to_wait(&commit->commit_wait, &wait,
223 				TASK_UNINTERRUPTIBLE);
224 		if (commit->commit_done)
225 			break;
226 		mutex_unlock(&root->fs_info->trans_mutex);
227 		schedule();
228 		mutex_lock(&root->fs_info->trans_mutex);
229 	}
230 	mutex_unlock(&root->fs_info->trans_mutex);
231 	finish_wait(&commit->commit_wait, &wait);
232 	return 0;
233 }
234 
235 #if 0
236 /*
237  * rate limit against the drop_snapshot code.  This helps to slow down new
238  * operations if the drop_snapshot code isn't able to keep up.
239  */
240 static void throttle_on_drops(struct btrfs_root *root)
241 {
242 	struct btrfs_fs_info *info = root->fs_info;
243 	int harder_count = 0;
244 
245 harder:
246 	if (atomic_read(&info->throttles)) {
247 		DEFINE_WAIT(wait);
248 		int thr;
249 		thr = atomic_read(&info->throttle_gen);
250 
251 		do {
252 			prepare_to_wait(&info->transaction_throttle,
253 					&wait, TASK_UNINTERRUPTIBLE);
254 			if (!atomic_read(&info->throttles)) {
255 				finish_wait(&info->transaction_throttle, &wait);
256 				break;
257 			}
258 			schedule();
259 			finish_wait(&info->transaction_throttle, &wait);
260 		} while (thr == atomic_read(&info->throttle_gen));
261 		harder_count++;
262 
263 		if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
264 		    harder_count < 2)
265 			goto harder;
266 
267 		if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
268 		    harder_count < 10)
269 			goto harder;
270 
271 		if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
272 		    harder_count < 20)
273 			goto harder;
274 	}
275 }
276 #endif
277 
278 void btrfs_throttle(struct btrfs_root *root)
279 {
280 	mutex_lock(&root->fs_info->trans_mutex);
281 	if (!root->fs_info->open_ioctl_trans)
282 		wait_current_trans(root);
283 	mutex_unlock(&root->fs_info->trans_mutex);
284 }
285 
286 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
287 			  struct btrfs_root *root, int throttle)
288 {
289 	struct btrfs_transaction *cur_trans;
290 	struct btrfs_fs_info *info = root->fs_info;
291 	int count = 0;
292 
293 	while (count < 4) {
294 		unsigned long cur = trans->delayed_ref_updates;
295 		trans->delayed_ref_updates = 0;
296 		if (cur &&
297 		    trans->transaction->delayed_refs.num_heads_ready > 64) {
298 			trans->delayed_ref_updates = 0;
299 
300 			/*
301 			 * do a full flush if the transaction is trying
302 			 * to close
303 			 */
304 			if (trans->transaction->delayed_refs.flushing)
305 				cur = 0;
306 			btrfs_run_delayed_refs(trans, root, cur);
307 		} else {
308 			break;
309 		}
310 		count++;
311 	}
312 
313 	mutex_lock(&info->trans_mutex);
314 	cur_trans = info->running_transaction;
315 	WARN_ON(cur_trans != trans->transaction);
316 	WARN_ON(cur_trans->num_writers < 1);
317 	cur_trans->num_writers--;
318 
319 	if (waitqueue_active(&cur_trans->writer_wait))
320 		wake_up(&cur_trans->writer_wait);
321 	put_transaction(cur_trans);
322 	mutex_unlock(&info->trans_mutex);
323 
324 	if (current->journal_info == trans)
325 		current->journal_info = NULL;
326 	memset(trans, 0, sizeof(*trans));
327 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
328 
329 	return 0;
330 }
331 
332 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
333 			  struct btrfs_root *root)
334 {
335 	return __btrfs_end_transaction(trans, root, 0);
336 }
337 
338 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
339 				   struct btrfs_root *root)
340 {
341 	return __btrfs_end_transaction(trans, root, 1);
342 }
343 
344 /*
345  * when btree blocks are allocated, they have some corresponding bits set for
346  * them in one of two extent_io trees.  This is used to make sure all of
347  * those extents are on disk for transaction or log commit
348  */
349 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
350 					struct extent_io_tree *dirty_pages)
351 {
352 	int ret;
353 	int err = 0;
354 	int werr = 0;
355 	struct page *page;
356 	struct inode *btree_inode = root->fs_info->btree_inode;
357 	u64 start = 0;
358 	u64 end;
359 	unsigned long index;
360 
361 	while (1) {
362 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
363 					    EXTENT_DIRTY);
364 		if (ret)
365 			break;
366 		while (start <= end) {
367 			cond_resched();
368 
369 			index = start >> PAGE_CACHE_SHIFT;
370 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
371 			page = find_get_page(btree_inode->i_mapping, index);
372 			if (!page)
373 				continue;
374 
375 			btree_lock_page_hook(page);
376 			if (!page->mapping) {
377 				unlock_page(page);
378 				page_cache_release(page);
379 				continue;
380 			}
381 
382 			if (PageWriteback(page)) {
383 				if (PageDirty(page))
384 					wait_on_page_writeback(page);
385 				else {
386 					unlock_page(page);
387 					page_cache_release(page);
388 					continue;
389 				}
390 			}
391 			err = write_one_page(page, 0);
392 			if (err)
393 				werr = err;
394 			page_cache_release(page);
395 		}
396 	}
397 	while (1) {
398 		ret = find_first_extent_bit(dirty_pages, 0, &start, &end,
399 					    EXTENT_DIRTY);
400 		if (ret)
401 			break;
402 
403 		clear_extent_dirty(dirty_pages, start, end, GFP_NOFS);
404 		while (start <= end) {
405 			index = start >> PAGE_CACHE_SHIFT;
406 			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
407 			page = find_get_page(btree_inode->i_mapping, index);
408 			if (!page)
409 				continue;
410 			if (PageDirty(page)) {
411 				btree_lock_page_hook(page);
412 				wait_on_page_writeback(page);
413 				err = write_one_page(page, 0);
414 				if (err)
415 					werr = err;
416 			}
417 			wait_on_page_writeback(page);
418 			page_cache_release(page);
419 			cond_resched();
420 		}
421 	}
422 	if (err)
423 		werr = err;
424 	return werr;
425 }
426 
427 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
428 				     struct btrfs_root *root)
429 {
430 	if (!trans || !trans->transaction) {
431 		struct inode *btree_inode;
432 		btree_inode = root->fs_info->btree_inode;
433 		return filemap_write_and_wait(btree_inode->i_mapping);
434 	}
435 	return btrfs_write_and_wait_marked_extents(root,
436 					   &trans->transaction->dirty_pages);
437 }
438 
439 /*
440  * this is used to update the root pointer in the tree of tree roots.
441  *
442  * But, in the case of the extent allocation tree, updating the root
443  * pointer may allocate blocks which may change the root of the extent
444  * allocation tree.
445  *
446  * So, this loops and repeats and makes sure the cowonly root didn't
447  * change while the root pointer was being updated in the metadata.
448  */
449 static int update_cowonly_root(struct btrfs_trans_handle *trans,
450 			       struct btrfs_root *root)
451 {
452 	int ret;
453 	u64 old_root_bytenr;
454 	struct btrfs_root *tree_root = root->fs_info->tree_root;
455 
456 	btrfs_write_dirty_block_groups(trans, root);
457 
458 	while (1) {
459 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
460 		if (old_root_bytenr == root->node->start)
461 			break;
462 
463 		btrfs_set_root_node(&root->root_item, root->node);
464 		ret = btrfs_update_root(trans, tree_root,
465 					&root->root_key,
466 					&root->root_item);
467 		BUG_ON(ret);
468 
469 		ret = btrfs_write_dirty_block_groups(trans, root);
470 		BUG_ON(ret);
471 	}
472 
473 	if (root != root->fs_info->extent_root)
474 		switch_commit_root(root);
475 
476 	return 0;
477 }
478 
479 /*
480  * update all the cowonly tree roots on disk
481  */
482 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
483 					 struct btrfs_root *root)
484 {
485 	struct btrfs_fs_info *fs_info = root->fs_info;
486 	struct list_head *next;
487 	struct extent_buffer *eb;
488 	int ret;
489 
490 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
491 	BUG_ON(ret);
492 
493 	eb = btrfs_lock_root_node(fs_info->tree_root);
494 	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
495 	btrfs_tree_unlock(eb);
496 	free_extent_buffer(eb);
497 
498 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
499 	BUG_ON(ret);
500 
501 	while (!list_empty(&fs_info->dirty_cowonly_roots)) {
502 		next = fs_info->dirty_cowonly_roots.next;
503 		list_del_init(next);
504 		root = list_entry(next, struct btrfs_root, dirty_list);
505 
506 		update_cowonly_root(trans, root);
507 	}
508 
509 	down_write(&fs_info->extent_commit_sem);
510 	switch_commit_root(fs_info->extent_root);
511 	up_write(&fs_info->extent_commit_sem);
512 
513 	return 0;
514 }
515 
516 /*
517  * dead roots are old snapshots that need to be deleted.  This allocates
518  * a dirty root struct and adds it into the list of dead roots that need to
519  * be deleted
520  */
521 int btrfs_add_dead_root(struct btrfs_root *root)
522 {
523 	mutex_lock(&root->fs_info->trans_mutex);
524 	list_add(&root->root_list, &root->fs_info->dead_roots);
525 	mutex_unlock(&root->fs_info->trans_mutex);
526 	return 0;
527 }
528 
529 /*
530  * update all the cowonly tree roots on disk
531  */
532 static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
533 				    struct btrfs_root *root)
534 {
535 	struct btrfs_root *gang[8];
536 	struct btrfs_fs_info *fs_info = root->fs_info;
537 	int i;
538 	int ret;
539 	int err = 0;
540 
541 	while (1) {
542 		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
543 						 (void **)gang, 0,
544 						 ARRAY_SIZE(gang),
545 						 BTRFS_ROOT_TRANS_TAG);
546 		if (ret == 0)
547 			break;
548 		for (i = 0; i < ret; i++) {
549 			root = gang[i];
550 			radix_tree_tag_clear(&fs_info->fs_roots_radix,
551 					(unsigned long)root->root_key.objectid,
552 					BTRFS_ROOT_TRANS_TAG);
553 
554 			btrfs_free_log(trans, root);
555 			btrfs_update_reloc_root(trans, root);
556 
557 			if (root->commit_root != root->node) {
558 				switch_commit_root(root);
559 				btrfs_set_root_node(&root->root_item,
560 						    root->node);
561 			}
562 
563 			err = btrfs_update_root(trans, fs_info->tree_root,
564 						&root->root_key,
565 						&root->root_item);
566 			if (err)
567 				break;
568 		}
569 	}
570 	return err;
571 }
572 
573 /*
574  * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
575  * otherwise every leaf in the btree is read and defragged.
576  */
577 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
578 {
579 	struct btrfs_fs_info *info = root->fs_info;
580 	int ret;
581 	struct btrfs_trans_handle *trans;
582 	unsigned long nr;
583 
584 	smp_mb();
585 	if (root->defrag_running)
586 		return 0;
587 	trans = btrfs_start_transaction(root, 1);
588 	while (1) {
589 		root->defrag_running = 1;
590 		ret = btrfs_defrag_leaves(trans, root, cacheonly);
591 		nr = trans->blocks_used;
592 		btrfs_end_transaction(trans, root);
593 		btrfs_btree_balance_dirty(info->tree_root, nr);
594 		cond_resched();
595 
596 		trans = btrfs_start_transaction(root, 1);
597 		if (root->fs_info->closing || ret != -EAGAIN)
598 			break;
599 	}
600 	root->defrag_running = 0;
601 	smp_mb();
602 	btrfs_end_transaction(trans, root);
603 	return 0;
604 }
605 
606 #if 0
607 /*
608  * when dropping snapshots, we generate a ton of delayed refs, and it makes
609  * sense not to join the transaction while it is trying to flush the current
610  * queue of delayed refs out.
611  *
612  * This is used by the drop snapshot code only
613  */
614 static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
615 {
616 	DEFINE_WAIT(wait);
617 
618 	mutex_lock(&info->trans_mutex);
619 	while (info->running_transaction &&
620 	       info->running_transaction->delayed_refs.flushing) {
621 		prepare_to_wait(&info->transaction_wait, &wait,
622 				TASK_UNINTERRUPTIBLE);
623 		mutex_unlock(&info->trans_mutex);
624 
625 		schedule();
626 
627 		mutex_lock(&info->trans_mutex);
628 		finish_wait(&info->transaction_wait, &wait);
629 	}
630 	mutex_unlock(&info->trans_mutex);
631 	return 0;
632 }
633 
634 /*
635  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
636  * all of them
637  */
638 int btrfs_drop_dead_root(struct btrfs_root *root)
639 {
640 	struct btrfs_trans_handle *trans;
641 	struct btrfs_root *tree_root = root->fs_info->tree_root;
642 	unsigned long nr;
643 	int ret;
644 
645 	while (1) {
646 		/*
647 		 * we don't want to jump in and create a bunch of
648 		 * delayed refs if the transaction is starting to close
649 		 */
650 		wait_transaction_pre_flush(tree_root->fs_info);
651 		trans = btrfs_start_transaction(tree_root, 1);
652 
653 		/*
654 		 * we've joined a transaction, make sure it isn't
655 		 * closing right now
656 		 */
657 		if (trans->transaction->delayed_refs.flushing) {
658 			btrfs_end_transaction(trans, tree_root);
659 			continue;
660 		}
661 
662 		ret = btrfs_drop_snapshot(trans, root);
663 		if (ret != -EAGAIN)
664 			break;
665 
666 		ret = btrfs_update_root(trans, tree_root,
667 					&root->root_key,
668 					&root->root_item);
669 		if (ret)
670 			break;
671 
672 		nr = trans->blocks_used;
673 		ret = btrfs_end_transaction(trans, tree_root);
674 		BUG_ON(ret);
675 
676 		btrfs_btree_balance_dirty(tree_root, nr);
677 		cond_resched();
678 	}
679 	BUG_ON(ret);
680 
681 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
682 	BUG_ON(ret);
683 
684 	nr = trans->blocks_used;
685 	ret = btrfs_end_transaction(trans, tree_root);
686 	BUG_ON(ret);
687 
688 	free_extent_buffer(root->node);
689 	free_extent_buffer(root->commit_root);
690 	kfree(root);
691 
692 	btrfs_btree_balance_dirty(tree_root, nr);
693 	return ret;
694 }
695 #endif
696 
697 /*
698  * new snapshots need to be created at a very specific time in the
699  * transaction commit.  This does the actual creation
700  */
701 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
702 				   struct btrfs_fs_info *fs_info,
703 				   struct btrfs_pending_snapshot *pending)
704 {
705 	struct btrfs_key key;
706 	struct btrfs_root_item *new_root_item;
707 	struct btrfs_root *tree_root = fs_info->tree_root;
708 	struct btrfs_root *root = pending->root;
709 	struct extent_buffer *tmp;
710 	struct extent_buffer *old;
711 	int ret;
712 	u64 objectid;
713 
714 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
715 	if (!new_root_item) {
716 		ret = -ENOMEM;
717 		goto fail;
718 	}
719 	ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
720 	if (ret)
721 		goto fail;
722 
723 	record_root_in_trans(trans, root);
724 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
725 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
726 
727 	key.objectid = objectid;
728 	/* record when the snapshot was created in key.offset */
729 	key.offset = trans->transid;
730 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
731 
732 	old = btrfs_lock_root_node(root);
733 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
734 	btrfs_set_lock_blocking(old);
735 
736 	btrfs_copy_root(trans, root, old, &tmp, objectid);
737 	btrfs_tree_unlock(old);
738 	free_extent_buffer(old);
739 
740 	btrfs_set_root_node(new_root_item, tmp);
741 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
742 				new_root_item);
743 	btrfs_tree_unlock(tmp);
744 	free_extent_buffer(tmp);
745 	if (ret)
746 		goto fail;
747 
748 	key.offset = (u64)-1;
749 	memcpy(&pending->root_key, &key, sizeof(key));
750 fail:
751 	kfree(new_root_item);
752 	btrfs_unreserve_metadata_space(root, 6);
753 	return ret;
754 }
755 
756 static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
757 				   struct btrfs_pending_snapshot *pending)
758 {
759 	int ret;
760 	int namelen;
761 	u64 index = 0;
762 	struct btrfs_trans_handle *trans;
763 	struct inode *parent_inode;
764 	struct inode *inode;
765 	struct btrfs_root *parent_root;
766 
767 	parent_inode = pending->dentry->d_parent->d_inode;
768 	parent_root = BTRFS_I(parent_inode)->root;
769 	trans = btrfs_join_transaction(parent_root, 1);
770 
771 	/*
772 	 * insert the directory item
773 	 */
774 	namelen = strlen(pending->name);
775 	ret = btrfs_set_inode_index(parent_inode, &index);
776 	ret = btrfs_insert_dir_item(trans, parent_root,
777 			    pending->name, namelen,
778 			    parent_inode->i_ino,
779 			    &pending->root_key, BTRFS_FT_DIR, index);
780 
781 	if (ret)
782 		goto fail;
783 
784 	btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
785 	ret = btrfs_update_inode(trans, parent_root, parent_inode);
786 	BUG_ON(ret);
787 
788 	ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
789 				 pending->root_key.objectid,
790 				 parent_root->root_key.objectid,
791 				 parent_inode->i_ino, index, pending->name,
792 				 namelen);
793 
794 	BUG_ON(ret);
795 
796 	inode = btrfs_lookup_dentry(parent_inode, pending->dentry);
797 	d_instantiate(pending->dentry, inode);
798 fail:
799 	btrfs_end_transaction(trans, fs_info->fs_root);
800 	return ret;
801 }
802 
803 /*
804  * create all the snapshots we've scheduled for creation
805  */
806 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
807 					     struct btrfs_fs_info *fs_info)
808 {
809 	struct btrfs_pending_snapshot *pending;
810 	struct list_head *head = &trans->transaction->pending_snapshots;
811 	int ret;
812 
813 	list_for_each_entry(pending, head, list) {
814 		ret = create_pending_snapshot(trans, fs_info, pending);
815 		BUG_ON(ret);
816 	}
817 	return 0;
818 }
819 
820 static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
821 					     struct btrfs_fs_info *fs_info)
822 {
823 	struct btrfs_pending_snapshot *pending;
824 	struct list_head *head = &trans->transaction->pending_snapshots;
825 	int ret;
826 
827 	while (!list_empty(head)) {
828 		pending = list_entry(head->next,
829 				     struct btrfs_pending_snapshot, list);
830 		ret = finish_pending_snapshot(fs_info, pending);
831 		BUG_ON(ret);
832 		list_del(&pending->list);
833 		kfree(pending->name);
834 		kfree(pending);
835 	}
836 	return 0;
837 }
838 
839 static void update_super_roots(struct btrfs_root *root)
840 {
841 	struct btrfs_root_item *root_item;
842 	struct btrfs_super_block *super;
843 
844 	super = &root->fs_info->super_copy;
845 
846 	root_item = &root->fs_info->chunk_root->root_item;
847 	super->chunk_root = root_item->bytenr;
848 	super->chunk_root_generation = root_item->generation;
849 	super->chunk_root_level = root_item->level;
850 
851 	root_item = &root->fs_info->tree_root->root_item;
852 	super->root = root_item->bytenr;
853 	super->generation = root_item->generation;
854 	super->root_level = root_item->level;
855 }
856 
857 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
858 {
859 	int ret = 0;
860 	spin_lock(&info->new_trans_lock);
861 	if (info->running_transaction)
862 		ret = info->running_transaction->in_commit;
863 	spin_unlock(&info->new_trans_lock);
864 	return ret;
865 }
866 
867 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
868 			     struct btrfs_root *root)
869 {
870 	unsigned long joined = 0;
871 	unsigned long timeout = 1;
872 	struct btrfs_transaction *cur_trans;
873 	struct btrfs_transaction *prev_trans = NULL;
874 	DEFINE_WAIT(wait);
875 	int ret;
876 	int should_grow = 0;
877 	unsigned long now = get_seconds();
878 	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
879 
880 	btrfs_run_ordered_operations(root, 0);
881 
882 	/* make a pass through all the delayed refs we have so far
883 	 * any runnings procs may add more while we are here
884 	 */
885 	ret = btrfs_run_delayed_refs(trans, root, 0);
886 	BUG_ON(ret);
887 
888 	cur_trans = trans->transaction;
889 	/*
890 	 * set the flushing flag so procs in this transaction have to
891 	 * start sending their work down.
892 	 */
893 	cur_trans->delayed_refs.flushing = 1;
894 
895 	ret = btrfs_run_delayed_refs(trans, root, 0);
896 	BUG_ON(ret);
897 
898 	mutex_lock(&root->fs_info->trans_mutex);
899 	if (cur_trans->in_commit) {
900 		cur_trans->use_count++;
901 		mutex_unlock(&root->fs_info->trans_mutex);
902 		btrfs_end_transaction(trans, root);
903 
904 		ret = wait_for_commit(root, cur_trans);
905 		BUG_ON(ret);
906 
907 		mutex_lock(&root->fs_info->trans_mutex);
908 		put_transaction(cur_trans);
909 		mutex_unlock(&root->fs_info->trans_mutex);
910 
911 		return 0;
912 	}
913 
914 	trans->transaction->in_commit = 1;
915 	trans->transaction->blocked = 1;
916 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
917 		prev_trans = list_entry(cur_trans->list.prev,
918 					struct btrfs_transaction, list);
919 		if (!prev_trans->commit_done) {
920 			prev_trans->use_count++;
921 			mutex_unlock(&root->fs_info->trans_mutex);
922 
923 			wait_for_commit(root, prev_trans);
924 
925 			mutex_lock(&root->fs_info->trans_mutex);
926 			put_transaction(prev_trans);
927 		}
928 	}
929 
930 	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
931 		should_grow = 1;
932 
933 	do {
934 		int snap_pending = 0;
935 		joined = cur_trans->num_joined;
936 		if (!list_empty(&trans->transaction->pending_snapshots))
937 			snap_pending = 1;
938 
939 		WARN_ON(cur_trans != trans->transaction);
940 		prepare_to_wait(&cur_trans->writer_wait, &wait,
941 				TASK_UNINTERRUPTIBLE);
942 
943 		if (cur_trans->num_writers > 1)
944 			timeout = MAX_SCHEDULE_TIMEOUT;
945 		else if (should_grow)
946 			timeout = 1;
947 
948 		mutex_unlock(&root->fs_info->trans_mutex);
949 
950 		if (flush_on_commit) {
951 			btrfs_start_delalloc_inodes(root);
952 			ret = btrfs_wait_ordered_extents(root, 0);
953 			BUG_ON(ret);
954 		} else if (snap_pending) {
955 			ret = btrfs_wait_ordered_extents(root, 1);
956 			BUG_ON(ret);
957 		}
958 
959 		/*
960 		 * rename don't use btrfs_join_transaction, so, once we
961 		 * set the transaction to blocked above, we aren't going
962 		 * to get any new ordered operations.  We can safely run
963 		 * it here and no for sure that nothing new will be added
964 		 * to the list
965 		 */
966 		btrfs_run_ordered_operations(root, 1);
967 
968 		smp_mb();
969 		if (cur_trans->num_writers > 1 || should_grow)
970 			schedule_timeout(timeout);
971 
972 		mutex_lock(&root->fs_info->trans_mutex);
973 		finish_wait(&cur_trans->writer_wait, &wait);
974 	} while (cur_trans->num_writers > 1 ||
975 		 (should_grow && cur_trans->num_joined != joined));
976 
977 	ret = create_pending_snapshots(trans, root->fs_info);
978 	BUG_ON(ret);
979 
980 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
981 	BUG_ON(ret);
982 
983 	WARN_ON(cur_trans != trans->transaction);
984 
985 	/* btrfs_commit_tree_roots is responsible for getting the
986 	 * various roots consistent with each other.  Every pointer
987 	 * in the tree of tree roots has to point to the most up to date
988 	 * root for every subvolume and other tree.  So, we have to keep
989 	 * the tree logging code from jumping in and changing any
990 	 * of the trees.
991 	 *
992 	 * At this point in the commit, there can't be any tree-log
993 	 * writers, but a little lower down we drop the trans mutex
994 	 * and let new people in.  By holding the tree_log_mutex
995 	 * from now until after the super is written, we avoid races
996 	 * with the tree-log code.
997 	 */
998 	mutex_lock(&root->fs_info->tree_log_mutex);
999 
1000 	ret = commit_fs_roots(trans, root);
1001 	BUG_ON(ret);
1002 
1003 	/* commit_fs_roots gets rid of all the tree log roots, it is now
1004 	 * safe to free the root of tree log roots
1005 	 */
1006 	btrfs_free_log_root_tree(trans, root->fs_info);
1007 
1008 	ret = commit_cowonly_roots(trans, root);
1009 	BUG_ON(ret);
1010 
1011 	btrfs_prepare_extent_commit(trans, root);
1012 
1013 	cur_trans = root->fs_info->running_transaction;
1014 	spin_lock(&root->fs_info->new_trans_lock);
1015 	root->fs_info->running_transaction = NULL;
1016 	spin_unlock(&root->fs_info->new_trans_lock);
1017 
1018 	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1019 			    root->fs_info->tree_root->node);
1020 	switch_commit_root(root->fs_info->tree_root);
1021 
1022 	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
1023 			    root->fs_info->chunk_root->node);
1024 	switch_commit_root(root->fs_info->chunk_root);
1025 
1026 	update_super_roots(root);
1027 
1028 	if (!root->fs_info->log_root_recovering) {
1029 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
1030 		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
1031 	}
1032 
1033 	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
1034 	       sizeof(root->fs_info->super_copy));
1035 
1036 	trans->transaction->blocked = 0;
1037 
1038 	wake_up(&root->fs_info->transaction_wait);
1039 
1040 	mutex_unlock(&root->fs_info->trans_mutex);
1041 	ret = btrfs_write_and_wait_transaction(trans, root);
1042 	BUG_ON(ret);
1043 	write_ctree_super(trans, root, 0);
1044 
1045 	/*
1046 	 * the super is written, we can safely allow the tree-loggers
1047 	 * to go about their business
1048 	 */
1049 	mutex_unlock(&root->fs_info->tree_log_mutex);
1050 
1051 	btrfs_finish_extent_commit(trans, root);
1052 
1053 	/* do the directory inserts of any pending snapshot creations */
1054 	finish_pending_snapshots(trans, root->fs_info);
1055 
1056 	mutex_lock(&root->fs_info->trans_mutex);
1057 
1058 	cur_trans->commit_done = 1;
1059 
1060 	root->fs_info->last_trans_committed = cur_trans->transid;
1061 
1062 	wake_up(&cur_trans->commit_wait);
1063 
1064 	put_transaction(cur_trans);
1065 	put_transaction(cur_trans);
1066 
1067 	mutex_unlock(&root->fs_info->trans_mutex);
1068 
1069 	if (current->journal_info == trans)
1070 		current->journal_info = NULL;
1071 
1072 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
1073 	return ret;
1074 }
1075 
1076 /*
1077  * interface function to delete all the snapshots we have scheduled for deletion
1078  */
1079 int btrfs_clean_old_snapshots(struct btrfs_root *root)
1080 {
1081 	LIST_HEAD(list);
1082 	struct btrfs_fs_info *fs_info = root->fs_info;
1083 
1084 	mutex_lock(&fs_info->trans_mutex);
1085 	list_splice_init(&fs_info->dead_roots, &list);
1086 	mutex_unlock(&fs_info->trans_mutex);
1087 
1088 	while (!list_empty(&list)) {
1089 		root = list_entry(list.next, struct btrfs_root, root_list);
1090 		list_del(&root->root_list);
1091 
1092 		if (btrfs_header_backref_rev(root->node) <
1093 		    BTRFS_MIXED_BACKREF_REV)
1094 			btrfs_drop_snapshot(root, 0);
1095 		else
1096 			btrfs_drop_snapshot(root, 1);
1097 	}
1098 	return 0;
1099 }
1100