xref: /openbmc/linux/fs/btrfs/volumes.c (revision 9c1f8594)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35 
36 static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 				struct btrfs_root *root,
38 				struct btrfs_device *device);
39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40 
41 static DEFINE_MUTEX(uuid_mutex);
42 static LIST_HEAD(fs_uuids);
43 
44 static void lock_chunks(struct btrfs_root *root)
45 {
46 	mutex_lock(&root->fs_info->chunk_mutex);
47 }
48 
49 static void unlock_chunks(struct btrfs_root *root)
50 {
51 	mutex_unlock(&root->fs_info->chunk_mutex);
52 }
53 
54 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
55 {
56 	struct btrfs_device *device;
57 	WARN_ON(fs_devices->opened);
58 	while (!list_empty(&fs_devices->devices)) {
59 		device = list_entry(fs_devices->devices.next,
60 				    struct btrfs_device, dev_list);
61 		list_del(&device->dev_list);
62 		kfree(device->name);
63 		kfree(device);
64 	}
65 	kfree(fs_devices);
66 }
67 
68 int btrfs_cleanup_fs_uuids(void)
69 {
70 	struct btrfs_fs_devices *fs_devices;
71 
72 	while (!list_empty(&fs_uuids)) {
73 		fs_devices = list_entry(fs_uuids.next,
74 					struct btrfs_fs_devices, list);
75 		list_del(&fs_devices->list);
76 		free_fs_devices(fs_devices);
77 	}
78 	return 0;
79 }
80 
81 static noinline struct btrfs_device *__find_device(struct list_head *head,
82 						   u64 devid, u8 *uuid)
83 {
84 	struct btrfs_device *dev;
85 
86 	list_for_each_entry(dev, head, dev_list) {
87 		if (dev->devid == devid &&
88 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
89 			return dev;
90 		}
91 	}
92 	return NULL;
93 }
94 
95 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
96 {
97 	struct btrfs_fs_devices *fs_devices;
98 
99 	list_for_each_entry(fs_devices, &fs_uuids, list) {
100 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
101 			return fs_devices;
102 	}
103 	return NULL;
104 }
105 
106 static void requeue_list(struct btrfs_pending_bios *pending_bios,
107 			struct bio *head, struct bio *tail)
108 {
109 
110 	struct bio *old_head;
111 
112 	old_head = pending_bios->head;
113 	pending_bios->head = head;
114 	if (pending_bios->tail)
115 		tail->bi_next = old_head;
116 	else
117 		pending_bios->tail = tail;
118 }
119 
120 /*
121  * we try to collect pending bios for a device so we don't get a large
122  * number of procs sending bios down to the same device.  This greatly
123  * improves the schedulers ability to collect and merge the bios.
124  *
125  * But, it also turns into a long list of bios to process and that is sure
126  * to eventually make the worker thread block.  The solution here is to
127  * make some progress and then put this work struct back at the end of
128  * the list if the block device is congested.  This way, multiple devices
129  * can make progress from a single worker thread.
130  */
131 static noinline int run_scheduled_bios(struct btrfs_device *device)
132 {
133 	struct bio *pending;
134 	struct backing_dev_info *bdi;
135 	struct btrfs_fs_info *fs_info;
136 	struct btrfs_pending_bios *pending_bios;
137 	struct bio *tail;
138 	struct bio *cur;
139 	int again = 0;
140 	unsigned long num_run;
141 	unsigned long batch_run = 0;
142 	unsigned long limit;
143 	unsigned long last_waited = 0;
144 	int force_reg = 0;
145 	int sync_pending = 0;
146 	struct blk_plug plug;
147 
148 	/*
149 	 * this function runs all the bios we've collected for
150 	 * a particular device.  We don't want to wander off to
151 	 * another device without first sending all of these down.
152 	 * So, setup a plug here and finish it off before we return
153 	 */
154 	blk_start_plug(&plug);
155 
156 	bdi = blk_get_backing_dev_info(device->bdev);
157 	fs_info = device->dev_root->fs_info;
158 	limit = btrfs_async_submit_limit(fs_info);
159 	limit = limit * 2 / 3;
160 
161 loop:
162 	spin_lock(&device->io_lock);
163 
164 loop_lock:
165 	num_run = 0;
166 
167 	/* take all the bios off the list at once and process them
168 	 * later on (without the lock held).  But, remember the
169 	 * tail and other pointers so the bios can be properly reinserted
170 	 * into the list if we hit congestion
171 	 */
172 	if (!force_reg && device->pending_sync_bios.head) {
173 		pending_bios = &device->pending_sync_bios;
174 		force_reg = 1;
175 	} else {
176 		pending_bios = &device->pending_bios;
177 		force_reg = 0;
178 	}
179 
180 	pending = pending_bios->head;
181 	tail = pending_bios->tail;
182 	WARN_ON(pending && !tail);
183 
184 	/*
185 	 * if pending was null this time around, no bios need processing
186 	 * at all and we can stop.  Otherwise it'll loop back up again
187 	 * and do an additional check so no bios are missed.
188 	 *
189 	 * device->running_pending is used to synchronize with the
190 	 * schedule_bio code.
191 	 */
192 	if (device->pending_sync_bios.head == NULL &&
193 	    device->pending_bios.head == NULL) {
194 		again = 0;
195 		device->running_pending = 0;
196 	} else {
197 		again = 1;
198 		device->running_pending = 1;
199 	}
200 
201 	pending_bios->head = NULL;
202 	pending_bios->tail = NULL;
203 
204 	spin_unlock(&device->io_lock);
205 
206 	while (pending) {
207 
208 		rmb();
209 		/* we want to work on both lists, but do more bios on the
210 		 * sync list than the regular list
211 		 */
212 		if ((num_run > 32 &&
213 		    pending_bios != &device->pending_sync_bios &&
214 		    device->pending_sync_bios.head) ||
215 		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
216 		    device->pending_bios.head)) {
217 			spin_lock(&device->io_lock);
218 			requeue_list(pending_bios, pending, tail);
219 			goto loop_lock;
220 		}
221 
222 		cur = pending;
223 		pending = pending->bi_next;
224 		cur->bi_next = NULL;
225 		atomic_dec(&fs_info->nr_async_bios);
226 
227 		if (atomic_read(&fs_info->nr_async_bios) < limit &&
228 		    waitqueue_active(&fs_info->async_submit_wait))
229 			wake_up(&fs_info->async_submit_wait);
230 
231 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
232 
233 		/*
234 		 * if we're doing the sync list, record that our
235 		 * plug has some sync requests on it
236 		 *
237 		 * If we're doing the regular list and there are
238 		 * sync requests sitting around, unplug before
239 		 * we add more
240 		 */
241 		if (pending_bios == &device->pending_sync_bios) {
242 			sync_pending = 1;
243 		} else if (sync_pending) {
244 			blk_finish_plug(&plug);
245 			blk_start_plug(&plug);
246 			sync_pending = 0;
247 		}
248 
249 		submit_bio(cur->bi_rw, cur);
250 		num_run++;
251 		batch_run++;
252 		if (need_resched())
253 			cond_resched();
254 
255 		/*
256 		 * we made progress, there is more work to do and the bdi
257 		 * is now congested.  Back off and let other work structs
258 		 * run instead
259 		 */
260 		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
261 		    fs_info->fs_devices->open_devices > 1) {
262 			struct io_context *ioc;
263 
264 			ioc = current->io_context;
265 
266 			/*
267 			 * the main goal here is that we don't want to
268 			 * block if we're going to be able to submit
269 			 * more requests without blocking.
270 			 *
271 			 * This code does two great things, it pokes into
272 			 * the elevator code from a filesystem _and_
273 			 * it makes assumptions about how batching works.
274 			 */
275 			if (ioc && ioc->nr_batch_requests > 0 &&
276 			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
277 			    (last_waited == 0 ||
278 			     ioc->last_waited == last_waited)) {
279 				/*
280 				 * we want to go through our batch of
281 				 * requests and stop.  So, we copy out
282 				 * the ioc->last_waited time and test
283 				 * against it before looping
284 				 */
285 				last_waited = ioc->last_waited;
286 				if (need_resched())
287 					cond_resched();
288 				continue;
289 			}
290 			spin_lock(&device->io_lock);
291 			requeue_list(pending_bios, pending, tail);
292 			device->running_pending = 1;
293 
294 			spin_unlock(&device->io_lock);
295 			btrfs_requeue_work(&device->work);
296 			goto done;
297 		}
298 	}
299 
300 	cond_resched();
301 	if (again)
302 		goto loop;
303 
304 	spin_lock(&device->io_lock);
305 	if (device->pending_bios.head || device->pending_sync_bios.head)
306 		goto loop_lock;
307 	spin_unlock(&device->io_lock);
308 
309 done:
310 	blk_finish_plug(&plug);
311 	return 0;
312 }
313 
314 static void pending_bios_fn(struct btrfs_work *work)
315 {
316 	struct btrfs_device *device;
317 
318 	device = container_of(work, struct btrfs_device, work);
319 	run_scheduled_bios(device);
320 }
321 
322 static noinline int device_list_add(const char *path,
323 			   struct btrfs_super_block *disk_super,
324 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
325 {
326 	struct btrfs_device *device;
327 	struct btrfs_fs_devices *fs_devices;
328 	u64 found_transid = btrfs_super_generation(disk_super);
329 	char *name;
330 
331 	fs_devices = find_fsid(disk_super->fsid);
332 	if (!fs_devices) {
333 		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
334 		if (!fs_devices)
335 			return -ENOMEM;
336 		INIT_LIST_HEAD(&fs_devices->devices);
337 		INIT_LIST_HEAD(&fs_devices->alloc_list);
338 		list_add(&fs_devices->list, &fs_uuids);
339 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
340 		fs_devices->latest_devid = devid;
341 		fs_devices->latest_trans = found_transid;
342 		mutex_init(&fs_devices->device_list_mutex);
343 		device = NULL;
344 	} else {
345 		device = __find_device(&fs_devices->devices, devid,
346 				       disk_super->dev_item.uuid);
347 	}
348 	if (!device) {
349 		if (fs_devices->opened)
350 			return -EBUSY;
351 
352 		device = kzalloc(sizeof(*device), GFP_NOFS);
353 		if (!device) {
354 			/* we can safely leave the fs_devices entry around */
355 			return -ENOMEM;
356 		}
357 		device->devid = devid;
358 		device->work.func = pending_bios_fn;
359 		memcpy(device->uuid, disk_super->dev_item.uuid,
360 		       BTRFS_UUID_SIZE);
361 		spin_lock_init(&device->io_lock);
362 		device->name = kstrdup(path, GFP_NOFS);
363 		if (!device->name) {
364 			kfree(device);
365 			return -ENOMEM;
366 		}
367 		INIT_LIST_HEAD(&device->dev_alloc_list);
368 
369 		mutex_lock(&fs_devices->device_list_mutex);
370 		list_add_rcu(&device->dev_list, &fs_devices->devices);
371 		mutex_unlock(&fs_devices->device_list_mutex);
372 
373 		device->fs_devices = fs_devices;
374 		fs_devices->num_devices++;
375 	} else if (!device->name || strcmp(device->name, path)) {
376 		name = kstrdup(path, GFP_NOFS);
377 		if (!name)
378 			return -ENOMEM;
379 		kfree(device->name);
380 		device->name = name;
381 		if (device->missing) {
382 			fs_devices->missing_devices--;
383 			device->missing = 0;
384 		}
385 	}
386 
387 	if (found_transid > fs_devices->latest_trans) {
388 		fs_devices->latest_devid = devid;
389 		fs_devices->latest_trans = found_transid;
390 	}
391 	*fs_devices_ret = fs_devices;
392 	return 0;
393 }
394 
395 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
396 {
397 	struct btrfs_fs_devices *fs_devices;
398 	struct btrfs_device *device;
399 	struct btrfs_device *orig_dev;
400 
401 	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
402 	if (!fs_devices)
403 		return ERR_PTR(-ENOMEM);
404 
405 	INIT_LIST_HEAD(&fs_devices->devices);
406 	INIT_LIST_HEAD(&fs_devices->alloc_list);
407 	INIT_LIST_HEAD(&fs_devices->list);
408 	mutex_init(&fs_devices->device_list_mutex);
409 	fs_devices->latest_devid = orig->latest_devid;
410 	fs_devices->latest_trans = orig->latest_trans;
411 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
412 
413 	/* We have held the volume lock, it is safe to get the devices. */
414 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
415 		device = kzalloc(sizeof(*device), GFP_NOFS);
416 		if (!device)
417 			goto error;
418 
419 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
420 		if (!device->name) {
421 			kfree(device);
422 			goto error;
423 		}
424 
425 		device->devid = orig_dev->devid;
426 		device->work.func = pending_bios_fn;
427 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
428 		spin_lock_init(&device->io_lock);
429 		INIT_LIST_HEAD(&device->dev_list);
430 		INIT_LIST_HEAD(&device->dev_alloc_list);
431 
432 		list_add(&device->dev_list, &fs_devices->devices);
433 		device->fs_devices = fs_devices;
434 		fs_devices->num_devices++;
435 	}
436 	return fs_devices;
437 error:
438 	free_fs_devices(fs_devices);
439 	return ERR_PTR(-ENOMEM);
440 }
441 
442 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
443 {
444 	struct btrfs_device *device, *next;
445 
446 	mutex_lock(&uuid_mutex);
447 again:
448 	/* This is the initialized path, it is safe to release the devices. */
449 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
450 		if (device->in_fs_metadata)
451 			continue;
452 
453 		if (device->bdev) {
454 			blkdev_put(device->bdev, device->mode);
455 			device->bdev = NULL;
456 			fs_devices->open_devices--;
457 		}
458 		if (device->writeable) {
459 			list_del_init(&device->dev_alloc_list);
460 			device->writeable = 0;
461 			fs_devices->rw_devices--;
462 		}
463 		list_del_init(&device->dev_list);
464 		fs_devices->num_devices--;
465 		kfree(device->name);
466 		kfree(device);
467 	}
468 
469 	if (fs_devices->seed) {
470 		fs_devices = fs_devices->seed;
471 		goto again;
472 	}
473 
474 	mutex_unlock(&uuid_mutex);
475 	return 0;
476 }
477 
478 static void __free_device(struct work_struct *work)
479 {
480 	struct btrfs_device *device;
481 
482 	device = container_of(work, struct btrfs_device, rcu_work);
483 
484 	if (device->bdev)
485 		blkdev_put(device->bdev, device->mode);
486 
487 	kfree(device->name);
488 	kfree(device);
489 }
490 
491 static void free_device(struct rcu_head *head)
492 {
493 	struct btrfs_device *device;
494 
495 	device = container_of(head, struct btrfs_device, rcu);
496 
497 	INIT_WORK(&device->rcu_work, __free_device);
498 	schedule_work(&device->rcu_work);
499 }
500 
501 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
502 {
503 	struct btrfs_device *device;
504 
505 	if (--fs_devices->opened > 0)
506 		return 0;
507 
508 	mutex_lock(&fs_devices->device_list_mutex);
509 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
510 		struct btrfs_device *new_device;
511 
512 		if (device->bdev)
513 			fs_devices->open_devices--;
514 
515 		if (device->writeable) {
516 			list_del_init(&device->dev_alloc_list);
517 			fs_devices->rw_devices--;
518 		}
519 
520 		if (device->can_discard)
521 			fs_devices->num_can_discard--;
522 
523 		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
524 		BUG_ON(!new_device);
525 		memcpy(new_device, device, sizeof(*new_device));
526 		new_device->name = kstrdup(device->name, GFP_NOFS);
527 		BUG_ON(device->name && !new_device->name);
528 		new_device->bdev = NULL;
529 		new_device->writeable = 0;
530 		new_device->in_fs_metadata = 0;
531 		new_device->can_discard = 0;
532 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
533 
534 		call_rcu(&device->rcu, free_device);
535 	}
536 	mutex_unlock(&fs_devices->device_list_mutex);
537 
538 	WARN_ON(fs_devices->open_devices);
539 	WARN_ON(fs_devices->rw_devices);
540 	fs_devices->opened = 0;
541 	fs_devices->seeding = 0;
542 
543 	return 0;
544 }
545 
546 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
547 {
548 	struct btrfs_fs_devices *seed_devices = NULL;
549 	int ret;
550 
551 	mutex_lock(&uuid_mutex);
552 	ret = __btrfs_close_devices(fs_devices);
553 	if (!fs_devices->opened) {
554 		seed_devices = fs_devices->seed;
555 		fs_devices->seed = NULL;
556 	}
557 	mutex_unlock(&uuid_mutex);
558 
559 	while (seed_devices) {
560 		fs_devices = seed_devices;
561 		seed_devices = fs_devices->seed;
562 		__btrfs_close_devices(fs_devices);
563 		free_fs_devices(fs_devices);
564 	}
565 	return ret;
566 }
567 
568 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
569 				fmode_t flags, void *holder)
570 {
571 	struct request_queue *q;
572 	struct block_device *bdev;
573 	struct list_head *head = &fs_devices->devices;
574 	struct btrfs_device *device;
575 	struct block_device *latest_bdev = NULL;
576 	struct buffer_head *bh;
577 	struct btrfs_super_block *disk_super;
578 	u64 latest_devid = 0;
579 	u64 latest_transid = 0;
580 	u64 devid;
581 	int seeding = 1;
582 	int ret = 0;
583 
584 	flags |= FMODE_EXCL;
585 
586 	list_for_each_entry(device, head, dev_list) {
587 		if (device->bdev)
588 			continue;
589 		if (!device->name)
590 			continue;
591 
592 		bdev = blkdev_get_by_path(device->name, flags, holder);
593 		if (IS_ERR(bdev)) {
594 			printk(KERN_INFO "open %s failed\n", device->name);
595 			goto error;
596 		}
597 		set_blocksize(bdev, 4096);
598 
599 		bh = btrfs_read_dev_super(bdev);
600 		if (!bh) {
601 			ret = -EINVAL;
602 			goto error_close;
603 		}
604 
605 		disk_super = (struct btrfs_super_block *)bh->b_data;
606 		devid = btrfs_stack_device_id(&disk_super->dev_item);
607 		if (devid != device->devid)
608 			goto error_brelse;
609 
610 		if (memcmp(device->uuid, disk_super->dev_item.uuid,
611 			   BTRFS_UUID_SIZE))
612 			goto error_brelse;
613 
614 		device->generation = btrfs_super_generation(disk_super);
615 		if (!latest_transid || device->generation > latest_transid) {
616 			latest_devid = devid;
617 			latest_transid = device->generation;
618 			latest_bdev = bdev;
619 		}
620 
621 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
622 			device->writeable = 0;
623 		} else {
624 			device->writeable = !bdev_read_only(bdev);
625 			seeding = 0;
626 		}
627 
628 		q = bdev_get_queue(bdev);
629 		if (blk_queue_discard(q)) {
630 			device->can_discard = 1;
631 			fs_devices->num_can_discard++;
632 		}
633 
634 		device->bdev = bdev;
635 		device->in_fs_metadata = 0;
636 		device->mode = flags;
637 
638 		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
639 			fs_devices->rotating = 1;
640 
641 		fs_devices->open_devices++;
642 		if (device->writeable) {
643 			fs_devices->rw_devices++;
644 			list_add(&device->dev_alloc_list,
645 				 &fs_devices->alloc_list);
646 		}
647 		brelse(bh);
648 		continue;
649 
650 error_brelse:
651 		brelse(bh);
652 error_close:
653 		blkdev_put(bdev, flags);
654 error:
655 		continue;
656 	}
657 	if (fs_devices->open_devices == 0) {
658 		ret = -EIO;
659 		goto out;
660 	}
661 	fs_devices->seeding = seeding;
662 	fs_devices->opened = 1;
663 	fs_devices->latest_bdev = latest_bdev;
664 	fs_devices->latest_devid = latest_devid;
665 	fs_devices->latest_trans = latest_transid;
666 	fs_devices->total_rw_bytes = 0;
667 out:
668 	return ret;
669 }
670 
671 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
672 		       fmode_t flags, void *holder)
673 {
674 	int ret;
675 
676 	mutex_lock(&uuid_mutex);
677 	if (fs_devices->opened) {
678 		fs_devices->opened++;
679 		ret = 0;
680 	} else {
681 		ret = __btrfs_open_devices(fs_devices, flags, holder);
682 	}
683 	mutex_unlock(&uuid_mutex);
684 	return ret;
685 }
686 
687 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
688 			  struct btrfs_fs_devices **fs_devices_ret)
689 {
690 	struct btrfs_super_block *disk_super;
691 	struct block_device *bdev;
692 	struct buffer_head *bh;
693 	int ret;
694 	u64 devid;
695 	u64 transid;
696 
697 	mutex_lock(&uuid_mutex);
698 
699 	flags |= FMODE_EXCL;
700 	bdev = blkdev_get_by_path(path, flags, holder);
701 
702 	if (IS_ERR(bdev)) {
703 		ret = PTR_ERR(bdev);
704 		goto error;
705 	}
706 
707 	ret = set_blocksize(bdev, 4096);
708 	if (ret)
709 		goto error_close;
710 	bh = btrfs_read_dev_super(bdev);
711 	if (!bh) {
712 		ret = -EINVAL;
713 		goto error_close;
714 	}
715 	disk_super = (struct btrfs_super_block *)bh->b_data;
716 	devid = btrfs_stack_device_id(&disk_super->dev_item);
717 	transid = btrfs_super_generation(disk_super);
718 	if (disk_super->label[0])
719 		printk(KERN_INFO "device label %s ", disk_super->label);
720 	else
721 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
722 	printk(KERN_CONT "devid %llu transid %llu %s\n",
723 	       (unsigned long long)devid, (unsigned long long)transid, path);
724 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
725 
726 	brelse(bh);
727 error_close:
728 	blkdev_put(bdev, flags);
729 error:
730 	mutex_unlock(&uuid_mutex);
731 	return ret;
732 }
733 
734 /* helper to account the used device space in the range */
735 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
736 				   u64 end, u64 *length)
737 {
738 	struct btrfs_key key;
739 	struct btrfs_root *root = device->dev_root;
740 	struct btrfs_dev_extent *dev_extent;
741 	struct btrfs_path *path;
742 	u64 extent_end;
743 	int ret;
744 	int slot;
745 	struct extent_buffer *l;
746 
747 	*length = 0;
748 
749 	if (start >= device->total_bytes)
750 		return 0;
751 
752 	path = btrfs_alloc_path();
753 	if (!path)
754 		return -ENOMEM;
755 	path->reada = 2;
756 
757 	key.objectid = device->devid;
758 	key.offset = start;
759 	key.type = BTRFS_DEV_EXTENT_KEY;
760 
761 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
762 	if (ret < 0)
763 		goto out;
764 	if (ret > 0) {
765 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
766 		if (ret < 0)
767 			goto out;
768 	}
769 
770 	while (1) {
771 		l = path->nodes[0];
772 		slot = path->slots[0];
773 		if (slot >= btrfs_header_nritems(l)) {
774 			ret = btrfs_next_leaf(root, path);
775 			if (ret == 0)
776 				continue;
777 			if (ret < 0)
778 				goto out;
779 
780 			break;
781 		}
782 		btrfs_item_key_to_cpu(l, &key, slot);
783 
784 		if (key.objectid < device->devid)
785 			goto next;
786 
787 		if (key.objectid > device->devid)
788 			break;
789 
790 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
791 			goto next;
792 
793 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
794 		extent_end = key.offset + btrfs_dev_extent_length(l,
795 								  dev_extent);
796 		if (key.offset <= start && extent_end > end) {
797 			*length = end - start + 1;
798 			break;
799 		} else if (key.offset <= start && extent_end > start)
800 			*length += extent_end - start;
801 		else if (key.offset > start && extent_end <= end)
802 			*length += extent_end - key.offset;
803 		else if (key.offset > start && key.offset <= end) {
804 			*length += end - key.offset + 1;
805 			break;
806 		} else if (key.offset > end)
807 			break;
808 
809 next:
810 		path->slots[0]++;
811 	}
812 	ret = 0;
813 out:
814 	btrfs_free_path(path);
815 	return ret;
816 }
817 
818 /*
819  * find_free_dev_extent - find free space in the specified device
820  * @trans:	transaction handler
821  * @device:	the device which we search the free space in
822  * @num_bytes:	the size of the free space that we need
823  * @start:	store the start of the free space.
824  * @len:	the size of the free space. that we find, or the size of the max
825  * 		free space if we don't find suitable free space
826  *
827  * this uses a pretty simple search, the expectation is that it is
828  * called very infrequently and that a given device has a small number
829  * of extents
830  *
831  * @start is used to store the start of the free space if we find. But if we
832  * don't find suitable free space, it will be used to store the start position
833  * of the max free space.
834  *
835  * @len is used to store the size of the free space that we find.
836  * But if we don't find suitable free space, it is used to store the size of
837  * the max free space.
838  */
839 int find_free_dev_extent(struct btrfs_trans_handle *trans,
840 			 struct btrfs_device *device, u64 num_bytes,
841 			 u64 *start, u64 *len)
842 {
843 	struct btrfs_key key;
844 	struct btrfs_root *root = device->dev_root;
845 	struct btrfs_dev_extent *dev_extent;
846 	struct btrfs_path *path;
847 	u64 hole_size;
848 	u64 max_hole_start;
849 	u64 max_hole_size;
850 	u64 extent_end;
851 	u64 search_start;
852 	u64 search_end = device->total_bytes;
853 	int ret;
854 	int slot;
855 	struct extent_buffer *l;
856 
857 	/* FIXME use last free of some kind */
858 
859 	/* we don't want to overwrite the superblock on the drive,
860 	 * so we make sure to start at an offset of at least 1MB
861 	 */
862 	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
863 
864 	max_hole_start = search_start;
865 	max_hole_size = 0;
866 	hole_size = 0;
867 
868 	if (search_start >= search_end) {
869 		ret = -ENOSPC;
870 		goto error;
871 	}
872 
873 	path = btrfs_alloc_path();
874 	if (!path) {
875 		ret = -ENOMEM;
876 		goto error;
877 	}
878 	path->reada = 2;
879 
880 	key.objectid = device->devid;
881 	key.offset = search_start;
882 	key.type = BTRFS_DEV_EXTENT_KEY;
883 
884 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
885 	if (ret < 0)
886 		goto out;
887 	if (ret > 0) {
888 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
889 		if (ret < 0)
890 			goto out;
891 	}
892 
893 	while (1) {
894 		l = path->nodes[0];
895 		slot = path->slots[0];
896 		if (slot >= btrfs_header_nritems(l)) {
897 			ret = btrfs_next_leaf(root, path);
898 			if (ret == 0)
899 				continue;
900 			if (ret < 0)
901 				goto out;
902 
903 			break;
904 		}
905 		btrfs_item_key_to_cpu(l, &key, slot);
906 
907 		if (key.objectid < device->devid)
908 			goto next;
909 
910 		if (key.objectid > device->devid)
911 			break;
912 
913 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
914 			goto next;
915 
916 		if (key.offset > search_start) {
917 			hole_size = key.offset - search_start;
918 
919 			if (hole_size > max_hole_size) {
920 				max_hole_start = search_start;
921 				max_hole_size = hole_size;
922 			}
923 
924 			/*
925 			 * If this free space is greater than which we need,
926 			 * it must be the max free space that we have found
927 			 * until now, so max_hole_start must point to the start
928 			 * of this free space and the length of this free space
929 			 * is stored in max_hole_size. Thus, we return
930 			 * max_hole_start and max_hole_size and go back to the
931 			 * caller.
932 			 */
933 			if (hole_size >= num_bytes) {
934 				ret = 0;
935 				goto out;
936 			}
937 		}
938 
939 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
940 		extent_end = key.offset + btrfs_dev_extent_length(l,
941 								  dev_extent);
942 		if (extent_end > search_start)
943 			search_start = extent_end;
944 next:
945 		path->slots[0]++;
946 		cond_resched();
947 	}
948 
949 	/*
950 	 * At this point, search_start should be the end of
951 	 * allocated dev extents, and when shrinking the device,
952 	 * search_end may be smaller than search_start.
953 	 */
954 	if (search_end > search_start)
955 		hole_size = search_end - search_start;
956 
957 	if (hole_size > max_hole_size) {
958 		max_hole_start = search_start;
959 		max_hole_size = hole_size;
960 	}
961 
962 	/* See above. */
963 	if (hole_size < num_bytes)
964 		ret = -ENOSPC;
965 	else
966 		ret = 0;
967 
968 out:
969 	btrfs_free_path(path);
970 error:
971 	*start = max_hole_start;
972 	if (len)
973 		*len = max_hole_size;
974 	return ret;
975 }
976 
977 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
978 			  struct btrfs_device *device,
979 			  u64 start)
980 {
981 	int ret;
982 	struct btrfs_path *path;
983 	struct btrfs_root *root = device->dev_root;
984 	struct btrfs_key key;
985 	struct btrfs_key found_key;
986 	struct extent_buffer *leaf = NULL;
987 	struct btrfs_dev_extent *extent = NULL;
988 
989 	path = btrfs_alloc_path();
990 	if (!path)
991 		return -ENOMEM;
992 
993 	key.objectid = device->devid;
994 	key.offset = start;
995 	key.type = BTRFS_DEV_EXTENT_KEY;
996 
997 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 	if (ret > 0) {
999 		ret = btrfs_previous_item(root, path, key.objectid,
1000 					  BTRFS_DEV_EXTENT_KEY);
1001 		if (ret)
1002 			goto out;
1003 		leaf = path->nodes[0];
1004 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1005 		extent = btrfs_item_ptr(leaf, path->slots[0],
1006 					struct btrfs_dev_extent);
1007 		BUG_ON(found_key.offset > start || found_key.offset +
1008 		       btrfs_dev_extent_length(leaf, extent) < start);
1009 	} else if (ret == 0) {
1010 		leaf = path->nodes[0];
1011 		extent = btrfs_item_ptr(leaf, path->slots[0],
1012 					struct btrfs_dev_extent);
1013 	}
1014 	BUG_ON(ret);
1015 
1016 	if (device->bytes_used > 0)
1017 		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
1018 	ret = btrfs_del_item(trans, root, path);
1019 
1020 out:
1021 	btrfs_free_path(path);
1022 	return ret;
1023 }
1024 
1025 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1026 			   struct btrfs_device *device,
1027 			   u64 chunk_tree, u64 chunk_objectid,
1028 			   u64 chunk_offset, u64 start, u64 num_bytes)
1029 {
1030 	int ret;
1031 	struct btrfs_path *path;
1032 	struct btrfs_root *root = device->dev_root;
1033 	struct btrfs_dev_extent *extent;
1034 	struct extent_buffer *leaf;
1035 	struct btrfs_key key;
1036 
1037 	WARN_ON(!device->in_fs_metadata);
1038 	path = btrfs_alloc_path();
1039 	if (!path)
1040 		return -ENOMEM;
1041 
1042 	key.objectid = device->devid;
1043 	key.offset = start;
1044 	key.type = BTRFS_DEV_EXTENT_KEY;
1045 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1046 				      sizeof(*extent));
1047 	BUG_ON(ret);
1048 
1049 	leaf = path->nodes[0];
1050 	extent = btrfs_item_ptr(leaf, path->slots[0],
1051 				struct btrfs_dev_extent);
1052 	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1053 	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1054 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1055 
1056 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1057 		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1058 		    BTRFS_UUID_SIZE);
1059 
1060 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1061 	btrfs_mark_buffer_dirty(leaf);
1062 	btrfs_free_path(path);
1063 	return ret;
1064 }
1065 
1066 static noinline int find_next_chunk(struct btrfs_root *root,
1067 				    u64 objectid, u64 *offset)
1068 {
1069 	struct btrfs_path *path;
1070 	int ret;
1071 	struct btrfs_key key;
1072 	struct btrfs_chunk *chunk;
1073 	struct btrfs_key found_key;
1074 
1075 	path = btrfs_alloc_path();
1076 	if (!path)
1077 		return -ENOMEM;
1078 
1079 	key.objectid = objectid;
1080 	key.offset = (u64)-1;
1081 	key.type = BTRFS_CHUNK_ITEM_KEY;
1082 
1083 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1084 	if (ret < 0)
1085 		goto error;
1086 
1087 	BUG_ON(ret == 0);
1088 
1089 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1090 	if (ret) {
1091 		*offset = 0;
1092 	} else {
1093 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1094 				      path->slots[0]);
1095 		if (found_key.objectid != objectid)
1096 			*offset = 0;
1097 		else {
1098 			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1099 					       struct btrfs_chunk);
1100 			*offset = found_key.offset +
1101 				btrfs_chunk_length(path->nodes[0], chunk);
1102 		}
1103 	}
1104 	ret = 0;
1105 error:
1106 	btrfs_free_path(path);
1107 	return ret;
1108 }
1109 
1110 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1111 {
1112 	int ret;
1113 	struct btrfs_key key;
1114 	struct btrfs_key found_key;
1115 	struct btrfs_path *path;
1116 
1117 	root = root->fs_info->chunk_root;
1118 
1119 	path = btrfs_alloc_path();
1120 	if (!path)
1121 		return -ENOMEM;
1122 
1123 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1124 	key.type = BTRFS_DEV_ITEM_KEY;
1125 	key.offset = (u64)-1;
1126 
1127 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1128 	if (ret < 0)
1129 		goto error;
1130 
1131 	BUG_ON(ret == 0);
1132 
1133 	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1134 				  BTRFS_DEV_ITEM_KEY);
1135 	if (ret) {
1136 		*objectid = 1;
1137 	} else {
1138 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1139 				      path->slots[0]);
1140 		*objectid = found_key.offset + 1;
1141 	}
1142 	ret = 0;
1143 error:
1144 	btrfs_free_path(path);
1145 	return ret;
1146 }
1147 
1148 /*
1149  * the device information is stored in the chunk root
1150  * the btrfs_device struct should be fully filled in
1151  */
1152 int btrfs_add_device(struct btrfs_trans_handle *trans,
1153 		     struct btrfs_root *root,
1154 		     struct btrfs_device *device)
1155 {
1156 	int ret;
1157 	struct btrfs_path *path;
1158 	struct btrfs_dev_item *dev_item;
1159 	struct extent_buffer *leaf;
1160 	struct btrfs_key key;
1161 	unsigned long ptr;
1162 
1163 	root = root->fs_info->chunk_root;
1164 
1165 	path = btrfs_alloc_path();
1166 	if (!path)
1167 		return -ENOMEM;
1168 
1169 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1170 	key.type = BTRFS_DEV_ITEM_KEY;
1171 	key.offset = device->devid;
1172 
1173 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1174 				      sizeof(*dev_item));
1175 	if (ret)
1176 		goto out;
1177 
1178 	leaf = path->nodes[0];
1179 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1180 
1181 	btrfs_set_device_id(leaf, dev_item, device->devid);
1182 	btrfs_set_device_generation(leaf, dev_item, 0);
1183 	btrfs_set_device_type(leaf, dev_item, device->type);
1184 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1185 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1186 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1187 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1188 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1189 	btrfs_set_device_group(leaf, dev_item, 0);
1190 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1191 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1192 	btrfs_set_device_start_offset(leaf, dev_item, 0);
1193 
1194 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
1195 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1196 	ptr = (unsigned long)btrfs_device_fsid(dev_item);
1197 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1198 	btrfs_mark_buffer_dirty(leaf);
1199 
1200 	ret = 0;
1201 out:
1202 	btrfs_free_path(path);
1203 	return ret;
1204 }
1205 
1206 static int btrfs_rm_dev_item(struct btrfs_root *root,
1207 			     struct btrfs_device *device)
1208 {
1209 	int ret;
1210 	struct btrfs_path *path;
1211 	struct btrfs_key key;
1212 	struct btrfs_trans_handle *trans;
1213 
1214 	root = root->fs_info->chunk_root;
1215 
1216 	path = btrfs_alloc_path();
1217 	if (!path)
1218 		return -ENOMEM;
1219 
1220 	trans = btrfs_start_transaction(root, 0);
1221 	if (IS_ERR(trans)) {
1222 		btrfs_free_path(path);
1223 		return PTR_ERR(trans);
1224 	}
1225 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1226 	key.type = BTRFS_DEV_ITEM_KEY;
1227 	key.offset = device->devid;
1228 	lock_chunks(root);
1229 
1230 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1231 	if (ret < 0)
1232 		goto out;
1233 
1234 	if (ret > 0) {
1235 		ret = -ENOENT;
1236 		goto out;
1237 	}
1238 
1239 	ret = btrfs_del_item(trans, root, path);
1240 	if (ret)
1241 		goto out;
1242 out:
1243 	btrfs_free_path(path);
1244 	unlock_chunks(root);
1245 	btrfs_commit_transaction(trans, root);
1246 	return ret;
1247 }
1248 
1249 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1250 {
1251 	struct btrfs_device *device;
1252 	struct btrfs_device *next_device;
1253 	struct block_device *bdev;
1254 	struct buffer_head *bh = NULL;
1255 	struct btrfs_super_block *disk_super;
1256 	struct btrfs_fs_devices *cur_devices;
1257 	u64 all_avail;
1258 	u64 devid;
1259 	u64 num_devices;
1260 	u8 *dev_uuid;
1261 	int ret = 0;
1262 	bool clear_super = false;
1263 
1264 	mutex_lock(&uuid_mutex);
1265 	mutex_lock(&root->fs_info->volume_mutex);
1266 
1267 	all_avail = root->fs_info->avail_data_alloc_bits |
1268 		root->fs_info->avail_system_alloc_bits |
1269 		root->fs_info->avail_metadata_alloc_bits;
1270 
1271 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1272 	    root->fs_info->fs_devices->num_devices <= 4) {
1273 		printk(KERN_ERR "btrfs: unable to go below four devices "
1274 		       "on raid10\n");
1275 		ret = -EINVAL;
1276 		goto out;
1277 	}
1278 
1279 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1280 	    root->fs_info->fs_devices->num_devices <= 2) {
1281 		printk(KERN_ERR "btrfs: unable to go below two "
1282 		       "devices on raid1\n");
1283 		ret = -EINVAL;
1284 		goto out;
1285 	}
1286 
1287 	if (strcmp(device_path, "missing") == 0) {
1288 		struct list_head *devices;
1289 		struct btrfs_device *tmp;
1290 
1291 		device = NULL;
1292 		devices = &root->fs_info->fs_devices->devices;
1293 		/*
1294 		 * It is safe to read the devices since the volume_mutex
1295 		 * is held.
1296 		 */
1297 		list_for_each_entry(tmp, devices, dev_list) {
1298 			if (tmp->in_fs_metadata && !tmp->bdev) {
1299 				device = tmp;
1300 				break;
1301 			}
1302 		}
1303 		bdev = NULL;
1304 		bh = NULL;
1305 		disk_super = NULL;
1306 		if (!device) {
1307 			printk(KERN_ERR "btrfs: no missing devices found to "
1308 			       "remove\n");
1309 			goto out;
1310 		}
1311 	} else {
1312 		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1313 					  root->fs_info->bdev_holder);
1314 		if (IS_ERR(bdev)) {
1315 			ret = PTR_ERR(bdev);
1316 			goto out;
1317 		}
1318 
1319 		set_blocksize(bdev, 4096);
1320 		bh = btrfs_read_dev_super(bdev);
1321 		if (!bh) {
1322 			ret = -EINVAL;
1323 			goto error_close;
1324 		}
1325 		disk_super = (struct btrfs_super_block *)bh->b_data;
1326 		devid = btrfs_stack_device_id(&disk_super->dev_item);
1327 		dev_uuid = disk_super->dev_item.uuid;
1328 		device = btrfs_find_device(root, devid, dev_uuid,
1329 					   disk_super->fsid);
1330 		if (!device) {
1331 			ret = -ENOENT;
1332 			goto error_brelse;
1333 		}
1334 	}
1335 
1336 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1337 		printk(KERN_ERR "btrfs: unable to remove the only writeable "
1338 		       "device\n");
1339 		ret = -EINVAL;
1340 		goto error_brelse;
1341 	}
1342 
1343 	if (device->writeable) {
1344 		lock_chunks(root);
1345 		list_del_init(&device->dev_alloc_list);
1346 		unlock_chunks(root);
1347 		root->fs_info->fs_devices->rw_devices--;
1348 		clear_super = true;
1349 	}
1350 
1351 	ret = btrfs_shrink_device(device, 0);
1352 	if (ret)
1353 		goto error_undo;
1354 
1355 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1356 	if (ret)
1357 		goto error_undo;
1358 
1359 	device->in_fs_metadata = 0;
1360 	btrfs_scrub_cancel_dev(root, device);
1361 
1362 	/*
1363 	 * the device list mutex makes sure that we don't change
1364 	 * the device list while someone else is writing out all
1365 	 * the device supers.
1366 	 */
1367 
1368 	cur_devices = device->fs_devices;
1369 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1370 	list_del_rcu(&device->dev_list);
1371 
1372 	device->fs_devices->num_devices--;
1373 
1374 	if (device->missing)
1375 		root->fs_info->fs_devices->missing_devices--;
1376 
1377 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
1378 				 struct btrfs_device, dev_list);
1379 	if (device->bdev == root->fs_info->sb->s_bdev)
1380 		root->fs_info->sb->s_bdev = next_device->bdev;
1381 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1382 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1383 
1384 	if (device->bdev)
1385 		device->fs_devices->open_devices--;
1386 
1387 	call_rcu(&device->rcu, free_device);
1388 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 
1390 	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1391 	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1392 
1393 	if (cur_devices->open_devices == 0) {
1394 		struct btrfs_fs_devices *fs_devices;
1395 		fs_devices = root->fs_info->fs_devices;
1396 		while (fs_devices) {
1397 			if (fs_devices->seed == cur_devices)
1398 				break;
1399 			fs_devices = fs_devices->seed;
1400 		}
1401 		fs_devices->seed = cur_devices->seed;
1402 		cur_devices->seed = NULL;
1403 		lock_chunks(root);
1404 		__btrfs_close_devices(cur_devices);
1405 		unlock_chunks(root);
1406 		free_fs_devices(cur_devices);
1407 	}
1408 
1409 	/*
1410 	 * at this point, the device is zero sized.  We want to
1411 	 * remove it from the devices list and zero out the old super
1412 	 */
1413 	if (clear_super) {
1414 		/* make sure this device isn't detected as part of
1415 		 * the FS anymore
1416 		 */
1417 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1418 		set_buffer_dirty(bh);
1419 		sync_dirty_buffer(bh);
1420 	}
1421 
1422 	ret = 0;
1423 
1424 error_brelse:
1425 	brelse(bh);
1426 error_close:
1427 	if (bdev)
1428 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1429 out:
1430 	mutex_unlock(&root->fs_info->volume_mutex);
1431 	mutex_unlock(&uuid_mutex);
1432 	return ret;
1433 error_undo:
1434 	if (device->writeable) {
1435 		lock_chunks(root);
1436 		list_add(&device->dev_alloc_list,
1437 			 &root->fs_info->fs_devices->alloc_list);
1438 		unlock_chunks(root);
1439 		root->fs_info->fs_devices->rw_devices++;
1440 	}
1441 	goto error_brelse;
1442 }
1443 
1444 /*
1445  * does all the dirty work required for changing file system's UUID.
1446  */
1447 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1448 				struct btrfs_root *root)
1449 {
1450 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 	struct btrfs_fs_devices *old_devices;
1452 	struct btrfs_fs_devices *seed_devices;
1453 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1454 	struct btrfs_device *device;
1455 	u64 super_flags;
1456 
1457 	BUG_ON(!mutex_is_locked(&uuid_mutex));
1458 	if (!fs_devices->seeding)
1459 		return -EINVAL;
1460 
1461 	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1462 	if (!seed_devices)
1463 		return -ENOMEM;
1464 
1465 	old_devices = clone_fs_devices(fs_devices);
1466 	if (IS_ERR(old_devices)) {
1467 		kfree(seed_devices);
1468 		return PTR_ERR(old_devices);
1469 	}
1470 
1471 	list_add(&old_devices->list, &fs_uuids);
1472 
1473 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1474 	seed_devices->opened = 1;
1475 	INIT_LIST_HEAD(&seed_devices->devices);
1476 	INIT_LIST_HEAD(&seed_devices->alloc_list);
1477 	mutex_init(&seed_devices->device_list_mutex);
1478 
1479 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1480 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1481 			      synchronize_rcu);
1482 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1483 
1484 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1485 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
1486 		device->fs_devices = seed_devices;
1487 	}
1488 
1489 	fs_devices->seeding = 0;
1490 	fs_devices->num_devices = 0;
1491 	fs_devices->open_devices = 0;
1492 	fs_devices->seed = seed_devices;
1493 
1494 	generate_random_uuid(fs_devices->fsid);
1495 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1496 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1497 	super_flags = btrfs_super_flags(disk_super) &
1498 		      ~BTRFS_SUPER_FLAG_SEEDING;
1499 	btrfs_set_super_flags(disk_super, super_flags);
1500 
1501 	return 0;
1502 }
1503 
1504 /*
1505  * strore the expected generation for seed devices in device items.
1506  */
1507 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1508 			       struct btrfs_root *root)
1509 {
1510 	struct btrfs_path *path;
1511 	struct extent_buffer *leaf;
1512 	struct btrfs_dev_item *dev_item;
1513 	struct btrfs_device *device;
1514 	struct btrfs_key key;
1515 	u8 fs_uuid[BTRFS_UUID_SIZE];
1516 	u8 dev_uuid[BTRFS_UUID_SIZE];
1517 	u64 devid;
1518 	int ret;
1519 
1520 	path = btrfs_alloc_path();
1521 	if (!path)
1522 		return -ENOMEM;
1523 
1524 	root = root->fs_info->chunk_root;
1525 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1526 	key.offset = 0;
1527 	key.type = BTRFS_DEV_ITEM_KEY;
1528 
1529 	while (1) {
1530 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1531 		if (ret < 0)
1532 			goto error;
1533 
1534 		leaf = path->nodes[0];
1535 next_slot:
1536 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1537 			ret = btrfs_next_leaf(root, path);
1538 			if (ret > 0)
1539 				break;
1540 			if (ret < 0)
1541 				goto error;
1542 			leaf = path->nodes[0];
1543 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1544 			btrfs_release_path(path);
1545 			continue;
1546 		}
1547 
1548 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1549 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1550 		    key.type != BTRFS_DEV_ITEM_KEY)
1551 			break;
1552 
1553 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
1554 					  struct btrfs_dev_item);
1555 		devid = btrfs_device_id(leaf, dev_item);
1556 		read_extent_buffer(leaf, dev_uuid,
1557 				   (unsigned long)btrfs_device_uuid(dev_item),
1558 				   BTRFS_UUID_SIZE);
1559 		read_extent_buffer(leaf, fs_uuid,
1560 				   (unsigned long)btrfs_device_fsid(dev_item),
1561 				   BTRFS_UUID_SIZE);
1562 		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1563 		BUG_ON(!device);
1564 
1565 		if (device->fs_devices->seeding) {
1566 			btrfs_set_device_generation(leaf, dev_item,
1567 						    device->generation);
1568 			btrfs_mark_buffer_dirty(leaf);
1569 		}
1570 
1571 		path->slots[0]++;
1572 		goto next_slot;
1573 	}
1574 	ret = 0;
1575 error:
1576 	btrfs_free_path(path);
1577 	return ret;
1578 }
1579 
1580 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1581 {
1582 	struct request_queue *q;
1583 	struct btrfs_trans_handle *trans;
1584 	struct btrfs_device *device;
1585 	struct block_device *bdev;
1586 	struct list_head *devices;
1587 	struct super_block *sb = root->fs_info->sb;
1588 	u64 total_bytes;
1589 	int seeding_dev = 0;
1590 	int ret = 0;
1591 
1592 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1593 		return -EINVAL;
1594 
1595 	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1596 				  root->fs_info->bdev_holder);
1597 	if (IS_ERR(bdev))
1598 		return PTR_ERR(bdev);
1599 
1600 	if (root->fs_info->fs_devices->seeding) {
1601 		seeding_dev = 1;
1602 		down_write(&sb->s_umount);
1603 		mutex_lock(&uuid_mutex);
1604 	}
1605 
1606 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
1607 	mutex_lock(&root->fs_info->volume_mutex);
1608 
1609 	devices = &root->fs_info->fs_devices->devices;
1610 	/*
1611 	 * we have the volume lock, so we don't need the extra
1612 	 * device list mutex while reading the list here.
1613 	 */
1614 	list_for_each_entry(device, devices, dev_list) {
1615 		if (device->bdev == bdev) {
1616 			ret = -EEXIST;
1617 			goto error;
1618 		}
1619 	}
1620 
1621 	device = kzalloc(sizeof(*device), GFP_NOFS);
1622 	if (!device) {
1623 		/* we can safely leave the fs_devices entry around */
1624 		ret = -ENOMEM;
1625 		goto error;
1626 	}
1627 
1628 	device->name = kstrdup(device_path, GFP_NOFS);
1629 	if (!device->name) {
1630 		kfree(device);
1631 		ret = -ENOMEM;
1632 		goto error;
1633 	}
1634 
1635 	ret = find_next_devid(root, &device->devid);
1636 	if (ret) {
1637 		kfree(device->name);
1638 		kfree(device);
1639 		goto error;
1640 	}
1641 
1642 	trans = btrfs_start_transaction(root, 0);
1643 	if (IS_ERR(trans)) {
1644 		kfree(device->name);
1645 		kfree(device);
1646 		ret = PTR_ERR(trans);
1647 		goto error;
1648 	}
1649 
1650 	lock_chunks(root);
1651 
1652 	q = bdev_get_queue(bdev);
1653 	if (blk_queue_discard(q))
1654 		device->can_discard = 1;
1655 	device->writeable = 1;
1656 	device->work.func = pending_bios_fn;
1657 	generate_random_uuid(device->uuid);
1658 	spin_lock_init(&device->io_lock);
1659 	device->generation = trans->transid;
1660 	device->io_width = root->sectorsize;
1661 	device->io_align = root->sectorsize;
1662 	device->sector_size = root->sectorsize;
1663 	device->total_bytes = i_size_read(bdev->bd_inode);
1664 	device->disk_total_bytes = device->total_bytes;
1665 	device->dev_root = root->fs_info->dev_root;
1666 	device->bdev = bdev;
1667 	device->in_fs_metadata = 1;
1668 	device->mode = FMODE_EXCL;
1669 	set_blocksize(device->bdev, 4096);
1670 
1671 	if (seeding_dev) {
1672 		sb->s_flags &= ~MS_RDONLY;
1673 		ret = btrfs_prepare_sprout(trans, root);
1674 		BUG_ON(ret);
1675 	}
1676 
1677 	device->fs_devices = root->fs_info->fs_devices;
1678 
1679 	/*
1680 	 * we don't want write_supers to jump in here with our device
1681 	 * half setup
1682 	 */
1683 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1684 	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1685 	list_add(&device->dev_alloc_list,
1686 		 &root->fs_info->fs_devices->alloc_list);
1687 	root->fs_info->fs_devices->num_devices++;
1688 	root->fs_info->fs_devices->open_devices++;
1689 	root->fs_info->fs_devices->rw_devices++;
1690 	if (device->can_discard)
1691 		root->fs_info->fs_devices->num_can_discard++;
1692 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 
1694 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 		root->fs_info->fs_devices->rotating = 1;
1696 
1697 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1698 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1699 				    total_bytes + device->total_bytes);
1700 
1701 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1702 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
1703 				    total_bytes + 1);
1704 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 
1706 	if (seeding_dev) {
1707 		ret = init_first_rw_device(trans, root, device);
1708 		BUG_ON(ret);
1709 		ret = btrfs_finish_sprout(trans, root);
1710 		BUG_ON(ret);
1711 	} else {
1712 		ret = btrfs_add_device(trans, root, device);
1713 	}
1714 
1715 	/*
1716 	 * we've got more storage, clear any full flags on the space
1717 	 * infos
1718 	 */
1719 	btrfs_clear_space_info_full(root->fs_info);
1720 
1721 	unlock_chunks(root);
1722 	btrfs_commit_transaction(trans, root);
1723 
1724 	if (seeding_dev) {
1725 		mutex_unlock(&uuid_mutex);
1726 		up_write(&sb->s_umount);
1727 
1728 		ret = btrfs_relocate_sys_chunks(root);
1729 		BUG_ON(ret);
1730 	}
1731 out:
1732 	mutex_unlock(&root->fs_info->volume_mutex);
1733 	return ret;
1734 error:
1735 	blkdev_put(bdev, FMODE_EXCL);
1736 	if (seeding_dev) {
1737 		mutex_unlock(&uuid_mutex);
1738 		up_write(&sb->s_umount);
1739 	}
1740 	goto out;
1741 }
1742 
1743 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1744 					struct btrfs_device *device)
1745 {
1746 	int ret;
1747 	struct btrfs_path *path;
1748 	struct btrfs_root *root;
1749 	struct btrfs_dev_item *dev_item;
1750 	struct extent_buffer *leaf;
1751 	struct btrfs_key key;
1752 
1753 	root = device->dev_root->fs_info->chunk_root;
1754 
1755 	path = btrfs_alloc_path();
1756 	if (!path)
1757 		return -ENOMEM;
1758 
1759 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1760 	key.type = BTRFS_DEV_ITEM_KEY;
1761 	key.offset = device->devid;
1762 
1763 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1764 	if (ret < 0)
1765 		goto out;
1766 
1767 	if (ret > 0) {
1768 		ret = -ENOENT;
1769 		goto out;
1770 	}
1771 
1772 	leaf = path->nodes[0];
1773 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1774 
1775 	btrfs_set_device_id(leaf, dev_item, device->devid);
1776 	btrfs_set_device_type(leaf, dev_item, device->type);
1777 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1778 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1779 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1780 	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1781 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1782 	btrfs_mark_buffer_dirty(leaf);
1783 
1784 out:
1785 	btrfs_free_path(path);
1786 	return ret;
1787 }
1788 
1789 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 		      struct btrfs_device *device, u64 new_size)
1791 {
1792 	struct btrfs_super_block *super_copy =
1793 		&device->dev_root->fs_info->super_copy;
1794 	u64 old_total = btrfs_super_total_bytes(super_copy);
1795 	u64 diff = new_size - device->total_bytes;
1796 
1797 	if (!device->writeable)
1798 		return -EACCES;
1799 	if (new_size <= device->total_bytes)
1800 		return -EINVAL;
1801 
1802 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
1803 	device->fs_devices->total_rw_bytes += diff;
1804 
1805 	device->total_bytes = new_size;
1806 	device->disk_total_bytes = new_size;
1807 	btrfs_clear_space_info_full(device->dev_root->fs_info);
1808 
1809 	return btrfs_update_device(trans, device);
1810 }
1811 
1812 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1813 		      struct btrfs_device *device, u64 new_size)
1814 {
1815 	int ret;
1816 	lock_chunks(device->dev_root);
1817 	ret = __btrfs_grow_device(trans, device, new_size);
1818 	unlock_chunks(device->dev_root);
1819 	return ret;
1820 }
1821 
1822 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1823 			    struct btrfs_root *root,
1824 			    u64 chunk_tree, u64 chunk_objectid,
1825 			    u64 chunk_offset)
1826 {
1827 	int ret;
1828 	struct btrfs_path *path;
1829 	struct btrfs_key key;
1830 
1831 	root = root->fs_info->chunk_root;
1832 	path = btrfs_alloc_path();
1833 	if (!path)
1834 		return -ENOMEM;
1835 
1836 	key.objectid = chunk_objectid;
1837 	key.offset = chunk_offset;
1838 	key.type = BTRFS_CHUNK_ITEM_KEY;
1839 
1840 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1841 	BUG_ON(ret);
1842 
1843 	ret = btrfs_del_item(trans, root, path);
1844 
1845 	btrfs_free_path(path);
1846 	return ret;
1847 }
1848 
1849 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 			chunk_offset)
1851 {
1852 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1853 	struct btrfs_disk_key *disk_key;
1854 	struct btrfs_chunk *chunk;
1855 	u8 *ptr;
1856 	int ret = 0;
1857 	u32 num_stripes;
1858 	u32 array_size;
1859 	u32 len = 0;
1860 	u32 cur;
1861 	struct btrfs_key key;
1862 
1863 	array_size = btrfs_super_sys_array_size(super_copy);
1864 
1865 	ptr = super_copy->sys_chunk_array;
1866 	cur = 0;
1867 
1868 	while (cur < array_size) {
1869 		disk_key = (struct btrfs_disk_key *)ptr;
1870 		btrfs_disk_key_to_cpu(&key, disk_key);
1871 
1872 		len = sizeof(*disk_key);
1873 
1874 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1875 			chunk = (struct btrfs_chunk *)(ptr + len);
1876 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1877 			len += btrfs_chunk_item_size(num_stripes);
1878 		} else {
1879 			ret = -EIO;
1880 			break;
1881 		}
1882 		if (key.objectid == chunk_objectid &&
1883 		    key.offset == chunk_offset) {
1884 			memmove(ptr, ptr + len, array_size - (cur + len));
1885 			array_size -= len;
1886 			btrfs_set_super_sys_array_size(super_copy, array_size);
1887 		} else {
1888 			ptr += len;
1889 			cur += len;
1890 		}
1891 	}
1892 	return ret;
1893 }
1894 
1895 static int btrfs_relocate_chunk(struct btrfs_root *root,
1896 			 u64 chunk_tree, u64 chunk_objectid,
1897 			 u64 chunk_offset)
1898 {
1899 	struct extent_map_tree *em_tree;
1900 	struct btrfs_root *extent_root;
1901 	struct btrfs_trans_handle *trans;
1902 	struct extent_map *em;
1903 	struct map_lookup *map;
1904 	int ret;
1905 	int i;
1906 
1907 	root = root->fs_info->chunk_root;
1908 	extent_root = root->fs_info->extent_root;
1909 	em_tree = &root->fs_info->mapping_tree.map_tree;
1910 
1911 	ret = btrfs_can_relocate(extent_root, chunk_offset);
1912 	if (ret)
1913 		return -ENOSPC;
1914 
1915 	/* step one, relocate all the extents inside this chunk */
1916 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1917 	if (ret)
1918 		return ret;
1919 
1920 	trans = btrfs_start_transaction(root, 0);
1921 	BUG_ON(IS_ERR(trans));
1922 
1923 	lock_chunks(root);
1924 
1925 	/*
1926 	 * step two, delete the device extents and the
1927 	 * chunk tree entries
1928 	 */
1929 	read_lock(&em_tree->lock);
1930 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1931 	read_unlock(&em_tree->lock);
1932 
1933 	BUG_ON(em->start > chunk_offset ||
1934 	       em->start + em->len < chunk_offset);
1935 	map = (struct map_lookup *)em->bdev;
1936 
1937 	for (i = 0; i < map->num_stripes; i++) {
1938 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1939 					    map->stripes[i].physical);
1940 		BUG_ON(ret);
1941 
1942 		if (map->stripes[i].dev) {
1943 			ret = btrfs_update_device(trans, map->stripes[i].dev);
1944 			BUG_ON(ret);
1945 		}
1946 	}
1947 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1948 			       chunk_offset);
1949 
1950 	BUG_ON(ret);
1951 
1952 	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1953 
1954 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1955 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1956 		BUG_ON(ret);
1957 	}
1958 
1959 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1960 	BUG_ON(ret);
1961 
1962 	write_lock(&em_tree->lock);
1963 	remove_extent_mapping(em_tree, em);
1964 	write_unlock(&em_tree->lock);
1965 
1966 	kfree(map);
1967 	em->bdev = NULL;
1968 
1969 	/* once for the tree */
1970 	free_extent_map(em);
1971 	/* once for us */
1972 	free_extent_map(em);
1973 
1974 	unlock_chunks(root);
1975 	btrfs_end_transaction(trans, root);
1976 	return 0;
1977 }
1978 
1979 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1980 {
1981 	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1982 	struct btrfs_path *path;
1983 	struct extent_buffer *leaf;
1984 	struct btrfs_chunk *chunk;
1985 	struct btrfs_key key;
1986 	struct btrfs_key found_key;
1987 	u64 chunk_tree = chunk_root->root_key.objectid;
1988 	u64 chunk_type;
1989 	bool retried = false;
1990 	int failed = 0;
1991 	int ret;
1992 
1993 	path = btrfs_alloc_path();
1994 	if (!path)
1995 		return -ENOMEM;
1996 
1997 again:
1998 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1999 	key.offset = (u64)-1;
2000 	key.type = BTRFS_CHUNK_ITEM_KEY;
2001 
2002 	while (1) {
2003 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2004 		if (ret < 0)
2005 			goto error;
2006 		BUG_ON(ret == 0);
2007 
2008 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
2009 					  key.type);
2010 		if (ret < 0)
2011 			goto error;
2012 		if (ret > 0)
2013 			break;
2014 
2015 		leaf = path->nodes[0];
2016 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2017 
2018 		chunk = btrfs_item_ptr(leaf, path->slots[0],
2019 				       struct btrfs_chunk);
2020 		chunk_type = btrfs_chunk_type(leaf, chunk);
2021 		btrfs_release_path(path);
2022 
2023 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2024 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2025 						   found_key.objectid,
2026 						   found_key.offset);
2027 			if (ret == -ENOSPC)
2028 				failed++;
2029 			else if (ret)
2030 				BUG();
2031 		}
2032 
2033 		if (found_key.offset == 0)
2034 			break;
2035 		key.offset = found_key.offset - 1;
2036 	}
2037 	ret = 0;
2038 	if (failed && !retried) {
2039 		failed = 0;
2040 		retried = true;
2041 		goto again;
2042 	} else if (failed && retried) {
2043 		WARN_ON(1);
2044 		ret = -ENOSPC;
2045 	}
2046 error:
2047 	btrfs_free_path(path);
2048 	return ret;
2049 }
2050 
2051 static u64 div_factor(u64 num, int factor)
2052 {
2053 	if (factor == 10)
2054 		return num;
2055 	num *= factor;
2056 	do_div(num, 10);
2057 	return num;
2058 }
2059 
2060 int btrfs_balance(struct btrfs_root *dev_root)
2061 {
2062 	int ret;
2063 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
2064 	struct btrfs_device *device;
2065 	u64 old_size;
2066 	u64 size_to_free;
2067 	struct btrfs_path *path;
2068 	struct btrfs_key key;
2069 	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
2070 	struct btrfs_trans_handle *trans;
2071 	struct btrfs_key found_key;
2072 
2073 	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2074 		return -EROFS;
2075 
2076 	if (!capable(CAP_SYS_ADMIN))
2077 		return -EPERM;
2078 
2079 	mutex_lock(&dev_root->fs_info->volume_mutex);
2080 	dev_root = dev_root->fs_info->dev_root;
2081 
2082 	/* step one make some room on all the devices */
2083 	list_for_each_entry(device, devices, dev_list) {
2084 		old_size = device->total_bytes;
2085 		size_to_free = div_factor(old_size, 1);
2086 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2087 		if (!device->writeable ||
2088 		    device->total_bytes - device->bytes_used > size_to_free)
2089 			continue;
2090 
2091 		ret = btrfs_shrink_device(device, old_size - size_to_free);
2092 		if (ret == -ENOSPC)
2093 			break;
2094 		BUG_ON(ret);
2095 
2096 		trans = btrfs_start_transaction(dev_root, 0);
2097 		BUG_ON(IS_ERR(trans));
2098 
2099 		ret = btrfs_grow_device(trans, device, old_size);
2100 		BUG_ON(ret);
2101 
2102 		btrfs_end_transaction(trans, dev_root);
2103 	}
2104 
2105 	/* step two, relocate all the chunks */
2106 	path = btrfs_alloc_path();
2107 	if (!path) {
2108 		ret = -ENOMEM;
2109 		goto error;
2110 	}
2111 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2112 	key.offset = (u64)-1;
2113 	key.type = BTRFS_CHUNK_ITEM_KEY;
2114 
2115 	while (1) {
2116 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2117 		if (ret < 0)
2118 			goto error;
2119 
2120 		/*
2121 		 * this shouldn't happen, it means the last relocate
2122 		 * failed
2123 		 */
2124 		if (ret == 0)
2125 			break;
2126 
2127 		ret = btrfs_previous_item(chunk_root, path, 0,
2128 					  BTRFS_CHUNK_ITEM_KEY);
2129 		if (ret)
2130 			break;
2131 
2132 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2133 				      path->slots[0]);
2134 		if (found_key.objectid != key.objectid)
2135 			break;
2136 
2137 		/* chunk zero is special */
2138 		if (found_key.offset == 0)
2139 			break;
2140 
2141 		btrfs_release_path(path);
2142 		ret = btrfs_relocate_chunk(chunk_root,
2143 					   chunk_root->root_key.objectid,
2144 					   found_key.objectid,
2145 					   found_key.offset);
2146 		if (ret && ret != -ENOSPC)
2147 			goto error;
2148 		key.offset = found_key.offset - 1;
2149 	}
2150 	ret = 0;
2151 error:
2152 	btrfs_free_path(path);
2153 	mutex_unlock(&dev_root->fs_info->volume_mutex);
2154 	return ret;
2155 }
2156 
2157 /*
2158  * shrinking a device means finding all of the device extents past
2159  * the new size, and then following the back refs to the chunks.
2160  * The chunk relocation code actually frees the device extent
2161  */
2162 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2163 {
2164 	struct btrfs_trans_handle *trans;
2165 	struct btrfs_root *root = device->dev_root;
2166 	struct btrfs_dev_extent *dev_extent = NULL;
2167 	struct btrfs_path *path;
2168 	u64 length;
2169 	u64 chunk_tree;
2170 	u64 chunk_objectid;
2171 	u64 chunk_offset;
2172 	int ret;
2173 	int slot;
2174 	int failed = 0;
2175 	bool retried = false;
2176 	struct extent_buffer *l;
2177 	struct btrfs_key key;
2178 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2179 	u64 old_total = btrfs_super_total_bytes(super_copy);
2180 	u64 old_size = device->total_bytes;
2181 	u64 diff = device->total_bytes - new_size;
2182 
2183 	if (new_size >= device->total_bytes)
2184 		return -EINVAL;
2185 
2186 	path = btrfs_alloc_path();
2187 	if (!path)
2188 		return -ENOMEM;
2189 
2190 	path->reada = 2;
2191 
2192 	lock_chunks(root);
2193 
2194 	device->total_bytes = new_size;
2195 	if (device->writeable)
2196 		device->fs_devices->total_rw_bytes -= diff;
2197 	unlock_chunks(root);
2198 
2199 again:
2200 	key.objectid = device->devid;
2201 	key.offset = (u64)-1;
2202 	key.type = BTRFS_DEV_EXTENT_KEY;
2203 
2204 	while (1) {
2205 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2206 		if (ret < 0)
2207 			goto done;
2208 
2209 		ret = btrfs_previous_item(root, path, 0, key.type);
2210 		if (ret < 0)
2211 			goto done;
2212 		if (ret) {
2213 			ret = 0;
2214 			btrfs_release_path(path);
2215 			break;
2216 		}
2217 
2218 		l = path->nodes[0];
2219 		slot = path->slots[0];
2220 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2221 
2222 		if (key.objectid != device->devid) {
2223 			btrfs_release_path(path);
2224 			break;
2225 		}
2226 
2227 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2228 		length = btrfs_dev_extent_length(l, dev_extent);
2229 
2230 		if (key.offset + length <= new_size) {
2231 			btrfs_release_path(path);
2232 			break;
2233 		}
2234 
2235 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2236 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2237 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2238 		btrfs_release_path(path);
2239 
2240 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2241 					   chunk_offset);
2242 		if (ret && ret != -ENOSPC)
2243 			goto done;
2244 		if (ret == -ENOSPC)
2245 			failed++;
2246 		key.offset -= 1;
2247 	}
2248 
2249 	if (failed && !retried) {
2250 		failed = 0;
2251 		retried = true;
2252 		goto again;
2253 	} else if (failed && retried) {
2254 		ret = -ENOSPC;
2255 		lock_chunks(root);
2256 
2257 		device->total_bytes = old_size;
2258 		if (device->writeable)
2259 			device->fs_devices->total_rw_bytes += diff;
2260 		unlock_chunks(root);
2261 		goto done;
2262 	}
2263 
2264 	/* Shrinking succeeded, else we would be at "done". */
2265 	trans = btrfs_start_transaction(root, 0);
2266 	if (IS_ERR(trans)) {
2267 		ret = PTR_ERR(trans);
2268 		goto done;
2269 	}
2270 
2271 	lock_chunks(root);
2272 
2273 	device->disk_total_bytes = new_size;
2274 	/* Now btrfs_update_device() will change the on-disk size. */
2275 	ret = btrfs_update_device(trans, device);
2276 	if (ret) {
2277 		unlock_chunks(root);
2278 		btrfs_end_transaction(trans, root);
2279 		goto done;
2280 	}
2281 	WARN_ON(diff > old_total);
2282 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
2283 	unlock_chunks(root);
2284 	btrfs_end_transaction(trans, root);
2285 done:
2286 	btrfs_free_path(path);
2287 	return ret;
2288 }
2289 
2290 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2291 			   struct btrfs_root *root,
2292 			   struct btrfs_key *key,
2293 			   struct btrfs_chunk *chunk, int item_size)
2294 {
2295 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2296 	struct btrfs_disk_key disk_key;
2297 	u32 array_size;
2298 	u8 *ptr;
2299 
2300 	array_size = btrfs_super_sys_array_size(super_copy);
2301 	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2302 		return -EFBIG;
2303 
2304 	ptr = super_copy->sys_chunk_array + array_size;
2305 	btrfs_cpu_key_to_disk(&disk_key, key);
2306 	memcpy(ptr, &disk_key, sizeof(disk_key));
2307 	ptr += sizeof(disk_key);
2308 	memcpy(ptr, chunk, item_size);
2309 	item_size += sizeof(disk_key);
2310 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2311 	return 0;
2312 }
2313 
2314 /*
2315  * sort the devices in descending order by max_avail, total_avail
2316  */
2317 static int btrfs_cmp_device_info(const void *a, const void *b)
2318 {
2319 	const struct btrfs_device_info *di_a = a;
2320 	const struct btrfs_device_info *di_b = b;
2321 
2322 	if (di_a->max_avail > di_b->max_avail)
2323 		return -1;
2324 	if (di_a->max_avail < di_b->max_avail)
2325 		return 1;
2326 	if (di_a->total_avail > di_b->total_avail)
2327 		return -1;
2328 	if (di_a->total_avail < di_b->total_avail)
2329 		return 1;
2330 	return 0;
2331 }
2332 
2333 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2334 			       struct btrfs_root *extent_root,
2335 			       struct map_lookup **map_ret,
2336 			       u64 *num_bytes_out, u64 *stripe_size_out,
2337 			       u64 start, u64 type)
2338 {
2339 	struct btrfs_fs_info *info = extent_root->fs_info;
2340 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
2341 	struct list_head *cur;
2342 	struct map_lookup *map = NULL;
2343 	struct extent_map_tree *em_tree;
2344 	struct extent_map *em;
2345 	struct btrfs_device_info *devices_info = NULL;
2346 	u64 total_avail;
2347 	int num_stripes;	/* total number of stripes to allocate */
2348 	int sub_stripes;	/* sub_stripes info for map */
2349 	int dev_stripes;	/* stripes per dev */
2350 	int devs_max;		/* max devs to use */
2351 	int devs_min;		/* min devs needed */
2352 	int devs_increment;	/* ndevs has to be a multiple of this */
2353 	int ncopies;		/* how many copies to data has */
2354 	int ret;
2355 	u64 max_stripe_size;
2356 	u64 max_chunk_size;
2357 	u64 stripe_size;
2358 	u64 num_bytes;
2359 	int ndevs;
2360 	int i;
2361 	int j;
2362 
2363 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2364 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
2365 		WARN_ON(1);
2366 		type &= ~BTRFS_BLOCK_GROUP_DUP;
2367 	}
2368 
2369 	if (list_empty(&fs_devices->alloc_list))
2370 		return -ENOSPC;
2371 
2372 	sub_stripes = 1;
2373 	dev_stripes = 1;
2374 	devs_increment = 1;
2375 	ncopies = 1;
2376 	devs_max = 0;	/* 0 == as many as possible */
2377 	devs_min = 1;
2378 
2379 	/*
2380 	 * define the properties of each RAID type.
2381 	 * FIXME: move this to a global table and use it in all RAID
2382 	 * calculation code
2383 	 */
2384 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2385 		dev_stripes = 2;
2386 		ncopies = 2;
2387 		devs_max = 1;
2388 	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2389 		devs_min = 2;
2390 	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2391 		devs_increment = 2;
2392 		ncopies = 2;
2393 		devs_max = 2;
2394 		devs_min = 2;
2395 	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2396 		sub_stripes = 2;
2397 		devs_increment = 2;
2398 		ncopies = 2;
2399 		devs_min = 4;
2400 	} else {
2401 		devs_max = 1;
2402 	}
2403 
2404 	if (type & BTRFS_BLOCK_GROUP_DATA) {
2405 		max_stripe_size = 1024 * 1024 * 1024;
2406 		max_chunk_size = 10 * max_stripe_size;
2407 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2408 		max_stripe_size = 256 * 1024 * 1024;
2409 		max_chunk_size = max_stripe_size;
2410 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2411 		max_stripe_size = 8 * 1024 * 1024;
2412 		max_chunk_size = 2 * max_stripe_size;
2413 	} else {
2414 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2415 		       type);
2416 		BUG_ON(1);
2417 	}
2418 
2419 	/* we don't want a chunk larger than 10% of writeable space */
2420 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2421 			     max_chunk_size);
2422 
2423 	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2424 			       GFP_NOFS);
2425 	if (!devices_info)
2426 		return -ENOMEM;
2427 
2428 	cur = fs_devices->alloc_list.next;
2429 
2430 	/*
2431 	 * in the first pass through the devices list, we gather information
2432 	 * about the available holes on each device.
2433 	 */
2434 	ndevs = 0;
2435 	while (cur != &fs_devices->alloc_list) {
2436 		struct btrfs_device *device;
2437 		u64 max_avail;
2438 		u64 dev_offset;
2439 
2440 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2441 
2442 		cur = cur->next;
2443 
2444 		if (!device->writeable) {
2445 			printk(KERN_ERR
2446 			       "btrfs: read-only device in alloc_list\n");
2447 			WARN_ON(1);
2448 			continue;
2449 		}
2450 
2451 		if (!device->in_fs_metadata)
2452 			continue;
2453 
2454 		if (device->total_bytes > device->bytes_used)
2455 			total_avail = device->total_bytes - device->bytes_used;
2456 		else
2457 			total_avail = 0;
2458 
2459 		/* If there is no space on this device, skip it. */
2460 		if (total_avail == 0)
2461 			continue;
2462 
2463 		ret = find_free_dev_extent(trans, device,
2464 					   max_stripe_size * dev_stripes,
2465 					   &dev_offset, &max_avail);
2466 		if (ret && ret != -ENOSPC)
2467 			goto error;
2468 
2469 		if (ret == 0)
2470 			max_avail = max_stripe_size * dev_stripes;
2471 
2472 		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2473 			continue;
2474 
2475 		devices_info[ndevs].dev_offset = dev_offset;
2476 		devices_info[ndevs].max_avail = max_avail;
2477 		devices_info[ndevs].total_avail = total_avail;
2478 		devices_info[ndevs].dev = device;
2479 		++ndevs;
2480 	}
2481 
2482 	/*
2483 	 * now sort the devices by hole size / available space
2484 	 */
2485 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2486 	     btrfs_cmp_device_info, NULL);
2487 
2488 	/* round down to number of usable stripes */
2489 	ndevs -= ndevs % devs_increment;
2490 
2491 	if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2492 		ret = -ENOSPC;
2493 		goto error;
2494 	}
2495 
2496 	if (devs_max && ndevs > devs_max)
2497 		ndevs = devs_max;
2498 	/*
2499 	 * the primary goal is to maximize the number of stripes, so use as many
2500 	 * devices as possible, even if the stripes are not maximum sized.
2501 	 */
2502 	stripe_size = devices_info[ndevs-1].max_avail;
2503 	num_stripes = ndevs * dev_stripes;
2504 
2505 	if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2506 		stripe_size = max_chunk_size * ncopies;
2507 		do_div(stripe_size, num_stripes);
2508 	}
2509 
2510 	do_div(stripe_size, dev_stripes);
2511 	do_div(stripe_size, BTRFS_STRIPE_LEN);
2512 	stripe_size *= BTRFS_STRIPE_LEN;
2513 
2514 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2515 	if (!map) {
2516 		ret = -ENOMEM;
2517 		goto error;
2518 	}
2519 	map->num_stripes = num_stripes;
2520 
2521 	for (i = 0; i < ndevs; ++i) {
2522 		for (j = 0; j < dev_stripes; ++j) {
2523 			int s = i * dev_stripes + j;
2524 			map->stripes[s].dev = devices_info[i].dev;
2525 			map->stripes[s].physical = devices_info[i].dev_offset +
2526 						   j * stripe_size;
2527 		}
2528 	}
2529 	map->sector_size = extent_root->sectorsize;
2530 	map->stripe_len = BTRFS_STRIPE_LEN;
2531 	map->io_align = BTRFS_STRIPE_LEN;
2532 	map->io_width = BTRFS_STRIPE_LEN;
2533 	map->type = type;
2534 	map->sub_stripes = sub_stripes;
2535 
2536 	*map_ret = map;
2537 	num_bytes = stripe_size * (num_stripes / ncopies);
2538 
2539 	*stripe_size_out = stripe_size;
2540 	*num_bytes_out = num_bytes;
2541 
2542 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2543 
2544 	em = alloc_extent_map();
2545 	if (!em) {
2546 		ret = -ENOMEM;
2547 		goto error;
2548 	}
2549 	em->bdev = (struct block_device *)map;
2550 	em->start = start;
2551 	em->len = num_bytes;
2552 	em->block_start = 0;
2553 	em->block_len = em->len;
2554 
2555 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2556 	write_lock(&em_tree->lock);
2557 	ret = add_extent_mapping(em_tree, em);
2558 	write_unlock(&em_tree->lock);
2559 	BUG_ON(ret);
2560 	free_extent_map(em);
2561 
2562 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
2563 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2564 				     start, num_bytes);
2565 	BUG_ON(ret);
2566 
2567 	for (i = 0; i < map->num_stripes; ++i) {
2568 		struct btrfs_device *device;
2569 		u64 dev_offset;
2570 
2571 		device = map->stripes[i].dev;
2572 		dev_offset = map->stripes[i].physical;
2573 
2574 		ret = btrfs_alloc_dev_extent(trans, device,
2575 				info->chunk_root->root_key.objectid,
2576 				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2577 				start, dev_offset, stripe_size);
2578 		BUG_ON(ret);
2579 	}
2580 
2581 	kfree(devices_info);
2582 	return 0;
2583 
2584 error:
2585 	kfree(map);
2586 	kfree(devices_info);
2587 	return ret;
2588 }
2589 
2590 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2591 				struct btrfs_root *extent_root,
2592 				struct map_lookup *map, u64 chunk_offset,
2593 				u64 chunk_size, u64 stripe_size)
2594 {
2595 	u64 dev_offset;
2596 	struct btrfs_key key;
2597 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2598 	struct btrfs_device *device;
2599 	struct btrfs_chunk *chunk;
2600 	struct btrfs_stripe *stripe;
2601 	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2602 	int index = 0;
2603 	int ret;
2604 
2605 	chunk = kzalloc(item_size, GFP_NOFS);
2606 	if (!chunk)
2607 		return -ENOMEM;
2608 
2609 	index = 0;
2610 	while (index < map->num_stripes) {
2611 		device = map->stripes[index].dev;
2612 		device->bytes_used += stripe_size;
2613 		ret = btrfs_update_device(trans, device);
2614 		BUG_ON(ret);
2615 		index++;
2616 	}
2617 
2618 	index = 0;
2619 	stripe = &chunk->stripe;
2620 	while (index < map->num_stripes) {
2621 		device = map->stripes[index].dev;
2622 		dev_offset = map->stripes[index].physical;
2623 
2624 		btrfs_set_stack_stripe_devid(stripe, device->devid);
2625 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
2626 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2627 		stripe++;
2628 		index++;
2629 	}
2630 
2631 	btrfs_set_stack_chunk_length(chunk, chunk_size);
2632 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2633 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2634 	btrfs_set_stack_chunk_type(chunk, map->type);
2635 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2636 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2637 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2638 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2639 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2640 
2641 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2642 	key.type = BTRFS_CHUNK_ITEM_KEY;
2643 	key.offset = chunk_offset;
2644 
2645 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2646 	BUG_ON(ret);
2647 
2648 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2649 		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2650 					     item_size);
2651 		BUG_ON(ret);
2652 	}
2653 
2654 	kfree(chunk);
2655 	return 0;
2656 }
2657 
2658 /*
2659  * Chunk allocation falls into two parts. The first part does works
2660  * that make the new allocated chunk useable, but not do any operation
2661  * that modifies the chunk tree. The second part does the works that
2662  * require modifying the chunk tree. This division is important for the
2663  * bootstrap process of adding storage to a seed btrfs.
2664  */
2665 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2666 		      struct btrfs_root *extent_root, u64 type)
2667 {
2668 	u64 chunk_offset;
2669 	u64 chunk_size;
2670 	u64 stripe_size;
2671 	struct map_lookup *map;
2672 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2673 	int ret;
2674 
2675 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2676 			      &chunk_offset);
2677 	if (ret)
2678 		return ret;
2679 
2680 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2681 				  &stripe_size, chunk_offset, type);
2682 	if (ret)
2683 		return ret;
2684 
2685 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2686 				   chunk_size, stripe_size);
2687 	BUG_ON(ret);
2688 	return 0;
2689 }
2690 
2691 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2692 					 struct btrfs_root *root,
2693 					 struct btrfs_device *device)
2694 {
2695 	u64 chunk_offset;
2696 	u64 sys_chunk_offset;
2697 	u64 chunk_size;
2698 	u64 sys_chunk_size;
2699 	u64 stripe_size;
2700 	u64 sys_stripe_size;
2701 	u64 alloc_profile;
2702 	struct map_lookup *map;
2703 	struct map_lookup *sys_map;
2704 	struct btrfs_fs_info *fs_info = root->fs_info;
2705 	struct btrfs_root *extent_root = fs_info->extent_root;
2706 	int ret;
2707 
2708 	ret = find_next_chunk(fs_info->chunk_root,
2709 			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2710 	if (ret)
2711 		return ret;
2712 
2713 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2714 			(fs_info->metadata_alloc_profile &
2715 			 fs_info->avail_metadata_alloc_bits);
2716 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2717 
2718 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2719 				  &stripe_size, chunk_offset, alloc_profile);
2720 	BUG_ON(ret);
2721 
2722 	sys_chunk_offset = chunk_offset + chunk_size;
2723 
2724 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2725 			(fs_info->system_alloc_profile &
2726 			 fs_info->avail_system_alloc_bits);
2727 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2728 
2729 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2730 				  &sys_chunk_size, &sys_stripe_size,
2731 				  sys_chunk_offset, alloc_profile);
2732 	BUG_ON(ret);
2733 
2734 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2735 	BUG_ON(ret);
2736 
2737 	/*
2738 	 * Modifying chunk tree needs allocating new blocks from both
2739 	 * system block group and metadata block group. So we only can
2740 	 * do operations require modifying the chunk tree after both
2741 	 * block groups were created.
2742 	 */
2743 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2744 				   chunk_size, stripe_size);
2745 	BUG_ON(ret);
2746 
2747 	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2748 				   sys_chunk_offset, sys_chunk_size,
2749 				   sys_stripe_size);
2750 	BUG_ON(ret);
2751 	return 0;
2752 }
2753 
2754 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2755 {
2756 	struct extent_map *em;
2757 	struct map_lookup *map;
2758 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2759 	int readonly = 0;
2760 	int i;
2761 
2762 	read_lock(&map_tree->map_tree.lock);
2763 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2764 	read_unlock(&map_tree->map_tree.lock);
2765 	if (!em)
2766 		return 1;
2767 
2768 	if (btrfs_test_opt(root, DEGRADED)) {
2769 		free_extent_map(em);
2770 		return 0;
2771 	}
2772 
2773 	map = (struct map_lookup *)em->bdev;
2774 	for (i = 0; i < map->num_stripes; i++) {
2775 		if (!map->stripes[i].dev->writeable) {
2776 			readonly = 1;
2777 			break;
2778 		}
2779 	}
2780 	free_extent_map(em);
2781 	return readonly;
2782 }
2783 
2784 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2785 {
2786 	extent_map_tree_init(&tree->map_tree);
2787 }
2788 
2789 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2790 {
2791 	struct extent_map *em;
2792 
2793 	while (1) {
2794 		write_lock(&tree->map_tree.lock);
2795 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2796 		if (em)
2797 			remove_extent_mapping(&tree->map_tree, em);
2798 		write_unlock(&tree->map_tree.lock);
2799 		if (!em)
2800 			break;
2801 		kfree(em->bdev);
2802 		/* once for us */
2803 		free_extent_map(em);
2804 		/* once for the tree */
2805 		free_extent_map(em);
2806 	}
2807 }
2808 
2809 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2810 {
2811 	struct extent_map *em;
2812 	struct map_lookup *map;
2813 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2814 	int ret;
2815 
2816 	read_lock(&em_tree->lock);
2817 	em = lookup_extent_mapping(em_tree, logical, len);
2818 	read_unlock(&em_tree->lock);
2819 	BUG_ON(!em);
2820 
2821 	BUG_ON(em->start > logical || em->start + em->len < logical);
2822 	map = (struct map_lookup *)em->bdev;
2823 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2824 		ret = map->num_stripes;
2825 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2826 		ret = map->sub_stripes;
2827 	else
2828 		ret = 1;
2829 	free_extent_map(em);
2830 	return ret;
2831 }
2832 
2833 static int find_live_mirror(struct map_lookup *map, int first, int num,
2834 			    int optimal)
2835 {
2836 	int i;
2837 	if (map->stripes[optimal].dev->bdev)
2838 		return optimal;
2839 	for (i = first; i < first + num; i++) {
2840 		if (map->stripes[i].dev->bdev)
2841 			return i;
2842 	}
2843 	/* we couldn't find one that doesn't fail.  Just return something
2844 	 * and the io error handling code will clean up eventually
2845 	 */
2846 	return optimal;
2847 }
2848 
2849 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 			     u64 logical, u64 *length,
2851 			     struct btrfs_multi_bio **multi_ret,
2852 			     int mirror_num)
2853 {
2854 	struct extent_map *em;
2855 	struct map_lookup *map;
2856 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2857 	u64 offset;
2858 	u64 stripe_offset;
2859 	u64 stripe_end_offset;
2860 	u64 stripe_nr;
2861 	u64 stripe_nr_orig;
2862 	u64 stripe_nr_end;
2863 	int stripes_allocated = 8;
2864 	int stripes_required = 1;
2865 	int stripe_index;
2866 	int i;
2867 	int num_stripes;
2868 	int max_errors = 0;
2869 	struct btrfs_multi_bio *multi = NULL;
2870 
2871 	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 		stripes_allocated = 1;
2873 again:
2874 	if (multi_ret) {
2875 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2876 				GFP_NOFS);
2877 		if (!multi)
2878 			return -ENOMEM;
2879 
2880 		atomic_set(&multi->error, 0);
2881 	}
2882 
2883 	read_lock(&em_tree->lock);
2884 	em = lookup_extent_mapping(em_tree, logical, *length);
2885 	read_unlock(&em_tree->lock);
2886 
2887 	if (!em) {
2888 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2889 		       (unsigned long long)logical,
2890 		       (unsigned long long)*length);
2891 		BUG();
2892 	}
2893 
2894 	BUG_ON(em->start > logical || em->start + em->len < logical);
2895 	map = (struct map_lookup *)em->bdev;
2896 	offset = logical - em->start;
2897 
2898 	if (mirror_num > map->num_stripes)
2899 		mirror_num = 0;
2900 
2901 	/* if our multi bio struct is too small, back off and try again */
2902 	if (rw & REQ_WRITE) {
2903 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 				 BTRFS_BLOCK_GROUP_DUP)) {
2905 			stripes_required = map->num_stripes;
2906 			max_errors = 1;
2907 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2908 			stripes_required = map->sub_stripes;
2909 			max_errors = 1;
2910 		}
2911 	}
2912 	if (rw & REQ_DISCARD) {
2913 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2914 				 BTRFS_BLOCK_GROUP_RAID1 |
2915 				 BTRFS_BLOCK_GROUP_DUP |
2916 				 BTRFS_BLOCK_GROUP_RAID10)) {
2917 			stripes_required = map->num_stripes;
2918 		}
2919 	}
2920 	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 	    stripes_allocated < stripes_required) {
2922 		stripes_allocated = map->num_stripes;
2923 		free_extent_map(em);
2924 		kfree(multi);
2925 		goto again;
2926 	}
2927 	stripe_nr = offset;
2928 	/*
2929 	 * stripe_nr counts the total number of stripes we have to stride
2930 	 * to get to this block
2931 	 */
2932 	do_div(stripe_nr, map->stripe_len);
2933 
2934 	stripe_offset = stripe_nr * map->stripe_len;
2935 	BUG_ON(offset < stripe_offset);
2936 
2937 	/* stripe_offset is the offset of this block in its stripe*/
2938 	stripe_offset = offset - stripe_offset;
2939 
2940 	if (rw & REQ_DISCARD)
2941 		*length = min_t(u64, em->len - offset, *length);
2942 	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2943 			      BTRFS_BLOCK_GROUP_RAID1 |
2944 			      BTRFS_BLOCK_GROUP_RAID10 |
2945 			      BTRFS_BLOCK_GROUP_DUP)) {
2946 		/* we limit the length of each bio to what fits in a stripe */
2947 		*length = min_t(u64, em->len - offset,
2948 				map->stripe_len - stripe_offset);
2949 	} else {
2950 		*length = em->len - offset;
2951 	}
2952 
2953 	if (!multi_ret)
2954 		goto out;
2955 
2956 	num_stripes = 1;
2957 	stripe_index = 0;
2958 	stripe_nr_orig = stripe_nr;
2959 	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2960 			(~(map->stripe_len - 1));
2961 	do_div(stripe_nr_end, map->stripe_len);
2962 	stripe_end_offset = stripe_nr_end * map->stripe_len -
2963 			    (offset + *length);
2964 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2965 		if (rw & REQ_DISCARD)
2966 			num_stripes = min_t(u64, map->num_stripes,
2967 					    stripe_nr_end - stripe_nr_orig);
2968 		stripe_index = do_div(stripe_nr, map->num_stripes);
2969 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2970 		if (rw & (REQ_WRITE | REQ_DISCARD))
2971 			num_stripes = map->num_stripes;
2972 		else if (mirror_num)
2973 			stripe_index = mirror_num - 1;
2974 		else {
2975 			stripe_index = find_live_mirror(map, 0,
2976 					    map->num_stripes,
2977 					    current->pid % map->num_stripes);
2978 		}
2979 
2980 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 		if (rw & (REQ_WRITE | REQ_DISCARD))
2982 			num_stripes = map->num_stripes;
2983 		else if (mirror_num)
2984 			stripe_index = mirror_num - 1;
2985 
2986 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 		int factor = map->num_stripes / map->sub_stripes;
2988 
2989 		stripe_index = do_div(stripe_nr, factor);
2990 		stripe_index *= map->sub_stripes;
2991 
2992 		if (rw & REQ_WRITE)
2993 			num_stripes = map->sub_stripes;
2994 		else if (rw & REQ_DISCARD)
2995 			num_stripes = min_t(u64, map->sub_stripes *
2996 					    (stripe_nr_end - stripe_nr_orig),
2997 					    map->num_stripes);
2998 		else if (mirror_num)
2999 			stripe_index += mirror_num - 1;
3000 		else {
3001 			stripe_index = find_live_mirror(map, stripe_index,
3002 					      map->sub_stripes, stripe_index +
3003 					      current->pid % map->sub_stripes);
3004 		}
3005 	} else {
3006 		/*
3007 		 * after this do_div call, stripe_nr is the number of stripes
3008 		 * on this device we have to walk to find the data, and
3009 		 * stripe_index is the number of our device in the stripe array
3010 		 */
3011 		stripe_index = do_div(stripe_nr, map->num_stripes);
3012 	}
3013 	BUG_ON(stripe_index >= map->num_stripes);
3014 
3015 	if (rw & REQ_DISCARD) {
3016 		for (i = 0; i < num_stripes; i++) {
3017 			multi->stripes[i].physical =
3018 				map->stripes[stripe_index].physical +
3019 				stripe_offset + stripe_nr * map->stripe_len;
3020 			multi->stripes[i].dev = map->stripes[stripe_index].dev;
3021 
3022 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 				u64 stripes;
3024 				u32 last_stripe = 0;
3025 				int j;
3026 
3027 				div_u64_rem(stripe_nr_end - 1,
3028 					    map->num_stripes,
3029 					    &last_stripe);
3030 
3031 				for (j = 0; j < map->num_stripes; j++) {
3032 					u32 test;
3033 
3034 					div_u64_rem(stripe_nr_end - 1 - j,
3035 						    map->num_stripes, &test);
3036 					if (test == stripe_index)
3037 						break;
3038 				}
3039 				stripes = stripe_nr_end - 1 - j;
3040 				do_div(stripes, map->num_stripes);
3041 				multi->stripes[i].length = map->stripe_len *
3042 					(stripes - stripe_nr + 1);
3043 
3044 				if (i == 0) {
3045 					multi->stripes[i].length -=
3046 						stripe_offset;
3047 					stripe_offset = 0;
3048 				}
3049 				if (stripe_index == last_stripe)
3050 					multi->stripes[i].length -=
3051 						stripe_end_offset;
3052 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 				u64 stripes;
3054 				int j;
3055 				int factor = map->num_stripes /
3056 					     map->sub_stripes;
3057 				u32 last_stripe = 0;
3058 
3059 				div_u64_rem(stripe_nr_end - 1,
3060 					    factor, &last_stripe);
3061 				last_stripe *= map->sub_stripes;
3062 
3063 				for (j = 0; j < factor; j++) {
3064 					u32 test;
3065 
3066 					div_u64_rem(stripe_nr_end - 1 - j,
3067 						    factor, &test);
3068 
3069 					if (test ==
3070 					    stripe_index / map->sub_stripes)
3071 						break;
3072 				}
3073 				stripes = stripe_nr_end - 1 - j;
3074 				do_div(stripes, factor);
3075 				multi->stripes[i].length = map->stripe_len *
3076 					(stripes - stripe_nr + 1);
3077 
3078 				if (i < map->sub_stripes) {
3079 					multi->stripes[i].length -=
3080 						stripe_offset;
3081 					if (i == map->sub_stripes - 1)
3082 						stripe_offset = 0;
3083 				}
3084 				if (stripe_index >= last_stripe &&
3085 				    stripe_index <= (last_stripe +
3086 						     map->sub_stripes - 1)) {
3087 					multi->stripes[i].length -=
3088 						stripe_end_offset;
3089 				}
3090 			} else
3091 				multi->stripes[i].length = *length;
3092 
3093 			stripe_index++;
3094 			if (stripe_index == map->num_stripes) {
3095 				/* This could only happen for RAID0/10 */
3096 				stripe_index = 0;
3097 				stripe_nr++;
3098 			}
3099 		}
3100 	} else {
3101 		for (i = 0; i < num_stripes; i++) {
3102 			multi->stripes[i].physical =
3103 				map->stripes[stripe_index].physical +
3104 				stripe_offset +
3105 				stripe_nr * map->stripe_len;
3106 			multi->stripes[i].dev =
3107 				map->stripes[stripe_index].dev;
3108 			stripe_index++;
3109 		}
3110 	}
3111 	if (multi_ret) {
3112 		*multi_ret = multi;
3113 		multi->num_stripes = num_stripes;
3114 		multi->max_errors = max_errors;
3115 	}
3116 out:
3117 	free_extent_map(em);
3118 	return 0;
3119 }
3120 
3121 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 		      u64 logical, u64 *length,
3123 		      struct btrfs_multi_bio **multi_ret, int mirror_num)
3124 {
3125 	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3126 				 mirror_num);
3127 }
3128 
3129 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3130 		     u64 chunk_start, u64 physical, u64 devid,
3131 		     u64 **logical, int *naddrs, int *stripe_len)
3132 {
3133 	struct extent_map_tree *em_tree = &map_tree->map_tree;
3134 	struct extent_map *em;
3135 	struct map_lookup *map;
3136 	u64 *buf;
3137 	u64 bytenr;
3138 	u64 length;
3139 	u64 stripe_nr;
3140 	int i, j, nr = 0;
3141 
3142 	read_lock(&em_tree->lock);
3143 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
3144 	read_unlock(&em_tree->lock);
3145 
3146 	BUG_ON(!em || em->start != chunk_start);
3147 	map = (struct map_lookup *)em->bdev;
3148 
3149 	length = em->len;
3150 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3151 		do_div(length, map->num_stripes / map->sub_stripes);
3152 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3153 		do_div(length, map->num_stripes);
3154 
3155 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3156 	BUG_ON(!buf);
3157 
3158 	for (i = 0; i < map->num_stripes; i++) {
3159 		if (devid && map->stripes[i].dev->devid != devid)
3160 			continue;
3161 		if (map->stripes[i].physical > physical ||
3162 		    map->stripes[i].physical + length <= physical)
3163 			continue;
3164 
3165 		stripe_nr = physical - map->stripes[i].physical;
3166 		do_div(stripe_nr, map->stripe_len);
3167 
3168 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3169 			stripe_nr = stripe_nr * map->num_stripes + i;
3170 			do_div(stripe_nr, map->sub_stripes);
3171 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3172 			stripe_nr = stripe_nr * map->num_stripes + i;
3173 		}
3174 		bytenr = chunk_start + stripe_nr * map->stripe_len;
3175 		WARN_ON(nr >= map->num_stripes);
3176 		for (j = 0; j < nr; j++) {
3177 			if (buf[j] == bytenr)
3178 				break;
3179 		}
3180 		if (j == nr) {
3181 			WARN_ON(nr >= map->num_stripes);
3182 			buf[nr++] = bytenr;
3183 		}
3184 	}
3185 
3186 	*logical = buf;
3187 	*naddrs = nr;
3188 	*stripe_len = map->stripe_len;
3189 
3190 	free_extent_map(em);
3191 	return 0;
3192 }
3193 
3194 static void end_bio_multi_stripe(struct bio *bio, int err)
3195 {
3196 	struct btrfs_multi_bio *multi = bio->bi_private;
3197 	int is_orig_bio = 0;
3198 
3199 	if (err)
3200 		atomic_inc(&multi->error);
3201 
3202 	if (bio == multi->orig_bio)
3203 		is_orig_bio = 1;
3204 
3205 	if (atomic_dec_and_test(&multi->stripes_pending)) {
3206 		if (!is_orig_bio) {
3207 			bio_put(bio);
3208 			bio = multi->orig_bio;
3209 		}
3210 		bio->bi_private = multi->private;
3211 		bio->bi_end_io = multi->end_io;
3212 		/* only send an error to the higher layers if it is
3213 		 * beyond the tolerance of the multi-bio
3214 		 */
3215 		if (atomic_read(&multi->error) > multi->max_errors) {
3216 			err = -EIO;
3217 		} else if (err) {
3218 			/*
3219 			 * this bio is actually up to date, we didn't
3220 			 * go over the max number of errors
3221 			 */
3222 			set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 			err = 0;
3224 		}
3225 		kfree(multi);
3226 
3227 		bio_endio(bio, err);
3228 	} else if (!is_orig_bio) {
3229 		bio_put(bio);
3230 	}
3231 }
3232 
3233 struct async_sched {
3234 	struct bio *bio;
3235 	int rw;
3236 	struct btrfs_fs_info *info;
3237 	struct btrfs_work work;
3238 };
3239 
3240 /*
3241  * see run_scheduled_bios for a description of why bios are collected for
3242  * async submit.
3243  *
3244  * This will add one bio to the pending list for a device and make sure
3245  * the work struct is scheduled.
3246  */
3247 static noinline int schedule_bio(struct btrfs_root *root,
3248 				 struct btrfs_device *device,
3249 				 int rw, struct bio *bio)
3250 {
3251 	int should_queue = 1;
3252 	struct btrfs_pending_bios *pending_bios;
3253 
3254 	/* don't bother with additional async steps for reads, right now */
3255 	if (!(rw & REQ_WRITE)) {
3256 		bio_get(bio);
3257 		submit_bio(rw, bio);
3258 		bio_put(bio);
3259 		return 0;
3260 	}
3261 
3262 	/*
3263 	 * nr_async_bios allows us to reliably return congestion to the
3264 	 * higher layers.  Otherwise, the async bio makes it appear we have
3265 	 * made progress against dirty pages when we've really just put it
3266 	 * on a queue for later
3267 	 */
3268 	atomic_inc(&root->fs_info->nr_async_bios);
3269 	WARN_ON(bio->bi_next);
3270 	bio->bi_next = NULL;
3271 	bio->bi_rw |= rw;
3272 
3273 	spin_lock(&device->io_lock);
3274 	if (bio->bi_rw & REQ_SYNC)
3275 		pending_bios = &device->pending_sync_bios;
3276 	else
3277 		pending_bios = &device->pending_bios;
3278 
3279 	if (pending_bios->tail)
3280 		pending_bios->tail->bi_next = bio;
3281 
3282 	pending_bios->tail = bio;
3283 	if (!pending_bios->head)
3284 		pending_bios->head = bio;
3285 	if (device->running_pending)
3286 		should_queue = 0;
3287 
3288 	spin_unlock(&device->io_lock);
3289 
3290 	if (should_queue)
3291 		btrfs_queue_worker(&root->fs_info->submit_workers,
3292 				   &device->work);
3293 	return 0;
3294 }
3295 
3296 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3297 		  int mirror_num, int async_submit)
3298 {
3299 	struct btrfs_mapping_tree *map_tree;
3300 	struct btrfs_device *dev;
3301 	struct bio *first_bio = bio;
3302 	u64 logical = (u64)bio->bi_sector << 9;
3303 	u64 length = 0;
3304 	u64 map_length;
3305 	struct btrfs_multi_bio *multi = NULL;
3306 	int ret;
3307 	int dev_nr = 0;
3308 	int total_devs = 1;
3309 
3310 	length = bio->bi_size;
3311 	map_tree = &root->fs_info->mapping_tree;
3312 	map_length = length;
3313 
3314 	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
3315 			      mirror_num);
3316 	BUG_ON(ret);
3317 
3318 	total_devs = multi->num_stripes;
3319 	if (map_length < length) {
3320 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 		       "len %llu\n", (unsigned long long)logical,
3322 		       (unsigned long long)length,
3323 		       (unsigned long long)map_length);
3324 		BUG();
3325 	}
3326 	multi->end_io = first_bio->bi_end_io;
3327 	multi->private = first_bio->bi_private;
3328 	multi->orig_bio = first_bio;
3329 	atomic_set(&multi->stripes_pending, multi->num_stripes);
3330 
3331 	while (dev_nr < total_devs) {
3332 		if (total_devs > 1) {
3333 			if (dev_nr < total_devs - 1) {
3334 				bio = bio_clone(first_bio, GFP_NOFS);
3335 				BUG_ON(!bio);
3336 			} else {
3337 				bio = first_bio;
3338 			}
3339 			bio->bi_private = multi;
3340 			bio->bi_end_io = end_bio_multi_stripe;
3341 		}
3342 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3343 		dev = multi->stripes[dev_nr].dev;
3344 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3345 			bio->bi_bdev = dev->bdev;
3346 			if (async_submit)
3347 				schedule_bio(root, dev, rw, bio);
3348 			else
3349 				submit_bio(rw, bio);
3350 		} else {
3351 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3352 			bio->bi_sector = logical >> 9;
3353 			bio_endio(bio, -EIO);
3354 		}
3355 		dev_nr++;
3356 	}
3357 	if (total_devs == 1)
3358 		kfree(multi);
3359 	return 0;
3360 }
3361 
3362 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3363 				       u8 *uuid, u8 *fsid)
3364 {
3365 	struct btrfs_device *device;
3366 	struct btrfs_fs_devices *cur_devices;
3367 
3368 	cur_devices = root->fs_info->fs_devices;
3369 	while (cur_devices) {
3370 		if (!fsid ||
3371 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3372 			device = __find_device(&cur_devices->devices,
3373 					       devid, uuid);
3374 			if (device)
3375 				return device;
3376 		}
3377 		cur_devices = cur_devices->seed;
3378 	}
3379 	return NULL;
3380 }
3381 
3382 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3383 					    u64 devid, u8 *dev_uuid)
3384 {
3385 	struct btrfs_device *device;
3386 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3387 
3388 	device = kzalloc(sizeof(*device), GFP_NOFS);
3389 	if (!device)
3390 		return NULL;
3391 	list_add(&device->dev_list,
3392 		 &fs_devices->devices);
3393 	device->dev_root = root->fs_info->dev_root;
3394 	device->devid = devid;
3395 	device->work.func = pending_bios_fn;
3396 	device->fs_devices = fs_devices;
3397 	device->missing = 1;
3398 	fs_devices->num_devices++;
3399 	fs_devices->missing_devices++;
3400 	spin_lock_init(&device->io_lock);
3401 	INIT_LIST_HEAD(&device->dev_alloc_list);
3402 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3403 	return device;
3404 }
3405 
3406 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3407 			  struct extent_buffer *leaf,
3408 			  struct btrfs_chunk *chunk)
3409 {
3410 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3411 	struct map_lookup *map;
3412 	struct extent_map *em;
3413 	u64 logical;
3414 	u64 length;
3415 	u64 devid;
3416 	u8 uuid[BTRFS_UUID_SIZE];
3417 	int num_stripes;
3418 	int ret;
3419 	int i;
3420 
3421 	logical = key->offset;
3422 	length = btrfs_chunk_length(leaf, chunk);
3423 
3424 	read_lock(&map_tree->map_tree.lock);
3425 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3426 	read_unlock(&map_tree->map_tree.lock);
3427 
3428 	/* already mapped? */
3429 	if (em && em->start <= logical && em->start + em->len > logical) {
3430 		free_extent_map(em);
3431 		return 0;
3432 	} else if (em) {
3433 		free_extent_map(em);
3434 	}
3435 
3436 	em = alloc_extent_map();
3437 	if (!em)
3438 		return -ENOMEM;
3439 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3440 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3441 	if (!map) {
3442 		free_extent_map(em);
3443 		return -ENOMEM;
3444 	}
3445 
3446 	em->bdev = (struct block_device *)map;
3447 	em->start = logical;
3448 	em->len = length;
3449 	em->block_start = 0;
3450 	em->block_len = em->len;
3451 
3452 	map->num_stripes = num_stripes;
3453 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
3454 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
3455 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3456 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3457 	map->type = btrfs_chunk_type(leaf, chunk);
3458 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3459 	for (i = 0; i < num_stripes; i++) {
3460 		map->stripes[i].physical =
3461 			btrfs_stripe_offset_nr(leaf, chunk, i);
3462 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3463 		read_extent_buffer(leaf, uuid, (unsigned long)
3464 				   btrfs_stripe_dev_uuid_nr(chunk, i),
3465 				   BTRFS_UUID_SIZE);
3466 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3467 							NULL);
3468 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3469 			kfree(map);
3470 			free_extent_map(em);
3471 			return -EIO;
3472 		}
3473 		if (!map->stripes[i].dev) {
3474 			map->stripes[i].dev =
3475 				add_missing_dev(root, devid, uuid);
3476 			if (!map->stripes[i].dev) {
3477 				kfree(map);
3478 				free_extent_map(em);
3479 				return -EIO;
3480 			}
3481 		}
3482 		map->stripes[i].dev->in_fs_metadata = 1;
3483 	}
3484 
3485 	write_lock(&map_tree->map_tree.lock);
3486 	ret = add_extent_mapping(&map_tree->map_tree, em);
3487 	write_unlock(&map_tree->map_tree.lock);
3488 	BUG_ON(ret);
3489 	free_extent_map(em);
3490 
3491 	return 0;
3492 }
3493 
3494 static int fill_device_from_item(struct extent_buffer *leaf,
3495 				 struct btrfs_dev_item *dev_item,
3496 				 struct btrfs_device *device)
3497 {
3498 	unsigned long ptr;
3499 
3500 	device->devid = btrfs_device_id(leaf, dev_item);
3501 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3502 	device->total_bytes = device->disk_total_bytes;
3503 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3504 	device->type = btrfs_device_type(leaf, dev_item);
3505 	device->io_align = btrfs_device_io_align(leaf, dev_item);
3506 	device->io_width = btrfs_device_io_width(leaf, dev_item);
3507 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3508 
3509 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
3510 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3511 
3512 	return 0;
3513 }
3514 
3515 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3516 {
3517 	struct btrfs_fs_devices *fs_devices;
3518 	int ret;
3519 
3520 	mutex_lock(&uuid_mutex);
3521 
3522 	fs_devices = root->fs_info->fs_devices->seed;
3523 	while (fs_devices) {
3524 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3525 			ret = 0;
3526 			goto out;
3527 		}
3528 		fs_devices = fs_devices->seed;
3529 	}
3530 
3531 	fs_devices = find_fsid(fsid);
3532 	if (!fs_devices) {
3533 		ret = -ENOENT;
3534 		goto out;
3535 	}
3536 
3537 	fs_devices = clone_fs_devices(fs_devices);
3538 	if (IS_ERR(fs_devices)) {
3539 		ret = PTR_ERR(fs_devices);
3540 		goto out;
3541 	}
3542 
3543 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3544 				   root->fs_info->bdev_holder);
3545 	if (ret)
3546 		goto out;
3547 
3548 	if (!fs_devices->seeding) {
3549 		__btrfs_close_devices(fs_devices);
3550 		free_fs_devices(fs_devices);
3551 		ret = -EINVAL;
3552 		goto out;
3553 	}
3554 
3555 	fs_devices->seed = root->fs_info->fs_devices->seed;
3556 	root->fs_info->fs_devices->seed = fs_devices;
3557 out:
3558 	mutex_unlock(&uuid_mutex);
3559 	return ret;
3560 }
3561 
3562 static int read_one_dev(struct btrfs_root *root,
3563 			struct extent_buffer *leaf,
3564 			struct btrfs_dev_item *dev_item)
3565 {
3566 	struct btrfs_device *device;
3567 	u64 devid;
3568 	int ret;
3569 	u8 fs_uuid[BTRFS_UUID_SIZE];
3570 	u8 dev_uuid[BTRFS_UUID_SIZE];
3571 
3572 	devid = btrfs_device_id(leaf, dev_item);
3573 	read_extent_buffer(leaf, dev_uuid,
3574 			   (unsigned long)btrfs_device_uuid(dev_item),
3575 			   BTRFS_UUID_SIZE);
3576 	read_extent_buffer(leaf, fs_uuid,
3577 			   (unsigned long)btrfs_device_fsid(dev_item),
3578 			   BTRFS_UUID_SIZE);
3579 
3580 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3581 		ret = open_seed_devices(root, fs_uuid);
3582 		if (ret && !btrfs_test_opt(root, DEGRADED))
3583 			return ret;
3584 	}
3585 
3586 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3587 	if (!device || !device->bdev) {
3588 		if (!btrfs_test_opt(root, DEGRADED))
3589 			return -EIO;
3590 
3591 		if (!device) {
3592 			printk(KERN_WARNING "warning devid %llu missing\n",
3593 			       (unsigned long long)devid);
3594 			device = add_missing_dev(root, devid, dev_uuid);
3595 			if (!device)
3596 				return -ENOMEM;
3597 		} else if (!device->missing) {
3598 			/*
3599 			 * this happens when a device that was properly setup
3600 			 * in the device info lists suddenly goes bad.
3601 			 * device->bdev is NULL, and so we have to set
3602 			 * device->missing to one here
3603 			 */
3604 			root->fs_info->fs_devices->missing_devices++;
3605 			device->missing = 1;
3606 		}
3607 	}
3608 
3609 	if (device->fs_devices != root->fs_info->fs_devices) {
3610 		BUG_ON(device->writeable);
3611 		if (device->generation !=
3612 		    btrfs_device_generation(leaf, dev_item))
3613 			return -EINVAL;
3614 	}
3615 
3616 	fill_device_from_item(leaf, dev_item, device);
3617 	device->dev_root = root->fs_info->dev_root;
3618 	device->in_fs_metadata = 1;
3619 	if (device->writeable)
3620 		device->fs_devices->total_rw_bytes += device->total_bytes;
3621 	ret = 0;
3622 	return ret;
3623 }
3624 
3625 int btrfs_read_sys_array(struct btrfs_root *root)
3626 {
3627 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3628 	struct extent_buffer *sb;
3629 	struct btrfs_disk_key *disk_key;
3630 	struct btrfs_chunk *chunk;
3631 	u8 *ptr;
3632 	unsigned long sb_ptr;
3633 	int ret = 0;
3634 	u32 num_stripes;
3635 	u32 array_size;
3636 	u32 len = 0;
3637 	u32 cur;
3638 	struct btrfs_key key;
3639 
3640 	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3641 					  BTRFS_SUPER_INFO_SIZE);
3642 	if (!sb)
3643 		return -ENOMEM;
3644 	btrfs_set_buffer_uptodate(sb);
3645 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
3646 
3647 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3648 	array_size = btrfs_super_sys_array_size(super_copy);
3649 
3650 	ptr = super_copy->sys_chunk_array;
3651 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3652 	cur = 0;
3653 
3654 	while (cur < array_size) {
3655 		disk_key = (struct btrfs_disk_key *)ptr;
3656 		btrfs_disk_key_to_cpu(&key, disk_key);
3657 
3658 		len = sizeof(*disk_key); ptr += len;
3659 		sb_ptr += len;
3660 		cur += len;
3661 
3662 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3663 			chunk = (struct btrfs_chunk *)sb_ptr;
3664 			ret = read_one_chunk(root, &key, sb, chunk);
3665 			if (ret)
3666 				break;
3667 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3668 			len = btrfs_chunk_item_size(num_stripes);
3669 		} else {
3670 			ret = -EIO;
3671 			break;
3672 		}
3673 		ptr += len;
3674 		sb_ptr += len;
3675 		cur += len;
3676 	}
3677 	free_extent_buffer(sb);
3678 	return ret;
3679 }
3680 
3681 int btrfs_read_chunk_tree(struct btrfs_root *root)
3682 {
3683 	struct btrfs_path *path;
3684 	struct extent_buffer *leaf;
3685 	struct btrfs_key key;
3686 	struct btrfs_key found_key;
3687 	int ret;
3688 	int slot;
3689 
3690 	root = root->fs_info->chunk_root;
3691 
3692 	path = btrfs_alloc_path();
3693 	if (!path)
3694 		return -ENOMEM;
3695 
3696 	/* first we search for all of the device items, and then we
3697 	 * read in all of the chunk items.  This way we can create chunk
3698 	 * mappings that reference all of the devices that are afound
3699 	 */
3700 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3701 	key.offset = 0;
3702 	key.type = 0;
3703 again:
3704 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3705 	if (ret < 0)
3706 		goto error;
3707 	while (1) {
3708 		leaf = path->nodes[0];
3709 		slot = path->slots[0];
3710 		if (slot >= btrfs_header_nritems(leaf)) {
3711 			ret = btrfs_next_leaf(root, path);
3712 			if (ret == 0)
3713 				continue;
3714 			if (ret < 0)
3715 				goto error;
3716 			break;
3717 		}
3718 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3719 		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3720 			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3721 				break;
3722 			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3723 				struct btrfs_dev_item *dev_item;
3724 				dev_item = btrfs_item_ptr(leaf, slot,
3725 						  struct btrfs_dev_item);
3726 				ret = read_one_dev(root, leaf, dev_item);
3727 				if (ret)
3728 					goto error;
3729 			}
3730 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3731 			struct btrfs_chunk *chunk;
3732 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3733 			ret = read_one_chunk(root, &found_key, leaf, chunk);
3734 			if (ret)
3735 				goto error;
3736 		}
3737 		path->slots[0]++;
3738 	}
3739 	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3740 		key.objectid = 0;
3741 		btrfs_release_path(path);
3742 		goto again;
3743 	}
3744 	ret = 0;
3745 error:
3746 	btrfs_free_path(path);
3747 	return ret;
3748 }
3749