xref: /openbmc/linux/fs/btrfs/volumes.c (revision 54525552)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/bio.h>
20 #include <linux/slab.h>
21 #include <linux/buffer_head.h>
22 #include <linux/blkdev.h>
23 #include <linux/random.h>
24 #include <linux/iocontext.h>
25 #include <linux/capability.h>
26 #include <asm/div64.h>
27 #include "compat.h"
28 #include "ctree.h"
29 #include "extent_map.h"
30 #include "disk-io.h"
31 #include "transaction.h"
32 #include "print-tree.h"
33 #include "volumes.h"
34 #include "async-thread.h"
35 
36 static int init_first_rw_device(struct btrfs_trans_handle *trans,
37 				struct btrfs_root *root,
38 				struct btrfs_device *device);
39 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
40 
41 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
42 			    (sizeof(struct btrfs_bio_stripe) * (n)))
43 
44 static DEFINE_MUTEX(uuid_mutex);
45 static LIST_HEAD(fs_uuids);
46 
47 void btrfs_lock_volumes(void)
48 {
49 	mutex_lock(&uuid_mutex);
50 }
51 
52 void btrfs_unlock_volumes(void)
53 {
54 	mutex_unlock(&uuid_mutex);
55 }
56 
57 static void lock_chunks(struct btrfs_root *root)
58 {
59 	mutex_lock(&root->fs_info->chunk_mutex);
60 }
61 
62 static void unlock_chunks(struct btrfs_root *root)
63 {
64 	mutex_unlock(&root->fs_info->chunk_mutex);
65 }
66 
67 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
68 {
69 	struct btrfs_device *device;
70 	WARN_ON(fs_devices->opened);
71 	while (!list_empty(&fs_devices->devices)) {
72 		device = list_entry(fs_devices->devices.next,
73 				    struct btrfs_device, dev_list);
74 		list_del(&device->dev_list);
75 		kfree(device->name);
76 		kfree(device);
77 	}
78 	kfree(fs_devices);
79 }
80 
81 int btrfs_cleanup_fs_uuids(void)
82 {
83 	struct btrfs_fs_devices *fs_devices;
84 
85 	while (!list_empty(&fs_uuids)) {
86 		fs_devices = list_entry(fs_uuids.next,
87 					struct btrfs_fs_devices, list);
88 		list_del(&fs_devices->list);
89 		free_fs_devices(fs_devices);
90 	}
91 	return 0;
92 }
93 
94 static noinline struct btrfs_device *__find_device(struct list_head *head,
95 						   u64 devid, u8 *uuid)
96 {
97 	struct btrfs_device *dev;
98 
99 	list_for_each_entry(dev, head, dev_list) {
100 		if (dev->devid == devid &&
101 		    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
102 			return dev;
103 		}
104 	}
105 	return NULL;
106 }
107 
108 static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
109 {
110 	struct btrfs_fs_devices *fs_devices;
111 
112 	list_for_each_entry(fs_devices, &fs_uuids, list) {
113 		if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
114 			return fs_devices;
115 	}
116 	return NULL;
117 }
118 
119 static void requeue_list(struct btrfs_pending_bios *pending_bios,
120 			struct bio *head, struct bio *tail)
121 {
122 
123 	struct bio *old_head;
124 
125 	old_head = pending_bios->head;
126 	pending_bios->head = head;
127 	if (pending_bios->tail)
128 		tail->bi_next = old_head;
129 	else
130 		pending_bios->tail = tail;
131 }
132 
133 /*
134  * we try to collect pending bios for a device so we don't get a large
135  * number of procs sending bios down to the same device.  This greatly
136  * improves the schedulers ability to collect and merge the bios.
137  *
138  * But, it also turns into a long list of bios to process and that is sure
139  * to eventually make the worker thread block.  The solution here is to
140  * make some progress and then put this work struct back at the end of
141  * the list if the block device is congested.  This way, multiple devices
142  * can make progress from a single worker thread.
143  */
144 static noinline int run_scheduled_bios(struct btrfs_device *device)
145 {
146 	struct bio *pending;
147 	struct backing_dev_info *bdi;
148 	struct btrfs_fs_info *fs_info;
149 	struct btrfs_pending_bios *pending_bios;
150 	struct bio *tail;
151 	struct bio *cur;
152 	int again = 0;
153 	unsigned long num_run;
154 	unsigned long batch_run = 0;
155 	unsigned long limit;
156 	unsigned long last_waited = 0;
157 	int force_reg = 0;
158 	struct blk_plug plug;
159 
160 	/*
161 	 * this function runs all the bios we've collected for
162 	 * a particular device.  We don't want to wander off to
163 	 * another device without first sending all of these down.
164 	 * So, setup a plug here and finish it off before we return
165 	 */
166 	blk_start_plug(&plug);
167 
168 	bdi = blk_get_backing_dev_info(device->bdev);
169 	fs_info = device->dev_root->fs_info;
170 	limit = btrfs_async_submit_limit(fs_info);
171 	limit = limit * 2 / 3;
172 
173 loop:
174 	spin_lock(&device->io_lock);
175 
176 loop_lock:
177 	num_run = 0;
178 
179 	/* take all the bios off the list at once and process them
180 	 * later on (without the lock held).  But, remember the
181 	 * tail and other pointers so the bios can be properly reinserted
182 	 * into the list if we hit congestion
183 	 */
184 	if (!force_reg && device->pending_sync_bios.head) {
185 		pending_bios = &device->pending_sync_bios;
186 		force_reg = 1;
187 	} else {
188 		pending_bios = &device->pending_bios;
189 		force_reg = 0;
190 	}
191 
192 	pending = pending_bios->head;
193 	tail = pending_bios->tail;
194 	WARN_ON(pending && !tail);
195 
196 	/*
197 	 * if pending was null this time around, no bios need processing
198 	 * at all and we can stop.  Otherwise it'll loop back up again
199 	 * and do an additional check so no bios are missed.
200 	 *
201 	 * device->running_pending is used to synchronize with the
202 	 * schedule_bio code.
203 	 */
204 	if (device->pending_sync_bios.head == NULL &&
205 	    device->pending_bios.head == NULL) {
206 		again = 0;
207 		device->running_pending = 0;
208 	} else {
209 		again = 1;
210 		device->running_pending = 1;
211 	}
212 
213 	pending_bios->head = NULL;
214 	pending_bios->tail = NULL;
215 
216 	spin_unlock(&device->io_lock);
217 
218 	while (pending) {
219 
220 		rmb();
221 		/* we want to work on both lists, but do more bios on the
222 		 * sync list than the regular list
223 		 */
224 		if ((num_run > 32 &&
225 		    pending_bios != &device->pending_sync_bios &&
226 		    device->pending_sync_bios.head) ||
227 		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
228 		    device->pending_bios.head)) {
229 			spin_lock(&device->io_lock);
230 			requeue_list(pending_bios, pending, tail);
231 			goto loop_lock;
232 		}
233 
234 		cur = pending;
235 		pending = pending->bi_next;
236 		cur->bi_next = NULL;
237 		atomic_dec(&fs_info->nr_async_bios);
238 
239 		if (atomic_read(&fs_info->nr_async_bios) < limit &&
240 		    waitqueue_active(&fs_info->async_submit_wait))
241 			wake_up(&fs_info->async_submit_wait);
242 
243 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
244 
245 		submit_bio(cur->bi_rw, cur);
246 		num_run++;
247 		batch_run++;
248 		if (need_resched())
249 			cond_resched();
250 
251 		/*
252 		 * we made progress, there is more work to do and the bdi
253 		 * is now congested.  Back off and let other work structs
254 		 * run instead
255 		 */
256 		if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
257 		    fs_info->fs_devices->open_devices > 1) {
258 			struct io_context *ioc;
259 
260 			ioc = current->io_context;
261 
262 			/*
263 			 * the main goal here is that we don't want to
264 			 * block if we're going to be able to submit
265 			 * more requests without blocking.
266 			 *
267 			 * This code does two great things, it pokes into
268 			 * the elevator code from a filesystem _and_
269 			 * it makes assumptions about how batching works.
270 			 */
271 			if (ioc && ioc->nr_batch_requests > 0 &&
272 			    time_before(jiffies, ioc->last_waited + HZ/50UL) &&
273 			    (last_waited == 0 ||
274 			     ioc->last_waited == last_waited)) {
275 				/*
276 				 * we want to go through our batch of
277 				 * requests and stop.  So, we copy out
278 				 * the ioc->last_waited time and test
279 				 * against it before looping
280 				 */
281 				last_waited = ioc->last_waited;
282 				if (need_resched())
283 					cond_resched();
284 				continue;
285 			}
286 			spin_lock(&device->io_lock);
287 			requeue_list(pending_bios, pending, tail);
288 			device->running_pending = 1;
289 
290 			spin_unlock(&device->io_lock);
291 			btrfs_requeue_work(&device->work);
292 			goto done;
293 		}
294 	}
295 
296 	cond_resched();
297 	if (again)
298 		goto loop;
299 
300 	spin_lock(&device->io_lock);
301 	if (device->pending_bios.head || device->pending_sync_bios.head)
302 		goto loop_lock;
303 	spin_unlock(&device->io_lock);
304 
305 done:
306 	blk_finish_plug(&plug);
307 	return 0;
308 }
309 
310 static void pending_bios_fn(struct btrfs_work *work)
311 {
312 	struct btrfs_device *device;
313 
314 	device = container_of(work, struct btrfs_device, work);
315 	run_scheduled_bios(device);
316 }
317 
318 static noinline int device_list_add(const char *path,
319 			   struct btrfs_super_block *disk_super,
320 			   u64 devid, struct btrfs_fs_devices **fs_devices_ret)
321 {
322 	struct btrfs_device *device;
323 	struct btrfs_fs_devices *fs_devices;
324 	u64 found_transid = btrfs_super_generation(disk_super);
325 	char *name;
326 
327 	fs_devices = find_fsid(disk_super->fsid);
328 	if (!fs_devices) {
329 		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
330 		if (!fs_devices)
331 			return -ENOMEM;
332 		INIT_LIST_HEAD(&fs_devices->devices);
333 		INIT_LIST_HEAD(&fs_devices->alloc_list);
334 		list_add(&fs_devices->list, &fs_uuids);
335 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
336 		fs_devices->latest_devid = devid;
337 		fs_devices->latest_trans = found_transid;
338 		mutex_init(&fs_devices->device_list_mutex);
339 		device = NULL;
340 	} else {
341 		device = __find_device(&fs_devices->devices, devid,
342 				       disk_super->dev_item.uuid);
343 	}
344 	if (!device) {
345 		if (fs_devices->opened)
346 			return -EBUSY;
347 
348 		device = kzalloc(sizeof(*device), GFP_NOFS);
349 		if (!device) {
350 			/* we can safely leave the fs_devices entry around */
351 			return -ENOMEM;
352 		}
353 		device->devid = devid;
354 		device->work.func = pending_bios_fn;
355 		memcpy(device->uuid, disk_super->dev_item.uuid,
356 		       BTRFS_UUID_SIZE);
357 		spin_lock_init(&device->io_lock);
358 		device->name = kstrdup(path, GFP_NOFS);
359 		if (!device->name) {
360 			kfree(device);
361 			return -ENOMEM;
362 		}
363 		INIT_LIST_HEAD(&device->dev_alloc_list);
364 
365 		mutex_lock(&fs_devices->device_list_mutex);
366 		list_add(&device->dev_list, &fs_devices->devices);
367 		mutex_unlock(&fs_devices->device_list_mutex);
368 
369 		device->fs_devices = fs_devices;
370 		fs_devices->num_devices++;
371 	} else if (!device->name || strcmp(device->name, path)) {
372 		name = kstrdup(path, GFP_NOFS);
373 		if (!name)
374 			return -ENOMEM;
375 		kfree(device->name);
376 		device->name = name;
377 		if (device->missing) {
378 			fs_devices->missing_devices--;
379 			device->missing = 0;
380 		}
381 	}
382 
383 	if (found_transid > fs_devices->latest_trans) {
384 		fs_devices->latest_devid = devid;
385 		fs_devices->latest_trans = found_transid;
386 	}
387 	*fs_devices_ret = fs_devices;
388 	return 0;
389 }
390 
391 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
392 {
393 	struct btrfs_fs_devices *fs_devices;
394 	struct btrfs_device *device;
395 	struct btrfs_device *orig_dev;
396 
397 	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
398 	if (!fs_devices)
399 		return ERR_PTR(-ENOMEM);
400 
401 	INIT_LIST_HEAD(&fs_devices->devices);
402 	INIT_LIST_HEAD(&fs_devices->alloc_list);
403 	INIT_LIST_HEAD(&fs_devices->list);
404 	mutex_init(&fs_devices->device_list_mutex);
405 	fs_devices->latest_devid = orig->latest_devid;
406 	fs_devices->latest_trans = orig->latest_trans;
407 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
408 
409 	mutex_lock(&orig->device_list_mutex);
410 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
411 		device = kzalloc(sizeof(*device), GFP_NOFS);
412 		if (!device)
413 			goto error;
414 
415 		device->name = kstrdup(orig_dev->name, GFP_NOFS);
416 		if (!device->name) {
417 			kfree(device);
418 			goto error;
419 		}
420 
421 		device->devid = orig_dev->devid;
422 		device->work.func = pending_bios_fn;
423 		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
424 		spin_lock_init(&device->io_lock);
425 		INIT_LIST_HEAD(&device->dev_list);
426 		INIT_LIST_HEAD(&device->dev_alloc_list);
427 
428 		list_add(&device->dev_list, &fs_devices->devices);
429 		device->fs_devices = fs_devices;
430 		fs_devices->num_devices++;
431 	}
432 	mutex_unlock(&orig->device_list_mutex);
433 	return fs_devices;
434 error:
435 	mutex_unlock(&orig->device_list_mutex);
436 	free_fs_devices(fs_devices);
437 	return ERR_PTR(-ENOMEM);
438 }
439 
440 int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
441 {
442 	struct btrfs_device *device, *next;
443 
444 	mutex_lock(&uuid_mutex);
445 again:
446 	mutex_lock(&fs_devices->device_list_mutex);
447 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
448 		if (device->in_fs_metadata)
449 			continue;
450 
451 		if (device->bdev) {
452 			blkdev_put(device->bdev, device->mode);
453 			device->bdev = NULL;
454 			fs_devices->open_devices--;
455 		}
456 		if (device->writeable) {
457 			list_del_init(&device->dev_alloc_list);
458 			device->writeable = 0;
459 			fs_devices->rw_devices--;
460 		}
461 		list_del_init(&device->dev_list);
462 		fs_devices->num_devices--;
463 		kfree(device->name);
464 		kfree(device);
465 	}
466 	mutex_unlock(&fs_devices->device_list_mutex);
467 
468 	if (fs_devices->seed) {
469 		fs_devices = fs_devices->seed;
470 		goto again;
471 	}
472 
473 	mutex_unlock(&uuid_mutex);
474 	return 0;
475 }
476 
477 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
478 {
479 	struct btrfs_device *device;
480 
481 	if (--fs_devices->opened > 0)
482 		return 0;
483 
484 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
485 		if (device->bdev) {
486 			blkdev_put(device->bdev, device->mode);
487 			fs_devices->open_devices--;
488 		}
489 		if (device->writeable) {
490 			list_del_init(&device->dev_alloc_list);
491 			fs_devices->rw_devices--;
492 		}
493 
494 		device->bdev = NULL;
495 		device->writeable = 0;
496 		device->in_fs_metadata = 0;
497 	}
498 	WARN_ON(fs_devices->open_devices);
499 	WARN_ON(fs_devices->rw_devices);
500 	fs_devices->opened = 0;
501 	fs_devices->seeding = 0;
502 
503 	return 0;
504 }
505 
506 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
507 {
508 	struct btrfs_fs_devices *seed_devices = NULL;
509 	int ret;
510 
511 	mutex_lock(&uuid_mutex);
512 	ret = __btrfs_close_devices(fs_devices);
513 	if (!fs_devices->opened) {
514 		seed_devices = fs_devices->seed;
515 		fs_devices->seed = NULL;
516 	}
517 	mutex_unlock(&uuid_mutex);
518 
519 	while (seed_devices) {
520 		fs_devices = seed_devices;
521 		seed_devices = fs_devices->seed;
522 		__btrfs_close_devices(fs_devices);
523 		free_fs_devices(fs_devices);
524 	}
525 	return ret;
526 }
527 
528 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
529 				fmode_t flags, void *holder)
530 {
531 	struct block_device *bdev;
532 	struct list_head *head = &fs_devices->devices;
533 	struct btrfs_device *device;
534 	struct block_device *latest_bdev = NULL;
535 	struct buffer_head *bh;
536 	struct btrfs_super_block *disk_super;
537 	u64 latest_devid = 0;
538 	u64 latest_transid = 0;
539 	u64 devid;
540 	int seeding = 1;
541 	int ret = 0;
542 
543 	flags |= FMODE_EXCL;
544 
545 	list_for_each_entry(device, head, dev_list) {
546 		if (device->bdev)
547 			continue;
548 		if (!device->name)
549 			continue;
550 
551 		bdev = blkdev_get_by_path(device->name, flags, holder);
552 		if (IS_ERR(bdev)) {
553 			printk(KERN_INFO "open %s failed\n", device->name);
554 			goto error;
555 		}
556 		set_blocksize(bdev, 4096);
557 
558 		bh = btrfs_read_dev_super(bdev);
559 		if (!bh) {
560 			ret = -EINVAL;
561 			goto error_close;
562 		}
563 
564 		disk_super = (struct btrfs_super_block *)bh->b_data;
565 		devid = btrfs_stack_device_id(&disk_super->dev_item);
566 		if (devid != device->devid)
567 			goto error_brelse;
568 
569 		if (memcmp(device->uuid, disk_super->dev_item.uuid,
570 			   BTRFS_UUID_SIZE))
571 			goto error_brelse;
572 
573 		device->generation = btrfs_super_generation(disk_super);
574 		if (!latest_transid || device->generation > latest_transid) {
575 			latest_devid = devid;
576 			latest_transid = device->generation;
577 			latest_bdev = bdev;
578 		}
579 
580 		if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
581 			device->writeable = 0;
582 		} else {
583 			device->writeable = !bdev_read_only(bdev);
584 			seeding = 0;
585 		}
586 
587 		device->bdev = bdev;
588 		device->in_fs_metadata = 0;
589 		device->mode = flags;
590 
591 		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
592 			fs_devices->rotating = 1;
593 
594 		fs_devices->open_devices++;
595 		if (device->writeable) {
596 			fs_devices->rw_devices++;
597 			list_add(&device->dev_alloc_list,
598 				 &fs_devices->alloc_list);
599 		}
600 		continue;
601 
602 error_brelse:
603 		brelse(bh);
604 error_close:
605 		blkdev_put(bdev, flags);
606 error:
607 		continue;
608 	}
609 	if (fs_devices->open_devices == 0) {
610 		ret = -EIO;
611 		goto out;
612 	}
613 	fs_devices->seeding = seeding;
614 	fs_devices->opened = 1;
615 	fs_devices->latest_bdev = latest_bdev;
616 	fs_devices->latest_devid = latest_devid;
617 	fs_devices->latest_trans = latest_transid;
618 	fs_devices->total_rw_bytes = 0;
619 out:
620 	return ret;
621 }
622 
623 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
624 		       fmode_t flags, void *holder)
625 {
626 	int ret;
627 
628 	mutex_lock(&uuid_mutex);
629 	if (fs_devices->opened) {
630 		fs_devices->opened++;
631 		ret = 0;
632 	} else {
633 		ret = __btrfs_open_devices(fs_devices, flags, holder);
634 	}
635 	mutex_unlock(&uuid_mutex);
636 	return ret;
637 }
638 
639 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
640 			  struct btrfs_fs_devices **fs_devices_ret)
641 {
642 	struct btrfs_super_block *disk_super;
643 	struct block_device *bdev;
644 	struct buffer_head *bh;
645 	int ret;
646 	u64 devid;
647 	u64 transid;
648 
649 	mutex_lock(&uuid_mutex);
650 
651 	flags |= FMODE_EXCL;
652 	bdev = blkdev_get_by_path(path, flags, holder);
653 
654 	if (IS_ERR(bdev)) {
655 		ret = PTR_ERR(bdev);
656 		goto error;
657 	}
658 
659 	ret = set_blocksize(bdev, 4096);
660 	if (ret)
661 		goto error_close;
662 	bh = btrfs_read_dev_super(bdev);
663 	if (!bh) {
664 		ret = -EINVAL;
665 		goto error_close;
666 	}
667 	disk_super = (struct btrfs_super_block *)bh->b_data;
668 	devid = btrfs_stack_device_id(&disk_super->dev_item);
669 	transid = btrfs_super_generation(disk_super);
670 	if (disk_super->label[0])
671 		printk(KERN_INFO "device label %s ", disk_super->label);
672 	else {
673 		/* FIXME, make a readl uuid parser */
674 		printk(KERN_INFO "device fsid %llx-%llx ",
675 		       *(unsigned long long *)disk_super->fsid,
676 		       *(unsigned long long *)(disk_super->fsid + 8));
677 	}
678 	printk(KERN_CONT "devid %llu transid %llu %s\n",
679 	       (unsigned long long)devid, (unsigned long long)transid, path);
680 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
681 
682 	brelse(bh);
683 error_close:
684 	blkdev_put(bdev, flags);
685 error:
686 	mutex_unlock(&uuid_mutex);
687 	return ret;
688 }
689 
690 /* helper to account the used device space in the range */
691 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
692 				   u64 end, u64 *length)
693 {
694 	struct btrfs_key key;
695 	struct btrfs_root *root = device->dev_root;
696 	struct btrfs_dev_extent *dev_extent;
697 	struct btrfs_path *path;
698 	u64 extent_end;
699 	int ret;
700 	int slot;
701 	struct extent_buffer *l;
702 
703 	*length = 0;
704 
705 	if (start >= device->total_bytes)
706 		return 0;
707 
708 	path = btrfs_alloc_path();
709 	if (!path)
710 		return -ENOMEM;
711 	path->reada = 2;
712 
713 	key.objectid = device->devid;
714 	key.offset = start;
715 	key.type = BTRFS_DEV_EXTENT_KEY;
716 
717 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
718 	if (ret < 0)
719 		goto out;
720 	if (ret > 0) {
721 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
722 		if (ret < 0)
723 			goto out;
724 	}
725 
726 	while (1) {
727 		l = path->nodes[0];
728 		slot = path->slots[0];
729 		if (slot >= btrfs_header_nritems(l)) {
730 			ret = btrfs_next_leaf(root, path);
731 			if (ret == 0)
732 				continue;
733 			if (ret < 0)
734 				goto out;
735 
736 			break;
737 		}
738 		btrfs_item_key_to_cpu(l, &key, slot);
739 
740 		if (key.objectid < device->devid)
741 			goto next;
742 
743 		if (key.objectid > device->devid)
744 			break;
745 
746 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
747 			goto next;
748 
749 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
750 		extent_end = key.offset + btrfs_dev_extent_length(l,
751 								  dev_extent);
752 		if (key.offset <= start && extent_end > end) {
753 			*length = end - start + 1;
754 			break;
755 		} else if (key.offset <= start && extent_end > start)
756 			*length += extent_end - start;
757 		else if (key.offset > start && extent_end <= end)
758 			*length += extent_end - key.offset;
759 		else if (key.offset > start && key.offset <= end) {
760 			*length += end - key.offset + 1;
761 			break;
762 		} else if (key.offset > end)
763 			break;
764 
765 next:
766 		path->slots[0]++;
767 	}
768 	ret = 0;
769 out:
770 	btrfs_free_path(path);
771 	return ret;
772 }
773 
774 /*
775  * find_free_dev_extent - find free space in the specified device
776  * @trans:	transaction handler
777  * @device:	the device which we search the free space in
778  * @num_bytes:	the size of the free space that we need
779  * @start:	store the start of the free space.
780  * @len:	the size of the free space. that we find, or the size of the max
781  * 		free space if we don't find suitable free space
782  *
783  * this uses a pretty simple search, the expectation is that it is
784  * called very infrequently and that a given device has a small number
785  * of extents
786  *
787  * @start is used to store the start of the free space if we find. But if we
788  * don't find suitable free space, it will be used to store the start position
789  * of the max free space.
790  *
791  * @len is used to store the size of the free space that we find.
792  * But if we don't find suitable free space, it is used to store the size of
793  * the max free space.
794  */
795 int find_free_dev_extent(struct btrfs_trans_handle *trans,
796 			 struct btrfs_device *device, u64 num_bytes,
797 			 u64 *start, u64 *len)
798 {
799 	struct btrfs_key key;
800 	struct btrfs_root *root = device->dev_root;
801 	struct btrfs_dev_extent *dev_extent;
802 	struct btrfs_path *path;
803 	u64 hole_size;
804 	u64 max_hole_start;
805 	u64 max_hole_size;
806 	u64 extent_end;
807 	u64 search_start;
808 	u64 search_end = device->total_bytes;
809 	int ret;
810 	int slot;
811 	struct extent_buffer *l;
812 
813 	/* FIXME use last free of some kind */
814 
815 	/* we don't want to overwrite the superblock on the drive,
816 	 * so we make sure to start at an offset of at least 1MB
817 	 */
818 	search_start = 1024 * 1024;
819 
820 	if (root->fs_info->alloc_start + num_bytes <= search_end)
821 		search_start = max(root->fs_info->alloc_start, search_start);
822 
823 	max_hole_start = search_start;
824 	max_hole_size = 0;
825 
826 	if (search_start >= search_end) {
827 		ret = -ENOSPC;
828 		goto error;
829 	}
830 
831 	path = btrfs_alloc_path();
832 	if (!path) {
833 		ret = -ENOMEM;
834 		goto error;
835 	}
836 	path->reada = 2;
837 
838 	key.objectid = device->devid;
839 	key.offset = search_start;
840 	key.type = BTRFS_DEV_EXTENT_KEY;
841 
842 	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
843 	if (ret < 0)
844 		goto out;
845 	if (ret > 0) {
846 		ret = btrfs_previous_item(root, path, key.objectid, key.type);
847 		if (ret < 0)
848 			goto out;
849 	}
850 
851 	while (1) {
852 		l = path->nodes[0];
853 		slot = path->slots[0];
854 		if (slot >= btrfs_header_nritems(l)) {
855 			ret = btrfs_next_leaf(root, path);
856 			if (ret == 0)
857 				continue;
858 			if (ret < 0)
859 				goto out;
860 
861 			break;
862 		}
863 		btrfs_item_key_to_cpu(l, &key, slot);
864 
865 		if (key.objectid < device->devid)
866 			goto next;
867 
868 		if (key.objectid > device->devid)
869 			break;
870 
871 		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
872 			goto next;
873 
874 		if (key.offset > search_start) {
875 			hole_size = key.offset - search_start;
876 
877 			if (hole_size > max_hole_size) {
878 				max_hole_start = search_start;
879 				max_hole_size = hole_size;
880 			}
881 
882 			/*
883 			 * If this free space is greater than which we need,
884 			 * it must be the max free space that we have found
885 			 * until now, so max_hole_start must point to the start
886 			 * of this free space and the length of this free space
887 			 * is stored in max_hole_size. Thus, we return
888 			 * max_hole_start and max_hole_size and go back to the
889 			 * caller.
890 			 */
891 			if (hole_size >= num_bytes) {
892 				ret = 0;
893 				goto out;
894 			}
895 		}
896 
897 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
898 		extent_end = key.offset + btrfs_dev_extent_length(l,
899 								  dev_extent);
900 		if (extent_end > search_start)
901 			search_start = extent_end;
902 next:
903 		path->slots[0]++;
904 		cond_resched();
905 	}
906 
907 	hole_size = search_end- search_start;
908 	if (hole_size > max_hole_size) {
909 		max_hole_start = search_start;
910 		max_hole_size = hole_size;
911 	}
912 
913 	/* See above. */
914 	if (hole_size < num_bytes)
915 		ret = -ENOSPC;
916 	else
917 		ret = 0;
918 
919 out:
920 	btrfs_free_path(path);
921 error:
922 	*start = max_hole_start;
923 	if (len)
924 		*len = max_hole_size;
925 	return ret;
926 }
927 
928 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
929 			  struct btrfs_device *device,
930 			  u64 start)
931 {
932 	int ret;
933 	struct btrfs_path *path;
934 	struct btrfs_root *root = device->dev_root;
935 	struct btrfs_key key;
936 	struct btrfs_key found_key;
937 	struct extent_buffer *leaf = NULL;
938 	struct btrfs_dev_extent *extent = NULL;
939 
940 	path = btrfs_alloc_path();
941 	if (!path)
942 		return -ENOMEM;
943 
944 	key.objectid = device->devid;
945 	key.offset = start;
946 	key.type = BTRFS_DEV_EXTENT_KEY;
947 
948 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
949 	if (ret > 0) {
950 		ret = btrfs_previous_item(root, path, key.objectid,
951 					  BTRFS_DEV_EXTENT_KEY);
952 		BUG_ON(ret);
953 		leaf = path->nodes[0];
954 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
955 		extent = btrfs_item_ptr(leaf, path->slots[0],
956 					struct btrfs_dev_extent);
957 		BUG_ON(found_key.offset > start || found_key.offset +
958 		       btrfs_dev_extent_length(leaf, extent) < start);
959 		ret = 0;
960 	} else if (ret == 0) {
961 		leaf = path->nodes[0];
962 		extent = btrfs_item_ptr(leaf, path->slots[0],
963 					struct btrfs_dev_extent);
964 	}
965 	BUG_ON(ret);
966 
967 	if (device->bytes_used > 0)
968 		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
969 	ret = btrfs_del_item(trans, root, path);
970 	BUG_ON(ret);
971 
972 	btrfs_free_path(path);
973 	return ret;
974 }
975 
976 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
977 			   struct btrfs_device *device,
978 			   u64 chunk_tree, u64 chunk_objectid,
979 			   u64 chunk_offset, u64 start, u64 num_bytes)
980 {
981 	int ret;
982 	struct btrfs_path *path;
983 	struct btrfs_root *root = device->dev_root;
984 	struct btrfs_dev_extent *extent;
985 	struct extent_buffer *leaf;
986 	struct btrfs_key key;
987 
988 	WARN_ON(!device->in_fs_metadata);
989 	path = btrfs_alloc_path();
990 	if (!path)
991 		return -ENOMEM;
992 
993 	key.objectid = device->devid;
994 	key.offset = start;
995 	key.type = BTRFS_DEV_EXTENT_KEY;
996 	ret = btrfs_insert_empty_item(trans, root, path, &key,
997 				      sizeof(*extent));
998 	BUG_ON(ret);
999 
1000 	leaf = path->nodes[0];
1001 	extent = btrfs_item_ptr(leaf, path->slots[0],
1002 				struct btrfs_dev_extent);
1003 	btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1004 	btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1005 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1006 
1007 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1008 		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
1009 		    BTRFS_UUID_SIZE);
1010 
1011 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1012 	btrfs_mark_buffer_dirty(leaf);
1013 	btrfs_free_path(path);
1014 	return ret;
1015 }
1016 
1017 static noinline int find_next_chunk(struct btrfs_root *root,
1018 				    u64 objectid, u64 *offset)
1019 {
1020 	struct btrfs_path *path;
1021 	int ret;
1022 	struct btrfs_key key;
1023 	struct btrfs_chunk *chunk;
1024 	struct btrfs_key found_key;
1025 
1026 	path = btrfs_alloc_path();
1027 	BUG_ON(!path);
1028 
1029 	key.objectid = objectid;
1030 	key.offset = (u64)-1;
1031 	key.type = BTRFS_CHUNK_ITEM_KEY;
1032 
1033 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1034 	if (ret < 0)
1035 		goto error;
1036 
1037 	BUG_ON(ret == 0);
1038 
1039 	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
1040 	if (ret) {
1041 		*offset = 0;
1042 	} else {
1043 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1044 				      path->slots[0]);
1045 		if (found_key.objectid != objectid)
1046 			*offset = 0;
1047 		else {
1048 			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
1049 					       struct btrfs_chunk);
1050 			*offset = found_key.offset +
1051 				btrfs_chunk_length(path->nodes[0], chunk);
1052 		}
1053 	}
1054 	ret = 0;
1055 error:
1056 	btrfs_free_path(path);
1057 	return ret;
1058 }
1059 
1060 static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
1061 {
1062 	int ret;
1063 	struct btrfs_key key;
1064 	struct btrfs_key found_key;
1065 	struct btrfs_path *path;
1066 
1067 	root = root->fs_info->chunk_root;
1068 
1069 	path = btrfs_alloc_path();
1070 	if (!path)
1071 		return -ENOMEM;
1072 
1073 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1074 	key.type = BTRFS_DEV_ITEM_KEY;
1075 	key.offset = (u64)-1;
1076 
1077 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1078 	if (ret < 0)
1079 		goto error;
1080 
1081 	BUG_ON(ret == 0);
1082 
1083 	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
1084 				  BTRFS_DEV_ITEM_KEY);
1085 	if (ret) {
1086 		*objectid = 1;
1087 	} else {
1088 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1089 				      path->slots[0]);
1090 		*objectid = found_key.offset + 1;
1091 	}
1092 	ret = 0;
1093 error:
1094 	btrfs_free_path(path);
1095 	return ret;
1096 }
1097 
1098 /*
1099  * the device information is stored in the chunk root
1100  * the btrfs_device struct should be fully filled in
1101  */
1102 int btrfs_add_device(struct btrfs_trans_handle *trans,
1103 		     struct btrfs_root *root,
1104 		     struct btrfs_device *device)
1105 {
1106 	int ret;
1107 	struct btrfs_path *path;
1108 	struct btrfs_dev_item *dev_item;
1109 	struct extent_buffer *leaf;
1110 	struct btrfs_key key;
1111 	unsigned long ptr;
1112 
1113 	root = root->fs_info->chunk_root;
1114 
1115 	path = btrfs_alloc_path();
1116 	if (!path)
1117 		return -ENOMEM;
1118 
1119 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1120 	key.type = BTRFS_DEV_ITEM_KEY;
1121 	key.offset = device->devid;
1122 
1123 	ret = btrfs_insert_empty_item(trans, root, path, &key,
1124 				      sizeof(*dev_item));
1125 	if (ret)
1126 		goto out;
1127 
1128 	leaf = path->nodes[0];
1129 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1130 
1131 	btrfs_set_device_id(leaf, dev_item, device->devid);
1132 	btrfs_set_device_generation(leaf, dev_item, 0);
1133 	btrfs_set_device_type(leaf, dev_item, device->type);
1134 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1135 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1136 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1137 	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
1138 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1139 	btrfs_set_device_group(leaf, dev_item, 0);
1140 	btrfs_set_device_seek_speed(leaf, dev_item, 0);
1141 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1142 	btrfs_set_device_start_offset(leaf, dev_item, 0);
1143 
1144 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
1145 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1146 	ptr = (unsigned long)btrfs_device_fsid(dev_item);
1147 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1148 	btrfs_mark_buffer_dirty(leaf);
1149 
1150 	ret = 0;
1151 out:
1152 	btrfs_free_path(path);
1153 	return ret;
1154 }
1155 
1156 static int btrfs_rm_dev_item(struct btrfs_root *root,
1157 			     struct btrfs_device *device)
1158 {
1159 	int ret;
1160 	struct btrfs_path *path;
1161 	struct btrfs_key key;
1162 	struct btrfs_trans_handle *trans;
1163 
1164 	root = root->fs_info->chunk_root;
1165 
1166 	path = btrfs_alloc_path();
1167 	if (!path)
1168 		return -ENOMEM;
1169 
1170 	trans = btrfs_start_transaction(root, 0);
1171 	if (IS_ERR(trans)) {
1172 		btrfs_free_path(path);
1173 		return PTR_ERR(trans);
1174 	}
1175 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1176 	key.type = BTRFS_DEV_ITEM_KEY;
1177 	key.offset = device->devid;
1178 	lock_chunks(root);
1179 
1180 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1181 	if (ret < 0)
1182 		goto out;
1183 
1184 	if (ret > 0) {
1185 		ret = -ENOENT;
1186 		goto out;
1187 	}
1188 
1189 	ret = btrfs_del_item(trans, root, path);
1190 	if (ret)
1191 		goto out;
1192 out:
1193 	btrfs_free_path(path);
1194 	unlock_chunks(root);
1195 	btrfs_commit_transaction(trans, root);
1196 	return ret;
1197 }
1198 
1199 int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1200 {
1201 	struct btrfs_device *device;
1202 	struct btrfs_device *next_device;
1203 	struct block_device *bdev;
1204 	struct buffer_head *bh = NULL;
1205 	struct btrfs_super_block *disk_super;
1206 	u64 all_avail;
1207 	u64 devid;
1208 	u64 num_devices;
1209 	u8 *dev_uuid;
1210 	int ret = 0;
1211 
1212 	mutex_lock(&uuid_mutex);
1213 	mutex_lock(&root->fs_info->volume_mutex);
1214 
1215 	all_avail = root->fs_info->avail_data_alloc_bits |
1216 		root->fs_info->avail_system_alloc_bits |
1217 		root->fs_info->avail_metadata_alloc_bits;
1218 
1219 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
1220 	    root->fs_info->fs_devices->num_devices <= 4) {
1221 		printk(KERN_ERR "btrfs: unable to go below four devices "
1222 		       "on raid10\n");
1223 		ret = -EINVAL;
1224 		goto out;
1225 	}
1226 
1227 	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
1228 	    root->fs_info->fs_devices->num_devices <= 2) {
1229 		printk(KERN_ERR "btrfs: unable to go below two "
1230 		       "devices on raid1\n");
1231 		ret = -EINVAL;
1232 		goto out;
1233 	}
1234 
1235 	if (strcmp(device_path, "missing") == 0) {
1236 		struct list_head *devices;
1237 		struct btrfs_device *tmp;
1238 
1239 		device = NULL;
1240 		devices = &root->fs_info->fs_devices->devices;
1241 		mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1242 		list_for_each_entry(tmp, devices, dev_list) {
1243 			if (tmp->in_fs_metadata && !tmp->bdev) {
1244 				device = tmp;
1245 				break;
1246 			}
1247 		}
1248 		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1249 		bdev = NULL;
1250 		bh = NULL;
1251 		disk_super = NULL;
1252 		if (!device) {
1253 			printk(KERN_ERR "btrfs: no missing devices found to "
1254 			       "remove\n");
1255 			goto out;
1256 		}
1257 	} else {
1258 		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1259 					  root->fs_info->bdev_holder);
1260 		if (IS_ERR(bdev)) {
1261 			ret = PTR_ERR(bdev);
1262 			goto out;
1263 		}
1264 
1265 		set_blocksize(bdev, 4096);
1266 		bh = btrfs_read_dev_super(bdev);
1267 		if (!bh) {
1268 			ret = -EINVAL;
1269 			goto error_close;
1270 		}
1271 		disk_super = (struct btrfs_super_block *)bh->b_data;
1272 		devid = btrfs_stack_device_id(&disk_super->dev_item);
1273 		dev_uuid = disk_super->dev_item.uuid;
1274 		device = btrfs_find_device(root, devid, dev_uuid,
1275 					   disk_super->fsid);
1276 		if (!device) {
1277 			ret = -ENOENT;
1278 			goto error_brelse;
1279 		}
1280 	}
1281 
1282 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1283 		printk(KERN_ERR "btrfs: unable to remove the only writeable "
1284 		       "device\n");
1285 		ret = -EINVAL;
1286 		goto error_brelse;
1287 	}
1288 
1289 	if (device->writeable) {
1290 		list_del_init(&device->dev_alloc_list);
1291 		root->fs_info->fs_devices->rw_devices--;
1292 	}
1293 
1294 	ret = btrfs_shrink_device(device, 0);
1295 	if (ret)
1296 		goto error_undo;
1297 
1298 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1299 	if (ret)
1300 		goto error_undo;
1301 
1302 	device->in_fs_metadata = 0;
1303 
1304 	/*
1305 	 * the device list mutex makes sure that we don't change
1306 	 * the device list while someone else is writing out all
1307 	 * the device supers.
1308 	 */
1309 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1310 	list_del_init(&device->dev_list);
1311 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1312 
1313 	device->fs_devices->num_devices--;
1314 
1315 	if (device->missing)
1316 		root->fs_info->fs_devices->missing_devices--;
1317 
1318 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
1319 				 struct btrfs_device, dev_list);
1320 	if (device->bdev == root->fs_info->sb->s_bdev)
1321 		root->fs_info->sb->s_bdev = next_device->bdev;
1322 	if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1323 		root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1324 
1325 	if (device->bdev) {
1326 		blkdev_put(device->bdev, device->mode);
1327 		device->bdev = NULL;
1328 		device->fs_devices->open_devices--;
1329 	}
1330 
1331 	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1332 	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1333 
1334 	if (device->fs_devices->open_devices == 0) {
1335 		struct btrfs_fs_devices *fs_devices;
1336 		fs_devices = root->fs_info->fs_devices;
1337 		while (fs_devices) {
1338 			if (fs_devices->seed == device->fs_devices)
1339 				break;
1340 			fs_devices = fs_devices->seed;
1341 		}
1342 		fs_devices->seed = device->fs_devices->seed;
1343 		device->fs_devices->seed = NULL;
1344 		__btrfs_close_devices(device->fs_devices);
1345 		free_fs_devices(device->fs_devices);
1346 	}
1347 
1348 	/*
1349 	 * at this point, the device is zero sized.  We want to
1350 	 * remove it from the devices list and zero out the old super
1351 	 */
1352 	if (device->writeable) {
1353 		/* make sure this device isn't detected as part of
1354 		 * the FS anymore
1355 		 */
1356 		memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1357 		set_buffer_dirty(bh);
1358 		sync_dirty_buffer(bh);
1359 	}
1360 
1361 	kfree(device->name);
1362 	kfree(device);
1363 	ret = 0;
1364 
1365 error_brelse:
1366 	brelse(bh);
1367 error_close:
1368 	if (bdev)
1369 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1370 out:
1371 	mutex_unlock(&root->fs_info->volume_mutex);
1372 	mutex_unlock(&uuid_mutex);
1373 	return ret;
1374 error_undo:
1375 	if (device->writeable) {
1376 		list_add(&device->dev_alloc_list,
1377 			 &root->fs_info->fs_devices->alloc_list);
1378 		root->fs_info->fs_devices->rw_devices++;
1379 	}
1380 	goto error_brelse;
1381 }
1382 
1383 /*
1384  * does all the dirty work required for changing file system's UUID.
1385  */
1386 static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1387 				struct btrfs_root *root)
1388 {
1389 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1390 	struct btrfs_fs_devices *old_devices;
1391 	struct btrfs_fs_devices *seed_devices;
1392 	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
1393 	struct btrfs_device *device;
1394 	u64 super_flags;
1395 
1396 	BUG_ON(!mutex_is_locked(&uuid_mutex));
1397 	if (!fs_devices->seeding)
1398 		return -EINVAL;
1399 
1400 	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
1401 	if (!seed_devices)
1402 		return -ENOMEM;
1403 
1404 	old_devices = clone_fs_devices(fs_devices);
1405 	if (IS_ERR(old_devices)) {
1406 		kfree(seed_devices);
1407 		return PTR_ERR(old_devices);
1408 	}
1409 
1410 	list_add(&old_devices->list, &fs_uuids);
1411 
1412 	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1413 	seed_devices->opened = 1;
1414 	INIT_LIST_HEAD(&seed_devices->devices);
1415 	INIT_LIST_HEAD(&seed_devices->alloc_list);
1416 	mutex_init(&seed_devices->device_list_mutex);
1417 	list_splice_init(&fs_devices->devices, &seed_devices->devices);
1418 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1419 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
1420 		device->fs_devices = seed_devices;
1421 	}
1422 
1423 	fs_devices->seeding = 0;
1424 	fs_devices->num_devices = 0;
1425 	fs_devices->open_devices = 0;
1426 	fs_devices->seed = seed_devices;
1427 
1428 	generate_random_uuid(fs_devices->fsid);
1429 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1430 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1431 	super_flags = btrfs_super_flags(disk_super) &
1432 		      ~BTRFS_SUPER_FLAG_SEEDING;
1433 	btrfs_set_super_flags(disk_super, super_flags);
1434 
1435 	return 0;
1436 }
1437 
1438 /*
1439  * strore the expected generation for seed devices in device items.
1440  */
1441 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
1442 			       struct btrfs_root *root)
1443 {
1444 	struct btrfs_path *path;
1445 	struct extent_buffer *leaf;
1446 	struct btrfs_dev_item *dev_item;
1447 	struct btrfs_device *device;
1448 	struct btrfs_key key;
1449 	u8 fs_uuid[BTRFS_UUID_SIZE];
1450 	u8 dev_uuid[BTRFS_UUID_SIZE];
1451 	u64 devid;
1452 	int ret;
1453 
1454 	path = btrfs_alloc_path();
1455 	if (!path)
1456 		return -ENOMEM;
1457 
1458 	root = root->fs_info->chunk_root;
1459 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1460 	key.offset = 0;
1461 	key.type = BTRFS_DEV_ITEM_KEY;
1462 
1463 	while (1) {
1464 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1465 		if (ret < 0)
1466 			goto error;
1467 
1468 		leaf = path->nodes[0];
1469 next_slot:
1470 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1471 			ret = btrfs_next_leaf(root, path);
1472 			if (ret > 0)
1473 				break;
1474 			if (ret < 0)
1475 				goto error;
1476 			leaf = path->nodes[0];
1477 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1478 			btrfs_release_path(root, path);
1479 			continue;
1480 		}
1481 
1482 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1483 		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
1484 		    key.type != BTRFS_DEV_ITEM_KEY)
1485 			break;
1486 
1487 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
1488 					  struct btrfs_dev_item);
1489 		devid = btrfs_device_id(leaf, dev_item);
1490 		read_extent_buffer(leaf, dev_uuid,
1491 				   (unsigned long)btrfs_device_uuid(dev_item),
1492 				   BTRFS_UUID_SIZE);
1493 		read_extent_buffer(leaf, fs_uuid,
1494 				   (unsigned long)btrfs_device_fsid(dev_item),
1495 				   BTRFS_UUID_SIZE);
1496 		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
1497 		BUG_ON(!device);
1498 
1499 		if (device->fs_devices->seeding) {
1500 			btrfs_set_device_generation(leaf, dev_item,
1501 						    device->generation);
1502 			btrfs_mark_buffer_dirty(leaf);
1503 		}
1504 
1505 		path->slots[0]++;
1506 		goto next_slot;
1507 	}
1508 	ret = 0;
1509 error:
1510 	btrfs_free_path(path);
1511 	return ret;
1512 }
1513 
1514 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1515 {
1516 	struct btrfs_trans_handle *trans;
1517 	struct btrfs_device *device;
1518 	struct block_device *bdev;
1519 	struct list_head *devices;
1520 	struct super_block *sb = root->fs_info->sb;
1521 	u64 total_bytes;
1522 	int seeding_dev = 0;
1523 	int ret = 0;
1524 
1525 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1526 		return -EINVAL;
1527 
1528 	bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1529 				  root->fs_info->bdev_holder);
1530 	if (IS_ERR(bdev))
1531 		return PTR_ERR(bdev);
1532 
1533 	if (root->fs_info->fs_devices->seeding) {
1534 		seeding_dev = 1;
1535 		down_write(&sb->s_umount);
1536 		mutex_lock(&uuid_mutex);
1537 	}
1538 
1539 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
1540 	mutex_lock(&root->fs_info->volume_mutex);
1541 
1542 	devices = &root->fs_info->fs_devices->devices;
1543 	/*
1544 	 * we have the volume lock, so we don't need the extra
1545 	 * device list mutex while reading the list here.
1546 	 */
1547 	list_for_each_entry(device, devices, dev_list) {
1548 		if (device->bdev == bdev) {
1549 			ret = -EEXIST;
1550 			goto error;
1551 		}
1552 	}
1553 
1554 	device = kzalloc(sizeof(*device), GFP_NOFS);
1555 	if (!device) {
1556 		/* we can safely leave the fs_devices entry around */
1557 		ret = -ENOMEM;
1558 		goto error;
1559 	}
1560 
1561 	device->name = kstrdup(device_path, GFP_NOFS);
1562 	if (!device->name) {
1563 		kfree(device);
1564 		ret = -ENOMEM;
1565 		goto error;
1566 	}
1567 
1568 	ret = find_next_devid(root, &device->devid);
1569 	if (ret) {
1570 		kfree(device->name);
1571 		kfree(device);
1572 		goto error;
1573 	}
1574 
1575 	trans = btrfs_start_transaction(root, 0);
1576 	if (IS_ERR(trans)) {
1577 		kfree(device->name);
1578 		kfree(device);
1579 		ret = PTR_ERR(trans);
1580 		goto error;
1581 	}
1582 
1583 	lock_chunks(root);
1584 
1585 	device->writeable = 1;
1586 	device->work.func = pending_bios_fn;
1587 	generate_random_uuid(device->uuid);
1588 	spin_lock_init(&device->io_lock);
1589 	device->generation = trans->transid;
1590 	device->io_width = root->sectorsize;
1591 	device->io_align = root->sectorsize;
1592 	device->sector_size = root->sectorsize;
1593 	device->total_bytes = i_size_read(bdev->bd_inode);
1594 	device->disk_total_bytes = device->total_bytes;
1595 	device->dev_root = root->fs_info->dev_root;
1596 	device->bdev = bdev;
1597 	device->in_fs_metadata = 1;
1598 	device->mode = FMODE_EXCL;
1599 	set_blocksize(device->bdev, 4096);
1600 
1601 	if (seeding_dev) {
1602 		sb->s_flags &= ~MS_RDONLY;
1603 		ret = btrfs_prepare_sprout(trans, root);
1604 		BUG_ON(ret);
1605 	}
1606 
1607 	device->fs_devices = root->fs_info->fs_devices;
1608 
1609 	/*
1610 	 * we don't want write_supers to jump in here with our device
1611 	 * half setup
1612 	 */
1613 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1614 	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
1615 	list_add(&device->dev_alloc_list,
1616 		 &root->fs_info->fs_devices->alloc_list);
1617 	root->fs_info->fs_devices->num_devices++;
1618 	root->fs_info->fs_devices->open_devices++;
1619 	root->fs_info->fs_devices->rw_devices++;
1620 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1621 
1622 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1623 		root->fs_info->fs_devices->rotating = 1;
1624 
1625 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
1626 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
1627 				    total_bytes + device->total_bytes);
1628 
1629 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
1630 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
1631 				    total_bytes + 1);
1632 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1633 
1634 	if (seeding_dev) {
1635 		ret = init_first_rw_device(trans, root, device);
1636 		BUG_ON(ret);
1637 		ret = btrfs_finish_sprout(trans, root);
1638 		BUG_ON(ret);
1639 	} else {
1640 		ret = btrfs_add_device(trans, root, device);
1641 	}
1642 
1643 	/*
1644 	 * we've got more storage, clear any full flags on the space
1645 	 * infos
1646 	 */
1647 	btrfs_clear_space_info_full(root->fs_info);
1648 
1649 	unlock_chunks(root);
1650 	btrfs_commit_transaction(trans, root);
1651 
1652 	if (seeding_dev) {
1653 		mutex_unlock(&uuid_mutex);
1654 		up_write(&sb->s_umount);
1655 
1656 		ret = btrfs_relocate_sys_chunks(root);
1657 		BUG_ON(ret);
1658 	}
1659 out:
1660 	mutex_unlock(&root->fs_info->volume_mutex);
1661 	return ret;
1662 error:
1663 	blkdev_put(bdev, FMODE_EXCL);
1664 	if (seeding_dev) {
1665 		mutex_unlock(&uuid_mutex);
1666 		up_write(&sb->s_umount);
1667 	}
1668 	goto out;
1669 }
1670 
1671 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
1672 					struct btrfs_device *device)
1673 {
1674 	int ret;
1675 	struct btrfs_path *path;
1676 	struct btrfs_root *root;
1677 	struct btrfs_dev_item *dev_item;
1678 	struct extent_buffer *leaf;
1679 	struct btrfs_key key;
1680 
1681 	root = device->dev_root->fs_info->chunk_root;
1682 
1683 	path = btrfs_alloc_path();
1684 	if (!path)
1685 		return -ENOMEM;
1686 
1687 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1688 	key.type = BTRFS_DEV_ITEM_KEY;
1689 	key.offset = device->devid;
1690 
1691 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1692 	if (ret < 0)
1693 		goto out;
1694 
1695 	if (ret > 0) {
1696 		ret = -ENOENT;
1697 		goto out;
1698 	}
1699 
1700 	leaf = path->nodes[0];
1701 	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1702 
1703 	btrfs_set_device_id(leaf, dev_item, device->devid);
1704 	btrfs_set_device_type(leaf, dev_item, device->type);
1705 	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1706 	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1707 	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1708 	btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes);
1709 	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
1710 	btrfs_mark_buffer_dirty(leaf);
1711 
1712 out:
1713 	btrfs_free_path(path);
1714 	return ret;
1715 }
1716 
1717 static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1718 		      struct btrfs_device *device, u64 new_size)
1719 {
1720 	struct btrfs_super_block *super_copy =
1721 		&device->dev_root->fs_info->super_copy;
1722 	u64 old_total = btrfs_super_total_bytes(super_copy);
1723 	u64 diff = new_size - device->total_bytes;
1724 
1725 	if (!device->writeable)
1726 		return -EACCES;
1727 	if (new_size <= device->total_bytes)
1728 		return -EINVAL;
1729 
1730 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
1731 	device->fs_devices->total_rw_bytes += diff;
1732 
1733 	device->total_bytes = new_size;
1734 	device->disk_total_bytes = new_size;
1735 	btrfs_clear_space_info_full(device->dev_root->fs_info);
1736 
1737 	return btrfs_update_device(trans, device);
1738 }
1739 
1740 int btrfs_grow_device(struct btrfs_trans_handle *trans,
1741 		      struct btrfs_device *device, u64 new_size)
1742 {
1743 	int ret;
1744 	lock_chunks(device->dev_root);
1745 	ret = __btrfs_grow_device(trans, device, new_size);
1746 	unlock_chunks(device->dev_root);
1747 	return ret;
1748 }
1749 
1750 static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1751 			    struct btrfs_root *root,
1752 			    u64 chunk_tree, u64 chunk_objectid,
1753 			    u64 chunk_offset)
1754 {
1755 	int ret;
1756 	struct btrfs_path *path;
1757 	struct btrfs_key key;
1758 
1759 	root = root->fs_info->chunk_root;
1760 	path = btrfs_alloc_path();
1761 	if (!path)
1762 		return -ENOMEM;
1763 
1764 	key.objectid = chunk_objectid;
1765 	key.offset = chunk_offset;
1766 	key.type = BTRFS_CHUNK_ITEM_KEY;
1767 
1768 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1769 	BUG_ON(ret);
1770 
1771 	ret = btrfs_del_item(trans, root, path);
1772 	BUG_ON(ret);
1773 
1774 	btrfs_free_path(path);
1775 	return 0;
1776 }
1777 
1778 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1779 			chunk_offset)
1780 {
1781 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
1782 	struct btrfs_disk_key *disk_key;
1783 	struct btrfs_chunk *chunk;
1784 	u8 *ptr;
1785 	int ret = 0;
1786 	u32 num_stripes;
1787 	u32 array_size;
1788 	u32 len = 0;
1789 	u32 cur;
1790 	struct btrfs_key key;
1791 
1792 	array_size = btrfs_super_sys_array_size(super_copy);
1793 
1794 	ptr = super_copy->sys_chunk_array;
1795 	cur = 0;
1796 
1797 	while (cur < array_size) {
1798 		disk_key = (struct btrfs_disk_key *)ptr;
1799 		btrfs_disk_key_to_cpu(&key, disk_key);
1800 
1801 		len = sizeof(*disk_key);
1802 
1803 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
1804 			chunk = (struct btrfs_chunk *)(ptr + len);
1805 			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
1806 			len += btrfs_chunk_item_size(num_stripes);
1807 		} else {
1808 			ret = -EIO;
1809 			break;
1810 		}
1811 		if (key.objectid == chunk_objectid &&
1812 		    key.offset == chunk_offset) {
1813 			memmove(ptr, ptr + len, array_size - (cur + len));
1814 			array_size -= len;
1815 			btrfs_set_super_sys_array_size(super_copy, array_size);
1816 		} else {
1817 			ptr += len;
1818 			cur += len;
1819 		}
1820 	}
1821 	return ret;
1822 }
1823 
1824 static int btrfs_relocate_chunk(struct btrfs_root *root,
1825 			 u64 chunk_tree, u64 chunk_objectid,
1826 			 u64 chunk_offset)
1827 {
1828 	struct extent_map_tree *em_tree;
1829 	struct btrfs_root *extent_root;
1830 	struct btrfs_trans_handle *trans;
1831 	struct extent_map *em;
1832 	struct map_lookup *map;
1833 	int ret;
1834 	int i;
1835 
1836 	root = root->fs_info->chunk_root;
1837 	extent_root = root->fs_info->extent_root;
1838 	em_tree = &root->fs_info->mapping_tree.map_tree;
1839 
1840 	ret = btrfs_can_relocate(extent_root, chunk_offset);
1841 	if (ret)
1842 		return -ENOSPC;
1843 
1844 	/* step one, relocate all the extents inside this chunk */
1845 	ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1846 	if (ret)
1847 		return ret;
1848 
1849 	trans = btrfs_start_transaction(root, 0);
1850 	BUG_ON(IS_ERR(trans));
1851 
1852 	lock_chunks(root);
1853 
1854 	/*
1855 	 * step two, delete the device extents and the
1856 	 * chunk tree entries
1857 	 */
1858 	read_lock(&em_tree->lock);
1859 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
1860 	read_unlock(&em_tree->lock);
1861 
1862 	BUG_ON(em->start > chunk_offset ||
1863 	       em->start + em->len < chunk_offset);
1864 	map = (struct map_lookup *)em->bdev;
1865 
1866 	for (i = 0; i < map->num_stripes; i++) {
1867 		ret = btrfs_free_dev_extent(trans, map->stripes[i].dev,
1868 					    map->stripes[i].physical);
1869 		BUG_ON(ret);
1870 
1871 		if (map->stripes[i].dev) {
1872 			ret = btrfs_update_device(trans, map->stripes[i].dev);
1873 			BUG_ON(ret);
1874 		}
1875 	}
1876 	ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
1877 			       chunk_offset);
1878 
1879 	BUG_ON(ret);
1880 
1881 	trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1882 
1883 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1884 		ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1885 		BUG_ON(ret);
1886 	}
1887 
1888 	ret = btrfs_remove_block_group(trans, extent_root, chunk_offset);
1889 	BUG_ON(ret);
1890 
1891 	write_lock(&em_tree->lock);
1892 	remove_extent_mapping(em_tree, em);
1893 	write_unlock(&em_tree->lock);
1894 
1895 	kfree(map);
1896 	em->bdev = NULL;
1897 
1898 	/* once for the tree */
1899 	free_extent_map(em);
1900 	/* once for us */
1901 	free_extent_map(em);
1902 
1903 	unlock_chunks(root);
1904 	btrfs_end_transaction(trans, root);
1905 	return 0;
1906 }
1907 
1908 static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
1909 {
1910 	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
1911 	struct btrfs_path *path;
1912 	struct extent_buffer *leaf;
1913 	struct btrfs_chunk *chunk;
1914 	struct btrfs_key key;
1915 	struct btrfs_key found_key;
1916 	u64 chunk_tree = chunk_root->root_key.objectid;
1917 	u64 chunk_type;
1918 	bool retried = false;
1919 	int failed = 0;
1920 	int ret;
1921 
1922 	path = btrfs_alloc_path();
1923 	if (!path)
1924 		return -ENOMEM;
1925 
1926 again:
1927 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
1928 	key.offset = (u64)-1;
1929 	key.type = BTRFS_CHUNK_ITEM_KEY;
1930 
1931 	while (1) {
1932 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
1933 		if (ret < 0)
1934 			goto error;
1935 		BUG_ON(ret == 0);
1936 
1937 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
1938 					  key.type);
1939 		if (ret < 0)
1940 			goto error;
1941 		if (ret > 0)
1942 			break;
1943 
1944 		leaf = path->nodes[0];
1945 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1946 
1947 		chunk = btrfs_item_ptr(leaf, path->slots[0],
1948 				       struct btrfs_chunk);
1949 		chunk_type = btrfs_chunk_type(leaf, chunk);
1950 		btrfs_release_path(chunk_root, path);
1951 
1952 		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1953 			ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
1954 						   found_key.objectid,
1955 						   found_key.offset);
1956 			if (ret == -ENOSPC)
1957 				failed++;
1958 			else if (ret)
1959 				BUG();
1960 		}
1961 
1962 		if (found_key.offset == 0)
1963 			break;
1964 		key.offset = found_key.offset - 1;
1965 	}
1966 	ret = 0;
1967 	if (failed && !retried) {
1968 		failed = 0;
1969 		retried = true;
1970 		goto again;
1971 	} else if (failed && retried) {
1972 		WARN_ON(1);
1973 		ret = -ENOSPC;
1974 	}
1975 error:
1976 	btrfs_free_path(path);
1977 	return ret;
1978 }
1979 
1980 static u64 div_factor(u64 num, int factor)
1981 {
1982 	if (factor == 10)
1983 		return num;
1984 	num *= factor;
1985 	do_div(num, 10);
1986 	return num;
1987 }
1988 
1989 int btrfs_balance(struct btrfs_root *dev_root)
1990 {
1991 	int ret;
1992 	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
1993 	struct btrfs_device *device;
1994 	u64 old_size;
1995 	u64 size_to_free;
1996 	struct btrfs_path *path;
1997 	struct btrfs_key key;
1998 	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1999 	struct btrfs_trans_handle *trans;
2000 	struct btrfs_key found_key;
2001 
2002 	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
2003 		return -EROFS;
2004 
2005 	if (!capable(CAP_SYS_ADMIN))
2006 		return -EPERM;
2007 
2008 	mutex_lock(&dev_root->fs_info->volume_mutex);
2009 	dev_root = dev_root->fs_info->dev_root;
2010 
2011 	/* step one make some room on all the devices */
2012 	list_for_each_entry(device, devices, dev_list) {
2013 		old_size = device->total_bytes;
2014 		size_to_free = div_factor(old_size, 1);
2015 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
2016 		if (!device->writeable ||
2017 		    device->total_bytes - device->bytes_used > size_to_free)
2018 			continue;
2019 
2020 		ret = btrfs_shrink_device(device, old_size - size_to_free);
2021 		if (ret == -ENOSPC)
2022 			break;
2023 		BUG_ON(ret);
2024 
2025 		trans = btrfs_start_transaction(dev_root, 0);
2026 		BUG_ON(IS_ERR(trans));
2027 
2028 		ret = btrfs_grow_device(trans, device, old_size);
2029 		BUG_ON(ret);
2030 
2031 		btrfs_end_transaction(trans, dev_root);
2032 	}
2033 
2034 	/* step two, relocate all the chunks */
2035 	path = btrfs_alloc_path();
2036 	BUG_ON(!path);
2037 
2038 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2039 	key.offset = (u64)-1;
2040 	key.type = BTRFS_CHUNK_ITEM_KEY;
2041 
2042 	while (1) {
2043 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2044 		if (ret < 0)
2045 			goto error;
2046 
2047 		/*
2048 		 * this shouldn't happen, it means the last relocate
2049 		 * failed
2050 		 */
2051 		if (ret == 0)
2052 			break;
2053 
2054 		ret = btrfs_previous_item(chunk_root, path, 0,
2055 					  BTRFS_CHUNK_ITEM_KEY);
2056 		if (ret)
2057 			break;
2058 
2059 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2060 				      path->slots[0]);
2061 		if (found_key.objectid != key.objectid)
2062 			break;
2063 
2064 		/* chunk zero is special */
2065 		if (found_key.offset == 0)
2066 			break;
2067 
2068 		btrfs_release_path(chunk_root, path);
2069 		ret = btrfs_relocate_chunk(chunk_root,
2070 					   chunk_root->root_key.objectid,
2071 					   found_key.objectid,
2072 					   found_key.offset);
2073 		BUG_ON(ret && ret != -ENOSPC);
2074 		key.offset = found_key.offset - 1;
2075 	}
2076 	ret = 0;
2077 error:
2078 	btrfs_free_path(path);
2079 	mutex_unlock(&dev_root->fs_info->volume_mutex);
2080 	return ret;
2081 }
2082 
2083 /*
2084  * shrinking a device means finding all of the device extents past
2085  * the new size, and then following the back refs to the chunks.
2086  * The chunk relocation code actually frees the device extent
2087  */
2088 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2089 {
2090 	struct btrfs_trans_handle *trans;
2091 	struct btrfs_root *root = device->dev_root;
2092 	struct btrfs_dev_extent *dev_extent = NULL;
2093 	struct btrfs_path *path;
2094 	u64 length;
2095 	u64 chunk_tree;
2096 	u64 chunk_objectid;
2097 	u64 chunk_offset;
2098 	int ret;
2099 	int slot;
2100 	int failed = 0;
2101 	bool retried = false;
2102 	struct extent_buffer *l;
2103 	struct btrfs_key key;
2104 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2105 	u64 old_total = btrfs_super_total_bytes(super_copy);
2106 	u64 old_size = device->total_bytes;
2107 	u64 diff = device->total_bytes - new_size;
2108 
2109 	if (new_size >= device->total_bytes)
2110 		return -EINVAL;
2111 
2112 	path = btrfs_alloc_path();
2113 	if (!path)
2114 		return -ENOMEM;
2115 
2116 	path->reada = 2;
2117 
2118 	lock_chunks(root);
2119 
2120 	device->total_bytes = new_size;
2121 	if (device->writeable)
2122 		device->fs_devices->total_rw_bytes -= diff;
2123 	unlock_chunks(root);
2124 
2125 again:
2126 	key.objectid = device->devid;
2127 	key.offset = (u64)-1;
2128 	key.type = BTRFS_DEV_EXTENT_KEY;
2129 
2130 	while (1) {
2131 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2132 		if (ret < 0)
2133 			goto done;
2134 
2135 		ret = btrfs_previous_item(root, path, 0, key.type);
2136 		if (ret < 0)
2137 			goto done;
2138 		if (ret) {
2139 			ret = 0;
2140 			btrfs_release_path(root, path);
2141 			break;
2142 		}
2143 
2144 		l = path->nodes[0];
2145 		slot = path->slots[0];
2146 		btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2147 
2148 		if (key.objectid != device->devid) {
2149 			btrfs_release_path(root, path);
2150 			break;
2151 		}
2152 
2153 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2154 		length = btrfs_dev_extent_length(l, dev_extent);
2155 
2156 		if (key.offset + length <= new_size) {
2157 			btrfs_release_path(root, path);
2158 			break;
2159 		}
2160 
2161 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2162 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2163 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2164 		btrfs_release_path(root, path);
2165 
2166 		ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2167 					   chunk_offset);
2168 		if (ret && ret != -ENOSPC)
2169 			goto done;
2170 		if (ret == -ENOSPC)
2171 			failed++;
2172 		key.offset -= 1;
2173 	}
2174 
2175 	if (failed && !retried) {
2176 		failed = 0;
2177 		retried = true;
2178 		goto again;
2179 	} else if (failed && retried) {
2180 		ret = -ENOSPC;
2181 		lock_chunks(root);
2182 
2183 		device->total_bytes = old_size;
2184 		if (device->writeable)
2185 			device->fs_devices->total_rw_bytes += diff;
2186 		unlock_chunks(root);
2187 		goto done;
2188 	}
2189 
2190 	/* Shrinking succeeded, else we would be at "done". */
2191 	trans = btrfs_start_transaction(root, 0);
2192 	if (IS_ERR(trans)) {
2193 		ret = PTR_ERR(trans);
2194 		goto done;
2195 	}
2196 
2197 	lock_chunks(root);
2198 
2199 	device->disk_total_bytes = new_size;
2200 	/* Now btrfs_update_device() will change the on-disk size. */
2201 	ret = btrfs_update_device(trans, device);
2202 	if (ret) {
2203 		unlock_chunks(root);
2204 		btrfs_end_transaction(trans, root);
2205 		goto done;
2206 	}
2207 	WARN_ON(diff > old_total);
2208 	btrfs_set_super_total_bytes(super_copy, old_total - diff);
2209 	unlock_chunks(root);
2210 	btrfs_end_transaction(trans, root);
2211 done:
2212 	btrfs_free_path(path);
2213 	return ret;
2214 }
2215 
2216 static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2217 			   struct btrfs_root *root,
2218 			   struct btrfs_key *key,
2219 			   struct btrfs_chunk *chunk, int item_size)
2220 {
2221 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
2222 	struct btrfs_disk_key disk_key;
2223 	u32 array_size;
2224 	u8 *ptr;
2225 
2226 	array_size = btrfs_super_sys_array_size(super_copy);
2227 	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
2228 		return -EFBIG;
2229 
2230 	ptr = super_copy->sys_chunk_array + array_size;
2231 	btrfs_cpu_key_to_disk(&disk_key, key);
2232 	memcpy(ptr, &disk_key, sizeof(disk_key));
2233 	ptr += sizeof(disk_key);
2234 	memcpy(ptr, chunk, item_size);
2235 	item_size += sizeof(disk_key);
2236 	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
2237 	return 0;
2238 }
2239 
2240 static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2241 					int num_stripes, int sub_stripes)
2242 {
2243 	if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
2244 		return calc_size;
2245 	else if (type & BTRFS_BLOCK_GROUP_RAID10)
2246 		return calc_size * (num_stripes / sub_stripes);
2247 	else
2248 		return calc_size * num_stripes;
2249 }
2250 
2251 /* Used to sort the devices by max_avail(descending sort) */
2252 int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2253 {
2254 	if (((struct btrfs_device_info *)dev_info1)->max_avail >
2255 	    ((struct btrfs_device_info *)dev_info2)->max_avail)
2256 		return -1;
2257 	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2258 		 ((struct btrfs_device_info *)dev_info2)->max_avail)
2259 		return 1;
2260 	else
2261 		return 0;
2262 }
2263 
2264 static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2265 				 int *num_stripes, int *min_stripes,
2266 				 int *sub_stripes)
2267 {
2268 	*num_stripes = 1;
2269 	*min_stripes = 1;
2270 	*sub_stripes = 0;
2271 
2272 	if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2273 		*num_stripes = fs_devices->rw_devices;
2274 		*min_stripes = 2;
2275 	}
2276 	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2277 		*num_stripes = 2;
2278 		*min_stripes = 2;
2279 	}
2280 	if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2281 		if (fs_devices->rw_devices < 2)
2282 			return -ENOSPC;
2283 		*num_stripes = 2;
2284 		*min_stripes = 2;
2285 	}
2286 	if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2287 		*num_stripes = fs_devices->rw_devices;
2288 		if (*num_stripes < 4)
2289 			return -ENOSPC;
2290 		*num_stripes &= ~(u32)1;
2291 		*sub_stripes = 2;
2292 		*min_stripes = 4;
2293 	}
2294 
2295 	return 0;
2296 }
2297 
2298 static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2299 				    u64 proposed_size, u64 type,
2300 				    int num_stripes, int small_stripe)
2301 {
2302 	int min_stripe_size = 1 * 1024 * 1024;
2303 	u64 calc_size = proposed_size;
2304 	u64 max_chunk_size = calc_size;
2305 	int ncopies = 1;
2306 
2307 	if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2308 		    BTRFS_BLOCK_GROUP_DUP |
2309 		    BTRFS_BLOCK_GROUP_RAID10))
2310 		ncopies = 2;
2311 
2312 	if (type & BTRFS_BLOCK_GROUP_DATA) {
2313 		max_chunk_size = 10 * calc_size;
2314 		min_stripe_size = 64 * 1024 * 1024;
2315 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2316 		max_chunk_size = 256 * 1024 * 1024;
2317 		min_stripe_size = 32 * 1024 * 1024;
2318 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2319 		calc_size = 8 * 1024 * 1024;
2320 		max_chunk_size = calc_size * 2;
2321 		min_stripe_size = 1 * 1024 * 1024;
2322 	}
2323 
2324 	/* we don't want a chunk larger than 10% of writeable space */
2325 	max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2326 			     max_chunk_size);
2327 
2328 	if (calc_size * num_stripes > max_chunk_size * ncopies) {
2329 		calc_size = max_chunk_size * ncopies;
2330 		do_div(calc_size, num_stripes);
2331 		do_div(calc_size, BTRFS_STRIPE_LEN);
2332 		calc_size *= BTRFS_STRIPE_LEN;
2333 	}
2334 
2335 	/* we don't want tiny stripes */
2336 	if (!small_stripe)
2337 		calc_size = max_t(u64, min_stripe_size, calc_size);
2338 
2339 	/*
2340 	 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2341 	 * we end up with something bigger than a stripe
2342 	 */
2343 	calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2344 
2345 	do_div(calc_size, BTRFS_STRIPE_LEN);
2346 	calc_size *= BTRFS_STRIPE_LEN;
2347 
2348 	return calc_size;
2349 }
2350 
2351 static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2352 						      int num_stripes)
2353 {
2354 	struct map_lookup *new;
2355 	size_t len = map_lookup_size(num_stripes);
2356 
2357 	BUG_ON(map->num_stripes < num_stripes);
2358 
2359 	if (map->num_stripes == num_stripes)
2360 		return map;
2361 
2362 	new = kmalloc(len, GFP_NOFS);
2363 	if (!new) {
2364 		/* just change map->num_stripes */
2365 		map->num_stripes = num_stripes;
2366 		return map;
2367 	}
2368 
2369 	memcpy(new, map, len);
2370 	new->num_stripes = num_stripes;
2371 	kfree(map);
2372 	return new;
2373 }
2374 
2375 /*
2376  * helper to allocate device space from btrfs_device_info, in which we stored
2377  * max free space information of every device. It is used when we can not
2378  * allocate chunks by default size.
2379  *
2380  * By this helper, we can allocate a new chunk as larger as possible.
2381  */
2382 static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2383 				    struct btrfs_fs_devices *fs_devices,
2384 				    struct btrfs_device_info *devices,
2385 				    int nr_device, u64 type,
2386 				    struct map_lookup **map_lookup,
2387 				    int min_stripes, u64 *stripe_size)
2388 {
2389 	int i, index, sort_again = 0;
2390 	int min_devices = min_stripes;
2391 	u64 max_avail, min_free;
2392 	struct map_lookup *map = *map_lookup;
2393 	int ret;
2394 
2395 	if (nr_device < min_stripes)
2396 		return -ENOSPC;
2397 
2398 	btrfs_descending_sort_devices(devices, nr_device);
2399 
2400 	max_avail = devices[0].max_avail;
2401 	if (!max_avail)
2402 		return -ENOSPC;
2403 
2404 	for (i = 0; i < nr_device; i++) {
2405 		/*
2406 		 * if dev_offset = 0, it means the free space of this device
2407 		 * is less than what we need, and we didn't search max avail
2408 		 * extent on this device, so do it now.
2409 		 */
2410 		if (!devices[i].dev_offset) {
2411 			ret = find_free_dev_extent(trans, devices[i].dev,
2412 						   max_avail,
2413 						   &devices[i].dev_offset,
2414 						   &devices[i].max_avail);
2415 			if (ret != 0 && ret != -ENOSPC)
2416 				return ret;
2417 			sort_again = 1;
2418 		}
2419 	}
2420 
2421 	/* we update the max avail free extent of each devices, sort again */
2422 	if (sort_again)
2423 		btrfs_descending_sort_devices(devices, nr_device);
2424 
2425 	if (type & BTRFS_BLOCK_GROUP_DUP)
2426 		min_devices = 1;
2427 
2428 	if (!devices[min_devices - 1].max_avail)
2429 		return -ENOSPC;
2430 
2431 	max_avail = devices[min_devices - 1].max_avail;
2432 	if (type & BTRFS_BLOCK_GROUP_DUP)
2433 		do_div(max_avail, 2);
2434 
2435 	max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2436 					     min_stripes, 1);
2437 	if (type & BTRFS_BLOCK_GROUP_DUP)
2438 		min_free = max_avail * 2;
2439 	else
2440 		min_free = max_avail;
2441 
2442 	if (min_free > devices[min_devices - 1].max_avail)
2443 		return -ENOSPC;
2444 
2445 	map = __shrink_map_lookup_stripes(map, min_stripes);
2446 	*stripe_size = max_avail;
2447 
2448 	index = 0;
2449 	for (i = 0; i < min_stripes; i++) {
2450 		map->stripes[i].dev = devices[index].dev;
2451 		map->stripes[i].physical = devices[index].dev_offset;
2452 		if (type & BTRFS_BLOCK_GROUP_DUP) {
2453 			i++;
2454 			map->stripes[i].dev = devices[index].dev;
2455 			map->stripes[i].physical = devices[index].dev_offset +
2456 						   max_avail;
2457 		}
2458 		index++;
2459 	}
2460 	*map_lookup = map;
2461 
2462 	return 0;
2463 }
2464 
2465 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2466 			       struct btrfs_root *extent_root,
2467 			       struct map_lookup **map_ret,
2468 			       u64 *num_bytes, u64 *stripe_size,
2469 			       u64 start, u64 type)
2470 {
2471 	struct btrfs_fs_info *info = extent_root->fs_info;
2472 	struct btrfs_device *device = NULL;
2473 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
2474 	struct list_head *cur;
2475 	struct map_lookup *map;
2476 	struct extent_map_tree *em_tree;
2477 	struct extent_map *em;
2478 	struct btrfs_device_info *devices_info;
2479 	struct list_head private_devs;
2480 	u64 calc_size = 1024 * 1024 * 1024;
2481 	u64 min_free;
2482 	u64 avail;
2483 	u64 dev_offset;
2484 	int num_stripes;
2485 	int min_stripes;
2486 	int sub_stripes;
2487 	int min_devices;	/* the min number of devices we need */
2488 	int i;
2489 	int ret;
2490 	int index;
2491 
2492 	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2493 	    (type & BTRFS_BLOCK_GROUP_DUP)) {
2494 		WARN_ON(1);
2495 		type &= ~BTRFS_BLOCK_GROUP_DUP;
2496 	}
2497 	if (list_empty(&fs_devices->alloc_list))
2498 		return -ENOSPC;
2499 
2500 	ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2501 				    &min_stripes, &sub_stripes);
2502 	if (ret)
2503 		return ret;
2504 
2505 	devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2506 			       GFP_NOFS);
2507 	if (!devices_info)
2508 		return -ENOMEM;
2509 
2510 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2511 	if (!map) {
2512 		ret = -ENOMEM;
2513 		goto error;
2514 	}
2515 	map->num_stripes = num_stripes;
2516 
2517 	cur = fs_devices->alloc_list.next;
2518 	index = 0;
2519 	i = 0;
2520 
2521 	calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2522 					     num_stripes, 0);
2523 
2524 	if (type & BTRFS_BLOCK_GROUP_DUP) {
2525 		min_free = calc_size * 2;
2526 		min_devices = 1;
2527 	} else {
2528 		min_free = calc_size;
2529 		min_devices = min_stripes;
2530 	}
2531 
2532 	INIT_LIST_HEAD(&private_devs);
2533 	while (index < num_stripes) {
2534 		device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2535 		BUG_ON(!device->writeable);
2536 		if (device->total_bytes > device->bytes_used)
2537 			avail = device->total_bytes - device->bytes_used;
2538 		else
2539 			avail = 0;
2540 		cur = cur->next;
2541 
2542 		if (device->in_fs_metadata && avail >= min_free) {
2543 			ret = find_free_dev_extent(trans, device, min_free,
2544 						   &devices_info[i].dev_offset,
2545 						   &devices_info[i].max_avail);
2546 			if (ret == 0) {
2547 				list_move_tail(&device->dev_alloc_list,
2548 					       &private_devs);
2549 				map->stripes[index].dev = device;
2550 				map->stripes[index].physical =
2551 						devices_info[i].dev_offset;
2552 				index++;
2553 				if (type & BTRFS_BLOCK_GROUP_DUP) {
2554 					map->stripes[index].dev = device;
2555 					map->stripes[index].physical =
2556 						devices_info[i].dev_offset +
2557 						calc_size;
2558 					index++;
2559 				}
2560 			} else if (ret != -ENOSPC)
2561 				goto error;
2562 
2563 			devices_info[i].dev = device;
2564 			i++;
2565 		} else if (device->in_fs_metadata &&
2566 			   avail >= BTRFS_STRIPE_LEN) {
2567 			devices_info[i].dev = device;
2568 			devices_info[i].max_avail = avail;
2569 			i++;
2570 		}
2571 
2572 		if (cur == &fs_devices->alloc_list)
2573 			break;
2574 	}
2575 
2576 	list_splice(&private_devs, &fs_devices->alloc_list);
2577 	if (index < num_stripes) {
2578 		if (index >= min_stripes) {
2579 			num_stripes = index;
2580 			if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2581 				num_stripes /= sub_stripes;
2582 				num_stripes *= sub_stripes;
2583 			}
2584 
2585 			map = __shrink_map_lookup_stripes(map, num_stripes);
2586 		} else if (i >= min_devices) {
2587 			ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2588 						       devices_info, i, type,
2589 						       &map, min_stripes,
2590 						       &calc_size);
2591 			if (ret)
2592 				goto error;
2593 		} else {
2594 			ret = -ENOSPC;
2595 			goto error;
2596 		}
2597 	}
2598 	map->sector_size = extent_root->sectorsize;
2599 	map->stripe_len = BTRFS_STRIPE_LEN;
2600 	map->io_align = BTRFS_STRIPE_LEN;
2601 	map->io_width = BTRFS_STRIPE_LEN;
2602 	map->type = type;
2603 	map->sub_stripes = sub_stripes;
2604 
2605 	*map_ret = map;
2606 	*stripe_size = calc_size;
2607 	*num_bytes = chunk_bytes_by_type(type, calc_size,
2608 					 map->num_stripes, sub_stripes);
2609 
2610 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, *num_bytes);
2611 
2612 	em = alloc_extent_map(GFP_NOFS);
2613 	if (!em) {
2614 		ret = -ENOMEM;
2615 		goto error;
2616 	}
2617 	em->bdev = (struct block_device *)map;
2618 	em->start = start;
2619 	em->len = *num_bytes;
2620 	em->block_start = 0;
2621 	em->block_len = em->len;
2622 
2623 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
2624 	write_lock(&em_tree->lock);
2625 	ret = add_extent_mapping(em_tree, em);
2626 	write_unlock(&em_tree->lock);
2627 	BUG_ON(ret);
2628 	free_extent_map(em);
2629 
2630 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
2631 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2632 				     start, *num_bytes);
2633 	BUG_ON(ret);
2634 
2635 	index = 0;
2636 	while (index < map->num_stripes) {
2637 		device = map->stripes[index].dev;
2638 		dev_offset = map->stripes[index].physical;
2639 
2640 		ret = btrfs_alloc_dev_extent(trans, device,
2641 				info->chunk_root->root_key.objectid,
2642 				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2643 				start, dev_offset, calc_size);
2644 		BUG_ON(ret);
2645 		index++;
2646 	}
2647 
2648 	kfree(devices_info);
2649 	return 0;
2650 
2651 error:
2652 	kfree(map);
2653 	kfree(devices_info);
2654 	return ret;
2655 }
2656 
2657 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2658 				struct btrfs_root *extent_root,
2659 				struct map_lookup *map, u64 chunk_offset,
2660 				u64 chunk_size, u64 stripe_size)
2661 {
2662 	u64 dev_offset;
2663 	struct btrfs_key key;
2664 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2665 	struct btrfs_device *device;
2666 	struct btrfs_chunk *chunk;
2667 	struct btrfs_stripe *stripe;
2668 	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
2669 	int index = 0;
2670 	int ret;
2671 
2672 	chunk = kzalloc(item_size, GFP_NOFS);
2673 	if (!chunk)
2674 		return -ENOMEM;
2675 
2676 	index = 0;
2677 	while (index < map->num_stripes) {
2678 		device = map->stripes[index].dev;
2679 		device->bytes_used += stripe_size;
2680 		ret = btrfs_update_device(trans, device);
2681 		BUG_ON(ret);
2682 		index++;
2683 	}
2684 
2685 	index = 0;
2686 	stripe = &chunk->stripe;
2687 	while (index < map->num_stripes) {
2688 		device = map->stripes[index].dev;
2689 		dev_offset = map->stripes[index].physical;
2690 
2691 		btrfs_set_stack_stripe_devid(stripe, device->devid);
2692 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
2693 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
2694 		stripe++;
2695 		index++;
2696 	}
2697 
2698 	btrfs_set_stack_chunk_length(chunk, chunk_size);
2699 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
2700 	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
2701 	btrfs_set_stack_chunk_type(chunk, map->type);
2702 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
2703 	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
2704 	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
2705 	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
2706 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
2707 
2708 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2709 	key.type = BTRFS_CHUNK_ITEM_KEY;
2710 	key.offset = chunk_offset;
2711 
2712 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
2713 	BUG_ON(ret);
2714 
2715 	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2716 		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
2717 					     item_size);
2718 		BUG_ON(ret);
2719 	}
2720 
2721 	kfree(chunk);
2722 	return 0;
2723 }
2724 
2725 /*
2726  * Chunk allocation falls into two parts. The first part does works
2727  * that make the new allocated chunk useable, but not do any operation
2728  * that modifies the chunk tree. The second part does the works that
2729  * require modifying the chunk tree. This division is important for the
2730  * bootstrap process of adding storage to a seed btrfs.
2731  */
2732 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2733 		      struct btrfs_root *extent_root, u64 type)
2734 {
2735 	u64 chunk_offset;
2736 	u64 chunk_size;
2737 	u64 stripe_size;
2738 	struct map_lookup *map;
2739 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
2740 	int ret;
2741 
2742 	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2743 			      &chunk_offset);
2744 	if (ret)
2745 		return ret;
2746 
2747 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2748 				  &stripe_size, chunk_offset, type);
2749 	if (ret)
2750 		return ret;
2751 
2752 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2753 				   chunk_size, stripe_size);
2754 	BUG_ON(ret);
2755 	return 0;
2756 }
2757 
2758 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2759 					 struct btrfs_root *root,
2760 					 struct btrfs_device *device)
2761 {
2762 	u64 chunk_offset;
2763 	u64 sys_chunk_offset;
2764 	u64 chunk_size;
2765 	u64 sys_chunk_size;
2766 	u64 stripe_size;
2767 	u64 sys_stripe_size;
2768 	u64 alloc_profile;
2769 	struct map_lookup *map;
2770 	struct map_lookup *sys_map;
2771 	struct btrfs_fs_info *fs_info = root->fs_info;
2772 	struct btrfs_root *extent_root = fs_info->extent_root;
2773 	int ret;
2774 
2775 	ret = find_next_chunk(fs_info->chunk_root,
2776 			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2777 	BUG_ON(ret);
2778 
2779 	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2780 			(fs_info->metadata_alloc_profile &
2781 			 fs_info->avail_metadata_alloc_bits);
2782 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2783 
2784 	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
2785 				  &stripe_size, chunk_offset, alloc_profile);
2786 	BUG_ON(ret);
2787 
2788 	sys_chunk_offset = chunk_offset + chunk_size;
2789 
2790 	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
2791 			(fs_info->system_alloc_profile &
2792 			 fs_info->avail_system_alloc_bits);
2793 	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
2794 
2795 	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
2796 				  &sys_chunk_size, &sys_stripe_size,
2797 				  sys_chunk_offset, alloc_profile);
2798 	BUG_ON(ret);
2799 
2800 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
2801 	BUG_ON(ret);
2802 
2803 	/*
2804 	 * Modifying chunk tree needs allocating new blocks from both
2805 	 * system block group and metadata block group. So we only can
2806 	 * do operations require modifying the chunk tree after both
2807 	 * block groups were created.
2808 	 */
2809 	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
2810 				   chunk_size, stripe_size);
2811 	BUG_ON(ret);
2812 
2813 	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
2814 				   sys_chunk_offset, sys_chunk_size,
2815 				   sys_stripe_size);
2816 	BUG_ON(ret);
2817 	return 0;
2818 }
2819 
2820 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2821 {
2822 	struct extent_map *em;
2823 	struct map_lookup *map;
2824 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
2825 	int readonly = 0;
2826 	int i;
2827 
2828 	read_lock(&map_tree->map_tree.lock);
2829 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2830 	read_unlock(&map_tree->map_tree.lock);
2831 	if (!em)
2832 		return 1;
2833 
2834 	if (btrfs_test_opt(root, DEGRADED)) {
2835 		free_extent_map(em);
2836 		return 0;
2837 	}
2838 
2839 	map = (struct map_lookup *)em->bdev;
2840 	for (i = 0; i < map->num_stripes; i++) {
2841 		if (!map->stripes[i].dev->writeable) {
2842 			readonly = 1;
2843 			break;
2844 		}
2845 	}
2846 	free_extent_map(em);
2847 	return readonly;
2848 }
2849 
2850 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2851 {
2852 	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
2853 }
2854 
2855 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
2856 {
2857 	struct extent_map *em;
2858 
2859 	while (1) {
2860 		write_lock(&tree->map_tree.lock);
2861 		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
2862 		if (em)
2863 			remove_extent_mapping(&tree->map_tree, em);
2864 		write_unlock(&tree->map_tree.lock);
2865 		if (!em)
2866 			break;
2867 		kfree(em->bdev);
2868 		/* once for us */
2869 		free_extent_map(em);
2870 		/* once for the tree */
2871 		free_extent_map(em);
2872 	}
2873 }
2874 
2875 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
2876 {
2877 	struct extent_map *em;
2878 	struct map_lookup *map;
2879 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2880 	int ret;
2881 
2882 	read_lock(&em_tree->lock);
2883 	em = lookup_extent_mapping(em_tree, logical, len);
2884 	read_unlock(&em_tree->lock);
2885 	BUG_ON(!em);
2886 
2887 	BUG_ON(em->start > logical || em->start + em->len < logical);
2888 	map = (struct map_lookup *)em->bdev;
2889 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
2890 		ret = map->num_stripes;
2891 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
2892 		ret = map->sub_stripes;
2893 	else
2894 		ret = 1;
2895 	free_extent_map(em);
2896 	return ret;
2897 }
2898 
2899 static int find_live_mirror(struct map_lookup *map, int first, int num,
2900 			    int optimal)
2901 {
2902 	int i;
2903 	if (map->stripes[optimal].dev->bdev)
2904 		return optimal;
2905 	for (i = first; i < first + num; i++) {
2906 		if (map->stripes[i].dev->bdev)
2907 			return i;
2908 	}
2909 	/* we couldn't find one that doesn't fail.  Just return something
2910 	 * and the io error handling code will clean up eventually
2911 	 */
2912 	return optimal;
2913 }
2914 
2915 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2916 			     u64 logical, u64 *length,
2917 			     struct btrfs_multi_bio **multi_ret,
2918 			     int mirror_num)
2919 {
2920 	struct extent_map *em;
2921 	struct map_lookup *map;
2922 	struct extent_map_tree *em_tree = &map_tree->map_tree;
2923 	u64 offset;
2924 	u64 stripe_offset;
2925 	u64 stripe_end_offset;
2926 	u64 stripe_nr;
2927 	u64 stripe_nr_orig;
2928 	u64 stripe_nr_end;
2929 	int stripes_allocated = 8;
2930 	int stripes_required = 1;
2931 	int stripe_index;
2932 	int i;
2933 	int num_stripes;
2934 	int max_errors = 0;
2935 	struct btrfs_multi_bio *multi = NULL;
2936 
2937 	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2938 		stripes_allocated = 1;
2939 again:
2940 	if (multi_ret) {
2941 		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
2942 				GFP_NOFS);
2943 		if (!multi)
2944 			return -ENOMEM;
2945 
2946 		atomic_set(&multi->error, 0);
2947 	}
2948 
2949 	read_lock(&em_tree->lock);
2950 	em = lookup_extent_mapping(em_tree, logical, *length);
2951 	read_unlock(&em_tree->lock);
2952 
2953 	if (!em) {
2954 		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2955 		       (unsigned long long)logical,
2956 		       (unsigned long long)*length);
2957 		BUG();
2958 	}
2959 
2960 	BUG_ON(em->start > logical || em->start + em->len < logical);
2961 	map = (struct map_lookup *)em->bdev;
2962 	offset = logical - em->start;
2963 
2964 	if (mirror_num > map->num_stripes)
2965 		mirror_num = 0;
2966 
2967 	/* if our multi bio struct is too small, back off and try again */
2968 	if (rw & REQ_WRITE) {
2969 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2970 				 BTRFS_BLOCK_GROUP_DUP)) {
2971 			stripes_required = map->num_stripes;
2972 			max_errors = 1;
2973 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2974 			stripes_required = map->sub_stripes;
2975 			max_errors = 1;
2976 		}
2977 	}
2978 	if (rw & REQ_DISCARD) {
2979 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2980 				 BTRFS_BLOCK_GROUP_RAID1 |
2981 				 BTRFS_BLOCK_GROUP_DUP |
2982 				 BTRFS_BLOCK_GROUP_RAID10)) {
2983 			stripes_required = map->num_stripes;
2984 		}
2985 	}
2986 	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2987 	    stripes_allocated < stripes_required) {
2988 		stripes_allocated = map->num_stripes;
2989 		free_extent_map(em);
2990 		kfree(multi);
2991 		goto again;
2992 	}
2993 	stripe_nr = offset;
2994 	/*
2995 	 * stripe_nr counts the total number of stripes we have to stride
2996 	 * to get to this block
2997 	 */
2998 	do_div(stripe_nr, map->stripe_len);
2999 
3000 	stripe_offset = stripe_nr * map->stripe_len;
3001 	BUG_ON(offset < stripe_offset);
3002 
3003 	/* stripe_offset is the offset of this block in its stripe*/
3004 	stripe_offset = offset - stripe_offset;
3005 
3006 	if (rw & REQ_DISCARD)
3007 		*length = min_t(u64, em->len - offset, *length);
3008 	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3009 			      BTRFS_BLOCK_GROUP_RAID1 |
3010 			      BTRFS_BLOCK_GROUP_RAID10 |
3011 			      BTRFS_BLOCK_GROUP_DUP)) {
3012 		/* we limit the length of each bio to what fits in a stripe */
3013 		*length = min_t(u64, em->len - offset,
3014 				map->stripe_len - stripe_offset);
3015 	} else {
3016 		*length = em->len - offset;
3017 	}
3018 
3019 	if (!multi_ret)
3020 		goto out;
3021 
3022 	num_stripes = 1;
3023 	stripe_index = 0;
3024 	stripe_nr_orig = stripe_nr;
3025 	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
3026 			(~(map->stripe_len - 1));
3027 	do_div(stripe_nr_end, map->stripe_len);
3028 	stripe_end_offset = stripe_nr_end * map->stripe_len -
3029 			    (offset + *length);
3030 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3031 		if (rw & REQ_DISCARD)
3032 			num_stripes = min_t(u64, map->num_stripes,
3033 					    stripe_nr_end - stripe_nr_orig);
3034 		stripe_index = do_div(stripe_nr, map->num_stripes);
3035 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3036 		if (rw & (REQ_WRITE | REQ_DISCARD))
3037 			num_stripes = map->num_stripes;
3038 		else if (mirror_num)
3039 			stripe_index = mirror_num - 1;
3040 		else {
3041 			stripe_index = find_live_mirror(map, 0,
3042 					    map->num_stripes,
3043 					    current->pid % map->num_stripes);
3044 		}
3045 
3046 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3047 		if (rw & (REQ_WRITE | REQ_DISCARD))
3048 			num_stripes = map->num_stripes;
3049 		else if (mirror_num)
3050 			stripe_index = mirror_num - 1;
3051 
3052 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 		int factor = map->num_stripes / map->sub_stripes;
3054 
3055 		stripe_index = do_div(stripe_nr, factor);
3056 		stripe_index *= map->sub_stripes;
3057 
3058 		if (rw & REQ_WRITE)
3059 			num_stripes = map->sub_stripes;
3060 		else if (rw & REQ_DISCARD)
3061 			num_stripes = min_t(u64, map->sub_stripes *
3062 					    (stripe_nr_end - stripe_nr_orig),
3063 					    map->num_stripes);
3064 		else if (mirror_num)
3065 			stripe_index += mirror_num - 1;
3066 		else {
3067 			stripe_index = find_live_mirror(map, stripe_index,
3068 					      map->sub_stripes, stripe_index +
3069 					      current->pid % map->sub_stripes);
3070 		}
3071 	} else {
3072 		/*
3073 		 * after this do_div call, stripe_nr is the number of stripes
3074 		 * on this device we have to walk to find the data, and
3075 		 * stripe_index is the number of our device in the stripe array
3076 		 */
3077 		stripe_index = do_div(stripe_nr, map->num_stripes);
3078 	}
3079 	BUG_ON(stripe_index >= map->num_stripes);
3080 
3081 	if (rw & REQ_DISCARD) {
3082 		for (i = 0; i < num_stripes; i++) {
3083 			multi->stripes[i].physical =
3084 				map->stripes[stripe_index].physical +
3085 				stripe_offset + stripe_nr * map->stripe_len;
3086 			multi->stripes[i].dev = map->stripes[stripe_index].dev;
3087 
3088 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3089 				u64 stripes;
3090 				u32 last_stripe = 0;
3091 				int j;
3092 
3093 				div_u64_rem(stripe_nr_end - 1,
3094 					    map->num_stripes,
3095 					    &last_stripe);
3096 
3097 				for (j = 0; j < map->num_stripes; j++) {
3098 					u32 test;
3099 
3100 					div_u64_rem(stripe_nr_end - 1 - j,
3101 						    map->num_stripes, &test);
3102 					if (test == stripe_index)
3103 						break;
3104 				}
3105 				stripes = stripe_nr_end - 1 - j;
3106 				do_div(stripes, map->num_stripes);
3107 				multi->stripes[i].length = map->stripe_len *
3108 					(stripes - stripe_nr + 1);
3109 
3110 				if (i == 0) {
3111 					multi->stripes[i].length -=
3112 						stripe_offset;
3113 					stripe_offset = 0;
3114 				}
3115 				if (stripe_index == last_stripe)
3116 					multi->stripes[i].length -=
3117 						stripe_end_offset;
3118 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3119 				u64 stripes;
3120 				int j;
3121 				int factor = map->num_stripes /
3122 					     map->sub_stripes;
3123 				u32 last_stripe = 0;
3124 
3125 				div_u64_rem(stripe_nr_end - 1,
3126 					    factor, &last_stripe);
3127 				last_stripe *= map->sub_stripes;
3128 
3129 				for (j = 0; j < factor; j++) {
3130 					u32 test;
3131 
3132 					div_u64_rem(stripe_nr_end - 1 - j,
3133 						    factor, &test);
3134 
3135 					if (test ==
3136 					    stripe_index / map->sub_stripes)
3137 						break;
3138 				}
3139 				stripes = stripe_nr_end - 1 - j;
3140 				do_div(stripes, factor);
3141 				multi->stripes[i].length = map->stripe_len *
3142 					(stripes - stripe_nr + 1);
3143 
3144 				if (i < map->sub_stripes) {
3145 					multi->stripes[i].length -=
3146 						stripe_offset;
3147 					if (i == map->sub_stripes - 1)
3148 						stripe_offset = 0;
3149 				}
3150 				if (stripe_index >= last_stripe &&
3151 				    stripe_index <= (last_stripe +
3152 						     map->sub_stripes - 1)) {
3153 					multi->stripes[i].length -=
3154 						stripe_end_offset;
3155 				}
3156 			} else
3157 				multi->stripes[i].length = *length;
3158 
3159 			stripe_index++;
3160 			if (stripe_index == map->num_stripes) {
3161 				/* This could only happen for RAID0/10 */
3162 				stripe_index = 0;
3163 				stripe_nr++;
3164 			}
3165 		}
3166 	} else {
3167 		for (i = 0; i < num_stripes; i++) {
3168 			multi->stripes[i].physical =
3169 				map->stripes[stripe_index].physical +
3170 				stripe_offset +
3171 				stripe_nr * map->stripe_len;
3172 			multi->stripes[i].dev =
3173 				map->stripes[stripe_index].dev;
3174 			stripe_index++;
3175 		}
3176 	}
3177 	if (multi_ret) {
3178 		*multi_ret = multi;
3179 		multi->num_stripes = num_stripes;
3180 		multi->max_errors = max_errors;
3181 	}
3182 out:
3183 	free_extent_map(em);
3184 	return 0;
3185 }
3186 
3187 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3188 		      u64 logical, u64 *length,
3189 		      struct btrfs_multi_bio **multi_ret, int mirror_num)
3190 {
3191 	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
3192 				 mirror_num);
3193 }
3194 
3195 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3196 		     u64 chunk_start, u64 physical, u64 devid,
3197 		     u64 **logical, int *naddrs, int *stripe_len)
3198 {
3199 	struct extent_map_tree *em_tree = &map_tree->map_tree;
3200 	struct extent_map *em;
3201 	struct map_lookup *map;
3202 	u64 *buf;
3203 	u64 bytenr;
3204 	u64 length;
3205 	u64 stripe_nr;
3206 	int i, j, nr = 0;
3207 
3208 	read_lock(&em_tree->lock);
3209 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
3210 	read_unlock(&em_tree->lock);
3211 
3212 	BUG_ON(!em || em->start != chunk_start);
3213 	map = (struct map_lookup *)em->bdev;
3214 
3215 	length = em->len;
3216 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
3217 		do_div(length, map->num_stripes / map->sub_stripes);
3218 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
3219 		do_div(length, map->num_stripes);
3220 
3221 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
3222 	BUG_ON(!buf);
3223 
3224 	for (i = 0; i < map->num_stripes; i++) {
3225 		if (devid && map->stripes[i].dev->devid != devid)
3226 			continue;
3227 		if (map->stripes[i].physical > physical ||
3228 		    map->stripes[i].physical + length <= physical)
3229 			continue;
3230 
3231 		stripe_nr = physical - map->stripes[i].physical;
3232 		do_div(stripe_nr, map->stripe_len);
3233 
3234 		if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3235 			stripe_nr = stripe_nr * map->num_stripes + i;
3236 			do_div(stripe_nr, map->sub_stripes);
3237 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3238 			stripe_nr = stripe_nr * map->num_stripes + i;
3239 		}
3240 		bytenr = chunk_start + stripe_nr * map->stripe_len;
3241 		WARN_ON(nr >= map->num_stripes);
3242 		for (j = 0; j < nr; j++) {
3243 			if (buf[j] == bytenr)
3244 				break;
3245 		}
3246 		if (j == nr) {
3247 			WARN_ON(nr >= map->num_stripes);
3248 			buf[nr++] = bytenr;
3249 		}
3250 	}
3251 
3252 	*logical = buf;
3253 	*naddrs = nr;
3254 	*stripe_len = map->stripe_len;
3255 
3256 	free_extent_map(em);
3257 	return 0;
3258 }
3259 
3260 static void end_bio_multi_stripe(struct bio *bio, int err)
3261 {
3262 	struct btrfs_multi_bio *multi = bio->bi_private;
3263 	int is_orig_bio = 0;
3264 
3265 	if (err)
3266 		atomic_inc(&multi->error);
3267 
3268 	if (bio == multi->orig_bio)
3269 		is_orig_bio = 1;
3270 
3271 	if (atomic_dec_and_test(&multi->stripes_pending)) {
3272 		if (!is_orig_bio) {
3273 			bio_put(bio);
3274 			bio = multi->orig_bio;
3275 		}
3276 		bio->bi_private = multi->private;
3277 		bio->bi_end_io = multi->end_io;
3278 		/* only send an error to the higher layers if it is
3279 		 * beyond the tolerance of the multi-bio
3280 		 */
3281 		if (atomic_read(&multi->error) > multi->max_errors) {
3282 			err = -EIO;
3283 		} else if (err) {
3284 			/*
3285 			 * this bio is actually up to date, we didn't
3286 			 * go over the max number of errors
3287 			 */
3288 			set_bit(BIO_UPTODATE, &bio->bi_flags);
3289 			err = 0;
3290 		}
3291 		kfree(multi);
3292 
3293 		bio_endio(bio, err);
3294 	} else if (!is_orig_bio) {
3295 		bio_put(bio);
3296 	}
3297 }
3298 
3299 struct async_sched {
3300 	struct bio *bio;
3301 	int rw;
3302 	struct btrfs_fs_info *info;
3303 	struct btrfs_work work;
3304 };
3305 
3306 /*
3307  * see run_scheduled_bios for a description of why bios are collected for
3308  * async submit.
3309  *
3310  * This will add one bio to the pending list for a device and make sure
3311  * the work struct is scheduled.
3312  */
3313 static noinline int schedule_bio(struct btrfs_root *root,
3314 				 struct btrfs_device *device,
3315 				 int rw, struct bio *bio)
3316 {
3317 	int should_queue = 1;
3318 	struct btrfs_pending_bios *pending_bios;
3319 
3320 	/* don't bother with additional async steps for reads, right now */
3321 	if (!(rw & REQ_WRITE)) {
3322 		bio_get(bio);
3323 		submit_bio(rw, bio);
3324 		bio_put(bio);
3325 		return 0;
3326 	}
3327 
3328 	/*
3329 	 * nr_async_bios allows us to reliably return congestion to the
3330 	 * higher layers.  Otherwise, the async bio makes it appear we have
3331 	 * made progress against dirty pages when we've really just put it
3332 	 * on a queue for later
3333 	 */
3334 	atomic_inc(&root->fs_info->nr_async_bios);
3335 	WARN_ON(bio->bi_next);
3336 	bio->bi_next = NULL;
3337 	bio->bi_rw |= rw;
3338 
3339 	spin_lock(&device->io_lock);
3340 	if (bio->bi_rw & REQ_SYNC)
3341 		pending_bios = &device->pending_sync_bios;
3342 	else
3343 		pending_bios = &device->pending_bios;
3344 
3345 	if (pending_bios->tail)
3346 		pending_bios->tail->bi_next = bio;
3347 
3348 	pending_bios->tail = bio;
3349 	if (!pending_bios->head)
3350 		pending_bios->head = bio;
3351 	if (device->running_pending)
3352 		should_queue = 0;
3353 
3354 	spin_unlock(&device->io_lock);
3355 
3356 	if (should_queue)
3357 		btrfs_queue_worker(&root->fs_info->submit_workers,
3358 				   &device->work);
3359 	return 0;
3360 }
3361 
3362 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3363 		  int mirror_num, int async_submit)
3364 {
3365 	struct btrfs_mapping_tree *map_tree;
3366 	struct btrfs_device *dev;
3367 	struct bio *first_bio = bio;
3368 	u64 logical = (u64)bio->bi_sector << 9;
3369 	u64 length = 0;
3370 	u64 map_length;
3371 	struct btrfs_multi_bio *multi = NULL;
3372 	int ret;
3373 	int dev_nr = 0;
3374 	int total_devs = 1;
3375 
3376 	length = bio->bi_size;
3377 	map_tree = &root->fs_info->mapping_tree;
3378 	map_length = length;
3379 
3380 	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
3381 			      mirror_num);
3382 	BUG_ON(ret);
3383 
3384 	total_devs = multi->num_stripes;
3385 	if (map_length < length) {
3386 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3387 		       "len %llu\n", (unsigned long long)logical,
3388 		       (unsigned long long)length,
3389 		       (unsigned long long)map_length);
3390 		BUG();
3391 	}
3392 	multi->end_io = first_bio->bi_end_io;
3393 	multi->private = first_bio->bi_private;
3394 	multi->orig_bio = first_bio;
3395 	atomic_set(&multi->stripes_pending, multi->num_stripes);
3396 
3397 	while (dev_nr < total_devs) {
3398 		if (total_devs > 1) {
3399 			if (dev_nr < total_devs - 1) {
3400 				bio = bio_clone(first_bio, GFP_NOFS);
3401 				BUG_ON(!bio);
3402 			} else {
3403 				bio = first_bio;
3404 			}
3405 			bio->bi_private = multi;
3406 			bio->bi_end_io = end_bio_multi_stripe;
3407 		}
3408 		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3409 		dev = multi->stripes[dev_nr].dev;
3410 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3411 			bio->bi_bdev = dev->bdev;
3412 			if (async_submit)
3413 				schedule_bio(root, dev, rw, bio);
3414 			else
3415 				submit_bio(rw, bio);
3416 		} else {
3417 			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
3418 			bio->bi_sector = logical >> 9;
3419 			bio_endio(bio, -EIO);
3420 		}
3421 		dev_nr++;
3422 	}
3423 	if (total_devs == 1)
3424 		kfree(multi);
3425 	return 0;
3426 }
3427 
3428 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
3429 				       u8 *uuid, u8 *fsid)
3430 {
3431 	struct btrfs_device *device;
3432 	struct btrfs_fs_devices *cur_devices;
3433 
3434 	cur_devices = root->fs_info->fs_devices;
3435 	while (cur_devices) {
3436 		if (!fsid ||
3437 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3438 			device = __find_device(&cur_devices->devices,
3439 					       devid, uuid);
3440 			if (device)
3441 				return device;
3442 		}
3443 		cur_devices = cur_devices->seed;
3444 	}
3445 	return NULL;
3446 }
3447 
3448 static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3449 					    u64 devid, u8 *dev_uuid)
3450 {
3451 	struct btrfs_device *device;
3452 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
3453 
3454 	device = kzalloc(sizeof(*device), GFP_NOFS);
3455 	if (!device)
3456 		return NULL;
3457 	list_add(&device->dev_list,
3458 		 &fs_devices->devices);
3459 	device->dev_root = root->fs_info->dev_root;
3460 	device->devid = devid;
3461 	device->work.func = pending_bios_fn;
3462 	device->fs_devices = fs_devices;
3463 	device->missing = 1;
3464 	fs_devices->num_devices++;
3465 	fs_devices->missing_devices++;
3466 	spin_lock_init(&device->io_lock);
3467 	INIT_LIST_HEAD(&device->dev_alloc_list);
3468 	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
3469 	return device;
3470 }
3471 
3472 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3473 			  struct extent_buffer *leaf,
3474 			  struct btrfs_chunk *chunk)
3475 {
3476 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
3477 	struct map_lookup *map;
3478 	struct extent_map *em;
3479 	u64 logical;
3480 	u64 length;
3481 	u64 devid;
3482 	u8 uuid[BTRFS_UUID_SIZE];
3483 	int num_stripes;
3484 	int ret;
3485 	int i;
3486 
3487 	logical = key->offset;
3488 	length = btrfs_chunk_length(leaf, chunk);
3489 
3490 	read_lock(&map_tree->map_tree.lock);
3491 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
3492 	read_unlock(&map_tree->map_tree.lock);
3493 
3494 	/* already mapped? */
3495 	if (em && em->start <= logical && em->start + em->len > logical) {
3496 		free_extent_map(em);
3497 		return 0;
3498 	} else if (em) {
3499 		free_extent_map(em);
3500 	}
3501 
3502 	em = alloc_extent_map(GFP_NOFS);
3503 	if (!em)
3504 		return -ENOMEM;
3505 	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3506 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
3507 	if (!map) {
3508 		free_extent_map(em);
3509 		return -ENOMEM;
3510 	}
3511 
3512 	em->bdev = (struct block_device *)map;
3513 	em->start = logical;
3514 	em->len = length;
3515 	em->block_start = 0;
3516 	em->block_len = em->len;
3517 
3518 	map->num_stripes = num_stripes;
3519 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
3520 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
3521 	map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
3522 	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
3523 	map->type = btrfs_chunk_type(leaf, chunk);
3524 	map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
3525 	for (i = 0; i < num_stripes; i++) {
3526 		map->stripes[i].physical =
3527 			btrfs_stripe_offset_nr(leaf, chunk, i);
3528 		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
3529 		read_extent_buffer(leaf, uuid, (unsigned long)
3530 				   btrfs_stripe_dev_uuid_nr(chunk, i),
3531 				   BTRFS_UUID_SIZE);
3532 		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
3533 							NULL);
3534 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
3535 			kfree(map);
3536 			free_extent_map(em);
3537 			return -EIO;
3538 		}
3539 		if (!map->stripes[i].dev) {
3540 			map->stripes[i].dev =
3541 				add_missing_dev(root, devid, uuid);
3542 			if (!map->stripes[i].dev) {
3543 				kfree(map);
3544 				free_extent_map(em);
3545 				return -EIO;
3546 			}
3547 		}
3548 		map->stripes[i].dev->in_fs_metadata = 1;
3549 	}
3550 
3551 	write_lock(&map_tree->map_tree.lock);
3552 	ret = add_extent_mapping(&map_tree->map_tree, em);
3553 	write_unlock(&map_tree->map_tree.lock);
3554 	BUG_ON(ret);
3555 	free_extent_map(em);
3556 
3557 	return 0;
3558 }
3559 
3560 static int fill_device_from_item(struct extent_buffer *leaf,
3561 				 struct btrfs_dev_item *dev_item,
3562 				 struct btrfs_device *device)
3563 {
3564 	unsigned long ptr;
3565 
3566 	device->devid = btrfs_device_id(leaf, dev_item);
3567 	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
3568 	device->total_bytes = device->disk_total_bytes;
3569 	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
3570 	device->type = btrfs_device_type(leaf, dev_item);
3571 	device->io_align = btrfs_device_io_align(leaf, dev_item);
3572 	device->io_width = btrfs_device_io_width(leaf, dev_item);
3573 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
3574 
3575 	ptr = (unsigned long)btrfs_device_uuid(dev_item);
3576 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
3577 
3578 	return 0;
3579 }
3580 
3581 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
3582 {
3583 	struct btrfs_fs_devices *fs_devices;
3584 	int ret;
3585 
3586 	mutex_lock(&uuid_mutex);
3587 
3588 	fs_devices = root->fs_info->fs_devices->seed;
3589 	while (fs_devices) {
3590 		if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
3591 			ret = 0;
3592 			goto out;
3593 		}
3594 		fs_devices = fs_devices->seed;
3595 	}
3596 
3597 	fs_devices = find_fsid(fsid);
3598 	if (!fs_devices) {
3599 		ret = -ENOENT;
3600 		goto out;
3601 	}
3602 
3603 	fs_devices = clone_fs_devices(fs_devices);
3604 	if (IS_ERR(fs_devices)) {
3605 		ret = PTR_ERR(fs_devices);
3606 		goto out;
3607 	}
3608 
3609 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
3610 				   root->fs_info->bdev_holder);
3611 	if (ret)
3612 		goto out;
3613 
3614 	if (!fs_devices->seeding) {
3615 		__btrfs_close_devices(fs_devices);
3616 		free_fs_devices(fs_devices);
3617 		ret = -EINVAL;
3618 		goto out;
3619 	}
3620 
3621 	fs_devices->seed = root->fs_info->fs_devices->seed;
3622 	root->fs_info->fs_devices->seed = fs_devices;
3623 out:
3624 	mutex_unlock(&uuid_mutex);
3625 	return ret;
3626 }
3627 
3628 static int read_one_dev(struct btrfs_root *root,
3629 			struct extent_buffer *leaf,
3630 			struct btrfs_dev_item *dev_item)
3631 {
3632 	struct btrfs_device *device;
3633 	u64 devid;
3634 	int ret;
3635 	u8 fs_uuid[BTRFS_UUID_SIZE];
3636 	u8 dev_uuid[BTRFS_UUID_SIZE];
3637 
3638 	devid = btrfs_device_id(leaf, dev_item);
3639 	read_extent_buffer(leaf, dev_uuid,
3640 			   (unsigned long)btrfs_device_uuid(dev_item),
3641 			   BTRFS_UUID_SIZE);
3642 	read_extent_buffer(leaf, fs_uuid,
3643 			   (unsigned long)btrfs_device_fsid(dev_item),
3644 			   BTRFS_UUID_SIZE);
3645 
3646 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
3647 		ret = open_seed_devices(root, fs_uuid);
3648 		if (ret && !btrfs_test_opt(root, DEGRADED))
3649 			return ret;
3650 	}
3651 
3652 	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
3653 	if (!device || !device->bdev) {
3654 		if (!btrfs_test_opt(root, DEGRADED))
3655 			return -EIO;
3656 
3657 		if (!device) {
3658 			printk(KERN_WARNING "warning devid %llu missing\n",
3659 			       (unsigned long long)devid);
3660 			device = add_missing_dev(root, devid, dev_uuid);
3661 			if (!device)
3662 				return -ENOMEM;
3663 		} else if (!device->missing) {
3664 			/*
3665 			 * this happens when a device that was properly setup
3666 			 * in the device info lists suddenly goes bad.
3667 			 * device->bdev is NULL, and so we have to set
3668 			 * device->missing to one here
3669 			 */
3670 			root->fs_info->fs_devices->missing_devices++;
3671 			device->missing = 1;
3672 		}
3673 	}
3674 
3675 	if (device->fs_devices != root->fs_info->fs_devices) {
3676 		BUG_ON(device->writeable);
3677 		if (device->generation !=
3678 		    btrfs_device_generation(leaf, dev_item))
3679 			return -EINVAL;
3680 	}
3681 
3682 	fill_device_from_item(leaf, dev_item, device);
3683 	device->dev_root = root->fs_info->dev_root;
3684 	device->in_fs_metadata = 1;
3685 	if (device->writeable)
3686 		device->fs_devices->total_rw_bytes += device->total_bytes;
3687 	ret = 0;
3688 	return ret;
3689 }
3690 
3691 int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3692 {
3693 	struct btrfs_dev_item *dev_item;
3694 
3695 	dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3696 						     dev_item);
3697 	return read_one_dev(root, buf, dev_item);
3698 }
3699 
3700 int btrfs_read_sys_array(struct btrfs_root *root)
3701 {
3702 	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
3703 	struct extent_buffer *sb;
3704 	struct btrfs_disk_key *disk_key;
3705 	struct btrfs_chunk *chunk;
3706 	u8 *ptr;
3707 	unsigned long sb_ptr;
3708 	int ret = 0;
3709 	u32 num_stripes;
3710 	u32 array_size;
3711 	u32 len = 0;
3712 	u32 cur;
3713 	struct btrfs_key key;
3714 
3715 	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
3716 					  BTRFS_SUPER_INFO_SIZE);
3717 	if (!sb)
3718 		return -ENOMEM;
3719 	btrfs_set_buffer_uptodate(sb);
3720 	btrfs_set_buffer_lockdep_class(sb, 0);
3721 
3722 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3723 	array_size = btrfs_super_sys_array_size(super_copy);
3724 
3725 	ptr = super_copy->sys_chunk_array;
3726 	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
3727 	cur = 0;
3728 
3729 	while (cur < array_size) {
3730 		disk_key = (struct btrfs_disk_key *)ptr;
3731 		btrfs_disk_key_to_cpu(&key, disk_key);
3732 
3733 		len = sizeof(*disk_key); ptr += len;
3734 		sb_ptr += len;
3735 		cur += len;
3736 
3737 		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
3738 			chunk = (struct btrfs_chunk *)sb_ptr;
3739 			ret = read_one_chunk(root, &key, sb, chunk);
3740 			if (ret)
3741 				break;
3742 			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
3743 			len = btrfs_chunk_item_size(num_stripes);
3744 		} else {
3745 			ret = -EIO;
3746 			break;
3747 		}
3748 		ptr += len;
3749 		sb_ptr += len;
3750 		cur += len;
3751 	}
3752 	free_extent_buffer(sb);
3753 	return ret;
3754 }
3755 
3756 int btrfs_read_chunk_tree(struct btrfs_root *root)
3757 {
3758 	struct btrfs_path *path;
3759 	struct extent_buffer *leaf;
3760 	struct btrfs_key key;
3761 	struct btrfs_key found_key;
3762 	int ret;
3763 	int slot;
3764 
3765 	root = root->fs_info->chunk_root;
3766 
3767 	path = btrfs_alloc_path();
3768 	if (!path)
3769 		return -ENOMEM;
3770 
3771 	/* first we search for all of the device items, and then we
3772 	 * read in all of the chunk items.  This way we can create chunk
3773 	 * mappings that reference all of the devices that are afound
3774 	 */
3775 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
3776 	key.offset = 0;
3777 	key.type = 0;
3778 again:
3779 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3780 	if (ret < 0)
3781 		goto error;
3782 	while (1) {
3783 		leaf = path->nodes[0];
3784 		slot = path->slots[0];
3785 		if (slot >= btrfs_header_nritems(leaf)) {
3786 			ret = btrfs_next_leaf(root, path);
3787 			if (ret == 0)
3788 				continue;
3789 			if (ret < 0)
3790 				goto error;
3791 			break;
3792 		}
3793 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3794 		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3795 			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
3796 				break;
3797 			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
3798 				struct btrfs_dev_item *dev_item;
3799 				dev_item = btrfs_item_ptr(leaf, slot,
3800 						  struct btrfs_dev_item);
3801 				ret = read_one_dev(root, leaf, dev_item);
3802 				if (ret)
3803 					goto error;
3804 			}
3805 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
3806 			struct btrfs_chunk *chunk;
3807 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3808 			ret = read_one_chunk(root, &found_key, leaf, chunk);
3809 			if (ret)
3810 				goto error;
3811 		}
3812 		path->slots[0]++;
3813 	}
3814 	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3815 		key.objectid = 0;
3816 		btrfs_release_path(root, path);
3817 		goto again;
3818 	}
3819 	ret = 0;
3820 error:
3821 	btrfs_free_path(path);
3822 	return ret;
3823 }
3824