xref: /openbmc/linux/drivers/md/dm-thin-metadata.c (revision 3bd940030752a33ff665eefdd74a1cdb74a4f9b0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2011-2012 Red Hat, Inc.
4  *
5  * This file is released under the GPL.
6  */
7 
8 #include "dm-thin-metadata.h"
9 #include "persistent-data/dm-btree.h"
10 #include "persistent-data/dm-space-map.h"
11 #include "persistent-data/dm-space-map-disk.h"
12 #include "persistent-data/dm-transaction-manager.h"
13 
14 #include <linux/list.h>
15 #include <linux/device-mapper.h>
16 #include <linux/workqueue.h>
17 
18 /*--------------------------------------------------------------------------
19  * As far as the metadata goes, there is:
20  *
21  * - A superblock in block zero, taking up fewer than 512 bytes for
22  *   atomic writes.
23  *
24  * - A space map managing the metadata blocks.
25  *
26  * - A space map managing the data blocks.
27  *
28  * - A btree mapping our internal thin dev ids onto struct disk_device_details.
29  *
30  * - A hierarchical btree, with 2 levels which effectively maps (thin
31  *   dev id, virtual block) -> block_time.  Block time is a 64-bit
32  *   field holding the time in the low 24 bits, and block in the top 40
33  *   bits.
34  *
35  * BTrees consist solely of btree_nodes, that fill a block.  Some are
36  * internal nodes, as such their values are a __le64 pointing to other
37  * nodes.  Leaf nodes can store data of any reasonable size (ie. much
38  * smaller than the block size).  The nodes consist of the header,
39  * followed by an array of keys, followed by an array of values.  We have
40  * to binary search on the keys so they're all held together to help the
41  * cpu cache.
42  *
43  * Space maps have 2 btrees:
44  *
45  * - One maps a uint64_t onto a struct index_entry.  Which points to a
46  *   bitmap block, and has some details about how many free entries there
47  *   are etc.
48  *
49  * - The bitmap blocks have a header (for the checksum).  Then the rest
50  *   of the block is pairs of bits.  With the meaning being:
51  *
52  *   0 - ref count is 0
53  *   1 - ref count is 1
54  *   2 - ref count is 2
55  *   3 - ref count is higher than 2
56  *
57  * - If the count is higher than 2 then the ref count is entered in a
58  *   second btree that directly maps the block_address to a uint32_t ref
59  *   count.
60  *
61  * The space map metadata variant doesn't have a bitmaps btree.  Instead
62  * it has one single blocks worth of index_entries.  This avoids
63  * recursive issues with the bitmap btree needing to allocate space in
64  * order to insert.  With a small data block size such as 64k the
65  * metadata support data devices that are hundreds of terrabytes.
66  *
67  * The space maps allocate space linearly from front to back.  Space that
68  * is freed in a transaction is never recycled within that transaction.
69  * To try and avoid fragmenting _free_ space the allocator always goes
70  * back and fills in gaps.
71  *
72  * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
73  * from the block manager.
74  *--------------------------------------------------------------------------*/
75 
76 #define DM_MSG_PREFIX   "thin metadata"
77 
78 #define THIN_SUPERBLOCK_MAGIC 27022010
79 #define THIN_SUPERBLOCK_LOCATION 0
80 #define THIN_VERSION 2
81 #define SECTOR_TO_BLOCK_SHIFT 3
82 
83 /*
84  * For btree insert:
85  *  3 for btree insert +
86  *  2 for btree lookup used within space map
87  * For btree remove:
88  *  2 for shadow spine +
89  *  4 for rebalance 3 child node
90  */
91 #define THIN_MAX_CONCURRENT_LOCKS 6
92 
93 /* This should be plenty */
94 #define SPACE_MAP_ROOT_SIZE 128
95 
96 /*
97  * Little endian on-disk superblock and device details.
98  */
99 struct thin_disk_superblock {
100 	__le32 csum;	/* Checksum of superblock except for this field. */
101 	__le32 flags;
102 	__le64 blocknr;	/* This block number, dm_block_t. */
103 
104 	__u8 uuid[16];
105 	__le64 magic;
106 	__le32 version;
107 	__le32 time;
108 
109 	__le64 trans_id;
110 
111 	/*
112 	 * Root held by userspace transactions.
113 	 */
114 	__le64 held_root;
115 
116 	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
117 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
118 
119 	/*
120 	 * 2-level btree mapping (dev_id, (dev block, time)) -> data block
121 	 */
122 	__le64 data_mapping_root;
123 
124 	/*
125 	 * Device detail root mapping dev_id -> device_details
126 	 */
127 	__le64 device_details_root;
128 
129 	__le32 data_block_size;		/* In 512-byte sectors. */
130 
131 	__le32 metadata_block_size;	/* In 512-byte sectors. */
132 	__le64 metadata_nr_blocks;
133 
134 	__le32 compat_flags;
135 	__le32 compat_ro_flags;
136 	__le32 incompat_flags;
137 } __packed;
138 
139 struct disk_device_details {
140 	__le64 mapped_blocks;
141 	__le64 transaction_id;		/* When created. */
142 	__le32 creation_time;
143 	__le32 snapshotted_time;
144 } __packed;
145 
146 struct dm_pool_metadata {
147 	struct hlist_node hash;
148 
149 	struct block_device *bdev;
150 	struct dm_block_manager *bm;
151 	struct dm_space_map *metadata_sm;
152 	struct dm_space_map *data_sm;
153 	struct dm_transaction_manager *tm;
154 	struct dm_transaction_manager *nb_tm;
155 
156 	/*
157 	 * Two-level btree.
158 	 * First level holds thin_dev_t.
159 	 * Second level holds mappings.
160 	 */
161 	struct dm_btree_info info;
162 
163 	/*
164 	 * Non-blocking version of the above.
165 	 */
166 	struct dm_btree_info nb_info;
167 
168 	/*
169 	 * Just the top level for deleting whole devices.
170 	 */
171 	struct dm_btree_info tl_info;
172 
173 	/*
174 	 * Just the bottom level for creating new devices.
175 	 */
176 	struct dm_btree_info bl_info;
177 
178 	/*
179 	 * Describes the device details btree.
180 	 */
181 	struct dm_btree_info details_info;
182 
183 	struct rw_semaphore root_lock;
184 	uint32_t time;
185 	dm_block_t root;
186 	dm_block_t details_root;
187 	struct list_head thin_devices;
188 	uint64_t trans_id;
189 	unsigned long flags;
190 	sector_t data_block_size;
191 
192 	/*
193 	 * Pre-commit callback.
194 	 *
195 	 * This allows the thin provisioning target to run a callback before
196 	 * the metadata are committed.
197 	 */
198 	dm_pool_pre_commit_fn pre_commit_fn;
199 	void *pre_commit_context;
200 
201 	/*
202 	 * We reserve a section of the metadata for commit overhead.
203 	 * All reported space does *not* include this.
204 	 */
205 	dm_block_t metadata_reserve;
206 
207 	/*
208 	 * Set if a transaction has to be aborted but the attempt to roll back
209 	 * to the previous (good) transaction failed.  The only pool metadata
210 	 * operation possible in this state is the closing of the device.
211 	 */
212 	bool fail_io:1;
213 
214 	/*
215 	 * Set once a thin-pool has been accessed through one of the interfaces
216 	 * that imply the pool is in-service (e.g. thin devices created/deleted,
217 	 * thin-pool message, metadata snapshots, etc).
218 	 */
219 	bool in_service:1;
220 
221 	/*
222 	 * Reading the space map roots can fail, so we read it into these
223 	 * buffers before the superblock is locked and updated.
224 	 */
225 	__u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
226 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
227 };
228 
229 struct dm_thin_device {
230 	struct list_head list;
231 	struct dm_pool_metadata *pmd;
232 	dm_thin_id id;
233 
234 	int open_count;
235 	bool changed:1;
236 	bool aborted_with_changes:1;
237 	uint64_t mapped_blocks;
238 	uint64_t transaction_id;
239 	uint32_t creation_time;
240 	uint32_t snapshotted_time;
241 };
242 
243 /*----------------------------------------------------------------
244  * superblock validator
245  *--------------------------------------------------------------*/
246 
247 #define SUPERBLOCK_CSUM_XOR 160774
248 
249 static void sb_prepare_for_write(struct dm_block_validator *v,
250 				 struct dm_block *b,
251 				 size_t block_size)
252 {
253 	struct thin_disk_superblock *disk_super = dm_block_data(b);
254 
255 	disk_super->blocknr = cpu_to_le64(dm_block_location(b));
256 	disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
257 						      block_size - sizeof(__le32),
258 						      SUPERBLOCK_CSUM_XOR));
259 }
260 
261 static int sb_check(struct dm_block_validator *v,
262 		    struct dm_block *b,
263 		    size_t block_size)
264 {
265 	struct thin_disk_superblock *disk_super = dm_block_data(b);
266 	__le32 csum_le;
267 
268 	if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
269 		DMERR("sb_check failed: blocknr %llu: "
270 		      "wanted %llu", le64_to_cpu(disk_super->blocknr),
271 		      (unsigned long long)dm_block_location(b));
272 		return -ENOTBLK;
273 	}
274 
275 	if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
276 		DMERR("sb_check failed: magic %llu: "
277 		      "wanted %llu", le64_to_cpu(disk_super->magic),
278 		      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
279 		return -EILSEQ;
280 	}
281 
282 	csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
283 					     block_size - sizeof(__le32),
284 					     SUPERBLOCK_CSUM_XOR));
285 	if (csum_le != disk_super->csum) {
286 		DMERR("sb_check failed: csum %u: wanted %u",
287 		      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
288 		return -EILSEQ;
289 	}
290 
291 	return 0;
292 }
293 
294 static struct dm_block_validator sb_validator = {
295 	.name = "superblock",
296 	.prepare_for_write = sb_prepare_for_write,
297 	.check = sb_check
298 };
299 
300 /*----------------------------------------------------------------
301  * Methods for the btree value types
302  *--------------------------------------------------------------*/
303 
304 static uint64_t pack_block_time(dm_block_t b, uint32_t t)
305 {
306 	return (b << 24) | t;
307 }
308 
309 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
310 {
311 	*b = v >> 24;
312 	*t = v & ((1 << 24) - 1);
313 }
314 
315 /*
316  * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as
317  * possible.  'with_runs' reads contiguous runs of blocks, and calls the
318  * given sm function.
319  */
320 typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t);
321 
322 static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn)
323 {
324 	uint64_t b, begin, end;
325 	uint32_t t;
326 	bool in_run = false;
327 	unsigned i;
328 
329 	for (i = 0; i < count; i++, value_le++) {
330 		/* We know value_le is 8 byte aligned */
331 		unpack_block_time(le64_to_cpu(*value_le), &b, &t);
332 
333 		if (in_run) {
334 			if (b == end) {
335 				end++;
336 			} else {
337 				fn(sm, begin, end);
338 				begin = b;
339 				end = b + 1;
340 			}
341 		} else {
342 			in_run = true;
343 			begin = b;
344 			end = b + 1;
345 		}
346 	}
347 
348 	if (in_run)
349 		fn(sm, begin, end);
350 }
351 
352 static void data_block_inc(void *context, const void *value_le, unsigned count)
353 {
354 	with_runs((struct dm_space_map *) context,
355 		  (const __le64 *) value_le, count, dm_sm_inc_blocks);
356 }
357 
358 static void data_block_dec(void *context, const void *value_le, unsigned count)
359 {
360 	with_runs((struct dm_space_map *) context,
361 		  (const __le64 *) value_le, count, dm_sm_dec_blocks);
362 }
363 
364 static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
365 {
366 	__le64 v1_le, v2_le;
367 	uint64_t b1, b2;
368 	uint32_t t;
369 
370 	memcpy(&v1_le, value1_le, sizeof(v1_le));
371 	memcpy(&v2_le, value2_le, sizeof(v2_le));
372 	unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
373 	unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
374 
375 	return b1 == b2;
376 }
377 
378 static void subtree_inc(void *context, const void *value, unsigned count)
379 {
380 	struct dm_btree_info *info = context;
381 	const __le64 *root_le = value;
382 	unsigned i;
383 
384 	for (i = 0; i < count; i++, root_le++)
385 		dm_tm_inc(info->tm, le64_to_cpu(*root_le));
386 }
387 
388 static void subtree_dec(void *context, const void *value, unsigned count)
389 {
390 	struct dm_btree_info *info = context;
391 	const __le64 *root_le = value;
392 	unsigned i;
393 
394 	for (i = 0; i < count; i++, root_le++)
395 		if (dm_btree_del(info, le64_to_cpu(*root_le)))
396 			DMERR("btree delete failed");
397 }
398 
399 static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
400 {
401 	__le64 v1_le, v2_le;
402 	memcpy(&v1_le, value1_le, sizeof(v1_le));
403 	memcpy(&v2_le, value2_le, sizeof(v2_le));
404 
405 	return v1_le == v2_le;
406 }
407 
408 /*----------------------------------------------------------------*/
409 
410 /*
411  * Variant that is used for in-core only changes or code that
412  * shouldn't put the pool in service on its own (e.g. commit).
413  */
414 static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd)
415 	__acquires(pmd->root_lock)
416 {
417 	down_write(&pmd->root_lock);
418 }
419 
420 static inline void pmd_write_lock(struct dm_pool_metadata *pmd)
421 {
422 	pmd_write_lock_in_core(pmd);
423 	if (unlikely(!pmd->in_service))
424 		pmd->in_service = true;
425 }
426 
427 static inline void pmd_write_unlock(struct dm_pool_metadata *pmd)
428 	__releases(pmd->root_lock)
429 {
430 	up_write(&pmd->root_lock);
431 }
432 
433 /*----------------------------------------------------------------*/
434 
435 static int superblock_lock_zero(struct dm_pool_metadata *pmd,
436 				struct dm_block **sblock)
437 {
438 	return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
439 				     &sb_validator, sblock);
440 }
441 
442 static int superblock_lock(struct dm_pool_metadata *pmd,
443 			   struct dm_block **sblock)
444 {
445 	return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
446 				&sb_validator, sblock);
447 }
448 
449 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
450 {
451 	int r;
452 	unsigned i;
453 	struct dm_block *b;
454 	__le64 *data_le, zero = cpu_to_le64(0);
455 	unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
456 
457 	/*
458 	 * We can't use a validator here - it may be all zeroes.
459 	 */
460 	r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
461 	if (r)
462 		return r;
463 
464 	data_le = dm_block_data(b);
465 	*result = 1;
466 	for (i = 0; i < block_size; i++) {
467 		if (data_le[i] != zero) {
468 			*result = 0;
469 			break;
470 		}
471 	}
472 
473 	dm_bm_unlock(b);
474 
475 	return 0;
476 }
477 
478 static void __setup_btree_details(struct dm_pool_metadata *pmd)
479 {
480 	pmd->info.tm = pmd->tm;
481 	pmd->info.levels = 2;
482 	pmd->info.value_type.context = pmd->data_sm;
483 	pmd->info.value_type.size = sizeof(__le64);
484 	pmd->info.value_type.inc = data_block_inc;
485 	pmd->info.value_type.dec = data_block_dec;
486 	pmd->info.value_type.equal = data_block_equal;
487 
488 	memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
489 	pmd->nb_info.tm = pmd->nb_tm;
490 
491 	pmd->tl_info.tm = pmd->tm;
492 	pmd->tl_info.levels = 1;
493 	pmd->tl_info.value_type.context = &pmd->bl_info;
494 	pmd->tl_info.value_type.size = sizeof(__le64);
495 	pmd->tl_info.value_type.inc = subtree_inc;
496 	pmd->tl_info.value_type.dec = subtree_dec;
497 	pmd->tl_info.value_type.equal = subtree_equal;
498 
499 	pmd->bl_info.tm = pmd->tm;
500 	pmd->bl_info.levels = 1;
501 	pmd->bl_info.value_type.context = pmd->data_sm;
502 	pmd->bl_info.value_type.size = sizeof(__le64);
503 	pmd->bl_info.value_type.inc = data_block_inc;
504 	pmd->bl_info.value_type.dec = data_block_dec;
505 	pmd->bl_info.value_type.equal = data_block_equal;
506 
507 	pmd->details_info.tm = pmd->tm;
508 	pmd->details_info.levels = 1;
509 	pmd->details_info.value_type.context = NULL;
510 	pmd->details_info.value_type.size = sizeof(struct disk_device_details);
511 	pmd->details_info.value_type.inc = NULL;
512 	pmd->details_info.value_type.dec = NULL;
513 	pmd->details_info.value_type.equal = NULL;
514 }
515 
516 static int save_sm_roots(struct dm_pool_metadata *pmd)
517 {
518 	int r;
519 	size_t len;
520 
521 	r = dm_sm_root_size(pmd->metadata_sm, &len);
522 	if (r < 0)
523 		return r;
524 
525 	r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
526 	if (r < 0)
527 		return r;
528 
529 	r = dm_sm_root_size(pmd->data_sm, &len);
530 	if (r < 0)
531 		return r;
532 
533 	return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
534 }
535 
536 static void copy_sm_roots(struct dm_pool_metadata *pmd,
537 			  struct thin_disk_superblock *disk)
538 {
539 	memcpy(&disk->metadata_space_map_root,
540 	       &pmd->metadata_space_map_root,
541 	       sizeof(pmd->metadata_space_map_root));
542 
543 	memcpy(&disk->data_space_map_root,
544 	       &pmd->data_space_map_root,
545 	       sizeof(pmd->data_space_map_root));
546 }
547 
548 static int __write_initial_superblock(struct dm_pool_metadata *pmd)
549 {
550 	int r;
551 	struct dm_block *sblock;
552 	struct thin_disk_superblock *disk_super;
553 	sector_t bdev_size = bdev_nr_sectors(pmd->bdev);
554 
555 	if (bdev_size > THIN_METADATA_MAX_SECTORS)
556 		bdev_size = THIN_METADATA_MAX_SECTORS;
557 
558 	r = dm_sm_commit(pmd->data_sm);
559 	if (r < 0)
560 		return r;
561 
562 	r = dm_tm_pre_commit(pmd->tm);
563 	if (r < 0)
564 		return r;
565 
566 	r = save_sm_roots(pmd);
567 	if (r < 0)
568 		return r;
569 
570 	r = superblock_lock_zero(pmd, &sblock);
571 	if (r)
572 		return r;
573 
574 	disk_super = dm_block_data(sblock);
575 	disk_super->flags = 0;
576 	memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
577 	disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
578 	disk_super->version = cpu_to_le32(THIN_VERSION);
579 	disk_super->time = 0;
580 	disk_super->trans_id = 0;
581 	disk_super->held_root = 0;
582 
583 	copy_sm_roots(pmd, disk_super);
584 
585 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
586 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
587 	disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
588 	disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
589 	disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
590 
591 	return dm_tm_commit(pmd->tm, sblock);
592 }
593 
594 static int __format_metadata(struct dm_pool_metadata *pmd)
595 {
596 	int r;
597 
598 	r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
599 				 &pmd->tm, &pmd->metadata_sm);
600 	if (r < 0) {
601 		DMERR("tm_create_with_sm failed");
602 		return r;
603 	}
604 
605 	pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
606 	if (IS_ERR(pmd->data_sm)) {
607 		DMERR("sm_disk_create failed");
608 		r = PTR_ERR(pmd->data_sm);
609 		goto bad_cleanup_tm;
610 	}
611 
612 	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
613 	if (!pmd->nb_tm) {
614 		DMERR("could not create non-blocking clone tm");
615 		r = -ENOMEM;
616 		goto bad_cleanup_data_sm;
617 	}
618 
619 	__setup_btree_details(pmd);
620 
621 	r = dm_btree_empty(&pmd->info, &pmd->root);
622 	if (r < 0)
623 		goto bad_cleanup_nb_tm;
624 
625 	r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
626 	if (r < 0) {
627 		DMERR("couldn't create devices root");
628 		goto bad_cleanup_nb_tm;
629 	}
630 
631 	r = __write_initial_superblock(pmd);
632 	if (r)
633 		goto bad_cleanup_nb_tm;
634 
635 	return 0;
636 
637 bad_cleanup_nb_tm:
638 	dm_tm_destroy(pmd->nb_tm);
639 bad_cleanup_data_sm:
640 	dm_sm_destroy(pmd->data_sm);
641 bad_cleanup_tm:
642 	dm_tm_destroy(pmd->tm);
643 	dm_sm_destroy(pmd->metadata_sm);
644 
645 	return r;
646 }
647 
648 static int __check_incompat_features(struct thin_disk_superblock *disk_super,
649 				     struct dm_pool_metadata *pmd)
650 {
651 	uint32_t features;
652 
653 	features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
654 	if (features) {
655 		DMERR("could not access metadata due to unsupported optional features (%lx).",
656 		      (unsigned long)features);
657 		return -EINVAL;
658 	}
659 
660 	/*
661 	 * Check for read-only metadata to skip the following RDWR checks.
662 	 */
663 	if (bdev_read_only(pmd->bdev))
664 		return 0;
665 
666 	features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
667 	if (features) {
668 		DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
669 		      (unsigned long)features);
670 		return -EINVAL;
671 	}
672 
673 	return 0;
674 }
675 
676 static int __open_metadata(struct dm_pool_metadata *pmd)
677 {
678 	int r;
679 	struct dm_block *sblock;
680 	struct thin_disk_superblock *disk_super;
681 
682 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
683 			    &sb_validator, &sblock);
684 	if (r < 0) {
685 		DMERR("couldn't read superblock");
686 		return r;
687 	}
688 
689 	disk_super = dm_block_data(sblock);
690 
691 	/* Verify the data block size hasn't changed */
692 	if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
693 		DMERR("changing the data block size (from %u to %llu) is not supported",
694 		      le32_to_cpu(disk_super->data_block_size),
695 		      (unsigned long long)pmd->data_block_size);
696 		r = -EINVAL;
697 		goto bad_unlock_sblock;
698 	}
699 
700 	r = __check_incompat_features(disk_super, pmd);
701 	if (r < 0)
702 		goto bad_unlock_sblock;
703 
704 	r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
705 			       disk_super->metadata_space_map_root,
706 			       sizeof(disk_super->metadata_space_map_root),
707 			       &pmd->tm, &pmd->metadata_sm);
708 	if (r < 0) {
709 		DMERR("tm_open_with_sm failed");
710 		goto bad_unlock_sblock;
711 	}
712 
713 	pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
714 				       sizeof(disk_super->data_space_map_root));
715 	if (IS_ERR(pmd->data_sm)) {
716 		DMERR("sm_disk_open failed");
717 		r = PTR_ERR(pmd->data_sm);
718 		goto bad_cleanup_tm;
719 	}
720 
721 	pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
722 	if (!pmd->nb_tm) {
723 		DMERR("could not create non-blocking clone tm");
724 		r = -ENOMEM;
725 		goto bad_cleanup_data_sm;
726 	}
727 
728 	/*
729 	 * For pool metadata opening process, root setting is redundant
730 	 * because it will be set again in __begin_transaction(). But dm
731 	 * pool aborting process really needs to get last transaction's
732 	 * root to avoid accessing broken btree.
733 	 */
734 	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
735 	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
736 
737 	__setup_btree_details(pmd);
738 	dm_bm_unlock(sblock);
739 
740 	return 0;
741 
742 bad_cleanup_data_sm:
743 	dm_sm_destroy(pmd->data_sm);
744 bad_cleanup_tm:
745 	dm_tm_destroy(pmd->tm);
746 	dm_sm_destroy(pmd->metadata_sm);
747 bad_unlock_sblock:
748 	dm_bm_unlock(sblock);
749 
750 	return r;
751 }
752 
753 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
754 {
755 	int r, unformatted;
756 
757 	r = __superblock_all_zeroes(pmd->bm, &unformatted);
758 	if (r)
759 		return r;
760 
761 	if (unformatted)
762 		return format_device ? __format_metadata(pmd) : -EPERM;
763 
764 	return __open_metadata(pmd);
765 }
766 
767 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
768 {
769 	int r;
770 
771 	pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
772 					  THIN_MAX_CONCURRENT_LOCKS);
773 	if (IS_ERR(pmd->bm)) {
774 		DMERR("could not create block manager");
775 		r = PTR_ERR(pmd->bm);
776 		pmd->bm = NULL;
777 		return r;
778 	}
779 
780 	r = __open_or_format_metadata(pmd, format_device);
781 	if (r) {
782 		dm_block_manager_destroy(pmd->bm);
783 		pmd->bm = NULL;
784 	}
785 
786 	return r;
787 }
788 
789 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd,
790 					      bool destroy_bm)
791 {
792 	dm_sm_destroy(pmd->data_sm);
793 	dm_sm_destroy(pmd->metadata_sm);
794 	dm_tm_destroy(pmd->nb_tm);
795 	dm_tm_destroy(pmd->tm);
796 	if (destroy_bm)
797 		dm_block_manager_destroy(pmd->bm);
798 }
799 
800 static int __begin_transaction(struct dm_pool_metadata *pmd)
801 {
802 	int r;
803 	struct thin_disk_superblock *disk_super;
804 	struct dm_block *sblock;
805 
806 	/*
807 	 * We re-read the superblock every time.  Shouldn't need to do this
808 	 * really.
809 	 */
810 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
811 			    &sb_validator, &sblock);
812 	if (r)
813 		return r;
814 
815 	disk_super = dm_block_data(sblock);
816 	pmd->time = le32_to_cpu(disk_super->time);
817 	pmd->root = le64_to_cpu(disk_super->data_mapping_root);
818 	pmd->details_root = le64_to_cpu(disk_super->device_details_root);
819 	pmd->trans_id = le64_to_cpu(disk_super->trans_id);
820 	pmd->flags = le32_to_cpu(disk_super->flags);
821 	pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
822 
823 	dm_bm_unlock(sblock);
824 	return 0;
825 }
826 
827 static int __write_changed_details(struct dm_pool_metadata *pmd)
828 {
829 	int r;
830 	struct dm_thin_device *td, *tmp;
831 	struct disk_device_details details;
832 	uint64_t key;
833 
834 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
835 		if (!td->changed)
836 			continue;
837 
838 		key = td->id;
839 
840 		details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
841 		details.transaction_id = cpu_to_le64(td->transaction_id);
842 		details.creation_time = cpu_to_le32(td->creation_time);
843 		details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
844 		__dm_bless_for_disk(&details);
845 
846 		r = dm_btree_insert(&pmd->details_info, pmd->details_root,
847 				    &key, &details, &pmd->details_root);
848 		if (r)
849 			return r;
850 
851 		if (td->open_count)
852 			td->changed = false;
853 		else {
854 			list_del(&td->list);
855 			kfree(td);
856 		}
857 	}
858 
859 	return 0;
860 }
861 
862 static int __commit_transaction(struct dm_pool_metadata *pmd)
863 {
864 	int r;
865 	struct thin_disk_superblock *disk_super;
866 	struct dm_block *sblock;
867 
868 	/*
869 	 * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
870 	 */
871 	BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
872 	BUG_ON(!rwsem_is_locked(&pmd->root_lock));
873 
874 	if (unlikely(!pmd->in_service))
875 		return 0;
876 
877 	if (pmd->pre_commit_fn) {
878 		r = pmd->pre_commit_fn(pmd->pre_commit_context);
879 		if (r < 0) {
880 			DMERR("pre-commit callback failed");
881 			return r;
882 		}
883 	}
884 
885 	r = __write_changed_details(pmd);
886 	if (r < 0)
887 		return r;
888 
889 	r = dm_sm_commit(pmd->data_sm);
890 	if (r < 0)
891 		return r;
892 
893 	r = dm_tm_pre_commit(pmd->tm);
894 	if (r < 0)
895 		return r;
896 
897 	r = save_sm_roots(pmd);
898 	if (r < 0)
899 		return r;
900 
901 	r = superblock_lock(pmd, &sblock);
902 	if (r)
903 		return r;
904 
905 	disk_super = dm_block_data(sblock);
906 	disk_super->time = cpu_to_le32(pmd->time);
907 	disk_super->data_mapping_root = cpu_to_le64(pmd->root);
908 	disk_super->device_details_root = cpu_to_le64(pmd->details_root);
909 	disk_super->trans_id = cpu_to_le64(pmd->trans_id);
910 	disk_super->flags = cpu_to_le32(pmd->flags);
911 
912 	copy_sm_roots(pmd, disk_super);
913 
914 	return dm_tm_commit(pmd->tm, sblock);
915 }
916 
917 static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
918 {
919 	int r;
920 	dm_block_t total;
921 	dm_block_t max_blocks = 4096; /* 16M */
922 
923 	r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
924 	if (r) {
925 		DMERR("could not get size of metadata device");
926 		pmd->metadata_reserve = max_blocks;
927 	} else
928 		pmd->metadata_reserve = min(max_blocks, div_u64(total, 10));
929 }
930 
931 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
932 					       sector_t data_block_size,
933 					       bool format_device)
934 {
935 	int r;
936 	struct dm_pool_metadata *pmd;
937 
938 	pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
939 	if (!pmd) {
940 		DMERR("could not allocate metadata struct");
941 		return ERR_PTR(-ENOMEM);
942 	}
943 
944 	init_rwsem(&pmd->root_lock);
945 	pmd->time = 0;
946 	INIT_LIST_HEAD(&pmd->thin_devices);
947 	pmd->fail_io = false;
948 	pmd->in_service = false;
949 	pmd->bdev = bdev;
950 	pmd->data_block_size = data_block_size;
951 	pmd->pre_commit_fn = NULL;
952 	pmd->pre_commit_context = NULL;
953 
954 	r = __create_persistent_data_objects(pmd, format_device);
955 	if (r) {
956 		kfree(pmd);
957 		return ERR_PTR(r);
958 	}
959 
960 	r = __begin_transaction(pmd);
961 	if (r < 0) {
962 		if (dm_pool_metadata_close(pmd) < 0)
963 			DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
964 		return ERR_PTR(r);
965 	}
966 
967 	__set_metadata_reserve(pmd);
968 
969 	return pmd;
970 }
971 
972 int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
973 {
974 	int r;
975 	unsigned open_devices = 0;
976 	struct dm_thin_device *td, *tmp;
977 
978 	down_read(&pmd->root_lock);
979 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
980 		if (td->open_count)
981 			open_devices++;
982 		else {
983 			list_del(&td->list);
984 			kfree(td);
985 		}
986 	}
987 	up_read(&pmd->root_lock);
988 
989 	if (open_devices) {
990 		DMERR("attempt to close pmd when %u device(s) are still open",
991 		       open_devices);
992 		return -EBUSY;
993 	}
994 
995 	pmd_write_lock_in_core(pmd);
996 	if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) {
997 		r = __commit_transaction(pmd);
998 		if (r < 0)
999 			DMWARN("%s: __commit_transaction() failed, error = %d",
1000 			       __func__, r);
1001 	}
1002 	pmd_write_unlock(pmd);
1003 	if (!pmd->fail_io)
1004 		__destroy_persistent_data_objects(pmd, true);
1005 
1006 	kfree(pmd);
1007 	return 0;
1008 }
1009 
1010 /*
1011  * __open_device: Returns @td corresponding to device with id @dev,
1012  * creating it if @create is set and incrementing @td->open_count.
1013  * On failure, @td is undefined.
1014  */
1015 static int __open_device(struct dm_pool_metadata *pmd,
1016 			 dm_thin_id dev, int create,
1017 			 struct dm_thin_device **td)
1018 {
1019 	int r, changed = 0;
1020 	struct dm_thin_device *td2;
1021 	uint64_t key = dev;
1022 	struct disk_device_details details_le;
1023 
1024 	/*
1025 	 * If the device is already open, return it.
1026 	 */
1027 	list_for_each_entry(td2, &pmd->thin_devices, list)
1028 		if (td2->id == dev) {
1029 			/*
1030 			 * May not create an already-open device.
1031 			 */
1032 			if (create)
1033 				return -EEXIST;
1034 
1035 			td2->open_count++;
1036 			*td = td2;
1037 			return 0;
1038 		}
1039 
1040 	/*
1041 	 * Check the device exists.
1042 	 */
1043 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1044 			    &key, &details_le);
1045 	if (r) {
1046 		if (r != -ENODATA || !create)
1047 			return r;
1048 
1049 		/*
1050 		 * Create new device.
1051 		 */
1052 		changed = 1;
1053 		details_le.mapped_blocks = 0;
1054 		details_le.transaction_id = cpu_to_le64(pmd->trans_id);
1055 		details_le.creation_time = cpu_to_le32(pmd->time);
1056 		details_le.snapshotted_time = cpu_to_le32(pmd->time);
1057 	}
1058 
1059 	*td = kmalloc(sizeof(**td), GFP_NOIO);
1060 	if (!*td)
1061 		return -ENOMEM;
1062 
1063 	(*td)->pmd = pmd;
1064 	(*td)->id = dev;
1065 	(*td)->open_count = 1;
1066 	(*td)->changed = changed;
1067 	(*td)->aborted_with_changes = false;
1068 	(*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
1069 	(*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
1070 	(*td)->creation_time = le32_to_cpu(details_le.creation_time);
1071 	(*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
1072 
1073 	list_add(&(*td)->list, &pmd->thin_devices);
1074 
1075 	return 0;
1076 }
1077 
1078 static void __close_device(struct dm_thin_device *td)
1079 {
1080 	--td->open_count;
1081 }
1082 
1083 static int __create_thin(struct dm_pool_metadata *pmd,
1084 			 dm_thin_id dev)
1085 {
1086 	int r;
1087 	dm_block_t dev_root;
1088 	uint64_t key = dev;
1089 	struct dm_thin_device *td;
1090 	__le64 value;
1091 
1092 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1093 			    &key, NULL);
1094 	if (!r)
1095 		return -EEXIST;
1096 
1097 	/*
1098 	 * Create an empty btree for the mappings.
1099 	 */
1100 	r = dm_btree_empty(&pmd->bl_info, &dev_root);
1101 	if (r)
1102 		return r;
1103 
1104 	/*
1105 	 * Insert it into the main mapping tree.
1106 	 */
1107 	value = cpu_to_le64(dev_root);
1108 	__dm_bless_for_disk(&value);
1109 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1110 	if (r) {
1111 		dm_btree_del(&pmd->bl_info, dev_root);
1112 		return r;
1113 	}
1114 
1115 	r = __open_device(pmd, dev, 1, &td);
1116 	if (r) {
1117 		dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1118 		dm_btree_del(&pmd->bl_info, dev_root);
1119 		return r;
1120 	}
1121 	__close_device(td);
1122 
1123 	return r;
1124 }
1125 
1126 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1127 {
1128 	int r = -EINVAL;
1129 
1130 	pmd_write_lock(pmd);
1131 	if (!pmd->fail_io)
1132 		r = __create_thin(pmd, dev);
1133 	pmd_write_unlock(pmd);
1134 
1135 	return r;
1136 }
1137 
1138 static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1139 				  struct dm_thin_device *snap,
1140 				  dm_thin_id origin, uint32_t time)
1141 {
1142 	int r;
1143 	struct dm_thin_device *td;
1144 
1145 	r = __open_device(pmd, origin, 0, &td);
1146 	if (r)
1147 		return r;
1148 
1149 	td->changed = true;
1150 	td->snapshotted_time = time;
1151 
1152 	snap->mapped_blocks = td->mapped_blocks;
1153 	snap->snapshotted_time = time;
1154 	__close_device(td);
1155 
1156 	return 0;
1157 }
1158 
1159 static int __create_snap(struct dm_pool_metadata *pmd,
1160 			 dm_thin_id dev, dm_thin_id origin)
1161 {
1162 	int r;
1163 	dm_block_t origin_root;
1164 	uint64_t key = origin, dev_key = dev;
1165 	struct dm_thin_device *td;
1166 	__le64 value;
1167 
1168 	/* check this device is unused */
1169 	r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1170 			    &dev_key, NULL);
1171 	if (!r)
1172 		return -EEXIST;
1173 
1174 	/* find the mapping tree for the origin */
1175 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1176 	if (r)
1177 		return r;
1178 	origin_root = le64_to_cpu(value);
1179 
1180 	/* clone the origin, an inc will do */
1181 	dm_tm_inc(pmd->tm, origin_root);
1182 
1183 	/* insert into the main mapping tree */
1184 	value = cpu_to_le64(origin_root);
1185 	__dm_bless_for_disk(&value);
1186 	key = dev;
1187 	r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1188 	if (r) {
1189 		dm_tm_dec(pmd->tm, origin_root);
1190 		return r;
1191 	}
1192 
1193 	pmd->time++;
1194 
1195 	r = __open_device(pmd, dev, 1, &td);
1196 	if (r)
1197 		goto bad;
1198 
1199 	r = __set_snapshot_details(pmd, td, origin, pmd->time);
1200 	__close_device(td);
1201 
1202 	if (r)
1203 		goto bad;
1204 
1205 	return 0;
1206 
1207 bad:
1208 	dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1209 	dm_btree_remove(&pmd->details_info, pmd->details_root,
1210 			&key, &pmd->details_root);
1211 	return r;
1212 }
1213 
1214 int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1215 				 dm_thin_id dev,
1216 				 dm_thin_id origin)
1217 {
1218 	int r = -EINVAL;
1219 
1220 	pmd_write_lock(pmd);
1221 	if (!pmd->fail_io)
1222 		r = __create_snap(pmd, dev, origin);
1223 	pmd_write_unlock(pmd);
1224 
1225 	return r;
1226 }
1227 
1228 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1229 {
1230 	int r;
1231 	uint64_t key = dev;
1232 	struct dm_thin_device *td;
1233 
1234 	/* TODO: failure should mark the transaction invalid */
1235 	r = __open_device(pmd, dev, 0, &td);
1236 	if (r)
1237 		return r;
1238 
1239 	if (td->open_count > 1) {
1240 		__close_device(td);
1241 		return -EBUSY;
1242 	}
1243 
1244 	list_del(&td->list);
1245 	kfree(td);
1246 	r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1247 			    &key, &pmd->details_root);
1248 	if (r)
1249 		return r;
1250 
1251 	r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1252 	if (r)
1253 		return r;
1254 
1255 	return 0;
1256 }
1257 
1258 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1259 			       dm_thin_id dev)
1260 {
1261 	int r = -EINVAL;
1262 
1263 	pmd_write_lock(pmd);
1264 	if (!pmd->fail_io)
1265 		r = __delete_device(pmd, dev);
1266 	pmd_write_unlock(pmd);
1267 
1268 	return r;
1269 }
1270 
1271 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1272 					uint64_t current_id,
1273 					uint64_t new_id)
1274 {
1275 	int r = -EINVAL;
1276 
1277 	pmd_write_lock(pmd);
1278 
1279 	if (pmd->fail_io)
1280 		goto out;
1281 
1282 	if (pmd->trans_id != current_id) {
1283 		DMERR("mismatched transaction id");
1284 		goto out;
1285 	}
1286 
1287 	pmd->trans_id = new_id;
1288 	r = 0;
1289 
1290 out:
1291 	pmd_write_unlock(pmd);
1292 
1293 	return r;
1294 }
1295 
1296 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1297 					uint64_t *result)
1298 {
1299 	int r = -EINVAL;
1300 
1301 	down_read(&pmd->root_lock);
1302 	if (!pmd->fail_io) {
1303 		*result = pmd->trans_id;
1304 		r = 0;
1305 	}
1306 	up_read(&pmd->root_lock);
1307 
1308 	return r;
1309 }
1310 
1311 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1312 {
1313 	int r, inc;
1314 	struct thin_disk_superblock *disk_super;
1315 	struct dm_block *copy, *sblock;
1316 	dm_block_t held_root;
1317 
1318 	/*
1319 	 * We commit to ensure the btree roots which we increment in a
1320 	 * moment are up to date.
1321 	 */
1322 	r = __commit_transaction(pmd);
1323 	if (r < 0) {
1324 		DMWARN("%s: __commit_transaction() failed, error = %d",
1325 		       __func__, r);
1326 		return r;
1327 	}
1328 
1329 	/*
1330 	 * Copy the superblock.
1331 	 */
1332 	dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1333 	r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1334 			       &sb_validator, &copy, &inc);
1335 	if (r)
1336 		return r;
1337 
1338 	BUG_ON(!inc);
1339 
1340 	held_root = dm_block_location(copy);
1341 	disk_super = dm_block_data(copy);
1342 
1343 	if (le64_to_cpu(disk_super->held_root)) {
1344 		DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1345 
1346 		dm_tm_dec(pmd->tm, held_root);
1347 		dm_tm_unlock(pmd->tm, copy);
1348 		return -EBUSY;
1349 	}
1350 
1351 	/*
1352 	 * Wipe the spacemap since we're not publishing this.
1353 	 */
1354 	memset(&disk_super->data_space_map_root, 0,
1355 	       sizeof(disk_super->data_space_map_root));
1356 	memset(&disk_super->metadata_space_map_root, 0,
1357 	       sizeof(disk_super->metadata_space_map_root));
1358 
1359 	/*
1360 	 * Increment the data structures that need to be preserved.
1361 	 */
1362 	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1363 	dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1364 	dm_tm_unlock(pmd->tm, copy);
1365 
1366 	/*
1367 	 * Write the held root into the superblock.
1368 	 */
1369 	r = superblock_lock(pmd, &sblock);
1370 	if (r) {
1371 		dm_tm_dec(pmd->tm, held_root);
1372 		return r;
1373 	}
1374 
1375 	disk_super = dm_block_data(sblock);
1376 	disk_super->held_root = cpu_to_le64(held_root);
1377 	dm_bm_unlock(sblock);
1378 	return 0;
1379 }
1380 
1381 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1382 {
1383 	int r = -EINVAL;
1384 
1385 	pmd_write_lock(pmd);
1386 	if (!pmd->fail_io)
1387 		r = __reserve_metadata_snap(pmd);
1388 	pmd_write_unlock(pmd);
1389 
1390 	return r;
1391 }
1392 
1393 static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1394 {
1395 	int r;
1396 	struct thin_disk_superblock *disk_super;
1397 	struct dm_block *sblock, *copy;
1398 	dm_block_t held_root;
1399 
1400 	r = superblock_lock(pmd, &sblock);
1401 	if (r)
1402 		return r;
1403 
1404 	disk_super = dm_block_data(sblock);
1405 	held_root = le64_to_cpu(disk_super->held_root);
1406 	disk_super->held_root = cpu_to_le64(0);
1407 
1408 	dm_bm_unlock(sblock);
1409 
1410 	if (!held_root) {
1411 		DMWARN("No pool metadata snapshot found: nothing to release.");
1412 		return -EINVAL;
1413 	}
1414 
1415 	r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1416 	if (r)
1417 		return r;
1418 
1419 	disk_super = dm_block_data(copy);
1420 	dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1421 	dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1422 	dm_sm_dec_block(pmd->metadata_sm, held_root);
1423 
1424 	dm_tm_unlock(pmd->tm, copy);
1425 
1426 	return 0;
1427 }
1428 
1429 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1430 {
1431 	int r = -EINVAL;
1432 
1433 	pmd_write_lock(pmd);
1434 	if (!pmd->fail_io)
1435 		r = __release_metadata_snap(pmd);
1436 	pmd_write_unlock(pmd);
1437 
1438 	return r;
1439 }
1440 
1441 static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1442 			       dm_block_t *result)
1443 {
1444 	int r;
1445 	struct thin_disk_superblock *disk_super;
1446 	struct dm_block *sblock;
1447 
1448 	r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1449 			    &sb_validator, &sblock);
1450 	if (r)
1451 		return r;
1452 
1453 	disk_super = dm_block_data(sblock);
1454 	*result = le64_to_cpu(disk_super->held_root);
1455 
1456 	dm_bm_unlock(sblock);
1457 
1458 	return 0;
1459 }
1460 
1461 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1462 			      dm_block_t *result)
1463 {
1464 	int r = -EINVAL;
1465 
1466 	down_read(&pmd->root_lock);
1467 	if (!pmd->fail_io)
1468 		r = __get_metadata_snap(pmd, result);
1469 	up_read(&pmd->root_lock);
1470 
1471 	return r;
1472 }
1473 
1474 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1475 			     struct dm_thin_device **td)
1476 {
1477 	int r = -EINVAL;
1478 
1479 	pmd_write_lock_in_core(pmd);
1480 	if (!pmd->fail_io)
1481 		r = __open_device(pmd, dev, 0, td);
1482 	pmd_write_unlock(pmd);
1483 
1484 	return r;
1485 }
1486 
1487 int dm_pool_close_thin_device(struct dm_thin_device *td)
1488 {
1489 	pmd_write_lock_in_core(td->pmd);
1490 	__close_device(td);
1491 	pmd_write_unlock(td->pmd);
1492 
1493 	return 0;
1494 }
1495 
1496 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1497 {
1498 	return td->id;
1499 }
1500 
1501 /*
1502  * Check whether @time (of block creation) is older than @td's last snapshot.
1503  * If so then the associated block is shared with the last snapshot device.
1504  * Any block on a device created *after* the device last got snapshotted is
1505  * necessarily not shared.
1506  */
1507 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1508 {
1509 	return td->snapshotted_time > time;
1510 }
1511 
1512 static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1513 				 struct dm_thin_lookup_result *result)
1514 {
1515 	uint64_t block_time = 0;
1516 	dm_block_t exception_block;
1517 	uint32_t exception_time;
1518 
1519 	block_time = le64_to_cpu(value);
1520 	unpack_block_time(block_time, &exception_block, &exception_time);
1521 	result->block = exception_block;
1522 	result->shared = __snapshotted_since(td, exception_time);
1523 }
1524 
1525 static int __find_block(struct dm_thin_device *td, dm_block_t block,
1526 			int can_issue_io, struct dm_thin_lookup_result *result)
1527 {
1528 	int r;
1529 	__le64 value;
1530 	struct dm_pool_metadata *pmd = td->pmd;
1531 	dm_block_t keys[2] = { td->id, block };
1532 	struct dm_btree_info *info;
1533 
1534 	if (can_issue_io) {
1535 		info = &pmd->info;
1536 	} else
1537 		info = &pmd->nb_info;
1538 
1539 	r = dm_btree_lookup(info, pmd->root, keys, &value);
1540 	if (!r)
1541 		unpack_lookup_result(td, value, result);
1542 
1543 	return r;
1544 }
1545 
1546 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1547 		       int can_issue_io, struct dm_thin_lookup_result *result)
1548 {
1549 	int r;
1550 	struct dm_pool_metadata *pmd = td->pmd;
1551 
1552 	down_read(&pmd->root_lock);
1553 	if (pmd->fail_io) {
1554 		up_read(&pmd->root_lock);
1555 		return -EINVAL;
1556 	}
1557 
1558 	r = __find_block(td, block, can_issue_io, result);
1559 
1560 	up_read(&pmd->root_lock);
1561 	return r;
1562 }
1563 
1564 static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1565 					  dm_block_t *vblock,
1566 					  struct dm_thin_lookup_result *result)
1567 {
1568 	int r;
1569 	__le64 value;
1570 	struct dm_pool_metadata *pmd = td->pmd;
1571 	dm_block_t keys[2] = { td->id, block };
1572 
1573 	r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1574 	if (!r)
1575 		unpack_lookup_result(td, value, result);
1576 
1577 	return r;
1578 }
1579 
1580 static int __find_mapped_range(struct dm_thin_device *td,
1581 			       dm_block_t begin, dm_block_t end,
1582 			       dm_block_t *thin_begin, dm_block_t *thin_end,
1583 			       dm_block_t *pool_begin, bool *maybe_shared)
1584 {
1585 	int r;
1586 	dm_block_t pool_end;
1587 	struct dm_thin_lookup_result lookup;
1588 
1589 	if (end < begin)
1590 		return -ENODATA;
1591 
1592 	r = __find_next_mapped_block(td, begin, &begin, &lookup);
1593 	if (r)
1594 		return r;
1595 
1596 	if (begin >= end)
1597 		return -ENODATA;
1598 
1599 	*thin_begin = begin;
1600 	*pool_begin = lookup.block;
1601 	*maybe_shared = lookup.shared;
1602 
1603 	begin++;
1604 	pool_end = *pool_begin + 1;
1605 	while (begin != end) {
1606 		r = __find_block(td, begin, true, &lookup);
1607 		if (r) {
1608 			if (r == -ENODATA)
1609 				break;
1610 			else
1611 				return r;
1612 		}
1613 
1614 		if ((lookup.block != pool_end) ||
1615 		    (lookup.shared != *maybe_shared))
1616 			break;
1617 
1618 		pool_end++;
1619 		begin++;
1620 	}
1621 
1622 	*thin_end = begin;
1623 	return 0;
1624 }
1625 
1626 int dm_thin_find_mapped_range(struct dm_thin_device *td,
1627 			      dm_block_t begin, dm_block_t end,
1628 			      dm_block_t *thin_begin, dm_block_t *thin_end,
1629 			      dm_block_t *pool_begin, bool *maybe_shared)
1630 {
1631 	int r = -EINVAL;
1632 	struct dm_pool_metadata *pmd = td->pmd;
1633 
1634 	down_read(&pmd->root_lock);
1635 	if (!pmd->fail_io) {
1636 		r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1637 					pool_begin, maybe_shared);
1638 	}
1639 	up_read(&pmd->root_lock);
1640 
1641 	return r;
1642 }
1643 
1644 static int __insert(struct dm_thin_device *td, dm_block_t block,
1645 		    dm_block_t data_block)
1646 {
1647 	int r, inserted;
1648 	__le64 value;
1649 	struct dm_pool_metadata *pmd = td->pmd;
1650 	dm_block_t keys[2] = { td->id, block };
1651 
1652 	value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1653 	__dm_bless_for_disk(&value);
1654 
1655 	r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1656 				   &pmd->root, &inserted);
1657 	if (r)
1658 		return r;
1659 
1660 	td->changed = true;
1661 	if (inserted)
1662 		td->mapped_blocks++;
1663 
1664 	return 0;
1665 }
1666 
1667 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1668 			 dm_block_t data_block)
1669 {
1670 	int r = -EINVAL;
1671 
1672 	pmd_write_lock(td->pmd);
1673 	if (!td->pmd->fail_io)
1674 		r = __insert(td, block, data_block);
1675 	pmd_write_unlock(td->pmd);
1676 
1677 	return r;
1678 }
1679 
1680 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1681 {
1682 	int r;
1683 	unsigned count, total_count = 0;
1684 	struct dm_pool_metadata *pmd = td->pmd;
1685 	dm_block_t keys[1] = { td->id };
1686 	__le64 value;
1687 	dm_block_t mapping_root;
1688 
1689 	/*
1690 	 * Find the mapping tree
1691 	 */
1692 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1693 	if (r)
1694 		return r;
1695 
1696 	/*
1697 	 * Remove from the mapping tree, taking care to inc the
1698 	 * ref count so it doesn't get deleted.
1699 	 */
1700 	mapping_root = le64_to_cpu(value);
1701 	dm_tm_inc(pmd->tm, mapping_root);
1702 	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1703 	if (r)
1704 		return r;
1705 
1706 	/*
1707 	 * Remove leaves stops at the first unmapped entry, so we have to
1708 	 * loop round finding mapped ranges.
1709 	 */
1710 	while (begin < end) {
1711 		r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1712 		if (r == -ENODATA)
1713 			break;
1714 
1715 		if (r)
1716 			return r;
1717 
1718 		if (begin >= end)
1719 			break;
1720 
1721 		r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1722 		if (r)
1723 			return r;
1724 
1725 		total_count += count;
1726 	}
1727 
1728 	td->mapped_blocks -= total_count;
1729 	td->changed = true;
1730 
1731 	/*
1732 	 * Reinsert the mapping tree.
1733 	 */
1734 	value = cpu_to_le64(mapping_root);
1735 	__dm_bless_for_disk(&value);
1736 	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1737 }
1738 
1739 int dm_thin_remove_range(struct dm_thin_device *td,
1740 			 dm_block_t begin, dm_block_t end)
1741 {
1742 	int r = -EINVAL;
1743 
1744 	pmd_write_lock(td->pmd);
1745 	if (!td->pmd->fail_io)
1746 		r = __remove_range(td, begin, end);
1747 	pmd_write_unlock(td->pmd);
1748 
1749 	return r;
1750 }
1751 
1752 int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1753 {
1754 	int r;
1755 	uint32_t ref_count;
1756 
1757 	down_read(&pmd->root_lock);
1758 	r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1759 	if (!r)
1760 		*result = (ref_count > 1);
1761 	up_read(&pmd->root_lock);
1762 
1763 	return r;
1764 }
1765 
1766 int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1767 {
1768 	int r = 0;
1769 
1770 	pmd_write_lock(pmd);
1771 	r = dm_sm_inc_blocks(pmd->data_sm, b, e);
1772 	pmd_write_unlock(pmd);
1773 
1774 	return r;
1775 }
1776 
1777 int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e)
1778 {
1779 	int r = 0;
1780 
1781 	pmd_write_lock(pmd);
1782 	r = dm_sm_dec_blocks(pmd->data_sm, b, e);
1783 	pmd_write_unlock(pmd);
1784 
1785 	return r;
1786 }
1787 
1788 bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1789 {
1790 	int r;
1791 
1792 	down_read(&td->pmd->root_lock);
1793 	r = td->changed;
1794 	up_read(&td->pmd->root_lock);
1795 
1796 	return r;
1797 }
1798 
1799 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1800 {
1801 	bool r = false;
1802 	struct dm_thin_device *td, *tmp;
1803 
1804 	down_read(&pmd->root_lock);
1805 	list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1806 		if (td->changed) {
1807 			r = td->changed;
1808 			break;
1809 		}
1810 	}
1811 	up_read(&pmd->root_lock);
1812 
1813 	return r;
1814 }
1815 
1816 bool dm_thin_aborted_changes(struct dm_thin_device *td)
1817 {
1818 	bool r;
1819 
1820 	down_read(&td->pmd->root_lock);
1821 	r = td->aborted_with_changes;
1822 	up_read(&td->pmd->root_lock);
1823 
1824 	return r;
1825 }
1826 
1827 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1828 {
1829 	int r = -EINVAL;
1830 
1831 	pmd_write_lock(pmd);
1832 	if (!pmd->fail_io)
1833 		r = dm_sm_new_block(pmd->data_sm, result);
1834 	pmd_write_unlock(pmd);
1835 
1836 	return r;
1837 }
1838 
1839 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1840 {
1841 	int r = -EINVAL;
1842 
1843 	/*
1844 	 * Care is taken to not have commit be what
1845 	 * triggers putting the thin-pool in-service.
1846 	 */
1847 	pmd_write_lock_in_core(pmd);
1848 	if (pmd->fail_io)
1849 		goto out;
1850 
1851 	r = __commit_transaction(pmd);
1852 	if (r < 0)
1853 		goto out;
1854 
1855 	/*
1856 	 * Open the next transaction.
1857 	 */
1858 	r = __begin_transaction(pmd);
1859 out:
1860 	pmd_write_unlock(pmd);
1861 	return r;
1862 }
1863 
1864 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1865 {
1866 	struct dm_thin_device *td;
1867 
1868 	list_for_each_entry(td, &pmd->thin_devices, list)
1869 		td->aborted_with_changes = td->changed;
1870 }
1871 
1872 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1873 {
1874 	int r = -EINVAL;
1875 	struct dm_block_manager *old_bm = NULL, *new_bm = NULL;
1876 
1877 	/* fail_io is double-checked with pmd->root_lock held below */
1878 	if (unlikely(pmd->fail_io))
1879 		return r;
1880 
1881 	/*
1882 	 * Replacement block manager (new_bm) is created and old_bm destroyed outside of
1883 	 * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of
1884 	 * shrinker associated with the block manager's bufio client vs pmd root_lock).
1885 	 * - must take shrinker_rwsem without holding pmd->root_lock
1886 	 */
1887 	new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
1888 					 THIN_MAX_CONCURRENT_LOCKS);
1889 
1890 	pmd_write_lock(pmd);
1891 	if (pmd->fail_io) {
1892 		pmd_write_unlock(pmd);
1893 		goto out;
1894 	}
1895 
1896 	__set_abort_with_changes_flags(pmd);
1897 	__destroy_persistent_data_objects(pmd, false);
1898 	old_bm = pmd->bm;
1899 	if (IS_ERR(new_bm)) {
1900 		DMERR("could not create block manager during abort");
1901 		pmd->bm = NULL;
1902 		r = PTR_ERR(new_bm);
1903 		goto out_unlock;
1904 	}
1905 
1906 	pmd->bm = new_bm;
1907 	r = __open_or_format_metadata(pmd, false);
1908 	if (r) {
1909 		pmd->bm = NULL;
1910 		goto out_unlock;
1911 	}
1912 	new_bm = NULL;
1913 out_unlock:
1914 	if (r)
1915 		pmd->fail_io = true;
1916 	pmd_write_unlock(pmd);
1917 	dm_block_manager_destroy(old_bm);
1918 out:
1919 	if (new_bm && !IS_ERR(new_bm))
1920 		dm_block_manager_destroy(new_bm);
1921 
1922 	return r;
1923 }
1924 
1925 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1926 {
1927 	int r = -EINVAL;
1928 
1929 	down_read(&pmd->root_lock);
1930 	if (!pmd->fail_io)
1931 		r = dm_sm_get_nr_free(pmd->data_sm, result);
1932 	up_read(&pmd->root_lock);
1933 
1934 	return r;
1935 }
1936 
1937 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1938 					  dm_block_t *result)
1939 {
1940 	int r = -EINVAL;
1941 
1942 	down_read(&pmd->root_lock);
1943 	if (!pmd->fail_io)
1944 		r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1945 
1946 	if (!r) {
1947 		if (*result < pmd->metadata_reserve)
1948 			*result = 0;
1949 		else
1950 			*result -= pmd->metadata_reserve;
1951 	}
1952 	up_read(&pmd->root_lock);
1953 
1954 	return r;
1955 }
1956 
1957 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1958 				  dm_block_t *result)
1959 {
1960 	int r = -EINVAL;
1961 
1962 	down_read(&pmd->root_lock);
1963 	if (!pmd->fail_io)
1964 		r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1965 	up_read(&pmd->root_lock);
1966 
1967 	return r;
1968 }
1969 
1970 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1971 {
1972 	int r = -EINVAL;
1973 
1974 	down_read(&pmd->root_lock);
1975 	if (!pmd->fail_io)
1976 		r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1977 	up_read(&pmd->root_lock);
1978 
1979 	return r;
1980 }
1981 
1982 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1983 {
1984 	int r = -EINVAL;
1985 	struct dm_pool_metadata *pmd = td->pmd;
1986 
1987 	down_read(&pmd->root_lock);
1988 	if (!pmd->fail_io) {
1989 		*result = td->mapped_blocks;
1990 		r = 0;
1991 	}
1992 	up_read(&pmd->root_lock);
1993 
1994 	return r;
1995 }
1996 
1997 static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1998 {
1999 	int r;
2000 	__le64 value_le;
2001 	dm_block_t thin_root;
2002 	struct dm_pool_metadata *pmd = td->pmd;
2003 
2004 	r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
2005 	if (r)
2006 		return r;
2007 
2008 	thin_root = le64_to_cpu(value_le);
2009 
2010 	return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
2011 }
2012 
2013 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
2014 				     dm_block_t *result)
2015 {
2016 	int r = -EINVAL;
2017 	struct dm_pool_metadata *pmd = td->pmd;
2018 
2019 	down_read(&pmd->root_lock);
2020 	if (!pmd->fail_io)
2021 		r = __highest_block(td, result);
2022 	up_read(&pmd->root_lock);
2023 
2024 	return r;
2025 }
2026 
2027 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
2028 {
2029 	int r;
2030 	dm_block_t old_count;
2031 
2032 	r = dm_sm_get_nr_blocks(sm, &old_count);
2033 	if (r)
2034 		return r;
2035 
2036 	if (new_count == old_count)
2037 		return 0;
2038 
2039 	if (new_count < old_count) {
2040 		DMERR("cannot reduce size of space map");
2041 		return -EINVAL;
2042 	}
2043 
2044 	return dm_sm_extend(sm, new_count - old_count);
2045 }
2046 
2047 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2048 {
2049 	int r = -EINVAL;
2050 
2051 	pmd_write_lock(pmd);
2052 	if (!pmd->fail_io)
2053 		r = __resize_space_map(pmd->data_sm, new_count);
2054 	pmd_write_unlock(pmd);
2055 
2056 	return r;
2057 }
2058 
2059 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
2060 {
2061 	int r = -EINVAL;
2062 
2063 	pmd_write_lock(pmd);
2064 	if (!pmd->fail_io) {
2065 		r = __resize_space_map(pmd->metadata_sm, new_count);
2066 		if (!r)
2067 			__set_metadata_reserve(pmd);
2068 	}
2069 	pmd_write_unlock(pmd);
2070 
2071 	return r;
2072 }
2073 
2074 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
2075 {
2076 	pmd_write_lock_in_core(pmd);
2077 	dm_bm_set_read_only(pmd->bm);
2078 	pmd_write_unlock(pmd);
2079 }
2080 
2081 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
2082 {
2083 	pmd_write_lock_in_core(pmd);
2084 	dm_bm_set_read_write(pmd->bm);
2085 	pmd_write_unlock(pmd);
2086 }
2087 
2088 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
2089 					dm_block_t threshold,
2090 					dm_sm_threshold_fn fn,
2091 					void *context)
2092 {
2093 	int r = -EINVAL;
2094 
2095 	pmd_write_lock_in_core(pmd);
2096 	if (!pmd->fail_io) {
2097 		r = dm_sm_register_threshold_callback(pmd->metadata_sm,
2098 						      threshold, fn, context);
2099 	}
2100 	pmd_write_unlock(pmd);
2101 
2102 	return r;
2103 }
2104 
2105 void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd,
2106 					  dm_pool_pre_commit_fn fn,
2107 					  void *context)
2108 {
2109 	pmd_write_lock_in_core(pmd);
2110 	pmd->pre_commit_fn = fn;
2111 	pmd->pre_commit_context = context;
2112 	pmd_write_unlock(pmd);
2113 }
2114 
2115 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
2116 {
2117 	int r = -EINVAL;
2118 	struct dm_block *sblock;
2119 	struct thin_disk_superblock *disk_super;
2120 
2121 	pmd_write_lock(pmd);
2122 	if (pmd->fail_io)
2123 		goto out;
2124 
2125 	pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
2126 
2127 	r = superblock_lock(pmd, &sblock);
2128 	if (r) {
2129 		DMERR("couldn't lock superblock");
2130 		goto out;
2131 	}
2132 
2133 	disk_super = dm_block_data(sblock);
2134 	disk_super->flags = cpu_to_le32(pmd->flags);
2135 
2136 	dm_bm_unlock(sblock);
2137 out:
2138 	pmd_write_unlock(pmd);
2139 	return r;
2140 }
2141 
2142 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
2143 {
2144 	bool needs_check;
2145 
2146 	down_read(&pmd->root_lock);
2147 	needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
2148 	up_read(&pmd->root_lock);
2149 
2150 	return needs_check;
2151 }
2152 
2153 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
2154 {
2155 	down_read(&pmd->root_lock);
2156 	if (!pmd->fail_io)
2157 		dm_tm_issue_prefetches(pmd->tm);
2158 	up_read(&pmd->root_lock);
2159 }
2160