xref: /openbmc/linux/drivers/md/dm-clone-target.c (revision dc6a81c3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4  */
5 
6 #include <linux/mm.h>
7 #include <linux/bio.h>
8 #include <linux/err.h>
9 #include <linux/hash.h>
10 #include <linux/list.h>
11 #include <linux/log2.h>
12 #include <linux/init.h>
13 #include <linux/slab.h>
14 #include <linux/wait.h>
15 #include <linux/dm-io.h>
16 #include <linux/mutex.h>
17 #include <linux/atomic.h>
18 #include <linux/bitops.h>
19 #include <linux/blkdev.h>
20 #include <linux/kdev_t.h>
21 #include <linux/kernel.h>
22 #include <linux/module.h>
23 #include <linux/jiffies.h>
24 #include <linux/mempool.h>
25 #include <linux/spinlock.h>
26 #include <linux/blk_types.h>
27 #include <linux/dm-kcopyd.h>
28 #include <linux/workqueue.h>
29 #include <linux/backing-dev.h>
30 #include <linux/device-mapper.h>
31 
32 #include "dm.h"
33 #include "dm-clone-metadata.h"
34 
35 #define DM_MSG_PREFIX "clone"
36 
37 /*
38  * Minimum and maximum allowed region sizes
39  */
40 #define MIN_REGION_SIZE (1 << 3)  /* 4KB */
41 #define MAX_REGION_SIZE (1 << 21) /* 1GB */
42 
43 #define MIN_HYDRATIONS 256 /* Size of hydration mempool */
44 #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
45 #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
46 
47 #define COMMIT_PERIOD HZ /* 1 sec */
48 
49 /*
50  * Hydration hash table size: 1 << HASH_TABLE_BITS
51  */
52 #define HASH_TABLE_BITS 15
53 
54 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
55 	"A percentage of time allocated for hydrating regions");
56 
57 /* Slab cache for struct dm_clone_region_hydration */
58 static struct kmem_cache *_hydration_cache;
59 
60 /* dm-clone metadata modes */
61 enum clone_metadata_mode {
62 	CM_WRITE,		/* metadata may be changed */
63 	CM_READ_ONLY,		/* metadata may not be changed */
64 	CM_FAIL,		/* all metadata I/O fails */
65 };
66 
67 struct hash_table_bucket;
68 
69 struct clone {
70 	struct dm_target *ti;
71 	struct dm_target_callbacks callbacks;
72 
73 	struct dm_dev *metadata_dev;
74 	struct dm_dev *dest_dev;
75 	struct dm_dev *source_dev;
76 
77 	unsigned long nr_regions;
78 	sector_t region_size;
79 	unsigned int region_shift;
80 
81 	/*
82 	 * A metadata commit and the actions taken in case it fails should run
83 	 * as a single atomic step.
84 	 */
85 	struct mutex commit_lock;
86 
87 	struct dm_clone_metadata *cmd;
88 
89 	/*
90 	 * bio used to flush the destination device, before committing the
91 	 * metadata.
92 	 */
93 	struct bio flush_bio;
94 
95 	/* Region hydration hash table */
96 	struct hash_table_bucket *ht;
97 
98 	atomic_t ios_in_flight;
99 
100 	wait_queue_head_t hydration_stopped;
101 
102 	mempool_t hydration_pool;
103 
104 	unsigned long last_commit_jiffies;
105 
106 	/*
107 	 * We defer incoming WRITE bios for regions that are not hydrated,
108 	 * until after these regions have been hydrated.
109 	 *
110 	 * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
111 	 * metadata have been committed.
112 	 */
113 	spinlock_t lock;
114 	struct bio_list deferred_bios;
115 	struct bio_list deferred_discard_bios;
116 	struct bio_list deferred_flush_bios;
117 	struct bio_list deferred_flush_completions;
118 
119 	/* Maximum number of regions being copied during background hydration. */
120 	unsigned int hydration_threshold;
121 
122 	/* Number of regions to batch together during background hydration. */
123 	unsigned int hydration_batch_size;
124 
125 	/* Which region to hydrate next */
126 	unsigned long hydration_offset;
127 
128 	atomic_t hydrations_in_flight;
129 
130 	/*
131 	 * Save a copy of the table line rather than reconstructing it for the
132 	 * status.
133 	 */
134 	unsigned int nr_ctr_args;
135 	const char **ctr_args;
136 
137 	struct workqueue_struct *wq;
138 	struct work_struct worker;
139 	struct delayed_work waker;
140 
141 	struct dm_kcopyd_client *kcopyd_client;
142 
143 	enum clone_metadata_mode mode;
144 	unsigned long flags;
145 };
146 
147 /*
148  * dm-clone flags
149  */
150 #define DM_CLONE_DISCARD_PASSDOWN 0
151 #define DM_CLONE_HYDRATION_ENABLED 1
152 #define DM_CLONE_HYDRATION_SUSPENDED 2
153 
154 /*---------------------------------------------------------------------------*/
155 
156 /*
157  * Metadata failure handling.
158  */
159 static enum clone_metadata_mode get_clone_mode(struct clone *clone)
160 {
161 	return READ_ONCE(clone->mode);
162 }
163 
164 static const char *clone_device_name(struct clone *clone)
165 {
166 	return dm_table_device_name(clone->ti->table);
167 }
168 
169 static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
170 {
171 	const char *descs[] = {
172 		"read-write",
173 		"read-only",
174 		"fail"
175 	};
176 
177 	enum clone_metadata_mode old_mode = get_clone_mode(clone);
178 
179 	/* Never move out of fail mode */
180 	if (old_mode == CM_FAIL)
181 		new_mode = CM_FAIL;
182 
183 	switch (new_mode) {
184 	case CM_FAIL:
185 	case CM_READ_ONLY:
186 		dm_clone_metadata_set_read_only(clone->cmd);
187 		break;
188 
189 	case CM_WRITE:
190 		dm_clone_metadata_set_read_write(clone->cmd);
191 		break;
192 	}
193 
194 	WRITE_ONCE(clone->mode, new_mode);
195 
196 	if (new_mode != old_mode) {
197 		dm_table_event(clone->ti->table);
198 		DMINFO("%s: Switching to %s mode", clone_device_name(clone),
199 		       descs[(int)new_mode]);
200 	}
201 }
202 
203 static void __abort_transaction(struct clone *clone)
204 {
205 	const char *dev_name = clone_device_name(clone);
206 
207 	if (get_clone_mode(clone) >= CM_READ_ONLY)
208 		return;
209 
210 	DMERR("%s: Aborting current metadata transaction", dev_name);
211 	if (dm_clone_metadata_abort(clone->cmd)) {
212 		DMERR("%s: Failed to abort metadata transaction", dev_name);
213 		__set_clone_mode(clone, CM_FAIL);
214 	}
215 }
216 
217 static void __reload_in_core_bitset(struct clone *clone)
218 {
219 	const char *dev_name = clone_device_name(clone);
220 
221 	if (get_clone_mode(clone) == CM_FAIL)
222 		return;
223 
224 	/* Reload the on-disk bitset */
225 	DMINFO("%s: Reloading on-disk bitmap", dev_name);
226 	if (dm_clone_reload_in_core_bitset(clone->cmd)) {
227 		DMERR("%s: Failed to reload on-disk bitmap", dev_name);
228 		__set_clone_mode(clone, CM_FAIL);
229 	}
230 }
231 
232 static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
233 {
234 	DMERR("%s: Metadata operation `%s' failed: error = %d",
235 	      clone_device_name(clone), op, r);
236 
237 	__abort_transaction(clone);
238 	__set_clone_mode(clone, CM_READ_ONLY);
239 
240 	/*
241 	 * dm_clone_reload_in_core_bitset() may run concurrently with either
242 	 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
243 	 * it's safe as we have already set the metadata to read-only mode.
244 	 */
245 	__reload_in_core_bitset(clone);
246 }
247 
248 /*---------------------------------------------------------------------------*/
249 
250 /* Wake up anyone waiting for region hydrations to stop */
251 static inline void wakeup_hydration_waiters(struct clone *clone)
252 {
253 	wake_up_all(&clone->hydration_stopped);
254 }
255 
256 static inline void wake_worker(struct clone *clone)
257 {
258 	queue_work(clone->wq, &clone->worker);
259 }
260 
261 /*---------------------------------------------------------------------------*/
262 
263 /*
264  * bio helper functions.
265  */
266 static inline void remap_to_source(struct clone *clone, struct bio *bio)
267 {
268 	bio_set_dev(bio, clone->source_dev->bdev);
269 }
270 
271 static inline void remap_to_dest(struct clone *clone, struct bio *bio)
272 {
273 	bio_set_dev(bio, clone->dest_dev->bdev);
274 }
275 
276 static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
277 {
278 	return op_is_flush(bio->bi_opf) &&
279 		dm_clone_changed_this_transaction(clone->cmd);
280 }
281 
282 /* Get the address of the region in sectors */
283 static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
284 {
285 	return (region_nr << clone->region_shift);
286 }
287 
288 /* Get the region number of the bio */
289 static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
290 {
291 	return (bio->bi_iter.bi_sector >> clone->region_shift);
292 }
293 
294 /* Get the region range covered by the bio */
295 static void bio_region_range(struct clone *clone, struct bio *bio,
296 			     unsigned long *rs, unsigned long *re)
297 {
298 	*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
299 	*re = bio_end_sector(bio) >> clone->region_shift;
300 }
301 
302 /* Check whether a bio overwrites a region */
303 static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
304 {
305 	return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
306 }
307 
308 static void fail_bios(struct bio_list *bios, blk_status_t status)
309 {
310 	struct bio *bio;
311 
312 	while ((bio = bio_list_pop(bios))) {
313 		bio->bi_status = status;
314 		bio_endio(bio);
315 	}
316 }
317 
318 static void submit_bios(struct bio_list *bios)
319 {
320 	struct bio *bio;
321 	struct blk_plug plug;
322 
323 	blk_start_plug(&plug);
324 
325 	while ((bio = bio_list_pop(bios)))
326 		generic_make_request(bio);
327 
328 	blk_finish_plug(&plug);
329 }
330 
331 /*
332  * Submit bio to the underlying device.
333  *
334  * If the bio triggers a commit, delay it, until after the metadata have been
335  * committed.
336  *
337  * NOTE: The bio remapping must be performed by the caller.
338  */
339 static void issue_bio(struct clone *clone, struct bio *bio)
340 {
341 	if (!bio_triggers_commit(clone, bio)) {
342 		generic_make_request(bio);
343 		return;
344 	}
345 
346 	/*
347 	 * If the metadata mode is RO or FAIL we won't be able to commit the
348 	 * metadata, so we complete the bio with an error.
349 	 */
350 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
351 		bio_io_error(bio);
352 		return;
353 	}
354 
355 	/*
356 	 * Batch together any bios that trigger commits and then issue a single
357 	 * commit for them in process_deferred_flush_bios().
358 	 */
359 	spin_lock_irq(&clone->lock);
360 	bio_list_add(&clone->deferred_flush_bios, bio);
361 	spin_unlock_irq(&clone->lock);
362 
363 	wake_worker(clone);
364 }
365 
366 /*
367  * Remap bio to the destination device and submit it.
368  *
369  * If the bio triggers a commit, delay it, until after the metadata have been
370  * committed.
371  */
372 static void remap_and_issue(struct clone *clone, struct bio *bio)
373 {
374 	remap_to_dest(clone, bio);
375 	issue_bio(clone, bio);
376 }
377 
378 /*
379  * Issue bios that have been deferred until after their region has finished
380  * hydrating.
381  *
382  * We delegate the bio submission to the worker thread, so this is safe to call
383  * from interrupt context.
384  */
385 static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
386 {
387 	struct bio *bio;
388 	unsigned long flags;
389 	struct bio_list flush_bios = BIO_EMPTY_LIST;
390 	struct bio_list normal_bios = BIO_EMPTY_LIST;
391 
392 	if (bio_list_empty(bios))
393 		return;
394 
395 	while ((bio = bio_list_pop(bios))) {
396 		if (bio_triggers_commit(clone, bio))
397 			bio_list_add(&flush_bios, bio);
398 		else
399 			bio_list_add(&normal_bios, bio);
400 	}
401 
402 	spin_lock_irqsave(&clone->lock, flags);
403 	bio_list_merge(&clone->deferred_bios, &normal_bios);
404 	bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
405 	spin_unlock_irqrestore(&clone->lock, flags);
406 
407 	wake_worker(clone);
408 }
409 
410 static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
411 {
412 	unsigned long flags;
413 
414 	/*
415 	 * If the bio has the REQ_FUA flag set we must commit the metadata
416 	 * before signaling its completion.
417 	 *
418 	 * complete_overwrite_bio() is only called by hydration_complete(),
419 	 * after having successfully updated the metadata. This means we don't
420 	 * need to call dm_clone_changed_this_transaction() to check if the
421 	 * metadata has changed and thus we can avoid taking the metadata spin
422 	 * lock.
423 	 */
424 	if (!(bio->bi_opf & REQ_FUA)) {
425 		bio_endio(bio);
426 		return;
427 	}
428 
429 	/*
430 	 * If the metadata mode is RO or FAIL we won't be able to commit the
431 	 * metadata, so we complete the bio with an error.
432 	 */
433 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
434 		bio_io_error(bio);
435 		return;
436 	}
437 
438 	/*
439 	 * Batch together any bios that trigger commits and then issue a single
440 	 * commit for them in process_deferred_flush_bios().
441 	 */
442 	spin_lock_irqsave(&clone->lock, flags);
443 	bio_list_add(&clone->deferred_flush_completions, bio);
444 	spin_unlock_irqrestore(&clone->lock, flags);
445 
446 	wake_worker(clone);
447 }
448 
449 static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
450 {
451 	bio->bi_iter.bi_sector = sector;
452 	bio->bi_iter.bi_size = to_bytes(len);
453 }
454 
455 static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
456 {
457 	unsigned long rs, re;
458 
459 	/*
460 	 * If the destination device supports discards, remap and trim the
461 	 * discard bio and pass it down. Otherwise complete the bio
462 	 * immediately.
463 	 */
464 	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
465 		remap_to_dest(clone, bio);
466 		bio_region_range(clone, bio, &rs, &re);
467 		trim_bio(bio, rs << clone->region_shift,
468 			 (re - rs) << clone->region_shift);
469 		generic_make_request(bio);
470 	} else
471 		bio_endio(bio);
472 }
473 
474 static void process_discard_bio(struct clone *clone, struct bio *bio)
475 {
476 	unsigned long rs, re;
477 
478 	bio_region_range(clone, bio, &rs, &re);
479 	BUG_ON(re > clone->nr_regions);
480 
481 	if (unlikely(rs == re)) {
482 		bio_endio(bio);
483 		return;
484 	}
485 
486 	/*
487 	 * The covered regions are already hydrated so we just need to pass
488 	 * down the discard.
489 	 */
490 	if (dm_clone_is_range_hydrated(clone->cmd, rs, re - rs)) {
491 		complete_discard_bio(clone, bio, true);
492 		return;
493 	}
494 
495 	/*
496 	 * If the metadata mode is RO or FAIL we won't be able to update the
497 	 * metadata for the regions covered by the discard so we just ignore
498 	 * it.
499 	 */
500 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
501 		bio_endio(bio);
502 		return;
503 	}
504 
505 	/*
506 	 * Defer discard processing.
507 	 */
508 	spin_lock_irq(&clone->lock);
509 	bio_list_add(&clone->deferred_discard_bios, bio);
510 	spin_unlock_irq(&clone->lock);
511 
512 	wake_worker(clone);
513 }
514 
515 /*---------------------------------------------------------------------------*/
516 
517 /*
518  * dm-clone region hydrations.
519  */
520 struct dm_clone_region_hydration {
521 	struct clone *clone;
522 	unsigned long region_nr;
523 
524 	struct bio *overwrite_bio;
525 	bio_end_io_t *overwrite_bio_end_io;
526 
527 	struct bio_list deferred_bios;
528 
529 	blk_status_t status;
530 
531 	/* Used by hydration batching */
532 	struct list_head list;
533 
534 	/* Used by hydration hash table */
535 	struct hlist_node h;
536 };
537 
538 /*
539  * Hydration hash table implementation.
540  *
541  * Ideally we would like to use list_bl, which uses bit spin locks and employs
542  * the least significant bit of the list head to lock the corresponding bucket,
543  * reducing the memory overhead for the locks. But, currently, list_bl and bit
544  * spin locks don't support IRQ safe versions. Since we have to take the lock
545  * in both process and interrupt context, we must fall back to using regular
546  * spin locks; one per hash table bucket.
547  */
548 struct hash_table_bucket {
549 	struct hlist_head head;
550 
551 	/* Spinlock protecting the bucket */
552 	spinlock_t lock;
553 };
554 
555 #define bucket_lock_irqsave(bucket, flags) \
556 	spin_lock_irqsave(&(bucket)->lock, flags)
557 
558 #define bucket_unlock_irqrestore(bucket, flags) \
559 	spin_unlock_irqrestore(&(bucket)->lock, flags)
560 
561 #define bucket_lock_irq(bucket) \
562 	spin_lock_irq(&(bucket)->lock)
563 
564 #define bucket_unlock_irq(bucket) \
565 	spin_unlock_irq(&(bucket)->lock)
566 
567 static int hash_table_init(struct clone *clone)
568 {
569 	unsigned int i, sz;
570 	struct hash_table_bucket *bucket;
571 
572 	sz = 1 << HASH_TABLE_BITS;
573 
574 	clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
575 	if (!clone->ht)
576 		return -ENOMEM;
577 
578 	for (i = 0; i < sz; i++) {
579 		bucket = clone->ht + i;
580 
581 		INIT_HLIST_HEAD(&bucket->head);
582 		spin_lock_init(&bucket->lock);
583 	}
584 
585 	return 0;
586 }
587 
588 static void hash_table_exit(struct clone *clone)
589 {
590 	kvfree(clone->ht);
591 }
592 
593 static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
594 						       unsigned long region_nr)
595 {
596 	return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
597 }
598 
599 /*
600  * Search hash table for a hydration with hd->region_nr == region_nr
601  *
602  * NOTE: Must be called with the bucket lock held
603  */
604 static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
605 						     unsigned long region_nr)
606 {
607 	struct dm_clone_region_hydration *hd;
608 
609 	hlist_for_each_entry(hd, &bucket->head, h) {
610 		if (hd->region_nr == region_nr)
611 			return hd;
612 	}
613 
614 	return NULL;
615 }
616 
617 /*
618  * Insert a hydration into the hash table.
619  *
620  * NOTE: Must be called with the bucket lock held.
621  */
622 static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
623 					     struct dm_clone_region_hydration *hd)
624 {
625 	hlist_add_head(&hd->h, &bucket->head);
626 }
627 
628 /*
629  * This function inserts a hydration into the hash table, unless someone else
630  * managed to insert a hydration for the same region first. In the latter case
631  * it returns the existing hydration descriptor for this region.
632  *
633  * NOTE: Must be called with the hydration hash table lock held.
634  */
635 static struct dm_clone_region_hydration *
636 __find_or_insert_region_hydration(struct hash_table_bucket *bucket,
637 				  struct dm_clone_region_hydration *hd)
638 {
639 	struct dm_clone_region_hydration *hd2;
640 
641 	hd2 = __hash_find(bucket, hd->region_nr);
642 	if (hd2)
643 		return hd2;
644 
645 	__insert_region_hydration(bucket, hd);
646 
647 	return hd;
648 }
649 
650 /*---------------------------------------------------------------------------*/
651 
652 /* Allocate a hydration */
653 static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
654 {
655 	struct dm_clone_region_hydration *hd;
656 
657 	/*
658 	 * Allocate a hydration from the hydration mempool.
659 	 * This might block but it can't fail.
660 	 */
661 	hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
662 	hd->clone = clone;
663 
664 	return hd;
665 }
666 
667 static inline void free_hydration(struct dm_clone_region_hydration *hd)
668 {
669 	mempool_free(hd, &hd->clone->hydration_pool);
670 }
671 
672 /* Initialize a hydration */
673 static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
674 {
675 	hd->region_nr = region_nr;
676 	hd->overwrite_bio = NULL;
677 	bio_list_init(&hd->deferred_bios);
678 	hd->status = 0;
679 
680 	INIT_LIST_HEAD(&hd->list);
681 	INIT_HLIST_NODE(&hd->h);
682 }
683 
684 /*---------------------------------------------------------------------------*/
685 
686 /*
687  * Update dm-clone's metadata after a region has finished hydrating and remove
688  * hydration from the hash table.
689  */
690 static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
691 {
692 	int r = 0;
693 	unsigned long flags;
694 	struct hash_table_bucket *bucket;
695 	struct clone *clone = hd->clone;
696 
697 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
698 		r = -EPERM;
699 
700 	/* Update the metadata */
701 	if (likely(!r) && hd->status == BLK_STS_OK)
702 		r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
703 
704 	bucket = get_hash_table_bucket(clone, hd->region_nr);
705 
706 	/* Remove hydration from hash table */
707 	bucket_lock_irqsave(bucket, flags);
708 	hlist_del(&hd->h);
709 	bucket_unlock_irqrestore(bucket, flags);
710 
711 	return r;
712 }
713 
714 /*
715  * Complete a region's hydration:
716  *
717  *	1. Update dm-clone's metadata.
718  *	2. Remove hydration from hash table.
719  *	3. Complete overwrite bio.
720  *	4. Issue deferred bios.
721  *	5. If this was the last hydration, wake up anyone waiting for
722  *	   hydrations to finish.
723  */
724 static void hydration_complete(struct dm_clone_region_hydration *hd)
725 {
726 	int r;
727 	blk_status_t status;
728 	struct clone *clone = hd->clone;
729 
730 	r = hydration_update_metadata(hd);
731 
732 	if (hd->status == BLK_STS_OK && likely(!r)) {
733 		if (hd->overwrite_bio)
734 			complete_overwrite_bio(clone, hd->overwrite_bio);
735 
736 		issue_deferred_bios(clone, &hd->deferred_bios);
737 	} else {
738 		status = r ? BLK_STS_IOERR : hd->status;
739 
740 		if (hd->overwrite_bio)
741 			bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
742 
743 		fail_bios(&hd->deferred_bios, status);
744 	}
745 
746 	free_hydration(hd);
747 
748 	if (atomic_dec_and_test(&clone->hydrations_in_flight))
749 		wakeup_hydration_waiters(clone);
750 }
751 
752 static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
753 {
754 	blk_status_t status;
755 
756 	struct dm_clone_region_hydration *tmp, *hd = context;
757 	struct clone *clone = hd->clone;
758 
759 	LIST_HEAD(batched_hydrations);
760 
761 	if (read_err || write_err) {
762 		DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
763 		status = BLK_STS_IOERR;
764 	} else {
765 		status = BLK_STS_OK;
766 	}
767 	list_splice_tail(&hd->list, &batched_hydrations);
768 
769 	hd->status = status;
770 	hydration_complete(hd);
771 
772 	/* Complete batched hydrations */
773 	list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
774 		hd->status = status;
775 		hydration_complete(hd);
776 	}
777 
778 	/* Continue background hydration, if there is no I/O in-flight */
779 	if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
780 	    !atomic_read(&clone->ios_in_flight))
781 		wake_worker(clone);
782 }
783 
784 static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
785 {
786 	unsigned long region_start, region_end;
787 	sector_t tail_size, region_size, total_size;
788 	struct dm_io_region from, to;
789 	struct clone *clone = hd->clone;
790 
791 	region_size = clone->region_size;
792 	region_start = hd->region_nr;
793 	region_end = region_start + nr_regions - 1;
794 
795 	total_size = (nr_regions - 1) << clone->region_shift;
796 
797 	if (region_end == clone->nr_regions - 1) {
798 		/*
799 		 * The last region of the target might be smaller than
800 		 * region_size.
801 		 */
802 		tail_size = clone->ti->len & (region_size - 1);
803 		if (!tail_size)
804 			tail_size = region_size;
805 	} else {
806 		tail_size = region_size;
807 	}
808 
809 	total_size += tail_size;
810 
811 	from.bdev = clone->source_dev->bdev;
812 	from.sector = region_to_sector(clone, region_start);
813 	from.count = total_size;
814 
815 	to.bdev = clone->dest_dev->bdev;
816 	to.sector = from.sector;
817 	to.count = from.count;
818 
819 	/* Issue copy */
820 	atomic_add(nr_regions, &clone->hydrations_in_flight);
821 	dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
822 		       hydration_kcopyd_callback, hd);
823 }
824 
825 static void overwrite_endio(struct bio *bio)
826 {
827 	struct dm_clone_region_hydration *hd = bio->bi_private;
828 
829 	bio->bi_end_io = hd->overwrite_bio_end_io;
830 	hd->status = bio->bi_status;
831 
832 	hydration_complete(hd);
833 }
834 
835 static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
836 {
837 	/*
838 	 * We don't need to save and restore bio->bi_private because device
839 	 * mapper core generates a new bio for us to use, with clean
840 	 * bi_private.
841 	 */
842 	hd->overwrite_bio = bio;
843 	hd->overwrite_bio_end_io = bio->bi_end_io;
844 
845 	bio->bi_end_io = overwrite_endio;
846 	bio->bi_private = hd;
847 
848 	atomic_inc(&hd->clone->hydrations_in_flight);
849 	generic_make_request(bio);
850 }
851 
852 /*
853  * Hydrate bio's region.
854  *
855  * This function starts the hydration of the bio's region and puts the bio in
856  * the list of deferred bios for this region. In case, by the time this
857  * function is called, the region has finished hydrating it's submitted to the
858  * destination device.
859  *
860  * NOTE: The bio remapping must be performed by the caller.
861  */
862 static void hydrate_bio_region(struct clone *clone, struct bio *bio)
863 {
864 	unsigned long region_nr;
865 	struct hash_table_bucket *bucket;
866 	struct dm_clone_region_hydration *hd, *hd2;
867 
868 	region_nr = bio_to_region(clone, bio);
869 	bucket = get_hash_table_bucket(clone, region_nr);
870 
871 	bucket_lock_irq(bucket);
872 
873 	hd = __hash_find(bucket, region_nr);
874 	if (hd) {
875 		/* Someone else is hydrating the region */
876 		bio_list_add(&hd->deferred_bios, bio);
877 		bucket_unlock_irq(bucket);
878 		return;
879 	}
880 
881 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
882 		/* The region has been hydrated */
883 		bucket_unlock_irq(bucket);
884 		issue_bio(clone, bio);
885 		return;
886 	}
887 
888 	/*
889 	 * We must allocate a hydration descriptor and start the hydration of
890 	 * the corresponding region.
891 	 */
892 	bucket_unlock_irq(bucket);
893 
894 	hd = alloc_hydration(clone);
895 	hydration_init(hd, region_nr);
896 
897 	bucket_lock_irq(bucket);
898 
899 	/* Check if the region has been hydrated in the meantime. */
900 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
901 		bucket_unlock_irq(bucket);
902 		free_hydration(hd);
903 		issue_bio(clone, bio);
904 		return;
905 	}
906 
907 	hd2 = __find_or_insert_region_hydration(bucket, hd);
908 	if (hd2 != hd) {
909 		/* Someone else started the region's hydration. */
910 		bio_list_add(&hd2->deferred_bios, bio);
911 		bucket_unlock_irq(bucket);
912 		free_hydration(hd);
913 		return;
914 	}
915 
916 	/*
917 	 * If the metadata mode is RO or FAIL then there is no point starting a
918 	 * hydration, since we will not be able to update the metadata when the
919 	 * hydration finishes.
920 	 */
921 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
922 		hlist_del(&hd->h);
923 		bucket_unlock_irq(bucket);
924 		free_hydration(hd);
925 		bio_io_error(bio);
926 		return;
927 	}
928 
929 	/*
930 	 * Start region hydration.
931 	 *
932 	 * If a bio overwrites a region, i.e., its size is equal to the
933 	 * region's size, then we don't need to copy the region from the source
934 	 * to the destination device.
935 	 */
936 	if (is_overwrite_bio(clone, bio)) {
937 		bucket_unlock_irq(bucket);
938 		hydration_overwrite(hd, bio);
939 	} else {
940 		bio_list_add(&hd->deferred_bios, bio);
941 		bucket_unlock_irq(bucket);
942 		hydration_copy(hd, 1);
943 	}
944 }
945 
946 /*---------------------------------------------------------------------------*/
947 
948 /*
949  * Background hydrations.
950  */
951 
952 /*
953  * Batch region hydrations.
954  *
955  * To better utilize device bandwidth we batch together the hydration of
956  * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
957  * is good for small, random write performance (because of the overwriting of
958  * un-hydrated regions) and at the same time issue big copy requests to kcopyd
959  * to achieve high hydration bandwidth.
960  */
961 struct batch_info {
962 	struct dm_clone_region_hydration *head;
963 	unsigned int nr_batched_regions;
964 };
965 
966 static void __batch_hydration(struct batch_info *batch,
967 			      struct dm_clone_region_hydration *hd)
968 {
969 	struct clone *clone = hd->clone;
970 	unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
971 
972 	if (batch->head) {
973 		/* Try to extend the current batch */
974 		if (batch->nr_batched_regions < max_batch_size &&
975 		    (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
976 			list_add_tail(&hd->list, &batch->head->list);
977 			batch->nr_batched_regions++;
978 			hd = NULL;
979 		}
980 
981 		/* Check if we should issue the current batch */
982 		if (batch->nr_batched_regions >= max_batch_size || hd) {
983 			hydration_copy(batch->head, batch->nr_batched_regions);
984 			batch->head = NULL;
985 			batch->nr_batched_regions = 0;
986 		}
987 	}
988 
989 	if (!hd)
990 		return;
991 
992 	/* We treat max batch sizes of zero and one equivalently */
993 	if (max_batch_size <= 1) {
994 		hydration_copy(hd, 1);
995 		return;
996 	}
997 
998 	/* Start a new batch */
999 	BUG_ON(!list_empty(&hd->list));
1000 	batch->head = hd;
1001 	batch->nr_batched_regions = 1;
1002 }
1003 
1004 static unsigned long __start_next_hydration(struct clone *clone,
1005 					    unsigned long offset,
1006 					    struct batch_info *batch)
1007 {
1008 	struct hash_table_bucket *bucket;
1009 	struct dm_clone_region_hydration *hd;
1010 	unsigned long nr_regions = clone->nr_regions;
1011 
1012 	hd = alloc_hydration(clone);
1013 
1014 	/* Try to find a region to hydrate. */
1015 	do {
1016 		offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1017 		if (offset == nr_regions)
1018 			break;
1019 
1020 		bucket = get_hash_table_bucket(clone, offset);
1021 		bucket_lock_irq(bucket);
1022 
1023 		if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1024 		    !__hash_find(bucket, offset)) {
1025 			hydration_init(hd, offset);
1026 			__insert_region_hydration(bucket, hd);
1027 			bucket_unlock_irq(bucket);
1028 
1029 			/* Batch hydration */
1030 			__batch_hydration(batch, hd);
1031 
1032 			return (offset + 1);
1033 		}
1034 
1035 		bucket_unlock_irq(bucket);
1036 
1037 	} while (++offset < nr_regions);
1038 
1039 	if (hd)
1040 		free_hydration(hd);
1041 
1042 	return offset;
1043 }
1044 
1045 /*
1046  * This function searches for regions that still reside in the source device
1047  * and starts their hydration.
1048  */
1049 static void do_hydration(struct clone *clone)
1050 {
1051 	unsigned int current_volume;
1052 	unsigned long offset, nr_regions = clone->nr_regions;
1053 
1054 	struct batch_info batch = {
1055 		.head = NULL,
1056 		.nr_batched_regions = 0,
1057 	};
1058 
1059 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1060 		return;
1061 
1062 	if (dm_clone_is_hydration_done(clone->cmd))
1063 		return;
1064 
1065 	/*
1066 	 * Avoid race with device suspension.
1067 	 */
1068 	atomic_inc(&clone->hydrations_in_flight);
1069 
1070 	/*
1071 	 * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1072 	 * might race with clone_postsuspend() and start a region hydration
1073 	 * after the target has been suspended.
1074 	 *
1075 	 * This is paired with the smp_mb__after_atomic() in
1076 	 * clone_postsuspend().
1077 	 */
1078 	smp_mb__after_atomic();
1079 
1080 	offset = clone->hydration_offset;
1081 	while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1082 	       !atomic_read(&clone->ios_in_flight) &&
1083 	       test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1084 	       offset < nr_regions) {
1085 		current_volume = atomic_read(&clone->hydrations_in_flight);
1086 		current_volume += batch.nr_batched_regions;
1087 
1088 		if (current_volume > READ_ONCE(clone->hydration_threshold))
1089 			break;
1090 
1091 		offset = __start_next_hydration(clone, offset, &batch);
1092 	}
1093 
1094 	if (batch.head)
1095 		hydration_copy(batch.head, batch.nr_batched_regions);
1096 
1097 	if (offset >= nr_regions)
1098 		offset = 0;
1099 
1100 	clone->hydration_offset = offset;
1101 
1102 	if (atomic_dec_and_test(&clone->hydrations_in_flight))
1103 		wakeup_hydration_waiters(clone);
1104 }
1105 
1106 /*---------------------------------------------------------------------------*/
1107 
1108 static bool need_commit_due_to_time(struct clone *clone)
1109 {
1110 	return !time_in_range(jiffies, clone->last_commit_jiffies,
1111 			      clone->last_commit_jiffies + COMMIT_PERIOD);
1112 }
1113 
1114 /*
1115  * A non-zero return indicates read-only or fail mode.
1116  */
1117 static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
1118 {
1119 	int r = 0;
1120 
1121 	if (dest_dev_flushed)
1122 		*dest_dev_flushed = false;
1123 
1124 	mutex_lock(&clone->commit_lock);
1125 
1126 	if (!dm_clone_changed_this_transaction(clone->cmd))
1127 		goto out;
1128 
1129 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1130 		r = -EPERM;
1131 		goto out;
1132 	}
1133 
1134 	r = dm_clone_metadata_pre_commit(clone->cmd);
1135 	if (unlikely(r)) {
1136 		__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
1137 		goto out;
1138 	}
1139 
1140 	bio_reset(&clone->flush_bio);
1141 	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
1142 	clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1143 
1144 	r = submit_bio_wait(&clone->flush_bio);
1145 	if (unlikely(r)) {
1146 		__metadata_operation_failed(clone, "flush destination device", r);
1147 		goto out;
1148 	}
1149 
1150 	if (dest_dev_flushed)
1151 		*dest_dev_flushed = true;
1152 
1153 	r = dm_clone_metadata_commit(clone->cmd);
1154 	if (unlikely(r)) {
1155 		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1156 		goto out;
1157 	}
1158 
1159 	if (dm_clone_is_hydration_done(clone->cmd))
1160 		dm_table_event(clone->ti->table);
1161 out:
1162 	mutex_unlock(&clone->commit_lock);
1163 
1164 	return r;
1165 }
1166 
1167 static void process_deferred_discards(struct clone *clone)
1168 {
1169 	int r = -EPERM;
1170 	struct bio *bio;
1171 	struct blk_plug plug;
1172 	unsigned long rs, re;
1173 	struct bio_list discards = BIO_EMPTY_LIST;
1174 
1175 	spin_lock_irq(&clone->lock);
1176 	bio_list_merge(&discards, &clone->deferred_discard_bios);
1177 	bio_list_init(&clone->deferred_discard_bios);
1178 	spin_unlock_irq(&clone->lock);
1179 
1180 	if (bio_list_empty(&discards))
1181 		return;
1182 
1183 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1184 		goto out;
1185 
1186 	/* Update the metadata */
1187 	bio_list_for_each(bio, &discards) {
1188 		bio_region_range(clone, bio, &rs, &re);
1189 		/*
1190 		 * A discard request might cover regions that have been already
1191 		 * hydrated. There is no need to update the metadata for these
1192 		 * regions.
1193 		 */
1194 		r = dm_clone_cond_set_range(clone->cmd, rs, re - rs);
1195 
1196 		if (unlikely(r))
1197 			break;
1198 	}
1199 out:
1200 	blk_start_plug(&plug);
1201 	while ((bio = bio_list_pop(&discards)))
1202 		complete_discard_bio(clone, bio, r == 0);
1203 	blk_finish_plug(&plug);
1204 }
1205 
1206 static void process_deferred_bios(struct clone *clone)
1207 {
1208 	struct bio_list bios = BIO_EMPTY_LIST;
1209 
1210 	spin_lock_irq(&clone->lock);
1211 	bio_list_merge(&bios, &clone->deferred_bios);
1212 	bio_list_init(&clone->deferred_bios);
1213 	spin_unlock_irq(&clone->lock);
1214 
1215 	if (bio_list_empty(&bios))
1216 		return;
1217 
1218 	submit_bios(&bios);
1219 }
1220 
1221 static void process_deferred_flush_bios(struct clone *clone)
1222 {
1223 	struct bio *bio;
1224 	bool dest_dev_flushed;
1225 	struct bio_list bios = BIO_EMPTY_LIST;
1226 	struct bio_list bio_completions = BIO_EMPTY_LIST;
1227 
1228 	/*
1229 	 * If there are any deferred flush bios, we must commit the metadata
1230 	 * before issuing them or signaling their completion.
1231 	 */
1232 	spin_lock_irq(&clone->lock);
1233 	bio_list_merge(&bios, &clone->deferred_flush_bios);
1234 	bio_list_init(&clone->deferred_flush_bios);
1235 
1236 	bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1237 	bio_list_init(&clone->deferred_flush_completions);
1238 	spin_unlock_irq(&clone->lock);
1239 
1240 	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1241 	    !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1242 		return;
1243 
1244 	if (commit_metadata(clone, &dest_dev_flushed)) {
1245 		bio_list_merge(&bios, &bio_completions);
1246 
1247 		while ((bio = bio_list_pop(&bios)))
1248 			bio_io_error(bio);
1249 
1250 		return;
1251 	}
1252 
1253 	clone->last_commit_jiffies = jiffies;
1254 
1255 	while ((bio = bio_list_pop(&bio_completions)))
1256 		bio_endio(bio);
1257 
1258 	while ((bio = bio_list_pop(&bios))) {
1259 		if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
1260 			/* We just flushed the destination device as part of
1261 			 * the metadata commit, so there is no reason to send
1262 			 * another flush.
1263 			 */
1264 			bio_endio(bio);
1265 		} else {
1266 			generic_make_request(bio);
1267 		}
1268 	}
1269 }
1270 
1271 static void do_worker(struct work_struct *work)
1272 {
1273 	struct clone *clone = container_of(work, typeof(*clone), worker);
1274 
1275 	process_deferred_bios(clone);
1276 	process_deferred_discards(clone);
1277 
1278 	/*
1279 	 * process_deferred_flush_bios():
1280 	 *
1281 	 *   - Commit metadata
1282 	 *
1283 	 *   - Process deferred REQ_FUA completions
1284 	 *
1285 	 *   - Process deferred REQ_PREFLUSH bios
1286 	 */
1287 	process_deferred_flush_bios(clone);
1288 
1289 	/* Background hydration */
1290 	do_hydration(clone);
1291 }
1292 
1293 /*
1294  * Commit periodically so that not too much unwritten data builds up.
1295  *
1296  * Also, restart background hydration, if it has been stopped by in-flight I/O.
1297  */
1298 static void do_waker(struct work_struct *work)
1299 {
1300 	struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1301 
1302 	wake_worker(clone);
1303 	queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1304 }
1305 
1306 /*---------------------------------------------------------------------------*/
1307 
1308 /*
1309  * Target methods
1310  */
1311 static int clone_map(struct dm_target *ti, struct bio *bio)
1312 {
1313 	struct clone *clone = ti->private;
1314 	unsigned long region_nr;
1315 
1316 	atomic_inc(&clone->ios_in_flight);
1317 
1318 	if (unlikely(get_clone_mode(clone) == CM_FAIL))
1319 		return DM_MAPIO_KILL;
1320 
1321 	/*
1322 	 * REQ_PREFLUSH bios carry no data:
1323 	 *
1324 	 * - Commit metadata, if changed
1325 	 *
1326 	 * - Pass down to destination device
1327 	 */
1328 	if (bio->bi_opf & REQ_PREFLUSH) {
1329 		remap_and_issue(clone, bio);
1330 		return DM_MAPIO_SUBMITTED;
1331 	}
1332 
1333 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1334 
1335 	/*
1336 	 * dm-clone interprets discards and performs a fast hydration of the
1337 	 * discarded regions, i.e., we skip the copy from the source device and
1338 	 * just mark the regions as hydrated.
1339 	 */
1340 	if (bio_op(bio) == REQ_OP_DISCARD) {
1341 		process_discard_bio(clone, bio);
1342 		return DM_MAPIO_SUBMITTED;
1343 	}
1344 
1345 	/*
1346 	 * If the bio's region is hydrated, redirect it to the destination
1347 	 * device.
1348 	 *
1349 	 * If the region is not hydrated and the bio is a READ, redirect it to
1350 	 * the source device.
1351 	 *
1352 	 * Else, defer WRITE bio until after its region has been hydrated and
1353 	 * start the region's hydration immediately.
1354 	 */
1355 	region_nr = bio_to_region(clone, bio);
1356 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1357 		remap_and_issue(clone, bio);
1358 		return DM_MAPIO_SUBMITTED;
1359 	} else if (bio_data_dir(bio) == READ) {
1360 		remap_to_source(clone, bio);
1361 		return DM_MAPIO_REMAPPED;
1362 	}
1363 
1364 	remap_to_dest(clone, bio);
1365 	hydrate_bio_region(clone, bio);
1366 
1367 	return DM_MAPIO_SUBMITTED;
1368 }
1369 
1370 static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1371 {
1372 	struct clone *clone = ti->private;
1373 
1374 	atomic_dec(&clone->ios_in_flight);
1375 
1376 	return DM_ENDIO_DONE;
1377 }
1378 
1379 static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1380 		       ssize_t *sz_ptr)
1381 {
1382 	ssize_t sz = *sz_ptr;
1383 	unsigned int count;
1384 
1385 	count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1386 	count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1387 
1388 	DMEMIT("%u ", count);
1389 
1390 	if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1391 		DMEMIT("no_hydration ");
1392 
1393 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1394 		DMEMIT("no_discard_passdown ");
1395 
1396 	*sz_ptr = sz;
1397 }
1398 
1399 static void emit_core_args(struct clone *clone, char *result,
1400 			   unsigned int maxlen, ssize_t *sz_ptr)
1401 {
1402 	ssize_t sz = *sz_ptr;
1403 	unsigned int count = 4;
1404 
1405 	DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1406 	       READ_ONCE(clone->hydration_threshold),
1407 	       READ_ONCE(clone->hydration_batch_size));
1408 
1409 	*sz_ptr = sz;
1410 }
1411 
1412 /*
1413  * Status format:
1414  *
1415  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1416  * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1417  * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1418  */
1419 static void clone_status(struct dm_target *ti, status_type_t type,
1420 			 unsigned int status_flags, char *result,
1421 			 unsigned int maxlen)
1422 {
1423 	int r;
1424 	unsigned int i;
1425 	ssize_t sz = 0;
1426 	dm_block_t nr_free_metadata_blocks = 0;
1427 	dm_block_t nr_metadata_blocks = 0;
1428 	char buf[BDEVNAME_SIZE];
1429 	struct clone *clone = ti->private;
1430 
1431 	switch (type) {
1432 	case STATUSTYPE_INFO:
1433 		if (get_clone_mode(clone) == CM_FAIL) {
1434 			DMEMIT("Fail");
1435 			break;
1436 		}
1437 
1438 		/* Commit to ensure statistics aren't out-of-date */
1439 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1440 			(void) commit_metadata(clone, NULL);
1441 
1442 		r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1443 
1444 		if (r) {
1445 			DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1446 			      clone_device_name(clone), r);
1447 			goto error;
1448 		}
1449 
1450 		r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1451 
1452 		if (r) {
1453 			DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1454 			      clone_device_name(clone), r);
1455 			goto error;
1456 		}
1457 
1458 		DMEMIT("%u %llu/%llu %llu %lu/%lu %u ",
1459 		       DM_CLONE_METADATA_BLOCK_SIZE,
1460 		       (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1461 		       (unsigned long long)nr_metadata_blocks,
1462 		       (unsigned long long)clone->region_size,
1463 		       dm_clone_nr_of_hydrated_regions(clone->cmd),
1464 		       clone->nr_regions,
1465 		       atomic_read(&clone->hydrations_in_flight));
1466 
1467 		emit_flags(clone, result, maxlen, &sz);
1468 		emit_core_args(clone, result, maxlen, &sz);
1469 
1470 		switch (get_clone_mode(clone)) {
1471 		case CM_WRITE:
1472 			DMEMIT("rw");
1473 			break;
1474 		case CM_READ_ONLY:
1475 			DMEMIT("ro");
1476 			break;
1477 		case CM_FAIL:
1478 			DMEMIT("Fail");
1479 		}
1480 
1481 		break;
1482 
1483 	case STATUSTYPE_TABLE:
1484 		format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1485 		DMEMIT("%s ", buf);
1486 
1487 		format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1488 		DMEMIT("%s ", buf);
1489 
1490 		format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1491 		DMEMIT("%s", buf);
1492 
1493 		for (i = 0; i < clone->nr_ctr_args; i++)
1494 			DMEMIT(" %s", clone->ctr_args[i]);
1495 	}
1496 
1497 	return;
1498 
1499 error:
1500 	DMEMIT("Error");
1501 }
1502 
1503 static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1504 {
1505 	struct request_queue *dest_q, *source_q;
1506 	struct clone *clone = container_of(cb, struct clone, callbacks);
1507 
1508 	source_q = bdev_get_queue(clone->source_dev->bdev);
1509 	dest_q = bdev_get_queue(clone->dest_dev->bdev);
1510 
1511 	return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1512 		bdi_congested(source_q->backing_dev_info, bdi_bits));
1513 }
1514 
1515 static sector_t get_dev_size(struct dm_dev *dev)
1516 {
1517 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1518 }
1519 
1520 /*---------------------------------------------------------------------------*/
1521 
1522 /*
1523  * Construct a clone device mapping:
1524  *
1525  * clone <metadata dev> <destination dev> <source dev> <region size>
1526  *	[<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1527  *
1528  * metadata dev: Fast device holding the persistent metadata
1529  * destination dev: The destination device, which will become a clone of the
1530  *                  source device
1531  * source dev: The read-only source device that gets cloned
1532  * region size: dm-clone unit size in sectors
1533  *
1534  * #feature args: Number of feature arguments passed
1535  * feature args: E.g. no_hydration, no_discard_passdown
1536  *
1537  * #core arguments: An even number of core arguments
1538  * core arguments: Key/value pairs for tuning the core
1539  *		   E.g. 'hydration_threshold 256'
1540  */
1541 static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1542 {
1543 	int r;
1544 	unsigned int argc;
1545 	const char *arg_name;
1546 	struct dm_target *ti = clone->ti;
1547 
1548 	const struct dm_arg args = {
1549 		.min = 0,
1550 		.max = 2,
1551 		.error = "Invalid number of feature arguments"
1552 	};
1553 
1554 	/* No feature arguments supplied */
1555 	if (!as->argc)
1556 		return 0;
1557 
1558 	r = dm_read_arg_group(&args, as, &argc, &ti->error);
1559 	if (r)
1560 		return r;
1561 
1562 	while (argc) {
1563 		arg_name = dm_shift_arg(as);
1564 		argc--;
1565 
1566 		if (!strcasecmp(arg_name, "no_hydration")) {
1567 			__clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1568 		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1569 			__clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1570 		} else {
1571 			ti->error = "Invalid feature argument";
1572 			return -EINVAL;
1573 		}
1574 	}
1575 
1576 	return 0;
1577 }
1578 
1579 static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1580 {
1581 	int r;
1582 	unsigned int argc;
1583 	unsigned int value;
1584 	const char *arg_name;
1585 	struct dm_target *ti = clone->ti;
1586 
1587 	const struct dm_arg args = {
1588 		.min = 0,
1589 		.max = 4,
1590 		.error = "Invalid number of core arguments"
1591 	};
1592 
1593 	/* Initialize core arguments */
1594 	clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1595 	clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1596 
1597 	/* No core arguments supplied */
1598 	if (!as->argc)
1599 		return 0;
1600 
1601 	r = dm_read_arg_group(&args, as, &argc, &ti->error);
1602 	if (r)
1603 		return r;
1604 
1605 	if (argc & 1) {
1606 		ti->error = "Number of core arguments must be even";
1607 		return -EINVAL;
1608 	}
1609 
1610 	while (argc) {
1611 		arg_name = dm_shift_arg(as);
1612 		argc -= 2;
1613 
1614 		if (!strcasecmp(arg_name, "hydration_threshold")) {
1615 			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1616 				ti->error = "Invalid value for argument `hydration_threshold'";
1617 				return -EINVAL;
1618 			}
1619 			clone->hydration_threshold = value;
1620 		} else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1621 			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1622 				ti->error = "Invalid value for argument `hydration_batch_size'";
1623 				return -EINVAL;
1624 			}
1625 			clone->hydration_batch_size = value;
1626 		} else {
1627 			ti->error = "Invalid core argument";
1628 			return -EINVAL;
1629 		}
1630 	}
1631 
1632 	return 0;
1633 }
1634 
1635 static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1636 {
1637 	int r;
1638 	unsigned int region_size;
1639 	struct dm_arg arg;
1640 
1641 	arg.min = MIN_REGION_SIZE;
1642 	arg.max = MAX_REGION_SIZE;
1643 	arg.error = "Invalid region size";
1644 
1645 	r = dm_read_arg(&arg, as, &region_size, error);
1646 	if (r)
1647 		return r;
1648 
1649 	/* Check region size is a power of 2 */
1650 	if (!is_power_of_2(region_size)) {
1651 		*error = "Region size is not a power of 2";
1652 		return -EINVAL;
1653 	}
1654 
1655 	/* Validate the region size against the device logical block size */
1656 	if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1657 	    region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1658 		*error = "Region size is not a multiple of device logical block size";
1659 		return -EINVAL;
1660 	}
1661 
1662 	clone->region_size = region_size;
1663 
1664 	return 0;
1665 }
1666 
1667 static int validate_nr_regions(unsigned long n, char **error)
1668 {
1669 	/*
1670 	 * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1671 	 * further to 2^31 regions.
1672 	 */
1673 	if (n > (1UL << 31)) {
1674 		*error = "Too many regions. Consider increasing the region size";
1675 		return -EINVAL;
1676 	}
1677 
1678 	return 0;
1679 }
1680 
1681 static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1682 {
1683 	int r;
1684 	sector_t metadata_dev_size;
1685 	char b[BDEVNAME_SIZE];
1686 
1687 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1688 			  &clone->metadata_dev);
1689 	if (r) {
1690 		*error = "Error opening metadata device";
1691 		return r;
1692 	}
1693 
1694 	metadata_dev_size = get_dev_size(clone->metadata_dev);
1695 	if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1696 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1697 		       bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1698 
1699 	return 0;
1700 }
1701 
1702 static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1703 {
1704 	int r;
1705 	sector_t dest_dev_size;
1706 
1707 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1708 			  &clone->dest_dev);
1709 	if (r) {
1710 		*error = "Error opening destination device";
1711 		return r;
1712 	}
1713 
1714 	dest_dev_size = get_dev_size(clone->dest_dev);
1715 	if (dest_dev_size < clone->ti->len) {
1716 		dm_put_device(clone->ti, clone->dest_dev);
1717 		*error = "Device size larger than destination device";
1718 		return -EINVAL;
1719 	}
1720 
1721 	return 0;
1722 }
1723 
1724 static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1725 {
1726 	int r;
1727 	sector_t source_dev_size;
1728 
1729 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1730 			  &clone->source_dev);
1731 	if (r) {
1732 		*error = "Error opening source device";
1733 		return r;
1734 	}
1735 
1736 	source_dev_size = get_dev_size(clone->source_dev);
1737 	if (source_dev_size < clone->ti->len) {
1738 		dm_put_device(clone->ti, clone->source_dev);
1739 		*error = "Device size larger than source device";
1740 		return -EINVAL;
1741 	}
1742 
1743 	return 0;
1744 }
1745 
1746 static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1747 {
1748 	unsigned int i;
1749 	const char **copy;
1750 
1751 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1752 	if (!copy)
1753 		goto error;
1754 
1755 	for (i = 0; i < argc; i++) {
1756 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
1757 
1758 		if (!copy[i]) {
1759 			while (i--)
1760 				kfree(copy[i]);
1761 			kfree(copy);
1762 			goto error;
1763 		}
1764 	}
1765 
1766 	clone->nr_ctr_args = argc;
1767 	clone->ctr_args = copy;
1768 	return 0;
1769 
1770 error:
1771 	*error = "Failed to allocate memory for table line";
1772 	return -ENOMEM;
1773 }
1774 
1775 static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1776 {
1777 	int r;
1778 	struct clone *clone;
1779 	struct dm_arg_set as;
1780 
1781 	if (argc < 4) {
1782 		ti->error = "Invalid number of arguments";
1783 		return -EINVAL;
1784 	}
1785 
1786 	as.argc = argc;
1787 	as.argv = argv;
1788 
1789 	clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1790 	if (!clone) {
1791 		ti->error = "Failed to allocate clone structure";
1792 		return -ENOMEM;
1793 	}
1794 
1795 	clone->ti = ti;
1796 
1797 	/* Initialize dm-clone flags */
1798 	__set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1799 	__set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1800 	__set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1801 
1802 	r = parse_metadata_dev(clone, &as, &ti->error);
1803 	if (r)
1804 		goto out_with_clone;
1805 
1806 	r = parse_dest_dev(clone, &as, &ti->error);
1807 	if (r)
1808 		goto out_with_meta_dev;
1809 
1810 	r = parse_source_dev(clone, &as, &ti->error);
1811 	if (r)
1812 		goto out_with_dest_dev;
1813 
1814 	r = parse_region_size(clone, &as, &ti->error);
1815 	if (r)
1816 		goto out_with_source_dev;
1817 
1818 	clone->region_shift = __ffs(clone->region_size);
1819 	clone->nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1820 
1821 	r = validate_nr_regions(clone->nr_regions, &ti->error);
1822 	if (r)
1823 		goto out_with_source_dev;
1824 
1825 	r = dm_set_target_max_io_len(ti, clone->region_size);
1826 	if (r) {
1827 		ti->error = "Failed to set max io len";
1828 		goto out_with_source_dev;
1829 	}
1830 
1831 	r = parse_feature_args(&as, clone);
1832 	if (r)
1833 		goto out_with_source_dev;
1834 
1835 	r = parse_core_args(&as, clone);
1836 	if (r)
1837 		goto out_with_source_dev;
1838 
1839 	/* Load metadata */
1840 	clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1841 					    clone->region_size);
1842 	if (IS_ERR(clone->cmd)) {
1843 		ti->error = "Failed to load metadata";
1844 		r = PTR_ERR(clone->cmd);
1845 		goto out_with_source_dev;
1846 	}
1847 
1848 	__set_clone_mode(clone, CM_WRITE);
1849 
1850 	if (get_clone_mode(clone) != CM_WRITE) {
1851 		ti->error = "Unable to get write access to metadata, please check/repair metadata";
1852 		r = -EPERM;
1853 		goto out_with_metadata;
1854 	}
1855 
1856 	clone->last_commit_jiffies = jiffies;
1857 
1858 	/* Allocate hydration hash table */
1859 	r = hash_table_init(clone);
1860 	if (r) {
1861 		ti->error = "Failed to allocate hydration hash table";
1862 		goto out_with_metadata;
1863 	}
1864 
1865 	atomic_set(&clone->ios_in_flight, 0);
1866 	init_waitqueue_head(&clone->hydration_stopped);
1867 	spin_lock_init(&clone->lock);
1868 	bio_list_init(&clone->deferred_bios);
1869 	bio_list_init(&clone->deferred_discard_bios);
1870 	bio_list_init(&clone->deferred_flush_bios);
1871 	bio_list_init(&clone->deferred_flush_completions);
1872 	clone->hydration_offset = 0;
1873 	atomic_set(&clone->hydrations_in_flight, 0);
1874 	bio_init(&clone->flush_bio, NULL, 0);
1875 
1876 	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1877 	if (!clone->wq) {
1878 		ti->error = "Failed to allocate workqueue";
1879 		r = -ENOMEM;
1880 		goto out_with_ht;
1881 	}
1882 
1883 	INIT_WORK(&clone->worker, do_worker);
1884 	INIT_DELAYED_WORK(&clone->waker, do_waker);
1885 
1886 	clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1887 	if (IS_ERR(clone->kcopyd_client)) {
1888 		r = PTR_ERR(clone->kcopyd_client);
1889 		goto out_with_wq;
1890 	}
1891 
1892 	r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1893 				   _hydration_cache);
1894 	if (r) {
1895 		ti->error = "Failed to create dm_clone_region_hydration memory pool";
1896 		goto out_with_kcopyd;
1897 	}
1898 
1899 	/* Save a copy of the table line */
1900 	r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1901 	if (r)
1902 		goto out_with_mempool;
1903 
1904 	mutex_init(&clone->commit_lock);
1905 	clone->callbacks.congested_fn = clone_is_congested;
1906 	dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1907 
1908 	/* Enable flushes */
1909 	ti->num_flush_bios = 1;
1910 	ti->flush_supported = true;
1911 
1912 	/* Enable discards */
1913 	ti->discards_supported = true;
1914 	ti->num_discard_bios = 1;
1915 
1916 	ti->private = clone;
1917 
1918 	return 0;
1919 
1920 out_with_mempool:
1921 	mempool_exit(&clone->hydration_pool);
1922 out_with_kcopyd:
1923 	dm_kcopyd_client_destroy(clone->kcopyd_client);
1924 out_with_wq:
1925 	destroy_workqueue(clone->wq);
1926 out_with_ht:
1927 	hash_table_exit(clone);
1928 out_with_metadata:
1929 	dm_clone_metadata_close(clone->cmd);
1930 out_with_source_dev:
1931 	dm_put_device(ti, clone->source_dev);
1932 out_with_dest_dev:
1933 	dm_put_device(ti, clone->dest_dev);
1934 out_with_meta_dev:
1935 	dm_put_device(ti, clone->metadata_dev);
1936 out_with_clone:
1937 	kfree(clone);
1938 
1939 	return r;
1940 }
1941 
1942 static void clone_dtr(struct dm_target *ti)
1943 {
1944 	unsigned int i;
1945 	struct clone *clone = ti->private;
1946 
1947 	mutex_destroy(&clone->commit_lock);
1948 	bio_uninit(&clone->flush_bio);
1949 
1950 	for (i = 0; i < clone->nr_ctr_args; i++)
1951 		kfree(clone->ctr_args[i]);
1952 	kfree(clone->ctr_args);
1953 
1954 	mempool_exit(&clone->hydration_pool);
1955 	dm_kcopyd_client_destroy(clone->kcopyd_client);
1956 	destroy_workqueue(clone->wq);
1957 	hash_table_exit(clone);
1958 	dm_clone_metadata_close(clone->cmd);
1959 	dm_put_device(ti, clone->source_dev);
1960 	dm_put_device(ti, clone->dest_dev);
1961 	dm_put_device(ti, clone->metadata_dev);
1962 
1963 	kfree(clone);
1964 }
1965 
1966 /*---------------------------------------------------------------------------*/
1967 
1968 static void clone_postsuspend(struct dm_target *ti)
1969 {
1970 	struct clone *clone = ti->private;
1971 
1972 	/*
1973 	 * To successfully suspend the device:
1974 	 *
1975 	 *	- We cancel the delayed work for periodic commits and wait for
1976 	 *	  it to finish.
1977 	 *
1978 	 *	- We stop the background hydration, i.e. we prevent new region
1979 	 *	  hydrations from starting.
1980 	 *
1981 	 *	- We wait for any in-flight hydrations to finish.
1982 	 *
1983 	 *	- We flush the workqueue.
1984 	 *
1985 	 *	- We commit the metadata.
1986 	 */
1987 	cancel_delayed_work_sync(&clone->waker);
1988 
1989 	set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1990 
1991 	/*
1992 	 * Make sure set_bit() is ordered before atomic_read(), otherwise we
1993 	 * might race with do_hydration() and miss some started region
1994 	 * hydrations.
1995 	 *
1996 	 * This is paired with smp_mb__after_atomic() in do_hydration().
1997 	 */
1998 	smp_mb__after_atomic();
1999 
2000 	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
2001 	flush_workqueue(clone->wq);
2002 
2003 	(void) commit_metadata(clone, NULL);
2004 }
2005 
2006 static void clone_resume(struct dm_target *ti)
2007 {
2008 	struct clone *clone = ti->private;
2009 
2010 	clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2011 	do_waker(&clone->waker.work);
2012 }
2013 
2014 static bool bdev_supports_discards(struct block_device *bdev)
2015 {
2016 	struct request_queue *q = bdev_get_queue(bdev);
2017 
2018 	return (q && blk_queue_discard(q));
2019 }
2020 
2021 /*
2022  * If discard_passdown was enabled verify that the destination device supports
2023  * discards. Disable discard_passdown if not.
2024  */
2025 static void disable_passdown_if_not_supported(struct clone *clone)
2026 {
2027 	struct block_device *dest_dev = clone->dest_dev->bdev;
2028 	struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
2029 	const char *reason = NULL;
2030 	char buf[BDEVNAME_SIZE];
2031 
2032 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
2033 		return;
2034 
2035 	if (!bdev_supports_discards(dest_dev))
2036 		reason = "discard unsupported";
2037 	else if (dest_limits->max_discard_sectors < clone->region_size)
2038 		reason = "max discard sectors smaller than a region";
2039 
2040 	if (reason) {
2041 		DMWARN("Destination device (%s) %s: Disabling discard passdown.",
2042 		       bdevname(dest_dev, buf), reason);
2043 		clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2044 	}
2045 }
2046 
2047 static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2048 {
2049 	struct block_device *dest_bdev = clone->dest_dev->bdev;
2050 	struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2051 
2052 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2053 		/* No passdown is done so we set our own virtual limits */
2054 		limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2055 		limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2056 		return;
2057 	}
2058 
2059 	/*
2060 	 * clone_iterate_devices() is stacking both the source and destination
2061 	 * device limits but discards aren't passed to the source device, so
2062 	 * inherit destination's limits.
2063 	 */
2064 	limits->max_discard_sectors = dest_limits->max_discard_sectors;
2065 	limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2066 	limits->discard_granularity = dest_limits->discard_granularity;
2067 	limits->discard_alignment = dest_limits->discard_alignment;
2068 	limits->discard_misaligned = dest_limits->discard_misaligned;
2069 	limits->max_discard_segments = dest_limits->max_discard_segments;
2070 }
2071 
2072 static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2073 {
2074 	struct clone *clone = ti->private;
2075 	u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2076 
2077 	/*
2078 	 * If the system-determined stacked limits are compatible with
2079 	 * dm-clone's region size (io_opt is a factor) do not override them.
2080 	 */
2081 	if (io_opt_sectors < clone->region_size ||
2082 	    do_div(io_opt_sectors, clone->region_size)) {
2083 		blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2084 		blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2085 	}
2086 
2087 	disable_passdown_if_not_supported(clone);
2088 	set_discard_limits(clone, limits);
2089 }
2090 
2091 static int clone_iterate_devices(struct dm_target *ti,
2092 				 iterate_devices_callout_fn fn, void *data)
2093 {
2094 	int ret;
2095 	struct clone *clone = ti->private;
2096 	struct dm_dev *dest_dev = clone->dest_dev;
2097 	struct dm_dev *source_dev = clone->source_dev;
2098 
2099 	ret = fn(ti, source_dev, 0, ti->len, data);
2100 	if (!ret)
2101 		ret = fn(ti, dest_dev, 0, ti->len, data);
2102 	return ret;
2103 }
2104 
2105 /*
2106  * dm-clone message functions.
2107  */
2108 static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2109 {
2110 	WRITE_ONCE(clone->hydration_threshold, nr_regions);
2111 
2112 	/*
2113 	 * If user space sets hydration_threshold to zero then the hydration
2114 	 * will stop. If at a later time the hydration_threshold is increased
2115 	 * we must restart the hydration process by waking up the worker.
2116 	 */
2117 	wake_worker(clone);
2118 }
2119 
2120 static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2121 {
2122 	WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2123 }
2124 
2125 static void enable_hydration(struct clone *clone)
2126 {
2127 	if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2128 		wake_worker(clone);
2129 }
2130 
2131 static void disable_hydration(struct clone *clone)
2132 {
2133 	clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2134 }
2135 
2136 static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2137 			 char *result, unsigned int maxlen)
2138 {
2139 	struct clone *clone = ti->private;
2140 	unsigned int value;
2141 
2142 	if (!argc)
2143 		return -EINVAL;
2144 
2145 	if (!strcasecmp(argv[0], "enable_hydration")) {
2146 		enable_hydration(clone);
2147 		return 0;
2148 	}
2149 
2150 	if (!strcasecmp(argv[0], "disable_hydration")) {
2151 		disable_hydration(clone);
2152 		return 0;
2153 	}
2154 
2155 	if (argc != 2)
2156 		return -EINVAL;
2157 
2158 	if (!strcasecmp(argv[0], "hydration_threshold")) {
2159 		if (kstrtouint(argv[1], 10, &value))
2160 			return -EINVAL;
2161 
2162 		set_hydration_threshold(clone, value);
2163 
2164 		return 0;
2165 	}
2166 
2167 	if (!strcasecmp(argv[0], "hydration_batch_size")) {
2168 		if (kstrtouint(argv[1], 10, &value))
2169 			return -EINVAL;
2170 
2171 		set_hydration_batch_size(clone, value);
2172 
2173 		return 0;
2174 	}
2175 
2176 	DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2177 	return -EINVAL;
2178 }
2179 
2180 static struct target_type clone_target = {
2181 	.name = "clone",
2182 	.version = {1, 0, 0},
2183 	.module = THIS_MODULE,
2184 	.ctr = clone_ctr,
2185 	.dtr =  clone_dtr,
2186 	.map = clone_map,
2187 	.end_io = clone_endio,
2188 	.postsuspend = clone_postsuspend,
2189 	.resume = clone_resume,
2190 	.status = clone_status,
2191 	.message = clone_message,
2192 	.io_hints = clone_io_hints,
2193 	.iterate_devices = clone_iterate_devices,
2194 };
2195 
2196 /*---------------------------------------------------------------------------*/
2197 
2198 /* Module functions */
2199 static int __init dm_clone_init(void)
2200 {
2201 	int r;
2202 
2203 	_hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2204 	if (!_hydration_cache)
2205 		return -ENOMEM;
2206 
2207 	r = dm_register_target(&clone_target);
2208 	if (r < 0) {
2209 		DMERR("Failed to register clone target");
2210 		return r;
2211 	}
2212 
2213 	return 0;
2214 }
2215 
2216 static void __exit dm_clone_exit(void)
2217 {
2218 	dm_unregister_target(&clone_target);
2219 
2220 	kmem_cache_destroy(_hydration_cache);
2221 	_hydration_cache = NULL;
2222 }
2223 
2224 /* Module hooks */
2225 module_init(dm_clone_init);
2226 module_exit(dm_clone_exit);
2227 
2228 MODULE_DESCRIPTION(DM_NAME " clone target");
2229 MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2230 MODULE_LICENSE("GPL");
2231