xref: /openbmc/linux/drivers/md/dm-clone-target.c (revision b58c6630)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2019 Arrikto, Inc. All Rights Reserved.
4  */
5 
6 #include <linux/mm.h>
7 #include <linux/bio.h>
8 #include <linux/err.h>
9 #include <linux/hash.h>
10 #include <linux/list.h>
11 #include <linux/log2.h>
12 #include <linux/init.h>
13 #include <linux/slab.h>
14 #include <linux/wait.h>
15 #include <linux/dm-io.h>
16 #include <linux/mutex.h>
17 #include <linux/atomic.h>
18 #include <linux/bitops.h>
19 #include <linux/blkdev.h>
20 #include <linux/kdev_t.h>
21 #include <linux/kernel.h>
22 #include <linux/module.h>
23 #include <linux/jiffies.h>
24 #include <linux/mempool.h>
25 #include <linux/spinlock.h>
26 #include <linux/blk_types.h>
27 #include <linux/dm-kcopyd.h>
28 #include <linux/workqueue.h>
29 #include <linux/backing-dev.h>
30 #include <linux/device-mapper.h>
31 
32 #include "dm.h"
33 #include "dm-clone-metadata.h"
34 
35 #define DM_MSG_PREFIX "clone"
36 
37 /*
38  * Minimum and maximum allowed region sizes
39  */
40 #define MIN_REGION_SIZE (1 << 3)  /* 4KB */
41 #define MAX_REGION_SIZE (1 << 21) /* 1GB */
42 
43 #define MIN_HYDRATIONS 256 /* Size of hydration mempool */
44 #define DEFAULT_HYDRATION_THRESHOLD 1 /* 1 region */
45 #define DEFAULT_HYDRATION_BATCH_SIZE 1 /* Hydrate in batches of 1 region */
46 
47 #define COMMIT_PERIOD HZ /* 1 sec */
48 
49 /*
50  * Hydration hash table size: 1 << HASH_TABLE_BITS
51  */
52 #define HASH_TABLE_BITS 15
53 
54 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(clone_hydration_throttle,
55 	"A percentage of time allocated for hydrating regions");
56 
57 /* Slab cache for struct dm_clone_region_hydration */
58 static struct kmem_cache *_hydration_cache;
59 
60 /* dm-clone metadata modes */
61 enum clone_metadata_mode {
62 	CM_WRITE,		/* metadata may be changed */
63 	CM_READ_ONLY,		/* metadata may not be changed */
64 	CM_FAIL,		/* all metadata I/O fails */
65 };
66 
67 struct hash_table_bucket;
68 
69 struct clone {
70 	struct dm_target *ti;
71 	struct dm_target_callbacks callbacks;
72 
73 	struct dm_dev *metadata_dev;
74 	struct dm_dev *dest_dev;
75 	struct dm_dev *source_dev;
76 
77 	unsigned long nr_regions;
78 	sector_t region_size;
79 	unsigned int region_shift;
80 
81 	/*
82 	 * A metadata commit and the actions taken in case it fails should run
83 	 * as a single atomic step.
84 	 */
85 	struct mutex commit_lock;
86 
87 	struct dm_clone_metadata *cmd;
88 
89 	/*
90 	 * bio used to flush the destination device, before committing the
91 	 * metadata.
92 	 */
93 	struct bio flush_bio;
94 
95 	/* Region hydration hash table */
96 	struct hash_table_bucket *ht;
97 
98 	atomic_t ios_in_flight;
99 
100 	wait_queue_head_t hydration_stopped;
101 
102 	mempool_t hydration_pool;
103 
104 	unsigned long last_commit_jiffies;
105 
106 	/*
107 	 * We defer incoming WRITE bios for regions that are not hydrated,
108 	 * until after these regions have been hydrated.
109 	 *
110 	 * Also, we defer REQ_FUA and REQ_PREFLUSH bios, until after the
111 	 * metadata have been committed.
112 	 */
113 	spinlock_t lock;
114 	struct bio_list deferred_bios;
115 	struct bio_list deferred_discard_bios;
116 	struct bio_list deferred_flush_bios;
117 	struct bio_list deferred_flush_completions;
118 
119 	/* Maximum number of regions being copied during background hydration. */
120 	unsigned int hydration_threshold;
121 
122 	/* Number of regions to batch together during background hydration. */
123 	unsigned int hydration_batch_size;
124 
125 	/* Which region to hydrate next */
126 	unsigned long hydration_offset;
127 
128 	atomic_t hydrations_in_flight;
129 
130 	/*
131 	 * Save a copy of the table line rather than reconstructing it for the
132 	 * status.
133 	 */
134 	unsigned int nr_ctr_args;
135 	const char **ctr_args;
136 
137 	struct workqueue_struct *wq;
138 	struct work_struct worker;
139 	struct delayed_work waker;
140 
141 	struct dm_kcopyd_client *kcopyd_client;
142 
143 	enum clone_metadata_mode mode;
144 	unsigned long flags;
145 };
146 
147 /*
148  * dm-clone flags
149  */
150 #define DM_CLONE_DISCARD_PASSDOWN 0
151 #define DM_CLONE_HYDRATION_ENABLED 1
152 #define DM_CLONE_HYDRATION_SUSPENDED 2
153 
154 /*---------------------------------------------------------------------------*/
155 
156 /*
157  * Metadata failure handling.
158  */
159 static enum clone_metadata_mode get_clone_mode(struct clone *clone)
160 {
161 	return READ_ONCE(clone->mode);
162 }
163 
164 static const char *clone_device_name(struct clone *clone)
165 {
166 	return dm_table_device_name(clone->ti->table);
167 }
168 
169 static void __set_clone_mode(struct clone *clone, enum clone_metadata_mode new_mode)
170 {
171 	const char *descs[] = {
172 		"read-write",
173 		"read-only",
174 		"fail"
175 	};
176 
177 	enum clone_metadata_mode old_mode = get_clone_mode(clone);
178 
179 	/* Never move out of fail mode */
180 	if (old_mode == CM_FAIL)
181 		new_mode = CM_FAIL;
182 
183 	switch (new_mode) {
184 	case CM_FAIL:
185 	case CM_READ_ONLY:
186 		dm_clone_metadata_set_read_only(clone->cmd);
187 		break;
188 
189 	case CM_WRITE:
190 		dm_clone_metadata_set_read_write(clone->cmd);
191 		break;
192 	}
193 
194 	WRITE_ONCE(clone->mode, new_mode);
195 
196 	if (new_mode != old_mode) {
197 		dm_table_event(clone->ti->table);
198 		DMINFO("%s: Switching to %s mode", clone_device_name(clone),
199 		       descs[(int)new_mode]);
200 	}
201 }
202 
203 static void __abort_transaction(struct clone *clone)
204 {
205 	const char *dev_name = clone_device_name(clone);
206 
207 	if (get_clone_mode(clone) >= CM_READ_ONLY)
208 		return;
209 
210 	DMERR("%s: Aborting current metadata transaction", dev_name);
211 	if (dm_clone_metadata_abort(clone->cmd)) {
212 		DMERR("%s: Failed to abort metadata transaction", dev_name);
213 		__set_clone_mode(clone, CM_FAIL);
214 	}
215 }
216 
217 static void __reload_in_core_bitset(struct clone *clone)
218 {
219 	const char *dev_name = clone_device_name(clone);
220 
221 	if (get_clone_mode(clone) == CM_FAIL)
222 		return;
223 
224 	/* Reload the on-disk bitset */
225 	DMINFO("%s: Reloading on-disk bitmap", dev_name);
226 	if (dm_clone_reload_in_core_bitset(clone->cmd)) {
227 		DMERR("%s: Failed to reload on-disk bitmap", dev_name);
228 		__set_clone_mode(clone, CM_FAIL);
229 	}
230 }
231 
232 static void __metadata_operation_failed(struct clone *clone, const char *op, int r)
233 {
234 	DMERR("%s: Metadata operation `%s' failed: error = %d",
235 	      clone_device_name(clone), op, r);
236 
237 	__abort_transaction(clone);
238 	__set_clone_mode(clone, CM_READ_ONLY);
239 
240 	/*
241 	 * dm_clone_reload_in_core_bitset() may run concurrently with either
242 	 * dm_clone_set_region_hydrated() or dm_clone_cond_set_range(), but
243 	 * it's safe as we have already set the metadata to read-only mode.
244 	 */
245 	__reload_in_core_bitset(clone);
246 }
247 
248 /*---------------------------------------------------------------------------*/
249 
250 /* Wake up anyone waiting for region hydrations to stop */
251 static inline void wakeup_hydration_waiters(struct clone *clone)
252 {
253 	wake_up_all(&clone->hydration_stopped);
254 }
255 
256 static inline void wake_worker(struct clone *clone)
257 {
258 	queue_work(clone->wq, &clone->worker);
259 }
260 
261 /*---------------------------------------------------------------------------*/
262 
263 /*
264  * bio helper functions.
265  */
266 static inline void remap_to_source(struct clone *clone, struct bio *bio)
267 {
268 	bio_set_dev(bio, clone->source_dev->bdev);
269 }
270 
271 static inline void remap_to_dest(struct clone *clone, struct bio *bio)
272 {
273 	bio_set_dev(bio, clone->dest_dev->bdev);
274 }
275 
276 static bool bio_triggers_commit(struct clone *clone, struct bio *bio)
277 {
278 	return op_is_flush(bio->bi_opf) &&
279 		dm_clone_changed_this_transaction(clone->cmd);
280 }
281 
282 /* Get the address of the region in sectors */
283 static inline sector_t region_to_sector(struct clone *clone, unsigned long region_nr)
284 {
285 	return ((sector_t)region_nr << clone->region_shift);
286 }
287 
288 /* Get the region number of the bio */
289 static inline unsigned long bio_to_region(struct clone *clone, struct bio *bio)
290 {
291 	return (bio->bi_iter.bi_sector >> clone->region_shift);
292 }
293 
294 /* Get the region range covered by the bio */
295 static void bio_region_range(struct clone *clone, struct bio *bio,
296 			     unsigned long *rs, unsigned long *nr_regions)
297 {
298 	unsigned long end;
299 
300 	*rs = dm_sector_div_up(bio->bi_iter.bi_sector, clone->region_size);
301 	end = bio_end_sector(bio) >> clone->region_shift;
302 
303 	if (*rs >= end)
304 		*nr_regions = 0;
305 	else
306 		*nr_regions = end - *rs;
307 }
308 
309 /* Check whether a bio overwrites a region */
310 static inline bool is_overwrite_bio(struct clone *clone, struct bio *bio)
311 {
312 	return (bio_data_dir(bio) == WRITE && bio_sectors(bio) == clone->region_size);
313 }
314 
315 static void fail_bios(struct bio_list *bios, blk_status_t status)
316 {
317 	struct bio *bio;
318 
319 	while ((bio = bio_list_pop(bios))) {
320 		bio->bi_status = status;
321 		bio_endio(bio);
322 	}
323 }
324 
325 static void submit_bios(struct bio_list *bios)
326 {
327 	struct bio *bio;
328 	struct blk_plug plug;
329 
330 	blk_start_plug(&plug);
331 
332 	while ((bio = bio_list_pop(bios)))
333 		generic_make_request(bio);
334 
335 	blk_finish_plug(&plug);
336 }
337 
338 /*
339  * Submit bio to the underlying device.
340  *
341  * If the bio triggers a commit, delay it, until after the metadata have been
342  * committed.
343  *
344  * NOTE: The bio remapping must be performed by the caller.
345  */
346 static void issue_bio(struct clone *clone, struct bio *bio)
347 {
348 	if (!bio_triggers_commit(clone, bio)) {
349 		generic_make_request(bio);
350 		return;
351 	}
352 
353 	/*
354 	 * If the metadata mode is RO or FAIL we won't be able to commit the
355 	 * metadata, so we complete the bio with an error.
356 	 */
357 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
358 		bio_io_error(bio);
359 		return;
360 	}
361 
362 	/*
363 	 * Batch together any bios that trigger commits and then issue a single
364 	 * commit for them in process_deferred_flush_bios().
365 	 */
366 	spin_lock_irq(&clone->lock);
367 	bio_list_add(&clone->deferred_flush_bios, bio);
368 	spin_unlock_irq(&clone->lock);
369 
370 	wake_worker(clone);
371 }
372 
373 /*
374  * Remap bio to the destination device and submit it.
375  *
376  * If the bio triggers a commit, delay it, until after the metadata have been
377  * committed.
378  */
379 static void remap_and_issue(struct clone *clone, struct bio *bio)
380 {
381 	remap_to_dest(clone, bio);
382 	issue_bio(clone, bio);
383 }
384 
385 /*
386  * Issue bios that have been deferred until after their region has finished
387  * hydrating.
388  *
389  * We delegate the bio submission to the worker thread, so this is safe to call
390  * from interrupt context.
391  */
392 static void issue_deferred_bios(struct clone *clone, struct bio_list *bios)
393 {
394 	struct bio *bio;
395 	unsigned long flags;
396 	struct bio_list flush_bios = BIO_EMPTY_LIST;
397 	struct bio_list normal_bios = BIO_EMPTY_LIST;
398 
399 	if (bio_list_empty(bios))
400 		return;
401 
402 	while ((bio = bio_list_pop(bios))) {
403 		if (bio_triggers_commit(clone, bio))
404 			bio_list_add(&flush_bios, bio);
405 		else
406 			bio_list_add(&normal_bios, bio);
407 	}
408 
409 	spin_lock_irqsave(&clone->lock, flags);
410 	bio_list_merge(&clone->deferred_bios, &normal_bios);
411 	bio_list_merge(&clone->deferred_flush_bios, &flush_bios);
412 	spin_unlock_irqrestore(&clone->lock, flags);
413 
414 	wake_worker(clone);
415 }
416 
417 static void complete_overwrite_bio(struct clone *clone, struct bio *bio)
418 {
419 	unsigned long flags;
420 
421 	/*
422 	 * If the bio has the REQ_FUA flag set we must commit the metadata
423 	 * before signaling its completion.
424 	 *
425 	 * complete_overwrite_bio() is only called by hydration_complete(),
426 	 * after having successfully updated the metadata. This means we don't
427 	 * need to call dm_clone_changed_this_transaction() to check if the
428 	 * metadata has changed and thus we can avoid taking the metadata spin
429 	 * lock.
430 	 */
431 	if (!(bio->bi_opf & REQ_FUA)) {
432 		bio_endio(bio);
433 		return;
434 	}
435 
436 	/*
437 	 * If the metadata mode is RO or FAIL we won't be able to commit the
438 	 * metadata, so we complete the bio with an error.
439 	 */
440 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
441 		bio_io_error(bio);
442 		return;
443 	}
444 
445 	/*
446 	 * Batch together any bios that trigger commits and then issue a single
447 	 * commit for them in process_deferred_flush_bios().
448 	 */
449 	spin_lock_irqsave(&clone->lock, flags);
450 	bio_list_add(&clone->deferred_flush_completions, bio);
451 	spin_unlock_irqrestore(&clone->lock, flags);
452 
453 	wake_worker(clone);
454 }
455 
456 static void trim_bio(struct bio *bio, sector_t sector, unsigned int len)
457 {
458 	bio->bi_iter.bi_sector = sector;
459 	bio->bi_iter.bi_size = to_bytes(len);
460 }
461 
462 static void complete_discard_bio(struct clone *clone, struct bio *bio, bool success)
463 {
464 	unsigned long rs, nr_regions;
465 
466 	/*
467 	 * If the destination device supports discards, remap and trim the
468 	 * discard bio and pass it down. Otherwise complete the bio
469 	 * immediately.
470 	 */
471 	if (test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags) && success) {
472 		remap_to_dest(clone, bio);
473 		bio_region_range(clone, bio, &rs, &nr_regions);
474 		trim_bio(bio, region_to_sector(clone, rs),
475 			 nr_regions << clone->region_shift);
476 		generic_make_request(bio);
477 	} else
478 		bio_endio(bio);
479 }
480 
481 static void process_discard_bio(struct clone *clone, struct bio *bio)
482 {
483 	unsigned long rs, nr_regions;
484 
485 	bio_region_range(clone, bio, &rs, &nr_regions);
486 	if (!nr_regions) {
487 		bio_endio(bio);
488 		return;
489 	}
490 
491 	if (WARN_ON(rs >= clone->nr_regions || (rs + nr_regions) < rs ||
492 		    (rs + nr_regions) > clone->nr_regions)) {
493 		DMERR("%s: Invalid range (%lu + %lu, total regions %lu) for discard (%llu + %u)",
494 		      clone_device_name(clone), rs, nr_regions,
495 		      clone->nr_regions,
496 		      (unsigned long long)bio->bi_iter.bi_sector,
497 		      bio_sectors(bio));
498 		bio_endio(bio);
499 		return;
500 	}
501 
502 	/*
503 	 * The covered regions are already hydrated so we just need to pass
504 	 * down the discard.
505 	 */
506 	if (dm_clone_is_range_hydrated(clone->cmd, rs, nr_regions)) {
507 		complete_discard_bio(clone, bio, true);
508 		return;
509 	}
510 
511 	/*
512 	 * If the metadata mode is RO or FAIL we won't be able to update the
513 	 * metadata for the regions covered by the discard so we just ignore
514 	 * it.
515 	 */
516 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
517 		bio_endio(bio);
518 		return;
519 	}
520 
521 	/*
522 	 * Defer discard processing.
523 	 */
524 	spin_lock_irq(&clone->lock);
525 	bio_list_add(&clone->deferred_discard_bios, bio);
526 	spin_unlock_irq(&clone->lock);
527 
528 	wake_worker(clone);
529 }
530 
531 /*---------------------------------------------------------------------------*/
532 
533 /*
534  * dm-clone region hydrations.
535  */
536 struct dm_clone_region_hydration {
537 	struct clone *clone;
538 	unsigned long region_nr;
539 
540 	struct bio *overwrite_bio;
541 	bio_end_io_t *overwrite_bio_end_io;
542 
543 	struct bio_list deferred_bios;
544 
545 	blk_status_t status;
546 
547 	/* Used by hydration batching */
548 	struct list_head list;
549 
550 	/* Used by hydration hash table */
551 	struct hlist_node h;
552 };
553 
554 /*
555  * Hydration hash table implementation.
556  *
557  * Ideally we would like to use list_bl, which uses bit spin locks and employs
558  * the least significant bit of the list head to lock the corresponding bucket,
559  * reducing the memory overhead for the locks. But, currently, list_bl and bit
560  * spin locks don't support IRQ safe versions. Since we have to take the lock
561  * in both process and interrupt context, we must fall back to using regular
562  * spin locks; one per hash table bucket.
563  */
564 struct hash_table_bucket {
565 	struct hlist_head head;
566 
567 	/* Spinlock protecting the bucket */
568 	spinlock_t lock;
569 };
570 
571 #define bucket_lock_irqsave(bucket, flags) \
572 	spin_lock_irqsave(&(bucket)->lock, flags)
573 
574 #define bucket_unlock_irqrestore(bucket, flags) \
575 	spin_unlock_irqrestore(&(bucket)->lock, flags)
576 
577 #define bucket_lock_irq(bucket) \
578 	spin_lock_irq(&(bucket)->lock)
579 
580 #define bucket_unlock_irq(bucket) \
581 	spin_unlock_irq(&(bucket)->lock)
582 
583 static int hash_table_init(struct clone *clone)
584 {
585 	unsigned int i, sz;
586 	struct hash_table_bucket *bucket;
587 
588 	sz = 1 << HASH_TABLE_BITS;
589 
590 	clone->ht = kvmalloc(sz * sizeof(struct hash_table_bucket), GFP_KERNEL);
591 	if (!clone->ht)
592 		return -ENOMEM;
593 
594 	for (i = 0; i < sz; i++) {
595 		bucket = clone->ht + i;
596 
597 		INIT_HLIST_HEAD(&bucket->head);
598 		spin_lock_init(&bucket->lock);
599 	}
600 
601 	return 0;
602 }
603 
604 static void hash_table_exit(struct clone *clone)
605 {
606 	kvfree(clone->ht);
607 }
608 
609 static struct hash_table_bucket *get_hash_table_bucket(struct clone *clone,
610 						       unsigned long region_nr)
611 {
612 	return &clone->ht[hash_long(region_nr, HASH_TABLE_BITS)];
613 }
614 
615 /*
616  * Search hash table for a hydration with hd->region_nr == region_nr
617  *
618  * NOTE: Must be called with the bucket lock held
619  */
620 static struct dm_clone_region_hydration *__hash_find(struct hash_table_bucket *bucket,
621 						     unsigned long region_nr)
622 {
623 	struct dm_clone_region_hydration *hd;
624 
625 	hlist_for_each_entry(hd, &bucket->head, h) {
626 		if (hd->region_nr == region_nr)
627 			return hd;
628 	}
629 
630 	return NULL;
631 }
632 
633 /*
634  * Insert a hydration into the hash table.
635  *
636  * NOTE: Must be called with the bucket lock held.
637  */
638 static inline void __insert_region_hydration(struct hash_table_bucket *bucket,
639 					     struct dm_clone_region_hydration *hd)
640 {
641 	hlist_add_head(&hd->h, &bucket->head);
642 }
643 
644 /*
645  * This function inserts a hydration into the hash table, unless someone else
646  * managed to insert a hydration for the same region first. In the latter case
647  * it returns the existing hydration descriptor for this region.
648  *
649  * NOTE: Must be called with the hydration hash table lock held.
650  */
651 static struct dm_clone_region_hydration *
652 __find_or_insert_region_hydration(struct hash_table_bucket *bucket,
653 				  struct dm_clone_region_hydration *hd)
654 {
655 	struct dm_clone_region_hydration *hd2;
656 
657 	hd2 = __hash_find(bucket, hd->region_nr);
658 	if (hd2)
659 		return hd2;
660 
661 	__insert_region_hydration(bucket, hd);
662 
663 	return hd;
664 }
665 
666 /*---------------------------------------------------------------------------*/
667 
668 /* Allocate a hydration */
669 static struct dm_clone_region_hydration *alloc_hydration(struct clone *clone)
670 {
671 	struct dm_clone_region_hydration *hd;
672 
673 	/*
674 	 * Allocate a hydration from the hydration mempool.
675 	 * This might block but it can't fail.
676 	 */
677 	hd = mempool_alloc(&clone->hydration_pool, GFP_NOIO);
678 	hd->clone = clone;
679 
680 	return hd;
681 }
682 
683 static inline void free_hydration(struct dm_clone_region_hydration *hd)
684 {
685 	mempool_free(hd, &hd->clone->hydration_pool);
686 }
687 
688 /* Initialize a hydration */
689 static void hydration_init(struct dm_clone_region_hydration *hd, unsigned long region_nr)
690 {
691 	hd->region_nr = region_nr;
692 	hd->overwrite_bio = NULL;
693 	bio_list_init(&hd->deferred_bios);
694 	hd->status = 0;
695 
696 	INIT_LIST_HEAD(&hd->list);
697 	INIT_HLIST_NODE(&hd->h);
698 }
699 
700 /*---------------------------------------------------------------------------*/
701 
702 /*
703  * Update dm-clone's metadata after a region has finished hydrating and remove
704  * hydration from the hash table.
705  */
706 static int hydration_update_metadata(struct dm_clone_region_hydration *hd)
707 {
708 	int r = 0;
709 	unsigned long flags;
710 	struct hash_table_bucket *bucket;
711 	struct clone *clone = hd->clone;
712 
713 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
714 		r = -EPERM;
715 
716 	/* Update the metadata */
717 	if (likely(!r) && hd->status == BLK_STS_OK)
718 		r = dm_clone_set_region_hydrated(clone->cmd, hd->region_nr);
719 
720 	bucket = get_hash_table_bucket(clone, hd->region_nr);
721 
722 	/* Remove hydration from hash table */
723 	bucket_lock_irqsave(bucket, flags);
724 	hlist_del(&hd->h);
725 	bucket_unlock_irqrestore(bucket, flags);
726 
727 	return r;
728 }
729 
730 /*
731  * Complete a region's hydration:
732  *
733  *	1. Update dm-clone's metadata.
734  *	2. Remove hydration from hash table.
735  *	3. Complete overwrite bio.
736  *	4. Issue deferred bios.
737  *	5. If this was the last hydration, wake up anyone waiting for
738  *	   hydrations to finish.
739  */
740 static void hydration_complete(struct dm_clone_region_hydration *hd)
741 {
742 	int r;
743 	blk_status_t status;
744 	struct clone *clone = hd->clone;
745 
746 	r = hydration_update_metadata(hd);
747 
748 	if (hd->status == BLK_STS_OK && likely(!r)) {
749 		if (hd->overwrite_bio)
750 			complete_overwrite_bio(clone, hd->overwrite_bio);
751 
752 		issue_deferred_bios(clone, &hd->deferred_bios);
753 	} else {
754 		status = r ? BLK_STS_IOERR : hd->status;
755 
756 		if (hd->overwrite_bio)
757 			bio_list_add(&hd->deferred_bios, hd->overwrite_bio);
758 
759 		fail_bios(&hd->deferred_bios, status);
760 	}
761 
762 	free_hydration(hd);
763 
764 	if (atomic_dec_and_test(&clone->hydrations_in_flight))
765 		wakeup_hydration_waiters(clone);
766 }
767 
768 static void hydration_kcopyd_callback(int read_err, unsigned long write_err, void *context)
769 {
770 	blk_status_t status;
771 
772 	struct dm_clone_region_hydration *tmp, *hd = context;
773 	struct clone *clone = hd->clone;
774 
775 	LIST_HEAD(batched_hydrations);
776 
777 	if (read_err || write_err) {
778 		DMERR_LIMIT("%s: hydration failed", clone_device_name(clone));
779 		status = BLK_STS_IOERR;
780 	} else {
781 		status = BLK_STS_OK;
782 	}
783 	list_splice_tail(&hd->list, &batched_hydrations);
784 
785 	hd->status = status;
786 	hydration_complete(hd);
787 
788 	/* Complete batched hydrations */
789 	list_for_each_entry_safe(hd, tmp, &batched_hydrations, list) {
790 		hd->status = status;
791 		hydration_complete(hd);
792 	}
793 
794 	/* Continue background hydration, if there is no I/O in-flight */
795 	if (test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
796 	    !atomic_read(&clone->ios_in_flight))
797 		wake_worker(clone);
798 }
799 
800 static void hydration_copy(struct dm_clone_region_hydration *hd, unsigned int nr_regions)
801 {
802 	unsigned long region_start, region_end;
803 	sector_t tail_size, region_size, total_size;
804 	struct dm_io_region from, to;
805 	struct clone *clone = hd->clone;
806 
807 	if (WARN_ON(!nr_regions))
808 		return;
809 
810 	region_size = clone->region_size;
811 	region_start = hd->region_nr;
812 	region_end = region_start + nr_regions - 1;
813 
814 	total_size = region_to_sector(clone, nr_regions - 1);
815 
816 	if (region_end == clone->nr_regions - 1) {
817 		/*
818 		 * The last region of the target might be smaller than
819 		 * region_size.
820 		 */
821 		tail_size = clone->ti->len & (region_size - 1);
822 		if (!tail_size)
823 			tail_size = region_size;
824 	} else {
825 		tail_size = region_size;
826 	}
827 
828 	total_size += tail_size;
829 
830 	from.bdev = clone->source_dev->bdev;
831 	from.sector = region_to_sector(clone, region_start);
832 	from.count = total_size;
833 
834 	to.bdev = clone->dest_dev->bdev;
835 	to.sector = from.sector;
836 	to.count = from.count;
837 
838 	/* Issue copy */
839 	atomic_add(nr_regions, &clone->hydrations_in_flight);
840 	dm_kcopyd_copy(clone->kcopyd_client, &from, 1, &to, 0,
841 		       hydration_kcopyd_callback, hd);
842 }
843 
844 static void overwrite_endio(struct bio *bio)
845 {
846 	struct dm_clone_region_hydration *hd = bio->bi_private;
847 
848 	bio->bi_end_io = hd->overwrite_bio_end_io;
849 	hd->status = bio->bi_status;
850 
851 	hydration_complete(hd);
852 }
853 
854 static void hydration_overwrite(struct dm_clone_region_hydration *hd, struct bio *bio)
855 {
856 	/*
857 	 * We don't need to save and restore bio->bi_private because device
858 	 * mapper core generates a new bio for us to use, with clean
859 	 * bi_private.
860 	 */
861 	hd->overwrite_bio = bio;
862 	hd->overwrite_bio_end_io = bio->bi_end_io;
863 
864 	bio->bi_end_io = overwrite_endio;
865 	bio->bi_private = hd;
866 
867 	atomic_inc(&hd->clone->hydrations_in_flight);
868 	generic_make_request(bio);
869 }
870 
871 /*
872  * Hydrate bio's region.
873  *
874  * This function starts the hydration of the bio's region and puts the bio in
875  * the list of deferred bios for this region. In case, by the time this
876  * function is called, the region has finished hydrating it's submitted to the
877  * destination device.
878  *
879  * NOTE: The bio remapping must be performed by the caller.
880  */
881 static void hydrate_bio_region(struct clone *clone, struct bio *bio)
882 {
883 	unsigned long region_nr;
884 	struct hash_table_bucket *bucket;
885 	struct dm_clone_region_hydration *hd, *hd2;
886 
887 	region_nr = bio_to_region(clone, bio);
888 	bucket = get_hash_table_bucket(clone, region_nr);
889 
890 	bucket_lock_irq(bucket);
891 
892 	hd = __hash_find(bucket, region_nr);
893 	if (hd) {
894 		/* Someone else is hydrating the region */
895 		bio_list_add(&hd->deferred_bios, bio);
896 		bucket_unlock_irq(bucket);
897 		return;
898 	}
899 
900 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
901 		/* The region has been hydrated */
902 		bucket_unlock_irq(bucket);
903 		issue_bio(clone, bio);
904 		return;
905 	}
906 
907 	/*
908 	 * We must allocate a hydration descriptor and start the hydration of
909 	 * the corresponding region.
910 	 */
911 	bucket_unlock_irq(bucket);
912 
913 	hd = alloc_hydration(clone);
914 	hydration_init(hd, region_nr);
915 
916 	bucket_lock_irq(bucket);
917 
918 	/* Check if the region has been hydrated in the meantime. */
919 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
920 		bucket_unlock_irq(bucket);
921 		free_hydration(hd);
922 		issue_bio(clone, bio);
923 		return;
924 	}
925 
926 	hd2 = __find_or_insert_region_hydration(bucket, hd);
927 	if (hd2 != hd) {
928 		/* Someone else started the region's hydration. */
929 		bio_list_add(&hd2->deferred_bios, bio);
930 		bucket_unlock_irq(bucket);
931 		free_hydration(hd);
932 		return;
933 	}
934 
935 	/*
936 	 * If the metadata mode is RO or FAIL then there is no point starting a
937 	 * hydration, since we will not be able to update the metadata when the
938 	 * hydration finishes.
939 	 */
940 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
941 		hlist_del(&hd->h);
942 		bucket_unlock_irq(bucket);
943 		free_hydration(hd);
944 		bio_io_error(bio);
945 		return;
946 	}
947 
948 	/*
949 	 * Start region hydration.
950 	 *
951 	 * If a bio overwrites a region, i.e., its size is equal to the
952 	 * region's size, then we don't need to copy the region from the source
953 	 * to the destination device.
954 	 */
955 	if (is_overwrite_bio(clone, bio)) {
956 		bucket_unlock_irq(bucket);
957 		hydration_overwrite(hd, bio);
958 	} else {
959 		bio_list_add(&hd->deferred_bios, bio);
960 		bucket_unlock_irq(bucket);
961 		hydration_copy(hd, 1);
962 	}
963 }
964 
965 /*---------------------------------------------------------------------------*/
966 
967 /*
968  * Background hydrations.
969  */
970 
971 /*
972  * Batch region hydrations.
973  *
974  * To better utilize device bandwidth we batch together the hydration of
975  * adjacent regions. This allows us to use small region sizes, e.g., 4KB, which
976  * is good for small, random write performance (because of the overwriting of
977  * un-hydrated regions) and at the same time issue big copy requests to kcopyd
978  * to achieve high hydration bandwidth.
979  */
980 struct batch_info {
981 	struct dm_clone_region_hydration *head;
982 	unsigned int nr_batched_regions;
983 };
984 
985 static void __batch_hydration(struct batch_info *batch,
986 			      struct dm_clone_region_hydration *hd)
987 {
988 	struct clone *clone = hd->clone;
989 	unsigned int max_batch_size = READ_ONCE(clone->hydration_batch_size);
990 
991 	if (batch->head) {
992 		/* Try to extend the current batch */
993 		if (batch->nr_batched_regions < max_batch_size &&
994 		    (batch->head->region_nr + batch->nr_batched_regions) == hd->region_nr) {
995 			list_add_tail(&hd->list, &batch->head->list);
996 			batch->nr_batched_regions++;
997 			hd = NULL;
998 		}
999 
1000 		/* Check if we should issue the current batch */
1001 		if (batch->nr_batched_regions >= max_batch_size || hd) {
1002 			hydration_copy(batch->head, batch->nr_batched_regions);
1003 			batch->head = NULL;
1004 			batch->nr_batched_regions = 0;
1005 		}
1006 	}
1007 
1008 	if (!hd)
1009 		return;
1010 
1011 	/* We treat max batch sizes of zero and one equivalently */
1012 	if (max_batch_size <= 1) {
1013 		hydration_copy(hd, 1);
1014 		return;
1015 	}
1016 
1017 	/* Start a new batch */
1018 	BUG_ON(!list_empty(&hd->list));
1019 	batch->head = hd;
1020 	batch->nr_batched_regions = 1;
1021 }
1022 
1023 static unsigned long __start_next_hydration(struct clone *clone,
1024 					    unsigned long offset,
1025 					    struct batch_info *batch)
1026 {
1027 	struct hash_table_bucket *bucket;
1028 	struct dm_clone_region_hydration *hd;
1029 	unsigned long nr_regions = clone->nr_regions;
1030 
1031 	hd = alloc_hydration(clone);
1032 
1033 	/* Try to find a region to hydrate. */
1034 	do {
1035 		offset = dm_clone_find_next_unhydrated_region(clone->cmd, offset);
1036 		if (offset == nr_regions)
1037 			break;
1038 
1039 		bucket = get_hash_table_bucket(clone, offset);
1040 		bucket_lock_irq(bucket);
1041 
1042 		if (!dm_clone_is_region_hydrated(clone->cmd, offset) &&
1043 		    !__hash_find(bucket, offset)) {
1044 			hydration_init(hd, offset);
1045 			__insert_region_hydration(bucket, hd);
1046 			bucket_unlock_irq(bucket);
1047 
1048 			/* Batch hydration */
1049 			__batch_hydration(batch, hd);
1050 
1051 			return (offset + 1);
1052 		}
1053 
1054 		bucket_unlock_irq(bucket);
1055 
1056 	} while (++offset < nr_regions);
1057 
1058 	if (hd)
1059 		free_hydration(hd);
1060 
1061 	return offset;
1062 }
1063 
1064 /*
1065  * This function searches for regions that still reside in the source device
1066  * and starts their hydration.
1067  */
1068 static void do_hydration(struct clone *clone)
1069 {
1070 	unsigned int current_volume;
1071 	unsigned long offset, nr_regions = clone->nr_regions;
1072 
1073 	struct batch_info batch = {
1074 		.head = NULL,
1075 		.nr_batched_regions = 0,
1076 	};
1077 
1078 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1079 		return;
1080 
1081 	if (dm_clone_is_hydration_done(clone->cmd))
1082 		return;
1083 
1084 	/*
1085 	 * Avoid race with device suspension.
1086 	 */
1087 	atomic_inc(&clone->hydrations_in_flight);
1088 
1089 	/*
1090 	 * Make sure atomic_inc() is ordered before test_bit(), otherwise we
1091 	 * might race with clone_postsuspend() and start a region hydration
1092 	 * after the target has been suspended.
1093 	 *
1094 	 * This is paired with the smp_mb__after_atomic() in
1095 	 * clone_postsuspend().
1096 	 */
1097 	smp_mb__after_atomic();
1098 
1099 	offset = clone->hydration_offset;
1100 	while (likely(!test_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags)) &&
1101 	       !atomic_read(&clone->ios_in_flight) &&
1102 	       test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags) &&
1103 	       offset < nr_regions) {
1104 		current_volume = atomic_read(&clone->hydrations_in_flight);
1105 		current_volume += batch.nr_batched_regions;
1106 
1107 		if (current_volume > READ_ONCE(clone->hydration_threshold))
1108 			break;
1109 
1110 		offset = __start_next_hydration(clone, offset, &batch);
1111 	}
1112 
1113 	if (batch.head)
1114 		hydration_copy(batch.head, batch.nr_batched_regions);
1115 
1116 	if (offset >= nr_regions)
1117 		offset = 0;
1118 
1119 	clone->hydration_offset = offset;
1120 
1121 	if (atomic_dec_and_test(&clone->hydrations_in_flight))
1122 		wakeup_hydration_waiters(clone);
1123 }
1124 
1125 /*---------------------------------------------------------------------------*/
1126 
1127 static bool need_commit_due_to_time(struct clone *clone)
1128 {
1129 	return !time_in_range(jiffies, clone->last_commit_jiffies,
1130 			      clone->last_commit_jiffies + COMMIT_PERIOD);
1131 }
1132 
1133 /*
1134  * A non-zero return indicates read-only or fail mode.
1135  */
1136 static int commit_metadata(struct clone *clone, bool *dest_dev_flushed)
1137 {
1138 	int r = 0;
1139 
1140 	if (dest_dev_flushed)
1141 		*dest_dev_flushed = false;
1142 
1143 	mutex_lock(&clone->commit_lock);
1144 
1145 	if (!dm_clone_changed_this_transaction(clone->cmd))
1146 		goto out;
1147 
1148 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY)) {
1149 		r = -EPERM;
1150 		goto out;
1151 	}
1152 
1153 	r = dm_clone_metadata_pre_commit(clone->cmd);
1154 	if (unlikely(r)) {
1155 		__metadata_operation_failed(clone, "dm_clone_metadata_pre_commit", r);
1156 		goto out;
1157 	}
1158 
1159 	bio_reset(&clone->flush_bio);
1160 	bio_set_dev(&clone->flush_bio, clone->dest_dev->bdev);
1161 	clone->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1162 
1163 	r = submit_bio_wait(&clone->flush_bio);
1164 	if (unlikely(r)) {
1165 		__metadata_operation_failed(clone, "flush destination device", r);
1166 		goto out;
1167 	}
1168 
1169 	if (dest_dev_flushed)
1170 		*dest_dev_flushed = true;
1171 
1172 	r = dm_clone_metadata_commit(clone->cmd);
1173 	if (unlikely(r)) {
1174 		__metadata_operation_failed(clone, "dm_clone_metadata_commit", r);
1175 		goto out;
1176 	}
1177 
1178 	if (dm_clone_is_hydration_done(clone->cmd))
1179 		dm_table_event(clone->ti->table);
1180 out:
1181 	mutex_unlock(&clone->commit_lock);
1182 
1183 	return r;
1184 }
1185 
1186 static void process_deferred_discards(struct clone *clone)
1187 {
1188 	int r = -EPERM;
1189 	struct bio *bio;
1190 	struct blk_plug plug;
1191 	unsigned long rs, nr_regions;
1192 	struct bio_list discards = BIO_EMPTY_LIST;
1193 
1194 	spin_lock_irq(&clone->lock);
1195 	bio_list_merge(&discards, &clone->deferred_discard_bios);
1196 	bio_list_init(&clone->deferred_discard_bios);
1197 	spin_unlock_irq(&clone->lock);
1198 
1199 	if (bio_list_empty(&discards))
1200 		return;
1201 
1202 	if (unlikely(get_clone_mode(clone) >= CM_READ_ONLY))
1203 		goto out;
1204 
1205 	/* Update the metadata */
1206 	bio_list_for_each(bio, &discards) {
1207 		bio_region_range(clone, bio, &rs, &nr_regions);
1208 		/*
1209 		 * A discard request might cover regions that have been already
1210 		 * hydrated. There is no need to update the metadata for these
1211 		 * regions.
1212 		 */
1213 		r = dm_clone_cond_set_range(clone->cmd, rs, nr_regions);
1214 		if (unlikely(r))
1215 			break;
1216 	}
1217 out:
1218 	blk_start_plug(&plug);
1219 	while ((bio = bio_list_pop(&discards)))
1220 		complete_discard_bio(clone, bio, r == 0);
1221 	blk_finish_plug(&plug);
1222 }
1223 
1224 static void process_deferred_bios(struct clone *clone)
1225 {
1226 	struct bio_list bios = BIO_EMPTY_LIST;
1227 
1228 	spin_lock_irq(&clone->lock);
1229 	bio_list_merge(&bios, &clone->deferred_bios);
1230 	bio_list_init(&clone->deferred_bios);
1231 	spin_unlock_irq(&clone->lock);
1232 
1233 	if (bio_list_empty(&bios))
1234 		return;
1235 
1236 	submit_bios(&bios);
1237 }
1238 
1239 static void process_deferred_flush_bios(struct clone *clone)
1240 {
1241 	struct bio *bio;
1242 	bool dest_dev_flushed;
1243 	struct bio_list bios = BIO_EMPTY_LIST;
1244 	struct bio_list bio_completions = BIO_EMPTY_LIST;
1245 
1246 	/*
1247 	 * If there are any deferred flush bios, we must commit the metadata
1248 	 * before issuing them or signaling their completion.
1249 	 */
1250 	spin_lock_irq(&clone->lock);
1251 	bio_list_merge(&bios, &clone->deferred_flush_bios);
1252 	bio_list_init(&clone->deferred_flush_bios);
1253 
1254 	bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
1255 	bio_list_init(&clone->deferred_flush_completions);
1256 	spin_unlock_irq(&clone->lock);
1257 
1258 	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
1259 	    !(dm_clone_changed_this_transaction(clone->cmd) && need_commit_due_to_time(clone)))
1260 		return;
1261 
1262 	if (commit_metadata(clone, &dest_dev_flushed)) {
1263 		bio_list_merge(&bios, &bio_completions);
1264 
1265 		while ((bio = bio_list_pop(&bios)))
1266 			bio_io_error(bio);
1267 
1268 		return;
1269 	}
1270 
1271 	clone->last_commit_jiffies = jiffies;
1272 
1273 	while ((bio = bio_list_pop(&bio_completions)))
1274 		bio_endio(bio);
1275 
1276 	while ((bio = bio_list_pop(&bios))) {
1277 		if ((bio->bi_opf & REQ_PREFLUSH) && dest_dev_flushed) {
1278 			/* We just flushed the destination device as part of
1279 			 * the metadata commit, so there is no reason to send
1280 			 * another flush.
1281 			 */
1282 			bio_endio(bio);
1283 		} else {
1284 			generic_make_request(bio);
1285 		}
1286 	}
1287 }
1288 
1289 static void do_worker(struct work_struct *work)
1290 {
1291 	struct clone *clone = container_of(work, typeof(*clone), worker);
1292 
1293 	process_deferred_bios(clone);
1294 	process_deferred_discards(clone);
1295 
1296 	/*
1297 	 * process_deferred_flush_bios():
1298 	 *
1299 	 *   - Commit metadata
1300 	 *
1301 	 *   - Process deferred REQ_FUA completions
1302 	 *
1303 	 *   - Process deferred REQ_PREFLUSH bios
1304 	 */
1305 	process_deferred_flush_bios(clone);
1306 
1307 	/* Background hydration */
1308 	do_hydration(clone);
1309 }
1310 
1311 /*
1312  * Commit periodically so that not too much unwritten data builds up.
1313  *
1314  * Also, restart background hydration, if it has been stopped by in-flight I/O.
1315  */
1316 static void do_waker(struct work_struct *work)
1317 {
1318 	struct clone *clone = container_of(to_delayed_work(work), struct clone, waker);
1319 
1320 	wake_worker(clone);
1321 	queue_delayed_work(clone->wq, &clone->waker, COMMIT_PERIOD);
1322 }
1323 
1324 /*---------------------------------------------------------------------------*/
1325 
1326 /*
1327  * Target methods
1328  */
1329 static int clone_map(struct dm_target *ti, struct bio *bio)
1330 {
1331 	struct clone *clone = ti->private;
1332 	unsigned long region_nr;
1333 
1334 	atomic_inc(&clone->ios_in_flight);
1335 
1336 	if (unlikely(get_clone_mode(clone) == CM_FAIL))
1337 		return DM_MAPIO_KILL;
1338 
1339 	/*
1340 	 * REQ_PREFLUSH bios carry no data:
1341 	 *
1342 	 * - Commit metadata, if changed
1343 	 *
1344 	 * - Pass down to destination device
1345 	 */
1346 	if (bio->bi_opf & REQ_PREFLUSH) {
1347 		remap_and_issue(clone, bio);
1348 		return DM_MAPIO_SUBMITTED;
1349 	}
1350 
1351 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1352 
1353 	/*
1354 	 * dm-clone interprets discards and performs a fast hydration of the
1355 	 * discarded regions, i.e., we skip the copy from the source device and
1356 	 * just mark the regions as hydrated.
1357 	 */
1358 	if (bio_op(bio) == REQ_OP_DISCARD) {
1359 		process_discard_bio(clone, bio);
1360 		return DM_MAPIO_SUBMITTED;
1361 	}
1362 
1363 	/*
1364 	 * If the bio's region is hydrated, redirect it to the destination
1365 	 * device.
1366 	 *
1367 	 * If the region is not hydrated and the bio is a READ, redirect it to
1368 	 * the source device.
1369 	 *
1370 	 * Else, defer WRITE bio until after its region has been hydrated and
1371 	 * start the region's hydration immediately.
1372 	 */
1373 	region_nr = bio_to_region(clone, bio);
1374 	if (dm_clone_is_region_hydrated(clone->cmd, region_nr)) {
1375 		remap_and_issue(clone, bio);
1376 		return DM_MAPIO_SUBMITTED;
1377 	} else if (bio_data_dir(bio) == READ) {
1378 		remap_to_source(clone, bio);
1379 		return DM_MAPIO_REMAPPED;
1380 	}
1381 
1382 	remap_to_dest(clone, bio);
1383 	hydrate_bio_region(clone, bio);
1384 
1385 	return DM_MAPIO_SUBMITTED;
1386 }
1387 
1388 static int clone_endio(struct dm_target *ti, struct bio *bio, blk_status_t *error)
1389 {
1390 	struct clone *clone = ti->private;
1391 
1392 	atomic_dec(&clone->ios_in_flight);
1393 
1394 	return DM_ENDIO_DONE;
1395 }
1396 
1397 static void emit_flags(struct clone *clone, char *result, unsigned int maxlen,
1398 		       ssize_t *sz_ptr)
1399 {
1400 	ssize_t sz = *sz_ptr;
1401 	unsigned int count;
1402 
1403 	count = !test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1404 	count += !test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1405 
1406 	DMEMIT("%u ", count);
1407 
1408 	if (!test_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
1409 		DMEMIT("no_hydration ");
1410 
1411 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
1412 		DMEMIT("no_discard_passdown ");
1413 
1414 	*sz_ptr = sz;
1415 }
1416 
1417 static void emit_core_args(struct clone *clone, char *result,
1418 			   unsigned int maxlen, ssize_t *sz_ptr)
1419 {
1420 	ssize_t sz = *sz_ptr;
1421 	unsigned int count = 4;
1422 
1423 	DMEMIT("%u hydration_threshold %u hydration_batch_size %u ", count,
1424 	       READ_ONCE(clone->hydration_threshold),
1425 	       READ_ONCE(clone->hydration_batch_size));
1426 
1427 	*sz_ptr = sz;
1428 }
1429 
1430 /*
1431  * Status format:
1432  *
1433  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
1434  * <clone region size> <#hydrated regions>/<#total regions> <#hydrating regions>
1435  * <#features> <features>* <#core args> <core args>* <clone metadata mode>
1436  */
1437 static void clone_status(struct dm_target *ti, status_type_t type,
1438 			 unsigned int status_flags, char *result,
1439 			 unsigned int maxlen)
1440 {
1441 	int r;
1442 	unsigned int i;
1443 	ssize_t sz = 0;
1444 	dm_block_t nr_free_metadata_blocks = 0;
1445 	dm_block_t nr_metadata_blocks = 0;
1446 	char buf[BDEVNAME_SIZE];
1447 	struct clone *clone = ti->private;
1448 
1449 	switch (type) {
1450 	case STATUSTYPE_INFO:
1451 		if (get_clone_mode(clone) == CM_FAIL) {
1452 			DMEMIT("Fail");
1453 			break;
1454 		}
1455 
1456 		/* Commit to ensure statistics aren't out-of-date */
1457 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
1458 			(void) commit_metadata(clone, NULL);
1459 
1460 		r = dm_clone_get_free_metadata_block_count(clone->cmd, &nr_free_metadata_blocks);
1461 
1462 		if (r) {
1463 			DMERR("%s: dm_clone_get_free_metadata_block_count returned %d",
1464 			      clone_device_name(clone), r);
1465 			goto error;
1466 		}
1467 
1468 		r = dm_clone_get_metadata_dev_size(clone->cmd, &nr_metadata_blocks);
1469 
1470 		if (r) {
1471 			DMERR("%s: dm_clone_get_metadata_dev_size returned %d",
1472 			      clone_device_name(clone), r);
1473 			goto error;
1474 		}
1475 
1476 		DMEMIT("%u %llu/%llu %llu %u/%lu %u ",
1477 		       DM_CLONE_METADATA_BLOCK_SIZE,
1478 		       (unsigned long long)(nr_metadata_blocks - nr_free_metadata_blocks),
1479 		       (unsigned long long)nr_metadata_blocks,
1480 		       (unsigned long long)clone->region_size,
1481 		       dm_clone_nr_of_hydrated_regions(clone->cmd),
1482 		       clone->nr_regions,
1483 		       atomic_read(&clone->hydrations_in_flight));
1484 
1485 		emit_flags(clone, result, maxlen, &sz);
1486 		emit_core_args(clone, result, maxlen, &sz);
1487 
1488 		switch (get_clone_mode(clone)) {
1489 		case CM_WRITE:
1490 			DMEMIT("rw");
1491 			break;
1492 		case CM_READ_ONLY:
1493 			DMEMIT("ro");
1494 			break;
1495 		case CM_FAIL:
1496 			DMEMIT("Fail");
1497 		}
1498 
1499 		break;
1500 
1501 	case STATUSTYPE_TABLE:
1502 		format_dev_t(buf, clone->metadata_dev->bdev->bd_dev);
1503 		DMEMIT("%s ", buf);
1504 
1505 		format_dev_t(buf, clone->dest_dev->bdev->bd_dev);
1506 		DMEMIT("%s ", buf);
1507 
1508 		format_dev_t(buf, clone->source_dev->bdev->bd_dev);
1509 		DMEMIT("%s", buf);
1510 
1511 		for (i = 0; i < clone->nr_ctr_args; i++)
1512 			DMEMIT(" %s", clone->ctr_args[i]);
1513 	}
1514 
1515 	return;
1516 
1517 error:
1518 	DMEMIT("Error");
1519 }
1520 
1521 static int clone_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1522 {
1523 	struct request_queue *dest_q, *source_q;
1524 	struct clone *clone = container_of(cb, struct clone, callbacks);
1525 
1526 	source_q = bdev_get_queue(clone->source_dev->bdev);
1527 	dest_q = bdev_get_queue(clone->dest_dev->bdev);
1528 
1529 	return (bdi_congested(dest_q->backing_dev_info, bdi_bits) |
1530 		bdi_congested(source_q->backing_dev_info, bdi_bits));
1531 }
1532 
1533 static sector_t get_dev_size(struct dm_dev *dev)
1534 {
1535 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1536 }
1537 
1538 /*---------------------------------------------------------------------------*/
1539 
1540 /*
1541  * Construct a clone device mapping:
1542  *
1543  * clone <metadata dev> <destination dev> <source dev> <region size>
1544  *	[<#feature args> [<feature arg>]* [<#core args> [key value]*]]
1545  *
1546  * metadata dev: Fast device holding the persistent metadata
1547  * destination dev: The destination device, which will become a clone of the
1548  *                  source device
1549  * source dev: The read-only source device that gets cloned
1550  * region size: dm-clone unit size in sectors
1551  *
1552  * #feature args: Number of feature arguments passed
1553  * feature args: E.g. no_hydration, no_discard_passdown
1554  *
1555  * #core arguments: An even number of core arguments
1556  * core arguments: Key/value pairs for tuning the core
1557  *		   E.g. 'hydration_threshold 256'
1558  */
1559 static int parse_feature_args(struct dm_arg_set *as, struct clone *clone)
1560 {
1561 	int r;
1562 	unsigned int argc;
1563 	const char *arg_name;
1564 	struct dm_target *ti = clone->ti;
1565 
1566 	const struct dm_arg args = {
1567 		.min = 0,
1568 		.max = 2,
1569 		.error = "Invalid number of feature arguments"
1570 	};
1571 
1572 	/* No feature arguments supplied */
1573 	if (!as->argc)
1574 		return 0;
1575 
1576 	r = dm_read_arg_group(&args, as, &argc, &ti->error);
1577 	if (r)
1578 		return r;
1579 
1580 	while (argc) {
1581 		arg_name = dm_shift_arg(as);
1582 		argc--;
1583 
1584 		if (!strcasecmp(arg_name, "no_hydration")) {
1585 			__clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1586 		} else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1587 			__clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1588 		} else {
1589 			ti->error = "Invalid feature argument";
1590 			return -EINVAL;
1591 		}
1592 	}
1593 
1594 	return 0;
1595 }
1596 
1597 static int parse_core_args(struct dm_arg_set *as, struct clone *clone)
1598 {
1599 	int r;
1600 	unsigned int argc;
1601 	unsigned int value;
1602 	const char *arg_name;
1603 	struct dm_target *ti = clone->ti;
1604 
1605 	const struct dm_arg args = {
1606 		.min = 0,
1607 		.max = 4,
1608 		.error = "Invalid number of core arguments"
1609 	};
1610 
1611 	/* Initialize core arguments */
1612 	clone->hydration_batch_size = DEFAULT_HYDRATION_BATCH_SIZE;
1613 	clone->hydration_threshold = DEFAULT_HYDRATION_THRESHOLD;
1614 
1615 	/* No core arguments supplied */
1616 	if (!as->argc)
1617 		return 0;
1618 
1619 	r = dm_read_arg_group(&args, as, &argc, &ti->error);
1620 	if (r)
1621 		return r;
1622 
1623 	if (argc & 1) {
1624 		ti->error = "Number of core arguments must be even";
1625 		return -EINVAL;
1626 	}
1627 
1628 	while (argc) {
1629 		arg_name = dm_shift_arg(as);
1630 		argc -= 2;
1631 
1632 		if (!strcasecmp(arg_name, "hydration_threshold")) {
1633 			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1634 				ti->error = "Invalid value for argument `hydration_threshold'";
1635 				return -EINVAL;
1636 			}
1637 			clone->hydration_threshold = value;
1638 		} else if (!strcasecmp(arg_name, "hydration_batch_size")) {
1639 			if (kstrtouint(dm_shift_arg(as), 10, &value)) {
1640 				ti->error = "Invalid value for argument `hydration_batch_size'";
1641 				return -EINVAL;
1642 			}
1643 			clone->hydration_batch_size = value;
1644 		} else {
1645 			ti->error = "Invalid core argument";
1646 			return -EINVAL;
1647 		}
1648 	}
1649 
1650 	return 0;
1651 }
1652 
1653 static int parse_region_size(struct clone *clone, struct dm_arg_set *as, char **error)
1654 {
1655 	int r;
1656 	unsigned int region_size;
1657 	struct dm_arg arg;
1658 
1659 	arg.min = MIN_REGION_SIZE;
1660 	arg.max = MAX_REGION_SIZE;
1661 	arg.error = "Invalid region size";
1662 
1663 	r = dm_read_arg(&arg, as, &region_size, error);
1664 	if (r)
1665 		return r;
1666 
1667 	/* Check region size is a power of 2 */
1668 	if (!is_power_of_2(region_size)) {
1669 		*error = "Region size is not a power of 2";
1670 		return -EINVAL;
1671 	}
1672 
1673 	/* Validate the region size against the device logical block size */
1674 	if (region_size % (bdev_logical_block_size(clone->source_dev->bdev) >> 9) ||
1675 	    region_size % (bdev_logical_block_size(clone->dest_dev->bdev) >> 9)) {
1676 		*error = "Region size is not a multiple of device logical block size";
1677 		return -EINVAL;
1678 	}
1679 
1680 	clone->region_size = region_size;
1681 
1682 	return 0;
1683 }
1684 
1685 static int validate_nr_regions(unsigned long n, char **error)
1686 {
1687 	/*
1688 	 * dm_bitset restricts us to 2^32 regions. test_bit & co. restrict us
1689 	 * further to 2^31 regions.
1690 	 */
1691 	if (n > (1UL << 31)) {
1692 		*error = "Too many regions. Consider increasing the region size";
1693 		return -EINVAL;
1694 	}
1695 
1696 	return 0;
1697 }
1698 
1699 static int parse_metadata_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1700 {
1701 	int r;
1702 	sector_t metadata_dev_size;
1703 	char b[BDEVNAME_SIZE];
1704 
1705 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1706 			  &clone->metadata_dev);
1707 	if (r) {
1708 		*error = "Error opening metadata device";
1709 		return r;
1710 	}
1711 
1712 	metadata_dev_size = get_dev_size(clone->metadata_dev);
1713 	if (metadata_dev_size > DM_CLONE_METADATA_MAX_SECTORS_WARNING)
1714 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1715 		       bdevname(clone->metadata_dev->bdev, b), DM_CLONE_METADATA_MAX_SECTORS);
1716 
1717 	return 0;
1718 }
1719 
1720 static int parse_dest_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1721 {
1722 	int r;
1723 	sector_t dest_dev_size;
1724 
1725 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1726 			  &clone->dest_dev);
1727 	if (r) {
1728 		*error = "Error opening destination device";
1729 		return r;
1730 	}
1731 
1732 	dest_dev_size = get_dev_size(clone->dest_dev);
1733 	if (dest_dev_size < clone->ti->len) {
1734 		dm_put_device(clone->ti, clone->dest_dev);
1735 		*error = "Device size larger than destination device";
1736 		return -EINVAL;
1737 	}
1738 
1739 	return 0;
1740 }
1741 
1742 static int parse_source_dev(struct clone *clone, struct dm_arg_set *as, char **error)
1743 {
1744 	int r;
1745 	sector_t source_dev_size;
1746 
1747 	r = dm_get_device(clone->ti, dm_shift_arg(as), FMODE_READ,
1748 			  &clone->source_dev);
1749 	if (r) {
1750 		*error = "Error opening source device";
1751 		return r;
1752 	}
1753 
1754 	source_dev_size = get_dev_size(clone->source_dev);
1755 	if (source_dev_size < clone->ti->len) {
1756 		dm_put_device(clone->ti, clone->source_dev);
1757 		*error = "Device size larger than source device";
1758 		return -EINVAL;
1759 	}
1760 
1761 	return 0;
1762 }
1763 
1764 static int copy_ctr_args(struct clone *clone, int argc, const char **argv, char **error)
1765 {
1766 	unsigned int i;
1767 	const char **copy;
1768 
1769 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
1770 	if (!copy)
1771 		goto error;
1772 
1773 	for (i = 0; i < argc; i++) {
1774 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
1775 
1776 		if (!copy[i]) {
1777 			while (i--)
1778 				kfree(copy[i]);
1779 			kfree(copy);
1780 			goto error;
1781 		}
1782 	}
1783 
1784 	clone->nr_ctr_args = argc;
1785 	clone->ctr_args = copy;
1786 	return 0;
1787 
1788 error:
1789 	*error = "Failed to allocate memory for table line";
1790 	return -ENOMEM;
1791 }
1792 
1793 static int clone_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1794 {
1795 	int r;
1796 	sector_t nr_regions;
1797 	struct clone *clone;
1798 	struct dm_arg_set as;
1799 
1800 	if (argc < 4) {
1801 		ti->error = "Invalid number of arguments";
1802 		return -EINVAL;
1803 	}
1804 
1805 	as.argc = argc;
1806 	as.argv = argv;
1807 
1808 	clone = kzalloc(sizeof(*clone), GFP_KERNEL);
1809 	if (!clone) {
1810 		ti->error = "Failed to allocate clone structure";
1811 		return -ENOMEM;
1812 	}
1813 
1814 	clone->ti = ti;
1815 
1816 	/* Initialize dm-clone flags */
1817 	__set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
1818 	__set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
1819 	__set_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
1820 
1821 	r = parse_metadata_dev(clone, &as, &ti->error);
1822 	if (r)
1823 		goto out_with_clone;
1824 
1825 	r = parse_dest_dev(clone, &as, &ti->error);
1826 	if (r)
1827 		goto out_with_meta_dev;
1828 
1829 	r = parse_source_dev(clone, &as, &ti->error);
1830 	if (r)
1831 		goto out_with_dest_dev;
1832 
1833 	r = parse_region_size(clone, &as, &ti->error);
1834 	if (r)
1835 		goto out_with_source_dev;
1836 
1837 	clone->region_shift = __ffs(clone->region_size);
1838 	nr_regions = dm_sector_div_up(ti->len, clone->region_size);
1839 
1840 	/* Check for overflow */
1841 	if (nr_regions != (unsigned long)nr_regions) {
1842 		ti->error = "Too many regions. Consider increasing the region size";
1843 		r = -EOVERFLOW;
1844 		goto out_with_source_dev;
1845 	}
1846 
1847 	clone->nr_regions = nr_regions;
1848 
1849 	r = validate_nr_regions(clone->nr_regions, &ti->error);
1850 	if (r)
1851 		goto out_with_source_dev;
1852 
1853 	r = dm_set_target_max_io_len(ti, clone->region_size);
1854 	if (r) {
1855 		ti->error = "Failed to set max io len";
1856 		goto out_with_source_dev;
1857 	}
1858 
1859 	r = parse_feature_args(&as, clone);
1860 	if (r)
1861 		goto out_with_source_dev;
1862 
1863 	r = parse_core_args(&as, clone);
1864 	if (r)
1865 		goto out_with_source_dev;
1866 
1867 	/* Load metadata */
1868 	clone->cmd = dm_clone_metadata_open(clone->metadata_dev->bdev, ti->len,
1869 					    clone->region_size);
1870 	if (IS_ERR(clone->cmd)) {
1871 		ti->error = "Failed to load metadata";
1872 		r = PTR_ERR(clone->cmd);
1873 		goto out_with_source_dev;
1874 	}
1875 
1876 	__set_clone_mode(clone, CM_WRITE);
1877 
1878 	if (get_clone_mode(clone) != CM_WRITE) {
1879 		ti->error = "Unable to get write access to metadata, please check/repair metadata";
1880 		r = -EPERM;
1881 		goto out_with_metadata;
1882 	}
1883 
1884 	clone->last_commit_jiffies = jiffies;
1885 
1886 	/* Allocate hydration hash table */
1887 	r = hash_table_init(clone);
1888 	if (r) {
1889 		ti->error = "Failed to allocate hydration hash table";
1890 		goto out_with_metadata;
1891 	}
1892 
1893 	atomic_set(&clone->ios_in_flight, 0);
1894 	init_waitqueue_head(&clone->hydration_stopped);
1895 	spin_lock_init(&clone->lock);
1896 	bio_list_init(&clone->deferred_bios);
1897 	bio_list_init(&clone->deferred_discard_bios);
1898 	bio_list_init(&clone->deferred_flush_bios);
1899 	bio_list_init(&clone->deferred_flush_completions);
1900 	clone->hydration_offset = 0;
1901 	atomic_set(&clone->hydrations_in_flight, 0);
1902 	bio_init(&clone->flush_bio, NULL, 0);
1903 
1904 	clone->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
1905 	if (!clone->wq) {
1906 		ti->error = "Failed to allocate workqueue";
1907 		r = -ENOMEM;
1908 		goto out_with_ht;
1909 	}
1910 
1911 	INIT_WORK(&clone->worker, do_worker);
1912 	INIT_DELAYED_WORK(&clone->waker, do_waker);
1913 
1914 	clone->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1915 	if (IS_ERR(clone->kcopyd_client)) {
1916 		r = PTR_ERR(clone->kcopyd_client);
1917 		goto out_with_wq;
1918 	}
1919 
1920 	r = mempool_init_slab_pool(&clone->hydration_pool, MIN_HYDRATIONS,
1921 				   _hydration_cache);
1922 	if (r) {
1923 		ti->error = "Failed to create dm_clone_region_hydration memory pool";
1924 		goto out_with_kcopyd;
1925 	}
1926 
1927 	/* Save a copy of the table line */
1928 	r = copy_ctr_args(clone, argc - 3, (const char **)argv + 3, &ti->error);
1929 	if (r)
1930 		goto out_with_mempool;
1931 
1932 	mutex_init(&clone->commit_lock);
1933 	clone->callbacks.congested_fn = clone_is_congested;
1934 	dm_table_add_target_callbacks(ti->table, &clone->callbacks);
1935 
1936 	/* Enable flushes */
1937 	ti->num_flush_bios = 1;
1938 	ti->flush_supported = true;
1939 
1940 	/* Enable discards */
1941 	ti->discards_supported = true;
1942 	ti->num_discard_bios = 1;
1943 
1944 	ti->private = clone;
1945 
1946 	return 0;
1947 
1948 out_with_mempool:
1949 	mempool_exit(&clone->hydration_pool);
1950 out_with_kcopyd:
1951 	dm_kcopyd_client_destroy(clone->kcopyd_client);
1952 out_with_wq:
1953 	destroy_workqueue(clone->wq);
1954 out_with_ht:
1955 	hash_table_exit(clone);
1956 out_with_metadata:
1957 	dm_clone_metadata_close(clone->cmd);
1958 out_with_source_dev:
1959 	dm_put_device(ti, clone->source_dev);
1960 out_with_dest_dev:
1961 	dm_put_device(ti, clone->dest_dev);
1962 out_with_meta_dev:
1963 	dm_put_device(ti, clone->metadata_dev);
1964 out_with_clone:
1965 	kfree(clone);
1966 
1967 	return r;
1968 }
1969 
1970 static void clone_dtr(struct dm_target *ti)
1971 {
1972 	unsigned int i;
1973 	struct clone *clone = ti->private;
1974 
1975 	mutex_destroy(&clone->commit_lock);
1976 	bio_uninit(&clone->flush_bio);
1977 
1978 	for (i = 0; i < clone->nr_ctr_args; i++)
1979 		kfree(clone->ctr_args[i]);
1980 	kfree(clone->ctr_args);
1981 
1982 	mempool_exit(&clone->hydration_pool);
1983 	dm_kcopyd_client_destroy(clone->kcopyd_client);
1984 	destroy_workqueue(clone->wq);
1985 	hash_table_exit(clone);
1986 	dm_clone_metadata_close(clone->cmd);
1987 	dm_put_device(ti, clone->source_dev);
1988 	dm_put_device(ti, clone->dest_dev);
1989 	dm_put_device(ti, clone->metadata_dev);
1990 
1991 	kfree(clone);
1992 }
1993 
1994 /*---------------------------------------------------------------------------*/
1995 
1996 static void clone_postsuspend(struct dm_target *ti)
1997 {
1998 	struct clone *clone = ti->private;
1999 
2000 	/*
2001 	 * To successfully suspend the device:
2002 	 *
2003 	 *	- We cancel the delayed work for periodic commits and wait for
2004 	 *	  it to finish.
2005 	 *
2006 	 *	- We stop the background hydration, i.e. we prevent new region
2007 	 *	  hydrations from starting.
2008 	 *
2009 	 *	- We wait for any in-flight hydrations to finish.
2010 	 *
2011 	 *	- We flush the workqueue.
2012 	 *
2013 	 *	- We commit the metadata.
2014 	 */
2015 	cancel_delayed_work_sync(&clone->waker);
2016 
2017 	set_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2018 
2019 	/*
2020 	 * Make sure set_bit() is ordered before atomic_read(), otherwise we
2021 	 * might race with do_hydration() and miss some started region
2022 	 * hydrations.
2023 	 *
2024 	 * This is paired with smp_mb__after_atomic() in do_hydration().
2025 	 */
2026 	smp_mb__after_atomic();
2027 
2028 	wait_event(clone->hydration_stopped, !atomic_read(&clone->hydrations_in_flight));
2029 	flush_workqueue(clone->wq);
2030 
2031 	(void) commit_metadata(clone, NULL);
2032 }
2033 
2034 static void clone_resume(struct dm_target *ti)
2035 {
2036 	struct clone *clone = ti->private;
2037 
2038 	clear_bit(DM_CLONE_HYDRATION_SUSPENDED, &clone->flags);
2039 	do_waker(&clone->waker.work);
2040 }
2041 
2042 static bool bdev_supports_discards(struct block_device *bdev)
2043 {
2044 	struct request_queue *q = bdev_get_queue(bdev);
2045 
2046 	return (q && blk_queue_discard(q));
2047 }
2048 
2049 /*
2050  * If discard_passdown was enabled verify that the destination device supports
2051  * discards. Disable discard_passdown if not.
2052  */
2053 static void disable_passdown_if_not_supported(struct clone *clone)
2054 {
2055 	struct block_device *dest_dev = clone->dest_dev->bdev;
2056 	struct queue_limits *dest_limits = &bdev_get_queue(dest_dev)->limits;
2057 	const char *reason = NULL;
2058 	char buf[BDEVNAME_SIZE];
2059 
2060 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags))
2061 		return;
2062 
2063 	if (!bdev_supports_discards(dest_dev))
2064 		reason = "discard unsupported";
2065 	else if (dest_limits->max_discard_sectors < clone->region_size)
2066 		reason = "max discard sectors smaller than a region";
2067 
2068 	if (reason) {
2069 		DMWARN("Destination device (%s) %s: Disabling discard passdown.",
2070 		       bdevname(dest_dev, buf), reason);
2071 		clear_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags);
2072 	}
2073 }
2074 
2075 static void set_discard_limits(struct clone *clone, struct queue_limits *limits)
2076 {
2077 	struct block_device *dest_bdev = clone->dest_dev->bdev;
2078 	struct queue_limits *dest_limits = &bdev_get_queue(dest_bdev)->limits;
2079 
2080 	if (!test_bit(DM_CLONE_DISCARD_PASSDOWN, &clone->flags)) {
2081 		/* No passdown is done so we set our own virtual limits */
2082 		limits->discard_granularity = clone->region_size << SECTOR_SHIFT;
2083 		limits->max_discard_sectors = round_down(UINT_MAX >> SECTOR_SHIFT, clone->region_size);
2084 		return;
2085 	}
2086 
2087 	/*
2088 	 * clone_iterate_devices() is stacking both the source and destination
2089 	 * device limits but discards aren't passed to the source device, so
2090 	 * inherit destination's limits.
2091 	 */
2092 	limits->max_discard_sectors = dest_limits->max_discard_sectors;
2093 	limits->max_hw_discard_sectors = dest_limits->max_hw_discard_sectors;
2094 	limits->discard_granularity = dest_limits->discard_granularity;
2095 	limits->discard_alignment = dest_limits->discard_alignment;
2096 	limits->discard_misaligned = dest_limits->discard_misaligned;
2097 	limits->max_discard_segments = dest_limits->max_discard_segments;
2098 }
2099 
2100 static void clone_io_hints(struct dm_target *ti, struct queue_limits *limits)
2101 {
2102 	struct clone *clone = ti->private;
2103 	u64 io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
2104 
2105 	/*
2106 	 * If the system-determined stacked limits are compatible with
2107 	 * dm-clone's region size (io_opt is a factor) do not override them.
2108 	 */
2109 	if (io_opt_sectors < clone->region_size ||
2110 	    do_div(io_opt_sectors, clone->region_size)) {
2111 		blk_limits_io_min(limits, clone->region_size << SECTOR_SHIFT);
2112 		blk_limits_io_opt(limits, clone->region_size << SECTOR_SHIFT);
2113 	}
2114 
2115 	disable_passdown_if_not_supported(clone);
2116 	set_discard_limits(clone, limits);
2117 }
2118 
2119 static int clone_iterate_devices(struct dm_target *ti,
2120 				 iterate_devices_callout_fn fn, void *data)
2121 {
2122 	int ret;
2123 	struct clone *clone = ti->private;
2124 	struct dm_dev *dest_dev = clone->dest_dev;
2125 	struct dm_dev *source_dev = clone->source_dev;
2126 
2127 	ret = fn(ti, source_dev, 0, ti->len, data);
2128 	if (!ret)
2129 		ret = fn(ti, dest_dev, 0, ti->len, data);
2130 	return ret;
2131 }
2132 
2133 /*
2134  * dm-clone message functions.
2135  */
2136 static void set_hydration_threshold(struct clone *clone, unsigned int nr_regions)
2137 {
2138 	WRITE_ONCE(clone->hydration_threshold, nr_regions);
2139 
2140 	/*
2141 	 * If user space sets hydration_threshold to zero then the hydration
2142 	 * will stop. If at a later time the hydration_threshold is increased
2143 	 * we must restart the hydration process by waking up the worker.
2144 	 */
2145 	wake_worker(clone);
2146 }
2147 
2148 static void set_hydration_batch_size(struct clone *clone, unsigned int nr_regions)
2149 {
2150 	WRITE_ONCE(clone->hydration_batch_size, nr_regions);
2151 }
2152 
2153 static void enable_hydration(struct clone *clone)
2154 {
2155 	if (!test_and_set_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags))
2156 		wake_worker(clone);
2157 }
2158 
2159 static void disable_hydration(struct clone *clone)
2160 {
2161 	clear_bit(DM_CLONE_HYDRATION_ENABLED, &clone->flags);
2162 }
2163 
2164 static int clone_message(struct dm_target *ti, unsigned int argc, char **argv,
2165 			 char *result, unsigned int maxlen)
2166 {
2167 	struct clone *clone = ti->private;
2168 	unsigned int value;
2169 
2170 	if (!argc)
2171 		return -EINVAL;
2172 
2173 	if (!strcasecmp(argv[0], "enable_hydration")) {
2174 		enable_hydration(clone);
2175 		return 0;
2176 	}
2177 
2178 	if (!strcasecmp(argv[0], "disable_hydration")) {
2179 		disable_hydration(clone);
2180 		return 0;
2181 	}
2182 
2183 	if (argc != 2)
2184 		return -EINVAL;
2185 
2186 	if (!strcasecmp(argv[0], "hydration_threshold")) {
2187 		if (kstrtouint(argv[1], 10, &value))
2188 			return -EINVAL;
2189 
2190 		set_hydration_threshold(clone, value);
2191 
2192 		return 0;
2193 	}
2194 
2195 	if (!strcasecmp(argv[0], "hydration_batch_size")) {
2196 		if (kstrtouint(argv[1], 10, &value))
2197 			return -EINVAL;
2198 
2199 		set_hydration_batch_size(clone, value);
2200 
2201 		return 0;
2202 	}
2203 
2204 	DMERR("%s: Unsupported message `%s'", clone_device_name(clone), argv[0]);
2205 	return -EINVAL;
2206 }
2207 
2208 static struct target_type clone_target = {
2209 	.name = "clone",
2210 	.version = {1, 0, 0},
2211 	.module = THIS_MODULE,
2212 	.ctr = clone_ctr,
2213 	.dtr =  clone_dtr,
2214 	.map = clone_map,
2215 	.end_io = clone_endio,
2216 	.postsuspend = clone_postsuspend,
2217 	.resume = clone_resume,
2218 	.status = clone_status,
2219 	.message = clone_message,
2220 	.io_hints = clone_io_hints,
2221 	.iterate_devices = clone_iterate_devices,
2222 };
2223 
2224 /*---------------------------------------------------------------------------*/
2225 
2226 /* Module functions */
2227 static int __init dm_clone_init(void)
2228 {
2229 	int r;
2230 
2231 	_hydration_cache = KMEM_CACHE(dm_clone_region_hydration, 0);
2232 	if (!_hydration_cache)
2233 		return -ENOMEM;
2234 
2235 	r = dm_register_target(&clone_target);
2236 	if (r < 0) {
2237 		DMERR("Failed to register clone target");
2238 		return r;
2239 	}
2240 
2241 	return 0;
2242 }
2243 
2244 static void __exit dm_clone_exit(void)
2245 {
2246 	dm_unregister_target(&clone_target);
2247 
2248 	kmem_cache_destroy(_hydration_cache);
2249 	_hydration_cache = NULL;
2250 }
2251 
2252 /* Module hooks */
2253 module_init(dm_clone_init);
2254 module_exit(dm_clone_exit);
2255 
2256 MODULE_DESCRIPTION(DM_NAME " clone target");
2257 MODULE_AUTHOR("Nikos Tsironis <ntsironis@arrikto.com>");
2258 MODULE_LICENSE("GPL");
2259