1 /*
2  * dm-exception-store.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  * Copyright (C) 2006 Red Hat GmbH
6  *
7  * This file is released under the GPL.
8  */
9 
10 #include "dm.h"
11 #include "dm-snap.h"
12 
13 #include <linux/mm.h>
14 #include <linux/pagemap.h>
15 #include <linux/vmalloc.h>
16 #include <linux/slab.h>
17 #include <linux/dm-io.h>
18 #include <linux/dm-kcopyd.h>
19 
20 #define DM_MSG_PREFIX "snapshots"
21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
22 
23 /*-----------------------------------------------------------------
24  * Persistent snapshots, by persistent we mean that the snapshot
25  * will survive a reboot.
26  *---------------------------------------------------------------*/
27 
28 /*
29  * We need to store a record of which parts of the origin have
30  * been copied to the snapshot device.  The snapshot code
31  * requires that we copy exception chunks to chunk aligned areas
32  * of the COW store.  It makes sense therefore, to store the
33  * metadata in chunk size blocks.
34  *
35  * There is no backward or forward compatibility implemented,
36  * snapshots with different disk versions than the kernel will
37  * not be usable.  It is expected that "lvcreate" will blank out
38  * the start of a fresh COW device before calling the snapshot
39  * constructor.
40  *
41  * The first chunk of the COW device just contains the header.
42  * After this there is a chunk filled with exception metadata,
43  * followed by as many exception chunks as can fit in the
44  * metadata areas.
45  *
46  * All on disk structures are in little-endian format.  The end
47  * of the exceptions info is indicated by an exception with a
48  * new_chunk of 0, which is invalid since it would point to the
49  * header chunk.
50  */
51 
52 /*
53  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
54  */
55 #define SNAP_MAGIC 0x70416e53
56 
57 /*
58  * The on-disk version of the metadata.
59  */
60 #define SNAPSHOT_DISK_VERSION 1
61 
62 struct disk_header {
63 	uint32_t magic;
64 
65 	/*
66 	 * Is this snapshot valid.  There is no way of recovering
67 	 * an invalid snapshot.
68 	 */
69 	uint32_t valid;
70 
71 	/*
72 	 * Simple, incrementing version. no backward
73 	 * compatibility.
74 	 */
75 	uint32_t version;
76 
77 	/* In sectors */
78 	uint32_t chunk_size;
79 };
80 
81 struct disk_exception {
82 	uint64_t old_chunk;
83 	uint64_t new_chunk;
84 };
85 
86 struct commit_callback {
87 	void (*callback)(void *, int success);
88 	void *context;
89 };
90 
91 /*
92  * The top level structure for a persistent exception store.
93  */
94 struct pstore {
95 	struct dm_snapshot *snap;	/* up pointer to my snapshot */
96 	int version;
97 	int valid;
98 	uint32_t exceptions_per_area;
99 
100 	/*
101 	 * Now that we have an asynchronous kcopyd there is no
102 	 * need for large chunk sizes, so it wont hurt to have a
103 	 * whole chunks worth of metadata in memory at once.
104 	 */
105 	void *area;
106 
107 	/*
108 	 * Used to keep track of which metadata area the data in
109 	 * 'chunk' refers to.
110 	 */
111 	chunk_t current_area;
112 
113 	/*
114 	 * The next free chunk for an exception.
115 	 */
116 	chunk_t next_free;
117 
118 	/*
119 	 * The index of next free exception in the current
120 	 * metadata area.
121 	 */
122 	uint32_t current_committed;
123 
124 	atomic_t pending_count;
125 	uint32_t callback_count;
126 	struct commit_callback *callbacks;
127 	struct dm_io_client *io_client;
128 
129 	struct workqueue_struct *metadata_wq;
130 };
131 
132 static unsigned sectors_to_pages(unsigned sectors)
133 {
134 	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
135 }
136 
137 static int alloc_area(struct pstore *ps)
138 {
139 	int r = -ENOMEM;
140 	size_t len;
141 
142 	len = ps->snap->chunk_size << SECTOR_SHIFT;
143 
144 	/*
145 	 * Allocate the chunk_size block of memory that will hold
146 	 * a single metadata area.
147 	 */
148 	ps->area = vmalloc(len);
149 	if (!ps->area)
150 		return r;
151 
152 	return 0;
153 }
154 
155 static void free_area(struct pstore *ps)
156 {
157 	vfree(ps->area);
158 	ps->area = NULL;
159 }
160 
161 struct mdata_req {
162 	struct dm_io_region *where;
163 	struct dm_io_request *io_req;
164 	struct work_struct work;
165 	int result;
166 };
167 
168 static void do_metadata(struct work_struct *work)
169 {
170 	struct mdata_req *req = container_of(work, struct mdata_req, work);
171 
172 	req->result = dm_io(req->io_req, 1, req->where, NULL);
173 }
174 
175 /*
176  * Read or write a chunk aligned and sized block of data from a device.
177  */
178 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata)
179 {
180 	struct dm_io_region where = {
181 		.bdev = ps->snap->cow->bdev,
182 		.sector = ps->snap->chunk_size * chunk,
183 		.count = ps->snap->chunk_size,
184 	};
185 	struct dm_io_request io_req = {
186 		.bi_rw = rw,
187 		.mem.type = DM_IO_VMA,
188 		.mem.ptr.vma = ps->area,
189 		.client = ps->io_client,
190 		.notify.fn = NULL,
191 	};
192 	struct mdata_req req;
193 
194 	if (!metadata)
195 		return dm_io(&io_req, 1, &where, NULL);
196 
197 	req.where = &where;
198 	req.io_req = &io_req;
199 
200 	/*
201 	 * Issue the synchronous I/O from a different thread
202 	 * to avoid generic_make_request recursion.
203 	 */
204 	INIT_WORK(&req.work, do_metadata);
205 	queue_work(ps->metadata_wq, &req.work);
206 	flush_workqueue(ps->metadata_wq);
207 
208 	return req.result;
209 }
210 
211 /*
212  * Convert a metadata area index to a chunk index.
213  */
214 static chunk_t area_location(struct pstore *ps, chunk_t area)
215 {
216 	return 1 + ((ps->exceptions_per_area + 1) * area);
217 }
218 
219 /*
220  * Read or write a metadata area.  Remembering to skip the first
221  * chunk which holds the header.
222  */
223 static int area_io(struct pstore *ps, chunk_t area, int rw)
224 {
225 	int r;
226 	chunk_t chunk;
227 
228 	chunk = area_location(ps, area);
229 
230 	r = chunk_io(ps, chunk, rw, 0);
231 	if (r)
232 		return r;
233 
234 	ps->current_area = area;
235 	return 0;
236 }
237 
238 static int zero_area(struct pstore *ps, chunk_t area)
239 {
240 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
241 	return area_io(ps, area, WRITE);
242 }
243 
244 static int read_header(struct pstore *ps, int *new_snapshot)
245 {
246 	int r;
247 	struct disk_header *dh;
248 	chunk_t chunk_size;
249 	int chunk_size_supplied = 1;
250 
251 	/*
252 	 * Use default chunk size (or hardsect_size, if larger) if none supplied
253 	 */
254 	if (!ps->snap->chunk_size) {
255         	ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
256 		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
257 		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
258 		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
259 		chunk_size_supplied = 0;
260 	}
261 
262 	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
263 							     chunk_size));
264 	if (IS_ERR(ps->io_client))
265 		return PTR_ERR(ps->io_client);
266 
267 	r = alloc_area(ps);
268 	if (r)
269 		return r;
270 
271 	r = chunk_io(ps, 0, READ, 1);
272 	if (r)
273 		goto bad;
274 
275 	dh = (struct disk_header *) ps->area;
276 
277 	if (le32_to_cpu(dh->magic) == 0) {
278 		*new_snapshot = 1;
279 		return 0;
280 	}
281 
282 	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
283 		DMWARN("Invalid or corrupt snapshot");
284 		r = -ENXIO;
285 		goto bad;
286 	}
287 
288 	*new_snapshot = 0;
289 	ps->valid = le32_to_cpu(dh->valid);
290 	ps->version = le32_to_cpu(dh->version);
291 	chunk_size = le32_to_cpu(dh->chunk_size);
292 
293 	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
294 		return 0;
295 
296 	DMWARN("chunk size %llu in device metadata overrides "
297 	       "table chunk size of %llu.",
298 	       (unsigned long long)chunk_size,
299 	       (unsigned long long)ps->snap->chunk_size);
300 
301 	/* We had a bogus chunk_size. Fix stuff up. */
302 	free_area(ps);
303 
304 	ps->snap->chunk_size = chunk_size;
305 	ps->snap->chunk_mask = chunk_size - 1;
306 	ps->snap->chunk_shift = ffs(chunk_size) - 1;
307 
308 	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
309 				ps->io_client);
310 	if (r)
311 		return r;
312 
313 	r = alloc_area(ps);
314 	return r;
315 
316 bad:
317 	free_area(ps);
318 	return r;
319 }
320 
321 static int write_header(struct pstore *ps)
322 {
323 	struct disk_header *dh;
324 
325 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
326 
327 	dh = (struct disk_header *) ps->area;
328 	dh->magic = cpu_to_le32(SNAP_MAGIC);
329 	dh->valid = cpu_to_le32(ps->valid);
330 	dh->version = cpu_to_le32(ps->version);
331 	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
332 
333 	return chunk_io(ps, 0, WRITE, 1);
334 }
335 
336 /*
337  * Access functions for the disk exceptions, these do the endian conversions.
338  */
339 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
340 {
341 	BUG_ON(index >= ps->exceptions_per_area);
342 
343 	return ((struct disk_exception *) ps->area) + index;
344 }
345 
346 static void read_exception(struct pstore *ps,
347 			   uint32_t index, struct disk_exception *result)
348 {
349 	struct disk_exception *e = get_exception(ps, index);
350 
351 	/* copy it */
352 	result->old_chunk = le64_to_cpu(e->old_chunk);
353 	result->new_chunk = le64_to_cpu(e->new_chunk);
354 }
355 
356 static void write_exception(struct pstore *ps,
357 			    uint32_t index, struct disk_exception *de)
358 {
359 	struct disk_exception *e = get_exception(ps, index);
360 
361 	/* copy it */
362 	e->old_chunk = cpu_to_le64(de->old_chunk);
363 	e->new_chunk = cpu_to_le64(de->new_chunk);
364 }
365 
366 /*
367  * Registers the exceptions that are present in the current area.
368  * 'full' is filled in to indicate if the area has been
369  * filled.
370  */
371 static int insert_exceptions(struct pstore *ps, int *full)
372 {
373 	int r;
374 	unsigned int i;
375 	struct disk_exception de;
376 
377 	/* presume the area is full */
378 	*full = 1;
379 
380 	for (i = 0; i < ps->exceptions_per_area; i++) {
381 		read_exception(ps, i, &de);
382 
383 		/*
384 		 * If the new_chunk is pointing at the start of
385 		 * the COW device, where the first metadata area
386 		 * is we know that we've hit the end of the
387 		 * exceptions.  Therefore the area is not full.
388 		 */
389 		if (de.new_chunk == 0LL) {
390 			ps->current_committed = i;
391 			*full = 0;
392 			break;
393 		}
394 
395 		/*
396 		 * Keep track of the start of the free chunks.
397 		 */
398 		if (ps->next_free <= de.new_chunk)
399 			ps->next_free = de.new_chunk + 1;
400 
401 		/*
402 		 * Otherwise we add the exception to the snapshot.
403 		 */
404 		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
405 		if (r)
406 			return r;
407 	}
408 
409 	return 0;
410 }
411 
412 static int read_exceptions(struct pstore *ps)
413 {
414 	chunk_t area;
415 	int r, full = 1;
416 
417 	/*
418 	 * Keeping reading chunks and inserting exceptions until
419 	 * we find a partially full area.
420 	 */
421 	for (area = 0; full; area++) {
422 		r = area_io(ps, area, READ);
423 		if (r)
424 			return r;
425 
426 		r = insert_exceptions(ps, &full);
427 		if (r)
428 			return r;
429 	}
430 
431 	return 0;
432 }
433 
434 static struct pstore *get_info(struct exception_store *store)
435 {
436 	return (struct pstore *) store->context;
437 }
438 
439 static void persistent_fraction_full(struct exception_store *store,
440 				     sector_t *numerator, sector_t *denominator)
441 {
442 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
443 	*denominator = get_dev_size(store->snap->cow->bdev);
444 }
445 
446 static void persistent_destroy(struct exception_store *store)
447 {
448 	struct pstore *ps = get_info(store);
449 
450 	destroy_workqueue(ps->metadata_wq);
451 	dm_io_client_destroy(ps->io_client);
452 	vfree(ps->callbacks);
453 	free_area(ps);
454 	kfree(ps);
455 }
456 
457 static int persistent_read_metadata(struct exception_store *store)
458 {
459 	int r, uninitialized_var(new_snapshot);
460 	struct pstore *ps = get_info(store);
461 
462 	/*
463 	 * Read the snapshot header.
464 	 */
465 	r = read_header(ps, &new_snapshot);
466 	if (r)
467 		return r;
468 
469 	/*
470 	 * Now we know correct chunk_size, complete the initialisation.
471 	 */
472 	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
473 				  sizeof(struct disk_exception);
474 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
475 			sizeof(*ps->callbacks));
476 	if (!ps->callbacks)
477 		return -ENOMEM;
478 
479 	/*
480 	 * Do we need to setup a new snapshot ?
481 	 */
482 	if (new_snapshot) {
483 		r = write_header(ps);
484 		if (r) {
485 			DMWARN("write_header failed");
486 			return r;
487 		}
488 
489 		r = zero_area(ps, 0);
490 		if (r) {
491 			DMWARN("zero_area(0) failed");
492 			return r;
493 		}
494 
495 	} else {
496 		/*
497 		 * Sanity checks.
498 		 */
499 		if (ps->version != SNAPSHOT_DISK_VERSION) {
500 			DMWARN("unable to handle snapshot disk version %d",
501 			       ps->version);
502 			return -EINVAL;
503 		}
504 
505 		/*
506 		 * Metadata are valid, but snapshot is invalidated
507 		 */
508 		if (!ps->valid)
509 			return 1;
510 
511 		/*
512 		 * Read the metadata.
513 		 */
514 		r = read_exceptions(ps);
515 		if (r)
516 			return r;
517 	}
518 
519 	return 0;
520 }
521 
522 static int persistent_prepare(struct exception_store *store,
523 			      struct dm_snap_exception *e)
524 {
525 	struct pstore *ps = get_info(store);
526 	uint32_t stride;
527 	chunk_t next_free;
528 	sector_t size = get_dev_size(store->snap->cow->bdev);
529 
530 	/* Is there enough room ? */
531 	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
532 		return -ENOSPC;
533 
534 	e->new_chunk = ps->next_free;
535 
536 	/*
537 	 * Move onto the next free pending, making sure to take
538 	 * into account the location of the metadata chunks.
539 	 */
540 	stride = (ps->exceptions_per_area + 1);
541 	next_free = ++ps->next_free;
542 	if (sector_div(next_free, stride) == 1)
543 		ps->next_free++;
544 
545 	atomic_inc(&ps->pending_count);
546 	return 0;
547 }
548 
549 static void persistent_commit(struct exception_store *store,
550 			      struct dm_snap_exception *e,
551 			      void (*callback) (void *, int success),
552 			      void *callback_context)
553 {
554 	int r;
555 	unsigned int i;
556 	struct pstore *ps = get_info(store);
557 	struct disk_exception de;
558 	struct commit_callback *cb;
559 
560 	de.old_chunk = e->old_chunk;
561 	de.new_chunk = e->new_chunk;
562 	write_exception(ps, ps->current_committed++, &de);
563 
564 	/*
565 	 * Add the callback to the back of the array.  This code
566 	 * is the only place where the callback array is
567 	 * manipulated, and we know that it will never be called
568 	 * multiple times concurrently.
569 	 */
570 	cb = ps->callbacks + ps->callback_count++;
571 	cb->callback = callback;
572 	cb->context = callback_context;
573 
574 	/*
575 	 * If there are no more exceptions in flight, or we have
576 	 * filled this metadata area we commit the exceptions to
577 	 * disk.
578 	 */
579 	if (atomic_dec_and_test(&ps->pending_count) ||
580 	    (ps->current_committed == ps->exceptions_per_area)) {
581 		r = area_io(ps, ps->current_area, WRITE);
582 		if (r)
583 			ps->valid = 0;
584 
585 		/*
586 		 * Have we completely filled the current area ?
587 		 */
588 		if (ps->current_committed == ps->exceptions_per_area) {
589 			ps->current_committed = 0;
590 			r = zero_area(ps, ps->current_area + 1);
591 			if (r)
592 				ps->valid = 0;
593 		}
594 
595 		for (i = 0; i < ps->callback_count; i++) {
596 			cb = ps->callbacks + i;
597 			cb->callback(cb->context, r == 0 ? 1 : 0);
598 		}
599 
600 		ps->callback_count = 0;
601 	}
602 }
603 
604 static void persistent_drop(struct exception_store *store)
605 {
606 	struct pstore *ps = get_info(store);
607 
608 	ps->valid = 0;
609 	if (write_header(ps))
610 		DMWARN("write header failed");
611 }
612 
613 int dm_create_persistent(struct exception_store *store)
614 {
615 	struct pstore *ps;
616 
617 	/* allocate the pstore */
618 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
619 	if (!ps)
620 		return -ENOMEM;
621 
622 	ps->snap = store->snap;
623 	ps->valid = 1;
624 	ps->version = SNAPSHOT_DISK_VERSION;
625 	ps->area = NULL;
626 	ps->next_free = 2;	/* skipping the header and first area */
627 	ps->current_committed = 0;
628 
629 	ps->callback_count = 0;
630 	atomic_set(&ps->pending_count, 0);
631 	ps->callbacks = NULL;
632 
633 	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
634 	if (!ps->metadata_wq) {
635 		kfree(ps);
636 		DMERR("couldn't start header metadata update thread");
637 		return -ENOMEM;
638 	}
639 
640 	store->destroy = persistent_destroy;
641 	store->read_metadata = persistent_read_metadata;
642 	store->prepare_exception = persistent_prepare;
643 	store->commit_exception = persistent_commit;
644 	store->drop_snapshot = persistent_drop;
645 	store->fraction_full = persistent_fraction_full;
646 	store->context = ps;
647 
648 	return 0;
649 }
650 
651 /*-----------------------------------------------------------------
652  * Implementation of the store for non-persistent snapshots.
653  *---------------------------------------------------------------*/
654 struct transient_c {
655 	sector_t next_free;
656 };
657 
658 static void transient_destroy(struct exception_store *store)
659 {
660 	kfree(store->context);
661 }
662 
663 static int transient_read_metadata(struct exception_store *store)
664 {
665 	return 0;
666 }
667 
668 static int transient_prepare(struct exception_store *store,
669 			     struct dm_snap_exception *e)
670 {
671 	struct transient_c *tc = (struct transient_c *) store->context;
672 	sector_t size = get_dev_size(store->snap->cow->bdev);
673 
674 	if (size < (tc->next_free + store->snap->chunk_size))
675 		return -1;
676 
677 	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
678 	tc->next_free += store->snap->chunk_size;
679 
680 	return 0;
681 }
682 
683 static void transient_commit(struct exception_store *store,
684 			     struct dm_snap_exception *e,
685 			     void (*callback) (void *, int success),
686 			     void *callback_context)
687 {
688 	/* Just succeed */
689 	callback(callback_context, 1);
690 }
691 
692 static void transient_fraction_full(struct exception_store *store,
693 				    sector_t *numerator, sector_t *denominator)
694 {
695 	*numerator = ((struct transient_c *) store->context)->next_free;
696 	*denominator = get_dev_size(store->snap->cow->bdev);
697 }
698 
699 int dm_create_transient(struct exception_store *store)
700 {
701 	struct transient_c *tc;
702 
703 	store->destroy = transient_destroy;
704 	store->read_metadata = transient_read_metadata;
705 	store->prepare_exception = transient_prepare;
706 	store->commit_exception = transient_commit;
707 	store->drop_snapshot = NULL;
708 	store->fraction_full = transient_fraction_full;
709 
710 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
711 	if (!tc)
712 		return -ENOMEM;
713 
714 	tc->next_free = 0;
715 	store->context = tc;
716 
717 	return 0;
718 }
719