1 /*
2  * dm-exception-store.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  * Copyright (C) 2006 Red Hat GmbH
6  *
7  * This file is released under the GPL.
8  */
9 
10 #include "dm.h"
11 #include "dm-snap.h"
12 
13 #include <linux/mm.h>
14 #include <linux/pagemap.h>
15 #include <linux/vmalloc.h>
16 #include <linux/slab.h>
17 #include <linux/dm-io.h>
18 #include <linux/dm-kcopyd.h>
19 
20 #define DM_MSG_PREFIX "snapshots"
21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32	/* 16KB */
22 
23 /*-----------------------------------------------------------------
24  * Persistent snapshots, by persistent we mean that the snapshot
25  * will survive a reboot.
26  *---------------------------------------------------------------*/
27 
28 /*
29  * We need to store a record of which parts of the origin have
30  * been copied to the snapshot device.  The snapshot code
31  * requires that we copy exception chunks to chunk aligned areas
32  * of the COW store.  It makes sense therefore, to store the
33  * metadata in chunk size blocks.
34  *
35  * There is no backward or forward compatibility implemented,
36  * snapshots with different disk versions than the kernel will
37  * not be usable.  It is expected that "lvcreate" will blank out
38  * the start of a fresh COW device before calling the snapshot
39  * constructor.
40  *
41  * The first chunk of the COW device just contains the header.
42  * After this there is a chunk filled with exception metadata,
43  * followed by as many exception chunks as can fit in the
44  * metadata areas.
45  *
46  * All on disk structures are in little-endian format.  The end
47  * of the exceptions info is indicated by an exception with a
48  * new_chunk of 0, which is invalid since it would point to the
49  * header chunk.
50  */
51 
52 /*
53  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
54  */
55 #define SNAP_MAGIC 0x70416e53
56 
57 /*
58  * The on-disk version of the metadata.
59  */
60 #define SNAPSHOT_DISK_VERSION 1
61 
62 struct disk_header {
63 	uint32_t magic;
64 
65 	/*
66 	 * Is this snapshot valid.  There is no way of recovering
67 	 * an invalid snapshot.
68 	 */
69 	uint32_t valid;
70 
71 	/*
72 	 * Simple, incrementing version. no backward
73 	 * compatibility.
74 	 */
75 	uint32_t version;
76 
77 	/* In sectors */
78 	uint32_t chunk_size;
79 };
80 
81 struct disk_exception {
82 	uint64_t old_chunk;
83 	uint64_t new_chunk;
84 };
85 
86 struct commit_callback {
87 	void (*callback)(void *, int success);
88 	void *context;
89 };
90 
91 /*
92  * The top level structure for a persistent exception store.
93  */
94 struct pstore {
95 	struct dm_snapshot *snap;	/* up pointer to my snapshot */
96 	int version;
97 	int valid;
98 	uint32_t exceptions_per_area;
99 
100 	/*
101 	 * Now that we have an asynchronous kcopyd there is no
102 	 * need for large chunk sizes, so it wont hurt to have a
103 	 * whole chunks worth of metadata in memory at once.
104 	 */
105 	void *area;
106 
107 	/*
108 	 * Used to keep track of which metadata area the data in
109 	 * 'chunk' refers to.
110 	 */
111 	uint32_t current_area;
112 
113 	/*
114 	 * The next free chunk for an exception.
115 	 */
116 	uint32_t next_free;
117 
118 	/*
119 	 * The index of next free exception in the current
120 	 * metadata area.
121 	 */
122 	uint32_t current_committed;
123 
124 	atomic_t pending_count;
125 	uint32_t callback_count;
126 	struct commit_callback *callbacks;
127 	struct dm_io_client *io_client;
128 
129 	struct workqueue_struct *metadata_wq;
130 };
131 
132 static unsigned sectors_to_pages(unsigned sectors)
133 {
134 	return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9);
135 }
136 
137 static int alloc_area(struct pstore *ps)
138 {
139 	int r = -ENOMEM;
140 	size_t len;
141 
142 	len = ps->snap->chunk_size << SECTOR_SHIFT;
143 
144 	/*
145 	 * Allocate the chunk_size block of memory that will hold
146 	 * a single metadata area.
147 	 */
148 	ps->area = vmalloc(len);
149 	if (!ps->area)
150 		return r;
151 
152 	return 0;
153 }
154 
155 static void free_area(struct pstore *ps)
156 {
157 	vfree(ps->area);
158 	ps->area = NULL;
159 }
160 
161 struct mdata_req {
162 	struct dm_io_region *where;
163 	struct dm_io_request *io_req;
164 	struct work_struct work;
165 	int result;
166 };
167 
168 static void do_metadata(struct work_struct *work)
169 {
170 	struct mdata_req *req = container_of(work, struct mdata_req, work);
171 
172 	req->result = dm_io(req->io_req, 1, req->where, NULL);
173 }
174 
175 /*
176  * Read or write a chunk aligned and sized block of data from a device.
177  */
178 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata)
179 {
180 	struct dm_io_region where = {
181 		.bdev = ps->snap->cow->bdev,
182 		.sector = ps->snap->chunk_size * chunk,
183 		.count = ps->snap->chunk_size,
184 	};
185 	struct dm_io_request io_req = {
186 		.bi_rw = rw,
187 		.mem.type = DM_IO_VMA,
188 		.mem.ptr.vma = ps->area,
189 		.client = ps->io_client,
190 		.notify.fn = NULL,
191 	};
192 	struct mdata_req req;
193 
194 	if (!metadata)
195 		return dm_io(&io_req, 1, &where, NULL);
196 
197 	req.where = &where;
198 	req.io_req = &io_req;
199 
200 	/*
201 	 * Issue the synchronous I/O from a different thread
202 	 * to avoid generic_make_request recursion.
203 	 */
204 	INIT_WORK(&req.work, do_metadata);
205 	queue_work(ps->metadata_wq, &req.work);
206 	flush_workqueue(ps->metadata_wq);
207 
208 	return req.result;
209 }
210 
211 /*
212  * Read or write a metadata area.  Remembering to skip the first
213  * chunk which holds the header.
214  */
215 static int area_io(struct pstore *ps, uint32_t area, int rw)
216 {
217 	int r;
218 	uint32_t chunk;
219 
220 	/* convert a metadata area index to a chunk index */
221 	chunk = 1 + ((ps->exceptions_per_area + 1) * area);
222 
223 	r = chunk_io(ps, chunk, rw, 0);
224 	if (r)
225 		return r;
226 
227 	ps->current_area = area;
228 	return 0;
229 }
230 
231 static int zero_area(struct pstore *ps, uint32_t area)
232 {
233 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
234 	return area_io(ps, area, WRITE);
235 }
236 
237 static int read_header(struct pstore *ps, int *new_snapshot)
238 {
239 	int r;
240 	struct disk_header *dh;
241 	chunk_t chunk_size;
242 	int chunk_size_supplied = 1;
243 
244 	/*
245 	 * Use default chunk size (or hardsect_size, if larger) if none supplied
246 	 */
247 	if (!ps->snap->chunk_size) {
248         	ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
249 		    bdev_hardsect_size(ps->snap->cow->bdev) >> 9);
250 		ps->snap->chunk_mask = ps->snap->chunk_size - 1;
251 		ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1;
252 		chunk_size_supplied = 0;
253 	}
254 
255 	ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap->
256 							     chunk_size));
257 	if (IS_ERR(ps->io_client))
258 		return PTR_ERR(ps->io_client);
259 
260 	r = alloc_area(ps);
261 	if (r)
262 		return r;
263 
264 	r = chunk_io(ps, 0, READ, 1);
265 	if (r)
266 		goto bad;
267 
268 	dh = (struct disk_header *) ps->area;
269 
270 	if (le32_to_cpu(dh->magic) == 0) {
271 		*new_snapshot = 1;
272 		return 0;
273 	}
274 
275 	if (le32_to_cpu(dh->magic) != SNAP_MAGIC) {
276 		DMWARN("Invalid or corrupt snapshot");
277 		r = -ENXIO;
278 		goto bad;
279 	}
280 
281 	*new_snapshot = 0;
282 	ps->valid = le32_to_cpu(dh->valid);
283 	ps->version = le32_to_cpu(dh->version);
284 	chunk_size = le32_to_cpu(dh->chunk_size);
285 
286 	if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size)
287 		return 0;
288 
289 	DMWARN("chunk size %llu in device metadata overrides "
290 	       "table chunk size of %llu.",
291 	       (unsigned long long)chunk_size,
292 	       (unsigned long long)ps->snap->chunk_size);
293 
294 	/* We had a bogus chunk_size. Fix stuff up. */
295 	free_area(ps);
296 
297 	ps->snap->chunk_size = chunk_size;
298 	ps->snap->chunk_mask = chunk_size - 1;
299 	ps->snap->chunk_shift = ffs(chunk_size) - 1;
300 
301 	r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size),
302 				ps->io_client);
303 	if (r)
304 		return r;
305 
306 	r = alloc_area(ps);
307 	return r;
308 
309 bad:
310 	free_area(ps);
311 	return r;
312 }
313 
314 static int write_header(struct pstore *ps)
315 {
316 	struct disk_header *dh;
317 
318 	memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT);
319 
320 	dh = (struct disk_header *) ps->area;
321 	dh->magic = cpu_to_le32(SNAP_MAGIC);
322 	dh->valid = cpu_to_le32(ps->valid);
323 	dh->version = cpu_to_le32(ps->version);
324 	dh->chunk_size = cpu_to_le32(ps->snap->chunk_size);
325 
326 	return chunk_io(ps, 0, WRITE, 1);
327 }
328 
329 /*
330  * Access functions for the disk exceptions, these do the endian conversions.
331  */
332 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
333 {
334 	BUG_ON(index >= ps->exceptions_per_area);
335 
336 	return ((struct disk_exception *) ps->area) + index;
337 }
338 
339 static void read_exception(struct pstore *ps,
340 			   uint32_t index, struct disk_exception *result)
341 {
342 	struct disk_exception *e = get_exception(ps, index);
343 
344 	/* copy it */
345 	result->old_chunk = le64_to_cpu(e->old_chunk);
346 	result->new_chunk = le64_to_cpu(e->new_chunk);
347 }
348 
349 static void write_exception(struct pstore *ps,
350 			    uint32_t index, struct disk_exception *de)
351 {
352 	struct disk_exception *e = get_exception(ps, index);
353 
354 	/* copy it */
355 	e->old_chunk = cpu_to_le64(de->old_chunk);
356 	e->new_chunk = cpu_to_le64(de->new_chunk);
357 }
358 
359 /*
360  * Registers the exceptions that are present in the current area.
361  * 'full' is filled in to indicate if the area has been
362  * filled.
363  */
364 static int insert_exceptions(struct pstore *ps, int *full)
365 {
366 	int r;
367 	unsigned int i;
368 	struct disk_exception de;
369 
370 	/* presume the area is full */
371 	*full = 1;
372 
373 	for (i = 0; i < ps->exceptions_per_area; i++) {
374 		read_exception(ps, i, &de);
375 
376 		/*
377 		 * If the new_chunk is pointing at the start of
378 		 * the COW device, where the first metadata area
379 		 * is we know that we've hit the end of the
380 		 * exceptions.  Therefore the area is not full.
381 		 */
382 		if (de.new_chunk == 0LL) {
383 			ps->current_committed = i;
384 			*full = 0;
385 			break;
386 		}
387 
388 		/*
389 		 * Keep track of the start of the free chunks.
390 		 */
391 		if (ps->next_free <= de.new_chunk)
392 			ps->next_free = de.new_chunk + 1;
393 
394 		/*
395 		 * Otherwise we add the exception to the snapshot.
396 		 */
397 		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
398 		if (r)
399 			return r;
400 	}
401 
402 	return 0;
403 }
404 
405 static int read_exceptions(struct pstore *ps)
406 {
407 	uint32_t area;
408 	int r, full = 1;
409 
410 	/*
411 	 * Keeping reading chunks and inserting exceptions until
412 	 * we find a partially full area.
413 	 */
414 	for (area = 0; full; area++) {
415 		r = area_io(ps, area, READ);
416 		if (r)
417 			return r;
418 
419 		r = insert_exceptions(ps, &full);
420 		if (r)
421 			return r;
422 	}
423 
424 	return 0;
425 }
426 
427 static struct pstore *get_info(struct exception_store *store)
428 {
429 	return (struct pstore *) store->context;
430 }
431 
432 static void persistent_fraction_full(struct exception_store *store,
433 				     sector_t *numerator, sector_t *denominator)
434 {
435 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
436 	*denominator = get_dev_size(store->snap->cow->bdev);
437 }
438 
439 static void persistent_destroy(struct exception_store *store)
440 {
441 	struct pstore *ps = get_info(store);
442 
443 	destroy_workqueue(ps->metadata_wq);
444 	dm_io_client_destroy(ps->io_client);
445 	vfree(ps->callbacks);
446 	free_area(ps);
447 	kfree(ps);
448 }
449 
450 static int persistent_read_metadata(struct exception_store *store)
451 {
452 	int r, uninitialized_var(new_snapshot);
453 	struct pstore *ps = get_info(store);
454 
455 	/*
456 	 * Read the snapshot header.
457 	 */
458 	r = read_header(ps, &new_snapshot);
459 	if (r)
460 		return r;
461 
462 	/*
463 	 * Now we know correct chunk_size, complete the initialisation.
464 	 */
465 	ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) /
466 				  sizeof(struct disk_exception);
467 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
468 			sizeof(*ps->callbacks));
469 	if (!ps->callbacks)
470 		return -ENOMEM;
471 
472 	/*
473 	 * Do we need to setup a new snapshot ?
474 	 */
475 	if (new_snapshot) {
476 		r = write_header(ps);
477 		if (r) {
478 			DMWARN("write_header failed");
479 			return r;
480 		}
481 
482 		r = zero_area(ps, 0);
483 		if (r) {
484 			DMWARN("zero_area(0) failed");
485 			return r;
486 		}
487 
488 	} else {
489 		/*
490 		 * Sanity checks.
491 		 */
492 		if (ps->version != SNAPSHOT_DISK_VERSION) {
493 			DMWARN("unable to handle snapshot disk version %d",
494 			       ps->version);
495 			return -EINVAL;
496 		}
497 
498 		/*
499 		 * Metadata are valid, but snapshot is invalidated
500 		 */
501 		if (!ps->valid)
502 			return 1;
503 
504 		/*
505 		 * Read the metadata.
506 		 */
507 		r = read_exceptions(ps);
508 		if (r)
509 			return r;
510 	}
511 
512 	return 0;
513 }
514 
515 static int persistent_prepare(struct exception_store *store,
516 			      struct dm_snap_exception *e)
517 {
518 	struct pstore *ps = get_info(store);
519 	uint32_t stride;
520 	sector_t size = get_dev_size(store->snap->cow->bdev);
521 
522 	/* Is there enough room ? */
523 	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
524 		return -ENOSPC;
525 
526 	e->new_chunk = ps->next_free;
527 
528 	/*
529 	 * Move onto the next free pending, making sure to take
530 	 * into account the location of the metadata chunks.
531 	 */
532 	stride = (ps->exceptions_per_area + 1);
533 	if ((++ps->next_free % stride) == 1)
534 		ps->next_free++;
535 
536 	atomic_inc(&ps->pending_count);
537 	return 0;
538 }
539 
540 static void persistent_commit(struct exception_store *store,
541 			      struct dm_snap_exception *e,
542 			      void (*callback) (void *, int success),
543 			      void *callback_context)
544 {
545 	int r;
546 	unsigned int i;
547 	struct pstore *ps = get_info(store);
548 	struct disk_exception de;
549 	struct commit_callback *cb;
550 
551 	de.old_chunk = e->old_chunk;
552 	de.new_chunk = e->new_chunk;
553 	write_exception(ps, ps->current_committed++, &de);
554 
555 	/*
556 	 * Add the callback to the back of the array.  This code
557 	 * is the only place where the callback array is
558 	 * manipulated, and we know that it will never be called
559 	 * multiple times concurrently.
560 	 */
561 	cb = ps->callbacks + ps->callback_count++;
562 	cb->callback = callback;
563 	cb->context = callback_context;
564 
565 	/*
566 	 * If there are no more exceptions in flight, or we have
567 	 * filled this metadata area we commit the exceptions to
568 	 * disk.
569 	 */
570 	if (atomic_dec_and_test(&ps->pending_count) ||
571 	    (ps->current_committed == ps->exceptions_per_area)) {
572 		r = area_io(ps, ps->current_area, WRITE);
573 		if (r)
574 			ps->valid = 0;
575 
576 		/*
577 		 * Have we completely filled the current area ?
578 		 */
579 		if (ps->current_committed == ps->exceptions_per_area) {
580 			ps->current_committed = 0;
581 			r = zero_area(ps, ps->current_area + 1);
582 			if (r)
583 				ps->valid = 0;
584 		}
585 
586 		for (i = 0; i < ps->callback_count; i++) {
587 			cb = ps->callbacks + i;
588 			cb->callback(cb->context, r == 0 ? 1 : 0);
589 		}
590 
591 		ps->callback_count = 0;
592 	}
593 }
594 
595 static void persistent_drop(struct exception_store *store)
596 {
597 	struct pstore *ps = get_info(store);
598 
599 	ps->valid = 0;
600 	if (write_header(ps))
601 		DMWARN("write header failed");
602 }
603 
604 int dm_create_persistent(struct exception_store *store)
605 {
606 	struct pstore *ps;
607 
608 	/* allocate the pstore */
609 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
610 	if (!ps)
611 		return -ENOMEM;
612 
613 	ps->snap = store->snap;
614 	ps->valid = 1;
615 	ps->version = SNAPSHOT_DISK_VERSION;
616 	ps->area = NULL;
617 	ps->next_free = 2;	/* skipping the header and first area */
618 	ps->current_committed = 0;
619 
620 	ps->callback_count = 0;
621 	atomic_set(&ps->pending_count, 0);
622 	ps->callbacks = NULL;
623 
624 	ps->metadata_wq = create_singlethread_workqueue("ksnaphd");
625 	if (!ps->metadata_wq) {
626 		kfree(ps);
627 		DMERR("couldn't start header metadata update thread");
628 		return -ENOMEM;
629 	}
630 
631 	store->destroy = persistent_destroy;
632 	store->read_metadata = persistent_read_metadata;
633 	store->prepare_exception = persistent_prepare;
634 	store->commit_exception = persistent_commit;
635 	store->drop_snapshot = persistent_drop;
636 	store->fraction_full = persistent_fraction_full;
637 	store->context = ps;
638 
639 	return 0;
640 }
641 
642 /*-----------------------------------------------------------------
643  * Implementation of the store for non-persistent snapshots.
644  *---------------------------------------------------------------*/
645 struct transient_c {
646 	sector_t next_free;
647 };
648 
649 static void transient_destroy(struct exception_store *store)
650 {
651 	kfree(store->context);
652 }
653 
654 static int transient_read_metadata(struct exception_store *store)
655 {
656 	return 0;
657 }
658 
659 static int transient_prepare(struct exception_store *store,
660 			     struct dm_snap_exception *e)
661 {
662 	struct transient_c *tc = (struct transient_c *) store->context;
663 	sector_t size = get_dev_size(store->snap->cow->bdev);
664 
665 	if (size < (tc->next_free + store->snap->chunk_size))
666 		return -1;
667 
668 	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
669 	tc->next_free += store->snap->chunk_size;
670 
671 	return 0;
672 }
673 
674 static void transient_commit(struct exception_store *store,
675 			     struct dm_snap_exception *e,
676 			     void (*callback) (void *, int success),
677 			     void *callback_context)
678 {
679 	/* Just succeed */
680 	callback(callback_context, 1);
681 }
682 
683 static void transient_fraction_full(struct exception_store *store,
684 				    sector_t *numerator, sector_t *denominator)
685 {
686 	*numerator = ((struct transient_c *) store->context)->next_free;
687 	*denominator = get_dev_size(store->snap->cow->bdev);
688 }
689 
690 int dm_create_transient(struct exception_store *store)
691 {
692 	struct transient_c *tc;
693 
694 	store->destroy = transient_destroy;
695 	store->read_metadata = transient_read_metadata;
696 	store->prepare_exception = transient_prepare;
697 	store->commit_exception = transient_commit;
698 	store->drop_snapshot = NULL;
699 	store->fraction_full = transient_fraction_full;
700 
701 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
702 	if (!tc)
703 		return -ENOMEM;
704 
705 	tc->next_free = 0;
706 	store->context = tc;
707 
708 	return 0;
709 }
710