xref: /openbmc/linux/drivers/md/dm-exception-store.c (revision 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2)
1 /*
2  * dm-snapshot.c
3  *
4  * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
5  *
6  * This file is released under the GPL.
7  */
8 
9 #include "dm.h"
10 #include "dm-snap.h"
11 #include "dm-io.h"
12 #include "kcopyd.h"
13 
14 #include <linux/mm.h>
15 #include <linux/pagemap.h>
16 #include <linux/vmalloc.h>
17 #include <linux/slab.h>
18 
19 /*-----------------------------------------------------------------
20  * Persistent snapshots, by persistent we mean that the snapshot
21  * will survive a reboot.
22  *---------------------------------------------------------------*/
23 
24 /*
25  * We need to store a record of which parts of the origin have
26  * been copied to the snapshot device.  The snapshot code
27  * requires that we copy exception chunks to chunk aligned areas
28  * of the COW store.  It makes sense therefore, to store the
29  * metadata in chunk size blocks.
30  *
31  * There is no backward or forward compatibility implemented,
32  * snapshots with different disk versions than the kernel will
33  * not be usable.  It is expected that "lvcreate" will blank out
34  * the start of a fresh COW device before calling the snapshot
35  * constructor.
36  *
37  * The first chunk of the COW device just contains the header.
38  * After this there is a chunk filled with exception metadata,
39  * followed by as many exception chunks as can fit in the
40  * metadata areas.
41  *
42  * All on disk structures are in little-endian format.  The end
43  * of the exceptions info is indicated by an exception with a
44  * new_chunk of 0, which is invalid since it would point to the
45  * header chunk.
46  */
47 
48 /*
49  * Magic for persistent snapshots: "SnAp" - Feeble isn't it.
50  */
51 #define SNAP_MAGIC 0x70416e53
52 
53 /*
54  * The on-disk version of the metadata.
55  */
56 #define SNAPSHOT_DISK_VERSION 1
57 
58 struct disk_header {
59 	uint32_t magic;
60 
61 	/*
62 	 * Is this snapshot valid.  There is no way of recovering
63 	 * an invalid snapshot.
64 	 */
65 	uint32_t valid;
66 
67 	/*
68 	 * Simple, incrementing version. no backward
69 	 * compatibility.
70 	 */
71 	uint32_t version;
72 
73 	/* In sectors */
74 	uint32_t chunk_size;
75 };
76 
77 struct disk_exception {
78 	uint64_t old_chunk;
79 	uint64_t new_chunk;
80 };
81 
82 struct commit_callback {
83 	void (*callback)(void *, int success);
84 	void *context;
85 };
86 
87 /*
88  * The top level structure for a persistent exception store.
89  */
90 struct pstore {
91 	struct dm_snapshot *snap;	/* up pointer to my snapshot */
92 	int version;
93 	int valid;
94 	uint32_t chunk_size;
95 	uint32_t exceptions_per_area;
96 
97 	/*
98 	 * Now that we have an asynchronous kcopyd there is no
99 	 * need for large chunk sizes, so it wont hurt to have a
100 	 * whole chunks worth of metadata in memory at once.
101 	 */
102 	void *area;
103 
104 	/*
105 	 * Used to keep track of which metadata area the data in
106 	 * 'chunk' refers to.
107 	 */
108 	uint32_t current_area;
109 
110 	/*
111 	 * The next free chunk for an exception.
112 	 */
113 	uint32_t next_free;
114 
115 	/*
116 	 * The index of next free exception in the current
117 	 * metadata area.
118 	 */
119 	uint32_t current_committed;
120 
121 	atomic_t pending_count;
122 	uint32_t callback_count;
123 	struct commit_callback *callbacks;
124 };
125 
126 static inline unsigned int sectors_to_pages(unsigned int sectors)
127 {
128 	return sectors / (PAGE_SIZE >> 9);
129 }
130 
131 static int alloc_area(struct pstore *ps)
132 {
133 	int r = -ENOMEM;
134 	size_t len;
135 
136 	len = ps->chunk_size << SECTOR_SHIFT;
137 
138 	/*
139 	 * Allocate the chunk_size block of memory that will hold
140 	 * a single metadata area.
141 	 */
142 	ps->area = vmalloc(len);
143 	if (!ps->area)
144 		return r;
145 
146 	return 0;
147 }
148 
149 static void free_area(struct pstore *ps)
150 {
151 	vfree(ps->area);
152 }
153 
154 /*
155  * Read or write a chunk aligned and sized block of data from a device.
156  */
157 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw)
158 {
159 	struct io_region where;
160 	unsigned long bits;
161 
162 	where.bdev = ps->snap->cow->bdev;
163 	where.sector = ps->chunk_size * chunk;
164 	where.count = ps->chunk_size;
165 
166 	return dm_io_sync_vm(1, &where, rw, ps->area, &bits);
167 }
168 
169 /*
170  * Read or write a metadata area.  Remembering to skip the first
171  * chunk which holds the header.
172  */
173 static int area_io(struct pstore *ps, uint32_t area, int rw)
174 {
175 	int r;
176 	uint32_t chunk;
177 
178 	/* convert a metadata area index to a chunk index */
179 	chunk = 1 + ((ps->exceptions_per_area + 1) * area);
180 
181 	r = chunk_io(ps, chunk, rw);
182 	if (r)
183 		return r;
184 
185 	ps->current_area = area;
186 	return 0;
187 }
188 
189 static int zero_area(struct pstore *ps, uint32_t area)
190 {
191 	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
192 	return area_io(ps, area, WRITE);
193 }
194 
195 static int read_header(struct pstore *ps, int *new_snapshot)
196 {
197 	int r;
198 	struct disk_header *dh;
199 
200 	r = chunk_io(ps, 0, READ);
201 	if (r)
202 		return r;
203 
204 	dh = (struct disk_header *) ps->area;
205 
206 	if (le32_to_cpu(dh->magic) == 0) {
207 		*new_snapshot = 1;
208 
209 	} else if (le32_to_cpu(dh->magic) == SNAP_MAGIC) {
210 		*new_snapshot = 0;
211 		ps->valid = le32_to_cpu(dh->valid);
212 		ps->version = le32_to_cpu(dh->version);
213 		ps->chunk_size = le32_to_cpu(dh->chunk_size);
214 
215 	} else {
216 		DMWARN("Invalid/corrupt snapshot");
217 		r = -ENXIO;
218 	}
219 
220 	return r;
221 }
222 
223 static int write_header(struct pstore *ps)
224 {
225 	struct disk_header *dh;
226 
227 	memset(ps->area, 0, ps->chunk_size << SECTOR_SHIFT);
228 
229 	dh = (struct disk_header *) ps->area;
230 	dh->magic = cpu_to_le32(SNAP_MAGIC);
231 	dh->valid = cpu_to_le32(ps->valid);
232 	dh->version = cpu_to_le32(ps->version);
233 	dh->chunk_size = cpu_to_le32(ps->chunk_size);
234 
235 	return chunk_io(ps, 0, WRITE);
236 }
237 
238 /*
239  * Access functions for the disk exceptions, these do the endian conversions.
240  */
241 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index)
242 {
243 	if (index >= ps->exceptions_per_area)
244 		return NULL;
245 
246 	return ((struct disk_exception *) ps->area) + index;
247 }
248 
249 static int read_exception(struct pstore *ps,
250 			  uint32_t index, struct disk_exception *result)
251 {
252 	struct disk_exception *e;
253 
254 	e = get_exception(ps, index);
255 	if (!e)
256 		return -EINVAL;
257 
258 	/* copy it */
259 	result->old_chunk = le64_to_cpu(e->old_chunk);
260 	result->new_chunk = le64_to_cpu(e->new_chunk);
261 
262 	return 0;
263 }
264 
265 static int write_exception(struct pstore *ps,
266 			   uint32_t index, struct disk_exception *de)
267 {
268 	struct disk_exception *e;
269 
270 	e = get_exception(ps, index);
271 	if (!e)
272 		return -EINVAL;
273 
274 	/* copy it */
275 	e->old_chunk = cpu_to_le64(de->old_chunk);
276 	e->new_chunk = cpu_to_le64(de->new_chunk);
277 
278 	return 0;
279 }
280 
281 /*
282  * Registers the exceptions that are present in the current area.
283  * 'full' is filled in to indicate if the area has been
284  * filled.
285  */
286 static int insert_exceptions(struct pstore *ps, int *full)
287 {
288 	int r;
289 	unsigned int i;
290 	struct disk_exception de;
291 
292 	/* presume the area is full */
293 	*full = 1;
294 
295 	for (i = 0; i < ps->exceptions_per_area; i++) {
296 		r = read_exception(ps, i, &de);
297 
298 		if (r)
299 			return r;
300 
301 		/*
302 		 * If the new_chunk is pointing at the start of
303 		 * the COW device, where the first metadata area
304 		 * is we know that we've hit the end of the
305 		 * exceptions.  Therefore the area is not full.
306 		 */
307 		if (de.new_chunk == 0LL) {
308 			ps->current_committed = i;
309 			*full = 0;
310 			break;
311 		}
312 
313 		/*
314 		 * Keep track of the start of the free chunks.
315 		 */
316 		if (ps->next_free <= de.new_chunk)
317 			ps->next_free = de.new_chunk + 1;
318 
319 		/*
320 		 * Otherwise we add the exception to the snapshot.
321 		 */
322 		r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk);
323 		if (r)
324 			return r;
325 	}
326 
327 	return 0;
328 }
329 
330 static int read_exceptions(struct pstore *ps)
331 {
332 	uint32_t area;
333 	int r, full = 1;
334 
335 	/*
336 	 * Keeping reading chunks and inserting exceptions until
337 	 * we find a partially full area.
338 	 */
339 	for (area = 0; full; area++) {
340 		r = area_io(ps, area, READ);
341 		if (r)
342 			return r;
343 
344 		r = insert_exceptions(ps, &full);
345 		if (r)
346 			return r;
347 	}
348 
349 	return 0;
350 }
351 
352 static inline struct pstore *get_info(struct exception_store *store)
353 {
354 	return (struct pstore *) store->context;
355 }
356 
357 static void persistent_fraction_full(struct exception_store *store,
358 				     sector_t *numerator, sector_t *denominator)
359 {
360 	*numerator = get_info(store)->next_free * store->snap->chunk_size;
361 	*denominator = get_dev_size(store->snap->cow->bdev);
362 }
363 
364 static void persistent_destroy(struct exception_store *store)
365 {
366 	struct pstore *ps = get_info(store);
367 
368 	dm_io_put(sectors_to_pages(ps->chunk_size));
369 	vfree(ps->callbacks);
370 	free_area(ps);
371 	kfree(ps);
372 }
373 
374 static int persistent_read_metadata(struct exception_store *store)
375 {
376 	int r, new_snapshot;
377 	struct pstore *ps = get_info(store);
378 
379 	/*
380 	 * Read the snapshot header.
381 	 */
382 	r = read_header(ps, &new_snapshot);
383 	if (r)
384 		return r;
385 
386 	/*
387 	 * Do we need to setup a new snapshot ?
388 	 */
389 	if (new_snapshot) {
390 		r = write_header(ps);
391 		if (r) {
392 			DMWARN("write_header failed");
393 			return r;
394 		}
395 
396 		r = zero_area(ps, 0);
397 		if (r) {
398 			DMWARN("zero_area(0) failed");
399 			return r;
400 		}
401 
402 	} else {
403 		/*
404 		 * Sanity checks.
405 		 */
406 		if (!ps->valid) {
407 			DMWARN("snapshot is marked invalid");
408 			return -EINVAL;
409 		}
410 
411 		if (ps->version != SNAPSHOT_DISK_VERSION) {
412 			DMWARN("unable to handle snapshot disk version %d",
413 			       ps->version);
414 			return -EINVAL;
415 		}
416 
417 		/*
418 		 * Read the metadata.
419 		 */
420 		r = read_exceptions(ps);
421 		if (r)
422 			return r;
423 	}
424 
425 	return 0;
426 }
427 
428 static int persistent_prepare(struct exception_store *store,
429 			      struct exception *e)
430 {
431 	struct pstore *ps = get_info(store);
432 	uint32_t stride;
433 	sector_t size = get_dev_size(store->snap->cow->bdev);
434 
435 	/* Is there enough room ? */
436 	if (size < ((ps->next_free + 1) * store->snap->chunk_size))
437 		return -ENOSPC;
438 
439 	e->new_chunk = ps->next_free;
440 
441 	/*
442 	 * Move onto the next free pending, making sure to take
443 	 * into account the location of the metadata chunks.
444 	 */
445 	stride = (ps->exceptions_per_area + 1);
446 	if ((++ps->next_free % stride) == 1)
447 		ps->next_free++;
448 
449 	atomic_inc(&ps->pending_count);
450 	return 0;
451 }
452 
453 static void persistent_commit(struct exception_store *store,
454 			      struct exception *e,
455 			      void (*callback) (void *, int success),
456 			      void *callback_context)
457 {
458 	int r;
459 	unsigned int i;
460 	struct pstore *ps = get_info(store);
461 	struct disk_exception de;
462 	struct commit_callback *cb;
463 
464 	de.old_chunk = e->old_chunk;
465 	de.new_chunk = e->new_chunk;
466 	write_exception(ps, ps->current_committed++, &de);
467 
468 	/*
469 	 * Add the callback to the back of the array.  This code
470 	 * is the only place where the callback array is
471 	 * manipulated, and we know that it will never be called
472 	 * multiple times concurrently.
473 	 */
474 	cb = ps->callbacks + ps->callback_count++;
475 	cb->callback = callback;
476 	cb->context = callback_context;
477 
478 	/*
479 	 * If there are no more exceptions in flight, or we have
480 	 * filled this metadata area we commit the exceptions to
481 	 * disk.
482 	 */
483 	if (atomic_dec_and_test(&ps->pending_count) ||
484 	    (ps->current_committed == ps->exceptions_per_area)) {
485 		r = area_io(ps, ps->current_area, WRITE);
486 		if (r)
487 			ps->valid = 0;
488 
489 		for (i = 0; i < ps->callback_count; i++) {
490 			cb = ps->callbacks + i;
491 			cb->callback(cb->context, r == 0 ? 1 : 0);
492 		}
493 
494 		ps->callback_count = 0;
495 	}
496 
497 	/*
498 	 * Have we completely filled the current area ?
499 	 */
500 	if (ps->current_committed == ps->exceptions_per_area) {
501 		ps->current_committed = 0;
502 		r = zero_area(ps, ps->current_area + 1);
503 		if (r)
504 			ps->valid = 0;
505 	}
506 }
507 
508 static void persistent_drop(struct exception_store *store)
509 {
510 	struct pstore *ps = get_info(store);
511 
512 	ps->valid = 0;
513 	if (write_header(ps))
514 		DMWARN("write header failed");
515 }
516 
517 int dm_create_persistent(struct exception_store *store, uint32_t chunk_size)
518 {
519 	int r;
520 	struct pstore *ps;
521 
522 	r = dm_io_get(sectors_to_pages(chunk_size));
523 	if (r)
524 		return r;
525 
526 	/* allocate the pstore */
527 	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
528 	if (!ps) {
529 		r = -ENOMEM;
530 		goto bad;
531 	}
532 
533 	ps->snap = store->snap;
534 	ps->valid = 1;
535 	ps->version = SNAPSHOT_DISK_VERSION;
536 	ps->chunk_size = chunk_size;
537 	ps->exceptions_per_area = (chunk_size << SECTOR_SHIFT) /
538 	    sizeof(struct disk_exception);
539 	ps->next_free = 2;	/* skipping the header and first area */
540 	ps->current_committed = 0;
541 
542 	r = alloc_area(ps);
543 	if (r)
544 		goto bad;
545 
546 	/*
547 	 * Allocate space for all the callbacks.
548 	 */
549 	ps->callback_count = 0;
550 	atomic_set(&ps->pending_count, 0);
551 	ps->callbacks = dm_vcalloc(ps->exceptions_per_area,
552 				   sizeof(*ps->callbacks));
553 
554 	if (!ps->callbacks) {
555 		r = -ENOMEM;
556 		goto bad;
557 	}
558 
559 	store->destroy = persistent_destroy;
560 	store->read_metadata = persistent_read_metadata;
561 	store->prepare_exception = persistent_prepare;
562 	store->commit_exception = persistent_commit;
563 	store->drop_snapshot = persistent_drop;
564 	store->fraction_full = persistent_fraction_full;
565 	store->context = ps;
566 
567 	return 0;
568 
569       bad:
570 	dm_io_put(sectors_to_pages(chunk_size));
571 	if (ps) {
572 		if (ps->area)
573 			free_area(ps);
574 
575 		kfree(ps);
576 	}
577 	return r;
578 }
579 
580 /*-----------------------------------------------------------------
581  * Implementation of the store for non-persistent snapshots.
582  *---------------------------------------------------------------*/
583 struct transient_c {
584 	sector_t next_free;
585 };
586 
587 static void transient_destroy(struct exception_store *store)
588 {
589 	kfree(store->context);
590 }
591 
592 static int transient_read_metadata(struct exception_store *store)
593 {
594 	return 0;
595 }
596 
597 static int transient_prepare(struct exception_store *store, struct exception *e)
598 {
599 	struct transient_c *tc = (struct transient_c *) store->context;
600 	sector_t size = get_dev_size(store->snap->cow->bdev);
601 
602 	if (size < (tc->next_free + store->snap->chunk_size))
603 		return -1;
604 
605 	e->new_chunk = sector_to_chunk(store->snap, tc->next_free);
606 	tc->next_free += store->snap->chunk_size;
607 
608 	return 0;
609 }
610 
611 static void transient_commit(struct exception_store *store,
612 		      struct exception *e,
613 		      void (*callback) (void *, int success),
614 		      void *callback_context)
615 {
616 	/* Just succeed */
617 	callback(callback_context, 1);
618 }
619 
620 static void transient_fraction_full(struct exception_store *store,
621 				    sector_t *numerator, sector_t *denominator)
622 {
623 	*numerator = ((struct transient_c *) store->context)->next_free;
624 	*denominator = get_dev_size(store->snap->cow->bdev);
625 }
626 
627 int dm_create_transient(struct exception_store *store,
628 			struct dm_snapshot *s, int blocksize)
629 {
630 	struct transient_c *tc;
631 
632 	memset(store, 0, sizeof(*store));
633 	store->destroy = transient_destroy;
634 	store->read_metadata = transient_read_metadata;
635 	store->prepare_exception = transient_prepare;
636 	store->commit_exception = transient_commit;
637 	store->fraction_full = transient_fraction_full;
638 	store->snap = s;
639 
640 	tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL);
641 	if (!tc)
642 		return -ENOMEM;
643 
644 	tc->next_free = 0;
645 	store->context = tc;
646 
647 	return 0;
648 }
649