1 /* 2 * dm-exception-store.c 3 * 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 5 * Copyright (C) 2006 Red Hat GmbH 6 * 7 * This file is released under the GPL. 8 */ 9 10 #include "dm.h" 11 #include "dm-snap.h" 12 13 #include <linux/mm.h> 14 #include <linux/pagemap.h> 15 #include <linux/vmalloc.h> 16 #include <linux/slab.h> 17 #include <linux/dm-io.h> 18 #include <linux/dm-kcopyd.h> 19 20 #define DM_MSG_PREFIX "snapshots" 21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 22 23 /*----------------------------------------------------------------- 24 * Persistent snapshots, by persistent we mean that the snapshot 25 * will survive a reboot. 26 *---------------------------------------------------------------*/ 27 28 /* 29 * We need to store a record of which parts of the origin have 30 * been copied to the snapshot device. The snapshot code 31 * requires that we copy exception chunks to chunk aligned areas 32 * of the COW store. It makes sense therefore, to store the 33 * metadata in chunk size blocks. 34 * 35 * There is no backward or forward compatibility implemented, 36 * snapshots with different disk versions than the kernel will 37 * not be usable. It is expected that "lvcreate" will blank out 38 * the start of a fresh COW device before calling the snapshot 39 * constructor. 40 * 41 * The first chunk of the COW device just contains the header. 42 * After this there is a chunk filled with exception metadata, 43 * followed by as many exception chunks as can fit in the 44 * metadata areas. 45 * 46 * All on disk structures are in little-endian format. The end 47 * of the exceptions info is indicated by an exception with a 48 * new_chunk of 0, which is invalid since it would point to the 49 * header chunk. 50 */ 51 52 /* 53 * Magic for persistent snapshots: "SnAp" - Feeble isn't it. 54 */ 55 #define SNAP_MAGIC 0x70416e53 56 57 /* 58 * The on-disk version of the metadata. 59 */ 60 #define SNAPSHOT_DISK_VERSION 1 61 62 struct disk_header { 63 uint32_t magic; 64 65 /* 66 * Is this snapshot valid. There is no way of recovering 67 * an invalid snapshot. 68 */ 69 uint32_t valid; 70 71 /* 72 * Simple, incrementing version. no backward 73 * compatibility. 74 */ 75 uint32_t version; 76 77 /* In sectors */ 78 uint32_t chunk_size; 79 }; 80 81 struct disk_exception { 82 uint64_t old_chunk; 83 uint64_t new_chunk; 84 }; 85 86 struct commit_callback { 87 void (*callback)(void *, int success); 88 void *context; 89 }; 90 91 /* 92 * The top level structure for a persistent exception store. 93 */ 94 struct pstore { 95 struct dm_snapshot *snap; /* up pointer to my snapshot */ 96 int version; 97 int valid; 98 uint32_t exceptions_per_area; 99 100 /* 101 * Now that we have an asynchronous kcopyd there is no 102 * need for large chunk sizes, so it wont hurt to have a 103 * whole chunks worth of metadata in memory at once. 104 */ 105 void *area; 106 107 /* 108 * Used to keep track of which metadata area the data in 109 * 'chunk' refers to. 110 */ 111 chunk_t current_area; 112 113 /* 114 * The next free chunk for an exception. 115 */ 116 chunk_t next_free; 117 118 /* 119 * The index of next free exception in the current 120 * metadata area. 121 */ 122 uint32_t current_committed; 123 124 atomic_t pending_count; 125 uint32_t callback_count; 126 struct commit_callback *callbacks; 127 struct dm_io_client *io_client; 128 129 struct workqueue_struct *metadata_wq; 130 }; 131 132 static unsigned sectors_to_pages(unsigned sectors) 133 { 134 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); 135 } 136 137 static int alloc_area(struct pstore *ps) 138 { 139 int r = -ENOMEM; 140 size_t len; 141 142 len = ps->snap->chunk_size << SECTOR_SHIFT; 143 144 /* 145 * Allocate the chunk_size block of memory that will hold 146 * a single metadata area. 147 */ 148 ps->area = vmalloc(len); 149 if (!ps->area) 150 return r; 151 152 return 0; 153 } 154 155 static void free_area(struct pstore *ps) 156 { 157 vfree(ps->area); 158 ps->area = NULL; 159 } 160 161 struct mdata_req { 162 struct dm_io_region *where; 163 struct dm_io_request *io_req; 164 struct work_struct work; 165 int result; 166 }; 167 168 static void do_metadata(struct work_struct *work) 169 { 170 struct mdata_req *req = container_of(work, struct mdata_req, work); 171 172 req->result = dm_io(req->io_req, 1, req->where, NULL); 173 } 174 175 /* 176 * Read or write a chunk aligned and sized block of data from a device. 177 */ 178 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) 179 { 180 struct dm_io_region where = { 181 .bdev = ps->snap->cow->bdev, 182 .sector = ps->snap->chunk_size * chunk, 183 .count = ps->snap->chunk_size, 184 }; 185 struct dm_io_request io_req = { 186 .bi_rw = rw, 187 .mem.type = DM_IO_VMA, 188 .mem.ptr.vma = ps->area, 189 .client = ps->io_client, 190 .notify.fn = NULL, 191 }; 192 struct mdata_req req; 193 194 if (!metadata) 195 return dm_io(&io_req, 1, &where, NULL); 196 197 req.where = &where; 198 req.io_req = &io_req; 199 200 /* 201 * Issue the synchronous I/O from a different thread 202 * to avoid generic_make_request recursion. 203 */ 204 INIT_WORK(&req.work, do_metadata); 205 queue_work(ps->metadata_wq, &req.work); 206 flush_workqueue(ps->metadata_wq); 207 208 return req.result; 209 } 210 211 /* 212 * Convert a metadata area index to a chunk index. 213 */ 214 static chunk_t area_location(struct pstore *ps, chunk_t area) 215 { 216 return 1 + ((ps->exceptions_per_area + 1) * area); 217 } 218 219 /* 220 * Read or write a metadata area. Remembering to skip the first 221 * chunk which holds the header. 222 */ 223 static int area_io(struct pstore *ps, chunk_t area, int rw) 224 { 225 int r; 226 chunk_t chunk; 227 228 chunk = area_location(ps, area); 229 230 r = chunk_io(ps, chunk, rw, 0); 231 if (r) 232 return r; 233 234 ps->current_area = area; 235 return 0; 236 } 237 238 static int zero_area(struct pstore *ps, chunk_t area) 239 { 240 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 241 return area_io(ps, area, WRITE); 242 } 243 244 static int read_header(struct pstore *ps, int *new_snapshot) 245 { 246 int r; 247 struct disk_header *dh; 248 chunk_t chunk_size; 249 int chunk_size_supplied = 1; 250 251 /* 252 * Use default chunk size (or hardsect_size, if larger) if none supplied 253 */ 254 if (!ps->snap->chunk_size) { 255 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 256 bdev_hardsect_size(ps->snap->cow->bdev) >> 9); 257 ps->snap->chunk_mask = ps->snap->chunk_size - 1; 258 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; 259 chunk_size_supplied = 0; 260 } 261 262 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> 263 chunk_size)); 264 if (IS_ERR(ps->io_client)) 265 return PTR_ERR(ps->io_client); 266 267 r = alloc_area(ps); 268 if (r) 269 return r; 270 271 r = chunk_io(ps, 0, READ, 1); 272 if (r) 273 goto bad; 274 275 dh = (struct disk_header *) ps->area; 276 277 if (le32_to_cpu(dh->magic) == 0) { 278 *new_snapshot = 1; 279 return 0; 280 } 281 282 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { 283 DMWARN("Invalid or corrupt snapshot"); 284 r = -ENXIO; 285 goto bad; 286 } 287 288 *new_snapshot = 0; 289 ps->valid = le32_to_cpu(dh->valid); 290 ps->version = le32_to_cpu(dh->version); 291 chunk_size = le32_to_cpu(dh->chunk_size); 292 293 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) 294 return 0; 295 296 DMWARN("chunk size %llu in device metadata overrides " 297 "table chunk size of %llu.", 298 (unsigned long long)chunk_size, 299 (unsigned long long)ps->snap->chunk_size); 300 301 /* We had a bogus chunk_size. Fix stuff up. */ 302 free_area(ps); 303 304 ps->snap->chunk_size = chunk_size; 305 ps->snap->chunk_mask = chunk_size - 1; 306 ps->snap->chunk_shift = ffs(chunk_size) - 1; 307 308 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), 309 ps->io_client); 310 if (r) 311 return r; 312 313 r = alloc_area(ps); 314 return r; 315 316 bad: 317 free_area(ps); 318 return r; 319 } 320 321 static int write_header(struct pstore *ps) 322 { 323 struct disk_header *dh; 324 325 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 326 327 dh = (struct disk_header *) ps->area; 328 dh->magic = cpu_to_le32(SNAP_MAGIC); 329 dh->valid = cpu_to_le32(ps->valid); 330 dh->version = cpu_to_le32(ps->version); 331 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); 332 333 return chunk_io(ps, 0, WRITE, 1); 334 } 335 336 /* 337 * Access functions for the disk exceptions, these do the endian conversions. 338 */ 339 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 340 { 341 BUG_ON(index >= ps->exceptions_per_area); 342 343 return ((struct disk_exception *) ps->area) + index; 344 } 345 346 static void read_exception(struct pstore *ps, 347 uint32_t index, struct disk_exception *result) 348 { 349 struct disk_exception *e = get_exception(ps, index); 350 351 /* copy it */ 352 result->old_chunk = le64_to_cpu(e->old_chunk); 353 result->new_chunk = le64_to_cpu(e->new_chunk); 354 } 355 356 static void write_exception(struct pstore *ps, 357 uint32_t index, struct disk_exception *de) 358 { 359 struct disk_exception *e = get_exception(ps, index); 360 361 /* copy it */ 362 e->old_chunk = cpu_to_le64(de->old_chunk); 363 e->new_chunk = cpu_to_le64(de->new_chunk); 364 } 365 366 /* 367 * Registers the exceptions that are present in the current area. 368 * 'full' is filled in to indicate if the area has been 369 * filled. 370 */ 371 static int insert_exceptions(struct pstore *ps, int *full) 372 { 373 int r; 374 unsigned int i; 375 struct disk_exception de; 376 377 /* presume the area is full */ 378 *full = 1; 379 380 for (i = 0; i < ps->exceptions_per_area; i++) { 381 read_exception(ps, i, &de); 382 383 /* 384 * If the new_chunk is pointing at the start of 385 * the COW device, where the first metadata area 386 * is we know that we've hit the end of the 387 * exceptions. Therefore the area is not full. 388 */ 389 if (de.new_chunk == 0LL) { 390 ps->current_committed = i; 391 *full = 0; 392 break; 393 } 394 395 /* 396 * Keep track of the start of the free chunks. 397 */ 398 if (ps->next_free <= de.new_chunk) 399 ps->next_free = de.new_chunk + 1; 400 401 /* 402 * Otherwise we add the exception to the snapshot. 403 */ 404 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); 405 if (r) 406 return r; 407 } 408 409 return 0; 410 } 411 412 static int read_exceptions(struct pstore *ps) 413 { 414 chunk_t area; 415 int r, full = 1; 416 417 /* 418 * Keeping reading chunks and inserting exceptions until 419 * we find a partially full area. 420 */ 421 for (area = 0; full; area++) { 422 r = area_io(ps, area, READ); 423 if (r) 424 return r; 425 426 r = insert_exceptions(ps, &full); 427 if (r) 428 return r; 429 } 430 431 return 0; 432 } 433 434 static struct pstore *get_info(struct exception_store *store) 435 { 436 return (struct pstore *) store->context; 437 } 438 439 static void persistent_fraction_full(struct exception_store *store, 440 sector_t *numerator, sector_t *denominator) 441 { 442 *numerator = get_info(store)->next_free * store->snap->chunk_size; 443 *denominator = get_dev_size(store->snap->cow->bdev); 444 } 445 446 static void persistent_destroy(struct exception_store *store) 447 { 448 struct pstore *ps = get_info(store); 449 450 destroy_workqueue(ps->metadata_wq); 451 dm_io_client_destroy(ps->io_client); 452 vfree(ps->callbacks); 453 free_area(ps); 454 kfree(ps); 455 } 456 457 static int persistent_read_metadata(struct exception_store *store) 458 { 459 int r, uninitialized_var(new_snapshot); 460 struct pstore *ps = get_info(store); 461 462 /* 463 * Read the snapshot header. 464 */ 465 r = read_header(ps, &new_snapshot); 466 if (r) 467 return r; 468 469 /* 470 * Now we know correct chunk_size, complete the initialisation. 471 */ 472 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / 473 sizeof(struct disk_exception); 474 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 475 sizeof(*ps->callbacks)); 476 if (!ps->callbacks) 477 return -ENOMEM; 478 479 /* 480 * Do we need to setup a new snapshot ? 481 */ 482 if (new_snapshot) { 483 r = write_header(ps); 484 if (r) { 485 DMWARN("write_header failed"); 486 return r; 487 } 488 489 r = zero_area(ps, 0); 490 if (r) { 491 DMWARN("zero_area(0) failed"); 492 return r; 493 } 494 495 } else { 496 /* 497 * Sanity checks. 498 */ 499 if (ps->version != SNAPSHOT_DISK_VERSION) { 500 DMWARN("unable to handle snapshot disk version %d", 501 ps->version); 502 return -EINVAL; 503 } 504 505 /* 506 * Metadata are valid, but snapshot is invalidated 507 */ 508 if (!ps->valid) 509 return 1; 510 511 /* 512 * Read the metadata. 513 */ 514 r = read_exceptions(ps); 515 if (r) 516 return r; 517 } 518 519 return 0; 520 } 521 522 static int persistent_prepare(struct exception_store *store, 523 struct dm_snap_exception *e) 524 { 525 struct pstore *ps = get_info(store); 526 uint32_t stride; 527 chunk_t next_free; 528 sector_t size = get_dev_size(store->snap->cow->bdev); 529 530 /* Is there enough room ? */ 531 if (size < ((ps->next_free + 1) * store->snap->chunk_size)) 532 return -ENOSPC; 533 534 e->new_chunk = ps->next_free; 535 536 /* 537 * Move onto the next free pending, making sure to take 538 * into account the location of the metadata chunks. 539 */ 540 stride = (ps->exceptions_per_area + 1); 541 next_free = ++ps->next_free; 542 if (sector_div(next_free, stride) == 1) 543 ps->next_free++; 544 545 atomic_inc(&ps->pending_count); 546 return 0; 547 } 548 549 static void persistent_commit(struct exception_store *store, 550 struct dm_snap_exception *e, 551 void (*callback) (void *, int success), 552 void *callback_context) 553 { 554 int r; 555 unsigned int i; 556 struct pstore *ps = get_info(store); 557 struct disk_exception de; 558 struct commit_callback *cb; 559 560 de.old_chunk = e->old_chunk; 561 de.new_chunk = e->new_chunk; 562 write_exception(ps, ps->current_committed++, &de); 563 564 /* 565 * Add the callback to the back of the array. This code 566 * is the only place where the callback array is 567 * manipulated, and we know that it will never be called 568 * multiple times concurrently. 569 */ 570 cb = ps->callbacks + ps->callback_count++; 571 cb->callback = callback; 572 cb->context = callback_context; 573 574 /* 575 * If there are no more exceptions in flight, or we have 576 * filled this metadata area we commit the exceptions to 577 * disk. 578 */ 579 if (atomic_dec_and_test(&ps->pending_count) || 580 (ps->current_committed == ps->exceptions_per_area)) { 581 r = area_io(ps, ps->current_area, WRITE); 582 if (r) 583 ps->valid = 0; 584 585 /* 586 * Have we completely filled the current area ? 587 */ 588 if (ps->current_committed == ps->exceptions_per_area) { 589 ps->current_committed = 0; 590 r = zero_area(ps, ps->current_area + 1); 591 if (r) 592 ps->valid = 0; 593 } 594 595 for (i = 0; i < ps->callback_count; i++) { 596 cb = ps->callbacks + i; 597 cb->callback(cb->context, r == 0 ? 1 : 0); 598 } 599 600 ps->callback_count = 0; 601 } 602 } 603 604 static void persistent_drop(struct exception_store *store) 605 { 606 struct pstore *ps = get_info(store); 607 608 ps->valid = 0; 609 if (write_header(ps)) 610 DMWARN("write header failed"); 611 } 612 613 int dm_create_persistent(struct exception_store *store) 614 { 615 struct pstore *ps; 616 617 /* allocate the pstore */ 618 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 619 if (!ps) 620 return -ENOMEM; 621 622 ps->snap = store->snap; 623 ps->valid = 1; 624 ps->version = SNAPSHOT_DISK_VERSION; 625 ps->area = NULL; 626 ps->next_free = 2; /* skipping the header and first area */ 627 ps->current_committed = 0; 628 629 ps->callback_count = 0; 630 atomic_set(&ps->pending_count, 0); 631 ps->callbacks = NULL; 632 633 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 634 if (!ps->metadata_wq) { 635 kfree(ps); 636 DMERR("couldn't start header metadata update thread"); 637 return -ENOMEM; 638 } 639 640 store->destroy = persistent_destroy; 641 store->read_metadata = persistent_read_metadata; 642 store->prepare_exception = persistent_prepare; 643 store->commit_exception = persistent_commit; 644 store->drop_snapshot = persistent_drop; 645 store->fraction_full = persistent_fraction_full; 646 store->context = ps; 647 648 return 0; 649 } 650 651 /*----------------------------------------------------------------- 652 * Implementation of the store for non-persistent snapshots. 653 *---------------------------------------------------------------*/ 654 struct transient_c { 655 sector_t next_free; 656 }; 657 658 static void transient_destroy(struct exception_store *store) 659 { 660 kfree(store->context); 661 } 662 663 static int transient_read_metadata(struct exception_store *store) 664 { 665 return 0; 666 } 667 668 static int transient_prepare(struct exception_store *store, 669 struct dm_snap_exception *e) 670 { 671 struct transient_c *tc = (struct transient_c *) store->context; 672 sector_t size = get_dev_size(store->snap->cow->bdev); 673 674 if (size < (tc->next_free + store->snap->chunk_size)) 675 return -1; 676 677 e->new_chunk = sector_to_chunk(store->snap, tc->next_free); 678 tc->next_free += store->snap->chunk_size; 679 680 return 0; 681 } 682 683 static void transient_commit(struct exception_store *store, 684 struct dm_snap_exception *e, 685 void (*callback) (void *, int success), 686 void *callback_context) 687 { 688 /* Just succeed */ 689 callback(callback_context, 1); 690 } 691 692 static void transient_fraction_full(struct exception_store *store, 693 sector_t *numerator, sector_t *denominator) 694 { 695 *numerator = ((struct transient_c *) store->context)->next_free; 696 *denominator = get_dev_size(store->snap->cow->bdev); 697 } 698 699 int dm_create_transient(struct exception_store *store) 700 { 701 struct transient_c *tc; 702 703 store->destroy = transient_destroy; 704 store->read_metadata = transient_read_metadata; 705 store->prepare_exception = transient_prepare; 706 store->commit_exception = transient_commit; 707 store->drop_snapshot = NULL; 708 store->fraction_full = transient_fraction_full; 709 710 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); 711 if (!tc) 712 return -ENOMEM; 713 714 tc->next_free = 0; 715 store->context = tc; 716 717 return 0; 718 } 719