1 /* 2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 3 * Copyright (C) 2006-2008 Red Hat GmbH 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-exception-store.h" 9 10 #include <linux/mm.h> 11 #include <linux/pagemap.h> 12 #include <linux/vmalloc.h> 13 #include <linux/slab.h> 14 #include <linux/dm-io.h> 15 16 #define DM_MSG_PREFIX "persistent snapshot" 17 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 18 19 /*----------------------------------------------------------------- 20 * Persistent snapshots, by persistent we mean that the snapshot 21 * will survive a reboot. 22 *---------------------------------------------------------------*/ 23 24 /* 25 * We need to store a record of which parts of the origin have 26 * been copied to the snapshot device. The snapshot code 27 * requires that we copy exception chunks to chunk aligned areas 28 * of the COW store. It makes sense therefore, to store the 29 * metadata in chunk size blocks. 30 * 31 * There is no backward or forward compatibility implemented, 32 * snapshots with different disk versions than the kernel will 33 * not be usable. It is expected that "lvcreate" will blank out 34 * the start of a fresh COW device before calling the snapshot 35 * constructor. 36 * 37 * The first chunk of the COW device just contains the header. 38 * After this there is a chunk filled with exception metadata, 39 * followed by as many exception chunks as can fit in the 40 * metadata areas. 41 * 42 * All on disk structures are in little-endian format. The end 43 * of the exceptions info is indicated by an exception with a 44 * new_chunk of 0, which is invalid since it would point to the 45 * header chunk. 46 */ 47 48 /* 49 * Magic for persistent snapshots: "SnAp" - Feeble isn't it. 50 */ 51 #define SNAP_MAGIC 0x70416e53 52 53 /* 54 * The on-disk version of the metadata. 55 */ 56 #define SNAPSHOT_DISK_VERSION 1 57 58 struct disk_header { 59 uint32_t magic; 60 61 /* 62 * Is this snapshot valid. There is no way of recovering 63 * an invalid snapshot. 64 */ 65 uint32_t valid; 66 67 /* 68 * Simple, incrementing version. no backward 69 * compatibility. 70 */ 71 uint32_t version; 72 73 /* In sectors */ 74 uint32_t chunk_size; 75 }; 76 77 struct disk_exception { 78 uint64_t old_chunk; 79 uint64_t new_chunk; 80 }; 81 82 struct commit_callback { 83 void (*callback)(void *, int success); 84 void *context; 85 }; 86 87 /* 88 * The top level structure for a persistent exception store. 89 */ 90 struct pstore { 91 struct dm_exception_store *store; 92 int version; 93 int valid; 94 uint32_t exceptions_per_area; 95 96 /* 97 * Now that we have an asynchronous kcopyd there is no 98 * need for large chunk sizes, so it wont hurt to have a 99 * whole chunks worth of metadata in memory at once. 100 */ 101 void *area; 102 103 /* 104 * An area of zeros used to clear the next area. 105 */ 106 void *zero_area; 107 108 /* 109 * Used to keep track of which metadata area the data in 110 * 'chunk' refers to. 111 */ 112 chunk_t current_area; 113 114 /* 115 * The next free chunk for an exception. 116 */ 117 chunk_t next_free; 118 119 /* 120 * The index of next free exception in the current 121 * metadata area. 122 */ 123 uint32_t current_committed; 124 125 atomic_t pending_count; 126 uint32_t callback_count; 127 struct commit_callback *callbacks; 128 struct dm_io_client *io_client; 129 130 struct workqueue_struct *metadata_wq; 131 }; 132 133 static unsigned sectors_to_pages(unsigned sectors) 134 { 135 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); 136 } 137 138 static int alloc_area(struct pstore *ps) 139 { 140 int r = -ENOMEM; 141 size_t len; 142 143 len = ps->store->chunk_size << SECTOR_SHIFT; 144 145 /* 146 * Allocate the chunk_size block of memory that will hold 147 * a single metadata area. 148 */ 149 ps->area = vmalloc(len); 150 if (!ps->area) 151 return r; 152 153 ps->zero_area = vmalloc(len); 154 if (!ps->zero_area) { 155 vfree(ps->area); 156 return r; 157 } 158 memset(ps->zero_area, 0, len); 159 160 return 0; 161 } 162 163 static void free_area(struct pstore *ps) 164 { 165 if (ps->area) 166 vfree(ps->area); 167 ps->area = NULL; 168 169 if (ps->zero_area) 170 vfree(ps->zero_area); 171 ps->zero_area = NULL; 172 } 173 174 struct mdata_req { 175 struct dm_io_region *where; 176 struct dm_io_request *io_req; 177 struct work_struct work; 178 int result; 179 }; 180 181 static void do_metadata(struct work_struct *work) 182 { 183 struct mdata_req *req = container_of(work, struct mdata_req, work); 184 185 req->result = dm_io(req->io_req, 1, req->where, NULL); 186 } 187 188 /* 189 * Read or write a chunk aligned and sized block of data from a device. 190 */ 191 static int chunk_io(struct pstore *ps, chunk_t chunk, int rw, int metadata) 192 { 193 struct dm_io_region where = { 194 .bdev = ps->store->cow->bdev, 195 .sector = ps->store->chunk_size * chunk, 196 .count = ps->store->chunk_size, 197 }; 198 struct dm_io_request io_req = { 199 .bi_rw = rw, 200 .mem.type = DM_IO_VMA, 201 .mem.ptr.vma = ps->area, 202 .client = ps->io_client, 203 .notify.fn = NULL, 204 }; 205 struct mdata_req req; 206 207 if (!metadata) 208 return dm_io(&io_req, 1, &where, NULL); 209 210 req.where = &where; 211 req.io_req = &io_req; 212 213 /* 214 * Issue the synchronous I/O from a different thread 215 * to avoid generic_make_request recursion. 216 */ 217 INIT_WORK(&req.work, do_metadata); 218 queue_work(ps->metadata_wq, &req.work); 219 flush_workqueue(ps->metadata_wq); 220 221 return req.result; 222 } 223 224 /* 225 * Convert a metadata area index to a chunk index. 226 */ 227 static chunk_t area_location(struct pstore *ps, chunk_t area) 228 { 229 return 1 + ((ps->exceptions_per_area + 1) * area); 230 } 231 232 /* 233 * Read or write a metadata area. Remembering to skip the first 234 * chunk which holds the header. 235 */ 236 static int area_io(struct pstore *ps, int rw) 237 { 238 int r; 239 chunk_t chunk; 240 241 chunk = area_location(ps, ps->current_area); 242 243 r = chunk_io(ps, chunk, rw, 0); 244 if (r) 245 return r; 246 247 return 0; 248 } 249 250 static void zero_memory_area(struct pstore *ps) 251 { 252 memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); 253 } 254 255 static int zero_disk_area(struct pstore *ps, chunk_t area) 256 { 257 struct dm_io_region where = { 258 .bdev = ps->store->cow->bdev, 259 .sector = ps->store->chunk_size * area_location(ps, area), 260 .count = ps->store->chunk_size, 261 }; 262 struct dm_io_request io_req = { 263 .bi_rw = WRITE, 264 .mem.type = DM_IO_VMA, 265 .mem.ptr.vma = ps->zero_area, 266 .client = ps->io_client, 267 .notify.fn = NULL, 268 }; 269 270 return dm_io(&io_req, 1, &where, NULL); 271 } 272 273 static int read_header(struct pstore *ps, int *new_snapshot) 274 { 275 int r; 276 struct disk_header *dh; 277 chunk_t chunk_size; 278 int chunk_size_supplied = 1; 279 280 /* 281 * Use default chunk size (or hardsect_size, if larger) if none supplied 282 */ 283 if (!ps->store->chunk_size) { 284 ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 285 bdev_logical_block_size(ps->store->cow->bdev) >> 9); 286 ps->store->chunk_mask = ps->store->chunk_size - 1; 287 ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; 288 chunk_size_supplied = 0; 289 } 290 291 ps->io_client = dm_io_client_create(sectors_to_pages(ps->store-> 292 chunk_size)); 293 if (IS_ERR(ps->io_client)) 294 return PTR_ERR(ps->io_client); 295 296 r = alloc_area(ps); 297 if (r) 298 return r; 299 300 r = chunk_io(ps, 0, READ, 1); 301 if (r) 302 goto bad; 303 304 dh = (struct disk_header *) ps->area; 305 306 if (le32_to_cpu(dh->magic) == 0) { 307 *new_snapshot = 1; 308 return 0; 309 } 310 311 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { 312 DMWARN("Invalid or corrupt snapshot"); 313 r = -ENXIO; 314 goto bad; 315 } 316 317 *new_snapshot = 0; 318 ps->valid = le32_to_cpu(dh->valid); 319 ps->version = le32_to_cpu(dh->version); 320 chunk_size = le32_to_cpu(dh->chunk_size); 321 322 if (!chunk_size_supplied || ps->store->chunk_size == chunk_size) 323 return 0; 324 325 DMWARN("chunk size %llu in device metadata overrides " 326 "table chunk size of %llu.", 327 (unsigned long long)chunk_size, 328 (unsigned long long)ps->store->chunk_size); 329 330 /* We had a bogus chunk_size. Fix stuff up. */ 331 free_area(ps); 332 333 ps->store->chunk_size = chunk_size; 334 ps->store->chunk_mask = chunk_size - 1; 335 ps->store->chunk_shift = ffs(chunk_size) - 1; 336 337 r = dm_io_client_resize(sectors_to_pages(ps->store->chunk_size), 338 ps->io_client); 339 if (r) 340 return r; 341 342 r = alloc_area(ps); 343 return r; 344 345 bad: 346 free_area(ps); 347 return r; 348 } 349 350 static int write_header(struct pstore *ps) 351 { 352 struct disk_header *dh; 353 354 memset(ps->area, 0, ps->store->chunk_size << SECTOR_SHIFT); 355 356 dh = (struct disk_header *) ps->area; 357 dh->magic = cpu_to_le32(SNAP_MAGIC); 358 dh->valid = cpu_to_le32(ps->valid); 359 dh->version = cpu_to_le32(ps->version); 360 dh->chunk_size = cpu_to_le32(ps->store->chunk_size); 361 362 return chunk_io(ps, 0, WRITE, 1); 363 } 364 365 /* 366 * Access functions for the disk exceptions, these do the endian conversions. 367 */ 368 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 369 { 370 BUG_ON(index >= ps->exceptions_per_area); 371 372 return ((struct disk_exception *) ps->area) + index; 373 } 374 375 static void read_exception(struct pstore *ps, 376 uint32_t index, struct disk_exception *result) 377 { 378 struct disk_exception *e = get_exception(ps, index); 379 380 /* copy it */ 381 result->old_chunk = le64_to_cpu(e->old_chunk); 382 result->new_chunk = le64_to_cpu(e->new_chunk); 383 } 384 385 static void write_exception(struct pstore *ps, 386 uint32_t index, struct disk_exception *de) 387 { 388 struct disk_exception *e = get_exception(ps, index); 389 390 /* copy it */ 391 e->old_chunk = cpu_to_le64(de->old_chunk); 392 e->new_chunk = cpu_to_le64(de->new_chunk); 393 } 394 395 /* 396 * Registers the exceptions that are present in the current area. 397 * 'full' is filled in to indicate if the area has been 398 * filled. 399 */ 400 static int insert_exceptions(struct pstore *ps, 401 int (*callback)(void *callback_context, 402 chunk_t old, chunk_t new), 403 void *callback_context, 404 int *full) 405 { 406 int r; 407 unsigned int i; 408 struct disk_exception de; 409 410 /* presume the area is full */ 411 *full = 1; 412 413 for (i = 0; i < ps->exceptions_per_area; i++) { 414 read_exception(ps, i, &de); 415 416 /* 417 * If the new_chunk is pointing at the start of 418 * the COW device, where the first metadata area 419 * is we know that we've hit the end of the 420 * exceptions. Therefore the area is not full. 421 */ 422 if (de.new_chunk == 0LL) { 423 ps->current_committed = i; 424 *full = 0; 425 break; 426 } 427 428 /* 429 * Keep track of the start of the free chunks. 430 */ 431 if (ps->next_free <= de.new_chunk) 432 ps->next_free = de.new_chunk + 1; 433 434 /* 435 * Otherwise we add the exception to the snapshot. 436 */ 437 r = callback(callback_context, de.old_chunk, de.new_chunk); 438 if (r) 439 return r; 440 } 441 442 return 0; 443 } 444 445 static int read_exceptions(struct pstore *ps, 446 int (*callback)(void *callback_context, chunk_t old, 447 chunk_t new), 448 void *callback_context) 449 { 450 int r, full = 1; 451 452 /* 453 * Keeping reading chunks and inserting exceptions until 454 * we find a partially full area. 455 */ 456 for (ps->current_area = 0; full; ps->current_area++) { 457 r = area_io(ps, READ); 458 if (r) 459 return r; 460 461 r = insert_exceptions(ps, callback, callback_context, &full); 462 if (r) 463 return r; 464 } 465 466 ps->current_area--; 467 468 return 0; 469 } 470 471 static struct pstore *get_info(struct dm_exception_store *store) 472 { 473 return (struct pstore *) store->context; 474 } 475 476 static void persistent_fraction_full(struct dm_exception_store *store, 477 sector_t *numerator, sector_t *denominator) 478 { 479 *numerator = get_info(store)->next_free * store->chunk_size; 480 *denominator = get_dev_size(store->cow->bdev); 481 } 482 483 static void persistent_dtr(struct dm_exception_store *store) 484 { 485 struct pstore *ps = get_info(store); 486 487 destroy_workqueue(ps->metadata_wq); 488 489 /* Created in read_header */ 490 if (ps->io_client) 491 dm_io_client_destroy(ps->io_client); 492 free_area(ps); 493 494 /* Allocated in persistent_read_metadata */ 495 if (ps->callbacks) 496 vfree(ps->callbacks); 497 498 kfree(ps); 499 } 500 501 static int persistent_read_metadata(struct dm_exception_store *store, 502 int (*callback)(void *callback_context, 503 chunk_t old, chunk_t new), 504 void *callback_context) 505 { 506 int r, uninitialized_var(new_snapshot); 507 struct pstore *ps = get_info(store); 508 509 /* 510 * Read the snapshot header. 511 */ 512 r = read_header(ps, &new_snapshot); 513 if (r) 514 return r; 515 516 /* 517 * Now we know correct chunk_size, complete the initialisation. 518 */ 519 ps->exceptions_per_area = (ps->store->chunk_size << SECTOR_SHIFT) / 520 sizeof(struct disk_exception); 521 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 522 sizeof(*ps->callbacks)); 523 if (!ps->callbacks) 524 return -ENOMEM; 525 526 /* 527 * Do we need to setup a new snapshot ? 528 */ 529 if (new_snapshot) { 530 r = write_header(ps); 531 if (r) { 532 DMWARN("write_header failed"); 533 return r; 534 } 535 536 ps->current_area = 0; 537 zero_memory_area(ps); 538 r = zero_disk_area(ps, 0); 539 if (r) { 540 DMWARN("zero_disk_area(0) failed"); 541 return r; 542 } 543 } else { 544 /* 545 * Sanity checks. 546 */ 547 if (ps->version != SNAPSHOT_DISK_VERSION) { 548 DMWARN("unable to handle snapshot disk version %d", 549 ps->version); 550 return -EINVAL; 551 } 552 553 /* 554 * Metadata are valid, but snapshot is invalidated 555 */ 556 if (!ps->valid) 557 return 1; 558 559 /* 560 * Read the metadata. 561 */ 562 r = read_exceptions(ps, callback, callback_context); 563 if (r) 564 return r; 565 } 566 567 return 0; 568 } 569 570 static int persistent_prepare_exception(struct dm_exception_store *store, 571 struct dm_snap_exception *e) 572 { 573 struct pstore *ps = get_info(store); 574 uint32_t stride; 575 chunk_t next_free; 576 sector_t size = get_dev_size(store->cow->bdev); 577 578 /* Is there enough room ? */ 579 if (size < ((ps->next_free + 1) * store->chunk_size)) 580 return -ENOSPC; 581 582 e->new_chunk = ps->next_free; 583 584 /* 585 * Move onto the next free pending, making sure to take 586 * into account the location of the metadata chunks. 587 */ 588 stride = (ps->exceptions_per_area + 1); 589 next_free = ++ps->next_free; 590 if (sector_div(next_free, stride) == 1) 591 ps->next_free++; 592 593 atomic_inc(&ps->pending_count); 594 return 0; 595 } 596 597 static void persistent_commit_exception(struct dm_exception_store *store, 598 struct dm_snap_exception *e, 599 void (*callback) (void *, int success), 600 void *callback_context) 601 { 602 unsigned int i; 603 struct pstore *ps = get_info(store); 604 struct disk_exception de; 605 struct commit_callback *cb; 606 607 de.old_chunk = e->old_chunk; 608 de.new_chunk = e->new_chunk; 609 write_exception(ps, ps->current_committed++, &de); 610 611 /* 612 * Add the callback to the back of the array. This code 613 * is the only place where the callback array is 614 * manipulated, and we know that it will never be called 615 * multiple times concurrently. 616 */ 617 cb = ps->callbacks + ps->callback_count++; 618 cb->callback = callback; 619 cb->context = callback_context; 620 621 /* 622 * If there are exceptions in flight and we have not yet 623 * filled this metadata area there's nothing more to do. 624 */ 625 if (!atomic_dec_and_test(&ps->pending_count) && 626 (ps->current_committed != ps->exceptions_per_area)) 627 return; 628 629 /* 630 * If we completely filled the current area, then wipe the next one. 631 */ 632 if ((ps->current_committed == ps->exceptions_per_area) && 633 zero_disk_area(ps, ps->current_area + 1)) 634 ps->valid = 0; 635 636 /* 637 * Commit exceptions to disk. 638 */ 639 if (ps->valid && area_io(ps, WRITE)) 640 ps->valid = 0; 641 642 /* 643 * Advance to the next area if this one is full. 644 */ 645 if (ps->current_committed == ps->exceptions_per_area) { 646 ps->current_committed = 0; 647 ps->current_area++; 648 zero_memory_area(ps); 649 } 650 651 for (i = 0; i < ps->callback_count; i++) { 652 cb = ps->callbacks + i; 653 cb->callback(cb->context, ps->valid); 654 } 655 656 ps->callback_count = 0; 657 } 658 659 static void persistent_drop_snapshot(struct dm_exception_store *store) 660 { 661 struct pstore *ps = get_info(store); 662 663 ps->valid = 0; 664 if (write_header(ps)) 665 DMWARN("write header failed"); 666 } 667 668 static int persistent_ctr(struct dm_exception_store *store, 669 unsigned argc, char **argv) 670 { 671 struct pstore *ps; 672 673 /* allocate the pstore */ 674 ps = kzalloc(sizeof(*ps), GFP_KERNEL); 675 if (!ps) 676 return -ENOMEM; 677 678 ps->store = store; 679 ps->valid = 1; 680 ps->version = SNAPSHOT_DISK_VERSION; 681 ps->area = NULL; 682 ps->next_free = 2; /* skipping the header and first area */ 683 ps->current_committed = 0; 684 685 ps->callback_count = 0; 686 atomic_set(&ps->pending_count, 0); 687 ps->callbacks = NULL; 688 689 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 690 if (!ps->metadata_wq) { 691 kfree(ps); 692 DMERR("couldn't start header metadata update thread"); 693 return -ENOMEM; 694 } 695 696 store->context = ps; 697 698 return 0; 699 } 700 701 static unsigned persistent_status(struct dm_exception_store *store, 702 status_type_t status, char *result, 703 unsigned maxlen) 704 { 705 unsigned sz = 0; 706 707 switch (status) { 708 case STATUSTYPE_INFO: 709 break; 710 case STATUSTYPE_TABLE: 711 DMEMIT(" %s P %llu", store->cow->name, 712 (unsigned long long)store->chunk_size); 713 } 714 715 return sz; 716 } 717 718 static struct dm_exception_store_type _persistent_type = { 719 .name = "persistent", 720 .module = THIS_MODULE, 721 .ctr = persistent_ctr, 722 .dtr = persistent_dtr, 723 .read_metadata = persistent_read_metadata, 724 .prepare_exception = persistent_prepare_exception, 725 .commit_exception = persistent_commit_exception, 726 .drop_snapshot = persistent_drop_snapshot, 727 .fraction_full = persistent_fraction_full, 728 .status = persistent_status, 729 }; 730 731 static struct dm_exception_store_type _persistent_compat_type = { 732 .name = "P", 733 .module = THIS_MODULE, 734 .ctr = persistent_ctr, 735 .dtr = persistent_dtr, 736 .read_metadata = persistent_read_metadata, 737 .prepare_exception = persistent_prepare_exception, 738 .commit_exception = persistent_commit_exception, 739 .drop_snapshot = persistent_drop_snapshot, 740 .fraction_full = persistent_fraction_full, 741 .status = persistent_status, 742 }; 743 744 int dm_persistent_snapshot_init(void) 745 { 746 int r; 747 748 r = dm_exception_store_type_register(&_persistent_type); 749 if (r) { 750 DMERR("Unable to register persistent exception store type"); 751 return r; 752 } 753 754 r = dm_exception_store_type_register(&_persistent_compat_type); 755 if (r) { 756 DMERR("Unable to register old-style persistent exception " 757 "store type"); 758 dm_exception_store_type_unregister(&_persistent_type); 759 return r; 760 } 761 762 return r; 763 } 764 765 void dm_persistent_snapshot_exit(void) 766 { 767 dm_exception_store_type_unregister(&_persistent_type); 768 dm_exception_store_type_unregister(&_persistent_compat_type); 769 } 770