1 /* 2 * dm-exception-store.c 3 * 4 * Copyright (C) 2001-2002 Sistina Software (UK) Limited. 5 * Copyright (C) 2006 Red Hat GmbH 6 * 7 * This file is released under the GPL. 8 */ 9 10 #include "dm.h" 11 #include "dm-snap.h" 12 13 #include <linux/mm.h> 14 #include <linux/pagemap.h> 15 #include <linux/vmalloc.h> 16 #include <linux/slab.h> 17 #include <linux/dm-io.h> 18 #include <linux/dm-kcopyd.h> 19 20 #define DM_MSG_PREFIX "snapshots" 21 #define DM_CHUNK_SIZE_DEFAULT_SECTORS 32 /* 16KB */ 22 23 /*----------------------------------------------------------------- 24 * Persistent snapshots, by persistent we mean that the snapshot 25 * will survive a reboot. 26 *---------------------------------------------------------------*/ 27 28 /* 29 * We need to store a record of which parts of the origin have 30 * been copied to the snapshot device. The snapshot code 31 * requires that we copy exception chunks to chunk aligned areas 32 * of the COW store. It makes sense therefore, to store the 33 * metadata in chunk size blocks. 34 * 35 * There is no backward or forward compatibility implemented, 36 * snapshots with different disk versions than the kernel will 37 * not be usable. It is expected that "lvcreate" will blank out 38 * the start of a fresh COW device before calling the snapshot 39 * constructor. 40 * 41 * The first chunk of the COW device just contains the header. 42 * After this there is a chunk filled with exception metadata, 43 * followed by as many exception chunks as can fit in the 44 * metadata areas. 45 * 46 * All on disk structures are in little-endian format. The end 47 * of the exceptions info is indicated by an exception with a 48 * new_chunk of 0, which is invalid since it would point to the 49 * header chunk. 50 */ 51 52 /* 53 * Magic for persistent snapshots: "SnAp" - Feeble isn't it. 54 */ 55 #define SNAP_MAGIC 0x70416e53 56 57 /* 58 * The on-disk version of the metadata. 59 */ 60 #define SNAPSHOT_DISK_VERSION 1 61 62 struct disk_header { 63 uint32_t magic; 64 65 /* 66 * Is this snapshot valid. There is no way of recovering 67 * an invalid snapshot. 68 */ 69 uint32_t valid; 70 71 /* 72 * Simple, incrementing version. no backward 73 * compatibility. 74 */ 75 uint32_t version; 76 77 /* In sectors */ 78 uint32_t chunk_size; 79 }; 80 81 struct disk_exception { 82 uint64_t old_chunk; 83 uint64_t new_chunk; 84 }; 85 86 struct commit_callback { 87 void (*callback)(void *, int success); 88 void *context; 89 }; 90 91 /* 92 * The top level structure for a persistent exception store. 93 */ 94 struct pstore { 95 struct dm_snapshot *snap; /* up pointer to my snapshot */ 96 int version; 97 int valid; 98 uint32_t exceptions_per_area; 99 100 /* 101 * Now that we have an asynchronous kcopyd there is no 102 * need for large chunk sizes, so it wont hurt to have a 103 * whole chunks worth of metadata in memory at once. 104 */ 105 void *area; 106 107 /* 108 * Used to keep track of which metadata area the data in 109 * 'chunk' refers to. 110 */ 111 uint32_t current_area; 112 113 /* 114 * The next free chunk for an exception. 115 */ 116 uint32_t next_free; 117 118 /* 119 * The index of next free exception in the current 120 * metadata area. 121 */ 122 uint32_t current_committed; 123 124 atomic_t pending_count; 125 uint32_t callback_count; 126 struct commit_callback *callbacks; 127 struct dm_io_client *io_client; 128 129 struct workqueue_struct *metadata_wq; 130 }; 131 132 static unsigned sectors_to_pages(unsigned sectors) 133 { 134 return DIV_ROUND_UP(sectors, PAGE_SIZE >> 9); 135 } 136 137 static int alloc_area(struct pstore *ps) 138 { 139 int r = -ENOMEM; 140 size_t len; 141 142 len = ps->snap->chunk_size << SECTOR_SHIFT; 143 144 /* 145 * Allocate the chunk_size block of memory that will hold 146 * a single metadata area. 147 */ 148 ps->area = vmalloc(len); 149 if (!ps->area) 150 return r; 151 152 return 0; 153 } 154 155 static void free_area(struct pstore *ps) 156 { 157 vfree(ps->area); 158 ps->area = NULL; 159 } 160 161 struct mdata_req { 162 struct dm_io_region *where; 163 struct dm_io_request *io_req; 164 struct work_struct work; 165 int result; 166 }; 167 168 static void do_metadata(struct work_struct *work) 169 { 170 struct mdata_req *req = container_of(work, struct mdata_req, work); 171 172 req->result = dm_io(req->io_req, 1, req->where, NULL); 173 } 174 175 /* 176 * Read or write a chunk aligned and sized block of data from a device. 177 */ 178 static int chunk_io(struct pstore *ps, uint32_t chunk, int rw, int metadata) 179 { 180 struct dm_io_region where = { 181 .bdev = ps->snap->cow->bdev, 182 .sector = ps->snap->chunk_size * chunk, 183 .count = ps->snap->chunk_size, 184 }; 185 struct dm_io_request io_req = { 186 .bi_rw = rw, 187 .mem.type = DM_IO_VMA, 188 .mem.ptr.vma = ps->area, 189 .client = ps->io_client, 190 .notify.fn = NULL, 191 }; 192 struct mdata_req req; 193 194 if (!metadata) 195 return dm_io(&io_req, 1, &where, NULL); 196 197 req.where = &where; 198 req.io_req = &io_req; 199 200 /* 201 * Issue the synchronous I/O from a different thread 202 * to avoid generic_make_request recursion. 203 */ 204 INIT_WORK(&req.work, do_metadata); 205 queue_work(ps->metadata_wq, &req.work); 206 flush_workqueue(ps->metadata_wq); 207 208 return req.result; 209 } 210 211 /* 212 * Read or write a metadata area. Remembering to skip the first 213 * chunk which holds the header. 214 */ 215 static int area_io(struct pstore *ps, uint32_t area, int rw) 216 { 217 int r; 218 uint32_t chunk; 219 220 /* convert a metadata area index to a chunk index */ 221 chunk = 1 + ((ps->exceptions_per_area + 1) * area); 222 223 r = chunk_io(ps, chunk, rw, 0); 224 if (r) 225 return r; 226 227 ps->current_area = area; 228 return 0; 229 } 230 231 static int zero_area(struct pstore *ps, uint32_t area) 232 { 233 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 234 return area_io(ps, area, WRITE); 235 } 236 237 static int read_header(struct pstore *ps, int *new_snapshot) 238 { 239 int r; 240 struct disk_header *dh; 241 chunk_t chunk_size; 242 int chunk_size_supplied = 1; 243 244 /* 245 * Use default chunk size (or hardsect_size, if larger) if none supplied 246 */ 247 if (!ps->snap->chunk_size) { 248 ps->snap->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, 249 bdev_hardsect_size(ps->snap->cow->bdev) >> 9); 250 ps->snap->chunk_mask = ps->snap->chunk_size - 1; 251 ps->snap->chunk_shift = ffs(ps->snap->chunk_size) - 1; 252 chunk_size_supplied = 0; 253 } 254 255 ps->io_client = dm_io_client_create(sectors_to_pages(ps->snap-> 256 chunk_size)); 257 if (IS_ERR(ps->io_client)) 258 return PTR_ERR(ps->io_client); 259 260 r = alloc_area(ps); 261 if (r) 262 return r; 263 264 r = chunk_io(ps, 0, READ, 1); 265 if (r) 266 goto bad; 267 268 dh = (struct disk_header *) ps->area; 269 270 if (le32_to_cpu(dh->magic) == 0) { 271 *new_snapshot = 1; 272 return 0; 273 } 274 275 if (le32_to_cpu(dh->magic) != SNAP_MAGIC) { 276 DMWARN("Invalid or corrupt snapshot"); 277 r = -ENXIO; 278 goto bad; 279 } 280 281 *new_snapshot = 0; 282 ps->valid = le32_to_cpu(dh->valid); 283 ps->version = le32_to_cpu(dh->version); 284 chunk_size = le32_to_cpu(dh->chunk_size); 285 286 if (!chunk_size_supplied || ps->snap->chunk_size == chunk_size) 287 return 0; 288 289 DMWARN("chunk size %llu in device metadata overrides " 290 "table chunk size of %llu.", 291 (unsigned long long)chunk_size, 292 (unsigned long long)ps->snap->chunk_size); 293 294 /* We had a bogus chunk_size. Fix stuff up. */ 295 free_area(ps); 296 297 ps->snap->chunk_size = chunk_size; 298 ps->snap->chunk_mask = chunk_size - 1; 299 ps->snap->chunk_shift = ffs(chunk_size) - 1; 300 301 r = dm_io_client_resize(sectors_to_pages(ps->snap->chunk_size), 302 ps->io_client); 303 if (r) 304 return r; 305 306 r = alloc_area(ps); 307 return r; 308 309 bad: 310 free_area(ps); 311 return r; 312 } 313 314 static int write_header(struct pstore *ps) 315 { 316 struct disk_header *dh; 317 318 memset(ps->area, 0, ps->snap->chunk_size << SECTOR_SHIFT); 319 320 dh = (struct disk_header *) ps->area; 321 dh->magic = cpu_to_le32(SNAP_MAGIC); 322 dh->valid = cpu_to_le32(ps->valid); 323 dh->version = cpu_to_le32(ps->version); 324 dh->chunk_size = cpu_to_le32(ps->snap->chunk_size); 325 326 return chunk_io(ps, 0, WRITE, 1); 327 } 328 329 /* 330 * Access functions for the disk exceptions, these do the endian conversions. 331 */ 332 static struct disk_exception *get_exception(struct pstore *ps, uint32_t index) 333 { 334 BUG_ON(index >= ps->exceptions_per_area); 335 336 return ((struct disk_exception *) ps->area) + index; 337 } 338 339 static void read_exception(struct pstore *ps, 340 uint32_t index, struct disk_exception *result) 341 { 342 struct disk_exception *e = get_exception(ps, index); 343 344 /* copy it */ 345 result->old_chunk = le64_to_cpu(e->old_chunk); 346 result->new_chunk = le64_to_cpu(e->new_chunk); 347 } 348 349 static void write_exception(struct pstore *ps, 350 uint32_t index, struct disk_exception *de) 351 { 352 struct disk_exception *e = get_exception(ps, index); 353 354 /* copy it */ 355 e->old_chunk = cpu_to_le64(de->old_chunk); 356 e->new_chunk = cpu_to_le64(de->new_chunk); 357 } 358 359 /* 360 * Registers the exceptions that are present in the current area. 361 * 'full' is filled in to indicate if the area has been 362 * filled. 363 */ 364 static int insert_exceptions(struct pstore *ps, int *full) 365 { 366 int r; 367 unsigned int i; 368 struct disk_exception de; 369 370 /* presume the area is full */ 371 *full = 1; 372 373 for (i = 0; i < ps->exceptions_per_area; i++) { 374 read_exception(ps, i, &de); 375 376 /* 377 * If the new_chunk is pointing at the start of 378 * the COW device, where the first metadata area 379 * is we know that we've hit the end of the 380 * exceptions. Therefore the area is not full. 381 */ 382 if (de.new_chunk == 0LL) { 383 ps->current_committed = i; 384 *full = 0; 385 break; 386 } 387 388 /* 389 * Keep track of the start of the free chunks. 390 */ 391 if (ps->next_free <= de.new_chunk) 392 ps->next_free = de.new_chunk + 1; 393 394 /* 395 * Otherwise we add the exception to the snapshot. 396 */ 397 r = dm_add_exception(ps->snap, de.old_chunk, de.new_chunk); 398 if (r) 399 return r; 400 } 401 402 return 0; 403 } 404 405 static int read_exceptions(struct pstore *ps) 406 { 407 uint32_t area; 408 int r, full = 1; 409 410 /* 411 * Keeping reading chunks and inserting exceptions until 412 * we find a partially full area. 413 */ 414 for (area = 0; full; area++) { 415 r = area_io(ps, area, READ); 416 if (r) 417 return r; 418 419 r = insert_exceptions(ps, &full); 420 if (r) 421 return r; 422 } 423 424 return 0; 425 } 426 427 static struct pstore *get_info(struct exception_store *store) 428 { 429 return (struct pstore *) store->context; 430 } 431 432 static void persistent_fraction_full(struct exception_store *store, 433 sector_t *numerator, sector_t *denominator) 434 { 435 *numerator = get_info(store)->next_free * store->snap->chunk_size; 436 *denominator = get_dev_size(store->snap->cow->bdev); 437 } 438 439 static void persistent_destroy(struct exception_store *store) 440 { 441 struct pstore *ps = get_info(store); 442 443 destroy_workqueue(ps->metadata_wq); 444 dm_io_client_destroy(ps->io_client); 445 vfree(ps->callbacks); 446 free_area(ps); 447 kfree(ps); 448 } 449 450 static int persistent_read_metadata(struct exception_store *store) 451 { 452 int r, uninitialized_var(new_snapshot); 453 struct pstore *ps = get_info(store); 454 455 /* 456 * Read the snapshot header. 457 */ 458 r = read_header(ps, &new_snapshot); 459 if (r) 460 return r; 461 462 /* 463 * Now we know correct chunk_size, complete the initialisation. 464 */ 465 ps->exceptions_per_area = (ps->snap->chunk_size << SECTOR_SHIFT) / 466 sizeof(struct disk_exception); 467 ps->callbacks = dm_vcalloc(ps->exceptions_per_area, 468 sizeof(*ps->callbacks)); 469 if (!ps->callbacks) 470 return -ENOMEM; 471 472 /* 473 * Do we need to setup a new snapshot ? 474 */ 475 if (new_snapshot) { 476 r = write_header(ps); 477 if (r) { 478 DMWARN("write_header failed"); 479 return r; 480 } 481 482 r = zero_area(ps, 0); 483 if (r) { 484 DMWARN("zero_area(0) failed"); 485 return r; 486 } 487 488 } else { 489 /* 490 * Sanity checks. 491 */ 492 if (ps->version != SNAPSHOT_DISK_VERSION) { 493 DMWARN("unable to handle snapshot disk version %d", 494 ps->version); 495 return -EINVAL; 496 } 497 498 /* 499 * Metadata are valid, but snapshot is invalidated 500 */ 501 if (!ps->valid) 502 return 1; 503 504 /* 505 * Read the metadata. 506 */ 507 r = read_exceptions(ps); 508 if (r) 509 return r; 510 } 511 512 return 0; 513 } 514 515 static int persistent_prepare(struct exception_store *store, 516 struct dm_snap_exception *e) 517 { 518 struct pstore *ps = get_info(store); 519 uint32_t stride; 520 sector_t size = get_dev_size(store->snap->cow->bdev); 521 522 /* Is there enough room ? */ 523 if (size < ((ps->next_free + 1) * store->snap->chunk_size)) 524 return -ENOSPC; 525 526 e->new_chunk = ps->next_free; 527 528 /* 529 * Move onto the next free pending, making sure to take 530 * into account the location of the metadata chunks. 531 */ 532 stride = (ps->exceptions_per_area + 1); 533 if ((++ps->next_free % stride) == 1) 534 ps->next_free++; 535 536 atomic_inc(&ps->pending_count); 537 return 0; 538 } 539 540 static void persistent_commit(struct exception_store *store, 541 struct dm_snap_exception *e, 542 void (*callback) (void *, int success), 543 void *callback_context) 544 { 545 int r; 546 unsigned int i; 547 struct pstore *ps = get_info(store); 548 struct disk_exception de; 549 struct commit_callback *cb; 550 551 de.old_chunk = e->old_chunk; 552 de.new_chunk = e->new_chunk; 553 write_exception(ps, ps->current_committed++, &de); 554 555 /* 556 * Add the callback to the back of the array. This code 557 * is the only place where the callback array is 558 * manipulated, and we know that it will never be called 559 * multiple times concurrently. 560 */ 561 cb = ps->callbacks + ps->callback_count++; 562 cb->callback = callback; 563 cb->context = callback_context; 564 565 /* 566 * If there are no more exceptions in flight, or we have 567 * filled this metadata area we commit the exceptions to 568 * disk. 569 */ 570 if (atomic_dec_and_test(&ps->pending_count) || 571 (ps->current_committed == ps->exceptions_per_area)) { 572 r = area_io(ps, ps->current_area, WRITE); 573 if (r) 574 ps->valid = 0; 575 576 /* 577 * Have we completely filled the current area ? 578 */ 579 if (ps->current_committed == ps->exceptions_per_area) { 580 ps->current_committed = 0; 581 r = zero_area(ps, ps->current_area + 1); 582 if (r) 583 ps->valid = 0; 584 } 585 586 for (i = 0; i < ps->callback_count; i++) { 587 cb = ps->callbacks + i; 588 cb->callback(cb->context, r == 0 ? 1 : 0); 589 } 590 591 ps->callback_count = 0; 592 } 593 } 594 595 static void persistent_drop(struct exception_store *store) 596 { 597 struct pstore *ps = get_info(store); 598 599 ps->valid = 0; 600 if (write_header(ps)) 601 DMWARN("write header failed"); 602 } 603 604 int dm_create_persistent(struct exception_store *store) 605 { 606 struct pstore *ps; 607 608 /* allocate the pstore */ 609 ps = kmalloc(sizeof(*ps), GFP_KERNEL); 610 if (!ps) 611 return -ENOMEM; 612 613 ps->snap = store->snap; 614 ps->valid = 1; 615 ps->version = SNAPSHOT_DISK_VERSION; 616 ps->area = NULL; 617 ps->next_free = 2; /* skipping the header and first area */ 618 ps->current_committed = 0; 619 620 ps->callback_count = 0; 621 atomic_set(&ps->pending_count, 0); 622 ps->callbacks = NULL; 623 624 ps->metadata_wq = create_singlethread_workqueue("ksnaphd"); 625 if (!ps->metadata_wq) { 626 kfree(ps); 627 DMERR("couldn't start header metadata update thread"); 628 return -ENOMEM; 629 } 630 631 store->destroy = persistent_destroy; 632 store->read_metadata = persistent_read_metadata; 633 store->prepare_exception = persistent_prepare; 634 store->commit_exception = persistent_commit; 635 store->drop_snapshot = persistent_drop; 636 store->fraction_full = persistent_fraction_full; 637 store->context = ps; 638 639 return 0; 640 } 641 642 /*----------------------------------------------------------------- 643 * Implementation of the store for non-persistent snapshots. 644 *---------------------------------------------------------------*/ 645 struct transient_c { 646 sector_t next_free; 647 }; 648 649 static void transient_destroy(struct exception_store *store) 650 { 651 kfree(store->context); 652 } 653 654 static int transient_read_metadata(struct exception_store *store) 655 { 656 return 0; 657 } 658 659 static int transient_prepare(struct exception_store *store, 660 struct dm_snap_exception *e) 661 { 662 struct transient_c *tc = (struct transient_c *) store->context; 663 sector_t size = get_dev_size(store->snap->cow->bdev); 664 665 if (size < (tc->next_free + store->snap->chunk_size)) 666 return -1; 667 668 e->new_chunk = sector_to_chunk(store->snap, tc->next_free); 669 tc->next_free += store->snap->chunk_size; 670 671 return 0; 672 } 673 674 static void transient_commit(struct exception_store *store, 675 struct dm_snap_exception *e, 676 void (*callback) (void *, int success), 677 void *callback_context) 678 { 679 /* Just succeed */ 680 callback(callback_context, 1); 681 } 682 683 static void transient_fraction_full(struct exception_store *store, 684 sector_t *numerator, sector_t *denominator) 685 { 686 *numerator = ((struct transient_c *) store->context)->next_free; 687 *denominator = get_dev_size(store->snap->cow->bdev); 688 } 689 690 int dm_create_transient(struct exception_store *store) 691 { 692 struct transient_c *tc; 693 694 store->destroy = transient_destroy; 695 store->read_metadata = transient_read_metadata; 696 store->prepare_exception = transient_prepare; 697 store->commit_exception = transient_commit; 698 store->drop_snapshot = NULL; 699 store->fraction_full = transient_fraction_full; 700 701 tc = kmalloc(sizeof(struct transient_c), GFP_KERNEL); 702 if (!tc) 703 return -ENOMEM; 704 705 tc->next_free = 0; 706 store->context = tc; 707 708 return 0; 709 } 710