1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/gfp.h> 6 #include <linux/pagemap.h> 7 #include <linux/page-flags.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include "extent_map.h" 12 13 static struct kmem_cache *extent_map_cache; 14 static struct kmem_cache *extent_state_cache; 15 16 struct tree_entry { 17 u64 start; 18 u64 end; 19 int in_tree; 20 struct rb_node rb_node; 21 }; 22 23 /* bits for the extent state */ 24 #define EXTENT_DIRTY 1 25 #define EXTENT_WRITEBACK (1 << 1) 26 #define EXTENT_UPTODATE (1 << 2) 27 #define EXTENT_LOCKED (1 << 3) 28 #define EXTENT_NEW (1 << 4) 29 #define EXTENT_DELALLOC (1 << 5) 30 31 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 32 33 void __init extent_map_init(void) 34 { 35 extent_map_cache = kmem_cache_create("extent_map", 36 sizeof(struct extent_map), 0, 37 SLAB_RECLAIM_ACCOUNT | 38 SLAB_DESTROY_BY_RCU, 39 NULL); 40 extent_state_cache = kmem_cache_create("extent_state", 41 sizeof(struct extent_state), 0, 42 SLAB_RECLAIM_ACCOUNT | 43 SLAB_DESTROY_BY_RCU, 44 NULL); 45 } 46 47 void __exit extent_map_exit(void) 48 { 49 if (extent_map_cache) 50 kmem_cache_destroy(extent_map_cache); 51 if (extent_state_cache) 52 kmem_cache_destroy(extent_state_cache); 53 } 54 55 void extent_map_tree_init(struct extent_map_tree *tree, 56 struct address_space *mapping, gfp_t mask) 57 { 58 tree->map.rb_node = NULL; 59 tree->state.rb_node = NULL; 60 tree->ops = NULL; 61 rwlock_init(&tree->lock); 62 tree->mapping = mapping; 63 } 64 EXPORT_SYMBOL(extent_map_tree_init); 65 66 struct extent_map *alloc_extent_map(gfp_t mask) 67 { 68 struct extent_map *em; 69 em = kmem_cache_alloc(extent_map_cache, mask); 70 if (!em || IS_ERR(em)) 71 return em; 72 em->in_tree = 0; 73 atomic_set(&em->refs, 1); 74 return em; 75 } 76 EXPORT_SYMBOL(alloc_extent_map); 77 78 void free_extent_map(struct extent_map *em) 79 { 80 if (!em) 81 return; 82 if (atomic_dec_and_test(&em->refs)) { 83 WARN_ON(em->in_tree); 84 kmem_cache_free(extent_map_cache, em); 85 } 86 } 87 EXPORT_SYMBOL(free_extent_map); 88 89 90 struct extent_state *alloc_extent_state(gfp_t mask) 91 { 92 struct extent_state *state; 93 state = kmem_cache_alloc(extent_state_cache, mask); 94 if (!state || IS_ERR(state)) 95 return state; 96 state->state = 0; 97 state->in_tree = 0; 98 state->private = 0; 99 atomic_set(&state->refs, 1); 100 init_waitqueue_head(&state->wq); 101 return state; 102 } 103 EXPORT_SYMBOL(alloc_extent_state); 104 105 void free_extent_state(struct extent_state *state) 106 { 107 if (!state) 108 return; 109 if (atomic_dec_and_test(&state->refs)) { 110 WARN_ON(state->in_tree); 111 kmem_cache_free(extent_state_cache, state); 112 } 113 } 114 EXPORT_SYMBOL(free_extent_state); 115 116 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 117 struct rb_node *node) 118 { 119 struct rb_node ** p = &root->rb_node; 120 struct rb_node * parent = NULL; 121 struct tree_entry *entry; 122 123 while(*p) { 124 parent = *p; 125 entry = rb_entry(parent, struct tree_entry, rb_node); 126 127 if (offset < entry->start) 128 p = &(*p)->rb_left; 129 else if (offset > entry->end) 130 p = &(*p)->rb_right; 131 else 132 return parent; 133 } 134 135 entry = rb_entry(node, struct tree_entry, rb_node); 136 entry->in_tree = 1; 137 rb_link_node(node, parent, p); 138 rb_insert_color(node, root); 139 return NULL; 140 } 141 142 static struct rb_node *__tree_search(struct rb_root *root, u64 offset, 143 struct rb_node **prev_ret) 144 { 145 struct rb_node * n = root->rb_node; 146 struct rb_node *prev = NULL; 147 struct tree_entry *entry; 148 struct tree_entry *prev_entry = NULL; 149 150 while(n) { 151 entry = rb_entry(n, struct tree_entry, rb_node); 152 prev = n; 153 prev_entry = entry; 154 155 if (offset < entry->start) 156 n = n->rb_left; 157 else if (offset > entry->end) 158 n = n->rb_right; 159 else 160 return n; 161 } 162 if (!prev_ret) 163 return NULL; 164 while(prev && offset > prev_entry->end) { 165 prev = rb_next(prev); 166 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 167 } 168 *prev_ret = prev; 169 return NULL; 170 } 171 172 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) 173 { 174 struct rb_node *prev; 175 struct rb_node *ret; 176 ret = __tree_search(root, offset, &prev); 177 if (!ret) 178 return prev; 179 return ret; 180 } 181 182 static int tree_delete(struct rb_root *root, u64 offset) 183 { 184 struct rb_node *node; 185 struct tree_entry *entry; 186 187 node = __tree_search(root, offset, NULL); 188 if (!node) 189 return -ENOENT; 190 entry = rb_entry(node, struct tree_entry, rb_node); 191 entry->in_tree = 0; 192 rb_erase(node, root); 193 return 0; 194 } 195 196 /* 197 * add_extent_mapping tries a simple backward merge with existing 198 * mappings. The extent_map struct passed in will be inserted into 199 * the tree directly (no copies made, just a reference taken). 200 */ 201 int add_extent_mapping(struct extent_map_tree *tree, 202 struct extent_map *em) 203 { 204 int ret = 0; 205 struct extent_map *prev = NULL; 206 struct rb_node *rb; 207 208 write_lock_irq(&tree->lock); 209 rb = tree_insert(&tree->map, em->end, &em->rb_node); 210 if (rb) { 211 prev = rb_entry(rb, struct extent_map, rb_node); 212 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); 213 ret = -EEXIST; 214 goto out; 215 } 216 atomic_inc(&em->refs); 217 if (em->start != 0) { 218 rb = rb_prev(&em->rb_node); 219 if (rb) 220 prev = rb_entry(rb, struct extent_map, rb_node); 221 if (prev && prev->end + 1 == em->start && 222 ((em->block_start == 0 && prev->block_start == 0) || 223 (em->block_start == prev->block_end + 1))) { 224 em->start = prev->start; 225 em->block_start = prev->block_start; 226 rb_erase(&prev->rb_node, &tree->map); 227 prev->in_tree = 0; 228 free_extent_map(prev); 229 } 230 } 231 out: 232 write_unlock_irq(&tree->lock); 233 return ret; 234 } 235 EXPORT_SYMBOL(add_extent_mapping); 236 237 /* 238 * lookup_extent_mapping returns the first extent_map struct in the 239 * tree that intersects the [start, end] (inclusive) range. There may 240 * be additional objects in the tree that intersect, so check the object 241 * returned carefully to make sure you don't need additional lookups. 242 */ 243 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, 244 u64 start, u64 end) 245 { 246 struct extent_map *em; 247 struct rb_node *rb_node; 248 249 read_lock_irq(&tree->lock); 250 rb_node = tree_search(&tree->map, start); 251 if (!rb_node) { 252 em = NULL; 253 goto out; 254 } 255 if (IS_ERR(rb_node)) { 256 em = ERR_PTR(PTR_ERR(rb_node)); 257 goto out; 258 } 259 em = rb_entry(rb_node, struct extent_map, rb_node); 260 if (em->end < start || em->start > end) { 261 em = NULL; 262 goto out; 263 } 264 atomic_inc(&em->refs); 265 out: 266 read_unlock_irq(&tree->lock); 267 return em; 268 } 269 EXPORT_SYMBOL(lookup_extent_mapping); 270 271 /* 272 * removes an extent_map struct from the tree. No reference counts are 273 * dropped, and no checks are done to see if the range is in use 274 */ 275 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) 276 { 277 int ret; 278 279 write_lock_irq(&tree->lock); 280 ret = tree_delete(&tree->map, em->end); 281 write_unlock_irq(&tree->lock); 282 return ret; 283 } 284 EXPORT_SYMBOL(remove_extent_mapping); 285 286 /* 287 * utility function to look for merge candidates inside a given range. 288 * Any extents with matching state are merged together into a single 289 * extent in the tree. Extents with EXTENT_IO in their state field 290 * are not merged because the end_io handlers need to be able to do 291 * operations on them without sleeping (or doing allocations/splits). 292 * 293 * This should be called with the tree lock held. 294 */ 295 static int merge_state(struct extent_map_tree *tree, 296 struct extent_state *state) 297 { 298 struct extent_state *other; 299 struct rb_node *other_node; 300 301 if (state->state & EXTENT_IOBITS) 302 return 0; 303 304 other_node = rb_prev(&state->rb_node); 305 if (other_node) { 306 other = rb_entry(other_node, struct extent_state, rb_node); 307 if (other->end == state->start - 1 && 308 other->state == state->state) { 309 state->start = other->start; 310 other->in_tree = 0; 311 rb_erase(&other->rb_node, &tree->state); 312 free_extent_state(other); 313 } 314 } 315 other_node = rb_next(&state->rb_node); 316 if (other_node) { 317 other = rb_entry(other_node, struct extent_state, rb_node); 318 if (other->start == state->end + 1 && 319 other->state == state->state) { 320 other->start = state->start; 321 state->in_tree = 0; 322 rb_erase(&state->rb_node, &tree->state); 323 free_extent_state(state); 324 } 325 } 326 return 0; 327 } 328 329 /* 330 * insert an extent_state struct into the tree. 'bits' are set on the 331 * struct before it is inserted. 332 * 333 * This may return -EEXIST if the extent is already there, in which case the 334 * state struct is freed. 335 * 336 * The tree lock is not taken internally. This is a utility function and 337 * probably isn't what you want to call (see set/clear_extent_bit). 338 */ 339 static int insert_state(struct extent_map_tree *tree, 340 struct extent_state *state, u64 start, u64 end, 341 int bits) 342 { 343 struct rb_node *node; 344 345 if (end < start) { 346 printk("end < start %Lu %Lu\n", end, start); 347 WARN_ON(1); 348 } 349 state->state |= bits; 350 state->start = start; 351 state->end = end; 352 if ((end & 4095) == 0) { 353 printk("insert state %Lu %Lu strange end\n", start, end); 354 WARN_ON(1); 355 } 356 node = tree_insert(&tree->state, end, &state->rb_node); 357 if (node) { 358 struct extent_state *found; 359 found = rb_entry(node, struct extent_state, rb_node); 360 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); 361 free_extent_state(state); 362 return -EEXIST; 363 } 364 merge_state(tree, state); 365 return 0; 366 } 367 368 /* 369 * split a given extent state struct in two, inserting the preallocated 370 * struct 'prealloc' as the newly created second half. 'split' indicates an 371 * offset inside 'orig' where it should be split. 372 * 373 * Before calling, 374 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 375 * are two extent state structs in the tree: 376 * prealloc: [orig->start, split - 1] 377 * orig: [ split, orig->end ] 378 * 379 * The tree locks are not taken by this function. They need to be held 380 * by the caller. 381 */ 382 static int split_state(struct extent_map_tree *tree, struct extent_state *orig, 383 struct extent_state *prealloc, u64 split) 384 { 385 struct rb_node *node; 386 prealloc->start = orig->start; 387 prealloc->end = split - 1; 388 prealloc->state = orig->state; 389 orig->start = split; 390 if ((prealloc->end & 4095) == 0) { 391 printk("insert state %Lu %Lu strange end\n", prealloc->start, 392 prealloc->end); 393 WARN_ON(1); 394 } 395 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 396 if (node) { 397 struct extent_state *found; 398 found = rb_entry(node, struct extent_state, rb_node); 399 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); 400 free_extent_state(prealloc); 401 return -EEXIST; 402 } 403 return 0; 404 } 405 406 /* 407 * utility function to clear some bits in an extent state struct. 408 * it will optionally wake up any one waiting on this state (wake == 1), or 409 * forcibly remove the state from the tree (delete == 1). 410 * 411 * If no bits are set on the state struct after clearing things, the 412 * struct is freed and removed from the tree 413 */ 414 static int clear_state_bit(struct extent_map_tree *tree, 415 struct extent_state *state, int bits, int wake, 416 int delete) 417 { 418 int ret = state->state & bits; 419 state->state &= ~bits; 420 if (wake) 421 wake_up(&state->wq); 422 if (delete || state->state == 0) { 423 if (state->in_tree) { 424 rb_erase(&state->rb_node, &tree->state); 425 state->in_tree = 0; 426 free_extent_state(state); 427 } else { 428 WARN_ON(1); 429 } 430 } else { 431 merge_state(tree, state); 432 } 433 return ret; 434 } 435 436 /* 437 * clear some bits on a range in the tree. This may require splitting 438 * or inserting elements in the tree, so the gfp mask is used to 439 * indicate which allocations or sleeping are allowed. 440 * 441 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 442 * the given range from the tree regardless of state (ie for truncate). 443 * 444 * the range [start, end] is inclusive. 445 * 446 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 447 * bits were already set, or zero if none of the bits were already set. 448 */ 449 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, 450 int bits, int wake, int delete, gfp_t mask) 451 { 452 struct extent_state *state; 453 struct extent_state *prealloc = NULL; 454 struct rb_node *node; 455 int err; 456 int set = 0; 457 458 again: 459 if (!prealloc && (mask & __GFP_WAIT)) { 460 prealloc = alloc_extent_state(mask); 461 if (!prealloc) 462 return -ENOMEM; 463 } 464 465 write_lock_irq(&tree->lock); 466 /* 467 * this search will find the extents that end after 468 * our range starts 469 */ 470 node = tree_search(&tree->state, start); 471 if (!node) 472 goto out; 473 state = rb_entry(node, struct extent_state, rb_node); 474 if (state->start > end) 475 goto out; 476 WARN_ON(state->end < start); 477 478 /* 479 * | ---- desired range ---- | 480 * | state | or 481 * | ------------- state -------------- | 482 * 483 * We need to split the extent we found, and may flip 484 * bits on second half. 485 * 486 * If the extent we found extends past our range, we 487 * just split and search again. It'll get split again 488 * the next time though. 489 * 490 * If the extent we found is inside our range, we clear 491 * the desired bit on it. 492 */ 493 494 if (state->start < start) { 495 err = split_state(tree, state, prealloc, start); 496 BUG_ON(err == -EEXIST); 497 prealloc = NULL; 498 if (err) 499 goto out; 500 if (state->end <= end) { 501 start = state->end + 1; 502 set |= clear_state_bit(tree, state, bits, 503 wake, delete); 504 } else { 505 start = state->start; 506 } 507 goto search_again; 508 } 509 /* 510 * | ---- desired range ---- | 511 * | state | 512 * We need to split the extent, and clear the bit 513 * on the first half 514 */ 515 if (state->start <= end && state->end > end) { 516 err = split_state(tree, state, prealloc, end + 1); 517 BUG_ON(err == -EEXIST); 518 519 if (wake) 520 wake_up(&state->wq); 521 set |= clear_state_bit(tree, prealloc, bits, 522 wake, delete); 523 prealloc = NULL; 524 goto out; 525 } 526 527 start = state->end + 1; 528 set |= clear_state_bit(tree, state, bits, wake, delete); 529 goto search_again; 530 531 out: 532 write_unlock_irq(&tree->lock); 533 if (prealloc) 534 free_extent_state(prealloc); 535 536 return set; 537 538 search_again: 539 if (start >= end) 540 goto out; 541 write_unlock_irq(&tree->lock); 542 if (mask & __GFP_WAIT) 543 cond_resched(); 544 goto again; 545 } 546 EXPORT_SYMBOL(clear_extent_bit); 547 548 static int wait_on_state(struct extent_map_tree *tree, 549 struct extent_state *state) 550 { 551 DEFINE_WAIT(wait); 552 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 553 read_unlock_irq(&tree->lock); 554 schedule(); 555 read_lock_irq(&tree->lock); 556 finish_wait(&state->wq, &wait); 557 return 0; 558 } 559 560 /* 561 * waits for one or more bits to clear on a range in the state tree. 562 * The range [start, end] is inclusive. 563 * The tree lock is taken by this function 564 */ 565 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) 566 { 567 struct extent_state *state; 568 struct rb_node *node; 569 570 read_lock_irq(&tree->lock); 571 again: 572 while (1) { 573 /* 574 * this search will find all the extents that end after 575 * our range starts 576 */ 577 node = tree_search(&tree->state, start); 578 if (!node) 579 break; 580 581 state = rb_entry(node, struct extent_state, rb_node); 582 583 if (state->start > end) 584 goto out; 585 586 if (state->state & bits) { 587 start = state->start; 588 atomic_inc(&state->refs); 589 wait_on_state(tree, state); 590 free_extent_state(state); 591 goto again; 592 } 593 start = state->end + 1; 594 595 if (start > end) 596 break; 597 598 if (need_resched()) { 599 read_unlock_irq(&tree->lock); 600 cond_resched(); 601 read_lock_irq(&tree->lock); 602 } 603 } 604 out: 605 read_unlock_irq(&tree->lock); 606 return 0; 607 } 608 EXPORT_SYMBOL(wait_extent_bit); 609 610 /* 611 * set some bits on a range in the tree. This may require allocations 612 * or sleeping, so the gfp mask is used to indicate what is allowed. 613 * 614 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 615 * range already has the desired bits set. The start of the existing 616 * range is returned in failed_start in this case. 617 * 618 * [start, end] is inclusive 619 * This takes the tree lock. 620 */ 621 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, 622 int exclusive, u64 *failed_start, gfp_t mask) 623 { 624 struct extent_state *state; 625 struct extent_state *prealloc = NULL; 626 struct rb_node *node; 627 int err = 0; 628 int set; 629 u64 last_start; 630 u64 last_end; 631 again: 632 if (!prealloc && (mask & __GFP_WAIT)) { 633 prealloc = alloc_extent_state(mask); 634 if (!prealloc) 635 return -ENOMEM; 636 } 637 638 write_lock_irq(&tree->lock); 639 /* 640 * this search will find all the extents that end after 641 * our range starts. 642 */ 643 node = tree_search(&tree->state, start); 644 if (!node) { 645 err = insert_state(tree, prealloc, start, end, bits); 646 prealloc = NULL; 647 BUG_ON(err == -EEXIST); 648 goto out; 649 } 650 651 state = rb_entry(node, struct extent_state, rb_node); 652 last_start = state->start; 653 last_end = state->end; 654 655 /* 656 * | ---- desired range ---- | 657 * | state | 658 * 659 * Just lock what we found and keep going 660 */ 661 if (state->start == start && state->end <= end) { 662 set = state->state & bits; 663 if (set && exclusive) { 664 *failed_start = state->start; 665 err = -EEXIST; 666 goto out; 667 } 668 state->state |= bits; 669 start = state->end + 1; 670 merge_state(tree, state); 671 goto search_again; 672 } 673 674 /* 675 * | ---- desired range ---- | 676 * | state | 677 * or 678 * | ------------- state -------------- | 679 * 680 * We need to split the extent we found, and may flip bits on 681 * second half. 682 * 683 * If the extent we found extends past our 684 * range, we just split and search again. It'll get split 685 * again the next time though. 686 * 687 * If the extent we found is inside our range, we set the 688 * desired bit on it. 689 */ 690 if (state->start < start) { 691 set = state->state & bits; 692 if (exclusive && set) { 693 *failed_start = start; 694 err = -EEXIST; 695 goto out; 696 } 697 err = split_state(tree, state, prealloc, start); 698 BUG_ON(err == -EEXIST); 699 prealloc = NULL; 700 if (err) 701 goto out; 702 if (state->end <= end) { 703 state->state |= bits; 704 start = state->end + 1; 705 merge_state(tree, state); 706 } else { 707 start = state->start; 708 } 709 goto search_again; 710 } 711 /* 712 * | ---- desired range ---- | 713 * | state | 714 * We need to split the extent, and set the bit 715 * on the first half 716 */ 717 if (state->start <= end && state->end > end) { 718 set = state->state & bits; 719 if (exclusive && set) { 720 *failed_start = start; 721 err = -EEXIST; 722 goto out; 723 } 724 err = split_state(tree, state, prealloc, end + 1); 725 BUG_ON(err == -EEXIST); 726 727 prealloc->state |= bits; 728 merge_state(tree, prealloc); 729 prealloc = NULL; 730 goto out; 731 } 732 733 /* 734 * | ---- desired range ---- | 735 * | state | or | state | 736 * 737 * There's a hole, we need to insert something in it and 738 * ignore the extent we found. 739 */ 740 if (state->start > start) { 741 u64 this_end; 742 if (end < last_start) 743 this_end = end; 744 else 745 this_end = last_start -1; 746 err = insert_state(tree, prealloc, start, this_end, 747 bits); 748 prealloc = NULL; 749 BUG_ON(err == -EEXIST); 750 if (err) 751 goto out; 752 start = this_end + 1; 753 goto search_again; 754 } 755 goto search_again; 756 757 out: 758 write_unlock_irq(&tree->lock); 759 if (prealloc) 760 free_extent_state(prealloc); 761 762 return err; 763 764 search_again: 765 if (start > end) 766 goto out; 767 write_unlock_irq(&tree->lock); 768 if (mask & __GFP_WAIT) 769 cond_resched(); 770 goto again; 771 } 772 EXPORT_SYMBOL(set_extent_bit); 773 774 /* wrappers around set/clear extent bit */ 775 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 776 gfp_t mask) 777 { 778 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 779 mask); 780 } 781 EXPORT_SYMBOL(set_extent_dirty); 782 783 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, 784 gfp_t mask) 785 { 786 return set_extent_bit(tree, start, end, 787 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, 788 mask); 789 } 790 EXPORT_SYMBOL(set_extent_delalloc); 791 792 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 793 gfp_t mask) 794 { 795 return clear_extent_bit(tree, start, end, 796 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 797 } 798 EXPORT_SYMBOL(clear_extent_dirty); 799 800 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 801 gfp_t mask) 802 { 803 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 804 mask); 805 } 806 EXPORT_SYMBOL(set_extent_new); 807 808 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 809 gfp_t mask) 810 { 811 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 812 } 813 EXPORT_SYMBOL(clear_extent_new); 814 815 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 816 gfp_t mask) 817 { 818 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 819 mask); 820 } 821 EXPORT_SYMBOL(set_extent_uptodate); 822 823 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 824 gfp_t mask) 825 { 826 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 827 } 828 EXPORT_SYMBOL(clear_extent_uptodate); 829 830 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 831 gfp_t mask) 832 { 833 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, 834 0, NULL, mask); 835 } 836 EXPORT_SYMBOL(set_extent_writeback); 837 838 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 839 gfp_t mask) 840 { 841 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); 842 } 843 EXPORT_SYMBOL(clear_extent_writeback); 844 845 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) 846 { 847 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 848 } 849 EXPORT_SYMBOL(wait_on_extent_writeback); 850 851 /* 852 * locks a range in ascending order, waiting for any locked regions 853 * it hits on the way. [start,end] are inclusive, and this will sleep. 854 */ 855 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) 856 { 857 int err; 858 u64 failed_start; 859 while (1) { 860 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 861 &failed_start, mask); 862 if (err == -EEXIST && (mask & __GFP_WAIT)) { 863 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 864 start = failed_start; 865 } else { 866 break; 867 } 868 WARN_ON(start > end); 869 } 870 return err; 871 } 872 EXPORT_SYMBOL(lock_extent); 873 874 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, 875 gfp_t mask) 876 { 877 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 878 } 879 EXPORT_SYMBOL(unlock_extent); 880 881 /* 882 * helper function to set pages and extents in the tree dirty 883 */ 884 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) 885 { 886 unsigned long index = start >> PAGE_CACHE_SHIFT; 887 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 888 struct page *page; 889 890 while (index <= end_index) { 891 page = find_get_page(tree->mapping, index); 892 BUG_ON(!page); 893 __set_page_dirty_nobuffers(page); 894 page_cache_release(page); 895 index++; 896 } 897 set_extent_dirty(tree, start, end, GFP_NOFS); 898 return 0; 899 } 900 EXPORT_SYMBOL(set_range_dirty); 901 902 /* 903 * helper function to set both pages and extents in the tree writeback 904 */ 905 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) 906 { 907 unsigned long index = start >> PAGE_CACHE_SHIFT; 908 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 909 struct page *page; 910 911 while (index <= end_index) { 912 page = find_get_page(tree->mapping, index); 913 BUG_ON(!page); 914 set_page_writeback(page); 915 page_cache_release(page); 916 index++; 917 } 918 set_extent_writeback(tree, start, end, GFP_NOFS); 919 return 0; 920 } 921 EXPORT_SYMBOL(set_range_writeback); 922 923 u64 find_lock_delalloc_range(struct extent_map_tree *tree, 924 u64 start, u64 lock_start, u64 *end, u64 max_bytes) 925 { 926 struct rb_node *node; 927 struct extent_state *state; 928 u64 cur_start = start; 929 u64 found = 0; 930 u64 total_bytes = 0; 931 932 write_lock_irq(&tree->lock); 933 /* 934 * this search will find all the extents that end after 935 * our range starts. 936 */ 937 search_again: 938 node = tree_search(&tree->state, cur_start); 939 if (!node || IS_ERR(node)) { 940 goto out; 941 } 942 943 while(1) { 944 state = rb_entry(node, struct extent_state, rb_node); 945 if (state->start != cur_start) { 946 goto out; 947 } 948 if (!(state->state & EXTENT_DELALLOC)) { 949 goto out; 950 } 951 if (state->start >= lock_start) { 952 if (state->state & EXTENT_LOCKED) { 953 DEFINE_WAIT(wait); 954 atomic_inc(&state->refs); 955 write_unlock_irq(&tree->lock); 956 schedule(); 957 write_lock_irq(&tree->lock); 958 finish_wait(&state->wq, &wait); 959 free_extent_state(state); 960 goto search_again; 961 } 962 state->state |= EXTENT_LOCKED; 963 } 964 found++; 965 *end = state->end; 966 cur_start = state->end + 1; 967 node = rb_next(node); 968 if (!node) 969 break; 970 total_bytes = state->end - state->start + 1; 971 if (total_bytes >= max_bytes) 972 break; 973 } 974 out: 975 write_unlock_irq(&tree->lock); 976 return found; 977 } 978 979 /* 980 * helper function to lock both pages and extents in the tree. 981 * pages must be locked first. 982 */ 983 int lock_range(struct extent_map_tree *tree, u64 start, u64 end) 984 { 985 unsigned long index = start >> PAGE_CACHE_SHIFT; 986 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 987 struct page *page; 988 int err; 989 990 while (index <= end_index) { 991 page = grab_cache_page(tree->mapping, index); 992 if (!page) { 993 err = -ENOMEM; 994 goto failed; 995 } 996 if (IS_ERR(page)) { 997 err = PTR_ERR(page); 998 goto failed; 999 } 1000 index++; 1001 } 1002 lock_extent(tree, start, end, GFP_NOFS); 1003 return 0; 1004 1005 failed: 1006 /* 1007 * we failed above in getting the page at 'index', so we undo here 1008 * up to but not including the page at 'index' 1009 */ 1010 end_index = index; 1011 index = start >> PAGE_CACHE_SHIFT; 1012 while (index < end_index) { 1013 page = find_get_page(tree->mapping, index); 1014 unlock_page(page); 1015 page_cache_release(page); 1016 index++; 1017 } 1018 return err; 1019 } 1020 EXPORT_SYMBOL(lock_range); 1021 1022 /* 1023 * helper function to unlock both pages and extents in the tree. 1024 */ 1025 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) 1026 { 1027 unsigned long index = start >> PAGE_CACHE_SHIFT; 1028 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1029 struct page *page; 1030 1031 while (index <= end_index) { 1032 page = find_get_page(tree->mapping, index); 1033 unlock_page(page); 1034 page_cache_release(page); 1035 index++; 1036 } 1037 unlock_extent(tree, start, end, GFP_NOFS); 1038 return 0; 1039 } 1040 EXPORT_SYMBOL(unlock_range); 1041 1042 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) 1043 { 1044 struct rb_node *node; 1045 struct extent_state *state; 1046 int ret = 0; 1047 1048 write_lock_irq(&tree->lock); 1049 /* 1050 * this search will find all the extents that end after 1051 * our range starts. 1052 */ 1053 node = tree_search(&tree->state, start); 1054 if (!node || IS_ERR(node)) { 1055 ret = -ENOENT; 1056 goto out; 1057 } 1058 state = rb_entry(node, struct extent_state, rb_node); 1059 if (state->start != start) { 1060 ret = -ENOENT; 1061 goto out; 1062 } 1063 state->private = private; 1064 out: 1065 write_unlock_irq(&tree->lock); 1066 return ret; 1067 1068 } 1069 1070 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) 1071 { 1072 struct rb_node *node; 1073 struct extent_state *state; 1074 int ret = 0; 1075 1076 read_lock_irq(&tree->lock); 1077 /* 1078 * this search will find all the extents that end after 1079 * our range starts. 1080 */ 1081 node = tree_search(&tree->state, start); 1082 if (!node || IS_ERR(node)) { 1083 ret = -ENOENT; 1084 goto out; 1085 } 1086 state = rb_entry(node, struct extent_state, rb_node); 1087 if (state->start != start) { 1088 ret = -ENOENT; 1089 goto out; 1090 } 1091 *private = state->private; 1092 out: 1093 read_unlock_irq(&tree->lock); 1094 return ret; 1095 } 1096 1097 /* 1098 * searches a range in the state tree for a given mask. 1099 * If 'filled' == 1, this returns 1 only if ever extent in the tree 1100 * has the bits set. Otherwise, 1 is returned if any bit in the 1101 * range is found set. 1102 */ 1103 static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, 1104 int bits, int filled) 1105 { 1106 struct extent_state *state = NULL; 1107 struct rb_node *node; 1108 int bitset = 0; 1109 1110 read_lock_irq(&tree->lock); 1111 node = tree_search(&tree->state, start); 1112 while (node && start <= end) { 1113 state = rb_entry(node, struct extent_state, rb_node); 1114 if (state->start > end) 1115 break; 1116 1117 if (filled && state->start > start) { 1118 bitset = 0; 1119 break; 1120 } 1121 if (state->state & bits) { 1122 bitset = 1; 1123 if (!filled) 1124 break; 1125 } else if (filled) { 1126 bitset = 0; 1127 break; 1128 } 1129 start = state->end + 1; 1130 if (start > end) 1131 break; 1132 node = rb_next(node); 1133 } 1134 read_unlock_irq(&tree->lock); 1135 return bitset; 1136 } 1137 1138 /* 1139 * helper function to set a given page up to date if all the 1140 * extents in the tree for that page are up to date 1141 */ 1142 static int check_page_uptodate(struct extent_map_tree *tree, 1143 struct page *page) 1144 { 1145 u64 start = page->index << PAGE_CACHE_SHIFT; 1146 u64 end = start + PAGE_CACHE_SIZE - 1; 1147 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1148 SetPageUptodate(page); 1149 return 0; 1150 } 1151 1152 /* 1153 * helper function to unlock a page if all the extents in the tree 1154 * for that page are unlocked 1155 */ 1156 static int check_page_locked(struct extent_map_tree *tree, 1157 struct page *page) 1158 { 1159 u64 start = page->index << PAGE_CACHE_SHIFT; 1160 u64 end = start + PAGE_CACHE_SIZE - 1; 1161 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1162 unlock_page(page); 1163 return 0; 1164 } 1165 1166 /* 1167 * helper function to end page writeback if all the extents 1168 * in the tree for that page are done with writeback 1169 */ 1170 static int check_page_writeback(struct extent_map_tree *tree, 1171 struct page *page) 1172 { 1173 u64 start = page->index << PAGE_CACHE_SHIFT; 1174 u64 end = start + PAGE_CACHE_SIZE - 1; 1175 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) 1176 end_page_writeback(page); 1177 return 0; 1178 } 1179 1180 /* lots and lots of room for performance fixes in the end_bio funcs */ 1181 1182 /* 1183 * after a writepage IO is done, we need to: 1184 * clear the uptodate bits on error 1185 * clear the writeback bits in the extent tree for this IO 1186 * end_page_writeback if the page has no more pending IO 1187 * 1188 * Scheduling is not allowed, so the extent state tree is expected 1189 * to have one and only one object corresponding to this IO. 1190 */ 1191 static int end_bio_extent_writepage(struct bio *bio, 1192 unsigned int bytes_done, int err) 1193 { 1194 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1195 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1196 struct extent_map_tree *tree = bio->bi_private; 1197 u64 start; 1198 u64 end; 1199 int whole_page; 1200 1201 if (bio->bi_size) 1202 return 1; 1203 1204 do { 1205 struct page *page = bvec->bv_page; 1206 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1207 end = start + bvec->bv_len - 1; 1208 1209 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1210 whole_page = 1; 1211 else 1212 whole_page = 0; 1213 1214 if (--bvec >= bio->bi_io_vec) 1215 prefetchw(&bvec->bv_page->flags); 1216 1217 if (!uptodate) { 1218 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1219 ClearPageUptodate(page); 1220 SetPageError(page); 1221 } 1222 clear_extent_writeback(tree, start, end, GFP_ATOMIC); 1223 1224 if (whole_page) 1225 end_page_writeback(page); 1226 else 1227 check_page_writeback(tree, page); 1228 } while (bvec >= bio->bi_io_vec); 1229 1230 bio_put(bio); 1231 return 0; 1232 } 1233 1234 /* 1235 * after a readpage IO is done, we need to: 1236 * clear the uptodate bits on error 1237 * set the uptodate bits if things worked 1238 * set the page up to date if all extents in the tree are uptodate 1239 * clear the lock bit in the extent tree 1240 * unlock the page if there are no other extents locked for it 1241 * 1242 * Scheduling is not allowed, so the extent state tree is expected 1243 * to have one and only one object corresponding to this IO. 1244 */ 1245 static int end_bio_extent_readpage(struct bio *bio, 1246 unsigned int bytes_done, int err) 1247 { 1248 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1249 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1250 struct extent_map_tree *tree = bio->bi_private; 1251 u64 start; 1252 u64 end; 1253 int whole_page; 1254 int ret; 1255 1256 if (bio->bi_size) 1257 return 1; 1258 1259 do { 1260 struct page *page = bvec->bv_page; 1261 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1262 end = start + bvec->bv_len - 1; 1263 1264 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1265 whole_page = 1; 1266 else 1267 whole_page = 0; 1268 1269 if (--bvec >= bio->bi_io_vec) 1270 prefetchw(&bvec->bv_page->flags); 1271 1272 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1273 ret = tree->ops->readpage_end_io_hook(page, start, end); 1274 if (ret) 1275 uptodate = 0; 1276 } 1277 if (uptodate) { 1278 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1279 if (whole_page) 1280 SetPageUptodate(page); 1281 else 1282 check_page_uptodate(tree, page); 1283 } else { 1284 ClearPageUptodate(page); 1285 SetPageError(page); 1286 } 1287 1288 unlock_extent(tree, start, end, GFP_ATOMIC); 1289 1290 if (whole_page) 1291 unlock_page(page); 1292 else 1293 check_page_locked(tree, page); 1294 } while (bvec >= bio->bi_io_vec); 1295 1296 bio_put(bio); 1297 return 0; 1298 } 1299 1300 /* 1301 * IO done from prepare_write is pretty simple, we just unlock 1302 * the structs in the extent tree when done, and set the uptodate bits 1303 * as appropriate. 1304 */ 1305 static int end_bio_extent_preparewrite(struct bio *bio, 1306 unsigned int bytes_done, int err) 1307 { 1308 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1309 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1310 struct extent_map_tree *tree = bio->bi_private; 1311 u64 start; 1312 u64 end; 1313 1314 if (bio->bi_size) 1315 return 1; 1316 1317 do { 1318 struct page *page = bvec->bv_page; 1319 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1320 end = start + bvec->bv_len - 1; 1321 1322 if (--bvec >= bio->bi_io_vec) 1323 prefetchw(&bvec->bv_page->flags); 1324 1325 if (uptodate) { 1326 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1327 } else { 1328 ClearPageUptodate(page); 1329 SetPageError(page); 1330 } 1331 1332 unlock_extent(tree, start, end, GFP_ATOMIC); 1333 1334 } while (bvec >= bio->bi_io_vec); 1335 1336 bio_put(bio); 1337 return 0; 1338 } 1339 1340 static int submit_extent_page(int rw, struct extent_map_tree *tree, 1341 struct page *page, sector_t sector, 1342 size_t size, unsigned long offset, 1343 struct block_device *bdev, 1344 bio_end_io_t end_io_func) 1345 { 1346 struct bio *bio; 1347 int ret = 0; 1348 1349 bio = bio_alloc(GFP_NOIO, 1); 1350 1351 bio->bi_sector = sector; 1352 bio->bi_bdev = bdev; 1353 bio->bi_io_vec[0].bv_page = page; 1354 bio->bi_io_vec[0].bv_len = size; 1355 bio->bi_io_vec[0].bv_offset = offset; 1356 1357 bio->bi_vcnt = 1; 1358 bio->bi_idx = 0; 1359 bio->bi_size = size; 1360 1361 bio->bi_end_io = end_io_func; 1362 bio->bi_private = tree; 1363 1364 bio_get(bio); 1365 submit_bio(rw, bio); 1366 1367 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1368 ret = -EOPNOTSUPP; 1369 1370 bio_put(bio); 1371 return ret; 1372 } 1373 1374 /* 1375 * basic readpage implementation. Locked extent state structs are inserted 1376 * into the tree that are removed when the IO is done (by the end_io 1377 * handlers) 1378 */ 1379 int extent_read_full_page(struct extent_map_tree *tree, struct page *page, 1380 get_extent_t *get_extent) 1381 { 1382 struct inode *inode = page->mapping->host; 1383 u64 start = page->index << PAGE_CACHE_SHIFT; 1384 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1385 u64 end; 1386 u64 cur = start; 1387 u64 extent_offset; 1388 u64 last_byte = i_size_read(inode); 1389 u64 block_start; 1390 u64 cur_end; 1391 sector_t sector; 1392 struct extent_map *em; 1393 struct block_device *bdev; 1394 int ret; 1395 int nr = 0; 1396 size_t page_offset = 0; 1397 size_t iosize; 1398 size_t blocksize = inode->i_sb->s_blocksize; 1399 1400 if (!PagePrivate(page)) { 1401 SetPagePrivate(page); 1402 WARN_ON(!page->mapping->a_ops->invalidatepage); 1403 set_page_private(page, 1); 1404 page_cache_get(page); 1405 } 1406 1407 end = page_end; 1408 lock_extent(tree, start, end, GFP_NOFS); 1409 1410 while (cur <= end) { 1411 if (cur >= last_byte) { 1412 iosize = PAGE_CACHE_SIZE - page_offset; 1413 zero_user_page(page, page_offset, iosize, KM_USER0); 1414 set_extent_uptodate(tree, cur, cur + iosize - 1, 1415 GFP_NOFS); 1416 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1417 break; 1418 } 1419 em = get_extent(inode, page, page_offset, cur, end, 0); 1420 if (IS_ERR(em) || !em) { 1421 SetPageError(page); 1422 unlock_extent(tree, cur, end, GFP_NOFS); 1423 break; 1424 } 1425 1426 extent_offset = cur - em->start; 1427 BUG_ON(em->end < cur); 1428 BUG_ON(end < cur); 1429 1430 iosize = min(em->end - cur, end - cur) + 1; 1431 cur_end = min(em->end, end); 1432 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1433 sector = (em->block_start + extent_offset) >> 9; 1434 bdev = em->bdev; 1435 block_start = em->block_start; 1436 free_extent_map(em); 1437 em = NULL; 1438 1439 /* we've found a hole, just zero and go on */ 1440 if (block_start == 0) { 1441 zero_user_page(page, page_offset, iosize, KM_USER0); 1442 set_extent_uptodate(tree, cur, cur + iosize - 1, 1443 GFP_NOFS); 1444 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1445 cur = cur + iosize; 1446 page_offset += iosize; 1447 continue; 1448 } 1449 /* the get_extent function already copied into the page */ 1450 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 1451 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1452 cur = cur + iosize; 1453 page_offset += iosize; 1454 continue; 1455 } 1456 1457 ret = 0; 1458 if (tree->ops && tree->ops->readpage_io_hook) { 1459 ret = tree->ops->readpage_io_hook(page, cur, 1460 cur + iosize - 1); 1461 } 1462 if (!ret) { 1463 ret = submit_extent_page(READ, tree, page, 1464 sector, iosize, page_offset, 1465 bdev, end_bio_extent_readpage); 1466 } 1467 if (ret) 1468 SetPageError(page); 1469 cur = cur + iosize; 1470 page_offset += iosize; 1471 nr++; 1472 } 1473 if (!nr) { 1474 if (!PageError(page)) 1475 SetPageUptodate(page); 1476 unlock_page(page); 1477 } 1478 return 0; 1479 } 1480 EXPORT_SYMBOL(extent_read_full_page); 1481 1482 /* 1483 * the writepage semantics are similar to regular writepage. extent 1484 * records are inserted to lock ranges in the tree, and as dirty areas 1485 * are found, they are marked writeback. Then the lock bits are removed 1486 * and the end_io handler clears the writeback ranges 1487 */ 1488 int extent_write_full_page(struct extent_map_tree *tree, struct page *page, 1489 get_extent_t *get_extent, 1490 struct writeback_control *wbc) 1491 { 1492 struct inode *inode = page->mapping->host; 1493 u64 start = page->index << PAGE_CACHE_SHIFT; 1494 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1495 u64 end; 1496 u64 cur = start; 1497 u64 extent_offset; 1498 u64 last_byte = i_size_read(inode); 1499 u64 block_start; 1500 sector_t sector; 1501 struct extent_map *em; 1502 struct block_device *bdev; 1503 int ret; 1504 int nr = 0; 1505 size_t page_offset = 0; 1506 size_t iosize; 1507 size_t blocksize; 1508 loff_t i_size = i_size_read(inode); 1509 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 1510 u64 nr_delalloc; 1511 u64 delalloc_end; 1512 1513 WARN_ON(!PageLocked(page)); 1514 if (page->index > end_index) { 1515 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1516 unlock_page(page); 1517 return 0; 1518 } 1519 1520 if (page->index == end_index) { 1521 size_t offset = i_size & (PAGE_CACHE_SIZE - 1); 1522 zero_user_page(page, offset, 1523 PAGE_CACHE_SIZE - offset, KM_USER0); 1524 } 1525 1526 if (!PagePrivate(page)) { 1527 SetPagePrivate(page); 1528 set_page_private(page, 1); 1529 WARN_ON(!page->mapping->a_ops->invalidatepage); 1530 page_cache_get(page); 1531 } 1532 1533 lock_extent(tree, start, page_end, GFP_NOFS); 1534 nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, 1535 &delalloc_end, 1536 128 * 1024 * 1024); 1537 if (nr_delalloc) { 1538 tree->ops->fill_delalloc(inode, start, delalloc_end); 1539 if (delalloc_end >= page_end + 1) { 1540 clear_extent_bit(tree, page_end + 1, delalloc_end, 1541 EXTENT_LOCKED | EXTENT_DELALLOC, 1542 1, 0, GFP_NOFS); 1543 } 1544 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC, 1545 0, 0, GFP_NOFS); 1546 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1547 printk("found delalloc bits after clear extent_bit\n"); 1548 } 1549 } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1550 printk("found delalloc bits after find_delalloc_range returns 0\n"); 1551 } 1552 1553 end = page_end; 1554 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1555 printk("found delalloc bits after lock_extent\n"); 1556 } 1557 1558 if (last_byte <= start) { 1559 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1560 goto done; 1561 } 1562 1563 set_extent_uptodate(tree, start, page_end, GFP_NOFS); 1564 blocksize = inode->i_sb->s_blocksize; 1565 1566 while (cur <= end) { 1567 if (cur >= last_byte) { 1568 clear_extent_dirty(tree, cur, page_end, GFP_NOFS); 1569 break; 1570 } 1571 em = get_extent(inode, page, page_offset, cur, end, 0); 1572 if (IS_ERR(em) || !em) { 1573 SetPageError(page); 1574 break; 1575 } 1576 1577 extent_offset = cur - em->start; 1578 BUG_ON(em->end < cur); 1579 BUG_ON(end < cur); 1580 iosize = min(em->end - cur, end - cur) + 1; 1581 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1582 sector = (em->block_start + extent_offset) >> 9; 1583 bdev = em->bdev; 1584 block_start = em->block_start; 1585 free_extent_map(em); 1586 em = NULL; 1587 1588 if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { 1589 clear_extent_dirty(tree, cur, 1590 cur + iosize - 1, GFP_NOFS); 1591 cur = cur + iosize; 1592 page_offset += iosize; 1593 continue; 1594 } 1595 1596 /* leave this out until we have a page_mkwrite call */ 1597 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 1598 EXTENT_DIRTY, 0)) { 1599 cur = cur + iosize; 1600 page_offset += iosize; 1601 continue; 1602 } 1603 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 1604 ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); 1605 if (ret) 1606 SetPageError(page); 1607 else { 1608 set_range_writeback(tree, cur, cur + iosize - 1); 1609 ret = submit_extent_page(WRITE, tree, page, sector, 1610 iosize, page_offset, bdev, 1611 end_bio_extent_writepage); 1612 if (ret) 1613 SetPageError(page); 1614 } 1615 cur = cur + iosize; 1616 page_offset += iosize; 1617 nr++; 1618 } 1619 done: 1620 WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); 1621 unlock_extent(tree, start, page_end, GFP_NOFS); 1622 unlock_page(page); 1623 return 0; 1624 } 1625 EXPORT_SYMBOL(extent_write_full_page); 1626 1627 /* 1628 * basic invalidatepage code, this waits on any locked or writeback 1629 * ranges corresponding to the page, and then deletes any extent state 1630 * records from the tree 1631 */ 1632 int extent_invalidatepage(struct extent_map_tree *tree, 1633 struct page *page, unsigned long offset) 1634 { 1635 u64 start = (page->index << PAGE_CACHE_SHIFT); 1636 u64 end = start + PAGE_CACHE_SIZE - 1; 1637 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 1638 1639 start += (offset + blocksize -1) & ~(blocksize - 1); 1640 if (start > end) 1641 return 0; 1642 1643 lock_extent(tree, start, end, GFP_NOFS); 1644 wait_on_extent_writeback(tree, start, end); 1645 clear_extent_bit(tree, start, end, 1646 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1647 1, 1, GFP_NOFS); 1648 return 0; 1649 } 1650 EXPORT_SYMBOL(extent_invalidatepage); 1651 1652 /* 1653 * simple commit_write call, set_range_dirty is used to mark both 1654 * the pages and the extent records as dirty 1655 */ 1656 int extent_commit_write(struct extent_map_tree *tree, 1657 struct inode *inode, struct page *page, 1658 unsigned from, unsigned to) 1659 { 1660 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1661 1662 if (!PagePrivate(page)) { 1663 SetPagePrivate(page); 1664 set_page_private(page, 1); 1665 WARN_ON(!page->mapping->a_ops->invalidatepage); 1666 page_cache_get(page); 1667 } 1668 1669 set_page_dirty(page); 1670 1671 if (pos > inode->i_size) { 1672 i_size_write(inode, pos); 1673 mark_inode_dirty(inode); 1674 } 1675 return 0; 1676 } 1677 EXPORT_SYMBOL(extent_commit_write); 1678 1679 int extent_prepare_write(struct extent_map_tree *tree, 1680 struct inode *inode, struct page *page, 1681 unsigned from, unsigned to, get_extent_t *get_extent) 1682 { 1683 u64 page_start = page->index << PAGE_CACHE_SHIFT; 1684 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 1685 u64 block_start; 1686 u64 orig_block_start; 1687 u64 block_end; 1688 u64 cur_end; 1689 struct extent_map *em; 1690 unsigned blocksize = 1 << inode->i_blkbits; 1691 size_t page_offset = 0; 1692 size_t block_off_start; 1693 size_t block_off_end; 1694 int err = 0; 1695 int iocount = 0; 1696 int ret = 0; 1697 int isnew; 1698 1699 if (!PagePrivate(page)) { 1700 SetPagePrivate(page); 1701 set_page_private(page, 1); 1702 WARN_ON(!page->mapping->a_ops->invalidatepage); 1703 page_cache_get(page); 1704 } 1705 block_start = (page_start + from) & ~((u64)blocksize - 1); 1706 block_end = (page_start + to - 1) | (blocksize - 1); 1707 orig_block_start = block_start; 1708 1709 lock_extent(tree, page_start, page_end, GFP_NOFS); 1710 while(block_start <= block_end) { 1711 em = get_extent(inode, page, page_offset, block_start, 1712 block_end, 1); 1713 if (IS_ERR(em) || !em) { 1714 goto err; 1715 } 1716 cur_end = min(block_end, em->end); 1717 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 1718 block_off_end = block_off_start + blocksize; 1719 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 1720 1721 if (!PageUptodate(page) && isnew && 1722 (block_off_end > to || block_off_start < from)) { 1723 void *kaddr; 1724 1725 kaddr = kmap_atomic(page, KM_USER0); 1726 if (block_off_end > to) 1727 memset(kaddr + to, 0, block_off_end - to); 1728 if (block_off_start < from) 1729 memset(kaddr + block_off_start, 0, 1730 from - block_off_start); 1731 flush_dcache_page(page); 1732 kunmap_atomic(kaddr, KM_USER0); 1733 } 1734 if (!isnew && !PageUptodate(page) && 1735 (block_off_end > to || block_off_start < from) && 1736 !test_range_bit(tree, block_start, cur_end, 1737 EXTENT_UPTODATE, 1)) { 1738 u64 sector; 1739 u64 extent_offset = block_start - em->start; 1740 size_t iosize; 1741 sector = (em->block_start + extent_offset) >> 9; 1742 iosize = (cur_end - block_start + blocksize - 1) & 1743 ~((u64)blocksize - 1); 1744 /* 1745 * we've already got the extent locked, but we 1746 * need to split the state such that our end_bio 1747 * handler can clear the lock. 1748 */ 1749 set_extent_bit(tree, block_start, 1750 block_start + iosize - 1, 1751 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 1752 ret = submit_extent_page(READ, tree, page, 1753 sector, iosize, page_offset, em->bdev, 1754 end_bio_extent_preparewrite); 1755 iocount++; 1756 block_start = block_start + iosize; 1757 } else { 1758 set_extent_uptodate(tree, block_start, cur_end, 1759 GFP_NOFS); 1760 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 1761 block_start = cur_end + 1; 1762 } 1763 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 1764 free_extent_map(em); 1765 } 1766 if (iocount) { 1767 wait_extent_bit(tree, orig_block_start, 1768 block_end, EXTENT_LOCKED); 1769 } 1770 check_page_uptodate(tree, page); 1771 err: 1772 /* FIXME, zero out newly allocated blocks on error */ 1773 return err; 1774 } 1775 EXPORT_SYMBOL(extent_prepare_write); 1776 1777 /* 1778 * a helper for releasepage. As long as there are no locked extents 1779 * in the range corresponding to the page, both state records and extent 1780 * map records are removed 1781 */ 1782 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) 1783 { 1784 struct extent_map *em; 1785 u64 start = page->index << PAGE_CACHE_SHIFT; 1786 u64 end = start + PAGE_CACHE_SIZE - 1; 1787 u64 orig_start = start; 1788 int ret = 1; 1789 1790 while (start <= end) { 1791 em = lookup_extent_mapping(tree, start, end); 1792 if (!em || IS_ERR(em)) 1793 break; 1794 if (!test_range_bit(tree, em->start, em->end, 1795 EXTENT_LOCKED, 0)) { 1796 remove_extent_mapping(tree, em); 1797 /* once for the rb tree */ 1798 free_extent_map(em); 1799 } 1800 start = em->end + 1; 1801 /* once for us */ 1802 free_extent_map(em); 1803 } 1804 if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) 1805 ret = 0; 1806 else 1807 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, 1808 1, 1, GFP_NOFS); 1809 return ret; 1810 } 1811 EXPORT_SYMBOL(try_release_extent_mapping); 1812 1813