1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/gfp.h> 6 #include <linux/pagemap.h> 7 #include <linux/page-flags.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include "extent_map.h" 12 13 static struct kmem_cache *extent_map_cache; 14 static struct kmem_cache *extent_state_cache; 15 16 struct tree_entry { 17 u64 start; 18 u64 end; 19 int in_tree; 20 struct rb_node rb_node; 21 }; 22 23 /* bits for the extent state */ 24 #define EXTENT_DIRTY 1 25 #define EXTENT_WRITEBACK (1 << 1) 26 #define EXTENT_UPTODATE (1 << 2) 27 #define EXTENT_LOCKED (1 << 3) 28 #define EXTENT_NEW (1 << 4) 29 #define EXTENT_DELALLOC (1 << 5) 30 31 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 32 33 void __init extent_map_init(void) 34 { 35 extent_map_cache = kmem_cache_create("extent_map", 36 sizeof(struct extent_map), 0, 37 SLAB_RECLAIM_ACCOUNT | 38 SLAB_DESTROY_BY_RCU, 39 NULL); 40 extent_state_cache = kmem_cache_create("extent_state", 41 sizeof(struct extent_state), 0, 42 SLAB_RECLAIM_ACCOUNT | 43 SLAB_DESTROY_BY_RCU, 44 NULL); 45 } 46 47 void __exit extent_map_exit(void) 48 { 49 if (extent_map_cache) 50 kmem_cache_destroy(extent_map_cache); 51 if (extent_state_cache) 52 kmem_cache_destroy(extent_state_cache); 53 } 54 55 void extent_map_tree_init(struct extent_map_tree *tree, 56 struct address_space *mapping, gfp_t mask) 57 { 58 tree->map.rb_node = NULL; 59 tree->state.rb_node = NULL; 60 tree->ops = NULL; 61 rwlock_init(&tree->lock); 62 tree->mapping = mapping; 63 } 64 EXPORT_SYMBOL(extent_map_tree_init); 65 66 struct extent_map *alloc_extent_map(gfp_t mask) 67 { 68 struct extent_map *em; 69 em = kmem_cache_alloc(extent_map_cache, mask); 70 if (!em || IS_ERR(em)) 71 return em; 72 em->in_tree = 0; 73 atomic_set(&em->refs, 1); 74 return em; 75 } 76 EXPORT_SYMBOL(alloc_extent_map); 77 78 void free_extent_map(struct extent_map *em) 79 { 80 if (atomic_dec_and_test(&em->refs)) { 81 WARN_ON(em->in_tree); 82 kmem_cache_free(extent_map_cache, em); 83 } 84 } 85 EXPORT_SYMBOL(free_extent_map); 86 87 88 struct extent_state *alloc_extent_state(gfp_t mask) 89 { 90 struct extent_state *state; 91 state = kmem_cache_alloc(extent_state_cache, mask); 92 if (!state || IS_ERR(state)) 93 return state; 94 state->state = 0; 95 state->in_tree = 0; 96 state->private = 0; 97 atomic_set(&state->refs, 1); 98 init_waitqueue_head(&state->wq); 99 return state; 100 } 101 EXPORT_SYMBOL(alloc_extent_state); 102 103 void free_extent_state(struct extent_state *state) 104 { 105 if (atomic_dec_and_test(&state->refs)) { 106 WARN_ON(state->in_tree); 107 kmem_cache_free(extent_state_cache, state); 108 } 109 } 110 EXPORT_SYMBOL(free_extent_state); 111 112 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 113 struct rb_node *node) 114 { 115 struct rb_node ** p = &root->rb_node; 116 struct rb_node * parent = NULL; 117 struct tree_entry *entry; 118 119 while(*p) { 120 parent = *p; 121 entry = rb_entry(parent, struct tree_entry, rb_node); 122 123 if (offset < entry->start) 124 p = &(*p)->rb_left; 125 else if (offset > entry->end) 126 p = &(*p)->rb_right; 127 else 128 return parent; 129 } 130 131 entry = rb_entry(node, struct tree_entry, rb_node); 132 entry->in_tree = 1; 133 rb_link_node(node, parent, p); 134 rb_insert_color(node, root); 135 return NULL; 136 } 137 138 static struct rb_node *__tree_search(struct rb_root *root, u64 offset, 139 struct rb_node **prev_ret) 140 { 141 struct rb_node * n = root->rb_node; 142 struct rb_node *prev = NULL; 143 struct tree_entry *entry; 144 struct tree_entry *prev_entry = NULL; 145 146 while(n) { 147 entry = rb_entry(n, struct tree_entry, rb_node); 148 prev = n; 149 prev_entry = entry; 150 151 if (offset < entry->start) 152 n = n->rb_left; 153 else if (offset > entry->end) 154 n = n->rb_right; 155 else 156 return n; 157 } 158 if (!prev_ret) 159 return NULL; 160 while(prev && offset > prev_entry->end) { 161 prev = rb_next(prev); 162 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 163 } 164 *prev_ret = prev; 165 return NULL; 166 } 167 168 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) 169 { 170 struct rb_node *prev; 171 struct rb_node *ret; 172 ret = __tree_search(root, offset, &prev); 173 if (!ret) 174 return prev; 175 return ret; 176 } 177 178 static int tree_delete(struct rb_root *root, u64 offset) 179 { 180 struct rb_node *node; 181 struct tree_entry *entry; 182 183 node = __tree_search(root, offset, NULL); 184 if (!node) 185 return -ENOENT; 186 entry = rb_entry(node, struct tree_entry, rb_node); 187 entry->in_tree = 0; 188 rb_erase(node, root); 189 return 0; 190 } 191 192 /* 193 * add_extent_mapping tries a simple backward merge with existing 194 * mappings. The extent_map struct passed in will be inserted into 195 * the tree directly (no copies made, just a reference taken). 196 */ 197 int add_extent_mapping(struct extent_map_tree *tree, 198 struct extent_map *em) 199 { 200 int ret = 0; 201 struct extent_map *prev = NULL; 202 struct rb_node *rb; 203 204 write_lock_irq(&tree->lock); 205 rb = tree_insert(&tree->map, em->end, &em->rb_node); 206 if (rb) { 207 prev = rb_entry(rb, struct extent_map, rb_node); 208 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); 209 ret = -EEXIST; 210 goto out; 211 } 212 atomic_inc(&em->refs); 213 if (em->start != 0) { 214 rb = rb_prev(&em->rb_node); 215 if (rb) 216 prev = rb_entry(rb, struct extent_map, rb_node); 217 if (prev && prev->end + 1 == em->start && 218 ((em->block_start == 0 && prev->block_start == 0) || 219 (em->block_start == prev->block_end + 1))) { 220 em->start = prev->start; 221 em->block_start = prev->block_start; 222 rb_erase(&prev->rb_node, &tree->map); 223 prev->in_tree = 0; 224 free_extent_map(prev); 225 } 226 } 227 out: 228 write_unlock_irq(&tree->lock); 229 return ret; 230 } 231 EXPORT_SYMBOL(add_extent_mapping); 232 233 /* 234 * lookup_extent_mapping returns the first extent_map struct in the 235 * tree that intersects the [start, end] (inclusive) range. There may 236 * be additional objects in the tree that intersect, so check the object 237 * returned carefully to make sure you don't need additional lookups. 238 */ 239 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, 240 u64 start, u64 end) 241 { 242 struct extent_map *em; 243 struct rb_node *rb_node; 244 245 read_lock_irq(&tree->lock); 246 rb_node = tree_search(&tree->map, start); 247 if (!rb_node) { 248 em = NULL; 249 goto out; 250 } 251 if (IS_ERR(rb_node)) { 252 em = ERR_PTR(PTR_ERR(rb_node)); 253 goto out; 254 } 255 em = rb_entry(rb_node, struct extent_map, rb_node); 256 if (em->end < start || em->start > end) { 257 em = NULL; 258 goto out; 259 } 260 atomic_inc(&em->refs); 261 out: 262 read_unlock_irq(&tree->lock); 263 return em; 264 } 265 EXPORT_SYMBOL(lookup_extent_mapping); 266 267 /* 268 * removes an extent_map struct from the tree. No reference counts are 269 * dropped, and no checks are done to see if the range is in use 270 */ 271 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) 272 { 273 int ret; 274 275 write_lock_irq(&tree->lock); 276 ret = tree_delete(&tree->map, em->end); 277 write_unlock_irq(&tree->lock); 278 return ret; 279 } 280 EXPORT_SYMBOL(remove_extent_mapping); 281 282 /* 283 * utility function to look for merge candidates inside a given range. 284 * Any extents with matching state are merged together into a single 285 * extent in the tree. Extents with EXTENT_IO in their state field 286 * are not merged because the end_io handlers need to be able to do 287 * operations on them without sleeping (or doing allocations/splits). 288 * 289 * This should be called with the tree lock held. 290 */ 291 static int merge_state(struct extent_map_tree *tree, 292 struct extent_state *state) 293 { 294 struct extent_state *other; 295 struct rb_node *other_node; 296 297 if (state->state & EXTENT_IOBITS) 298 return 0; 299 300 other_node = rb_prev(&state->rb_node); 301 if (other_node) { 302 other = rb_entry(other_node, struct extent_state, rb_node); 303 if (other->end == state->start - 1 && 304 other->state == state->state) { 305 state->start = other->start; 306 other->in_tree = 0; 307 rb_erase(&other->rb_node, &tree->state); 308 free_extent_state(other); 309 } 310 } 311 other_node = rb_next(&state->rb_node); 312 if (other_node) { 313 other = rb_entry(other_node, struct extent_state, rb_node); 314 if (other->start == state->end + 1 && 315 other->state == state->state) { 316 other->start = state->start; 317 state->in_tree = 0; 318 rb_erase(&state->rb_node, &tree->state); 319 free_extent_state(state); 320 } 321 } 322 return 0; 323 } 324 325 /* 326 * insert an extent_state struct into the tree. 'bits' are set on the 327 * struct before it is inserted. 328 * 329 * This may return -EEXIST if the extent is already there, in which case the 330 * state struct is freed. 331 * 332 * The tree lock is not taken internally. This is a utility function and 333 * probably isn't what you want to call (see set/clear_extent_bit). 334 */ 335 static int insert_state(struct extent_map_tree *tree, 336 struct extent_state *state, u64 start, u64 end, 337 int bits) 338 { 339 struct rb_node *node; 340 341 if (end < start) { 342 printk("end < start %Lu %Lu\n", end, start); 343 WARN_ON(1); 344 } 345 state->state |= bits; 346 state->start = start; 347 state->end = end; 348 if ((end & 4095) == 0) { 349 printk("insert state %Lu %Lu strange end\n", start, end); 350 WARN_ON(1); 351 } 352 node = tree_insert(&tree->state, end, &state->rb_node); 353 if (node) { 354 struct extent_state *found; 355 found = rb_entry(node, struct extent_state, rb_node); 356 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); 357 free_extent_state(state); 358 return -EEXIST; 359 } 360 merge_state(tree, state); 361 return 0; 362 } 363 364 /* 365 * split a given extent state struct in two, inserting the preallocated 366 * struct 'prealloc' as the newly created second half. 'split' indicates an 367 * offset inside 'orig' where it should be split. 368 * 369 * Before calling, 370 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 371 * are two extent state structs in the tree: 372 * prealloc: [orig->start, split - 1] 373 * orig: [ split, orig->end ] 374 * 375 * The tree locks are not taken by this function. They need to be held 376 * by the caller. 377 */ 378 static int split_state(struct extent_map_tree *tree, struct extent_state *orig, 379 struct extent_state *prealloc, u64 split) 380 { 381 struct rb_node *node; 382 prealloc->start = orig->start; 383 prealloc->end = split - 1; 384 prealloc->state = orig->state; 385 orig->start = split; 386 if ((prealloc->end & 4095) == 0) { 387 printk("insert state %Lu %Lu strange end\n", prealloc->start, 388 prealloc->end); 389 WARN_ON(1); 390 } 391 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 392 if (node) { 393 struct extent_state *found; 394 found = rb_entry(node, struct extent_state, rb_node); 395 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); 396 free_extent_state(prealloc); 397 return -EEXIST; 398 } 399 return 0; 400 } 401 402 /* 403 * utility function to clear some bits in an extent state struct. 404 * it will optionally wake up any one waiting on this state (wake == 1), or 405 * forcibly remove the state from the tree (delete == 1). 406 * 407 * If no bits are set on the state struct after clearing things, the 408 * struct is freed and removed from the tree 409 */ 410 static int clear_state_bit(struct extent_map_tree *tree, 411 struct extent_state *state, int bits, int wake, 412 int delete) 413 { 414 int ret = state->state & bits; 415 state->state &= ~bits; 416 if (wake) 417 wake_up(&state->wq); 418 if (delete || state->state == 0) { 419 if (state->in_tree) { 420 rb_erase(&state->rb_node, &tree->state); 421 state->in_tree = 0; 422 free_extent_state(state); 423 } else { 424 WARN_ON(1); 425 } 426 } else { 427 merge_state(tree, state); 428 } 429 return ret; 430 } 431 432 /* 433 * clear some bits on a range in the tree. This may require splitting 434 * or inserting elements in the tree, so the gfp mask is used to 435 * indicate which allocations or sleeping are allowed. 436 * 437 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 438 * the given range from the tree regardless of state (ie for truncate). 439 * 440 * the range [start, end] is inclusive. 441 * 442 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 443 * bits were already set, or zero if none of the bits were already set. 444 */ 445 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, 446 int bits, int wake, int delete, gfp_t mask) 447 { 448 struct extent_state *state; 449 struct extent_state *prealloc = NULL; 450 struct rb_node *node; 451 int err; 452 int set = 0; 453 454 again: 455 if (!prealloc && (mask & __GFP_WAIT)) { 456 prealloc = alloc_extent_state(mask); 457 if (!prealloc) 458 return -ENOMEM; 459 } 460 461 write_lock_irq(&tree->lock); 462 /* 463 * this search will find the extents that end after 464 * our range starts 465 */ 466 node = tree_search(&tree->state, start); 467 if (!node) 468 goto out; 469 state = rb_entry(node, struct extent_state, rb_node); 470 if (state->start > end) 471 goto out; 472 WARN_ON(state->end < start); 473 474 /* 475 * | ---- desired range ---- | 476 * | state | or 477 * | ------------- state -------------- | 478 * 479 * We need to split the extent we found, and may flip 480 * bits on second half. 481 * 482 * If the extent we found extends past our range, we 483 * just split and search again. It'll get split again 484 * the next time though. 485 * 486 * If the extent we found is inside our range, we clear 487 * the desired bit on it. 488 */ 489 490 if (state->start < start) { 491 err = split_state(tree, state, prealloc, start); 492 BUG_ON(err == -EEXIST); 493 prealloc = NULL; 494 if (err) 495 goto out; 496 if (state->end <= end) { 497 start = state->end + 1; 498 set |= clear_state_bit(tree, state, bits, 499 wake, delete); 500 } else { 501 start = state->start; 502 } 503 goto search_again; 504 } 505 /* 506 * | ---- desired range ---- | 507 * | state | 508 * We need to split the extent, and clear the bit 509 * on the first half 510 */ 511 if (state->start <= end && state->end > end) { 512 err = split_state(tree, state, prealloc, end + 1); 513 BUG_ON(err == -EEXIST); 514 515 if (wake) 516 wake_up(&state->wq); 517 set |= clear_state_bit(tree, prealloc, bits, 518 wake, delete); 519 prealloc = NULL; 520 goto out; 521 } 522 523 start = state->end + 1; 524 set |= clear_state_bit(tree, state, bits, wake, delete); 525 goto search_again; 526 527 out: 528 write_unlock_irq(&tree->lock); 529 if (prealloc) 530 free_extent_state(prealloc); 531 532 return set; 533 534 search_again: 535 if (start >= end) 536 goto out; 537 write_unlock_irq(&tree->lock); 538 if (mask & __GFP_WAIT) 539 cond_resched(); 540 goto again; 541 } 542 EXPORT_SYMBOL(clear_extent_bit); 543 544 static int wait_on_state(struct extent_map_tree *tree, 545 struct extent_state *state) 546 { 547 DEFINE_WAIT(wait); 548 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 549 read_unlock_irq(&tree->lock); 550 schedule(); 551 read_lock_irq(&tree->lock); 552 finish_wait(&state->wq, &wait); 553 return 0; 554 } 555 556 /* 557 * waits for one or more bits to clear on a range in the state tree. 558 * The range [start, end] is inclusive. 559 * The tree lock is taken by this function 560 */ 561 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) 562 { 563 struct extent_state *state; 564 struct rb_node *node; 565 566 read_lock_irq(&tree->lock); 567 again: 568 while (1) { 569 /* 570 * this search will find all the extents that end after 571 * our range starts 572 */ 573 node = tree_search(&tree->state, start); 574 if (!node) 575 break; 576 577 state = rb_entry(node, struct extent_state, rb_node); 578 579 if (state->start > end) 580 goto out; 581 582 if (state->state & bits) { 583 start = state->start; 584 atomic_inc(&state->refs); 585 wait_on_state(tree, state); 586 free_extent_state(state); 587 goto again; 588 } 589 start = state->end + 1; 590 591 if (start > end) 592 break; 593 594 if (need_resched()) { 595 read_unlock_irq(&tree->lock); 596 cond_resched(); 597 read_lock_irq(&tree->lock); 598 } 599 } 600 out: 601 read_unlock_irq(&tree->lock); 602 return 0; 603 } 604 EXPORT_SYMBOL(wait_extent_bit); 605 606 /* 607 * set some bits on a range in the tree. This may require allocations 608 * or sleeping, so the gfp mask is used to indicate what is allowed. 609 * 610 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 611 * range already has the desired bits set. The start of the existing 612 * range is returned in failed_start in this case. 613 * 614 * [start, end] is inclusive 615 * This takes the tree lock. 616 */ 617 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, 618 int exclusive, u64 *failed_start, gfp_t mask) 619 { 620 struct extent_state *state; 621 struct extent_state *prealloc = NULL; 622 struct rb_node *node; 623 int err = 0; 624 int set; 625 u64 last_start; 626 u64 last_end; 627 again: 628 if (!prealloc && (mask & __GFP_WAIT)) { 629 prealloc = alloc_extent_state(mask); 630 if (!prealloc) 631 return -ENOMEM; 632 } 633 634 write_lock_irq(&tree->lock); 635 /* 636 * this search will find all the extents that end after 637 * our range starts. 638 */ 639 node = tree_search(&tree->state, start); 640 if (!node) { 641 err = insert_state(tree, prealloc, start, end, bits); 642 prealloc = NULL; 643 BUG_ON(err == -EEXIST); 644 goto out; 645 } 646 647 state = rb_entry(node, struct extent_state, rb_node); 648 last_start = state->start; 649 last_end = state->end; 650 651 /* 652 * | ---- desired range ---- | 653 * | state | 654 * 655 * Just lock what we found and keep going 656 */ 657 if (state->start == start && state->end <= end) { 658 set = state->state & bits; 659 if (set && exclusive) { 660 *failed_start = state->start; 661 err = -EEXIST; 662 goto out; 663 } 664 state->state |= bits; 665 start = state->end + 1; 666 merge_state(tree, state); 667 goto search_again; 668 } 669 670 /* 671 * | ---- desired range ---- | 672 * | state | 673 * or 674 * | ------------- state -------------- | 675 * 676 * We need to split the extent we found, and may flip bits on 677 * second half. 678 * 679 * If the extent we found extends past our 680 * range, we just split and search again. It'll get split 681 * again the next time though. 682 * 683 * If the extent we found is inside our range, we set the 684 * desired bit on it. 685 */ 686 if (state->start < start) { 687 set = state->state & bits; 688 if (exclusive && set) { 689 *failed_start = start; 690 err = -EEXIST; 691 goto out; 692 } 693 err = split_state(tree, state, prealloc, start); 694 BUG_ON(err == -EEXIST); 695 prealloc = NULL; 696 if (err) 697 goto out; 698 if (state->end <= end) { 699 state->state |= bits; 700 start = state->end + 1; 701 merge_state(tree, state); 702 } else { 703 start = state->start; 704 } 705 goto search_again; 706 } 707 /* 708 * | ---- desired range ---- | 709 * | state | 710 * We need to split the extent, and set the bit 711 * on the first half 712 */ 713 if (state->start <= end && state->end > end) { 714 set = state->state & bits; 715 if (exclusive && set) { 716 *failed_start = start; 717 err = -EEXIST; 718 goto out; 719 } 720 err = split_state(tree, state, prealloc, end + 1); 721 BUG_ON(err == -EEXIST); 722 723 prealloc->state |= bits; 724 merge_state(tree, prealloc); 725 prealloc = NULL; 726 goto out; 727 } 728 729 /* 730 * | ---- desired range ---- | 731 * | state | or | state | 732 * 733 * There's a hole, we need to insert something in it and 734 * ignore the extent we found. 735 */ 736 if (state->start > start) { 737 u64 this_end; 738 if (end < last_start) 739 this_end = end; 740 else 741 this_end = last_start -1; 742 err = insert_state(tree, prealloc, start, this_end, 743 bits); 744 prealloc = NULL; 745 BUG_ON(err == -EEXIST); 746 if (err) 747 goto out; 748 start = this_end + 1; 749 goto search_again; 750 } 751 goto search_again; 752 753 out: 754 write_unlock_irq(&tree->lock); 755 if (prealloc) 756 free_extent_state(prealloc); 757 758 return err; 759 760 search_again: 761 if (start > end) 762 goto out; 763 write_unlock_irq(&tree->lock); 764 if (mask & __GFP_WAIT) 765 cond_resched(); 766 goto again; 767 } 768 EXPORT_SYMBOL(set_extent_bit); 769 770 /* wrappers around set/clear extent bit */ 771 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 772 gfp_t mask) 773 { 774 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 775 mask); 776 } 777 EXPORT_SYMBOL(set_extent_dirty); 778 779 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, 780 gfp_t mask) 781 { 782 return set_extent_bit(tree, start, end, 783 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, 784 mask); 785 } 786 EXPORT_SYMBOL(set_extent_delalloc); 787 788 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 789 gfp_t mask) 790 { 791 return clear_extent_bit(tree, start, end, 792 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 793 } 794 EXPORT_SYMBOL(clear_extent_dirty); 795 796 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 797 gfp_t mask) 798 { 799 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 800 mask); 801 } 802 EXPORT_SYMBOL(set_extent_new); 803 804 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 805 gfp_t mask) 806 { 807 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 808 } 809 EXPORT_SYMBOL(clear_extent_new); 810 811 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 812 gfp_t mask) 813 { 814 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 815 mask); 816 } 817 EXPORT_SYMBOL(set_extent_uptodate); 818 819 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 820 gfp_t mask) 821 { 822 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 823 } 824 EXPORT_SYMBOL(clear_extent_uptodate); 825 826 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 827 gfp_t mask) 828 { 829 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, 830 0, NULL, mask); 831 } 832 EXPORT_SYMBOL(set_extent_writeback); 833 834 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 835 gfp_t mask) 836 { 837 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); 838 } 839 EXPORT_SYMBOL(clear_extent_writeback); 840 841 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) 842 { 843 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 844 } 845 EXPORT_SYMBOL(wait_on_extent_writeback); 846 847 /* 848 * locks a range in ascending order, waiting for any locked regions 849 * it hits on the way. [start,end] are inclusive, and this will sleep. 850 */ 851 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) 852 { 853 int err; 854 u64 failed_start; 855 while (1) { 856 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 857 &failed_start, mask); 858 if (err == -EEXIST && (mask & __GFP_WAIT)) { 859 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 860 start = failed_start; 861 } else { 862 break; 863 } 864 WARN_ON(start > end); 865 } 866 return err; 867 } 868 EXPORT_SYMBOL(lock_extent); 869 870 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, 871 gfp_t mask) 872 { 873 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 874 } 875 EXPORT_SYMBOL(unlock_extent); 876 877 /* 878 * helper function to set pages and extents in the tree dirty 879 */ 880 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) 881 { 882 unsigned long index = start >> PAGE_CACHE_SHIFT; 883 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 884 struct page *page; 885 886 while (index <= end_index) { 887 page = find_get_page(tree->mapping, index); 888 BUG_ON(!page); 889 __set_page_dirty_nobuffers(page); 890 page_cache_release(page); 891 index++; 892 } 893 set_extent_dirty(tree, start, end, GFP_NOFS); 894 return 0; 895 } 896 EXPORT_SYMBOL(set_range_dirty); 897 898 /* 899 * helper function to set both pages and extents in the tree writeback 900 */ 901 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) 902 { 903 unsigned long index = start >> PAGE_CACHE_SHIFT; 904 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 905 struct page *page; 906 907 while (index <= end_index) { 908 page = find_get_page(tree->mapping, index); 909 BUG_ON(!page); 910 set_page_writeback(page); 911 page_cache_release(page); 912 index++; 913 } 914 set_extent_writeback(tree, start, end, GFP_NOFS); 915 return 0; 916 } 917 EXPORT_SYMBOL(set_range_writeback); 918 919 u64 find_lock_delalloc_range(struct extent_map_tree *tree, 920 u64 start, u64 lock_start, u64 *end, u64 max_bytes) 921 { 922 struct rb_node *node; 923 struct extent_state *state; 924 u64 cur_start = start; 925 u64 found = 0; 926 u64 total_bytes = 0; 927 928 write_lock_irq(&tree->lock); 929 /* 930 * this search will find all the extents that end after 931 * our range starts. 932 */ 933 search_again: 934 node = tree_search(&tree->state, cur_start); 935 if (!node || IS_ERR(node)) { 936 goto out; 937 } 938 939 while(1) { 940 state = rb_entry(node, struct extent_state, rb_node); 941 if (state->start != cur_start) { 942 goto out; 943 } 944 if (!(state->state & EXTENT_DELALLOC)) { 945 goto out; 946 } 947 if (state->start >= lock_start) { 948 if (state->state & EXTENT_LOCKED) { 949 DEFINE_WAIT(wait); 950 atomic_inc(&state->refs); 951 write_unlock_irq(&tree->lock); 952 schedule(); 953 write_lock_irq(&tree->lock); 954 finish_wait(&state->wq, &wait); 955 free_extent_state(state); 956 goto search_again; 957 } 958 state->state |= EXTENT_LOCKED; 959 } 960 found++; 961 *end = state->end; 962 cur_start = state->end + 1; 963 node = rb_next(node); 964 if (!node) 965 break; 966 total_bytes = state->end - state->start + 1; 967 if (total_bytes >= max_bytes) 968 break; 969 } 970 out: 971 write_unlock_irq(&tree->lock); 972 return found; 973 } 974 975 /* 976 * helper function to lock both pages and extents in the tree. 977 * pages must be locked first. 978 */ 979 int lock_range(struct extent_map_tree *tree, u64 start, u64 end) 980 { 981 unsigned long index = start >> PAGE_CACHE_SHIFT; 982 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 983 struct page *page; 984 int err; 985 986 while (index <= end_index) { 987 page = grab_cache_page(tree->mapping, index); 988 if (!page) { 989 err = -ENOMEM; 990 goto failed; 991 } 992 if (IS_ERR(page)) { 993 err = PTR_ERR(page); 994 goto failed; 995 } 996 index++; 997 } 998 lock_extent(tree, start, end, GFP_NOFS); 999 return 0; 1000 1001 failed: 1002 /* 1003 * we failed above in getting the page at 'index', so we undo here 1004 * up to but not including the page at 'index' 1005 */ 1006 end_index = index; 1007 index = start >> PAGE_CACHE_SHIFT; 1008 while (index < end_index) { 1009 page = find_get_page(tree->mapping, index); 1010 unlock_page(page); 1011 page_cache_release(page); 1012 index++; 1013 } 1014 return err; 1015 } 1016 EXPORT_SYMBOL(lock_range); 1017 1018 /* 1019 * helper function to unlock both pages and extents in the tree. 1020 */ 1021 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) 1022 { 1023 unsigned long index = start >> PAGE_CACHE_SHIFT; 1024 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1025 struct page *page; 1026 1027 while (index <= end_index) { 1028 page = find_get_page(tree->mapping, index); 1029 unlock_page(page); 1030 page_cache_release(page); 1031 index++; 1032 } 1033 unlock_extent(tree, start, end, GFP_NOFS); 1034 return 0; 1035 } 1036 EXPORT_SYMBOL(unlock_range); 1037 1038 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) 1039 { 1040 struct rb_node *node; 1041 struct extent_state *state; 1042 int ret = 0; 1043 1044 write_lock_irq(&tree->lock); 1045 /* 1046 * this search will find all the extents that end after 1047 * our range starts. 1048 */ 1049 node = tree_search(&tree->state, start); 1050 if (!node || IS_ERR(node)) { 1051 ret = -ENOENT; 1052 goto out; 1053 } 1054 state = rb_entry(node, struct extent_state, rb_node); 1055 if (state->start != start) { 1056 ret = -ENOENT; 1057 goto out; 1058 } 1059 state->private = private; 1060 out: 1061 write_unlock_irq(&tree->lock); 1062 return ret; 1063 1064 } 1065 1066 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) 1067 { 1068 struct rb_node *node; 1069 struct extent_state *state; 1070 int ret = 0; 1071 1072 read_lock_irq(&tree->lock); 1073 /* 1074 * this search will find all the extents that end after 1075 * our range starts. 1076 */ 1077 node = tree_search(&tree->state, start); 1078 if (!node || IS_ERR(node)) { 1079 ret = -ENOENT; 1080 goto out; 1081 } 1082 state = rb_entry(node, struct extent_state, rb_node); 1083 if (state->start != start) { 1084 ret = -ENOENT; 1085 goto out; 1086 } 1087 *private = state->private; 1088 out: 1089 read_unlock_irq(&tree->lock); 1090 return ret; 1091 } 1092 1093 /* 1094 * searches a range in the state tree for a given mask. 1095 * If 'filled' == 1, this returns 1 only if ever extent in the tree 1096 * has the bits set. Otherwise, 1 is returned if any bit in the 1097 * range is found set. 1098 */ 1099 static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, 1100 int bits, int filled) 1101 { 1102 struct extent_state *state = NULL; 1103 struct rb_node *node; 1104 int bitset = 0; 1105 1106 read_lock_irq(&tree->lock); 1107 node = tree_search(&tree->state, start); 1108 while (node && start <= end) { 1109 state = rb_entry(node, struct extent_state, rb_node); 1110 if (state->start > end) 1111 break; 1112 1113 if (filled && state->start > start) { 1114 bitset = 0; 1115 break; 1116 } 1117 if (state->state & bits) { 1118 bitset = 1; 1119 if (!filled) 1120 break; 1121 } else if (filled) { 1122 bitset = 0; 1123 break; 1124 } 1125 start = state->end + 1; 1126 if (start > end) 1127 break; 1128 node = rb_next(node); 1129 } 1130 read_unlock_irq(&tree->lock); 1131 return bitset; 1132 } 1133 1134 /* 1135 * helper function to set a given page up to date if all the 1136 * extents in the tree for that page are up to date 1137 */ 1138 static int check_page_uptodate(struct extent_map_tree *tree, 1139 struct page *page) 1140 { 1141 u64 start = page->index << PAGE_CACHE_SHIFT; 1142 u64 end = start + PAGE_CACHE_SIZE - 1; 1143 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1144 SetPageUptodate(page); 1145 return 0; 1146 } 1147 1148 /* 1149 * helper function to unlock a page if all the extents in the tree 1150 * for that page are unlocked 1151 */ 1152 static int check_page_locked(struct extent_map_tree *tree, 1153 struct page *page) 1154 { 1155 u64 start = page->index << PAGE_CACHE_SHIFT; 1156 u64 end = start + PAGE_CACHE_SIZE - 1; 1157 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1158 unlock_page(page); 1159 return 0; 1160 } 1161 1162 /* 1163 * helper function to end page writeback if all the extents 1164 * in the tree for that page are done with writeback 1165 */ 1166 static int check_page_writeback(struct extent_map_tree *tree, 1167 struct page *page) 1168 { 1169 u64 start = page->index << PAGE_CACHE_SHIFT; 1170 u64 end = start + PAGE_CACHE_SIZE - 1; 1171 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) 1172 end_page_writeback(page); 1173 return 0; 1174 } 1175 1176 /* lots and lots of room for performance fixes in the end_bio funcs */ 1177 1178 /* 1179 * after a writepage IO is done, we need to: 1180 * clear the uptodate bits on error 1181 * clear the writeback bits in the extent tree for this IO 1182 * end_page_writeback if the page has no more pending IO 1183 * 1184 * Scheduling is not allowed, so the extent state tree is expected 1185 * to have one and only one object corresponding to this IO. 1186 */ 1187 static int end_bio_extent_writepage(struct bio *bio, 1188 unsigned int bytes_done, int err) 1189 { 1190 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1191 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1192 struct extent_map_tree *tree = bio->bi_private; 1193 u64 start; 1194 u64 end; 1195 int whole_page; 1196 1197 if (bio->bi_size) 1198 return 1; 1199 1200 do { 1201 struct page *page = bvec->bv_page; 1202 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1203 end = start + bvec->bv_len - 1; 1204 1205 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1206 whole_page = 1; 1207 else 1208 whole_page = 0; 1209 1210 if (--bvec >= bio->bi_io_vec) 1211 prefetchw(&bvec->bv_page->flags); 1212 1213 if (!uptodate) { 1214 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1215 ClearPageUptodate(page); 1216 SetPageError(page); 1217 } 1218 clear_extent_writeback(tree, start, end, GFP_ATOMIC); 1219 1220 if (whole_page) 1221 end_page_writeback(page); 1222 else 1223 check_page_writeback(tree, page); 1224 } while (bvec >= bio->bi_io_vec); 1225 1226 bio_put(bio); 1227 return 0; 1228 } 1229 1230 /* 1231 * after a readpage IO is done, we need to: 1232 * clear the uptodate bits on error 1233 * set the uptodate bits if things worked 1234 * set the page up to date if all extents in the tree are uptodate 1235 * clear the lock bit in the extent tree 1236 * unlock the page if there are no other extents locked for it 1237 * 1238 * Scheduling is not allowed, so the extent state tree is expected 1239 * to have one and only one object corresponding to this IO. 1240 */ 1241 static int end_bio_extent_readpage(struct bio *bio, 1242 unsigned int bytes_done, int err) 1243 { 1244 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1245 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1246 struct extent_map_tree *tree = bio->bi_private; 1247 u64 start; 1248 u64 end; 1249 int whole_page; 1250 int ret; 1251 1252 if (bio->bi_size) 1253 return 1; 1254 1255 do { 1256 struct page *page = bvec->bv_page; 1257 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1258 end = start + bvec->bv_len - 1; 1259 1260 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1261 whole_page = 1; 1262 else 1263 whole_page = 0; 1264 1265 if (--bvec >= bio->bi_io_vec) 1266 prefetchw(&bvec->bv_page->flags); 1267 1268 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1269 ret = tree->ops->readpage_end_io_hook(page, start, end); 1270 if (ret) 1271 uptodate = 0; 1272 } 1273 if (uptodate) { 1274 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1275 if (whole_page) 1276 SetPageUptodate(page); 1277 else 1278 check_page_uptodate(tree, page); 1279 } else { 1280 ClearPageUptodate(page); 1281 SetPageError(page); 1282 } 1283 1284 unlock_extent(tree, start, end, GFP_ATOMIC); 1285 1286 if (whole_page) 1287 unlock_page(page); 1288 else 1289 check_page_locked(tree, page); 1290 } while (bvec >= bio->bi_io_vec); 1291 1292 bio_put(bio); 1293 return 0; 1294 } 1295 1296 /* 1297 * IO done from prepare_write is pretty simple, we just unlock 1298 * the structs in the extent tree when done, and set the uptodate bits 1299 * as appropriate. 1300 */ 1301 static int end_bio_extent_preparewrite(struct bio *bio, 1302 unsigned int bytes_done, int err) 1303 { 1304 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1305 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1306 struct extent_map_tree *tree = bio->bi_private; 1307 u64 start; 1308 u64 end; 1309 1310 if (bio->bi_size) 1311 return 1; 1312 1313 do { 1314 struct page *page = bvec->bv_page; 1315 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1316 end = start + bvec->bv_len - 1; 1317 1318 if (--bvec >= bio->bi_io_vec) 1319 prefetchw(&bvec->bv_page->flags); 1320 1321 if (uptodate) { 1322 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1323 } else { 1324 ClearPageUptodate(page); 1325 SetPageError(page); 1326 } 1327 1328 unlock_extent(tree, start, end, GFP_ATOMIC); 1329 1330 } while (bvec >= bio->bi_io_vec); 1331 1332 bio_put(bio); 1333 return 0; 1334 } 1335 1336 static int submit_extent_page(int rw, struct extent_map_tree *tree, 1337 struct page *page, sector_t sector, 1338 size_t size, unsigned long offset, 1339 struct block_device *bdev, 1340 bio_end_io_t end_io_func) 1341 { 1342 struct bio *bio; 1343 int ret = 0; 1344 1345 bio = bio_alloc(GFP_NOIO, 1); 1346 1347 bio->bi_sector = sector; 1348 bio->bi_bdev = bdev; 1349 bio->bi_io_vec[0].bv_page = page; 1350 bio->bi_io_vec[0].bv_len = size; 1351 bio->bi_io_vec[0].bv_offset = offset; 1352 1353 bio->bi_vcnt = 1; 1354 bio->bi_idx = 0; 1355 bio->bi_size = size; 1356 1357 bio->bi_end_io = end_io_func; 1358 bio->bi_private = tree; 1359 1360 bio_get(bio); 1361 submit_bio(rw, bio); 1362 1363 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1364 ret = -EOPNOTSUPP; 1365 1366 bio_put(bio); 1367 return ret; 1368 } 1369 1370 /* 1371 * basic readpage implementation. Locked extent state structs are inserted 1372 * into the tree that are removed when the IO is done (by the end_io 1373 * handlers) 1374 */ 1375 int extent_read_full_page(struct extent_map_tree *tree, struct page *page, 1376 get_extent_t *get_extent) 1377 { 1378 struct inode *inode = page->mapping->host; 1379 u64 start = page->index << PAGE_CACHE_SHIFT; 1380 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1381 u64 end; 1382 u64 cur = start; 1383 u64 extent_offset; 1384 u64 last_byte = i_size_read(inode); 1385 u64 block_start; 1386 u64 cur_end; 1387 sector_t sector; 1388 struct extent_map *em; 1389 struct block_device *bdev; 1390 int ret; 1391 int nr = 0; 1392 size_t page_offset = 0; 1393 size_t iosize; 1394 size_t blocksize = inode->i_sb->s_blocksize; 1395 1396 if (!PagePrivate(page)) { 1397 SetPagePrivate(page); 1398 set_page_private(page, 1); 1399 WARN_ON(!page->mapping->a_ops->invalidatepage); 1400 page_cache_get(page); 1401 } 1402 1403 end = page_end; 1404 lock_extent(tree, start, end, GFP_NOFS); 1405 1406 while (cur <= end) { 1407 if (cur >= last_byte) { 1408 iosize = PAGE_CACHE_SIZE - page_offset; 1409 zero_user_page(page, page_offset, iosize, KM_USER0); 1410 set_extent_uptodate(tree, cur, cur + iosize - 1, 1411 GFP_NOFS); 1412 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1413 break; 1414 } 1415 em = get_extent(inode, page, page_offset, cur, end, 0); 1416 if (IS_ERR(em) || !em) { 1417 SetPageError(page); 1418 unlock_extent(tree, cur, end, GFP_NOFS); 1419 break; 1420 } 1421 1422 extent_offset = cur - em->start; 1423 BUG_ON(em->end < cur); 1424 BUG_ON(end < cur); 1425 1426 iosize = min(em->end - cur, end - cur) + 1; 1427 cur_end = min(em->end, end); 1428 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1429 sector = (em->block_start + extent_offset) >> 9; 1430 bdev = em->bdev; 1431 block_start = em->block_start; 1432 free_extent_map(em); 1433 em = NULL; 1434 1435 /* we've found a hole, just zero and go on */ 1436 if (block_start == 0) { 1437 zero_user_page(page, page_offset, iosize, KM_USER0); 1438 set_extent_uptodate(tree, cur, cur + iosize - 1, 1439 GFP_NOFS); 1440 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1441 cur = cur + iosize; 1442 page_offset += iosize; 1443 continue; 1444 } 1445 /* the get_extent function already copied into the page */ 1446 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 1447 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1448 cur = cur + iosize; 1449 page_offset += iosize; 1450 continue; 1451 } 1452 1453 ret = 0; 1454 if (tree->ops && tree->ops->readpage_io_hook) { 1455 ret = tree->ops->readpage_io_hook(page, cur, 1456 cur + iosize - 1); 1457 } 1458 if (!ret) { 1459 ret = submit_extent_page(READ, tree, page, 1460 sector, iosize, page_offset, 1461 bdev, end_bio_extent_readpage); 1462 } 1463 if (ret) 1464 SetPageError(page); 1465 cur = cur + iosize; 1466 page_offset += iosize; 1467 nr++; 1468 } 1469 if (!nr) { 1470 if (!PageError(page)) 1471 SetPageUptodate(page); 1472 unlock_page(page); 1473 } 1474 return 0; 1475 } 1476 EXPORT_SYMBOL(extent_read_full_page); 1477 1478 /* 1479 * the writepage semantics are similar to regular writepage. extent 1480 * records are inserted to lock ranges in the tree, and as dirty areas 1481 * are found, they are marked writeback. Then the lock bits are removed 1482 * and the end_io handler clears the writeback ranges 1483 */ 1484 int extent_write_full_page(struct extent_map_tree *tree, struct page *page, 1485 get_extent_t *get_extent, 1486 struct writeback_control *wbc) 1487 { 1488 struct inode *inode = page->mapping->host; 1489 u64 start = page->index << PAGE_CACHE_SHIFT; 1490 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1491 u64 end; 1492 u64 cur = start; 1493 u64 extent_offset; 1494 u64 last_byte = i_size_read(inode); 1495 u64 block_start; 1496 sector_t sector; 1497 struct extent_map *em; 1498 struct block_device *bdev; 1499 int ret; 1500 int nr = 0; 1501 size_t page_offset = 0; 1502 size_t iosize; 1503 size_t blocksize; 1504 loff_t i_size = i_size_read(inode); 1505 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 1506 u64 nr_delalloc; 1507 u64 delalloc_end; 1508 1509 WARN_ON(!PageLocked(page)); 1510 if (page->index > end_index) { 1511 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1512 unlock_page(page); 1513 return 0; 1514 } 1515 1516 if (page->index == end_index) { 1517 size_t offset = i_size & (PAGE_CACHE_SIZE - 1); 1518 zero_user_page(page, offset, 1519 PAGE_CACHE_SIZE - offset, KM_USER0); 1520 } 1521 1522 if (!PagePrivate(page)) { 1523 SetPagePrivate(page); 1524 set_page_private(page, 1); 1525 WARN_ON(!page->mapping->a_ops->invalidatepage); 1526 page_cache_get(page); 1527 } 1528 1529 lock_extent(tree, start, page_end, GFP_NOFS); 1530 nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, 1531 &delalloc_end, 1532 128 * 1024 * 1024); 1533 if (nr_delalloc) { 1534 tree->ops->fill_delalloc(inode, start, delalloc_end); 1535 if (delalloc_end >= page_end + 1) { 1536 clear_extent_bit(tree, page_end + 1, delalloc_end, 1537 EXTENT_LOCKED | EXTENT_DELALLOC, 1538 1, 0, GFP_NOFS); 1539 } 1540 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC, 1541 0, 0, GFP_NOFS); 1542 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1543 printk("found delalloc bits after clear extent_bit\n"); 1544 } 1545 } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1546 printk("found delalloc bits after find_delalloc_range returns 0\n"); 1547 } 1548 1549 end = page_end; 1550 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1551 printk("found delalloc bits after lock_extent\n"); 1552 } 1553 1554 if (last_byte <= start) { 1555 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1556 goto done; 1557 } 1558 1559 set_extent_uptodate(tree, start, page_end, GFP_NOFS); 1560 blocksize = inode->i_sb->s_blocksize; 1561 1562 while (cur <= end) { 1563 if (cur >= last_byte) { 1564 clear_extent_dirty(tree, cur, page_end, GFP_NOFS); 1565 break; 1566 } 1567 em = get_extent(inode, page, page_offset, cur, end, 0); 1568 if (IS_ERR(em) || !em) { 1569 SetPageError(page); 1570 break; 1571 } 1572 1573 extent_offset = cur - em->start; 1574 BUG_ON(em->end < cur); 1575 BUG_ON(end < cur); 1576 iosize = min(em->end - cur, end - cur) + 1; 1577 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1578 sector = (em->block_start + extent_offset) >> 9; 1579 bdev = em->bdev; 1580 block_start = em->block_start; 1581 free_extent_map(em); 1582 em = NULL; 1583 1584 if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { 1585 clear_extent_dirty(tree, cur, 1586 cur + iosize - 1, GFP_NOFS); 1587 cur = cur + iosize; 1588 page_offset += iosize; 1589 continue; 1590 } 1591 1592 /* leave this out until we have a page_mkwrite call */ 1593 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 1594 EXTENT_DIRTY, 0)) { 1595 cur = cur + iosize; 1596 page_offset += iosize; 1597 continue; 1598 } 1599 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 1600 ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1); 1601 if (ret) 1602 SetPageError(page); 1603 else { 1604 set_range_writeback(tree, cur, cur + iosize - 1); 1605 ret = submit_extent_page(WRITE, tree, page, sector, 1606 iosize, page_offset, bdev, 1607 end_bio_extent_writepage); 1608 if (ret) 1609 SetPageError(page); 1610 } 1611 cur = cur + iosize; 1612 page_offset += iosize; 1613 nr++; 1614 } 1615 done: 1616 WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); 1617 unlock_extent(tree, start, page_end, GFP_NOFS); 1618 unlock_page(page); 1619 return 0; 1620 } 1621 EXPORT_SYMBOL(extent_write_full_page); 1622 1623 /* 1624 * basic invalidatepage code, this waits on any locked or writeback 1625 * ranges corresponding to the page, and then deletes any extent state 1626 * records from the tree 1627 */ 1628 int extent_invalidatepage(struct extent_map_tree *tree, 1629 struct page *page, unsigned long offset) 1630 { 1631 u64 start = (page->index << PAGE_CACHE_SHIFT); 1632 u64 end = start + PAGE_CACHE_SIZE - 1; 1633 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 1634 1635 start += (offset + blocksize -1) & ~(blocksize - 1); 1636 if (start > end) 1637 return 0; 1638 1639 lock_extent(tree, start, end, GFP_NOFS); 1640 wait_on_extent_writeback(tree, start, end); 1641 clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY, 1642 1, 1, GFP_NOFS); 1643 return 0; 1644 } 1645 EXPORT_SYMBOL(extent_invalidatepage); 1646 1647 /* 1648 * simple commit_write call, set_range_dirty is used to mark both 1649 * the pages and the extent records as dirty 1650 */ 1651 int extent_commit_write(struct extent_map_tree *tree, 1652 struct inode *inode, struct page *page, 1653 unsigned from, unsigned to) 1654 { 1655 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1656 1657 if (!PagePrivate(page)) { 1658 SetPagePrivate(page); 1659 set_page_private(page, 1); 1660 WARN_ON(!page->mapping->a_ops->invalidatepage); 1661 page_cache_get(page); 1662 } 1663 1664 set_page_dirty(page); 1665 1666 if (pos > inode->i_size) { 1667 i_size_write(inode, pos); 1668 mark_inode_dirty(inode); 1669 } 1670 return 0; 1671 } 1672 EXPORT_SYMBOL(extent_commit_write); 1673 1674 int extent_prepare_write(struct extent_map_tree *tree, 1675 struct inode *inode, struct page *page, 1676 unsigned from, unsigned to, get_extent_t *get_extent) 1677 { 1678 u64 page_start = page->index << PAGE_CACHE_SHIFT; 1679 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 1680 u64 block_start; 1681 u64 orig_block_start; 1682 u64 block_end; 1683 u64 cur_end; 1684 struct extent_map *em; 1685 unsigned blocksize = 1 << inode->i_blkbits; 1686 size_t page_offset = 0; 1687 size_t block_off_start; 1688 size_t block_off_end; 1689 int err = 0; 1690 int iocount = 0; 1691 int ret = 0; 1692 int isnew; 1693 1694 if (!PagePrivate(page)) { 1695 SetPagePrivate(page); 1696 set_page_private(page, 1); 1697 WARN_ON(!page->mapping->a_ops->invalidatepage); 1698 page_cache_get(page); 1699 } 1700 block_start = (page_start + from) & ~((u64)blocksize - 1); 1701 block_end = (page_start + to - 1) | (blocksize - 1); 1702 orig_block_start = block_start; 1703 1704 lock_extent(tree, page_start, page_end, GFP_NOFS); 1705 while(block_start <= block_end) { 1706 em = get_extent(inode, page, page_offset, block_start, 1707 block_end, 1); 1708 if (IS_ERR(em) || !em) { 1709 goto err; 1710 } 1711 cur_end = min(block_end, em->end); 1712 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 1713 block_off_end = block_off_start + blocksize; 1714 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 1715 1716 if (!PageUptodate(page) && isnew && 1717 (block_off_end > to || block_off_start < from)) { 1718 void *kaddr; 1719 1720 kaddr = kmap_atomic(page, KM_USER0); 1721 if (block_off_end > to) 1722 memset(kaddr + to, 0, block_off_end - to); 1723 if (block_off_start < from) 1724 memset(kaddr + block_off_start, 0, 1725 from - block_off_start); 1726 flush_dcache_page(page); 1727 kunmap_atomic(kaddr, KM_USER0); 1728 } 1729 if (!isnew && !PageUptodate(page) && 1730 (block_off_end > to || block_off_start < from) && 1731 !test_range_bit(tree, block_start, cur_end, 1732 EXTENT_UPTODATE, 1)) { 1733 u64 sector; 1734 u64 extent_offset = block_start - em->start; 1735 size_t iosize; 1736 sector = (em->block_start + extent_offset) >> 9; 1737 iosize = (cur_end - block_start + blocksize - 1) & 1738 ~((u64)blocksize - 1); 1739 /* 1740 * we've already got the extent locked, but we 1741 * need to split the state such that our end_bio 1742 * handler can clear the lock. 1743 */ 1744 set_extent_bit(tree, block_start, 1745 block_start + iosize - 1, 1746 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 1747 ret = submit_extent_page(READ, tree, page, 1748 sector, iosize, page_offset, em->bdev, 1749 end_bio_extent_preparewrite); 1750 iocount++; 1751 block_start = block_start + iosize; 1752 } else { 1753 set_extent_uptodate(tree, block_start, cur_end, 1754 GFP_NOFS); 1755 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 1756 block_start = cur_end + 1; 1757 } 1758 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 1759 free_extent_map(em); 1760 } 1761 if (iocount) { 1762 wait_extent_bit(tree, orig_block_start, 1763 block_end, EXTENT_LOCKED); 1764 } 1765 check_page_uptodate(tree, page); 1766 err: 1767 /* FIXME, zero out newly allocated blocks on error */ 1768 return err; 1769 } 1770 EXPORT_SYMBOL(extent_prepare_write); 1771 1772 /* 1773 * a helper for releasepage. As long as there are no locked extents 1774 * in the range corresponding to the page, both state records and extent 1775 * map records are removed 1776 */ 1777 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) 1778 { 1779 struct extent_map *em; 1780 u64 start = page->index << PAGE_CACHE_SHIFT; 1781 u64 end = start + PAGE_CACHE_SIZE - 1; 1782 u64 orig_start = start; 1783 int ret = 1; 1784 1785 while (start <= end) { 1786 em = lookup_extent_mapping(tree, start, end); 1787 if (!em || IS_ERR(em)) 1788 break; 1789 if (!test_range_bit(tree, em->start, em->end, 1790 EXTENT_LOCKED, 0)) { 1791 remove_extent_mapping(tree, em); 1792 /* once for the rb tree */ 1793 free_extent_map(em); 1794 } 1795 start = em->end + 1; 1796 /* once for us */ 1797 free_extent_map(em); 1798 } 1799 if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) 1800 ret = 0; 1801 else 1802 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, 1803 1, 1, GFP_NOFS); 1804 return ret; 1805 } 1806 EXPORT_SYMBOL(try_release_extent_mapping); 1807 1808