1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/gfp.h> 6 #include <linux/pagemap.h> 7 #include <linux/page-flags.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include "extent_map.h" 12 13 /* temporary define until extent_map moves out of btrfs */ 14 struct kmem_cache *btrfs_cache_create(const char *name, size_t size, 15 unsigned long extra_flags, 16 void (*ctor)(void *, struct kmem_cache *, 17 unsigned long)); 18 19 static struct kmem_cache *extent_map_cache; 20 static struct kmem_cache *extent_state_cache; 21 22 struct tree_entry { 23 u64 start; 24 u64 end; 25 int in_tree; 26 struct rb_node rb_node; 27 }; 28 29 /* bits for the extent state */ 30 #define EXTENT_DIRTY 1 31 #define EXTENT_WRITEBACK (1 << 1) 32 #define EXTENT_UPTODATE (1 << 2) 33 #define EXTENT_LOCKED (1 << 3) 34 #define EXTENT_NEW (1 << 4) 35 #define EXTENT_DELALLOC (1 << 5) 36 37 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 38 39 void __init extent_map_init(void) 40 { 41 extent_map_cache = btrfs_cache_create("extent_map", 42 sizeof(struct extent_map), 43 SLAB_DESTROY_BY_RCU, 44 NULL); 45 extent_state_cache = btrfs_cache_create("extent_state", 46 sizeof(struct extent_state), 47 SLAB_DESTROY_BY_RCU, 48 NULL); 49 } 50 51 void __exit extent_map_exit(void) 52 { 53 if (extent_map_cache) 54 kmem_cache_destroy(extent_map_cache); 55 if (extent_state_cache) 56 kmem_cache_destroy(extent_state_cache); 57 } 58 59 void extent_map_tree_init(struct extent_map_tree *tree, 60 struct address_space *mapping, gfp_t mask) 61 { 62 tree->map.rb_node = NULL; 63 tree->state.rb_node = NULL; 64 tree->ops = NULL; 65 rwlock_init(&tree->lock); 66 tree->mapping = mapping; 67 } 68 EXPORT_SYMBOL(extent_map_tree_init); 69 70 struct extent_map *alloc_extent_map(gfp_t mask) 71 { 72 struct extent_map *em; 73 em = kmem_cache_alloc(extent_map_cache, mask); 74 if (!em || IS_ERR(em)) 75 return em; 76 em->in_tree = 0; 77 atomic_set(&em->refs, 1); 78 return em; 79 } 80 EXPORT_SYMBOL(alloc_extent_map); 81 82 void free_extent_map(struct extent_map *em) 83 { 84 if (!em) 85 return; 86 if (atomic_dec_and_test(&em->refs)) { 87 WARN_ON(em->in_tree); 88 kmem_cache_free(extent_map_cache, em); 89 } 90 } 91 EXPORT_SYMBOL(free_extent_map); 92 93 94 struct extent_state *alloc_extent_state(gfp_t mask) 95 { 96 struct extent_state *state; 97 state = kmem_cache_alloc(extent_state_cache, mask); 98 if (!state || IS_ERR(state)) 99 return state; 100 state->state = 0; 101 state->in_tree = 0; 102 state->private = 0; 103 atomic_set(&state->refs, 1); 104 init_waitqueue_head(&state->wq); 105 return state; 106 } 107 EXPORT_SYMBOL(alloc_extent_state); 108 109 void free_extent_state(struct extent_state *state) 110 { 111 if (!state) 112 return; 113 if (atomic_dec_and_test(&state->refs)) { 114 WARN_ON(state->in_tree); 115 kmem_cache_free(extent_state_cache, state); 116 } 117 } 118 EXPORT_SYMBOL(free_extent_state); 119 120 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 121 struct rb_node *node) 122 { 123 struct rb_node ** p = &root->rb_node; 124 struct rb_node * parent = NULL; 125 struct tree_entry *entry; 126 127 while(*p) { 128 parent = *p; 129 entry = rb_entry(parent, struct tree_entry, rb_node); 130 131 if (offset < entry->start) 132 p = &(*p)->rb_left; 133 else if (offset > entry->end) 134 p = &(*p)->rb_right; 135 else 136 return parent; 137 } 138 139 entry = rb_entry(node, struct tree_entry, rb_node); 140 entry->in_tree = 1; 141 rb_link_node(node, parent, p); 142 rb_insert_color(node, root); 143 return NULL; 144 } 145 146 static struct rb_node *__tree_search(struct rb_root *root, u64 offset, 147 struct rb_node **prev_ret) 148 { 149 struct rb_node * n = root->rb_node; 150 struct rb_node *prev = NULL; 151 struct tree_entry *entry; 152 struct tree_entry *prev_entry = NULL; 153 154 while(n) { 155 entry = rb_entry(n, struct tree_entry, rb_node); 156 prev = n; 157 prev_entry = entry; 158 159 if (offset < entry->start) 160 n = n->rb_left; 161 else if (offset > entry->end) 162 n = n->rb_right; 163 else 164 return n; 165 } 166 if (!prev_ret) 167 return NULL; 168 while(prev && offset > prev_entry->end) { 169 prev = rb_next(prev); 170 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 171 } 172 *prev_ret = prev; 173 return NULL; 174 } 175 176 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) 177 { 178 struct rb_node *prev; 179 struct rb_node *ret; 180 ret = __tree_search(root, offset, &prev); 181 if (!ret) 182 return prev; 183 return ret; 184 } 185 186 static int tree_delete(struct rb_root *root, u64 offset) 187 { 188 struct rb_node *node; 189 struct tree_entry *entry; 190 191 node = __tree_search(root, offset, NULL); 192 if (!node) 193 return -ENOENT; 194 entry = rb_entry(node, struct tree_entry, rb_node); 195 entry->in_tree = 0; 196 rb_erase(node, root); 197 return 0; 198 } 199 200 /* 201 * add_extent_mapping tries a simple backward merge with existing 202 * mappings. The extent_map struct passed in will be inserted into 203 * the tree directly (no copies made, just a reference taken). 204 */ 205 int add_extent_mapping(struct extent_map_tree *tree, 206 struct extent_map *em) 207 { 208 int ret = 0; 209 struct extent_map *prev = NULL; 210 struct rb_node *rb; 211 212 write_lock_irq(&tree->lock); 213 rb = tree_insert(&tree->map, em->end, &em->rb_node); 214 if (rb) { 215 prev = rb_entry(rb, struct extent_map, rb_node); 216 printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end); 217 ret = -EEXIST; 218 goto out; 219 } 220 atomic_inc(&em->refs); 221 if (em->start != 0) { 222 rb = rb_prev(&em->rb_node); 223 if (rb) 224 prev = rb_entry(rb, struct extent_map, rb_node); 225 if (prev && prev->end + 1 == em->start && 226 ((em->block_start == 0 && prev->block_start == 0) || 227 (em->block_start == prev->block_end + 1))) { 228 em->start = prev->start; 229 em->block_start = prev->block_start; 230 rb_erase(&prev->rb_node, &tree->map); 231 prev->in_tree = 0; 232 free_extent_map(prev); 233 } 234 } 235 out: 236 write_unlock_irq(&tree->lock); 237 return ret; 238 } 239 EXPORT_SYMBOL(add_extent_mapping); 240 241 /* 242 * lookup_extent_mapping returns the first extent_map struct in the 243 * tree that intersects the [start, end] (inclusive) range. There may 244 * be additional objects in the tree that intersect, so check the object 245 * returned carefully to make sure you don't need additional lookups. 246 */ 247 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, 248 u64 start, u64 end) 249 { 250 struct extent_map *em; 251 struct rb_node *rb_node; 252 253 read_lock_irq(&tree->lock); 254 rb_node = tree_search(&tree->map, start); 255 if (!rb_node) { 256 em = NULL; 257 goto out; 258 } 259 if (IS_ERR(rb_node)) { 260 em = ERR_PTR(PTR_ERR(rb_node)); 261 goto out; 262 } 263 em = rb_entry(rb_node, struct extent_map, rb_node); 264 if (em->end < start || em->start > end) { 265 em = NULL; 266 goto out; 267 } 268 atomic_inc(&em->refs); 269 out: 270 read_unlock_irq(&tree->lock); 271 return em; 272 } 273 EXPORT_SYMBOL(lookup_extent_mapping); 274 275 /* 276 * removes an extent_map struct from the tree. No reference counts are 277 * dropped, and no checks are done to see if the range is in use 278 */ 279 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) 280 { 281 int ret; 282 283 write_lock_irq(&tree->lock); 284 ret = tree_delete(&tree->map, em->end); 285 write_unlock_irq(&tree->lock); 286 return ret; 287 } 288 EXPORT_SYMBOL(remove_extent_mapping); 289 290 /* 291 * utility function to look for merge candidates inside a given range. 292 * Any extents with matching state are merged together into a single 293 * extent in the tree. Extents with EXTENT_IO in their state field 294 * are not merged because the end_io handlers need to be able to do 295 * operations on them without sleeping (or doing allocations/splits). 296 * 297 * This should be called with the tree lock held. 298 */ 299 static int merge_state(struct extent_map_tree *tree, 300 struct extent_state *state) 301 { 302 struct extent_state *other; 303 struct rb_node *other_node; 304 305 if (state->state & EXTENT_IOBITS) 306 return 0; 307 308 other_node = rb_prev(&state->rb_node); 309 if (other_node) { 310 other = rb_entry(other_node, struct extent_state, rb_node); 311 if (other->end == state->start - 1 && 312 other->state == state->state) { 313 state->start = other->start; 314 other->in_tree = 0; 315 rb_erase(&other->rb_node, &tree->state); 316 free_extent_state(other); 317 } 318 } 319 other_node = rb_next(&state->rb_node); 320 if (other_node) { 321 other = rb_entry(other_node, struct extent_state, rb_node); 322 if (other->start == state->end + 1 && 323 other->state == state->state) { 324 other->start = state->start; 325 state->in_tree = 0; 326 rb_erase(&state->rb_node, &tree->state); 327 free_extent_state(state); 328 } 329 } 330 return 0; 331 } 332 333 /* 334 * insert an extent_state struct into the tree. 'bits' are set on the 335 * struct before it is inserted. 336 * 337 * This may return -EEXIST if the extent is already there, in which case the 338 * state struct is freed. 339 * 340 * The tree lock is not taken internally. This is a utility function and 341 * probably isn't what you want to call (see set/clear_extent_bit). 342 */ 343 static int insert_state(struct extent_map_tree *tree, 344 struct extent_state *state, u64 start, u64 end, 345 int bits) 346 { 347 struct rb_node *node; 348 349 if (end < start) { 350 printk("end < start %Lu %Lu\n", end, start); 351 WARN_ON(1); 352 } 353 state->state |= bits; 354 state->start = start; 355 state->end = end; 356 if ((end & 4095) == 0) { 357 printk("insert state %Lu %Lu strange end\n", start, end); 358 WARN_ON(1); 359 } 360 node = tree_insert(&tree->state, end, &state->rb_node); 361 if (node) { 362 struct extent_state *found; 363 found = rb_entry(node, struct extent_state, rb_node); 364 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); 365 free_extent_state(state); 366 return -EEXIST; 367 } 368 merge_state(tree, state); 369 return 0; 370 } 371 372 /* 373 * split a given extent state struct in two, inserting the preallocated 374 * struct 'prealloc' as the newly created second half. 'split' indicates an 375 * offset inside 'orig' where it should be split. 376 * 377 * Before calling, 378 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 379 * are two extent state structs in the tree: 380 * prealloc: [orig->start, split - 1] 381 * orig: [ split, orig->end ] 382 * 383 * The tree locks are not taken by this function. They need to be held 384 * by the caller. 385 */ 386 static int split_state(struct extent_map_tree *tree, struct extent_state *orig, 387 struct extent_state *prealloc, u64 split) 388 { 389 struct rb_node *node; 390 prealloc->start = orig->start; 391 prealloc->end = split - 1; 392 prealloc->state = orig->state; 393 orig->start = split; 394 if ((prealloc->end & 4095) == 0) { 395 printk("insert state %Lu %Lu strange end\n", prealloc->start, 396 prealloc->end); 397 WARN_ON(1); 398 } 399 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 400 if (node) { 401 struct extent_state *found; 402 found = rb_entry(node, struct extent_state, rb_node); 403 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); 404 free_extent_state(prealloc); 405 return -EEXIST; 406 } 407 return 0; 408 } 409 410 /* 411 * utility function to clear some bits in an extent state struct. 412 * it will optionally wake up any one waiting on this state (wake == 1), or 413 * forcibly remove the state from the tree (delete == 1). 414 * 415 * If no bits are set on the state struct after clearing things, the 416 * struct is freed and removed from the tree 417 */ 418 static int clear_state_bit(struct extent_map_tree *tree, 419 struct extent_state *state, int bits, int wake, 420 int delete) 421 { 422 int ret = state->state & bits; 423 state->state &= ~bits; 424 if (wake) 425 wake_up(&state->wq); 426 if (delete || state->state == 0) { 427 if (state->in_tree) { 428 rb_erase(&state->rb_node, &tree->state); 429 state->in_tree = 0; 430 free_extent_state(state); 431 } else { 432 WARN_ON(1); 433 } 434 } else { 435 merge_state(tree, state); 436 } 437 return ret; 438 } 439 440 /* 441 * clear some bits on a range in the tree. This may require splitting 442 * or inserting elements in the tree, so the gfp mask is used to 443 * indicate which allocations or sleeping are allowed. 444 * 445 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 446 * the given range from the tree regardless of state (ie for truncate). 447 * 448 * the range [start, end] is inclusive. 449 * 450 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 451 * bits were already set, or zero if none of the bits were already set. 452 */ 453 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, 454 int bits, int wake, int delete, gfp_t mask) 455 { 456 struct extent_state *state; 457 struct extent_state *prealloc = NULL; 458 struct rb_node *node; 459 unsigned long flags; 460 int err; 461 int set = 0; 462 463 again: 464 if (!prealloc && (mask & __GFP_WAIT)) { 465 prealloc = alloc_extent_state(mask); 466 if (!prealloc) 467 return -ENOMEM; 468 } 469 470 write_lock_irqsave(&tree->lock, flags); 471 /* 472 * this search will find the extents that end after 473 * our range starts 474 */ 475 node = tree_search(&tree->state, start); 476 if (!node) 477 goto out; 478 state = rb_entry(node, struct extent_state, rb_node); 479 if (state->start > end) 480 goto out; 481 WARN_ON(state->end < start); 482 483 /* 484 * | ---- desired range ---- | 485 * | state | or 486 * | ------------- state -------------- | 487 * 488 * We need to split the extent we found, and may flip 489 * bits on second half. 490 * 491 * If the extent we found extends past our range, we 492 * just split and search again. It'll get split again 493 * the next time though. 494 * 495 * If the extent we found is inside our range, we clear 496 * the desired bit on it. 497 */ 498 499 if (state->start < start) { 500 err = split_state(tree, state, prealloc, start); 501 BUG_ON(err == -EEXIST); 502 prealloc = NULL; 503 if (err) 504 goto out; 505 if (state->end <= end) { 506 start = state->end + 1; 507 set |= clear_state_bit(tree, state, bits, 508 wake, delete); 509 } else { 510 start = state->start; 511 } 512 goto search_again; 513 } 514 /* 515 * | ---- desired range ---- | 516 * | state | 517 * We need to split the extent, and clear the bit 518 * on the first half 519 */ 520 if (state->start <= end && state->end > end) { 521 err = split_state(tree, state, prealloc, end + 1); 522 BUG_ON(err == -EEXIST); 523 524 if (wake) 525 wake_up(&state->wq); 526 set |= clear_state_bit(tree, prealloc, bits, 527 wake, delete); 528 prealloc = NULL; 529 goto out; 530 } 531 532 start = state->end + 1; 533 set |= clear_state_bit(tree, state, bits, wake, delete); 534 goto search_again; 535 536 out: 537 write_unlock_irqrestore(&tree->lock, flags); 538 if (prealloc) 539 free_extent_state(prealloc); 540 541 return set; 542 543 search_again: 544 if (start >= end) 545 goto out; 546 write_unlock_irqrestore(&tree->lock, flags); 547 if (mask & __GFP_WAIT) 548 cond_resched(); 549 goto again; 550 } 551 EXPORT_SYMBOL(clear_extent_bit); 552 553 static int wait_on_state(struct extent_map_tree *tree, 554 struct extent_state *state) 555 { 556 DEFINE_WAIT(wait); 557 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 558 read_unlock_irq(&tree->lock); 559 schedule(); 560 read_lock_irq(&tree->lock); 561 finish_wait(&state->wq, &wait); 562 return 0; 563 } 564 565 /* 566 * waits for one or more bits to clear on a range in the state tree. 567 * The range [start, end] is inclusive. 568 * The tree lock is taken by this function 569 */ 570 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) 571 { 572 struct extent_state *state; 573 struct rb_node *node; 574 575 read_lock_irq(&tree->lock); 576 again: 577 while (1) { 578 /* 579 * this search will find all the extents that end after 580 * our range starts 581 */ 582 node = tree_search(&tree->state, start); 583 if (!node) 584 break; 585 586 state = rb_entry(node, struct extent_state, rb_node); 587 588 if (state->start > end) 589 goto out; 590 591 if (state->state & bits) { 592 start = state->start; 593 atomic_inc(&state->refs); 594 wait_on_state(tree, state); 595 free_extent_state(state); 596 goto again; 597 } 598 start = state->end + 1; 599 600 if (start > end) 601 break; 602 603 if (need_resched()) { 604 read_unlock_irq(&tree->lock); 605 cond_resched(); 606 read_lock_irq(&tree->lock); 607 } 608 } 609 out: 610 read_unlock_irq(&tree->lock); 611 return 0; 612 } 613 EXPORT_SYMBOL(wait_extent_bit); 614 615 /* 616 * set some bits on a range in the tree. This may require allocations 617 * or sleeping, so the gfp mask is used to indicate what is allowed. 618 * 619 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 620 * range already has the desired bits set. The start of the existing 621 * range is returned in failed_start in this case. 622 * 623 * [start, end] is inclusive 624 * This takes the tree lock. 625 */ 626 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, 627 int exclusive, u64 *failed_start, gfp_t mask) 628 { 629 struct extent_state *state; 630 struct extent_state *prealloc = NULL; 631 struct rb_node *node; 632 unsigned long flags; 633 int err = 0; 634 int set; 635 u64 last_start; 636 u64 last_end; 637 again: 638 if (!prealloc && (mask & __GFP_WAIT)) { 639 prealloc = alloc_extent_state(mask); 640 if (!prealloc) 641 return -ENOMEM; 642 } 643 644 write_lock_irqsave(&tree->lock, flags); 645 /* 646 * this search will find all the extents that end after 647 * our range starts. 648 */ 649 node = tree_search(&tree->state, start); 650 if (!node) { 651 err = insert_state(tree, prealloc, start, end, bits); 652 prealloc = NULL; 653 BUG_ON(err == -EEXIST); 654 goto out; 655 } 656 657 state = rb_entry(node, struct extent_state, rb_node); 658 last_start = state->start; 659 last_end = state->end; 660 661 /* 662 * | ---- desired range ---- | 663 * | state | 664 * 665 * Just lock what we found and keep going 666 */ 667 if (state->start == start && state->end <= end) { 668 set = state->state & bits; 669 if (set && exclusive) { 670 *failed_start = state->start; 671 err = -EEXIST; 672 goto out; 673 } 674 state->state |= bits; 675 start = state->end + 1; 676 merge_state(tree, state); 677 goto search_again; 678 } 679 680 /* 681 * | ---- desired range ---- | 682 * | state | 683 * or 684 * | ------------- state -------------- | 685 * 686 * We need to split the extent we found, and may flip bits on 687 * second half. 688 * 689 * If the extent we found extends past our 690 * range, we just split and search again. It'll get split 691 * again the next time though. 692 * 693 * If the extent we found is inside our range, we set the 694 * desired bit on it. 695 */ 696 if (state->start < start) { 697 set = state->state & bits; 698 if (exclusive && set) { 699 *failed_start = start; 700 err = -EEXIST; 701 goto out; 702 } 703 err = split_state(tree, state, prealloc, start); 704 BUG_ON(err == -EEXIST); 705 prealloc = NULL; 706 if (err) 707 goto out; 708 if (state->end <= end) { 709 state->state |= bits; 710 start = state->end + 1; 711 merge_state(tree, state); 712 } else { 713 start = state->start; 714 } 715 goto search_again; 716 } 717 /* 718 * | ---- desired range ---- | 719 * | state | or | state | 720 * 721 * There's a hole, we need to insert something in it and 722 * ignore the extent we found. 723 */ 724 if (state->start > start) { 725 u64 this_end; 726 if (end < last_start) 727 this_end = end; 728 else 729 this_end = last_start -1; 730 err = insert_state(tree, prealloc, start, this_end, 731 bits); 732 prealloc = NULL; 733 BUG_ON(err == -EEXIST); 734 if (err) 735 goto out; 736 start = this_end + 1; 737 goto search_again; 738 } 739 /* 740 * | ---- desired range ---- | 741 * | state | 742 * We need to split the extent, and set the bit 743 * on the first half 744 */ 745 if (state->start <= end && state->end > end) { 746 set = state->state & bits; 747 if (exclusive && set) { 748 *failed_start = start; 749 err = -EEXIST; 750 goto out; 751 } 752 err = split_state(tree, state, prealloc, end + 1); 753 BUG_ON(err == -EEXIST); 754 755 prealloc->state |= bits; 756 merge_state(tree, prealloc); 757 prealloc = NULL; 758 goto out; 759 } 760 761 goto search_again; 762 763 out: 764 write_unlock_irqrestore(&tree->lock, flags); 765 if (prealloc) 766 free_extent_state(prealloc); 767 768 return err; 769 770 search_again: 771 if (start > end) 772 goto out; 773 write_unlock_irqrestore(&tree->lock, flags); 774 if (mask & __GFP_WAIT) 775 cond_resched(); 776 goto again; 777 } 778 EXPORT_SYMBOL(set_extent_bit); 779 780 /* wrappers around set/clear extent bit */ 781 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 782 gfp_t mask) 783 { 784 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 785 mask); 786 } 787 EXPORT_SYMBOL(set_extent_dirty); 788 789 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, 790 gfp_t mask) 791 { 792 return set_extent_bit(tree, start, end, 793 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, 794 mask); 795 } 796 EXPORT_SYMBOL(set_extent_delalloc); 797 798 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 799 gfp_t mask) 800 { 801 return clear_extent_bit(tree, start, end, 802 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 803 } 804 EXPORT_SYMBOL(clear_extent_dirty); 805 806 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 807 gfp_t mask) 808 { 809 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 810 mask); 811 } 812 EXPORT_SYMBOL(set_extent_new); 813 814 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 815 gfp_t mask) 816 { 817 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 818 } 819 EXPORT_SYMBOL(clear_extent_new); 820 821 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 822 gfp_t mask) 823 { 824 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 825 mask); 826 } 827 EXPORT_SYMBOL(set_extent_uptodate); 828 829 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 830 gfp_t mask) 831 { 832 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 833 } 834 EXPORT_SYMBOL(clear_extent_uptodate); 835 836 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 837 gfp_t mask) 838 { 839 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, 840 0, NULL, mask); 841 } 842 EXPORT_SYMBOL(set_extent_writeback); 843 844 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 845 gfp_t mask) 846 { 847 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); 848 } 849 EXPORT_SYMBOL(clear_extent_writeback); 850 851 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) 852 { 853 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 854 } 855 EXPORT_SYMBOL(wait_on_extent_writeback); 856 857 /* 858 * locks a range in ascending order, waiting for any locked regions 859 * it hits on the way. [start,end] are inclusive, and this will sleep. 860 */ 861 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) 862 { 863 int err; 864 u64 failed_start; 865 while (1) { 866 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 867 &failed_start, mask); 868 if (err == -EEXIST && (mask & __GFP_WAIT)) { 869 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 870 start = failed_start; 871 } else { 872 break; 873 } 874 WARN_ON(start > end); 875 } 876 return err; 877 } 878 EXPORT_SYMBOL(lock_extent); 879 880 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, 881 gfp_t mask) 882 { 883 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 884 } 885 EXPORT_SYMBOL(unlock_extent); 886 887 /* 888 * helper function to set pages and extents in the tree dirty 889 */ 890 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) 891 { 892 unsigned long index = start >> PAGE_CACHE_SHIFT; 893 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 894 struct page *page; 895 896 while (index <= end_index) { 897 page = find_get_page(tree->mapping, index); 898 BUG_ON(!page); 899 __set_page_dirty_nobuffers(page); 900 page_cache_release(page); 901 index++; 902 } 903 set_extent_dirty(tree, start, end, GFP_NOFS); 904 return 0; 905 } 906 EXPORT_SYMBOL(set_range_dirty); 907 908 /* 909 * helper function to set both pages and extents in the tree writeback 910 */ 911 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) 912 { 913 unsigned long index = start >> PAGE_CACHE_SHIFT; 914 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 915 struct page *page; 916 917 while (index <= end_index) { 918 page = find_get_page(tree->mapping, index); 919 BUG_ON(!page); 920 set_page_writeback(page); 921 page_cache_release(page); 922 index++; 923 } 924 set_extent_writeback(tree, start, end, GFP_NOFS); 925 return 0; 926 } 927 EXPORT_SYMBOL(set_range_writeback); 928 929 u64 find_lock_delalloc_range(struct extent_map_tree *tree, 930 u64 start, u64 lock_start, u64 *end, u64 max_bytes) 931 { 932 struct rb_node *node; 933 struct extent_state *state; 934 u64 cur_start = start; 935 u64 found = 0; 936 u64 total_bytes = 0; 937 938 write_lock_irq(&tree->lock); 939 /* 940 * this search will find all the extents that end after 941 * our range starts. 942 */ 943 search_again: 944 node = tree_search(&tree->state, cur_start); 945 if (!node || IS_ERR(node)) { 946 goto out; 947 } 948 949 while(1) { 950 state = rb_entry(node, struct extent_state, rb_node); 951 if (state->start != cur_start) { 952 goto out; 953 } 954 if (!(state->state & EXTENT_DELALLOC)) { 955 goto out; 956 } 957 if (state->start >= lock_start) { 958 if (state->state & EXTENT_LOCKED) { 959 DEFINE_WAIT(wait); 960 atomic_inc(&state->refs); 961 write_unlock_irq(&tree->lock); 962 schedule(); 963 write_lock_irq(&tree->lock); 964 finish_wait(&state->wq, &wait); 965 free_extent_state(state); 966 goto search_again; 967 } 968 state->state |= EXTENT_LOCKED; 969 } 970 found++; 971 *end = state->end; 972 cur_start = state->end + 1; 973 node = rb_next(node); 974 if (!node) 975 break; 976 total_bytes = state->end - state->start + 1; 977 if (total_bytes >= max_bytes) 978 break; 979 } 980 out: 981 write_unlock_irq(&tree->lock); 982 return found; 983 } 984 985 /* 986 * helper function to lock both pages and extents in the tree. 987 * pages must be locked first. 988 */ 989 int lock_range(struct extent_map_tree *tree, u64 start, u64 end) 990 { 991 unsigned long index = start >> PAGE_CACHE_SHIFT; 992 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 993 struct page *page; 994 int err; 995 996 while (index <= end_index) { 997 page = grab_cache_page(tree->mapping, index); 998 if (!page) { 999 err = -ENOMEM; 1000 goto failed; 1001 } 1002 if (IS_ERR(page)) { 1003 err = PTR_ERR(page); 1004 goto failed; 1005 } 1006 index++; 1007 } 1008 lock_extent(tree, start, end, GFP_NOFS); 1009 return 0; 1010 1011 failed: 1012 /* 1013 * we failed above in getting the page at 'index', so we undo here 1014 * up to but not including the page at 'index' 1015 */ 1016 end_index = index; 1017 index = start >> PAGE_CACHE_SHIFT; 1018 while (index < end_index) { 1019 page = find_get_page(tree->mapping, index); 1020 unlock_page(page); 1021 page_cache_release(page); 1022 index++; 1023 } 1024 return err; 1025 } 1026 EXPORT_SYMBOL(lock_range); 1027 1028 /* 1029 * helper function to unlock both pages and extents in the tree. 1030 */ 1031 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) 1032 { 1033 unsigned long index = start >> PAGE_CACHE_SHIFT; 1034 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1035 struct page *page; 1036 1037 while (index <= end_index) { 1038 page = find_get_page(tree->mapping, index); 1039 unlock_page(page); 1040 page_cache_release(page); 1041 index++; 1042 } 1043 unlock_extent(tree, start, end, GFP_NOFS); 1044 return 0; 1045 } 1046 EXPORT_SYMBOL(unlock_range); 1047 1048 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) 1049 { 1050 struct rb_node *node; 1051 struct extent_state *state; 1052 int ret = 0; 1053 1054 write_lock_irq(&tree->lock); 1055 /* 1056 * this search will find all the extents that end after 1057 * our range starts. 1058 */ 1059 node = tree_search(&tree->state, start); 1060 if (!node || IS_ERR(node)) { 1061 ret = -ENOENT; 1062 goto out; 1063 } 1064 state = rb_entry(node, struct extent_state, rb_node); 1065 if (state->start != start) { 1066 ret = -ENOENT; 1067 goto out; 1068 } 1069 state->private = private; 1070 out: 1071 write_unlock_irq(&tree->lock); 1072 return ret; 1073 1074 } 1075 1076 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) 1077 { 1078 struct rb_node *node; 1079 struct extent_state *state; 1080 int ret = 0; 1081 1082 read_lock_irq(&tree->lock); 1083 /* 1084 * this search will find all the extents that end after 1085 * our range starts. 1086 */ 1087 node = tree_search(&tree->state, start); 1088 if (!node || IS_ERR(node)) { 1089 ret = -ENOENT; 1090 goto out; 1091 } 1092 state = rb_entry(node, struct extent_state, rb_node); 1093 if (state->start != start) { 1094 ret = -ENOENT; 1095 goto out; 1096 } 1097 *private = state->private; 1098 out: 1099 read_unlock_irq(&tree->lock); 1100 return ret; 1101 } 1102 1103 /* 1104 * searches a range in the state tree for a given mask. 1105 * If 'filled' == 1, this returns 1 only if ever extent in the tree 1106 * has the bits set. Otherwise, 1 is returned if any bit in the 1107 * range is found set. 1108 */ 1109 static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, 1110 int bits, int filled) 1111 { 1112 struct extent_state *state = NULL; 1113 struct rb_node *node; 1114 int bitset = 0; 1115 1116 read_lock_irq(&tree->lock); 1117 node = tree_search(&tree->state, start); 1118 while (node && start <= end) { 1119 state = rb_entry(node, struct extent_state, rb_node); 1120 if (state->start > end) 1121 break; 1122 1123 if (filled && state->start > start) { 1124 bitset = 0; 1125 break; 1126 } 1127 if (state->state & bits) { 1128 bitset = 1; 1129 if (!filled) 1130 break; 1131 } else if (filled) { 1132 bitset = 0; 1133 break; 1134 } 1135 start = state->end + 1; 1136 if (start > end) 1137 break; 1138 node = rb_next(node); 1139 } 1140 read_unlock_irq(&tree->lock); 1141 return bitset; 1142 } 1143 1144 /* 1145 * helper function to set a given page up to date if all the 1146 * extents in the tree for that page are up to date 1147 */ 1148 static int check_page_uptodate(struct extent_map_tree *tree, 1149 struct page *page) 1150 { 1151 u64 start = page->index << PAGE_CACHE_SHIFT; 1152 u64 end = start + PAGE_CACHE_SIZE - 1; 1153 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1154 SetPageUptodate(page); 1155 return 0; 1156 } 1157 1158 /* 1159 * helper function to unlock a page if all the extents in the tree 1160 * for that page are unlocked 1161 */ 1162 static int check_page_locked(struct extent_map_tree *tree, 1163 struct page *page) 1164 { 1165 u64 start = page->index << PAGE_CACHE_SHIFT; 1166 u64 end = start + PAGE_CACHE_SIZE - 1; 1167 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1168 unlock_page(page); 1169 return 0; 1170 } 1171 1172 /* 1173 * helper function to end page writeback if all the extents 1174 * in the tree for that page are done with writeback 1175 */ 1176 static int check_page_writeback(struct extent_map_tree *tree, 1177 struct page *page) 1178 { 1179 u64 start = page->index << PAGE_CACHE_SHIFT; 1180 u64 end = start + PAGE_CACHE_SIZE - 1; 1181 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) 1182 end_page_writeback(page); 1183 return 0; 1184 } 1185 1186 /* lots and lots of room for performance fixes in the end_bio funcs */ 1187 1188 /* 1189 * after a writepage IO is done, we need to: 1190 * clear the uptodate bits on error 1191 * clear the writeback bits in the extent tree for this IO 1192 * end_page_writeback if the page has no more pending IO 1193 * 1194 * Scheduling is not allowed, so the extent state tree is expected 1195 * to have one and only one object corresponding to this IO. 1196 */ 1197 static int end_bio_extent_writepage(struct bio *bio, 1198 unsigned int bytes_done, int err) 1199 { 1200 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1201 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1202 struct extent_map_tree *tree = bio->bi_private; 1203 u64 start; 1204 u64 end; 1205 int whole_page; 1206 1207 if (bio->bi_size) 1208 return 1; 1209 1210 do { 1211 struct page *page = bvec->bv_page; 1212 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1213 end = start + bvec->bv_len - 1; 1214 1215 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1216 whole_page = 1; 1217 else 1218 whole_page = 0; 1219 1220 if (--bvec >= bio->bi_io_vec) 1221 prefetchw(&bvec->bv_page->flags); 1222 1223 if (!uptodate) { 1224 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1225 ClearPageUptodate(page); 1226 SetPageError(page); 1227 } 1228 clear_extent_writeback(tree, start, end, GFP_ATOMIC); 1229 1230 if (whole_page) 1231 end_page_writeback(page); 1232 else 1233 check_page_writeback(tree, page); 1234 if (tree->ops && tree->ops->writepage_end_io_hook) 1235 tree->ops->writepage_end_io_hook(page, start, end); 1236 } while (bvec >= bio->bi_io_vec); 1237 1238 bio_put(bio); 1239 return 0; 1240 } 1241 1242 /* 1243 * after a readpage IO is done, we need to: 1244 * clear the uptodate bits on error 1245 * set the uptodate bits if things worked 1246 * set the page up to date if all extents in the tree are uptodate 1247 * clear the lock bit in the extent tree 1248 * unlock the page if there are no other extents locked for it 1249 * 1250 * Scheduling is not allowed, so the extent state tree is expected 1251 * to have one and only one object corresponding to this IO. 1252 */ 1253 static int end_bio_extent_readpage(struct bio *bio, 1254 unsigned int bytes_done, int err) 1255 { 1256 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1257 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1258 struct extent_map_tree *tree = bio->bi_private; 1259 u64 start; 1260 u64 end; 1261 int whole_page; 1262 int ret; 1263 1264 if (bio->bi_size) 1265 return 1; 1266 1267 do { 1268 struct page *page = bvec->bv_page; 1269 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1270 end = start + bvec->bv_len - 1; 1271 1272 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1273 whole_page = 1; 1274 else 1275 whole_page = 0; 1276 1277 if (--bvec >= bio->bi_io_vec) 1278 prefetchw(&bvec->bv_page->flags); 1279 1280 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1281 ret = tree->ops->readpage_end_io_hook(page, start, end); 1282 if (ret) 1283 uptodate = 0; 1284 } 1285 if (uptodate) { 1286 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1287 if (whole_page) 1288 SetPageUptodate(page); 1289 else 1290 check_page_uptodate(tree, page); 1291 } else { 1292 ClearPageUptodate(page); 1293 SetPageError(page); 1294 } 1295 1296 unlock_extent(tree, start, end, GFP_ATOMIC); 1297 1298 if (whole_page) 1299 unlock_page(page); 1300 else 1301 check_page_locked(tree, page); 1302 } while (bvec >= bio->bi_io_vec); 1303 1304 bio_put(bio); 1305 return 0; 1306 } 1307 1308 /* 1309 * IO done from prepare_write is pretty simple, we just unlock 1310 * the structs in the extent tree when done, and set the uptodate bits 1311 * as appropriate. 1312 */ 1313 static int end_bio_extent_preparewrite(struct bio *bio, 1314 unsigned int bytes_done, int err) 1315 { 1316 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1317 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1318 struct extent_map_tree *tree = bio->bi_private; 1319 u64 start; 1320 u64 end; 1321 1322 if (bio->bi_size) 1323 return 1; 1324 1325 do { 1326 struct page *page = bvec->bv_page; 1327 start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1328 end = start + bvec->bv_len - 1; 1329 1330 if (--bvec >= bio->bi_io_vec) 1331 prefetchw(&bvec->bv_page->flags); 1332 1333 if (uptodate) { 1334 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1335 } else { 1336 ClearPageUptodate(page); 1337 SetPageError(page); 1338 } 1339 1340 unlock_extent(tree, start, end, GFP_ATOMIC); 1341 1342 } while (bvec >= bio->bi_io_vec); 1343 1344 bio_put(bio); 1345 return 0; 1346 } 1347 1348 static int submit_extent_page(int rw, struct extent_map_tree *tree, 1349 struct page *page, sector_t sector, 1350 size_t size, unsigned long offset, 1351 struct block_device *bdev, 1352 bio_end_io_t end_io_func) 1353 { 1354 struct bio *bio; 1355 int ret = 0; 1356 1357 bio = bio_alloc(GFP_NOIO, 1); 1358 1359 bio->bi_sector = sector; 1360 bio->bi_bdev = bdev; 1361 bio->bi_io_vec[0].bv_page = page; 1362 bio->bi_io_vec[0].bv_len = size; 1363 bio->bi_io_vec[0].bv_offset = offset; 1364 1365 bio->bi_vcnt = 1; 1366 bio->bi_idx = 0; 1367 bio->bi_size = size; 1368 1369 bio->bi_end_io = end_io_func; 1370 bio->bi_private = tree; 1371 1372 bio_get(bio); 1373 submit_bio(rw, bio); 1374 1375 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1376 ret = -EOPNOTSUPP; 1377 1378 bio_put(bio); 1379 return ret; 1380 } 1381 1382 /* 1383 * basic readpage implementation. Locked extent state structs are inserted 1384 * into the tree that are removed when the IO is done (by the end_io 1385 * handlers) 1386 */ 1387 int extent_read_full_page(struct extent_map_tree *tree, struct page *page, 1388 get_extent_t *get_extent) 1389 { 1390 struct inode *inode = page->mapping->host; 1391 u64 start = page->index << PAGE_CACHE_SHIFT; 1392 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1393 u64 end; 1394 u64 cur = start; 1395 u64 extent_offset; 1396 u64 last_byte = i_size_read(inode); 1397 u64 block_start; 1398 u64 cur_end; 1399 sector_t sector; 1400 struct extent_map *em; 1401 struct block_device *bdev; 1402 int ret; 1403 int nr = 0; 1404 size_t page_offset = 0; 1405 size_t iosize; 1406 size_t blocksize = inode->i_sb->s_blocksize; 1407 1408 if (!PagePrivate(page)) { 1409 SetPagePrivate(page); 1410 WARN_ON(!page->mapping->a_ops->invalidatepage); 1411 set_page_private(page, 1); 1412 page_cache_get(page); 1413 } 1414 1415 end = page_end; 1416 lock_extent(tree, start, end, GFP_NOFS); 1417 1418 while (cur <= end) { 1419 if (cur >= last_byte) { 1420 iosize = PAGE_CACHE_SIZE - page_offset; 1421 zero_user_page(page, page_offset, iosize, KM_USER0); 1422 set_extent_uptodate(tree, cur, cur + iosize - 1, 1423 GFP_NOFS); 1424 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1425 break; 1426 } 1427 em = get_extent(inode, page, page_offset, cur, end, 0); 1428 if (IS_ERR(em) || !em) { 1429 SetPageError(page); 1430 unlock_extent(tree, cur, end, GFP_NOFS); 1431 break; 1432 } 1433 1434 extent_offset = cur - em->start; 1435 BUG_ON(em->end < cur); 1436 BUG_ON(end < cur); 1437 1438 iosize = min(em->end - cur, end - cur) + 1; 1439 cur_end = min(em->end, end); 1440 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1441 sector = (em->block_start + extent_offset) >> 9; 1442 bdev = em->bdev; 1443 block_start = em->block_start; 1444 free_extent_map(em); 1445 em = NULL; 1446 1447 /* we've found a hole, just zero and go on */ 1448 if (block_start == 0) { 1449 zero_user_page(page, page_offset, iosize, KM_USER0); 1450 set_extent_uptodate(tree, cur, cur + iosize - 1, 1451 GFP_NOFS); 1452 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1453 cur = cur + iosize; 1454 page_offset += iosize; 1455 continue; 1456 } 1457 /* the get_extent function already copied into the page */ 1458 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 1459 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1460 cur = cur + iosize; 1461 page_offset += iosize; 1462 continue; 1463 } 1464 1465 ret = 0; 1466 if (tree->ops && tree->ops->readpage_io_hook) { 1467 ret = tree->ops->readpage_io_hook(page, cur, 1468 cur + iosize - 1); 1469 } 1470 if (!ret) { 1471 ret = submit_extent_page(READ, tree, page, 1472 sector, iosize, page_offset, 1473 bdev, end_bio_extent_readpage); 1474 } 1475 if (ret) 1476 SetPageError(page); 1477 cur = cur + iosize; 1478 page_offset += iosize; 1479 nr++; 1480 } 1481 if (!nr) { 1482 if (!PageError(page)) 1483 SetPageUptodate(page); 1484 unlock_page(page); 1485 } 1486 return 0; 1487 } 1488 EXPORT_SYMBOL(extent_read_full_page); 1489 1490 /* 1491 * the writepage semantics are similar to regular writepage. extent 1492 * records are inserted to lock ranges in the tree, and as dirty areas 1493 * are found, they are marked writeback. Then the lock bits are removed 1494 * and the end_io handler clears the writeback ranges 1495 */ 1496 int extent_write_full_page(struct extent_map_tree *tree, struct page *page, 1497 get_extent_t *get_extent, 1498 struct writeback_control *wbc) 1499 { 1500 struct inode *inode = page->mapping->host; 1501 u64 start = page->index << PAGE_CACHE_SHIFT; 1502 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1503 u64 end; 1504 u64 cur = start; 1505 u64 extent_offset; 1506 u64 last_byte = i_size_read(inode); 1507 u64 block_start; 1508 sector_t sector; 1509 struct extent_map *em; 1510 struct block_device *bdev; 1511 int ret; 1512 int nr = 0; 1513 size_t page_offset = 0; 1514 size_t iosize; 1515 size_t blocksize; 1516 loff_t i_size = i_size_read(inode); 1517 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 1518 u64 nr_delalloc; 1519 u64 delalloc_end; 1520 1521 WARN_ON(!PageLocked(page)); 1522 if (page->index > end_index) { 1523 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1524 unlock_page(page); 1525 return 0; 1526 } 1527 1528 if (page->index == end_index) { 1529 size_t offset = i_size & (PAGE_CACHE_SIZE - 1); 1530 zero_user_page(page, offset, 1531 PAGE_CACHE_SIZE - offset, KM_USER0); 1532 } 1533 1534 if (!PagePrivate(page)) { 1535 SetPagePrivate(page); 1536 set_page_private(page, 1); 1537 WARN_ON(!page->mapping->a_ops->invalidatepage); 1538 page_cache_get(page); 1539 } 1540 1541 lock_extent(tree, start, page_end, GFP_NOFS); 1542 nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1, 1543 &delalloc_end, 1544 128 * 1024 * 1024); 1545 if (nr_delalloc) { 1546 tree->ops->fill_delalloc(inode, start, delalloc_end); 1547 if (delalloc_end >= page_end + 1) { 1548 clear_extent_bit(tree, page_end + 1, delalloc_end, 1549 EXTENT_LOCKED | EXTENT_DELALLOC, 1550 1, 0, GFP_NOFS); 1551 } 1552 clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC, 1553 0, 0, GFP_NOFS); 1554 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1555 printk("found delalloc bits after clear extent_bit\n"); 1556 } 1557 } else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1558 printk("found delalloc bits after find_delalloc_range returns 0\n"); 1559 } 1560 1561 end = page_end; 1562 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1563 printk("found delalloc bits after lock_extent\n"); 1564 } 1565 1566 if (last_byte <= start) { 1567 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1568 goto done; 1569 } 1570 1571 set_extent_uptodate(tree, start, page_end, GFP_NOFS); 1572 blocksize = inode->i_sb->s_blocksize; 1573 1574 while (cur <= end) { 1575 if (cur >= last_byte) { 1576 clear_extent_dirty(tree, cur, page_end, GFP_NOFS); 1577 break; 1578 } 1579 em = get_extent(inode, page, page_offset, cur, end, 0); 1580 if (IS_ERR(em) || !em) { 1581 SetPageError(page); 1582 break; 1583 } 1584 1585 extent_offset = cur - em->start; 1586 BUG_ON(em->end < cur); 1587 BUG_ON(end < cur); 1588 iosize = min(em->end - cur, end - cur) + 1; 1589 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1590 sector = (em->block_start + extent_offset) >> 9; 1591 bdev = em->bdev; 1592 block_start = em->block_start; 1593 free_extent_map(em); 1594 em = NULL; 1595 1596 if (block_start == 0 || block_start == EXTENT_MAP_INLINE) { 1597 clear_extent_dirty(tree, cur, 1598 cur + iosize - 1, GFP_NOFS); 1599 cur = cur + iosize; 1600 page_offset += iosize; 1601 continue; 1602 } 1603 1604 /* leave this out until we have a page_mkwrite call */ 1605 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 1606 EXTENT_DIRTY, 0)) { 1607 cur = cur + iosize; 1608 page_offset += iosize; 1609 continue; 1610 } 1611 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 1612 if (tree->ops && tree->ops->writepage_io_hook) { 1613 ret = tree->ops->writepage_io_hook(page, cur, 1614 cur + iosize - 1); 1615 } else { 1616 ret = 0; 1617 } 1618 if (ret) 1619 SetPageError(page); 1620 else { 1621 set_range_writeback(tree, cur, cur + iosize - 1); 1622 ret = submit_extent_page(WRITE, tree, page, sector, 1623 iosize, page_offset, bdev, 1624 end_bio_extent_writepage); 1625 if (ret) 1626 SetPageError(page); 1627 } 1628 cur = cur + iosize; 1629 page_offset += iosize; 1630 nr++; 1631 } 1632 done: 1633 WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0)); 1634 unlock_extent(tree, start, page_end, GFP_NOFS); 1635 unlock_page(page); 1636 return 0; 1637 } 1638 EXPORT_SYMBOL(extent_write_full_page); 1639 1640 /* 1641 * basic invalidatepage code, this waits on any locked or writeback 1642 * ranges corresponding to the page, and then deletes any extent state 1643 * records from the tree 1644 */ 1645 int extent_invalidatepage(struct extent_map_tree *tree, 1646 struct page *page, unsigned long offset) 1647 { 1648 u64 start = (page->index << PAGE_CACHE_SHIFT); 1649 u64 end = start + PAGE_CACHE_SIZE - 1; 1650 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 1651 1652 start += (offset + blocksize -1) & ~(blocksize - 1); 1653 if (start > end) 1654 return 0; 1655 1656 lock_extent(tree, start, end, GFP_NOFS); 1657 wait_on_extent_writeback(tree, start, end); 1658 clear_extent_bit(tree, start, end, 1659 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 1660 1, 1, GFP_NOFS); 1661 return 0; 1662 } 1663 EXPORT_SYMBOL(extent_invalidatepage); 1664 1665 /* 1666 * simple commit_write call, set_range_dirty is used to mark both 1667 * the pages and the extent records as dirty 1668 */ 1669 int extent_commit_write(struct extent_map_tree *tree, 1670 struct inode *inode, struct page *page, 1671 unsigned from, unsigned to) 1672 { 1673 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 1674 1675 if (!PagePrivate(page)) { 1676 SetPagePrivate(page); 1677 set_page_private(page, 1); 1678 WARN_ON(!page->mapping->a_ops->invalidatepage); 1679 page_cache_get(page); 1680 } 1681 1682 set_page_dirty(page); 1683 1684 if (pos > inode->i_size) { 1685 i_size_write(inode, pos); 1686 mark_inode_dirty(inode); 1687 } 1688 return 0; 1689 } 1690 EXPORT_SYMBOL(extent_commit_write); 1691 1692 int extent_prepare_write(struct extent_map_tree *tree, 1693 struct inode *inode, struct page *page, 1694 unsigned from, unsigned to, get_extent_t *get_extent) 1695 { 1696 u64 page_start = page->index << PAGE_CACHE_SHIFT; 1697 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 1698 u64 block_start; 1699 u64 orig_block_start; 1700 u64 block_end; 1701 u64 cur_end; 1702 struct extent_map *em; 1703 unsigned blocksize = 1 << inode->i_blkbits; 1704 size_t page_offset = 0; 1705 size_t block_off_start; 1706 size_t block_off_end; 1707 int err = 0; 1708 int iocount = 0; 1709 int ret = 0; 1710 int isnew; 1711 1712 if (!PagePrivate(page)) { 1713 SetPagePrivate(page); 1714 set_page_private(page, 1); 1715 WARN_ON(!page->mapping->a_ops->invalidatepage); 1716 page_cache_get(page); 1717 } 1718 block_start = (page_start + from) & ~((u64)blocksize - 1); 1719 block_end = (page_start + to - 1) | (blocksize - 1); 1720 orig_block_start = block_start; 1721 1722 lock_extent(tree, page_start, page_end, GFP_NOFS); 1723 while(block_start <= block_end) { 1724 em = get_extent(inode, page, page_offset, block_start, 1725 block_end, 1); 1726 if (IS_ERR(em) || !em) { 1727 goto err; 1728 } 1729 cur_end = min(block_end, em->end); 1730 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 1731 block_off_end = block_off_start + blocksize; 1732 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 1733 1734 if (!PageUptodate(page) && isnew && 1735 (block_off_end > to || block_off_start < from)) { 1736 void *kaddr; 1737 1738 kaddr = kmap_atomic(page, KM_USER0); 1739 if (block_off_end > to) 1740 memset(kaddr + to, 0, block_off_end - to); 1741 if (block_off_start < from) 1742 memset(kaddr + block_off_start, 0, 1743 from - block_off_start); 1744 flush_dcache_page(page); 1745 kunmap_atomic(kaddr, KM_USER0); 1746 } 1747 if (!isnew && !PageUptodate(page) && 1748 (block_off_end > to || block_off_start < from) && 1749 !test_range_bit(tree, block_start, cur_end, 1750 EXTENT_UPTODATE, 1)) { 1751 u64 sector; 1752 u64 extent_offset = block_start - em->start; 1753 size_t iosize; 1754 sector = (em->block_start + extent_offset) >> 9; 1755 iosize = (cur_end - block_start + blocksize - 1) & 1756 ~((u64)blocksize - 1); 1757 /* 1758 * we've already got the extent locked, but we 1759 * need to split the state such that our end_bio 1760 * handler can clear the lock. 1761 */ 1762 set_extent_bit(tree, block_start, 1763 block_start + iosize - 1, 1764 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 1765 ret = submit_extent_page(READ, tree, page, 1766 sector, iosize, page_offset, em->bdev, 1767 end_bio_extent_preparewrite); 1768 iocount++; 1769 block_start = block_start + iosize; 1770 } else { 1771 set_extent_uptodate(tree, block_start, cur_end, 1772 GFP_NOFS); 1773 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 1774 block_start = cur_end + 1; 1775 } 1776 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 1777 free_extent_map(em); 1778 } 1779 if (iocount) { 1780 wait_extent_bit(tree, orig_block_start, 1781 block_end, EXTENT_LOCKED); 1782 } 1783 check_page_uptodate(tree, page); 1784 err: 1785 /* FIXME, zero out newly allocated blocks on error */ 1786 return err; 1787 } 1788 EXPORT_SYMBOL(extent_prepare_write); 1789 1790 /* 1791 * a helper for releasepage. As long as there are no locked extents 1792 * in the range corresponding to the page, both state records and extent 1793 * map records are removed 1794 */ 1795 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) 1796 { 1797 struct extent_map *em; 1798 u64 start = page->index << PAGE_CACHE_SHIFT; 1799 u64 end = start + PAGE_CACHE_SIZE - 1; 1800 u64 orig_start = start; 1801 int ret = 1; 1802 1803 while (start <= end) { 1804 em = lookup_extent_mapping(tree, start, end); 1805 if (!em || IS_ERR(em)) 1806 break; 1807 if (!test_range_bit(tree, em->start, em->end, 1808 EXTENT_LOCKED, 0)) { 1809 remove_extent_mapping(tree, em); 1810 /* once for the rb tree */ 1811 free_extent_map(em); 1812 } 1813 start = em->end + 1; 1814 /* once for us */ 1815 free_extent_map(em); 1816 } 1817 if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) 1818 ret = 0; 1819 else 1820 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, 1821 1, 1, GFP_NOFS); 1822 return ret; 1823 } 1824 EXPORT_SYMBOL(try_release_extent_mapping); 1825 1826 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 1827 get_extent_t *get_extent) 1828 { 1829 struct inode *inode = mapping->host; 1830 u64 start = iblock << inode->i_blkbits; 1831 u64 end = start + (1 << inode->i_blkbits) - 1; 1832 struct extent_map *em; 1833 1834 em = get_extent(inode, NULL, 0, start, end, 0); 1835 if (!em || IS_ERR(em)) 1836 return 0; 1837 1838 // XXX(hch): block 0 is valid in some cases, e.g. XFS RT device 1839 if (em->block_start == EXTENT_MAP_INLINE || 1840 em->block_start == 0) 1841 return 0; 1842 1843 return (em->block_start + start - em->start) >> inode->i_blkbits; 1844 } 1845