1 #include <linux/bitops.h> 2 #include <linux/slab.h> 3 #include <linux/bio.h> 4 #include <linux/mm.h> 5 #include <linux/gfp.h> 6 #include <linux/pagemap.h> 7 #include <linux/page-flags.h> 8 #include <linux/module.h> 9 #include <linux/spinlock.h> 10 #include <linux/blkdev.h> 11 #include <linux/swap.h> 12 #include <linux/version.h> 13 #include <linux/writeback.h> 14 #include <linux/pagevec.h> 15 #include "extent_map.h" 16 17 /* temporary define until extent_map moves out of btrfs */ 18 struct kmem_cache *btrfs_cache_create(const char *name, size_t size, 19 unsigned long extra_flags, 20 void (*ctor)(void *, struct kmem_cache *, 21 unsigned long)); 22 23 static struct kmem_cache *extent_map_cache; 24 static struct kmem_cache *extent_state_cache; 25 static struct kmem_cache *extent_buffer_cache; 26 27 static LIST_HEAD(buffers); 28 static LIST_HEAD(states); 29 30 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED; 31 #define BUFFER_LRU_MAX 64 32 33 struct tree_entry { 34 u64 start; 35 u64 end; 36 int in_tree; 37 struct rb_node rb_node; 38 }; 39 40 struct extent_page_data { 41 struct bio *bio; 42 struct extent_map_tree *tree; 43 get_extent_t *get_extent; 44 }; 45 46 int __init extent_map_init(void) 47 { 48 extent_map_cache = btrfs_cache_create("extent_map", 49 sizeof(struct extent_map), 0, 50 NULL); 51 if (!extent_map_cache) 52 return -ENOMEM; 53 extent_state_cache = btrfs_cache_create("extent_state", 54 sizeof(struct extent_state), 0, 55 NULL); 56 if (!extent_state_cache) 57 goto free_map_cache; 58 extent_buffer_cache = btrfs_cache_create("extent_buffers", 59 sizeof(struct extent_buffer), 0, 60 NULL); 61 if (!extent_buffer_cache) 62 goto free_state_cache; 63 return 0; 64 65 free_state_cache: 66 kmem_cache_destroy(extent_state_cache); 67 free_map_cache: 68 kmem_cache_destroy(extent_map_cache); 69 return -ENOMEM; 70 } 71 72 void extent_map_exit(void) 73 { 74 struct extent_state *state; 75 76 while (!list_empty(&states)) { 77 state = list_entry(states.next, struct extent_state, list); 78 printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs)); 79 list_del(&state->list); 80 kmem_cache_free(extent_state_cache, state); 81 82 } 83 84 if (extent_map_cache) 85 kmem_cache_destroy(extent_map_cache); 86 if (extent_state_cache) 87 kmem_cache_destroy(extent_state_cache); 88 if (extent_buffer_cache) 89 kmem_cache_destroy(extent_buffer_cache); 90 } 91 92 void extent_map_tree_init(struct extent_map_tree *tree, 93 struct address_space *mapping, gfp_t mask) 94 { 95 tree->map.rb_node = NULL; 96 tree->state.rb_node = NULL; 97 tree->ops = NULL; 98 tree->dirty_bytes = 0; 99 rwlock_init(&tree->lock); 100 spin_lock_init(&tree->lru_lock); 101 tree->mapping = mapping; 102 INIT_LIST_HEAD(&tree->buffer_lru); 103 tree->lru_size = 0; 104 } 105 EXPORT_SYMBOL(extent_map_tree_init); 106 107 void extent_map_tree_empty_lru(struct extent_map_tree *tree) 108 { 109 struct extent_buffer *eb; 110 while(!list_empty(&tree->buffer_lru)) { 111 eb = list_entry(tree->buffer_lru.next, struct extent_buffer, 112 lru); 113 list_del_init(&eb->lru); 114 free_extent_buffer(eb); 115 } 116 } 117 EXPORT_SYMBOL(extent_map_tree_empty_lru); 118 119 struct extent_map *alloc_extent_map(gfp_t mask) 120 { 121 struct extent_map *em; 122 em = kmem_cache_alloc(extent_map_cache, mask); 123 if (!em || IS_ERR(em)) 124 return em; 125 em->in_tree = 0; 126 atomic_set(&em->refs, 1); 127 return em; 128 } 129 EXPORT_SYMBOL(alloc_extent_map); 130 131 void free_extent_map(struct extent_map *em) 132 { 133 if (!em) 134 return; 135 if (atomic_dec_and_test(&em->refs)) { 136 WARN_ON(em->in_tree); 137 kmem_cache_free(extent_map_cache, em); 138 } 139 } 140 EXPORT_SYMBOL(free_extent_map); 141 142 143 struct extent_state *alloc_extent_state(gfp_t mask) 144 { 145 struct extent_state *state; 146 unsigned long flags; 147 148 state = kmem_cache_alloc(extent_state_cache, mask); 149 if (!state || IS_ERR(state)) 150 return state; 151 state->state = 0; 152 state->in_tree = 0; 153 state->private = 0; 154 155 spin_lock_irqsave(&state_lock, flags); 156 list_add(&state->list, &states); 157 spin_unlock_irqrestore(&state_lock, flags); 158 159 atomic_set(&state->refs, 1); 160 init_waitqueue_head(&state->wq); 161 return state; 162 } 163 EXPORT_SYMBOL(alloc_extent_state); 164 165 void free_extent_state(struct extent_state *state) 166 { 167 unsigned long flags; 168 if (!state) 169 return; 170 if (atomic_dec_and_test(&state->refs)) { 171 WARN_ON(state->in_tree); 172 spin_lock_irqsave(&state_lock, flags); 173 list_del(&state->list); 174 spin_unlock_irqrestore(&state_lock, flags); 175 kmem_cache_free(extent_state_cache, state); 176 } 177 } 178 EXPORT_SYMBOL(free_extent_state); 179 180 static struct rb_node *tree_insert(struct rb_root *root, u64 offset, 181 struct rb_node *node) 182 { 183 struct rb_node ** p = &root->rb_node; 184 struct rb_node * parent = NULL; 185 struct tree_entry *entry; 186 187 while(*p) { 188 parent = *p; 189 entry = rb_entry(parent, struct tree_entry, rb_node); 190 191 if (offset < entry->start) 192 p = &(*p)->rb_left; 193 else if (offset > entry->end) 194 p = &(*p)->rb_right; 195 else 196 return parent; 197 } 198 199 entry = rb_entry(node, struct tree_entry, rb_node); 200 entry->in_tree = 1; 201 rb_link_node(node, parent, p); 202 rb_insert_color(node, root); 203 return NULL; 204 } 205 206 static struct rb_node *__tree_search(struct rb_root *root, u64 offset, 207 struct rb_node **prev_ret) 208 { 209 struct rb_node * n = root->rb_node; 210 struct rb_node *prev = NULL; 211 struct tree_entry *entry; 212 struct tree_entry *prev_entry = NULL; 213 214 while(n) { 215 entry = rb_entry(n, struct tree_entry, rb_node); 216 prev = n; 217 prev_entry = entry; 218 219 if (offset < entry->start) 220 n = n->rb_left; 221 else if (offset > entry->end) 222 n = n->rb_right; 223 else 224 return n; 225 } 226 if (!prev_ret) 227 return NULL; 228 while(prev && offset > prev_entry->end) { 229 prev = rb_next(prev); 230 prev_entry = rb_entry(prev, struct tree_entry, rb_node); 231 } 232 *prev_ret = prev; 233 return NULL; 234 } 235 236 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) 237 { 238 struct rb_node *prev; 239 struct rb_node *ret; 240 ret = __tree_search(root, offset, &prev); 241 if (!ret) 242 return prev; 243 return ret; 244 } 245 246 static int tree_delete(struct rb_root *root, u64 offset) 247 { 248 struct rb_node *node; 249 struct tree_entry *entry; 250 251 node = __tree_search(root, offset, NULL); 252 if (!node) 253 return -ENOENT; 254 entry = rb_entry(node, struct tree_entry, rb_node); 255 entry->in_tree = 0; 256 rb_erase(node, root); 257 return 0; 258 } 259 260 /* 261 * add_extent_mapping tries a simple backward merge with existing 262 * mappings. The extent_map struct passed in will be inserted into 263 * the tree directly (no copies made, just a reference taken). 264 */ 265 int add_extent_mapping(struct extent_map_tree *tree, 266 struct extent_map *em) 267 { 268 int ret = 0; 269 struct extent_map *prev = NULL; 270 struct rb_node *rb; 271 272 write_lock_irq(&tree->lock); 273 rb = tree_insert(&tree->map, em->end, &em->rb_node); 274 if (rb) { 275 prev = rb_entry(rb, struct extent_map, rb_node); 276 ret = -EEXIST; 277 goto out; 278 } 279 atomic_inc(&em->refs); 280 if (em->start != 0) { 281 rb = rb_prev(&em->rb_node); 282 if (rb) 283 prev = rb_entry(rb, struct extent_map, rb_node); 284 if (prev && prev->end + 1 == em->start && 285 ((em->block_start == EXTENT_MAP_HOLE && 286 prev->block_start == EXTENT_MAP_HOLE) || 287 (em->block_start == EXTENT_MAP_INLINE && 288 prev->block_start == EXTENT_MAP_INLINE) || 289 (em->block_start == EXTENT_MAP_DELALLOC && 290 prev->block_start == EXTENT_MAP_DELALLOC) || 291 (em->block_start < EXTENT_MAP_DELALLOC - 1 && 292 em->block_start == prev->block_end + 1))) { 293 em->start = prev->start; 294 em->block_start = prev->block_start; 295 rb_erase(&prev->rb_node, &tree->map); 296 prev->in_tree = 0; 297 free_extent_map(prev); 298 } 299 } 300 out: 301 write_unlock_irq(&tree->lock); 302 return ret; 303 } 304 EXPORT_SYMBOL(add_extent_mapping); 305 306 /* 307 * lookup_extent_mapping returns the first extent_map struct in the 308 * tree that intersects the [start, end] (inclusive) range. There may 309 * be additional objects in the tree that intersect, so check the object 310 * returned carefully to make sure you don't need additional lookups. 311 */ 312 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, 313 u64 start, u64 end) 314 { 315 struct extent_map *em; 316 struct rb_node *rb_node; 317 318 read_lock_irq(&tree->lock); 319 rb_node = tree_search(&tree->map, start); 320 if (!rb_node) { 321 em = NULL; 322 goto out; 323 } 324 if (IS_ERR(rb_node)) { 325 em = ERR_PTR(PTR_ERR(rb_node)); 326 goto out; 327 } 328 em = rb_entry(rb_node, struct extent_map, rb_node); 329 if (em->end < start || em->start > end) { 330 em = NULL; 331 goto out; 332 } 333 atomic_inc(&em->refs); 334 out: 335 read_unlock_irq(&tree->lock); 336 return em; 337 } 338 EXPORT_SYMBOL(lookup_extent_mapping); 339 340 /* 341 * removes an extent_map struct from the tree. No reference counts are 342 * dropped, and no checks are done to see if the range is in use 343 */ 344 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) 345 { 346 int ret; 347 348 write_lock_irq(&tree->lock); 349 ret = tree_delete(&tree->map, em->end); 350 write_unlock_irq(&tree->lock); 351 return ret; 352 } 353 EXPORT_SYMBOL(remove_extent_mapping); 354 355 /* 356 * utility function to look for merge candidates inside a given range. 357 * Any extents with matching state are merged together into a single 358 * extent in the tree. Extents with EXTENT_IO in their state field 359 * are not merged because the end_io handlers need to be able to do 360 * operations on them without sleeping (or doing allocations/splits). 361 * 362 * This should be called with the tree lock held. 363 */ 364 static int merge_state(struct extent_map_tree *tree, 365 struct extent_state *state) 366 { 367 struct extent_state *other; 368 struct rb_node *other_node; 369 370 if (state->state & EXTENT_IOBITS) 371 return 0; 372 373 other_node = rb_prev(&state->rb_node); 374 if (other_node) { 375 other = rb_entry(other_node, struct extent_state, rb_node); 376 if (other->end == state->start - 1 && 377 other->state == state->state) { 378 state->start = other->start; 379 other->in_tree = 0; 380 rb_erase(&other->rb_node, &tree->state); 381 free_extent_state(other); 382 } 383 } 384 other_node = rb_next(&state->rb_node); 385 if (other_node) { 386 other = rb_entry(other_node, struct extent_state, rb_node); 387 if (other->start == state->end + 1 && 388 other->state == state->state) { 389 other->start = state->start; 390 state->in_tree = 0; 391 rb_erase(&state->rb_node, &tree->state); 392 free_extent_state(state); 393 } 394 } 395 return 0; 396 } 397 398 /* 399 * insert an extent_state struct into the tree. 'bits' are set on the 400 * struct before it is inserted. 401 * 402 * This may return -EEXIST if the extent is already there, in which case the 403 * state struct is freed. 404 * 405 * The tree lock is not taken internally. This is a utility function and 406 * probably isn't what you want to call (see set/clear_extent_bit). 407 */ 408 static int insert_state(struct extent_map_tree *tree, 409 struct extent_state *state, u64 start, u64 end, 410 int bits) 411 { 412 struct rb_node *node; 413 414 if (end < start) { 415 printk("end < start %Lu %Lu\n", end, start); 416 WARN_ON(1); 417 } 418 if (bits & EXTENT_DIRTY) 419 tree->dirty_bytes += end - start + 1; 420 state->state |= bits; 421 state->start = start; 422 state->end = end; 423 node = tree_insert(&tree->state, end, &state->rb_node); 424 if (node) { 425 struct extent_state *found; 426 found = rb_entry(node, struct extent_state, rb_node); 427 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end); 428 free_extent_state(state); 429 return -EEXIST; 430 } 431 merge_state(tree, state); 432 return 0; 433 } 434 435 /* 436 * split a given extent state struct in two, inserting the preallocated 437 * struct 'prealloc' as the newly created second half. 'split' indicates an 438 * offset inside 'orig' where it should be split. 439 * 440 * Before calling, 441 * the tree has 'orig' at [orig->start, orig->end]. After calling, there 442 * are two extent state structs in the tree: 443 * prealloc: [orig->start, split - 1] 444 * orig: [ split, orig->end ] 445 * 446 * The tree locks are not taken by this function. They need to be held 447 * by the caller. 448 */ 449 static int split_state(struct extent_map_tree *tree, struct extent_state *orig, 450 struct extent_state *prealloc, u64 split) 451 { 452 struct rb_node *node; 453 prealloc->start = orig->start; 454 prealloc->end = split - 1; 455 prealloc->state = orig->state; 456 orig->start = split; 457 458 node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); 459 if (node) { 460 struct extent_state *found; 461 found = rb_entry(node, struct extent_state, rb_node); 462 printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end); 463 free_extent_state(prealloc); 464 return -EEXIST; 465 } 466 return 0; 467 } 468 469 /* 470 * utility function to clear some bits in an extent state struct. 471 * it will optionally wake up any one waiting on this state (wake == 1), or 472 * forcibly remove the state from the tree (delete == 1). 473 * 474 * If no bits are set on the state struct after clearing things, the 475 * struct is freed and removed from the tree 476 */ 477 static int clear_state_bit(struct extent_map_tree *tree, 478 struct extent_state *state, int bits, int wake, 479 int delete) 480 { 481 int ret = state->state & bits; 482 483 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 484 u64 range = state->end - state->start + 1; 485 WARN_ON(range > tree->dirty_bytes); 486 tree->dirty_bytes -= range; 487 } 488 state->state &= ~bits; 489 if (wake) 490 wake_up(&state->wq); 491 if (delete || state->state == 0) { 492 if (state->in_tree) { 493 rb_erase(&state->rb_node, &tree->state); 494 state->in_tree = 0; 495 free_extent_state(state); 496 } else { 497 WARN_ON(1); 498 } 499 } else { 500 merge_state(tree, state); 501 } 502 return ret; 503 } 504 505 /* 506 * clear some bits on a range in the tree. This may require splitting 507 * or inserting elements in the tree, so the gfp mask is used to 508 * indicate which allocations or sleeping are allowed. 509 * 510 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove 511 * the given range from the tree regardless of state (ie for truncate). 512 * 513 * the range [start, end] is inclusive. 514 * 515 * This takes the tree lock, and returns < 0 on error, > 0 if any of the 516 * bits were already set, or zero if none of the bits were already set. 517 */ 518 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, 519 int bits, int wake, int delete, gfp_t mask) 520 { 521 struct extent_state *state; 522 struct extent_state *prealloc = NULL; 523 struct rb_node *node; 524 unsigned long flags; 525 int err; 526 int set = 0; 527 528 again: 529 if (!prealloc && (mask & __GFP_WAIT)) { 530 prealloc = alloc_extent_state(mask); 531 if (!prealloc) 532 return -ENOMEM; 533 } 534 535 write_lock_irqsave(&tree->lock, flags); 536 /* 537 * this search will find the extents that end after 538 * our range starts 539 */ 540 node = tree_search(&tree->state, start); 541 if (!node) 542 goto out; 543 state = rb_entry(node, struct extent_state, rb_node); 544 if (state->start > end) 545 goto out; 546 WARN_ON(state->end < start); 547 548 /* 549 * | ---- desired range ---- | 550 * | state | or 551 * | ------------- state -------------- | 552 * 553 * We need to split the extent we found, and may flip 554 * bits on second half. 555 * 556 * If the extent we found extends past our range, we 557 * just split and search again. It'll get split again 558 * the next time though. 559 * 560 * If the extent we found is inside our range, we clear 561 * the desired bit on it. 562 */ 563 564 if (state->start < start) { 565 err = split_state(tree, state, prealloc, start); 566 BUG_ON(err == -EEXIST); 567 prealloc = NULL; 568 if (err) 569 goto out; 570 if (state->end <= end) { 571 start = state->end + 1; 572 set |= clear_state_bit(tree, state, bits, 573 wake, delete); 574 } else { 575 start = state->start; 576 } 577 goto search_again; 578 } 579 /* 580 * | ---- desired range ---- | 581 * | state | 582 * We need to split the extent, and clear the bit 583 * on the first half 584 */ 585 if (state->start <= end && state->end > end) { 586 err = split_state(tree, state, prealloc, end + 1); 587 BUG_ON(err == -EEXIST); 588 589 if (wake) 590 wake_up(&state->wq); 591 set |= clear_state_bit(tree, prealloc, bits, 592 wake, delete); 593 prealloc = NULL; 594 goto out; 595 } 596 597 start = state->end + 1; 598 set |= clear_state_bit(tree, state, bits, wake, delete); 599 goto search_again; 600 601 out: 602 write_unlock_irqrestore(&tree->lock, flags); 603 if (prealloc) 604 free_extent_state(prealloc); 605 606 return set; 607 608 search_again: 609 if (start > end) 610 goto out; 611 write_unlock_irqrestore(&tree->lock, flags); 612 if (mask & __GFP_WAIT) 613 cond_resched(); 614 goto again; 615 } 616 EXPORT_SYMBOL(clear_extent_bit); 617 618 static int wait_on_state(struct extent_map_tree *tree, 619 struct extent_state *state) 620 { 621 DEFINE_WAIT(wait); 622 prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); 623 read_unlock_irq(&tree->lock); 624 schedule(); 625 read_lock_irq(&tree->lock); 626 finish_wait(&state->wq, &wait); 627 return 0; 628 } 629 630 /* 631 * waits for one or more bits to clear on a range in the state tree. 632 * The range [start, end] is inclusive. 633 * The tree lock is taken by this function 634 */ 635 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits) 636 { 637 struct extent_state *state; 638 struct rb_node *node; 639 640 read_lock_irq(&tree->lock); 641 again: 642 while (1) { 643 /* 644 * this search will find all the extents that end after 645 * our range starts 646 */ 647 node = tree_search(&tree->state, start); 648 if (!node) 649 break; 650 651 state = rb_entry(node, struct extent_state, rb_node); 652 653 if (state->start > end) 654 goto out; 655 656 if (state->state & bits) { 657 start = state->start; 658 atomic_inc(&state->refs); 659 wait_on_state(tree, state); 660 free_extent_state(state); 661 goto again; 662 } 663 start = state->end + 1; 664 665 if (start > end) 666 break; 667 668 if (need_resched()) { 669 read_unlock_irq(&tree->lock); 670 cond_resched(); 671 read_lock_irq(&tree->lock); 672 } 673 } 674 out: 675 read_unlock_irq(&tree->lock); 676 return 0; 677 } 678 EXPORT_SYMBOL(wait_extent_bit); 679 680 static void set_state_bits(struct extent_map_tree *tree, 681 struct extent_state *state, 682 int bits) 683 { 684 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 685 u64 range = state->end - state->start + 1; 686 tree->dirty_bytes += range; 687 } 688 state->state |= bits; 689 } 690 691 /* 692 * set some bits on a range in the tree. This may require allocations 693 * or sleeping, so the gfp mask is used to indicate what is allowed. 694 * 695 * If 'exclusive' == 1, this will fail with -EEXIST if some part of the 696 * range already has the desired bits set. The start of the existing 697 * range is returned in failed_start in this case. 698 * 699 * [start, end] is inclusive 700 * This takes the tree lock. 701 */ 702 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits, 703 int exclusive, u64 *failed_start, gfp_t mask) 704 { 705 struct extent_state *state; 706 struct extent_state *prealloc = NULL; 707 struct rb_node *node; 708 unsigned long flags; 709 int err = 0; 710 int set; 711 u64 last_start; 712 u64 last_end; 713 again: 714 if (!prealloc && (mask & __GFP_WAIT)) { 715 prealloc = alloc_extent_state(mask); 716 if (!prealloc) 717 return -ENOMEM; 718 } 719 720 write_lock_irqsave(&tree->lock, flags); 721 /* 722 * this search will find all the extents that end after 723 * our range starts. 724 */ 725 node = tree_search(&tree->state, start); 726 if (!node) { 727 err = insert_state(tree, prealloc, start, end, bits); 728 prealloc = NULL; 729 BUG_ON(err == -EEXIST); 730 goto out; 731 } 732 733 state = rb_entry(node, struct extent_state, rb_node); 734 last_start = state->start; 735 last_end = state->end; 736 737 /* 738 * | ---- desired range ---- | 739 * | state | 740 * 741 * Just lock what we found and keep going 742 */ 743 if (state->start == start && state->end <= end) { 744 set = state->state & bits; 745 if (set && exclusive) { 746 *failed_start = state->start; 747 err = -EEXIST; 748 goto out; 749 } 750 set_state_bits(tree, state, bits); 751 start = state->end + 1; 752 merge_state(tree, state); 753 goto search_again; 754 } 755 756 /* 757 * | ---- desired range ---- | 758 * | state | 759 * or 760 * | ------------- state -------------- | 761 * 762 * We need to split the extent we found, and may flip bits on 763 * second half. 764 * 765 * If the extent we found extends past our 766 * range, we just split and search again. It'll get split 767 * again the next time though. 768 * 769 * If the extent we found is inside our range, we set the 770 * desired bit on it. 771 */ 772 if (state->start < start) { 773 set = state->state & bits; 774 if (exclusive && set) { 775 *failed_start = start; 776 err = -EEXIST; 777 goto out; 778 } 779 err = split_state(tree, state, prealloc, start); 780 BUG_ON(err == -EEXIST); 781 prealloc = NULL; 782 if (err) 783 goto out; 784 if (state->end <= end) { 785 set_state_bits(tree, state, bits); 786 start = state->end + 1; 787 merge_state(tree, state); 788 } else { 789 start = state->start; 790 } 791 goto search_again; 792 } 793 /* 794 * | ---- desired range ---- | 795 * | state | or | state | 796 * 797 * There's a hole, we need to insert something in it and 798 * ignore the extent we found. 799 */ 800 if (state->start > start) { 801 u64 this_end; 802 if (end < last_start) 803 this_end = end; 804 else 805 this_end = last_start -1; 806 err = insert_state(tree, prealloc, start, this_end, 807 bits); 808 prealloc = NULL; 809 BUG_ON(err == -EEXIST); 810 if (err) 811 goto out; 812 start = this_end + 1; 813 goto search_again; 814 } 815 /* 816 * | ---- desired range ---- | 817 * | state | 818 * We need to split the extent, and set the bit 819 * on the first half 820 */ 821 if (state->start <= end && state->end > end) { 822 set = state->state & bits; 823 if (exclusive && set) { 824 *failed_start = start; 825 err = -EEXIST; 826 goto out; 827 } 828 err = split_state(tree, state, prealloc, end + 1); 829 BUG_ON(err == -EEXIST); 830 831 set_state_bits(tree, prealloc, bits); 832 merge_state(tree, prealloc); 833 prealloc = NULL; 834 goto out; 835 } 836 837 goto search_again; 838 839 out: 840 write_unlock_irqrestore(&tree->lock, flags); 841 if (prealloc) 842 free_extent_state(prealloc); 843 844 return err; 845 846 search_again: 847 if (start > end) 848 goto out; 849 write_unlock_irqrestore(&tree->lock, flags); 850 if (mask & __GFP_WAIT) 851 cond_resched(); 852 goto again; 853 } 854 EXPORT_SYMBOL(set_extent_bit); 855 856 /* wrappers around set/clear extent bit */ 857 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 858 gfp_t mask) 859 { 860 return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, 861 mask); 862 } 863 EXPORT_SYMBOL(set_extent_dirty); 864 865 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, 866 int bits, gfp_t mask) 867 { 868 return set_extent_bit(tree, start, end, bits, 0, NULL, 869 mask); 870 } 871 EXPORT_SYMBOL(set_extent_bits); 872 873 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end, 874 int bits, gfp_t mask) 875 { 876 return clear_extent_bit(tree, start, end, bits, 0, 0, mask); 877 } 878 EXPORT_SYMBOL(clear_extent_bits); 879 880 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end, 881 gfp_t mask) 882 { 883 return set_extent_bit(tree, start, end, 884 EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL, 885 mask); 886 } 887 EXPORT_SYMBOL(set_extent_delalloc); 888 889 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end, 890 gfp_t mask) 891 { 892 return clear_extent_bit(tree, start, end, 893 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); 894 } 895 EXPORT_SYMBOL(clear_extent_dirty); 896 897 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 898 gfp_t mask) 899 { 900 return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, 901 mask); 902 } 903 EXPORT_SYMBOL(set_extent_new); 904 905 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end, 906 gfp_t mask) 907 { 908 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); 909 } 910 EXPORT_SYMBOL(clear_extent_new); 911 912 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 913 gfp_t mask) 914 { 915 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 916 mask); 917 } 918 EXPORT_SYMBOL(set_extent_uptodate); 919 920 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end, 921 gfp_t mask) 922 { 923 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); 924 } 925 EXPORT_SYMBOL(clear_extent_uptodate); 926 927 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 928 gfp_t mask) 929 { 930 return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, 931 0, NULL, mask); 932 } 933 EXPORT_SYMBOL(set_extent_writeback); 934 935 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end, 936 gfp_t mask) 937 { 938 return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); 939 } 940 EXPORT_SYMBOL(clear_extent_writeback); 941 942 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end) 943 { 944 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); 945 } 946 EXPORT_SYMBOL(wait_on_extent_writeback); 947 948 /* 949 * locks a range in ascending order, waiting for any locked regions 950 * it hits on the way. [start,end] are inclusive, and this will sleep. 951 */ 952 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask) 953 { 954 int err; 955 u64 failed_start; 956 while (1) { 957 err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 958 &failed_start, mask); 959 if (err == -EEXIST && (mask & __GFP_WAIT)) { 960 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 961 start = failed_start; 962 } else { 963 break; 964 } 965 WARN_ON(start > end); 966 } 967 return err; 968 } 969 EXPORT_SYMBOL(lock_extent); 970 971 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end, 972 gfp_t mask) 973 { 974 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); 975 } 976 EXPORT_SYMBOL(unlock_extent); 977 978 /* 979 * helper function to set pages and extents in the tree dirty 980 */ 981 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end) 982 { 983 unsigned long index = start >> PAGE_CACHE_SHIFT; 984 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 985 struct page *page; 986 987 while (index <= end_index) { 988 page = find_get_page(tree->mapping, index); 989 BUG_ON(!page); 990 __set_page_dirty_nobuffers(page); 991 page_cache_release(page); 992 index++; 993 } 994 set_extent_dirty(tree, start, end, GFP_NOFS); 995 return 0; 996 } 997 EXPORT_SYMBOL(set_range_dirty); 998 999 /* 1000 * helper function to set both pages and extents in the tree writeback 1001 */ 1002 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end) 1003 { 1004 unsigned long index = start >> PAGE_CACHE_SHIFT; 1005 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1006 struct page *page; 1007 1008 while (index <= end_index) { 1009 page = find_get_page(tree->mapping, index); 1010 BUG_ON(!page); 1011 set_page_writeback(page); 1012 page_cache_release(page); 1013 index++; 1014 } 1015 set_extent_writeback(tree, start, end, GFP_NOFS); 1016 return 0; 1017 } 1018 EXPORT_SYMBOL(set_range_writeback); 1019 1020 int find_first_extent_bit(struct extent_map_tree *tree, u64 start, 1021 u64 *start_ret, u64 *end_ret, int bits) 1022 { 1023 struct rb_node *node; 1024 struct extent_state *state; 1025 int ret = 1; 1026 1027 read_lock_irq(&tree->lock); 1028 /* 1029 * this search will find all the extents that end after 1030 * our range starts. 1031 */ 1032 node = tree_search(&tree->state, start); 1033 if (!node || IS_ERR(node)) { 1034 goto out; 1035 } 1036 1037 while(1) { 1038 state = rb_entry(node, struct extent_state, rb_node); 1039 if (state->end >= start && (state->state & bits)) { 1040 *start_ret = state->start; 1041 *end_ret = state->end; 1042 ret = 0; 1043 break; 1044 } 1045 node = rb_next(node); 1046 if (!node) 1047 break; 1048 } 1049 out: 1050 read_unlock_irq(&tree->lock); 1051 return ret; 1052 } 1053 EXPORT_SYMBOL(find_first_extent_bit); 1054 1055 u64 find_lock_delalloc_range(struct extent_map_tree *tree, 1056 u64 *start, u64 *end, u64 max_bytes) 1057 { 1058 struct rb_node *node; 1059 struct extent_state *state; 1060 u64 cur_start = *start; 1061 u64 found = 0; 1062 u64 total_bytes = 0; 1063 1064 write_lock_irq(&tree->lock); 1065 /* 1066 * this search will find all the extents that end after 1067 * our range starts. 1068 */ 1069 search_again: 1070 node = tree_search(&tree->state, cur_start); 1071 if (!node || IS_ERR(node)) { 1072 *end = (u64)-1; 1073 goto out; 1074 } 1075 1076 while(1) { 1077 state = rb_entry(node, struct extent_state, rb_node); 1078 if (found && state->start != cur_start) { 1079 goto out; 1080 } 1081 if (!(state->state & EXTENT_DELALLOC)) { 1082 if (!found) 1083 *end = state->end; 1084 goto out; 1085 } 1086 if (!found) { 1087 struct extent_state *prev_state; 1088 struct rb_node *prev_node = node; 1089 while(1) { 1090 prev_node = rb_prev(prev_node); 1091 if (!prev_node) 1092 break; 1093 prev_state = rb_entry(prev_node, 1094 struct extent_state, 1095 rb_node); 1096 if (!(prev_state->state & EXTENT_DELALLOC)) 1097 break; 1098 state = prev_state; 1099 node = prev_node; 1100 } 1101 } 1102 if (state->state & EXTENT_LOCKED) { 1103 DEFINE_WAIT(wait); 1104 atomic_inc(&state->refs); 1105 prepare_to_wait(&state->wq, &wait, 1106 TASK_UNINTERRUPTIBLE); 1107 write_unlock_irq(&tree->lock); 1108 schedule(); 1109 write_lock_irq(&tree->lock); 1110 finish_wait(&state->wq, &wait); 1111 free_extent_state(state); 1112 goto search_again; 1113 } 1114 state->state |= EXTENT_LOCKED; 1115 if (!found) 1116 *start = state->start; 1117 found++; 1118 *end = state->end; 1119 cur_start = state->end + 1; 1120 node = rb_next(node); 1121 if (!node) 1122 break; 1123 total_bytes += state->end - state->start + 1; 1124 if (total_bytes >= max_bytes) 1125 break; 1126 } 1127 out: 1128 write_unlock_irq(&tree->lock); 1129 return found; 1130 } 1131 1132 u64 count_range_bits(struct extent_map_tree *tree, 1133 u64 *start, u64 search_end, u64 max_bytes, 1134 unsigned long bits) 1135 { 1136 struct rb_node *node; 1137 struct extent_state *state; 1138 u64 cur_start = *start; 1139 u64 total_bytes = 0; 1140 int found = 0; 1141 1142 if (search_end <= cur_start) { 1143 printk("search_end %Lu start %Lu\n", search_end, cur_start); 1144 WARN_ON(1); 1145 return 0; 1146 } 1147 1148 write_lock_irq(&tree->lock); 1149 if (cur_start == 0 && bits == EXTENT_DIRTY) { 1150 total_bytes = tree->dirty_bytes; 1151 goto out; 1152 } 1153 /* 1154 * this search will find all the extents that end after 1155 * our range starts. 1156 */ 1157 node = tree_search(&tree->state, cur_start); 1158 if (!node || IS_ERR(node)) { 1159 goto out; 1160 } 1161 1162 while(1) { 1163 state = rb_entry(node, struct extent_state, rb_node); 1164 if (state->start > search_end) 1165 break; 1166 if (state->end >= cur_start && (state->state & bits)) { 1167 total_bytes += min(search_end, state->end) + 1 - 1168 max(cur_start, state->start); 1169 if (total_bytes >= max_bytes) 1170 break; 1171 if (!found) { 1172 *start = state->start; 1173 found = 1; 1174 } 1175 } 1176 node = rb_next(node); 1177 if (!node) 1178 break; 1179 } 1180 out: 1181 write_unlock_irq(&tree->lock); 1182 return total_bytes; 1183 } 1184 /* 1185 * helper function to lock both pages and extents in the tree. 1186 * pages must be locked first. 1187 */ 1188 int lock_range(struct extent_map_tree *tree, u64 start, u64 end) 1189 { 1190 unsigned long index = start >> PAGE_CACHE_SHIFT; 1191 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1192 struct page *page; 1193 int err; 1194 1195 while (index <= end_index) { 1196 page = grab_cache_page(tree->mapping, index); 1197 if (!page) { 1198 err = -ENOMEM; 1199 goto failed; 1200 } 1201 if (IS_ERR(page)) { 1202 err = PTR_ERR(page); 1203 goto failed; 1204 } 1205 index++; 1206 } 1207 lock_extent(tree, start, end, GFP_NOFS); 1208 return 0; 1209 1210 failed: 1211 /* 1212 * we failed above in getting the page at 'index', so we undo here 1213 * up to but not including the page at 'index' 1214 */ 1215 end_index = index; 1216 index = start >> PAGE_CACHE_SHIFT; 1217 while (index < end_index) { 1218 page = find_get_page(tree->mapping, index); 1219 unlock_page(page); 1220 page_cache_release(page); 1221 index++; 1222 } 1223 return err; 1224 } 1225 EXPORT_SYMBOL(lock_range); 1226 1227 /* 1228 * helper function to unlock both pages and extents in the tree. 1229 */ 1230 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end) 1231 { 1232 unsigned long index = start >> PAGE_CACHE_SHIFT; 1233 unsigned long end_index = end >> PAGE_CACHE_SHIFT; 1234 struct page *page; 1235 1236 while (index <= end_index) { 1237 page = find_get_page(tree->mapping, index); 1238 unlock_page(page); 1239 page_cache_release(page); 1240 index++; 1241 } 1242 unlock_extent(tree, start, end, GFP_NOFS); 1243 return 0; 1244 } 1245 EXPORT_SYMBOL(unlock_range); 1246 1247 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private) 1248 { 1249 struct rb_node *node; 1250 struct extent_state *state; 1251 int ret = 0; 1252 1253 write_lock_irq(&tree->lock); 1254 /* 1255 * this search will find all the extents that end after 1256 * our range starts. 1257 */ 1258 node = tree_search(&tree->state, start); 1259 if (!node || IS_ERR(node)) { 1260 ret = -ENOENT; 1261 goto out; 1262 } 1263 state = rb_entry(node, struct extent_state, rb_node); 1264 if (state->start != start) { 1265 ret = -ENOENT; 1266 goto out; 1267 } 1268 state->private = private; 1269 out: 1270 write_unlock_irq(&tree->lock); 1271 return ret; 1272 } 1273 1274 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private) 1275 { 1276 struct rb_node *node; 1277 struct extent_state *state; 1278 int ret = 0; 1279 1280 read_lock_irq(&tree->lock); 1281 /* 1282 * this search will find all the extents that end after 1283 * our range starts. 1284 */ 1285 node = tree_search(&tree->state, start); 1286 if (!node || IS_ERR(node)) { 1287 ret = -ENOENT; 1288 goto out; 1289 } 1290 state = rb_entry(node, struct extent_state, rb_node); 1291 if (state->start != start) { 1292 ret = -ENOENT; 1293 goto out; 1294 } 1295 *private = state->private; 1296 out: 1297 read_unlock_irq(&tree->lock); 1298 return ret; 1299 } 1300 1301 /* 1302 * searches a range in the state tree for a given mask. 1303 * If 'filled' == 1, this returns 1 only if ever extent in the tree 1304 * has the bits set. Otherwise, 1 is returned if any bit in the 1305 * range is found set. 1306 */ 1307 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end, 1308 int bits, int filled) 1309 { 1310 struct extent_state *state = NULL; 1311 struct rb_node *node; 1312 int bitset = 0; 1313 1314 read_lock_irq(&tree->lock); 1315 node = tree_search(&tree->state, start); 1316 while (node && start <= end) { 1317 state = rb_entry(node, struct extent_state, rb_node); 1318 1319 if (filled && state->start > start) { 1320 bitset = 0; 1321 break; 1322 } 1323 1324 if (state->start > end) 1325 break; 1326 1327 if (state->state & bits) { 1328 bitset = 1; 1329 if (!filled) 1330 break; 1331 } else if (filled) { 1332 bitset = 0; 1333 break; 1334 } 1335 start = state->end + 1; 1336 if (start > end) 1337 break; 1338 node = rb_next(node); 1339 if (!node) { 1340 if (filled) 1341 bitset = 0; 1342 break; 1343 } 1344 } 1345 read_unlock_irq(&tree->lock); 1346 return bitset; 1347 } 1348 EXPORT_SYMBOL(test_range_bit); 1349 1350 /* 1351 * helper function to set a given page up to date if all the 1352 * extents in the tree for that page are up to date 1353 */ 1354 static int check_page_uptodate(struct extent_map_tree *tree, 1355 struct page *page) 1356 { 1357 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1358 u64 end = start + PAGE_CACHE_SIZE - 1; 1359 if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) 1360 SetPageUptodate(page); 1361 return 0; 1362 } 1363 1364 /* 1365 * helper function to unlock a page if all the extents in the tree 1366 * for that page are unlocked 1367 */ 1368 static int check_page_locked(struct extent_map_tree *tree, 1369 struct page *page) 1370 { 1371 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1372 u64 end = start + PAGE_CACHE_SIZE - 1; 1373 if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) 1374 unlock_page(page); 1375 return 0; 1376 } 1377 1378 /* 1379 * helper function to end page writeback if all the extents 1380 * in the tree for that page are done with writeback 1381 */ 1382 static int check_page_writeback(struct extent_map_tree *tree, 1383 struct page *page) 1384 { 1385 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1386 u64 end = start + PAGE_CACHE_SIZE - 1; 1387 if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) 1388 end_page_writeback(page); 1389 return 0; 1390 } 1391 1392 /* lots and lots of room for performance fixes in the end_bio funcs */ 1393 1394 /* 1395 * after a writepage IO is done, we need to: 1396 * clear the uptodate bits on error 1397 * clear the writeback bits in the extent tree for this IO 1398 * end_page_writeback if the page has no more pending IO 1399 * 1400 * Scheduling is not allowed, so the extent state tree is expected 1401 * to have one and only one object corresponding to this IO. 1402 */ 1403 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1404 static void end_bio_extent_writepage(struct bio *bio, int err) 1405 #else 1406 static int end_bio_extent_writepage(struct bio *bio, 1407 unsigned int bytes_done, int err) 1408 #endif 1409 { 1410 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1411 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1412 struct extent_map_tree *tree = bio->bi_private; 1413 u64 start; 1414 u64 end; 1415 int whole_page; 1416 1417 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1418 if (bio->bi_size) 1419 return 1; 1420 #endif 1421 1422 do { 1423 struct page *page = bvec->bv_page; 1424 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1425 bvec->bv_offset; 1426 end = start + bvec->bv_len - 1; 1427 1428 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1429 whole_page = 1; 1430 else 1431 whole_page = 0; 1432 1433 if (--bvec >= bio->bi_io_vec) 1434 prefetchw(&bvec->bv_page->flags); 1435 1436 if (!uptodate) { 1437 clear_extent_uptodate(tree, start, end, GFP_ATOMIC); 1438 ClearPageUptodate(page); 1439 SetPageError(page); 1440 } 1441 clear_extent_writeback(tree, start, end, GFP_ATOMIC); 1442 1443 if (whole_page) 1444 end_page_writeback(page); 1445 else 1446 check_page_writeback(tree, page); 1447 if (tree->ops && tree->ops->writepage_end_io_hook) 1448 tree->ops->writepage_end_io_hook(page, start, end); 1449 } while (bvec >= bio->bi_io_vec); 1450 1451 bio_put(bio); 1452 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1453 return 0; 1454 #endif 1455 } 1456 1457 /* 1458 * after a readpage IO is done, we need to: 1459 * clear the uptodate bits on error 1460 * set the uptodate bits if things worked 1461 * set the page up to date if all extents in the tree are uptodate 1462 * clear the lock bit in the extent tree 1463 * unlock the page if there are no other extents locked for it 1464 * 1465 * Scheduling is not allowed, so the extent state tree is expected 1466 * to have one and only one object corresponding to this IO. 1467 */ 1468 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1469 static void end_bio_extent_readpage(struct bio *bio, int err) 1470 #else 1471 static int end_bio_extent_readpage(struct bio *bio, 1472 unsigned int bytes_done, int err) 1473 #endif 1474 { 1475 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1476 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1477 struct extent_map_tree *tree = bio->bi_private; 1478 u64 start; 1479 u64 end; 1480 int whole_page; 1481 int ret; 1482 1483 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1484 if (bio->bi_size) 1485 return 1; 1486 #endif 1487 1488 do { 1489 struct page *page = bvec->bv_page; 1490 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1491 bvec->bv_offset; 1492 end = start + bvec->bv_len - 1; 1493 1494 if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) 1495 whole_page = 1; 1496 else 1497 whole_page = 0; 1498 1499 if (--bvec >= bio->bi_io_vec) 1500 prefetchw(&bvec->bv_page->flags); 1501 1502 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1503 ret = tree->ops->readpage_end_io_hook(page, start, end); 1504 if (ret) 1505 uptodate = 0; 1506 } 1507 if (uptodate) { 1508 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1509 if (whole_page) 1510 SetPageUptodate(page); 1511 else 1512 check_page_uptodate(tree, page); 1513 } else { 1514 ClearPageUptodate(page); 1515 SetPageError(page); 1516 } 1517 1518 unlock_extent(tree, start, end, GFP_ATOMIC); 1519 1520 if (whole_page) 1521 unlock_page(page); 1522 else 1523 check_page_locked(tree, page); 1524 } while (bvec >= bio->bi_io_vec); 1525 1526 bio_put(bio); 1527 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1528 return 0; 1529 #endif 1530 } 1531 1532 /* 1533 * IO done from prepare_write is pretty simple, we just unlock 1534 * the structs in the extent tree when done, and set the uptodate bits 1535 * as appropriate. 1536 */ 1537 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23) 1538 static void end_bio_extent_preparewrite(struct bio *bio, int err) 1539 #else 1540 static int end_bio_extent_preparewrite(struct bio *bio, 1541 unsigned int bytes_done, int err) 1542 #endif 1543 { 1544 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1545 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1546 struct extent_map_tree *tree = bio->bi_private; 1547 u64 start; 1548 u64 end; 1549 1550 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1551 if (bio->bi_size) 1552 return 1; 1553 #endif 1554 1555 do { 1556 struct page *page = bvec->bv_page; 1557 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1558 bvec->bv_offset; 1559 end = start + bvec->bv_len - 1; 1560 1561 if (--bvec >= bio->bi_io_vec) 1562 prefetchw(&bvec->bv_page->flags); 1563 1564 if (uptodate) { 1565 set_extent_uptodate(tree, start, end, GFP_ATOMIC); 1566 } else { 1567 ClearPageUptodate(page); 1568 SetPageError(page); 1569 } 1570 1571 unlock_extent(tree, start, end, GFP_ATOMIC); 1572 1573 } while (bvec >= bio->bi_io_vec); 1574 1575 bio_put(bio); 1576 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23) 1577 return 0; 1578 #endif 1579 } 1580 1581 static struct bio * 1582 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1583 gfp_t gfp_flags) 1584 { 1585 struct bio *bio; 1586 1587 bio = bio_alloc(gfp_flags, nr_vecs); 1588 1589 if (bio == NULL && (current->flags & PF_MEMALLOC)) { 1590 while (!bio && (nr_vecs /= 2)) 1591 bio = bio_alloc(gfp_flags, nr_vecs); 1592 } 1593 1594 if (bio) { 1595 bio->bi_bdev = bdev; 1596 bio->bi_sector = first_sector; 1597 } 1598 return bio; 1599 } 1600 1601 static int submit_one_bio(int rw, struct bio *bio) 1602 { 1603 u64 maxsector; 1604 int ret = 0; 1605 1606 bio_get(bio); 1607 1608 maxsector = bio->bi_bdev->bd_inode->i_size >> 9; 1609 if (maxsector < bio->bi_sector) { 1610 printk("sector too large max %Lu got %llu\n", maxsector, 1611 (unsigned long long)bio->bi_sector); 1612 WARN_ON(1); 1613 } 1614 1615 submit_bio(rw, bio); 1616 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1617 ret = -EOPNOTSUPP; 1618 bio_put(bio); 1619 return ret; 1620 } 1621 1622 static int submit_extent_page(int rw, struct extent_map_tree *tree, 1623 struct page *page, sector_t sector, 1624 size_t size, unsigned long offset, 1625 struct block_device *bdev, 1626 struct bio **bio_ret, 1627 unsigned long max_pages, 1628 bio_end_io_t end_io_func) 1629 { 1630 int ret = 0; 1631 struct bio *bio; 1632 int nr; 1633 1634 if (bio_ret && *bio_ret) { 1635 bio = *bio_ret; 1636 if (bio->bi_sector + (bio->bi_size >> 9) != sector || 1637 bio_add_page(bio, page, size, offset) < size) { 1638 ret = submit_one_bio(rw, bio); 1639 bio = NULL; 1640 } else { 1641 return 0; 1642 } 1643 } 1644 nr = min_t(int, max_pages, bio_get_nr_vecs(bdev)); 1645 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1646 if (!bio) { 1647 printk("failed to allocate bio nr %d\n", nr); 1648 } 1649 bio_add_page(bio, page, size, offset); 1650 bio->bi_end_io = end_io_func; 1651 bio->bi_private = tree; 1652 if (bio_ret) { 1653 *bio_ret = bio; 1654 } else { 1655 ret = submit_one_bio(rw, bio); 1656 } 1657 1658 return ret; 1659 } 1660 1661 void set_page_extent_mapped(struct page *page) 1662 { 1663 if (!PagePrivate(page)) { 1664 SetPagePrivate(page); 1665 WARN_ON(!page->mapping->a_ops->invalidatepage); 1666 set_page_private(page, EXTENT_PAGE_PRIVATE); 1667 page_cache_get(page); 1668 } 1669 } 1670 1671 void set_page_extent_head(struct page *page, unsigned long len) 1672 { 1673 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1674 } 1675 1676 /* 1677 * basic readpage implementation. Locked extent state structs are inserted 1678 * into the tree that are removed when the IO is done (by the end_io 1679 * handlers) 1680 */ 1681 static int __extent_read_full_page(struct extent_map_tree *tree, 1682 struct page *page, 1683 get_extent_t *get_extent, 1684 struct bio **bio) 1685 { 1686 struct inode *inode = page->mapping->host; 1687 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1688 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1689 u64 end; 1690 u64 cur = start; 1691 u64 extent_offset; 1692 u64 last_byte = i_size_read(inode); 1693 u64 block_start; 1694 u64 cur_end; 1695 sector_t sector; 1696 struct extent_map *em; 1697 struct block_device *bdev; 1698 int ret; 1699 int nr = 0; 1700 size_t page_offset = 0; 1701 size_t iosize; 1702 size_t blocksize = inode->i_sb->s_blocksize; 1703 1704 set_page_extent_mapped(page); 1705 1706 end = page_end; 1707 lock_extent(tree, start, end, GFP_NOFS); 1708 1709 while (cur <= end) { 1710 if (cur >= last_byte) { 1711 char *userpage; 1712 iosize = PAGE_CACHE_SIZE - page_offset; 1713 userpage = kmap_atomic(page, KM_USER0); 1714 memset(userpage + page_offset, 0, iosize); 1715 flush_dcache_page(page); 1716 kunmap_atomic(userpage, KM_USER0); 1717 set_extent_uptodate(tree, cur, cur + iosize - 1, 1718 GFP_NOFS); 1719 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1720 break; 1721 } 1722 em = get_extent(inode, page, page_offset, cur, end, 0); 1723 if (IS_ERR(em) || !em) { 1724 SetPageError(page); 1725 unlock_extent(tree, cur, end, GFP_NOFS); 1726 break; 1727 } 1728 1729 extent_offset = cur - em->start; 1730 BUG_ON(em->end < cur); 1731 BUG_ON(end < cur); 1732 1733 iosize = min(em->end - cur, end - cur) + 1; 1734 cur_end = min(em->end, end); 1735 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1736 sector = (em->block_start + extent_offset) >> 9; 1737 bdev = em->bdev; 1738 block_start = em->block_start; 1739 free_extent_map(em); 1740 em = NULL; 1741 1742 /* we've found a hole, just zero and go on */ 1743 if (block_start == EXTENT_MAP_HOLE) { 1744 char *userpage; 1745 userpage = kmap_atomic(page, KM_USER0); 1746 memset(userpage + page_offset, 0, iosize); 1747 flush_dcache_page(page); 1748 kunmap_atomic(userpage, KM_USER0); 1749 1750 set_extent_uptodate(tree, cur, cur + iosize - 1, 1751 GFP_NOFS); 1752 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1753 cur = cur + iosize; 1754 page_offset += iosize; 1755 continue; 1756 } 1757 /* the get_extent function already copied into the page */ 1758 if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { 1759 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 1760 cur = cur + iosize; 1761 page_offset += iosize; 1762 continue; 1763 } 1764 1765 ret = 0; 1766 if (tree->ops && tree->ops->readpage_io_hook) { 1767 ret = tree->ops->readpage_io_hook(page, cur, 1768 cur + iosize - 1); 1769 } 1770 if (!ret) { 1771 unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 1772 nr -= page->index; 1773 ret = submit_extent_page(READ, tree, page, 1774 sector, iosize, page_offset, 1775 bdev, bio, nr, 1776 end_bio_extent_readpage); 1777 } 1778 if (ret) 1779 SetPageError(page); 1780 cur = cur + iosize; 1781 page_offset += iosize; 1782 nr++; 1783 } 1784 if (!nr) { 1785 if (!PageError(page)) 1786 SetPageUptodate(page); 1787 unlock_page(page); 1788 } 1789 return 0; 1790 } 1791 1792 int extent_read_full_page(struct extent_map_tree *tree, struct page *page, 1793 get_extent_t *get_extent) 1794 { 1795 struct bio *bio = NULL; 1796 int ret; 1797 1798 ret = __extent_read_full_page(tree, page, get_extent, &bio); 1799 if (bio) 1800 submit_one_bio(READ, bio); 1801 return ret; 1802 } 1803 EXPORT_SYMBOL(extent_read_full_page); 1804 1805 /* 1806 * the writepage semantics are similar to regular writepage. extent 1807 * records are inserted to lock ranges in the tree, and as dirty areas 1808 * are found, they are marked writeback. Then the lock bits are removed 1809 * and the end_io handler clears the writeback ranges 1810 */ 1811 static int __extent_writepage(struct page *page, struct writeback_control *wbc, 1812 void *data) 1813 { 1814 struct inode *inode = page->mapping->host; 1815 struct extent_page_data *epd = data; 1816 struct extent_map_tree *tree = epd->tree; 1817 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1818 u64 delalloc_start; 1819 u64 page_end = start + PAGE_CACHE_SIZE - 1; 1820 u64 end; 1821 u64 cur = start; 1822 u64 extent_offset; 1823 u64 last_byte = i_size_read(inode); 1824 u64 block_start; 1825 u64 iosize; 1826 sector_t sector; 1827 struct extent_map *em; 1828 struct block_device *bdev; 1829 int ret; 1830 int nr = 0; 1831 size_t page_offset = 0; 1832 size_t blocksize; 1833 loff_t i_size = i_size_read(inode); 1834 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 1835 u64 nr_delalloc; 1836 u64 delalloc_end; 1837 1838 WARN_ON(!PageLocked(page)); 1839 if (page->index > end_index) { 1840 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1841 unlock_page(page); 1842 return 0; 1843 } 1844 1845 if (page->index == end_index) { 1846 char *userpage; 1847 1848 size_t offset = i_size & (PAGE_CACHE_SIZE - 1); 1849 1850 userpage = kmap_atomic(page, KM_USER0); 1851 memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset); 1852 flush_dcache_page(page); 1853 kunmap_atomic(userpage, KM_USER0); 1854 } 1855 1856 set_page_extent_mapped(page); 1857 1858 delalloc_start = start; 1859 delalloc_end = 0; 1860 while(delalloc_end < page_end) { 1861 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, 1862 &delalloc_end, 1863 128 * 1024 * 1024); 1864 if (nr_delalloc == 0) { 1865 delalloc_start = delalloc_end + 1; 1866 continue; 1867 } 1868 tree->ops->fill_delalloc(inode, delalloc_start, 1869 delalloc_end); 1870 clear_extent_bit(tree, delalloc_start, 1871 delalloc_end, 1872 EXTENT_LOCKED | EXTENT_DELALLOC, 1873 1, 0, GFP_NOFS); 1874 delalloc_start = delalloc_end + 1; 1875 } 1876 lock_extent(tree, start, page_end, GFP_NOFS); 1877 1878 end = page_end; 1879 if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) { 1880 printk("found delalloc bits after lock_extent\n"); 1881 } 1882 1883 if (last_byte <= start) { 1884 clear_extent_dirty(tree, start, page_end, GFP_NOFS); 1885 goto done; 1886 } 1887 1888 set_extent_uptodate(tree, start, page_end, GFP_NOFS); 1889 blocksize = inode->i_sb->s_blocksize; 1890 1891 while (cur <= end) { 1892 if (cur >= last_byte) { 1893 clear_extent_dirty(tree, cur, page_end, GFP_NOFS); 1894 break; 1895 } 1896 em = epd->get_extent(inode, page, page_offset, cur, end, 1); 1897 if (IS_ERR(em) || !em) { 1898 SetPageError(page); 1899 break; 1900 } 1901 1902 extent_offset = cur - em->start; 1903 BUG_ON(em->end < cur); 1904 BUG_ON(end < cur); 1905 iosize = min(em->end - cur, end - cur) + 1; 1906 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1907 sector = (em->block_start + extent_offset) >> 9; 1908 bdev = em->bdev; 1909 block_start = em->block_start; 1910 free_extent_map(em); 1911 em = NULL; 1912 1913 if (block_start == EXTENT_MAP_HOLE || 1914 block_start == EXTENT_MAP_INLINE) { 1915 clear_extent_dirty(tree, cur, 1916 cur + iosize - 1, GFP_NOFS); 1917 cur = cur + iosize; 1918 page_offset += iosize; 1919 continue; 1920 } 1921 1922 /* leave this out until we have a page_mkwrite call */ 1923 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 1924 EXTENT_DIRTY, 0)) { 1925 cur = cur + iosize; 1926 page_offset += iosize; 1927 continue; 1928 } 1929 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 1930 if (tree->ops && tree->ops->writepage_io_hook) { 1931 ret = tree->ops->writepage_io_hook(page, cur, 1932 cur + iosize - 1); 1933 } else { 1934 ret = 0; 1935 } 1936 if (ret) 1937 SetPageError(page); 1938 else { 1939 unsigned long max_nr = end_index + 1; 1940 set_range_writeback(tree, cur, cur + iosize - 1); 1941 if (!PageWriteback(page)) { 1942 printk("warning page %lu not writeback, " 1943 "cur %llu end %llu\n", page->index, 1944 (unsigned long long)cur, 1945 (unsigned long long)end); 1946 } 1947 1948 ret = submit_extent_page(WRITE, tree, page, sector, 1949 iosize, page_offset, bdev, 1950 &epd->bio, max_nr, 1951 end_bio_extent_writepage); 1952 if (ret) 1953 SetPageError(page); 1954 } 1955 cur = cur + iosize; 1956 page_offset += iosize; 1957 nr++; 1958 } 1959 done: 1960 if (nr == 0) { 1961 /* make sure the mapping tag for page dirty gets cleared */ 1962 set_page_writeback(page); 1963 end_page_writeback(page); 1964 } 1965 unlock_extent(tree, start, page_end, GFP_NOFS); 1966 unlock_page(page); 1967 return 0; 1968 } 1969 1970 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18) 1971 1972 /* Taken directly from 2.6.23 for 2.6.18 back port */ 1973 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, 1974 void *data); 1975 1976 /** 1977 * write_cache_pages - walk the list of dirty pages of the given address space 1978 * and write all of them. 1979 * @mapping: address space structure to write 1980 * @wbc: subtract the number of written pages from *@wbc->nr_to_write 1981 * @writepage: function called for each page 1982 * @data: data passed to writepage function 1983 * 1984 * If a page is already under I/O, write_cache_pages() skips it, even 1985 * if it's dirty. This is desirable behaviour for memory-cleaning writeback, 1986 * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() 1987 * and msync() need to guarantee that all the data which was dirty at the time 1988 * the call was made get new I/O started against them. If wbc->sync_mode is 1989 * WB_SYNC_ALL then we were called for data integrity and we must wait for 1990 * existing IO to complete. 1991 */ 1992 static int write_cache_pages(struct address_space *mapping, 1993 struct writeback_control *wbc, writepage_t writepage, 1994 void *data) 1995 { 1996 struct backing_dev_info *bdi = mapping->backing_dev_info; 1997 int ret = 0; 1998 int done = 0; 1999 struct pagevec pvec; 2000 int nr_pages; 2001 pgoff_t index; 2002 pgoff_t end; /* Inclusive */ 2003 int scanned = 0; 2004 int range_whole = 0; 2005 2006 if (wbc->nonblocking && bdi_write_congested(bdi)) { 2007 wbc->encountered_congestion = 1; 2008 return 0; 2009 } 2010 2011 pagevec_init(&pvec, 0); 2012 if (wbc->range_cyclic) { 2013 index = mapping->writeback_index; /* Start from prev offset */ 2014 end = -1; 2015 } else { 2016 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2017 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2018 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2019 range_whole = 1; 2020 scanned = 1; 2021 } 2022 retry: 2023 while (!done && (index <= end) && 2024 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2025 PAGECACHE_TAG_DIRTY, 2026 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 2027 unsigned i; 2028 2029 scanned = 1; 2030 for (i = 0; i < nr_pages; i++) { 2031 struct page *page = pvec.pages[i]; 2032 2033 /* 2034 * At this point we hold neither mapping->tree_lock nor 2035 * lock on the page itself: the page may be truncated or 2036 * invalidated (changing page->mapping to NULL), or even 2037 * swizzled back from swapper_space to tmpfs file 2038 * mapping 2039 */ 2040 lock_page(page); 2041 2042 if (unlikely(page->mapping != mapping)) { 2043 unlock_page(page); 2044 continue; 2045 } 2046 2047 if (!wbc->range_cyclic && page->index > end) { 2048 done = 1; 2049 unlock_page(page); 2050 continue; 2051 } 2052 2053 if (wbc->sync_mode != WB_SYNC_NONE) 2054 wait_on_page_writeback(page); 2055 2056 if (PageWriteback(page) || 2057 !clear_page_dirty_for_io(page)) { 2058 unlock_page(page); 2059 continue; 2060 } 2061 2062 ret = (*writepage)(page, wbc, data); 2063 2064 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 2065 unlock_page(page); 2066 ret = 0; 2067 } 2068 if (ret || (--(wbc->nr_to_write) <= 0)) 2069 done = 1; 2070 if (wbc->nonblocking && bdi_write_congested(bdi)) { 2071 wbc->encountered_congestion = 1; 2072 done = 1; 2073 } 2074 } 2075 pagevec_release(&pvec); 2076 cond_resched(); 2077 } 2078 if (!scanned && !done) { 2079 /* 2080 * We hit the last page and there is more work to be done: wrap 2081 * back to the start of the file 2082 */ 2083 scanned = 1; 2084 index = 0; 2085 goto retry; 2086 } 2087 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2088 mapping->writeback_index = index; 2089 return ret; 2090 } 2091 #endif 2092 2093 int extent_write_full_page(struct extent_map_tree *tree, struct page *page, 2094 get_extent_t *get_extent, 2095 struct writeback_control *wbc) 2096 { 2097 int ret; 2098 struct address_space *mapping = page->mapping; 2099 struct extent_page_data epd = { 2100 .bio = NULL, 2101 .tree = tree, 2102 .get_extent = get_extent, 2103 }; 2104 struct writeback_control wbc_writepages = { 2105 .bdi = wbc->bdi, 2106 .sync_mode = WB_SYNC_NONE, 2107 .older_than_this = NULL, 2108 .nr_to_write = 64, 2109 .range_start = page_offset(page) + PAGE_CACHE_SIZE, 2110 .range_end = (loff_t)-1, 2111 }; 2112 2113 2114 ret = __extent_writepage(page, wbc, &epd); 2115 2116 write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd); 2117 if (epd.bio) { 2118 submit_one_bio(WRITE, epd.bio); 2119 } 2120 return ret; 2121 } 2122 EXPORT_SYMBOL(extent_write_full_page); 2123 2124 2125 int extent_writepages(struct extent_map_tree *tree, 2126 struct address_space *mapping, 2127 get_extent_t *get_extent, 2128 struct writeback_control *wbc) 2129 { 2130 int ret = 0; 2131 struct extent_page_data epd = { 2132 .bio = NULL, 2133 .tree = tree, 2134 .get_extent = get_extent, 2135 }; 2136 2137 ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd); 2138 if (epd.bio) { 2139 submit_one_bio(WRITE, epd.bio); 2140 } 2141 return ret; 2142 } 2143 EXPORT_SYMBOL(extent_writepages); 2144 2145 int extent_readpages(struct extent_map_tree *tree, 2146 struct address_space *mapping, 2147 struct list_head *pages, unsigned nr_pages, 2148 get_extent_t get_extent) 2149 { 2150 struct bio *bio = NULL; 2151 unsigned page_idx; 2152 struct pagevec pvec; 2153 2154 pagevec_init(&pvec, 0); 2155 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2156 struct page *page = list_entry(pages->prev, struct page, lru); 2157 2158 prefetchw(&page->flags); 2159 list_del(&page->lru); 2160 /* 2161 * what we want to do here is call add_to_page_cache_lru, 2162 * but that isn't exported, so we reproduce it here 2163 */ 2164 if (!add_to_page_cache(page, mapping, 2165 page->index, GFP_KERNEL)) { 2166 2167 /* open coding of lru_cache_add, also not exported */ 2168 page_cache_get(page); 2169 if (!pagevec_add(&pvec, page)) 2170 __pagevec_lru_add(&pvec); 2171 __extent_read_full_page(tree, page, get_extent, &bio); 2172 } 2173 page_cache_release(page); 2174 } 2175 if (pagevec_count(&pvec)) 2176 __pagevec_lru_add(&pvec); 2177 BUG_ON(!list_empty(pages)); 2178 if (bio) 2179 submit_one_bio(READ, bio); 2180 return 0; 2181 } 2182 EXPORT_SYMBOL(extent_readpages); 2183 2184 /* 2185 * basic invalidatepage code, this waits on any locked or writeback 2186 * ranges corresponding to the page, and then deletes any extent state 2187 * records from the tree 2188 */ 2189 int extent_invalidatepage(struct extent_map_tree *tree, 2190 struct page *page, unsigned long offset) 2191 { 2192 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2193 u64 end = start + PAGE_CACHE_SIZE - 1; 2194 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2195 2196 start += (offset + blocksize -1) & ~(blocksize - 1); 2197 if (start > end) 2198 return 0; 2199 2200 lock_extent(tree, start, end, GFP_NOFS); 2201 wait_on_extent_writeback(tree, start, end); 2202 clear_extent_bit(tree, start, end, 2203 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, 2204 1, 1, GFP_NOFS); 2205 return 0; 2206 } 2207 EXPORT_SYMBOL(extent_invalidatepage); 2208 2209 /* 2210 * simple commit_write call, set_range_dirty is used to mark both 2211 * the pages and the extent records as dirty 2212 */ 2213 int extent_commit_write(struct extent_map_tree *tree, 2214 struct inode *inode, struct page *page, 2215 unsigned from, unsigned to) 2216 { 2217 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; 2218 2219 set_page_extent_mapped(page); 2220 set_page_dirty(page); 2221 2222 if (pos > inode->i_size) { 2223 i_size_write(inode, pos); 2224 mark_inode_dirty(inode); 2225 } 2226 return 0; 2227 } 2228 EXPORT_SYMBOL(extent_commit_write); 2229 2230 int extent_prepare_write(struct extent_map_tree *tree, 2231 struct inode *inode, struct page *page, 2232 unsigned from, unsigned to, get_extent_t *get_extent) 2233 { 2234 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 2235 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 2236 u64 block_start; 2237 u64 orig_block_start; 2238 u64 block_end; 2239 u64 cur_end; 2240 struct extent_map *em; 2241 unsigned blocksize = 1 << inode->i_blkbits; 2242 size_t page_offset = 0; 2243 size_t block_off_start; 2244 size_t block_off_end; 2245 int err = 0; 2246 int iocount = 0; 2247 int ret = 0; 2248 int isnew; 2249 2250 set_page_extent_mapped(page); 2251 2252 block_start = (page_start + from) & ~((u64)blocksize - 1); 2253 block_end = (page_start + to - 1) | (blocksize - 1); 2254 orig_block_start = block_start; 2255 2256 lock_extent(tree, page_start, page_end, GFP_NOFS); 2257 while(block_start <= block_end) { 2258 em = get_extent(inode, page, page_offset, block_start, 2259 block_end, 1); 2260 if (IS_ERR(em) || !em) { 2261 goto err; 2262 } 2263 cur_end = min(block_end, em->end); 2264 block_off_start = block_start & (PAGE_CACHE_SIZE - 1); 2265 block_off_end = block_off_start + blocksize; 2266 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); 2267 2268 if (!PageUptodate(page) && isnew && 2269 (block_off_end > to || block_off_start < from)) { 2270 void *kaddr; 2271 2272 kaddr = kmap_atomic(page, KM_USER0); 2273 if (block_off_end > to) 2274 memset(kaddr + to, 0, block_off_end - to); 2275 if (block_off_start < from) 2276 memset(kaddr + block_off_start, 0, 2277 from - block_off_start); 2278 flush_dcache_page(page); 2279 kunmap_atomic(kaddr, KM_USER0); 2280 } 2281 if ((em->block_start != EXTENT_MAP_HOLE && 2282 em->block_start != EXTENT_MAP_INLINE) && 2283 !isnew && !PageUptodate(page) && 2284 (block_off_end > to || block_off_start < from) && 2285 !test_range_bit(tree, block_start, cur_end, 2286 EXTENT_UPTODATE, 1)) { 2287 u64 sector; 2288 u64 extent_offset = block_start - em->start; 2289 size_t iosize; 2290 sector = (em->block_start + extent_offset) >> 9; 2291 iosize = (cur_end - block_start + blocksize) & 2292 ~((u64)blocksize - 1); 2293 /* 2294 * we've already got the extent locked, but we 2295 * need to split the state such that our end_bio 2296 * handler can clear the lock. 2297 */ 2298 set_extent_bit(tree, block_start, 2299 block_start + iosize - 1, 2300 EXTENT_LOCKED, 0, NULL, GFP_NOFS); 2301 ret = submit_extent_page(READ, tree, page, 2302 sector, iosize, page_offset, em->bdev, 2303 NULL, 1, 2304 end_bio_extent_preparewrite); 2305 iocount++; 2306 block_start = block_start + iosize; 2307 } else { 2308 set_extent_uptodate(tree, block_start, cur_end, 2309 GFP_NOFS); 2310 unlock_extent(tree, block_start, cur_end, GFP_NOFS); 2311 block_start = cur_end + 1; 2312 } 2313 page_offset = block_start & (PAGE_CACHE_SIZE - 1); 2314 free_extent_map(em); 2315 } 2316 if (iocount) { 2317 wait_extent_bit(tree, orig_block_start, 2318 block_end, EXTENT_LOCKED); 2319 } 2320 check_page_uptodate(tree, page); 2321 err: 2322 /* FIXME, zero out newly allocated blocks on error */ 2323 return err; 2324 } 2325 EXPORT_SYMBOL(extent_prepare_write); 2326 2327 /* 2328 * a helper for releasepage. As long as there are no locked extents 2329 * in the range corresponding to the page, both state records and extent 2330 * map records are removed 2331 */ 2332 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page) 2333 { 2334 struct extent_map *em; 2335 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 2336 u64 end = start + PAGE_CACHE_SIZE - 1; 2337 u64 orig_start = start; 2338 int ret = 1; 2339 2340 while (start <= end) { 2341 em = lookup_extent_mapping(tree, start, end); 2342 if (!em || IS_ERR(em)) 2343 break; 2344 if (!test_range_bit(tree, em->start, em->end, 2345 EXTENT_LOCKED, 0)) { 2346 remove_extent_mapping(tree, em); 2347 /* once for the rb tree */ 2348 free_extent_map(em); 2349 } 2350 start = em->end + 1; 2351 /* once for us */ 2352 free_extent_map(em); 2353 } 2354 if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0)) 2355 ret = 0; 2356 else 2357 clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE, 2358 1, 1, GFP_NOFS); 2359 return ret; 2360 } 2361 EXPORT_SYMBOL(try_release_extent_mapping); 2362 2363 sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2364 get_extent_t *get_extent) 2365 { 2366 struct inode *inode = mapping->host; 2367 u64 start = iblock << inode->i_blkbits; 2368 u64 end = start + (1 << inode->i_blkbits) - 1; 2369 sector_t sector = 0; 2370 struct extent_map *em; 2371 2372 em = get_extent(inode, NULL, 0, start, end, 0); 2373 if (!em || IS_ERR(em)) 2374 return 0; 2375 2376 if (em->block_start == EXTENT_MAP_INLINE || 2377 em->block_start == EXTENT_MAP_HOLE) 2378 goto out; 2379 2380 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2381 out: 2382 free_extent_map(em); 2383 return sector; 2384 } 2385 2386 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb) 2387 { 2388 if (list_empty(&eb->lru)) { 2389 extent_buffer_get(eb); 2390 list_add(&eb->lru, &tree->buffer_lru); 2391 tree->lru_size++; 2392 if (tree->lru_size >= BUFFER_LRU_MAX) { 2393 struct extent_buffer *rm; 2394 rm = list_entry(tree->buffer_lru.prev, 2395 struct extent_buffer, lru); 2396 tree->lru_size--; 2397 list_del_init(&rm->lru); 2398 free_extent_buffer(rm); 2399 } 2400 } else 2401 list_move(&eb->lru, &tree->buffer_lru); 2402 return 0; 2403 } 2404 static struct extent_buffer *find_lru(struct extent_map_tree *tree, 2405 u64 start, unsigned long len) 2406 { 2407 struct list_head *lru = &tree->buffer_lru; 2408 struct list_head *cur = lru->next; 2409 struct extent_buffer *eb; 2410 2411 if (list_empty(lru)) 2412 return NULL; 2413 2414 do { 2415 eb = list_entry(cur, struct extent_buffer, lru); 2416 if (eb->start == start && eb->len == len) { 2417 extent_buffer_get(eb); 2418 return eb; 2419 } 2420 cur = cur->next; 2421 } while (cur != lru); 2422 return NULL; 2423 } 2424 2425 static inline unsigned long num_extent_pages(u64 start, u64 len) 2426 { 2427 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 2428 (start >> PAGE_CACHE_SHIFT); 2429 } 2430 2431 static inline struct page *extent_buffer_page(struct extent_buffer *eb, 2432 unsigned long i) 2433 { 2434 struct page *p; 2435 struct address_space *mapping; 2436 2437 if (i == 0) 2438 return eb->first_page; 2439 i += eb->start >> PAGE_CACHE_SHIFT; 2440 mapping = eb->first_page->mapping; 2441 read_lock_irq(&mapping->tree_lock); 2442 p = radix_tree_lookup(&mapping->page_tree, i); 2443 read_unlock_irq(&mapping->tree_lock); 2444 return p; 2445 } 2446 2447 static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree, 2448 u64 start, 2449 unsigned long len, 2450 gfp_t mask) 2451 { 2452 struct extent_buffer *eb = NULL; 2453 2454 spin_lock(&tree->lru_lock); 2455 eb = find_lru(tree, start, len); 2456 spin_unlock(&tree->lru_lock); 2457 if (eb) { 2458 return eb; 2459 } 2460 2461 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 2462 INIT_LIST_HEAD(&eb->lru); 2463 eb->start = start; 2464 eb->len = len; 2465 atomic_set(&eb->refs, 1); 2466 2467 return eb; 2468 } 2469 2470 static void __free_extent_buffer(struct extent_buffer *eb) 2471 { 2472 kmem_cache_free(extent_buffer_cache, eb); 2473 } 2474 2475 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree, 2476 u64 start, unsigned long len, 2477 struct page *page0, 2478 gfp_t mask) 2479 { 2480 unsigned long num_pages = num_extent_pages(start, len); 2481 unsigned long i; 2482 unsigned long index = start >> PAGE_CACHE_SHIFT; 2483 struct extent_buffer *eb; 2484 struct page *p; 2485 struct address_space *mapping = tree->mapping; 2486 int uptodate = 1; 2487 2488 eb = __alloc_extent_buffer(tree, start, len, mask); 2489 if (!eb || IS_ERR(eb)) 2490 return NULL; 2491 2492 if (eb->flags & EXTENT_BUFFER_FILLED) 2493 goto lru_add; 2494 2495 if (page0) { 2496 eb->first_page = page0; 2497 i = 1; 2498 index++; 2499 page_cache_get(page0); 2500 mark_page_accessed(page0); 2501 set_page_extent_mapped(page0); 2502 WARN_ON(!PageUptodate(page0)); 2503 set_page_extent_head(page0, len); 2504 } else { 2505 i = 0; 2506 } 2507 for (; i < num_pages; i++, index++) { 2508 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 2509 if (!p) { 2510 WARN_ON(1); 2511 goto fail; 2512 } 2513 set_page_extent_mapped(p); 2514 mark_page_accessed(p); 2515 if (i == 0) { 2516 eb->first_page = p; 2517 set_page_extent_head(p, len); 2518 } else { 2519 set_page_private(p, EXTENT_PAGE_PRIVATE); 2520 } 2521 if (!PageUptodate(p)) 2522 uptodate = 0; 2523 unlock_page(p); 2524 } 2525 if (uptodate) 2526 eb->flags |= EXTENT_UPTODATE; 2527 eb->flags |= EXTENT_BUFFER_FILLED; 2528 2529 lru_add: 2530 spin_lock(&tree->lru_lock); 2531 add_lru(tree, eb); 2532 spin_unlock(&tree->lru_lock); 2533 return eb; 2534 2535 fail: 2536 spin_lock(&tree->lru_lock); 2537 list_del_init(&eb->lru); 2538 spin_unlock(&tree->lru_lock); 2539 if (!atomic_dec_and_test(&eb->refs)) 2540 return NULL; 2541 for (index = 1; index < i; index++) { 2542 page_cache_release(extent_buffer_page(eb, index)); 2543 } 2544 if (i > 0) 2545 page_cache_release(extent_buffer_page(eb, 0)); 2546 __free_extent_buffer(eb); 2547 return NULL; 2548 } 2549 EXPORT_SYMBOL(alloc_extent_buffer); 2550 2551 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree, 2552 u64 start, unsigned long len, 2553 gfp_t mask) 2554 { 2555 unsigned long num_pages = num_extent_pages(start, len); 2556 unsigned long i; 2557 unsigned long index = start >> PAGE_CACHE_SHIFT; 2558 struct extent_buffer *eb; 2559 struct page *p; 2560 struct address_space *mapping = tree->mapping; 2561 int uptodate = 1; 2562 2563 eb = __alloc_extent_buffer(tree, start, len, mask); 2564 if (!eb || IS_ERR(eb)) 2565 return NULL; 2566 2567 if (eb->flags & EXTENT_BUFFER_FILLED) 2568 goto lru_add; 2569 2570 for (i = 0; i < num_pages; i++, index++) { 2571 p = find_lock_page(mapping, index); 2572 if (!p) { 2573 goto fail; 2574 } 2575 set_page_extent_mapped(p); 2576 mark_page_accessed(p); 2577 2578 if (i == 0) { 2579 eb->first_page = p; 2580 set_page_extent_head(p, len); 2581 } else { 2582 set_page_private(p, EXTENT_PAGE_PRIVATE); 2583 } 2584 2585 if (!PageUptodate(p)) 2586 uptodate = 0; 2587 unlock_page(p); 2588 } 2589 if (uptodate) 2590 eb->flags |= EXTENT_UPTODATE; 2591 eb->flags |= EXTENT_BUFFER_FILLED; 2592 2593 lru_add: 2594 spin_lock(&tree->lru_lock); 2595 add_lru(tree, eb); 2596 spin_unlock(&tree->lru_lock); 2597 return eb; 2598 fail: 2599 spin_lock(&tree->lru_lock); 2600 list_del_init(&eb->lru); 2601 spin_unlock(&tree->lru_lock); 2602 if (!atomic_dec_and_test(&eb->refs)) 2603 return NULL; 2604 for (index = 1; index < i; index++) { 2605 page_cache_release(extent_buffer_page(eb, index)); 2606 } 2607 if (i > 0) 2608 page_cache_release(extent_buffer_page(eb, 0)); 2609 __free_extent_buffer(eb); 2610 return NULL; 2611 } 2612 EXPORT_SYMBOL(find_extent_buffer); 2613 2614 void free_extent_buffer(struct extent_buffer *eb) 2615 { 2616 unsigned long i; 2617 unsigned long num_pages; 2618 2619 if (!eb) 2620 return; 2621 2622 if (!atomic_dec_and_test(&eb->refs)) 2623 return; 2624 2625 WARN_ON(!list_empty(&eb->lru)); 2626 num_pages = num_extent_pages(eb->start, eb->len); 2627 2628 for (i = 1; i < num_pages; i++) { 2629 page_cache_release(extent_buffer_page(eb, i)); 2630 } 2631 page_cache_release(extent_buffer_page(eb, 0)); 2632 __free_extent_buffer(eb); 2633 } 2634 EXPORT_SYMBOL(free_extent_buffer); 2635 2636 int clear_extent_buffer_dirty(struct extent_map_tree *tree, 2637 struct extent_buffer *eb) 2638 { 2639 int set; 2640 unsigned long i; 2641 unsigned long num_pages; 2642 struct page *page; 2643 2644 u64 start = eb->start; 2645 u64 end = start + eb->len - 1; 2646 2647 set = clear_extent_dirty(tree, start, end, GFP_NOFS); 2648 num_pages = num_extent_pages(eb->start, eb->len); 2649 2650 for (i = 0; i < num_pages; i++) { 2651 page = extent_buffer_page(eb, i); 2652 lock_page(page); 2653 if (i == 0) 2654 set_page_extent_head(page, eb->len); 2655 else 2656 set_page_private(page, EXTENT_PAGE_PRIVATE); 2657 2658 /* 2659 * if we're on the last page or the first page and the 2660 * block isn't aligned on a page boundary, do extra checks 2661 * to make sure we don't clean page that is partially dirty 2662 */ 2663 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 2664 ((i == num_pages - 1) && 2665 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 2666 start = (u64)page->index << PAGE_CACHE_SHIFT; 2667 end = start + PAGE_CACHE_SIZE - 1; 2668 if (test_range_bit(tree, start, end, 2669 EXTENT_DIRTY, 0)) { 2670 unlock_page(page); 2671 continue; 2672 } 2673 } 2674 clear_page_dirty_for_io(page); 2675 write_lock_irq(&page->mapping->tree_lock); 2676 if (!PageDirty(page)) { 2677 radix_tree_tag_clear(&page->mapping->page_tree, 2678 page_index(page), 2679 PAGECACHE_TAG_DIRTY); 2680 } 2681 write_unlock_irq(&page->mapping->tree_lock); 2682 unlock_page(page); 2683 } 2684 return 0; 2685 } 2686 EXPORT_SYMBOL(clear_extent_buffer_dirty); 2687 2688 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree, 2689 struct extent_buffer *eb) 2690 { 2691 return wait_on_extent_writeback(tree, eb->start, 2692 eb->start + eb->len - 1); 2693 } 2694 EXPORT_SYMBOL(wait_on_extent_buffer_writeback); 2695 2696 int set_extent_buffer_dirty(struct extent_map_tree *tree, 2697 struct extent_buffer *eb) 2698 { 2699 unsigned long i; 2700 unsigned long num_pages; 2701 2702 num_pages = num_extent_pages(eb->start, eb->len); 2703 for (i = 0; i < num_pages; i++) { 2704 struct page *page = extent_buffer_page(eb, i); 2705 /* writepage may need to do something special for the 2706 * first page, we have to make sure page->private is 2707 * properly set. releasepage may drop page->private 2708 * on us if the page isn't already dirty. 2709 */ 2710 if (i == 0) { 2711 lock_page(page); 2712 set_page_extent_head(page, eb->len); 2713 } else if (PagePrivate(page) && 2714 page->private != EXTENT_PAGE_PRIVATE) { 2715 lock_page(page); 2716 set_page_extent_mapped(page); 2717 unlock_page(page); 2718 } 2719 __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); 2720 if (i == 0) 2721 unlock_page(page); 2722 } 2723 return set_extent_dirty(tree, eb->start, 2724 eb->start + eb->len - 1, GFP_NOFS); 2725 } 2726 EXPORT_SYMBOL(set_extent_buffer_dirty); 2727 2728 int set_extent_buffer_uptodate(struct extent_map_tree *tree, 2729 struct extent_buffer *eb) 2730 { 2731 unsigned long i; 2732 struct page *page; 2733 unsigned long num_pages; 2734 2735 num_pages = num_extent_pages(eb->start, eb->len); 2736 2737 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 2738 GFP_NOFS); 2739 for (i = 0; i < num_pages; i++) { 2740 page = extent_buffer_page(eb, i); 2741 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 2742 ((i == num_pages - 1) && 2743 ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { 2744 check_page_uptodate(tree, page); 2745 continue; 2746 } 2747 SetPageUptodate(page); 2748 } 2749 return 0; 2750 } 2751 EXPORT_SYMBOL(set_extent_buffer_uptodate); 2752 2753 int extent_buffer_uptodate(struct extent_map_tree *tree, 2754 struct extent_buffer *eb) 2755 { 2756 if (eb->flags & EXTENT_UPTODATE) 2757 return 1; 2758 return test_range_bit(tree, eb->start, eb->start + eb->len - 1, 2759 EXTENT_UPTODATE, 1); 2760 } 2761 EXPORT_SYMBOL(extent_buffer_uptodate); 2762 2763 int read_extent_buffer_pages(struct extent_map_tree *tree, 2764 struct extent_buffer *eb, 2765 u64 start, 2766 int wait) 2767 { 2768 unsigned long i; 2769 unsigned long start_i; 2770 struct page *page; 2771 int err; 2772 int ret = 0; 2773 unsigned long num_pages; 2774 2775 if (eb->flags & EXTENT_UPTODATE) 2776 return 0; 2777 2778 if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1, 2779 EXTENT_UPTODATE, 1)) { 2780 return 0; 2781 } 2782 2783 if (start) { 2784 WARN_ON(start < eb->start); 2785 start_i = (start >> PAGE_CACHE_SHIFT) - 2786 (eb->start >> PAGE_CACHE_SHIFT); 2787 } else { 2788 start_i = 0; 2789 } 2790 2791 num_pages = num_extent_pages(eb->start, eb->len); 2792 for (i = start_i; i < num_pages; i++) { 2793 page = extent_buffer_page(eb, i); 2794 if (PageUptodate(page)) { 2795 continue; 2796 } 2797 if (!wait) { 2798 if (TestSetPageLocked(page)) { 2799 continue; 2800 } 2801 } else { 2802 lock_page(page); 2803 } 2804 if (!PageUptodate(page)) { 2805 err = page->mapping->a_ops->readpage(NULL, page); 2806 if (err) { 2807 ret = err; 2808 } 2809 } else { 2810 unlock_page(page); 2811 } 2812 } 2813 2814 if (ret || !wait) { 2815 return ret; 2816 } 2817 2818 for (i = start_i; i < num_pages; i++) { 2819 page = extent_buffer_page(eb, i); 2820 wait_on_page_locked(page); 2821 if (!PageUptodate(page)) { 2822 ret = -EIO; 2823 } 2824 } 2825 if (!ret) 2826 eb->flags |= EXTENT_UPTODATE; 2827 return ret; 2828 } 2829 EXPORT_SYMBOL(read_extent_buffer_pages); 2830 2831 void read_extent_buffer(struct extent_buffer *eb, void *dstv, 2832 unsigned long start, 2833 unsigned long len) 2834 { 2835 size_t cur; 2836 size_t offset; 2837 struct page *page; 2838 char *kaddr; 2839 char *dst = (char *)dstv; 2840 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 2841 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 2842 unsigned long num_pages = num_extent_pages(eb->start, eb->len); 2843 2844 WARN_ON(start > eb->len); 2845 WARN_ON(start + len > eb->start + eb->len); 2846 2847 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 2848 2849 while(len > 0) { 2850 page = extent_buffer_page(eb, i); 2851 if (!PageUptodate(page)) { 2852 printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len); 2853 WARN_ON(1); 2854 } 2855 WARN_ON(!PageUptodate(page)); 2856 2857 cur = min(len, (PAGE_CACHE_SIZE - offset)); 2858 kaddr = kmap_atomic(page, KM_USER1); 2859 memcpy(dst, kaddr + offset, cur); 2860 kunmap_atomic(kaddr, KM_USER1); 2861 2862 dst += cur; 2863 len -= cur; 2864 offset = 0; 2865 i++; 2866 } 2867 } 2868 EXPORT_SYMBOL(read_extent_buffer); 2869 2870 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 2871 unsigned long min_len, char **token, char **map, 2872 unsigned long *map_start, 2873 unsigned long *map_len, int km) 2874 { 2875 size_t offset = start & (PAGE_CACHE_SIZE - 1); 2876 char *kaddr; 2877 struct page *p; 2878 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 2879 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 2880 unsigned long end_i = (start_offset + start + min_len - 1) >> 2881 PAGE_CACHE_SHIFT; 2882 2883 if (i != end_i) 2884 return -EINVAL; 2885 2886 if (i == 0) { 2887 offset = start_offset; 2888 *map_start = 0; 2889 } else { 2890 offset = 0; 2891 *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; 2892 } 2893 if (start + min_len > eb->len) { 2894 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len); 2895 WARN_ON(1); 2896 } 2897 2898 p = extent_buffer_page(eb, i); 2899 WARN_ON(!PageUptodate(p)); 2900 kaddr = kmap_atomic(p, km); 2901 *token = kaddr; 2902 *map = kaddr + offset; 2903 *map_len = PAGE_CACHE_SIZE - offset; 2904 return 0; 2905 } 2906 EXPORT_SYMBOL(map_private_extent_buffer); 2907 2908 int map_extent_buffer(struct extent_buffer *eb, unsigned long start, 2909 unsigned long min_len, 2910 char **token, char **map, 2911 unsigned long *map_start, 2912 unsigned long *map_len, int km) 2913 { 2914 int err; 2915 int save = 0; 2916 if (eb->map_token) { 2917 unmap_extent_buffer(eb, eb->map_token, km); 2918 eb->map_token = NULL; 2919 save = 1; 2920 } 2921 err = map_private_extent_buffer(eb, start, min_len, token, map, 2922 map_start, map_len, km); 2923 if (!err && save) { 2924 eb->map_token = *token; 2925 eb->kaddr = *map; 2926 eb->map_start = *map_start; 2927 eb->map_len = *map_len; 2928 } 2929 return err; 2930 } 2931 EXPORT_SYMBOL(map_extent_buffer); 2932 2933 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) 2934 { 2935 kunmap_atomic(token, km); 2936 } 2937 EXPORT_SYMBOL(unmap_extent_buffer); 2938 2939 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 2940 unsigned long start, 2941 unsigned long len) 2942 { 2943 size_t cur; 2944 size_t offset; 2945 struct page *page; 2946 char *kaddr; 2947 char *ptr = (char *)ptrv; 2948 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 2949 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 2950 int ret = 0; 2951 2952 WARN_ON(start > eb->len); 2953 WARN_ON(start + len > eb->start + eb->len); 2954 2955 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 2956 2957 while(len > 0) { 2958 page = extent_buffer_page(eb, i); 2959 WARN_ON(!PageUptodate(page)); 2960 2961 cur = min(len, (PAGE_CACHE_SIZE - offset)); 2962 2963 kaddr = kmap_atomic(page, KM_USER0); 2964 ret = memcmp(ptr, kaddr + offset, cur); 2965 kunmap_atomic(kaddr, KM_USER0); 2966 if (ret) 2967 break; 2968 2969 ptr += cur; 2970 len -= cur; 2971 offset = 0; 2972 i++; 2973 } 2974 return ret; 2975 } 2976 EXPORT_SYMBOL(memcmp_extent_buffer); 2977 2978 void write_extent_buffer(struct extent_buffer *eb, const void *srcv, 2979 unsigned long start, unsigned long len) 2980 { 2981 size_t cur; 2982 size_t offset; 2983 struct page *page; 2984 char *kaddr; 2985 char *src = (char *)srcv; 2986 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 2987 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 2988 2989 WARN_ON(start > eb->len); 2990 WARN_ON(start + len > eb->start + eb->len); 2991 2992 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 2993 2994 while(len > 0) { 2995 page = extent_buffer_page(eb, i); 2996 WARN_ON(!PageUptodate(page)); 2997 2998 cur = min(len, PAGE_CACHE_SIZE - offset); 2999 kaddr = kmap_atomic(page, KM_USER1); 3000 memcpy(kaddr + offset, src, cur); 3001 kunmap_atomic(kaddr, KM_USER1); 3002 3003 src += cur; 3004 len -= cur; 3005 offset = 0; 3006 i++; 3007 } 3008 } 3009 EXPORT_SYMBOL(write_extent_buffer); 3010 3011 void memset_extent_buffer(struct extent_buffer *eb, char c, 3012 unsigned long start, unsigned long len) 3013 { 3014 size_t cur; 3015 size_t offset; 3016 struct page *page; 3017 char *kaddr; 3018 size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); 3019 unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; 3020 3021 WARN_ON(start > eb->len); 3022 WARN_ON(start + len > eb->start + eb->len); 3023 3024 offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); 3025 3026 while(len > 0) { 3027 page = extent_buffer_page(eb, i); 3028 WARN_ON(!PageUptodate(page)); 3029 3030 cur = min(len, PAGE_CACHE_SIZE - offset); 3031 kaddr = kmap_atomic(page, KM_USER0); 3032 memset(kaddr + offset, c, cur); 3033 kunmap_atomic(kaddr, KM_USER0); 3034 3035 len -= cur; 3036 offset = 0; 3037 i++; 3038 } 3039 } 3040 EXPORT_SYMBOL(memset_extent_buffer); 3041 3042 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, 3043 unsigned long dst_offset, unsigned long src_offset, 3044 unsigned long len) 3045 { 3046 u64 dst_len = dst->len; 3047 size_t cur; 3048 size_t offset; 3049 struct page *page; 3050 char *kaddr; 3051 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3052 unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3053 3054 WARN_ON(src->len != dst_len); 3055 3056 offset = (start_offset + dst_offset) & 3057 ((unsigned long)PAGE_CACHE_SIZE - 1); 3058 3059 while(len > 0) { 3060 page = extent_buffer_page(dst, i); 3061 WARN_ON(!PageUptodate(page)); 3062 3063 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3064 3065 kaddr = kmap_atomic(page, KM_USER0); 3066 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3067 kunmap_atomic(kaddr, KM_USER0); 3068 3069 src_offset += cur; 3070 len -= cur; 3071 offset = 0; 3072 i++; 3073 } 3074 } 3075 EXPORT_SYMBOL(copy_extent_buffer); 3076 3077 static void move_pages(struct page *dst_page, struct page *src_page, 3078 unsigned long dst_off, unsigned long src_off, 3079 unsigned long len) 3080 { 3081 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3082 if (dst_page == src_page) { 3083 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3084 } else { 3085 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3086 char *p = dst_kaddr + dst_off + len; 3087 char *s = src_kaddr + src_off + len; 3088 3089 while (len--) 3090 *--p = *--s; 3091 3092 kunmap_atomic(src_kaddr, KM_USER1); 3093 } 3094 kunmap_atomic(dst_kaddr, KM_USER0); 3095 } 3096 3097 static void copy_pages(struct page *dst_page, struct page *src_page, 3098 unsigned long dst_off, unsigned long src_off, 3099 unsigned long len) 3100 { 3101 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3102 char *src_kaddr; 3103 3104 if (dst_page != src_page) 3105 src_kaddr = kmap_atomic(src_page, KM_USER1); 3106 else 3107 src_kaddr = dst_kaddr; 3108 3109 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3110 kunmap_atomic(dst_kaddr, KM_USER0); 3111 if (dst_page != src_page) 3112 kunmap_atomic(src_kaddr, KM_USER1); 3113 } 3114 3115 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3116 unsigned long src_offset, unsigned long len) 3117 { 3118 size_t cur; 3119 size_t dst_off_in_page; 3120 size_t src_off_in_page; 3121 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3122 unsigned long dst_i; 3123 unsigned long src_i; 3124 3125 if (src_offset + len > dst->len) { 3126 printk("memmove bogus src_offset %lu move len %lu len %lu\n", 3127 src_offset, len, dst->len); 3128 BUG_ON(1); 3129 } 3130 if (dst_offset + len > dst->len) { 3131 printk("memmove bogus dst_offset %lu move len %lu len %lu\n", 3132 dst_offset, len, dst->len); 3133 BUG_ON(1); 3134 } 3135 3136 while(len > 0) { 3137 dst_off_in_page = (start_offset + dst_offset) & 3138 ((unsigned long)PAGE_CACHE_SIZE - 1); 3139 src_off_in_page = (start_offset + src_offset) & 3140 ((unsigned long)PAGE_CACHE_SIZE - 1); 3141 3142 dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; 3143 src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; 3144 3145 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - 3146 src_off_in_page)); 3147 cur = min_t(unsigned long, cur, 3148 (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); 3149 3150 copy_pages(extent_buffer_page(dst, dst_i), 3151 extent_buffer_page(dst, src_i), 3152 dst_off_in_page, src_off_in_page, cur); 3153 3154 src_offset += cur; 3155 dst_offset += cur; 3156 len -= cur; 3157 } 3158 } 3159 EXPORT_SYMBOL(memcpy_extent_buffer); 3160 3161 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3162 unsigned long src_offset, unsigned long len) 3163 { 3164 size_t cur; 3165 size_t dst_off_in_page; 3166 size_t src_off_in_page; 3167 unsigned long dst_end = dst_offset + len - 1; 3168 unsigned long src_end = src_offset + len - 1; 3169 size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); 3170 unsigned long dst_i; 3171 unsigned long src_i; 3172 3173 if (src_offset + len > dst->len) { 3174 printk("memmove bogus src_offset %lu move len %lu len %lu\n", 3175 src_offset, len, dst->len); 3176 BUG_ON(1); 3177 } 3178 if (dst_offset + len > dst->len) { 3179 printk("memmove bogus dst_offset %lu move len %lu len %lu\n", 3180 dst_offset, len, dst->len); 3181 BUG_ON(1); 3182 } 3183 if (dst_offset < src_offset) { 3184 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3185 return; 3186 } 3187 while(len > 0) { 3188 dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; 3189 src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; 3190 3191 dst_off_in_page = (start_offset + dst_end) & 3192 ((unsigned long)PAGE_CACHE_SIZE - 1); 3193 src_off_in_page = (start_offset + src_end) & 3194 ((unsigned long)PAGE_CACHE_SIZE - 1); 3195 3196 cur = min_t(unsigned long, len, src_off_in_page + 1); 3197 cur = min(cur, dst_off_in_page + 1); 3198 move_pages(extent_buffer_page(dst, dst_i), 3199 extent_buffer_page(dst, src_i), 3200 dst_off_in_page - cur + 1, 3201 src_off_in_page - cur + 1, cur); 3202 3203 dst_end -= cur; 3204 src_end -= cur; 3205 len -= cur; 3206 } 3207 } 3208 EXPORT_SYMBOL(memmove_extent_buffer); 3209