1 /* 2 * Copyright (C) 2012 Fusion-io All rights reserved. 3 * Copyright (C) 2012 Intel Corp. All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public 7 * License v2 as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public 15 * License along with this program; if not, write to the 16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 17 * Boston, MA 021110-1307, USA. 18 */ 19 #include <linux/sched.h> 20 #include <linux/wait.h> 21 #include <linux/bio.h> 22 #include <linux/slab.h> 23 #include <linux/buffer_head.h> 24 #include <linux/blkdev.h> 25 #include <linux/random.h> 26 #include <linux/iocontext.h> 27 #include <linux/capability.h> 28 #include <linux/ratelimit.h> 29 #include <linux/kthread.h> 30 #include <linux/raid/pq.h> 31 #include <linux/hash.h> 32 #include <linux/list_sort.h> 33 #include <linux/raid/xor.h> 34 #include <linux/vmalloc.h> 35 #include <asm/div64.h> 36 #include "ctree.h" 37 #include "extent_map.h" 38 #include "disk-io.h" 39 #include "transaction.h" 40 #include "print-tree.h" 41 #include "volumes.h" 42 #include "raid56.h" 43 #include "async-thread.h" 44 #include "check-integrity.h" 45 #include "rcu-string.h" 46 47 /* set when additional merges to this rbio are not allowed */ 48 #define RBIO_RMW_LOCKED_BIT 1 49 50 /* 51 * set when this rbio is sitting in the hash, but it is just a cache 52 * of past RMW 53 */ 54 #define RBIO_CACHE_BIT 2 55 56 /* 57 * set when it is safe to trust the stripe_pages for caching 58 */ 59 #define RBIO_CACHE_READY_BIT 3 60 61 62 #define RBIO_CACHE_SIZE 1024 63 64 struct btrfs_raid_bio { 65 struct btrfs_fs_info *fs_info; 66 struct btrfs_bio *bbio; 67 68 /* 69 * logical block numbers for the start of each stripe 70 * The last one or two are p/q. These are sorted, 71 * so raid_map[0] is the start of our full stripe 72 */ 73 u64 *raid_map; 74 75 /* while we're doing rmw on a stripe 76 * we put it into a hash table so we can 77 * lock the stripe and merge more rbios 78 * into it. 79 */ 80 struct list_head hash_list; 81 82 /* 83 * LRU list for the stripe cache 84 */ 85 struct list_head stripe_cache; 86 87 /* 88 * for scheduling work in the helper threads 89 */ 90 struct btrfs_work work; 91 92 /* 93 * bio list and bio_list_lock are used 94 * to add more bios into the stripe 95 * in hopes of avoiding the full rmw 96 */ 97 struct bio_list bio_list; 98 spinlock_t bio_list_lock; 99 100 /* also protected by the bio_list_lock, the 101 * plug list is used by the plugging code 102 * to collect partial bios while plugged. The 103 * stripe locking code also uses it to hand off 104 * the stripe lock to the next pending IO 105 */ 106 struct list_head plug_list; 107 108 /* 109 * flags that tell us if it is safe to 110 * merge with this bio 111 */ 112 unsigned long flags; 113 114 /* size of each individual stripe on disk */ 115 int stripe_len; 116 117 /* number of data stripes (no p/q) */ 118 int nr_data; 119 120 /* 121 * set if we're doing a parity rebuild 122 * for a read from higher up, which is handled 123 * differently from a parity rebuild as part of 124 * rmw 125 */ 126 int read_rebuild; 127 128 /* first bad stripe */ 129 int faila; 130 131 /* second bad stripe (for raid6 use) */ 132 int failb; 133 134 /* 135 * number of pages needed to represent the full 136 * stripe 137 */ 138 int nr_pages; 139 140 /* 141 * size of all the bios in the bio_list. This 142 * helps us decide if the rbio maps to a full 143 * stripe or not 144 */ 145 int bio_list_bytes; 146 147 atomic_t refs; 148 149 /* 150 * these are two arrays of pointers. We allocate the 151 * rbio big enough to hold them both and setup their 152 * locations when the rbio is allocated 153 */ 154 155 /* pointers to pages that we allocated for 156 * reading/writing stripes directly from the disk (including P/Q) 157 */ 158 struct page **stripe_pages; 159 160 /* 161 * pointers to the pages in the bio_list. Stored 162 * here for faster lookup 163 */ 164 struct page **bio_pages; 165 }; 166 167 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); 168 static noinline void finish_rmw(struct btrfs_raid_bio *rbio); 169 static void rmw_work(struct btrfs_work *work); 170 static void read_rebuild_work(struct btrfs_work *work); 171 static void async_rmw_stripe(struct btrfs_raid_bio *rbio); 172 static void async_read_rebuild(struct btrfs_raid_bio *rbio); 173 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); 174 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); 175 static void __free_raid_bio(struct btrfs_raid_bio *rbio); 176 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 177 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 178 179 /* 180 * the stripe hash table is used for locking, and to collect 181 * bios in hopes of making a full stripe 182 */ 183 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) 184 { 185 struct btrfs_stripe_hash_table *table; 186 struct btrfs_stripe_hash_table *x; 187 struct btrfs_stripe_hash *cur; 188 struct btrfs_stripe_hash *h; 189 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; 190 int i; 191 int table_size; 192 193 if (info->stripe_hash_table) 194 return 0; 195 196 /* 197 * The table is large, starting with order 4 and can go as high as 198 * order 7 in case lock debugging is turned on. 199 * 200 * Try harder to allocate and fallback to vmalloc to lower the chance 201 * of a failing mount. 202 */ 203 table_size = sizeof(*table) + sizeof(*h) * num_entries; 204 table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); 205 if (!table) { 206 table = vzalloc(table_size); 207 if (!table) 208 return -ENOMEM; 209 } 210 211 spin_lock_init(&table->cache_lock); 212 INIT_LIST_HEAD(&table->stripe_cache); 213 214 h = table->table; 215 216 for (i = 0; i < num_entries; i++) { 217 cur = h + i; 218 INIT_LIST_HEAD(&cur->hash_list); 219 spin_lock_init(&cur->lock); 220 init_waitqueue_head(&cur->wait); 221 } 222 223 x = cmpxchg(&info->stripe_hash_table, NULL, table); 224 if (x) { 225 if (is_vmalloc_addr(x)) 226 vfree(x); 227 else 228 kfree(x); 229 } 230 return 0; 231 } 232 233 /* 234 * caching an rbio means to copy anything from the 235 * bio_pages array into the stripe_pages array. We 236 * use the page uptodate bit in the stripe cache array 237 * to indicate if it has valid data 238 * 239 * once the caching is done, we set the cache ready 240 * bit. 241 */ 242 static void cache_rbio_pages(struct btrfs_raid_bio *rbio) 243 { 244 int i; 245 char *s; 246 char *d; 247 int ret; 248 249 ret = alloc_rbio_pages(rbio); 250 if (ret) 251 return; 252 253 for (i = 0; i < rbio->nr_pages; i++) { 254 if (!rbio->bio_pages[i]) 255 continue; 256 257 s = kmap(rbio->bio_pages[i]); 258 d = kmap(rbio->stripe_pages[i]); 259 260 memcpy(d, s, PAGE_CACHE_SIZE); 261 262 kunmap(rbio->bio_pages[i]); 263 kunmap(rbio->stripe_pages[i]); 264 SetPageUptodate(rbio->stripe_pages[i]); 265 } 266 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 267 } 268 269 /* 270 * we hash on the first logical address of the stripe 271 */ 272 static int rbio_bucket(struct btrfs_raid_bio *rbio) 273 { 274 u64 num = rbio->raid_map[0]; 275 276 /* 277 * we shift down quite a bit. We're using byte 278 * addressing, and most of the lower bits are zeros. 279 * This tends to upset hash_64, and it consistently 280 * returns just one or two different values. 281 * 282 * shifting off the lower bits fixes things. 283 */ 284 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); 285 } 286 287 /* 288 * stealing an rbio means taking all the uptodate pages from the stripe 289 * array in the source rbio and putting them into the destination rbio 290 */ 291 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) 292 { 293 int i; 294 struct page *s; 295 struct page *d; 296 297 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) 298 return; 299 300 for (i = 0; i < dest->nr_pages; i++) { 301 s = src->stripe_pages[i]; 302 if (!s || !PageUptodate(s)) { 303 continue; 304 } 305 306 d = dest->stripe_pages[i]; 307 if (d) 308 __free_page(d); 309 310 dest->stripe_pages[i] = s; 311 src->stripe_pages[i] = NULL; 312 } 313 } 314 315 /* 316 * merging means we take the bio_list from the victim and 317 * splice it into the destination. The victim should 318 * be discarded afterwards. 319 * 320 * must be called with dest->rbio_list_lock held 321 */ 322 static void merge_rbio(struct btrfs_raid_bio *dest, 323 struct btrfs_raid_bio *victim) 324 { 325 bio_list_merge(&dest->bio_list, &victim->bio_list); 326 dest->bio_list_bytes += victim->bio_list_bytes; 327 bio_list_init(&victim->bio_list); 328 } 329 330 /* 331 * used to prune items that are in the cache. The caller 332 * must hold the hash table lock. 333 */ 334 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 335 { 336 int bucket = rbio_bucket(rbio); 337 struct btrfs_stripe_hash_table *table; 338 struct btrfs_stripe_hash *h; 339 int freeit = 0; 340 341 /* 342 * check the bit again under the hash table lock. 343 */ 344 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 345 return; 346 347 table = rbio->fs_info->stripe_hash_table; 348 h = table->table + bucket; 349 350 /* hold the lock for the bucket because we may be 351 * removing it from the hash table 352 */ 353 spin_lock(&h->lock); 354 355 /* 356 * hold the lock for the bio list because we need 357 * to make sure the bio list is empty 358 */ 359 spin_lock(&rbio->bio_list_lock); 360 361 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { 362 list_del_init(&rbio->stripe_cache); 363 table->cache_size -= 1; 364 freeit = 1; 365 366 /* if the bio list isn't empty, this rbio is 367 * still involved in an IO. We take it out 368 * of the cache list, and drop the ref that 369 * was held for the list. 370 * 371 * If the bio_list was empty, we also remove 372 * the rbio from the hash_table, and drop 373 * the corresponding ref 374 */ 375 if (bio_list_empty(&rbio->bio_list)) { 376 if (!list_empty(&rbio->hash_list)) { 377 list_del_init(&rbio->hash_list); 378 atomic_dec(&rbio->refs); 379 BUG_ON(!list_empty(&rbio->plug_list)); 380 } 381 } 382 } 383 384 spin_unlock(&rbio->bio_list_lock); 385 spin_unlock(&h->lock); 386 387 if (freeit) 388 __free_raid_bio(rbio); 389 } 390 391 /* 392 * prune a given rbio from the cache 393 */ 394 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) 395 { 396 struct btrfs_stripe_hash_table *table; 397 unsigned long flags; 398 399 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) 400 return; 401 402 table = rbio->fs_info->stripe_hash_table; 403 404 spin_lock_irqsave(&table->cache_lock, flags); 405 __remove_rbio_from_cache(rbio); 406 spin_unlock_irqrestore(&table->cache_lock, flags); 407 } 408 409 /* 410 * remove everything in the cache 411 */ 412 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) 413 { 414 struct btrfs_stripe_hash_table *table; 415 unsigned long flags; 416 struct btrfs_raid_bio *rbio; 417 418 table = info->stripe_hash_table; 419 420 spin_lock_irqsave(&table->cache_lock, flags); 421 while (!list_empty(&table->stripe_cache)) { 422 rbio = list_entry(table->stripe_cache.next, 423 struct btrfs_raid_bio, 424 stripe_cache); 425 __remove_rbio_from_cache(rbio); 426 } 427 spin_unlock_irqrestore(&table->cache_lock, flags); 428 } 429 430 /* 431 * remove all cached entries and free the hash table 432 * used by unmount 433 */ 434 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) 435 { 436 if (!info->stripe_hash_table) 437 return; 438 btrfs_clear_rbio_cache(info); 439 if (is_vmalloc_addr(info->stripe_hash_table)) 440 vfree(info->stripe_hash_table); 441 else 442 kfree(info->stripe_hash_table); 443 info->stripe_hash_table = NULL; 444 } 445 446 /* 447 * insert an rbio into the stripe cache. It 448 * must have already been prepared by calling 449 * cache_rbio_pages 450 * 451 * If this rbio was already cached, it gets 452 * moved to the front of the lru. 453 * 454 * If the size of the rbio cache is too big, we 455 * prune an item. 456 */ 457 static void cache_rbio(struct btrfs_raid_bio *rbio) 458 { 459 struct btrfs_stripe_hash_table *table; 460 unsigned long flags; 461 462 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) 463 return; 464 465 table = rbio->fs_info->stripe_hash_table; 466 467 spin_lock_irqsave(&table->cache_lock, flags); 468 spin_lock(&rbio->bio_list_lock); 469 470 /* bump our ref if we were not in the list before */ 471 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) 472 atomic_inc(&rbio->refs); 473 474 if (!list_empty(&rbio->stripe_cache)){ 475 list_move(&rbio->stripe_cache, &table->stripe_cache); 476 } else { 477 list_add(&rbio->stripe_cache, &table->stripe_cache); 478 table->cache_size += 1; 479 } 480 481 spin_unlock(&rbio->bio_list_lock); 482 483 if (table->cache_size > RBIO_CACHE_SIZE) { 484 struct btrfs_raid_bio *found; 485 486 found = list_entry(table->stripe_cache.prev, 487 struct btrfs_raid_bio, 488 stripe_cache); 489 490 if (found != rbio) 491 __remove_rbio_from_cache(found); 492 } 493 494 spin_unlock_irqrestore(&table->cache_lock, flags); 495 return; 496 } 497 498 /* 499 * helper function to run the xor_blocks api. It is only 500 * able to do MAX_XOR_BLOCKS at a time, so we need to 501 * loop through. 502 */ 503 static void run_xor(void **pages, int src_cnt, ssize_t len) 504 { 505 int src_off = 0; 506 int xor_src_cnt = 0; 507 void *dest = pages[src_cnt]; 508 509 while(src_cnt > 0) { 510 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); 511 xor_blocks(xor_src_cnt, len, dest, pages + src_off); 512 513 src_cnt -= xor_src_cnt; 514 src_off += xor_src_cnt; 515 } 516 } 517 518 /* 519 * returns true if the bio list inside this rbio 520 * covers an entire stripe (no rmw required). 521 * Must be called with the bio list lock held, or 522 * at a time when you know it is impossible to add 523 * new bios into the list 524 */ 525 static int __rbio_is_full(struct btrfs_raid_bio *rbio) 526 { 527 unsigned long size = rbio->bio_list_bytes; 528 int ret = 1; 529 530 if (size != rbio->nr_data * rbio->stripe_len) 531 ret = 0; 532 533 BUG_ON(size > rbio->nr_data * rbio->stripe_len); 534 return ret; 535 } 536 537 static int rbio_is_full(struct btrfs_raid_bio *rbio) 538 { 539 unsigned long flags; 540 int ret; 541 542 spin_lock_irqsave(&rbio->bio_list_lock, flags); 543 ret = __rbio_is_full(rbio); 544 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 545 return ret; 546 } 547 548 /* 549 * returns 1 if it is safe to merge two rbios together. 550 * The merging is safe if the two rbios correspond to 551 * the same stripe and if they are both going in the same 552 * direction (read vs write), and if neither one is 553 * locked for final IO 554 * 555 * The caller is responsible for locking such that 556 * rmw_locked is safe to test 557 */ 558 static int rbio_can_merge(struct btrfs_raid_bio *last, 559 struct btrfs_raid_bio *cur) 560 { 561 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || 562 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) 563 return 0; 564 565 /* 566 * we can't merge with cached rbios, since the 567 * idea is that when we merge the destination 568 * rbio is going to run our IO for us. We can 569 * steal from cached rbio's though, other functions 570 * handle that. 571 */ 572 if (test_bit(RBIO_CACHE_BIT, &last->flags) || 573 test_bit(RBIO_CACHE_BIT, &cur->flags)) 574 return 0; 575 576 if (last->raid_map[0] != 577 cur->raid_map[0]) 578 return 0; 579 580 /* reads can't merge with writes */ 581 if (last->read_rebuild != 582 cur->read_rebuild) { 583 return 0; 584 } 585 586 return 1; 587 } 588 589 /* 590 * helper to index into the pstripe 591 */ 592 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) 593 { 594 index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 595 return rbio->stripe_pages[index]; 596 } 597 598 /* 599 * helper to index into the qstripe, returns null 600 * if there is no qstripe 601 */ 602 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) 603 { 604 if (rbio->nr_data + 1 == rbio->bbio->num_stripes) 605 return NULL; 606 607 index += ((rbio->nr_data + 1) * rbio->stripe_len) >> 608 PAGE_CACHE_SHIFT; 609 return rbio->stripe_pages[index]; 610 } 611 612 /* 613 * The first stripe in the table for a logical address 614 * has the lock. rbios are added in one of three ways: 615 * 616 * 1) Nobody has the stripe locked yet. The rbio is given 617 * the lock and 0 is returned. The caller must start the IO 618 * themselves. 619 * 620 * 2) Someone has the stripe locked, but we're able to merge 621 * with the lock owner. The rbio is freed and the IO will 622 * start automatically along with the existing rbio. 1 is returned. 623 * 624 * 3) Someone has the stripe locked, but we're not able to merge. 625 * The rbio is added to the lock owner's plug list, or merged into 626 * an rbio already on the plug list. When the lock owner unlocks, 627 * the next rbio on the list is run and the IO is started automatically. 628 * 1 is returned 629 * 630 * If we return 0, the caller still owns the rbio and must continue with 631 * IO submission. If we return 1, the caller must assume the rbio has 632 * already been freed. 633 */ 634 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) 635 { 636 int bucket = rbio_bucket(rbio); 637 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; 638 struct btrfs_raid_bio *cur; 639 struct btrfs_raid_bio *pending; 640 unsigned long flags; 641 DEFINE_WAIT(wait); 642 struct btrfs_raid_bio *freeit = NULL; 643 struct btrfs_raid_bio *cache_drop = NULL; 644 int ret = 0; 645 int walk = 0; 646 647 spin_lock_irqsave(&h->lock, flags); 648 list_for_each_entry(cur, &h->hash_list, hash_list) { 649 walk++; 650 if (cur->raid_map[0] == rbio->raid_map[0]) { 651 spin_lock(&cur->bio_list_lock); 652 653 /* can we steal this cached rbio's pages? */ 654 if (bio_list_empty(&cur->bio_list) && 655 list_empty(&cur->plug_list) && 656 test_bit(RBIO_CACHE_BIT, &cur->flags) && 657 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { 658 list_del_init(&cur->hash_list); 659 atomic_dec(&cur->refs); 660 661 steal_rbio(cur, rbio); 662 cache_drop = cur; 663 spin_unlock(&cur->bio_list_lock); 664 665 goto lockit; 666 } 667 668 /* can we merge into the lock owner? */ 669 if (rbio_can_merge(cur, rbio)) { 670 merge_rbio(cur, rbio); 671 spin_unlock(&cur->bio_list_lock); 672 freeit = rbio; 673 ret = 1; 674 goto out; 675 } 676 677 678 /* 679 * we couldn't merge with the running 680 * rbio, see if we can merge with the 681 * pending ones. We don't have to 682 * check for rmw_locked because there 683 * is no way they are inside finish_rmw 684 * right now 685 */ 686 list_for_each_entry(pending, &cur->plug_list, 687 plug_list) { 688 if (rbio_can_merge(pending, rbio)) { 689 merge_rbio(pending, rbio); 690 spin_unlock(&cur->bio_list_lock); 691 freeit = rbio; 692 ret = 1; 693 goto out; 694 } 695 } 696 697 /* no merging, put us on the tail of the plug list, 698 * our rbio will be started with the currently 699 * running rbio unlocks 700 */ 701 list_add_tail(&rbio->plug_list, &cur->plug_list); 702 spin_unlock(&cur->bio_list_lock); 703 ret = 1; 704 goto out; 705 } 706 } 707 lockit: 708 atomic_inc(&rbio->refs); 709 list_add(&rbio->hash_list, &h->hash_list); 710 out: 711 spin_unlock_irqrestore(&h->lock, flags); 712 if (cache_drop) 713 remove_rbio_from_cache(cache_drop); 714 if (freeit) 715 __free_raid_bio(freeit); 716 return ret; 717 } 718 719 /* 720 * called as rmw or parity rebuild is completed. If the plug list has more 721 * rbios waiting for this stripe, the next one on the list will be started 722 */ 723 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) 724 { 725 int bucket; 726 struct btrfs_stripe_hash *h; 727 unsigned long flags; 728 int keep_cache = 0; 729 730 bucket = rbio_bucket(rbio); 731 h = rbio->fs_info->stripe_hash_table->table + bucket; 732 733 if (list_empty(&rbio->plug_list)) 734 cache_rbio(rbio); 735 736 spin_lock_irqsave(&h->lock, flags); 737 spin_lock(&rbio->bio_list_lock); 738 739 if (!list_empty(&rbio->hash_list)) { 740 /* 741 * if we're still cached and there is no other IO 742 * to perform, just leave this rbio here for others 743 * to steal from later 744 */ 745 if (list_empty(&rbio->plug_list) && 746 test_bit(RBIO_CACHE_BIT, &rbio->flags)) { 747 keep_cache = 1; 748 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 749 BUG_ON(!bio_list_empty(&rbio->bio_list)); 750 goto done; 751 } 752 753 list_del_init(&rbio->hash_list); 754 atomic_dec(&rbio->refs); 755 756 /* 757 * we use the plug list to hold all the rbios 758 * waiting for the chance to lock this stripe. 759 * hand the lock over to one of them. 760 */ 761 if (!list_empty(&rbio->plug_list)) { 762 struct btrfs_raid_bio *next; 763 struct list_head *head = rbio->plug_list.next; 764 765 next = list_entry(head, struct btrfs_raid_bio, 766 plug_list); 767 768 list_del_init(&rbio->plug_list); 769 770 list_add(&next->hash_list, &h->hash_list); 771 atomic_inc(&next->refs); 772 spin_unlock(&rbio->bio_list_lock); 773 spin_unlock_irqrestore(&h->lock, flags); 774 775 if (next->read_rebuild) 776 async_read_rebuild(next); 777 else { 778 steal_rbio(rbio, next); 779 async_rmw_stripe(next); 780 } 781 782 goto done_nolock; 783 } else if (waitqueue_active(&h->wait)) { 784 spin_unlock(&rbio->bio_list_lock); 785 spin_unlock_irqrestore(&h->lock, flags); 786 wake_up(&h->wait); 787 goto done_nolock; 788 } 789 } 790 done: 791 spin_unlock(&rbio->bio_list_lock); 792 spin_unlock_irqrestore(&h->lock, flags); 793 794 done_nolock: 795 if (!keep_cache) 796 remove_rbio_from_cache(rbio); 797 } 798 799 static void __free_raid_bio(struct btrfs_raid_bio *rbio) 800 { 801 int i; 802 803 WARN_ON(atomic_read(&rbio->refs) < 0); 804 if (!atomic_dec_and_test(&rbio->refs)) 805 return; 806 807 WARN_ON(!list_empty(&rbio->stripe_cache)); 808 WARN_ON(!list_empty(&rbio->hash_list)); 809 WARN_ON(!bio_list_empty(&rbio->bio_list)); 810 811 for (i = 0; i < rbio->nr_pages; i++) { 812 if (rbio->stripe_pages[i]) { 813 __free_page(rbio->stripe_pages[i]); 814 rbio->stripe_pages[i] = NULL; 815 } 816 } 817 kfree(rbio->raid_map); 818 kfree(rbio->bbio); 819 kfree(rbio); 820 } 821 822 static void free_raid_bio(struct btrfs_raid_bio *rbio) 823 { 824 unlock_stripe(rbio); 825 __free_raid_bio(rbio); 826 } 827 828 /* 829 * this frees the rbio and runs through all the bios in the 830 * bio_list and calls end_io on them 831 */ 832 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) 833 { 834 struct bio *cur = bio_list_get(&rbio->bio_list); 835 struct bio *next; 836 free_raid_bio(rbio); 837 838 while (cur) { 839 next = cur->bi_next; 840 cur->bi_next = NULL; 841 if (uptodate) 842 set_bit(BIO_UPTODATE, &cur->bi_flags); 843 bio_endio(cur, err); 844 cur = next; 845 } 846 } 847 848 /* 849 * end io function used by finish_rmw. When we finally 850 * get here, we've written a full stripe 851 */ 852 static void raid_write_end_io(struct bio *bio, int err) 853 { 854 struct btrfs_raid_bio *rbio = bio->bi_private; 855 856 if (err) 857 fail_bio_stripe(rbio, bio); 858 859 bio_put(bio); 860 861 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 862 return; 863 864 err = 0; 865 866 /* OK, we have read all the stripes we need to. */ 867 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 868 err = -EIO; 869 870 rbio_orig_end_io(rbio, err, 0); 871 return; 872 } 873 874 /* 875 * the read/modify/write code wants to use the original bio for 876 * any pages it included, and then use the rbio for everything 877 * else. This function decides if a given index (stripe number) 878 * and page number in that stripe fall inside the original bio 879 * or the rbio. 880 * 881 * if you set bio_list_only, you'll get a NULL back for any ranges 882 * that are outside the bio_list 883 * 884 * This doesn't take any refs on anything, you get a bare page pointer 885 * and the caller must bump refs as required. 886 * 887 * You must call index_rbio_pages once before you can trust 888 * the answers from this function. 889 */ 890 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, 891 int index, int pagenr, int bio_list_only) 892 { 893 int chunk_page; 894 struct page *p = NULL; 895 896 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; 897 898 spin_lock_irq(&rbio->bio_list_lock); 899 p = rbio->bio_pages[chunk_page]; 900 spin_unlock_irq(&rbio->bio_list_lock); 901 902 if (p || bio_list_only) 903 return p; 904 905 return rbio->stripe_pages[chunk_page]; 906 } 907 908 /* 909 * number of pages we need for the entire stripe across all the 910 * drives 911 */ 912 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) 913 { 914 unsigned long nr = stripe_len * nr_stripes; 915 return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 916 } 917 918 /* 919 * allocation and initial setup for the btrfs_raid_bio. Not 920 * this does not allocate any pages for rbio->pages. 921 */ 922 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, 923 struct btrfs_bio *bbio, u64 *raid_map, 924 u64 stripe_len) 925 { 926 struct btrfs_raid_bio *rbio; 927 int nr_data = 0; 928 int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); 929 void *p; 930 931 rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, 932 GFP_NOFS); 933 if (!rbio) { 934 kfree(raid_map); 935 kfree(bbio); 936 return ERR_PTR(-ENOMEM); 937 } 938 939 bio_list_init(&rbio->bio_list); 940 INIT_LIST_HEAD(&rbio->plug_list); 941 spin_lock_init(&rbio->bio_list_lock); 942 INIT_LIST_HEAD(&rbio->stripe_cache); 943 INIT_LIST_HEAD(&rbio->hash_list); 944 rbio->bbio = bbio; 945 rbio->raid_map = raid_map; 946 rbio->fs_info = root->fs_info; 947 rbio->stripe_len = stripe_len; 948 rbio->nr_pages = num_pages; 949 rbio->faila = -1; 950 rbio->failb = -1; 951 atomic_set(&rbio->refs, 1); 952 953 /* 954 * the stripe_pages and bio_pages array point to the extra 955 * memory we allocated past the end of the rbio 956 */ 957 p = rbio + 1; 958 rbio->stripe_pages = p; 959 rbio->bio_pages = p + sizeof(struct page *) * num_pages; 960 961 if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) 962 nr_data = bbio->num_stripes - 2; 963 else 964 nr_data = bbio->num_stripes - 1; 965 966 rbio->nr_data = nr_data; 967 return rbio; 968 } 969 970 /* allocate pages for all the stripes in the bio, including parity */ 971 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) 972 { 973 int i; 974 struct page *page; 975 976 for (i = 0; i < rbio->nr_pages; i++) { 977 if (rbio->stripe_pages[i]) 978 continue; 979 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 980 if (!page) 981 return -ENOMEM; 982 rbio->stripe_pages[i] = page; 983 ClearPageUptodate(page); 984 } 985 return 0; 986 } 987 988 /* allocate pages for just the p/q stripes */ 989 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) 990 { 991 int i; 992 struct page *page; 993 994 i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; 995 996 for (; i < rbio->nr_pages; i++) { 997 if (rbio->stripe_pages[i]) 998 continue; 999 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1000 if (!page) 1001 return -ENOMEM; 1002 rbio->stripe_pages[i] = page; 1003 } 1004 return 0; 1005 } 1006 1007 /* 1008 * add a single page from a specific stripe into our list of bios for IO 1009 * this will try to merge into existing bios if possible, and returns 1010 * zero if all went well. 1011 */ 1012 static int rbio_add_io_page(struct btrfs_raid_bio *rbio, 1013 struct bio_list *bio_list, 1014 struct page *page, 1015 int stripe_nr, 1016 unsigned long page_index, 1017 unsigned long bio_max_len) 1018 { 1019 struct bio *last = bio_list->tail; 1020 u64 last_end = 0; 1021 int ret; 1022 struct bio *bio; 1023 struct btrfs_bio_stripe *stripe; 1024 u64 disk_start; 1025 1026 stripe = &rbio->bbio->stripes[stripe_nr]; 1027 disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); 1028 1029 /* if the device is missing, just fail this stripe */ 1030 if (!stripe->dev->bdev) 1031 return fail_rbio_index(rbio, stripe_nr); 1032 1033 /* see if we can add this page onto our existing bio */ 1034 if (last) { 1035 last_end = (u64)last->bi_iter.bi_sector << 9; 1036 last_end += last->bi_iter.bi_size; 1037 1038 /* 1039 * we can't merge these if they are from different 1040 * devices or if they are not contiguous 1041 */ 1042 if (last_end == disk_start && stripe->dev->bdev && 1043 test_bit(BIO_UPTODATE, &last->bi_flags) && 1044 last->bi_bdev == stripe->dev->bdev) { 1045 ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); 1046 if (ret == PAGE_CACHE_SIZE) 1047 return 0; 1048 } 1049 } 1050 1051 /* put a new bio on the list */ 1052 bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); 1053 if (!bio) 1054 return -ENOMEM; 1055 1056 bio->bi_iter.bi_size = 0; 1057 bio->bi_bdev = stripe->dev->bdev; 1058 bio->bi_iter.bi_sector = disk_start >> 9; 1059 set_bit(BIO_UPTODATE, &bio->bi_flags); 1060 1061 bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); 1062 bio_list_add(bio_list, bio); 1063 return 0; 1064 } 1065 1066 /* 1067 * while we're doing the read/modify/write cycle, we could 1068 * have errors in reading pages off the disk. This checks 1069 * for errors and if we're not able to read the page it'll 1070 * trigger parity reconstruction. The rmw will be finished 1071 * after we've reconstructed the failed stripes 1072 */ 1073 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) 1074 { 1075 if (rbio->faila >= 0 || rbio->failb >= 0) { 1076 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); 1077 __raid56_parity_recover(rbio); 1078 } else { 1079 finish_rmw(rbio); 1080 } 1081 } 1082 1083 /* 1084 * these are just the pages from the rbio array, not from anything 1085 * the FS sent down to us 1086 */ 1087 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) 1088 { 1089 int index; 1090 index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); 1091 index += page; 1092 return rbio->stripe_pages[index]; 1093 } 1094 1095 /* 1096 * helper function to walk our bio list and populate the bio_pages array with 1097 * the result. This seems expensive, but it is faster than constantly 1098 * searching through the bio list as we setup the IO in finish_rmw or stripe 1099 * reconstruction. 1100 * 1101 * This must be called before you trust the answers from page_in_rbio 1102 */ 1103 static void index_rbio_pages(struct btrfs_raid_bio *rbio) 1104 { 1105 struct bio *bio; 1106 u64 start; 1107 unsigned long stripe_offset; 1108 unsigned long page_index; 1109 struct page *p; 1110 int i; 1111 1112 spin_lock_irq(&rbio->bio_list_lock); 1113 bio_list_for_each(bio, &rbio->bio_list) { 1114 start = (u64)bio->bi_iter.bi_sector << 9; 1115 stripe_offset = start - rbio->raid_map[0]; 1116 page_index = stripe_offset >> PAGE_CACHE_SHIFT; 1117 1118 for (i = 0; i < bio->bi_vcnt; i++) { 1119 p = bio->bi_io_vec[i].bv_page; 1120 rbio->bio_pages[page_index + i] = p; 1121 } 1122 } 1123 spin_unlock_irq(&rbio->bio_list_lock); 1124 } 1125 1126 /* 1127 * this is called from one of two situations. We either 1128 * have a full stripe from the higher layers, or we've read all 1129 * the missing bits off disk. 1130 * 1131 * This will calculate the parity and then send down any 1132 * changed blocks. 1133 */ 1134 static noinline void finish_rmw(struct btrfs_raid_bio *rbio) 1135 { 1136 struct btrfs_bio *bbio = rbio->bbio; 1137 void *pointers[bbio->num_stripes]; 1138 int stripe_len = rbio->stripe_len; 1139 int nr_data = rbio->nr_data; 1140 int stripe; 1141 int pagenr; 1142 int p_stripe = -1; 1143 int q_stripe = -1; 1144 struct bio_list bio_list; 1145 struct bio *bio; 1146 int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; 1147 int ret; 1148 1149 bio_list_init(&bio_list); 1150 1151 if (bbio->num_stripes - rbio->nr_data == 1) { 1152 p_stripe = bbio->num_stripes - 1; 1153 } else if (bbio->num_stripes - rbio->nr_data == 2) { 1154 p_stripe = bbio->num_stripes - 2; 1155 q_stripe = bbio->num_stripes - 1; 1156 } else { 1157 BUG(); 1158 } 1159 1160 /* at this point we either have a full stripe, 1161 * or we've read the full stripe from the drive. 1162 * recalculate the parity and write the new results. 1163 * 1164 * We're not allowed to add any new bios to the 1165 * bio list here, anyone else that wants to 1166 * change this stripe needs to do their own rmw. 1167 */ 1168 spin_lock_irq(&rbio->bio_list_lock); 1169 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1170 spin_unlock_irq(&rbio->bio_list_lock); 1171 1172 atomic_set(&rbio->bbio->error, 0); 1173 1174 /* 1175 * now that we've set rmw_locked, run through the 1176 * bio list one last time and map the page pointers 1177 * 1178 * We don't cache full rbios because we're assuming 1179 * the higher layers are unlikely to use this area of 1180 * the disk again soon. If they do use it again, 1181 * hopefully they will send another full bio. 1182 */ 1183 index_rbio_pages(rbio); 1184 if (!rbio_is_full(rbio)) 1185 cache_rbio_pages(rbio); 1186 else 1187 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1188 1189 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1190 struct page *p; 1191 /* first collect one page from each data stripe */ 1192 for (stripe = 0; stripe < nr_data; stripe++) { 1193 p = page_in_rbio(rbio, stripe, pagenr, 0); 1194 pointers[stripe] = kmap(p); 1195 } 1196 1197 /* then add the parity stripe */ 1198 p = rbio_pstripe_page(rbio, pagenr); 1199 SetPageUptodate(p); 1200 pointers[stripe++] = kmap(p); 1201 1202 if (q_stripe != -1) { 1203 1204 /* 1205 * raid6, add the qstripe and call the 1206 * library function to fill in our p/q 1207 */ 1208 p = rbio_qstripe_page(rbio, pagenr); 1209 SetPageUptodate(p); 1210 pointers[stripe++] = kmap(p); 1211 1212 raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, 1213 pointers); 1214 } else { 1215 /* raid5 */ 1216 memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); 1217 run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); 1218 } 1219 1220 1221 for (stripe = 0; stripe < bbio->num_stripes; stripe++) 1222 kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); 1223 } 1224 1225 /* 1226 * time to start writing. Make bios for everything from the 1227 * higher layers (the bio_list in our rbio) and our p/q. Ignore 1228 * everything else. 1229 */ 1230 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1231 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { 1232 struct page *page; 1233 if (stripe < rbio->nr_data) { 1234 page = page_in_rbio(rbio, stripe, pagenr, 1); 1235 if (!page) 1236 continue; 1237 } else { 1238 page = rbio_stripe_page(rbio, stripe, pagenr); 1239 } 1240 1241 ret = rbio_add_io_page(rbio, &bio_list, 1242 page, stripe, pagenr, rbio->stripe_len); 1243 if (ret) 1244 goto cleanup; 1245 } 1246 } 1247 1248 atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); 1249 BUG_ON(atomic_read(&bbio->stripes_pending) == 0); 1250 1251 while (1) { 1252 bio = bio_list_pop(&bio_list); 1253 if (!bio) 1254 break; 1255 1256 bio->bi_private = rbio; 1257 bio->bi_end_io = raid_write_end_io; 1258 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1259 submit_bio(WRITE, bio); 1260 } 1261 return; 1262 1263 cleanup: 1264 rbio_orig_end_io(rbio, -EIO, 0); 1265 } 1266 1267 /* 1268 * helper to find the stripe number for a given bio. Used to figure out which 1269 * stripe has failed. This expects the bio to correspond to a physical disk, 1270 * so it looks up based on physical sector numbers. 1271 */ 1272 static int find_bio_stripe(struct btrfs_raid_bio *rbio, 1273 struct bio *bio) 1274 { 1275 u64 physical = bio->bi_iter.bi_sector; 1276 u64 stripe_start; 1277 int i; 1278 struct btrfs_bio_stripe *stripe; 1279 1280 physical <<= 9; 1281 1282 for (i = 0; i < rbio->bbio->num_stripes; i++) { 1283 stripe = &rbio->bbio->stripes[i]; 1284 stripe_start = stripe->physical; 1285 if (physical >= stripe_start && 1286 physical < stripe_start + rbio->stripe_len) { 1287 return i; 1288 } 1289 } 1290 return -1; 1291 } 1292 1293 /* 1294 * helper to find the stripe number for a given 1295 * bio (before mapping). Used to figure out which stripe has 1296 * failed. This looks up based on logical block numbers. 1297 */ 1298 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, 1299 struct bio *bio) 1300 { 1301 u64 logical = bio->bi_iter.bi_sector; 1302 u64 stripe_start; 1303 int i; 1304 1305 logical <<= 9; 1306 1307 for (i = 0; i < rbio->nr_data; i++) { 1308 stripe_start = rbio->raid_map[i]; 1309 if (logical >= stripe_start && 1310 logical < stripe_start + rbio->stripe_len) { 1311 return i; 1312 } 1313 } 1314 return -1; 1315 } 1316 1317 /* 1318 * returns -EIO if we had too many failures 1319 */ 1320 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) 1321 { 1322 unsigned long flags; 1323 int ret = 0; 1324 1325 spin_lock_irqsave(&rbio->bio_list_lock, flags); 1326 1327 /* we already know this stripe is bad, move on */ 1328 if (rbio->faila == failed || rbio->failb == failed) 1329 goto out; 1330 1331 if (rbio->faila == -1) { 1332 /* first failure on this rbio */ 1333 rbio->faila = failed; 1334 atomic_inc(&rbio->bbio->error); 1335 } else if (rbio->failb == -1) { 1336 /* second failure on this rbio */ 1337 rbio->failb = failed; 1338 atomic_inc(&rbio->bbio->error); 1339 } else { 1340 ret = -EIO; 1341 } 1342 out: 1343 spin_unlock_irqrestore(&rbio->bio_list_lock, flags); 1344 1345 return ret; 1346 } 1347 1348 /* 1349 * helper to fail a stripe based on a physical disk 1350 * bio. 1351 */ 1352 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, 1353 struct bio *bio) 1354 { 1355 int failed = find_bio_stripe(rbio, bio); 1356 1357 if (failed < 0) 1358 return -EIO; 1359 1360 return fail_rbio_index(rbio, failed); 1361 } 1362 1363 /* 1364 * this sets each page in the bio uptodate. It should only be used on private 1365 * rbio pages, nothing that comes in from the higher layers 1366 */ 1367 static void set_bio_pages_uptodate(struct bio *bio) 1368 { 1369 int i; 1370 struct page *p; 1371 1372 for (i = 0; i < bio->bi_vcnt; i++) { 1373 p = bio->bi_io_vec[i].bv_page; 1374 SetPageUptodate(p); 1375 } 1376 } 1377 1378 /* 1379 * end io for the read phase of the rmw cycle. All the bios here are physical 1380 * stripe bios we've read from the disk so we can recalculate the parity of the 1381 * stripe. 1382 * 1383 * This will usually kick off finish_rmw once all the bios are read in, but it 1384 * may trigger parity reconstruction if we had any errors along the way 1385 */ 1386 static void raid_rmw_end_io(struct bio *bio, int err) 1387 { 1388 struct btrfs_raid_bio *rbio = bio->bi_private; 1389 1390 if (err) 1391 fail_bio_stripe(rbio, bio); 1392 else 1393 set_bio_pages_uptodate(bio); 1394 1395 bio_put(bio); 1396 1397 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1398 return; 1399 1400 err = 0; 1401 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1402 goto cleanup; 1403 1404 /* 1405 * this will normally call finish_rmw to start our write 1406 * but if there are any failed stripes we'll reconstruct 1407 * from parity first 1408 */ 1409 validate_rbio_for_rmw(rbio); 1410 return; 1411 1412 cleanup: 1413 1414 rbio_orig_end_io(rbio, -EIO, 0); 1415 } 1416 1417 static void async_rmw_stripe(struct btrfs_raid_bio *rbio) 1418 { 1419 btrfs_init_work(&rbio->work, rmw_work, NULL, NULL); 1420 1421 btrfs_queue_work(rbio->fs_info->rmw_workers, 1422 &rbio->work); 1423 } 1424 1425 static void async_read_rebuild(struct btrfs_raid_bio *rbio) 1426 { 1427 btrfs_init_work(&rbio->work, read_rebuild_work, NULL, NULL); 1428 1429 btrfs_queue_work(rbio->fs_info->rmw_workers, 1430 &rbio->work); 1431 } 1432 1433 /* 1434 * the stripe must be locked by the caller. It will 1435 * unlock after all the writes are done 1436 */ 1437 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) 1438 { 1439 int bios_to_read = 0; 1440 struct btrfs_bio *bbio = rbio->bbio; 1441 struct bio_list bio_list; 1442 int ret; 1443 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1444 int pagenr; 1445 int stripe; 1446 struct bio *bio; 1447 1448 bio_list_init(&bio_list); 1449 1450 ret = alloc_rbio_pages(rbio); 1451 if (ret) 1452 goto cleanup; 1453 1454 index_rbio_pages(rbio); 1455 1456 atomic_set(&rbio->bbio->error, 0); 1457 /* 1458 * build a list of bios to read all the missing parts of this 1459 * stripe 1460 */ 1461 for (stripe = 0; stripe < rbio->nr_data; stripe++) { 1462 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1463 struct page *page; 1464 /* 1465 * we want to find all the pages missing from 1466 * the rbio and read them from the disk. If 1467 * page_in_rbio finds a page in the bio list 1468 * we don't need to read it off the stripe. 1469 */ 1470 page = page_in_rbio(rbio, stripe, pagenr, 1); 1471 if (page) 1472 continue; 1473 1474 page = rbio_stripe_page(rbio, stripe, pagenr); 1475 /* 1476 * the bio cache may have handed us an uptodate 1477 * page. If so, be happy and use it 1478 */ 1479 if (PageUptodate(page)) 1480 continue; 1481 1482 ret = rbio_add_io_page(rbio, &bio_list, page, 1483 stripe, pagenr, rbio->stripe_len); 1484 if (ret) 1485 goto cleanup; 1486 } 1487 } 1488 1489 bios_to_read = bio_list_size(&bio_list); 1490 if (!bios_to_read) { 1491 /* 1492 * this can happen if others have merged with 1493 * us, it means there is nothing left to read. 1494 * But if there are missing devices it may not be 1495 * safe to do the full stripe write yet. 1496 */ 1497 goto finish; 1498 } 1499 1500 /* 1501 * the bbio may be freed once we submit the last bio. Make sure 1502 * not to touch it after that 1503 */ 1504 atomic_set(&bbio->stripes_pending, bios_to_read); 1505 while (1) { 1506 bio = bio_list_pop(&bio_list); 1507 if (!bio) 1508 break; 1509 1510 bio->bi_private = rbio; 1511 bio->bi_end_io = raid_rmw_end_io; 1512 1513 btrfs_bio_wq_end_io(rbio->fs_info, bio, 1514 BTRFS_WQ_ENDIO_RAID56); 1515 1516 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 1517 submit_bio(READ, bio); 1518 } 1519 /* the actual write will happen once the reads are done */ 1520 return 0; 1521 1522 cleanup: 1523 rbio_orig_end_io(rbio, -EIO, 0); 1524 return -EIO; 1525 1526 finish: 1527 validate_rbio_for_rmw(rbio); 1528 return 0; 1529 } 1530 1531 /* 1532 * if the upper layers pass in a full stripe, we thank them by only allocating 1533 * enough pages to hold the parity, and sending it all down quickly. 1534 */ 1535 static int full_stripe_write(struct btrfs_raid_bio *rbio) 1536 { 1537 int ret; 1538 1539 ret = alloc_rbio_parity_pages(rbio); 1540 if (ret) { 1541 __free_raid_bio(rbio); 1542 return ret; 1543 } 1544 1545 ret = lock_stripe_add(rbio); 1546 if (ret == 0) 1547 finish_rmw(rbio); 1548 return 0; 1549 } 1550 1551 /* 1552 * partial stripe writes get handed over to async helpers. 1553 * We're really hoping to merge a few more writes into this 1554 * rbio before calculating new parity 1555 */ 1556 static int partial_stripe_write(struct btrfs_raid_bio *rbio) 1557 { 1558 int ret; 1559 1560 ret = lock_stripe_add(rbio); 1561 if (ret == 0) 1562 async_rmw_stripe(rbio); 1563 return 0; 1564 } 1565 1566 /* 1567 * sometimes while we were reading from the drive to 1568 * recalculate parity, enough new bios come into create 1569 * a full stripe. So we do a check here to see if we can 1570 * go directly to finish_rmw 1571 */ 1572 static int __raid56_parity_write(struct btrfs_raid_bio *rbio) 1573 { 1574 /* head off into rmw land if we don't have a full stripe */ 1575 if (!rbio_is_full(rbio)) 1576 return partial_stripe_write(rbio); 1577 return full_stripe_write(rbio); 1578 } 1579 1580 /* 1581 * We use plugging call backs to collect full stripes. 1582 * Any time we get a partial stripe write while plugged 1583 * we collect it into a list. When the unplug comes down, 1584 * we sort the list by logical block number and merge 1585 * everything we can into the same rbios 1586 */ 1587 struct btrfs_plug_cb { 1588 struct blk_plug_cb cb; 1589 struct btrfs_fs_info *info; 1590 struct list_head rbio_list; 1591 struct btrfs_work work; 1592 }; 1593 1594 /* 1595 * rbios on the plug list are sorted for easier merging. 1596 */ 1597 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) 1598 { 1599 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, 1600 plug_list); 1601 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, 1602 plug_list); 1603 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; 1604 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; 1605 1606 if (a_sector < b_sector) 1607 return -1; 1608 if (a_sector > b_sector) 1609 return 1; 1610 return 0; 1611 } 1612 1613 static void run_plug(struct btrfs_plug_cb *plug) 1614 { 1615 struct btrfs_raid_bio *cur; 1616 struct btrfs_raid_bio *last = NULL; 1617 1618 /* 1619 * sort our plug list then try to merge 1620 * everything we can in hopes of creating full 1621 * stripes. 1622 */ 1623 list_sort(NULL, &plug->rbio_list, plug_cmp); 1624 while (!list_empty(&plug->rbio_list)) { 1625 cur = list_entry(plug->rbio_list.next, 1626 struct btrfs_raid_bio, plug_list); 1627 list_del_init(&cur->plug_list); 1628 1629 if (rbio_is_full(cur)) { 1630 /* we have a full stripe, send it down */ 1631 full_stripe_write(cur); 1632 continue; 1633 } 1634 if (last) { 1635 if (rbio_can_merge(last, cur)) { 1636 merge_rbio(last, cur); 1637 __free_raid_bio(cur); 1638 continue; 1639 1640 } 1641 __raid56_parity_write(last); 1642 } 1643 last = cur; 1644 } 1645 if (last) { 1646 __raid56_parity_write(last); 1647 } 1648 kfree(plug); 1649 } 1650 1651 /* 1652 * if the unplug comes from schedule, we have to push the 1653 * work off to a helper thread 1654 */ 1655 static void unplug_work(struct btrfs_work *work) 1656 { 1657 struct btrfs_plug_cb *plug; 1658 plug = container_of(work, struct btrfs_plug_cb, work); 1659 run_plug(plug); 1660 } 1661 1662 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) 1663 { 1664 struct btrfs_plug_cb *plug; 1665 plug = container_of(cb, struct btrfs_plug_cb, cb); 1666 1667 if (from_schedule) { 1668 btrfs_init_work(&plug->work, unplug_work, NULL, NULL); 1669 btrfs_queue_work(plug->info->rmw_workers, 1670 &plug->work); 1671 return; 1672 } 1673 run_plug(plug); 1674 } 1675 1676 /* 1677 * our main entry point for writes from the rest of the FS. 1678 */ 1679 int raid56_parity_write(struct btrfs_root *root, struct bio *bio, 1680 struct btrfs_bio *bbio, u64 *raid_map, 1681 u64 stripe_len) 1682 { 1683 struct btrfs_raid_bio *rbio; 1684 struct btrfs_plug_cb *plug = NULL; 1685 struct blk_plug_cb *cb; 1686 1687 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 1688 if (IS_ERR(rbio)) 1689 return PTR_ERR(rbio); 1690 bio_list_add(&rbio->bio_list, bio); 1691 rbio->bio_list_bytes = bio->bi_iter.bi_size; 1692 1693 /* 1694 * don't plug on full rbios, just get them out the door 1695 * as quickly as we can 1696 */ 1697 if (rbio_is_full(rbio)) 1698 return full_stripe_write(rbio); 1699 1700 cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, 1701 sizeof(*plug)); 1702 if (cb) { 1703 plug = container_of(cb, struct btrfs_plug_cb, cb); 1704 if (!plug->info) { 1705 plug->info = root->fs_info; 1706 INIT_LIST_HEAD(&plug->rbio_list); 1707 } 1708 list_add_tail(&rbio->plug_list, &plug->rbio_list); 1709 } else { 1710 return __raid56_parity_write(rbio); 1711 } 1712 return 0; 1713 } 1714 1715 /* 1716 * all parity reconstruction happens here. We've read in everything 1717 * we can find from the drives and this does the heavy lifting of 1718 * sorting the good from the bad. 1719 */ 1720 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) 1721 { 1722 int pagenr, stripe; 1723 void **pointers; 1724 int faila = -1, failb = -1; 1725 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1726 struct page *page; 1727 int err; 1728 int i; 1729 1730 pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), 1731 GFP_NOFS); 1732 if (!pointers) { 1733 err = -ENOMEM; 1734 goto cleanup_io; 1735 } 1736 1737 faila = rbio->faila; 1738 failb = rbio->failb; 1739 1740 if (rbio->read_rebuild) { 1741 spin_lock_irq(&rbio->bio_list_lock); 1742 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); 1743 spin_unlock_irq(&rbio->bio_list_lock); 1744 } 1745 1746 index_rbio_pages(rbio); 1747 1748 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1749 /* setup our array of pointers with pages 1750 * from each stripe 1751 */ 1752 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1753 /* 1754 * if we're rebuilding a read, we have to use 1755 * pages from the bio list 1756 */ 1757 if (rbio->read_rebuild && 1758 (stripe == faila || stripe == failb)) { 1759 page = page_in_rbio(rbio, stripe, pagenr, 0); 1760 } else { 1761 page = rbio_stripe_page(rbio, stripe, pagenr); 1762 } 1763 pointers[stripe] = kmap(page); 1764 } 1765 1766 /* all raid6 handling here */ 1767 if (rbio->raid_map[rbio->bbio->num_stripes - 1] == 1768 RAID6_Q_STRIPE) { 1769 1770 /* 1771 * single failure, rebuild from parity raid5 1772 * style 1773 */ 1774 if (failb < 0) { 1775 if (faila == rbio->nr_data) { 1776 /* 1777 * Just the P stripe has failed, without 1778 * a bad data or Q stripe. 1779 * TODO, we should redo the xor here. 1780 */ 1781 err = -EIO; 1782 goto cleanup; 1783 } 1784 /* 1785 * a single failure in raid6 is rebuilt 1786 * in the pstripe code below 1787 */ 1788 goto pstripe; 1789 } 1790 1791 /* make sure our ps and qs are in order */ 1792 if (faila > failb) { 1793 int tmp = failb; 1794 failb = faila; 1795 faila = tmp; 1796 } 1797 1798 /* if the q stripe is failed, do a pstripe reconstruction 1799 * from the xors. 1800 * If both the q stripe and the P stripe are failed, we're 1801 * here due to a crc mismatch and we can't give them the 1802 * data they want 1803 */ 1804 if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { 1805 if (rbio->raid_map[faila] == RAID5_P_STRIPE) { 1806 err = -EIO; 1807 goto cleanup; 1808 } 1809 /* 1810 * otherwise we have one bad data stripe and 1811 * a good P stripe. raid5! 1812 */ 1813 goto pstripe; 1814 } 1815 1816 if (rbio->raid_map[failb] == RAID5_P_STRIPE) { 1817 raid6_datap_recov(rbio->bbio->num_stripes, 1818 PAGE_SIZE, faila, pointers); 1819 } else { 1820 raid6_2data_recov(rbio->bbio->num_stripes, 1821 PAGE_SIZE, faila, failb, 1822 pointers); 1823 } 1824 } else { 1825 void *p; 1826 1827 /* rebuild from P stripe here (raid5 or raid6) */ 1828 BUG_ON(failb != -1); 1829 pstripe: 1830 /* Copy parity block into failed block to start with */ 1831 memcpy(pointers[faila], 1832 pointers[rbio->nr_data], 1833 PAGE_CACHE_SIZE); 1834 1835 /* rearrange the pointer array */ 1836 p = pointers[faila]; 1837 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) 1838 pointers[stripe] = pointers[stripe + 1]; 1839 pointers[rbio->nr_data - 1] = p; 1840 1841 /* xor in the rest */ 1842 run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); 1843 } 1844 /* if we're doing this rebuild as part of an rmw, go through 1845 * and set all of our private rbio pages in the 1846 * failed stripes as uptodate. This way finish_rmw will 1847 * know they can be trusted. If this was a read reconstruction, 1848 * other endio functions will fiddle the uptodate bits 1849 */ 1850 if (!rbio->read_rebuild) { 1851 for (i = 0; i < nr_pages; i++) { 1852 if (faila != -1) { 1853 page = rbio_stripe_page(rbio, faila, i); 1854 SetPageUptodate(page); 1855 } 1856 if (failb != -1) { 1857 page = rbio_stripe_page(rbio, failb, i); 1858 SetPageUptodate(page); 1859 } 1860 } 1861 } 1862 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { 1863 /* 1864 * if we're rebuilding a read, we have to use 1865 * pages from the bio list 1866 */ 1867 if (rbio->read_rebuild && 1868 (stripe == faila || stripe == failb)) { 1869 page = page_in_rbio(rbio, stripe, pagenr, 0); 1870 } else { 1871 page = rbio_stripe_page(rbio, stripe, pagenr); 1872 } 1873 kunmap(page); 1874 } 1875 } 1876 1877 err = 0; 1878 cleanup: 1879 kfree(pointers); 1880 1881 cleanup_io: 1882 1883 if (rbio->read_rebuild) { 1884 if (err == 0) 1885 cache_rbio_pages(rbio); 1886 else 1887 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 1888 1889 rbio_orig_end_io(rbio, err, err == 0); 1890 } else if (err == 0) { 1891 rbio->faila = -1; 1892 rbio->failb = -1; 1893 finish_rmw(rbio); 1894 } else { 1895 rbio_orig_end_io(rbio, err, 0); 1896 } 1897 } 1898 1899 /* 1900 * This is called only for stripes we've read from disk to 1901 * reconstruct the parity. 1902 */ 1903 static void raid_recover_end_io(struct bio *bio, int err) 1904 { 1905 struct btrfs_raid_bio *rbio = bio->bi_private; 1906 1907 /* 1908 * we only read stripe pages off the disk, set them 1909 * up to date if there were no errors 1910 */ 1911 if (err) 1912 fail_bio_stripe(rbio, bio); 1913 else 1914 set_bio_pages_uptodate(bio); 1915 bio_put(bio); 1916 1917 if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) 1918 return; 1919 1920 if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) 1921 rbio_orig_end_io(rbio, -EIO, 0); 1922 else 1923 __raid_recover_end_io(rbio); 1924 } 1925 1926 /* 1927 * reads everything we need off the disk to reconstruct 1928 * the parity. endio handlers trigger final reconstruction 1929 * when the IO is done. 1930 * 1931 * This is used both for reads from the higher layers and for 1932 * parity construction required to finish a rmw cycle. 1933 */ 1934 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) 1935 { 1936 int bios_to_read = 0; 1937 struct btrfs_bio *bbio = rbio->bbio; 1938 struct bio_list bio_list; 1939 int ret; 1940 int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 1941 int pagenr; 1942 int stripe; 1943 struct bio *bio; 1944 1945 bio_list_init(&bio_list); 1946 1947 ret = alloc_rbio_pages(rbio); 1948 if (ret) 1949 goto cleanup; 1950 1951 atomic_set(&rbio->bbio->error, 0); 1952 1953 /* 1954 * read everything that hasn't failed. Thanks to the 1955 * stripe cache, it is possible that some or all of these 1956 * pages are going to be uptodate. 1957 */ 1958 for (stripe = 0; stripe < bbio->num_stripes; stripe++) { 1959 if (rbio->faila == stripe || rbio->failb == stripe) { 1960 atomic_inc(&rbio->bbio->error); 1961 continue; 1962 } 1963 1964 for (pagenr = 0; pagenr < nr_pages; pagenr++) { 1965 struct page *p; 1966 1967 /* 1968 * the rmw code may have already read this 1969 * page in 1970 */ 1971 p = rbio_stripe_page(rbio, stripe, pagenr); 1972 if (PageUptodate(p)) 1973 continue; 1974 1975 ret = rbio_add_io_page(rbio, &bio_list, 1976 rbio_stripe_page(rbio, stripe, pagenr), 1977 stripe, pagenr, rbio->stripe_len); 1978 if (ret < 0) 1979 goto cleanup; 1980 } 1981 } 1982 1983 bios_to_read = bio_list_size(&bio_list); 1984 if (!bios_to_read) { 1985 /* 1986 * we might have no bios to read just because the pages 1987 * were up to date, or we might have no bios to read because 1988 * the devices were gone. 1989 */ 1990 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { 1991 __raid_recover_end_io(rbio); 1992 goto out; 1993 } else { 1994 goto cleanup; 1995 } 1996 } 1997 1998 /* 1999 * the bbio may be freed once we submit the last bio. Make sure 2000 * not to touch it after that 2001 */ 2002 atomic_set(&bbio->stripes_pending, bios_to_read); 2003 while (1) { 2004 bio = bio_list_pop(&bio_list); 2005 if (!bio) 2006 break; 2007 2008 bio->bi_private = rbio; 2009 bio->bi_end_io = raid_recover_end_io; 2010 2011 btrfs_bio_wq_end_io(rbio->fs_info, bio, 2012 BTRFS_WQ_ENDIO_RAID56); 2013 2014 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); 2015 submit_bio(READ, bio); 2016 } 2017 out: 2018 return 0; 2019 2020 cleanup: 2021 if (rbio->read_rebuild) 2022 rbio_orig_end_io(rbio, -EIO, 0); 2023 return -EIO; 2024 } 2025 2026 /* 2027 * the main entry point for reads from the higher layers. This 2028 * is really only called when the normal read path had a failure, 2029 * so we assume the bio they send down corresponds to a failed part 2030 * of the drive. 2031 */ 2032 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, 2033 struct btrfs_bio *bbio, u64 *raid_map, 2034 u64 stripe_len, int mirror_num) 2035 { 2036 struct btrfs_raid_bio *rbio; 2037 int ret; 2038 2039 rbio = alloc_rbio(root, bbio, raid_map, stripe_len); 2040 if (IS_ERR(rbio)) 2041 return PTR_ERR(rbio); 2042 2043 rbio->read_rebuild = 1; 2044 bio_list_add(&rbio->bio_list, bio); 2045 rbio->bio_list_bytes = bio->bi_iter.bi_size; 2046 2047 rbio->faila = find_logical_bio_stripe(rbio, bio); 2048 if (rbio->faila == -1) { 2049 BUG(); 2050 kfree(raid_map); 2051 kfree(bbio); 2052 kfree(rbio); 2053 return -EIO; 2054 } 2055 2056 /* 2057 * reconstruct from the q stripe if they are 2058 * asking for mirror 3 2059 */ 2060 if (mirror_num == 3) 2061 rbio->failb = bbio->num_stripes - 2; 2062 2063 ret = lock_stripe_add(rbio); 2064 2065 /* 2066 * __raid56_parity_recover will end the bio with 2067 * any errors it hits. We don't want to return 2068 * its error value up the stack because our caller 2069 * will end up calling bio_endio with any nonzero 2070 * return 2071 */ 2072 if (ret == 0) 2073 __raid56_parity_recover(rbio); 2074 /* 2075 * our rbio has been added to the list of 2076 * rbios that will be handled after the 2077 * currently lock owner is done 2078 */ 2079 return 0; 2080 2081 } 2082 2083 static void rmw_work(struct btrfs_work *work) 2084 { 2085 struct btrfs_raid_bio *rbio; 2086 2087 rbio = container_of(work, struct btrfs_raid_bio, work); 2088 raid56_rmw_stripe(rbio); 2089 } 2090 2091 static void read_rebuild_work(struct btrfs_work *work) 2092 { 2093 struct btrfs_raid_bio *rbio; 2094 2095 rbio = container_of(work, struct btrfs_raid_bio, work); 2096 __raid56_parity_recover(rbio); 2097 } 2098