1 /* 2 drbd_actlog.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/slab.h> 27 #include <linux/drbd.h> 28 #include "drbd_int.h" 29 #include "drbd_wrappers.h" 30 31 /* We maintain a trivial check sum in our on disk activity log. 32 * With that we can ensure correct operation even when the storage 33 * device might do a partial (last) sector write while loosing power. 34 */ 35 struct __packed al_transaction { 36 u32 magic; 37 u32 tr_number; 38 struct __packed { 39 u32 pos; 40 u32 extent; } updates[1 + AL_EXTENTS_PT]; 41 u32 xor_sum; 42 }; 43 44 struct update_odbm_work { 45 struct drbd_work w; 46 unsigned int enr; 47 }; 48 49 struct update_al_work { 50 struct drbd_work w; 51 struct lc_element *al_ext; 52 struct completion event; 53 unsigned int enr; 54 /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ 55 unsigned int old_enr; 56 }; 57 58 struct drbd_atodb_wait { 59 atomic_t count; 60 struct completion io_done; 61 struct drbd_conf *mdev; 62 int error; 63 }; 64 65 66 int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); 67 68 static int _drbd_md_sync_page_io(struct drbd_conf *mdev, 69 struct drbd_backing_dev *bdev, 70 struct page *page, sector_t sector, 71 int rw, int size) 72 { 73 struct bio *bio; 74 struct drbd_md_io md_io; 75 int ok; 76 77 md_io.mdev = mdev; 78 init_completion(&md_io.event); 79 md_io.error = 0; 80 81 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) 82 rw |= REQ_FUA; 83 rw |= REQ_UNPLUG | REQ_SYNC; 84 85 bio = bio_alloc(GFP_NOIO, 1); 86 bio->bi_bdev = bdev->md_bdev; 87 bio->bi_sector = sector; 88 ok = (bio_add_page(bio, page, size, 0) == size); 89 if (!ok) 90 goto out; 91 bio->bi_private = &md_io; 92 bio->bi_end_io = drbd_md_io_complete; 93 bio->bi_rw = rw; 94 95 if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 96 bio_endio(bio, -EIO); 97 else 98 submit_bio(rw, bio); 99 wait_for_completion(&md_io.event); 100 ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; 101 102 out: 103 bio_put(bio); 104 return ok; 105 } 106 107 int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, 108 sector_t sector, int rw) 109 { 110 int logical_block_size, mask, ok; 111 int offset = 0; 112 struct page *iop = mdev->md_io_page; 113 114 D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); 115 116 BUG_ON(!bdev->md_bdev); 117 118 logical_block_size = bdev_logical_block_size(bdev->md_bdev); 119 if (logical_block_size == 0) 120 logical_block_size = MD_SECTOR_SIZE; 121 122 /* in case logical_block_size != 512 [ s390 only? ] */ 123 if (logical_block_size != MD_SECTOR_SIZE) { 124 mask = (logical_block_size / MD_SECTOR_SIZE) - 1; 125 D_ASSERT(mask == 1 || mask == 3 || mask == 7); 126 D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); 127 offset = sector & mask; 128 sector = sector & ~mask; 129 iop = mdev->md_io_tmpp; 130 131 if (rw & WRITE) { 132 /* these are GFP_KERNEL pages, pre-allocated 133 * on device initialization */ 134 void *p = page_address(mdev->md_io_page); 135 void *hp = page_address(mdev->md_io_tmpp); 136 137 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, 138 READ, logical_block_size); 139 140 if (unlikely(!ok)) { 141 dev_err(DEV, "drbd_md_sync_page_io(,%llus," 142 "READ [logical_block_size!=512]) failed!\n", 143 (unsigned long long)sector); 144 return 0; 145 } 146 147 memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); 148 } 149 } 150 151 if (sector < drbd_md_first_sector(bdev) || 152 sector > drbd_md_last_sector(bdev)) 153 dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 154 current->comm, current->pid, __func__, 155 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 156 157 ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); 158 if (unlikely(!ok)) { 159 dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", 160 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 161 return 0; 162 } 163 164 if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { 165 void *p = page_address(mdev->md_io_page); 166 void *hp = page_address(mdev->md_io_tmpp); 167 168 memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); 169 } 170 171 return ok; 172 } 173 174 static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) 175 { 176 struct lc_element *al_ext; 177 struct lc_element *tmp; 178 unsigned long al_flags = 0; 179 180 spin_lock_irq(&mdev->al_lock); 181 tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); 182 if (unlikely(tmp != NULL)) { 183 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 184 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 185 spin_unlock_irq(&mdev->al_lock); 186 return NULL; 187 } 188 } 189 al_ext = lc_get(mdev->act_log, enr); 190 al_flags = mdev->act_log->flags; 191 spin_unlock_irq(&mdev->al_lock); 192 193 /* 194 if (!al_ext) { 195 if (al_flags & LC_STARVING) 196 dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); 197 if (al_flags & LC_DIRTY) 198 dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); 199 } 200 */ 201 202 return al_ext; 203 } 204 205 void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) 206 { 207 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); 208 struct lc_element *al_ext; 209 struct update_al_work al_work; 210 211 D_ASSERT(atomic_read(&mdev->local_cnt) > 0); 212 213 wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); 214 215 if (al_ext->lc_number != enr) { 216 /* drbd_al_write_transaction(mdev,al_ext,enr); 217 * recurses into generic_make_request(), which 218 * disallows recursion, bios being serialized on the 219 * current->bio_tail list now. 220 * we have to delegate updates to the activity log 221 * to the worker thread. */ 222 init_completion(&al_work.event); 223 al_work.al_ext = al_ext; 224 al_work.enr = enr; 225 al_work.old_enr = al_ext->lc_number; 226 al_work.w.cb = w_al_write_transaction; 227 drbd_queue_work_front(&mdev->data.work, &al_work.w); 228 wait_for_completion(&al_work.event); 229 230 mdev->al_writ_cnt++; 231 232 spin_lock_irq(&mdev->al_lock); 233 lc_changed(mdev->act_log, al_ext); 234 spin_unlock_irq(&mdev->al_lock); 235 wake_up(&mdev->al_wait); 236 } 237 } 238 239 void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) 240 { 241 unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); 242 struct lc_element *extent; 243 unsigned long flags; 244 245 spin_lock_irqsave(&mdev->al_lock, flags); 246 247 extent = lc_find(mdev->act_log, enr); 248 249 if (!extent) { 250 spin_unlock_irqrestore(&mdev->al_lock, flags); 251 dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); 252 return; 253 } 254 255 if (lc_put(mdev->act_log, extent) == 0) 256 wake_up(&mdev->al_wait); 257 258 spin_unlock_irqrestore(&mdev->al_lock, flags); 259 } 260 261 int 262 w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) 263 { 264 struct update_al_work *aw = container_of(w, struct update_al_work, w); 265 struct lc_element *updated = aw->al_ext; 266 const unsigned int new_enr = aw->enr; 267 const unsigned int evicted = aw->old_enr; 268 struct al_transaction *buffer; 269 sector_t sector; 270 int i, n, mx; 271 unsigned int extent_nr; 272 u32 xor_sum = 0; 273 274 if (!get_ldev(mdev)) { 275 dev_err(DEV, 276 "disk is %s, cannot start al transaction (-%d +%d)\n", 277 drbd_disk_str(mdev->state.disk), evicted, new_enr); 278 complete(&((struct update_al_work *)w)->event); 279 return 1; 280 } 281 /* do we have to do a bitmap write, first? 282 * TODO reduce maximum latency: 283 * submit both bios, then wait for both, 284 * instead of doing two synchronous sector writes. 285 * For now, we must not write the transaction, 286 * if we cannot write out the bitmap of the evicted extent. */ 287 if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) 288 drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); 289 290 /* The bitmap write may have failed, causing a state change. */ 291 if (mdev->state.disk < D_INCONSISTENT) { 292 dev_err(DEV, 293 "disk is %s, cannot write al transaction (-%d +%d)\n", 294 drbd_disk_str(mdev->state.disk), evicted, new_enr); 295 complete(&((struct update_al_work *)w)->event); 296 put_ldev(mdev); 297 return 1; 298 } 299 300 mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ 301 buffer = (struct al_transaction *)page_address(mdev->md_io_page); 302 303 buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); 304 buffer->tr_number = cpu_to_be32(mdev->al_tr_number); 305 306 n = lc_index_of(mdev->act_log, updated); 307 308 buffer->updates[0].pos = cpu_to_be32(n); 309 buffer->updates[0].extent = cpu_to_be32(new_enr); 310 311 xor_sum ^= new_enr; 312 313 mx = min_t(int, AL_EXTENTS_PT, 314 mdev->act_log->nr_elements - mdev->al_tr_cycle); 315 for (i = 0; i < mx; i++) { 316 unsigned idx = mdev->al_tr_cycle + i; 317 extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; 318 buffer->updates[i+1].pos = cpu_to_be32(idx); 319 buffer->updates[i+1].extent = cpu_to_be32(extent_nr); 320 xor_sum ^= extent_nr; 321 } 322 for (; i < AL_EXTENTS_PT; i++) { 323 buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); 324 buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); 325 xor_sum ^= LC_FREE; 326 } 327 mdev->al_tr_cycle += AL_EXTENTS_PT; 328 if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) 329 mdev->al_tr_cycle = 0; 330 331 buffer->xor_sum = cpu_to_be32(xor_sum); 332 333 sector = mdev->ldev->md.md_offset 334 + mdev->ldev->md.al_offset + mdev->al_tr_pos; 335 336 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) 337 drbd_chk_io_error(mdev, 1, TRUE); 338 339 if (++mdev->al_tr_pos > 340 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 341 mdev->al_tr_pos = 0; 342 343 D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); 344 mdev->al_tr_number++; 345 346 mutex_unlock(&mdev->md_io_mutex); 347 348 complete(&((struct update_al_work *)w)->event); 349 put_ldev(mdev); 350 351 return 1; 352 } 353 354 /** 355 * drbd_al_read_tr() - Read a single transaction from the on disk activity log 356 * @mdev: DRBD device. 357 * @bdev: Block device to read form. 358 * @b: pointer to an al_transaction. 359 * @index: On disk slot of the transaction to read. 360 * 361 * Returns -1 on IO error, 0 on checksum error and 1 upon success. 362 */ 363 static int drbd_al_read_tr(struct drbd_conf *mdev, 364 struct drbd_backing_dev *bdev, 365 struct al_transaction *b, 366 int index) 367 { 368 sector_t sector; 369 int rv, i; 370 u32 xor_sum = 0; 371 372 sector = bdev->md.md_offset + bdev->md.al_offset + index; 373 374 /* Dont process error normally, 375 * as this is done before disk is attached! */ 376 if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) 377 return -1; 378 379 rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); 380 381 for (i = 0; i < AL_EXTENTS_PT + 1; i++) 382 xor_sum ^= be32_to_cpu(b->updates[i].extent); 383 rv &= (xor_sum == be32_to_cpu(b->xor_sum)); 384 385 return rv; 386 } 387 388 /** 389 * drbd_al_read_log() - Restores the activity log from its on disk representation. 390 * @mdev: DRBD device. 391 * @bdev: Block device to read form. 392 * 393 * Returns 1 on success, returns 0 when reading the log failed due to IO errors. 394 */ 395 int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) 396 { 397 struct al_transaction *buffer; 398 int i; 399 int rv; 400 int mx; 401 int active_extents = 0; 402 int transactions = 0; 403 int found_valid = 0; 404 int from = 0; 405 int to = 0; 406 u32 from_tnr = 0; 407 u32 to_tnr = 0; 408 u32 cnr; 409 410 mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); 411 412 /* lock out all other meta data io for now, 413 * and make sure the page is mapped. 414 */ 415 mutex_lock(&mdev->md_io_mutex); 416 buffer = page_address(mdev->md_io_page); 417 418 /* Find the valid transaction in the log */ 419 for (i = 0; i <= mx; i++) { 420 rv = drbd_al_read_tr(mdev, bdev, buffer, i); 421 if (rv == 0) 422 continue; 423 if (rv == -1) { 424 mutex_unlock(&mdev->md_io_mutex); 425 return 0; 426 } 427 cnr = be32_to_cpu(buffer->tr_number); 428 429 if (++found_valid == 1) { 430 from = i; 431 to = i; 432 from_tnr = cnr; 433 to_tnr = cnr; 434 continue; 435 } 436 if ((int)cnr - (int)from_tnr < 0) { 437 D_ASSERT(from_tnr - cnr + i - from == mx+1); 438 from = i; 439 from_tnr = cnr; 440 } 441 if ((int)cnr - (int)to_tnr > 0) { 442 D_ASSERT(cnr - to_tnr == i - to); 443 to = i; 444 to_tnr = cnr; 445 } 446 } 447 448 if (!found_valid) { 449 dev_warn(DEV, "No usable activity log found.\n"); 450 mutex_unlock(&mdev->md_io_mutex); 451 return 1; 452 } 453 454 /* Read the valid transactions. 455 * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ 456 i = from; 457 while (1) { 458 int j, pos; 459 unsigned int extent_nr; 460 unsigned int trn; 461 462 rv = drbd_al_read_tr(mdev, bdev, buffer, i); 463 ERR_IF(rv == 0) goto cancel; 464 if (rv == -1) { 465 mutex_unlock(&mdev->md_io_mutex); 466 return 0; 467 } 468 469 trn = be32_to_cpu(buffer->tr_number); 470 471 spin_lock_irq(&mdev->al_lock); 472 473 /* This loop runs backwards because in the cyclic 474 elements there might be an old version of the 475 updated element (in slot 0). So the element in slot 0 476 can overwrite old versions. */ 477 for (j = AL_EXTENTS_PT; j >= 0; j--) { 478 pos = be32_to_cpu(buffer->updates[j].pos); 479 extent_nr = be32_to_cpu(buffer->updates[j].extent); 480 481 if (extent_nr == LC_FREE) 482 continue; 483 484 lc_set(mdev->act_log, extent_nr, pos); 485 active_extents++; 486 } 487 spin_unlock_irq(&mdev->al_lock); 488 489 transactions++; 490 491 cancel: 492 if (i == to) 493 break; 494 i++; 495 if (i > mx) 496 i = 0; 497 } 498 499 mdev->al_tr_number = to_tnr+1; 500 mdev->al_tr_pos = to; 501 if (++mdev->al_tr_pos > 502 div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) 503 mdev->al_tr_pos = 0; 504 505 /* ok, we are done with it */ 506 mutex_unlock(&mdev->md_io_mutex); 507 508 dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", 509 transactions, active_extents); 510 511 return 1; 512 } 513 514 static void atodb_endio(struct bio *bio, int error) 515 { 516 struct drbd_atodb_wait *wc = bio->bi_private; 517 struct drbd_conf *mdev = wc->mdev; 518 struct page *page; 519 int uptodate = bio_flagged(bio, BIO_UPTODATE); 520 521 /* strange behavior of some lower level drivers... 522 * fail the request by clearing the uptodate flag, 523 * but do not return any error?! */ 524 if (!error && !uptodate) 525 error = -EIO; 526 527 drbd_chk_io_error(mdev, error, TRUE); 528 if (error && wc->error == 0) 529 wc->error = error; 530 531 if (atomic_dec_and_test(&wc->count)) 532 complete(&wc->io_done); 533 534 page = bio->bi_io_vec[0].bv_page; 535 put_page(page); 536 bio_put(bio); 537 mdev->bm_writ_cnt++; 538 put_ldev(mdev); 539 } 540 541 /* sector to word */ 542 #define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) 543 544 /* activity log to on disk bitmap -- prepare bio unless that sector 545 * is already covered by previously prepared bios */ 546 static int atodb_prepare_unless_covered(struct drbd_conf *mdev, 547 struct bio **bios, 548 unsigned int enr, 549 struct drbd_atodb_wait *wc) __must_hold(local) 550 { 551 struct bio *bio; 552 struct page *page; 553 sector_t on_disk_sector; 554 unsigned int page_offset = PAGE_SIZE; 555 int offset; 556 int i = 0; 557 int err = -ENOMEM; 558 559 /* We always write aligned, full 4k blocks, 560 * so we can ignore the logical_block_size (for now) */ 561 enr &= ~7U; 562 on_disk_sector = enr + mdev->ldev->md.md_offset 563 + mdev->ldev->md.bm_offset; 564 565 D_ASSERT(!(on_disk_sector & 7U)); 566 567 /* Check if that enr is already covered by an already created bio. 568 * Caution, bios[] is not NULL terminated, 569 * but only initialized to all NULL. 570 * For completely scattered activity log, 571 * the last invocation iterates over all bios, 572 * and finds the last NULL entry. 573 */ 574 while ((bio = bios[i])) { 575 if (bio->bi_sector == on_disk_sector) 576 return 0; 577 i++; 578 } 579 /* bios[i] == NULL, the next not yet used slot */ 580 581 /* GFP_KERNEL, we are not in the write-out path */ 582 bio = bio_alloc(GFP_KERNEL, 1); 583 if (bio == NULL) 584 return -ENOMEM; 585 586 if (i > 0) { 587 const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; 588 page_offset = prev_bv->bv_offset + prev_bv->bv_len; 589 page = prev_bv->bv_page; 590 } 591 if (page_offset == PAGE_SIZE) { 592 page = alloc_page(__GFP_HIGHMEM); 593 if (page == NULL) 594 goto out_bio_put; 595 page_offset = 0; 596 } else { 597 get_page(page); 598 } 599 600 offset = S2W(enr); 601 drbd_bm_get_lel(mdev, offset, 602 min_t(size_t, S2W(8), drbd_bm_words(mdev) - offset), 603 kmap(page) + page_offset); 604 kunmap(page); 605 606 bio->bi_private = wc; 607 bio->bi_end_io = atodb_endio; 608 bio->bi_bdev = mdev->ldev->md_bdev; 609 bio->bi_sector = on_disk_sector; 610 611 if (bio_add_page(bio, page, 4096, page_offset) != 4096) 612 goto out_put_page; 613 614 atomic_inc(&wc->count); 615 /* we already know that we may do this... 616 * get_ldev_if_state(mdev,D_ATTACHING); 617 * just get the extra reference, so that the local_cnt reflects 618 * the number of pending IO requests DRBD at its backing device. 619 */ 620 atomic_inc(&mdev->local_cnt); 621 622 bios[i] = bio; 623 624 return 0; 625 626 out_put_page: 627 err = -EINVAL; 628 put_page(page); 629 out_bio_put: 630 bio_put(bio); 631 return err; 632 } 633 634 /** 635 * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents 636 * @mdev: DRBD device. 637 * 638 * Called when we detach (unconfigure) local storage, 639 * or when we go from R_PRIMARY to R_SECONDARY role. 640 */ 641 void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) 642 { 643 int i, nr_elements; 644 unsigned int enr; 645 struct bio **bios; 646 struct drbd_atodb_wait wc; 647 648 ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) 649 return; /* sorry, I don't have any act_log etc... */ 650 651 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 652 653 nr_elements = mdev->act_log->nr_elements; 654 655 /* GFP_KERNEL, we are not in anyone's write-out path */ 656 bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); 657 if (!bios) 658 goto submit_one_by_one; 659 660 atomic_set(&wc.count, 0); 661 init_completion(&wc.io_done); 662 wc.mdev = mdev; 663 wc.error = 0; 664 665 for (i = 0; i < nr_elements; i++) { 666 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 667 if (enr == LC_FREE) 668 continue; 669 /* next statement also does atomic_inc wc.count and local_cnt */ 670 if (atodb_prepare_unless_covered(mdev, bios, 671 enr/AL_EXT_PER_BM_SECT, 672 &wc)) 673 goto free_bios_submit_one_by_one; 674 } 675 676 /* unnecessary optimization? */ 677 lc_unlock(mdev->act_log); 678 wake_up(&mdev->al_wait); 679 680 /* all prepared, submit them */ 681 for (i = 0; i < nr_elements; i++) { 682 if (bios[i] == NULL) 683 break; 684 if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { 685 bios[i]->bi_rw = WRITE; 686 bio_endio(bios[i], -EIO); 687 } else { 688 submit_bio(WRITE, bios[i]); 689 } 690 } 691 692 drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); 693 694 /* always (try to) flush bitmap to stable storage */ 695 drbd_md_flush(mdev); 696 697 /* In case we did not submit a single IO do not wait for 698 * them to complete. ( Because we would wait forever here. ) 699 * 700 * In case we had IOs and they are already complete, there 701 * is not point in waiting anyways. 702 * Therefore this if () ... */ 703 if (atomic_read(&wc.count)) 704 wait_for_completion(&wc.io_done); 705 706 put_ldev(mdev); 707 708 kfree(bios); 709 return; 710 711 free_bios_submit_one_by_one: 712 /* free everything by calling the endio callback directly. */ 713 for (i = 0; i < nr_elements && bios[i]; i++) 714 bio_endio(bios[i], 0); 715 716 kfree(bios); 717 718 submit_one_by_one: 719 dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); 720 721 for (i = 0; i < mdev->act_log->nr_elements; i++) { 722 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 723 if (enr == LC_FREE) 724 continue; 725 /* Really slow: if we have al-extents 16..19 active, 726 * sector 4 will be written four times! Synchronous! */ 727 drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); 728 } 729 730 lc_unlock(mdev->act_log); 731 wake_up(&mdev->al_wait); 732 put_ldev(mdev); 733 } 734 735 /** 736 * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents 737 * @mdev: DRBD device. 738 */ 739 void drbd_al_apply_to_bm(struct drbd_conf *mdev) 740 { 741 unsigned int enr; 742 unsigned long add = 0; 743 char ppb[10]; 744 int i, tmp; 745 746 wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); 747 748 for (i = 0; i < mdev->act_log->nr_elements; i++) { 749 enr = lc_element_by_index(mdev->act_log, i)->lc_number; 750 if (enr == LC_FREE) 751 continue; 752 tmp = drbd_bm_ALe_set_all(mdev, enr); 753 dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr); 754 add += tmp; 755 } 756 757 lc_unlock(mdev->act_log); 758 wake_up(&mdev->al_wait); 759 760 dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", 761 ppsize(ppb, Bit2KB(add))); 762 } 763 764 static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) 765 { 766 int rv; 767 768 spin_lock_irq(&mdev->al_lock); 769 rv = (al_ext->refcnt == 0); 770 if (likely(rv)) 771 lc_del(mdev->act_log, al_ext); 772 spin_unlock_irq(&mdev->al_lock); 773 774 return rv; 775 } 776 777 /** 778 * drbd_al_shrink() - Removes all active extents form the activity log 779 * @mdev: DRBD device. 780 * 781 * Removes all active extents form the activity log, waiting until 782 * the reference count of each entry dropped to 0 first, of course. 783 * 784 * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() 785 */ 786 void drbd_al_shrink(struct drbd_conf *mdev) 787 { 788 struct lc_element *al_ext; 789 int i; 790 791 D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); 792 793 for (i = 0; i < mdev->act_log->nr_elements; i++) { 794 al_ext = lc_element_by_index(mdev->act_log, i); 795 if (al_ext->lc_number == LC_FREE) 796 continue; 797 wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); 798 } 799 800 wake_up(&mdev->al_wait); 801 } 802 803 static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) 804 { 805 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 806 807 if (!get_ldev(mdev)) { 808 if (__ratelimit(&drbd_ratelimit_state)) 809 dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); 810 kfree(udw); 811 return 1; 812 } 813 814 drbd_bm_write_sect(mdev, udw->enr); 815 put_ldev(mdev); 816 817 kfree(udw); 818 819 if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { 820 switch (mdev->state.conn) { 821 case C_SYNC_SOURCE: case C_SYNC_TARGET: 822 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: 823 drbd_resync_finished(mdev); 824 default: 825 /* nothing to do */ 826 break; 827 } 828 } 829 drbd_bcast_sync_progress(mdev); 830 831 return 1; 832 } 833 834 835 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 836 * resync LRU-cache are 16MB each. 837 * The caller of this function has to hold an get_ldev() reference. 838 * 839 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 840 */ 841 static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, 842 int count, int success) 843 { 844 struct lc_element *e; 845 struct update_odbm_work *udw; 846 847 unsigned int enr; 848 849 D_ASSERT(atomic_read(&mdev->local_cnt)); 850 851 /* I simply assume that a sector/size pair never crosses 852 * a 16 MB extent border. (Currently this is true...) */ 853 enr = BM_SECT_TO_EXT(sector); 854 855 e = lc_get(mdev->resync, enr); 856 if (e) { 857 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 858 if (ext->lce.lc_number == enr) { 859 if (success) 860 ext->rs_left -= count; 861 else 862 ext->rs_failed += count; 863 if (ext->rs_left < ext->rs_failed) { 864 dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " 865 "rs_failed=%d count=%d\n", 866 (unsigned long long)sector, 867 ext->lce.lc_number, ext->rs_left, 868 ext->rs_failed, count); 869 dump_stack(); 870 871 lc_put(mdev->resync, &ext->lce); 872 drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); 873 return; 874 } 875 } else { 876 /* Normally this element should be in the cache, 877 * since drbd_rs_begin_io() pulled it already in. 878 * 879 * But maybe an application write finished, and we set 880 * something outside the resync lru_cache in sync. 881 */ 882 int rs_left = drbd_bm_e_weight(mdev, enr); 883 if (ext->flags != 0) { 884 dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" 885 " -> %d[%u;00]\n", 886 ext->lce.lc_number, ext->rs_left, 887 ext->flags, enr, rs_left); 888 ext->flags = 0; 889 } 890 if (ext->rs_failed) { 891 dev_warn(DEV, "Kicking resync_lru element enr=%u " 892 "out with rs_failed=%d\n", 893 ext->lce.lc_number, ext->rs_failed); 894 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); 895 } 896 ext->rs_left = rs_left; 897 ext->rs_failed = success ? 0 : count; 898 lc_changed(mdev->resync, &ext->lce); 899 } 900 lc_put(mdev->resync, &ext->lce); 901 /* no race, we are within the al_lock! */ 902 903 if (ext->rs_left == ext->rs_failed) { 904 ext->rs_failed = 0; 905 906 udw = kmalloc(sizeof(*udw), GFP_ATOMIC); 907 if (udw) { 908 udw->enr = ext->lce.lc_number; 909 udw->w.cb = w_update_odbm; 910 drbd_queue_work_front(&mdev->data.work, &udw->w); 911 } else { 912 dev_warn(DEV, "Could not kmalloc an udw\n"); 913 set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); 914 } 915 } 916 } else { 917 dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", 918 mdev->resync_locked, 919 mdev->resync->nr_elements, 920 mdev->resync->flags); 921 } 922 } 923 924 /* clear the bit corresponding to the piece of storage in question: 925 * size byte of data starting from sector. Only clear a bits of the affected 926 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 927 * 928 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 929 * 930 */ 931 void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, 932 const char *file, const unsigned int line) 933 { 934 /* Is called from worker and receiver context _only_ */ 935 unsigned long sbnr, ebnr, lbnr; 936 unsigned long count = 0; 937 sector_t esector, nr_sectors; 938 int wake_up = 0; 939 unsigned long flags; 940 941 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 942 dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 943 (unsigned long long)sector, size); 944 return; 945 } 946 nr_sectors = drbd_get_capacity(mdev->this_bdev); 947 esector = sector + (size >> 9) - 1; 948 949 ERR_IF(sector >= nr_sectors) return; 950 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); 951 952 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 953 954 /* we clear it (in sync). 955 * round up start sector, round down end sector. we make sure we only 956 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ 957 if (unlikely(esector < BM_SECT_PER_BIT-1)) 958 return; 959 if (unlikely(esector == (nr_sectors-1))) 960 ebnr = lbnr; 961 else 962 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 963 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 964 965 if (sbnr > ebnr) 966 return; 967 968 /* 969 * ok, (capacity & 7) != 0 sometimes, but who cares... 970 * we count rs_{total,left} in bits, not sectors. 971 */ 972 count = drbd_bm_clear_bits(mdev, sbnr, ebnr); 973 if (count && get_ldev(mdev)) { 974 unsigned long now = jiffies; 975 unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark]; 976 int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS; 977 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 978 unsigned long tw = drbd_bm_total_weight(mdev); 979 if (mdev->rs_mark_left[mdev->rs_last_mark] != tw && 980 mdev->state.conn != C_PAUSED_SYNC_T && 981 mdev->state.conn != C_PAUSED_SYNC_S) { 982 mdev->rs_mark_time[next] = now; 983 mdev->rs_mark_left[next] = tw; 984 mdev->rs_last_mark = next; 985 } 986 } 987 spin_lock_irqsave(&mdev->al_lock, flags); 988 drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); 989 spin_unlock_irqrestore(&mdev->al_lock, flags); 990 991 /* just wake_up unconditional now, various lc_chaged(), 992 * lc_put() in drbd_try_clear_on_disk_bm(). */ 993 wake_up = 1; 994 put_ldev(mdev); 995 } 996 if (wake_up) 997 wake_up(&mdev->al_wait); 998 } 999 1000 /* 1001 * this is intended to set one request worth of data out of sync. 1002 * affects at least 1 bit, 1003 * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. 1004 * 1005 * called by tl_clear and drbd_send_dblock (==drbd_make_request). 1006 * so this can be _any_ process. 1007 */ 1008 void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, 1009 const char *file, const unsigned int line) 1010 { 1011 unsigned long sbnr, ebnr, lbnr, flags; 1012 sector_t esector, nr_sectors; 1013 unsigned int enr, count; 1014 struct lc_element *e; 1015 1016 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1017 dev_err(DEV, "sector: %llus, size: %d\n", 1018 (unsigned long long)sector, size); 1019 return; 1020 } 1021 1022 if (!get_ldev(mdev)) 1023 return; /* no disk, no metadata, no bitmap to set bits in */ 1024 1025 nr_sectors = drbd_get_capacity(mdev->this_bdev); 1026 esector = sector + (size >> 9) - 1; 1027 1028 ERR_IF(sector >= nr_sectors) 1029 goto out; 1030 ERR_IF(esector >= nr_sectors) 1031 esector = (nr_sectors-1); 1032 1033 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1034 1035 /* we set it out of sync, 1036 * we do not need to round anything here */ 1037 sbnr = BM_SECT_TO_BIT(sector); 1038 ebnr = BM_SECT_TO_BIT(esector); 1039 1040 /* ok, (capacity & 7) != 0 sometimes, but who cares... 1041 * we count rs_{total,left} in bits, not sectors. */ 1042 spin_lock_irqsave(&mdev->al_lock, flags); 1043 count = drbd_bm_set_bits(mdev, sbnr, ebnr); 1044 1045 enr = BM_SECT_TO_EXT(sector); 1046 e = lc_find(mdev->resync, enr); 1047 if (e) 1048 lc_entry(e, struct bm_extent, lce)->rs_left += count; 1049 spin_unlock_irqrestore(&mdev->al_lock, flags); 1050 1051 out: 1052 put_ldev(mdev); 1053 } 1054 1055 static 1056 struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) 1057 { 1058 struct lc_element *e; 1059 struct bm_extent *bm_ext; 1060 int wakeup = 0; 1061 unsigned long rs_flags; 1062 1063 spin_lock_irq(&mdev->al_lock); 1064 if (mdev->resync_locked > mdev->resync->nr_elements/2) { 1065 spin_unlock_irq(&mdev->al_lock); 1066 return NULL; 1067 } 1068 e = lc_get(mdev->resync, enr); 1069 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1070 if (bm_ext) { 1071 if (bm_ext->lce.lc_number != enr) { 1072 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 1073 bm_ext->rs_failed = 0; 1074 lc_changed(mdev->resync, &bm_ext->lce); 1075 wakeup = 1; 1076 } 1077 if (bm_ext->lce.refcnt == 1) 1078 mdev->resync_locked++; 1079 set_bit(BME_NO_WRITES, &bm_ext->flags); 1080 } 1081 rs_flags = mdev->resync->flags; 1082 spin_unlock_irq(&mdev->al_lock); 1083 if (wakeup) 1084 wake_up(&mdev->al_wait); 1085 1086 if (!bm_ext) { 1087 if (rs_flags & LC_STARVING) 1088 dev_warn(DEV, "Have to wait for element" 1089 " (resync LRU too small?)\n"); 1090 BUG_ON(rs_flags & LC_DIRTY); 1091 } 1092 1093 return bm_ext; 1094 } 1095 1096 static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) 1097 { 1098 struct lc_element *al_ext; 1099 int rv = 0; 1100 1101 spin_lock_irq(&mdev->al_lock); 1102 if (unlikely(enr == mdev->act_log->new_number)) 1103 rv = 1; 1104 else { 1105 al_ext = lc_find(mdev->act_log, enr); 1106 if (al_ext) { 1107 if (al_ext->refcnt) 1108 rv = 1; 1109 } 1110 } 1111 spin_unlock_irq(&mdev->al_lock); 1112 1113 /* 1114 if (unlikely(rv)) { 1115 dev_info(DEV, "Delaying sync read until app's write is done\n"); 1116 } 1117 */ 1118 return rv; 1119 } 1120 1121 /** 1122 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 1123 * @mdev: DRBD device. 1124 * @sector: The sector number. 1125 * 1126 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 1127 */ 1128 int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 1129 { 1130 unsigned int enr = BM_SECT_TO_EXT(sector); 1131 struct bm_extent *bm_ext; 1132 int i, sig; 1133 1134 sig = wait_event_interruptible(mdev->al_wait, 1135 (bm_ext = _bme_get(mdev, enr))); 1136 if (sig) 1137 return -EINTR; 1138 1139 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1140 return 0; 1141 1142 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1143 sig = wait_event_interruptible(mdev->al_wait, 1144 !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); 1145 if (sig) { 1146 spin_lock_irq(&mdev->al_lock); 1147 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1148 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1149 mdev->resync_locked--; 1150 wake_up(&mdev->al_wait); 1151 } 1152 spin_unlock_irq(&mdev->al_lock); 1153 return -EINTR; 1154 } 1155 } 1156 set_bit(BME_LOCKED, &bm_ext->flags); 1157 return 0; 1158 } 1159 1160 /** 1161 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 1162 * @mdev: DRBD device. 1163 * @sector: The sector number. 1164 * 1165 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 1166 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 1167 * if there is still application IO going on in this area. 1168 */ 1169 int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) 1170 { 1171 unsigned int enr = BM_SECT_TO_EXT(sector); 1172 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 1173 struct lc_element *e; 1174 struct bm_extent *bm_ext; 1175 int i; 1176 1177 spin_lock_irq(&mdev->al_lock); 1178 if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { 1179 /* in case you have very heavy scattered io, it may 1180 * stall the syncer undefined if we give up the ref count 1181 * when we try again and requeue. 1182 * 1183 * if we don't give up the refcount, but the next time 1184 * we are scheduled this extent has been "synced" by new 1185 * application writes, we'd miss the lc_put on the 1186 * extent we keep the refcount on. 1187 * so we remembered which extent we had to try again, and 1188 * if the next requested one is something else, we do 1189 * the lc_put here... 1190 * we also have to wake_up 1191 */ 1192 e = lc_find(mdev->resync, mdev->resync_wenr); 1193 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1194 if (bm_ext) { 1195 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1196 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 1197 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1198 mdev->resync_wenr = LC_FREE; 1199 if (lc_put(mdev->resync, &bm_ext->lce) == 0) 1200 mdev->resync_locked--; 1201 wake_up(&mdev->al_wait); 1202 } else { 1203 dev_alert(DEV, "LOGIC BUG\n"); 1204 } 1205 } 1206 /* TRY. */ 1207 e = lc_try_get(mdev->resync, enr); 1208 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1209 if (bm_ext) { 1210 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1211 goto proceed; 1212 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1213 mdev->resync_locked++; 1214 } else { 1215 /* we did set the BME_NO_WRITES, 1216 * but then could not set BME_LOCKED, 1217 * so we tried again. 1218 * drop the extra reference. */ 1219 bm_ext->lce.refcnt--; 1220 D_ASSERT(bm_ext->lce.refcnt > 0); 1221 } 1222 goto check_al; 1223 } else { 1224 /* do we rather want to try later? */ 1225 if (mdev->resync_locked > mdev->resync->nr_elements-3) 1226 goto try_again; 1227 /* Do or do not. There is no try. -- Yoda */ 1228 e = lc_get(mdev->resync, enr); 1229 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1230 if (!bm_ext) { 1231 const unsigned long rs_flags = mdev->resync->flags; 1232 if (rs_flags & LC_STARVING) 1233 dev_warn(DEV, "Have to wait for element" 1234 " (resync LRU too small?)\n"); 1235 BUG_ON(rs_flags & LC_DIRTY); 1236 goto try_again; 1237 } 1238 if (bm_ext->lce.lc_number != enr) { 1239 bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); 1240 bm_ext->rs_failed = 0; 1241 lc_changed(mdev->resync, &bm_ext->lce); 1242 wake_up(&mdev->al_wait); 1243 D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1244 } 1245 set_bit(BME_NO_WRITES, &bm_ext->flags); 1246 D_ASSERT(bm_ext->lce.refcnt == 1); 1247 mdev->resync_locked++; 1248 goto check_al; 1249 } 1250 check_al: 1251 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1252 if (unlikely(al_enr+i == mdev->act_log->new_number)) 1253 goto try_again; 1254 if (lc_is_used(mdev->act_log, al_enr+i)) 1255 goto try_again; 1256 } 1257 set_bit(BME_LOCKED, &bm_ext->flags); 1258 proceed: 1259 mdev->resync_wenr = LC_FREE; 1260 spin_unlock_irq(&mdev->al_lock); 1261 return 0; 1262 1263 try_again: 1264 if (bm_ext) 1265 mdev->resync_wenr = enr; 1266 spin_unlock_irq(&mdev->al_lock); 1267 return -EAGAIN; 1268 } 1269 1270 void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) 1271 { 1272 unsigned int enr = BM_SECT_TO_EXT(sector); 1273 struct lc_element *e; 1274 struct bm_extent *bm_ext; 1275 unsigned long flags; 1276 1277 spin_lock_irqsave(&mdev->al_lock, flags); 1278 e = lc_find(mdev->resync, enr); 1279 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1280 if (!bm_ext) { 1281 spin_unlock_irqrestore(&mdev->al_lock, flags); 1282 if (__ratelimit(&drbd_ratelimit_state)) 1283 dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); 1284 return; 1285 } 1286 1287 if (bm_ext->lce.refcnt == 0) { 1288 spin_unlock_irqrestore(&mdev->al_lock, flags); 1289 dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " 1290 "but refcnt is 0!?\n", 1291 (unsigned long long)sector, enr); 1292 return; 1293 } 1294 1295 if (lc_put(mdev->resync, &bm_ext->lce) == 0) { 1296 clear_bit(BME_LOCKED, &bm_ext->flags); 1297 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1298 mdev->resync_locked--; 1299 wake_up(&mdev->al_wait); 1300 } 1301 1302 spin_unlock_irqrestore(&mdev->al_lock, flags); 1303 } 1304 1305 /** 1306 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1307 * @mdev: DRBD device. 1308 */ 1309 void drbd_rs_cancel_all(struct drbd_conf *mdev) 1310 { 1311 spin_lock_irq(&mdev->al_lock); 1312 1313 if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ 1314 lc_reset(mdev->resync); 1315 put_ldev(mdev); 1316 } 1317 mdev->resync_locked = 0; 1318 mdev->resync_wenr = LC_FREE; 1319 spin_unlock_irq(&mdev->al_lock); 1320 wake_up(&mdev->al_wait); 1321 } 1322 1323 /** 1324 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1325 * @mdev: DRBD device. 1326 * 1327 * Returns 0 upon success, -EAGAIN if at least one reference count was 1328 * not zero. 1329 */ 1330 int drbd_rs_del_all(struct drbd_conf *mdev) 1331 { 1332 struct lc_element *e; 1333 struct bm_extent *bm_ext; 1334 int i; 1335 1336 spin_lock_irq(&mdev->al_lock); 1337 1338 if (get_ldev_if_state(mdev, D_FAILED)) { 1339 /* ok, ->resync is there. */ 1340 for (i = 0; i < mdev->resync->nr_elements; i++) { 1341 e = lc_element_by_index(mdev->resync, i); 1342 bm_ext = lc_entry(e, struct bm_extent, lce); 1343 if (bm_ext->lce.lc_number == LC_FREE) 1344 continue; 1345 if (bm_ext->lce.lc_number == mdev->resync_wenr) { 1346 dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" 1347 " got 'synced' by application io\n", 1348 mdev->resync_wenr); 1349 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1350 D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); 1351 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1352 mdev->resync_wenr = LC_FREE; 1353 lc_put(mdev->resync, &bm_ext->lce); 1354 } 1355 if (bm_ext->lce.refcnt != 0) { 1356 dev_info(DEV, "Retrying drbd_rs_del_all() later. " 1357 "refcnt=%d\n", bm_ext->lce.refcnt); 1358 put_ldev(mdev); 1359 spin_unlock_irq(&mdev->al_lock); 1360 return -EAGAIN; 1361 } 1362 D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); 1363 D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); 1364 lc_del(mdev->resync, &bm_ext->lce); 1365 } 1366 D_ASSERT(mdev->resync->used == 0); 1367 put_ldev(mdev); 1368 } 1369 spin_unlock_irq(&mdev->al_lock); 1370 1371 return 0; 1372 } 1373 1374 /** 1375 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks 1376 * @mdev: DRBD device. 1377 * @sector: The sector number. 1378 * @size: Size of failed IO operation, in byte. 1379 */ 1380 void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) 1381 { 1382 /* Is called from worker and receiver context _only_ */ 1383 unsigned long sbnr, ebnr, lbnr; 1384 unsigned long count; 1385 sector_t esector, nr_sectors; 1386 int wake_up = 0; 1387 1388 if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { 1389 dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1390 (unsigned long long)sector, size); 1391 return; 1392 } 1393 nr_sectors = drbd_get_capacity(mdev->this_bdev); 1394 esector = sector + (size >> 9) - 1; 1395 1396 ERR_IF(sector >= nr_sectors) return; 1397 ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); 1398 1399 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1400 1401 /* 1402 * round up start sector, round down end sector. we make sure we only 1403 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ 1404 if (unlikely(esector < BM_SECT_PER_BIT-1)) 1405 return; 1406 if (unlikely(esector == (nr_sectors-1))) 1407 ebnr = lbnr; 1408 else 1409 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 1410 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 1411 1412 if (sbnr > ebnr) 1413 return; 1414 1415 /* 1416 * ok, (capacity & 7) != 0 sometimes, but who cares... 1417 * we count rs_{total,left} in bits, not sectors. 1418 */ 1419 spin_lock_irq(&mdev->al_lock); 1420 count = drbd_bm_count_bits(mdev, sbnr, ebnr); 1421 if (count) { 1422 mdev->rs_failed += count; 1423 1424 if (get_ldev(mdev)) { 1425 drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); 1426 put_ldev(mdev); 1427 } 1428 1429 /* just wake_up unconditional now, various lc_chaged(), 1430 * lc_put() in drbd_try_clear_on_disk_bm(). */ 1431 wake_up = 1; 1432 } 1433 spin_unlock_irq(&mdev->al_lock); 1434 if (wake_up) 1435 wake_up(&mdev->al_wait); 1436 } 1437