1 /* 2 drbd_actlog.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/slab.h> 27 #include <linux/crc32c.h> 28 #include <linux/drbd.h> 29 #include <linux/drbd_limits.h> 30 #include <linux/dynamic_debug.h> 31 #include "drbd_int.h" 32 #include "drbd_wrappers.h" 33 34 35 enum al_transaction_types { 36 AL_TR_UPDATE = 0, 37 AL_TR_INITIALIZED = 0xffff 38 }; 39 /* all fields on disc in big endian */ 40 struct __packed al_transaction_on_disk { 41 /* don't we all like magic */ 42 __be32 magic; 43 44 /* to identify the most recent transaction block 45 * in the on disk ring buffer */ 46 __be32 tr_number; 47 48 /* checksum on the full 4k block, with this field set to 0. */ 49 __be32 crc32c; 50 51 /* type of transaction, special transaction types like: 52 * purge-all, set-all-idle, set-all-active, ... to-be-defined 53 * see also enum al_transaction_types */ 54 __be16 transaction_type; 55 56 /* we currently allow only a few thousand extents, 57 * so 16bit will be enough for the slot number. */ 58 59 /* how many updates in this transaction */ 60 __be16 n_updates; 61 62 /* maximum slot number, "al-extents" in drbd.conf speak. 63 * Having this in each transaction should make reconfiguration 64 * of that parameter easier. */ 65 __be16 context_size; 66 67 /* slot number the context starts with */ 68 __be16 context_start_slot_nr; 69 70 /* Some reserved bytes. Expected usage is a 64bit counter of 71 * sectors-written since device creation, and other data generation tag 72 * supporting usage */ 73 __be32 __reserved[4]; 74 75 /* --- 36 byte used --- */ 76 77 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 78 * in one transaction, then use the remaining byte in the 4k block for 79 * context information. "Flexible" number of updates per transaction 80 * does not help, as we have to account for the case when all update 81 * slots are used anyways, so it would only complicate code without 82 * additional benefit. 83 */ 84 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 85 86 /* but the extent number is 32bit, which at an extent size of 4 MiB 87 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 88 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 89 90 /* --- 420 bytes used (36 + 64*6) --- */ 91 92 /* 4096 - 420 = 3676 = 919 * 4 */ 93 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 94 }; 95 96 struct update_odbm_work { 97 struct drbd_work w; 98 struct drbd_device *device; 99 unsigned int enr; 100 }; 101 102 struct update_al_work { 103 struct drbd_work w; 104 struct drbd_device *device; 105 struct completion event; 106 int err; 107 }; 108 109 110 void *drbd_md_get_buffer(struct drbd_device *device) 111 { 112 int r; 113 114 wait_event(device->misc_wait, 115 (r = atomic_cmpxchg(&device->md_io_in_use, 0, 1)) == 0 || 116 device->state.disk <= D_FAILED); 117 118 return r ? NULL : page_address(device->md_io_page); 119 } 120 121 void drbd_md_put_buffer(struct drbd_device *device) 122 { 123 if (atomic_dec_and_test(&device->md_io_in_use)) 124 wake_up(&device->misc_wait); 125 } 126 127 void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, 128 unsigned int *done) 129 { 130 long dt; 131 132 rcu_read_lock(); 133 dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 134 rcu_read_unlock(); 135 dt = dt * HZ / 10; 136 if (dt == 0) 137 dt = MAX_SCHEDULE_TIMEOUT; 138 139 dt = wait_event_timeout(device->misc_wait, 140 *done || test_bit(FORCE_DETACH, &device->flags), dt); 141 if (dt == 0) { 142 drbd_err(device, "meta-data IO operation timed out\n"); 143 drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); 144 } 145 } 146 147 static int _drbd_md_sync_page_io(struct drbd_device *device, 148 struct drbd_backing_dev *bdev, 149 struct page *page, sector_t sector, 150 int rw, int size) 151 { 152 struct bio *bio; 153 int err; 154 155 device->md_io.done = 0; 156 device->md_io.error = -ENODEV; 157 158 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 159 rw |= REQ_FUA | REQ_FLUSH; 160 rw |= REQ_SYNC; 161 162 bio = bio_alloc_drbd(GFP_NOIO); 163 bio->bi_bdev = bdev->md_bdev; 164 bio->bi_iter.bi_sector = sector; 165 err = -EIO; 166 if (bio_add_page(bio, page, size, 0) != size) 167 goto out; 168 bio->bi_private = &device->md_io; 169 bio->bi_end_io = drbd_md_io_complete; 170 bio->bi_rw = rw; 171 172 if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) 173 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ 174 ; 175 else if (!get_ldev_if_state(device, D_ATTACHING)) { 176 /* Corresponding put_ldev in drbd_md_io_complete() */ 177 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 178 err = -ENODEV; 179 goto out; 180 } 181 182 bio_get(bio); /* one bio_put() is in the completion handler */ 183 atomic_inc(&device->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ 184 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 185 bio_endio(bio, -EIO); 186 else 187 submit_bio(rw, bio); 188 wait_until_done_or_force_detached(device, bdev, &device->md_io.done); 189 if (bio_flagged(bio, BIO_UPTODATE)) 190 err = device->md_io.error; 191 192 out: 193 bio_put(bio); 194 return err; 195 } 196 197 int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, 198 sector_t sector, int rw) 199 { 200 int err; 201 struct page *iop = device->md_io_page; 202 203 D_ASSERT(device, atomic_read(&device->md_io_in_use) == 1); 204 205 BUG_ON(!bdev->md_bdev); 206 207 drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 208 current->comm, current->pid, __func__, 209 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", 210 (void*)_RET_IP_ ); 211 212 if (sector < drbd_md_first_sector(bdev) || 213 sector + 7 > drbd_md_last_sector(bdev)) 214 drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 215 current->comm, current->pid, __func__, 216 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 217 218 /* we do all our meta data IO in aligned 4k blocks. */ 219 err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096); 220 if (err) { 221 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 222 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 223 } 224 return err; 225 } 226 227 static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) 228 { 229 struct lc_element *tmp; 230 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 231 if (unlikely(tmp != NULL)) { 232 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 233 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) 234 return bm_ext; 235 } 236 return NULL; 237 } 238 239 static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) 240 { 241 struct lc_element *al_ext; 242 struct bm_extent *bm_ext; 243 int wake; 244 245 spin_lock_irq(&device->al_lock); 246 bm_ext = find_active_resync_extent(device, enr); 247 if (bm_ext) { 248 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 249 spin_unlock_irq(&device->al_lock); 250 if (wake) 251 wake_up(&device->al_wait); 252 return NULL; 253 } 254 if (nonblock) 255 al_ext = lc_try_get(device->act_log, enr); 256 else 257 al_ext = lc_get(device->act_log, enr); 258 spin_unlock_irq(&device->al_lock); 259 return al_ext; 260 } 261 262 bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) 263 { 264 /* for bios crossing activity log extent boundaries, 265 * we may need to activate two extents in one go */ 266 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 267 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 268 269 D_ASSERT(device, (unsigned)(last - first) <= 1); 270 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 271 272 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 273 if (first != last) 274 return false; 275 276 return _al_get(device, first, true); 277 } 278 279 static 280 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 281 { 282 /* for bios crossing activity log extent boundaries, 283 * we may need to activate two extents in one go */ 284 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 285 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 286 unsigned enr; 287 bool need_transaction = false; 288 289 D_ASSERT(device, first <= last); 290 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 291 292 for (enr = first; enr <= last; enr++) { 293 struct lc_element *al_ext; 294 wait_event(device->al_wait, 295 (al_ext = _al_get(device, enr, false)) != NULL); 296 if (al_ext->lc_number != enr) 297 need_transaction = true; 298 } 299 return need_transaction; 300 } 301 302 static int al_write_transaction(struct drbd_device *device, bool delegate); 303 304 /* When called through generic_make_request(), we must delegate 305 * activity log I/O to the worker thread: a further request 306 * submitted via generic_make_request() within the same task 307 * would be queued on current->bio_list, and would only start 308 * after this function returns (see generic_make_request()). 309 * 310 * However, if we *are* the worker, we must not delegate to ourselves. 311 */ 312 313 /* 314 * @delegate: delegate activity log I/O to the worker thread 315 */ 316 void drbd_al_begin_io_commit(struct drbd_device *device, bool delegate) 317 { 318 bool locked = false; 319 320 BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); 321 322 /* Serialize multiple transactions. 323 * This uses test_and_set_bit, memory barrier is implicit. 324 */ 325 wait_event(device->al_wait, 326 device->act_log->pending_changes == 0 || 327 (locked = lc_try_lock_for_transaction(device->act_log))); 328 329 if (locked) { 330 /* Double check: it may have been committed by someone else, 331 * while we have been waiting for the lock. */ 332 if (device->act_log->pending_changes) { 333 bool write_al_updates; 334 335 rcu_read_lock(); 336 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 337 rcu_read_unlock(); 338 339 if (write_al_updates) 340 al_write_transaction(device, delegate); 341 spin_lock_irq(&device->al_lock); 342 /* FIXME 343 if (err) 344 we need an "lc_cancel" here; 345 */ 346 lc_committed(device->act_log); 347 spin_unlock_irq(&device->al_lock); 348 } 349 lc_unlock(device->act_log); 350 wake_up(&device->al_wait); 351 } 352 } 353 354 /* 355 * @delegate: delegate activity log I/O to the worker thread 356 */ 357 void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i, bool delegate) 358 { 359 BUG_ON(delegate && current == first_peer_device(device)->connection->worker.task); 360 361 if (drbd_al_begin_io_prepare(device, i)) 362 drbd_al_begin_io_commit(device, delegate); 363 } 364 365 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 366 { 367 struct lru_cache *al = device->act_log; 368 /* for bios crossing activity log extent boundaries, 369 * we may need to activate two extents in one go */ 370 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 371 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 372 unsigned nr_al_extents; 373 unsigned available_update_slots; 374 unsigned enr; 375 376 D_ASSERT(device, first <= last); 377 378 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ 379 available_update_slots = min(al->nr_elements - al->used, 380 al->max_pending_changes - al->pending_changes); 381 382 /* We want all necessary updates for a given request within the same transaction 383 * We could first check how many updates are *actually* needed, 384 * and use that instead of the worst-case nr_al_extents */ 385 if (available_update_slots < nr_al_extents) 386 return -EWOULDBLOCK; 387 388 /* Is resync active in this area? */ 389 for (enr = first; enr <= last; enr++) { 390 struct lc_element *tmp; 391 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 392 if (unlikely(tmp != NULL)) { 393 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 394 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 395 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) 396 return -EBUSY; 397 return -EWOULDBLOCK; 398 } 399 } 400 } 401 402 /* Checkout the refcounts. 403 * Given that we checked for available elements and update slots above, 404 * this has to be successful. */ 405 for (enr = first; enr <= last; enr++) { 406 struct lc_element *al_ext; 407 al_ext = lc_get_cumulative(device->act_log, enr); 408 if (!al_ext) 409 drbd_info(device, "LOGIC BUG for enr=%u\n", enr); 410 } 411 return 0; 412 } 413 414 void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) 415 { 416 /* for bios crossing activity log extent boundaries, 417 * we may need to activate two extents in one go */ 418 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 419 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 420 unsigned enr; 421 struct lc_element *extent; 422 unsigned long flags; 423 424 D_ASSERT(device, first <= last); 425 spin_lock_irqsave(&device->al_lock, flags); 426 427 for (enr = first; enr <= last; enr++) { 428 extent = lc_find(device->act_log, enr); 429 if (!extent) { 430 drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); 431 continue; 432 } 433 lc_put(device->act_log, extent); 434 } 435 spin_unlock_irqrestore(&device->al_lock, flags); 436 wake_up(&device->al_wait); 437 } 438 439 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 440 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 441 * are still coupled, or assume too much about their relation. 442 * Code below will not work if this is violated. 443 * Will be cleaned up with some followup patch. 444 */ 445 # error FIXME 446 #endif 447 448 static unsigned int al_extent_to_bm_page(unsigned int al_enr) 449 { 450 return al_enr >> 451 /* bit to page */ 452 ((PAGE_SHIFT + 3) - 453 /* al extent number to bit */ 454 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 455 } 456 457 static unsigned int rs_extent_to_bm_page(unsigned int rs_enr) 458 { 459 return rs_enr >> 460 /* bit to page */ 461 ((PAGE_SHIFT + 3) - 462 /* resync extent number to bit */ 463 (BM_EXT_SHIFT - BM_BLOCK_SHIFT)); 464 } 465 466 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 467 { 468 const unsigned int stripes = device->ldev->md.al_stripes; 469 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; 470 471 /* transaction number, modulo on-disk ring buffer wrap around */ 472 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); 473 474 /* ... to aligned 4k on disk block */ 475 t = ((t % stripes) * stripe_size_4kB) + t/stripes; 476 477 /* ... to 512 byte sector in activity log */ 478 t *= 8; 479 480 /* ... plus offset to the on disk position */ 481 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 482 } 483 484 static int 485 _al_write_transaction(struct drbd_device *device) 486 { 487 struct al_transaction_on_disk *buffer; 488 struct lc_element *e; 489 sector_t sector; 490 int i, mx; 491 unsigned extent_nr; 492 unsigned crc = 0; 493 int err = 0; 494 495 if (!get_ldev(device)) { 496 drbd_err(device, "disk is %s, cannot start al transaction\n", 497 drbd_disk_str(device->state.disk)); 498 return -EIO; 499 } 500 501 /* The bitmap write may have failed, causing a state change. */ 502 if (device->state.disk < D_INCONSISTENT) { 503 drbd_err(device, 504 "disk is %s, cannot write al transaction\n", 505 drbd_disk_str(device->state.disk)); 506 put_ldev(device); 507 return -EIO; 508 } 509 510 buffer = drbd_md_get_buffer(device); /* protects md_io_buffer, al_tr_cycle, ... */ 511 if (!buffer) { 512 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 513 put_ldev(device); 514 return -ENODEV; 515 } 516 517 memset(buffer, 0, sizeof(*buffer)); 518 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 519 buffer->tr_number = cpu_to_be32(device->al_tr_number); 520 521 i = 0; 522 523 /* Even though no one can start to change this list 524 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 525 * lc_try_lock_for_transaction() --, someone may still 526 * be in the process of changing it. */ 527 spin_lock_irq(&device->al_lock); 528 list_for_each_entry(e, &device->act_log->to_be_changed, list) { 529 if (i == AL_UPDATES_PER_TRANSACTION) { 530 i++; 531 break; 532 } 533 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 534 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 535 if (e->lc_number != LC_FREE) 536 drbd_bm_mark_for_writeout(device, 537 al_extent_to_bm_page(e->lc_number)); 538 i++; 539 } 540 spin_unlock_irq(&device->al_lock); 541 BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 542 543 buffer->n_updates = cpu_to_be16(i); 544 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 545 buffer->update_slot_nr[i] = cpu_to_be16(-1); 546 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 547 } 548 549 buffer->context_size = cpu_to_be16(device->act_log->nr_elements); 550 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); 551 552 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 553 device->act_log->nr_elements - device->al_tr_cycle); 554 for (i = 0; i < mx; i++) { 555 unsigned idx = device->al_tr_cycle + i; 556 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; 557 buffer->context[i] = cpu_to_be32(extent_nr); 558 } 559 for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 560 buffer->context[i] = cpu_to_be32(LC_FREE); 561 562 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 563 if (device->al_tr_cycle >= device->act_log->nr_elements) 564 device->al_tr_cycle = 0; 565 566 sector = al_tr_number_to_on_disk_sector(device); 567 568 crc = crc32c(0, buffer, 4096); 569 buffer->crc32c = cpu_to_be32(crc); 570 571 if (drbd_bm_write_hinted(device)) 572 err = -EIO; 573 else { 574 bool write_al_updates; 575 rcu_read_lock(); 576 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 577 rcu_read_unlock(); 578 if (write_al_updates) { 579 if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { 580 err = -EIO; 581 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 582 } else { 583 device->al_tr_number++; 584 device->al_writ_cnt++; 585 } 586 } 587 } 588 589 drbd_md_put_buffer(device); 590 put_ldev(device); 591 592 return err; 593 } 594 595 596 static int w_al_write_transaction(struct drbd_work *w, int unused) 597 { 598 struct update_al_work *aw = container_of(w, struct update_al_work, w); 599 struct drbd_device *device = aw->device; 600 int err; 601 602 err = _al_write_transaction(device); 603 aw->err = err; 604 complete(&aw->event); 605 606 return err != -EIO ? err : 0; 607 } 608 609 /* Calls from worker context (see w_restart_disk_io()) need to write the 610 transaction directly. Others came through generic_make_request(), 611 those need to delegate it to the worker. */ 612 static int al_write_transaction(struct drbd_device *device, bool delegate) 613 { 614 if (delegate) { 615 struct update_al_work al_work; 616 init_completion(&al_work.event); 617 al_work.w.cb = w_al_write_transaction; 618 al_work.device = device; 619 drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, 620 &al_work.w); 621 wait_for_completion(&al_work.event); 622 return al_work.err; 623 } else 624 return _al_write_transaction(device); 625 } 626 627 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 628 { 629 int rv; 630 631 spin_lock_irq(&device->al_lock); 632 rv = (al_ext->refcnt == 0); 633 if (likely(rv)) 634 lc_del(device->act_log, al_ext); 635 spin_unlock_irq(&device->al_lock); 636 637 return rv; 638 } 639 640 /** 641 * drbd_al_shrink() - Removes all active extents form the activity log 642 * @device: DRBD device. 643 * 644 * Removes all active extents form the activity log, waiting until 645 * the reference count of each entry dropped to 0 first, of course. 646 * 647 * You need to lock device->act_log with lc_try_lock() / lc_unlock() 648 */ 649 void drbd_al_shrink(struct drbd_device *device) 650 { 651 struct lc_element *al_ext; 652 int i; 653 654 D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); 655 656 for (i = 0; i < device->act_log->nr_elements; i++) { 657 al_ext = lc_element_by_index(device->act_log, i); 658 if (al_ext->lc_number == LC_FREE) 659 continue; 660 wait_event(device->al_wait, _try_lc_del(device, al_ext)); 661 } 662 663 wake_up(&device->al_wait); 664 } 665 666 int drbd_initialize_al(struct drbd_device *device, void *buffer) 667 { 668 struct al_transaction_on_disk *al = buffer; 669 struct drbd_md *md = &device->ldev->md; 670 sector_t al_base = md->md_offset + md->al_offset; 671 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 672 int i; 673 674 memset(al, 0, 4096); 675 al->magic = cpu_to_be32(DRBD_AL_MAGIC); 676 al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); 677 al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); 678 679 for (i = 0; i < al_size_4k; i++) { 680 int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); 681 if (err) 682 return err; 683 } 684 return 0; 685 } 686 687 static int w_update_odbm(struct drbd_work *w, int unused) 688 { 689 struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); 690 struct drbd_device *device = udw->device; 691 struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, }; 692 693 if (!get_ldev(device)) { 694 if (__ratelimit(&drbd_ratelimit_state)) 695 drbd_warn(device, "Can not update on disk bitmap, local IO disabled.\n"); 696 kfree(udw); 697 return 0; 698 } 699 700 drbd_bm_write_page(device, rs_extent_to_bm_page(udw->enr)); 701 put_ldev(device); 702 703 kfree(udw); 704 705 if (drbd_bm_total_weight(device) <= device->rs_failed) { 706 switch (device->state.conn) { 707 case C_SYNC_SOURCE: case C_SYNC_TARGET: 708 case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: 709 drbd_resync_finished(device); 710 default: 711 /* nothing to do */ 712 break; 713 } 714 } 715 drbd_bcast_event(device, &sib); 716 717 return 0; 718 } 719 720 721 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 722 * resync LRU-cache are 16MB each. 723 * The caller of this function has to hold an get_ldev() reference. 724 * 725 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 726 */ 727 static void drbd_try_clear_on_disk_bm(struct drbd_device *device, sector_t sector, 728 int count, int success) 729 { 730 struct lc_element *e; 731 struct update_odbm_work *udw; 732 733 unsigned int enr; 734 735 D_ASSERT(device, atomic_read(&device->local_cnt)); 736 737 /* I simply assume that a sector/size pair never crosses 738 * a 16 MB extent border. (Currently this is true...) */ 739 enr = BM_SECT_TO_EXT(sector); 740 741 e = lc_get(device->resync, enr); 742 if (e) { 743 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 744 if (ext->lce.lc_number == enr) { 745 if (success) 746 ext->rs_left -= count; 747 else 748 ext->rs_failed += count; 749 if (ext->rs_left < ext->rs_failed) { 750 drbd_warn(device, "BAD! sector=%llus enr=%u rs_left=%d " 751 "rs_failed=%d count=%d cstate=%s\n", 752 (unsigned long long)sector, 753 ext->lce.lc_number, ext->rs_left, 754 ext->rs_failed, count, 755 drbd_conn_str(device->state.conn)); 756 757 /* We don't expect to be able to clear more bits 758 * than have been set when we originally counted 759 * the set bits to cache that value in ext->rs_left. 760 * Whatever the reason (disconnect during resync, 761 * delayed local completion of an application write), 762 * try to fix it up by recounting here. */ 763 ext->rs_left = drbd_bm_e_weight(device, enr); 764 } 765 } else { 766 /* Normally this element should be in the cache, 767 * since drbd_rs_begin_io() pulled it already in. 768 * 769 * But maybe an application write finished, and we set 770 * something outside the resync lru_cache in sync. 771 */ 772 int rs_left = drbd_bm_e_weight(device, enr); 773 if (ext->flags != 0) { 774 drbd_warn(device, "changing resync lce: %d[%u;%02lx]" 775 " -> %d[%u;00]\n", 776 ext->lce.lc_number, ext->rs_left, 777 ext->flags, enr, rs_left); 778 ext->flags = 0; 779 } 780 if (ext->rs_failed) { 781 drbd_warn(device, "Kicking resync_lru element enr=%u " 782 "out with rs_failed=%d\n", 783 ext->lce.lc_number, ext->rs_failed); 784 } 785 ext->rs_left = rs_left; 786 ext->rs_failed = success ? 0 : count; 787 /* we don't keep a persistent log of the resync lru, 788 * we can commit any change right away. */ 789 lc_committed(device->resync); 790 } 791 lc_put(device->resync, &ext->lce); 792 /* no race, we are within the al_lock! */ 793 794 if (ext->rs_left == ext->rs_failed) { 795 ext->rs_failed = 0; 796 797 udw = kmalloc(sizeof(*udw), GFP_ATOMIC); 798 if (udw) { 799 udw->enr = ext->lce.lc_number; 800 udw->w.cb = w_update_odbm; 801 udw->device = device; 802 drbd_queue_work_front(&first_peer_device(device)->connection->sender_work, 803 &udw->w); 804 } else { 805 drbd_warn(device, "Could not kmalloc an udw\n"); 806 } 807 } 808 } else { 809 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 810 device->resync_locked, 811 device->resync->nr_elements, 812 device->resync->flags); 813 } 814 } 815 816 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 817 { 818 unsigned long now = jiffies; 819 unsigned long last = device->rs_mark_time[device->rs_last_mark]; 820 int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; 821 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 822 if (device->rs_mark_left[device->rs_last_mark] != still_to_go && 823 device->state.conn != C_PAUSED_SYNC_T && 824 device->state.conn != C_PAUSED_SYNC_S) { 825 device->rs_mark_time[next] = now; 826 device->rs_mark_left[next] = still_to_go; 827 device->rs_last_mark = next; 828 } 829 } 830 } 831 832 /* clear the bit corresponding to the piece of storage in question: 833 * size byte of data starting from sector. Only clear a bits of the affected 834 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 835 * 836 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 837 * 838 */ 839 void __drbd_set_in_sync(struct drbd_device *device, sector_t sector, int size, 840 const char *file, const unsigned int line) 841 { 842 /* Is called from worker and receiver context _only_ */ 843 unsigned long sbnr, ebnr, lbnr; 844 unsigned long count = 0; 845 sector_t esector, nr_sectors; 846 int wake_up = 0; 847 unsigned long flags; 848 849 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 850 drbd_err(device, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", 851 (unsigned long long)sector, size); 852 return; 853 } 854 855 if (!get_ldev(device)) 856 return; /* no disk, no metadata, no bitmap to clear bits in */ 857 858 nr_sectors = drbd_get_capacity(device->this_bdev); 859 esector = sector + (size >> 9) - 1; 860 861 if (!expect(sector < nr_sectors)) 862 goto out; 863 if (!expect(esector < nr_sectors)) 864 esector = nr_sectors - 1; 865 866 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 867 868 /* we clear it (in sync). 869 * round up start sector, round down end sector. we make sure we only 870 * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ 871 if (unlikely(esector < BM_SECT_PER_BIT-1)) 872 goto out; 873 if (unlikely(esector == (nr_sectors-1))) 874 ebnr = lbnr; 875 else 876 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 877 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 878 879 if (sbnr > ebnr) 880 goto out; 881 882 /* 883 * ok, (capacity & 7) != 0 sometimes, but who cares... 884 * we count rs_{total,left} in bits, not sectors. 885 */ 886 count = drbd_bm_clear_bits(device, sbnr, ebnr); 887 if (count) { 888 drbd_advance_rs_marks(device, drbd_bm_total_weight(device)); 889 spin_lock_irqsave(&device->al_lock, flags); 890 drbd_try_clear_on_disk_bm(device, sector, count, true); 891 spin_unlock_irqrestore(&device->al_lock, flags); 892 893 /* just wake_up unconditional now, various lc_chaged(), 894 * lc_put() in drbd_try_clear_on_disk_bm(). */ 895 wake_up = 1; 896 } 897 out: 898 put_ldev(device); 899 if (wake_up) 900 wake_up(&device->al_wait); 901 } 902 903 /* 904 * this is intended to set one request worth of data out of sync. 905 * affects at least 1 bit, 906 * and at most 1+DRBD_MAX_BIO_SIZE/BM_BLOCK_SIZE bits. 907 * 908 * called by tl_clear and drbd_send_dblock (==drbd_make_request). 909 * so this can be _any_ process. 910 */ 911 int __drbd_set_out_of_sync(struct drbd_device *device, sector_t sector, int size, 912 const char *file, const unsigned int line) 913 { 914 unsigned long sbnr, ebnr, flags; 915 sector_t esector, nr_sectors; 916 unsigned int enr, count = 0; 917 struct lc_element *e; 918 919 /* this should be an empty REQ_FLUSH */ 920 if (size == 0) 921 return 0; 922 923 if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 924 drbd_err(device, "sector: %llus, size: %d\n", 925 (unsigned long long)sector, size); 926 return 0; 927 } 928 929 if (!get_ldev(device)) 930 return 0; /* no disk, no metadata, no bitmap to set bits in */ 931 932 nr_sectors = drbd_get_capacity(device->this_bdev); 933 esector = sector + (size >> 9) - 1; 934 935 if (!expect(sector < nr_sectors)) 936 goto out; 937 if (!expect(esector < nr_sectors)) 938 esector = nr_sectors - 1; 939 940 /* we set it out of sync, 941 * we do not need to round anything here */ 942 sbnr = BM_SECT_TO_BIT(sector); 943 ebnr = BM_SECT_TO_BIT(esector); 944 945 /* ok, (capacity & 7) != 0 sometimes, but who cares... 946 * we count rs_{total,left} in bits, not sectors. */ 947 spin_lock_irqsave(&device->al_lock, flags); 948 count = drbd_bm_set_bits(device, sbnr, ebnr); 949 950 enr = BM_SECT_TO_EXT(sector); 951 e = lc_find(device->resync, enr); 952 if (e) 953 lc_entry(e, struct bm_extent, lce)->rs_left += count; 954 spin_unlock_irqrestore(&device->al_lock, flags); 955 956 out: 957 put_ldev(device); 958 959 return count; 960 } 961 962 static 963 struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) 964 { 965 struct lc_element *e; 966 struct bm_extent *bm_ext; 967 int wakeup = 0; 968 unsigned long rs_flags; 969 970 spin_lock_irq(&device->al_lock); 971 if (device->resync_locked > device->resync->nr_elements/2) { 972 spin_unlock_irq(&device->al_lock); 973 return NULL; 974 } 975 e = lc_get(device->resync, enr); 976 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 977 if (bm_ext) { 978 if (bm_ext->lce.lc_number != enr) { 979 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 980 bm_ext->rs_failed = 0; 981 lc_committed(device->resync); 982 wakeup = 1; 983 } 984 if (bm_ext->lce.refcnt == 1) 985 device->resync_locked++; 986 set_bit(BME_NO_WRITES, &bm_ext->flags); 987 } 988 rs_flags = device->resync->flags; 989 spin_unlock_irq(&device->al_lock); 990 if (wakeup) 991 wake_up(&device->al_wait); 992 993 if (!bm_ext) { 994 if (rs_flags & LC_STARVING) 995 drbd_warn(device, "Have to wait for element" 996 " (resync LRU too small?)\n"); 997 BUG_ON(rs_flags & LC_LOCKED); 998 } 999 1000 return bm_ext; 1001 } 1002 1003 static int _is_in_al(struct drbd_device *device, unsigned int enr) 1004 { 1005 int rv; 1006 1007 spin_lock_irq(&device->al_lock); 1008 rv = lc_is_used(device->act_log, enr); 1009 spin_unlock_irq(&device->al_lock); 1010 1011 return rv; 1012 } 1013 1014 /** 1015 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 1016 * @device: DRBD device. 1017 * @sector: The sector number. 1018 * 1019 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 1020 */ 1021 int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) 1022 { 1023 unsigned int enr = BM_SECT_TO_EXT(sector); 1024 struct bm_extent *bm_ext; 1025 int i, sig; 1026 int sa = 200; /* Step aside 200 times, then grab the extent and let app-IO wait. 1027 200 times -> 20 seconds. */ 1028 1029 retry: 1030 sig = wait_event_interruptible(device->al_wait, 1031 (bm_ext = _bme_get(device, enr))); 1032 if (sig) 1033 return -EINTR; 1034 1035 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1036 return 0; 1037 1038 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1039 sig = wait_event_interruptible(device->al_wait, 1040 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 1041 test_bit(BME_PRIORITY, &bm_ext->flags)); 1042 1043 if (sig || (test_bit(BME_PRIORITY, &bm_ext->flags) && sa)) { 1044 spin_lock_irq(&device->al_lock); 1045 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1046 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 1047 device->resync_locked--; 1048 wake_up(&device->al_wait); 1049 } 1050 spin_unlock_irq(&device->al_lock); 1051 if (sig) 1052 return -EINTR; 1053 if (schedule_timeout_interruptible(HZ/10)) 1054 return -EINTR; 1055 if (sa && --sa == 0) 1056 drbd_warn(device, "drbd_rs_begin_io() stepped aside for 20sec." 1057 "Resync stalled?\n"); 1058 goto retry; 1059 } 1060 } 1061 set_bit(BME_LOCKED, &bm_ext->flags); 1062 return 0; 1063 } 1064 1065 /** 1066 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 1067 * @device: DRBD device. 1068 * @sector: The sector number. 1069 * 1070 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 1071 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 1072 * if there is still application IO going on in this area. 1073 */ 1074 int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) 1075 { 1076 unsigned int enr = BM_SECT_TO_EXT(sector); 1077 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 1078 struct lc_element *e; 1079 struct bm_extent *bm_ext; 1080 int i; 1081 1082 spin_lock_irq(&device->al_lock); 1083 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 1084 /* in case you have very heavy scattered io, it may 1085 * stall the syncer undefined if we give up the ref count 1086 * when we try again and requeue. 1087 * 1088 * if we don't give up the refcount, but the next time 1089 * we are scheduled this extent has been "synced" by new 1090 * application writes, we'd miss the lc_put on the 1091 * extent we keep the refcount on. 1092 * so we remembered which extent we had to try again, and 1093 * if the next requested one is something else, we do 1094 * the lc_put here... 1095 * we also have to wake_up 1096 */ 1097 e = lc_find(device->resync, device->resync_wenr); 1098 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1099 if (bm_ext) { 1100 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1101 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1102 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1103 device->resync_wenr = LC_FREE; 1104 if (lc_put(device->resync, &bm_ext->lce) == 0) 1105 device->resync_locked--; 1106 wake_up(&device->al_wait); 1107 } else { 1108 drbd_alert(device, "LOGIC BUG\n"); 1109 } 1110 } 1111 /* TRY. */ 1112 e = lc_try_get(device->resync, enr); 1113 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1114 if (bm_ext) { 1115 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1116 goto proceed; 1117 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1118 device->resync_locked++; 1119 } else { 1120 /* we did set the BME_NO_WRITES, 1121 * but then could not set BME_LOCKED, 1122 * so we tried again. 1123 * drop the extra reference. */ 1124 bm_ext->lce.refcnt--; 1125 D_ASSERT(device, bm_ext->lce.refcnt > 0); 1126 } 1127 goto check_al; 1128 } else { 1129 /* do we rather want to try later? */ 1130 if (device->resync_locked > device->resync->nr_elements-3) 1131 goto try_again; 1132 /* Do or do not. There is no try. -- Yoda */ 1133 e = lc_get(device->resync, enr); 1134 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1135 if (!bm_ext) { 1136 const unsigned long rs_flags = device->resync->flags; 1137 if (rs_flags & LC_STARVING) 1138 drbd_warn(device, "Have to wait for element" 1139 " (resync LRU too small?)\n"); 1140 BUG_ON(rs_flags & LC_LOCKED); 1141 goto try_again; 1142 } 1143 if (bm_ext->lce.lc_number != enr) { 1144 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 1145 bm_ext->rs_failed = 0; 1146 lc_committed(device->resync); 1147 wake_up(&device->al_wait); 1148 D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1149 } 1150 set_bit(BME_NO_WRITES, &bm_ext->flags); 1151 D_ASSERT(device, bm_ext->lce.refcnt == 1); 1152 device->resync_locked++; 1153 goto check_al; 1154 } 1155 check_al: 1156 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1157 if (lc_is_used(device->act_log, al_enr+i)) 1158 goto try_again; 1159 } 1160 set_bit(BME_LOCKED, &bm_ext->flags); 1161 proceed: 1162 device->resync_wenr = LC_FREE; 1163 spin_unlock_irq(&device->al_lock); 1164 return 0; 1165 1166 try_again: 1167 if (bm_ext) 1168 device->resync_wenr = enr; 1169 spin_unlock_irq(&device->al_lock); 1170 return -EAGAIN; 1171 } 1172 1173 void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) 1174 { 1175 unsigned int enr = BM_SECT_TO_EXT(sector); 1176 struct lc_element *e; 1177 struct bm_extent *bm_ext; 1178 unsigned long flags; 1179 1180 spin_lock_irqsave(&device->al_lock, flags); 1181 e = lc_find(device->resync, enr); 1182 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1183 if (!bm_ext) { 1184 spin_unlock_irqrestore(&device->al_lock, flags); 1185 if (__ratelimit(&drbd_ratelimit_state)) 1186 drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); 1187 return; 1188 } 1189 1190 if (bm_ext->lce.refcnt == 0) { 1191 spin_unlock_irqrestore(&device->al_lock, flags); 1192 drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " 1193 "but refcnt is 0!?\n", 1194 (unsigned long long)sector, enr); 1195 return; 1196 } 1197 1198 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1199 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 1200 device->resync_locked--; 1201 wake_up(&device->al_wait); 1202 } 1203 1204 spin_unlock_irqrestore(&device->al_lock, flags); 1205 } 1206 1207 /** 1208 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1209 * @device: DRBD device. 1210 */ 1211 void drbd_rs_cancel_all(struct drbd_device *device) 1212 { 1213 spin_lock_irq(&device->al_lock); 1214 1215 if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ 1216 lc_reset(device->resync); 1217 put_ldev(device); 1218 } 1219 device->resync_locked = 0; 1220 device->resync_wenr = LC_FREE; 1221 spin_unlock_irq(&device->al_lock); 1222 wake_up(&device->al_wait); 1223 } 1224 1225 /** 1226 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1227 * @device: DRBD device. 1228 * 1229 * Returns 0 upon success, -EAGAIN if at least one reference count was 1230 * not zero. 1231 */ 1232 int drbd_rs_del_all(struct drbd_device *device) 1233 { 1234 struct lc_element *e; 1235 struct bm_extent *bm_ext; 1236 int i; 1237 1238 spin_lock_irq(&device->al_lock); 1239 1240 if (get_ldev_if_state(device, D_FAILED)) { 1241 /* ok, ->resync is there. */ 1242 for (i = 0; i < device->resync->nr_elements; i++) { 1243 e = lc_element_by_index(device->resync, i); 1244 bm_ext = lc_entry(e, struct bm_extent, lce); 1245 if (bm_ext->lce.lc_number == LC_FREE) 1246 continue; 1247 if (bm_ext->lce.lc_number == device->resync_wenr) { 1248 drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" 1249 " got 'synced' by application io\n", 1250 device->resync_wenr); 1251 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1252 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1253 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1254 device->resync_wenr = LC_FREE; 1255 lc_put(device->resync, &bm_ext->lce); 1256 } 1257 if (bm_ext->lce.refcnt != 0) { 1258 drbd_info(device, "Retrying drbd_rs_del_all() later. " 1259 "refcnt=%d\n", bm_ext->lce.refcnt); 1260 put_ldev(device); 1261 spin_unlock_irq(&device->al_lock); 1262 return -EAGAIN; 1263 } 1264 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1265 D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); 1266 lc_del(device->resync, &bm_ext->lce); 1267 } 1268 D_ASSERT(device, device->resync->used == 0); 1269 put_ldev(device); 1270 } 1271 spin_unlock_irq(&device->al_lock); 1272 wake_up(&device->al_wait); 1273 1274 return 0; 1275 } 1276 1277 /** 1278 * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks 1279 * @device: DRBD device. 1280 * @sector: The sector number. 1281 * @size: Size of failed IO operation, in byte. 1282 */ 1283 void drbd_rs_failed_io(struct drbd_device *device, sector_t sector, int size) 1284 { 1285 /* Is called from worker and receiver context _only_ */ 1286 unsigned long sbnr, ebnr, lbnr; 1287 unsigned long count; 1288 sector_t esector, nr_sectors; 1289 int wake_up = 0; 1290 1291 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) { 1292 drbd_err(device, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", 1293 (unsigned long long)sector, size); 1294 return; 1295 } 1296 nr_sectors = drbd_get_capacity(device->this_bdev); 1297 esector = sector + (size >> 9) - 1; 1298 1299 if (!expect(sector < nr_sectors)) 1300 return; 1301 if (!expect(esector < nr_sectors)) 1302 esector = nr_sectors - 1; 1303 1304 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 1305 1306 /* 1307 * round up start sector, round down end sector. we make sure we only 1308 * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ 1309 if (unlikely(esector < BM_SECT_PER_BIT-1)) 1310 return; 1311 if (unlikely(esector == (nr_sectors-1))) 1312 ebnr = lbnr; 1313 else 1314 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 1315 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 1316 1317 if (sbnr > ebnr) 1318 return; 1319 1320 /* 1321 * ok, (capacity & 7) != 0 sometimes, but who cares... 1322 * we count rs_{total,left} in bits, not sectors. 1323 */ 1324 spin_lock_irq(&device->al_lock); 1325 count = drbd_bm_count_bits(device, sbnr, ebnr); 1326 if (count) { 1327 device->rs_failed += count; 1328 1329 if (get_ldev(device)) { 1330 drbd_try_clear_on_disk_bm(device, sector, count, false); 1331 put_ldev(device); 1332 } 1333 1334 /* just wake_up unconditional now, various lc_chaged(), 1335 * lc_put() in drbd_try_clear_on_disk_bm(). */ 1336 wake_up = 1; 1337 } 1338 spin_unlock_irq(&device->al_lock); 1339 if (wake_up) 1340 wake_up(&device->al_wait); 1341 } 1342