1 /* 2 drbd_actlog.c 3 4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 5 6 Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. 7 Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>. 8 Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 9 10 drbd is free software; you can redistribute it and/or modify 11 it under the terms of the GNU General Public License as published by 12 the Free Software Foundation; either version 2, or (at your option) 13 any later version. 14 15 drbd is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public License 21 along with drbd; see the file COPYING. If not, write to 22 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. 23 24 */ 25 26 #include <linux/slab.h> 27 #include <linux/crc32c.h> 28 #include <linux/drbd.h> 29 #include <linux/drbd_limits.h> 30 #include <linux/dynamic_debug.h> 31 #include "drbd_int.h" 32 33 34 enum al_transaction_types { 35 AL_TR_UPDATE = 0, 36 AL_TR_INITIALIZED = 0xffff 37 }; 38 /* all fields on disc in big endian */ 39 struct __packed al_transaction_on_disk { 40 /* don't we all like magic */ 41 __be32 magic; 42 43 /* to identify the most recent transaction block 44 * in the on disk ring buffer */ 45 __be32 tr_number; 46 47 /* checksum on the full 4k block, with this field set to 0. */ 48 __be32 crc32c; 49 50 /* type of transaction, special transaction types like: 51 * purge-all, set-all-idle, set-all-active, ... to-be-defined 52 * see also enum al_transaction_types */ 53 __be16 transaction_type; 54 55 /* we currently allow only a few thousand extents, 56 * so 16bit will be enough for the slot number. */ 57 58 /* how many updates in this transaction */ 59 __be16 n_updates; 60 61 /* maximum slot number, "al-extents" in drbd.conf speak. 62 * Having this in each transaction should make reconfiguration 63 * of that parameter easier. */ 64 __be16 context_size; 65 66 /* slot number the context starts with */ 67 __be16 context_start_slot_nr; 68 69 /* Some reserved bytes. Expected usage is a 64bit counter of 70 * sectors-written since device creation, and other data generation tag 71 * supporting usage */ 72 __be32 __reserved[4]; 73 74 /* --- 36 byte used --- */ 75 76 /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes 77 * in one transaction, then use the remaining byte in the 4k block for 78 * context information. "Flexible" number of updates per transaction 79 * does not help, as we have to account for the case when all update 80 * slots are used anyways, so it would only complicate code without 81 * additional benefit. 82 */ 83 __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION]; 84 85 /* but the extent number is 32bit, which at an extent size of 4 MiB 86 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */ 87 __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION]; 88 89 /* --- 420 bytes used (36 + 64*6) --- */ 90 91 /* 4096 - 420 = 3676 = 919 * 4 */ 92 __be32 context[AL_CONTEXT_PER_TRANSACTION]; 93 }; 94 95 void *drbd_md_get_buffer(struct drbd_device *device, const char *intent) 96 { 97 int r; 98 99 wait_event(device->misc_wait, 100 (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 || 101 device->state.disk <= D_FAILED); 102 103 if (r) 104 return NULL; 105 106 device->md_io.current_use = intent; 107 device->md_io.start_jif = jiffies; 108 device->md_io.submit_jif = device->md_io.start_jif - 1; 109 return page_address(device->md_io.page); 110 } 111 112 void drbd_md_put_buffer(struct drbd_device *device) 113 { 114 if (atomic_dec_and_test(&device->md_io.in_use)) 115 wake_up(&device->misc_wait); 116 } 117 118 void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, 119 unsigned int *done) 120 { 121 long dt; 122 123 rcu_read_lock(); 124 dt = rcu_dereference(bdev->disk_conf)->disk_timeout; 125 rcu_read_unlock(); 126 dt = dt * HZ / 10; 127 if (dt == 0) 128 dt = MAX_SCHEDULE_TIMEOUT; 129 130 dt = wait_event_timeout(device->misc_wait, 131 *done || test_bit(FORCE_DETACH, &device->flags), dt); 132 if (dt == 0) { 133 drbd_err(device, "meta-data IO operation timed out\n"); 134 drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH); 135 } 136 } 137 138 static int _drbd_md_sync_page_io(struct drbd_device *device, 139 struct drbd_backing_dev *bdev, 140 struct page *page, sector_t sector, 141 int rw, int size) 142 { 143 struct bio *bio; 144 int err; 145 146 device->md_io.done = 0; 147 device->md_io.error = -ENODEV; 148 149 if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) 150 rw |= REQ_FUA | REQ_FLUSH; 151 rw |= REQ_SYNC | REQ_NOIDLE; 152 153 bio = bio_alloc_drbd(GFP_NOIO); 154 bio->bi_bdev = bdev->md_bdev; 155 bio->bi_iter.bi_sector = sector; 156 err = -EIO; 157 if (bio_add_page(bio, page, size, 0) != size) 158 goto out; 159 bio->bi_private = device; 160 bio->bi_end_io = drbd_md_io_complete; 161 bio->bi_rw = rw; 162 163 if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) 164 /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ 165 ; 166 else if (!get_ldev_if_state(device, D_ATTACHING)) { 167 /* Corresponding put_ldev in drbd_md_io_complete() */ 168 drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); 169 err = -ENODEV; 170 goto out; 171 } 172 173 bio_get(bio); /* one bio_put() is in the completion handler */ 174 atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ 175 device->md_io.submit_jif = jiffies; 176 if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) 177 bio_endio(bio, -EIO); 178 else 179 submit_bio(rw, bio); 180 wait_until_done_or_force_detached(device, bdev, &device->md_io.done); 181 if (bio_flagged(bio, BIO_UPTODATE)) 182 err = device->md_io.error; 183 184 out: 185 bio_put(bio); 186 return err; 187 } 188 189 int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, 190 sector_t sector, int rw) 191 { 192 int err; 193 struct page *iop = device->md_io.page; 194 195 D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); 196 197 BUG_ON(!bdev->md_bdev); 198 199 dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", 200 current->comm, current->pid, __func__, 201 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", 202 (void*)_RET_IP_ ); 203 204 if (sector < drbd_md_first_sector(bdev) || 205 sector + 7 > drbd_md_last_sector(bdev)) 206 drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", 207 current->comm, current->pid, __func__, 208 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); 209 210 /* we do all our meta data IO in aligned 4k blocks. */ 211 err = _drbd_md_sync_page_io(device, bdev, iop, sector, rw, 4096); 212 if (err) { 213 drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", 214 (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); 215 } 216 return err; 217 } 218 219 static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr) 220 { 221 struct lc_element *tmp; 222 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 223 if (unlikely(tmp != NULL)) { 224 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 225 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) 226 return bm_ext; 227 } 228 return NULL; 229 } 230 231 static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock) 232 { 233 struct lc_element *al_ext; 234 struct bm_extent *bm_ext; 235 int wake; 236 237 spin_lock_irq(&device->al_lock); 238 bm_ext = find_active_resync_extent(device, enr); 239 if (bm_ext) { 240 wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags); 241 spin_unlock_irq(&device->al_lock); 242 if (wake) 243 wake_up(&device->al_wait); 244 return NULL; 245 } 246 if (nonblock) 247 al_ext = lc_try_get(device->act_log, enr); 248 else 249 al_ext = lc_get(device->act_log, enr); 250 spin_unlock_irq(&device->al_lock); 251 return al_ext; 252 } 253 254 bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i) 255 { 256 /* for bios crossing activity log extent boundaries, 257 * we may need to activate two extents in one go */ 258 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 259 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 260 261 D_ASSERT(device, (unsigned)(last - first) <= 1); 262 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 263 264 /* FIXME figure out a fast path for bios crossing AL extent boundaries */ 265 if (first != last) 266 return false; 267 268 return _al_get(device, first, true); 269 } 270 271 bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i) 272 { 273 /* for bios crossing activity log extent boundaries, 274 * we may need to activate two extents in one go */ 275 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 276 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 277 unsigned enr; 278 bool need_transaction = false; 279 280 D_ASSERT(device, first <= last); 281 D_ASSERT(device, atomic_read(&device->local_cnt) > 0); 282 283 for (enr = first; enr <= last; enr++) { 284 struct lc_element *al_ext; 285 wait_event(device->al_wait, 286 (al_ext = _al_get(device, enr, false)) != NULL); 287 if (al_ext->lc_number != enr) 288 need_transaction = true; 289 } 290 return need_transaction; 291 } 292 293 static int al_write_transaction(struct drbd_device *device); 294 295 void drbd_al_begin_io_commit(struct drbd_device *device) 296 { 297 bool locked = false; 298 299 /* Serialize multiple transactions. 300 * This uses test_and_set_bit, memory barrier is implicit. 301 */ 302 wait_event(device->al_wait, 303 device->act_log->pending_changes == 0 || 304 (locked = lc_try_lock_for_transaction(device->act_log))); 305 306 if (locked) { 307 /* Double check: it may have been committed by someone else, 308 * while we have been waiting for the lock. */ 309 if (device->act_log->pending_changes) { 310 bool write_al_updates; 311 312 rcu_read_lock(); 313 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 314 rcu_read_unlock(); 315 316 if (write_al_updates) 317 al_write_transaction(device); 318 spin_lock_irq(&device->al_lock); 319 /* FIXME 320 if (err) 321 we need an "lc_cancel" here; 322 */ 323 lc_committed(device->act_log); 324 spin_unlock_irq(&device->al_lock); 325 } 326 lc_unlock(device->act_log); 327 wake_up(&device->al_wait); 328 } 329 } 330 331 /* 332 * @delegate: delegate activity log I/O to the worker thread 333 */ 334 void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i) 335 { 336 if (drbd_al_begin_io_prepare(device, i)) 337 drbd_al_begin_io_commit(device); 338 } 339 340 int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i) 341 { 342 struct lru_cache *al = device->act_log; 343 /* for bios crossing activity log extent boundaries, 344 * we may need to activate two extents in one go */ 345 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 346 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 347 unsigned nr_al_extents; 348 unsigned available_update_slots; 349 unsigned enr; 350 351 D_ASSERT(device, first <= last); 352 353 nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */ 354 available_update_slots = min(al->nr_elements - al->used, 355 al->max_pending_changes - al->pending_changes); 356 357 /* We want all necessary updates for a given request within the same transaction 358 * We could first check how many updates are *actually* needed, 359 * and use that instead of the worst-case nr_al_extents */ 360 if (available_update_slots < nr_al_extents) 361 return -EWOULDBLOCK; 362 363 /* Is resync active in this area? */ 364 for (enr = first; enr <= last; enr++) { 365 struct lc_element *tmp; 366 tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT); 367 if (unlikely(tmp != NULL)) { 368 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); 369 if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { 370 if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags)) 371 return -EBUSY; 372 return -EWOULDBLOCK; 373 } 374 } 375 } 376 377 /* Checkout the refcounts. 378 * Given that we checked for available elements and update slots above, 379 * this has to be successful. */ 380 for (enr = first; enr <= last; enr++) { 381 struct lc_element *al_ext; 382 al_ext = lc_get_cumulative(device->act_log, enr); 383 if (!al_ext) 384 drbd_info(device, "LOGIC BUG for enr=%u\n", enr); 385 } 386 return 0; 387 } 388 389 void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) 390 { 391 /* for bios crossing activity log extent boundaries, 392 * we may need to activate two extents in one go */ 393 unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); 394 unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); 395 unsigned enr; 396 struct lc_element *extent; 397 unsigned long flags; 398 399 D_ASSERT(device, first <= last); 400 spin_lock_irqsave(&device->al_lock, flags); 401 402 for (enr = first; enr <= last; enr++) { 403 extent = lc_find(device->act_log, enr); 404 if (!extent) { 405 drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr); 406 continue; 407 } 408 lc_put(device->act_log, extent); 409 } 410 spin_unlock_irqrestore(&device->al_lock, flags); 411 wake_up(&device->al_wait); 412 } 413 414 #if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) 415 /* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT 416 * are still coupled, or assume too much about their relation. 417 * Code below will not work if this is violated. 418 * Will be cleaned up with some followup patch. 419 */ 420 # error FIXME 421 #endif 422 423 static unsigned int al_extent_to_bm_page(unsigned int al_enr) 424 { 425 return al_enr >> 426 /* bit to page */ 427 ((PAGE_SHIFT + 3) - 428 /* al extent number to bit */ 429 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); 430 } 431 432 static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) 433 { 434 const unsigned int stripes = device->ldev->md.al_stripes; 435 const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; 436 437 /* transaction number, modulo on-disk ring buffer wrap around */ 438 unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); 439 440 /* ... to aligned 4k on disk block */ 441 t = ((t % stripes) * stripe_size_4kB) + t/stripes; 442 443 /* ... to 512 byte sector in activity log */ 444 t *= 8; 445 446 /* ... plus offset to the on disk position */ 447 return device->ldev->md.md_offset + device->ldev->md.al_offset + t; 448 } 449 450 int al_write_transaction(struct drbd_device *device) 451 { 452 struct al_transaction_on_disk *buffer; 453 struct lc_element *e; 454 sector_t sector; 455 int i, mx; 456 unsigned extent_nr; 457 unsigned crc = 0; 458 int err = 0; 459 460 if (!get_ldev(device)) { 461 drbd_err(device, "disk is %s, cannot start al transaction\n", 462 drbd_disk_str(device->state.disk)); 463 return -EIO; 464 } 465 466 /* The bitmap write may have failed, causing a state change. */ 467 if (device->state.disk < D_INCONSISTENT) { 468 drbd_err(device, 469 "disk is %s, cannot write al transaction\n", 470 drbd_disk_str(device->state.disk)); 471 put_ldev(device); 472 return -EIO; 473 } 474 475 /* protects md_io_buffer, al_tr_cycle, ... */ 476 buffer = drbd_md_get_buffer(device, __func__); 477 if (!buffer) { 478 drbd_err(device, "disk failed while waiting for md_io buffer\n"); 479 put_ldev(device); 480 return -ENODEV; 481 } 482 483 memset(buffer, 0, sizeof(*buffer)); 484 buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); 485 buffer->tr_number = cpu_to_be32(device->al_tr_number); 486 487 i = 0; 488 489 /* Even though no one can start to change this list 490 * once we set the LC_LOCKED -- from drbd_al_begin_io(), 491 * lc_try_lock_for_transaction() --, someone may still 492 * be in the process of changing it. */ 493 spin_lock_irq(&device->al_lock); 494 list_for_each_entry(e, &device->act_log->to_be_changed, list) { 495 if (i == AL_UPDATES_PER_TRANSACTION) { 496 i++; 497 break; 498 } 499 buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); 500 buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); 501 if (e->lc_number != LC_FREE) 502 drbd_bm_mark_for_writeout(device, 503 al_extent_to_bm_page(e->lc_number)); 504 i++; 505 } 506 spin_unlock_irq(&device->al_lock); 507 BUG_ON(i > AL_UPDATES_PER_TRANSACTION); 508 509 buffer->n_updates = cpu_to_be16(i); 510 for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { 511 buffer->update_slot_nr[i] = cpu_to_be16(-1); 512 buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); 513 } 514 515 buffer->context_size = cpu_to_be16(device->act_log->nr_elements); 516 buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); 517 518 mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, 519 device->act_log->nr_elements - device->al_tr_cycle); 520 for (i = 0; i < mx; i++) { 521 unsigned idx = device->al_tr_cycle + i; 522 extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; 523 buffer->context[i] = cpu_to_be32(extent_nr); 524 } 525 for (; i < AL_CONTEXT_PER_TRANSACTION; i++) 526 buffer->context[i] = cpu_to_be32(LC_FREE); 527 528 device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; 529 if (device->al_tr_cycle >= device->act_log->nr_elements) 530 device->al_tr_cycle = 0; 531 532 sector = al_tr_number_to_on_disk_sector(device); 533 534 crc = crc32c(0, buffer, 4096); 535 buffer->crc32c = cpu_to_be32(crc); 536 537 if (drbd_bm_write_hinted(device)) 538 err = -EIO; 539 else { 540 bool write_al_updates; 541 rcu_read_lock(); 542 write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; 543 rcu_read_unlock(); 544 if (write_al_updates) { 545 if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { 546 err = -EIO; 547 drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); 548 } else { 549 device->al_tr_number++; 550 device->al_writ_cnt++; 551 } 552 } 553 } 554 555 drbd_md_put_buffer(device); 556 put_ldev(device); 557 558 return err; 559 } 560 561 static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) 562 { 563 int rv; 564 565 spin_lock_irq(&device->al_lock); 566 rv = (al_ext->refcnt == 0); 567 if (likely(rv)) 568 lc_del(device->act_log, al_ext); 569 spin_unlock_irq(&device->al_lock); 570 571 return rv; 572 } 573 574 /** 575 * drbd_al_shrink() - Removes all active extents form the activity log 576 * @device: DRBD device. 577 * 578 * Removes all active extents form the activity log, waiting until 579 * the reference count of each entry dropped to 0 first, of course. 580 * 581 * You need to lock device->act_log with lc_try_lock() / lc_unlock() 582 */ 583 void drbd_al_shrink(struct drbd_device *device) 584 { 585 struct lc_element *al_ext; 586 int i; 587 588 D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags)); 589 590 for (i = 0; i < device->act_log->nr_elements; i++) { 591 al_ext = lc_element_by_index(device->act_log, i); 592 if (al_ext->lc_number == LC_FREE) 593 continue; 594 wait_event(device->al_wait, _try_lc_del(device, al_ext)); 595 } 596 597 wake_up(&device->al_wait); 598 } 599 600 int drbd_initialize_al(struct drbd_device *device, void *buffer) 601 { 602 struct al_transaction_on_disk *al = buffer; 603 struct drbd_md *md = &device->ldev->md; 604 sector_t al_base = md->md_offset + md->al_offset; 605 int al_size_4k = md->al_stripes * md->al_stripe_size_4k; 606 int i; 607 608 memset(al, 0, 4096); 609 al->magic = cpu_to_be32(DRBD_AL_MAGIC); 610 al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); 611 al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); 612 613 for (i = 0; i < al_size_4k; i++) { 614 int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); 615 if (err) 616 return err; 617 } 618 return 0; 619 } 620 621 static const char *drbd_change_sync_fname[] = { 622 [RECORD_RS_FAILED] = "drbd_rs_failed_io", 623 [SET_IN_SYNC] = "drbd_set_in_sync", 624 [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync" 625 }; 626 627 /* ATTENTION. The AL's extents are 4MB each, while the extents in the 628 * resync LRU-cache are 16MB each. 629 * The caller of this function has to hold an get_ldev() reference. 630 * 631 * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success), 632 * potentially pulling in (and recounting the corresponding bits) 633 * this resync extent into the resync extent lru cache. 634 * 635 * Returns whether all bits have been cleared for this resync extent, 636 * precisely: (rs_left <= rs_failed) 637 * 638 * TODO will be obsoleted once we have a caching lru of the on disk bitmap 639 */ 640 static bool update_rs_extent(struct drbd_device *device, 641 unsigned int enr, int count, 642 enum update_sync_bits_mode mode) 643 { 644 struct lc_element *e; 645 646 D_ASSERT(device, atomic_read(&device->local_cnt)); 647 648 /* When setting out-of-sync bits, 649 * we don't need it cached (lc_find). 650 * But if it is present in the cache, 651 * we should update the cached bit count. 652 * Otherwise, that extent should be in the resync extent lru cache 653 * already -- or we want to pull it in if necessary -- (lc_get), 654 * then update and check rs_left and rs_failed. */ 655 if (mode == SET_OUT_OF_SYNC) 656 e = lc_find(device->resync, enr); 657 else 658 e = lc_get(device->resync, enr); 659 if (e) { 660 struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); 661 if (ext->lce.lc_number == enr) { 662 if (mode == SET_IN_SYNC) 663 ext->rs_left -= count; 664 else if (mode == SET_OUT_OF_SYNC) 665 ext->rs_left += count; 666 else 667 ext->rs_failed += count; 668 if (ext->rs_left < ext->rs_failed) { 669 drbd_warn(device, "BAD! enr=%u rs_left=%d " 670 "rs_failed=%d count=%d cstate=%s\n", 671 ext->lce.lc_number, ext->rs_left, 672 ext->rs_failed, count, 673 drbd_conn_str(device->state.conn)); 674 675 /* We don't expect to be able to clear more bits 676 * than have been set when we originally counted 677 * the set bits to cache that value in ext->rs_left. 678 * Whatever the reason (disconnect during resync, 679 * delayed local completion of an application write), 680 * try to fix it up by recounting here. */ 681 ext->rs_left = drbd_bm_e_weight(device, enr); 682 } 683 } else { 684 /* Normally this element should be in the cache, 685 * since drbd_rs_begin_io() pulled it already in. 686 * 687 * But maybe an application write finished, and we set 688 * something outside the resync lru_cache in sync. 689 */ 690 int rs_left = drbd_bm_e_weight(device, enr); 691 if (ext->flags != 0) { 692 drbd_warn(device, "changing resync lce: %d[%u;%02lx]" 693 " -> %d[%u;00]\n", 694 ext->lce.lc_number, ext->rs_left, 695 ext->flags, enr, rs_left); 696 ext->flags = 0; 697 } 698 if (ext->rs_failed) { 699 drbd_warn(device, "Kicking resync_lru element enr=%u " 700 "out with rs_failed=%d\n", 701 ext->lce.lc_number, ext->rs_failed); 702 } 703 ext->rs_left = rs_left; 704 ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0; 705 /* we don't keep a persistent log of the resync lru, 706 * we can commit any change right away. */ 707 lc_committed(device->resync); 708 } 709 if (mode != SET_OUT_OF_SYNC) 710 lc_put(device->resync, &ext->lce); 711 /* no race, we are within the al_lock! */ 712 713 if (ext->rs_left <= ext->rs_failed) { 714 ext->rs_failed = 0; 715 return true; 716 } 717 } else if (mode != SET_OUT_OF_SYNC) { 718 /* be quiet if lc_find() did not find it. */ 719 drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n", 720 device->resync_locked, 721 device->resync->nr_elements, 722 device->resync->flags); 723 } 724 return false; 725 } 726 727 void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go) 728 { 729 unsigned long now = jiffies; 730 unsigned long last = device->rs_mark_time[device->rs_last_mark]; 731 int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS; 732 if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) { 733 if (device->rs_mark_left[device->rs_last_mark] != still_to_go && 734 device->state.conn != C_PAUSED_SYNC_T && 735 device->state.conn != C_PAUSED_SYNC_S) { 736 device->rs_mark_time[next] = now; 737 device->rs_mark_left[next] = still_to_go; 738 device->rs_last_mark = next; 739 } 740 } 741 } 742 743 /* It is called lazy update, so don't do write-out too often. */ 744 static bool lazy_bitmap_update_due(struct drbd_device *device) 745 { 746 return time_after(jiffies, device->rs_last_bcast + 2*HZ); 747 } 748 749 static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) 750 { 751 if (rs_done) 752 set_bit(RS_DONE, &device->flags); 753 /* and also set RS_PROGRESS below */ 754 else if (!lazy_bitmap_update_due(device)) 755 return; 756 757 drbd_device_post_work(device, RS_PROGRESS); 758 } 759 760 static int update_sync_bits(struct drbd_device *device, 761 unsigned long sbnr, unsigned long ebnr, 762 enum update_sync_bits_mode mode) 763 { 764 /* 765 * We keep a count of set bits per resync-extent in the ->rs_left 766 * caching member, so we need to loop and work within the resync extent 767 * alignment. Typically this loop will execute exactly once. 768 */ 769 unsigned long flags; 770 unsigned long count = 0; 771 unsigned int cleared = 0; 772 while (sbnr <= ebnr) { 773 /* set temporary boundary bit number to last bit number within 774 * the resync extent of the current start bit number, 775 * but cap at provided end bit number */ 776 unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK); 777 unsigned long c; 778 779 if (mode == RECORD_RS_FAILED) 780 /* Only called from drbd_rs_failed_io(), bits 781 * supposedly still set. Recount, maybe some 782 * of the bits have been successfully cleared 783 * by application IO meanwhile. 784 */ 785 c = drbd_bm_count_bits(device, sbnr, tbnr); 786 else if (mode == SET_IN_SYNC) 787 c = drbd_bm_clear_bits(device, sbnr, tbnr); 788 else /* if (mode == SET_OUT_OF_SYNC) */ 789 c = drbd_bm_set_bits(device, sbnr, tbnr); 790 791 if (c) { 792 spin_lock_irqsave(&device->al_lock, flags); 793 cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode); 794 spin_unlock_irqrestore(&device->al_lock, flags); 795 count += c; 796 } 797 sbnr = tbnr + 1; 798 } 799 if (count) { 800 if (mode == SET_IN_SYNC) { 801 unsigned long still_to_go = drbd_bm_total_weight(device); 802 bool rs_is_done = (still_to_go <= device->rs_failed); 803 drbd_advance_rs_marks(device, still_to_go); 804 if (cleared || rs_is_done) 805 maybe_schedule_on_disk_bitmap_update(device, rs_is_done); 806 } else if (mode == RECORD_RS_FAILED) 807 device->rs_failed += count; 808 wake_up(&device->al_wait); 809 } 810 return count; 811 } 812 813 /* clear the bit corresponding to the piece of storage in question: 814 * size byte of data starting from sector. Only clear a bits of the affected 815 * one ore more _aligned_ BM_BLOCK_SIZE blocks. 816 * 817 * called by worker on C_SYNC_TARGET and receiver on SyncSource. 818 * 819 */ 820 int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, 821 enum update_sync_bits_mode mode, 822 const char *file, const unsigned int line) 823 { 824 /* Is called from worker and receiver context _only_ */ 825 unsigned long sbnr, ebnr, lbnr; 826 unsigned long count = 0; 827 sector_t esector, nr_sectors; 828 829 /* This would be an empty REQ_FLUSH, be silent. */ 830 if ((mode == SET_OUT_OF_SYNC) && size == 0) 831 return 0; 832 833 if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { 834 drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", 835 drbd_change_sync_fname[mode], 836 (unsigned long long)sector, size); 837 return 0; 838 } 839 840 if (!get_ldev(device)) 841 return 0; /* no disk, no metadata, no bitmap to manipulate bits in */ 842 843 nr_sectors = drbd_get_capacity(device->this_bdev); 844 esector = sector + (size >> 9) - 1; 845 846 if (!expect(sector < nr_sectors)) 847 goto out; 848 if (!expect(esector < nr_sectors)) 849 esector = nr_sectors - 1; 850 851 lbnr = BM_SECT_TO_BIT(nr_sectors-1); 852 853 if (mode == SET_IN_SYNC) { 854 /* Round up start sector, round down end sector. We make sure 855 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */ 856 if (unlikely(esector < BM_SECT_PER_BIT-1)) 857 goto out; 858 if (unlikely(esector == (nr_sectors-1))) 859 ebnr = lbnr; 860 else 861 ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); 862 sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); 863 } else { 864 /* We set it out of sync, or record resync failure. 865 * Should not round anything here. */ 866 sbnr = BM_SECT_TO_BIT(sector); 867 ebnr = BM_SECT_TO_BIT(esector); 868 } 869 870 count = update_sync_bits(device, sbnr, ebnr, mode); 871 out: 872 put_ldev(device); 873 return count; 874 } 875 876 static 877 struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr) 878 { 879 struct lc_element *e; 880 struct bm_extent *bm_ext; 881 int wakeup = 0; 882 unsigned long rs_flags; 883 884 spin_lock_irq(&device->al_lock); 885 if (device->resync_locked > device->resync->nr_elements/2) { 886 spin_unlock_irq(&device->al_lock); 887 return NULL; 888 } 889 e = lc_get(device->resync, enr); 890 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 891 if (bm_ext) { 892 if (bm_ext->lce.lc_number != enr) { 893 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 894 bm_ext->rs_failed = 0; 895 lc_committed(device->resync); 896 wakeup = 1; 897 } 898 if (bm_ext->lce.refcnt == 1) 899 device->resync_locked++; 900 set_bit(BME_NO_WRITES, &bm_ext->flags); 901 } 902 rs_flags = device->resync->flags; 903 spin_unlock_irq(&device->al_lock); 904 if (wakeup) 905 wake_up(&device->al_wait); 906 907 if (!bm_ext) { 908 if (rs_flags & LC_STARVING) 909 drbd_warn(device, "Have to wait for element" 910 " (resync LRU too small?)\n"); 911 BUG_ON(rs_flags & LC_LOCKED); 912 } 913 914 return bm_ext; 915 } 916 917 static int _is_in_al(struct drbd_device *device, unsigned int enr) 918 { 919 int rv; 920 921 spin_lock_irq(&device->al_lock); 922 rv = lc_is_used(device->act_log, enr); 923 spin_unlock_irq(&device->al_lock); 924 925 return rv; 926 } 927 928 /** 929 * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED 930 * @device: DRBD device. 931 * @sector: The sector number. 932 * 933 * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted. 934 */ 935 int drbd_rs_begin_io(struct drbd_device *device, sector_t sector) 936 { 937 unsigned int enr = BM_SECT_TO_EXT(sector); 938 struct bm_extent *bm_ext; 939 int i, sig; 940 bool sa; 941 942 retry: 943 sig = wait_event_interruptible(device->al_wait, 944 (bm_ext = _bme_get(device, enr))); 945 if (sig) 946 return -EINTR; 947 948 if (test_bit(BME_LOCKED, &bm_ext->flags)) 949 return 0; 950 951 /* step aside only while we are above c-min-rate; unless disabled. */ 952 sa = drbd_rs_c_min_rate_throttle(device); 953 954 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 955 sig = wait_event_interruptible(device->al_wait, 956 !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) || 957 (sa && test_bit(BME_PRIORITY, &bm_ext->flags))); 958 959 if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) { 960 spin_lock_irq(&device->al_lock); 961 if (lc_put(device->resync, &bm_ext->lce) == 0) { 962 bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */ 963 device->resync_locked--; 964 wake_up(&device->al_wait); 965 } 966 spin_unlock_irq(&device->al_lock); 967 if (sig) 968 return -EINTR; 969 if (schedule_timeout_interruptible(HZ/10)) 970 return -EINTR; 971 goto retry; 972 } 973 } 974 set_bit(BME_LOCKED, &bm_ext->flags); 975 return 0; 976 } 977 978 /** 979 * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep 980 * @device: DRBD device. 981 * @sector: The sector number. 982 * 983 * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then 984 * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN 985 * if there is still application IO going on in this area. 986 */ 987 int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector) 988 { 989 unsigned int enr = BM_SECT_TO_EXT(sector); 990 const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; 991 struct lc_element *e; 992 struct bm_extent *bm_ext; 993 int i; 994 995 spin_lock_irq(&device->al_lock); 996 if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) { 997 /* in case you have very heavy scattered io, it may 998 * stall the syncer undefined if we give up the ref count 999 * when we try again and requeue. 1000 * 1001 * if we don't give up the refcount, but the next time 1002 * we are scheduled this extent has been "synced" by new 1003 * application writes, we'd miss the lc_put on the 1004 * extent we keep the refcount on. 1005 * so we remembered which extent we had to try again, and 1006 * if the next requested one is something else, we do 1007 * the lc_put here... 1008 * we also have to wake_up 1009 */ 1010 e = lc_find(device->resync, device->resync_wenr); 1011 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1012 if (bm_ext) { 1013 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1014 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1015 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1016 device->resync_wenr = LC_FREE; 1017 if (lc_put(device->resync, &bm_ext->lce) == 0) 1018 device->resync_locked--; 1019 wake_up(&device->al_wait); 1020 } else { 1021 drbd_alert(device, "LOGIC BUG\n"); 1022 } 1023 } 1024 /* TRY. */ 1025 e = lc_try_get(device->resync, enr); 1026 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1027 if (bm_ext) { 1028 if (test_bit(BME_LOCKED, &bm_ext->flags)) 1029 goto proceed; 1030 if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { 1031 device->resync_locked++; 1032 } else { 1033 /* we did set the BME_NO_WRITES, 1034 * but then could not set BME_LOCKED, 1035 * so we tried again. 1036 * drop the extra reference. */ 1037 bm_ext->lce.refcnt--; 1038 D_ASSERT(device, bm_ext->lce.refcnt > 0); 1039 } 1040 goto check_al; 1041 } else { 1042 /* do we rather want to try later? */ 1043 if (device->resync_locked > device->resync->nr_elements-3) 1044 goto try_again; 1045 /* Do or do not. There is no try. -- Yoda */ 1046 e = lc_get(device->resync, enr); 1047 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1048 if (!bm_ext) { 1049 const unsigned long rs_flags = device->resync->flags; 1050 if (rs_flags & LC_STARVING) 1051 drbd_warn(device, "Have to wait for element" 1052 " (resync LRU too small?)\n"); 1053 BUG_ON(rs_flags & LC_LOCKED); 1054 goto try_again; 1055 } 1056 if (bm_ext->lce.lc_number != enr) { 1057 bm_ext->rs_left = drbd_bm_e_weight(device, enr); 1058 bm_ext->rs_failed = 0; 1059 lc_committed(device->resync); 1060 wake_up(&device->al_wait); 1061 D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0); 1062 } 1063 set_bit(BME_NO_WRITES, &bm_ext->flags); 1064 D_ASSERT(device, bm_ext->lce.refcnt == 1); 1065 device->resync_locked++; 1066 goto check_al; 1067 } 1068 check_al: 1069 for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { 1070 if (lc_is_used(device->act_log, al_enr+i)) 1071 goto try_again; 1072 } 1073 set_bit(BME_LOCKED, &bm_ext->flags); 1074 proceed: 1075 device->resync_wenr = LC_FREE; 1076 spin_unlock_irq(&device->al_lock); 1077 return 0; 1078 1079 try_again: 1080 if (bm_ext) 1081 device->resync_wenr = enr; 1082 spin_unlock_irq(&device->al_lock); 1083 return -EAGAIN; 1084 } 1085 1086 void drbd_rs_complete_io(struct drbd_device *device, sector_t sector) 1087 { 1088 unsigned int enr = BM_SECT_TO_EXT(sector); 1089 struct lc_element *e; 1090 struct bm_extent *bm_ext; 1091 unsigned long flags; 1092 1093 spin_lock_irqsave(&device->al_lock, flags); 1094 e = lc_find(device->resync, enr); 1095 bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; 1096 if (!bm_ext) { 1097 spin_unlock_irqrestore(&device->al_lock, flags); 1098 if (__ratelimit(&drbd_ratelimit_state)) 1099 drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n"); 1100 return; 1101 } 1102 1103 if (bm_ext->lce.refcnt == 0) { 1104 spin_unlock_irqrestore(&device->al_lock, flags); 1105 drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, " 1106 "but refcnt is 0!?\n", 1107 (unsigned long long)sector, enr); 1108 return; 1109 } 1110 1111 if (lc_put(device->resync, &bm_ext->lce) == 0) { 1112 bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */ 1113 device->resync_locked--; 1114 wake_up(&device->al_wait); 1115 } 1116 1117 spin_unlock_irqrestore(&device->al_lock, flags); 1118 } 1119 1120 /** 1121 * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) 1122 * @device: DRBD device. 1123 */ 1124 void drbd_rs_cancel_all(struct drbd_device *device) 1125 { 1126 spin_lock_irq(&device->al_lock); 1127 1128 if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */ 1129 lc_reset(device->resync); 1130 put_ldev(device); 1131 } 1132 device->resync_locked = 0; 1133 device->resync_wenr = LC_FREE; 1134 spin_unlock_irq(&device->al_lock); 1135 wake_up(&device->al_wait); 1136 } 1137 1138 /** 1139 * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU 1140 * @device: DRBD device. 1141 * 1142 * Returns 0 upon success, -EAGAIN if at least one reference count was 1143 * not zero. 1144 */ 1145 int drbd_rs_del_all(struct drbd_device *device) 1146 { 1147 struct lc_element *e; 1148 struct bm_extent *bm_ext; 1149 int i; 1150 1151 spin_lock_irq(&device->al_lock); 1152 1153 if (get_ldev_if_state(device, D_FAILED)) { 1154 /* ok, ->resync is there. */ 1155 for (i = 0; i < device->resync->nr_elements; i++) { 1156 e = lc_element_by_index(device->resync, i); 1157 bm_ext = lc_entry(e, struct bm_extent, lce); 1158 if (bm_ext->lce.lc_number == LC_FREE) 1159 continue; 1160 if (bm_ext->lce.lc_number == device->resync_wenr) { 1161 drbd_info(device, "dropping %u in drbd_rs_del_all, apparently" 1162 " got 'synced' by application io\n", 1163 device->resync_wenr); 1164 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1165 D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags)); 1166 clear_bit(BME_NO_WRITES, &bm_ext->flags); 1167 device->resync_wenr = LC_FREE; 1168 lc_put(device->resync, &bm_ext->lce); 1169 } 1170 if (bm_ext->lce.refcnt != 0) { 1171 drbd_info(device, "Retrying drbd_rs_del_all() later. " 1172 "refcnt=%d\n", bm_ext->lce.refcnt); 1173 put_ldev(device); 1174 spin_unlock_irq(&device->al_lock); 1175 return -EAGAIN; 1176 } 1177 D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags)); 1178 D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags)); 1179 lc_del(device->resync, &bm_ext->lce); 1180 } 1181 D_ASSERT(device, device->resync->used == 0); 1182 put_ldev(device); 1183 } 1184 spin_unlock_irq(&device->al_lock); 1185 wake_up(&device->al_wait); 1186 1187 return 0; 1188 } 1189