1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-zoned.h" 9 10 #include <linux/module.h> 11 12 #define DM_MSG_PREFIX "zoned reclaim" 13 14 struct dmz_reclaim { 15 struct dmz_metadata *metadata; 16 struct dmz_dev *dev; 17 18 struct delayed_work work; 19 struct workqueue_struct *wq; 20 21 struct dm_kcopyd_client *kc; 22 struct dm_kcopyd_throttle kc_throttle; 23 int kc_err; 24 25 unsigned long flags; 26 27 /* Last target access time */ 28 unsigned long atime; 29 }; 30 31 /* 32 * Reclaim state flags. 33 */ 34 enum { 35 DMZ_RECLAIM_KCOPY, 36 }; 37 38 /* 39 * Number of seconds of target BIO inactivity to consider the target idle. 40 */ 41 #define DMZ_IDLE_PERIOD (10UL * HZ) 42 43 /* 44 * Percentage of unmapped (free) random zones below which reclaim starts 45 * even if the target is busy. 46 */ 47 #define DMZ_RECLAIM_LOW_UNMAP_RND 30 48 49 /* 50 * Percentage of unmapped (free) random zones above which reclaim will 51 * stop if the target is busy. 52 */ 53 #define DMZ_RECLAIM_HIGH_UNMAP_RND 50 54 55 /* 56 * Align a sequential zone write pointer to chunk_block. 57 */ 58 static int dmz_reclaim_align_wp(struct dmz_reclaim *zrc, struct dm_zone *zone, 59 sector_t block) 60 { 61 struct dmz_metadata *zmd = zrc->metadata; 62 sector_t wp_block = zone->wp_block; 63 unsigned int nr_blocks; 64 int ret; 65 66 if (wp_block == block) 67 return 0; 68 69 if (wp_block > block) 70 return -EIO; 71 72 /* 73 * Zeroout the space between the write 74 * pointer and the requested position. 75 */ 76 nr_blocks = block - wp_block; 77 ret = blkdev_issue_zeroout(zrc->dev->bdev, 78 dmz_start_sect(zmd, zone) + dmz_blk2sect(wp_block), 79 dmz_blk2sect(nr_blocks), GFP_NOIO, 0); 80 if (ret) { 81 dmz_dev_err(zrc->dev, 82 "Align zone %u wp %llu to %llu (wp+%u) blocks failed %d", 83 dmz_id(zmd, zone), (unsigned long long)wp_block, 84 (unsigned long long)block, nr_blocks, ret); 85 dmz_check_bdev(zrc->dev); 86 return ret; 87 } 88 89 zone->wp_block = block; 90 91 return 0; 92 } 93 94 /* 95 * dm_kcopyd_copy end notification. 96 */ 97 static void dmz_reclaim_kcopy_end(int read_err, unsigned long write_err, 98 void *context) 99 { 100 struct dmz_reclaim *zrc = context; 101 102 if (read_err || write_err) 103 zrc->kc_err = -EIO; 104 else 105 zrc->kc_err = 0; 106 107 clear_bit_unlock(DMZ_RECLAIM_KCOPY, &zrc->flags); 108 smp_mb__after_atomic(); 109 wake_up_bit(&zrc->flags, DMZ_RECLAIM_KCOPY); 110 } 111 112 /* 113 * Copy valid blocks of src_zone into dst_zone. 114 */ 115 static int dmz_reclaim_copy(struct dmz_reclaim *zrc, 116 struct dm_zone *src_zone, struct dm_zone *dst_zone) 117 { 118 struct dmz_metadata *zmd = zrc->metadata; 119 struct dmz_dev *dev = zrc->dev; 120 struct dm_io_region src, dst; 121 sector_t block = 0, end_block; 122 sector_t nr_blocks; 123 sector_t src_zone_block; 124 sector_t dst_zone_block; 125 unsigned long flags = 0; 126 int ret; 127 128 if (dmz_is_seq(src_zone)) 129 end_block = src_zone->wp_block; 130 else 131 end_block = dev->zone_nr_blocks; 132 src_zone_block = dmz_start_block(zmd, src_zone); 133 dst_zone_block = dmz_start_block(zmd, dst_zone); 134 135 if (dmz_is_seq(dst_zone)) 136 set_bit(DM_KCOPYD_WRITE_SEQ, &flags); 137 138 while (block < end_block) { 139 if (dev->flags & DMZ_BDEV_DYING) 140 return -EIO; 141 142 /* Get a valid region from the source zone */ 143 ret = dmz_first_valid_block(zmd, src_zone, &block); 144 if (ret <= 0) 145 return ret; 146 nr_blocks = ret; 147 148 /* 149 * If we are writing in a sequential zone, we must make sure 150 * that writes are sequential. So Zeroout any eventual hole 151 * between writes. 152 */ 153 if (dmz_is_seq(dst_zone)) { 154 ret = dmz_reclaim_align_wp(zrc, dst_zone, block); 155 if (ret) 156 return ret; 157 } 158 159 src.bdev = dev->bdev; 160 src.sector = dmz_blk2sect(src_zone_block + block); 161 src.count = dmz_blk2sect(nr_blocks); 162 163 dst.bdev = dev->bdev; 164 dst.sector = dmz_blk2sect(dst_zone_block + block); 165 dst.count = src.count; 166 167 /* Copy the valid region */ 168 set_bit(DMZ_RECLAIM_KCOPY, &zrc->flags); 169 dm_kcopyd_copy(zrc->kc, &src, 1, &dst, flags, 170 dmz_reclaim_kcopy_end, zrc); 171 172 /* Wait for copy to complete */ 173 wait_on_bit_io(&zrc->flags, DMZ_RECLAIM_KCOPY, 174 TASK_UNINTERRUPTIBLE); 175 if (zrc->kc_err) 176 return zrc->kc_err; 177 178 block += nr_blocks; 179 if (dmz_is_seq(dst_zone)) 180 dst_zone->wp_block = block; 181 } 182 183 return 0; 184 } 185 186 /* 187 * Move valid blocks of dzone buffer zone into dzone (after its write pointer) 188 * and free the buffer zone. 189 */ 190 static int dmz_reclaim_buf(struct dmz_reclaim *zrc, struct dm_zone *dzone) 191 { 192 struct dm_zone *bzone = dzone->bzone; 193 sector_t chunk_block = dzone->wp_block; 194 struct dmz_metadata *zmd = zrc->metadata; 195 int ret; 196 197 dmz_dev_debug(zrc->dev, 198 "Chunk %u, move buf zone %u (weight %u) to data zone %u (weight %u)", 199 dzone->chunk, dmz_id(zmd, bzone), dmz_weight(bzone), 200 dmz_id(zmd, dzone), dmz_weight(dzone)); 201 202 /* Flush data zone into the buffer zone */ 203 ret = dmz_reclaim_copy(zrc, bzone, dzone); 204 if (ret < 0) 205 return ret; 206 207 dmz_lock_flush(zmd); 208 209 /* Validate copied blocks */ 210 ret = dmz_merge_valid_blocks(zmd, bzone, dzone, chunk_block); 211 if (ret == 0) { 212 /* Free the buffer zone */ 213 dmz_invalidate_blocks(zmd, bzone, 0, zrc->dev->zone_nr_blocks); 214 dmz_lock_map(zmd); 215 dmz_unmap_zone(zmd, bzone); 216 dmz_unlock_zone_reclaim(dzone); 217 dmz_free_zone(zmd, bzone); 218 dmz_unlock_map(zmd); 219 } 220 221 dmz_unlock_flush(zmd); 222 223 return ret; 224 } 225 226 /* 227 * Merge valid blocks of dzone into its buffer zone and free dzone. 228 */ 229 static int dmz_reclaim_seq_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) 230 { 231 unsigned int chunk = dzone->chunk; 232 struct dm_zone *bzone = dzone->bzone; 233 struct dmz_metadata *zmd = zrc->metadata; 234 int ret = 0; 235 236 dmz_dev_debug(zrc->dev, 237 "Chunk %u, move data zone %u (weight %u) to buf zone %u (weight %u)", 238 chunk, dmz_id(zmd, dzone), dmz_weight(dzone), 239 dmz_id(zmd, bzone), dmz_weight(bzone)); 240 241 /* Flush data zone into the buffer zone */ 242 ret = dmz_reclaim_copy(zrc, dzone, bzone); 243 if (ret < 0) 244 return ret; 245 246 dmz_lock_flush(zmd); 247 248 /* Validate copied blocks */ 249 ret = dmz_merge_valid_blocks(zmd, dzone, bzone, 0); 250 if (ret == 0) { 251 /* 252 * Free the data zone and remap the chunk to 253 * the buffer zone. 254 */ 255 dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); 256 dmz_lock_map(zmd); 257 dmz_unmap_zone(zmd, bzone); 258 dmz_unmap_zone(zmd, dzone); 259 dmz_unlock_zone_reclaim(dzone); 260 dmz_free_zone(zmd, dzone); 261 dmz_map_zone(zmd, bzone, chunk); 262 dmz_unlock_map(zmd); 263 } 264 265 dmz_unlock_flush(zmd); 266 267 return ret; 268 } 269 270 /* 271 * Move valid blocks of the random data zone dzone into a free sequential zone. 272 * Once blocks are moved, remap the zone chunk to the sequential zone. 273 */ 274 static int dmz_reclaim_rnd_data(struct dmz_reclaim *zrc, struct dm_zone *dzone) 275 { 276 unsigned int chunk = dzone->chunk; 277 struct dm_zone *szone = NULL; 278 struct dmz_metadata *zmd = zrc->metadata; 279 int ret; 280 281 /* Get a free sequential zone */ 282 dmz_lock_map(zmd); 283 szone = dmz_alloc_zone(zmd, DMZ_ALLOC_RECLAIM); 284 dmz_unlock_map(zmd); 285 if (!szone) 286 return -ENOSPC; 287 288 dmz_dev_debug(zrc->dev, 289 "Chunk %u, move rnd zone %u (weight %u) to seq zone %u", 290 chunk, dmz_id(zmd, dzone), dmz_weight(dzone), 291 dmz_id(zmd, szone)); 292 293 /* Flush the random data zone into the sequential zone */ 294 ret = dmz_reclaim_copy(zrc, dzone, szone); 295 296 dmz_lock_flush(zmd); 297 298 if (ret == 0) { 299 /* Validate copied blocks */ 300 ret = dmz_copy_valid_blocks(zmd, dzone, szone); 301 } 302 if (ret) { 303 /* Free the sequential zone */ 304 dmz_lock_map(zmd); 305 dmz_free_zone(zmd, szone); 306 dmz_unlock_map(zmd); 307 } else { 308 /* Free the data zone and remap the chunk */ 309 dmz_invalidate_blocks(zmd, dzone, 0, zrc->dev->zone_nr_blocks); 310 dmz_lock_map(zmd); 311 dmz_unmap_zone(zmd, dzone); 312 dmz_unlock_zone_reclaim(dzone); 313 dmz_free_zone(zmd, dzone); 314 dmz_map_zone(zmd, szone, chunk); 315 dmz_unlock_map(zmd); 316 } 317 318 dmz_unlock_flush(zmd); 319 320 return ret; 321 } 322 323 /* 324 * Reclaim an empty zone. 325 */ 326 static void dmz_reclaim_empty(struct dmz_reclaim *zrc, struct dm_zone *dzone) 327 { 328 struct dmz_metadata *zmd = zrc->metadata; 329 330 dmz_lock_flush(zmd); 331 dmz_lock_map(zmd); 332 dmz_unmap_zone(zmd, dzone); 333 dmz_unlock_zone_reclaim(dzone); 334 dmz_free_zone(zmd, dzone); 335 dmz_unlock_map(zmd); 336 dmz_unlock_flush(zmd); 337 } 338 339 /* 340 * Find a candidate zone for reclaim and process it. 341 */ 342 static int dmz_do_reclaim(struct dmz_reclaim *zrc) 343 { 344 struct dmz_metadata *zmd = zrc->metadata; 345 struct dm_zone *dzone; 346 struct dm_zone *rzone; 347 unsigned long start; 348 int ret; 349 350 /* Get a data zone */ 351 dzone = dmz_get_zone_for_reclaim(zmd); 352 if (IS_ERR(dzone)) 353 return PTR_ERR(dzone); 354 355 start = jiffies; 356 357 if (dmz_is_rnd(dzone)) { 358 if (!dmz_weight(dzone)) { 359 /* Empty zone */ 360 dmz_reclaim_empty(zrc, dzone); 361 ret = 0; 362 } else { 363 /* 364 * Reclaim the random data zone by moving its 365 * valid data blocks to a free sequential zone. 366 */ 367 ret = dmz_reclaim_rnd_data(zrc, dzone); 368 } 369 rzone = dzone; 370 371 } else { 372 struct dm_zone *bzone = dzone->bzone; 373 sector_t chunk_block = 0; 374 375 ret = dmz_first_valid_block(zmd, bzone, &chunk_block); 376 if (ret < 0) 377 goto out; 378 379 if (ret == 0 || chunk_block >= dzone->wp_block) { 380 /* 381 * The buffer zone is empty or its valid blocks are 382 * after the data zone write pointer. 383 */ 384 ret = dmz_reclaim_buf(zrc, dzone); 385 rzone = bzone; 386 } else { 387 /* 388 * Reclaim the data zone by merging it into the 389 * buffer zone so that the buffer zone itself can 390 * be later reclaimed. 391 */ 392 ret = dmz_reclaim_seq_data(zrc, dzone); 393 rzone = dzone; 394 } 395 } 396 out: 397 if (ret) { 398 dmz_unlock_zone_reclaim(dzone); 399 return ret; 400 } 401 402 ret = dmz_flush_metadata(zrc->metadata); 403 if (ret) { 404 dmz_dev_debug(zrc->dev, 405 "Metadata flush for zone %u failed, err %d\n", 406 dmz_id(zmd, rzone), ret); 407 return ret; 408 } 409 410 dmz_dev_debug(zrc->dev, "Reclaimed zone %u in %u ms", 411 dmz_id(zmd, rzone), jiffies_to_msecs(jiffies - start)); 412 return 0; 413 } 414 415 /* 416 * Test if the target device is idle. 417 */ 418 static inline int dmz_target_idle(struct dmz_reclaim *zrc) 419 { 420 return time_is_before_jiffies(zrc->atime + DMZ_IDLE_PERIOD); 421 } 422 423 /* 424 * Test if reclaim is necessary. 425 */ 426 static bool dmz_should_reclaim(struct dmz_reclaim *zrc) 427 { 428 struct dmz_metadata *zmd = zrc->metadata; 429 unsigned int nr_rnd = dmz_nr_rnd_zones(zmd); 430 unsigned int nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); 431 unsigned int p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; 432 433 /* Reclaim when idle */ 434 if (dmz_target_idle(zrc) && nr_unmap_rnd < nr_rnd) 435 return true; 436 437 /* If there are still plenty of random zones, do not reclaim */ 438 if (p_unmap_rnd >= DMZ_RECLAIM_HIGH_UNMAP_RND) 439 return false; 440 441 /* 442 * If the percentage of unmapped random zones is low, 443 * reclaim even if the target is busy. 444 */ 445 return p_unmap_rnd <= DMZ_RECLAIM_LOW_UNMAP_RND; 446 } 447 448 /* 449 * Reclaim work function. 450 */ 451 static void dmz_reclaim_work(struct work_struct *work) 452 { 453 struct dmz_reclaim *zrc = container_of(work, struct dmz_reclaim, work.work); 454 struct dmz_metadata *zmd = zrc->metadata; 455 unsigned int nr_rnd, nr_unmap_rnd; 456 unsigned int p_unmap_rnd; 457 int ret; 458 459 if (dmz_bdev_is_dying(zrc->dev)) 460 return; 461 462 if (!dmz_should_reclaim(zrc)) { 463 mod_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); 464 return; 465 } 466 467 /* 468 * We need to start reclaiming random zones: set up zone copy 469 * throttling to either go fast if we are very low on random zones 470 * and slower if there are still some free random zones to avoid 471 * as much as possible to negatively impact the user workload. 472 */ 473 nr_rnd = dmz_nr_rnd_zones(zmd); 474 nr_unmap_rnd = dmz_nr_unmap_rnd_zones(zmd); 475 p_unmap_rnd = nr_unmap_rnd * 100 / nr_rnd; 476 if (dmz_target_idle(zrc) || p_unmap_rnd < DMZ_RECLAIM_LOW_UNMAP_RND / 2) { 477 /* Idle or very low percentage: go fast */ 478 zrc->kc_throttle.throttle = 100; 479 } else { 480 /* Busy but we still have some random zone: throttle */ 481 zrc->kc_throttle.throttle = min(75U, 100U - p_unmap_rnd / 2); 482 } 483 484 dmz_dev_debug(zrc->dev, 485 "Reclaim (%u): %s, %u%% free rnd zones (%u/%u)", 486 zrc->kc_throttle.throttle, 487 (dmz_target_idle(zrc) ? "Idle" : "Busy"), 488 p_unmap_rnd, nr_unmap_rnd, nr_rnd); 489 490 ret = dmz_do_reclaim(zrc); 491 if (ret) { 492 dmz_dev_debug(zrc->dev, "Reclaim error %d\n", ret); 493 if (!dmz_check_bdev(zrc->dev)) 494 return; 495 } 496 497 dmz_schedule_reclaim(zrc); 498 } 499 500 /* 501 * Initialize reclaim. 502 */ 503 int dmz_ctr_reclaim(struct dmz_dev *dev, struct dmz_metadata *zmd, 504 struct dmz_reclaim **reclaim) 505 { 506 struct dmz_reclaim *zrc; 507 int ret; 508 509 zrc = kzalloc(sizeof(struct dmz_reclaim), GFP_KERNEL); 510 if (!zrc) 511 return -ENOMEM; 512 513 zrc->dev = dev; 514 zrc->metadata = zmd; 515 zrc->atime = jiffies; 516 517 /* Reclaim kcopyd client */ 518 zrc->kc = dm_kcopyd_client_create(&zrc->kc_throttle); 519 if (IS_ERR(zrc->kc)) { 520 ret = PTR_ERR(zrc->kc); 521 zrc->kc = NULL; 522 goto err; 523 } 524 525 /* Reclaim work */ 526 INIT_DELAYED_WORK(&zrc->work, dmz_reclaim_work); 527 zrc->wq = alloc_ordered_workqueue("dmz_rwq_%s", WQ_MEM_RECLAIM, 528 dev->name); 529 if (!zrc->wq) { 530 ret = -ENOMEM; 531 goto err; 532 } 533 534 *reclaim = zrc; 535 queue_delayed_work(zrc->wq, &zrc->work, 0); 536 537 return 0; 538 err: 539 if (zrc->kc) 540 dm_kcopyd_client_destroy(zrc->kc); 541 kfree(zrc); 542 543 return ret; 544 } 545 546 /* 547 * Terminate reclaim. 548 */ 549 void dmz_dtr_reclaim(struct dmz_reclaim *zrc) 550 { 551 cancel_delayed_work_sync(&zrc->work); 552 destroy_workqueue(zrc->wq); 553 dm_kcopyd_client_destroy(zrc->kc); 554 kfree(zrc); 555 } 556 557 /* 558 * Suspend reclaim. 559 */ 560 void dmz_suspend_reclaim(struct dmz_reclaim *zrc) 561 { 562 cancel_delayed_work_sync(&zrc->work); 563 } 564 565 /* 566 * Resume reclaim. 567 */ 568 void dmz_resume_reclaim(struct dmz_reclaim *zrc) 569 { 570 queue_delayed_work(zrc->wq, &zrc->work, DMZ_IDLE_PERIOD); 571 } 572 573 /* 574 * BIO accounting. 575 */ 576 void dmz_reclaim_bio_acc(struct dmz_reclaim *zrc) 577 { 578 zrc->atime = jiffies; 579 } 580 581 /* 582 * Start reclaim if necessary. 583 */ 584 void dmz_schedule_reclaim(struct dmz_reclaim *zrc) 585 { 586 if (dmz_should_reclaim(zrc)) 587 mod_delayed_work(zrc->wq, &zrc->work, 0); 588 } 589 590