1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 struct hlist_head *cache_hash; 108 wait_queue_head_t free_buffer_wait; 109 110 int async_write_error; 111 112 struct list_head client_list; 113 struct shrinker shrinker; 114 }; 115 116 /* 117 * Buffer state bits. 118 */ 119 #define B_READING 0 120 #define B_WRITING 1 121 #define B_DIRTY 2 122 123 /* 124 * Describes how the block was allocated: 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 126 * See the comment at alloc_buffer_data. 127 */ 128 enum data_mode { 129 DATA_MODE_SLAB = 0, 130 DATA_MODE_GET_FREE_PAGES = 1, 131 DATA_MODE_VMALLOC = 2, 132 DATA_MODE_LIMIT = 3 133 }; 134 135 struct dm_buffer { 136 struct hlist_node hash_list; 137 struct list_head lru_list; 138 sector_t block; 139 void *data; 140 enum data_mode data_mode; 141 unsigned char list_mode; /* LIST_* */ 142 unsigned hold_count; 143 int read_error; 144 int write_error; 145 unsigned long state; 146 unsigned long last_accessed; 147 struct dm_bufio_client *c; 148 struct list_head write_list; 149 struct bio bio; 150 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 151 }; 152 153 /*----------------------------------------------------------------*/ 154 155 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 156 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 157 158 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 159 { 160 unsigned ret = c->blocks_per_page_bits - 1; 161 162 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 163 164 return ret; 165 } 166 167 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 168 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 169 170 #define dm_bufio_in_request() (!!current->bio_list) 171 172 static void dm_bufio_lock(struct dm_bufio_client *c) 173 { 174 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 175 } 176 177 static int dm_bufio_trylock(struct dm_bufio_client *c) 178 { 179 return mutex_trylock(&c->lock); 180 } 181 182 static void dm_bufio_unlock(struct dm_bufio_client *c) 183 { 184 mutex_unlock(&c->lock); 185 } 186 187 /* 188 * FIXME Move to sched.h? 189 */ 190 #ifdef CONFIG_PREEMPT_VOLUNTARY 191 # define dm_bufio_cond_resched() \ 192 do { \ 193 if (unlikely(need_resched())) \ 194 _cond_resched(); \ 195 } while (0) 196 #else 197 # define dm_bufio_cond_resched() do { } while (0) 198 #endif 199 200 /*----------------------------------------------------------------*/ 201 202 /* 203 * Default cache size: available memory divided by the ratio. 204 */ 205 static unsigned long dm_bufio_default_cache_size; 206 207 /* 208 * Total cache size set by the user. 209 */ 210 static unsigned long dm_bufio_cache_size; 211 212 /* 213 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 214 * at any time. If it disagrees, the user has changed cache size. 215 */ 216 static unsigned long dm_bufio_cache_size_latch; 217 218 static DEFINE_SPINLOCK(param_spinlock); 219 220 /* 221 * Buffers are freed after this timeout 222 */ 223 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 224 225 static unsigned long dm_bufio_peak_allocated; 226 static unsigned long dm_bufio_allocated_kmem_cache; 227 static unsigned long dm_bufio_allocated_get_free_pages; 228 static unsigned long dm_bufio_allocated_vmalloc; 229 static unsigned long dm_bufio_current_allocated; 230 231 /*----------------------------------------------------------------*/ 232 233 /* 234 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 235 */ 236 static unsigned long dm_bufio_cache_size_per_client; 237 238 /* 239 * The current number of clients. 240 */ 241 static int dm_bufio_client_count; 242 243 /* 244 * The list of all clients. 245 */ 246 static LIST_HEAD(dm_bufio_all_clients); 247 248 /* 249 * This mutex protects dm_bufio_cache_size_latch, 250 * dm_bufio_cache_size_per_client and dm_bufio_client_count 251 */ 252 static DEFINE_MUTEX(dm_bufio_clients_lock); 253 254 /*----------------------------------------------------------------*/ 255 256 static void adjust_total_allocated(enum data_mode data_mode, long diff) 257 { 258 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 259 &dm_bufio_allocated_kmem_cache, 260 &dm_bufio_allocated_get_free_pages, 261 &dm_bufio_allocated_vmalloc, 262 }; 263 264 spin_lock(¶m_spinlock); 265 266 *class_ptr[data_mode] += diff; 267 268 dm_bufio_current_allocated += diff; 269 270 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 271 dm_bufio_peak_allocated = dm_bufio_current_allocated; 272 273 spin_unlock(¶m_spinlock); 274 } 275 276 /* 277 * Change the number of clients and recalculate per-client limit. 278 */ 279 static void __cache_size_refresh(void) 280 { 281 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 282 BUG_ON(dm_bufio_client_count < 0); 283 284 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 285 286 /* 287 * Use default if set to 0 and report the actual cache size used. 288 */ 289 if (!dm_bufio_cache_size_latch) { 290 (void)cmpxchg(&dm_bufio_cache_size, 0, 291 dm_bufio_default_cache_size); 292 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 293 } 294 295 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 296 (dm_bufio_client_count ? : 1); 297 } 298 299 /* 300 * Allocating buffer data. 301 * 302 * Small buffers are allocated with kmem_cache, to use space optimally. 303 * 304 * For large buffers, we choose between get_free_pages and vmalloc. 305 * Each has advantages and disadvantages. 306 * 307 * __get_free_pages can randomly fail if the memory is fragmented. 308 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 309 * as low as 128M) so using it for caching is not appropriate. 310 * 311 * If the allocation may fail we use __get_free_pages. Memory fragmentation 312 * won't have a fatal effect here, but it just causes flushes of some other 313 * buffers and more I/O will be performed. Don't use __get_free_pages if it 314 * always fails (i.e. order >= MAX_ORDER). 315 * 316 * If the allocation shouldn't fail we use __vmalloc. This is only for the 317 * initial reserve allocation, so there's no risk of wasting all vmalloc 318 * space. 319 */ 320 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 321 enum data_mode *data_mode) 322 { 323 unsigned noio_flag; 324 void *ptr; 325 326 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 327 *data_mode = DATA_MODE_SLAB; 328 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 329 } 330 331 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 332 gfp_mask & __GFP_NORETRY) { 333 *data_mode = DATA_MODE_GET_FREE_PAGES; 334 return (void *)__get_free_pages(gfp_mask, 335 c->pages_per_block_bits); 336 } 337 338 *data_mode = DATA_MODE_VMALLOC; 339 340 /* 341 * __vmalloc allocates the data pages and auxiliary structures with 342 * gfp_flags that were specified, but pagetables are always allocated 343 * with GFP_KERNEL, no matter what was specified as gfp_mask. 344 * 345 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 346 * all allocations done by this process (including pagetables) are done 347 * as if GFP_NOIO was specified. 348 */ 349 350 if (gfp_mask & __GFP_NORETRY) 351 noio_flag = memalloc_noio_save(); 352 353 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 354 355 if (gfp_mask & __GFP_NORETRY) 356 memalloc_noio_restore(noio_flag); 357 358 return ptr; 359 } 360 361 /* 362 * Free buffer's data. 363 */ 364 static void free_buffer_data(struct dm_bufio_client *c, 365 void *data, enum data_mode data_mode) 366 { 367 switch (data_mode) { 368 case DATA_MODE_SLAB: 369 kmem_cache_free(DM_BUFIO_CACHE(c), data); 370 break; 371 372 case DATA_MODE_GET_FREE_PAGES: 373 free_pages((unsigned long)data, c->pages_per_block_bits); 374 break; 375 376 case DATA_MODE_VMALLOC: 377 vfree(data); 378 break; 379 380 default: 381 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 382 data_mode); 383 BUG(); 384 } 385 } 386 387 /* 388 * Allocate buffer and its data. 389 */ 390 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 391 { 392 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 393 gfp_mask); 394 395 if (!b) 396 return NULL; 397 398 b->c = c; 399 400 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 401 if (!b->data) { 402 kfree(b); 403 return NULL; 404 } 405 406 adjust_total_allocated(b->data_mode, (long)c->block_size); 407 408 return b; 409 } 410 411 /* 412 * Free buffer and its data. 413 */ 414 static void free_buffer(struct dm_buffer *b) 415 { 416 struct dm_bufio_client *c = b->c; 417 418 adjust_total_allocated(b->data_mode, -(long)c->block_size); 419 420 free_buffer_data(c, b->data, b->data_mode); 421 kfree(b); 422 } 423 424 /* 425 * Link buffer to the hash list and clean or dirty queue. 426 */ 427 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 428 { 429 struct dm_bufio_client *c = b->c; 430 431 c->n_buffers[dirty]++; 432 b->block = block; 433 b->list_mode = dirty; 434 list_add(&b->lru_list, &c->lru[dirty]); 435 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 436 b->last_accessed = jiffies; 437 } 438 439 /* 440 * Unlink buffer from the hash list and dirty or clean queue. 441 */ 442 static void __unlink_buffer(struct dm_buffer *b) 443 { 444 struct dm_bufio_client *c = b->c; 445 446 BUG_ON(!c->n_buffers[b->list_mode]); 447 448 c->n_buffers[b->list_mode]--; 449 hlist_del(&b->hash_list); 450 list_del(&b->lru_list); 451 } 452 453 /* 454 * Place the buffer to the head of dirty or clean LRU queue. 455 */ 456 static void __relink_lru(struct dm_buffer *b, int dirty) 457 { 458 struct dm_bufio_client *c = b->c; 459 460 BUG_ON(!c->n_buffers[b->list_mode]); 461 462 c->n_buffers[b->list_mode]--; 463 c->n_buffers[dirty]++; 464 b->list_mode = dirty; 465 list_move(&b->lru_list, &c->lru[dirty]); 466 } 467 468 /*---------------------------------------------------------------- 469 * Submit I/O on the buffer. 470 * 471 * Bio interface is faster but it has some problems: 472 * the vector list is limited (increasing this limit increases 473 * memory-consumption per buffer, so it is not viable); 474 * 475 * the memory must be direct-mapped, not vmalloced; 476 * 477 * the I/O driver can reject requests spuriously if it thinks that 478 * the requests are too big for the device or if they cross a 479 * controller-defined memory boundary. 480 * 481 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 482 * it is not vmalloced, try using the bio interface. 483 * 484 * If the buffer is big, if it is vmalloced or if the underlying device 485 * rejects the bio because it is too large, use dm-io layer to do the I/O. 486 * The dm-io layer splits the I/O into multiple requests, avoiding the above 487 * shortcomings. 488 *--------------------------------------------------------------*/ 489 490 /* 491 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 492 * that the request was handled directly with bio interface. 493 */ 494 static void dmio_complete(unsigned long error, void *context) 495 { 496 struct dm_buffer *b = context; 497 498 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 499 } 500 501 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 502 bio_end_io_t *end_io) 503 { 504 int r; 505 struct dm_io_request io_req = { 506 .bi_rw = rw, 507 .notify.fn = dmio_complete, 508 .notify.context = b, 509 .client = b->c->dm_io, 510 }; 511 struct dm_io_region region = { 512 .bdev = b->c->bdev, 513 .sector = block << b->c->sectors_per_block_bits, 514 .count = b->c->block_size >> SECTOR_SHIFT, 515 }; 516 517 if (b->data_mode != DATA_MODE_VMALLOC) { 518 io_req.mem.type = DM_IO_KMEM; 519 io_req.mem.ptr.addr = b->data; 520 } else { 521 io_req.mem.type = DM_IO_VMA; 522 io_req.mem.ptr.vma = b->data; 523 } 524 525 b->bio.bi_end_io = end_io; 526 527 r = dm_io(&io_req, 1, ®ion, NULL); 528 if (r) 529 end_io(&b->bio, r); 530 } 531 532 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 533 bio_end_io_t *end_io) 534 { 535 char *ptr; 536 int len; 537 538 bio_init(&b->bio); 539 b->bio.bi_io_vec = b->bio_vec; 540 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 541 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 542 b->bio.bi_bdev = b->c->bdev; 543 b->bio.bi_end_io = end_io; 544 545 /* 546 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 547 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 548 */ 549 ptr = b->data; 550 len = b->c->block_size; 551 552 if (len >= PAGE_SIZE) 553 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 554 else 555 BUG_ON((unsigned long)ptr & (len - 1)); 556 557 do { 558 if (!bio_add_page(&b->bio, virt_to_page(ptr), 559 len < PAGE_SIZE ? len : PAGE_SIZE, 560 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 561 BUG_ON(b->c->block_size <= PAGE_SIZE); 562 use_dmio(b, rw, block, end_io); 563 return; 564 } 565 566 len -= PAGE_SIZE; 567 ptr += PAGE_SIZE; 568 } while (len > 0); 569 570 submit_bio(rw, &b->bio); 571 } 572 573 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 574 bio_end_io_t *end_io) 575 { 576 if (rw == WRITE && b->c->write_callback) 577 b->c->write_callback(b); 578 579 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 580 b->data_mode != DATA_MODE_VMALLOC) 581 use_inline_bio(b, rw, block, end_io); 582 else 583 use_dmio(b, rw, block, end_io); 584 } 585 586 /*---------------------------------------------------------------- 587 * Writing dirty buffers 588 *--------------------------------------------------------------*/ 589 590 /* 591 * The endio routine for write. 592 * 593 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 594 * it. 595 */ 596 static void write_endio(struct bio *bio, int error) 597 { 598 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 599 600 b->write_error = error; 601 if (unlikely(error)) { 602 struct dm_bufio_client *c = b->c; 603 (void)cmpxchg(&c->async_write_error, 0, error); 604 } 605 606 BUG_ON(!test_bit(B_WRITING, &b->state)); 607 608 smp_mb__before_clear_bit(); 609 clear_bit(B_WRITING, &b->state); 610 smp_mb__after_clear_bit(); 611 612 wake_up_bit(&b->state, B_WRITING); 613 } 614 615 /* 616 * This function is called when wait_on_bit is actually waiting. 617 */ 618 static int do_io_schedule(void *word) 619 { 620 io_schedule(); 621 622 return 0; 623 } 624 625 /* 626 * Initiate a write on a dirty buffer, but don't wait for it. 627 * 628 * - If the buffer is not dirty, exit. 629 * - If there some previous write going on, wait for it to finish (we can't 630 * have two writes on the same buffer simultaneously). 631 * - Submit our write and don't wait on it. We set B_WRITING indicating 632 * that there is a write in progress. 633 */ 634 static void __write_dirty_buffer(struct dm_buffer *b, 635 struct list_head *write_list) 636 { 637 if (!test_bit(B_DIRTY, &b->state)) 638 return; 639 640 clear_bit(B_DIRTY, &b->state); 641 wait_on_bit_lock(&b->state, B_WRITING, 642 do_io_schedule, TASK_UNINTERRUPTIBLE); 643 644 if (!write_list) 645 submit_io(b, WRITE, b->block, write_endio); 646 else 647 list_add_tail(&b->write_list, write_list); 648 } 649 650 static void __flush_write_list(struct list_head *write_list) 651 { 652 struct blk_plug plug; 653 blk_start_plug(&plug); 654 while (!list_empty(write_list)) { 655 struct dm_buffer *b = 656 list_entry(write_list->next, struct dm_buffer, write_list); 657 list_del(&b->write_list); 658 submit_io(b, WRITE, b->block, write_endio); 659 dm_bufio_cond_resched(); 660 } 661 blk_finish_plug(&plug); 662 } 663 664 /* 665 * Wait until any activity on the buffer finishes. Possibly write the 666 * buffer if it is dirty. When this function finishes, there is no I/O 667 * running on the buffer and the buffer is not dirty. 668 */ 669 static void __make_buffer_clean(struct dm_buffer *b) 670 { 671 BUG_ON(b->hold_count); 672 673 if (!b->state) /* fast case */ 674 return; 675 676 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 677 __write_dirty_buffer(b, NULL); 678 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 679 } 680 681 /* 682 * Find some buffer that is not held by anybody, clean it, unlink it and 683 * return it. 684 */ 685 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 686 { 687 struct dm_buffer *b; 688 689 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 690 BUG_ON(test_bit(B_WRITING, &b->state)); 691 BUG_ON(test_bit(B_DIRTY, &b->state)); 692 693 if (!b->hold_count) { 694 __make_buffer_clean(b); 695 __unlink_buffer(b); 696 return b; 697 } 698 dm_bufio_cond_resched(); 699 } 700 701 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 702 BUG_ON(test_bit(B_READING, &b->state)); 703 704 if (!b->hold_count) { 705 __make_buffer_clean(b); 706 __unlink_buffer(b); 707 return b; 708 } 709 dm_bufio_cond_resched(); 710 } 711 712 return NULL; 713 } 714 715 /* 716 * Wait until some other threads free some buffer or release hold count on 717 * some buffer. 718 * 719 * This function is entered with c->lock held, drops it and regains it 720 * before exiting. 721 */ 722 static void __wait_for_free_buffer(struct dm_bufio_client *c) 723 { 724 DECLARE_WAITQUEUE(wait, current); 725 726 add_wait_queue(&c->free_buffer_wait, &wait); 727 set_task_state(current, TASK_UNINTERRUPTIBLE); 728 dm_bufio_unlock(c); 729 730 io_schedule(); 731 732 set_task_state(current, TASK_RUNNING); 733 remove_wait_queue(&c->free_buffer_wait, &wait); 734 735 dm_bufio_lock(c); 736 } 737 738 enum new_flag { 739 NF_FRESH = 0, 740 NF_READ = 1, 741 NF_GET = 2, 742 NF_PREFETCH = 3 743 }; 744 745 /* 746 * Allocate a new buffer. If the allocation is not possible, wait until 747 * some other thread frees a buffer. 748 * 749 * May drop the lock and regain it. 750 */ 751 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 752 { 753 struct dm_buffer *b; 754 755 /* 756 * dm-bufio is resistant to allocation failures (it just keeps 757 * one buffer reserved in cases all the allocations fail). 758 * So set flags to not try too hard: 759 * GFP_NOIO: don't recurse into the I/O layer 760 * __GFP_NORETRY: don't retry and rather return failure 761 * __GFP_NOMEMALLOC: don't use emergency reserves 762 * __GFP_NOWARN: don't print a warning in case of failure 763 * 764 * For debugging, if we set the cache size to 1, no new buffers will 765 * be allocated. 766 */ 767 while (1) { 768 if (dm_bufio_cache_size_latch != 1) { 769 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 770 if (b) 771 return b; 772 } 773 774 if (nf == NF_PREFETCH) 775 return NULL; 776 777 if (!list_empty(&c->reserved_buffers)) { 778 b = list_entry(c->reserved_buffers.next, 779 struct dm_buffer, lru_list); 780 list_del(&b->lru_list); 781 c->need_reserved_buffers++; 782 783 return b; 784 } 785 786 b = __get_unclaimed_buffer(c); 787 if (b) 788 return b; 789 790 __wait_for_free_buffer(c); 791 } 792 } 793 794 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 795 { 796 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 797 798 if (!b) 799 return NULL; 800 801 if (c->alloc_callback) 802 c->alloc_callback(b); 803 804 return b; 805 } 806 807 /* 808 * Free a buffer and wake other threads waiting for free buffers. 809 */ 810 static void __free_buffer_wake(struct dm_buffer *b) 811 { 812 struct dm_bufio_client *c = b->c; 813 814 if (!c->need_reserved_buffers) 815 free_buffer(b); 816 else { 817 list_add(&b->lru_list, &c->reserved_buffers); 818 c->need_reserved_buffers--; 819 } 820 821 wake_up(&c->free_buffer_wait); 822 } 823 824 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 825 struct list_head *write_list) 826 { 827 struct dm_buffer *b, *tmp; 828 829 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 830 BUG_ON(test_bit(B_READING, &b->state)); 831 832 if (!test_bit(B_DIRTY, &b->state) && 833 !test_bit(B_WRITING, &b->state)) { 834 __relink_lru(b, LIST_CLEAN); 835 continue; 836 } 837 838 if (no_wait && test_bit(B_WRITING, &b->state)) 839 return; 840 841 __write_dirty_buffer(b, write_list); 842 dm_bufio_cond_resched(); 843 } 844 } 845 846 /* 847 * Get writeback threshold and buffer limit for a given client. 848 */ 849 static void __get_memory_limit(struct dm_bufio_client *c, 850 unsigned long *threshold_buffers, 851 unsigned long *limit_buffers) 852 { 853 unsigned long buffers; 854 855 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 856 mutex_lock(&dm_bufio_clients_lock); 857 __cache_size_refresh(); 858 mutex_unlock(&dm_bufio_clients_lock); 859 } 860 861 buffers = dm_bufio_cache_size_per_client >> 862 (c->sectors_per_block_bits + SECTOR_SHIFT); 863 864 if (buffers < DM_BUFIO_MIN_BUFFERS) 865 buffers = DM_BUFIO_MIN_BUFFERS; 866 867 *limit_buffers = buffers; 868 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 869 } 870 871 /* 872 * Check if we're over watermark. 873 * If we are over threshold_buffers, start freeing buffers. 874 * If we're over "limit_buffers", block until we get under the limit. 875 */ 876 static void __check_watermark(struct dm_bufio_client *c, 877 struct list_head *write_list) 878 { 879 unsigned long threshold_buffers, limit_buffers; 880 881 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 882 883 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 884 limit_buffers) { 885 886 struct dm_buffer *b = __get_unclaimed_buffer(c); 887 888 if (!b) 889 return; 890 891 __free_buffer_wake(b); 892 dm_bufio_cond_resched(); 893 } 894 895 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 896 __write_dirty_buffers_async(c, 1, write_list); 897 } 898 899 /* 900 * Find a buffer in the hash. 901 */ 902 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 903 { 904 struct dm_buffer *b; 905 906 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 907 hash_list) { 908 dm_bufio_cond_resched(); 909 if (b->block == block) 910 return b; 911 } 912 913 return NULL; 914 } 915 916 /*---------------------------------------------------------------- 917 * Getting a buffer 918 *--------------------------------------------------------------*/ 919 920 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 921 enum new_flag nf, int *need_submit, 922 struct list_head *write_list) 923 { 924 struct dm_buffer *b, *new_b = NULL; 925 926 *need_submit = 0; 927 928 b = __find(c, block); 929 if (b) 930 goto found_buffer; 931 932 if (nf == NF_GET) 933 return NULL; 934 935 new_b = __alloc_buffer_wait(c, nf); 936 if (!new_b) 937 return NULL; 938 939 /* 940 * We've had a period where the mutex was unlocked, so need to 941 * recheck the hash table. 942 */ 943 b = __find(c, block); 944 if (b) { 945 __free_buffer_wake(new_b); 946 goto found_buffer; 947 } 948 949 __check_watermark(c, write_list); 950 951 b = new_b; 952 b->hold_count = 1; 953 b->read_error = 0; 954 b->write_error = 0; 955 __link_buffer(b, block, LIST_CLEAN); 956 957 if (nf == NF_FRESH) { 958 b->state = 0; 959 return b; 960 } 961 962 b->state = 1 << B_READING; 963 *need_submit = 1; 964 965 return b; 966 967 found_buffer: 968 if (nf == NF_PREFETCH) 969 return NULL; 970 /* 971 * Note: it is essential that we don't wait for the buffer to be 972 * read if dm_bufio_get function is used. Both dm_bufio_get and 973 * dm_bufio_prefetch can be used in the driver request routine. 974 * If the user called both dm_bufio_prefetch and dm_bufio_get on 975 * the same buffer, it would deadlock if we waited. 976 */ 977 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 978 return NULL; 979 980 b->hold_count++; 981 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 982 test_bit(B_WRITING, &b->state)); 983 return b; 984 } 985 986 /* 987 * The endio routine for reading: set the error, clear the bit and wake up 988 * anyone waiting on the buffer. 989 */ 990 static void read_endio(struct bio *bio, int error) 991 { 992 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 993 994 b->read_error = error; 995 996 BUG_ON(!test_bit(B_READING, &b->state)); 997 998 smp_mb__before_clear_bit(); 999 clear_bit(B_READING, &b->state); 1000 smp_mb__after_clear_bit(); 1001 1002 wake_up_bit(&b->state, B_READING); 1003 } 1004 1005 /* 1006 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1007 * functions is similar except that dm_bufio_new doesn't read the 1008 * buffer from the disk (assuming that the caller overwrites all the data 1009 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1010 */ 1011 static void *new_read(struct dm_bufio_client *c, sector_t block, 1012 enum new_flag nf, struct dm_buffer **bp) 1013 { 1014 int need_submit; 1015 struct dm_buffer *b; 1016 1017 LIST_HEAD(write_list); 1018 1019 dm_bufio_lock(c); 1020 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1021 dm_bufio_unlock(c); 1022 1023 __flush_write_list(&write_list); 1024 1025 if (!b) 1026 return b; 1027 1028 if (need_submit) 1029 submit_io(b, READ, b->block, read_endio); 1030 1031 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 1032 1033 if (b->read_error) { 1034 int error = b->read_error; 1035 1036 dm_bufio_release(b); 1037 1038 return ERR_PTR(error); 1039 } 1040 1041 *bp = b; 1042 1043 return b->data; 1044 } 1045 1046 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1047 struct dm_buffer **bp) 1048 { 1049 return new_read(c, block, NF_GET, bp); 1050 } 1051 EXPORT_SYMBOL_GPL(dm_bufio_get); 1052 1053 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1054 struct dm_buffer **bp) 1055 { 1056 BUG_ON(dm_bufio_in_request()); 1057 1058 return new_read(c, block, NF_READ, bp); 1059 } 1060 EXPORT_SYMBOL_GPL(dm_bufio_read); 1061 1062 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1063 struct dm_buffer **bp) 1064 { 1065 BUG_ON(dm_bufio_in_request()); 1066 1067 return new_read(c, block, NF_FRESH, bp); 1068 } 1069 EXPORT_SYMBOL_GPL(dm_bufio_new); 1070 1071 void dm_bufio_prefetch(struct dm_bufio_client *c, 1072 sector_t block, unsigned n_blocks) 1073 { 1074 struct blk_plug plug; 1075 1076 LIST_HEAD(write_list); 1077 1078 BUG_ON(dm_bufio_in_request()); 1079 1080 blk_start_plug(&plug); 1081 dm_bufio_lock(c); 1082 1083 for (; n_blocks--; block++) { 1084 int need_submit; 1085 struct dm_buffer *b; 1086 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1087 &write_list); 1088 if (unlikely(!list_empty(&write_list))) { 1089 dm_bufio_unlock(c); 1090 blk_finish_plug(&plug); 1091 __flush_write_list(&write_list); 1092 blk_start_plug(&plug); 1093 dm_bufio_lock(c); 1094 } 1095 if (unlikely(b != NULL)) { 1096 dm_bufio_unlock(c); 1097 1098 if (need_submit) 1099 submit_io(b, READ, b->block, read_endio); 1100 dm_bufio_release(b); 1101 1102 dm_bufio_cond_resched(); 1103 1104 if (!n_blocks) 1105 goto flush_plug; 1106 dm_bufio_lock(c); 1107 } 1108 } 1109 1110 dm_bufio_unlock(c); 1111 1112 flush_plug: 1113 blk_finish_plug(&plug); 1114 } 1115 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1116 1117 void dm_bufio_release(struct dm_buffer *b) 1118 { 1119 struct dm_bufio_client *c = b->c; 1120 1121 dm_bufio_lock(c); 1122 1123 BUG_ON(!b->hold_count); 1124 1125 b->hold_count--; 1126 if (!b->hold_count) { 1127 wake_up(&c->free_buffer_wait); 1128 1129 /* 1130 * If there were errors on the buffer, and the buffer is not 1131 * to be written, free the buffer. There is no point in caching 1132 * invalid buffer. 1133 */ 1134 if ((b->read_error || b->write_error) && 1135 !test_bit(B_READING, &b->state) && 1136 !test_bit(B_WRITING, &b->state) && 1137 !test_bit(B_DIRTY, &b->state)) { 1138 __unlink_buffer(b); 1139 __free_buffer_wake(b); 1140 } 1141 } 1142 1143 dm_bufio_unlock(c); 1144 } 1145 EXPORT_SYMBOL_GPL(dm_bufio_release); 1146 1147 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1148 { 1149 struct dm_bufio_client *c = b->c; 1150 1151 dm_bufio_lock(c); 1152 1153 BUG_ON(test_bit(B_READING, &b->state)); 1154 1155 if (!test_and_set_bit(B_DIRTY, &b->state)) 1156 __relink_lru(b, LIST_DIRTY); 1157 1158 dm_bufio_unlock(c); 1159 } 1160 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1161 1162 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1163 { 1164 LIST_HEAD(write_list); 1165 1166 BUG_ON(dm_bufio_in_request()); 1167 1168 dm_bufio_lock(c); 1169 __write_dirty_buffers_async(c, 0, &write_list); 1170 dm_bufio_unlock(c); 1171 __flush_write_list(&write_list); 1172 } 1173 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1174 1175 /* 1176 * For performance, it is essential that the buffers are written asynchronously 1177 * and simultaneously (so that the block layer can merge the writes) and then 1178 * waited upon. 1179 * 1180 * Finally, we flush hardware disk cache. 1181 */ 1182 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1183 { 1184 int a, f; 1185 unsigned long buffers_processed = 0; 1186 struct dm_buffer *b, *tmp; 1187 1188 LIST_HEAD(write_list); 1189 1190 dm_bufio_lock(c); 1191 __write_dirty_buffers_async(c, 0, &write_list); 1192 dm_bufio_unlock(c); 1193 __flush_write_list(&write_list); 1194 dm_bufio_lock(c); 1195 1196 again: 1197 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1198 int dropped_lock = 0; 1199 1200 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1201 buffers_processed++; 1202 1203 BUG_ON(test_bit(B_READING, &b->state)); 1204 1205 if (test_bit(B_WRITING, &b->state)) { 1206 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1207 dropped_lock = 1; 1208 b->hold_count++; 1209 dm_bufio_unlock(c); 1210 wait_on_bit(&b->state, B_WRITING, 1211 do_io_schedule, 1212 TASK_UNINTERRUPTIBLE); 1213 dm_bufio_lock(c); 1214 b->hold_count--; 1215 } else 1216 wait_on_bit(&b->state, B_WRITING, 1217 do_io_schedule, 1218 TASK_UNINTERRUPTIBLE); 1219 } 1220 1221 if (!test_bit(B_DIRTY, &b->state) && 1222 !test_bit(B_WRITING, &b->state)) 1223 __relink_lru(b, LIST_CLEAN); 1224 1225 dm_bufio_cond_resched(); 1226 1227 /* 1228 * If we dropped the lock, the list is no longer consistent, 1229 * so we must restart the search. 1230 * 1231 * In the most common case, the buffer just processed is 1232 * relinked to the clean list, so we won't loop scanning the 1233 * same buffer again and again. 1234 * 1235 * This may livelock if there is another thread simultaneously 1236 * dirtying buffers, so we count the number of buffers walked 1237 * and if it exceeds the total number of buffers, it means that 1238 * someone is doing some writes simultaneously with us. In 1239 * this case, stop, dropping the lock. 1240 */ 1241 if (dropped_lock) 1242 goto again; 1243 } 1244 wake_up(&c->free_buffer_wait); 1245 dm_bufio_unlock(c); 1246 1247 a = xchg(&c->async_write_error, 0); 1248 f = dm_bufio_issue_flush(c); 1249 if (a) 1250 return a; 1251 1252 return f; 1253 } 1254 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1255 1256 /* 1257 * Use dm-io to send and empty barrier flush the device. 1258 */ 1259 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1260 { 1261 struct dm_io_request io_req = { 1262 .bi_rw = WRITE_FLUSH, 1263 .mem.type = DM_IO_KMEM, 1264 .mem.ptr.addr = NULL, 1265 .client = c->dm_io, 1266 }; 1267 struct dm_io_region io_reg = { 1268 .bdev = c->bdev, 1269 .sector = 0, 1270 .count = 0, 1271 }; 1272 1273 BUG_ON(dm_bufio_in_request()); 1274 1275 return dm_io(&io_req, 1, &io_reg, NULL); 1276 } 1277 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1278 1279 /* 1280 * We first delete any other buffer that may be at that new location. 1281 * 1282 * Then, we write the buffer to the original location if it was dirty. 1283 * 1284 * Then, if we are the only one who is holding the buffer, relink the buffer 1285 * in the hash queue for the new location. 1286 * 1287 * If there was someone else holding the buffer, we write it to the new 1288 * location but not relink it, because that other user needs to have the buffer 1289 * at the same place. 1290 */ 1291 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1292 { 1293 struct dm_bufio_client *c = b->c; 1294 struct dm_buffer *new; 1295 1296 BUG_ON(dm_bufio_in_request()); 1297 1298 dm_bufio_lock(c); 1299 1300 retry: 1301 new = __find(c, new_block); 1302 if (new) { 1303 if (new->hold_count) { 1304 __wait_for_free_buffer(c); 1305 goto retry; 1306 } 1307 1308 /* 1309 * FIXME: Is there any point waiting for a write that's going 1310 * to be overwritten in a bit? 1311 */ 1312 __make_buffer_clean(new); 1313 __unlink_buffer(new); 1314 __free_buffer_wake(new); 1315 } 1316 1317 BUG_ON(!b->hold_count); 1318 BUG_ON(test_bit(B_READING, &b->state)); 1319 1320 __write_dirty_buffer(b, NULL); 1321 if (b->hold_count == 1) { 1322 wait_on_bit(&b->state, B_WRITING, 1323 do_io_schedule, TASK_UNINTERRUPTIBLE); 1324 set_bit(B_DIRTY, &b->state); 1325 __unlink_buffer(b); 1326 __link_buffer(b, new_block, LIST_DIRTY); 1327 } else { 1328 sector_t old_block; 1329 wait_on_bit_lock(&b->state, B_WRITING, 1330 do_io_schedule, TASK_UNINTERRUPTIBLE); 1331 /* 1332 * Relink buffer to "new_block" so that write_callback 1333 * sees "new_block" as a block number. 1334 * After the write, link the buffer back to old_block. 1335 * All this must be done in bufio lock, so that block number 1336 * change isn't visible to other threads. 1337 */ 1338 old_block = b->block; 1339 __unlink_buffer(b); 1340 __link_buffer(b, new_block, b->list_mode); 1341 submit_io(b, WRITE, new_block, write_endio); 1342 wait_on_bit(&b->state, B_WRITING, 1343 do_io_schedule, TASK_UNINTERRUPTIBLE); 1344 __unlink_buffer(b); 1345 __link_buffer(b, old_block, b->list_mode); 1346 } 1347 1348 dm_bufio_unlock(c); 1349 dm_bufio_release(b); 1350 } 1351 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1352 1353 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1354 { 1355 return c->block_size; 1356 } 1357 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1358 1359 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1360 { 1361 return i_size_read(c->bdev->bd_inode) >> 1362 (SECTOR_SHIFT + c->sectors_per_block_bits); 1363 } 1364 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1365 1366 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1367 { 1368 return b->block; 1369 } 1370 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1371 1372 void *dm_bufio_get_block_data(struct dm_buffer *b) 1373 { 1374 return b->data; 1375 } 1376 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1377 1378 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1379 { 1380 return b + 1; 1381 } 1382 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1383 1384 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1385 { 1386 return b->c; 1387 } 1388 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1389 1390 static void drop_buffers(struct dm_bufio_client *c) 1391 { 1392 struct dm_buffer *b; 1393 int i; 1394 1395 BUG_ON(dm_bufio_in_request()); 1396 1397 /* 1398 * An optimization so that the buffers are not written one-by-one. 1399 */ 1400 dm_bufio_write_dirty_buffers_async(c); 1401 1402 dm_bufio_lock(c); 1403 1404 while ((b = __get_unclaimed_buffer(c))) 1405 __free_buffer_wake(b); 1406 1407 for (i = 0; i < LIST_SIZE; i++) 1408 list_for_each_entry(b, &c->lru[i], lru_list) 1409 DMERR("leaked buffer %llx, hold count %u, list %d", 1410 (unsigned long long)b->block, b->hold_count, i); 1411 1412 for (i = 0; i < LIST_SIZE; i++) 1413 BUG_ON(!list_empty(&c->lru[i])); 1414 1415 dm_bufio_unlock(c); 1416 } 1417 1418 /* 1419 * Test if the buffer is unused and too old, and commit it. 1420 * At if noio is set, we must not do any I/O because we hold 1421 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1422 * different bufio client. 1423 */ 1424 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1425 unsigned long max_jiffies) 1426 { 1427 if (jiffies - b->last_accessed < max_jiffies) 1428 return 0; 1429 1430 if (!(gfp & __GFP_IO)) { 1431 if (test_bit(B_READING, &b->state) || 1432 test_bit(B_WRITING, &b->state) || 1433 test_bit(B_DIRTY, &b->state)) 1434 return 0; 1435 } 1436 1437 if (b->hold_count) 1438 return 0; 1439 1440 __make_buffer_clean(b); 1441 __unlink_buffer(b); 1442 __free_buffer_wake(b); 1443 1444 return 1; 1445 } 1446 1447 static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1448 gfp_t gfp_mask) 1449 { 1450 int l; 1451 struct dm_buffer *b, *tmp; 1452 long freed = 0; 1453 1454 for (l = 0; l < LIST_SIZE; l++) { 1455 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1456 freed += __cleanup_old_buffer(b, gfp_mask, 0); 1457 if (!--nr_to_scan) 1458 break; 1459 } 1460 dm_bufio_cond_resched(); 1461 } 1462 return freed; 1463 } 1464 1465 static unsigned long 1466 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1467 { 1468 struct dm_bufio_client *c; 1469 unsigned long freed; 1470 1471 c = container_of(shrink, struct dm_bufio_client, shrinker); 1472 if (sc->gfp_mask & __GFP_IO) 1473 dm_bufio_lock(c); 1474 else if (!dm_bufio_trylock(c)) 1475 return SHRINK_STOP; 1476 1477 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1478 dm_bufio_unlock(c); 1479 return freed; 1480 } 1481 1482 static unsigned long 1483 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1484 { 1485 struct dm_bufio_client *c; 1486 unsigned long count; 1487 1488 c = container_of(shrink, struct dm_bufio_client, shrinker); 1489 if (sc->gfp_mask & __GFP_IO) 1490 dm_bufio_lock(c); 1491 else if (!dm_bufio_trylock(c)) 1492 return 0; 1493 1494 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1495 dm_bufio_unlock(c); 1496 return count; 1497 } 1498 1499 /* 1500 * Create the buffering interface 1501 */ 1502 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1503 unsigned reserved_buffers, unsigned aux_size, 1504 void (*alloc_callback)(struct dm_buffer *), 1505 void (*write_callback)(struct dm_buffer *)) 1506 { 1507 int r; 1508 struct dm_bufio_client *c; 1509 unsigned i; 1510 1511 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1512 (block_size & (block_size - 1))); 1513 1514 c = kmalloc(sizeof(*c), GFP_KERNEL); 1515 if (!c) { 1516 r = -ENOMEM; 1517 goto bad_client; 1518 } 1519 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1520 if (!c->cache_hash) { 1521 r = -ENOMEM; 1522 goto bad_hash; 1523 } 1524 1525 c->bdev = bdev; 1526 c->block_size = block_size; 1527 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1528 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1529 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1530 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1531 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1532 1533 c->aux_size = aux_size; 1534 c->alloc_callback = alloc_callback; 1535 c->write_callback = write_callback; 1536 1537 for (i = 0; i < LIST_SIZE; i++) { 1538 INIT_LIST_HEAD(&c->lru[i]); 1539 c->n_buffers[i] = 0; 1540 } 1541 1542 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1543 INIT_HLIST_HEAD(&c->cache_hash[i]); 1544 1545 mutex_init(&c->lock); 1546 INIT_LIST_HEAD(&c->reserved_buffers); 1547 c->need_reserved_buffers = reserved_buffers; 1548 1549 init_waitqueue_head(&c->free_buffer_wait); 1550 c->async_write_error = 0; 1551 1552 c->dm_io = dm_io_client_create(); 1553 if (IS_ERR(c->dm_io)) { 1554 r = PTR_ERR(c->dm_io); 1555 goto bad_dm_io; 1556 } 1557 1558 mutex_lock(&dm_bufio_clients_lock); 1559 if (c->blocks_per_page_bits) { 1560 if (!DM_BUFIO_CACHE_NAME(c)) { 1561 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1562 if (!DM_BUFIO_CACHE_NAME(c)) { 1563 r = -ENOMEM; 1564 mutex_unlock(&dm_bufio_clients_lock); 1565 goto bad_cache; 1566 } 1567 } 1568 1569 if (!DM_BUFIO_CACHE(c)) { 1570 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1571 c->block_size, 1572 c->block_size, 0, NULL); 1573 if (!DM_BUFIO_CACHE(c)) { 1574 r = -ENOMEM; 1575 mutex_unlock(&dm_bufio_clients_lock); 1576 goto bad_cache; 1577 } 1578 } 1579 } 1580 mutex_unlock(&dm_bufio_clients_lock); 1581 1582 while (c->need_reserved_buffers) { 1583 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1584 1585 if (!b) { 1586 r = -ENOMEM; 1587 goto bad_buffer; 1588 } 1589 __free_buffer_wake(b); 1590 } 1591 1592 mutex_lock(&dm_bufio_clients_lock); 1593 dm_bufio_client_count++; 1594 list_add(&c->client_list, &dm_bufio_all_clients); 1595 __cache_size_refresh(); 1596 mutex_unlock(&dm_bufio_clients_lock); 1597 1598 c->shrinker.count_objects = dm_bufio_shrink_count; 1599 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1600 c->shrinker.seeks = 1; 1601 c->shrinker.batch = 0; 1602 register_shrinker(&c->shrinker); 1603 1604 return c; 1605 1606 bad_buffer: 1607 bad_cache: 1608 while (!list_empty(&c->reserved_buffers)) { 1609 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1610 struct dm_buffer, lru_list); 1611 list_del(&b->lru_list); 1612 free_buffer(b); 1613 } 1614 dm_io_client_destroy(c->dm_io); 1615 bad_dm_io: 1616 vfree(c->cache_hash); 1617 bad_hash: 1618 kfree(c); 1619 bad_client: 1620 return ERR_PTR(r); 1621 } 1622 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1623 1624 /* 1625 * Free the buffering interface. 1626 * It is required that there are no references on any buffers. 1627 */ 1628 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1629 { 1630 unsigned i; 1631 1632 drop_buffers(c); 1633 1634 unregister_shrinker(&c->shrinker); 1635 1636 mutex_lock(&dm_bufio_clients_lock); 1637 1638 list_del(&c->client_list); 1639 dm_bufio_client_count--; 1640 __cache_size_refresh(); 1641 1642 mutex_unlock(&dm_bufio_clients_lock); 1643 1644 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1645 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1646 1647 BUG_ON(c->need_reserved_buffers); 1648 1649 while (!list_empty(&c->reserved_buffers)) { 1650 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1651 struct dm_buffer, lru_list); 1652 list_del(&b->lru_list); 1653 free_buffer(b); 1654 } 1655 1656 for (i = 0; i < LIST_SIZE; i++) 1657 if (c->n_buffers[i]) 1658 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1659 1660 for (i = 0; i < LIST_SIZE; i++) 1661 BUG_ON(c->n_buffers[i]); 1662 1663 dm_io_client_destroy(c->dm_io); 1664 vfree(c->cache_hash); 1665 kfree(c); 1666 } 1667 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1668 1669 static void cleanup_old_buffers(void) 1670 { 1671 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1672 struct dm_bufio_client *c; 1673 1674 if (max_age > ULONG_MAX / HZ) 1675 max_age = ULONG_MAX / HZ; 1676 1677 mutex_lock(&dm_bufio_clients_lock); 1678 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1679 if (!dm_bufio_trylock(c)) 1680 continue; 1681 1682 while (!list_empty(&c->lru[LIST_CLEAN])) { 1683 struct dm_buffer *b; 1684 b = list_entry(c->lru[LIST_CLEAN].prev, 1685 struct dm_buffer, lru_list); 1686 if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1687 break; 1688 dm_bufio_cond_resched(); 1689 } 1690 1691 dm_bufio_unlock(c); 1692 dm_bufio_cond_resched(); 1693 } 1694 mutex_unlock(&dm_bufio_clients_lock); 1695 } 1696 1697 static struct workqueue_struct *dm_bufio_wq; 1698 static struct delayed_work dm_bufio_work; 1699 1700 static void work_fn(struct work_struct *w) 1701 { 1702 cleanup_old_buffers(); 1703 1704 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1705 DM_BUFIO_WORK_TIMER_SECS * HZ); 1706 } 1707 1708 /*---------------------------------------------------------------- 1709 * Module setup 1710 *--------------------------------------------------------------*/ 1711 1712 /* 1713 * This is called only once for the whole dm_bufio module. 1714 * It initializes memory limit. 1715 */ 1716 static int __init dm_bufio_init(void) 1717 { 1718 __u64 mem; 1719 1720 dm_bufio_allocated_kmem_cache = 0; 1721 dm_bufio_allocated_get_free_pages = 0; 1722 dm_bufio_allocated_vmalloc = 0; 1723 dm_bufio_current_allocated = 0; 1724 1725 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1726 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1727 1728 mem = (__u64)((totalram_pages - totalhigh_pages) * 1729 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1730 1731 if (mem > ULONG_MAX) 1732 mem = ULONG_MAX; 1733 1734 #ifdef CONFIG_MMU 1735 /* 1736 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1737 * in fs/proc/internal.h 1738 */ 1739 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1740 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1741 #endif 1742 1743 dm_bufio_default_cache_size = mem; 1744 1745 mutex_lock(&dm_bufio_clients_lock); 1746 __cache_size_refresh(); 1747 mutex_unlock(&dm_bufio_clients_lock); 1748 1749 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1750 if (!dm_bufio_wq) 1751 return -ENOMEM; 1752 1753 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1754 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1755 DM_BUFIO_WORK_TIMER_SECS * HZ); 1756 1757 return 0; 1758 } 1759 1760 /* 1761 * This is called once when unloading the dm_bufio module. 1762 */ 1763 static void __exit dm_bufio_exit(void) 1764 { 1765 int bug = 0; 1766 int i; 1767 1768 cancel_delayed_work_sync(&dm_bufio_work); 1769 destroy_workqueue(dm_bufio_wq); 1770 1771 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1772 struct kmem_cache *kc = dm_bufio_caches[i]; 1773 1774 if (kc) 1775 kmem_cache_destroy(kc); 1776 } 1777 1778 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1779 kfree(dm_bufio_cache_names[i]); 1780 1781 if (dm_bufio_client_count) { 1782 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1783 __func__, dm_bufio_client_count); 1784 bug = 1; 1785 } 1786 1787 if (dm_bufio_current_allocated) { 1788 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1789 __func__, dm_bufio_current_allocated); 1790 bug = 1; 1791 } 1792 1793 if (dm_bufio_allocated_get_free_pages) { 1794 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1795 __func__, dm_bufio_allocated_get_free_pages); 1796 bug = 1; 1797 } 1798 1799 if (dm_bufio_allocated_vmalloc) { 1800 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1801 __func__, dm_bufio_allocated_vmalloc); 1802 bug = 1; 1803 } 1804 1805 if (bug) 1806 BUG(); 1807 } 1808 1809 module_init(dm_bufio_init) 1810 module_exit(dm_bufio_exit) 1811 1812 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1813 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1814 1815 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1816 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1817 1818 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1819 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1820 1821 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1822 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1823 1824 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1825 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1826 1827 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1828 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1829 1830 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1831 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1832 1833 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1834 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1835 MODULE_LICENSE("GPL"); 1836