1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 struct hlist_head *cache_hash; 108 wait_queue_head_t free_buffer_wait; 109 110 int async_write_error; 111 112 struct list_head client_list; 113 struct shrinker shrinker; 114 }; 115 116 /* 117 * Buffer state bits. 118 */ 119 #define B_READING 0 120 #define B_WRITING 1 121 #define B_DIRTY 2 122 123 /* 124 * Describes how the block was allocated: 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 126 * See the comment at alloc_buffer_data. 127 */ 128 enum data_mode { 129 DATA_MODE_SLAB = 0, 130 DATA_MODE_GET_FREE_PAGES = 1, 131 DATA_MODE_VMALLOC = 2, 132 DATA_MODE_LIMIT = 3 133 }; 134 135 struct dm_buffer { 136 struct hlist_node hash_list; 137 struct list_head lru_list; 138 sector_t block; 139 void *data; 140 enum data_mode data_mode; 141 unsigned char list_mode; /* LIST_* */ 142 unsigned hold_count; 143 int read_error; 144 int write_error; 145 unsigned long state; 146 unsigned long last_accessed; 147 struct dm_bufio_client *c; 148 struct bio bio; 149 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 150 }; 151 152 /*----------------------------------------------------------------*/ 153 154 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 155 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 156 157 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 158 { 159 unsigned ret = c->blocks_per_page_bits - 1; 160 161 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 162 163 return ret; 164 } 165 166 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 167 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 168 169 #define dm_bufio_in_request() (!!current->bio_list) 170 171 static void dm_bufio_lock(struct dm_bufio_client *c) 172 { 173 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 174 } 175 176 static int dm_bufio_trylock(struct dm_bufio_client *c) 177 { 178 return mutex_trylock(&c->lock); 179 } 180 181 static void dm_bufio_unlock(struct dm_bufio_client *c) 182 { 183 mutex_unlock(&c->lock); 184 } 185 186 /* 187 * FIXME Move to sched.h? 188 */ 189 #ifdef CONFIG_PREEMPT_VOLUNTARY 190 # define dm_bufio_cond_resched() \ 191 do { \ 192 if (unlikely(need_resched())) \ 193 _cond_resched(); \ 194 } while (0) 195 #else 196 # define dm_bufio_cond_resched() do { } while (0) 197 #endif 198 199 /*----------------------------------------------------------------*/ 200 201 /* 202 * Default cache size: available memory divided by the ratio. 203 */ 204 static unsigned long dm_bufio_default_cache_size; 205 206 /* 207 * Total cache size set by the user. 208 */ 209 static unsigned long dm_bufio_cache_size; 210 211 /* 212 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 213 * at any time. If it disagrees, the user has changed cache size. 214 */ 215 static unsigned long dm_bufio_cache_size_latch; 216 217 static DEFINE_SPINLOCK(param_spinlock); 218 219 /* 220 * Buffers are freed after this timeout 221 */ 222 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 223 224 static unsigned long dm_bufio_peak_allocated; 225 static unsigned long dm_bufio_allocated_kmem_cache; 226 static unsigned long dm_bufio_allocated_get_free_pages; 227 static unsigned long dm_bufio_allocated_vmalloc; 228 static unsigned long dm_bufio_current_allocated; 229 230 /*----------------------------------------------------------------*/ 231 232 /* 233 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 234 */ 235 static unsigned long dm_bufio_cache_size_per_client; 236 237 /* 238 * The current number of clients. 239 */ 240 static int dm_bufio_client_count; 241 242 /* 243 * The list of all clients. 244 */ 245 static LIST_HEAD(dm_bufio_all_clients); 246 247 /* 248 * This mutex protects dm_bufio_cache_size_latch, 249 * dm_bufio_cache_size_per_client and dm_bufio_client_count 250 */ 251 static DEFINE_MUTEX(dm_bufio_clients_lock); 252 253 /*----------------------------------------------------------------*/ 254 255 static void adjust_total_allocated(enum data_mode data_mode, long diff) 256 { 257 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 258 &dm_bufio_allocated_kmem_cache, 259 &dm_bufio_allocated_get_free_pages, 260 &dm_bufio_allocated_vmalloc, 261 }; 262 263 spin_lock(¶m_spinlock); 264 265 *class_ptr[data_mode] += diff; 266 267 dm_bufio_current_allocated += diff; 268 269 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 270 dm_bufio_peak_allocated = dm_bufio_current_allocated; 271 272 spin_unlock(¶m_spinlock); 273 } 274 275 /* 276 * Change the number of clients and recalculate per-client limit. 277 */ 278 static void __cache_size_refresh(void) 279 { 280 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 281 BUG_ON(dm_bufio_client_count < 0); 282 283 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 284 285 /* 286 * Use default if set to 0 and report the actual cache size used. 287 */ 288 if (!dm_bufio_cache_size_latch) { 289 (void)cmpxchg(&dm_bufio_cache_size, 0, 290 dm_bufio_default_cache_size); 291 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 292 } 293 294 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 295 (dm_bufio_client_count ? : 1); 296 } 297 298 /* 299 * Allocating buffer data. 300 * 301 * Small buffers are allocated with kmem_cache, to use space optimally. 302 * 303 * For large buffers, we choose between get_free_pages and vmalloc. 304 * Each has advantages and disadvantages. 305 * 306 * __get_free_pages can randomly fail if the memory is fragmented. 307 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 308 * as low as 128M) so using it for caching is not appropriate. 309 * 310 * If the allocation may fail we use __get_free_pages. Memory fragmentation 311 * won't have a fatal effect here, but it just causes flushes of some other 312 * buffers and more I/O will be performed. Don't use __get_free_pages if it 313 * always fails (i.e. order >= MAX_ORDER). 314 * 315 * If the allocation shouldn't fail we use __vmalloc. This is only for the 316 * initial reserve allocation, so there's no risk of wasting all vmalloc 317 * space. 318 */ 319 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 320 enum data_mode *data_mode) 321 { 322 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 323 *data_mode = DATA_MODE_SLAB; 324 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 325 } 326 327 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 328 gfp_mask & __GFP_NORETRY) { 329 *data_mode = DATA_MODE_GET_FREE_PAGES; 330 return (void *)__get_free_pages(gfp_mask, 331 c->pages_per_block_bits); 332 } 333 334 *data_mode = DATA_MODE_VMALLOC; 335 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 336 } 337 338 /* 339 * Free buffer's data. 340 */ 341 static void free_buffer_data(struct dm_bufio_client *c, 342 void *data, enum data_mode data_mode) 343 { 344 switch (data_mode) { 345 case DATA_MODE_SLAB: 346 kmem_cache_free(DM_BUFIO_CACHE(c), data); 347 break; 348 349 case DATA_MODE_GET_FREE_PAGES: 350 free_pages((unsigned long)data, c->pages_per_block_bits); 351 break; 352 353 case DATA_MODE_VMALLOC: 354 vfree(data); 355 break; 356 357 default: 358 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 359 data_mode); 360 BUG(); 361 } 362 } 363 364 /* 365 * Allocate buffer and its data. 366 */ 367 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 368 { 369 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 370 gfp_mask); 371 372 if (!b) 373 return NULL; 374 375 b->c = c; 376 377 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 378 if (!b->data) { 379 kfree(b); 380 return NULL; 381 } 382 383 adjust_total_allocated(b->data_mode, (long)c->block_size); 384 385 return b; 386 } 387 388 /* 389 * Free buffer and its data. 390 */ 391 static void free_buffer(struct dm_buffer *b) 392 { 393 struct dm_bufio_client *c = b->c; 394 395 adjust_total_allocated(b->data_mode, -(long)c->block_size); 396 397 free_buffer_data(c, b->data, b->data_mode); 398 kfree(b); 399 } 400 401 /* 402 * Link buffer to the hash list and clean or dirty queue. 403 */ 404 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 405 { 406 struct dm_bufio_client *c = b->c; 407 408 c->n_buffers[dirty]++; 409 b->block = block; 410 b->list_mode = dirty; 411 list_add(&b->lru_list, &c->lru[dirty]); 412 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 413 b->last_accessed = jiffies; 414 } 415 416 /* 417 * Unlink buffer from the hash list and dirty or clean queue. 418 */ 419 static void __unlink_buffer(struct dm_buffer *b) 420 { 421 struct dm_bufio_client *c = b->c; 422 423 BUG_ON(!c->n_buffers[b->list_mode]); 424 425 c->n_buffers[b->list_mode]--; 426 hlist_del(&b->hash_list); 427 list_del(&b->lru_list); 428 } 429 430 /* 431 * Place the buffer to the head of dirty or clean LRU queue. 432 */ 433 static void __relink_lru(struct dm_buffer *b, int dirty) 434 { 435 struct dm_bufio_client *c = b->c; 436 437 BUG_ON(!c->n_buffers[b->list_mode]); 438 439 c->n_buffers[b->list_mode]--; 440 c->n_buffers[dirty]++; 441 b->list_mode = dirty; 442 list_move(&b->lru_list, &c->lru[dirty]); 443 } 444 445 /*---------------------------------------------------------------- 446 * Submit I/O on the buffer. 447 * 448 * Bio interface is faster but it has some problems: 449 * the vector list is limited (increasing this limit increases 450 * memory-consumption per buffer, so it is not viable); 451 * 452 * the memory must be direct-mapped, not vmalloced; 453 * 454 * the I/O driver can reject requests spuriously if it thinks that 455 * the requests are too big for the device or if they cross a 456 * controller-defined memory boundary. 457 * 458 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 459 * it is not vmalloced, try using the bio interface. 460 * 461 * If the buffer is big, if it is vmalloced or if the underlying device 462 * rejects the bio because it is too large, use dm-io layer to do the I/O. 463 * The dm-io layer splits the I/O into multiple requests, avoiding the above 464 * shortcomings. 465 *--------------------------------------------------------------*/ 466 467 /* 468 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 469 * that the request was handled directly with bio interface. 470 */ 471 static void dmio_complete(unsigned long error, void *context) 472 { 473 struct dm_buffer *b = context; 474 475 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 476 } 477 478 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 479 bio_end_io_t *end_io) 480 { 481 int r; 482 struct dm_io_request io_req = { 483 .bi_rw = rw, 484 .notify.fn = dmio_complete, 485 .notify.context = b, 486 .client = b->c->dm_io, 487 }; 488 struct dm_io_region region = { 489 .bdev = b->c->bdev, 490 .sector = block << b->c->sectors_per_block_bits, 491 .count = b->c->block_size >> SECTOR_SHIFT, 492 }; 493 494 if (b->data_mode != DATA_MODE_VMALLOC) { 495 io_req.mem.type = DM_IO_KMEM; 496 io_req.mem.ptr.addr = b->data; 497 } else { 498 io_req.mem.type = DM_IO_VMA; 499 io_req.mem.ptr.vma = b->data; 500 } 501 502 b->bio.bi_end_io = end_io; 503 504 r = dm_io(&io_req, 1, ®ion, NULL); 505 if (r) 506 end_io(&b->bio, r); 507 } 508 509 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 510 bio_end_io_t *end_io) 511 { 512 char *ptr; 513 int len; 514 515 bio_init(&b->bio); 516 b->bio.bi_io_vec = b->bio_vec; 517 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 518 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 519 b->bio.bi_bdev = b->c->bdev; 520 b->bio.bi_end_io = end_io; 521 522 /* 523 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 524 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 525 */ 526 ptr = b->data; 527 len = b->c->block_size; 528 529 if (len >= PAGE_SIZE) 530 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 531 else 532 BUG_ON((unsigned long)ptr & (len - 1)); 533 534 do { 535 if (!bio_add_page(&b->bio, virt_to_page(ptr), 536 len < PAGE_SIZE ? len : PAGE_SIZE, 537 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 538 BUG_ON(b->c->block_size <= PAGE_SIZE); 539 use_dmio(b, rw, block, end_io); 540 return; 541 } 542 543 len -= PAGE_SIZE; 544 ptr += PAGE_SIZE; 545 } while (len > 0); 546 547 submit_bio(rw, &b->bio); 548 } 549 550 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 551 bio_end_io_t *end_io) 552 { 553 if (rw == WRITE && b->c->write_callback) 554 b->c->write_callback(b); 555 556 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 557 b->data_mode != DATA_MODE_VMALLOC) 558 use_inline_bio(b, rw, block, end_io); 559 else 560 use_dmio(b, rw, block, end_io); 561 } 562 563 /*---------------------------------------------------------------- 564 * Writing dirty buffers 565 *--------------------------------------------------------------*/ 566 567 /* 568 * The endio routine for write. 569 * 570 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 571 * it. 572 */ 573 static void write_endio(struct bio *bio, int error) 574 { 575 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 576 577 b->write_error = error; 578 if (unlikely(error)) { 579 struct dm_bufio_client *c = b->c; 580 (void)cmpxchg(&c->async_write_error, 0, error); 581 } 582 583 BUG_ON(!test_bit(B_WRITING, &b->state)); 584 585 smp_mb__before_clear_bit(); 586 clear_bit(B_WRITING, &b->state); 587 smp_mb__after_clear_bit(); 588 589 wake_up_bit(&b->state, B_WRITING); 590 } 591 592 /* 593 * This function is called when wait_on_bit is actually waiting. 594 */ 595 static int do_io_schedule(void *word) 596 { 597 io_schedule(); 598 599 return 0; 600 } 601 602 /* 603 * Initiate a write on a dirty buffer, but don't wait for it. 604 * 605 * - If the buffer is not dirty, exit. 606 * - If there some previous write going on, wait for it to finish (we can't 607 * have two writes on the same buffer simultaneously). 608 * - Submit our write and don't wait on it. We set B_WRITING indicating 609 * that there is a write in progress. 610 */ 611 static void __write_dirty_buffer(struct dm_buffer *b) 612 { 613 if (!test_bit(B_DIRTY, &b->state)) 614 return; 615 616 clear_bit(B_DIRTY, &b->state); 617 wait_on_bit_lock(&b->state, B_WRITING, 618 do_io_schedule, TASK_UNINTERRUPTIBLE); 619 620 submit_io(b, WRITE, b->block, write_endio); 621 } 622 623 /* 624 * Wait until any activity on the buffer finishes. Possibly write the 625 * buffer if it is dirty. When this function finishes, there is no I/O 626 * running on the buffer and the buffer is not dirty. 627 */ 628 static void __make_buffer_clean(struct dm_buffer *b) 629 { 630 BUG_ON(b->hold_count); 631 632 if (!b->state) /* fast case */ 633 return; 634 635 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 636 __write_dirty_buffer(b); 637 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 638 } 639 640 /* 641 * Find some buffer that is not held by anybody, clean it, unlink it and 642 * return it. 643 */ 644 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 645 { 646 struct dm_buffer *b; 647 648 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 649 BUG_ON(test_bit(B_WRITING, &b->state)); 650 BUG_ON(test_bit(B_DIRTY, &b->state)); 651 652 if (!b->hold_count) { 653 __make_buffer_clean(b); 654 __unlink_buffer(b); 655 return b; 656 } 657 dm_bufio_cond_resched(); 658 } 659 660 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 661 BUG_ON(test_bit(B_READING, &b->state)); 662 663 if (!b->hold_count) { 664 __make_buffer_clean(b); 665 __unlink_buffer(b); 666 return b; 667 } 668 dm_bufio_cond_resched(); 669 } 670 671 return NULL; 672 } 673 674 /* 675 * Wait until some other threads free some buffer or release hold count on 676 * some buffer. 677 * 678 * This function is entered with c->lock held, drops it and regains it 679 * before exiting. 680 */ 681 static void __wait_for_free_buffer(struct dm_bufio_client *c) 682 { 683 DECLARE_WAITQUEUE(wait, current); 684 685 add_wait_queue(&c->free_buffer_wait, &wait); 686 set_task_state(current, TASK_UNINTERRUPTIBLE); 687 dm_bufio_unlock(c); 688 689 io_schedule(); 690 691 set_task_state(current, TASK_RUNNING); 692 remove_wait_queue(&c->free_buffer_wait, &wait); 693 694 dm_bufio_lock(c); 695 } 696 697 enum new_flag { 698 NF_FRESH = 0, 699 NF_READ = 1, 700 NF_GET = 2, 701 NF_PREFETCH = 3 702 }; 703 704 /* 705 * Allocate a new buffer. If the allocation is not possible, wait until 706 * some other thread frees a buffer. 707 * 708 * May drop the lock and regain it. 709 */ 710 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 711 { 712 struct dm_buffer *b; 713 714 /* 715 * dm-bufio is resistant to allocation failures (it just keeps 716 * one buffer reserved in cases all the allocations fail). 717 * So set flags to not try too hard: 718 * GFP_NOIO: don't recurse into the I/O layer 719 * __GFP_NORETRY: don't retry and rather return failure 720 * __GFP_NOMEMALLOC: don't use emergency reserves 721 * __GFP_NOWARN: don't print a warning in case of failure 722 * 723 * For debugging, if we set the cache size to 1, no new buffers will 724 * be allocated. 725 */ 726 while (1) { 727 if (dm_bufio_cache_size_latch != 1) { 728 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 729 if (b) 730 return b; 731 } 732 733 if (nf == NF_PREFETCH) 734 return NULL; 735 736 if (!list_empty(&c->reserved_buffers)) { 737 b = list_entry(c->reserved_buffers.next, 738 struct dm_buffer, lru_list); 739 list_del(&b->lru_list); 740 c->need_reserved_buffers++; 741 742 return b; 743 } 744 745 b = __get_unclaimed_buffer(c); 746 if (b) 747 return b; 748 749 __wait_for_free_buffer(c); 750 } 751 } 752 753 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 754 { 755 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 756 757 if (!b) 758 return NULL; 759 760 if (c->alloc_callback) 761 c->alloc_callback(b); 762 763 return b; 764 } 765 766 /* 767 * Free a buffer and wake other threads waiting for free buffers. 768 */ 769 static void __free_buffer_wake(struct dm_buffer *b) 770 { 771 struct dm_bufio_client *c = b->c; 772 773 if (!c->need_reserved_buffers) 774 free_buffer(b); 775 else { 776 list_add(&b->lru_list, &c->reserved_buffers); 777 c->need_reserved_buffers--; 778 } 779 780 wake_up(&c->free_buffer_wait); 781 } 782 783 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 784 { 785 struct dm_buffer *b, *tmp; 786 787 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 788 BUG_ON(test_bit(B_READING, &b->state)); 789 790 if (!test_bit(B_DIRTY, &b->state) && 791 !test_bit(B_WRITING, &b->state)) { 792 __relink_lru(b, LIST_CLEAN); 793 continue; 794 } 795 796 if (no_wait && test_bit(B_WRITING, &b->state)) 797 return; 798 799 __write_dirty_buffer(b); 800 dm_bufio_cond_resched(); 801 } 802 } 803 804 /* 805 * Get writeback threshold and buffer limit for a given client. 806 */ 807 static void __get_memory_limit(struct dm_bufio_client *c, 808 unsigned long *threshold_buffers, 809 unsigned long *limit_buffers) 810 { 811 unsigned long buffers; 812 813 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 814 mutex_lock(&dm_bufio_clients_lock); 815 __cache_size_refresh(); 816 mutex_unlock(&dm_bufio_clients_lock); 817 } 818 819 buffers = dm_bufio_cache_size_per_client >> 820 (c->sectors_per_block_bits + SECTOR_SHIFT); 821 822 if (buffers < DM_BUFIO_MIN_BUFFERS) 823 buffers = DM_BUFIO_MIN_BUFFERS; 824 825 *limit_buffers = buffers; 826 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 827 } 828 829 /* 830 * Check if we're over watermark. 831 * If we are over threshold_buffers, start freeing buffers. 832 * If we're over "limit_buffers", block until we get under the limit. 833 */ 834 static void __check_watermark(struct dm_bufio_client *c) 835 { 836 unsigned long threshold_buffers, limit_buffers; 837 838 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 839 840 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 841 limit_buffers) { 842 843 struct dm_buffer *b = __get_unclaimed_buffer(c); 844 845 if (!b) 846 return; 847 848 __free_buffer_wake(b); 849 dm_bufio_cond_resched(); 850 } 851 852 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 853 __write_dirty_buffers_async(c, 1); 854 } 855 856 /* 857 * Find a buffer in the hash. 858 */ 859 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 860 { 861 struct dm_buffer *b; 862 863 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 864 hash_list) { 865 dm_bufio_cond_resched(); 866 if (b->block == block) 867 return b; 868 } 869 870 return NULL; 871 } 872 873 /*---------------------------------------------------------------- 874 * Getting a buffer 875 *--------------------------------------------------------------*/ 876 877 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 878 enum new_flag nf, int *need_submit) 879 { 880 struct dm_buffer *b, *new_b = NULL; 881 882 *need_submit = 0; 883 884 b = __find(c, block); 885 if (b) 886 goto found_buffer; 887 888 if (nf == NF_GET) 889 return NULL; 890 891 new_b = __alloc_buffer_wait(c, nf); 892 if (!new_b) 893 return NULL; 894 895 /* 896 * We've had a period where the mutex was unlocked, so need to 897 * recheck the hash table. 898 */ 899 b = __find(c, block); 900 if (b) { 901 __free_buffer_wake(new_b); 902 goto found_buffer; 903 } 904 905 __check_watermark(c); 906 907 b = new_b; 908 b->hold_count = 1; 909 b->read_error = 0; 910 b->write_error = 0; 911 __link_buffer(b, block, LIST_CLEAN); 912 913 if (nf == NF_FRESH) { 914 b->state = 0; 915 return b; 916 } 917 918 b->state = 1 << B_READING; 919 *need_submit = 1; 920 921 return b; 922 923 found_buffer: 924 if (nf == NF_PREFETCH) 925 return NULL; 926 /* 927 * Note: it is essential that we don't wait for the buffer to be 928 * read if dm_bufio_get function is used. Both dm_bufio_get and 929 * dm_bufio_prefetch can be used in the driver request routine. 930 * If the user called both dm_bufio_prefetch and dm_bufio_get on 931 * the same buffer, it would deadlock if we waited. 932 */ 933 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 934 return NULL; 935 936 b->hold_count++; 937 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 938 test_bit(B_WRITING, &b->state)); 939 return b; 940 } 941 942 /* 943 * The endio routine for reading: set the error, clear the bit and wake up 944 * anyone waiting on the buffer. 945 */ 946 static void read_endio(struct bio *bio, int error) 947 { 948 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 949 950 b->read_error = error; 951 952 BUG_ON(!test_bit(B_READING, &b->state)); 953 954 smp_mb__before_clear_bit(); 955 clear_bit(B_READING, &b->state); 956 smp_mb__after_clear_bit(); 957 958 wake_up_bit(&b->state, B_READING); 959 } 960 961 /* 962 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 963 * functions is similar except that dm_bufio_new doesn't read the 964 * buffer from the disk (assuming that the caller overwrites all the data 965 * and uses dm_bufio_mark_buffer_dirty to write new data back). 966 */ 967 static void *new_read(struct dm_bufio_client *c, sector_t block, 968 enum new_flag nf, struct dm_buffer **bp) 969 { 970 int need_submit; 971 struct dm_buffer *b; 972 973 dm_bufio_lock(c); 974 b = __bufio_new(c, block, nf, &need_submit); 975 dm_bufio_unlock(c); 976 977 if (!b) 978 return b; 979 980 if (need_submit) 981 submit_io(b, READ, b->block, read_endio); 982 983 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 984 985 if (b->read_error) { 986 int error = b->read_error; 987 988 dm_bufio_release(b); 989 990 return ERR_PTR(error); 991 } 992 993 *bp = b; 994 995 return b->data; 996 } 997 998 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 999 struct dm_buffer **bp) 1000 { 1001 return new_read(c, block, NF_GET, bp); 1002 } 1003 EXPORT_SYMBOL_GPL(dm_bufio_get); 1004 1005 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1006 struct dm_buffer **bp) 1007 { 1008 BUG_ON(dm_bufio_in_request()); 1009 1010 return new_read(c, block, NF_READ, bp); 1011 } 1012 EXPORT_SYMBOL_GPL(dm_bufio_read); 1013 1014 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1015 struct dm_buffer **bp) 1016 { 1017 BUG_ON(dm_bufio_in_request()); 1018 1019 return new_read(c, block, NF_FRESH, bp); 1020 } 1021 EXPORT_SYMBOL_GPL(dm_bufio_new); 1022 1023 void dm_bufio_prefetch(struct dm_bufio_client *c, 1024 sector_t block, unsigned n_blocks) 1025 { 1026 struct blk_plug plug; 1027 1028 blk_start_plug(&plug); 1029 dm_bufio_lock(c); 1030 1031 for (; n_blocks--; block++) { 1032 int need_submit; 1033 struct dm_buffer *b; 1034 b = __bufio_new(c, block, NF_PREFETCH, &need_submit); 1035 if (unlikely(b != NULL)) { 1036 dm_bufio_unlock(c); 1037 1038 if (need_submit) 1039 submit_io(b, READ, b->block, read_endio); 1040 dm_bufio_release(b); 1041 1042 dm_bufio_cond_resched(); 1043 1044 if (!n_blocks) 1045 goto flush_plug; 1046 dm_bufio_lock(c); 1047 } 1048 1049 } 1050 1051 dm_bufio_unlock(c); 1052 1053 flush_plug: 1054 blk_finish_plug(&plug); 1055 } 1056 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1057 1058 void dm_bufio_release(struct dm_buffer *b) 1059 { 1060 struct dm_bufio_client *c = b->c; 1061 1062 dm_bufio_lock(c); 1063 1064 BUG_ON(!b->hold_count); 1065 1066 b->hold_count--; 1067 if (!b->hold_count) { 1068 wake_up(&c->free_buffer_wait); 1069 1070 /* 1071 * If there were errors on the buffer, and the buffer is not 1072 * to be written, free the buffer. There is no point in caching 1073 * invalid buffer. 1074 */ 1075 if ((b->read_error || b->write_error) && 1076 !test_bit(B_READING, &b->state) && 1077 !test_bit(B_WRITING, &b->state) && 1078 !test_bit(B_DIRTY, &b->state)) { 1079 __unlink_buffer(b); 1080 __free_buffer_wake(b); 1081 } 1082 } 1083 1084 dm_bufio_unlock(c); 1085 } 1086 EXPORT_SYMBOL_GPL(dm_bufio_release); 1087 1088 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1089 { 1090 struct dm_bufio_client *c = b->c; 1091 1092 dm_bufio_lock(c); 1093 1094 BUG_ON(test_bit(B_READING, &b->state)); 1095 1096 if (!test_and_set_bit(B_DIRTY, &b->state)) 1097 __relink_lru(b, LIST_DIRTY); 1098 1099 dm_bufio_unlock(c); 1100 } 1101 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1102 1103 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1104 { 1105 BUG_ON(dm_bufio_in_request()); 1106 1107 dm_bufio_lock(c); 1108 __write_dirty_buffers_async(c, 0); 1109 dm_bufio_unlock(c); 1110 } 1111 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1112 1113 /* 1114 * For performance, it is essential that the buffers are written asynchronously 1115 * and simultaneously (so that the block layer can merge the writes) and then 1116 * waited upon. 1117 * 1118 * Finally, we flush hardware disk cache. 1119 */ 1120 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1121 { 1122 int a, f; 1123 unsigned long buffers_processed = 0; 1124 struct dm_buffer *b, *tmp; 1125 1126 dm_bufio_lock(c); 1127 __write_dirty_buffers_async(c, 0); 1128 1129 again: 1130 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1131 int dropped_lock = 0; 1132 1133 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1134 buffers_processed++; 1135 1136 BUG_ON(test_bit(B_READING, &b->state)); 1137 1138 if (test_bit(B_WRITING, &b->state)) { 1139 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1140 dropped_lock = 1; 1141 b->hold_count++; 1142 dm_bufio_unlock(c); 1143 wait_on_bit(&b->state, B_WRITING, 1144 do_io_schedule, 1145 TASK_UNINTERRUPTIBLE); 1146 dm_bufio_lock(c); 1147 b->hold_count--; 1148 } else 1149 wait_on_bit(&b->state, B_WRITING, 1150 do_io_schedule, 1151 TASK_UNINTERRUPTIBLE); 1152 } 1153 1154 if (!test_bit(B_DIRTY, &b->state) && 1155 !test_bit(B_WRITING, &b->state)) 1156 __relink_lru(b, LIST_CLEAN); 1157 1158 dm_bufio_cond_resched(); 1159 1160 /* 1161 * If we dropped the lock, the list is no longer consistent, 1162 * so we must restart the search. 1163 * 1164 * In the most common case, the buffer just processed is 1165 * relinked to the clean list, so we won't loop scanning the 1166 * same buffer again and again. 1167 * 1168 * This may livelock if there is another thread simultaneously 1169 * dirtying buffers, so we count the number of buffers walked 1170 * and if it exceeds the total number of buffers, it means that 1171 * someone is doing some writes simultaneously with us. In 1172 * this case, stop, dropping the lock. 1173 */ 1174 if (dropped_lock) 1175 goto again; 1176 } 1177 wake_up(&c->free_buffer_wait); 1178 dm_bufio_unlock(c); 1179 1180 a = xchg(&c->async_write_error, 0); 1181 f = dm_bufio_issue_flush(c); 1182 if (a) 1183 return a; 1184 1185 return f; 1186 } 1187 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1188 1189 /* 1190 * Use dm-io to send and empty barrier flush the device. 1191 */ 1192 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1193 { 1194 struct dm_io_request io_req = { 1195 .bi_rw = WRITE_FLUSH, 1196 .mem.type = DM_IO_KMEM, 1197 .mem.ptr.addr = NULL, 1198 .client = c->dm_io, 1199 }; 1200 struct dm_io_region io_reg = { 1201 .bdev = c->bdev, 1202 .sector = 0, 1203 .count = 0, 1204 }; 1205 1206 BUG_ON(dm_bufio_in_request()); 1207 1208 return dm_io(&io_req, 1, &io_reg, NULL); 1209 } 1210 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1211 1212 /* 1213 * We first delete any other buffer that may be at that new location. 1214 * 1215 * Then, we write the buffer to the original location if it was dirty. 1216 * 1217 * Then, if we are the only one who is holding the buffer, relink the buffer 1218 * in the hash queue for the new location. 1219 * 1220 * If there was someone else holding the buffer, we write it to the new 1221 * location but not relink it, because that other user needs to have the buffer 1222 * at the same place. 1223 */ 1224 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1225 { 1226 struct dm_bufio_client *c = b->c; 1227 struct dm_buffer *new; 1228 1229 BUG_ON(dm_bufio_in_request()); 1230 1231 dm_bufio_lock(c); 1232 1233 retry: 1234 new = __find(c, new_block); 1235 if (new) { 1236 if (new->hold_count) { 1237 __wait_for_free_buffer(c); 1238 goto retry; 1239 } 1240 1241 /* 1242 * FIXME: Is there any point waiting for a write that's going 1243 * to be overwritten in a bit? 1244 */ 1245 __make_buffer_clean(new); 1246 __unlink_buffer(new); 1247 __free_buffer_wake(new); 1248 } 1249 1250 BUG_ON(!b->hold_count); 1251 BUG_ON(test_bit(B_READING, &b->state)); 1252 1253 __write_dirty_buffer(b); 1254 if (b->hold_count == 1) { 1255 wait_on_bit(&b->state, B_WRITING, 1256 do_io_schedule, TASK_UNINTERRUPTIBLE); 1257 set_bit(B_DIRTY, &b->state); 1258 __unlink_buffer(b); 1259 __link_buffer(b, new_block, LIST_DIRTY); 1260 } else { 1261 sector_t old_block; 1262 wait_on_bit_lock(&b->state, B_WRITING, 1263 do_io_schedule, TASK_UNINTERRUPTIBLE); 1264 /* 1265 * Relink buffer to "new_block" so that write_callback 1266 * sees "new_block" as a block number. 1267 * After the write, link the buffer back to old_block. 1268 * All this must be done in bufio lock, so that block number 1269 * change isn't visible to other threads. 1270 */ 1271 old_block = b->block; 1272 __unlink_buffer(b); 1273 __link_buffer(b, new_block, b->list_mode); 1274 submit_io(b, WRITE, new_block, write_endio); 1275 wait_on_bit(&b->state, B_WRITING, 1276 do_io_schedule, TASK_UNINTERRUPTIBLE); 1277 __unlink_buffer(b); 1278 __link_buffer(b, old_block, b->list_mode); 1279 } 1280 1281 dm_bufio_unlock(c); 1282 dm_bufio_release(b); 1283 } 1284 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1285 1286 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1287 { 1288 return c->block_size; 1289 } 1290 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1291 1292 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1293 { 1294 return i_size_read(c->bdev->bd_inode) >> 1295 (SECTOR_SHIFT + c->sectors_per_block_bits); 1296 } 1297 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1298 1299 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1300 { 1301 return b->block; 1302 } 1303 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1304 1305 void *dm_bufio_get_block_data(struct dm_buffer *b) 1306 { 1307 return b->data; 1308 } 1309 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1310 1311 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1312 { 1313 return b + 1; 1314 } 1315 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1316 1317 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1318 { 1319 return b->c; 1320 } 1321 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1322 1323 static void drop_buffers(struct dm_bufio_client *c) 1324 { 1325 struct dm_buffer *b; 1326 int i; 1327 1328 BUG_ON(dm_bufio_in_request()); 1329 1330 /* 1331 * An optimization so that the buffers are not written one-by-one. 1332 */ 1333 dm_bufio_write_dirty_buffers_async(c); 1334 1335 dm_bufio_lock(c); 1336 1337 while ((b = __get_unclaimed_buffer(c))) 1338 __free_buffer_wake(b); 1339 1340 for (i = 0; i < LIST_SIZE; i++) 1341 list_for_each_entry(b, &c->lru[i], lru_list) 1342 DMERR("leaked buffer %llx, hold count %u, list %d", 1343 (unsigned long long)b->block, b->hold_count, i); 1344 1345 for (i = 0; i < LIST_SIZE; i++) 1346 BUG_ON(!list_empty(&c->lru[i])); 1347 1348 dm_bufio_unlock(c); 1349 } 1350 1351 /* 1352 * Test if the buffer is unused and too old, and commit it. 1353 * At if noio is set, we must not do any I/O because we hold 1354 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1355 * different bufio client. 1356 */ 1357 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1358 unsigned long max_jiffies) 1359 { 1360 if (jiffies - b->last_accessed < max_jiffies) 1361 return 1; 1362 1363 if (!(gfp & __GFP_IO)) { 1364 if (test_bit(B_READING, &b->state) || 1365 test_bit(B_WRITING, &b->state) || 1366 test_bit(B_DIRTY, &b->state)) 1367 return 1; 1368 } 1369 1370 if (b->hold_count) 1371 return 1; 1372 1373 __make_buffer_clean(b); 1374 __unlink_buffer(b); 1375 __free_buffer_wake(b); 1376 1377 return 0; 1378 } 1379 1380 static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1381 struct shrink_control *sc) 1382 { 1383 int l; 1384 struct dm_buffer *b, *tmp; 1385 1386 for (l = 0; l < LIST_SIZE; l++) { 1387 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) 1388 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && 1389 !--nr_to_scan) 1390 return; 1391 dm_bufio_cond_resched(); 1392 } 1393 } 1394 1395 static int shrink(struct shrinker *shrinker, struct shrink_control *sc) 1396 { 1397 struct dm_bufio_client *c = 1398 container_of(shrinker, struct dm_bufio_client, shrinker); 1399 unsigned long r; 1400 unsigned long nr_to_scan = sc->nr_to_scan; 1401 1402 if (sc->gfp_mask & __GFP_IO) 1403 dm_bufio_lock(c); 1404 else if (!dm_bufio_trylock(c)) 1405 return !nr_to_scan ? 0 : -1; 1406 1407 if (nr_to_scan) 1408 __scan(c, nr_to_scan, sc); 1409 1410 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1411 if (r > INT_MAX) 1412 r = INT_MAX; 1413 1414 dm_bufio_unlock(c); 1415 1416 return r; 1417 } 1418 1419 /* 1420 * Create the buffering interface 1421 */ 1422 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1423 unsigned reserved_buffers, unsigned aux_size, 1424 void (*alloc_callback)(struct dm_buffer *), 1425 void (*write_callback)(struct dm_buffer *)) 1426 { 1427 int r; 1428 struct dm_bufio_client *c; 1429 unsigned i; 1430 1431 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1432 (block_size & (block_size - 1))); 1433 1434 c = kmalloc(sizeof(*c), GFP_KERNEL); 1435 if (!c) { 1436 r = -ENOMEM; 1437 goto bad_client; 1438 } 1439 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1440 if (!c->cache_hash) { 1441 r = -ENOMEM; 1442 goto bad_hash; 1443 } 1444 1445 c->bdev = bdev; 1446 c->block_size = block_size; 1447 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1448 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1449 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1450 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1451 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1452 1453 c->aux_size = aux_size; 1454 c->alloc_callback = alloc_callback; 1455 c->write_callback = write_callback; 1456 1457 for (i = 0; i < LIST_SIZE; i++) { 1458 INIT_LIST_HEAD(&c->lru[i]); 1459 c->n_buffers[i] = 0; 1460 } 1461 1462 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1463 INIT_HLIST_HEAD(&c->cache_hash[i]); 1464 1465 mutex_init(&c->lock); 1466 INIT_LIST_HEAD(&c->reserved_buffers); 1467 c->need_reserved_buffers = reserved_buffers; 1468 1469 init_waitqueue_head(&c->free_buffer_wait); 1470 c->async_write_error = 0; 1471 1472 c->dm_io = dm_io_client_create(); 1473 if (IS_ERR(c->dm_io)) { 1474 r = PTR_ERR(c->dm_io); 1475 goto bad_dm_io; 1476 } 1477 1478 mutex_lock(&dm_bufio_clients_lock); 1479 if (c->blocks_per_page_bits) { 1480 if (!DM_BUFIO_CACHE_NAME(c)) { 1481 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1482 if (!DM_BUFIO_CACHE_NAME(c)) { 1483 r = -ENOMEM; 1484 mutex_unlock(&dm_bufio_clients_lock); 1485 goto bad_cache; 1486 } 1487 } 1488 1489 if (!DM_BUFIO_CACHE(c)) { 1490 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1491 c->block_size, 1492 c->block_size, 0, NULL); 1493 if (!DM_BUFIO_CACHE(c)) { 1494 r = -ENOMEM; 1495 mutex_unlock(&dm_bufio_clients_lock); 1496 goto bad_cache; 1497 } 1498 } 1499 } 1500 mutex_unlock(&dm_bufio_clients_lock); 1501 1502 while (c->need_reserved_buffers) { 1503 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1504 1505 if (!b) { 1506 r = -ENOMEM; 1507 goto bad_buffer; 1508 } 1509 __free_buffer_wake(b); 1510 } 1511 1512 mutex_lock(&dm_bufio_clients_lock); 1513 dm_bufio_client_count++; 1514 list_add(&c->client_list, &dm_bufio_all_clients); 1515 __cache_size_refresh(); 1516 mutex_unlock(&dm_bufio_clients_lock); 1517 1518 c->shrinker.shrink = shrink; 1519 c->shrinker.seeks = 1; 1520 c->shrinker.batch = 0; 1521 register_shrinker(&c->shrinker); 1522 1523 return c; 1524 1525 bad_buffer: 1526 bad_cache: 1527 while (!list_empty(&c->reserved_buffers)) { 1528 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1529 struct dm_buffer, lru_list); 1530 list_del(&b->lru_list); 1531 free_buffer(b); 1532 } 1533 dm_io_client_destroy(c->dm_io); 1534 bad_dm_io: 1535 vfree(c->cache_hash); 1536 bad_hash: 1537 kfree(c); 1538 bad_client: 1539 return ERR_PTR(r); 1540 } 1541 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1542 1543 /* 1544 * Free the buffering interface. 1545 * It is required that there are no references on any buffers. 1546 */ 1547 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1548 { 1549 unsigned i; 1550 1551 drop_buffers(c); 1552 1553 unregister_shrinker(&c->shrinker); 1554 1555 mutex_lock(&dm_bufio_clients_lock); 1556 1557 list_del(&c->client_list); 1558 dm_bufio_client_count--; 1559 __cache_size_refresh(); 1560 1561 mutex_unlock(&dm_bufio_clients_lock); 1562 1563 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1564 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1565 1566 BUG_ON(c->need_reserved_buffers); 1567 1568 while (!list_empty(&c->reserved_buffers)) { 1569 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1570 struct dm_buffer, lru_list); 1571 list_del(&b->lru_list); 1572 free_buffer(b); 1573 } 1574 1575 for (i = 0; i < LIST_SIZE; i++) 1576 if (c->n_buffers[i]) 1577 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1578 1579 for (i = 0; i < LIST_SIZE; i++) 1580 BUG_ON(c->n_buffers[i]); 1581 1582 dm_io_client_destroy(c->dm_io); 1583 vfree(c->cache_hash); 1584 kfree(c); 1585 } 1586 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1587 1588 static void cleanup_old_buffers(void) 1589 { 1590 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1591 struct dm_bufio_client *c; 1592 1593 if (max_age > ULONG_MAX / HZ) 1594 max_age = ULONG_MAX / HZ; 1595 1596 mutex_lock(&dm_bufio_clients_lock); 1597 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1598 if (!dm_bufio_trylock(c)) 1599 continue; 1600 1601 while (!list_empty(&c->lru[LIST_CLEAN])) { 1602 struct dm_buffer *b; 1603 b = list_entry(c->lru[LIST_CLEAN].prev, 1604 struct dm_buffer, lru_list); 1605 if (__cleanup_old_buffer(b, 0, max_age * HZ)) 1606 break; 1607 dm_bufio_cond_resched(); 1608 } 1609 1610 dm_bufio_unlock(c); 1611 dm_bufio_cond_resched(); 1612 } 1613 mutex_unlock(&dm_bufio_clients_lock); 1614 } 1615 1616 static struct workqueue_struct *dm_bufio_wq; 1617 static struct delayed_work dm_bufio_work; 1618 1619 static void work_fn(struct work_struct *w) 1620 { 1621 cleanup_old_buffers(); 1622 1623 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1624 DM_BUFIO_WORK_TIMER_SECS * HZ); 1625 } 1626 1627 /*---------------------------------------------------------------- 1628 * Module setup 1629 *--------------------------------------------------------------*/ 1630 1631 /* 1632 * This is called only once for the whole dm_bufio module. 1633 * It initializes memory limit. 1634 */ 1635 static int __init dm_bufio_init(void) 1636 { 1637 __u64 mem; 1638 1639 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1640 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1641 1642 mem = (__u64)((totalram_pages - totalhigh_pages) * 1643 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1644 1645 if (mem > ULONG_MAX) 1646 mem = ULONG_MAX; 1647 1648 #ifdef CONFIG_MMU 1649 /* 1650 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1651 * in fs/proc/internal.h 1652 */ 1653 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1654 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1655 #endif 1656 1657 dm_bufio_default_cache_size = mem; 1658 1659 mutex_lock(&dm_bufio_clients_lock); 1660 __cache_size_refresh(); 1661 mutex_unlock(&dm_bufio_clients_lock); 1662 1663 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1664 if (!dm_bufio_wq) 1665 return -ENOMEM; 1666 1667 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1668 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1669 DM_BUFIO_WORK_TIMER_SECS * HZ); 1670 1671 return 0; 1672 } 1673 1674 /* 1675 * This is called once when unloading the dm_bufio module. 1676 */ 1677 static void __exit dm_bufio_exit(void) 1678 { 1679 int bug = 0; 1680 int i; 1681 1682 cancel_delayed_work_sync(&dm_bufio_work); 1683 destroy_workqueue(dm_bufio_wq); 1684 1685 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1686 struct kmem_cache *kc = dm_bufio_caches[i]; 1687 1688 if (kc) 1689 kmem_cache_destroy(kc); 1690 } 1691 1692 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1693 kfree(dm_bufio_cache_names[i]); 1694 1695 if (dm_bufio_client_count) { 1696 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1697 __func__, dm_bufio_client_count); 1698 bug = 1; 1699 } 1700 1701 if (dm_bufio_current_allocated) { 1702 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1703 __func__, dm_bufio_current_allocated); 1704 bug = 1; 1705 } 1706 1707 if (dm_bufio_allocated_get_free_pages) { 1708 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1709 __func__, dm_bufio_allocated_get_free_pages); 1710 bug = 1; 1711 } 1712 1713 if (dm_bufio_allocated_vmalloc) { 1714 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1715 __func__, dm_bufio_allocated_vmalloc); 1716 bug = 1; 1717 } 1718 1719 if (bug) 1720 BUG(); 1721 } 1722 1723 module_init(dm_bufio_init) 1724 module_exit(dm_bufio_exit) 1725 1726 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1727 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1728 1729 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1730 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1731 1732 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1733 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1734 1735 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1736 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1737 1738 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1739 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1740 1741 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1742 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1743 1744 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1745 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1746 1747 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1748 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1749 MODULE_LICENSE("GPL"); 1750