1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 struct hlist_head *cache_hash; 108 wait_queue_head_t free_buffer_wait; 109 110 int async_write_error; 111 112 struct list_head client_list; 113 struct shrinker shrinker; 114 }; 115 116 /* 117 * Buffer state bits. 118 */ 119 #define B_READING 0 120 #define B_WRITING 1 121 #define B_DIRTY 2 122 123 /* 124 * Describes how the block was allocated: 125 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 126 * See the comment at alloc_buffer_data. 127 */ 128 enum data_mode { 129 DATA_MODE_SLAB = 0, 130 DATA_MODE_GET_FREE_PAGES = 1, 131 DATA_MODE_VMALLOC = 2, 132 DATA_MODE_LIMIT = 3 133 }; 134 135 struct dm_buffer { 136 struct hlist_node hash_list; 137 struct list_head lru_list; 138 sector_t block; 139 void *data; 140 enum data_mode data_mode; 141 unsigned char list_mode; /* LIST_* */ 142 unsigned hold_count; 143 int read_error; 144 int write_error; 145 unsigned long state; 146 unsigned long last_accessed; 147 struct dm_bufio_client *c; 148 struct bio bio; 149 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 150 }; 151 152 /*----------------------------------------------------------------*/ 153 154 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 155 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 156 157 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 158 { 159 unsigned ret = c->blocks_per_page_bits - 1; 160 161 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 162 163 return ret; 164 } 165 166 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 167 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 168 169 #define dm_bufio_in_request() (!!current->bio_list) 170 171 static void dm_bufio_lock(struct dm_bufio_client *c) 172 { 173 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 174 } 175 176 static int dm_bufio_trylock(struct dm_bufio_client *c) 177 { 178 return mutex_trylock(&c->lock); 179 } 180 181 static void dm_bufio_unlock(struct dm_bufio_client *c) 182 { 183 mutex_unlock(&c->lock); 184 } 185 186 /* 187 * FIXME Move to sched.h? 188 */ 189 #ifdef CONFIG_PREEMPT_VOLUNTARY 190 # define dm_bufio_cond_resched() \ 191 do { \ 192 if (unlikely(need_resched())) \ 193 _cond_resched(); \ 194 } while (0) 195 #else 196 # define dm_bufio_cond_resched() do { } while (0) 197 #endif 198 199 /*----------------------------------------------------------------*/ 200 201 /* 202 * Default cache size: available memory divided by the ratio. 203 */ 204 static unsigned long dm_bufio_default_cache_size; 205 206 /* 207 * Total cache size set by the user. 208 */ 209 static unsigned long dm_bufio_cache_size; 210 211 /* 212 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 213 * at any time. If it disagrees, the user has changed cache size. 214 */ 215 static unsigned long dm_bufio_cache_size_latch; 216 217 static DEFINE_SPINLOCK(param_spinlock); 218 219 /* 220 * Buffers are freed after this timeout 221 */ 222 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 223 224 static unsigned long dm_bufio_peak_allocated; 225 static unsigned long dm_bufio_allocated_kmem_cache; 226 static unsigned long dm_bufio_allocated_get_free_pages; 227 static unsigned long dm_bufio_allocated_vmalloc; 228 static unsigned long dm_bufio_current_allocated; 229 230 /*----------------------------------------------------------------*/ 231 232 /* 233 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 234 */ 235 static unsigned long dm_bufio_cache_size_per_client; 236 237 /* 238 * The current number of clients. 239 */ 240 static int dm_bufio_client_count; 241 242 /* 243 * The list of all clients. 244 */ 245 static LIST_HEAD(dm_bufio_all_clients); 246 247 /* 248 * This mutex protects dm_bufio_cache_size_latch, 249 * dm_bufio_cache_size_per_client and dm_bufio_client_count 250 */ 251 static DEFINE_MUTEX(dm_bufio_clients_lock); 252 253 /*----------------------------------------------------------------*/ 254 255 static void adjust_total_allocated(enum data_mode data_mode, long diff) 256 { 257 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 258 &dm_bufio_allocated_kmem_cache, 259 &dm_bufio_allocated_get_free_pages, 260 &dm_bufio_allocated_vmalloc, 261 }; 262 263 spin_lock(¶m_spinlock); 264 265 *class_ptr[data_mode] += diff; 266 267 dm_bufio_current_allocated += diff; 268 269 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 270 dm_bufio_peak_allocated = dm_bufio_current_allocated; 271 272 spin_unlock(¶m_spinlock); 273 } 274 275 /* 276 * Change the number of clients and recalculate per-client limit. 277 */ 278 static void __cache_size_refresh(void) 279 { 280 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 281 BUG_ON(dm_bufio_client_count < 0); 282 283 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 284 285 /* 286 * Use default if set to 0 and report the actual cache size used. 287 */ 288 if (!dm_bufio_cache_size_latch) { 289 (void)cmpxchg(&dm_bufio_cache_size, 0, 290 dm_bufio_default_cache_size); 291 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 292 } 293 294 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 295 (dm_bufio_client_count ? : 1); 296 } 297 298 /* 299 * Allocating buffer data. 300 * 301 * Small buffers are allocated with kmem_cache, to use space optimally. 302 * 303 * For large buffers, we choose between get_free_pages and vmalloc. 304 * Each has advantages and disadvantages. 305 * 306 * __get_free_pages can randomly fail if the memory is fragmented. 307 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 308 * as low as 128M) so using it for caching is not appropriate. 309 * 310 * If the allocation may fail we use __get_free_pages. Memory fragmentation 311 * won't have a fatal effect here, but it just causes flushes of some other 312 * buffers and more I/O will be performed. Don't use __get_free_pages if it 313 * always fails (i.e. order >= MAX_ORDER). 314 * 315 * If the allocation shouldn't fail we use __vmalloc. This is only for the 316 * initial reserve allocation, so there's no risk of wasting all vmalloc 317 * space. 318 */ 319 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 320 enum data_mode *data_mode) 321 { 322 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 323 *data_mode = DATA_MODE_SLAB; 324 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 325 } 326 327 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 328 gfp_mask & __GFP_NORETRY) { 329 *data_mode = DATA_MODE_GET_FREE_PAGES; 330 return (void *)__get_free_pages(gfp_mask, 331 c->pages_per_block_bits); 332 } 333 334 *data_mode = DATA_MODE_VMALLOC; 335 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); 336 } 337 338 /* 339 * Free buffer's data. 340 */ 341 static void free_buffer_data(struct dm_bufio_client *c, 342 void *data, enum data_mode data_mode) 343 { 344 switch (data_mode) { 345 case DATA_MODE_SLAB: 346 kmem_cache_free(DM_BUFIO_CACHE(c), data); 347 break; 348 349 case DATA_MODE_GET_FREE_PAGES: 350 free_pages((unsigned long)data, c->pages_per_block_bits); 351 break; 352 353 case DATA_MODE_VMALLOC: 354 vfree(data); 355 break; 356 357 default: 358 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 359 data_mode); 360 BUG(); 361 } 362 } 363 364 /* 365 * Allocate buffer and its data. 366 */ 367 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 368 { 369 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 370 gfp_mask); 371 372 if (!b) 373 return NULL; 374 375 b->c = c; 376 377 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 378 if (!b->data) { 379 kfree(b); 380 return NULL; 381 } 382 383 adjust_total_allocated(b->data_mode, (long)c->block_size); 384 385 return b; 386 } 387 388 /* 389 * Free buffer and its data. 390 */ 391 static void free_buffer(struct dm_buffer *b) 392 { 393 struct dm_bufio_client *c = b->c; 394 395 adjust_total_allocated(b->data_mode, -(long)c->block_size); 396 397 free_buffer_data(c, b->data, b->data_mode); 398 kfree(b); 399 } 400 401 /* 402 * Link buffer to the hash list and clean or dirty queue. 403 */ 404 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 405 { 406 struct dm_bufio_client *c = b->c; 407 408 c->n_buffers[dirty]++; 409 b->block = block; 410 b->list_mode = dirty; 411 list_add(&b->lru_list, &c->lru[dirty]); 412 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 413 b->last_accessed = jiffies; 414 } 415 416 /* 417 * Unlink buffer from the hash list and dirty or clean queue. 418 */ 419 static void __unlink_buffer(struct dm_buffer *b) 420 { 421 struct dm_bufio_client *c = b->c; 422 423 BUG_ON(!c->n_buffers[b->list_mode]); 424 425 c->n_buffers[b->list_mode]--; 426 hlist_del(&b->hash_list); 427 list_del(&b->lru_list); 428 } 429 430 /* 431 * Place the buffer to the head of dirty or clean LRU queue. 432 */ 433 static void __relink_lru(struct dm_buffer *b, int dirty) 434 { 435 struct dm_bufio_client *c = b->c; 436 437 BUG_ON(!c->n_buffers[b->list_mode]); 438 439 c->n_buffers[b->list_mode]--; 440 c->n_buffers[dirty]++; 441 b->list_mode = dirty; 442 list_move(&b->lru_list, &c->lru[dirty]); 443 } 444 445 /*---------------------------------------------------------------- 446 * Submit I/O on the buffer. 447 * 448 * Bio interface is faster but it has some problems: 449 * the vector list is limited (increasing this limit increases 450 * memory-consumption per buffer, so it is not viable); 451 * 452 * the memory must be direct-mapped, not vmalloced; 453 * 454 * the I/O driver can reject requests spuriously if it thinks that 455 * the requests are too big for the device or if they cross a 456 * controller-defined memory boundary. 457 * 458 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 459 * it is not vmalloced, try using the bio interface. 460 * 461 * If the buffer is big, if it is vmalloced or if the underlying device 462 * rejects the bio because it is too large, use dm-io layer to do the I/O. 463 * The dm-io layer splits the I/O into multiple requests, avoiding the above 464 * shortcomings. 465 *--------------------------------------------------------------*/ 466 467 /* 468 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 469 * that the request was handled directly with bio interface. 470 */ 471 static void dmio_complete(unsigned long error, void *context) 472 { 473 struct dm_buffer *b = context; 474 475 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 476 } 477 478 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 479 bio_end_io_t *end_io) 480 { 481 int r; 482 struct dm_io_request io_req = { 483 .bi_rw = rw, 484 .notify.fn = dmio_complete, 485 .notify.context = b, 486 .client = b->c->dm_io, 487 }; 488 struct dm_io_region region = { 489 .bdev = b->c->bdev, 490 .sector = block << b->c->sectors_per_block_bits, 491 .count = b->c->block_size >> SECTOR_SHIFT, 492 }; 493 494 if (b->data_mode != DATA_MODE_VMALLOC) { 495 io_req.mem.type = DM_IO_KMEM; 496 io_req.mem.ptr.addr = b->data; 497 } else { 498 io_req.mem.type = DM_IO_VMA; 499 io_req.mem.ptr.vma = b->data; 500 } 501 502 b->bio.bi_end_io = end_io; 503 504 r = dm_io(&io_req, 1, ®ion, NULL); 505 if (r) 506 end_io(&b->bio, r); 507 } 508 509 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 510 bio_end_io_t *end_io) 511 { 512 char *ptr; 513 int len; 514 515 bio_init(&b->bio); 516 b->bio.bi_io_vec = b->bio_vec; 517 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 518 b->bio.bi_sector = block << b->c->sectors_per_block_bits; 519 b->bio.bi_bdev = b->c->bdev; 520 b->bio.bi_end_io = end_io; 521 522 /* 523 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 524 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 525 */ 526 ptr = b->data; 527 len = b->c->block_size; 528 529 if (len >= PAGE_SIZE) 530 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 531 else 532 BUG_ON((unsigned long)ptr & (len - 1)); 533 534 do { 535 if (!bio_add_page(&b->bio, virt_to_page(ptr), 536 len < PAGE_SIZE ? len : PAGE_SIZE, 537 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 538 BUG_ON(b->c->block_size <= PAGE_SIZE); 539 use_dmio(b, rw, block, end_io); 540 return; 541 } 542 543 len -= PAGE_SIZE; 544 ptr += PAGE_SIZE; 545 } while (len > 0); 546 547 submit_bio(rw, &b->bio); 548 } 549 550 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 551 bio_end_io_t *end_io) 552 { 553 if (rw == WRITE && b->c->write_callback) 554 b->c->write_callback(b); 555 556 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 557 b->data_mode != DATA_MODE_VMALLOC) 558 use_inline_bio(b, rw, block, end_io); 559 else 560 use_dmio(b, rw, block, end_io); 561 } 562 563 /*---------------------------------------------------------------- 564 * Writing dirty buffers 565 *--------------------------------------------------------------*/ 566 567 /* 568 * The endio routine for write. 569 * 570 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 571 * it. 572 */ 573 static void write_endio(struct bio *bio, int error) 574 { 575 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 576 577 b->write_error = error; 578 if (unlikely(error)) { 579 struct dm_bufio_client *c = b->c; 580 (void)cmpxchg(&c->async_write_error, 0, error); 581 } 582 583 BUG_ON(!test_bit(B_WRITING, &b->state)); 584 585 smp_mb__before_clear_bit(); 586 clear_bit(B_WRITING, &b->state); 587 smp_mb__after_clear_bit(); 588 589 wake_up_bit(&b->state, B_WRITING); 590 } 591 592 /* 593 * This function is called when wait_on_bit is actually waiting. 594 */ 595 static int do_io_schedule(void *word) 596 { 597 io_schedule(); 598 599 return 0; 600 } 601 602 /* 603 * Initiate a write on a dirty buffer, but don't wait for it. 604 * 605 * - If the buffer is not dirty, exit. 606 * - If there some previous write going on, wait for it to finish (we can't 607 * have two writes on the same buffer simultaneously). 608 * - Submit our write and don't wait on it. We set B_WRITING indicating 609 * that there is a write in progress. 610 */ 611 static void __write_dirty_buffer(struct dm_buffer *b) 612 { 613 if (!test_bit(B_DIRTY, &b->state)) 614 return; 615 616 clear_bit(B_DIRTY, &b->state); 617 wait_on_bit_lock(&b->state, B_WRITING, 618 do_io_schedule, TASK_UNINTERRUPTIBLE); 619 620 submit_io(b, WRITE, b->block, write_endio); 621 } 622 623 /* 624 * Wait until any activity on the buffer finishes. Possibly write the 625 * buffer if it is dirty. When this function finishes, there is no I/O 626 * running on the buffer and the buffer is not dirty. 627 */ 628 static void __make_buffer_clean(struct dm_buffer *b) 629 { 630 BUG_ON(b->hold_count); 631 632 if (!b->state) /* fast case */ 633 return; 634 635 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 636 __write_dirty_buffer(b); 637 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 638 } 639 640 /* 641 * Find some buffer that is not held by anybody, clean it, unlink it and 642 * return it. 643 */ 644 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 645 { 646 struct dm_buffer *b; 647 648 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 649 BUG_ON(test_bit(B_WRITING, &b->state)); 650 BUG_ON(test_bit(B_DIRTY, &b->state)); 651 652 if (!b->hold_count) { 653 __make_buffer_clean(b); 654 __unlink_buffer(b); 655 return b; 656 } 657 dm_bufio_cond_resched(); 658 } 659 660 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 661 BUG_ON(test_bit(B_READING, &b->state)); 662 663 if (!b->hold_count) { 664 __make_buffer_clean(b); 665 __unlink_buffer(b); 666 return b; 667 } 668 dm_bufio_cond_resched(); 669 } 670 671 return NULL; 672 } 673 674 /* 675 * Wait until some other threads free some buffer or release hold count on 676 * some buffer. 677 * 678 * This function is entered with c->lock held, drops it and regains it 679 * before exiting. 680 */ 681 static void __wait_for_free_buffer(struct dm_bufio_client *c) 682 { 683 DECLARE_WAITQUEUE(wait, current); 684 685 add_wait_queue(&c->free_buffer_wait, &wait); 686 set_task_state(current, TASK_UNINTERRUPTIBLE); 687 dm_bufio_unlock(c); 688 689 io_schedule(); 690 691 set_task_state(current, TASK_RUNNING); 692 remove_wait_queue(&c->free_buffer_wait, &wait); 693 694 dm_bufio_lock(c); 695 } 696 697 enum new_flag { 698 NF_FRESH = 0, 699 NF_READ = 1, 700 NF_GET = 2, 701 NF_PREFETCH = 3 702 }; 703 704 /* 705 * Allocate a new buffer. If the allocation is not possible, wait until 706 * some other thread frees a buffer. 707 * 708 * May drop the lock and regain it. 709 */ 710 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 711 { 712 struct dm_buffer *b; 713 714 /* 715 * dm-bufio is resistant to allocation failures (it just keeps 716 * one buffer reserved in cases all the allocations fail). 717 * So set flags to not try too hard: 718 * GFP_NOIO: don't recurse into the I/O layer 719 * __GFP_NORETRY: don't retry and rather return failure 720 * __GFP_NOMEMALLOC: don't use emergency reserves 721 * __GFP_NOWARN: don't print a warning in case of failure 722 * 723 * For debugging, if we set the cache size to 1, no new buffers will 724 * be allocated. 725 */ 726 while (1) { 727 if (dm_bufio_cache_size_latch != 1) { 728 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 729 if (b) 730 return b; 731 } 732 733 if (nf == NF_PREFETCH) 734 return NULL; 735 736 if (!list_empty(&c->reserved_buffers)) { 737 b = list_entry(c->reserved_buffers.next, 738 struct dm_buffer, lru_list); 739 list_del(&b->lru_list); 740 c->need_reserved_buffers++; 741 742 return b; 743 } 744 745 b = __get_unclaimed_buffer(c); 746 if (b) 747 return b; 748 749 __wait_for_free_buffer(c); 750 } 751 } 752 753 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 754 { 755 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 756 757 if (!b) 758 return NULL; 759 760 if (c->alloc_callback) 761 c->alloc_callback(b); 762 763 return b; 764 } 765 766 /* 767 * Free a buffer and wake other threads waiting for free buffers. 768 */ 769 static void __free_buffer_wake(struct dm_buffer *b) 770 { 771 struct dm_bufio_client *c = b->c; 772 773 if (!c->need_reserved_buffers) 774 free_buffer(b); 775 else { 776 list_add(&b->lru_list, &c->reserved_buffers); 777 c->need_reserved_buffers--; 778 } 779 780 wake_up(&c->free_buffer_wait); 781 } 782 783 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) 784 { 785 struct dm_buffer *b, *tmp; 786 787 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 788 BUG_ON(test_bit(B_READING, &b->state)); 789 790 if (!test_bit(B_DIRTY, &b->state) && 791 !test_bit(B_WRITING, &b->state)) { 792 __relink_lru(b, LIST_CLEAN); 793 continue; 794 } 795 796 if (no_wait && test_bit(B_WRITING, &b->state)) 797 return; 798 799 __write_dirty_buffer(b); 800 dm_bufio_cond_resched(); 801 } 802 } 803 804 /* 805 * Get writeback threshold and buffer limit for a given client. 806 */ 807 static void __get_memory_limit(struct dm_bufio_client *c, 808 unsigned long *threshold_buffers, 809 unsigned long *limit_buffers) 810 { 811 unsigned long buffers; 812 813 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 814 mutex_lock(&dm_bufio_clients_lock); 815 __cache_size_refresh(); 816 mutex_unlock(&dm_bufio_clients_lock); 817 } 818 819 buffers = dm_bufio_cache_size_per_client >> 820 (c->sectors_per_block_bits + SECTOR_SHIFT); 821 822 if (buffers < DM_BUFIO_MIN_BUFFERS) 823 buffers = DM_BUFIO_MIN_BUFFERS; 824 825 *limit_buffers = buffers; 826 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 827 } 828 829 /* 830 * Check if we're over watermark. 831 * If we are over threshold_buffers, start freeing buffers. 832 * If we're over "limit_buffers", block until we get under the limit. 833 */ 834 static void __check_watermark(struct dm_bufio_client *c) 835 { 836 unsigned long threshold_buffers, limit_buffers; 837 838 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 839 840 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 841 limit_buffers) { 842 843 struct dm_buffer *b = __get_unclaimed_buffer(c); 844 845 if (!b) 846 return; 847 848 __free_buffer_wake(b); 849 dm_bufio_cond_resched(); 850 } 851 852 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 853 __write_dirty_buffers_async(c, 1); 854 } 855 856 /* 857 * Find a buffer in the hash. 858 */ 859 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 860 { 861 struct dm_buffer *b; 862 struct hlist_node *hn; 863 864 hlist_for_each_entry(b, hn, &c->cache_hash[DM_BUFIO_HASH(block)], 865 hash_list) { 866 dm_bufio_cond_resched(); 867 if (b->block == block) 868 return b; 869 } 870 871 return NULL; 872 } 873 874 /*---------------------------------------------------------------- 875 * Getting a buffer 876 *--------------------------------------------------------------*/ 877 878 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 879 enum new_flag nf, int *need_submit) 880 { 881 struct dm_buffer *b, *new_b = NULL; 882 883 *need_submit = 0; 884 885 b = __find(c, block); 886 if (b) 887 goto found_buffer; 888 889 if (nf == NF_GET) 890 return NULL; 891 892 new_b = __alloc_buffer_wait(c, nf); 893 if (!new_b) 894 return NULL; 895 896 /* 897 * We've had a period where the mutex was unlocked, so need to 898 * recheck the hash table. 899 */ 900 b = __find(c, block); 901 if (b) { 902 __free_buffer_wake(new_b); 903 goto found_buffer; 904 } 905 906 __check_watermark(c); 907 908 b = new_b; 909 b->hold_count = 1; 910 b->read_error = 0; 911 b->write_error = 0; 912 __link_buffer(b, block, LIST_CLEAN); 913 914 if (nf == NF_FRESH) { 915 b->state = 0; 916 return b; 917 } 918 919 b->state = 1 << B_READING; 920 *need_submit = 1; 921 922 return b; 923 924 found_buffer: 925 if (nf == NF_PREFETCH) 926 return NULL; 927 /* 928 * Note: it is essential that we don't wait for the buffer to be 929 * read if dm_bufio_get function is used. Both dm_bufio_get and 930 * dm_bufio_prefetch can be used in the driver request routine. 931 * If the user called both dm_bufio_prefetch and dm_bufio_get on 932 * the same buffer, it would deadlock if we waited. 933 */ 934 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 935 return NULL; 936 937 b->hold_count++; 938 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 939 test_bit(B_WRITING, &b->state)); 940 return b; 941 } 942 943 /* 944 * The endio routine for reading: set the error, clear the bit and wake up 945 * anyone waiting on the buffer. 946 */ 947 static void read_endio(struct bio *bio, int error) 948 { 949 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 950 951 b->read_error = error; 952 953 BUG_ON(!test_bit(B_READING, &b->state)); 954 955 smp_mb__before_clear_bit(); 956 clear_bit(B_READING, &b->state); 957 smp_mb__after_clear_bit(); 958 959 wake_up_bit(&b->state, B_READING); 960 } 961 962 /* 963 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 964 * functions is similar except that dm_bufio_new doesn't read the 965 * buffer from the disk (assuming that the caller overwrites all the data 966 * and uses dm_bufio_mark_buffer_dirty to write new data back). 967 */ 968 static void *new_read(struct dm_bufio_client *c, sector_t block, 969 enum new_flag nf, struct dm_buffer **bp) 970 { 971 int need_submit; 972 struct dm_buffer *b; 973 974 dm_bufio_lock(c); 975 b = __bufio_new(c, block, nf, &need_submit); 976 dm_bufio_unlock(c); 977 978 if (!b) 979 return b; 980 981 if (need_submit) 982 submit_io(b, READ, b->block, read_endio); 983 984 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 985 986 if (b->read_error) { 987 int error = b->read_error; 988 989 dm_bufio_release(b); 990 991 return ERR_PTR(error); 992 } 993 994 *bp = b; 995 996 return b->data; 997 } 998 999 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1000 struct dm_buffer **bp) 1001 { 1002 return new_read(c, block, NF_GET, bp); 1003 } 1004 EXPORT_SYMBOL_GPL(dm_bufio_get); 1005 1006 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1007 struct dm_buffer **bp) 1008 { 1009 BUG_ON(dm_bufio_in_request()); 1010 1011 return new_read(c, block, NF_READ, bp); 1012 } 1013 EXPORT_SYMBOL_GPL(dm_bufio_read); 1014 1015 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1016 struct dm_buffer **bp) 1017 { 1018 BUG_ON(dm_bufio_in_request()); 1019 1020 return new_read(c, block, NF_FRESH, bp); 1021 } 1022 EXPORT_SYMBOL_GPL(dm_bufio_new); 1023 1024 void dm_bufio_prefetch(struct dm_bufio_client *c, 1025 sector_t block, unsigned n_blocks) 1026 { 1027 struct blk_plug plug; 1028 1029 blk_start_plug(&plug); 1030 dm_bufio_lock(c); 1031 1032 for (; n_blocks--; block++) { 1033 int need_submit; 1034 struct dm_buffer *b; 1035 b = __bufio_new(c, block, NF_PREFETCH, &need_submit); 1036 if (unlikely(b != NULL)) { 1037 dm_bufio_unlock(c); 1038 1039 if (need_submit) 1040 submit_io(b, READ, b->block, read_endio); 1041 dm_bufio_release(b); 1042 1043 dm_bufio_cond_resched(); 1044 1045 if (!n_blocks) 1046 goto flush_plug; 1047 dm_bufio_lock(c); 1048 } 1049 1050 } 1051 1052 dm_bufio_unlock(c); 1053 1054 flush_plug: 1055 blk_finish_plug(&plug); 1056 } 1057 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1058 1059 void dm_bufio_release(struct dm_buffer *b) 1060 { 1061 struct dm_bufio_client *c = b->c; 1062 1063 dm_bufio_lock(c); 1064 1065 BUG_ON(!b->hold_count); 1066 1067 b->hold_count--; 1068 if (!b->hold_count) { 1069 wake_up(&c->free_buffer_wait); 1070 1071 /* 1072 * If there were errors on the buffer, and the buffer is not 1073 * to be written, free the buffer. There is no point in caching 1074 * invalid buffer. 1075 */ 1076 if ((b->read_error || b->write_error) && 1077 !test_bit(B_READING, &b->state) && 1078 !test_bit(B_WRITING, &b->state) && 1079 !test_bit(B_DIRTY, &b->state)) { 1080 __unlink_buffer(b); 1081 __free_buffer_wake(b); 1082 } 1083 } 1084 1085 dm_bufio_unlock(c); 1086 } 1087 EXPORT_SYMBOL_GPL(dm_bufio_release); 1088 1089 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1090 { 1091 struct dm_bufio_client *c = b->c; 1092 1093 dm_bufio_lock(c); 1094 1095 BUG_ON(test_bit(B_READING, &b->state)); 1096 1097 if (!test_and_set_bit(B_DIRTY, &b->state)) 1098 __relink_lru(b, LIST_DIRTY); 1099 1100 dm_bufio_unlock(c); 1101 } 1102 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1103 1104 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1105 { 1106 BUG_ON(dm_bufio_in_request()); 1107 1108 dm_bufio_lock(c); 1109 __write_dirty_buffers_async(c, 0); 1110 dm_bufio_unlock(c); 1111 } 1112 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1113 1114 /* 1115 * For performance, it is essential that the buffers are written asynchronously 1116 * and simultaneously (so that the block layer can merge the writes) and then 1117 * waited upon. 1118 * 1119 * Finally, we flush hardware disk cache. 1120 */ 1121 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1122 { 1123 int a, f; 1124 unsigned long buffers_processed = 0; 1125 struct dm_buffer *b, *tmp; 1126 1127 dm_bufio_lock(c); 1128 __write_dirty_buffers_async(c, 0); 1129 1130 again: 1131 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1132 int dropped_lock = 0; 1133 1134 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1135 buffers_processed++; 1136 1137 BUG_ON(test_bit(B_READING, &b->state)); 1138 1139 if (test_bit(B_WRITING, &b->state)) { 1140 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1141 dropped_lock = 1; 1142 b->hold_count++; 1143 dm_bufio_unlock(c); 1144 wait_on_bit(&b->state, B_WRITING, 1145 do_io_schedule, 1146 TASK_UNINTERRUPTIBLE); 1147 dm_bufio_lock(c); 1148 b->hold_count--; 1149 } else 1150 wait_on_bit(&b->state, B_WRITING, 1151 do_io_schedule, 1152 TASK_UNINTERRUPTIBLE); 1153 } 1154 1155 if (!test_bit(B_DIRTY, &b->state) && 1156 !test_bit(B_WRITING, &b->state)) 1157 __relink_lru(b, LIST_CLEAN); 1158 1159 dm_bufio_cond_resched(); 1160 1161 /* 1162 * If we dropped the lock, the list is no longer consistent, 1163 * so we must restart the search. 1164 * 1165 * In the most common case, the buffer just processed is 1166 * relinked to the clean list, so we won't loop scanning the 1167 * same buffer again and again. 1168 * 1169 * This may livelock if there is another thread simultaneously 1170 * dirtying buffers, so we count the number of buffers walked 1171 * and if it exceeds the total number of buffers, it means that 1172 * someone is doing some writes simultaneously with us. In 1173 * this case, stop, dropping the lock. 1174 */ 1175 if (dropped_lock) 1176 goto again; 1177 } 1178 wake_up(&c->free_buffer_wait); 1179 dm_bufio_unlock(c); 1180 1181 a = xchg(&c->async_write_error, 0); 1182 f = dm_bufio_issue_flush(c); 1183 if (a) 1184 return a; 1185 1186 return f; 1187 } 1188 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1189 1190 /* 1191 * Use dm-io to send and empty barrier flush the device. 1192 */ 1193 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1194 { 1195 struct dm_io_request io_req = { 1196 .bi_rw = REQ_FLUSH, 1197 .mem.type = DM_IO_KMEM, 1198 .mem.ptr.addr = NULL, 1199 .client = c->dm_io, 1200 }; 1201 struct dm_io_region io_reg = { 1202 .bdev = c->bdev, 1203 .sector = 0, 1204 .count = 0, 1205 }; 1206 1207 BUG_ON(dm_bufio_in_request()); 1208 1209 return dm_io(&io_req, 1, &io_reg, NULL); 1210 } 1211 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1212 1213 /* 1214 * We first delete any other buffer that may be at that new location. 1215 * 1216 * Then, we write the buffer to the original location if it was dirty. 1217 * 1218 * Then, if we are the only one who is holding the buffer, relink the buffer 1219 * in the hash queue for the new location. 1220 * 1221 * If there was someone else holding the buffer, we write it to the new 1222 * location but not relink it, because that other user needs to have the buffer 1223 * at the same place. 1224 */ 1225 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1226 { 1227 struct dm_bufio_client *c = b->c; 1228 struct dm_buffer *new; 1229 1230 BUG_ON(dm_bufio_in_request()); 1231 1232 dm_bufio_lock(c); 1233 1234 retry: 1235 new = __find(c, new_block); 1236 if (new) { 1237 if (new->hold_count) { 1238 __wait_for_free_buffer(c); 1239 goto retry; 1240 } 1241 1242 /* 1243 * FIXME: Is there any point waiting for a write that's going 1244 * to be overwritten in a bit? 1245 */ 1246 __make_buffer_clean(new); 1247 __unlink_buffer(new); 1248 __free_buffer_wake(new); 1249 } 1250 1251 BUG_ON(!b->hold_count); 1252 BUG_ON(test_bit(B_READING, &b->state)); 1253 1254 __write_dirty_buffer(b); 1255 if (b->hold_count == 1) { 1256 wait_on_bit(&b->state, B_WRITING, 1257 do_io_schedule, TASK_UNINTERRUPTIBLE); 1258 set_bit(B_DIRTY, &b->state); 1259 __unlink_buffer(b); 1260 __link_buffer(b, new_block, LIST_DIRTY); 1261 } else { 1262 sector_t old_block; 1263 wait_on_bit_lock(&b->state, B_WRITING, 1264 do_io_schedule, TASK_UNINTERRUPTIBLE); 1265 /* 1266 * Relink buffer to "new_block" so that write_callback 1267 * sees "new_block" as a block number. 1268 * After the write, link the buffer back to old_block. 1269 * All this must be done in bufio lock, so that block number 1270 * change isn't visible to other threads. 1271 */ 1272 old_block = b->block; 1273 __unlink_buffer(b); 1274 __link_buffer(b, new_block, b->list_mode); 1275 submit_io(b, WRITE, new_block, write_endio); 1276 wait_on_bit(&b->state, B_WRITING, 1277 do_io_schedule, TASK_UNINTERRUPTIBLE); 1278 __unlink_buffer(b); 1279 __link_buffer(b, old_block, b->list_mode); 1280 } 1281 1282 dm_bufio_unlock(c); 1283 dm_bufio_release(b); 1284 } 1285 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1286 1287 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1288 { 1289 return c->block_size; 1290 } 1291 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1292 1293 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1294 { 1295 return i_size_read(c->bdev->bd_inode) >> 1296 (SECTOR_SHIFT + c->sectors_per_block_bits); 1297 } 1298 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1299 1300 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1301 { 1302 return b->block; 1303 } 1304 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1305 1306 void *dm_bufio_get_block_data(struct dm_buffer *b) 1307 { 1308 return b->data; 1309 } 1310 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1311 1312 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1313 { 1314 return b + 1; 1315 } 1316 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1317 1318 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1319 { 1320 return b->c; 1321 } 1322 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1323 1324 static void drop_buffers(struct dm_bufio_client *c) 1325 { 1326 struct dm_buffer *b; 1327 int i; 1328 1329 BUG_ON(dm_bufio_in_request()); 1330 1331 /* 1332 * An optimization so that the buffers are not written one-by-one. 1333 */ 1334 dm_bufio_write_dirty_buffers_async(c); 1335 1336 dm_bufio_lock(c); 1337 1338 while ((b = __get_unclaimed_buffer(c))) 1339 __free_buffer_wake(b); 1340 1341 for (i = 0; i < LIST_SIZE; i++) 1342 list_for_each_entry(b, &c->lru[i], lru_list) 1343 DMERR("leaked buffer %llx, hold count %u, list %d", 1344 (unsigned long long)b->block, b->hold_count, i); 1345 1346 for (i = 0; i < LIST_SIZE; i++) 1347 BUG_ON(!list_empty(&c->lru[i])); 1348 1349 dm_bufio_unlock(c); 1350 } 1351 1352 /* 1353 * Test if the buffer is unused and too old, and commit it. 1354 * At if noio is set, we must not do any I/O because we hold 1355 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1356 * different bufio client. 1357 */ 1358 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1359 unsigned long max_jiffies) 1360 { 1361 if (jiffies - b->last_accessed < max_jiffies) 1362 return 1; 1363 1364 if (!(gfp & __GFP_IO)) { 1365 if (test_bit(B_READING, &b->state) || 1366 test_bit(B_WRITING, &b->state) || 1367 test_bit(B_DIRTY, &b->state)) 1368 return 1; 1369 } 1370 1371 if (b->hold_count) 1372 return 1; 1373 1374 __make_buffer_clean(b); 1375 __unlink_buffer(b); 1376 __free_buffer_wake(b); 1377 1378 return 0; 1379 } 1380 1381 static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1382 struct shrink_control *sc) 1383 { 1384 int l; 1385 struct dm_buffer *b, *tmp; 1386 1387 for (l = 0; l < LIST_SIZE; l++) { 1388 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) 1389 if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && 1390 !--nr_to_scan) 1391 return; 1392 dm_bufio_cond_resched(); 1393 } 1394 } 1395 1396 static int shrink(struct shrinker *shrinker, struct shrink_control *sc) 1397 { 1398 struct dm_bufio_client *c = 1399 container_of(shrinker, struct dm_bufio_client, shrinker); 1400 unsigned long r; 1401 unsigned long nr_to_scan = sc->nr_to_scan; 1402 1403 if (sc->gfp_mask & __GFP_IO) 1404 dm_bufio_lock(c); 1405 else if (!dm_bufio_trylock(c)) 1406 return !nr_to_scan ? 0 : -1; 1407 1408 if (nr_to_scan) 1409 __scan(c, nr_to_scan, sc); 1410 1411 r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1412 if (r > INT_MAX) 1413 r = INT_MAX; 1414 1415 dm_bufio_unlock(c); 1416 1417 return r; 1418 } 1419 1420 /* 1421 * Create the buffering interface 1422 */ 1423 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1424 unsigned reserved_buffers, unsigned aux_size, 1425 void (*alloc_callback)(struct dm_buffer *), 1426 void (*write_callback)(struct dm_buffer *)) 1427 { 1428 int r; 1429 struct dm_bufio_client *c; 1430 unsigned i; 1431 1432 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1433 (block_size & (block_size - 1))); 1434 1435 c = kmalloc(sizeof(*c), GFP_KERNEL); 1436 if (!c) { 1437 r = -ENOMEM; 1438 goto bad_client; 1439 } 1440 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1441 if (!c->cache_hash) { 1442 r = -ENOMEM; 1443 goto bad_hash; 1444 } 1445 1446 c->bdev = bdev; 1447 c->block_size = block_size; 1448 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1449 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1450 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1451 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1452 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1453 1454 c->aux_size = aux_size; 1455 c->alloc_callback = alloc_callback; 1456 c->write_callback = write_callback; 1457 1458 for (i = 0; i < LIST_SIZE; i++) { 1459 INIT_LIST_HEAD(&c->lru[i]); 1460 c->n_buffers[i] = 0; 1461 } 1462 1463 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1464 INIT_HLIST_HEAD(&c->cache_hash[i]); 1465 1466 mutex_init(&c->lock); 1467 INIT_LIST_HEAD(&c->reserved_buffers); 1468 c->need_reserved_buffers = reserved_buffers; 1469 1470 init_waitqueue_head(&c->free_buffer_wait); 1471 c->async_write_error = 0; 1472 1473 c->dm_io = dm_io_client_create(); 1474 if (IS_ERR(c->dm_io)) { 1475 r = PTR_ERR(c->dm_io); 1476 goto bad_dm_io; 1477 } 1478 1479 mutex_lock(&dm_bufio_clients_lock); 1480 if (c->blocks_per_page_bits) { 1481 if (!DM_BUFIO_CACHE_NAME(c)) { 1482 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1483 if (!DM_BUFIO_CACHE_NAME(c)) { 1484 r = -ENOMEM; 1485 mutex_unlock(&dm_bufio_clients_lock); 1486 goto bad_cache; 1487 } 1488 } 1489 1490 if (!DM_BUFIO_CACHE(c)) { 1491 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1492 c->block_size, 1493 c->block_size, 0, NULL); 1494 if (!DM_BUFIO_CACHE(c)) { 1495 r = -ENOMEM; 1496 mutex_unlock(&dm_bufio_clients_lock); 1497 goto bad_cache; 1498 } 1499 } 1500 } 1501 mutex_unlock(&dm_bufio_clients_lock); 1502 1503 while (c->need_reserved_buffers) { 1504 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1505 1506 if (!b) { 1507 r = -ENOMEM; 1508 goto bad_buffer; 1509 } 1510 __free_buffer_wake(b); 1511 } 1512 1513 mutex_lock(&dm_bufio_clients_lock); 1514 dm_bufio_client_count++; 1515 list_add(&c->client_list, &dm_bufio_all_clients); 1516 __cache_size_refresh(); 1517 mutex_unlock(&dm_bufio_clients_lock); 1518 1519 c->shrinker.shrink = shrink; 1520 c->shrinker.seeks = 1; 1521 c->shrinker.batch = 0; 1522 register_shrinker(&c->shrinker); 1523 1524 return c; 1525 1526 bad_buffer: 1527 bad_cache: 1528 while (!list_empty(&c->reserved_buffers)) { 1529 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1530 struct dm_buffer, lru_list); 1531 list_del(&b->lru_list); 1532 free_buffer(b); 1533 } 1534 dm_io_client_destroy(c->dm_io); 1535 bad_dm_io: 1536 vfree(c->cache_hash); 1537 bad_hash: 1538 kfree(c); 1539 bad_client: 1540 return ERR_PTR(r); 1541 } 1542 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1543 1544 /* 1545 * Free the buffering interface. 1546 * It is required that there are no references on any buffers. 1547 */ 1548 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1549 { 1550 unsigned i; 1551 1552 drop_buffers(c); 1553 1554 unregister_shrinker(&c->shrinker); 1555 1556 mutex_lock(&dm_bufio_clients_lock); 1557 1558 list_del(&c->client_list); 1559 dm_bufio_client_count--; 1560 __cache_size_refresh(); 1561 1562 mutex_unlock(&dm_bufio_clients_lock); 1563 1564 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1565 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1566 1567 BUG_ON(c->need_reserved_buffers); 1568 1569 while (!list_empty(&c->reserved_buffers)) { 1570 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1571 struct dm_buffer, lru_list); 1572 list_del(&b->lru_list); 1573 free_buffer(b); 1574 } 1575 1576 for (i = 0; i < LIST_SIZE; i++) 1577 if (c->n_buffers[i]) 1578 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1579 1580 for (i = 0; i < LIST_SIZE; i++) 1581 BUG_ON(c->n_buffers[i]); 1582 1583 dm_io_client_destroy(c->dm_io); 1584 vfree(c->cache_hash); 1585 kfree(c); 1586 } 1587 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1588 1589 static void cleanup_old_buffers(void) 1590 { 1591 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1592 struct dm_bufio_client *c; 1593 1594 if (max_age > ULONG_MAX / HZ) 1595 max_age = ULONG_MAX / HZ; 1596 1597 mutex_lock(&dm_bufio_clients_lock); 1598 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1599 if (!dm_bufio_trylock(c)) 1600 continue; 1601 1602 while (!list_empty(&c->lru[LIST_CLEAN])) { 1603 struct dm_buffer *b; 1604 b = list_entry(c->lru[LIST_CLEAN].prev, 1605 struct dm_buffer, lru_list); 1606 if (__cleanup_old_buffer(b, 0, max_age * HZ)) 1607 break; 1608 dm_bufio_cond_resched(); 1609 } 1610 1611 dm_bufio_unlock(c); 1612 dm_bufio_cond_resched(); 1613 } 1614 mutex_unlock(&dm_bufio_clients_lock); 1615 } 1616 1617 static struct workqueue_struct *dm_bufio_wq; 1618 static struct delayed_work dm_bufio_work; 1619 1620 static void work_fn(struct work_struct *w) 1621 { 1622 cleanup_old_buffers(); 1623 1624 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1625 DM_BUFIO_WORK_TIMER_SECS * HZ); 1626 } 1627 1628 /*---------------------------------------------------------------- 1629 * Module setup 1630 *--------------------------------------------------------------*/ 1631 1632 /* 1633 * This is called only once for the whole dm_bufio module. 1634 * It initializes memory limit. 1635 */ 1636 static int __init dm_bufio_init(void) 1637 { 1638 __u64 mem; 1639 1640 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1641 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1642 1643 mem = (__u64)((totalram_pages - totalhigh_pages) * 1644 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1645 1646 if (mem > ULONG_MAX) 1647 mem = ULONG_MAX; 1648 1649 #ifdef CONFIG_MMU 1650 /* 1651 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1652 * in fs/proc/internal.h 1653 */ 1654 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1655 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1656 #endif 1657 1658 dm_bufio_default_cache_size = mem; 1659 1660 mutex_lock(&dm_bufio_clients_lock); 1661 __cache_size_refresh(); 1662 mutex_unlock(&dm_bufio_clients_lock); 1663 1664 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1665 if (!dm_bufio_wq) 1666 return -ENOMEM; 1667 1668 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1669 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1670 DM_BUFIO_WORK_TIMER_SECS * HZ); 1671 1672 return 0; 1673 } 1674 1675 /* 1676 * This is called once when unloading the dm_bufio module. 1677 */ 1678 static void __exit dm_bufio_exit(void) 1679 { 1680 int bug = 0; 1681 int i; 1682 1683 cancel_delayed_work_sync(&dm_bufio_work); 1684 destroy_workqueue(dm_bufio_wq); 1685 1686 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1687 struct kmem_cache *kc = dm_bufio_caches[i]; 1688 1689 if (kc) 1690 kmem_cache_destroy(kc); 1691 } 1692 1693 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1694 kfree(dm_bufio_cache_names[i]); 1695 1696 if (dm_bufio_client_count) { 1697 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1698 __func__, dm_bufio_client_count); 1699 bug = 1; 1700 } 1701 1702 if (dm_bufio_current_allocated) { 1703 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1704 __func__, dm_bufio_current_allocated); 1705 bug = 1; 1706 } 1707 1708 if (dm_bufio_allocated_get_free_pages) { 1709 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1710 __func__, dm_bufio_allocated_get_free_pages); 1711 bug = 1; 1712 } 1713 1714 if (dm_bufio_allocated_vmalloc) { 1715 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1716 __func__, dm_bufio_allocated_vmalloc); 1717 bug = 1; 1718 } 1719 1720 if (bug) 1721 BUG(); 1722 } 1723 1724 module_init(dm_bufio_init) 1725 module_exit(dm_bufio_exit) 1726 1727 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1728 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1729 1730 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1731 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1732 1733 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1734 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1735 1736 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1737 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1738 1739 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1740 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1741 1742 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1743 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1744 1745 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1746 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1747 1748 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1749 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1750 MODULE_LICENSE("GPL"); 1751