1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 unsigned minimum_buffers; 108 109 struct hlist_head *cache_hash; 110 wait_queue_head_t free_buffer_wait; 111 112 int async_write_error; 113 114 struct list_head client_list; 115 struct shrinker shrinker; 116 }; 117 118 /* 119 * Buffer state bits. 120 */ 121 #define B_READING 0 122 #define B_WRITING 1 123 #define B_DIRTY 2 124 125 /* 126 * Describes how the block was allocated: 127 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 128 * See the comment at alloc_buffer_data. 129 */ 130 enum data_mode { 131 DATA_MODE_SLAB = 0, 132 DATA_MODE_GET_FREE_PAGES = 1, 133 DATA_MODE_VMALLOC = 2, 134 DATA_MODE_LIMIT = 3 135 }; 136 137 struct dm_buffer { 138 struct hlist_node hash_list; 139 struct list_head lru_list; 140 sector_t block; 141 void *data; 142 enum data_mode data_mode; 143 unsigned char list_mode; /* LIST_* */ 144 unsigned hold_count; 145 int read_error; 146 int write_error; 147 unsigned long state; 148 unsigned long last_accessed; 149 struct dm_bufio_client *c; 150 struct list_head write_list; 151 struct bio bio; 152 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 153 }; 154 155 /*----------------------------------------------------------------*/ 156 157 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 158 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 159 160 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 161 { 162 unsigned ret = c->blocks_per_page_bits - 1; 163 164 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 165 166 return ret; 167 } 168 169 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 170 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 171 172 #define dm_bufio_in_request() (!!current->bio_list) 173 174 static void dm_bufio_lock(struct dm_bufio_client *c) 175 { 176 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 177 } 178 179 static int dm_bufio_trylock(struct dm_bufio_client *c) 180 { 181 return mutex_trylock(&c->lock); 182 } 183 184 static void dm_bufio_unlock(struct dm_bufio_client *c) 185 { 186 mutex_unlock(&c->lock); 187 } 188 189 /* 190 * FIXME Move to sched.h? 191 */ 192 #ifdef CONFIG_PREEMPT_VOLUNTARY 193 # define dm_bufio_cond_resched() \ 194 do { \ 195 if (unlikely(need_resched())) \ 196 _cond_resched(); \ 197 } while (0) 198 #else 199 # define dm_bufio_cond_resched() do { } while (0) 200 #endif 201 202 /*----------------------------------------------------------------*/ 203 204 /* 205 * Default cache size: available memory divided by the ratio. 206 */ 207 static unsigned long dm_bufio_default_cache_size; 208 209 /* 210 * Total cache size set by the user. 211 */ 212 static unsigned long dm_bufio_cache_size; 213 214 /* 215 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 216 * at any time. If it disagrees, the user has changed cache size. 217 */ 218 static unsigned long dm_bufio_cache_size_latch; 219 220 static DEFINE_SPINLOCK(param_spinlock); 221 222 /* 223 * Buffers are freed after this timeout 224 */ 225 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 226 227 static unsigned long dm_bufio_peak_allocated; 228 static unsigned long dm_bufio_allocated_kmem_cache; 229 static unsigned long dm_bufio_allocated_get_free_pages; 230 static unsigned long dm_bufio_allocated_vmalloc; 231 static unsigned long dm_bufio_current_allocated; 232 233 /*----------------------------------------------------------------*/ 234 235 /* 236 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 237 */ 238 static unsigned long dm_bufio_cache_size_per_client; 239 240 /* 241 * The current number of clients. 242 */ 243 static int dm_bufio_client_count; 244 245 /* 246 * The list of all clients. 247 */ 248 static LIST_HEAD(dm_bufio_all_clients); 249 250 /* 251 * This mutex protects dm_bufio_cache_size_latch, 252 * dm_bufio_cache_size_per_client and dm_bufio_client_count 253 */ 254 static DEFINE_MUTEX(dm_bufio_clients_lock); 255 256 /*----------------------------------------------------------------*/ 257 258 static void adjust_total_allocated(enum data_mode data_mode, long diff) 259 { 260 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 261 &dm_bufio_allocated_kmem_cache, 262 &dm_bufio_allocated_get_free_pages, 263 &dm_bufio_allocated_vmalloc, 264 }; 265 266 spin_lock(¶m_spinlock); 267 268 *class_ptr[data_mode] += diff; 269 270 dm_bufio_current_allocated += diff; 271 272 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 273 dm_bufio_peak_allocated = dm_bufio_current_allocated; 274 275 spin_unlock(¶m_spinlock); 276 } 277 278 /* 279 * Change the number of clients and recalculate per-client limit. 280 */ 281 static void __cache_size_refresh(void) 282 { 283 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 284 BUG_ON(dm_bufio_client_count < 0); 285 286 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 287 288 /* 289 * Use default if set to 0 and report the actual cache size used. 290 */ 291 if (!dm_bufio_cache_size_latch) { 292 (void)cmpxchg(&dm_bufio_cache_size, 0, 293 dm_bufio_default_cache_size); 294 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 295 } 296 297 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 298 (dm_bufio_client_count ? : 1); 299 } 300 301 /* 302 * Allocating buffer data. 303 * 304 * Small buffers are allocated with kmem_cache, to use space optimally. 305 * 306 * For large buffers, we choose between get_free_pages and vmalloc. 307 * Each has advantages and disadvantages. 308 * 309 * __get_free_pages can randomly fail if the memory is fragmented. 310 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 311 * as low as 128M) so using it for caching is not appropriate. 312 * 313 * If the allocation may fail we use __get_free_pages. Memory fragmentation 314 * won't have a fatal effect here, but it just causes flushes of some other 315 * buffers and more I/O will be performed. Don't use __get_free_pages if it 316 * always fails (i.e. order >= MAX_ORDER). 317 * 318 * If the allocation shouldn't fail we use __vmalloc. This is only for the 319 * initial reserve allocation, so there's no risk of wasting all vmalloc 320 * space. 321 */ 322 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 323 enum data_mode *data_mode) 324 { 325 unsigned noio_flag; 326 void *ptr; 327 328 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 329 *data_mode = DATA_MODE_SLAB; 330 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 331 } 332 333 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 334 gfp_mask & __GFP_NORETRY) { 335 *data_mode = DATA_MODE_GET_FREE_PAGES; 336 return (void *)__get_free_pages(gfp_mask, 337 c->pages_per_block_bits); 338 } 339 340 *data_mode = DATA_MODE_VMALLOC; 341 342 /* 343 * __vmalloc allocates the data pages and auxiliary structures with 344 * gfp_flags that were specified, but pagetables are always allocated 345 * with GFP_KERNEL, no matter what was specified as gfp_mask. 346 * 347 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 348 * all allocations done by this process (including pagetables) are done 349 * as if GFP_NOIO was specified. 350 */ 351 352 if (gfp_mask & __GFP_NORETRY) 353 noio_flag = memalloc_noio_save(); 354 355 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 356 357 if (gfp_mask & __GFP_NORETRY) 358 memalloc_noio_restore(noio_flag); 359 360 return ptr; 361 } 362 363 /* 364 * Free buffer's data. 365 */ 366 static void free_buffer_data(struct dm_bufio_client *c, 367 void *data, enum data_mode data_mode) 368 { 369 switch (data_mode) { 370 case DATA_MODE_SLAB: 371 kmem_cache_free(DM_BUFIO_CACHE(c), data); 372 break; 373 374 case DATA_MODE_GET_FREE_PAGES: 375 free_pages((unsigned long)data, c->pages_per_block_bits); 376 break; 377 378 case DATA_MODE_VMALLOC: 379 vfree(data); 380 break; 381 382 default: 383 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 384 data_mode); 385 BUG(); 386 } 387 } 388 389 /* 390 * Allocate buffer and its data. 391 */ 392 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 393 { 394 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 395 gfp_mask); 396 397 if (!b) 398 return NULL; 399 400 b->c = c; 401 402 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 403 if (!b->data) { 404 kfree(b); 405 return NULL; 406 } 407 408 adjust_total_allocated(b->data_mode, (long)c->block_size); 409 410 return b; 411 } 412 413 /* 414 * Free buffer and its data. 415 */ 416 static void free_buffer(struct dm_buffer *b) 417 { 418 struct dm_bufio_client *c = b->c; 419 420 adjust_total_allocated(b->data_mode, -(long)c->block_size); 421 422 free_buffer_data(c, b->data, b->data_mode); 423 kfree(b); 424 } 425 426 /* 427 * Link buffer to the hash list and clean or dirty queue. 428 */ 429 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 430 { 431 struct dm_bufio_client *c = b->c; 432 433 c->n_buffers[dirty]++; 434 b->block = block; 435 b->list_mode = dirty; 436 list_add(&b->lru_list, &c->lru[dirty]); 437 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 438 b->last_accessed = jiffies; 439 } 440 441 /* 442 * Unlink buffer from the hash list and dirty or clean queue. 443 */ 444 static void __unlink_buffer(struct dm_buffer *b) 445 { 446 struct dm_bufio_client *c = b->c; 447 448 BUG_ON(!c->n_buffers[b->list_mode]); 449 450 c->n_buffers[b->list_mode]--; 451 hlist_del(&b->hash_list); 452 list_del(&b->lru_list); 453 } 454 455 /* 456 * Place the buffer to the head of dirty or clean LRU queue. 457 */ 458 static void __relink_lru(struct dm_buffer *b, int dirty) 459 { 460 struct dm_bufio_client *c = b->c; 461 462 BUG_ON(!c->n_buffers[b->list_mode]); 463 464 c->n_buffers[b->list_mode]--; 465 c->n_buffers[dirty]++; 466 b->list_mode = dirty; 467 list_move(&b->lru_list, &c->lru[dirty]); 468 } 469 470 /*---------------------------------------------------------------- 471 * Submit I/O on the buffer. 472 * 473 * Bio interface is faster but it has some problems: 474 * the vector list is limited (increasing this limit increases 475 * memory-consumption per buffer, so it is not viable); 476 * 477 * the memory must be direct-mapped, not vmalloced; 478 * 479 * the I/O driver can reject requests spuriously if it thinks that 480 * the requests are too big for the device or if they cross a 481 * controller-defined memory boundary. 482 * 483 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 484 * it is not vmalloced, try using the bio interface. 485 * 486 * If the buffer is big, if it is vmalloced or if the underlying device 487 * rejects the bio because it is too large, use dm-io layer to do the I/O. 488 * The dm-io layer splits the I/O into multiple requests, avoiding the above 489 * shortcomings. 490 *--------------------------------------------------------------*/ 491 492 /* 493 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 494 * that the request was handled directly with bio interface. 495 */ 496 static void dmio_complete(unsigned long error, void *context) 497 { 498 struct dm_buffer *b = context; 499 500 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 501 } 502 503 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 504 bio_end_io_t *end_io) 505 { 506 int r; 507 struct dm_io_request io_req = { 508 .bi_rw = rw, 509 .notify.fn = dmio_complete, 510 .notify.context = b, 511 .client = b->c->dm_io, 512 }; 513 struct dm_io_region region = { 514 .bdev = b->c->bdev, 515 .sector = block << b->c->sectors_per_block_bits, 516 .count = b->c->block_size >> SECTOR_SHIFT, 517 }; 518 519 if (b->data_mode != DATA_MODE_VMALLOC) { 520 io_req.mem.type = DM_IO_KMEM; 521 io_req.mem.ptr.addr = b->data; 522 } else { 523 io_req.mem.type = DM_IO_VMA; 524 io_req.mem.ptr.vma = b->data; 525 } 526 527 b->bio.bi_end_io = end_io; 528 529 r = dm_io(&io_req, 1, ®ion, NULL); 530 if (r) 531 end_io(&b->bio, r); 532 } 533 534 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 535 bio_end_io_t *end_io) 536 { 537 char *ptr; 538 int len; 539 540 bio_init(&b->bio); 541 b->bio.bi_io_vec = b->bio_vec; 542 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 543 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 544 b->bio.bi_bdev = b->c->bdev; 545 b->bio.bi_end_io = end_io; 546 547 /* 548 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 549 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 550 */ 551 ptr = b->data; 552 len = b->c->block_size; 553 554 if (len >= PAGE_SIZE) 555 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 556 else 557 BUG_ON((unsigned long)ptr & (len - 1)); 558 559 do { 560 if (!bio_add_page(&b->bio, virt_to_page(ptr), 561 len < PAGE_SIZE ? len : PAGE_SIZE, 562 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 563 BUG_ON(b->c->block_size <= PAGE_SIZE); 564 use_dmio(b, rw, block, end_io); 565 return; 566 } 567 568 len -= PAGE_SIZE; 569 ptr += PAGE_SIZE; 570 } while (len > 0); 571 572 submit_bio(rw, &b->bio); 573 } 574 575 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 576 bio_end_io_t *end_io) 577 { 578 if (rw == WRITE && b->c->write_callback) 579 b->c->write_callback(b); 580 581 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 582 b->data_mode != DATA_MODE_VMALLOC) 583 use_inline_bio(b, rw, block, end_io); 584 else 585 use_dmio(b, rw, block, end_io); 586 } 587 588 /*---------------------------------------------------------------- 589 * Writing dirty buffers 590 *--------------------------------------------------------------*/ 591 592 /* 593 * The endio routine for write. 594 * 595 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 596 * it. 597 */ 598 static void write_endio(struct bio *bio, int error) 599 { 600 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 601 602 b->write_error = error; 603 if (unlikely(error)) { 604 struct dm_bufio_client *c = b->c; 605 (void)cmpxchg(&c->async_write_error, 0, error); 606 } 607 608 BUG_ON(!test_bit(B_WRITING, &b->state)); 609 610 smp_mb__before_atomic(); 611 clear_bit(B_WRITING, &b->state); 612 smp_mb__after_atomic(); 613 614 wake_up_bit(&b->state, B_WRITING); 615 } 616 617 /* 618 * Initiate a write on a dirty buffer, but don't wait for it. 619 * 620 * - If the buffer is not dirty, exit. 621 * - If there some previous write going on, wait for it to finish (we can't 622 * have two writes on the same buffer simultaneously). 623 * - Submit our write and don't wait on it. We set B_WRITING indicating 624 * that there is a write in progress. 625 */ 626 static void __write_dirty_buffer(struct dm_buffer *b, 627 struct list_head *write_list) 628 { 629 if (!test_bit(B_DIRTY, &b->state)) 630 return; 631 632 clear_bit(B_DIRTY, &b->state); 633 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 634 635 if (!write_list) 636 submit_io(b, WRITE, b->block, write_endio); 637 else 638 list_add_tail(&b->write_list, write_list); 639 } 640 641 static void __flush_write_list(struct list_head *write_list) 642 { 643 struct blk_plug plug; 644 blk_start_plug(&plug); 645 while (!list_empty(write_list)) { 646 struct dm_buffer *b = 647 list_entry(write_list->next, struct dm_buffer, write_list); 648 list_del(&b->write_list); 649 submit_io(b, WRITE, b->block, write_endio); 650 dm_bufio_cond_resched(); 651 } 652 blk_finish_plug(&plug); 653 } 654 655 /* 656 * Wait until any activity on the buffer finishes. Possibly write the 657 * buffer if it is dirty. When this function finishes, there is no I/O 658 * running on the buffer and the buffer is not dirty. 659 */ 660 static void __make_buffer_clean(struct dm_buffer *b) 661 { 662 BUG_ON(b->hold_count); 663 664 if (!b->state) /* fast case */ 665 return; 666 667 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 668 __write_dirty_buffer(b, NULL); 669 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 670 } 671 672 /* 673 * Find some buffer that is not held by anybody, clean it, unlink it and 674 * return it. 675 */ 676 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 677 { 678 struct dm_buffer *b; 679 680 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 681 BUG_ON(test_bit(B_WRITING, &b->state)); 682 BUG_ON(test_bit(B_DIRTY, &b->state)); 683 684 if (!b->hold_count) { 685 __make_buffer_clean(b); 686 __unlink_buffer(b); 687 return b; 688 } 689 dm_bufio_cond_resched(); 690 } 691 692 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 693 BUG_ON(test_bit(B_READING, &b->state)); 694 695 if (!b->hold_count) { 696 __make_buffer_clean(b); 697 __unlink_buffer(b); 698 return b; 699 } 700 dm_bufio_cond_resched(); 701 } 702 703 return NULL; 704 } 705 706 /* 707 * Wait until some other threads free some buffer or release hold count on 708 * some buffer. 709 * 710 * This function is entered with c->lock held, drops it and regains it 711 * before exiting. 712 */ 713 static void __wait_for_free_buffer(struct dm_bufio_client *c) 714 { 715 DECLARE_WAITQUEUE(wait, current); 716 717 add_wait_queue(&c->free_buffer_wait, &wait); 718 set_task_state(current, TASK_UNINTERRUPTIBLE); 719 dm_bufio_unlock(c); 720 721 io_schedule(); 722 723 set_task_state(current, TASK_RUNNING); 724 remove_wait_queue(&c->free_buffer_wait, &wait); 725 726 dm_bufio_lock(c); 727 } 728 729 enum new_flag { 730 NF_FRESH = 0, 731 NF_READ = 1, 732 NF_GET = 2, 733 NF_PREFETCH = 3 734 }; 735 736 /* 737 * Allocate a new buffer. If the allocation is not possible, wait until 738 * some other thread frees a buffer. 739 * 740 * May drop the lock and regain it. 741 */ 742 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 743 { 744 struct dm_buffer *b; 745 746 /* 747 * dm-bufio is resistant to allocation failures (it just keeps 748 * one buffer reserved in cases all the allocations fail). 749 * So set flags to not try too hard: 750 * GFP_NOIO: don't recurse into the I/O layer 751 * __GFP_NORETRY: don't retry and rather return failure 752 * __GFP_NOMEMALLOC: don't use emergency reserves 753 * __GFP_NOWARN: don't print a warning in case of failure 754 * 755 * For debugging, if we set the cache size to 1, no new buffers will 756 * be allocated. 757 */ 758 while (1) { 759 if (dm_bufio_cache_size_latch != 1) { 760 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 761 if (b) 762 return b; 763 } 764 765 if (nf == NF_PREFETCH) 766 return NULL; 767 768 if (!list_empty(&c->reserved_buffers)) { 769 b = list_entry(c->reserved_buffers.next, 770 struct dm_buffer, lru_list); 771 list_del(&b->lru_list); 772 c->need_reserved_buffers++; 773 774 return b; 775 } 776 777 b = __get_unclaimed_buffer(c); 778 if (b) 779 return b; 780 781 __wait_for_free_buffer(c); 782 } 783 } 784 785 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 786 { 787 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 788 789 if (!b) 790 return NULL; 791 792 if (c->alloc_callback) 793 c->alloc_callback(b); 794 795 return b; 796 } 797 798 /* 799 * Free a buffer and wake other threads waiting for free buffers. 800 */ 801 static void __free_buffer_wake(struct dm_buffer *b) 802 { 803 struct dm_bufio_client *c = b->c; 804 805 if (!c->need_reserved_buffers) 806 free_buffer(b); 807 else { 808 list_add(&b->lru_list, &c->reserved_buffers); 809 c->need_reserved_buffers--; 810 } 811 812 wake_up(&c->free_buffer_wait); 813 } 814 815 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 816 struct list_head *write_list) 817 { 818 struct dm_buffer *b, *tmp; 819 820 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 821 BUG_ON(test_bit(B_READING, &b->state)); 822 823 if (!test_bit(B_DIRTY, &b->state) && 824 !test_bit(B_WRITING, &b->state)) { 825 __relink_lru(b, LIST_CLEAN); 826 continue; 827 } 828 829 if (no_wait && test_bit(B_WRITING, &b->state)) 830 return; 831 832 __write_dirty_buffer(b, write_list); 833 dm_bufio_cond_resched(); 834 } 835 } 836 837 /* 838 * Get writeback threshold and buffer limit for a given client. 839 */ 840 static void __get_memory_limit(struct dm_bufio_client *c, 841 unsigned long *threshold_buffers, 842 unsigned long *limit_buffers) 843 { 844 unsigned long buffers; 845 846 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 847 mutex_lock(&dm_bufio_clients_lock); 848 __cache_size_refresh(); 849 mutex_unlock(&dm_bufio_clients_lock); 850 } 851 852 buffers = dm_bufio_cache_size_per_client >> 853 (c->sectors_per_block_bits + SECTOR_SHIFT); 854 855 if (buffers < c->minimum_buffers) 856 buffers = c->minimum_buffers; 857 858 *limit_buffers = buffers; 859 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 860 } 861 862 /* 863 * Check if we're over watermark. 864 * If we are over threshold_buffers, start freeing buffers. 865 * If we're over "limit_buffers", block until we get under the limit. 866 */ 867 static void __check_watermark(struct dm_bufio_client *c, 868 struct list_head *write_list) 869 { 870 unsigned long threshold_buffers, limit_buffers; 871 872 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 873 874 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 875 limit_buffers) { 876 877 struct dm_buffer *b = __get_unclaimed_buffer(c); 878 879 if (!b) 880 return; 881 882 __free_buffer_wake(b); 883 dm_bufio_cond_resched(); 884 } 885 886 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 887 __write_dirty_buffers_async(c, 1, write_list); 888 } 889 890 /* 891 * Find a buffer in the hash. 892 */ 893 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 894 { 895 struct dm_buffer *b; 896 897 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 898 hash_list) { 899 dm_bufio_cond_resched(); 900 if (b->block == block) 901 return b; 902 } 903 904 return NULL; 905 } 906 907 /*---------------------------------------------------------------- 908 * Getting a buffer 909 *--------------------------------------------------------------*/ 910 911 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 912 enum new_flag nf, int *need_submit, 913 struct list_head *write_list) 914 { 915 struct dm_buffer *b, *new_b = NULL; 916 917 *need_submit = 0; 918 919 b = __find(c, block); 920 if (b) 921 goto found_buffer; 922 923 if (nf == NF_GET) 924 return NULL; 925 926 new_b = __alloc_buffer_wait(c, nf); 927 if (!new_b) 928 return NULL; 929 930 /* 931 * We've had a period where the mutex was unlocked, so need to 932 * recheck the hash table. 933 */ 934 b = __find(c, block); 935 if (b) { 936 __free_buffer_wake(new_b); 937 goto found_buffer; 938 } 939 940 __check_watermark(c, write_list); 941 942 b = new_b; 943 b->hold_count = 1; 944 b->read_error = 0; 945 b->write_error = 0; 946 __link_buffer(b, block, LIST_CLEAN); 947 948 if (nf == NF_FRESH) { 949 b->state = 0; 950 return b; 951 } 952 953 b->state = 1 << B_READING; 954 *need_submit = 1; 955 956 return b; 957 958 found_buffer: 959 if (nf == NF_PREFETCH) 960 return NULL; 961 /* 962 * Note: it is essential that we don't wait for the buffer to be 963 * read if dm_bufio_get function is used. Both dm_bufio_get and 964 * dm_bufio_prefetch can be used in the driver request routine. 965 * If the user called both dm_bufio_prefetch and dm_bufio_get on 966 * the same buffer, it would deadlock if we waited. 967 */ 968 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 969 return NULL; 970 971 b->hold_count++; 972 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 973 test_bit(B_WRITING, &b->state)); 974 return b; 975 } 976 977 /* 978 * The endio routine for reading: set the error, clear the bit and wake up 979 * anyone waiting on the buffer. 980 */ 981 static void read_endio(struct bio *bio, int error) 982 { 983 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 984 985 b->read_error = error; 986 987 BUG_ON(!test_bit(B_READING, &b->state)); 988 989 smp_mb__before_atomic(); 990 clear_bit(B_READING, &b->state); 991 smp_mb__after_atomic(); 992 993 wake_up_bit(&b->state, B_READING); 994 } 995 996 /* 997 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 998 * functions is similar except that dm_bufio_new doesn't read the 999 * buffer from the disk (assuming that the caller overwrites all the data 1000 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1001 */ 1002 static void *new_read(struct dm_bufio_client *c, sector_t block, 1003 enum new_flag nf, struct dm_buffer **bp) 1004 { 1005 int need_submit; 1006 struct dm_buffer *b; 1007 1008 LIST_HEAD(write_list); 1009 1010 dm_bufio_lock(c); 1011 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1012 dm_bufio_unlock(c); 1013 1014 __flush_write_list(&write_list); 1015 1016 if (!b) 1017 return b; 1018 1019 if (need_submit) 1020 submit_io(b, READ, b->block, read_endio); 1021 1022 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1023 1024 if (b->read_error) { 1025 int error = b->read_error; 1026 1027 dm_bufio_release(b); 1028 1029 return ERR_PTR(error); 1030 } 1031 1032 *bp = b; 1033 1034 return b->data; 1035 } 1036 1037 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1038 struct dm_buffer **bp) 1039 { 1040 return new_read(c, block, NF_GET, bp); 1041 } 1042 EXPORT_SYMBOL_GPL(dm_bufio_get); 1043 1044 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1045 struct dm_buffer **bp) 1046 { 1047 BUG_ON(dm_bufio_in_request()); 1048 1049 return new_read(c, block, NF_READ, bp); 1050 } 1051 EXPORT_SYMBOL_GPL(dm_bufio_read); 1052 1053 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1054 struct dm_buffer **bp) 1055 { 1056 BUG_ON(dm_bufio_in_request()); 1057 1058 return new_read(c, block, NF_FRESH, bp); 1059 } 1060 EXPORT_SYMBOL_GPL(dm_bufio_new); 1061 1062 void dm_bufio_prefetch(struct dm_bufio_client *c, 1063 sector_t block, unsigned n_blocks) 1064 { 1065 struct blk_plug plug; 1066 1067 LIST_HEAD(write_list); 1068 1069 BUG_ON(dm_bufio_in_request()); 1070 1071 blk_start_plug(&plug); 1072 dm_bufio_lock(c); 1073 1074 for (; n_blocks--; block++) { 1075 int need_submit; 1076 struct dm_buffer *b; 1077 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1078 &write_list); 1079 if (unlikely(!list_empty(&write_list))) { 1080 dm_bufio_unlock(c); 1081 blk_finish_plug(&plug); 1082 __flush_write_list(&write_list); 1083 blk_start_plug(&plug); 1084 dm_bufio_lock(c); 1085 } 1086 if (unlikely(b != NULL)) { 1087 dm_bufio_unlock(c); 1088 1089 if (need_submit) 1090 submit_io(b, READ, b->block, read_endio); 1091 dm_bufio_release(b); 1092 1093 dm_bufio_cond_resched(); 1094 1095 if (!n_blocks) 1096 goto flush_plug; 1097 dm_bufio_lock(c); 1098 } 1099 } 1100 1101 dm_bufio_unlock(c); 1102 1103 flush_plug: 1104 blk_finish_plug(&plug); 1105 } 1106 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1107 1108 void dm_bufio_release(struct dm_buffer *b) 1109 { 1110 struct dm_bufio_client *c = b->c; 1111 1112 dm_bufio_lock(c); 1113 1114 BUG_ON(!b->hold_count); 1115 1116 b->hold_count--; 1117 if (!b->hold_count) { 1118 wake_up(&c->free_buffer_wait); 1119 1120 /* 1121 * If there were errors on the buffer, and the buffer is not 1122 * to be written, free the buffer. There is no point in caching 1123 * invalid buffer. 1124 */ 1125 if ((b->read_error || b->write_error) && 1126 !test_bit(B_READING, &b->state) && 1127 !test_bit(B_WRITING, &b->state) && 1128 !test_bit(B_DIRTY, &b->state)) { 1129 __unlink_buffer(b); 1130 __free_buffer_wake(b); 1131 } 1132 } 1133 1134 dm_bufio_unlock(c); 1135 } 1136 EXPORT_SYMBOL_GPL(dm_bufio_release); 1137 1138 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1139 { 1140 struct dm_bufio_client *c = b->c; 1141 1142 dm_bufio_lock(c); 1143 1144 BUG_ON(test_bit(B_READING, &b->state)); 1145 1146 if (!test_and_set_bit(B_DIRTY, &b->state)) 1147 __relink_lru(b, LIST_DIRTY); 1148 1149 dm_bufio_unlock(c); 1150 } 1151 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1152 1153 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1154 { 1155 LIST_HEAD(write_list); 1156 1157 BUG_ON(dm_bufio_in_request()); 1158 1159 dm_bufio_lock(c); 1160 __write_dirty_buffers_async(c, 0, &write_list); 1161 dm_bufio_unlock(c); 1162 __flush_write_list(&write_list); 1163 } 1164 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1165 1166 /* 1167 * For performance, it is essential that the buffers are written asynchronously 1168 * and simultaneously (so that the block layer can merge the writes) and then 1169 * waited upon. 1170 * 1171 * Finally, we flush hardware disk cache. 1172 */ 1173 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1174 { 1175 int a, f; 1176 unsigned long buffers_processed = 0; 1177 struct dm_buffer *b, *tmp; 1178 1179 LIST_HEAD(write_list); 1180 1181 dm_bufio_lock(c); 1182 __write_dirty_buffers_async(c, 0, &write_list); 1183 dm_bufio_unlock(c); 1184 __flush_write_list(&write_list); 1185 dm_bufio_lock(c); 1186 1187 again: 1188 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1189 int dropped_lock = 0; 1190 1191 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1192 buffers_processed++; 1193 1194 BUG_ON(test_bit(B_READING, &b->state)); 1195 1196 if (test_bit(B_WRITING, &b->state)) { 1197 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1198 dropped_lock = 1; 1199 b->hold_count++; 1200 dm_bufio_unlock(c); 1201 wait_on_bit_io(&b->state, B_WRITING, 1202 TASK_UNINTERRUPTIBLE); 1203 dm_bufio_lock(c); 1204 b->hold_count--; 1205 } else 1206 wait_on_bit_io(&b->state, B_WRITING, 1207 TASK_UNINTERRUPTIBLE); 1208 } 1209 1210 if (!test_bit(B_DIRTY, &b->state) && 1211 !test_bit(B_WRITING, &b->state)) 1212 __relink_lru(b, LIST_CLEAN); 1213 1214 dm_bufio_cond_resched(); 1215 1216 /* 1217 * If we dropped the lock, the list is no longer consistent, 1218 * so we must restart the search. 1219 * 1220 * In the most common case, the buffer just processed is 1221 * relinked to the clean list, so we won't loop scanning the 1222 * same buffer again and again. 1223 * 1224 * This may livelock if there is another thread simultaneously 1225 * dirtying buffers, so we count the number of buffers walked 1226 * and if it exceeds the total number of buffers, it means that 1227 * someone is doing some writes simultaneously with us. In 1228 * this case, stop, dropping the lock. 1229 */ 1230 if (dropped_lock) 1231 goto again; 1232 } 1233 wake_up(&c->free_buffer_wait); 1234 dm_bufio_unlock(c); 1235 1236 a = xchg(&c->async_write_error, 0); 1237 f = dm_bufio_issue_flush(c); 1238 if (a) 1239 return a; 1240 1241 return f; 1242 } 1243 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1244 1245 /* 1246 * Use dm-io to send and empty barrier flush the device. 1247 */ 1248 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1249 { 1250 struct dm_io_request io_req = { 1251 .bi_rw = WRITE_FLUSH, 1252 .mem.type = DM_IO_KMEM, 1253 .mem.ptr.addr = NULL, 1254 .client = c->dm_io, 1255 }; 1256 struct dm_io_region io_reg = { 1257 .bdev = c->bdev, 1258 .sector = 0, 1259 .count = 0, 1260 }; 1261 1262 BUG_ON(dm_bufio_in_request()); 1263 1264 return dm_io(&io_req, 1, &io_reg, NULL); 1265 } 1266 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1267 1268 /* 1269 * We first delete any other buffer that may be at that new location. 1270 * 1271 * Then, we write the buffer to the original location if it was dirty. 1272 * 1273 * Then, if we are the only one who is holding the buffer, relink the buffer 1274 * in the hash queue for the new location. 1275 * 1276 * If there was someone else holding the buffer, we write it to the new 1277 * location but not relink it, because that other user needs to have the buffer 1278 * at the same place. 1279 */ 1280 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1281 { 1282 struct dm_bufio_client *c = b->c; 1283 struct dm_buffer *new; 1284 1285 BUG_ON(dm_bufio_in_request()); 1286 1287 dm_bufio_lock(c); 1288 1289 retry: 1290 new = __find(c, new_block); 1291 if (new) { 1292 if (new->hold_count) { 1293 __wait_for_free_buffer(c); 1294 goto retry; 1295 } 1296 1297 /* 1298 * FIXME: Is there any point waiting for a write that's going 1299 * to be overwritten in a bit? 1300 */ 1301 __make_buffer_clean(new); 1302 __unlink_buffer(new); 1303 __free_buffer_wake(new); 1304 } 1305 1306 BUG_ON(!b->hold_count); 1307 BUG_ON(test_bit(B_READING, &b->state)); 1308 1309 __write_dirty_buffer(b, NULL); 1310 if (b->hold_count == 1) { 1311 wait_on_bit_io(&b->state, B_WRITING, 1312 TASK_UNINTERRUPTIBLE); 1313 set_bit(B_DIRTY, &b->state); 1314 __unlink_buffer(b); 1315 __link_buffer(b, new_block, LIST_DIRTY); 1316 } else { 1317 sector_t old_block; 1318 wait_on_bit_lock_io(&b->state, B_WRITING, 1319 TASK_UNINTERRUPTIBLE); 1320 /* 1321 * Relink buffer to "new_block" so that write_callback 1322 * sees "new_block" as a block number. 1323 * After the write, link the buffer back to old_block. 1324 * All this must be done in bufio lock, so that block number 1325 * change isn't visible to other threads. 1326 */ 1327 old_block = b->block; 1328 __unlink_buffer(b); 1329 __link_buffer(b, new_block, b->list_mode); 1330 submit_io(b, WRITE, new_block, write_endio); 1331 wait_on_bit_io(&b->state, B_WRITING, 1332 TASK_UNINTERRUPTIBLE); 1333 __unlink_buffer(b); 1334 __link_buffer(b, old_block, b->list_mode); 1335 } 1336 1337 dm_bufio_unlock(c); 1338 dm_bufio_release(b); 1339 } 1340 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1341 1342 /* 1343 * Free the given buffer. 1344 * 1345 * This is just a hint, if the buffer is in use or dirty, this function 1346 * does nothing. 1347 */ 1348 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1349 { 1350 struct dm_buffer *b; 1351 1352 dm_bufio_lock(c); 1353 1354 b = __find(c, block); 1355 if (b && likely(!b->hold_count) && likely(!b->state)) { 1356 __unlink_buffer(b); 1357 __free_buffer_wake(b); 1358 } 1359 1360 dm_bufio_unlock(c); 1361 } 1362 EXPORT_SYMBOL(dm_bufio_forget); 1363 1364 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1365 { 1366 c->minimum_buffers = n; 1367 } 1368 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1369 1370 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1371 { 1372 return c->block_size; 1373 } 1374 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1375 1376 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1377 { 1378 return i_size_read(c->bdev->bd_inode) >> 1379 (SECTOR_SHIFT + c->sectors_per_block_bits); 1380 } 1381 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1382 1383 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1384 { 1385 return b->block; 1386 } 1387 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1388 1389 void *dm_bufio_get_block_data(struct dm_buffer *b) 1390 { 1391 return b->data; 1392 } 1393 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1394 1395 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1396 { 1397 return b + 1; 1398 } 1399 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1400 1401 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1402 { 1403 return b->c; 1404 } 1405 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1406 1407 static void drop_buffers(struct dm_bufio_client *c) 1408 { 1409 struct dm_buffer *b; 1410 int i; 1411 1412 BUG_ON(dm_bufio_in_request()); 1413 1414 /* 1415 * An optimization so that the buffers are not written one-by-one. 1416 */ 1417 dm_bufio_write_dirty_buffers_async(c); 1418 1419 dm_bufio_lock(c); 1420 1421 while ((b = __get_unclaimed_buffer(c))) 1422 __free_buffer_wake(b); 1423 1424 for (i = 0; i < LIST_SIZE; i++) 1425 list_for_each_entry(b, &c->lru[i], lru_list) 1426 DMERR("leaked buffer %llx, hold count %u, list %d", 1427 (unsigned long long)b->block, b->hold_count, i); 1428 1429 for (i = 0; i < LIST_SIZE; i++) 1430 BUG_ON(!list_empty(&c->lru[i])); 1431 1432 dm_bufio_unlock(c); 1433 } 1434 1435 /* 1436 * Test if the buffer is unused and too old, and commit it. 1437 * At if noio is set, we must not do any I/O because we hold 1438 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1439 * different bufio client. 1440 */ 1441 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1442 unsigned long max_jiffies) 1443 { 1444 if (jiffies - b->last_accessed < max_jiffies) 1445 return 0; 1446 1447 if (!(gfp & __GFP_IO)) { 1448 if (test_bit(B_READING, &b->state) || 1449 test_bit(B_WRITING, &b->state) || 1450 test_bit(B_DIRTY, &b->state)) 1451 return 0; 1452 } 1453 1454 if (b->hold_count) 1455 return 0; 1456 1457 __make_buffer_clean(b); 1458 __unlink_buffer(b); 1459 __free_buffer_wake(b); 1460 1461 return 1; 1462 } 1463 1464 static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1465 gfp_t gfp_mask) 1466 { 1467 int l; 1468 struct dm_buffer *b, *tmp; 1469 long freed = 0; 1470 1471 for (l = 0; l < LIST_SIZE; l++) { 1472 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1473 freed += __cleanup_old_buffer(b, gfp_mask, 0); 1474 if (!--nr_to_scan) 1475 break; 1476 } 1477 dm_bufio_cond_resched(); 1478 } 1479 return freed; 1480 } 1481 1482 static unsigned long 1483 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1484 { 1485 struct dm_bufio_client *c; 1486 unsigned long freed; 1487 1488 c = container_of(shrink, struct dm_bufio_client, shrinker); 1489 if (sc->gfp_mask & __GFP_IO) 1490 dm_bufio_lock(c); 1491 else if (!dm_bufio_trylock(c)) 1492 return SHRINK_STOP; 1493 1494 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1495 dm_bufio_unlock(c); 1496 return freed; 1497 } 1498 1499 static unsigned long 1500 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1501 { 1502 struct dm_bufio_client *c; 1503 unsigned long count; 1504 1505 c = container_of(shrink, struct dm_bufio_client, shrinker); 1506 if (sc->gfp_mask & __GFP_IO) 1507 dm_bufio_lock(c); 1508 else if (!dm_bufio_trylock(c)) 1509 return 0; 1510 1511 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1512 dm_bufio_unlock(c); 1513 return count; 1514 } 1515 1516 /* 1517 * Create the buffering interface 1518 */ 1519 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1520 unsigned reserved_buffers, unsigned aux_size, 1521 void (*alloc_callback)(struct dm_buffer *), 1522 void (*write_callback)(struct dm_buffer *)) 1523 { 1524 int r; 1525 struct dm_bufio_client *c; 1526 unsigned i; 1527 1528 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1529 (block_size & (block_size - 1))); 1530 1531 c = kzalloc(sizeof(*c), GFP_KERNEL); 1532 if (!c) { 1533 r = -ENOMEM; 1534 goto bad_client; 1535 } 1536 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1537 if (!c->cache_hash) { 1538 r = -ENOMEM; 1539 goto bad_hash; 1540 } 1541 1542 c->bdev = bdev; 1543 c->block_size = block_size; 1544 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1545 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1546 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1547 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1548 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1549 1550 c->aux_size = aux_size; 1551 c->alloc_callback = alloc_callback; 1552 c->write_callback = write_callback; 1553 1554 for (i = 0; i < LIST_SIZE; i++) { 1555 INIT_LIST_HEAD(&c->lru[i]); 1556 c->n_buffers[i] = 0; 1557 } 1558 1559 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1560 INIT_HLIST_HEAD(&c->cache_hash[i]); 1561 1562 mutex_init(&c->lock); 1563 INIT_LIST_HEAD(&c->reserved_buffers); 1564 c->need_reserved_buffers = reserved_buffers; 1565 1566 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1567 1568 init_waitqueue_head(&c->free_buffer_wait); 1569 c->async_write_error = 0; 1570 1571 c->dm_io = dm_io_client_create(); 1572 if (IS_ERR(c->dm_io)) { 1573 r = PTR_ERR(c->dm_io); 1574 goto bad_dm_io; 1575 } 1576 1577 mutex_lock(&dm_bufio_clients_lock); 1578 if (c->blocks_per_page_bits) { 1579 if (!DM_BUFIO_CACHE_NAME(c)) { 1580 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1581 if (!DM_BUFIO_CACHE_NAME(c)) { 1582 r = -ENOMEM; 1583 mutex_unlock(&dm_bufio_clients_lock); 1584 goto bad_cache; 1585 } 1586 } 1587 1588 if (!DM_BUFIO_CACHE(c)) { 1589 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1590 c->block_size, 1591 c->block_size, 0, NULL); 1592 if (!DM_BUFIO_CACHE(c)) { 1593 r = -ENOMEM; 1594 mutex_unlock(&dm_bufio_clients_lock); 1595 goto bad_cache; 1596 } 1597 } 1598 } 1599 mutex_unlock(&dm_bufio_clients_lock); 1600 1601 while (c->need_reserved_buffers) { 1602 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1603 1604 if (!b) { 1605 r = -ENOMEM; 1606 goto bad_buffer; 1607 } 1608 __free_buffer_wake(b); 1609 } 1610 1611 mutex_lock(&dm_bufio_clients_lock); 1612 dm_bufio_client_count++; 1613 list_add(&c->client_list, &dm_bufio_all_clients); 1614 __cache_size_refresh(); 1615 mutex_unlock(&dm_bufio_clients_lock); 1616 1617 c->shrinker.count_objects = dm_bufio_shrink_count; 1618 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1619 c->shrinker.seeks = 1; 1620 c->shrinker.batch = 0; 1621 register_shrinker(&c->shrinker); 1622 1623 return c; 1624 1625 bad_buffer: 1626 bad_cache: 1627 while (!list_empty(&c->reserved_buffers)) { 1628 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1629 struct dm_buffer, lru_list); 1630 list_del(&b->lru_list); 1631 free_buffer(b); 1632 } 1633 dm_io_client_destroy(c->dm_io); 1634 bad_dm_io: 1635 vfree(c->cache_hash); 1636 bad_hash: 1637 kfree(c); 1638 bad_client: 1639 return ERR_PTR(r); 1640 } 1641 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1642 1643 /* 1644 * Free the buffering interface. 1645 * It is required that there are no references on any buffers. 1646 */ 1647 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1648 { 1649 unsigned i; 1650 1651 drop_buffers(c); 1652 1653 unregister_shrinker(&c->shrinker); 1654 1655 mutex_lock(&dm_bufio_clients_lock); 1656 1657 list_del(&c->client_list); 1658 dm_bufio_client_count--; 1659 __cache_size_refresh(); 1660 1661 mutex_unlock(&dm_bufio_clients_lock); 1662 1663 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1664 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1665 1666 BUG_ON(c->need_reserved_buffers); 1667 1668 while (!list_empty(&c->reserved_buffers)) { 1669 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1670 struct dm_buffer, lru_list); 1671 list_del(&b->lru_list); 1672 free_buffer(b); 1673 } 1674 1675 for (i = 0; i < LIST_SIZE; i++) 1676 if (c->n_buffers[i]) 1677 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1678 1679 for (i = 0; i < LIST_SIZE; i++) 1680 BUG_ON(c->n_buffers[i]); 1681 1682 dm_io_client_destroy(c->dm_io); 1683 vfree(c->cache_hash); 1684 kfree(c); 1685 } 1686 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1687 1688 static void cleanup_old_buffers(void) 1689 { 1690 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1691 struct dm_bufio_client *c; 1692 1693 if (max_age > ULONG_MAX / HZ) 1694 max_age = ULONG_MAX / HZ; 1695 1696 mutex_lock(&dm_bufio_clients_lock); 1697 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1698 if (!dm_bufio_trylock(c)) 1699 continue; 1700 1701 while (!list_empty(&c->lru[LIST_CLEAN])) { 1702 struct dm_buffer *b; 1703 b = list_entry(c->lru[LIST_CLEAN].prev, 1704 struct dm_buffer, lru_list); 1705 if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1706 break; 1707 dm_bufio_cond_resched(); 1708 } 1709 1710 dm_bufio_unlock(c); 1711 dm_bufio_cond_resched(); 1712 } 1713 mutex_unlock(&dm_bufio_clients_lock); 1714 } 1715 1716 static struct workqueue_struct *dm_bufio_wq; 1717 static struct delayed_work dm_bufio_work; 1718 1719 static void work_fn(struct work_struct *w) 1720 { 1721 cleanup_old_buffers(); 1722 1723 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1724 DM_BUFIO_WORK_TIMER_SECS * HZ); 1725 } 1726 1727 /*---------------------------------------------------------------- 1728 * Module setup 1729 *--------------------------------------------------------------*/ 1730 1731 /* 1732 * This is called only once for the whole dm_bufio module. 1733 * It initializes memory limit. 1734 */ 1735 static int __init dm_bufio_init(void) 1736 { 1737 __u64 mem; 1738 1739 dm_bufio_allocated_kmem_cache = 0; 1740 dm_bufio_allocated_get_free_pages = 0; 1741 dm_bufio_allocated_vmalloc = 0; 1742 dm_bufio_current_allocated = 0; 1743 1744 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1745 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1746 1747 mem = (__u64)((totalram_pages - totalhigh_pages) * 1748 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1749 1750 if (mem > ULONG_MAX) 1751 mem = ULONG_MAX; 1752 1753 #ifdef CONFIG_MMU 1754 /* 1755 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1756 * in fs/proc/internal.h 1757 */ 1758 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1759 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1760 #endif 1761 1762 dm_bufio_default_cache_size = mem; 1763 1764 mutex_lock(&dm_bufio_clients_lock); 1765 __cache_size_refresh(); 1766 mutex_unlock(&dm_bufio_clients_lock); 1767 1768 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1769 if (!dm_bufio_wq) 1770 return -ENOMEM; 1771 1772 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1773 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1774 DM_BUFIO_WORK_TIMER_SECS * HZ); 1775 1776 return 0; 1777 } 1778 1779 /* 1780 * This is called once when unloading the dm_bufio module. 1781 */ 1782 static void __exit dm_bufio_exit(void) 1783 { 1784 int bug = 0; 1785 int i; 1786 1787 cancel_delayed_work_sync(&dm_bufio_work); 1788 destroy_workqueue(dm_bufio_wq); 1789 1790 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1791 struct kmem_cache *kc = dm_bufio_caches[i]; 1792 1793 if (kc) 1794 kmem_cache_destroy(kc); 1795 } 1796 1797 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1798 kfree(dm_bufio_cache_names[i]); 1799 1800 if (dm_bufio_client_count) { 1801 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1802 __func__, dm_bufio_client_count); 1803 bug = 1; 1804 } 1805 1806 if (dm_bufio_current_allocated) { 1807 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1808 __func__, dm_bufio_current_allocated); 1809 bug = 1; 1810 } 1811 1812 if (dm_bufio_allocated_get_free_pages) { 1813 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1814 __func__, dm_bufio_allocated_get_free_pages); 1815 bug = 1; 1816 } 1817 1818 if (dm_bufio_allocated_vmalloc) { 1819 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1820 __func__, dm_bufio_allocated_vmalloc); 1821 bug = 1; 1822 } 1823 1824 if (bug) 1825 BUG(); 1826 } 1827 1828 module_init(dm_bufio_init) 1829 module_exit(dm_bufio_exit) 1830 1831 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1832 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1833 1834 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1835 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1836 1837 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1838 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1839 1840 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1841 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1842 1843 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1844 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1845 1846 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1847 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1848 1849 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1850 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1851 1852 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1853 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1854 MODULE_LICENSE("GPL"); 1855