1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/shrinker.h> 16 #include <linux/module.h> 17 18 #define DM_MSG_PREFIX "bufio" 19 20 /* 21 * Memory management policy: 22 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 23 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 24 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 25 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 26 * dirty buffers. 27 */ 28 #define DM_BUFIO_MIN_BUFFERS 8 29 30 #define DM_BUFIO_MEMORY_PERCENT 2 31 #define DM_BUFIO_VMALLOC_PERCENT 25 32 #define DM_BUFIO_WRITEBACK_PERCENT 75 33 34 /* 35 * Check buffer ages in this interval (seconds) 36 */ 37 #define DM_BUFIO_WORK_TIMER_SECS 10 38 39 /* 40 * Free buffers when they are older than this (seconds) 41 */ 42 #define DM_BUFIO_DEFAULT_AGE_SECS 60 43 44 /* 45 * The number of bvec entries that are embedded directly in the buffer. 46 * If the chunk size is larger, dm-io is used to do the io. 47 */ 48 #define DM_BUFIO_INLINE_VECS 16 49 50 /* 51 * Buffer hash 52 */ 53 #define DM_BUFIO_HASH_BITS 20 54 #define DM_BUFIO_HASH(block) \ 55 ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ 56 ((1 << DM_BUFIO_HASH_BITS) - 1)) 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 unsigned minimum_buffers; 108 109 struct hlist_head *cache_hash; 110 wait_queue_head_t free_buffer_wait; 111 112 int async_write_error; 113 114 struct list_head client_list; 115 struct shrinker shrinker; 116 }; 117 118 /* 119 * Buffer state bits. 120 */ 121 #define B_READING 0 122 #define B_WRITING 1 123 #define B_DIRTY 2 124 125 /* 126 * Describes how the block was allocated: 127 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 128 * See the comment at alloc_buffer_data. 129 */ 130 enum data_mode { 131 DATA_MODE_SLAB = 0, 132 DATA_MODE_GET_FREE_PAGES = 1, 133 DATA_MODE_VMALLOC = 2, 134 DATA_MODE_LIMIT = 3 135 }; 136 137 struct dm_buffer { 138 struct hlist_node hash_list; 139 struct list_head lru_list; 140 sector_t block; 141 void *data; 142 enum data_mode data_mode; 143 unsigned char list_mode; /* LIST_* */ 144 unsigned hold_count; 145 int read_error; 146 int write_error; 147 unsigned long state; 148 unsigned long last_accessed; 149 struct dm_bufio_client *c; 150 struct list_head write_list; 151 struct bio bio; 152 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 153 }; 154 155 /*----------------------------------------------------------------*/ 156 157 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 158 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 159 160 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 161 { 162 unsigned ret = c->blocks_per_page_bits - 1; 163 164 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 165 166 return ret; 167 } 168 169 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 170 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 171 172 #define dm_bufio_in_request() (!!current->bio_list) 173 174 static void dm_bufio_lock(struct dm_bufio_client *c) 175 { 176 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 177 } 178 179 static int dm_bufio_trylock(struct dm_bufio_client *c) 180 { 181 return mutex_trylock(&c->lock); 182 } 183 184 static void dm_bufio_unlock(struct dm_bufio_client *c) 185 { 186 mutex_unlock(&c->lock); 187 } 188 189 /* 190 * FIXME Move to sched.h? 191 */ 192 #ifdef CONFIG_PREEMPT_VOLUNTARY 193 # define dm_bufio_cond_resched() \ 194 do { \ 195 if (unlikely(need_resched())) \ 196 _cond_resched(); \ 197 } while (0) 198 #else 199 # define dm_bufio_cond_resched() do { } while (0) 200 #endif 201 202 /*----------------------------------------------------------------*/ 203 204 /* 205 * Default cache size: available memory divided by the ratio. 206 */ 207 static unsigned long dm_bufio_default_cache_size; 208 209 /* 210 * Total cache size set by the user. 211 */ 212 static unsigned long dm_bufio_cache_size; 213 214 /* 215 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 216 * at any time. If it disagrees, the user has changed cache size. 217 */ 218 static unsigned long dm_bufio_cache_size_latch; 219 220 static DEFINE_SPINLOCK(param_spinlock); 221 222 /* 223 * Buffers are freed after this timeout 224 */ 225 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 226 227 static unsigned long dm_bufio_peak_allocated; 228 static unsigned long dm_bufio_allocated_kmem_cache; 229 static unsigned long dm_bufio_allocated_get_free_pages; 230 static unsigned long dm_bufio_allocated_vmalloc; 231 static unsigned long dm_bufio_current_allocated; 232 233 /*----------------------------------------------------------------*/ 234 235 /* 236 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 237 */ 238 static unsigned long dm_bufio_cache_size_per_client; 239 240 /* 241 * The current number of clients. 242 */ 243 static int dm_bufio_client_count; 244 245 /* 246 * The list of all clients. 247 */ 248 static LIST_HEAD(dm_bufio_all_clients); 249 250 /* 251 * This mutex protects dm_bufio_cache_size_latch, 252 * dm_bufio_cache_size_per_client and dm_bufio_client_count 253 */ 254 static DEFINE_MUTEX(dm_bufio_clients_lock); 255 256 /*----------------------------------------------------------------*/ 257 258 static void adjust_total_allocated(enum data_mode data_mode, long diff) 259 { 260 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 261 &dm_bufio_allocated_kmem_cache, 262 &dm_bufio_allocated_get_free_pages, 263 &dm_bufio_allocated_vmalloc, 264 }; 265 266 spin_lock(¶m_spinlock); 267 268 *class_ptr[data_mode] += diff; 269 270 dm_bufio_current_allocated += diff; 271 272 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 273 dm_bufio_peak_allocated = dm_bufio_current_allocated; 274 275 spin_unlock(¶m_spinlock); 276 } 277 278 /* 279 * Change the number of clients and recalculate per-client limit. 280 */ 281 static void __cache_size_refresh(void) 282 { 283 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 284 BUG_ON(dm_bufio_client_count < 0); 285 286 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 287 288 /* 289 * Use default if set to 0 and report the actual cache size used. 290 */ 291 if (!dm_bufio_cache_size_latch) { 292 (void)cmpxchg(&dm_bufio_cache_size, 0, 293 dm_bufio_default_cache_size); 294 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 295 } 296 297 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 298 (dm_bufio_client_count ? : 1); 299 } 300 301 /* 302 * Allocating buffer data. 303 * 304 * Small buffers are allocated with kmem_cache, to use space optimally. 305 * 306 * For large buffers, we choose between get_free_pages and vmalloc. 307 * Each has advantages and disadvantages. 308 * 309 * __get_free_pages can randomly fail if the memory is fragmented. 310 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 311 * as low as 128M) so using it for caching is not appropriate. 312 * 313 * If the allocation may fail we use __get_free_pages. Memory fragmentation 314 * won't have a fatal effect here, but it just causes flushes of some other 315 * buffers and more I/O will be performed. Don't use __get_free_pages if it 316 * always fails (i.e. order >= MAX_ORDER). 317 * 318 * If the allocation shouldn't fail we use __vmalloc. This is only for the 319 * initial reserve allocation, so there's no risk of wasting all vmalloc 320 * space. 321 */ 322 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 323 enum data_mode *data_mode) 324 { 325 unsigned noio_flag; 326 void *ptr; 327 328 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 329 *data_mode = DATA_MODE_SLAB; 330 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 331 } 332 333 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 334 gfp_mask & __GFP_NORETRY) { 335 *data_mode = DATA_MODE_GET_FREE_PAGES; 336 return (void *)__get_free_pages(gfp_mask, 337 c->pages_per_block_bits); 338 } 339 340 *data_mode = DATA_MODE_VMALLOC; 341 342 /* 343 * __vmalloc allocates the data pages and auxiliary structures with 344 * gfp_flags that were specified, but pagetables are always allocated 345 * with GFP_KERNEL, no matter what was specified as gfp_mask. 346 * 347 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 348 * all allocations done by this process (including pagetables) are done 349 * as if GFP_NOIO was specified. 350 */ 351 352 if (gfp_mask & __GFP_NORETRY) 353 noio_flag = memalloc_noio_save(); 354 355 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 356 357 if (gfp_mask & __GFP_NORETRY) 358 memalloc_noio_restore(noio_flag); 359 360 return ptr; 361 } 362 363 /* 364 * Free buffer's data. 365 */ 366 static void free_buffer_data(struct dm_bufio_client *c, 367 void *data, enum data_mode data_mode) 368 { 369 switch (data_mode) { 370 case DATA_MODE_SLAB: 371 kmem_cache_free(DM_BUFIO_CACHE(c), data); 372 break; 373 374 case DATA_MODE_GET_FREE_PAGES: 375 free_pages((unsigned long)data, c->pages_per_block_bits); 376 break; 377 378 case DATA_MODE_VMALLOC: 379 vfree(data); 380 break; 381 382 default: 383 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 384 data_mode); 385 BUG(); 386 } 387 } 388 389 /* 390 * Allocate buffer and its data. 391 */ 392 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 393 { 394 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 395 gfp_mask); 396 397 if (!b) 398 return NULL; 399 400 b->c = c; 401 402 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 403 if (!b->data) { 404 kfree(b); 405 return NULL; 406 } 407 408 adjust_total_allocated(b->data_mode, (long)c->block_size); 409 410 return b; 411 } 412 413 /* 414 * Free buffer and its data. 415 */ 416 static void free_buffer(struct dm_buffer *b) 417 { 418 struct dm_bufio_client *c = b->c; 419 420 adjust_total_allocated(b->data_mode, -(long)c->block_size); 421 422 free_buffer_data(c, b->data, b->data_mode); 423 kfree(b); 424 } 425 426 /* 427 * Link buffer to the hash list and clean or dirty queue. 428 */ 429 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 430 { 431 struct dm_bufio_client *c = b->c; 432 433 c->n_buffers[dirty]++; 434 b->block = block; 435 b->list_mode = dirty; 436 list_add(&b->lru_list, &c->lru[dirty]); 437 hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); 438 b->last_accessed = jiffies; 439 } 440 441 /* 442 * Unlink buffer from the hash list and dirty or clean queue. 443 */ 444 static void __unlink_buffer(struct dm_buffer *b) 445 { 446 struct dm_bufio_client *c = b->c; 447 448 BUG_ON(!c->n_buffers[b->list_mode]); 449 450 c->n_buffers[b->list_mode]--; 451 hlist_del(&b->hash_list); 452 list_del(&b->lru_list); 453 } 454 455 /* 456 * Place the buffer to the head of dirty or clean LRU queue. 457 */ 458 static void __relink_lru(struct dm_buffer *b, int dirty) 459 { 460 struct dm_bufio_client *c = b->c; 461 462 BUG_ON(!c->n_buffers[b->list_mode]); 463 464 c->n_buffers[b->list_mode]--; 465 c->n_buffers[dirty]++; 466 b->list_mode = dirty; 467 list_move(&b->lru_list, &c->lru[dirty]); 468 } 469 470 /*---------------------------------------------------------------- 471 * Submit I/O on the buffer. 472 * 473 * Bio interface is faster but it has some problems: 474 * the vector list is limited (increasing this limit increases 475 * memory-consumption per buffer, so it is not viable); 476 * 477 * the memory must be direct-mapped, not vmalloced; 478 * 479 * the I/O driver can reject requests spuriously if it thinks that 480 * the requests are too big for the device or if they cross a 481 * controller-defined memory boundary. 482 * 483 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 484 * it is not vmalloced, try using the bio interface. 485 * 486 * If the buffer is big, if it is vmalloced or if the underlying device 487 * rejects the bio because it is too large, use dm-io layer to do the I/O. 488 * The dm-io layer splits the I/O into multiple requests, avoiding the above 489 * shortcomings. 490 *--------------------------------------------------------------*/ 491 492 /* 493 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 494 * that the request was handled directly with bio interface. 495 */ 496 static void dmio_complete(unsigned long error, void *context) 497 { 498 struct dm_buffer *b = context; 499 500 b->bio.bi_end_io(&b->bio, error ? -EIO : 0); 501 } 502 503 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 504 bio_end_io_t *end_io) 505 { 506 int r; 507 struct dm_io_request io_req = { 508 .bi_rw = rw, 509 .notify.fn = dmio_complete, 510 .notify.context = b, 511 .client = b->c->dm_io, 512 }; 513 struct dm_io_region region = { 514 .bdev = b->c->bdev, 515 .sector = block << b->c->sectors_per_block_bits, 516 .count = b->c->block_size >> SECTOR_SHIFT, 517 }; 518 519 if (b->data_mode != DATA_MODE_VMALLOC) { 520 io_req.mem.type = DM_IO_KMEM; 521 io_req.mem.ptr.addr = b->data; 522 } else { 523 io_req.mem.type = DM_IO_VMA; 524 io_req.mem.ptr.vma = b->data; 525 } 526 527 b->bio.bi_end_io = end_io; 528 529 r = dm_io(&io_req, 1, ®ion, NULL); 530 if (r) 531 end_io(&b->bio, r); 532 } 533 534 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 535 bio_end_io_t *end_io) 536 { 537 char *ptr; 538 int len; 539 540 bio_init(&b->bio); 541 b->bio.bi_io_vec = b->bio_vec; 542 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 543 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 544 b->bio.bi_bdev = b->c->bdev; 545 b->bio.bi_end_io = end_io; 546 547 /* 548 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 549 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 550 */ 551 ptr = b->data; 552 len = b->c->block_size; 553 554 if (len >= PAGE_SIZE) 555 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 556 else 557 BUG_ON((unsigned long)ptr & (len - 1)); 558 559 do { 560 if (!bio_add_page(&b->bio, virt_to_page(ptr), 561 len < PAGE_SIZE ? len : PAGE_SIZE, 562 virt_to_phys(ptr) & (PAGE_SIZE - 1))) { 563 BUG_ON(b->c->block_size <= PAGE_SIZE); 564 use_dmio(b, rw, block, end_io); 565 return; 566 } 567 568 len -= PAGE_SIZE; 569 ptr += PAGE_SIZE; 570 } while (len > 0); 571 572 submit_bio(rw, &b->bio); 573 } 574 575 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 576 bio_end_io_t *end_io) 577 { 578 if (rw == WRITE && b->c->write_callback) 579 b->c->write_callback(b); 580 581 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 582 b->data_mode != DATA_MODE_VMALLOC) 583 use_inline_bio(b, rw, block, end_io); 584 else 585 use_dmio(b, rw, block, end_io); 586 } 587 588 /*---------------------------------------------------------------- 589 * Writing dirty buffers 590 *--------------------------------------------------------------*/ 591 592 /* 593 * The endio routine for write. 594 * 595 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 596 * it. 597 */ 598 static void write_endio(struct bio *bio, int error) 599 { 600 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 601 602 b->write_error = error; 603 if (unlikely(error)) { 604 struct dm_bufio_client *c = b->c; 605 (void)cmpxchg(&c->async_write_error, 0, error); 606 } 607 608 BUG_ON(!test_bit(B_WRITING, &b->state)); 609 610 smp_mb__before_atomic(); 611 clear_bit(B_WRITING, &b->state); 612 smp_mb__after_atomic(); 613 614 wake_up_bit(&b->state, B_WRITING); 615 } 616 617 /* 618 * This function is called when wait_on_bit is actually waiting. 619 */ 620 static int do_io_schedule(void *word) 621 { 622 io_schedule(); 623 624 return 0; 625 } 626 627 /* 628 * Initiate a write on a dirty buffer, but don't wait for it. 629 * 630 * - If the buffer is not dirty, exit. 631 * - If there some previous write going on, wait for it to finish (we can't 632 * have two writes on the same buffer simultaneously). 633 * - Submit our write and don't wait on it. We set B_WRITING indicating 634 * that there is a write in progress. 635 */ 636 static void __write_dirty_buffer(struct dm_buffer *b, 637 struct list_head *write_list) 638 { 639 if (!test_bit(B_DIRTY, &b->state)) 640 return; 641 642 clear_bit(B_DIRTY, &b->state); 643 wait_on_bit_lock(&b->state, B_WRITING, 644 do_io_schedule, TASK_UNINTERRUPTIBLE); 645 646 if (!write_list) 647 submit_io(b, WRITE, b->block, write_endio); 648 else 649 list_add_tail(&b->write_list, write_list); 650 } 651 652 static void __flush_write_list(struct list_head *write_list) 653 { 654 struct blk_plug plug; 655 blk_start_plug(&plug); 656 while (!list_empty(write_list)) { 657 struct dm_buffer *b = 658 list_entry(write_list->next, struct dm_buffer, write_list); 659 list_del(&b->write_list); 660 submit_io(b, WRITE, b->block, write_endio); 661 dm_bufio_cond_resched(); 662 } 663 blk_finish_plug(&plug); 664 } 665 666 /* 667 * Wait until any activity on the buffer finishes. Possibly write the 668 * buffer if it is dirty. When this function finishes, there is no I/O 669 * running on the buffer and the buffer is not dirty. 670 */ 671 static void __make_buffer_clean(struct dm_buffer *b) 672 { 673 BUG_ON(b->hold_count); 674 675 if (!b->state) /* fast case */ 676 return; 677 678 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 679 __write_dirty_buffer(b, NULL); 680 wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); 681 } 682 683 /* 684 * Find some buffer that is not held by anybody, clean it, unlink it and 685 * return it. 686 */ 687 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 688 { 689 struct dm_buffer *b; 690 691 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 692 BUG_ON(test_bit(B_WRITING, &b->state)); 693 BUG_ON(test_bit(B_DIRTY, &b->state)); 694 695 if (!b->hold_count) { 696 __make_buffer_clean(b); 697 __unlink_buffer(b); 698 return b; 699 } 700 dm_bufio_cond_resched(); 701 } 702 703 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 704 BUG_ON(test_bit(B_READING, &b->state)); 705 706 if (!b->hold_count) { 707 __make_buffer_clean(b); 708 __unlink_buffer(b); 709 return b; 710 } 711 dm_bufio_cond_resched(); 712 } 713 714 return NULL; 715 } 716 717 /* 718 * Wait until some other threads free some buffer or release hold count on 719 * some buffer. 720 * 721 * This function is entered with c->lock held, drops it and regains it 722 * before exiting. 723 */ 724 static void __wait_for_free_buffer(struct dm_bufio_client *c) 725 { 726 DECLARE_WAITQUEUE(wait, current); 727 728 add_wait_queue(&c->free_buffer_wait, &wait); 729 set_task_state(current, TASK_UNINTERRUPTIBLE); 730 dm_bufio_unlock(c); 731 732 io_schedule(); 733 734 set_task_state(current, TASK_RUNNING); 735 remove_wait_queue(&c->free_buffer_wait, &wait); 736 737 dm_bufio_lock(c); 738 } 739 740 enum new_flag { 741 NF_FRESH = 0, 742 NF_READ = 1, 743 NF_GET = 2, 744 NF_PREFETCH = 3 745 }; 746 747 /* 748 * Allocate a new buffer. If the allocation is not possible, wait until 749 * some other thread frees a buffer. 750 * 751 * May drop the lock and regain it. 752 */ 753 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 754 { 755 struct dm_buffer *b; 756 757 /* 758 * dm-bufio is resistant to allocation failures (it just keeps 759 * one buffer reserved in cases all the allocations fail). 760 * So set flags to not try too hard: 761 * GFP_NOIO: don't recurse into the I/O layer 762 * __GFP_NORETRY: don't retry and rather return failure 763 * __GFP_NOMEMALLOC: don't use emergency reserves 764 * __GFP_NOWARN: don't print a warning in case of failure 765 * 766 * For debugging, if we set the cache size to 1, no new buffers will 767 * be allocated. 768 */ 769 while (1) { 770 if (dm_bufio_cache_size_latch != 1) { 771 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 772 if (b) 773 return b; 774 } 775 776 if (nf == NF_PREFETCH) 777 return NULL; 778 779 if (!list_empty(&c->reserved_buffers)) { 780 b = list_entry(c->reserved_buffers.next, 781 struct dm_buffer, lru_list); 782 list_del(&b->lru_list); 783 c->need_reserved_buffers++; 784 785 return b; 786 } 787 788 b = __get_unclaimed_buffer(c); 789 if (b) 790 return b; 791 792 __wait_for_free_buffer(c); 793 } 794 } 795 796 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 797 { 798 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 799 800 if (!b) 801 return NULL; 802 803 if (c->alloc_callback) 804 c->alloc_callback(b); 805 806 return b; 807 } 808 809 /* 810 * Free a buffer and wake other threads waiting for free buffers. 811 */ 812 static void __free_buffer_wake(struct dm_buffer *b) 813 { 814 struct dm_bufio_client *c = b->c; 815 816 if (!c->need_reserved_buffers) 817 free_buffer(b); 818 else { 819 list_add(&b->lru_list, &c->reserved_buffers); 820 c->need_reserved_buffers--; 821 } 822 823 wake_up(&c->free_buffer_wait); 824 } 825 826 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 827 struct list_head *write_list) 828 { 829 struct dm_buffer *b, *tmp; 830 831 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 832 BUG_ON(test_bit(B_READING, &b->state)); 833 834 if (!test_bit(B_DIRTY, &b->state) && 835 !test_bit(B_WRITING, &b->state)) { 836 __relink_lru(b, LIST_CLEAN); 837 continue; 838 } 839 840 if (no_wait && test_bit(B_WRITING, &b->state)) 841 return; 842 843 __write_dirty_buffer(b, write_list); 844 dm_bufio_cond_resched(); 845 } 846 } 847 848 /* 849 * Get writeback threshold and buffer limit for a given client. 850 */ 851 static void __get_memory_limit(struct dm_bufio_client *c, 852 unsigned long *threshold_buffers, 853 unsigned long *limit_buffers) 854 { 855 unsigned long buffers; 856 857 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 858 mutex_lock(&dm_bufio_clients_lock); 859 __cache_size_refresh(); 860 mutex_unlock(&dm_bufio_clients_lock); 861 } 862 863 buffers = dm_bufio_cache_size_per_client >> 864 (c->sectors_per_block_bits + SECTOR_SHIFT); 865 866 if (buffers < c->minimum_buffers) 867 buffers = c->minimum_buffers; 868 869 *limit_buffers = buffers; 870 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 871 } 872 873 /* 874 * Check if we're over watermark. 875 * If we are over threshold_buffers, start freeing buffers. 876 * If we're over "limit_buffers", block until we get under the limit. 877 */ 878 static void __check_watermark(struct dm_bufio_client *c, 879 struct list_head *write_list) 880 { 881 unsigned long threshold_buffers, limit_buffers; 882 883 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 884 885 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 886 limit_buffers) { 887 888 struct dm_buffer *b = __get_unclaimed_buffer(c); 889 890 if (!b) 891 return; 892 893 __free_buffer_wake(b); 894 dm_bufio_cond_resched(); 895 } 896 897 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 898 __write_dirty_buffers_async(c, 1, write_list); 899 } 900 901 /* 902 * Find a buffer in the hash. 903 */ 904 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 905 { 906 struct dm_buffer *b; 907 908 hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], 909 hash_list) { 910 dm_bufio_cond_resched(); 911 if (b->block == block) 912 return b; 913 } 914 915 return NULL; 916 } 917 918 /*---------------------------------------------------------------- 919 * Getting a buffer 920 *--------------------------------------------------------------*/ 921 922 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 923 enum new_flag nf, int *need_submit, 924 struct list_head *write_list) 925 { 926 struct dm_buffer *b, *new_b = NULL; 927 928 *need_submit = 0; 929 930 b = __find(c, block); 931 if (b) 932 goto found_buffer; 933 934 if (nf == NF_GET) 935 return NULL; 936 937 new_b = __alloc_buffer_wait(c, nf); 938 if (!new_b) 939 return NULL; 940 941 /* 942 * We've had a period where the mutex was unlocked, so need to 943 * recheck the hash table. 944 */ 945 b = __find(c, block); 946 if (b) { 947 __free_buffer_wake(new_b); 948 goto found_buffer; 949 } 950 951 __check_watermark(c, write_list); 952 953 b = new_b; 954 b->hold_count = 1; 955 b->read_error = 0; 956 b->write_error = 0; 957 __link_buffer(b, block, LIST_CLEAN); 958 959 if (nf == NF_FRESH) { 960 b->state = 0; 961 return b; 962 } 963 964 b->state = 1 << B_READING; 965 *need_submit = 1; 966 967 return b; 968 969 found_buffer: 970 if (nf == NF_PREFETCH) 971 return NULL; 972 /* 973 * Note: it is essential that we don't wait for the buffer to be 974 * read if dm_bufio_get function is used. Both dm_bufio_get and 975 * dm_bufio_prefetch can be used in the driver request routine. 976 * If the user called both dm_bufio_prefetch and dm_bufio_get on 977 * the same buffer, it would deadlock if we waited. 978 */ 979 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 980 return NULL; 981 982 b->hold_count++; 983 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 984 test_bit(B_WRITING, &b->state)); 985 return b; 986 } 987 988 /* 989 * The endio routine for reading: set the error, clear the bit and wake up 990 * anyone waiting on the buffer. 991 */ 992 static void read_endio(struct bio *bio, int error) 993 { 994 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 995 996 b->read_error = error; 997 998 BUG_ON(!test_bit(B_READING, &b->state)); 999 1000 smp_mb__before_atomic(); 1001 clear_bit(B_READING, &b->state); 1002 smp_mb__after_atomic(); 1003 1004 wake_up_bit(&b->state, B_READING); 1005 } 1006 1007 /* 1008 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1009 * functions is similar except that dm_bufio_new doesn't read the 1010 * buffer from the disk (assuming that the caller overwrites all the data 1011 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1012 */ 1013 static void *new_read(struct dm_bufio_client *c, sector_t block, 1014 enum new_flag nf, struct dm_buffer **bp) 1015 { 1016 int need_submit; 1017 struct dm_buffer *b; 1018 1019 LIST_HEAD(write_list); 1020 1021 dm_bufio_lock(c); 1022 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1023 dm_bufio_unlock(c); 1024 1025 __flush_write_list(&write_list); 1026 1027 if (!b) 1028 return b; 1029 1030 if (need_submit) 1031 submit_io(b, READ, b->block, read_endio); 1032 1033 wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); 1034 1035 if (b->read_error) { 1036 int error = b->read_error; 1037 1038 dm_bufio_release(b); 1039 1040 return ERR_PTR(error); 1041 } 1042 1043 *bp = b; 1044 1045 return b->data; 1046 } 1047 1048 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1049 struct dm_buffer **bp) 1050 { 1051 return new_read(c, block, NF_GET, bp); 1052 } 1053 EXPORT_SYMBOL_GPL(dm_bufio_get); 1054 1055 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1056 struct dm_buffer **bp) 1057 { 1058 BUG_ON(dm_bufio_in_request()); 1059 1060 return new_read(c, block, NF_READ, bp); 1061 } 1062 EXPORT_SYMBOL_GPL(dm_bufio_read); 1063 1064 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1065 struct dm_buffer **bp) 1066 { 1067 BUG_ON(dm_bufio_in_request()); 1068 1069 return new_read(c, block, NF_FRESH, bp); 1070 } 1071 EXPORT_SYMBOL_GPL(dm_bufio_new); 1072 1073 void dm_bufio_prefetch(struct dm_bufio_client *c, 1074 sector_t block, unsigned n_blocks) 1075 { 1076 struct blk_plug plug; 1077 1078 LIST_HEAD(write_list); 1079 1080 BUG_ON(dm_bufio_in_request()); 1081 1082 blk_start_plug(&plug); 1083 dm_bufio_lock(c); 1084 1085 for (; n_blocks--; block++) { 1086 int need_submit; 1087 struct dm_buffer *b; 1088 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1089 &write_list); 1090 if (unlikely(!list_empty(&write_list))) { 1091 dm_bufio_unlock(c); 1092 blk_finish_plug(&plug); 1093 __flush_write_list(&write_list); 1094 blk_start_plug(&plug); 1095 dm_bufio_lock(c); 1096 } 1097 if (unlikely(b != NULL)) { 1098 dm_bufio_unlock(c); 1099 1100 if (need_submit) 1101 submit_io(b, READ, b->block, read_endio); 1102 dm_bufio_release(b); 1103 1104 dm_bufio_cond_resched(); 1105 1106 if (!n_blocks) 1107 goto flush_plug; 1108 dm_bufio_lock(c); 1109 } 1110 } 1111 1112 dm_bufio_unlock(c); 1113 1114 flush_plug: 1115 blk_finish_plug(&plug); 1116 } 1117 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1118 1119 void dm_bufio_release(struct dm_buffer *b) 1120 { 1121 struct dm_bufio_client *c = b->c; 1122 1123 dm_bufio_lock(c); 1124 1125 BUG_ON(!b->hold_count); 1126 1127 b->hold_count--; 1128 if (!b->hold_count) { 1129 wake_up(&c->free_buffer_wait); 1130 1131 /* 1132 * If there were errors on the buffer, and the buffer is not 1133 * to be written, free the buffer. There is no point in caching 1134 * invalid buffer. 1135 */ 1136 if ((b->read_error || b->write_error) && 1137 !test_bit(B_READING, &b->state) && 1138 !test_bit(B_WRITING, &b->state) && 1139 !test_bit(B_DIRTY, &b->state)) { 1140 __unlink_buffer(b); 1141 __free_buffer_wake(b); 1142 } 1143 } 1144 1145 dm_bufio_unlock(c); 1146 } 1147 EXPORT_SYMBOL_GPL(dm_bufio_release); 1148 1149 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1150 { 1151 struct dm_bufio_client *c = b->c; 1152 1153 dm_bufio_lock(c); 1154 1155 BUG_ON(test_bit(B_READING, &b->state)); 1156 1157 if (!test_and_set_bit(B_DIRTY, &b->state)) 1158 __relink_lru(b, LIST_DIRTY); 1159 1160 dm_bufio_unlock(c); 1161 } 1162 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1163 1164 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1165 { 1166 LIST_HEAD(write_list); 1167 1168 BUG_ON(dm_bufio_in_request()); 1169 1170 dm_bufio_lock(c); 1171 __write_dirty_buffers_async(c, 0, &write_list); 1172 dm_bufio_unlock(c); 1173 __flush_write_list(&write_list); 1174 } 1175 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1176 1177 /* 1178 * For performance, it is essential that the buffers are written asynchronously 1179 * and simultaneously (so that the block layer can merge the writes) and then 1180 * waited upon. 1181 * 1182 * Finally, we flush hardware disk cache. 1183 */ 1184 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1185 { 1186 int a, f; 1187 unsigned long buffers_processed = 0; 1188 struct dm_buffer *b, *tmp; 1189 1190 LIST_HEAD(write_list); 1191 1192 dm_bufio_lock(c); 1193 __write_dirty_buffers_async(c, 0, &write_list); 1194 dm_bufio_unlock(c); 1195 __flush_write_list(&write_list); 1196 dm_bufio_lock(c); 1197 1198 again: 1199 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1200 int dropped_lock = 0; 1201 1202 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1203 buffers_processed++; 1204 1205 BUG_ON(test_bit(B_READING, &b->state)); 1206 1207 if (test_bit(B_WRITING, &b->state)) { 1208 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1209 dropped_lock = 1; 1210 b->hold_count++; 1211 dm_bufio_unlock(c); 1212 wait_on_bit(&b->state, B_WRITING, 1213 do_io_schedule, 1214 TASK_UNINTERRUPTIBLE); 1215 dm_bufio_lock(c); 1216 b->hold_count--; 1217 } else 1218 wait_on_bit(&b->state, B_WRITING, 1219 do_io_schedule, 1220 TASK_UNINTERRUPTIBLE); 1221 } 1222 1223 if (!test_bit(B_DIRTY, &b->state) && 1224 !test_bit(B_WRITING, &b->state)) 1225 __relink_lru(b, LIST_CLEAN); 1226 1227 dm_bufio_cond_resched(); 1228 1229 /* 1230 * If we dropped the lock, the list is no longer consistent, 1231 * so we must restart the search. 1232 * 1233 * In the most common case, the buffer just processed is 1234 * relinked to the clean list, so we won't loop scanning the 1235 * same buffer again and again. 1236 * 1237 * This may livelock if there is another thread simultaneously 1238 * dirtying buffers, so we count the number of buffers walked 1239 * and if it exceeds the total number of buffers, it means that 1240 * someone is doing some writes simultaneously with us. In 1241 * this case, stop, dropping the lock. 1242 */ 1243 if (dropped_lock) 1244 goto again; 1245 } 1246 wake_up(&c->free_buffer_wait); 1247 dm_bufio_unlock(c); 1248 1249 a = xchg(&c->async_write_error, 0); 1250 f = dm_bufio_issue_flush(c); 1251 if (a) 1252 return a; 1253 1254 return f; 1255 } 1256 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1257 1258 /* 1259 * Use dm-io to send and empty barrier flush the device. 1260 */ 1261 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1262 { 1263 struct dm_io_request io_req = { 1264 .bi_rw = WRITE_FLUSH, 1265 .mem.type = DM_IO_KMEM, 1266 .mem.ptr.addr = NULL, 1267 .client = c->dm_io, 1268 }; 1269 struct dm_io_region io_reg = { 1270 .bdev = c->bdev, 1271 .sector = 0, 1272 .count = 0, 1273 }; 1274 1275 BUG_ON(dm_bufio_in_request()); 1276 1277 return dm_io(&io_req, 1, &io_reg, NULL); 1278 } 1279 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1280 1281 /* 1282 * We first delete any other buffer that may be at that new location. 1283 * 1284 * Then, we write the buffer to the original location if it was dirty. 1285 * 1286 * Then, if we are the only one who is holding the buffer, relink the buffer 1287 * in the hash queue for the new location. 1288 * 1289 * If there was someone else holding the buffer, we write it to the new 1290 * location but not relink it, because that other user needs to have the buffer 1291 * at the same place. 1292 */ 1293 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1294 { 1295 struct dm_bufio_client *c = b->c; 1296 struct dm_buffer *new; 1297 1298 BUG_ON(dm_bufio_in_request()); 1299 1300 dm_bufio_lock(c); 1301 1302 retry: 1303 new = __find(c, new_block); 1304 if (new) { 1305 if (new->hold_count) { 1306 __wait_for_free_buffer(c); 1307 goto retry; 1308 } 1309 1310 /* 1311 * FIXME: Is there any point waiting for a write that's going 1312 * to be overwritten in a bit? 1313 */ 1314 __make_buffer_clean(new); 1315 __unlink_buffer(new); 1316 __free_buffer_wake(new); 1317 } 1318 1319 BUG_ON(!b->hold_count); 1320 BUG_ON(test_bit(B_READING, &b->state)); 1321 1322 __write_dirty_buffer(b, NULL); 1323 if (b->hold_count == 1) { 1324 wait_on_bit(&b->state, B_WRITING, 1325 do_io_schedule, TASK_UNINTERRUPTIBLE); 1326 set_bit(B_DIRTY, &b->state); 1327 __unlink_buffer(b); 1328 __link_buffer(b, new_block, LIST_DIRTY); 1329 } else { 1330 sector_t old_block; 1331 wait_on_bit_lock(&b->state, B_WRITING, 1332 do_io_schedule, TASK_UNINTERRUPTIBLE); 1333 /* 1334 * Relink buffer to "new_block" so that write_callback 1335 * sees "new_block" as a block number. 1336 * After the write, link the buffer back to old_block. 1337 * All this must be done in bufio lock, so that block number 1338 * change isn't visible to other threads. 1339 */ 1340 old_block = b->block; 1341 __unlink_buffer(b); 1342 __link_buffer(b, new_block, b->list_mode); 1343 submit_io(b, WRITE, new_block, write_endio); 1344 wait_on_bit(&b->state, B_WRITING, 1345 do_io_schedule, TASK_UNINTERRUPTIBLE); 1346 __unlink_buffer(b); 1347 __link_buffer(b, old_block, b->list_mode); 1348 } 1349 1350 dm_bufio_unlock(c); 1351 dm_bufio_release(b); 1352 } 1353 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1354 1355 /* 1356 * Free the given buffer. 1357 * 1358 * This is just a hint, if the buffer is in use or dirty, this function 1359 * does nothing. 1360 */ 1361 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1362 { 1363 struct dm_buffer *b; 1364 1365 dm_bufio_lock(c); 1366 1367 b = __find(c, block); 1368 if (b && likely(!b->hold_count) && likely(!b->state)) { 1369 __unlink_buffer(b); 1370 __free_buffer_wake(b); 1371 } 1372 1373 dm_bufio_unlock(c); 1374 } 1375 EXPORT_SYMBOL(dm_bufio_forget); 1376 1377 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1378 { 1379 c->minimum_buffers = n; 1380 } 1381 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1382 1383 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1384 { 1385 return c->block_size; 1386 } 1387 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1388 1389 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1390 { 1391 return i_size_read(c->bdev->bd_inode) >> 1392 (SECTOR_SHIFT + c->sectors_per_block_bits); 1393 } 1394 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1395 1396 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1397 { 1398 return b->block; 1399 } 1400 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1401 1402 void *dm_bufio_get_block_data(struct dm_buffer *b) 1403 { 1404 return b->data; 1405 } 1406 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1407 1408 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1409 { 1410 return b + 1; 1411 } 1412 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1413 1414 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1415 { 1416 return b->c; 1417 } 1418 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1419 1420 static void drop_buffers(struct dm_bufio_client *c) 1421 { 1422 struct dm_buffer *b; 1423 int i; 1424 1425 BUG_ON(dm_bufio_in_request()); 1426 1427 /* 1428 * An optimization so that the buffers are not written one-by-one. 1429 */ 1430 dm_bufio_write_dirty_buffers_async(c); 1431 1432 dm_bufio_lock(c); 1433 1434 while ((b = __get_unclaimed_buffer(c))) 1435 __free_buffer_wake(b); 1436 1437 for (i = 0; i < LIST_SIZE; i++) 1438 list_for_each_entry(b, &c->lru[i], lru_list) 1439 DMERR("leaked buffer %llx, hold count %u, list %d", 1440 (unsigned long long)b->block, b->hold_count, i); 1441 1442 for (i = 0; i < LIST_SIZE; i++) 1443 BUG_ON(!list_empty(&c->lru[i])); 1444 1445 dm_bufio_unlock(c); 1446 } 1447 1448 /* 1449 * Test if the buffer is unused and too old, and commit it. 1450 * At if noio is set, we must not do any I/O because we hold 1451 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to 1452 * different bufio client. 1453 */ 1454 static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, 1455 unsigned long max_jiffies) 1456 { 1457 if (jiffies - b->last_accessed < max_jiffies) 1458 return 0; 1459 1460 if (!(gfp & __GFP_IO)) { 1461 if (test_bit(B_READING, &b->state) || 1462 test_bit(B_WRITING, &b->state) || 1463 test_bit(B_DIRTY, &b->state)) 1464 return 0; 1465 } 1466 1467 if (b->hold_count) 1468 return 0; 1469 1470 __make_buffer_clean(b); 1471 __unlink_buffer(b); 1472 __free_buffer_wake(b); 1473 1474 return 1; 1475 } 1476 1477 static long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1478 gfp_t gfp_mask) 1479 { 1480 int l; 1481 struct dm_buffer *b, *tmp; 1482 long freed = 0; 1483 1484 for (l = 0; l < LIST_SIZE; l++) { 1485 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1486 freed += __cleanup_old_buffer(b, gfp_mask, 0); 1487 if (!--nr_to_scan) 1488 break; 1489 } 1490 dm_bufio_cond_resched(); 1491 } 1492 return freed; 1493 } 1494 1495 static unsigned long 1496 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1497 { 1498 struct dm_bufio_client *c; 1499 unsigned long freed; 1500 1501 c = container_of(shrink, struct dm_bufio_client, shrinker); 1502 if (sc->gfp_mask & __GFP_IO) 1503 dm_bufio_lock(c); 1504 else if (!dm_bufio_trylock(c)) 1505 return SHRINK_STOP; 1506 1507 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1508 dm_bufio_unlock(c); 1509 return freed; 1510 } 1511 1512 static unsigned long 1513 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1514 { 1515 struct dm_bufio_client *c; 1516 unsigned long count; 1517 1518 c = container_of(shrink, struct dm_bufio_client, shrinker); 1519 if (sc->gfp_mask & __GFP_IO) 1520 dm_bufio_lock(c); 1521 else if (!dm_bufio_trylock(c)) 1522 return 0; 1523 1524 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1525 dm_bufio_unlock(c); 1526 return count; 1527 } 1528 1529 /* 1530 * Create the buffering interface 1531 */ 1532 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1533 unsigned reserved_buffers, unsigned aux_size, 1534 void (*alloc_callback)(struct dm_buffer *), 1535 void (*write_callback)(struct dm_buffer *)) 1536 { 1537 int r; 1538 struct dm_bufio_client *c; 1539 unsigned i; 1540 1541 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1542 (block_size & (block_size - 1))); 1543 1544 c = kmalloc(sizeof(*c), GFP_KERNEL); 1545 if (!c) { 1546 r = -ENOMEM; 1547 goto bad_client; 1548 } 1549 c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); 1550 if (!c->cache_hash) { 1551 r = -ENOMEM; 1552 goto bad_hash; 1553 } 1554 1555 c->bdev = bdev; 1556 c->block_size = block_size; 1557 c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; 1558 c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? 1559 ffs(block_size) - 1 - PAGE_SHIFT : 0; 1560 c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? 1561 PAGE_SHIFT - (ffs(block_size) - 1) : 0); 1562 1563 c->aux_size = aux_size; 1564 c->alloc_callback = alloc_callback; 1565 c->write_callback = write_callback; 1566 1567 for (i = 0; i < LIST_SIZE; i++) { 1568 INIT_LIST_HEAD(&c->lru[i]); 1569 c->n_buffers[i] = 0; 1570 } 1571 1572 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1573 INIT_HLIST_HEAD(&c->cache_hash[i]); 1574 1575 mutex_init(&c->lock); 1576 INIT_LIST_HEAD(&c->reserved_buffers); 1577 c->need_reserved_buffers = reserved_buffers; 1578 1579 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1580 1581 init_waitqueue_head(&c->free_buffer_wait); 1582 c->async_write_error = 0; 1583 1584 c->dm_io = dm_io_client_create(); 1585 if (IS_ERR(c->dm_io)) { 1586 r = PTR_ERR(c->dm_io); 1587 goto bad_dm_io; 1588 } 1589 1590 mutex_lock(&dm_bufio_clients_lock); 1591 if (c->blocks_per_page_bits) { 1592 if (!DM_BUFIO_CACHE_NAME(c)) { 1593 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1594 if (!DM_BUFIO_CACHE_NAME(c)) { 1595 r = -ENOMEM; 1596 mutex_unlock(&dm_bufio_clients_lock); 1597 goto bad_cache; 1598 } 1599 } 1600 1601 if (!DM_BUFIO_CACHE(c)) { 1602 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1603 c->block_size, 1604 c->block_size, 0, NULL); 1605 if (!DM_BUFIO_CACHE(c)) { 1606 r = -ENOMEM; 1607 mutex_unlock(&dm_bufio_clients_lock); 1608 goto bad_cache; 1609 } 1610 } 1611 } 1612 mutex_unlock(&dm_bufio_clients_lock); 1613 1614 while (c->need_reserved_buffers) { 1615 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1616 1617 if (!b) { 1618 r = -ENOMEM; 1619 goto bad_buffer; 1620 } 1621 __free_buffer_wake(b); 1622 } 1623 1624 mutex_lock(&dm_bufio_clients_lock); 1625 dm_bufio_client_count++; 1626 list_add(&c->client_list, &dm_bufio_all_clients); 1627 __cache_size_refresh(); 1628 mutex_unlock(&dm_bufio_clients_lock); 1629 1630 c->shrinker.count_objects = dm_bufio_shrink_count; 1631 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1632 c->shrinker.seeks = 1; 1633 c->shrinker.batch = 0; 1634 register_shrinker(&c->shrinker); 1635 1636 return c; 1637 1638 bad_buffer: 1639 bad_cache: 1640 while (!list_empty(&c->reserved_buffers)) { 1641 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1642 struct dm_buffer, lru_list); 1643 list_del(&b->lru_list); 1644 free_buffer(b); 1645 } 1646 dm_io_client_destroy(c->dm_io); 1647 bad_dm_io: 1648 vfree(c->cache_hash); 1649 bad_hash: 1650 kfree(c); 1651 bad_client: 1652 return ERR_PTR(r); 1653 } 1654 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1655 1656 /* 1657 * Free the buffering interface. 1658 * It is required that there are no references on any buffers. 1659 */ 1660 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1661 { 1662 unsigned i; 1663 1664 drop_buffers(c); 1665 1666 unregister_shrinker(&c->shrinker); 1667 1668 mutex_lock(&dm_bufio_clients_lock); 1669 1670 list_del(&c->client_list); 1671 dm_bufio_client_count--; 1672 __cache_size_refresh(); 1673 1674 mutex_unlock(&dm_bufio_clients_lock); 1675 1676 for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) 1677 BUG_ON(!hlist_empty(&c->cache_hash[i])); 1678 1679 BUG_ON(c->need_reserved_buffers); 1680 1681 while (!list_empty(&c->reserved_buffers)) { 1682 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1683 struct dm_buffer, lru_list); 1684 list_del(&b->lru_list); 1685 free_buffer(b); 1686 } 1687 1688 for (i = 0; i < LIST_SIZE; i++) 1689 if (c->n_buffers[i]) 1690 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1691 1692 for (i = 0; i < LIST_SIZE; i++) 1693 BUG_ON(c->n_buffers[i]); 1694 1695 dm_io_client_destroy(c->dm_io); 1696 vfree(c->cache_hash); 1697 kfree(c); 1698 } 1699 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1700 1701 static void cleanup_old_buffers(void) 1702 { 1703 unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); 1704 struct dm_bufio_client *c; 1705 1706 if (max_age > ULONG_MAX / HZ) 1707 max_age = ULONG_MAX / HZ; 1708 1709 mutex_lock(&dm_bufio_clients_lock); 1710 list_for_each_entry(c, &dm_bufio_all_clients, client_list) { 1711 if (!dm_bufio_trylock(c)) 1712 continue; 1713 1714 while (!list_empty(&c->lru[LIST_CLEAN])) { 1715 struct dm_buffer *b; 1716 b = list_entry(c->lru[LIST_CLEAN].prev, 1717 struct dm_buffer, lru_list); 1718 if (!__cleanup_old_buffer(b, 0, max_age * HZ)) 1719 break; 1720 dm_bufio_cond_resched(); 1721 } 1722 1723 dm_bufio_unlock(c); 1724 dm_bufio_cond_resched(); 1725 } 1726 mutex_unlock(&dm_bufio_clients_lock); 1727 } 1728 1729 static struct workqueue_struct *dm_bufio_wq; 1730 static struct delayed_work dm_bufio_work; 1731 1732 static void work_fn(struct work_struct *w) 1733 { 1734 cleanup_old_buffers(); 1735 1736 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1737 DM_BUFIO_WORK_TIMER_SECS * HZ); 1738 } 1739 1740 /*---------------------------------------------------------------- 1741 * Module setup 1742 *--------------------------------------------------------------*/ 1743 1744 /* 1745 * This is called only once for the whole dm_bufio module. 1746 * It initializes memory limit. 1747 */ 1748 static int __init dm_bufio_init(void) 1749 { 1750 __u64 mem; 1751 1752 dm_bufio_allocated_kmem_cache = 0; 1753 dm_bufio_allocated_get_free_pages = 0; 1754 dm_bufio_allocated_vmalloc = 0; 1755 dm_bufio_current_allocated = 0; 1756 1757 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1758 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1759 1760 mem = (__u64)((totalram_pages - totalhigh_pages) * 1761 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1762 1763 if (mem > ULONG_MAX) 1764 mem = ULONG_MAX; 1765 1766 #ifdef CONFIG_MMU 1767 /* 1768 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1769 * in fs/proc/internal.h 1770 */ 1771 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1772 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1773 #endif 1774 1775 dm_bufio_default_cache_size = mem; 1776 1777 mutex_lock(&dm_bufio_clients_lock); 1778 __cache_size_refresh(); 1779 mutex_unlock(&dm_bufio_clients_lock); 1780 1781 dm_bufio_wq = create_singlethread_workqueue("dm_bufio_cache"); 1782 if (!dm_bufio_wq) 1783 return -ENOMEM; 1784 1785 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1786 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1787 DM_BUFIO_WORK_TIMER_SECS * HZ); 1788 1789 return 0; 1790 } 1791 1792 /* 1793 * This is called once when unloading the dm_bufio module. 1794 */ 1795 static void __exit dm_bufio_exit(void) 1796 { 1797 int bug = 0; 1798 int i; 1799 1800 cancel_delayed_work_sync(&dm_bufio_work); 1801 destroy_workqueue(dm_bufio_wq); 1802 1803 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { 1804 struct kmem_cache *kc = dm_bufio_caches[i]; 1805 1806 if (kc) 1807 kmem_cache_destroy(kc); 1808 } 1809 1810 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1811 kfree(dm_bufio_cache_names[i]); 1812 1813 if (dm_bufio_client_count) { 1814 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1815 __func__, dm_bufio_client_count); 1816 bug = 1; 1817 } 1818 1819 if (dm_bufio_current_allocated) { 1820 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1821 __func__, dm_bufio_current_allocated); 1822 bug = 1; 1823 } 1824 1825 if (dm_bufio_allocated_get_free_pages) { 1826 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1827 __func__, dm_bufio_allocated_get_free_pages); 1828 bug = 1; 1829 } 1830 1831 if (dm_bufio_allocated_vmalloc) { 1832 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1833 __func__, dm_bufio_allocated_vmalloc); 1834 bug = 1; 1835 } 1836 1837 if (bug) 1838 BUG(); 1839 } 1840 1841 module_init(dm_bufio_init) 1842 module_exit(dm_bufio_exit) 1843 1844 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1845 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1846 1847 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1848 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1849 1850 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1851 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1852 1853 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1854 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1855 1856 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1857 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1858 1859 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1860 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1861 1862 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1863 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1864 1865 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1866 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1867 MODULE_LICENSE("GPL"); 1868