1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/jiffies.h> 15 #include <linux/vmalloc.h> 16 #include <linux/shrinker.h> 17 #include <linux/module.h> 18 #include <linux/rbtree.h> 19 #include <linux/stacktrace.h> 20 21 #define DM_MSG_PREFIX "bufio" 22 23 /* 24 * Memory management policy: 25 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 26 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 27 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 28 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 29 * dirty buffers. 30 */ 31 #define DM_BUFIO_MIN_BUFFERS 8 32 33 #define DM_BUFIO_MEMORY_PERCENT 2 34 #define DM_BUFIO_VMALLOC_PERCENT 25 35 #define DM_BUFIO_WRITEBACK_PERCENT 75 36 37 /* 38 * Check buffer ages in this interval (seconds) 39 */ 40 #define DM_BUFIO_WORK_TIMER_SECS 30 41 42 /* 43 * Free buffers when they are older than this (seconds) 44 */ 45 #define DM_BUFIO_DEFAULT_AGE_SECS 300 46 47 /* 48 * The nr of bytes of cached data to keep around. 49 */ 50 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) 51 52 /* 53 * The number of bvec entries that are embedded directly in the buffer. 54 * If the chunk size is larger, dm-io is used to do the io. 55 */ 56 #define DM_BUFIO_INLINE_VECS 16 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 unsigned minimum_buffers; 108 109 struct rb_root buffer_tree; 110 wait_queue_head_t free_buffer_wait; 111 112 int async_write_error; 113 114 struct list_head client_list; 115 struct shrinker shrinker; 116 }; 117 118 /* 119 * Buffer state bits. 120 */ 121 #define B_READING 0 122 #define B_WRITING 1 123 #define B_DIRTY 2 124 125 /* 126 * Describes how the block was allocated: 127 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 128 * See the comment at alloc_buffer_data. 129 */ 130 enum data_mode { 131 DATA_MODE_SLAB = 0, 132 DATA_MODE_GET_FREE_PAGES = 1, 133 DATA_MODE_VMALLOC = 2, 134 DATA_MODE_LIMIT = 3 135 }; 136 137 struct dm_buffer { 138 struct rb_node node; 139 struct list_head lru_list; 140 sector_t block; 141 void *data; 142 enum data_mode data_mode; 143 unsigned char list_mode; /* LIST_* */ 144 unsigned hold_count; 145 int read_error; 146 int write_error; 147 unsigned long state; 148 unsigned long last_accessed; 149 struct dm_bufio_client *c; 150 struct list_head write_list; 151 struct bio bio; 152 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 153 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 154 #define MAX_STACK 10 155 struct stack_trace stack_trace; 156 unsigned long stack_entries[MAX_STACK]; 157 #endif 158 }; 159 160 /*----------------------------------------------------------------*/ 161 162 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 163 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 164 165 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 166 { 167 unsigned ret = c->blocks_per_page_bits - 1; 168 169 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 170 171 return ret; 172 } 173 174 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 175 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 176 177 #define dm_bufio_in_request() (!!current->bio_list) 178 179 static void dm_bufio_lock(struct dm_bufio_client *c) 180 { 181 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 182 } 183 184 static int dm_bufio_trylock(struct dm_bufio_client *c) 185 { 186 return mutex_trylock(&c->lock); 187 } 188 189 static void dm_bufio_unlock(struct dm_bufio_client *c) 190 { 191 mutex_unlock(&c->lock); 192 } 193 194 /*----------------------------------------------------------------*/ 195 196 /* 197 * Default cache size: available memory divided by the ratio. 198 */ 199 static unsigned long dm_bufio_default_cache_size; 200 201 /* 202 * Total cache size set by the user. 203 */ 204 static unsigned long dm_bufio_cache_size; 205 206 /* 207 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 208 * at any time. If it disagrees, the user has changed cache size. 209 */ 210 static unsigned long dm_bufio_cache_size_latch; 211 212 static DEFINE_SPINLOCK(param_spinlock); 213 214 /* 215 * Buffers are freed after this timeout 216 */ 217 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 218 static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 219 220 static unsigned long dm_bufio_peak_allocated; 221 static unsigned long dm_bufio_allocated_kmem_cache; 222 static unsigned long dm_bufio_allocated_get_free_pages; 223 static unsigned long dm_bufio_allocated_vmalloc; 224 static unsigned long dm_bufio_current_allocated; 225 226 /*----------------------------------------------------------------*/ 227 228 /* 229 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 230 */ 231 static unsigned long dm_bufio_cache_size_per_client; 232 233 /* 234 * The current number of clients. 235 */ 236 static int dm_bufio_client_count; 237 238 /* 239 * The list of all clients. 240 */ 241 static LIST_HEAD(dm_bufio_all_clients); 242 243 /* 244 * This mutex protects dm_bufio_cache_size_latch, 245 * dm_bufio_cache_size_per_client and dm_bufio_client_count 246 */ 247 static DEFINE_MUTEX(dm_bufio_clients_lock); 248 249 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 250 static void buffer_record_stack(struct dm_buffer *b) 251 { 252 b->stack_trace.nr_entries = 0; 253 b->stack_trace.max_entries = MAX_STACK; 254 b->stack_trace.entries = b->stack_entries; 255 b->stack_trace.skip = 2; 256 save_stack_trace(&b->stack_trace); 257 } 258 #endif 259 260 /*---------------------------------------------------------------- 261 * A red/black tree acts as an index for all the buffers. 262 *--------------------------------------------------------------*/ 263 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 264 { 265 struct rb_node *n = c->buffer_tree.rb_node; 266 struct dm_buffer *b; 267 268 while (n) { 269 b = container_of(n, struct dm_buffer, node); 270 271 if (b->block == block) 272 return b; 273 274 n = (b->block < block) ? n->rb_left : n->rb_right; 275 } 276 277 return NULL; 278 } 279 280 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) 281 { 282 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; 283 struct dm_buffer *found; 284 285 while (*new) { 286 found = container_of(*new, struct dm_buffer, node); 287 288 if (found->block == b->block) { 289 BUG_ON(found != b); 290 return; 291 } 292 293 parent = *new; 294 new = (found->block < b->block) ? 295 &((*new)->rb_left) : &((*new)->rb_right); 296 } 297 298 rb_link_node(&b->node, parent, new); 299 rb_insert_color(&b->node, &c->buffer_tree); 300 } 301 302 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) 303 { 304 rb_erase(&b->node, &c->buffer_tree); 305 } 306 307 /*----------------------------------------------------------------*/ 308 309 static void adjust_total_allocated(enum data_mode data_mode, long diff) 310 { 311 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 312 &dm_bufio_allocated_kmem_cache, 313 &dm_bufio_allocated_get_free_pages, 314 &dm_bufio_allocated_vmalloc, 315 }; 316 317 spin_lock(¶m_spinlock); 318 319 *class_ptr[data_mode] += diff; 320 321 dm_bufio_current_allocated += diff; 322 323 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 324 dm_bufio_peak_allocated = dm_bufio_current_allocated; 325 326 spin_unlock(¶m_spinlock); 327 } 328 329 /* 330 * Change the number of clients and recalculate per-client limit. 331 */ 332 static void __cache_size_refresh(void) 333 { 334 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 335 BUG_ON(dm_bufio_client_count < 0); 336 337 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 338 339 /* 340 * Use default if set to 0 and report the actual cache size used. 341 */ 342 if (!dm_bufio_cache_size_latch) { 343 (void)cmpxchg(&dm_bufio_cache_size, 0, 344 dm_bufio_default_cache_size); 345 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 346 } 347 348 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 349 (dm_bufio_client_count ? : 1); 350 } 351 352 /* 353 * Allocating buffer data. 354 * 355 * Small buffers are allocated with kmem_cache, to use space optimally. 356 * 357 * For large buffers, we choose between get_free_pages and vmalloc. 358 * Each has advantages and disadvantages. 359 * 360 * __get_free_pages can randomly fail if the memory is fragmented. 361 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 362 * as low as 128M) so using it for caching is not appropriate. 363 * 364 * If the allocation may fail we use __get_free_pages. Memory fragmentation 365 * won't have a fatal effect here, but it just causes flushes of some other 366 * buffers and more I/O will be performed. Don't use __get_free_pages if it 367 * always fails (i.e. order >= MAX_ORDER). 368 * 369 * If the allocation shouldn't fail we use __vmalloc. This is only for the 370 * initial reserve allocation, so there's no risk of wasting all vmalloc 371 * space. 372 */ 373 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 374 enum data_mode *data_mode) 375 { 376 unsigned noio_flag; 377 void *ptr; 378 379 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 380 *data_mode = DATA_MODE_SLAB; 381 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 382 } 383 384 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 385 gfp_mask & __GFP_NORETRY) { 386 *data_mode = DATA_MODE_GET_FREE_PAGES; 387 return (void *)__get_free_pages(gfp_mask, 388 c->pages_per_block_bits); 389 } 390 391 *data_mode = DATA_MODE_VMALLOC; 392 393 /* 394 * __vmalloc allocates the data pages and auxiliary structures with 395 * gfp_flags that were specified, but pagetables are always allocated 396 * with GFP_KERNEL, no matter what was specified as gfp_mask. 397 * 398 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 399 * all allocations done by this process (including pagetables) are done 400 * as if GFP_NOIO was specified. 401 */ 402 403 if (gfp_mask & __GFP_NORETRY) 404 noio_flag = memalloc_noio_save(); 405 406 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 407 408 if (gfp_mask & __GFP_NORETRY) 409 memalloc_noio_restore(noio_flag); 410 411 return ptr; 412 } 413 414 /* 415 * Free buffer's data. 416 */ 417 static void free_buffer_data(struct dm_bufio_client *c, 418 void *data, enum data_mode data_mode) 419 { 420 switch (data_mode) { 421 case DATA_MODE_SLAB: 422 kmem_cache_free(DM_BUFIO_CACHE(c), data); 423 break; 424 425 case DATA_MODE_GET_FREE_PAGES: 426 free_pages((unsigned long)data, c->pages_per_block_bits); 427 break; 428 429 case DATA_MODE_VMALLOC: 430 vfree(data); 431 break; 432 433 default: 434 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 435 data_mode); 436 BUG(); 437 } 438 } 439 440 /* 441 * Allocate buffer and its data. 442 */ 443 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 444 { 445 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 446 gfp_mask); 447 448 if (!b) 449 return NULL; 450 451 b->c = c; 452 453 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 454 if (!b->data) { 455 kfree(b); 456 return NULL; 457 } 458 459 adjust_total_allocated(b->data_mode, (long)c->block_size); 460 461 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 462 memset(&b->stack_trace, 0, sizeof(b->stack_trace)); 463 #endif 464 return b; 465 } 466 467 /* 468 * Free buffer and its data. 469 */ 470 static void free_buffer(struct dm_buffer *b) 471 { 472 struct dm_bufio_client *c = b->c; 473 474 adjust_total_allocated(b->data_mode, -(long)c->block_size); 475 476 free_buffer_data(c, b->data, b->data_mode); 477 kfree(b); 478 } 479 480 /* 481 * Link buffer to the hash list and clean or dirty queue. 482 */ 483 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 484 { 485 struct dm_bufio_client *c = b->c; 486 487 c->n_buffers[dirty]++; 488 b->block = block; 489 b->list_mode = dirty; 490 list_add(&b->lru_list, &c->lru[dirty]); 491 __insert(b->c, b); 492 b->last_accessed = jiffies; 493 } 494 495 /* 496 * Unlink buffer from the hash list and dirty or clean queue. 497 */ 498 static void __unlink_buffer(struct dm_buffer *b) 499 { 500 struct dm_bufio_client *c = b->c; 501 502 BUG_ON(!c->n_buffers[b->list_mode]); 503 504 c->n_buffers[b->list_mode]--; 505 __remove(b->c, b); 506 list_del(&b->lru_list); 507 } 508 509 /* 510 * Place the buffer to the head of dirty or clean LRU queue. 511 */ 512 static void __relink_lru(struct dm_buffer *b, int dirty) 513 { 514 struct dm_bufio_client *c = b->c; 515 516 BUG_ON(!c->n_buffers[b->list_mode]); 517 518 c->n_buffers[b->list_mode]--; 519 c->n_buffers[dirty]++; 520 b->list_mode = dirty; 521 list_move(&b->lru_list, &c->lru[dirty]); 522 b->last_accessed = jiffies; 523 } 524 525 /*---------------------------------------------------------------- 526 * Submit I/O on the buffer. 527 * 528 * Bio interface is faster but it has some problems: 529 * the vector list is limited (increasing this limit increases 530 * memory-consumption per buffer, so it is not viable); 531 * 532 * the memory must be direct-mapped, not vmalloced; 533 * 534 * the I/O driver can reject requests spuriously if it thinks that 535 * the requests are too big for the device or if they cross a 536 * controller-defined memory boundary. 537 * 538 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 539 * it is not vmalloced, try using the bio interface. 540 * 541 * If the buffer is big, if it is vmalloced or if the underlying device 542 * rejects the bio because it is too large, use dm-io layer to do the I/O. 543 * The dm-io layer splits the I/O into multiple requests, avoiding the above 544 * shortcomings. 545 *--------------------------------------------------------------*/ 546 547 /* 548 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 549 * that the request was handled directly with bio interface. 550 */ 551 static void dmio_complete(unsigned long error, void *context) 552 { 553 struct dm_buffer *b = context; 554 555 b->bio.bi_error = error ? -EIO : 0; 556 b->bio.bi_end_io(&b->bio); 557 } 558 559 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 560 bio_end_io_t *end_io) 561 { 562 int r; 563 struct dm_io_request io_req = { 564 .bi_op = rw, 565 .bi_op_flags = 0, 566 .notify.fn = dmio_complete, 567 .notify.context = b, 568 .client = b->c->dm_io, 569 }; 570 struct dm_io_region region = { 571 .bdev = b->c->bdev, 572 .sector = block << b->c->sectors_per_block_bits, 573 .count = b->c->block_size >> SECTOR_SHIFT, 574 }; 575 576 if (b->data_mode != DATA_MODE_VMALLOC) { 577 io_req.mem.type = DM_IO_KMEM; 578 io_req.mem.ptr.addr = b->data; 579 } else { 580 io_req.mem.type = DM_IO_VMA; 581 io_req.mem.ptr.vma = b->data; 582 } 583 584 b->bio.bi_end_io = end_io; 585 586 r = dm_io(&io_req, 1, ®ion, NULL); 587 if (r) { 588 b->bio.bi_error = r; 589 end_io(&b->bio); 590 } 591 } 592 593 static void inline_endio(struct bio *bio) 594 { 595 bio_end_io_t *end_fn = bio->bi_private; 596 int error = bio->bi_error; 597 598 /* 599 * Reset the bio to free any attached resources 600 * (e.g. bio integrity profiles). 601 */ 602 bio_reset(bio); 603 604 bio->bi_error = error; 605 end_fn(bio); 606 } 607 608 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 609 bio_end_io_t *end_io) 610 { 611 char *ptr; 612 int len; 613 614 bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS); 615 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 616 b->bio.bi_bdev = b->c->bdev; 617 b->bio.bi_end_io = inline_endio; 618 /* 619 * Use of .bi_private isn't a problem here because 620 * the dm_buffer's inline bio is local to bufio. 621 */ 622 b->bio.bi_private = end_io; 623 bio_set_op_attrs(&b->bio, rw, 0); 624 625 /* 626 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 627 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 628 */ 629 ptr = b->data; 630 len = b->c->block_size; 631 632 if (len >= PAGE_SIZE) 633 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 634 else 635 BUG_ON((unsigned long)ptr & (len - 1)); 636 637 do { 638 if (!bio_add_page(&b->bio, virt_to_page(ptr), 639 len < PAGE_SIZE ? len : PAGE_SIZE, 640 offset_in_page(ptr))) { 641 BUG_ON(b->c->block_size <= PAGE_SIZE); 642 use_dmio(b, rw, block, end_io); 643 return; 644 } 645 646 len -= PAGE_SIZE; 647 ptr += PAGE_SIZE; 648 } while (len > 0); 649 650 submit_bio(&b->bio); 651 } 652 653 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 654 bio_end_io_t *end_io) 655 { 656 if (rw == WRITE && b->c->write_callback) 657 b->c->write_callback(b); 658 659 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 660 b->data_mode != DATA_MODE_VMALLOC) 661 use_inline_bio(b, rw, block, end_io); 662 else 663 use_dmio(b, rw, block, end_io); 664 } 665 666 /*---------------------------------------------------------------- 667 * Writing dirty buffers 668 *--------------------------------------------------------------*/ 669 670 /* 671 * The endio routine for write. 672 * 673 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 674 * it. 675 */ 676 static void write_endio(struct bio *bio) 677 { 678 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 679 680 b->write_error = bio->bi_error; 681 if (unlikely(bio->bi_error)) { 682 struct dm_bufio_client *c = b->c; 683 int error = bio->bi_error; 684 (void)cmpxchg(&c->async_write_error, 0, error); 685 } 686 687 BUG_ON(!test_bit(B_WRITING, &b->state)); 688 689 smp_mb__before_atomic(); 690 clear_bit(B_WRITING, &b->state); 691 smp_mb__after_atomic(); 692 693 wake_up_bit(&b->state, B_WRITING); 694 } 695 696 /* 697 * Initiate a write on a dirty buffer, but don't wait for it. 698 * 699 * - If the buffer is not dirty, exit. 700 * - If there some previous write going on, wait for it to finish (we can't 701 * have two writes on the same buffer simultaneously). 702 * - Submit our write and don't wait on it. We set B_WRITING indicating 703 * that there is a write in progress. 704 */ 705 static void __write_dirty_buffer(struct dm_buffer *b, 706 struct list_head *write_list) 707 { 708 if (!test_bit(B_DIRTY, &b->state)) 709 return; 710 711 clear_bit(B_DIRTY, &b->state); 712 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 713 714 if (!write_list) 715 submit_io(b, WRITE, b->block, write_endio); 716 else 717 list_add_tail(&b->write_list, write_list); 718 } 719 720 static void __flush_write_list(struct list_head *write_list) 721 { 722 struct blk_plug plug; 723 blk_start_plug(&plug); 724 while (!list_empty(write_list)) { 725 struct dm_buffer *b = 726 list_entry(write_list->next, struct dm_buffer, write_list); 727 list_del(&b->write_list); 728 submit_io(b, WRITE, b->block, write_endio); 729 cond_resched(); 730 } 731 blk_finish_plug(&plug); 732 } 733 734 /* 735 * Wait until any activity on the buffer finishes. Possibly write the 736 * buffer if it is dirty. When this function finishes, there is no I/O 737 * running on the buffer and the buffer is not dirty. 738 */ 739 static void __make_buffer_clean(struct dm_buffer *b) 740 { 741 BUG_ON(b->hold_count); 742 743 if (!b->state) /* fast case */ 744 return; 745 746 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 747 __write_dirty_buffer(b, NULL); 748 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 749 } 750 751 /* 752 * Find some buffer that is not held by anybody, clean it, unlink it and 753 * return it. 754 */ 755 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 756 { 757 struct dm_buffer *b; 758 759 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 760 BUG_ON(test_bit(B_WRITING, &b->state)); 761 BUG_ON(test_bit(B_DIRTY, &b->state)); 762 763 if (!b->hold_count) { 764 __make_buffer_clean(b); 765 __unlink_buffer(b); 766 return b; 767 } 768 cond_resched(); 769 } 770 771 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 772 BUG_ON(test_bit(B_READING, &b->state)); 773 774 if (!b->hold_count) { 775 __make_buffer_clean(b); 776 __unlink_buffer(b); 777 return b; 778 } 779 cond_resched(); 780 } 781 782 return NULL; 783 } 784 785 /* 786 * Wait until some other threads free some buffer or release hold count on 787 * some buffer. 788 * 789 * This function is entered with c->lock held, drops it and regains it 790 * before exiting. 791 */ 792 static void __wait_for_free_buffer(struct dm_bufio_client *c) 793 { 794 DECLARE_WAITQUEUE(wait, current); 795 796 add_wait_queue(&c->free_buffer_wait, &wait); 797 set_task_state(current, TASK_UNINTERRUPTIBLE); 798 dm_bufio_unlock(c); 799 800 io_schedule(); 801 802 remove_wait_queue(&c->free_buffer_wait, &wait); 803 804 dm_bufio_lock(c); 805 } 806 807 enum new_flag { 808 NF_FRESH = 0, 809 NF_READ = 1, 810 NF_GET = 2, 811 NF_PREFETCH = 3 812 }; 813 814 /* 815 * Allocate a new buffer. If the allocation is not possible, wait until 816 * some other thread frees a buffer. 817 * 818 * May drop the lock and regain it. 819 */ 820 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 821 { 822 struct dm_buffer *b; 823 824 /* 825 * dm-bufio is resistant to allocation failures (it just keeps 826 * one buffer reserved in cases all the allocations fail). 827 * So set flags to not try too hard: 828 * GFP_NOIO: don't recurse into the I/O layer 829 * __GFP_NORETRY: don't retry and rather return failure 830 * __GFP_NOMEMALLOC: don't use emergency reserves 831 * __GFP_NOWARN: don't print a warning in case of failure 832 * 833 * For debugging, if we set the cache size to 1, no new buffers will 834 * be allocated. 835 */ 836 while (1) { 837 if (dm_bufio_cache_size_latch != 1) { 838 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 839 if (b) 840 return b; 841 } 842 843 if (nf == NF_PREFETCH) 844 return NULL; 845 846 if (!list_empty(&c->reserved_buffers)) { 847 b = list_entry(c->reserved_buffers.next, 848 struct dm_buffer, lru_list); 849 list_del(&b->lru_list); 850 c->need_reserved_buffers++; 851 852 return b; 853 } 854 855 b = __get_unclaimed_buffer(c); 856 if (b) 857 return b; 858 859 __wait_for_free_buffer(c); 860 } 861 } 862 863 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 864 { 865 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 866 867 if (!b) 868 return NULL; 869 870 if (c->alloc_callback) 871 c->alloc_callback(b); 872 873 return b; 874 } 875 876 /* 877 * Free a buffer and wake other threads waiting for free buffers. 878 */ 879 static void __free_buffer_wake(struct dm_buffer *b) 880 { 881 struct dm_bufio_client *c = b->c; 882 883 if (!c->need_reserved_buffers) 884 free_buffer(b); 885 else { 886 list_add(&b->lru_list, &c->reserved_buffers); 887 c->need_reserved_buffers--; 888 } 889 890 wake_up(&c->free_buffer_wait); 891 } 892 893 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 894 struct list_head *write_list) 895 { 896 struct dm_buffer *b, *tmp; 897 898 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 899 BUG_ON(test_bit(B_READING, &b->state)); 900 901 if (!test_bit(B_DIRTY, &b->state) && 902 !test_bit(B_WRITING, &b->state)) { 903 __relink_lru(b, LIST_CLEAN); 904 continue; 905 } 906 907 if (no_wait && test_bit(B_WRITING, &b->state)) 908 return; 909 910 __write_dirty_buffer(b, write_list); 911 cond_resched(); 912 } 913 } 914 915 /* 916 * Get writeback threshold and buffer limit for a given client. 917 */ 918 static void __get_memory_limit(struct dm_bufio_client *c, 919 unsigned long *threshold_buffers, 920 unsigned long *limit_buffers) 921 { 922 unsigned long buffers; 923 924 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 925 mutex_lock(&dm_bufio_clients_lock); 926 __cache_size_refresh(); 927 mutex_unlock(&dm_bufio_clients_lock); 928 } 929 930 buffers = dm_bufio_cache_size_per_client >> 931 (c->sectors_per_block_bits + SECTOR_SHIFT); 932 933 if (buffers < c->minimum_buffers) 934 buffers = c->minimum_buffers; 935 936 *limit_buffers = buffers; 937 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 938 } 939 940 /* 941 * Check if we're over watermark. 942 * If we are over threshold_buffers, start freeing buffers. 943 * If we're over "limit_buffers", block until we get under the limit. 944 */ 945 static void __check_watermark(struct dm_bufio_client *c, 946 struct list_head *write_list) 947 { 948 unsigned long threshold_buffers, limit_buffers; 949 950 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 951 952 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 953 limit_buffers) { 954 955 struct dm_buffer *b = __get_unclaimed_buffer(c); 956 957 if (!b) 958 return; 959 960 __free_buffer_wake(b); 961 cond_resched(); 962 } 963 964 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 965 __write_dirty_buffers_async(c, 1, write_list); 966 } 967 968 /*---------------------------------------------------------------- 969 * Getting a buffer 970 *--------------------------------------------------------------*/ 971 972 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 973 enum new_flag nf, int *need_submit, 974 struct list_head *write_list) 975 { 976 struct dm_buffer *b, *new_b = NULL; 977 978 *need_submit = 0; 979 980 b = __find(c, block); 981 if (b) 982 goto found_buffer; 983 984 if (nf == NF_GET) 985 return NULL; 986 987 new_b = __alloc_buffer_wait(c, nf); 988 if (!new_b) 989 return NULL; 990 991 /* 992 * We've had a period where the mutex was unlocked, so need to 993 * recheck the hash table. 994 */ 995 b = __find(c, block); 996 if (b) { 997 __free_buffer_wake(new_b); 998 goto found_buffer; 999 } 1000 1001 __check_watermark(c, write_list); 1002 1003 b = new_b; 1004 b->hold_count = 1; 1005 b->read_error = 0; 1006 b->write_error = 0; 1007 __link_buffer(b, block, LIST_CLEAN); 1008 1009 if (nf == NF_FRESH) { 1010 b->state = 0; 1011 return b; 1012 } 1013 1014 b->state = 1 << B_READING; 1015 *need_submit = 1; 1016 1017 return b; 1018 1019 found_buffer: 1020 if (nf == NF_PREFETCH) 1021 return NULL; 1022 /* 1023 * Note: it is essential that we don't wait for the buffer to be 1024 * read if dm_bufio_get function is used. Both dm_bufio_get and 1025 * dm_bufio_prefetch can be used in the driver request routine. 1026 * If the user called both dm_bufio_prefetch and dm_bufio_get on 1027 * the same buffer, it would deadlock if we waited. 1028 */ 1029 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 1030 return NULL; 1031 1032 b->hold_count++; 1033 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 1034 test_bit(B_WRITING, &b->state)); 1035 return b; 1036 } 1037 1038 /* 1039 * The endio routine for reading: set the error, clear the bit and wake up 1040 * anyone waiting on the buffer. 1041 */ 1042 static void read_endio(struct bio *bio) 1043 { 1044 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 1045 1046 b->read_error = bio->bi_error; 1047 1048 BUG_ON(!test_bit(B_READING, &b->state)); 1049 1050 smp_mb__before_atomic(); 1051 clear_bit(B_READING, &b->state); 1052 smp_mb__after_atomic(); 1053 1054 wake_up_bit(&b->state, B_READING); 1055 } 1056 1057 /* 1058 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1059 * functions is similar except that dm_bufio_new doesn't read the 1060 * buffer from the disk (assuming that the caller overwrites all the data 1061 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1062 */ 1063 static void *new_read(struct dm_bufio_client *c, sector_t block, 1064 enum new_flag nf, struct dm_buffer **bp) 1065 { 1066 int need_submit; 1067 struct dm_buffer *b; 1068 1069 LIST_HEAD(write_list); 1070 1071 dm_bufio_lock(c); 1072 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1073 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1074 if (b && b->hold_count == 1) 1075 buffer_record_stack(b); 1076 #endif 1077 dm_bufio_unlock(c); 1078 1079 __flush_write_list(&write_list); 1080 1081 if (!b) 1082 return NULL; 1083 1084 if (need_submit) 1085 submit_io(b, READ, b->block, read_endio); 1086 1087 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1088 1089 if (b->read_error) { 1090 int error = b->read_error; 1091 1092 dm_bufio_release(b); 1093 1094 return ERR_PTR(error); 1095 } 1096 1097 *bp = b; 1098 1099 return b->data; 1100 } 1101 1102 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1103 struct dm_buffer **bp) 1104 { 1105 return new_read(c, block, NF_GET, bp); 1106 } 1107 EXPORT_SYMBOL_GPL(dm_bufio_get); 1108 1109 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1110 struct dm_buffer **bp) 1111 { 1112 BUG_ON(dm_bufio_in_request()); 1113 1114 return new_read(c, block, NF_READ, bp); 1115 } 1116 EXPORT_SYMBOL_GPL(dm_bufio_read); 1117 1118 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1119 struct dm_buffer **bp) 1120 { 1121 BUG_ON(dm_bufio_in_request()); 1122 1123 return new_read(c, block, NF_FRESH, bp); 1124 } 1125 EXPORT_SYMBOL_GPL(dm_bufio_new); 1126 1127 void dm_bufio_prefetch(struct dm_bufio_client *c, 1128 sector_t block, unsigned n_blocks) 1129 { 1130 struct blk_plug plug; 1131 1132 LIST_HEAD(write_list); 1133 1134 BUG_ON(dm_bufio_in_request()); 1135 1136 blk_start_plug(&plug); 1137 dm_bufio_lock(c); 1138 1139 for (; n_blocks--; block++) { 1140 int need_submit; 1141 struct dm_buffer *b; 1142 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1143 &write_list); 1144 if (unlikely(!list_empty(&write_list))) { 1145 dm_bufio_unlock(c); 1146 blk_finish_plug(&plug); 1147 __flush_write_list(&write_list); 1148 blk_start_plug(&plug); 1149 dm_bufio_lock(c); 1150 } 1151 if (unlikely(b != NULL)) { 1152 dm_bufio_unlock(c); 1153 1154 if (need_submit) 1155 submit_io(b, READ, b->block, read_endio); 1156 dm_bufio_release(b); 1157 1158 cond_resched(); 1159 1160 if (!n_blocks) 1161 goto flush_plug; 1162 dm_bufio_lock(c); 1163 } 1164 } 1165 1166 dm_bufio_unlock(c); 1167 1168 flush_plug: 1169 blk_finish_plug(&plug); 1170 } 1171 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1172 1173 void dm_bufio_release(struct dm_buffer *b) 1174 { 1175 struct dm_bufio_client *c = b->c; 1176 1177 dm_bufio_lock(c); 1178 1179 BUG_ON(!b->hold_count); 1180 1181 b->hold_count--; 1182 if (!b->hold_count) { 1183 wake_up(&c->free_buffer_wait); 1184 1185 /* 1186 * If there were errors on the buffer, and the buffer is not 1187 * to be written, free the buffer. There is no point in caching 1188 * invalid buffer. 1189 */ 1190 if ((b->read_error || b->write_error) && 1191 !test_bit(B_READING, &b->state) && 1192 !test_bit(B_WRITING, &b->state) && 1193 !test_bit(B_DIRTY, &b->state)) { 1194 __unlink_buffer(b); 1195 __free_buffer_wake(b); 1196 } 1197 } 1198 1199 dm_bufio_unlock(c); 1200 } 1201 EXPORT_SYMBOL_GPL(dm_bufio_release); 1202 1203 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1204 { 1205 struct dm_bufio_client *c = b->c; 1206 1207 dm_bufio_lock(c); 1208 1209 BUG_ON(test_bit(B_READING, &b->state)); 1210 1211 if (!test_and_set_bit(B_DIRTY, &b->state)) 1212 __relink_lru(b, LIST_DIRTY); 1213 1214 dm_bufio_unlock(c); 1215 } 1216 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1217 1218 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1219 { 1220 LIST_HEAD(write_list); 1221 1222 BUG_ON(dm_bufio_in_request()); 1223 1224 dm_bufio_lock(c); 1225 __write_dirty_buffers_async(c, 0, &write_list); 1226 dm_bufio_unlock(c); 1227 __flush_write_list(&write_list); 1228 } 1229 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1230 1231 /* 1232 * For performance, it is essential that the buffers are written asynchronously 1233 * and simultaneously (so that the block layer can merge the writes) and then 1234 * waited upon. 1235 * 1236 * Finally, we flush hardware disk cache. 1237 */ 1238 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1239 { 1240 int a, f; 1241 unsigned long buffers_processed = 0; 1242 struct dm_buffer *b, *tmp; 1243 1244 LIST_HEAD(write_list); 1245 1246 dm_bufio_lock(c); 1247 __write_dirty_buffers_async(c, 0, &write_list); 1248 dm_bufio_unlock(c); 1249 __flush_write_list(&write_list); 1250 dm_bufio_lock(c); 1251 1252 again: 1253 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1254 int dropped_lock = 0; 1255 1256 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1257 buffers_processed++; 1258 1259 BUG_ON(test_bit(B_READING, &b->state)); 1260 1261 if (test_bit(B_WRITING, &b->state)) { 1262 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1263 dropped_lock = 1; 1264 b->hold_count++; 1265 dm_bufio_unlock(c); 1266 wait_on_bit_io(&b->state, B_WRITING, 1267 TASK_UNINTERRUPTIBLE); 1268 dm_bufio_lock(c); 1269 b->hold_count--; 1270 } else 1271 wait_on_bit_io(&b->state, B_WRITING, 1272 TASK_UNINTERRUPTIBLE); 1273 } 1274 1275 if (!test_bit(B_DIRTY, &b->state) && 1276 !test_bit(B_WRITING, &b->state)) 1277 __relink_lru(b, LIST_CLEAN); 1278 1279 cond_resched(); 1280 1281 /* 1282 * If we dropped the lock, the list is no longer consistent, 1283 * so we must restart the search. 1284 * 1285 * In the most common case, the buffer just processed is 1286 * relinked to the clean list, so we won't loop scanning the 1287 * same buffer again and again. 1288 * 1289 * This may livelock if there is another thread simultaneously 1290 * dirtying buffers, so we count the number of buffers walked 1291 * and if it exceeds the total number of buffers, it means that 1292 * someone is doing some writes simultaneously with us. In 1293 * this case, stop, dropping the lock. 1294 */ 1295 if (dropped_lock) 1296 goto again; 1297 } 1298 wake_up(&c->free_buffer_wait); 1299 dm_bufio_unlock(c); 1300 1301 a = xchg(&c->async_write_error, 0); 1302 f = dm_bufio_issue_flush(c); 1303 if (a) 1304 return a; 1305 1306 return f; 1307 } 1308 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1309 1310 /* 1311 * Use dm-io to send and empty barrier flush the device. 1312 */ 1313 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1314 { 1315 struct dm_io_request io_req = { 1316 .bi_op = REQ_OP_WRITE, 1317 .bi_op_flags = REQ_PREFLUSH, 1318 .mem.type = DM_IO_KMEM, 1319 .mem.ptr.addr = NULL, 1320 .client = c->dm_io, 1321 }; 1322 struct dm_io_region io_reg = { 1323 .bdev = c->bdev, 1324 .sector = 0, 1325 .count = 0, 1326 }; 1327 1328 BUG_ON(dm_bufio_in_request()); 1329 1330 return dm_io(&io_req, 1, &io_reg, NULL); 1331 } 1332 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1333 1334 /* 1335 * We first delete any other buffer that may be at that new location. 1336 * 1337 * Then, we write the buffer to the original location if it was dirty. 1338 * 1339 * Then, if we are the only one who is holding the buffer, relink the buffer 1340 * in the hash queue for the new location. 1341 * 1342 * If there was someone else holding the buffer, we write it to the new 1343 * location but not relink it, because that other user needs to have the buffer 1344 * at the same place. 1345 */ 1346 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1347 { 1348 struct dm_bufio_client *c = b->c; 1349 struct dm_buffer *new; 1350 1351 BUG_ON(dm_bufio_in_request()); 1352 1353 dm_bufio_lock(c); 1354 1355 retry: 1356 new = __find(c, new_block); 1357 if (new) { 1358 if (new->hold_count) { 1359 __wait_for_free_buffer(c); 1360 goto retry; 1361 } 1362 1363 /* 1364 * FIXME: Is there any point waiting for a write that's going 1365 * to be overwritten in a bit? 1366 */ 1367 __make_buffer_clean(new); 1368 __unlink_buffer(new); 1369 __free_buffer_wake(new); 1370 } 1371 1372 BUG_ON(!b->hold_count); 1373 BUG_ON(test_bit(B_READING, &b->state)); 1374 1375 __write_dirty_buffer(b, NULL); 1376 if (b->hold_count == 1) { 1377 wait_on_bit_io(&b->state, B_WRITING, 1378 TASK_UNINTERRUPTIBLE); 1379 set_bit(B_DIRTY, &b->state); 1380 __unlink_buffer(b); 1381 __link_buffer(b, new_block, LIST_DIRTY); 1382 } else { 1383 sector_t old_block; 1384 wait_on_bit_lock_io(&b->state, B_WRITING, 1385 TASK_UNINTERRUPTIBLE); 1386 /* 1387 * Relink buffer to "new_block" so that write_callback 1388 * sees "new_block" as a block number. 1389 * After the write, link the buffer back to old_block. 1390 * All this must be done in bufio lock, so that block number 1391 * change isn't visible to other threads. 1392 */ 1393 old_block = b->block; 1394 __unlink_buffer(b); 1395 __link_buffer(b, new_block, b->list_mode); 1396 submit_io(b, WRITE, new_block, write_endio); 1397 wait_on_bit_io(&b->state, B_WRITING, 1398 TASK_UNINTERRUPTIBLE); 1399 __unlink_buffer(b); 1400 __link_buffer(b, old_block, b->list_mode); 1401 } 1402 1403 dm_bufio_unlock(c); 1404 dm_bufio_release(b); 1405 } 1406 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1407 1408 /* 1409 * Free the given buffer. 1410 * 1411 * This is just a hint, if the buffer is in use or dirty, this function 1412 * does nothing. 1413 */ 1414 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1415 { 1416 struct dm_buffer *b; 1417 1418 dm_bufio_lock(c); 1419 1420 b = __find(c, block); 1421 if (b && likely(!b->hold_count) && likely(!b->state)) { 1422 __unlink_buffer(b); 1423 __free_buffer_wake(b); 1424 } 1425 1426 dm_bufio_unlock(c); 1427 } 1428 EXPORT_SYMBOL(dm_bufio_forget); 1429 1430 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1431 { 1432 c->minimum_buffers = n; 1433 } 1434 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1435 1436 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1437 { 1438 return c->block_size; 1439 } 1440 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1441 1442 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1443 { 1444 return i_size_read(c->bdev->bd_inode) >> 1445 (SECTOR_SHIFT + c->sectors_per_block_bits); 1446 } 1447 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1448 1449 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1450 { 1451 return b->block; 1452 } 1453 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1454 1455 void *dm_bufio_get_block_data(struct dm_buffer *b) 1456 { 1457 return b->data; 1458 } 1459 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1460 1461 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1462 { 1463 return b + 1; 1464 } 1465 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1466 1467 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1468 { 1469 return b->c; 1470 } 1471 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1472 1473 static void drop_buffers(struct dm_bufio_client *c) 1474 { 1475 struct dm_buffer *b; 1476 int i; 1477 bool warned = false; 1478 1479 BUG_ON(dm_bufio_in_request()); 1480 1481 /* 1482 * An optimization so that the buffers are not written one-by-one. 1483 */ 1484 dm_bufio_write_dirty_buffers_async(c); 1485 1486 dm_bufio_lock(c); 1487 1488 while ((b = __get_unclaimed_buffer(c))) 1489 __free_buffer_wake(b); 1490 1491 for (i = 0; i < LIST_SIZE; i++) 1492 list_for_each_entry(b, &c->lru[i], lru_list) { 1493 WARN_ON(!warned); 1494 warned = true; 1495 DMERR("leaked buffer %llx, hold count %u, list %d", 1496 (unsigned long long)b->block, b->hold_count, i); 1497 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1498 print_stack_trace(&b->stack_trace, 1); 1499 b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */ 1500 #endif 1501 } 1502 1503 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1504 while ((b = __get_unclaimed_buffer(c))) 1505 __free_buffer_wake(b); 1506 #endif 1507 1508 for (i = 0; i < LIST_SIZE; i++) 1509 BUG_ON(!list_empty(&c->lru[i])); 1510 1511 dm_bufio_unlock(c); 1512 } 1513 1514 /* 1515 * We may not be able to evict this buffer if IO pending or the client 1516 * is still using it. Caller is expected to know buffer is too old. 1517 * 1518 * And if GFP_NOFS is used, we must not do any I/O because we hold 1519 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1520 * rerouted to different bufio client. 1521 */ 1522 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) 1523 { 1524 if (!(gfp & __GFP_FS)) { 1525 if (test_bit(B_READING, &b->state) || 1526 test_bit(B_WRITING, &b->state) || 1527 test_bit(B_DIRTY, &b->state)) 1528 return false; 1529 } 1530 1531 if (b->hold_count) 1532 return false; 1533 1534 __make_buffer_clean(b); 1535 __unlink_buffer(b); 1536 __free_buffer_wake(b); 1537 1538 return true; 1539 } 1540 1541 static unsigned get_retain_buffers(struct dm_bufio_client *c) 1542 { 1543 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); 1544 return retain_bytes / c->block_size; 1545 } 1546 1547 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1548 gfp_t gfp_mask) 1549 { 1550 int l; 1551 struct dm_buffer *b, *tmp; 1552 unsigned long freed = 0; 1553 unsigned long count = nr_to_scan; 1554 unsigned retain_target = get_retain_buffers(c); 1555 1556 for (l = 0; l < LIST_SIZE; l++) { 1557 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1558 if (__try_evict_buffer(b, gfp_mask)) 1559 freed++; 1560 if (!--nr_to_scan || ((count - freed) <= retain_target)) 1561 return freed; 1562 cond_resched(); 1563 } 1564 } 1565 return freed; 1566 } 1567 1568 static unsigned long 1569 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1570 { 1571 struct dm_bufio_client *c; 1572 unsigned long freed; 1573 1574 c = container_of(shrink, struct dm_bufio_client, shrinker); 1575 if (sc->gfp_mask & __GFP_FS) 1576 dm_bufio_lock(c); 1577 else if (!dm_bufio_trylock(c)) 1578 return SHRINK_STOP; 1579 1580 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1581 dm_bufio_unlock(c); 1582 return freed; 1583 } 1584 1585 static unsigned long 1586 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1587 { 1588 struct dm_bufio_client *c; 1589 unsigned long count; 1590 1591 c = container_of(shrink, struct dm_bufio_client, shrinker); 1592 if (sc->gfp_mask & __GFP_FS) 1593 dm_bufio_lock(c); 1594 else if (!dm_bufio_trylock(c)) 1595 return 0; 1596 1597 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1598 dm_bufio_unlock(c); 1599 return count; 1600 } 1601 1602 /* 1603 * Create the buffering interface 1604 */ 1605 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1606 unsigned reserved_buffers, unsigned aux_size, 1607 void (*alloc_callback)(struct dm_buffer *), 1608 void (*write_callback)(struct dm_buffer *)) 1609 { 1610 int r; 1611 struct dm_bufio_client *c; 1612 unsigned i; 1613 1614 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1615 (block_size & (block_size - 1))); 1616 1617 c = kzalloc(sizeof(*c), GFP_KERNEL); 1618 if (!c) { 1619 r = -ENOMEM; 1620 goto bad_client; 1621 } 1622 c->buffer_tree = RB_ROOT; 1623 1624 c->bdev = bdev; 1625 c->block_size = block_size; 1626 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; 1627 c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? 1628 __ffs(block_size) - PAGE_SHIFT : 0; 1629 c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? 1630 PAGE_SHIFT - __ffs(block_size) : 0); 1631 1632 c->aux_size = aux_size; 1633 c->alloc_callback = alloc_callback; 1634 c->write_callback = write_callback; 1635 1636 for (i = 0; i < LIST_SIZE; i++) { 1637 INIT_LIST_HEAD(&c->lru[i]); 1638 c->n_buffers[i] = 0; 1639 } 1640 1641 mutex_init(&c->lock); 1642 INIT_LIST_HEAD(&c->reserved_buffers); 1643 c->need_reserved_buffers = reserved_buffers; 1644 1645 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1646 1647 init_waitqueue_head(&c->free_buffer_wait); 1648 c->async_write_error = 0; 1649 1650 c->dm_io = dm_io_client_create(); 1651 if (IS_ERR(c->dm_io)) { 1652 r = PTR_ERR(c->dm_io); 1653 goto bad_dm_io; 1654 } 1655 1656 mutex_lock(&dm_bufio_clients_lock); 1657 if (c->blocks_per_page_bits) { 1658 if (!DM_BUFIO_CACHE_NAME(c)) { 1659 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1660 if (!DM_BUFIO_CACHE_NAME(c)) { 1661 r = -ENOMEM; 1662 mutex_unlock(&dm_bufio_clients_lock); 1663 goto bad_cache; 1664 } 1665 } 1666 1667 if (!DM_BUFIO_CACHE(c)) { 1668 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1669 c->block_size, 1670 c->block_size, 0, NULL); 1671 if (!DM_BUFIO_CACHE(c)) { 1672 r = -ENOMEM; 1673 mutex_unlock(&dm_bufio_clients_lock); 1674 goto bad_cache; 1675 } 1676 } 1677 } 1678 mutex_unlock(&dm_bufio_clients_lock); 1679 1680 while (c->need_reserved_buffers) { 1681 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1682 1683 if (!b) { 1684 r = -ENOMEM; 1685 goto bad_buffer; 1686 } 1687 __free_buffer_wake(b); 1688 } 1689 1690 mutex_lock(&dm_bufio_clients_lock); 1691 dm_bufio_client_count++; 1692 list_add(&c->client_list, &dm_bufio_all_clients); 1693 __cache_size_refresh(); 1694 mutex_unlock(&dm_bufio_clients_lock); 1695 1696 c->shrinker.count_objects = dm_bufio_shrink_count; 1697 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1698 c->shrinker.seeks = 1; 1699 c->shrinker.batch = 0; 1700 register_shrinker(&c->shrinker); 1701 1702 return c; 1703 1704 bad_buffer: 1705 bad_cache: 1706 while (!list_empty(&c->reserved_buffers)) { 1707 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1708 struct dm_buffer, lru_list); 1709 list_del(&b->lru_list); 1710 free_buffer(b); 1711 } 1712 dm_io_client_destroy(c->dm_io); 1713 bad_dm_io: 1714 kfree(c); 1715 bad_client: 1716 return ERR_PTR(r); 1717 } 1718 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1719 1720 /* 1721 * Free the buffering interface. 1722 * It is required that there are no references on any buffers. 1723 */ 1724 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1725 { 1726 unsigned i; 1727 1728 drop_buffers(c); 1729 1730 unregister_shrinker(&c->shrinker); 1731 1732 mutex_lock(&dm_bufio_clients_lock); 1733 1734 list_del(&c->client_list); 1735 dm_bufio_client_count--; 1736 __cache_size_refresh(); 1737 1738 mutex_unlock(&dm_bufio_clients_lock); 1739 1740 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); 1741 BUG_ON(c->need_reserved_buffers); 1742 1743 while (!list_empty(&c->reserved_buffers)) { 1744 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1745 struct dm_buffer, lru_list); 1746 list_del(&b->lru_list); 1747 free_buffer(b); 1748 } 1749 1750 for (i = 0; i < LIST_SIZE; i++) 1751 if (c->n_buffers[i]) 1752 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1753 1754 for (i = 0; i < LIST_SIZE; i++) 1755 BUG_ON(c->n_buffers[i]); 1756 1757 dm_io_client_destroy(c->dm_io); 1758 kfree(c); 1759 } 1760 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1761 1762 static unsigned get_max_age_hz(void) 1763 { 1764 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1765 1766 if (max_age > UINT_MAX / HZ) 1767 max_age = UINT_MAX / HZ; 1768 1769 return max_age * HZ; 1770 } 1771 1772 static bool older_than(struct dm_buffer *b, unsigned long age_hz) 1773 { 1774 return time_after_eq(jiffies, b->last_accessed + age_hz); 1775 } 1776 1777 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 1778 { 1779 struct dm_buffer *b, *tmp; 1780 unsigned retain_target = get_retain_buffers(c); 1781 unsigned count; 1782 1783 dm_bufio_lock(c); 1784 1785 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1786 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1787 if (count <= retain_target) 1788 break; 1789 1790 if (!older_than(b, age_hz)) 1791 break; 1792 1793 if (__try_evict_buffer(b, 0)) 1794 count--; 1795 1796 cond_resched(); 1797 } 1798 1799 dm_bufio_unlock(c); 1800 } 1801 1802 static void cleanup_old_buffers(void) 1803 { 1804 unsigned long max_age_hz = get_max_age_hz(); 1805 struct dm_bufio_client *c; 1806 1807 mutex_lock(&dm_bufio_clients_lock); 1808 1809 list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1810 __evict_old_buffers(c, max_age_hz); 1811 1812 mutex_unlock(&dm_bufio_clients_lock); 1813 } 1814 1815 static struct workqueue_struct *dm_bufio_wq; 1816 static struct delayed_work dm_bufio_work; 1817 1818 static void work_fn(struct work_struct *w) 1819 { 1820 cleanup_old_buffers(); 1821 1822 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1823 DM_BUFIO_WORK_TIMER_SECS * HZ); 1824 } 1825 1826 /*---------------------------------------------------------------- 1827 * Module setup 1828 *--------------------------------------------------------------*/ 1829 1830 /* 1831 * This is called only once for the whole dm_bufio module. 1832 * It initializes memory limit. 1833 */ 1834 static int __init dm_bufio_init(void) 1835 { 1836 __u64 mem; 1837 1838 dm_bufio_allocated_kmem_cache = 0; 1839 dm_bufio_allocated_get_free_pages = 0; 1840 dm_bufio_allocated_vmalloc = 0; 1841 dm_bufio_current_allocated = 0; 1842 1843 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1844 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1845 1846 mem = (__u64)((totalram_pages - totalhigh_pages) * 1847 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1848 1849 if (mem > ULONG_MAX) 1850 mem = ULONG_MAX; 1851 1852 #ifdef CONFIG_MMU 1853 /* 1854 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1855 * in fs/proc/internal.h 1856 */ 1857 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1858 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1859 #endif 1860 1861 dm_bufio_default_cache_size = mem; 1862 1863 mutex_lock(&dm_bufio_clients_lock); 1864 __cache_size_refresh(); 1865 mutex_unlock(&dm_bufio_clients_lock); 1866 1867 dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); 1868 if (!dm_bufio_wq) 1869 return -ENOMEM; 1870 1871 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1872 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1873 DM_BUFIO_WORK_TIMER_SECS * HZ); 1874 1875 return 0; 1876 } 1877 1878 /* 1879 * This is called once when unloading the dm_bufio module. 1880 */ 1881 static void __exit dm_bufio_exit(void) 1882 { 1883 int bug = 0; 1884 int i; 1885 1886 cancel_delayed_work_sync(&dm_bufio_work); 1887 destroy_workqueue(dm_bufio_wq); 1888 1889 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) 1890 kmem_cache_destroy(dm_bufio_caches[i]); 1891 1892 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1893 kfree(dm_bufio_cache_names[i]); 1894 1895 if (dm_bufio_client_count) { 1896 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1897 __func__, dm_bufio_client_count); 1898 bug = 1; 1899 } 1900 1901 if (dm_bufio_current_allocated) { 1902 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1903 __func__, dm_bufio_current_allocated); 1904 bug = 1; 1905 } 1906 1907 if (dm_bufio_allocated_get_free_pages) { 1908 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1909 __func__, dm_bufio_allocated_get_free_pages); 1910 bug = 1; 1911 } 1912 1913 if (dm_bufio_allocated_vmalloc) { 1914 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1915 __func__, dm_bufio_allocated_vmalloc); 1916 bug = 1; 1917 } 1918 1919 BUG_ON(bug); 1920 } 1921 1922 module_init(dm_bufio_init) 1923 module_exit(dm_bufio_exit) 1924 1925 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1926 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1927 1928 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1929 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1930 1931 module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); 1932 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); 1933 1934 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1935 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1936 1937 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1938 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1939 1940 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1941 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1942 1943 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1944 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1945 1946 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1947 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1948 1949 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1950 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1951 MODULE_LICENSE("GPL"); 1952