1 /* 2 * Copyright (C) 2009-2011 Red Hat, Inc. 3 * 4 * Author: Mikulas Patocka <mpatocka@redhat.com> 5 * 6 * This file is released under the GPL. 7 */ 8 9 #include "dm-bufio.h" 10 11 #include <linux/device-mapper.h> 12 #include <linux/dm-io.h> 13 #include <linux/slab.h> 14 #include <linux/jiffies.h> 15 #include <linux/vmalloc.h> 16 #include <linux/shrinker.h> 17 #include <linux/module.h> 18 #include <linux/rbtree.h> 19 #include <linux/stacktrace.h> 20 21 #define DM_MSG_PREFIX "bufio" 22 23 /* 24 * Memory management policy: 25 * Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory 26 * or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower). 27 * Always allocate at least DM_BUFIO_MIN_BUFFERS buffers. 28 * Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT 29 * dirty buffers. 30 */ 31 #define DM_BUFIO_MIN_BUFFERS 8 32 33 #define DM_BUFIO_MEMORY_PERCENT 2 34 #define DM_BUFIO_VMALLOC_PERCENT 25 35 #define DM_BUFIO_WRITEBACK_PERCENT 75 36 37 /* 38 * Check buffer ages in this interval (seconds) 39 */ 40 #define DM_BUFIO_WORK_TIMER_SECS 30 41 42 /* 43 * Free buffers when they are older than this (seconds) 44 */ 45 #define DM_BUFIO_DEFAULT_AGE_SECS 300 46 47 /* 48 * The nr of bytes of cached data to keep around. 49 */ 50 #define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) 51 52 /* 53 * The number of bvec entries that are embedded directly in the buffer. 54 * If the chunk size is larger, dm-io is used to do the io. 55 */ 56 #define DM_BUFIO_INLINE_VECS 16 57 58 /* 59 * Don't try to use kmem_cache_alloc for blocks larger than this. 60 * For explanation, see alloc_buffer_data below. 61 */ 62 #define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT (PAGE_SIZE >> 1) 63 #define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT (PAGE_SIZE << (MAX_ORDER - 1)) 64 65 /* 66 * dm_buffer->list_mode 67 */ 68 #define LIST_CLEAN 0 69 #define LIST_DIRTY 1 70 #define LIST_SIZE 2 71 72 /* 73 * Linking of buffers: 74 * All buffers are linked to cache_hash with their hash_list field. 75 * 76 * Clean buffers that are not being written (B_WRITING not set) 77 * are linked to lru[LIST_CLEAN] with their lru_list field. 78 * 79 * Dirty and clean buffers that are being written are linked to 80 * lru[LIST_DIRTY] with their lru_list field. When the write 81 * finishes, the buffer cannot be relinked immediately (because we 82 * are in an interrupt context and relinking requires process 83 * context), so some clean-not-writing buffers can be held on 84 * dirty_lru too. They are later added to lru in the process 85 * context. 86 */ 87 struct dm_bufio_client { 88 struct mutex lock; 89 90 struct list_head lru[LIST_SIZE]; 91 unsigned long n_buffers[LIST_SIZE]; 92 93 struct block_device *bdev; 94 unsigned block_size; 95 unsigned char sectors_per_block_bits; 96 unsigned char pages_per_block_bits; 97 unsigned char blocks_per_page_bits; 98 unsigned aux_size; 99 void (*alloc_callback)(struct dm_buffer *); 100 void (*write_callback)(struct dm_buffer *); 101 102 struct dm_io_client *dm_io; 103 104 struct list_head reserved_buffers; 105 unsigned need_reserved_buffers; 106 107 unsigned minimum_buffers; 108 109 struct rb_root buffer_tree; 110 wait_queue_head_t free_buffer_wait; 111 112 int async_write_error; 113 114 struct list_head client_list; 115 struct shrinker shrinker; 116 }; 117 118 /* 119 * Buffer state bits. 120 */ 121 #define B_READING 0 122 #define B_WRITING 1 123 #define B_DIRTY 2 124 125 /* 126 * Describes how the block was allocated: 127 * kmem_cache_alloc(), __get_free_pages() or vmalloc(). 128 * See the comment at alloc_buffer_data. 129 */ 130 enum data_mode { 131 DATA_MODE_SLAB = 0, 132 DATA_MODE_GET_FREE_PAGES = 1, 133 DATA_MODE_VMALLOC = 2, 134 DATA_MODE_LIMIT = 3 135 }; 136 137 struct dm_buffer { 138 struct rb_node node; 139 struct list_head lru_list; 140 sector_t block; 141 void *data; 142 enum data_mode data_mode; 143 unsigned char list_mode; /* LIST_* */ 144 unsigned hold_count; 145 int read_error; 146 int write_error; 147 unsigned long state; 148 unsigned long last_accessed; 149 struct dm_bufio_client *c; 150 struct list_head write_list; 151 struct bio bio; 152 struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; 153 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 154 #define MAX_STACK 10 155 struct stack_trace stack_trace; 156 unsigned long stack_entries[MAX_STACK]; 157 #endif 158 }; 159 160 /*----------------------------------------------------------------*/ 161 162 static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT]; 163 static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT]; 164 165 static inline int dm_bufio_cache_index(struct dm_bufio_client *c) 166 { 167 unsigned ret = c->blocks_per_page_bits - 1; 168 169 BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches)); 170 171 return ret; 172 } 173 174 #define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)]) 175 #define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)]) 176 177 #define dm_bufio_in_request() (!!current->bio_list) 178 179 static void dm_bufio_lock(struct dm_bufio_client *c) 180 { 181 mutex_lock_nested(&c->lock, dm_bufio_in_request()); 182 } 183 184 static int dm_bufio_trylock(struct dm_bufio_client *c) 185 { 186 return mutex_trylock(&c->lock); 187 } 188 189 static void dm_bufio_unlock(struct dm_bufio_client *c) 190 { 191 mutex_unlock(&c->lock); 192 } 193 194 /* 195 * FIXME Move to sched.h? 196 */ 197 #ifdef CONFIG_PREEMPT_VOLUNTARY 198 # define dm_bufio_cond_resched() \ 199 do { \ 200 if (unlikely(need_resched())) \ 201 _cond_resched(); \ 202 } while (0) 203 #else 204 # define dm_bufio_cond_resched() do { } while (0) 205 #endif 206 207 /*----------------------------------------------------------------*/ 208 209 /* 210 * Default cache size: available memory divided by the ratio. 211 */ 212 static unsigned long dm_bufio_default_cache_size; 213 214 /* 215 * Total cache size set by the user. 216 */ 217 static unsigned long dm_bufio_cache_size; 218 219 /* 220 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change 221 * at any time. If it disagrees, the user has changed cache size. 222 */ 223 static unsigned long dm_bufio_cache_size_latch; 224 225 static DEFINE_SPINLOCK(param_spinlock); 226 227 /* 228 * Buffers are freed after this timeout 229 */ 230 static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; 231 static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; 232 233 static unsigned long dm_bufio_peak_allocated; 234 static unsigned long dm_bufio_allocated_kmem_cache; 235 static unsigned long dm_bufio_allocated_get_free_pages; 236 static unsigned long dm_bufio_allocated_vmalloc; 237 static unsigned long dm_bufio_current_allocated; 238 239 /*----------------------------------------------------------------*/ 240 241 /* 242 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count 243 */ 244 static unsigned long dm_bufio_cache_size_per_client; 245 246 /* 247 * The current number of clients. 248 */ 249 static int dm_bufio_client_count; 250 251 /* 252 * The list of all clients. 253 */ 254 static LIST_HEAD(dm_bufio_all_clients); 255 256 /* 257 * This mutex protects dm_bufio_cache_size_latch, 258 * dm_bufio_cache_size_per_client and dm_bufio_client_count 259 */ 260 static DEFINE_MUTEX(dm_bufio_clients_lock); 261 262 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 263 static void buffer_record_stack(struct dm_buffer *b) 264 { 265 b->stack_trace.nr_entries = 0; 266 b->stack_trace.max_entries = MAX_STACK; 267 b->stack_trace.entries = b->stack_entries; 268 b->stack_trace.skip = 2; 269 save_stack_trace(&b->stack_trace); 270 } 271 #endif 272 273 /*---------------------------------------------------------------- 274 * A red/black tree acts as an index for all the buffers. 275 *--------------------------------------------------------------*/ 276 static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) 277 { 278 struct rb_node *n = c->buffer_tree.rb_node; 279 struct dm_buffer *b; 280 281 while (n) { 282 b = container_of(n, struct dm_buffer, node); 283 284 if (b->block == block) 285 return b; 286 287 n = (b->block < block) ? n->rb_left : n->rb_right; 288 } 289 290 return NULL; 291 } 292 293 static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) 294 { 295 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; 296 struct dm_buffer *found; 297 298 while (*new) { 299 found = container_of(*new, struct dm_buffer, node); 300 301 if (found->block == b->block) { 302 BUG_ON(found != b); 303 return; 304 } 305 306 parent = *new; 307 new = (found->block < b->block) ? 308 &((*new)->rb_left) : &((*new)->rb_right); 309 } 310 311 rb_link_node(&b->node, parent, new); 312 rb_insert_color(&b->node, &c->buffer_tree); 313 } 314 315 static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) 316 { 317 rb_erase(&b->node, &c->buffer_tree); 318 } 319 320 /*----------------------------------------------------------------*/ 321 322 static void adjust_total_allocated(enum data_mode data_mode, long diff) 323 { 324 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = { 325 &dm_bufio_allocated_kmem_cache, 326 &dm_bufio_allocated_get_free_pages, 327 &dm_bufio_allocated_vmalloc, 328 }; 329 330 spin_lock(¶m_spinlock); 331 332 *class_ptr[data_mode] += diff; 333 334 dm_bufio_current_allocated += diff; 335 336 if (dm_bufio_current_allocated > dm_bufio_peak_allocated) 337 dm_bufio_peak_allocated = dm_bufio_current_allocated; 338 339 spin_unlock(¶m_spinlock); 340 } 341 342 /* 343 * Change the number of clients and recalculate per-client limit. 344 */ 345 static void __cache_size_refresh(void) 346 { 347 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock)); 348 BUG_ON(dm_bufio_client_count < 0); 349 350 dm_bufio_cache_size_latch = ACCESS_ONCE(dm_bufio_cache_size); 351 352 /* 353 * Use default if set to 0 and report the actual cache size used. 354 */ 355 if (!dm_bufio_cache_size_latch) { 356 (void)cmpxchg(&dm_bufio_cache_size, 0, 357 dm_bufio_default_cache_size); 358 dm_bufio_cache_size_latch = dm_bufio_default_cache_size; 359 } 360 361 dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch / 362 (dm_bufio_client_count ? : 1); 363 } 364 365 /* 366 * Allocating buffer data. 367 * 368 * Small buffers are allocated with kmem_cache, to use space optimally. 369 * 370 * For large buffers, we choose between get_free_pages and vmalloc. 371 * Each has advantages and disadvantages. 372 * 373 * __get_free_pages can randomly fail if the memory is fragmented. 374 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be 375 * as low as 128M) so using it for caching is not appropriate. 376 * 377 * If the allocation may fail we use __get_free_pages. Memory fragmentation 378 * won't have a fatal effect here, but it just causes flushes of some other 379 * buffers and more I/O will be performed. Don't use __get_free_pages if it 380 * always fails (i.e. order >= MAX_ORDER). 381 * 382 * If the allocation shouldn't fail we use __vmalloc. This is only for the 383 * initial reserve allocation, so there's no risk of wasting all vmalloc 384 * space. 385 */ 386 static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, 387 enum data_mode *data_mode) 388 { 389 unsigned noio_flag; 390 void *ptr; 391 392 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) { 393 *data_mode = DATA_MODE_SLAB; 394 return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask); 395 } 396 397 if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT && 398 gfp_mask & __GFP_NORETRY) { 399 *data_mode = DATA_MODE_GET_FREE_PAGES; 400 return (void *)__get_free_pages(gfp_mask, 401 c->pages_per_block_bits); 402 } 403 404 *data_mode = DATA_MODE_VMALLOC; 405 406 /* 407 * __vmalloc allocates the data pages and auxiliary structures with 408 * gfp_flags that were specified, but pagetables are always allocated 409 * with GFP_KERNEL, no matter what was specified as gfp_mask. 410 * 411 * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that 412 * all allocations done by this process (including pagetables) are done 413 * as if GFP_NOIO was specified. 414 */ 415 416 if (gfp_mask & __GFP_NORETRY) 417 noio_flag = memalloc_noio_save(); 418 419 ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); 420 421 if (gfp_mask & __GFP_NORETRY) 422 memalloc_noio_restore(noio_flag); 423 424 return ptr; 425 } 426 427 /* 428 * Free buffer's data. 429 */ 430 static void free_buffer_data(struct dm_bufio_client *c, 431 void *data, enum data_mode data_mode) 432 { 433 switch (data_mode) { 434 case DATA_MODE_SLAB: 435 kmem_cache_free(DM_BUFIO_CACHE(c), data); 436 break; 437 438 case DATA_MODE_GET_FREE_PAGES: 439 free_pages((unsigned long)data, c->pages_per_block_bits); 440 break; 441 442 case DATA_MODE_VMALLOC: 443 vfree(data); 444 break; 445 446 default: 447 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d", 448 data_mode); 449 BUG(); 450 } 451 } 452 453 /* 454 * Allocate buffer and its data. 455 */ 456 static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask) 457 { 458 struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size, 459 gfp_mask); 460 461 if (!b) 462 return NULL; 463 464 b->c = c; 465 466 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode); 467 if (!b->data) { 468 kfree(b); 469 return NULL; 470 } 471 472 adjust_total_allocated(b->data_mode, (long)c->block_size); 473 474 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 475 memset(&b->stack_trace, 0, sizeof(b->stack_trace)); 476 #endif 477 return b; 478 } 479 480 /* 481 * Free buffer and its data. 482 */ 483 static void free_buffer(struct dm_buffer *b) 484 { 485 struct dm_bufio_client *c = b->c; 486 487 adjust_total_allocated(b->data_mode, -(long)c->block_size); 488 489 free_buffer_data(c, b->data, b->data_mode); 490 kfree(b); 491 } 492 493 /* 494 * Link buffer to the hash list and clean or dirty queue. 495 */ 496 static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty) 497 { 498 struct dm_bufio_client *c = b->c; 499 500 c->n_buffers[dirty]++; 501 b->block = block; 502 b->list_mode = dirty; 503 list_add(&b->lru_list, &c->lru[dirty]); 504 __insert(b->c, b); 505 b->last_accessed = jiffies; 506 } 507 508 /* 509 * Unlink buffer from the hash list and dirty or clean queue. 510 */ 511 static void __unlink_buffer(struct dm_buffer *b) 512 { 513 struct dm_bufio_client *c = b->c; 514 515 BUG_ON(!c->n_buffers[b->list_mode]); 516 517 c->n_buffers[b->list_mode]--; 518 __remove(b->c, b); 519 list_del(&b->lru_list); 520 } 521 522 /* 523 * Place the buffer to the head of dirty or clean LRU queue. 524 */ 525 static void __relink_lru(struct dm_buffer *b, int dirty) 526 { 527 struct dm_bufio_client *c = b->c; 528 529 BUG_ON(!c->n_buffers[b->list_mode]); 530 531 c->n_buffers[b->list_mode]--; 532 c->n_buffers[dirty]++; 533 b->list_mode = dirty; 534 list_move(&b->lru_list, &c->lru[dirty]); 535 b->last_accessed = jiffies; 536 } 537 538 /*---------------------------------------------------------------- 539 * Submit I/O on the buffer. 540 * 541 * Bio interface is faster but it has some problems: 542 * the vector list is limited (increasing this limit increases 543 * memory-consumption per buffer, so it is not viable); 544 * 545 * the memory must be direct-mapped, not vmalloced; 546 * 547 * the I/O driver can reject requests spuriously if it thinks that 548 * the requests are too big for the device or if they cross a 549 * controller-defined memory boundary. 550 * 551 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and 552 * it is not vmalloced, try using the bio interface. 553 * 554 * If the buffer is big, if it is vmalloced or if the underlying device 555 * rejects the bio because it is too large, use dm-io layer to do the I/O. 556 * The dm-io layer splits the I/O into multiple requests, avoiding the above 557 * shortcomings. 558 *--------------------------------------------------------------*/ 559 560 /* 561 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending 562 * that the request was handled directly with bio interface. 563 */ 564 static void dmio_complete(unsigned long error, void *context) 565 { 566 struct dm_buffer *b = context; 567 568 b->bio.bi_error = error ? -EIO : 0; 569 b->bio.bi_end_io(&b->bio); 570 } 571 572 static void use_dmio(struct dm_buffer *b, int rw, sector_t block, 573 bio_end_io_t *end_io) 574 { 575 int r; 576 struct dm_io_request io_req = { 577 .bi_op = rw, 578 .bi_op_flags = 0, 579 .notify.fn = dmio_complete, 580 .notify.context = b, 581 .client = b->c->dm_io, 582 }; 583 struct dm_io_region region = { 584 .bdev = b->c->bdev, 585 .sector = block << b->c->sectors_per_block_bits, 586 .count = b->c->block_size >> SECTOR_SHIFT, 587 }; 588 589 if (b->data_mode != DATA_MODE_VMALLOC) { 590 io_req.mem.type = DM_IO_KMEM; 591 io_req.mem.ptr.addr = b->data; 592 } else { 593 io_req.mem.type = DM_IO_VMA; 594 io_req.mem.ptr.vma = b->data; 595 } 596 597 b->bio.bi_end_io = end_io; 598 599 r = dm_io(&io_req, 1, ®ion, NULL); 600 if (r) { 601 b->bio.bi_error = r; 602 end_io(&b->bio); 603 } 604 } 605 606 static void inline_endio(struct bio *bio) 607 { 608 bio_end_io_t *end_fn = bio->bi_private; 609 int error = bio->bi_error; 610 611 /* 612 * Reset the bio to free any attached resources 613 * (e.g. bio integrity profiles). 614 */ 615 bio_reset(bio); 616 617 bio->bi_error = error; 618 end_fn(bio); 619 } 620 621 static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, 622 bio_end_io_t *end_io) 623 { 624 char *ptr; 625 int len; 626 627 bio_init(&b->bio); 628 b->bio.bi_io_vec = b->bio_vec; 629 b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; 630 b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; 631 b->bio.bi_bdev = b->c->bdev; 632 b->bio.bi_end_io = inline_endio; 633 /* 634 * Use of .bi_private isn't a problem here because 635 * the dm_buffer's inline bio is local to bufio. 636 */ 637 b->bio.bi_private = end_io; 638 bio_set_op_attrs(&b->bio, rw, 0); 639 640 /* 641 * We assume that if len >= PAGE_SIZE ptr is page-aligned. 642 * If len < PAGE_SIZE the buffer doesn't cross page boundary. 643 */ 644 ptr = b->data; 645 len = b->c->block_size; 646 647 if (len >= PAGE_SIZE) 648 BUG_ON((unsigned long)ptr & (PAGE_SIZE - 1)); 649 else 650 BUG_ON((unsigned long)ptr & (len - 1)); 651 652 do { 653 if (!bio_add_page(&b->bio, virt_to_page(ptr), 654 len < PAGE_SIZE ? len : PAGE_SIZE, 655 offset_in_page(ptr))) { 656 BUG_ON(b->c->block_size <= PAGE_SIZE); 657 use_dmio(b, rw, block, end_io); 658 return; 659 } 660 661 len -= PAGE_SIZE; 662 ptr += PAGE_SIZE; 663 } while (len > 0); 664 665 submit_bio(&b->bio); 666 } 667 668 static void submit_io(struct dm_buffer *b, int rw, sector_t block, 669 bio_end_io_t *end_io) 670 { 671 if (rw == WRITE && b->c->write_callback) 672 b->c->write_callback(b); 673 674 if (b->c->block_size <= DM_BUFIO_INLINE_VECS * PAGE_SIZE && 675 b->data_mode != DATA_MODE_VMALLOC) 676 use_inline_bio(b, rw, block, end_io); 677 else 678 use_dmio(b, rw, block, end_io); 679 } 680 681 /*---------------------------------------------------------------- 682 * Writing dirty buffers 683 *--------------------------------------------------------------*/ 684 685 /* 686 * The endio routine for write. 687 * 688 * Set the error, clear B_WRITING bit and wake anyone who was waiting on 689 * it. 690 */ 691 static void write_endio(struct bio *bio) 692 { 693 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 694 695 b->write_error = bio->bi_error; 696 if (unlikely(bio->bi_error)) { 697 struct dm_bufio_client *c = b->c; 698 int error = bio->bi_error; 699 (void)cmpxchg(&c->async_write_error, 0, error); 700 } 701 702 BUG_ON(!test_bit(B_WRITING, &b->state)); 703 704 smp_mb__before_atomic(); 705 clear_bit(B_WRITING, &b->state); 706 smp_mb__after_atomic(); 707 708 wake_up_bit(&b->state, B_WRITING); 709 } 710 711 /* 712 * Initiate a write on a dirty buffer, but don't wait for it. 713 * 714 * - If the buffer is not dirty, exit. 715 * - If there some previous write going on, wait for it to finish (we can't 716 * have two writes on the same buffer simultaneously). 717 * - Submit our write and don't wait on it. We set B_WRITING indicating 718 * that there is a write in progress. 719 */ 720 static void __write_dirty_buffer(struct dm_buffer *b, 721 struct list_head *write_list) 722 { 723 if (!test_bit(B_DIRTY, &b->state)) 724 return; 725 726 clear_bit(B_DIRTY, &b->state); 727 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 728 729 if (!write_list) 730 submit_io(b, WRITE, b->block, write_endio); 731 else 732 list_add_tail(&b->write_list, write_list); 733 } 734 735 static void __flush_write_list(struct list_head *write_list) 736 { 737 struct blk_plug plug; 738 blk_start_plug(&plug); 739 while (!list_empty(write_list)) { 740 struct dm_buffer *b = 741 list_entry(write_list->next, struct dm_buffer, write_list); 742 list_del(&b->write_list); 743 submit_io(b, WRITE, b->block, write_endio); 744 dm_bufio_cond_resched(); 745 } 746 blk_finish_plug(&plug); 747 } 748 749 /* 750 * Wait until any activity on the buffer finishes. Possibly write the 751 * buffer if it is dirty. When this function finishes, there is no I/O 752 * running on the buffer and the buffer is not dirty. 753 */ 754 static void __make_buffer_clean(struct dm_buffer *b) 755 { 756 BUG_ON(b->hold_count); 757 758 if (!b->state) /* fast case */ 759 return; 760 761 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 762 __write_dirty_buffer(b, NULL); 763 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); 764 } 765 766 /* 767 * Find some buffer that is not held by anybody, clean it, unlink it and 768 * return it. 769 */ 770 static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) 771 { 772 struct dm_buffer *b; 773 774 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) { 775 BUG_ON(test_bit(B_WRITING, &b->state)); 776 BUG_ON(test_bit(B_DIRTY, &b->state)); 777 778 if (!b->hold_count) { 779 __make_buffer_clean(b); 780 __unlink_buffer(b); 781 return b; 782 } 783 dm_bufio_cond_resched(); 784 } 785 786 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { 787 BUG_ON(test_bit(B_READING, &b->state)); 788 789 if (!b->hold_count) { 790 __make_buffer_clean(b); 791 __unlink_buffer(b); 792 return b; 793 } 794 dm_bufio_cond_resched(); 795 } 796 797 return NULL; 798 } 799 800 /* 801 * Wait until some other threads free some buffer or release hold count on 802 * some buffer. 803 * 804 * This function is entered with c->lock held, drops it and regains it 805 * before exiting. 806 */ 807 static void __wait_for_free_buffer(struct dm_bufio_client *c) 808 { 809 DECLARE_WAITQUEUE(wait, current); 810 811 add_wait_queue(&c->free_buffer_wait, &wait); 812 set_task_state(current, TASK_UNINTERRUPTIBLE); 813 dm_bufio_unlock(c); 814 815 io_schedule(); 816 817 remove_wait_queue(&c->free_buffer_wait, &wait); 818 819 dm_bufio_lock(c); 820 } 821 822 enum new_flag { 823 NF_FRESH = 0, 824 NF_READ = 1, 825 NF_GET = 2, 826 NF_PREFETCH = 3 827 }; 828 829 /* 830 * Allocate a new buffer. If the allocation is not possible, wait until 831 * some other thread frees a buffer. 832 * 833 * May drop the lock and regain it. 834 */ 835 static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) 836 { 837 struct dm_buffer *b; 838 839 /* 840 * dm-bufio is resistant to allocation failures (it just keeps 841 * one buffer reserved in cases all the allocations fail). 842 * So set flags to not try too hard: 843 * GFP_NOIO: don't recurse into the I/O layer 844 * __GFP_NORETRY: don't retry and rather return failure 845 * __GFP_NOMEMALLOC: don't use emergency reserves 846 * __GFP_NOWARN: don't print a warning in case of failure 847 * 848 * For debugging, if we set the cache size to 1, no new buffers will 849 * be allocated. 850 */ 851 while (1) { 852 if (dm_bufio_cache_size_latch != 1) { 853 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN); 854 if (b) 855 return b; 856 } 857 858 if (nf == NF_PREFETCH) 859 return NULL; 860 861 if (!list_empty(&c->reserved_buffers)) { 862 b = list_entry(c->reserved_buffers.next, 863 struct dm_buffer, lru_list); 864 list_del(&b->lru_list); 865 c->need_reserved_buffers++; 866 867 return b; 868 } 869 870 b = __get_unclaimed_buffer(c); 871 if (b) 872 return b; 873 874 __wait_for_free_buffer(c); 875 } 876 } 877 878 static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) 879 { 880 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); 881 882 if (!b) 883 return NULL; 884 885 if (c->alloc_callback) 886 c->alloc_callback(b); 887 888 return b; 889 } 890 891 /* 892 * Free a buffer and wake other threads waiting for free buffers. 893 */ 894 static void __free_buffer_wake(struct dm_buffer *b) 895 { 896 struct dm_bufio_client *c = b->c; 897 898 if (!c->need_reserved_buffers) 899 free_buffer(b); 900 else { 901 list_add(&b->lru_list, &c->reserved_buffers); 902 c->need_reserved_buffers--; 903 } 904 905 wake_up(&c->free_buffer_wait); 906 } 907 908 static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, 909 struct list_head *write_list) 910 { 911 struct dm_buffer *b, *tmp; 912 913 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 914 BUG_ON(test_bit(B_READING, &b->state)); 915 916 if (!test_bit(B_DIRTY, &b->state) && 917 !test_bit(B_WRITING, &b->state)) { 918 __relink_lru(b, LIST_CLEAN); 919 continue; 920 } 921 922 if (no_wait && test_bit(B_WRITING, &b->state)) 923 return; 924 925 __write_dirty_buffer(b, write_list); 926 dm_bufio_cond_resched(); 927 } 928 } 929 930 /* 931 * Get writeback threshold and buffer limit for a given client. 932 */ 933 static void __get_memory_limit(struct dm_bufio_client *c, 934 unsigned long *threshold_buffers, 935 unsigned long *limit_buffers) 936 { 937 unsigned long buffers; 938 939 if (ACCESS_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch) { 940 mutex_lock(&dm_bufio_clients_lock); 941 __cache_size_refresh(); 942 mutex_unlock(&dm_bufio_clients_lock); 943 } 944 945 buffers = dm_bufio_cache_size_per_client >> 946 (c->sectors_per_block_bits + SECTOR_SHIFT); 947 948 if (buffers < c->minimum_buffers) 949 buffers = c->minimum_buffers; 950 951 *limit_buffers = buffers; 952 *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; 953 } 954 955 /* 956 * Check if we're over watermark. 957 * If we are over threshold_buffers, start freeing buffers. 958 * If we're over "limit_buffers", block until we get under the limit. 959 */ 960 static void __check_watermark(struct dm_bufio_client *c, 961 struct list_head *write_list) 962 { 963 unsigned long threshold_buffers, limit_buffers; 964 965 __get_memory_limit(c, &threshold_buffers, &limit_buffers); 966 967 while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] > 968 limit_buffers) { 969 970 struct dm_buffer *b = __get_unclaimed_buffer(c); 971 972 if (!b) 973 return; 974 975 __free_buffer_wake(b); 976 dm_bufio_cond_resched(); 977 } 978 979 if (c->n_buffers[LIST_DIRTY] > threshold_buffers) 980 __write_dirty_buffers_async(c, 1, write_list); 981 } 982 983 /*---------------------------------------------------------------- 984 * Getting a buffer 985 *--------------------------------------------------------------*/ 986 987 static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 988 enum new_flag nf, int *need_submit, 989 struct list_head *write_list) 990 { 991 struct dm_buffer *b, *new_b = NULL; 992 993 *need_submit = 0; 994 995 b = __find(c, block); 996 if (b) 997 goto found_buffer; 998 999 if (nf == NF_GET) 1000 return NULL; 1001 1002 new_b = __alloc_buffer_wait(c, nf); 1003 if (!new_b) 1004 return NULL; 1005 1006 /* 1007 * We've had a period where the mutex was unlocked, so need to 1008 * recheck the hash table. 1009 */ 1010 b = __find(c, block); 1011 if (b) { 1012 __free_buffer_wake(new_b); 1013 goto found_buffer; 1014 } 1015 1016 __check_watermark(c, write_list); 1017 1018 b = new_b; 1019 b->hold_count = 1; 1020 b->read_error = 0; 1021 b->write_error = 0; 1022 __link_buffer(b, block, LIST_CLEAN); 1023 1024 if (nf == NF_FRESH) { 1025 b->state = 0; 1026 return b; 1027 } 1028 1029 b->state = 1 << B_READING; 1030 *need_submit = 1; 1031 1032 return b; 1033 1034 found_buffer: 1035 if (nf == NF_PREFETCH) 1036 return NULL; 1037 /* 1038 * Note: it is essential that we don't wait for the buffer to be 1039 * read if dm_bufio_get function is used. Both dm_bufio_get and 1040 * dm_bufio_prefetch can be used in the driver request routine. 1041 * If the user called both dm_bufio_prefetch and dm_bufio_get on 1042 * the same buffer, it would deadlock if we waited. 1043 */ 1044 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) 1045 return NULL; 1046 1047 b->hold_count++; 1048 __relink_lru(b, test_bit(B_DIRTY, &b->state) || 1049 test_bit(B_WRITING, &b->state)); 1050 return b; 1051 } 1052 1053 /* 1054 * The endio routine for reading: set the error, clear the bit and wake up 1055 * anyone waiting on the buffer. 1056 */ 1057 static void read_endio(struct bio *bio) 1058 { 1059 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 1060 1061 b->read_error = bio->bi_error; 1062 1063 BUG_ON(!test_bit(B_READING, &b->state)); 1064 1065 smp_mb__before_atomic(); 1066 clear_bit(B_READING, &b->state); 1067 smp_mb__after_atomic(); 1068 1069 wake_up_bit(&b->state, B_READING); 1070 } 1071 1072 /* 1073 * A common routine for dm_bufio_new and dm_bufio_read. Operation of these 1074 * functions is similar except that dm_bufio_new doesn't read the 1075 * buffer from the disk (assuming that the caller overwrites all the data 1076 * and uses dm_bufio_mark_buffer_dirty to write new data back). 1077 */ 1078 static void *new_read(struct dm_bufio_client *c, sector_t block, 1079 enum new_flag nf, struct dm_buffer **bp) 1080 { 1081 int need_submit; 1082 struct dm_buffer *b; 1083 1084 LIST_HEAD(write_list); 1085 1086 dm_bufio_lock(c); 1087 b = __bufio_new(c, block, nf, &need_submit, &write_list); 1088 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1089 if (b && b->hold_count == 1) 1090 buffer_record_stack(b); 1091 #endif 1092 dm_bufio_unlock(c); 1093 1094 __flush_write_list(&write_list); 1095 1096 if (!b) 1097 return NULL; 1098 1099 if (need_submit) 1100 submit_io(b, READ, b->block, read_endio); 1101 1102 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); 1103 1104 if (b->read_error) { 1105 int error = b->read_error; 1106 1107 dm_bufio_release(b); 1108 1109 return ERR_PTR(error); 1110 } 1111 1112 *bp = b; 1113 1114 return b->data; 1115 } 1116 1117 void *dm_bufio_get(struct dm_bufio_client *c, sector_t block, 1118 struct dm_buffer **bp) 1119 { 1120 return new_read(c, block, NF_GET, bp); 1121 } 1122 EXPORT_SYMBOL_GPL(dm_bufio_get); 1123 1124 void *dm_bufio_read(struct dm_bufio_client *c, sector_t block, 1125 struct dm_buffer **bp) 1126 { 1127 BUG_ON(dm_bufio_in_request()); 1128 1129 return new_read(c, block, NF_READ, bp); 1130 } 1131 EXPORT_SYMBOL_GPL(dm_bufio_read); 1132 1133 void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, 1134 struct dm_buffer **bp) 1135 { 1136 BUG_ON(dm_bufio_in_request()); 1137 1138 return new_read(c, block, NF_FRESH, bp); 1139 } 1140 EXPORT_SYMBOL_GPL(dm_bufio_new); 1141 1142 void dm_bufio_prefetch(struct dm_bufio_client *c, 1143 sector_t block, unsigned n_blocks) 1144 { 1145 struct blk_plug plug; 1146 1147 LIST_HEAD(write_list); 1148 1149 BUG_ON(dm_bufio_in_request()); 1150 1151 blk_start_plug(&plug); 1152 dm_bufio_lock(c); 1153 1154 for (; n_blocks--; block++) { 1155 int need_submit; 1156 struct dm_buffer *b; 1157 b = __bufio_new(c, block, NF_PREFETCH, &need_submit, 1158 &write_list); 1159 if (unlikely(!list_empty(&write_list))) { 1160 dm_bufio_unlock(c); 1161 blk_finish_plug(&plug); 1162 __flush_write_list(&write_list); 1163 blk_start_plug(&plug); 1164 dm_bufio_lock(c); 1165 } 1166 if (unlikely(b != NULL)) { 1167 dm_bufio_unlock(c); 1168 1169 if (need_submit) 1170 submit_io(b, READ, b->block, read_endio); 1171 dm_bufio_release(b); 1172 1173 dm_bufio_cond_resched(); 1174 1175 if (!n_blocks) 1176 goto flush_plug; 1177 dm_bufio_lock(c); 1178 } 1179 } 1180 1181 dm_bufio_unlock(c); 1182 1183 flush_plug: 1184 blk_finish_plug(&plug); 1185 } 1186 EXPORT_SYMBOL_GPL(dm_bufio_prefetch); 1187 1188 void dm_bufio_release(struct dm_buffer *b) 1189 { 1190 struct dm_bufio_client *c = b->c; 1191 1192 dm_bufio_lock(c); 1193 1194 BUG_ON(!b->hold_count); 1195 1196 b->hold_count--; 1197 if (!b->hold_count) { 1198 wake_up(&c->free_buffer_wait); 1199 1200 /* 1201 * If there were errors on the buffer, and the buffer is not 1202 * to be written, free the buffer. There is no point in caching 1203 * invalid buffer. 1204 */ 1205 if ((b->read_error || b->write_error) && 1206 !test_bit(B_READING, &b->state) && 1207 !test_bit(B_WRITING, &b->state) && 1208 !test_bit(B_DIRTY, &b->state)) { 1209 __unlink_buffer(b); 1210 __free_buffer_wake(b); 1211 } 1212 } 1213 1214 dm_bufio_unlock(c); 1215 } 1216 EXPORT_SYMBOL_GPL(dm_bufio_release); 1217 1218 void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) 1219 { 1220 struct dm_bufio_client *c = b->c; 1221 1222 dm_bufio_lock(c); 1223 1224 BUG_ON(test_bit(B_READING, &b->state)); 1225 1226 if (!test_and_set_bit(B_DIRTY, &b->state)) 1227 __relink_lru(b, LIST_DIRTY); 1228 1229 dm_bufio_unlock(c); 1230 } 1231 EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty); 1232 1233 void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) 1234 { 1235 LIST_HEAD(write_list); 1236 1237 BUG_ON(dm_bufio_in_request()); 1238 1239 dm_bufio_lock(c); 1240 __write_dirty_buffers_async(c, 0, &write_list); 1241 dm_bufio_unlock(c); 1242 __flush_write_list(&write_list); 1243 } 1244 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); 1245 1246 /* 1247 * For performance, it is essential that the buffers are written asynchronously 1248 * and simultaneously (so that the block layer can merge the writes) and then 1249 * waited upon. 1250 * 1251 * Finally, we flush hardware disk cache. 1252 */ 1253 int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c) 1254 { 1255 int a, f; 1256 unsigned long buffers_processed = 0; 1257 struct dm_buffer *b, *tmp; 1258 1259 LIST_HEAD(write_list); 1260 1261 dm_bufio_lock(c); 1262 __write_dirty_buffers_async(c, 0, &write_list); 1263 dm_bufio_unlock(c); 1264 __flush_write_list(&write_list); 1265 dm_bufio_lock(c); 1266 1267 again: 1268 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { 1269 int dropped_lock = 0; 1270 1271 if (buffers_processed < c->n_buffers[LIST_DIRTY]) 1272 buffers_processed++; 1273 1274 BUG_ON(test_bit(B_READING, &b->state)); 1275 1276 if (test_bit(B_WRITING, &b->state)) { 1277 if (buffers_processed < c->n_buffers[LIST_DIRTY]) { 1278 dropped_lock = 1; 1279 b->hold_count++; 1280 dm_bufio_unlock(c); 1281 wait_on_bit_io(&b->state, B_WRITING, 1282 TASK_UNINTERRUPTIBLE); 1283 dm_bufio_lock(c); 1284 b->hold_count--; 1285 } else 1286 wait_on_bit_io(&b->state, B_WRITING, 1287 TASK_UNINTERRUPTIBLE); 1288 } 1289 1290 if (!test_bit(B_DIRTY, &b->state) && 1291 !test_bit(B_WRITING, &b->state)) 1292 __relink_lru(b, LIST_CLEAN); 1293 1294 dm_bufio_cond_resched(); 1295 1296 /* 1297 * If we dropped the lock, the list is no longer consistent, 1298 * so we must restart the search. 1299 * 1300 * In the most common case, the buffer just processed is 1301 * relinked to the clean list, so we won't loop scanning the 1302 * same buffer again and again. 1303 * 1304 * This may livelock if there is another thread simultaneously 1305 * dirtying buffers, so we count the number of buffers walked 1306 * and if it exceeds the total number of buffers, it means that 1307 * someone is doing some writes simultaneously with us. In 1308 * this case, stop, dropping the lock. 1309 */ 1310 if (dropped_lock) 1311 goto again; 1312 } 1313 wake_up(&c->free_buffer_wait); 1314 dm_bufio_unlock(c); 1315 1316 a = xchg(&c->async_write_error, 0); 1317 f = dm_bufio_issue_flush(c); 1318 if (a) 1319 return a; 1320 1321 return f; 1322 } 1323 EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers); 1324 1325 /* 1326 * Use dm-io to send and empty barrier flush the device. 1327 */ 1328 int dm_bufio_issue_flush(struct dm_bufio_client *c) 1329 { 1330 struct dm_io_request io_req = { 1331 .bi_op = REQ_OP_WRITE, 1332 .bi_op_flags = WRITE_FLUSH, 1333 .mem.type = DM_IO_KMEM, 1334 .mem.ptr.addr = NULL, 1335 .client = c->dm_io, 1336 }; 1337 struct dm_io_region io_reg = { 1338 .bdev = c->bdev, 1339 .sector = 0, 1340 .count = 0, 1341 }; 1342 1343 BUG_ON(dm_bufio_in_request()); 1344 1345 return dm_io(&io_req, 1, &io_reg, NULL); 1346 } 1347 EXPORT_SYMBOL_GPL(dm_bufio_issue_flush); 1348 1349 /* 1350 * We first delete any other buffer that may be at that new location. 1351 * 1352 * Then, we write the buffer to the original location if it was dirty. 1353 * 1354 * Then, if we are the only one who is holding the buffer, relink the buffer 1355 * in the hash queue for the new location. 1356 * 1357 * If there was someone else holding the buffer, we write it to the new 1358 * location but not relink it, because that other user needs to have the buffer 1359 * at the same place. 1360 */ 1361 void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block) 1362 { 1363 struct dm_bufio_client *c = b->c; 1364 struct dm_buffer *new; 1365 1366 BUG_ON(dm_bufio_in_request()); 1367 1368 dm_bufio_lock(c); 1369 1370 retry: 1371 new = __find(c, new_block); 1372 if (new) { 1373 if (new->hold_count) { 1374 __wait_for_free_buffer(c); 1375 goto retry; 1376 } 1377 1378 /* 1379 * FIXME: Is there any point waiting for a write that's going 1380 * to be overwritten in a bit? 1381 */ 1382 __make_buffer_clean(new); 1383 __unlink_buffer(new); 1384 __free_buffer_wake(new); 1385 } 1386 1387 BUG_ON(!b->hold_count); 1388 BUG_ON(test_bit(B_READING, &b->state)); 1389 1390 __write_dirty_buffer(b, NULL); 1391 if (b->hold_count == 1) { 1392 wait_on_bit_io(&b->state, B_WRITING, 1393 TASK_UNINTERRUPTIBLE); 1394 set_bit(B_DIRTY, &b->state); 1395 __unlink_buffer(b); 1396 __link_buffer(b, new_block, LIST_DIRTY); 1397 } else { 1398 sector_t old_block; 1399 wait_on_bit_lock_io(&b->state, B_WRITING, 1400 TASK_UNINTERRUPTIBLE); 1401 /* 1402 * Relink buffer to "new_block" so that write_callback 1403 * sees "new_block" as a block number. 1404 * After the write, link the buffer back to old_block. 1405 * All this must be done in bufio lock, so that block number 1406 * change isn't visible to other threads. 1407 */ 1408 old_block = b->block; 1409 __unlink_buffer(b); 1410 __link_buffer(b, new_block, b->list_mode); 1411 submit_io(b, WRITE, new_block, write_endio); 1412 wait_on_bit_io(&b->state, B_WRITING, 1413 TASK_UNINTERRUPTIBLE); 1414 __unlink_buffer(b); 1415 __link_buffer(b, old_block, b->list_mode); 1416 } 1417 1418 dm_bufio_unlock(c); 1419 dm_bufio_release(b); 1420 } 1421 EXPORT_SYMBOL_GPL(dm_bufio_release_move); 1422 1423 /* 1424 * Free the given buffer. 1425 * 1426 * This is just a hint, if the buffer is in use or dirty, this function 1427 * does nothing. 1428 */ 1429 void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) 1430 { 1431 struct dm_buffer *b; 1432 1433 dm_bufio_lock(c); 1434 1435 b = __find(c, block); 1436 if (b && likely(!b->hold_count) && likely(!b->state)) { 1437 __unlink_buffer(b); 1438 __free_buffer_wake(b); 1439 } 1440 1441 dm_bufio_unlock(c); 1442 } 1443 EXPORT_SYMBOL(dm_bufio_forget); 1444 1445 void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) 1446 { 1447 c->minimum_buffers = n; 1448 } 1449 EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); 1450 1451 unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) 1452 { 1453 return c->block_size; 1454 } 1455 EXPORT_SYMBOL_GPL(dm_bufio_get_block_size); 1456 1457 sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) 1458 { 1459 return i_size_read(c->bdev->bd_inode) >> 1460 (SECTOR_SHIFT + c->sectors_per_block_bits); 1461 } 1462 EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); 1463 1464 sector_t dm_bufio_get_block_number(struct dm_buffer *b) 1465 { 1466 return b->block; 1467 } 1468 EXPORT_SYMBOL_GPL(dm_bufio_get_block_number); 1469 1470 void *dm_bufio_get_block_data(struct dm_buffer *b) 1471 { 1472 return b->data; 1473 } 1474 EXPORT_SYMBOL_GPL(dm_bufio_get_block_data); 1475 1476 void *dm_bufio_get_aux_data(struct dm_buffer *b) 1477 { 1478 return b + 1; 1479 } 1480 EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data); 1481 1482 struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b) 1483 { 1484 return b->c; 1485 } 1486 EXPORT_SYMBOL_GPL(dm_bufio_get_client); 1487 1488 static void drop_buffers(struct dm_bufio_client *c) 1489 { 1490 struct dm_buffer *b; 1491 int i; 1492 bool warned = false; 1493 1494 BUG_ON(dm_bufio_in_request()); 1495 1496 /* 1497 * An optimization so that the buffers are not written one-by-one. 1498 */ 1499 dm_bufio_write_dirty_buffers_async(c); 1500 1501 dm_bufio_lock(c); 1502 1503 while ((b = __get_unclaimed_buffer(c))) 1504 __free_buffer_wake(b); 1505 1506 for (i = 0; i < LIST_SIZE; i++) 1507 list_for_each_entry(b, &c->lru[i], lru_list) { 1508 WARN_ON(!warned); 1509 warned = true; 1510 DMERR("leaked buffer %llx, hold count %u, list %d", 1511 (unsigned long long)b->block, b->hold_count, i); 1512 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1513 print_stack_trace(&b->stack_trace, 1); 1514 b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */ 1515 #endif 1516 } 1517 1518 #ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING 1519 while ((b = __get_unclaimed_buffer(c))) 1520 __free_buffer_wake(b); 1521 #endif 1522 1523 for (i = 0; i < LIST_SIZE; i++) 1524 BUG_ON(!list_empty(&c->lru[i])); 1525 1526 dm_bufio_unlock(c); 1527 } 1528 1529 /* 1530 * We may not be able to evict this buffer if IO pending or the client 1531 * is still using it. Caller is expected to know buffer is too old. 1532 * 1533 * And if GFP_NOFS is used, we must not do any I/O because we hold 1534 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets 1535 * rerouted to different bufio client. 1536 */ 1537 static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) 1538 { 1539 if (!(gfp & __GFP_FS)) { 1540 if (test_bit(B_READING, &b->state) || 1541 test_bit(B_WRITING, &b->state) || 1542 test_bit(B_DIRTY, &b->state)) 1543 return false; 1544 } 1545 1546 if (b->hold_count) 1547 return false; 1548 1549 __make_buffer_clean(b); 1550 __unlink_buffer(b); 1551 __free_buffer_wake(b); 1552 1553 return true; 1554 } 1555 1556 static unsigned get_retain_buffers(struct dm_bufio_client *c) 1557 { 1558 unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); 1559 return retain_bytes / c->block_size; 1560 } 1561 1562 static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, 1563 gfp_t gfp_mask) 1564 { 1565 int l; 1566 struct dm_buffer *b, *tmp; 1567 unsigned long freed = 0; 1568 unsigned long count = nr_to_scan; 1569 unsigned retain_target = get_retain_buffers(c); 1570 1571 for (l = 0; l < LIST_SIZE; l++) { 1572 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { 1573 if (__try_evict_buffer(b, gfp_mask)) 1574 freed++; 1575 if (!--nr_to_scan || ((count - freed) <= retain_target)) 1576 return freed; 1577 dm_bufio_cond_resched(); 1578 } 1579 } 1580 return freed; 1581 } 1582 1583 static unsigned long 1584 dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) 1585 { 1586 struct dm_bufio_client *c; 1587 unsigned long freed; 1588 1589 c = container_of(shrink, struct dm_bufio_client, shrinker); 1590 if (sc->gfp_mask & __GFP_FS) 1591 dm_bufio_lock(c); 1592 else if (!dm_bufio_trylock(c)) 1593 return SHRINK_STOP; 1594 1595 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); 1596 dm_bufio_unlock(c); 1597 return freed; 1598 } 1599 1600 static unsigned long 1601 dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) 1602 { 1603 struct dm_bufio_client *c; 1604 unsigned long count; 1605 1606 c = container_of(shrink, struct dm_bufio_client, shrinker); 1607 if (sc->gfp_mask & __GFP_FS) 1608 dm_bufio_lock(c); 1609 else if (!dm_bufio_trylock(c)) 1610 return 0; 1611 1612 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1613 dm_bufio_unlock(c); 1614 return count; 1615 } 1616 1617 /* 1618 * Create the buffering interface 1619 */ 1620 struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size, 1621 unsigned reserved_buffers, unsigned aux_size, 1622 void (*alloc_callback)(struct dm_buffer *), 1623 void (*write_callback)(struct dm_buffer *)) 1624 { 1625 int r; 1626 struct dm_bufio_client *c; 1627 unsigned i; 1628 1629 BUG_ON(block_size < 1 << SECTOR_SHIFT || 1630 (block_size & (block_size - 1))); 1631 1632 c = kzalloc(sizeof(*c), GFP_KERNEL); 1633 if (!c) { 1634 r = -ENOMEM; 1635 goto bad_client; 1636 } 1637 c->buffer_tree = RB_ROOT; 1638 1639 c->bdev = bdev; 1640 c->block_size = block_size; 1641 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; 1642 c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? 1643 __ffs(block_size) - PAGE_SHIFT : 0; 1644 c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? 1645 PAGE_SHIFT - __ffs(block_size) : 0); 1646 1647 c->aux_size = aux_size; 1648 c->alloc_callback = alloc_callback; 1649 c->write_callback = write_callback; 1650 1651 for (i = 0; i < LIST_SIZE; i++) { 1652 INIT_LIST_HEAD(&c->lru[i]); 1653 c->n_buffers[i] = 0; 1654 } 1655 1656 mutex_init(&c->lock); 1657 INIT_LIST_HEAD(&c->reserved_buffers); 1658 c->need_reserved_buffers = reserved_buffers; 1659 1660 c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; 1661 1662 init_waitqueue_head(&c->free_buffer_wait); 1663 c->async_write_error = 0; 1664 1665 c->dm_io = dm_io_client_create(); 1666 if (IS_ERR(c->dm_io)) { 1667 r = PTR_ERR(c->dm_io); 1668 goto bad_dm_io; 1669 } 1670 1671 mutex_lock(&dm_bufio_clients_lock); 1672 if (c->blocks_per_page_bits) { 1673 if (!DM_BUFIO_CACHE_NAME(c)) { 1674 DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size); 1675 if (!DM_BUFIO_CACHE_NAME(c)) { 1676 r = -ENOMEM; 1677 mutex_unlock(&dm_bufio_clients_lock); 1678 goto bad_cache; 1679 } 1680 } 1681 1682 if (!DM_BUFIO_CACHE(c)) { 1683 DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c), 1684 c->block_size, 1685 c->block_size, 0, NULL); 1686 if (!DM_BUFIO_CACHE(c)) { 1687 r = -ENOMEM; 1688 mutex_unlock(&dm_bufio_clients_lock); 1689 goto bad_cache; 1690 } 1691 } 1692 } 1693 mutex_unlock(&dm_bufio_clients_lock); 1694 1695 while (c->need_reserved_buffers) { 1696 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL); 1697 1698 if (!b) { 1699 r = -ENOMEM; 1700 goto bad_buffer; 1701 } 1702 __free_buffer_wake(b); 1703 } 1704 1705 mutex_lock(&dm_bufio_clients_lock); 1706 dm_bufio_client_count++; 1707 list_add(&c->client_list, &dm_bufio_all_clients); 1708 __cache_size_refresh(); 1709 mutex_unlock(&dm_bufio_clients_lock); 1710 1711 c->shrinker.count_objects = dm_bufio_shrink_count; 1712 c->shrinker.scan_objects = dm_bufio_shrink_scan; 1713 c->shrinker.seeks = 1; 1714 c->shrinker.batch = 0; 1715 register_shrinker(&c->shrinker); 1716 1717 return c; 1718 1719 bad_buffer: 1720 bad_cache: 1721 while (!list_empty(&c->reserved_buffers)) { 1722 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1723 struct dm_buffer, lru_list); 1724 list_del(&b->lru_list); 1725 free_buffer(b); 1726 } 1727 dm_io_client_destroy(c->dm_io); 1728 bad_dm_io: 1729 kfree(c); 1730 bad_client: 1731 return ERR_PTR(r); 1732 } 1733 EXPORT_SYMBOL_GPL(dm_bufio_client_create); 1734 1735 /* 1736 * Free the buffering interface. 1737 * It is required that there are no references on any buffers. 1738 */ 1739 void dm_bufio_client_destroy(struct dm_bufio_client *c) 1740 { 1741 unsigned i; 1742 1743 drop_buffers(c); 1744 1745 unregister_shrinker(&c->shrinker); 1746 1747 mutex_lock(&dm_bufio_clients_lock); 1748 1749 list_del(&c->client_list); 1750 dm_bufio_client_count--; 1751 __cache_size_refresh(); 1752 1753 mutex_unlock(&dm_bufio_clients_lock); 1754 1755 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); 1756 BUG_ON(c->need_reserved_buffers); 1757 1758 while (!list_empty(&c->reserved_buffers)) { 1759 struct dm_buffer *b = list_entry(c->reserved_buffers.next, 1760 struct dm_buffer, lru_list); 1761 list_del(&b->lru_list); 1762 free_buffer(b); 1763 } 1764 1765 for (i = 0; i < LIST_SIZE; i++) 1766 if (c->n_buffers[i]) 1767 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]); 1768 1769 for (i = 0; i < LIST_SIZE; i++) 1770 BUG_ON(c->n_buffers[i]); 1771 1772 dm_io_client_destroy(c->dm_io); 1773 kfree(c); 1774 } 1775 EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); 1776 1777 static unsigned get_max_age_hz(void) 1778 { 1779 unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); 1780 1781 if (max_age > UINT_MAX / HZ) 1782 max_age = UINT_MAX / HZ; 1783 1784 return max_age * HZ; 1785 } 1786 1787 static bool older_than(struct dm_buffer *b, unsigned long age_hz) 1788 { 1789 return time_after_eq(jiffies, b->last_accessed + age_hz); 1790 } 1791 1792 static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) 1793 { 1794 struct dm_buffer *b, *tmp; 1795 unsigned retain_target = get_retain_buffers(c); 1796 unsigned count; 1797 1798 dm_bufio_lock(c); 1799 1800 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; 1801 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { 1802 if (count <= retain_target) 1803 break; 1804 1805 if (!older_than(b, age_hz)) 1806 break; 1807 1808 if (__try_evict_buffer(b, 0)) 1809 count--; 1810 1811 dm_bufio_cond_resched(); 1812 } 1813 1814 dm_bufio_unlock(c); 1815 } 1816 1817 static void cleanup_old_buffers(void) 1818 { 1819 unsigned long max_age_hz = get_max_age_hz(); 1820 struct dm_bufio_client *c; 1821 1822 mutex_lock(&dm_bufio_clients_lock); 1823 1824 list_for_each_entry(c, &dm_bufio_all_clients, client_list) 1825 __evict_old_buffers(c, max_age_hz); 1826 1827 mutex_unlock(&dm_bufio_clients_lock); 1828 } 1829 1830 static struct workqueue_struct *dm_bufio_wq; 1831 static struct delayed_work dm_bufio_work; 1832 1833 static void work_fn(struct work_struct *w) 1834 { 1835 cleanup_old_buffers(); 1836 1837 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1838 DM_BUFIO_WORK_TIMER_SECS * HZ); 1839 } 1840 1841 /*---------------------------------------------------------------- 1842 * Module setup 1843 *--------------------------------------------------------------*/ 1844 1845 /* 1846 * This is called only once for the whole dm_bufio module. 1847 * It initializes memory limit. 1848 */ 1849 static int __init dm_bufio_init(void) 1850 { 1851 __u64 mem; 1852 1853 dm_bufio_allocated_kmem_cache = 0; 1854 dm_bufio_allocated_get_free_pages = 0; 1855 dm_bufio_allocated_vmalloc = 0; 1856 dm_bufio_current_allocated = 0; 1857 1858 memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches); 1859 memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names); 1860 1861 mem = (__u64)((totalram_pages - totalhigh_pages) * 1862 DM_BUFIO_MEMORY_PERCENT / 100) << PAGE_SHIFT; 1863 1864 if (mem > ULONG_MAX) 1865 mem = ULONG_MAX; 1866 1867 #ifdef CONFIG_MMU 1868 /* 1869 * Get the size of vmalloc space the same way as VMALLOC_TOTAL 1870 * in fs/proc/internal.h 1871 */ 1872 if (mem > (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100) 1873 mem = (VMALLOC_END - VMALLOC_START) * DM_BUFIO_VMALLOC_PERCENT / 100; 1874 #endif 1875 1876 dm_bufio_default_cache_size = mem; 1877 1878 mutex_lock(&dm_bufio_clients_lock); 1879 __cache_size_refresh(); 1880 mutex_unlock(&dm_bufio_clients_lock); 1881 1882 dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0); 1883 if (!dm_bufio_wq) 1884 return -ENOMEM; 1885 1886 INIT_DELAYED_WORK(&dm_bufio_work, work_fn); 1887 queue_delayed_work(dm_bufio_wq, &dm_bufio_work, 1888 DM_BUFIO_WORK_TIMER_SECS * HZ); 1889 1890 return 0; 1891 } 1892 1893 /* 1894 * This is called once when unloading the dm_bufio module. 1895 */ 1896 static void __exit dm_bufio_exit(void) 1897 { 1898 int bug = 0; 1899 int i; 1900 1901 cancel_delayed_work_sync(&dm_bufio_work); 1902 destroy_workqueue(dm_bufio_wq); 1903 1904 for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) 1905 kmem_cache_destroy(dm_bufio_caches[i]); 1906 1907 for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) 1908 kfree(dm_bufio_cache_names[i]); 1909 1910 if (dm_bufio_client_count) { 1911 DMCRIT("%s: dm_bufio_client_count leaked: %d", 1912 __func__, dm_bufio_client_count); 1913 bug = 1; 1914 } 1915 1916 if (dm_bufio_current_allocated) { 1917 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu", 1918 __func__, dm_bufio_current_allocated); 1919 bug = 1; 1920 } 1921 1922 if (dm_bufio_allocated_get_free_pages) { 1923 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu", 1924 __func__, dm_bufio_allocated_get_free_pages); 1925 bug = 1; 1926 } 1927 1928 if (dm_bufio_allocated_vmalloc) { 1929 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu", 1930 __func__, dm_bufio_allocated_vmalloc); 1931 bug = 1; 1932 } 1933 1934 BUG_ON(bug); 1935 } 1936 1937 module_init(dm_bufio_init) 1938 module_exit(dm_bufio_exit) 1939 1940 module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR); 1941 MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache"); 1942 1943 module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); 1944 MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); 1945 1946 module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); 1947 MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); 1948 1949 module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); 1950 MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory"); 1951 1952 module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO); 1953 MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc"); 1954 1955 module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO); 1956 MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages"); 1957 1958 module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO); 1959 MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc"); 1960 1961 module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO); 1962 MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache"); 1963 1964 MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>"); 1965 MODULE_DESCRIPTION(DM_NAME " buffered I/O library"); 1966 MODULE_LICENSE("GPL"); 1967