1 /* 2 * z3fold.c 3 * 4 * Author: Vitaly Wool <vitaly.wool@konsulko.com> 5 * Copyright (C) 2016, Sony Mobile Communications Inc. 6 * 7 * This implementation is based on zbud written by Seth Jennings. 8 * 9 * z3fold is an special purpose allocator for storing compressed pages. It 10 * can store up to three compressed pages per page which improves the 11 * compression ratio of zbud while retaining its main concepts (e. g. always 12 * storing an integral number of objects per page) and simplicity. 13 * It still has simple and deterministic reclaim properties that make it 14 * preferable to a higher density approach (with no requirement on integral 15 * number of object per page) when reclaim is used. 16 * 17 * As in zbud, pages are divided into "chunks". The size of the chunks is 18 * fixed at compile time and is determined by NCHUNKS_ORDER below. 19 * 20 * z3fold doesn't export any API and is meant to be used via zpool API. 21 */ 22 23 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 24 25 #include <linux/atomic.h> 26 #include <linux/sched.h> 27 #include <linux/list.h> 28 #include <linux/mm.h> 29 #include <linux/module.h> 30 #include <linux/percpu.h> 31 #include <linux/preempt.h> 32 #include <linux/workqueue.h> 33 #include <linux/slab.h> 34 #include <linux/spinlock.h> 35 #include <linux/zpool.h> 36 37 /***************** 38 * Structures 39 *****************/ 40 struct z3fold_pool; 41 struct z3fold_ops { 42 int (*evict)(struct z3fold_pool *pool, unsigned long handle); 43 }; 44 45 enum buddy { 46 HEADLESS = 0, 47 FIRST, 48 MIDDLE, 49 LAST, 50 BUDDIES_MAX 51 }; 52 53 /* 54 * struct z3fold_header - z3fold page metadata occupying first chunks of each 55 * z3fold page, except for HEADLESS pages 56 * @buddy: links the z3fold page into the relevant list in the 57 * pool 58 * @page_lock: per-page lock 59 * @refcount: reference count for the z3fold page 60 * @work: work_struct for page layout optimization 61 * @pool: pointer to the pool which this page belongs to 62 * @cpu: CPU which this page "belongs" to 63 * @first_chunks: the size of the first buddy in chunks, 0 if free 64 * @middle_chunks: the size of the middle buddy in chunks, 0 if free 65 * @last_chunks: the size of the last buddy in chunks, 0 if free 66 * @first_num: the starting number (for the first handle) 67 */ 68 struct z3fold_header { 69 struct list_head buddy; 70 spinlock_t page_lock; 71 struct kref refcount; 72 struct work_struct work; 73 struct z3fold_pool *pool; 74 short cpu; 75 unsigned short first_chunks; 76 unsigned short middle_chunks; 77 unsigned short last_chunks; 78 unsigned short start_middle; 79 unsigned short first_num:2; 80 }; 81 82 /* 83 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 84 * adjusting internal fragmentation. It also determines the number of 85 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 86 * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks 87 * in the beginning of an allocated page are occupied by z3fold header, so 88 * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), 89 * which shows the max number of free chunks in z3fold page, also there will 90 * be 63, or 62, respectively, freelists per pool. 91 */ 92 #define NCHUNKS_ORDER 6 93 94 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 95 #define CHUNK_SIZE (1 << CHUNK_SHIFT) 96 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) 97 #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) 98 #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 99 #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) 100 101 #define BUDDY_MASK (0x3) 102 #define BUDDY_SHIFT 2 103 104 /** 105 * struct z3fold_pool - stores metadata for each z3fold pool 106 * @name: pool name 107 * @lock: protects pool unbuddied/lru lists 108 * @stale_lock: protects pool stale page list 109 * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- 110 * buddies; the list each z3fold page is added to depends on 111 * the size of its free region. 112 * @lru: list tracking the z3fold pages in LRU order by most recently 113 * added buddy. 114 * @stale: list of pages marked for freeing 115 * @pages_nr: number of z3fold pages in the pool. 116 * @ops: pointer to a structure of user defined operations specified at 117 * pool creation time. 118 * @compact_wq: workqueue for page layout background optimization 119 * @release_wq: workqueue for safe page release 120 * @work: work_struct for safe page release 121 * 122 * This structure is allocated at pool creation time and maintains metadata 123 * pertaining to a particular z3fold pool. 124 */ 125 struct z3fold_pool { 126 const char *name; 127 spinlock_t lock; 128 spinlock_t stale_lock; 129 struct list_head *unbuddied; 130 struct list_head lru; 131 struct list_head stale; 132 atomic64_t pages_nr; 133 const struct z3fold_ops *ops; 134 struct zpool *zpool; 135 const struct zpool_ops *zpool_ops; 136 struct workqueue_struct *compact_wq; 137 struct workqueue_struct *release_wq; 138 struct work_struct work; 139 }; 140 141 /* 142 * Internal z3fold page flags 143 */ 144 enum z3fold_page_flags { 145 PAGE_HEADLESS = 0, 146 MIDDLE_CHUNK_MAPPED, 147 NEEDS_COMPACTING, 148 PAGE_STALE, 149 PAGE_CLAIMED, /* by either reclaim or free */ 150 }; 151 152 /***************** 153 * Helpers 154 *****************/ 155 156 /* Converts an allocation size in bytes to size in z3fold chunks */ 157 static int size_to_chunks(size_t size) 158 { 159 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 160 } 161 162 #define for_each_unbuddied_list(_iter, _begin) \ 163 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 164 165 static void compact_page_work(struct work_struct *w); 166 167 /* Initializes the z3fold header of a newly allocated z3fold page */ 168 static struct z3fold_header *init_z3fold_page(struct page *page, 169 struct z3fold_pool *pool) 170 { 171 struct z3fold_header *zhdr = page_address(page); 172 173 INIT_LIST_HEAD(&page->lru); 174 clear_bit(PAGE_HEADLESS, &page->private); 175 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 176 clear_bit(NEEDS_COMPACTING, &page->private); 177 clear_bit(PAGE_STALE, &page->private); 178 clear_bit(PAGE_CLAIMED, &page->private); 179 180 spin_lock_init(&zhdr->page_lock); 181 kref_init(&zhdr->refcount); 182 zhdr->first_chunks = 0; 183 zhdr->middle_chunks = 0; 184 zhdr->last_chunks = 0; 185 zhdr->first_num = 0; 186 zhdr->start_middle = 0; 187 zhdr->cpu = -1; 188 zhdr->pool = pool; 189 INIT_LIST_HEAD(&zhdr->buddy); 190 INIT_WORK(&zhdr->work, compact_page_work); 191 return zhdr; 192 } 193 194 /* Resets the struct page fields and frees the page */ 195 static void free_z3fold_page(struct page *page) 196 { 197 __free_page(page); 198 } 199 200 /* Lock a z3fold page */ 201 static inline void z3fold_page_lock(struct z3fold_header *zhdr) 202 { 203 spin_lock(&zhdr->page_lock); 204 } 205 206 /* Try to lock a z3fold page */ 207 static inline int z3fold_page_trylock(struct z3fold_header *zhdr) 208 { 209 return spin_trylock(&zhdr->page_lock); 210 } 211 212 /* Unlock a z3fold page */ 213 static inline void z3fold_page_unlock(struct z3fold_header *zhdr) 214 { 215 spin_unlock(&zhdr->page_lock); 216 } 217 218 /* 219 * Encodes the handle of a particular buddy within a z3fold page 220 * Pool lock should be held as this function accesses first_num 221 */ 222 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 223 { 224 unsigned long handle; 225 226 handle = (unsigned long)zhdr; 227 if (bud != HEADLESS) { 228 handle |= (bud + zhdr->first_num) & BUDDY_MASK; 229 if (bud == LAST) 230 handle |= (zhdr->last_chunks << BUDDY_SHIFT); 231 } 232 return handle; 233 } 234 235 /* Returns the z3fold page where a given handle is stored */ 236 static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) 237 { 238 return (struct z3fold_header *)(handle & PAGE_MASK); 239 } 240 241 /* only for LAST bud, returns zero otherwise */ 242 static unsigned short handle_to_chunks(unsigned long handle) 243 { 244 return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; 245 } 246 247 /* 248 * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle 249 * but that doesn't matter. because the masking will result in the 250 * correct buddy number. 251 */ 252 static enum buddy handle_to_buddy(unsigned long handle) 253 { 254 struct z3fold_header *zhdr = handle_to_z3fold_header(handle); 255 return (handle - zhdr->first_num) & BUDDY_MASK; 256 } 257 258 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) 259 { 260 struct page *page = virt_to_page(zhdr); 261 struct z3fold_pool *pool = zhdr->pool; 262 263 WARN_ON(!list_empty(&zhdr->buddy)); 264 set_bit(PAGE_STALE, &page->private); 265 clear_bit(NEEDS_COMPACTING, &page->private); 266 spin_lock(&pool->lock); 267 if (!list_empty(&page->lru)) 268 list_del(&page->lru); 269 spin_unlock(&pool->lock); 270 if (locked) 271 z3fold_page_unlock(zhdr); 272 spin_lock(&pool->stale_lock); 273 list_add(&zhdr->buddy, &pool->stale); 274 queue_work(pool->release_wq, &pool->work); 275 spin_unlock(&pool->stale_lock); 276 } 277 278 static void __attribute__((__unused__)) 279 release_z3fold_page(struct kref *ref) 280 { 281 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 282 refcount); 283 __release_z3fold_page(zhdr, false); 284 } 285 286 static void release_z3fold_page_locked(struct kref *ref) 287 { 288 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 289 refcount); 290 WARN_ON(z3fold_page_trylock(zhdr)); 291 __release_z3fold_page(zhdr, true); 292 } 293 294 static void release_z3fold_page_locked_list(struct kref *ref) 295 { 296 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 297 refcount); 298 spin_lock(&zhdr->pool->lock); 299 list_del_init(&zhdr->buddy); 300 spin_unlock(&zhdr->pool->lock); 301 302 WARN_ON(z3fold_page_trylock(zhdr)); 303 __release_z3fold_page(zhdr, true); 304 } 305 306 static void free_pages_work(struct work_struct *w) 307 { 308 struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); 309 310 spin_lock(&pool->stale_lock); 311 while (!list_empty(&pool->stale)) { 312 struct z3fold_header *zhdr = list_first_entry(&pool->stale, 313 struct z3fold_header, buddy); 314 struct page *page = virt_to_page(zhdr); 315 316 list_del(&zhdr->buddy); 317 if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) 318 continue; 319 spin_unlock(&pool->stale_lock); 320 cancel_work_sync(&zhdr->work); 321 free_z3fold_page(page); 322 cond_resched(); 323 spin_lock(&pool->stale_lock); 324 } 325 spin_unlock(&pool->stale_lock); 326 } 327 328 /* 329 * Returns the number of free chunks in a z3fold page. 330 * NB: can't be used with HEADLESS pages. 331 */ 332 static int num_free_chunks(struct z3fold_header *zhdr) 333 { 334 int nfree; 335 /* 336 * If there is a middle object, pick up the bigger free space 337 * either before or after it. Otherwise just subtract the number 338 * of chunks occupied by the first and the last objects. 339 */ 340 if (zhdr->middle_chunks != 0) { 341 int nfree_before = zhdr->first_chunks ? 342 0 : zhdr->start_middle - ZHDR_CHUNKS; 343 int nfree_after = zhdr->last_chunks ? 344 0 : TOTAL_CHUNKS - 345 (zhdr->start_middle + zhdr->middle_chunks); 346 nfree = max(nfree_before, nfree_after); 347 } else 348 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; 349 return nfree; 350 } 351 352 static inline void *mchunk_memmove(struct z3fold_header *zhdr, 353 unsigned short dst_chunk) 354 { 355 void *beg = zhdr; 356 return memmove(beg + (dst_chunk << CHUNK_SHIFT), 357 beg + (zhdr->start_middle << CHUNK_SHIFT), 358 zhdr->middle_chunks << CHUNK_SHIFT); 359 } 360 361 #define BIG_CHUNK_GAP 3 362 /* Has to be called with lock held */ 363 static int z3fold_compact_page(struct z3fold_header *zhdr) 364 { 365 struct page *page = virt_to_page(zhdr); 366 367 if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) 368 return 0; /* can't move middle chunk, it's used */ 369 370 if (zhdr->middle_chunks == 0) 371 return 0; /* nothing to compact */ 372 373 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 374 /* move to the beginning */ 375 mchunk_memmove(zhdr, ZHDR_CHUNKS); 376 zhdr->first_chunks = zhdr->middle_chunks; 377 zhdr->middle_chunks = 0; 378 zhdr->start_middle = 0; 379 zhdr->first_num++; 380 return 1; 381 } 382 383 /* 384 * moving data is expensive, so let's only do that if 385 * there's substantial gain (at least BIG_CHUNK_GAP chunks) 386 */ 387 if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && 388 zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= 389 BIG_CHUNK_GAP) { 390 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); 391 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 392 return 1; 393 } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && 394 TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle 395 + zhdr->middle_chunks) >= 396 BIG_CHUNK_GAP) { 397 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - 398 zhdr->middle_chunks; 399 mchunk_memmove(zhdr, new_start); 400 zhdr->start_middle = new_start; 401 return 1; 402 } 403 404 return 0; 405 } 406 407 static void do_compact_page(struct z3fold_header *zhdr, bool locked) 408 { 409 struct z3fold_pool *pool = zhdr->pool; 410 struct page *page; 411 struct list_head *unbuddied; 412 int fchunks; 413 414 page = virt_to_page(zhdr); 415 if (locked) 416 WARN_ON(z3fold_page_trylock(zhdr)); 417 else 418 z3fold_page_lock(zhdr); 419 if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { 420 z3fold_page_unlock(zhdr); 421 return; 422 } 423 spin_lock(&pool->lock); 424 list_del_init(&zhdr->buddy); 425 spin_unlock(&pool->lock); 426 427 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 428 atomic64_dec(&pool->pages_nr); 429 return; 430 } 431 432 z3fold_compact_page(zhdr); 433 unbuddied = get_cpu_ptr(pool->unbuddied); 434 fchunks = num_free_chunks(zhdr); 435 if (fchunks < NCHUNKS && 436 (!zhdr->first_chunks || !zhdr->middle_chunks || 437 !zhdr->last_chunks)) { 438 /* the page's not completely free and it's unbuddied */ 439 spin_lock(&pool->lock); 440 list_add(&zhdr->buddy, &unbuddied[fchunks]); 441 spin_unlock(&pool->lock); 442 zhdr->cpu = smp_processor_id(); 443 } 444 put_cpu_ptr(pool->unbuddied); 445 z3fold_page_unlock(zhdr); 446 } 447 448 static void compact_page_work(struct work_struct *w) 449 { 450 struct z3fold_header *zhdr = container_of(w, struct z3fold_header, 451 work); 452 453 do_compact_page(zhdr, false); 454 } 455 456 457 /* 458 * API Functions 459 */ 460 461 /** 462 * z3fold_create_pool() - create a new z3fold pool 463 * @name: pool name 464 * @gfp: gfp flags when allocating the z3fold pool structure 465 * @ops: user-defined operations for the z3fold pool 466 * 467 * Return: pointer to the new z3fold pool or NULL if the metadata allocation 468 * failed. 469 */ 470 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, 471 const struct z3fold_ops *ops) 472 { 473 struct z3fold_pool *pool = NULL; 474 int i, cpu; 475 476 pool = kzalloc(sizeof(struct z3fold_pool), gfp); 477 if (!pool) 478 goto out; 479 spin_lock_init(&pool->lock); 480 spin_lock_init(&pool->stale_lock); 481 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 482 if (!pool->unbuddied) 483 goto out_pool; 484 for_each_possible_cpu(cpu) { 485 struct list_head *unbuddied = 486 per_cpu_ptr(pool->unbuddied, cpu); 487 for_each_unbuddied_list(i, 0) 488 INIT_LIST_HEAD(&unbuddied[i]); 489 } 490 INIT_LIST_HEAD(&pool->lru); 491 INIT_LIST_HEAD(&pool->stale); 492 atomic64_set(&pool->pages_nr, 0); 493 pool->name = name; 494 pool->compact_wq = create_singlethread_workqueue(pool->name); 495 if (!pool->compact_wq) 496 goto out_unbuddied; 497 pool->release_wq = create_singlethread_workqueue(pool->name); 498 if (!pool->release_wq) 499 goto out_wq; 500 INIT_WORK(&pool->work, free_pages_work); 501 pool->ops = ops; 502 return pool; 503 504 out_wq: 505 destroy_workqueue(pool->compact_wq); 506 out_unbuddied: 507 free_percpu(pool->unbuddied); 508 out_pool: 509 kfree(pool); 510 out: 511 return NULL; 512 } 513 514 /** 515 * z3fold_destroy_pool() - destroys an existing z3fold pool 516 * @pool: the z3fold pool to be destroyed 517 * 518 * The pool should be emptied before this function is called. 519 */ 520 static void z3fold_destroy_pool(struct z3fold_pool *pool) 521 { 522 destroy_workqueue(pool->release_wq); 523 destroy_workqueue(pool->compact_wq); 524 kfree(pool); 525 } 526 527 /** 528 * z3fold_alloc() - allocates a region of a given size 529 * @pool: z3fold pool from which to allocate 530 * @size: size in bytes of the desired allocation 531 * @gfp: gfp flags used if the pool needs to grow 532 * @handle: handle of the new allocation 533 * 534 * This function will attempt to find a free region in the pool large enough to 535 * satisfy the allocation request. A search of the unbuddied lists is 536 * performed first. If no suitable free region is found, then a new page is 537 * allocated and added to the pool to satisfy the request. 538 * 539 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 540 * as z3fold pool pages. 541 * 542 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or 543 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 544 * a new page. 545 */ 546 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, 547 unsigned long *handle) 548 { 549 int chunks = 0, i, freechunks; 550 struct z3fold_header *zhdr = NULL; 551 struct page *page = NULL; 552 enum buddy bud; 553 bool can_sleep = gfpflags_allow_blocking(gfp); 554 555 if (!size || (gfp & __GFP_HIGHMEM)) 556 return -EINVAL; 557 558 if (size > PAGE_SIZE) 559 return -ENOSPC; 560 561 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 562 bud = HEADLESS; 563 else { 564 struct list_head *unbuddied; 565 chunks = size_to_chunks(size); 566 567 lookup: 568 /* First, try to find an unbuddied z3fold page. */ 569 unbuddied = get_cpu_ptr(pool->unbuddied); 570 for_each_unbuddied_list(i, chunks) { 571 struct list_head *l = &unbuddied[i]; 572 573 zhdr = list_first_entry_or_null(READ_ONCE(l), 574 struct z3fold_header, buddy); 575 576 if (!zhdr) 577 continue; 578 579 /* Re-check under lock. */ 580 spin_lock(&pool->lock); 581 l = &unbuddied[i]; 582 if (unlikely(zhdr != list_first_entry(READ_ONCE(l), 583 struct z3fold_header, buddy)) || 584 !z3fold_page_trylock(zhdr)) { 585 spin_unlock(&pool->lock); 586 put_cpu_ptr(pool->unbuddied); 587 goto lookup; 588 } 589 list_del_init(&zhdr->buddy); 590 zhdr->cpu = -1; 591 spin_unlock(&pool->lock); 592 593 page = virt_to_page(zhdr); 594 if (test_bit(NEEDS_COMPACTING, &page->private)) { 595 z3fold_page_unlock(zhdr); 596 zhdr = NULL; 597 put_cpu_ptr(pool->unbuddied); 598 if (can_sleep) 599 cond_resched(); 600 goto lookup; 601 } 602 603 /* 604 * this page could not be removed from its unbuddied 605 * list while pool lock was held, and then we've taken 606 * page lock so kref_put could not be called before 607 * we got here, so it's safe to just call kref_get() 608 */ 609 kref_get(&zhdr->refcount); 610 break; 611 } 612 put_cpu_ptr(pool->unbuddied); 613 614 if (zhdr) { 615 if (zhdr->first_chunks == 0) { 616 if (zhdr->middle_chunks != 0 && 617 chunks >= zhdr->start_middle) 618 bud = LAST; 619 else 620 bud = FIRST; 621 } else if (zhdr->last_chunks == 0) 622 bud = LAST; 623 else if (zhdr->middle_chunks == 0) 624 bud = MIDDLE; 625 else { 626 if (kref_put(&zhdr->refcount, 627 release_z3fold_page_locked)) 628 atomic64_dec(&pool->pages_nr); 629 else 630 z3fold_page_unlock(zhdr); 631 pr_err("No free chunks in unbuddied\n"); 632 WARN_ON(1); 633 goto lookup; 634 } 635 goto found; 636 } 637 bud = FIRST; 638 } 639 640 page = NULL; 641 if (can_sleep) { 642 spin_lock(&pool->stale_lock); 643 zhdr = list_first_entry_or_null(&pool->stale, 644 struct z3fold_header, buddy); 645 /* 646 * Before allocating a page, let's see if we can take one from 647 * the stale pages list. cancel_work_sync() can sleep so we 648 * limit this case to the contexts where we can sleep 649 */ 650 if (zhdr) { 651 list_del(&zhdr->buddy); 652 spin_unlock(&pool->stale_lock); 653 cancel_work_sync(&zhdr->work); 654 page = virt_to_page(zhdr); 655 } else { 656 spin_unlock(&pool->stale_lock); 657 } 658 } 659 if (!page) 660 page = alloc_page(gfp); 661 662 if (!page) 663 return -ENOMEM; 664 665 atomic64_inc(&pool->pages_nr); 666 zhdr = init_z3fold_page(page, pool); 667 668 if (bud == HEADLESS) { 669 set_bit(PAGE_HEADLESS, &page->private); 670 goto headless; 671 } 672 z3fold_page_lock(zhdr); 673 674 found: 675 if (bud == FIRST) 676 zhdr->first_chunks = chunks; 677 else if (bud == LAST) 678 zhdr->last_chunks = chunks; 679 else { 680 zhdr->middle_chunks = chunks; 681 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 682 } 683 684 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 685 zhdr->middle_chunks == 0) { 686 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); 687 688 /* Add to unbuddied list */ 689 freechunks = num_free_chunks(zhdr); 690 spin_lock(&pool->lock); 691 list_add(&zhdr->buddy, &unbuddied[freechunks]); 692 spin_unlock(&pool->lock); 693 zhdr->cpu = smp_processor_id(); 694 put_cpu_ptr(pool->unbuddied); 695 } 696 697 headless: 698 spin_lock(&pool->lock); 699 /* Add/move z3fold page to beginning of LRU */ 700 if (!list_empty(&page->lru)) 701 list_del(&page->lru); 702 703 list_add(&page->lru, &pool->lru); 704 705 *handle = encode_handle(zhdr, bud); 706 spin_unlock(&pool->lock); 707 if (bud != HEADLESS) 708 z3fold_page_unlock(zhdr); 709 710 return 0; 711 } 712 713 /** 714 * z3fold_free() - frees the allocation associated with the given handle 715 * @pool: pool in which the allocation resided 716 * @handle: handle associated with the allocation returned by z3fold_alloc() 717 * 718 * In the case that the z3fold page in which the allocation resides is under 719 * reclaim, as indicated by the PG_reclaim flag being set, this function 720 * only sets the first|last_chunks to 0. The page is actually freed 721 * once both buddies are evicted (see z3fold_reclaim_page() below). 722 */ 723 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) 724 { 725 struct z3fold_header *zhdr; 726 struct page *page; 727 enum buddy bud; 728 729 zhdr = handle_to_z3fold_header(handle); 730 page = virt_to_page(zhdr); 731 732 if (test_bit(PAGE_HEADLESS, &page->private)) { 733 /* if a headless page is under reclaim, just leave. 734 * NB: we use test_and_set_bit for a reason: if the bit 735 * has not been set before, we release this page 736 * immediately so we don't care about its value any more. 737 */ 738 if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) { 739 spin_lock(&pool->lock); 740 list_del(&page->lru); 741 spin_unlock(&pool->lock); 742 free_z3fold_page(page); 743 atomic64_dec(&pool->pages_nr); 744 } 745 return; 746 } 747 748 /* Non-headless case */ 749 z3fold_page_lock(zhdr); 750 bud = handle_to_buddy(handle); 751 752 switch (bud) { 753 case FIRST: 754 zhdr->first_chunks = 0; 755 break; 756 case MIDDLE: 757 zhdr->middle_chunks = 0; 758 break; 759 case LAST: 760 zhdr->last_chunks = 0; 761 break; 762 default: 763 pr_err("%s: unknown bud %d\n", __func__, bud); 764 WARN_ON(1); 765 z3fold_page_unlock(zhdr); 766 return; 767 } 768 769 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { 770 atomic64_dec(&pool->pages_nr); 771 return; 772 } 773 if (test_bit(PAGE_CLAIMED, &page->private)) { 774 z3fold_page_unlock(zhdr); 775 return; 776 } 777 if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 778 z3fold_page_unlock(zhdr); 779 return; 780 } 781 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { 782 spin_lock(&pool->lock); 783 list_del_init(&zhdr->buddy); 784 spin_unlock(&pool->lock); 785 zhdr->cpu = -1; 786 kref_get(&zhdr->refcount); 787 do_compact_page(zhdr, true); 788 return; 789 } 790 kref_get(&zhdr->refcount); 791 queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); 792 z3fold_page_unlock(zhdr); 793 } 794 795 /** 796 * z3fold_reclaim_page() - evicts allocations from a pool page and frees it 797 * @pool: pool from which a page will attempt to be evicted 798 * @retries: number of pages on the LRU list for which eviction will 799 * be attempted before failing 800 * 801 * z3fold reclaim is different from normal system reclaim in that it is done 802 * from the bottom, up. This is because only the bottom layer, z3fold, has 803 * information on how the allocations are organized within each z3fold page. 804 * This has the potential to create interesting locking situations between 805 * z3fold and the user, however. 806 * 807 * To avoid these, this is how z3fold_reclaim_page() should be called: 808 * 809 * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). 810 * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and 811 * call the user-defined eviction handler with the pool and handle as 812 * arguments. 813 * 814 * If the handle can not be evicted, the eviction handler should return 815 * non-zero. z3fold_reclaim_page() will add the z3fold page back to the 816 * appropriate list and try the next z3fold page on the LRU up to 817 * a user defined number of retries. 818 * 819 * If the handle is successfully evicted, the eviction handler should 820 * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() 821 * contains logic to delay freeing the page if the page is under reclaim, 822 * as indicated by the setting of the PG_reclaim flag on the underlying page. 823 * 824 * If all buddies in the z3fold page are successfully evicted, then the 825 * z3fold page can be freed. 826 * 827 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are 828 * no pages to evict or an eviction handler is not registered, -EAGAIN if 829 * the retry limit was hit. 830 */ 831 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) 832 { 833 int i, ret = 0; 834 struct z3fold_header *zhdr = NULL; 835 struct page *page = NULL; 836 struct list_head *pos; 837 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 838 839 spin_lock(&pool->lock); 840 if (!pool->ops || !pool->ops->evict || retries == 0) { 841 spin_unlock(&pool->lock); 842 return -EINVAL; 843 } 844 for (i = 0; i < retries; i++) { 845 if (list_empty(&pool->lru)) { 846 spin_unlock(&pool->lock); 847 return -EINVAL; 848 } 849 list_for_each_prev(pos, &pool->lru) { 850 page = list_entry(pos, struct page, lru); 851 852 /* this bit could have been set by free, in which case 853 * we pass over to the next page in the pool. 854 */ 855 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) 856 continue; 857 858 zhdr = page_address(page); 859 if (test_bit(PAGE_HEADLESS, &page->private)) 860 break; 861 862 if (!z3fold_page_trylock(zhdr)) { 863 zhdr = NULL; 864 continue; /* can't evict at this point */ 865 } 866 kref_get(&zhdr->refcount); 867 list_del_init(&zhdr->buddy); 868 zhdr->cpu = -1; 869 break; 870 } 871 872 if (!zhdr) 873 break; 874 875 list_del_init(&page->lru); 876 spin_unlock(&pool->lock); 877 878 if (!test_bit(PAGE_HEADLESS, &page->private)) { 879 /* 880 * We need encode the handles before unlocking, since 881 * we can race with free that will set 882 * (first|last)_chunks to 0 883 */ 884 first_handle = 0; 885 last_handle = 0; 886 middle_handle = 0; 887 if (zhdr->first_chunks) 888 first_handle = encode_handle(zhdr, FIRST); 889 if (zhdr->middle_chunks) 890 middle_handle = encode_handle(zhdr, MIDDLE); 891 if (zhdr->last_chunks) 892 last_handle = encode_handle(zhdr, LAST); 893 /* 894 * it's safe to unlock here because we hold a 895 * reference to this page 896 */ 897 z3fold_page_unlock(zhdr); 898 } else { 899 first_handle = encode_handle(zhdr, HEADLESS); 900 last_handle = middle_handle = 0; 901 } 902 903 /* Issue the eviction callback(s) */ 904 if (middle_handle) { 905 ret = pool->ops->evict(pool, middle_handle); 906 if (ret) 907 goto next; 908 } 909 if (first_handle) { 910 ret = pool->ops->evict(pool, first_handle); 911 if (ret) 912 goto next; 913 } 914 if (last_handle) { 915 ret = pool->ops->evict(pool, last_handle); 916 if (ret) 917 goto next; 918 } 919 next: 920 if (test_bit(PAGE_HEADLESS, &page->private)) { 921 if (ret == 0) { 922 free_z3fold_page(page); 923 atomic64_dec(&pool->pages_nr); 924 return 0; 925 } 926 spin_lock(&pool->lock); 927 list_add(&page->lru, &pool->lru); 928 spin_unlock(&pool->lock); 929 } else { 930 z3fold_page_lock(zhdr); 931 clear_bit(PAGE_CLAIMED, &page->private); 932 if (kref_put(&zhdr->refcount, 933 release_z3fold_page_locked)) { 934 atomic64_dec(&pool->pages_nr); 935 return 0; 936 } 937 /* 938 * if we are here, the page is still not completely 939 * free. Take the global pool lock then to be able 940 * to add it back to the lru list 941 */ 942 spin_lock(&pool->lock); 943 list_add(&page->lru, &pool->lru); 944 spin_unlock(&pool->lock); 945 z3fold_page_unlock(zhdr); 946 } 947 948 /* We started off locked to we need to lock the pool back */ 949 spin_lock(&pool->lock); 950 } 951 spin_unlock(&pool->lock); 952 return -EAGAIN; 953 } 954 955 /** 956 * z3fold_map() - maps the allocation associated with the given handle 957 * @pool: pool in which the allocation resides 958 * @handle: handle associated with the allocation to be mapped 959 * 960 * Extracts the buddy number from handle and constructs the pointer to the 961 * correct starting chunk within the page. 962 * 963 * Returns: a pointer to the mapped allocation 964 */ 965 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) 966 { 967 struct z3fold_header *zhdr; 968 struct page *page; 969 void *addr; 970 enum buddy buddy; 971 972 zhdr = handle_to_z3fold_header(handle); 973 addr = zhdr; 974 page = virt_to_page(zhdr); 975 976 if (test_bit(PAGE_HEADLESS, &page->private)) 977 goto out; 978 979 z3fold_page_lock(zhdr); 980 buddy = handle_to_buddy(handle); 981 switch (buddy) { 982 case FIRST: 983 addr += ZHDR_SIZE_ALIGNED; 984 break; 985 case MIDDLE: 986 addr += zhdr->start_middle << CHUNK_SHIFT; 987 set_bit(MIDDLE_CHUNK_MAPPED, &page->private); 988 break; 989 case LAST: 990 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); 991 break; 992 default: 993 pr_err("unknown buddy id %d\n", buddy); 994 WARN_ON(1); 995 addr = NULL; 996 break; 997 } 998 999 z3fold_page_unlock(zhdr); 1000 out: 1001 return addr; 1002 } 1003 1004 /** 1005 * z3fold_unmap() - unmaps the allocation associated with the given handle 1006 * @pool: pool in which the allocation resides 1007 * @handle: handle associated with the allocation to be unmapped 1008 */ 1009 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) 1010 { 1011 struct z3fold_header *zhdr; 1012 struct page *page; 1013 enum buddy buddy; 1014 1015 zhdr = handle_to_z3fold_header(handle); 1016 page = virt_to_page(zhdr); 1017 1018 if (test_bit(PAGE_HEADLESS, &page->private)) 1019 return; 1020 1021 z3fold_page_lock(zhdr); 1022 buddy = handle_to_buddy(handle); 1023 if (buddy == MIDDLE) 1024 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1025 z3fold_page_unlock(zhdr); 1026 } 1027 1028 /** 1029 * z3fold_get_pool_size() - gets the z3fold pool size in pages 1030 * @pool: pool whose size is being queried 1031 * 1032 * Returns: size in pages of the given pool. 1033 */ 1034 static u64 z3fold_get_pool_size(struct z3fold_pool *pool) 1035 { 1036 return atomic64_read(&pool->pages_nr); 1037 } 1038 1039 /***************** 1040 * zpool 1041 ****************/ 1042 1043 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) 1044 { 1045 if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) 1046 return pool->zpool_ops->evict(pool->zpool, handle); 1047 else 1048 return -ENOENT; 1049 } 1050 1051 static const struct z3fold_ops z3fold_zpool_ops = { 1052 .evict = z3fold_zpool_evict 1053 }; 1054 1055 static void *z3fold_zpool_create(const char *name, gfp_t gfp, 1056 const struct zpool_ops *zpool_ops, 1057 struct zpool *zpool) 1058 { 1059 struct z3fold_pool *pool; 1060 1061 pool = z3fold_create_pool(name, gfp, 1062 zpool_ops ? &z3fold_zpool_ops : NULL); 1063 if (pool) { 1064 pool->zpool = zpool; 1065 pool->zpool_ops = zpool_ops; 1066 } 1067 return pool; 1068 } 1069 1070 static void z3fold_zpool_destroy(void *pool) 1071 { 1072 z3fold_destroy_pool(pool); 1073 } 1074 1075 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, 1076 unsigned long *handle) 1077 { 1078 return z3fold_alloc(pool, size, gfp, handle); 1079 } 1080 static void z3fold_zpool_free(void *pool, unsigned long handle) 1081 { 1082 z3fold_free(pool, handle); 1083 } 1084 1085 static int z3fold_zpool_shrink(void *pool, unsigned int pages, 1086 unsigned int *reclaimed) 1087 { 1088 unsigned int total = 0; 1089 int ret = -EINVAL; 1090 1091 while (total < pages) { 1092 ret = z3fold_reclaim_page(pool, 8); 1093 if (ret < 0) 1094 break; 1095 total++; 1096 } 1097 1098 if (reclaimed) 1099 *reclaimed = total; 1100 1101 return ret; 1102 } 1103 1104 static void *z3fold_zpool_map(void *pool, unsigned long handle, 1105 enum zpool_mapmode mm) 1106 { 1107 return z3fold_map(pool, handle); 1108 } 1109 static void z3fold_zpool_unmap(void *pool, unsigned long handle) 1110 { 1111 z3fold_unmap(pool, handle); 1112 } 1113 1114 static u64 z3fold_zpool_total_size(void *pool) 1115 { 1116 return z3fold_get_pool_size(pool) * PAGE_SIZE; 1117 } 1118 1119 static struct zpool_driver z3fold_zpool_driver = { 1120 .type = "z3fold", 1121 .owner = THIS_MODULE, 1122 .create = z3fold_zpool_create, 1123 .destroy = z3fold_zpool_destroy, 1124 .malloc = z3fold_zpool_malloc, 1125 .free = z3fold_zpool_free, 1126 .shrink = z3fold_zpool_shrink, 1127 .map = z3fold_zpool_map, 1128 .unmap = z3fold_zpool_unmap, 1129 .total_size = z3fold_zpool_total_size, 1130 }; 1131 1132 MODULE_ALIAS("zpool-z3fold"); 1133 1134 static int __init init_z3fold(void) 1135 { 1136 /* Make sure the z3fold header is not larger than the page size */ 1137 BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); 1138 zpool_register_driver(&z3fold_zpool_driver); 1139 1140 return 0; 1141 } 1142 1143 static void __exit exit_z3fold(void) 1144 { 1145 zpool_unregister_driver(&z3fold_zpool_driver); 1146 } 1147 1148 module_init(init_z3fold); 1149 module_exit(exit_z3fold); 1150 1151 MODULE_LICENSE("GPL"); 1152 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); 1153 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); 1154