1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * z3fold.c 4 * 5 * Author: Vitaly Wool <vitaly.wool@konsulko.com> 6 * Copyright (C) 2016, Sony Mobile Communications Inc. 7 * 8 * This implementation is based on zbud written by Seth Jennings. 9 * 10 * z3fold is an special purpose allocator for storing compressed pages. It 11 * can store up to three compressed pages per page which improves the 12 * compression ratio of zbud while retaining its main concepts (e. g. always 13 * storing an integral number of objects per page) and simplicity. 14 * It still has simple and deterministic reclaim properties that make it 15 * preferable to a higher density approach (with no requirement on integral 16 * number of object per page) when reclaim is used. 17 * 18 * As in zbud, pages are divided into "chunks". The size of the chunks is 19 * fixed at compile time and is determined by NCHUNKS_ORDER below. 20 * 21 * z3fold doesn't export any API and is meant to be used via zpool API. 22 */ 23 24 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 25 26 #include <linux/atomic.h> 27 #include <linux/sched.h> 28 #include <linux/cpumask.h> 29 #include <linux/list.h> 30 #include <linux/mm.h> 31 #include <linux/module.h> 32 #include <linux/page-flags.h> 33 #include <linux/migrate.h> 34 #include <linux/node.h> 35 #include <linux/compaction.h> 36 #include <linux/percpu.h> 37 #include <linux/mount.h> 38 #include <linux/pseudo_fs.h> 39 #include <linux/fs.h> 40 #include <linux/preempt.h> 41 #include <linux/workqueue.h> 42 #include <linux/slab.h> 43 #include <linux/spinlock.h> 44 #include <linux/zpool.h> 45 #include <linux/magic.h> 46 #include <linux/kmemleak.h> 47 48 /* 49 * NCHUNKS_ORDER determines the internal allocation granularity, effectively 50 * adjusting internal fragmentation. It also determines the number of 51 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the 52 * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks 53 * in the beginning of an allocated page are occupied by z3fold header, so 54 * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y), 55 * which shows the max number of free chunks in z3fold page, also there will 56 * be 63, or 62, respectively, freelists per pool. 57 */ 58 #define NCHUNKS_ORDER 6 59 60 #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) 61 #define CHUNK_SIZE (1 << CHUNK_SHIFT) 62 #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE) 63 #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT) 64 #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT) 65 #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) 66 67 #define BUDDY_MASK (0x3) 68 #define BUDDY_SHIFT 2 69 #define SLOTS_ALIGN (0x40) 70 71 /***************** 72 * Structures 73 *****************/ 74 struct z3fold_pool; 75 struct z3fold_ops { 76 int (*evict)(struct z3fold_pool *pool, unsigned long handle); 77 }; 78 79 enum buddy { 80 HEADLESS = 0, 81 FIRST, 82 MIDDLE, 83 LAST, 84 BUDDIES_MAX = LAST 85 }; 86 87 struct z3fold_buddy_slots { 88 /* 89 * we are using BUDDY_MASK in handle_to_buddy etc. so there should 90 * be enough slots to hold all possible variants 91 */ 92 unsigned long slot[BUDDY_MASK + 1]; 93 unsigned long pool; /* back link + flags */ 94 rwlock_t lock; 95 }; 96 #define HANDLE_FLAG_MASK (0x03) 97 98 /* 99 * struct z3fold_header - z3fold page metadata occupying first chunks of each 100 * z3fold page, except for HEADLESS pages 101 * @buddy: links the z3fold page into the relevant list in the 102 * pool 103 * @page_lock: per-page lock 104 * @refcount: reference count for the z3fold page 105 * @work: work_struct for page layout optimization 106 * @slots: pointer to the structure holding buddy slots 107 * @pool: pointer to the containing pool 108 * @cpu: CPU which this page "belongs" to 109 * @first_chunks: the size of the first buddy in chunks, 0 if free 110 * @middle_chunks: the size of the middle buddy in chunks, 0 if free 111 * @last_chunks: the size of the last buddy in chunks, 0 if free 112 * @first_num: the starting number (for the first handle) 113 * @mapped_count: the number of objects currently mapped 114 */ 115 struct z3fold_header { 116 struct list_head buddy; 117 spinlock_t page_lock; 118 struct kref refcount; 119 struct work_struct work; 120 struct z3fold_buddy_slots *slots; 121 struct z3fold_pool *pool; 122 short cpu; 123 unsigned short first_chunks; 124 unsigned short middle_chunks; 125 unsigned short last_chunks; 126 unsigned short start_middle; 127 unsigned short first_num:2; 128 unsigned short mapped_count:2; 129 unsigned short foreign_handles:2; 130 }; 131 132 /** 133 * struct z3fold_pool - stores metadata for each z3fold pool 134 * @name: pool name 135 * @lock: protects pool unbuddied/lru lists 136 * @stale_lock: protects pool stale page list 137 * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2- 138 * buddies; the list each z3fold page is added to depends on 139 * the size of its free region. 140 * @lru: list tracking the z3fold pages in LRU order by most recently 141 * added buddy. 142 * @stale: list of pages marked for freeing 143 * @pages_nr: number of z3fold pages in the pool. 144 * @c_handle: cache for z3fold_buddy_slots allocation 145 * @ops: pointer to a structure of user defined operations specified at 146 * pool creation time. 147 * @compact_wq: workqueue for page layout background optimization 148 * @release_wq: workqueue for safe page release 149 * @work: work_struct for safe page release 150 * @inode: inode for z3fold pseudo filesystem 151 * 152 * This structure is allocated at pool creation time and maintains metadata 153 * pertaining to a particular z3fold pool. 154 */ 155 struct z3fold_pool { 156 const char *name; 157 spinlock_t lock; 158 spinlock_t stale_lock; 159 struct list_head *unbuddied; 160 struct list_head lru; 161 struct list_head stale; 162 atomic64_t pages_nr; 163 struct kmem_cache *c_handle; 164 const struct z3fold_ops *ops; 165 struct zpool *zpool; 166 const struct zpool_ops *zpool_ops; 167 struct workqueue_struct *compact_wq; 168 struct workqueue_struct *release_wq; 169 struct work_struct work; 170 struct inode *inode; 171 }; 172 173 /* 174 * Internal z3fold page flags 175 */ 176 enum z3fold_page_flags { 177 PAGE_HEADLESS = 0, 178 MIDDLE_CHUNK_MAPPED, 179 NEEDS_COMPACTING, 180 PAGE_STALE, 181 PAGE_CLAIMED, /* by either reclaim or free */ 182 }; 183 184 /* 185 * handle flags, go under HANDLE_FLAG_MASK 186 */ 187 enum z3fold_handle_flags { 188 HANDLES_ORPHANED = 0, 189 }; 190 191 /* 192 * Forward declarations 193 */ 194 static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool); 195 static void compact_page_work(struct work_struct *w); 196 197 /***************** 198 * Helpers 199 *****************/ 200 201 /* Converts an allocation size in bytes to size in z3fold chunks */ 202 static int size_to_chunks(size_t size) 203 { 204 return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; 205 } 206 207 #define for_each_unbuddied_list(_iter, _begin) \ 208 for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) 209 210 static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, 211 gfp_t gfp) 212 { 213 struct z3fold_buddy_slots *slots; 214 215 slots = kmem_cache_zalloc(pool->c_handle, 216 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); 217 218 if (slots) { 219 /* It will be freed separately in free_handle(). */ 220 kmemleak_not_leak(slots); 221 slots->pool = (unsigned long)pool; 222 rwlock_init(&slots->lock); 223 } 224 225 return slots; 226 } 227 228 static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s) 229 { 230 return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK); 231 } 232 233 static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle) 234 { 235 return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1)); 236 } 237 238 /* Lock a z3fold page */ 239 static inline void z3fold_page_lock(struct z3fold_header *zhdr) 240 { 241 spin_lock(&zhdr->page_lock); 242 } 243 244 /* Try to lock a z3fold page */ 245 static inline int z3fold_page_trylock(struct z3fold_header *zhdr) 246 { 247 return spin_trylock(&zhdr->page_lock); 248 } 249 250 /* Unlock a z3fold page */ 251 static inline void z3fold_page_unlock(struct z3fold_header *zhdr) 252 { 253 spin_unlock(&zhdr->page_lock); 254 } 255 256 257 static inline struct z3fold_header *__get_z3fold_header(unsigned long handle, 258 bool lock) 259 { 260 struct z3fold_buddy_slots *slots; 261 struct z3fold_header *zhdr; 262 int locked = 0; 263 264 if (!(handle & (1 << PAGE_HEADLESS))) { 265 slots = handle_to_slots(handle); 266 do { 267 unsigned long addr; 268 269 read_lock(&slots->lock); 270 addr = *(unsigned long *)handle; 271 zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 272 if (lock) 273 locked = z3fold_page_trylock(zhdr); 274 read_unlock(&slots->lock); 275 if (locked) 276 break; 277 cpu_relax(); 278 } while (lock); 279 } else { 280 zhdr = (struct z3fold_header *)(handle & PAGE_MASK); 281 } 282 283 return zhdr; 284 } 285 286 /* Returns the z3fold page where a given handle is stored */ 287 static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) 288 { 289 return __get_z3fold_header(h, false); 290 } 291 292 /* return locked z3fold page if it's not headless */ 293 static inline struct z3fold_header *get_z3fold_header(unsigned long h) 294 { 295 return __get_z3fold_header(h, true); 296 } 297 298 static inline void put_z3fold_header(struct z3fold_header *zhdr) 299 { 300 struct page *page = virt_to_page(zhdr); 301 302 if (!test_bit(PAGE_HEADLESS, &page->private)) 303 z3fold_page_unlock(zhdr); 304 } 305 306 static inline void free_handle(unsigned long handle) 307 { 308 struct z3fold_buddy_slots *slots; 309 struct z3fold_header *zhdr; 310 int i; 311 bool is_free; 312 313 if (handle & (1 << PAGE_HEADLESS)) 314 return; 315 316 if (WARN_ON(*(unsigned long *)handle == 0)) 317 return; 318 319 zhdr = handle_to_z3fold_header(handle); 320 slots = handle_to_slots(handle); 321 write_lock(&slots->lock); 322 *(unsigned long *)handle = 0; 323 if (zhdr->slots == slots) { 324 write_unlock(&slots->lock); 325 return; /* simple case, nothing else to do */ 326 } 327 328 /* we are freeing a foreign handle if we are here */ 329 zhdr->foreign_handles--; 330 is_free = true; 331 if (!test_bit(HANDLES_ORPHANED, &slots->pool)) { 332 write_unlock(&slots->lock); 333 return; 334 } 335 for (i = 0; i <= BUDDY_MASK; i++) { 336 if (slots->slot[i]) { 337 is_free = false; 338 break; 339 } 340 } 341 write_unlock(&slots->lock); 342 343 if (is_free) { 344 struct z3fold_pool *pool = slots_to_pool(slots); 345 346 kmem_cache_free(pool->c_handle, slots); 347 } 348 } 349 350 static int z3fold_init_fs_context(struct fs_context *fc) 351 { 352 return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; 353 } 354 355 static struct file_system_type z3fold_fs = { 356 .name = "z3fold", 357 .init_fs_context = z3fold_init_fs_context, 358 .kill_sb = kill_anon_super, 359 }; 360 361 static struct vfsmount *z3fold_mnt; 362 static int z3fold_mount(void) 363 { 364 int ret = 0; 365 366 z3fold_mnt = kern_mount(&z3fold_fs); 367 if (IS_ERR(z3fold_mnt)) 368 ret = PTR_ERR(z3fold_mnt); 369 370 return ret; 371 } 372 373 static void z3fold_unmount(void) 374 { 375 kern_unmount(z3fold_mnt); 376 } 377 378 static const struct address_space_operations z3fold_aops; 379 static int z3fold_register_migration(struct z3fold_pool *pool) 380 { 381 pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb); 382 if (IS_ERR(pool->inode)) { 383 pool->inode = NULL; 384 return 1; 385 } 386 387 pool->inode->i_mapping->private_data = pool; 388 pool->inode->i_mapping->a_ops = &z3fold_aops; 389 return 0; 390 } 391 392 static void z3fold_unregister_migration(struct z3fold_pool *pool) 393 { 394 if (pool->inode) 395 iput(pool->inode); 396 } 397 398 /* Initializes the z3fold header of a newly allocated z3fold page */ 399 static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, 400 struct z3fold_pool *pool, gfp_t gfp) 401 { 402 struct z3fold_header *zhdr = page_address(page); 403 struct z3fold_buddy_slots *slots; 404 405 INIT_LIST_HEAD(&page->lru); 406 clear_bit(PAGE_HEADLESS, &page->private); 407 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 408 clear_bit(NEEDS_COMPACTING, &page->private); 409 clear_bit(PAGE_STALE, &page->private); 410 clear_bit(PAGE_CLAIMED, &page->private); 411 if (headless) 412 return zhdr; 413 414 slots = alloc_slots(pool, gfp); 415 if (!slots) 416 return NULL; 417 418 spin_lock_init(&zhdr->page_lock); 419 kref_init(&zhdr->refcount); 420 zhdr->first_chunks = 0; 421 zhdr->middle_chunks = 0; 422 zhdr->last_chunks = 0; 423 zhdr->first_num = 0; 424 zhdr->start_middle = 0; 425 zhdr->cpu = -1; 426 zhdr->foreign_handles = 0; 427 zhdr->mapped_count = 0; 428 zhdr->slots = slots; 429 zhdr->pool = pool; 430 INIT_LIST_HEAD(&zhdr->buddy); 431 INIT_WORK(&zhdr->work, compact_page_work); 432 return zhdr; 433 } 434 435 /* Resets the struct page fields and frees the page */ 436 static void free_z3fold_page(struct page *page, bool headless) 437 { 438 if (!headless) { 439 lock_page(page); 440 __ClearPageMovable(page); 441 unlock_page(page); 442 } 443 ClearPagePrivate(page); 444 __free_page(page); 445 } 446 447 /* Helper function to build the index */ 448 static inline int __idx(struct z3fold_header *zhdr, enum buddy bud) 449 { 450 return (bud + zhdr->first_num) & BUDDY_MASK; 451 } 452 453 /* 454 * Encodes the handle of a particular buddy within a z3fold page 455 * Pool lock should be held as this function accesses first_num 456 */ 457 static unsigned long __encode_handle(struct z3fold_header *zhdr, 458 struct z3fold_buddy_slots *slots, 459 enum buddy bud) 460 { 461 unsigned long h = (unsigned long)zhdr; 462 int idx = 0; 463 464 /* 465 * For a headless page, its handle is its pointer with the extra 466 * PAGE_HEADLESS bit set 467 */ 468 if (bud == HEADLESS) 469 return h | (1 << PAGE_HEADLESS); 470 471 /* otherwise, return pointer to encoded handle */ 472 idx = __idx(zhdr, bud); 473 h += idx; 474 if (bud == LAST) 475 h |= (zhdr->last_chunks << BUDDY_SHIFT); 476 477 write_lock(&slots->lock); 478 slots->slot[idx] = h; 479 write_unlock(&slots->lock); 480 return (unsigned long)&slots->slot[idx]; 481 } 482 483 static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) 484 { 485 return __encode_handle(zhdr, zhdr->slots, bud); 486 } 487 488 /* only for LAST bud, returns zero otherwise */ 489 static unsigned short handle_to_chunks(unsigned long handle) 490 { 491 struct z3fold_buddy_slots *slots = handle_to_slots(handle); 492 unsigned long addr; 493 494 read_lock(&slots->lock); 495 addr = *(unsigned long *)handle; 496 read_unlock(&slots->lock); 497 return (addr & ~PAGE_MASK) >> BUDDY_SHIFT; 498 } 499 500 /* 501 * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle 502 * but that doesn't matter. because the masking will result in the 503 * correct buddy number. 504 */ 505 static enum buddy handle_to_buddy(unsigned long handle) 506 { 507 struct z3fold_header *zhdr; 508 struct z3fold_buddy_slots *slots = handle_to_slots(handle); 509 unsigned long addr; 510 511 read_lock(&slots->lock); 512 WARN_ON(handle & (1 << PAGE_HEADLESS)); 513 addr = *(unsigned long *)handle; 514 read_unlock(&slots->lock); 515 zhdr = (struct z3fold_header *)(addr & PAGE_MASK); 516 return (addr - zhdr->first_num) & BUDDY_MASK; 517 } 518 519 static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) 520 { 521 return zhdr->pool; 522 } 523 524 static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) 525 { 526 struct page *page = virt_to_page(zhdr); 527 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 528 bool is_free = true; 529 int i; 530 531 WARN_ON(!list_empty(&zhdr->buddy)); 532 set_bit(PAGE_STALE, &page->private); 533 clear_bit(NEEDS_COMPACTING, &page->private); 534 spin_lock(&pool->lock); 535 if (!list_empty(&page->lru)) 536 list_del_init(&page->lru); 537 spin_unlock(&pool->lock); 538 539 /* If there are no foreign handles, free the handles array */ 540 read_lock(&zhdr->slots->lock); 541 for (i = 0; i <= BUDDY_MASK; i++) { 542 if (zhdr->slots->slot[i]) { 543 is_free = false; 544 break; 545 } 546 } 547 if (!is_free) 548 set_bit(HANDLES_ORPHANED, &zhdr->slots->pool); 549 read_unlock(&zhdr->slots->lock); 550 551 if (is_free) 552 kmem_cache_free(pool->c_handle, zhdr->slots); 553 554 if (locked) 555 z3fold_page_unlock(zhdr); 556 557 spin_lock(&pool->stale_lock); 558 list_add(&zhdr->buddy, &pool->stale); 559 queue_work(pool->release_wq, &pool->work); 560 spin_unlock(&pool->stale_lock); 561 } 562 563 static void __attribute__((__unused__)) 564 release_z3fold_page(struct kref *ref) 565 { 566 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 567 refcount); 568 __release_z3fold_page(zhdr, false); 569 } 570 571 static void release_z3fold_page_locked(struct kref *ref) 572 { 573 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 574 refcount); 575 WARN_ON(z3fold_page_trylock(zhdr)); 576 __release_z3fold_page(zhdr, true); 577 } 578 579 static void release_z3fold_page_locked_list(struct kref *ref) 580 { 581 struct z3fold_header *zhdr = container_of(ref, struct z3fold_header, 582 refcount); 583 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 584 585 spin_lock(&pool->lock); 586 list_del_init(&zhdr->buddy); 587 spin_unlock(&pool->lock); 588 589 WARN_ON(z3fold_page_trylock(zhdr)); 590 __release_z3fold_page(zhdr, true); 591 } 592 593 static void free_pages_work(struct work_struct *w) 594 { 595 struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); 596 597 spin_lock(&pool->stale_lock); 598 while (!list_empty(&pool->stale)) { 599 struct z3fold_header *zhdr = list_first_entry(&pool->stale, 600 struct z3fold_header, buddy); 601 struct page *page = virt_to_page(zhdr); 602 603 list_del(&zhdr->buddy); 604 if (WARN_ON(!test_bit(PAGE_STALE, &page->private))) 605 continue; 606 spin_unlock(&pool->stale_lock); 607 cancel_work_sync(&zhdr->work); 608 free_z3fold_page(page, false); 609 cond_resched(); 610 spin_lock(&pool->stale_lock); 611 } 612 spin_unlock(&pool->stale_lock); 613 } 614 615 /* 616 * Returns the number of free chunks in a z3fold page. 617 * NB: can't be used with HEADLESS pages. 618 */ 619 static int num_free_chunks(struct z3fold_header *zhdr) 620 { 621 int nfree; 622 /* 623 * If there is a middle object, pick up the bigger free space 624 * either before or after it. Otherwise just subtract the number 625 * of chunks occupied by the first and the last objects. 626 */ 627 if (zhdr->middle_chunks != 0) { 628 int nfree_before = zhdr->first_chunks ? 629 0 : zhdr->start_middle - ZHDR_CHUNKS; 630 int nfree_after = zhdr->last_chunks ? 631 0 : TOTAL_CHUNKS - 632 (zhdr->start_middle + zhdr->middle_chunks); 633 nfree = max(nfree_before, nfree_after); 634 } else 635 nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; 636 return nfree; 637 } 638 639 /* Add to the appropriate unbuddied list */ 640 static inline void add_to_unbuddied(struct z3fold_pool *pool, 641 struct z3fold_header *zhdr) 642 { 643 if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || 644 zhdr->middle_chunks == 0) { 645 struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied); 646 647 int freechunks = num_free_chunks(zhdr); 648 spin_lock(&pool->lock); 649 list_add(&zhdr->buddy, &unbuddied[freechunks]); 650 spin_unlock(&pool->lock); 651 zhdr->cpu = smp_processor_id(); 652 put_cpu_ptr(pool->unbuddied); 653 } 654 } 655 656 static inline void *mchunk_memmove(struct z3fold_header *zhdr, 657 unsigned short dst_chunk) 658 { 659 void *beg = zhdr; 660 return memmove(beg + (dst_chunk << CHUNK_SHIFT), 661 beg + (zhdr->start_middle << CHUNK_SHIFT), 662 zhdr->middle_chunks << CHUNK_SHIFT); 663 } 664 665 static inline bool buddy_single(struct z3fold_header *zhdr) 666 { 667 return !((zhdr->first_chunks && zhdr->middle_chunks) || 668 (zhdr->first_chunks && zhdr->last_chunks) || 669 (zhdr->middle_chunks && zhdr->last_chunks)); 670 } 671 672 static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) 673 { 674 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 675 void *p = zhdr; 676 unsigned long old_handle = 0; 677 size_t sz = 0; 678 struct z3fold_header *new_zhdr = NULL; 679 int first_idx = __idx(zhdr, FIRST); 680 int middle_idx = __idx(zhdr, MIDDLE); 681 int last_idx = __idx(zhdr, LAST); 682 unsigned short *moved_chunks = NULL; 683 684 /* 685 * No need to protect slots here -- all the slots are "local" and 686 * the page lock is already taken 687 */ 688 if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) { 689 p += ZHDR_SIZE_ALIGNED; 690 sz = zhdr->first_chunks << CHUNK_SHIFT; 691 old_handle = (unsigned long)&zhdr->slots->slot[first_idx]; 692 moved_chunks = &zhdr->first_chunks; 693 } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) { 694 p += zhdr->start_middle << CHUNK_SHIFT; 695 sz = zhdr->middle_chunks << CHUNK_SHIFT; 696 old_handle = (unsigned long)&zhdr->slots->slot[middle_idx]; 697 moved_chunks = &zhdr->middle_chunks; 698 } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) { 699 p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); 700 sz = zhdr->last_chunks << CHUNK_SHIFT; 701 old_handle = (unsigned long)&zhdr->slots->slot[last_idx]; 702 moved_chunks = &zhdr->last_chunks; 703 } 704 705 if (sz > 0) { 706 enum buddy new_bud = HEADLESS; 707 short chunks = size_to_chunks(sz); 708 void *q; 709 710 new_zhdr = __z3fold_alloc(pool, sz, false); 711 if (!new_zhdr) 712 return NULL; 713 714 if (WARN_ON(new_zhdr == zhdr)) 715 goto out_fail; 716 717 if (new_zhdr->first_chunks == 0) { 718 if (new_zhdr->middle_chunks != 0 && 719 chunks >= new_zhdr->start_middle) { 720 new_bud = LAST; 721 } else { 722 new_bud = FIRST; 723 } 724 } else if (new_zhdr->last_chunks == 0) { 725 new_bud = LAST; 726 } else if (new_zhdr->middle_chunks == 0) { 727 new_bud = MIDDLE; 728 } 729 q = new_zhdr; 730 switch (new_bud) { 731 case FIRST: 732 new_zhdr->first_chunks = chunks; 733 q += ZHDR_SIZE_ALIGNED; 734 break; 735 case MIDDLE: 736 new_zhdr->middle_chunks = chunks; 737 new_zhdr->start_middle = 738 new_zhdr->first_chunks + ZHDR_CHUNKS; 739 q += new_zhdr->start_middle << CHUNK_SHIFT; 740 break; 741 case LAST: 742 new_zhdr->last_chunks = chunks; 743 q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT); 744 break; 745 default: 746 goto out_fail; 747 } 748 new_zhdr->foreign_handles++; 749 memcpy(q, p, sz); 750 write_lock(&zhdr->slots->lock); 751 *(unsigned long *)old_handle = (unsigned long)new_zhdr + 752 __idx(new_zhdr, new_bud); 753 if (new_bud == LAST) 754 *(unsigned long *)old_handle |= 755 (new_zhdr->last_chunks << BUDDY_SHIFT); 756 write_unlock(&zhdr->slots->lock); 757 add_to_unbuddied(pool, new_zhdr); 758 z3fold_page_unlock(new_zhdr); 759 760 *moved_chunks = 0; 761 } 762 763 return new_zhdr; 764 765 out_fail: 766 if (new_zhdr) { 767 if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) 768 atomic64_dec(&pool->pages_nr); 769 else { 770 add_to_unbuddied(pool, new_zhdr); 771 z3fold_page_unlock(new_zhdr); 772 } 773 } 774 return NULL; 775 776 } 777 778 #define BIG_CHUNK_GAP 3 779 /* Has to be called with lock held */ 780 static int z3fold_compact_page(struct z3fold_header *zhdr) 781 { 782 struct page *page = virt_to_page(zhdr); 783 784 if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private)) 785 return 0; /* can't move middle chunk, it's used */ 786 787 if (unlikely(PageIsolated(page))) 788 return 0; 789 790 if (zhdr->middle_chunks == 0) 791 return 0; /* nothing to compact */ 792 793 if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { 794 /* move to the beginning */ 795 mchunk_memmove(zhdr, ZHDR_CHUNKS); 796 zhdr->first_chunks = zhdr->middle_chunks; 797 zhdr->middle_chunks = 0; 798 zhdr->start_middle = 0; 799 zhdr->first_num++; 800 return 1; 801 } 802 803 /* 804 * moving data is expensive, so let's only do that if 805 * there's substantial gain (at least BIG_CHUNK_GAP chunks) 806 */ 807 if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 && 808 zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >= 809 BIG_CHUNK_GAP) { 810 mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS); 811 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 812 return 1; 813 } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 && 814 TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle 815 + zhdr->middle_chunks) >= 816 BIG_CHUNK_GAP) { 817 unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks - 818 zhdr->middle_chunks; 819 mchunk_memmove(zhdr, new_start); 820 zhdr->start_middle = new_start; 821 return 1; 822 } 823 824 return 0; 825 } 826 827 static void do_compact_page(struct z3fold_header *zhdr, bool locked) 828 { 829 struct z3fold_pool *pool = zhdr_to_pool(zhdr); 830 struct page *page; 831 832 page = virt_to_page(zhdr); 833 if (locked) 834 WARN_ON(z3fold_page_trylock(zhdr)); 835 else 836 z3fold_page_lock(zhdr); 837 if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) { 838 z3fold_page_unlock(zhdr); 839 return; 840 } 841 spin_lock(&pool->lock); 842 list_del_init(&zhdr->buddy); 843 spin_unlock(&pool->lock); 844 845 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 846 atomic64_dec(&pool->pages_nr); 847 return; 848 } 849 850 if (unlikely(PageIsolated(page) || 851 test_bit(PAGE_CLAIMED, &page->private) || 852 test_bit(PAGE_STALE, &page->private))) { 853 z3fold_page_unlock(zhdr); 854 return; 855 } 856 857 if (!zhdr->foreign_handles && buddy_single(zhdr) && 858 zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { 859 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) 860 atomic64_dec(&pool->pages_nr); 861 else 862 z3fold_page_unlock(zhdr); 863 return; 864 } 865 866 z3fold_compact_page(zhdr); 867 add_to_unbuddied(pool, zhdr); 868 z3fold_page_unlock(zhdr); 869 } 870 871 static void compact_page_work(struct work_struct *w) 872 { 873 struct z3fold_header *zhdr = container_of(w, struct z3fold_header, 874 work); 875 876 do_compact_page(zhdr, false); 877 } 878 879 /* returns _locked_ z3fold page header or NULL */ 880 static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, 881 size_t size, bool can_sleep) 882 { 883 struct z3fold_header *zhdr = NULL; 884 struct page *page; 885 struct list_head *unbuddied; 886 int chunks = size_to_chunks(size), i; 887 888 lookup: 889 /* First, try to find an unbuddied z3fold page. */ 890 unbuddied = get_cpu_ptr(pool->unbuddied); 891 for_each_unbuddied_list(i, chunks) { 892 struct list_head *l = &unbuddied[i]; 893 894 zhdr = list_first_entry_or_null(READ_ONCE(l), 895 struct z3fold_header, buddy); 896 897 if (!zhdr) 898 continue; 899 900 /* Re-check under lock. */ 901 spin_lock(&pool->lock); 902 l = &unbuddied[i]; 903 if (unlikely(zhdr != list_first_entry(READ_ONCE(l), 904 struct z3fold_header, buddy)) || 905 !z3fold_page_trylock(zhdr)) { 906 spin_unlock(&pool->lock); 907 zhdr = NULL; 908 put_cpu_ptr(pool->unbuddied); 909 if (can_sleep) 910 cond_resched(); 911 goto lookup; 912 } 913 list_del_init(&zhdr->buddy); 914 zhdr->cpu = -1; 915 spin_unlock(&pool->lock); 916 917 page = virt_to_page(zhdr); 918 if (test_bit(NEEDS_COMPACTING, &page->private) || 919 test_bit(PAGE_CLAIMED, &page->private)) { 920 z3fold_page_unlock(zhdr); 921 zhdr = NULL; 922 put_cpu_ptr(pool->unbuddied); 923 if (can_sleep) 924 cond_resched(); 925 goto lookup; 926 } 927 928 /* 929 * this page could not be removed from its unbuddied 930 * list while pool lock was held, and then we've taken 931 * page lock so kref_put could not be called before 932 * we got here, so it's safe to just call kref_get() 933 */ 934 kref_get(&zhdr->refcount); 935 break; 936 } 937 put_cpu_ptr(pool->unbuddied); 938 939 if (!zhdr) { 940 int cpu; 941 942 /* look for _exact_ match on other cpus' lists */ 943 for_each_online_cpu(cpu) { 944 struct list_head *l; 945 946 unbuddied = per_cpu_ptr(pool->unbuddied, cpu); 947 spin_lock(&pool->lock); 948 l = &unbuddied[chunks]; 949 950 zhdr = list_first_entry_or_null(READ_ONCE(l), 951 struct z3fold_header, buddy); 952 953 if (!zhdr || !z3fold_page_trylock(zhdr)) { 954 spin_unlock(&pool->lock); 955 zhdr = NULL; 956 continue; 957 } 958 list_del_init(&zhdr->buddy); 959 zhdr->cpu = -1; 960 spin_unlock(&pool->lock); 961 962 page = virt_to_page(zhdr); 963 if (test_bit(NEEDS_COMPACTING, &page->private) || 964 test_bit(PAGE_CLAIMED, &page->private)) { 965 z3fold_page_unlock(zhdr); 966 zhdr = NULL; 967 if (can_sleep) 968 cond_resched(); 969 continue; 970 } 971 kref_get(&zhdr->refcount); 972 break; 973 } 974 } 975 976 return zhdr; 977 } 978 979 /* 980 * API Functions 981 */ 982 983 /** 984 * z3fold_create_pool() - create a new z3fold pool 985 * @name: pool name 986 * @gfp: gfp flags when allocating the z3fold pool structure 987 * @ops: user-defined operations for the z3fold pool 988 * 989 * Return: pointer to the new z3fold pool or NULL if the metadata allocation 990 * failed. 991 */ 992 static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, 993 const struct z3fold_ops *ops) 994 { 995 struct z3fold_pool *pool = NULL; 996 int i, cpu; 997 998 pool = kzalloc(sizeof(struct z3fold_pool), gfp); 999 if (!pool) 1000 goto out; 1001 pool->c_handle = kmem_cache_create("z3fold_handle", 1002 sizeof(struct z3fold_buddy_slots), 1003 SLOTS_ALIGN, 0, NULL); 1004 if (!pool->c_handle) 1005 goto out_c; 1006 spin_lock_init(&pool->lock); 1007 spin_lock_init(&pool->stale_lock); 1008 pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); 1009 if (!pool->unbuddied) 1010 goto out_pool; 1011 for_each_possible_cpu(cpu) { 1012 struct list_head *unbuddied = 1013 per_cpu_ptr(pool->unbuddied, cpu); 1014 for_each_unbuddied_list(i, 0) 1015 INIT_LIST_HEAD(&unbuddied[i]); 1016 } 1017 INIT_LIST_HEAD(&pool->lru); 1018 INIT_LIST_HEAD(&pool->stale); 1019 atomic64_set(&pool->pages_nr, 0); 1020 pool->name = name; 1021 pool->compact_wq = create_singlethread_workqueue(pool->name); 1022 if (!pool->compact_wq) 1023 goto out_unbuddied; 1024 pool->release_wq = create_singlethread_workqueue(pool->name); 1025 if (!pool->release_wq) 1026 goto out_wq; 1027 if (z3fold_register_migration(pool)) 1028 goto out_rwq; 1029 INIT_WORK(&pool->work, free_pages_work); 1030 pool->ops = ops; 1031 return pool; 1032 1033 out_rwq: 1034 destroy_workqueue(pool->release_wq); 1035 out_wq: 1036 destroy_workqueue(pool->compact_wq); 1037 out_unbuddied: 1038 free_percpu(pool->unbuddied); 1039 out_pool: 1040 kmem_cache_destroy(pool->c_handle); 1041 out_c: 1042 kfree(pool); 1043 out: 1044 return NULL; 1045 } 1046 1047 /** 1048 * z3fold_destroy_pool() - destroys an existing z3fold pool 1049 * @pool: the z3fold pool to be destroyed 1050 * 1051 * The pool should be emptied before this function is called. 1052 */ 1053 static void z3fold_destroy_pool(struct z3fold_pool *pool) 1054 { 1055 kmem_cache_destroy(pool->c_handle); 1056 1057 /* 1058 * We need to destroy pool->compact_wq before pool->release_wq, 1059 * as any pending work on pool->compact_wq will call 1060 * queue_work(pool->release_wq, &pool->work). 1061 * 1062 * There are still outstanding pages until both workqueues are drained, 1063 * so we cannot unregister migration until then. 1064 */ 1065 1066 destroy_workqueue(pool->compact_wq); 1067 destroy_workqueue(pool->release_wq); 1068 z3fold_unregister_migration(pool); 1069 kfree(pool); 1070 } 1071 1072 /** 1073 * z3fold_alloc() - allocates a region of a given size 1074 * @pool: z3fold pool from which to allocate 1075 * @size: size in bytes of the desired allocation 1076 * @gfp: gfp flags used if the pool needs to grow 1077 * @handle: handle of the new allocation 1078 * 1079 * This function will attempt to find a free region in the pool large enough to 1080 * satisfy the allocation request. A search of the unbuddied lists is 1081 * performed first. If no suitable free region is found, then a new page is 1082 * allocated and added to the pool to satisfy the request. 1083 * 1084 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used 1085 * as z3fold pool pages. 1086 * 1087 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or 1088 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate 1089 * a new page. 1090 */ 1091 static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, 1092 unsigned long *handle) 1093 { 1094 int chunks = size_to_chunks(size); 1095 struct z3fold_header *zhdr = NULL; 1096 struct page *page = NULL; 1097 enum buddy bud; 1098 bool can_sleep = gfpflags_allow_blocking(gfp); 1099 1100 if (!size) 1101 return -EINVAL; 1102 1103 if (size > PAGE_SIZE) 1104 return -ENOSPC; 1105 1106 if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) 1107 bud = HEADLESS; 1108 else { 1109 retry: 1110 zhdr = __z3fold_alloc(pool, size, can_sleep); 1111 if (zhdr) { 1112 if (zhdr->first_chunks == 0) { 1113 if (zhdr->middle_chunks != 0 && 1114 chunks >= zhdr->start_middle) 1115 bud = LAST; 1116 else 1117 bud = FIRST; 1118 } else if (zhdr->last_chunks == 0) 1119 bud = LAST; 1120 else if (zhdr->middle_chunks == 0) 1121 bud = MIDDLE; 1122 else { 1123 if (kref_put(&zhdr->refcount, 1124 release_z3fold_page_locked)) 1125 atomic64_dec(&pool->pages_nr); 1126 else 1127 z3fold_page_unlock(zhdr); 1128 pr_err("No free chunks in unbuddied\n"); 1129 WARN_ON(1); 1130 goto retry; 1131 } 1132 page = virt_to_page(zhdr); 1133 goto found; 1134 } 1135 bud = FIRST; 1136 } 1137 1138 page = NULL; 1139 if (can_sleep) { 1140 spin_lock(&pool->stale_lock); 1141 zhdr = list_first_entry_or_null(&pool->stale, 1142 struct z3fold_header, buddy); 1143 /* 1144 * Before allocating a page, let's see if we can take one from 1145 * the stale pages list. cancel_work_sync() can sleep so we 1146 * limit this case to the contexts where we can sleep 1147 */ 1148 if (zhdr) { 1149 list_del(&zhdr->buddy); 1150 spin_unlock(&pool->stale_lock); 1151 cancel_work_sync(&zhdr->work); 1152 page = virt_to_page(zhdr); 1153 } else { 1154 spin_unlock(&pool->stale_lock); 1155 } 1156 } 1157 if (!page) 1158 page = alloc_page(gfp); 1159 1160 if (!page) 1161 return -ENOMEM; 1162 1163 zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); 1164 if (!zhdr) { 1165 __free_page(page); 1166 return -ENOMEM; 1167 } 1168 atomic64_inc(&pool->pages_nr); 1169 1170 if (bud == HEADLESS) { 1171 set_bit(PAGE_HEADLESS, &page->private); 1172 goto headless; 1173 } 1174 if (can_sleep) { 1175 lock_page(page); 1176 __SetPageMovable(page, pool->inode->i_mapping); 1177 unlock_page(page); 1178 } else { 1179 if (trylock_page(page)) { 1180 __SetPageMovable(page, pool->inode->i_mapping); 1181 unlock_page(page); 1182 } 1183 } 1184 z3fold_page_lock(zhdr); 1185 1186 found: 1187 if (bud == FIRST) 1188 zhdr->first_chunks = chunks; 1189 else if (bud == LAST) 1190 zhdr->last_chunks = chunks; 1191 else { 1192 zhdr->middle_chunks = chunks; 1193 zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS; 1194 } 1195 add_to_unbuddied(pool, zhdr); 1196 1197 headless: 1198 spin_lock(&pool->lock); 1199 /* Add/move z3fold page to beginning of LRU */ 1200 if (!list_empty(&page->lru)) 1201 list_del(&page->lru); 1202 1203 list_add(&page->lru, &pool->lru); 1204 1205 *handle = encode_handle(zhdr, bud); 1206 spin_unlock(&pool->lock); 1207 if (bud != HEADLESS) 1208 z3fold_page_unlock(zhdr); 1209 1210 return 0; 1211 } 1212 1213 /** 1214 * z3fold_free() - frees the allocation associated with the given handle 1215 * @pool: pool in which the allocation resided 1216 * @handle: handle associated with the allocation returned by z3fold_alloc() 1217 * 1218 * In the case that the z3fold page in which the allocation resides is under 1219 * reclaim, as indicated by the PG_reclaim flag being set, this function 1220 * only sets the first|last_chunks to 0. The page is actually freed 1221 * once both buddies are evicted (see z3fold_reclaim_page() below). 1222 */ 1223 static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) 1224 { 1225 struct z3fold_header *zhdr; 1226 struct page *page; 1227 enum buddy bud; 1228 bool page_claimed; 1229 1230 zhdr = get_z3fold_header(handle); 1231 page = virt_to_page(zhdr); 1232 page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private); 1233 1234 if (test_bit(PAGE_HEADLESS, &page->private)) { 1235 /* if a headless page is under reclaim, just leave. 1236 * NB: we use test_and_set_bit for a reason: if the bit 1237 * has not been set before, we release this page 1238 * immediately so we don't care about its value any more. 1239 */ 1240 if (!page_claimed) { 1241 spin_lock(&pool->lock); 1242 list_del(&page->lru); 1243 spin_unlock(&pool->lock); 1244 put_z3fold_header(zhdr); 1245 free_z3fold_page(page, true); 1246 atomic64_dec(&pool->pages_nr); 1247 } 1248 return; 1249 } 1250 1251 /* Non-headless case */ 1252 bud = handle_to_buddy(handle); 1253 1254 switch (bud) { 1255 case FIRST: 1256 zhdr->first_chunks = 0; 1257 break; 1258 case MIDDLE: 1259 zhdr->middle_chunks = 0; 1260 break; 1261 case LAST: 1262 zhdr->last_chunks = 0; 1263 break; 1264 default: 1265 pr_err("%s: unknown bud %d\n", __func__, bud); 1266 WARN_ON(1); 1267 put_z3fold_header(zhdr); 1268 clear_bit(PAGE_CLAIMED, &page->private); 1269 return; 1270 } 1271 1272 if (!page_claimed) 1273 free_handle(handle); 1274 if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) { 1275 atomic64_dec(&pool->pages_nr); 1276 return; 1277 } 1278 if (page_claimed) { 1279 /* the page has not been claimed by us */ 1280 z3fold_page_unlock(zhdr); 1281 return; 1282 } 1283 if (unlikely(PageIsolated(page)) || 1284 test_and_set_bit(NEEDS_COMPACTING, &page->private)) { 1285 put_z3fold_header(zhdr); 1286 clear_bit(PAGE_CLAIMED, &page->private); 1287 return; 1288 } 1289 if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) { 1290 spin_lock(&pool->lock); 1291 list_del_init(&zhdr->buddy); 1292 spin_unlock(&pool->lock); 1293 zhdr->cpu = -1; 1294 kref_get(&zhdr->refcount); 1295 clear_bit(PAGE_CLAIMED, &page->private); 1296 do_compact_page(zhdr, true); 1297 return; 1298 } 1299 kref_get(&zhdr->refcount); 1300 clear_bit(PAGE_CLAIMED, &page->private); 1301 queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work); 1302 put_z3fold_header(zhdr); 1303 } 1304 1305 /** 1306 * z3fold_reclaim_page() - evicts allocations from a pool page and frees it 1307 * @pool: pool from which a page will attempt to be evicted 1308 * @retries: number of pages on the LRU list for which eviction will 1309 * be attempted before failing 1310 * 1311 * z3fold reclaim is different from normal system reclaim in that it is done 1312 * from the bottom, up. This is because only the bottom layer, z3fold, has 1313 * information on how the allocations are organized within each z3fold page. 1314 * This has the potential to create interesting locking situations between 1315 * z3fold and the user, however. 1316 * 1317 * To avoid these, this is how z3fold_reclaim_page() should be called: 1318 * 1319 * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). 1320 * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and 1321 * call the user-defined eviction handler with the pool and handle as 1322 * arguments. 1323 * 1324 * If the handle can not be evicted, the eviction handler should return 1325 * non-zero. z3fold_reclaim_page() will add the z3fold page back to the 1326 * appropriate list and try the next z3fold page on the LRU up to 1327 * a user defined number of retries. 1328 * 1329 * If the handle is successfully evicted, the eviction handler should 1330 * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() 1331 * contains logic to delay freeing the page if the page is under reclaim, 1332 * as indicated by the setting of the PG_reclaim flag on the underlying page. 1333 * 1334 * If all buddies in the z3fold page are successfully evicted, then the 1335 * z3fold page can be freed. 1336 * 1337 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are 1338 * no pages to evict or an eviction handler is not registered, -EAGAIN if 1339 * the retry limit was hit. 1340 */ 1341 static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) 1342 { 1343 int i, ret = -1; 1344 struct z3fold_header *zhdr = NULL; 1345 struct page *page = NULL; 1346 struct list_head *pos; 1347 unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; 1348 1349 spin_lock(&pool->lock); 1350 if (!pool->ops || !pool->ops->evict || retries == 0) { 1351 spin_unlock(&pool->lock); 1352 return -EINVAL; 1353 } 1354 for (i = 0; i < retries; i++) { 1355 if (list_empty(&pool->lru)) { 1356 spin_unlock(&pool->lock); 1357 return -EINVAL; 1358 } 1359 list_for_each_prev(pos, &pool->lru) { 1360 page = list_entry(pos, struct page, lru); 1361 1362 /* this bit could have been set by free, in which case 1363 * we pass over to the next page in the pool. 1364 */ 1365 if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { 1366 page = NULL; 1367 continue; 1368 } 1369 1370 if (unlikely(PageIsolated(page))) { 1371 clear_bit(PAGE_CLAIMED, &page->private); 1372 page = NULL; 1373 continue; 1374 } 1375 zhdr = page_address(page); 1376 if (test_bit(PAGE_HEADLESS, &page->private)) 1377 break; 1378 1379 if (!z3fold_page_trylock(zhdr)) { 1380 clear_bit(PAGE_CLAIMED, &page->private); 1381 zhdr = NULL; 1382 continue; /* can't evict at this point */ 1383 } 1384 if (zhdr->foreign_handles) { 1385 clear_bit(PAGE_CLAIMED, &page->private); 1386 z3fold_page_unlock(zhdr); 1387 zhdr = NULL; 1388 continue; /* can't evict such page */ 1389 } 1390 kref_get(&zhdr->refcount); 1391 list_del_init(&zhdr->buddy); 1392 zhdr->cpu = -1; 1393 break; 1394 } 1395 1396 if (!zhdr) 1397 break; 1398 1399 list_del_init(&page->lru); 1400 spin_unlock(&pool->lock); 1401 1402 if (!test_bit(PAGE_HEADLESS, &page->private)) { 1403 /* 1404 * We need encode the handles before unlocking, and 1405 * use our local slots structure because z3fold_free 1406 * can zero out zhdr->slots and we can't do much 1407 * about that 1408 */ 1409 first_handle = 0; 1410 last_handle = 0; 1411 middle_handle = 0; 1412 if (zhdr->first_chunks) 1413 first_handle = encode_handle(zhdr, FIRST); 1414 if (zhdr->middle_chunks) 1415 middle_handle = encode_handle(zhdr, MIDDLE); 1416 if (zhdr->last_chunks) 1417 last_handle = encode_handle(zhdr, LAST); 1418 /* 1419 * it's safe to unlock here because we hold a 1420 * reference to this page 1421 */ 1422 z3fold_page_unlock(zhdr); 1423 } else { 1424 first_handle = encode_handle(zhdr, HEADLESS); 1425 last_handle = middle_handle = 0; 1426 } 1427 /* Issue the eviction callback(s) */ 1428 if (middle_handle) { 1429 ret = pool->ops->evict(pool, middle_handle); 1430 if (ret) 1431 goto next; 1432 free_handle(middle_handle); 1433 } 1434 if (first_handle) { 1435 ret = pool->ops->evict(pool, first_handle); 1436 if (ret) 1437 goto next; 1438 free_handle(first_handle); 1439 } 1440 if (last_handle) { 1441 ret = pool->ops->evict(pool, last_handle); 1442 if (ret) 1443 goto next; 1444 free_handle(last_handle); 1445 } 1446 next: 1447 if (test_bit(PAGE_HEADLESS, &page->private)) { 1448 if (ret == 0) { 1449 free_z3fold_page(page, true); 1450 atomic64_dec(&pool->pages_nr); 1451 return 0; 1452 } 1453 spin_lock(&pool->lock); 1454 list_add(&page->lru, &pool->lru); 1455 spin_unlock(&pool->lock); 1456 clear_bit(PAGE_CLAIMED, &page->private); 1457 } else { 1458 z3fold_page_lock(zhdr); 1459 if (kref_put(&zhdr->refcount, 1460 release_z3fold_page_locked)) { 1461 atomic64_dec(&pool->pages_nr); 1462 return 0; 1463 } 1464 /* 1465 * if we are here, the page is still not completely 1466 * free. Take the global pool lock then to be able 1467 * to add it back to the lru list 1468 */ 1469 spin_lock(&pool->lock); 1470 list_add(&page->lru, &pool->lru); 1471 spin_unlock(&pool->lock); 1472 z3fold_page_unlock(zhdr); 1473 clear_bit(PAGE_CLAIMED, &page->private); 1474 } 1475 1476 /* We started off locked to we need to lock the pool back */ 1477 spin_lock(&pool->lock); 1478 } 1479 spin_unlock(&pool->lock); 1480 return -EAGAIN; 1481 } 1482 1483 /** 1484 * z3fold_map() - maps the allocation associated with the given handle 1485 * @pool: pool in which the allocation resides 1486 * @handle: handle associated with the allocation to be mapped 1487 * 1488 * Extracts the buddy number from handle and constructs the pointer to the 1489 * correct starting chunk within the page. 1490 * 1491 * Returns: a pointer to the mapped allocation 1492 */ 1493 static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) 1494 { 1495 struct z3fold_header *zhdr; 1496 struct page *page; 1497 void *addr; 1498 enum buddy buddy; 1499 1500 zhdr = get_z3fold_header(handle); 1501 addr = zhdr; 1502 page = virt_to_page(zhdr); 1503 1504 if (test_bit(PAGE_HEADLESS, &page->private)) 1505 goto out; 1506 1507 buddy = handle_to_buddy(handle); 1508 switch (buddy) { 1509 case FIRST: 1510 addr += ZHDR_SIZE_ALIGNED; 1511 break; 1512 case MIDDLE: 1513 addr += zhdr->start_middle << CHUNK_SHIFT; 1514 set_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1515 break; 1516 case LAST: 1517 addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); 1518 break; 1519 default: 1520 pr_err("unknown buddy id %d\n", buddy); 1521 WARN_ON(1); 1522 addr = NULL; 1523 break; 1524 } 1525 1526 if (addr) 1527 zhdr->mapped_count++; 1528 out: 1529 put_z3fold_header(zhdr); 1530 return addr; 1531 } 1532 1533 /** 1534 * z3fold_unmap() - unmaps the allocation associated with the given handle 1535 * @pool: pool in which the allocation resides 1536 * @handle: handle associated with the allocation to be unmapped 1537 */ 1538 static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) 1539 { 1540 struct z3fold_header *zhdr; 1541 struct page *page; 1542 enum buddy buddy; 1543 1544 zhdr = get_z3fold_header(handle); 1545 page = virt_to_page(zhdr); 1546 1547 if (test_bit(PAGE_HEADLESS, &page->private)) 1548 return; 1549 1550 buddy = handle_to_buddy(handle); 1551 if (buddy == MIDDLE) 1552 clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); 1553 zhdr->mapped_count--; 1554 put_z3fold_header(zhdr); 1555 } 1556 1557 /** 1558 * z3fold_get_pool_size() - gets the z3fold pool size in pages 1559 * @pool: pool whose size is being queried 1560 * 1561 * Returns: size in pages of the given pool. 1562 */ 1563 static u64 z3fold_get_pool_size(struct z3fold_pool *pool) 1564 { 1565 return atomic64_read(&pool->pages_nr); 1566 } 1567 1568 static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) 1569 { 1570 struct z3fold_header *zhdr; 1571 struct z3fold_pool *pool; 1572 1573 VM_BUG_ON_PAGE(!PageMovable(page), page); 1574 VM_BUG_ON_PAGE(PageIsolated(page), page); 1575 1576 if (test_bit(PAGE_HEADLESS, &page->private) || 1577 test_bit(PAGE_CLAIMED, &page->private)) 1578 return false; 1579 1580 zhdr = page_address(page); 1581 z3fold_page_lock(zhdr); 1582 if (test_bit(NEEDS_COMPACTING, &page->private) || 1583 test_bit(PAGE_STALE, &page->private)) 1584 goto out; 1585 1586 if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) 1587 goto out; 1588 1589 pool = zhdr_to_pool(zhdr); 1590 spin_lock(&pool->lock); 1591 if (!list_empty(&zhdr->buddy)) 1592 list_del_init(&zhdr->buddy); 1593 if (!list_empty(&page->lru)) 1594 list_del_init(&page->lru); 1595 spin_unlock(&pool->lock); 1596 1597 kref_get(&zhdr->refcount); 1598 z3fold_page_unlock(zhdr); 1599 return true; 1600 1601 out: 1602 z3fold_page_unlock(zhdr); 1603 return false; 1604 } 1605 1606 static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage, 1607 struct page *page, enum migrate_mode mode) 1608 { 1609 struct z3fold_header *zhdr, *new_zhdr; 1610 struct z3fold_pool *pool; 1611 struct address_space *new_mapping; 1612 1613 VM_BUG_ON_PAGE(!PageMovable(page), page); 1614 VM_BUG_ON_PAGE(!PageIsolated(page), page); 1615 VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); 1616 1617 zhdr = page_address(page); 1618 pool = zhdr_to_pool(zhdr); 1619 1620 if (!z3fold_page_trylock(zhdr)) { 1621 return -EAGAIN; 1622 } 1623 if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) { 1624 z3fold_page_unlock(zhdr); 1625 return -EBUSY; 1626 } 1627 if (work_pending(&zhdr->work)) { 1628 z3fold_page_unlock(zhdr); 1629 return -EAGAIN; 1630 } 1631 new_zhdr = page_address(newpage); 1632 memcpy(new_zhdr, zhdr, PAGE_SIZE); 1633 newpage->private = page->private; 1634 page->private = 0; 1635 z3fold_page_unlock(zhdr); 1636 spin_lock_init(&new_zhdr->page_lock); 1637 INIT_WORK(&new_zhdr->work, compact_page_work); 1638 /* 1639 * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, 1640 * so we only have to reinitialize it. 1641 */ 1642 INIT_LIST_HEAD(&new_zhdr->buddy); 1643 new_mapping = page_mapping(page); 1644 __ClearPageMovable(page); 1645 ClearPagePrivate(page); 1646 1647 get_page(newpage); 1648 z3fold_page_lock(new_zhdr); 1649 if (new_zhdr->first_chunks) 1650 encode_handle(new_zhdr, FIRST); 1651 if (new_zhdr->last_chunks) 1652 encode_handle(new_zhdr, LAST); 1653 if (new_zhdr->middle_chunks) 1654 encode_handle(new_zhdr, MIDDLE); 1655 set_bit(NEEDS_COMPACTING, &newpage->private); 1656 new_zhdr->cpu = smp_processor_id(); 1657 spin_lock(&pool->lock); 1658 list_add(&newpage->lru, &pool->lru); 1659 spin_unlock(&pool->lock); 1660 __SetPageMovable(newpage, new_mapping); 1661 z3fold_page_unlock(new_zhdr); 1662 1663 queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); 1664 1665 page_mapcount_reset(page); 1666 put_page(page); 1667 return 0; 1668 } 1669 1670 static void z3fold_page_putback(struct page *page) 1671 { 1672 struct z3fold_header *zhdr; 1673 struct z3fold_pool *pool; 1674 1675 zhdr = page_address(page); 1676 pool = zhdr_to_pool(zhdr); 1677 1678 z3fold_page_lock(zhdr); 1679 if (!list_empty(&zhdr->buddy)) 1680 list_del_init(&zhdr->buddy); 1681 INIT_LIST_HEAD(&page->lru); 1682 if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { 1683 atomic64_dec(&pool->pages_nr); 1684 return; 1685 } 1686 spin_lock(&pool->lock); 1687 list_add(&page->lru, &pool->lru); 1688 spin_unlock(&pool->lock); 1689 z3fold_page_unlock(zhdr); 1690 } 1691 1692 static const struct address_space_operations z3fold_aops = { 1693 .isolate_page = z3fold_page_isolate, 1694 .migratepage = z3fold_page_migrate, 1695 .putback_page = z3fold_page_putback, 1696 }; 1697 1698 /***************** 1699 * zpool 1700 ****************/ 1701 1702 static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) 1703 { 1704 if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) 1705 return pool->zpool_ops->evict(pool->zpool, handle); 1706 else 1707 return -ENOENT; 1708 } 1709 1710 static const struct z3fold_ops z3fold_zpool_ops = { 1711 .evict = z3fold_zpool_evict 1712 }; 1713 1714 static void *z3fold_zpool_create(const char *name, gfp_t gfp, 1715 const struct zpool_ops *zpool_ops, 1716 struct zpool *zpool) 1717 { 1718 struct z3fold_pool *pool; 1719 1720 pool = z3fold_create_pool(name, gfp, 1721 zpool_ops ? &z3fold_zpool_ops : NULL); 1722 if (pool) { 1723 pool->zpool = zpool; 1724 pool->zpool_ops = zpool_ops; 1725 } 1726 return pool; 1727 } 1728 1729 static void z3fold_zpool_destroy(void *pool) 1730 { 1731 z3fold_destroy_pool(pool); 1732 } 1733 1734 static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, 1735 unsigned long *handle) 1736 { 1737 return z3fold_alloc(pool, size, gfp, handle); 1738 } 1739 static void z3fold_zpool_free(void *pool, unsigned long handle) 1740 { 1741 z3fold_free(pool, handle); 1742 } 1743 1744 static int z3fold_zpool_shrink(void *pool, unsigned int pages, 1745 unsigned int *reclaimed) 1746 { 1747 unsigned int total = 0; 1748 int ret = -EINVAL; 1749 1750 while (total < pages) { 1751 ret = z3fold_reclaim_page(pool, 8); 1752 if (ret < 0) 1753 break; 1754 total++; 1755 } 1756 1757 if (reclaimed) 1758 *reclaimed = total; 1759 1760 return ret; 1761 } 1762 1763 static void *z3fold_zpool_map(void *pool, unsigned long handle, 1764 enum zpool_mapmode mm) 1765 { 1766 return z3fold_map(pool, handle); 1767 } 1768 static void z3fold_zpool_unmap(void *pool, unsigned long handle) 1769 { 1770 z3fold_unmap(pool, handle); 1771 } 1772 1773 static u64 z3fold_zpool_total_size(void *pool) 1774 { 1775 return z3fold_get_pool_size(pool) * PAGE_SIZE; 1776 } 1777 1778 static struct zpool_driver z3fold_zpool_driver = { 1779 .type = "z3fold", 1780 .owner = THIS_MODULE, 1781 .create = z3fold_zpool_create, 1782 .destroy = z3fold_zpool_destroy, 1783 .malloc = z3fold_zpool_malloc, 1784 .free = z3fold_zpool_free, 1785 .shrink = z3fold_zpool_shrink, 1786 .map = z3fold_zpool_map, 1787 .unmap = z3fold_zpool_unmap, 1788 .total_size = z3fold_zpool_total_size, 1789 }; 1790 1791 MODULE_ALIAS("zpool-z3fold"); 1792 1793 static int __init init_z3fold(void) 1794 { 1795 int ret; 1796 1797 /* Make sure the z3fold header is not larger than the page size */ 1798 BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE); 1799 ret = z3fold_mount(); 1800 if (ret) 1801 return ret; 1802 1803 zpool_register_driver(&z3fold_zpool_driver); 1804 1805 return 0; 1806 } 1807 1808 static void __exit exit_z3fold(void) 1809 { 1810 z3fold_unmount(); 1811 zpool_unregister_driver(&z3fold_zpool_driver); 1812 } 1813 1814 module_init(init_z3fold); 1815 module_exit(exit_z3fold); 1816 1817 MODULE_LICENSE("GPL"); 1818 MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>"); 1819 MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); 1820