1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->private: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private: refers to the component page after the first page 30 * If the page is first_page for huge object, it stores handle. 31 * Look at size_class->huge. 32 * page->freelist: points to the first free object in zspage. 33 * Free objects are linked together using in-place 34 * metadata. 35 * page->objects: maximum number of objects we can store in this 36 * zspage (class->zspage_order * PAGE_SIZE / class->size) 37 * page->lru: links together first pages of various zspages. 38 * Basically forming list of zspages in a fullness group. 39 * page->mapping: class index and fullness group of the zspage 40 * page->inuse: the number of objects that are used in this zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/preempt.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 }; 170 171 #ifdef CONFIG_ZSMALLOC_STAT 172 #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) 173 #else 174 #define NR_ZS_STAT_TYPE (OBJ_USED + 1) 175 #endif 176 177 struct zs_size_stat { 178 unsigned long objs[NR_ZS_STAT_TYPE]; 179 }; 180 181 #ifdef CONFIG_ZSMALLOC_STAT 182 static struct dentry *zs_stat_root; 183 #endif 184 185 /* 186 * number of size_classes 187 */ 188 static int zs_size_classes; 189 190 /* 191 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 192 * n <= N / f, where 193 * n = number of allocated objects 194 * N = total number of objects zspage can store 195 * f = fullness_threshold_frac 196 * 197 * Similarly, we assign zspage to: 198 * ZS_ALMOST_FULL when n > N / f 199 * ZS_EMPTY when n == 0 200 * ZS_FULL when n == N 201 * 202 * (see: fix_fullness_group()) 203 */ 204 static const int fullness_threshold_frac = 4; 205 206 struct size_class { 207 spinlock_t lock; 208 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 209 /* 210 * Size of objects stored in this class. Must be multiple 211 * of ZS_ALIGN. 212 */ 213 int size; 214 unsigned int index; 215 216 struct zs_size_stat stats; 217 218 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 219 int pages_per_zspage; 220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 221 bool huge; 222 }; 223 224 /* 225 * Placed within free objects to form a singly linked list. 226 * For every zspage, first_page->freelist gives head of this list. 227 * 228 * This must be power of 2 and less than or equal to ZS_ALIGN 229 */ 230 struct link_free { 231 union { 232 /* 233 * Position of next free chunk (encodes <PFN, obj_idx>) 234 * It's valid for non-allocated object 235 */ 236 void *next; 237 /* 238 * Handle of allocated object. 239 */ 240 unsigned long handle; 241 }; 242 }; 243 244 struct zs_pool { 245 const char *name; 246 247 struct size_class **size_class; 248 struct kmem_cache *handle_cachep; 249 250 gfp_t flags; /* allocation flags used when growing pool */ 251 atomic_long_t pages_allocated; 252 253 struct zs_pool_stats stats; 254 255 /* Compact classes */ 256 struct shrinker shrinker; 257 /* 258 * To signify that register_shrinker() was successful 259 * and unregister_shrinker() will not Oops. 260 */ 261 bool shrinker_enabled; 262 #ifdef CONFIG_ZSMALLOC_STAT 263 struct dentry *stat_dentry; 264 #endif 265 }; 266 267 /* 268 * A zspage's class index and fullness group 269 * are encoded in its (first)page->mapping 270 */ 271 #define CLASS_IDX_BITS 28 272 #define FULLNESS_BITS 4 273 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 274 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 275 276 struct mapping_area { 277 #ifdef CONFIG_PGTABLE_MAPPING 278 struct vm_struct *vm; /* vm area for mapping object that span pages */ 279 #else 280 char *vm_buf; /* copy buffer for objects that span pages */ 281 #endif 282 char *vm_addr; /* address of kmap_atomic()'ed pages */ 283 enum zs_mapmode vm_mm; /* mapping mode */ 284 }; 285 286 static int create_handle_cache(struct zs_pool *pool) 287 { 288 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 289 0, 0, NULL); 290 return pool->handle_cachep ? 0 : 1; 291 } 292 293 static void destroy_handle_cache(struct zs_pool *pool) 294 { 295 kmem_cache_destroy(pool->handle_cachep); 296 } 297 298 static unsigned long alloc_handle(struct zs_pool *pool) 299 { 300 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 301 pool->flags & ~__GFP_HIGHMEM); 302 } 303 304 static void free_handle(struct zs_pool *pool, unsigned long handle) 305 { 306 kmem_cache_free(pool->handle_cachep, (void *)handle); 307 } 308 309 static void record_obj(unsigned long handle, unsigned long obj) 310 { 311 /* 312 * lsb of @obj represents handle lock while other bits 313 * represent object value the handle is pointing so 314 * updating shouldn't do store tearing. 315 */ 316 WRITE_ONCE(*(unsigned long *)handle, obj); 317 } 318 319 /* zpool driver */ 320 321 #ifdef CONFIG_ZPOOL 322 323 static void *zs_zpool_create(const char *name, gfp_t gfp, 324 const struct zpool_ops *zpool_ops, 325 struct zpool *zpool) 326 { 327 return zs_create_pool(name, gfp); 328 } 329 330 static void zs_zpool_destroy(void *pool) 331 { 332 zs_destroy_pool(pool); 333 } 334 335 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 336 unsigned long *handle) 337 { 338 *handle = zs_malloc(pool, size); 339 return *handle ? 0 : -1; 340 } 341 static void zs_zpool_free(void *pool, unsigned long handle) 342 { 343 zs_free(pool, handle); 344 } 345 346 static int zs_zpool_shrink(void *pool, unsigned int pages, 347 unsigned int *reclaimed) 348 { 349 return -EINVAL; 350 } 351 352 static void *zs_zpool_map(void *pool, unsigned long handle, 353 enum zpool_mapmode mm) 354 { 355 enum zs_mapmode zs_mm; 356 357 switch (mm) { 358 case ZPOOL_MM_RO: 359 zs_mm = ZS_MM_RO; 360 break; 361 case ZPOOL_MM_WO: 362 zs_mm = ZS_MM_WO; 363 break; 364 case ZPOOL_MM_RW: /* fallthru */ 365 default: 366 zs_mm = ZS_MM_RW; 367 break; 368 } 369 370 return zs_map_object(pool, handle, zs_mm); 371 } 372 static void zs_zpool_unmap(void *pool, unsigned long handle) 373 { 374 zs_unmap_object(pool, handle); 375 } 376 377 static u64 zs_zpool_total_size(void *pool) 378 { 379 return zs_get_total_pages(pool) << PAGE_SHIFT; 380 } 381 382 static struct zpool_driver zs_zpool_driver = { 383 .type = "zsmalloc", 384 .owner = THIS_MODULE, 385 .create = zs_zpool_create, 386 .destroy = zs_zpool_destroy, 387 .malloc = zs_zpool_malloc, 388 .free = zs_zpool_free, 389 .shrink = zs_zpool_shrink, 390 .map = zs_zpool_map, 391 .unmap = zs_zpool_unmap, 392 .total_size = zs_zpool_total_size, 393 }; 394 395 MODULE_ALIAS("zpool-zsmalloc"); 396 #endif /* CONFIG_ZPOOL */ 397 398 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 399 { 400 return pages_per_zspage * PAGE_SIZE / size; 401 } 402 403 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 404 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 405 406 static int is_first_page(struct page *page) 407 { 408 return PagePrivate(page); 409 } 410 411 static int is_last_page(struct page *page) 412 { 413 return PagePrivate2(page); 414 } 415 416 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 417 enum fullness_group *fullness) 418 { 419 unsigned long m; 420 BUG_ON(!is_first_page(page)); 421 422 m = (unsigned long)page->mapping; 423 *fullness = m & FULLNESS_MASK; 424 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 425 } 426 427 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 428 enum fullness_group fullness) 429 { 430 unsigned long m; 431 BUG_ON(!is_first_page(page)); 432 433 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 434 (fullness & FULLNESS_MASK); 435 page->mapping = (struct address_space *)m; 436 } 437 438 /* 439 * zsmalloc divides the pool into various size classes where each 440 * class maintains a list of zspages where each zspage is divided 441 * into equal sized chunks. Each allocation falls into one of these 442 * classes depending on its size. This function returns index of the 443 * size class which has chunk size big enough to hold the give size. 444 */ 445 static int get_size_class_index(int size) 446 { 447 int idx = 0; 448 449 if (likely(size > ZS_MIN_ALLOC_SIZE)) 450 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 451 ZS_SIZE_CLASS_DELTA); 452 453 return min(zs_size_classes - 1, idx); 454 } 455 456 static inline void zs_stat_inc(struct size_class *class, 457 enum zs_stat_type type, unsigned long cnt) 458 { 459 if (type < NR_ZS_STAT_TYPE) 460 class->stats.objs[type] += cnt; 461 } 462 463 static inline void zs_stat_dec(struct size_class *class, 464 enum zs_stat_type type, unsigned long cnt) 465 { 466 if (type < NR_ZS_STAT_TYPE) 467 class->stats.objs[type] -= cnt; 468 } 469 470 static inline unsigned long zs_stat_get(struct size_class *class, 471 enum zs_stat_type type) 472 { 473 if (type < NR_ZS_STAT_TYPE) 474 return class->stats.objs[type]; 475 return 0; 476 } 477 478 #ifdef CONFIG_ZSMALLOC_STAT 479 480 static int __init zs_stat_init(void) 481 { 482 if (!debugfs_initialized()) 483 return -ENODEV; 484 485 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 486 if (!zs_stat_root) 487 return -ENOMEM; 488 489 return 0; 490 } 491 492 static void __exit zs_stat_exit(void) 493 { 494 debugfs_remove_recursive(zs_stat_root); 495 } 496 497 static unsigned long zs_can_compact(struct size_class *class); 498 499 static int zs_stats_size_show(struct seq_file *s, void *v) 500 { 501 int i; 502 struct zs_pool *pool = s->private; 503 struct size_class *class; 504 int objs_per_zspage; 505 unsigned long class_almost_full, class_almost_empty; 506 unsigned long obj_allocated, obj_used, pages_used, freeable; 507 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 508 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 509 unsigned long total_freeable = 0; 510 511 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s %8s\n", 512 "class", "size", "almost_full", "almost_empty", 513 "obj_allocated", "obj_used", "pages_used", 514 "pages_per_zspage", "freeable"); 515 516 for (i = 0; i < zs_size_classes; i++) { 517 class = pool->size_class[i]; 518 519 if (class->index != i) 520 continue; 521 522 spin_lock(&class->lock); 523 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 524 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 525 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 526 obj_used = zs_stat_get(class, OBJ_USED); 527 freeable = zs_can_compact(class); 528 spin_unlock(&class->lock); 529 530 objs_per_zspage = get_maxobj_per_zspage(class->size, 531 class->pages_per_zspage); 532 pages_used = obj_allocated / objs_per_zspage * 533 class->pages_per_zspage; 534 535 seq_printf(s, " %5u %5u %11lu %12lu %13lu" 536 " %10lu %10lu %16d %8lu\n", 537 i, class->size, class_almost_full, class_almost_empty, 538 obj_allocated, obj_used, pages_used, 539 class->pages_per_zspage, freeable); 540 541 total_class_almost_full += class_almost_full; 542 total_class_almost_empty += class_almost_empty; 543 total_objs += obj_allocated; 544 total_used_objs += obj_used; 545 total_pages += pages_used; 546 total_freeable += freeable; 547 } 548 549 seq_puts(s, "\n"); 550 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu %16s %8lu\n", 551 "Total", "", total_class_almost_full, 552 total_class_almost_empty, total_objs, 553 total_used_objs, total_pages, "", total_freeable); 554 555 return 0; 556 } 557 558 static int zs_stats_size_open(struct inode *inode, struct file *file) 559 { 560 return single_open(file, zs_stats_size_show, inode->i_private); 561 } 562 563 static const struct file_operations zs_stat_size_ops = { 564 .open = zs_stats_size_open, 565 .read = seq_read, 566 .llseek = seq_lseek, 567 .release = single_release, 568 }; 569 570 static int zs_pool_stat_create(const char *name, struct zs_pool *pool) 571 { 572 struct dentry *entry; 573 574 if (!zs_stat_root) 575 return -ENODEV; 576 577 entry = debugfs_create_dir(name, zs_stat_root); 578 if (!entry) { 579 pr_warn("debugfs dir <%s> creation failed\n", name); 580 return -ENOMEM; 581 } 582 pool->stat_dentry = entry; 583 584 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 585 pool->stat_dentry, pool, &zs_stat_size_ops); 586 if (!entry) { 587 pr_warn("%s: debugfs file entry <%s> creation failed\n", 588 name, "classes"); 589 return -ENOMEM; 590 } 591 592 return 0; 593 } 594 595 static void zs_pool_stat_destroy(struct zs_pool *pool) 596 { 597 debugfs_remove_recursive(pool->stat_dentry); 598 } 599 600 #else /* CONFIG_ZSMALLOC_STAT */ 601 static int __init zs_stat_init(void) 602 { 603 return 0; 604 } 605 606 static void __exit zs_stat_exit(void) 607 { 608 } 609 610 static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) 611 { 612 return 0; 613 } 614 615 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 616 { 617 } 618 #endif 619 620 621 /* 622 * For each size class, zspages are divided into different groups 623 * depending on how "full" they are. This was done so that we could 624 * easily find empty or nearly empty zspages when we try to shrink 625 * the pool (not yet implemented). This function returns fullness 626 * status of the given page. 627 */ 628 static enum fullness_group get_fullness_group(struct page *page) 629 { 630 int inuse, max_objects; 631 enum fullness_group fg; 632 BUG_ON(!is_first_page(page)); 633 634 inuse = page->inuse; 635 max_objects = page->objects; 636 637 if (inuse == 0) 638 fg = ZS_EMPTY; 639 else if (inuse == max_objects) 640 fg = ZS_FULL; 641 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 642 fg = ZS_ALMOST_EMPTY; 643 else 644 fg = ZS_ALMOST_FULL; 645 646 return fg; 647 } 648 649 /* 650 * Each size class maintains various freelists and zspages are assigned 651 * to one of these freelists based on the number of live objects they 652 * have. This functions inserts the given zspage into the freelist 653 * identified by <class, fullness_group>. 654 */ 655 static void insert_zspage(struct page *page, struct size_class *class, 656 enum fullness_group fullness) 657 { 658 struct page **head; 659 660 BUG_ON(!is_first_page(page)); 661 662 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 663 return; 664 665 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 666 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 667 668 head = &class->fullness_list[fullness]; 669 if (!*head) { 670 *head = page; 671 return; 672 } 673 674 /* 675 * We want to see more ZS_FULL pages and less almost 676 * empty/full. Put pages with higher ->inuse first. 677 */ 678 list_add_tail(&page->lru, &(*head)->lru); 679 if (page->inuse >= (*head)->inuse) 680 *head = page; 681 } 682 683 /* 684 * This function removes the given zspage from the freelist identified 685 * by <class, fullness_group>. 686 */ 687 static void remove_zspage(struct page *page, struct size_class *class, 688 enum fullness_group fullness) 689 { 690 struct page **head; 691 692 BUG_ON(!is_first_page(page)); 693 694 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 695 return; 696 697 head = &class->fullness_list[fullness]; 698 BUG_ON(!*head); 699 if (list_empty(&(*head)->lru)) 700 *head = NULL; 701 else if (*head == page) 702 *head = (struct page *)list_entry((*head)->lru.next, 703 struct page, lru); 704 705 list_del_init(&page->lru); 706 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 707 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 708 } 709 710 /* 711 * Each size class maintains zspages in different fullness groups depending 712 * on the number of live objects they contain. When allocating or freeing 713 * objects, the fullness status of the page can change, say, from ALMOST_FULL 714 * to ALMOST_EMPTY when freeing an object. This function checks if such 715 * a status change has occurred for the given page and accordingly moves the 716 * page from the freelist of the old fullness group to that of the new 717 * fullness group. 718 */ 719 static enum fullness_group fix_fullness_group(struct size_class *class, 720 struct page *page) 721 { 722 int class_idx; 723 enum fullness_group currfg, newfg; 724 725 BUG_ON(!is_first_page(page)); 726 727 get_zspage_mapping(page, &class_idx, &currfg); 728 newfg = get_fullness_group(page); 729 if (newfg == currfg) 730 goto out; 731 732 remove_zspage(page, class, currfg); 733 insert_zspage(page, class, newfg); 734 set_zspage_mapping(page, class_idx, newfg); 735 736 out: 737 return newfg; 738 } 739 740 /* 741 * We have to decide on how many pages to link together 742 * to form a zspage for each size class. This is important 743 * to reduce wastage due to unusable space left at end of 744 * each zspage which is given as: 745 * wastage = Zp % class_size 746 * usage = Zp - wastage 747 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 748 * 749 * For example, for size class of 3/8 * PAGE_SIZE, we should 750 * link together 3 PAGE_SIZE sized pages to form a zspage 751 * since then we can perfectly fit in 8 such objects. 752 */ 753 static int get_pages_per_zspage(int class_size) 754 { 755 int i, max_usedpc = 0; 756 /* zspage order which gives maximum used size per KB */ 757 int max_usedpc_order = 1; 758 759 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 760 int zspage_size; 761 int waste, usedpc; 762 763 zspage_size = i * PAGE_SIZE; 764 waste = zspage_size % class_size; 765 usedpc = (zspage_size - waste) * 100 / zspage_size; 766 767 if (usedpc > max_usedpc) { 768 max_usedpc = usedpc; 769 max_usedpc_order = i; 770 } 771 } 772 773 return max_usedpc_order; 774 } 775 776 /* 777 * A single 'zspage' is composed of many system pages which are 778 * linked together using fields in struct page. This function finds 779 * the first/head page, given any component page of a zspage. 780 */ 781 static struct page *get_first_page(struct page *page) 782 { 783 if (is_first_page(page)) 784 return page; 785 else 786 return (struct page *)page_private(page); 787 } 788 789 static struct page *get_next_page(struct page *page) 790 { 791 struct page *next; 792 793 if (is_last_page(page)) 794 next = NULL; 795 else if (is_first_page(page)) 796 next = (struct page *)page_private(page); 797 else 798 next = list_entry(page->lru.next, struct page, lru); 799 800 return next; 801 } 802 803 /* 804 * Encode <page, obj_idx> as a single handle value. 805 * We use the least bit of handle for tagging. 806 */ 807 static void *location_to_obj(struct page *page, unsigned long obj_idx) 808 { 809 unsigned long obj; 810 811 if (!page) { 812 BUG_ON(obj_idx); 813 return NULL; 814 } 815 816 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 817 obj |= ((obj_idx) & OBJ_INDEX_MASK); 818 obj <<= OBJ_TAG_BITS; 819 820 return (void *)obj; 821 } 822 823 /* 824 * Decode <page, obj_idx> pair from the given object handle. We adjust the 825 * decoded obj_idx back to its original value since it was adjusted in 826 * location_to_obj(). 827 */ 828 static void obj_to_location(unsigned long obj, struct page **page, 829 unsigned long *obj_idx) 830 { 831 obj >>= OBJ_TAG_BITS; 832 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 833 *obj_idx = (obj & OBJ_INDEX_MASK); 834 } 835 836 static unsigned long handle_to_obj(unsigned long handle) 837 { 838 return *(unsigned long *)handle; 839 } 840 841 static unsigned long obj_to_head(struct size_class *class, struct page *page, 842 void *obj) 843 { 844 if (class->huge) { 845 VM_BUG_ON(!is_first_page(page)); 846 return page_private(page); 847 } else 848 return *(unsigned long *)obj; 849 } 850 851 static unsigned long obj_idx_to_offset(struct page *page, 852 unsigned long obj_idx, int class_size) 853 { 854 unsigned long off = 0; 855 856 if (!is_first_page(page)) 857 off = page->index; 858 859 return off + obj_idx * class_size; 860 } 861 862 static inline int trypin_tag(unsigned long handle) 863 { 864 unsigned long *ptr = (unsigned long *)handle; 865 866 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 867 } 868 869 static void pin_tag(unsigned long handle) 870 { 871 while (!trypin_tag(handle)); 872 } 873 874 static void unpin_tag(unsigned long handle) 875 { 876 unsigned long *ptr = (unsigned long *)handle; 877 878 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 879 } 880 881 static void reset_page(struct page *page) 882 { 883 clear_bit(PG_private, &page->flags); 884 clear_bit(PG_private_2, &page->flags); 885 set_page_private(page, 0); 886 page->mapping = NULL; 887 page->freelist = NULL; 888 page_mapcount_reset(page); 889 } 890 891 static void free_zspage(struct page *first_page) 892 { 893 struct page *nextp, *tmp, *head_extra; 894 895 BUG_ON(!is_first_page(first_page)); 896 BUG_ON(first_page->inuse); 897 898 head_extra = (struct page *)page_private(first_page); 899 900 reset_page(first_page); 901 __free_page(first_page); 902 903 /* zspage with only 1 system page */ 904 if (!head_extra) 905 return; 906 907 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 908 list_del(&nextp->lru); 909 reset_page(nextp); 910 __free_page(nextp); 911 } 912 reset_page(head_extra); 913 __free_page(head_extra); 914 } 915 916 /* Initialize a newly allocated zspage */ 917 static void init_zspage(struct page *first_page, struct size_class *class) 918 { 919 unsigned long off = 0; 920 struct page *page = first_page; 921 922 BUG_ON(!is_first_page(first_page)); 923 while (page) { 924 struct page *next_page; 925 struct link_free *link; 926 unsigned int i = 1; 927 void *vaddr; 928 929 /* 930 * page->index stores offset of first object starting 931 * in the page. For the first page, this is always 0, 932 * so we use first_page->index (aka ->freelist) to store 933 * head of corresponding zspage's freelist. 934 */ 935 if (page != first_page) 936 page->index = off; 937 938 vaddr = kmap_atomic(page); 939 link = (struct link_free *)vaddr + off / sizeof(*link); 940 941 while ((off += class->size) < PAGE_SIZE) { 942 link->next = location_to_obj(page, i++); 943 link += class->size / sizeof(*link); 944 } 945 946 /* 947 * We now come to the last (full or partial) object on this 948 * page, which must point to the first object on the next 949 * page (if present) 950 */ 951 next_page = get_next_page(page); 952 link->next = location_to_obj(next_page, 0); 953 kunmap_atomic(vaddr); 954 page = next_page; 955 off %= PAGE_SIZE; 956 } 957 } 958 959 /* 960 * Allocate a zspage for the given size class 961 */ 962 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 963 { 964 int i, error; 965 struct page *first_page = NULL, *uninitialized_var(prev_page); 966 967 /* 968 * Allocate individual pages and link them together as: 969 * 1. first page->private = first sub-page 970 * 2. all sub-pages are linked together using page->lru 971 * 3. each sub-page is linked to the first page using page->private 972 * 973 * For each size class, First/Head pages are linked together using 974 * page->lru. Also, we set PG_private to identify the first page 975 * (i.e. no other sub-page has this flag set) and PG_private_2 to 976 * identify the last page. 977 */ 978 error = -ENOMEM; 979 for (i = 0; i < class->pages_per_zspage; i++) { 980 struct page *page; 981 982 page = alloc_page(flags); 983 if (!page) 984 goto cleanup; 985 986 INIT_LIST_HEAD(&page->lru); 987 if (i == 0) { /* first page */ 988 SetPagePrivate(page); 989 set_page_private(page, 0); 990 first_page = page; 991 first_page->inuse = 0; 992 } 993 if (i == 1) 994 set_page_private(first_page, (unsigned long)page); 995 if (i >= 1) 996 set_page_private(page, (unsigned long)first_page); 997 if (i >= 2) 998 list_add(&page->lru, &prev_page->lru); 999 if (i == class->pages_per_zspage - 1) /* last page */ 1000 SetPagePrivate2(page); 1001 prev_page = page; 1002 } 1003 1004 init_zspage(first_page, class); 1005 1006 first_page->freelist = location_to_obj(first_page, 0); 1007 /* Maximum number of objects we can store in this zspage */ 1008 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 1009 1010 error = 0; /* Success */ 1011 1012 cleanup: 1013 if (unlikely(error) && first_page) { 1014 free_zspage(first_page); 1015 first_page = NULL; 1016 } 1017 1018 return first_page; 1019 } 1020 1021 static struct page *find_get_zspage(struct size_class *class) 1022 { 1023 int i; 1024 struct page *page; 1025 1026 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1027 page = class->fullness_list[i]; 1028 if (page) 1029 break; 1030 } 1031 1032 return page; 1033 } 1034 1035 #ifdef CONFIG_PGTABLE_MAPPING 1036 static inline int __zs_cpu_up(struct mapping_area *area) 1037 { 1038 /* 1039 * Make sure we don't leak memory if a cpu UP notification 1040 * and zs_init() race and both call zs_cpu_up() on the same cpu 1041 */ 1042 if (area->vm) 1043 return 0; 1044 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1045 if (!area->vm) 1046 return -ENOMEM; 1047 return 0; 1048 } 1049 1050 static inline void __zs_cpu_down(struct mapping_area *area) 1051 { 1052 if (area->vm) 1053 free_vm_area(area->vm); 1054 area->vm = NULL; 1055 } 1056 1057 static inline void *__zs_map_object(struct mapping_area *area, 1058 struct page *pages[2], int off, int size) 1059 { 1060 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1061 area->vm_addr = area->vm->addr; 1062 return area->vm_addr + off; 1063 } 1064 1065 static inline void __zs_unmap_object(struct mapping_area *area, 1066 struct page *pages[2], int off, int size) 1067 { 1068 unsigned long addr = (unsigned long)area->vm_addr; 1069 1070 unmap_kernel_range(addr, PAGE_SIZE * 2); 1071 } 1072 1073 #else /* CONFIG_PGTABLE_MAPPING */ 1074 1075 static inline int __zs_cpu_up(struct mapping_area *area) 1076 { 1077 /* 1078 * Make sure we don't leak memory if a cpu UP notification 1079 * and zs_init() race and both call zs_cpu_up() on the same cpu 1080 */ 1081 if (area->vm_buf) 1082 return 0; 1083 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1084 if (!area->vm_buf) 1085 return -ENOMEM; 1086 return 0; 1087 } 1088 1089 static inline void __zs_cpu_down(struct mapping_area *area) 1090 { 1091 kfree(area->vm_buf); 1092 area->vm_buf = NULL; 1093 } 1094 1095 static void *__zs_map_object(struct mapping_area *area, 1096 struct page *pages[2], int off, int size) 1097 { 1098 int sizes[2]; 1099 void *addr; 1100 char *buf = area->vm_buf; 1101 1102 /* disable page faults to match kmap_atomic() return conditions */ 1103 pagefault_disable(); 1104 1105 /* no read fastpath */ 1106 if (area->vm_mm == ZS_MM_WO) 1107 goto out; 1108 1109 sizes[0] = PAGE_SIZE - off; 1110 sizes[1] = size - sizes[0]; 1111 1112 /* copy object to per-cpu buffer */ 1113 addr = kmap_atomic(pages[0]); 1114 memcpy(buf, addr + off, sizes[0]); 1115 kunmap_atomic(addr); 1116 addr = kmap_atomic(pages[1]); 1117 memcpy(buf + sizes[0], addr, sizes[1]); 1118 kunmap_atomic(addr); 1119 out: 1120 return area->vm_buf; 1121 } 1122 1123 static void __zs_unmap_object(struct mapping_area *area, 1124 struct page *pages[2], int off, int size) 1125 { 1126 int sizes[2]; 1127 void *addr; 1128 char *buf; 1129 1130 /* no write fastpath */ 1131 if (area->vm_mm == ZS_MM_RO) 1132 goto out; 1133 1134 buf = area->vm_buf; 1135 buf = buf + ZS_HANDLE_SIZE; 1136 size -= ZS_HANDLE_SIZE; 1137 off += ZS_HANDLE_SIZE; 1138 1139 sizes[0] = PAGE_SIZE - off; 1140 sizes[1] = size - sizes[0]; 1141 1142 /* copy per-cpu buffer to object */ 1143 addr = kmap_atomic(pages[0]); 1144 memcpy(addr + off, buf, sizes[0]); 1145 kunmap_atomic(addr); 1146 addr = kmap_atomic(pages[1]); 1147 memcpy(addr, buf + sizes[0], sizes[1]); 1148 kunmap_atomic(addr); 1149 1150 out: 1151 /* enable page faults to match kunmap_atomic() return conditions */ 1152 pagefault_enable(); 1153 } 1154 1155 #endif /* CONFIG_PGTABLE_MAPPING */ 1156 1157 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1158 void *pcpu) 1159 { 1160 int ret, cpu = (long)pcpu; 1161 struct mapping_area *area; 1162 1163 switch (action) { 1164 case CPU_UP_PREPARE: 1165 area = &per_cpu(zs_map_area, cpu); 1166 ret = __zs_cpu_up(area); 1167 if (ret) 1168 return notifier_from_errno(ret); 1169 break; 1170 case CPU_DEAD: 1171 case CPU_UP_CANCELED: 1172 area = &per_cpu(zs_map_area, cpu); 1173 __zs_cpu_down(area); 1174 break; 1175 } 1176 1177 return NOTIFY_OK; 1178 } 1179 1180 static struct notifier_block zs_cpu_nb = { 1181 .notifier_call = zs_cpu_notifier 1182 }; 1183 1184 static int zs_register_cpu_notifier(void) 1185 { 1186 int cpu, uninitialized_var(ret); 1187 1188 cpu_notifier_register_begin(); 1189 1190 __register_cpu_notifier(&zs_cpu_nb); 1191 for_each_online_cpu(cpu) { 1192 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1193 if (notifier_to_errno(ret)) 1194 break; 1195 } 1196 1197 cpu_notifier_register_done(); 1198 return notifier_to_errno(ret); 1199 } 1200 1201 static void zs_unregister_cpu_notifier(void) 1202 { 1203 int cpu; 1204 1205 cpu_notifier_register_begin(); 1206 1207 for_each_online_cpu(cpu) 1208 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1209 __unregister_cpu_notifier(&zs_cpu_nb); 1210 1211 cpu_notifier_register_done(); 1212 } 1213 1214 static void init_zs_size_classes(void) 1215 { 1216 int nr; 1217 1218 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1219 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1220 nr += 1; 1221 1222 zs_size_classes = nr; 1223 } 1224 1225 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1226 { 1227 if (prev->pages_per_zspage != pages_per_zspage) 1228 return false; 1229 1230 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1231 != get_maxobj_per_zspage(size, pages_per_zspage)) 1232 return false; 1233 1234 return true; 1235 } 1236 1237 static bool zspage_full(struct page *page) 1238 { 1239 BUG_ON(!is_first_page(page)); 1240 1241 return page->inuse == page->objects; 1242 } 1243 1244 unsigned long zs_get_total_pages(struct zs_pool *pool) 1245 { 1246 return atomic_long_read(&pool->pages_allocated); 1247 } 1248 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1249 1250 /** 1251 * zs_map_object - get address of allocated object from handle. 1252 * @pool: pool from which the object was allocated 1253 * @handle: handle returned from zs_malloc 1254 * 1255 * Before using an object allocated from zs_malloc, it must be mapped using 1256 * this function. When done with the object, it must be unmapped using 1257 * zs_unmap_object. 1258 * 1259 * Only one object can be mapped per cpu at a time. There is no protection 1260 * against nested mappings. 1261 * 1262 * This function returns with preemption and page faults disabled. 1263 */ 1264 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1265 enum zs_mapmode mm) 1266 { 1267 struct page *page; 1268 unsigned long obj, obj_idx, off; 1269 1270 unsigned int class_idx; 1271 enum fullness_group fg; 1272 struct size_class *class; 1273 struct mapping_area *area; 1274 struct page *pages[2]; 1275 void *ret; 1276 1277 BUG_ON(!handle); 1278 1279 /* 1280 * Because we use per-cpu mapping areas shared among the 1281 * pools/users, we can't allow mapping in interrupt context 1282 * because it can corrupt another users mappings. 1283 */ 1284 BUG_ON(in_interrupt()); 1285 1286 /* From now on, migration cannot move the object */ 1287 pin_tag(handle); 1288 1289 obj = handle_to_obj(handle); 1290 obj_to_location(obj, &page, &obj_idx); 1291 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1292 class = pool->size_class[class_idx]; 1293 off = obj_idx_to_offset(page, obj_idx, class->size); 1294 1295 area = &get_cpu_var(zs_map_area); 1296 area->vm_mm = mm; 1297 if (off + class->size <= PAGE_SIZE) { 1298 /* this object is contained entirely within a page */ 1299 area->vm_addr = kmap_atomic(page); 1300 ret = area->vm_addr + off; 1301 goto out; 1302 } 1303 1304 /* this object spans two pages */ 1305 pages[0] = page; 1306 pages[1] = get_next_page(page); 1307 BUG_ON(!pages[1]); 1308 1309 ret = __zs_map_object(area, pages, off, class->size); 1310 out: 1311 if (!class->huge) 1312 ret += ZS_HANDLE_SIZE; 1313 1314 return ret; 1315 } 1316 EXPORT_SYMBOL_GPL(zs_map_object); 1317 1318 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1319 { 1320 struct page *page; 1321 unsigned long obj, obj_idx, off; 1322 1323 unsigned int class_idx; 1324 enum fullness_group fg; 1325 struct size_class *class; 1326 struct mapping_area *area; 1327 1328 BUG_ON(!handle); 1329 1330 obj = handle_to_obj(handle); 1331 obj_to_location(obj, &page, &obj_idx); 1332 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1333 class = pool->size_class[class_idx]; 1334 off = obj_idx_to_offset(page, obj_idx, class->size); 1335 1336 area = this_cpu_ptr(&zs_map_area); 1337 if (off + class->size <= PAGE_SIZE) 1338 kunmap_atomic(area->vm_addr); 1339 else { 1340 struct page *pages[2]; 1341 1342 pages[0] = page; 1343 pages[1] = get_next_page(page); 1344 BUG_ON(!pages[1]); 1345 1346 __zs_unmap_object(area, pages, off, class->size); 1347 } 1348 put_cpu_var(zs_map_area); 1349 unpin_tag(handle); 1350 } 1351 EXPORT_SYMBOL_GPL(zs_unmap_object); 1352 1353 static unsigned long obj_malloc(struct page *first_page, 1354 struct size_class *class, unsigned long handle) 1355 { 1356 unsigned long obj; 1357 struct link_free *link; 1358 1359 struct page *m_page; 1360 unsigned long m_objidx, m_offset; 1361 void *vaddr; 1362 1363 handle |= OBJ_ALLOCATED_TAG; 1364 obj = (unsigned long)first_page->freelist; 1365 obj_to_location(obj, &m_page, &m_objidx); 1366 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1367 1368 vaddr = kmap_atomic(m_page); 1369 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1370 first_page->freelist = link->next; 1371 if (!class->huge) 1372 /* record handle in the header of allocated chunk */ 1373 link->handle = handle; 1374 else 1375 /* record handle in first_page->private */ 1376 set_page_private(first_page, handle); 1377 kunmap_atomic(vaddr); 1378 first_page->inuse++; 1379 zs_stat_inc(class, OBJ_USED, 1); 1380 1381 return obj; 1382 } 1383 1384 1385 /** 1386 * zs_malloc - Allocate block of given size from pool. 1387 * @pool: pool to allocate from 1388 * @size: size of block to allocate 1389 * 1390 * On success, handle to the allocated object is returned, 1391 * otherwise 0. 1392 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1393 */ 1394 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1395 { 1396 unsigned long handle, obj; 1397 struct size_class *class; 1398 struct page *first_page; 1399 1400 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1401 return 0; 1402 1403 handle = alloc_handle(pool); 1404 if (!handle) 1405 return 0; 1406 1407 /* extra space in chunk to keep the handle */ 1408 size += ZS_HANDLE_SIZE; 1409 class = pool->size_class[get_size_class_index(size)]; 1410 1411 spin_lock(&class->lock); 1412 first_page = find_get_zspage(class); 1413 1414 if (!first_page) { 1415 spin_unlock(&class->lock); 1416 first_page = alloc_zspage(class, pool->flags); 1417 if (unlikely(!first_page)) { 1418 free_handle(pool, handle); 1419 return 0; 1420 } 1421 1422 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1423 atomic_long_add(class->pages_per_zspage, 1424 &pool->pages_allocated); 1425 1426 spin_lock(&class->lock); 1427 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1428 class->size, class->pages_per_zspage)); 1429 } 1430 1431 obj = obj_malloc(first_page, class, handle); 1432 /* Now move the zspage to another fullness group, if required */ 1433 fix_fullness_group(class, first_page); 1434 record_obj(handle, obj); 1435 spin_unlock(&class->lock); 1436 1437 return handle; 1438 } 1439 EXPORT_SYMBOL_GPL(zs_malloc); 1440 1441 static void obj_free(struct zs_pool *pool, struct size_class *class, 1442 unsigned long obj) 1443 { 1444 struct link_free *link; 1445 struct page *first_page, *f_page; 1446 unsigned long f_objidx, f_offset; 1447 void *vaddr; 1448 1449 BUG_ON(!obj); 1450 1451 obj &= ~OBJ_ALLOCATED_TAG; 1452 obj_to_location(obj, &f_page, &f_objidx); 1453 first_page = get_first_page(f_page); 1454 1455 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1456 1457 vaddr = kmap_atomic(f_page); 1458 1459 /* Insert this object in containing zspage's freelist */ 1460 link = (struct link_free *)(vaddr + f_offset); 1461 link->next = first_page->freelist; 1462 if (class->huge) 1463 set_page_private(first_page, 0); 1464 kunmap_atomic(vaddr); 1465 first_page->freelist = (void *)obj; 1466 first_page->inuse--; 1467 zs_stat_dec(class, OBJ_USED, 1); 1468 } 1469 1470 void zs_free(struct zs_pool *pool, unsigned long handle) 1471 { 1472 struct page *first_page, *f_page; 1473 unsigned long obj, f_objidx; 1474 int class_idx; 1475 struct size_class *class; 1476 enum fullness_group fullness; 1477 1478 if (unlikely(!handle)) 1479 return; 1480 1481 pin_tag(handle); 1482 obj = handle_to_obj(handle); 1483 obj_to_location(obj, &f_page, &f_objidx); 1484 first_page = get_first_page(f_page); 1485 1486 get_zspage_mapping(first_page, &class_idx, &fullness); 1487 class = pool->size_class[class_idx]; 1488 1489 spin_lock(&class->lock); 1490 obj_free(pool, class, obj); 1491 fullness = fix_fullness_group(class, first_page); 1492 if (fullness == ZS_EMPTY) { 1493 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1494 class->size, class->pages_per_zspage)); 1495 atomic_long_sub(class->pages_per_zspage, 1496 &pool->pages_allocated); 1497 free_zspage(first_page); 1498 } 1499 spin_unlock(&class->lock); 1500 unpin_tag(handle); 1501 1502 free_handle(pool, handle); 1503 } 1504 EXPORT_SYMBOL_GPL(zs_free); 1505 1506 static void zs_object_copy(unsigned long dst, unsigned long src, 1507 struct size_class *class) 1508 { 1509 struct page *s_page, *d_page; 1510 unsigned long s_objidx, d_objidx; 1511 unsigned long s_off, d_off; 1512 void *s_addr, *d_addr; 1513 int s_size, d_size, size; 1514 int written = 0; 1515 1516 s_size = d_size = class->size; 1517 1518 obj_to_location(src, &s_page, &s_objidx); 1519 obj_to_location(dst, &d_page, &d_objidx); 1520 1521 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1522 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1523 1524 if (s_off + class->size > PAGE_SIZE) 1525 s_size = PAGE_SIZE - s_off; 1526 1527 if (d_off + class->size > PAGE_SIZE) 1528 d_size = PAGE_SIZE - d_off; 1529 1530 s_addr = kmap_atomic(s_page); 1531 d_addr = kmap_atomic(d_page); 1532 1533 while (1) { 1534 size = min(s_size, d_size); 1535 memcpy(d_addr + d_off, s_addr + s_off, size); 1536 written += size; 1537 1538 if (written == class->size) 1539 break; 1540 1541 s_off += size; 1542 s_size -= size; 1543 d_off += size; 1544 d_size -= size; 1545 1546 if (s_off >= PAGE_SIZE) { 1547 kunmap_atomic(d_addr); 1548 kunmap_atomic(s_addr); 1549 s_page = get_next_page(s_page); 1550 BUG_ON(!s_page); 1551 s_addr = kmap_atomic(s_page); 1552 d_addr = kmap_atomic(d_page); 1553 s_size = class->size - written; 1554 s_off = 0; 1555 } 1556 1557 if (d_off >= PAGE_SIZE) { 1558 kunmap_atomic(d_addr); 1559 d_page = get_next_page(d_page); 1560 BUG_ON(!d_page); 1561 d_addr = kmap_atomic(d_page); 1562 d_size = class->size - written; 1563 d_off = 0; 1564 } 1565 } 1566 1567 kunmap_atomic(d_addr); 1568 kunmap_atomic(s_addr); 1569 } 1570 1571 /* 1572 * Find alloced object in zspage from index object and 1573 * return handle. 1574 */ 1575 static unsigned long find_alloced_obj(struct page *page, int index, 1576 struct size_class *class) 1577 { 1578 unsigned long head; 1579 int offset = 0; 1580 unsigned long handle = 0; 1581 void *addr = kmap_atomic(page); 1582 1583 if (!is_first_page(page)) 1584 offset = page->index; 1585 offset += class->size * index; 1586 1587 while (offset < PAGE_SIZE) { 1588 head = obj_to_head(class, page, addr + offset); 1589 if (head & OBJ_ALLOCATED_TAG) { 1590 handle = head & ~OBJ_ALLOCATED_TAG; 1591 if (trypin_tag(handle)) 1592 break; 1593 handle = 0; 1594 } 1595 1596 offset += class->size; 1597 index++; 1598 } 1599 1600 kunmap_atomic(addr); 1601 return handle; 1602 } 1603 1604 struct zs_compact_control { 1605 /* Source page for migration which could be a subpage of zspage. */ 1606 struct page *s_page; 1607 /* Destination page for migration which should be a first page 1608 * of zspage. */ 1609 struct page *d_page; 1610 /* Starting object index within @s_page which used for live object 1611 * in the subpage. */ 1612 int index; 1613 }; 1614 1615 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1616 struct zs_compact_control *cc) 1617 { 1618 unsigned long used_obj, free_obj; 1619 unsigned long handle; 1620 struct page *s_page = cc->s_page; 1621 struct page *d_page = cc->d_page; 1622 unsigned long index = cc->index; 1623 int ret = 0; 1624 1625 while (1) { 1626 handle = find_alloced_obj(s_page, index, class); 1627 if (!handle) { 1628 s_page = get_next_page(s_page); 1629 if (!s_page) 1630 break; 1631 index = 0; 1632 continue; 1633 } 1634 1635 /* Stop if there is no more space */ 1636 if (zspage_full(d_page)) { 1637 unpin_tag(handle); 1638 ret = -ENOMEM; 1639 break; 1640 } 1641 1642 used_obj = handle_to_obj(handle); 1643 free_obj = obj_malloc(d_page, class, handle); 1644 zs_object_copy(free_obj, used_obj, class); 1645 index++; 1646 /* 1647 * record_obj updates handle's value to free_obj and it will 1648 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which 1649 * breaks synchronization using pin_tag(e,g, zs_free) so 1650 * let's keep the lock bit. 1651 */ 1652 free_obj |= BIT(HANDLE_PIN_BIT); 1653 record_obj(handle, free_obj); 1654 unpin_tag(handle); 1655 obj_free(pool, class, used_obj); 1656 } 1657 1658 /* Remember last position in this iteration */ 1659 cc->s_page = s_page; 1660 cc->index = index; 1661 1662 return ret; 1663 } 1664 1665 static struct page *isolate_target_page(struct size_class *class) 1666 { 1667 int i; 1668 struct page *page; 1669 1670 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1671 page = class->fullness_list[i]; 1672 if (page) { 1673 remove_zspage(page, class, i); 1674 break; 1675 } 1676 } 1677 1678 return page; 1679 } 1680 1681 /* 1682 * putback_zspage - add @first_page into right class's fullness list 1683 * @pool: target pool 1684 * @class: destination class 1685 * @first_page: target page 1686 * 1687 * Return @fist_page's fullness_group 1688 */ 1689 static enum fullness_group putback_zspage(struct zs_pool *pool, 1690 struct size_class *class, 1691 struct page *first_page) 1692 { 1693 enum fullness_group fullness; 1694 1695 BUG_ON(!is_first_page(first_page)); 1696 1697 fullness = get_fullness_group(first_page); 1698 insert_zspage(first_page, class, fullness); 1699 set_zspage_mapping(first_page, class->index, fullness); 1700 1701 if (fullness == ZS_EMPTY) { 1702 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1703 class->size, class->pages_per_zspage)); 1704 atomic_long_sub(class->pages_per_zspage, 1705 &pool->pages_allocated); 1706 1707 free_zspage(first_page); 1708 } 1709 1710 return fullness; 1711 } 1712 1713 static struct page *isolate_source_page(struct size_class *class) 1714 { 1715 int i; 1716 struct page *page = NULL; 1717 1718 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1719 page = class->fullness_list[i]; 1720 if (!page) 1721 continue; 1722 1723 remove_zspage(page, class, i); 1724 break; 1725 } 1726 1727 return page; 1728 } 1729 1730 /* 1731 * 1732 * Based on the number of unused allocated objects calculate 1733 * and return the number of pages that we can free. 1734 */ 1735 static unsigned long zs_can_compact(struct size_class *class) 1736 { 1737 unsigned long obj_wasted; 1738 1739 obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - 1740 zs_stat_get(class, OBJ_USED); 1741 1742 obj_wasted /= get_maxobj_per_zspage(class->size, 1743 class->pages_per_zspage); 1744 1745 return obj_wasted * class->pages_per_zspage; 1746 } 1747 1748 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1749 { 1750 struct zs_compact_control cc; 1751 struct page *src_page; 1752 struct page *dst_page = NULL; 1753 1754 spin_lock(&class->lock); 1755 while ((src_page = isolate_source_page(class))) { 1756 1757 BUG_ON(!is_first_page(src_page)); 1758 1759 if (!zs_can_compact(class)) 1760 break; 1761 1762 cc.index = 0; 1763 cc.s_page = src_page; 1764 1765 while ((dst_page = isolate_target_page(class))) { 1766 cc.d_page = dst_page; 1767 /* 1768 * If there is no more space in dst_page, resched 1769 * and see if anyone had allocated another zspage. 1770 */ 1771 if (!migrate_zspage(pool, class, &cc)) 1772 break; 1773 1774 putback_zspage(pool, class, dst_page); 1775 } 1776 1777 /* Stop if we couldn't find slot */ 1778 if (dst_page == NULL) 1779 break; 1780 1781 putback_zspage(pool, class, dst_page); 1782 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1783 pool->stats.pages_compacted += class->pages_per_zspage; 1784 spin_unlock(&class->lock); 1785 cond_resched(); 1786 spin_lock(&class->lock); 1787 } 1788 1789 if (src_page) 1790 putback_zspage(pool, class, src_page); 1791 1792 spin_unlock(&class->lock); 1793 } 1794 1795 unsigned long zs_compact(struct zs_pool *pool) 1796 { 1797 int i; 1798 struct size_class *class; 1799 1800 for (i = zs_size_classes - 1; i >= 0; i--) { 1801 class = pool->size_class[i]; 1802 if (!class) 1803 continue; 1804 if (class->index != i) 1805 continue; 1806 __zs_compact(pool, class); 1807 } 1808 1809 return pool->stats.pages_compacted; 1810 } 1811 EXPORT_SYMBOL_GPL(zs_compact); 1812 1813 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1814 { 1815 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1816 } 1817 EXPORT_SYMBOL_GPL(zs_pool_stats); 1818 1819 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1820 struct shrink_control *sc) 1821 { 1822 unsigned long pages_freed; 1823 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1824 shrinker); 1825 1826 pages_freed = pool->stats.pages_compacted; 1827 /* 1828 * Compact classes and calculate compaction delta. 1829 * Can run concurrently with a manually triggered 1830 * (by user) compaction. 1831 */ 1832 pages_freed = zs_compact(pool) - pages_freed; 1833 1834 return pages_freed ? pages_freed : SHRINK_STOP; 1835 } 1836 1837 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1838 struct shrink_control *sc) 1839 { 1840 int i; 1841 struct size_class *class; 1842 unsigned long pages_to_free = 0; 1843 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1844 shrinker); 1845 1846 for (i = zs_size_classes - 1; i >= 0; i--) { 1847 class = pool->size_class[i]; 1848 if (!class) 1849 continue; 1850 if (class->index != i) 1851 continue; 1852 1853 pages_to_free += zs_can_compact(class); 1854 } 1855 1856 return pages_to_free; 1857 } 1858 1859 static void zs_unregister_shrinker(struct zs_pool *pool) 1860 { 1861 if (pool->shrinker_enabled) { 1862 unregister_shrinker(&pool->shrinker); 1863 pool->shrinker_enabled = false; 1864 } 1865 } 1866 1867 static int zs_register_shrinker(struct zs_pool *pool) 1868 { 1869 pool->shrinker.scan_objects = zs_shrinker_scan; 1870 pool->shrinker.count_objects = zs_shrinker_count; 1871 pool->shrinker.batch = 0; 1872 pool->shrinker.seeks = DEFAULT_SEEKS; 1873 1874 return register_shrinker(&pool->shrinker); 1875 } 1876 1877 /** 1878 * zs_create_pool - Creates an allocation pool to work from. 1879 * @flags: allocation flags used to allocate pool metadata 1880 * 1881 * This function must be called before anything when using 1882 * the zsmalloc allocator. 1883 * 1884 * On success, a pointer to the newly created pool is returned, 1885 * otherwise NULL. 1886 */ 1887 struct zs_pool *zs_create_pool(const char *name, gfp_t flags) 1888 { 1889 int i; 1890 struct zs_pool *pool; 1891 struct size_class *prev_class = NULL; 1892 1893 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1894 if (!pool) 1895 return NULL; 1896 1897 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1898 GFP_KERNEL); 1899 if (!pool->size_class) { 1900 kfree(pool); 1901 return NULL; 1902 } 1903 1904 pool->name = kstrdup(name, GFP_KERNEL); 1905 if (!pool->name) 1906 goto err; 1907 1908 if (create_handle_cache(pool)) 1909 goto err; 1910 1911 /* 1912 * Iterate reversly, because, size of size_class that we want to use 1913 * for merging should be larger or equal to current size. 1914 */ 1915 for (i = zs_size_classes - 1; i >= 0; i--) { 1916 int size; 1917 int pages_per_zspage; 1918 struct size_class *class; 1919 1920 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1921 if (size > ZS_MAX_ALLOC_SIZE) 1922 size = ZS_MAX_ALLOC_SIZE; 1923 pages_per_zspage = get_pages_per_zspage(size); 1924 1925 /* 1926 * size_class is used for normal zsmalloc operation such 1927 * as alloc/free for that size. Although it is natural that we 1928 * have one size_class for each size, there is a chance that we 1929 * can get more memory utilization if we use one size_class for 1930 * many different sizes whose size_class have same 1931 * characteristics. So, we makes size_class point to 1932 * previous size_class if possible. 1933 */ 1934 if (prev_class) { 1935 if (can_merge(prev_class, size, pages_per_zspage)) { 1936 pool->size_class[i] = prev_class; 1937 continue; 1938 } 1939 } 1940 1941 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1942 if (!class) 1943 goto err; 1944 1945 class->size = size; 1946 class->index = i; 1947 class->pages_per_zspage = pages_per_zspage; 1948 if (pages_per_zspage == 1 && 1949 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1950 class->huge = true; 1951 spin_lock_init(&class->lock); 1952 pool->size_class[i] = class; 1953 1954 prev_class = class; 1955 } 1956 1957 pool->flags = flags; 1958 1959 if (zs_pool_stat_create(name, pool)) 1960 goto err; 1961 1962 /* 1963 * Not critical, we still can use the pool 1964 * and user can trigger compaction manually. 1965 */ 1966 if (zs_register_shrinker(pool) == 0) 1967 pool->shrinker_enabled = true; 1968 return pool; 1969 1970 err: 1971 zs_destroy_pool(pool); 1972 return NULL; 1973 } 1974 EXPORT_SYMBOL_GPL(zs_create_pool); 1975 1976 void zs_destroy_pool(struct zs_pool *pool) 1977 { 1978 int i; 1979 1980 zs_unregister_shrinker(pool); 1981 zs_pool_stat_destroy(pool); 1982 1983 for (i = 0; i < zs_size_classes; i++) { 1984 int fg; 1985 struct size_class *class = pool->size_class[i]; 1986 1987 if (!class) 1988 continue; 1989 1990 if (class->index != i) 1991 continue; 1992 1993 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1994 if (class->fullness_list[fg]) { 1995 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1996 class->size, fg); 1997 } 1998 } 1999 kfree(class); 2000 } 2001 2002 destroy_handle_cache(pool); 2003 kfree(pool->size_class); 2004 kfree(pool->name); 2005 kfree(pool); 2006 } 2007 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2008 2009 static int __init zs_init(void) 2010 { 2011 int ret = zs_register_cpu_notifier(); 2012 2013 if (ret) 2014 goto notifier_fail; 2015 2016 init_zs_size_classes(); 2017 2018 #ifdef CONFIG_ZPOOL 2019 zpool_register_driver(&zs_zpool_driver); 2020 #endif 2021 2022 ret = zs_stat_init(); 2023 if (ret) { 2024 pr_err("zs stat initialization failed\n"); 2025 goto stat_fail; 2026 } 2027 return 0; 2028 2029 stat_fail: 2030 #ifdef CONFIG_ZPOOL 2031 zpool_unregister_driver(&zs_zpool_driver); 2032 #endif 2033 notifier_fail: 2034 zs_unregister_cpu_notifier(); 2035 2036 return ret; 2037 } 2038 2039 static void __exit zs_exit(void) 2040 { 2041 #ifdef CONFIG_ZPOOL 2042 zpool_unregister_driver(&zs_zpool_driver); 2043 #endif 2044 zs_unregister_cpu_notifier(); 2045 2046 zs_stat_exit(); 2047 } 2048 2049 module_init(zs_init); 2050 module_exit(zs_exit); 2051 2052 MODULE_LICENSE("Dual BSD/GPL"); 2053 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2054