1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->private: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private: refers to the component page after the first page 30 * If the page is first_page for huge object, it stores handle. 31 * Look at size_class->huge. 32 * page->freelist: points to the first free object in zspage. 33 * Free objects are linked together using in-place 34 * metadata. 35 * page->objects: maximum number of objects we can store in this 36 * zspage (class->zspage_order * PAGE_SIZE / class->size) 37 * page->lru: links together first pages of various zspages. 38 * Basically forming list of zspages in a fullness group. 39 * page->mapping: class index and fullness group of the zspage 40 * page->inuse: the number of objects that are used in this zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/preempt.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 }; 170 171 #ifdef CONFIG_ZSMALLOC_STAT 172 #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) 173 #else 174 #define NR_ZS_STAT_TYPE (OBJ_USED + 1) 175 #endif 176 177 struct zs_size_stat { 178 unsigned long objs[NR_ZS_STAT_TYPE]; 179 }; 180 181 #ifdef CONFIG_ZSMALLOC_STAT 182 static struct dentry *zs_stat_root; 183 #endif 184 185 /* 186 * number of size_classes 187 */ 188 static int zs_size_classes; 189 190 /* 191 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 192 * n <= N / f, where 193 * n = number of allocated objects 194 * N = total number of objects zspage can store 195 * f = fullness_threshold_frac 196 * 197 * Similarly, we assign zspage to: 198 * ZS_ALMOST_FULL when n > N / f 199 * ZS_EMPTY when n == 0 200 * ZS_FULL when n == N 201 * 202 * (see: fix_fullness_group()) 203 */ 204 static const int fullness_threshold_frac = 4; 205 206 struct size_class { 207 spinlock_t lock; 208 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 209 /* 210 * Size of objects stored in this class. Must be multiple 211 * of ZS_ALIGN. 212 */ 213 int size; 214 unsigned int index; 215 216 struct zs_size_stat stats; 217 218 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 219 int pages_per_zspage; 220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 221 bool huge; 222 }; 223 224 /* 225 * Placed within free objects to form a singly linked list. 226 * For every zspage, first_page->freelist gives head of this list. 227 * 228 * This must be power of 2 and less than or equal to ZS_ALIGN 229 */ 230 struct link_free { 231 union { 232 /* 233 * Position of next free chunk (encodes <PFN, obj_idx>) 234 * It's valid for non-allocated object 235 */ 236 void *next; 237 /* 238 * Handle of allocated object. 239 */ 240 unsigned long handle; 241 }; 242 }; 243 244 struct zs_pool { 245 const char *name; 246 247 struct size_class **size_class; 248 struct kmem_cache *handle_cachep; 249 250 gfp_t flags; /* allocation flags used when growing pool */ 251 atomic_long_t pages_allocated; 252 253 struct zs_pool_stats stats; 254 255 /* Compact classes */ 256 struct shrinker shrinker; 257 /* 258 * To signify that register_shrinker() was successful 259 * and unregister_shrinker() will not Oops. 260 */ 261 bool shrinker_enabled; 262 #ifdef CONFIG_ZSMALLOC_STAT 263 struct dentry *stat_dentry; 264 #endif 265 }; 266 267 /* 268 * A zspage's class index and fullness group 269 * are encoded in its (first)page->mapping 270 */ 271 #define CLASS_IDX_BITS 28 272 #define FULLNESS_BITS 4 273 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 274 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 275 276 struct mapping_area { 277 #ifdef CONFIG_PGTABLE_MAPPING 278 struct vm_struct *vm; /* vm area for mapping object that span pages */ 279 #else 280 char *vm_buf; /* copy buffer for objects that span pages */ 281 #endif 282 char *vm_addr; /* address of kmap_atomic()'ed pages */ 283 enum zs_mapmode vm_mm; /* mapping mode */ 284 bool huge; 285 }; 286 287 static int create_handle_cache(struct zs_pool *pool) 288 { 289 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 290 0, 0, NULL); 291 return pool->handle_cachep ? 0 : 1; 292 } 293 294 static void destroy_handle_cache(struct zs_pool *pool) 295 { 296 kmem_cache_destroy(pool->handle_cachep); 297 } 298 299 static unsigned long alloc_handle(struct zs_pool *pool) 300 { 301 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 302 pool->flags & ~__GFP_HIGHMEM); 303 } 304 305 static void free_handle(struct zs_pool *pool, unsigned long handle) 306 { 307 kmem_cache_free(pool->handle_cachep, (void *)handle); 308 } 309 310 static void record_obj(unsigned long handle, unsigned long obj) 311 { 312 /* 313 * lsb of @obj represents handle lock while other bits 314 * represent object value the handle is pointing so 315 * updating shouldn't do store tearing. 316 */ 317 WRITE_ONCE(*(unsigned long *)handle, obj); 318 } 319 320 /* zpool driver */ 321 322 #ifdef CONFIG_ZPOOL 323 324 static void *zs_zpool_create(const char *name, gfp_t gfp, 325 const struct zpool_ops *zpool_ops, 326 struct zpool *zpool) 327 { 328 return zs_create_pool(name, gfp); 329 } 330 331 static void zs_zpool_destroy(void *pool) 332 { 333 zs_destroy_pool(pool); 334 } 335 336 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 337 unsigned long *handle) 338 { 339 *handle = zs_malloc(pool, size); 340 return *handle ? 0 : -1; 341 } 342 static void zs_zpool_free(void *pool, unsigned long handle) 343 { 344 zs_free(pool, handle); 345 } 346 347 static int zs_zpool_shrink(void *pool, unsigned int pages, 348 unsigned int *reclaimed) 349 { 350 return -EINVAL; 351 } 352 353 static void *zs_zpool_map(void *pool, unsigned long handle, 354 enum zpool_mapmode mm) 355 { 356 enum zs_mapmode zs_mm; 357 358 switch (mm) { 359 case ZPOOL_MM_RO: 360 zs_mm = ZS_MM_RO; 361 break; 362 case ZPOOL_MM_WO: 363 zs_mm = ZS_MM_WO; 364 break; 365 case ZPOOL_MM_RW: /* fallthru */ 366 default: 367 zs_mm = ZS_MM_RW; 368 break; 369 } 370 371 return zs_map_object(pool, handle, zs_mm); 372 } 373 static void zs_zpool_unmap(void *pool, unsigned long handle) 374 { 375 zs_unmap_object(pool, handle); 376 } 377 378 static u64 zs_zpool_total_size(void *pool) 379 { 380 return zs_get_total_pages(pool) << PAGE_SHIFT; 381 } 382 383 static struct zpool_driver zs_zpool_driver = { 384 .type = "zsmalloc", 385 .owner = THIS_MODULE, 386 .create = zs_zpool_create, 387 .destroy = zs_zpool_destroy, 388 .malloc = zs_zpool_malloc, 389 .free = zs_zpool_free, 390 .shrink = zs_zpool_shrink, 391 .map = zs_zpool_map, 392 .unmap = zs_zpool_unmap, 393 .total_size = zs_zpool_total_size, 394 }; 395 396 MODULE_ALIAS("zpool-zsmalloc"); 397 #endif /* CONFIG_ZPOOL */ 398 399 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 400 { 401 return pages_per_zspage * PAGE_SIZE / size; 402 } 403 404 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 405 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 406 407 static int is_first_page(struct page *page) 408 { 409 return PagePrivate(page); 410 } 411 412 static int is_last_page(struct page *page) 413 { 414 return PagePrivate2(page); 415 } 416 417 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 418 enum fullness_group *fullness) 419 { 420 unsigned long m; 421 BUG_ON(!is_first_page(page)); 422 423 m = (unsigned long)page->mapping; 424 *fullness = m & FULLNESS_MASK; 425 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 426 } 427 428 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 429 enum fullness_group fullness) 430 { 431 unsigned long m; 432 BUG_ON(!is_first_page(page)); 433 434 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 435 (fullness & FULLNESS_MASK); 436 page->mapping = (struct address_space *)m; 437 } 438 439 /* 440 * zsmalloc divides the pool into various size classes where each 441 * class maintains a list of zspages where each zspage is divided 442 * into equal sized chunks. Each allocation falls into one of these 443 * classes depending on its size. This function returns index of the 444 * size class which has chunk size big enough to hold the give size. 445 */ 446 static int get_size_class_index(int size) 447 { 448 int idx = 0; 449 450 if (likely(size > ZS_MIN_ALLOC_SIZE)) 451 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 452 ZS_SIZE_CLASS_DELTA); 453 454 return min(zs_size_classes - 1, idx); 455 } 456 457 static inline void zs_stat_inc(struct size_class *class, 458 enum zs_stat_type type, unsigned long cnt) 459 { 460 if (type < NR_ZS_STAT_TYPE) 461 class->stats.objs[type] += cnt; 462 } 463 464 static inline void zs_stat_dec(struct size_class *class, 465 enum zs_stat_type type, unsigned long cnt) 466 { 467 if (type < NR_ZS_STAT_TYPE) 468 class->stats.objs[type] -= cnt; 469 } 470 471 static inline unsigned long zs_stat_get(struct size_class *class, 472 enum zs_stat_type type) 473 { 474 if (type < NR_ZS_STAT_TYPE) 475 return class->stats.objs[type]; 476 return 0; 477 } 478 479 #ifdef CONFIG_ZSMALLOC_STAT 480 481 static int __init zs_stat_init(void) 482 { 483 if (!debugfs_initialized()) 484 return -ENODEV; 485 486 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 487 if (!zs_stat_root) 488 return -ENOMEM; 489 490 return 0; 491 } 492 493 static void __exit zs_stat_exit(void) 494 { 495 debugfs_remove_recursive(zs_stat_root); 496 } 497 498 static int zs_stats_size_show(struct seq_file *s, void *v) 499 { 500 int i; 501 struct zs_pool *pool = s->private; 502 struct size_class *class; 503 int objs_per_zspage; 504 unsigned long class_almost_full, class_almost_empty; 505 unsigned long obj_allocated, obj_used, pages_used; 506 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 507 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 508 509 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", 510 "class", "size", "almost_full", "almost_empty", 511 "obj_allocated", "obj_used", "pages_used", 512 "pages_per_zspage"); 513 514 for (i = 0; i < zs_size_classes; i++) { 515 class = pool->size_class[i]; 516 517 if (class->index != i) 518 continue; 519 520 spin_lock(&class->lock); 521 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 522 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 523 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 524 obj_used = zs_stat_get(class, OBJ_USED); 525 spin_unlock(&class->lock); 526 527 objs_per_zspage = get_maxobj_per_zspage(class->size, 528 class->pages_per_zspage); 529 pages_used = obj_allocated / objs_per_zspage * 530 class->pages_per_zspage; 531 532 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", 533 i, class->size, class_almost_full, class_almost_empty, 534 obj_allocated, obj_used, pages_used, 535 class->pages_per_zspage); 536 537 total_class_almost_full += class_almost_full; 538 total_class_almost_empty += class_almost_empty; 539 total_objs += obj_allocated; 540 total_used_objs += obj_used; 541 total_pages += pages_used; 542 } 543 544 seq_puts(s, "\n"); 545 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", 546 "Total", "", total_class_almost_full, 547 total_class_almost_empty, total_objs, 548 total_used_objs, total_pages); 549 550 return 0; 551 } 552 553 static int zs_stats_size_open(struct inode *inode, struct file *file) 554 { 555 return single_open(file, zs_stats_size_show, inode->i_private); 556 } 557 558 static const struct file_operations zs_stat_size_ops = { 559 .open = zs_stats_size_open, 560 .read = seq_read, 561 .llseek = seq_lseek, 562 .release = single_release, 563 }; 564 565 static int zs_pool_stat_create(const char *name, struct zs_pool *pool) 566 { 567 struct dentry *entry; 568 569 if (!zs_stat_root) 570 return -ENODEV; 571 572 entry = debugfs_create_dir(name, zs_stat_root); 573 if (!entry) { 574 pr_warn("debugfs dir <%s> creation failed\n", name); 575 return -ENOMEM; 576 } 577 pool->stat_dentry = entry; 578 579 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 580 pool->stat_dentry, pool, &zs_stat_size_ops); 581 if (!entry) { 582 pr_warn("%s: debugfs file entry <%s> creation failed\n", 583 name, "classes"); 584 return -ENOMEM; 585 } 586 587 return 0; 588 } 589 590 static void zs_pool_stat_destroy(struct zs_pool *pool) 591 { 592 debugfs_remove_recursive(pool->stat_dentry); 593 } 594 595 #else /* CONFIG_ZSMALLOC_STAT */ 596 static int __init zs_stat_init(void) 597 { 598 return 0; 599 } 600 601 static void __exit zs_stat_exit(void) 602 { 603 } 604 605 static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) 606 { 607 return 0; 608 } 609 610 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 611 { 612 } 613 #endif 614 615 616 /* 617 * For each size class, zspages are divided into different groups 618 * depending on how "full" they are. This was done so that we could 619 * easily find empty or nearly empty zspages when we try to shrink 620 * the pool (not yet implemented). This function returns fullness 621 * status of the given page. 622 */ 623 static enum fullness_group get_fullness_group(struct page *page) 624 { 625 int inuse, max_objects; 626 enum fullness_group fg; 627 BUG_ON(!is_first_page(page)); 628 629 inuse = page->inuse; 630 max_objects = page->objects; 631 632 if (inuse == 0) 633 fg = ZS_EMPTY; 634 else if (inuse == max_objects) 635 fg = ZS_FULL; 636 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 637 fg = ZS_ALMOST_EMPTY; 638 else 639 fg = ZS_ALMOST_FULL; 640 641 return fg; 642 } 643 644 /* 645 * Each size class maintains various freelists and zspages are assigned 646 * to one of these freelists based on the number of live objects they 647 * have. This functions inserts the given zspage into the freelist 648 * identified by <class, fullness_group>. 649 */ 650 static void insert_zspage(struct page *page, struct size_class *class, 651 enum fullness_group fullness) 652 { 653 struct page **head; 654 655 BUG_ON(!is_first_page(page)); 656 657 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 658 return; 659 660 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 661 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 662 663 head = &class->fullness_list[fullness]; 664 if (!*head) { 665 *head = page; 666 return; 667 } 668 669 /* 670 * We want to see more ZS_FULL pages and less almost 671 * empty/full. Put pages with higher ->inuse first. 672 */ 673 list_add_tail(&page->lru, &(*head)->lru); 674 if (page->inuse >= (*head)->inuse) 675 *head = page; 676 } 677 678 /* 679 * This function removes the given zspage from the freelist identified 680 * by <class, fullness_group>. 681 */ 682 static void remove_zspage(struct page *page, struct size_class *class, 683 enum fullness_group fullness) 684 { 685 struct page **head; 686 687 BUG_ON(!is_first_page(page)); 688 689 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 690 return; 691 692 head = &class->fullness_list[fullness]; 693 BUG_ON(!*head); 694 if (list_empty(&(*head)->lru)) 695 *head = NULL; 696 else if (*head == page) 697 *head = (struct page *)list_entry((*head)->lru.next, 698 struct page, lru); 699 700 list_del_init(&page->lru); 701 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 702 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 703 } 704 705 /* 706 * Each size class maintains zspages in different fullness groups depending 707 * on the number of live objects they contain. When allocating or freeing 708 * objects, the fullness status of the page can change, say, from ALMOST_FULL 709 * to ALMOST_EMPTY when freeing an object. This function checks if such 710 * a status change has occurred for the given page and accordingly moves the 711 * page from the freelist of the old fullness group to that of the new 712 * fullness group. 713 */ 714 static enum fullness_group fix_fullness_group(struct size_class *class, 715 struct page *page) 716 { 717 int class_idx; 718 enum fullness_group currfg, newfg; 719 720 BUG_ON(!is_first_page(page)); 721 722 get_zspage_mapping(page, &class_idx, &currfg); 723 newfg = get_fullness_group(page); 724 if (newfg == currfg) 725 goto out; 726 727 remove_zspage(page, class, currfg); 728 insert_zspage(page, class, newfg); 729 set_zspage_mapping(page, class_idx, newfg); 730 731 out: 732 return newfg; 733 } 734 735 /* 736 * We have to decide on how many pages to link together 737 * to form a zspage for each size class. This is important 738 * to reduce wastage due to unusable space left at end of 739 * each zspage which is given as: 740 * wastage = Zp % class_size 741 * usage = Zp - wastage 742 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 743 * 744 * For example, for size class of 3/8 * PAGE_SIZE, we should 745 * link together 3 PAGE_SIZE sized pages to form a zspage 746 * since then we can perfectly fit in 8 such objects. 747 */ 748 static int get_pages_per_zspage(int class_size) 749 { 750 int i, max_usedpc = 0; 751 /* zspage order which gives maximum used size per KB */ 752 int max_usedpc_order = 1; 753 754 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 755 int zspage_size; 756 int waste, usedpc; 757 758 zspage_size = i * PAGE_SIZE; 759 waste = zspage_size % class_size; 760 usedpc = (zspage_size - waste) * 100 / zspage_size; 761 762 if (usedpc > max_usedpc) { 763 max_usedpc = usedpc; 764 max_usedpc_order = i; 765 } 766 } 767 768 return max_usedpc_order; 769 } 770 771 /* 772 * A single 'zspage' is composed of many system pages which are 773 * linked together using fields in struct page. This function finds 774 * the first/head page, given any component page of a zspage. 775 */ 776 static struct page *get_first_page(struct page *page) 777 { 778 if (is_first_page(page)) 779 return page; 780 else 781 return (struct page *)page_private(page); 782 } 783 784 static struct page *get_next_page(struct page *page) 785 { 786 struct page *next; 787 788 if (is_last_page(page)) 789 next = NULL; 790 else if (is_first_page(page)) 791 next = (struct page *)page_private(page); 792 else 793 next = list_entry(page->lru.next, struct page, lru); 794 795 return next; 796 } 797 798 /* 799 * Encode <page, obj_idx> as a single handle value. 800 * We use the least bit of handle for tagging. 801 */ 802 static void *location_to_obj(struct page *page, unsigned long obj_idx) 803 { 804 unsigned long obj; 805 806 if (!page) { 807 BUG_ON(obj_idx); 808 return NULL; 809 } 810 811 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 812 obj |= ((obj_idx) & OBJ_INDEX_MASK); 813 obj <<= OBJ_TAG_BITS; 814 815 return (void *)obj; 816 } 817 818 /* 819 * Decode <page, obj_idx> pair from the given object handle. We adjust the 820 * decoded obj_idx back to its original value since it was adjusted in 821 * location_to_obj(). 822 */ 823 static void obj_to_location(unsigned long obj, struct page **page, 824 unsigned long *obj_idx) 825 { 826 obj >>= OBJ_TAG_BITS; 827 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 828 *obj_idx = (obj & OBJ_INDEX_MASK); 829 } 830 831 static unsigned long handle_to_obj(unsigned long handle) 832 { 833 return *(unsigned long *)handle; 834 } 835 836 static unsigned long obj_to_head(struct size_class *class, struct page *page, 837 void *obj) 838 { 839 if (class->huge) { 840 VM_BUG_ON(!is_first_page(page)); 841 return page_private(page); 842 } else 843 return *(unsigned long *)obj; 844 } 845 846 static unsigned long obj_idx_to_offset(struct page *page, 847 unsigned long obj_idx, int class_size) 848 { 849 unsigned long off = 0; 850 851 if (!is_first_page(page)) 852 off = page->index; 853 854 return off + obj_idx * class_size; 855 } 856 857 static inline int trypin_tag(unsigned long handle) 858 { 859 unsigned long *ptr = (unsigned long *)handle; 860 861 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 862 } 863 864 static void pin_tag(unsigned long handle) 865 { 866 while (!trypin_tag(handle)); 867 } 868 869 static void unpin_tag(unsigned long handle) 870 { 871 unsigned long *ptr = (unsigned long *)handle; 872 873 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 874 } 875 876 static void reset_page(struct page *page) 877 { 878 clear_bit(PG_private, &page->flags); 879 clear_bit(PG_private_2, &page->flags); 880 set_page_private(page, 0); 881 page->mapping = NULL; 882 page->freelist = NULL; 883 page_mapcount_reset(page); 884 } 885 886 static void free_zspage(struct page *first_page) 887 { 888 struct page *nextp, *tmp, *head_extra; 889 890 BUG_ON(!is_first_page(first_page)); 891 BUG_ON(first_page->inuse); 892 893 head_extra = (struct page *)page_private(first_page); 894 895 reset_page(first_page); 896 __free_page(first_page); 897 898 /* zspage with only 1 system page */ 899 if (!head_extra) 900 return; 901 902 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 903 list_del(&nextp->lru); 904 reset_page(nextp); 905 __free_page(nextp); 906 } 907 reset_page(head_extra); 908 __free_page(head_extra); 909 } 910 911 /* Initialize a newly allocated zspage */ 912 static void init_zspage(struct page *first_page, struct size_class *class) 913 { 914 unsigned long off = 0; 915 struct page *page = first_page; 916 917 BUG_ON(!is_first_page(first_page)); 918 while (page) { 919 struct page *next_page; 920 struct link_free *link; 921 unsigned int i = 1; 922 void *vaddr; 923 924 /* 925 * page->index stores offset of first object starting 926 * in the page. For the first page, this is always 0, 927 * so we use first_page->index (aka ->freelist) to store 928 * head of corresponding zspage's freelist. 929 */ 930 if (page != first_page) 931 page->index = off; 932 933 vaddr = kmap_atomic(page); 934 link = (struct link_free *)vaddr + off / sizeof(*link); 935 936 while ((off += class->size) < PAGE_SIZE) { 937 link->next = location_to_obj(page, i++); 938 link += class->size / sizeof(*link); 939 } 940 941 /* 942 * We now come to the last (full or partial) object on this 943 * page, which must point to the first object on the next 944 * page (if present) 945 */ 946 next_page = get_next_page(page); 947 link->next = location_to_obj(next_page, 0); 948 kunmap_atomic(vaddr); 949 page = next_page; 950 off %= PAGE_SIZE; 951 } 952 } 953 954 /* 955 * Allocate a zspage for the given size class 956 */ 957 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 958 { 959 int i, error; 960 struct page *first_page = NULL, *uninitialized_var(prev_page); 961 962 /* 963 * Allocate individual pages and link them together as: 964 * 1. first page->private = first sub-page 965 * 2. all sub-pages are linked together using page->lru 966 * 3. each sub-page is linked to the first page using page->private 967 * 968 * For each size class, First/Head pages are linked together using 969 * page->lru. Also, we set PG_private to identify the first page 970 * (i.e. no other sub-page has this flag set) and PG_private_2 to 971 * identify the last page. 972 */ 973 error = -ENOMEM; 974 for (i = 0; i < class->pages_per_zspage; i++) { 975 struct page *page; 976 977 page = alloc_page(flags); 978 if (!page) 979 goto cleanup; 980 981 INIT_LIST_HEAD(&page->lru); 982 if (i == 0) { /* first page */ 983 SetPagePrivate(page); 984 set_page_private(page, 0); 985 first_page = page; 986 first_page->inuse = 0; 987 } 988 if (i == 1) 989 set_page_private(first_page, (unsigned long)page); 990 if (i >= 1) 991 set_page_private(page, (unsigned long)first_page); 992 if (i >= 2) 993 list_add(&page->lru, &prev_page->lru); 994 if (i == class->pages_per_zspage - 1) /* last page */ 995 SetPagePrivate2(page); 996 prev_page = page; 997 } 998 999 init_zspage(first_page, class); 1000 1001 first_page->freelist = location_to_obj(first_page, 0); 1002 /* Maximum number of objects we can store in this zspage */ 1003 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 1004 1005 error = 0; /* Success */ 1006 1007 cleanup: 1008 if (unlikely(error) && first_page) { 1009 free_zspage(first_page); 1010 first_page = NULL; 1011 } 1012 1013 return first_page; 1014 } 1015 1016 static struct page *find_get_zspage(struct size_class *class) 1017 { 1018 int i; 1019 struct page *page; 1020 1021 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1022 page = class->fullness_list[i]; 1023 if (page) 1024 break; 1025 } 1026 1027 return page; 1028 } 1029 1030 #ifdef CONFIG_PGTABLE_MAPPING 1031 static inline int __zs_cpu_up(struct mapping_area *area) 1032 { 1033 /* 1034 * Make sure we don't leak memory if a cpu UP notification 1035 * and zs_init() race and both call zs_cpu_up() on the same cpu 1036 */ 1037 if (area->vm) 1038 return 0; 1039 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1040 if (!area->vm) 1041 return -ENOMEM; 1042 return 0; 1043 } 1044 1045 static inline void __zs_cpu_down(struct mapping_area *area) 1046 { 1047 if (area->vm) 1048 free_vm_area(area->vm); 1049 area->vm = NULL; 1050 } 1051 1052 static inline void *__zs_map_object(struct mapping_area *area, 1053 struct page *pages[2], int off, int size) 1054 { 1055 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1056 area->vm_addr = area->vm->addr; 1057 return area->vm_addr + off; 1058 } 1059 1060 static inline void __zs_unmap_object(struct mapping_area *area, 1061 struct page *pages[2], int off, int size) 1062 { 1063 unsigned long addr = (unsigned long)area->vm_addr; 1064 1065 unmap_kernel_range(addr, PAGE_SIZE * 2); 1066 } 1067 1068 #else /* CONFIG_PGTABLE_MAPPING */ 1069 1070 static inline int __zs_cpu_up(struct mapping_area *area) 1071 { 1072 /* 1073 * Make sure we don't leak memory if a cpu UP notification 1074 * and zs_init() race and both call zs_cpu_up() on the same cpu 1075 */ 1076 if (area->vm_buf) 1077 return 0; 1078 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1079 if (!area->vm_buf) 1080 return -ENOMEM; 1081 return 0; 1082 } 1083 1084 static inline void __zs_cpu_down(struct mapping_area *area) 1085 { 1086 kfree(area->vm_buf); 1087 area->vm_buf = NULL; 1088 } 1089 1090 static void *__zs_map_object(struct mapping_area *area, 1091 struct page *pages[2], int off, int size) 1092 { 1093 int sizes[2]; 1094 void *addr; 1095 char *buf = area->vm_buf; 1096 1097 /* disable page faults to match kmap_atomic() return conditions */ 1098 pagefault_disable(); 1099 1100 /* no read fastpath */ 1101 if (area->vm_mm == ZS_MM_WO) 1102 goto out; 1103 1104 sizes[0] = PAGE_SIZE - off; 1105 sizes[1] = size - sizes[0]; 1106 1107 /* copy object to per-cpu buffer */ 1108 addr = kmap_atomic(pages[0]); 1109 memcpy(buf, addr + off, sizes[0]); 1110 kunmap_atomic(addr); 1111 addr = kmap_atomic(pages[1]); 1112 memcpy(buf + sizes[0], addr, sizes[1]); 1113 kunmap_atomic(addr); 1114 out: 1115 return area->vm_buf; 1116 } 1117 1118 static void __zs_unmap_object(struct mapping_area *area, 1119 struct page *pages[2], int off, int size) 1120 { 1121 int sizes[2]; 1122 void *addr; 1123 char *buf; 1124 1125 /* no write fastpath */ 1126 if (area->vm_mm == ZS_MM_RO) 1127 goto out; 1128 1129 buf = area->vm_buf; 1130 if (!area->huge) { 1131 buf = buf + ZS_HANDLE_SIZE; 1132 size -= ZS_HANDLE_SIZE; 1133 off += ZS_HANDLE_SIZE; 1134 } 1135 1136 sizes[0] = PAGE_SIZE - off; 1137 sizes[1] = size - sizes[0]; 1138 1139 /* copy per-cpu buffer to object */ 1140 addr = kmap_atomic(pages[0]); 1141 memcpy(addr + off, buf, sizes[0]); 1142 kunmap_atomic(addr); 1143 addr = kmap_atomic(pages[1]); 1144 memcpy(addr, buf + sizes[0], sizes[1]); 1145 kunmap_atomic(addr); 1146 1147 out: 1148 /* enable page faults to match kunmap_atomic() return conditions */ 1149 pagefault_enable(); 1150 } 1151 1152 #endif /* CONFIG_PGTABLE_MAPPING */ 1153 1154 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1155 void *pcpu) 1156 { 1157 int ret, cpu = (long)pcpu; 1158 struct mapping_area *area; 1159 1160 switch (action) { 1161 case CPU_UP_PREPARE: 1162 area = &per_cpu(zs_map_area, cpu); 1163 ret = __zs_cpu_up(area); 1164 if (ret) 1165 return notifier_from_errno(ret); 1166 break; 1167 case CPU_DEAD: 1168 case CPU_UP_CANCELED: 1169 area = &per_cpu(zs_map_area, cpu); 1170 __zs_cpu_down(area); 1171 break; 1172 } 1173 1174 return NOTIFY_OK; 1175 } 1176 1177 static struct notifier_block zs_cpu_nb = { 1178 .notifier_call = zs_cpu_notifier 1179 }; 1180 1181 static int zs_register_cpu_notifier(void) 1182 { 1183 int cpu, uninitialized_var(ret); 1184 1185 cpu_notifier_register_begin(); 1186 1187 __register_cpu_notifier(&zs_cpu_nb); 1188 for_each_online_cpu(cpu) { 1189 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1190 if (notifier_to_errno(ret)) 1191 break; 1192 } 1193 1194 cpu_notifier_register_done(); 1195 return notifier_to_errno(ret); 1196 } 1197 1198 static void zs_unregister_cpu_notifier(void) 1199 { 1200 int cpu; 1201 1202 cpu_notifier_register_begin(); 1203 1204 for_each_online_cpu(cpu) 1205 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1206 __unregister_cpu_notifier(&zs_cpu_nb); 1207 1208 cpu_notifier_register_done(); 1209 } 1210 1211 static void init_zs_size_classes(void) 1212 { 1213 int nr; 1214 1215 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1216 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1217 nr += 1; 1218 1219 zs_size_classes = nr; 1220 } 1221 1222 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1223 { 1224 if (prev->pages_per_zspage != pages_per_zspage) 1225 return false; 1226 1227 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1228 != get_maxobj_per_zspage(size, pages_per_zspage)) 1229 return false; 1230 1231 return true; 1232 } 1233 1234 static bool zspage_full(struct page *page) 1235 { 1236 BUG_ON(!is_first_page(page)); 1237 1238 return page->inuse == page->objects; 1239 } 1240 1241 unsigned long zs_get_total_pages(struct zs_pool *pool) 1242 { 1243 return atomic_long_read(&pool->pages_allocated); 1244 } 1245 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1246 1247 /** 1248 * zs_map_object - get address of allocated object from handle. 1249 * @pool: pool from which the object was allocated 1250 * @handle: handle returned from zs_malloc 1251 * 1252 * Before using an object allocated from zs_malloc, it must be mapped using 1253 * this function. When done with the object, it must be unmapped using 1254 * zs_unmap_object. 1255 * 1256 * Only one object can be mapped per cpu at a time. There is no protection 1257 * against nested mappings. 1258 * 1259 * This function returns with preemption and page faults disabled. 1260 */ 1261 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1262 enum zs_mapmode mm) 1263 { 1264 struct page *page; 1265 unsigned long obj, obj_idx, off; 1266 1267 unsigned int class_idx; 1268 enum fullness_group fg; 1269 struct size_class *class; 1270 struct mapping_area *area; 1271 struct page *pages[2]; 1272 void *ret; 1273 1274 BUG_ON(!handle); 1275 1276 /* 1277 * Because we use per-cpu mapping areas shared among the 1278 * pools/users, we can't allow mapping in interrupt context 1279 * because it can corrupt another users mappings. 1280 */ 1281 BUG_ON(in_interrupt()); 1282 1283 /* From now on, migration cannot move the object */ 1284 pin_tag(handle); 1285 1286 obj = handle_to_obj(handle); 1287 obj_to_location(obj, &page, &obj_idx); 1288 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1289 class = pool->size_class[class_idx]; 1290 off = obj_idx_to_offset(page, obj_idx, class->size); 1291 1292 area = &get_cpu_var(zs_map_area); 1293 area->vm_mm = mm; 1294 if (off + class->size <= PAGE_SIZE) { 1295 /* this object is contained entirely within a page */ 1296 area->vm_addr = kmap_atomic(page); 1297 ret = area->vm_addr + off; 1298 goto out; 1299 } 1300 1301 /* this object spans two pages */ 1302 pages[0] = page; 1303 pages[1] = get_next_page(page); 1304 BUG_ON(!pages[1]); 1305 1306 ret = __zs_map_object(area, pages, off, class->size); 1307 out: 1308 if (!class->huge) 1309 ret += ZS_HANDLE_SIZE; 1310 1311 return ret; 1312 } 1313 EXPORT_SYMBOL_GPL(zs_map_object); 1314 1315 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1316 { 1317 struct page *page; 1318 unsigned long obj, obj_idx, off; 1319 1320 unsigned int class_idx; 1321 enum fullness_group fg; 1322 struct size_class *class; 1323 struct mapping_area *area; 1324 1325 BUG_ON(!handle); 1326 1327 obj = handle_to_obj(handle); 1328 obj_to_location(obj, &page, &obj_idx); 1329 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1330 class = pool->size_class[class_idx]; 1331 off = obj_idx_to_offset(page, obj_idx, class->size); 1332 1333 area = this_cpu_ptr(&zs_map_area); 1334 if (off + class->size <= PAGE_SIZE) 1335 kunmap_atomic(area->vm_addr); 1336 else { 1337 struct page *pages[2]; 1338 1339 pages[0] = page; 1340 pages[1] = get_next_page(page); 1341 BUG_ON(!pages[1]); 1342 1343 __zs_unmap_object(area, pages, off, class->size); 1344 } 1345 put_cpu_var(zs_map_area); 1346 unpin_tag(handle); 1347 } 1348 EXPORT_SYMBOL_GPL(zs_unmap_object); 1349 1350 static unsigned long obj_malloc(struct page *first_page, 1351 struct size_class *class, unsigned long handle) 1352 { 1353 unsigned long obj; 1354 struct link_free *link; 1355 1356 struct page *m_page; 1357 unsigned long m_objidx, m_offset; 1358 void *vaddr; 1359 1360 handle |= OBJ_ALLOCATED_TAG; 1361 obj = (unsigned long)first_page->freelist; 1362 obj_to_location(obj, &m_page, &m_objidx); 1363 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1364 1365 vaddr = kmap_atomic(m_page); 1366 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1367 first_page->freelist = link->next; 1368 if (!class->huge) 1369 /* record handle in the header of allocated chunk */ 1370 link->handle = handle; 1371 else 1372 /* record handle in first_page->private */ 1373 set_page_private(first_page, handle); 1374 kunmap_atomic(vaddr); 1375 first_page->inuse++; 1376 zs_stat_inc(class, OBJ_USED, 1); 1377 1378 return obj; 1379 } 1380 1381 1382 /** 1383 * zs_malloc - Allocate block of given size from pool. 1384 * @pool: pool to allocate from 1385 * @size: size of block to allocate 1386 * 1387 * On success, handle to the allocated object is returned, 1388 * otherwise 0. 1389 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1390 */ 1391 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1392 { 1393 unsigned long handle, obj; 1394 struct size_class *class; 1395 struct page *first_page; 1396 1397 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1398 return 0; 1399 1400 handle = alloc_handle(pool); 1401 if (!handle) 1402 return 0; 1403 1404 /* extra space in chunk to keep the handle */ 1405 size += ZS_HANDLE_SIZE; 1406 class = pool->size_class[get_size_class_index(size)]; 1407 1408 spin_lock(&class->lock); 1409 first_page = find_get_zspage(class); 1410 1411 if (!first_page) { 1412 spin_unlock(&class->lock); 1413 first_page = alloc_zspage(class, pool->flags); 1414 if (unlikely(!first_page)) { 1415 free_handle(pool, handle); 1416 return 0; 1417 } 1418 1419 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1420 atomic_long_add(class->pages_per_zspage, 1421 &pool->pages_allocated); 1422 1423 spin_lock(&class->lock); 1424 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1425 class->size, class->pages_per_zspage)); 1426 } 1427 1428 obj = obj_malloc(first_page, class, handle); 1429 /* Now move the zspage to another fullness group, if required */ 1430 fix_fullness_group(class, first_page); 1431 record_obj(handle, obj); 1432 spin_unlock(&class->lock); 1433 1434 return handle; 1435 } 1436 EXPORT_SYMBOL_GPL(zs_malloc); 1437 1438 static void obj_free(struct zs_pool *pool, struct size_class *class, 1439 unsigned long obj) 1440 { 1441 struct link_free *link; 1442 struct page *first_page, *f_page; 1443 unsigned long f_objidx, f_offset; 1444 void *vaddr; 1445 1446 BUG_ON(!obj); 1447 1448 obj &= ~OBJ_ALLOCATED_TAG; 1449 obj_to_location(obj, &f_page, &f_objidx); 1450 first_page = get_first_page(f_page); 1451 1452 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1453 1454 vaddr = kmap_atomic(f_page); 1455 1456 /* Insert this object in containing zspage's freelist */ 1457 link = (struct link_free *)(vaddr + f_offset); 1458 link->next = first_page->freelist; 1459 if (class->huge) 1460 set_page_private(first_page, 0); 1461 kunmap_atomic(vaddr); 1462 first_page->freelist = (void *)obj; 1463 first_page->inuse--; 1464 zs_stat_dec(class, OBJ_USED, 1); 1465 } 1466 1467 void zs_free(struct zs_pool *pool, unsigned long handle) 1468 { 1469 struct page *first_page, *f_page; 1470 unsigned long obj, f_objidx; 1471 int class_idx; 1472 struct size_class *class; 1473 enum fullness_group fullness; 1474 1475 if (unlikely(!handle)) 1476 return; 1477 1478 pin_tag(handle); 1479 obj = handle_to_obj(handle); 1480 obj_to_location(obj, &f_page, &f_objidx); 1481 first_page = get_first_page(f_page); 1482 1483 get_zspage_mapping(first_page, &class_idx, &fullness); 1484 class = pool->size_class[class_idx]; 1485 1486 spin_lock(&class->lock); 1487 obj_free(pool, class, obj); 1488 fullness = fix_fullness_group(class, first_page); 1489 if (fullness == ZS_EMPTY) { 1490 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1491 class->size, class->pages_per_zspage)); 1492 atomic_long_sub(class->pages_per_zspage, 1493 &pool->pages_allocated); 1494 free_zspage(first_page); 1495 } 1496 spin_unlock(&class->lock); 1497 unpin_tag(handle); 1498 1499 free_handle(pool, handle); 1500 } 1501 EXPORT_SYMBOL_GPL(zs_free); 1502 1503 static void zs_object_copy(unsigned long dst, unsigned long src, 1504 struct size_class *class) 1505 { 1506 struct page *s_page, *d_page; 1507 unsigned long s_objidx, d_objidx; 1508 unsigned long s_off, d_off; 1509 void *s_addr, *d_addr; 1510 int s_size, d_size, size; 1511 int written = 0; 1512 1513 s_size = d_size = class->size; 1514 1515 obj_to_location(src, &s_page, &s_objidx); 1516 obj_to_location(dst, &d_page, &d_objidx); 1517 1518 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1519 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1520 1521 if (s_off + class->size > PAGE_SIZE) 1522 s_size = PAGE_SIZE - s_off; 1523 1524 if (d_off + class->size > PAGE_SIZE) 1525 d_size = PAGE_SIZE - d_off; 1526 1527 s_addr = kmap_atomic(s_page); 1528 d_addr = kmap_atomic(d_page); 1529 1530 while (1) { 1531 size = min(s_size, d_size); 1532 memcpy(d_addr + d_off, s_addr + s_off, size); 1533 written += size; 1534 1535 if (written == class->size) 1536 break; 1537 1538 s_off += size; 1539 s_size -= size; 1540 d_off += size; 1541 d_size -= size; 1542 1543 if (s_off >= PAGE_SIZE) { 1544 kunmap_atomic(d_addr); 1545 kunmap_atomic(s_addr); 1546 s_page = get_next_page(s_page); 1547 BUG_ON(!s_page); 1548 s_addr = kmap_atomic(s_page); 1549 d_addr = kmap_atomic(d_page); 1550 s_size = class->size - written; 1551 s_off = 0; 1552 } 1553 1554 if (d_off >= PAGE_SIZE) { 1555 kunmap_atomic(d_addr); 1556 d_page = get_next_page(d_page); 1557 BUG_ON(!d_page); 1558 d_addr = kmap_atomic(d_page); 1559 d_size = class->size - written; 1560 d_off = 0; 1561 } 1562 } 1563 1564 kunmap_atomic(d_addr); 1565 kunmap_atomic(s_addr); 1566 } 1567 1568 /* 1569 * Find alloced object in zspage from index object and 1570 * return handle. 1571 */ 1572 static unsigned long find_alloced_obj(struct page *page, int index, 1573 struct size_class *class) 1574 { 1575 unsigned long head; 1576 int offset = 0; 1577 unsigned long handle = 0; 1578 void *addr = kmap_atomic(page); 1579 1580 if (!is_first_page(page)) 1581 offset = page->index; 1582 offset += class->size * index; 1583 1584 while (offset < PAGE_SIZE) { 1585 head = obj_to_head(class, page, addr + offset); 1586 if (head & OBJ_ALLOCATED_TAG) { 1587 handle = head & ~OBJ_ALLOCATED_TAG; 1588 if (trypin_tag(handle)) 1589 break; 1590 handle = 0; 1591 } 1592 1593 offset += class->size; 1594 index++; 1595 } 1596 1597 kunmap_atomic(addr); 1598 return handle; 1599 } 1600 1601 struct zs_compact_control { 1602 /* Source page for migration which could be a subpage of zspage. */ 1603 struct page *s_page; 1604 /* Destination page for migration which should be a first page 1605 * of zspage. */ 1606 struct page *d_page; 1607 /* Starting object index within @s_page which used for live object 1608 * in the subpage. */ 1609 int index; 1610 }; 1611 1612 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1613 struct zs_compact_control *cc) 1614 { 1615 unsigned long used_obj, free_obj; 1616 unsigned long handle; 1617 struct page *s_page = cc->s_page; 1618 struct page *d_page = cc->d_page; 1619 unsigned long index = cc->index; 1620 int ret = 0; 1621 1622 while (1) { 1623 handle = find_alloced_obj(s_page, index, class); 1624 if (!handle) { 1625 s_page = get_next_page(s_page); 1626 if (!s_page) 1627 break; 1628 index = 0; 1629 continue; 1630 } 1631 1632 /* Stop if there is no more space */ 1633 if (zspage_full(d_page)) { 1634 unpin_tag(handle); 1635 ret = -ENOMEM; 1636 break; 1637 } 1638 1639 used_obj = handle_to_obj(handle); 1640 free_obj = obj_malloc(d_page, class, handle); 1641 zs_object_copy(free_obj, used_obj, class); 1642 index++; 1643 /* 1644 * record_obj updates handle's value to free_obj and it will 1645 * invalidate lock bit(ie, HANDLE_PIN_BIT) of handle, which 1646 * breaks synchronization using pin_tag(e,g, zs_free) so 1647 * let's keep the lock bit. 1648 */ 1649 free_obj |= BIT(HANDLE_PIN_BIT); 1650 record_obj(handle, free_obj); 1651 unpin_tag(handle); 1652 obj_free(pool, class, used_obj); 1653 } 1654 1655 /* Remember last position in this iteration */ 1656 cc->s_page = s_page; 1657 cc->index = index; 1658 1659 return ret; 1660 } 1661 1662 static struct page *isolate_target_page(struct size_class *class) 1663 { 1664 int i; 1665 struct page *page; 1666 1667 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1668 page = class->fullness_list[i]; 1669 if (page) { 1670 remove_zspage(page, class, i); 1671 break; 1672 } 1673 } 1674 1675 return page; 1676 } 1677 1678 /* 1679 * putback_zspage - add @first_page into right class's fullness list 1680 * @pool: target pool 1681 * @class: destination class 1682 * @first_page: target page 1683 * 1684 * Return @fist_page's fullness_group 1685 */ 1686 static enum fullness_group putback_zspage(struct zs_pool *pool, 1687 struct size_class *class, 1688 struct page *first_page) 1689 { 1690 enum fullness_group fullness; 1691 1692 BUG_ON(!is_first_page(first_page)); 1693 1694 fullness = get_fullness_group(first_page); 1695 insert_zspage(first_page, class, fullness); 1696 set_zspage_mapping(first_page, class->index, fullness); 1697 1698 if (fullness == ZS_EMPTY) { 1699 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1700 class->size, class->pages_per_zspage)); 1701 atomic_long_sub(class->pages_per_zspage, 1702 &pool->pages_allocated); 1703 1704 free_zspage(first_page); 1705 } 1706 1707 return fullness; 1708 } 1709 1710 static struct page *isolate_source_page(struct size_class *class) 1711 { 1712 int i; 1713 struct page *page = NULL; 1714 1715 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1716 page = class->fullness_list[i]; 1717 if (!page) 1718 continue; 1719 1720 remove_zspage(page, class, i); 1721 break; 1722 } 1723 1724 return page; 1725 } 1726 1727 /* 1728 * 1729 * Based on the number of unused allocated objects calculate 1730 * and return the number of pages that we can free. 1731 */ 1732 static unsigned long zs_can_compact(struct size_class *class) 1733 { 1734 unsigned long obj_wasted; 1735 1736 obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - 1737 zs_stat_get(class, OBJ_USED); 1738 1739 obj_wasted /= get_maxobj_per_zspage(class->size, 1740 class->pages_per_zspage); 1741 1742 return obj_wasted * class->pages_per_zspage; 1743 } 1744 1745 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1746 { 1747 struct zs_compact_control cc; 1748 struct page *src_page; 1749 struct page *dst_page = NULL; 1750 1751 spin_lock(&class->lock); 1752 while ((src_page = isolate_source_page(class))) { 1753 1754 BUG_ON(!is_first_page(src_page)); 1755 1756 if (!zs_can_compact(class)) 1757 break; 1758 1759 cc.index = 0; 1760 cc.s_page = src_page; 1761 1762 while ((dst_page = isolate_target_page(class))) { 1763 cc.d_page = dst_page; 1764 /* 1765 * If there is no more space in dst_page, resched 1766 * and see if anyone had allocated another zspage. 1767 */ 1768 if (!migrate_zspage(pool, class, &cc)) 1769 break; 1770 1771 putback_zspage(pool, class, dst_page); 1772 } 1773 1774 /* Stop if we couldn't find slot */ 1775 if (dst_page == NULL) 1776 break; 1777 1778 putback_zspage(pool, class, dst_page); 1779 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1780 pool->stats.pages_compacted += class->pages_per_zspage; 1781 spin_unlock(&class->lock); 1782 cond_resched(); 1783 spin_lock(&class->lock); 1784 } 1785 1786 if (src_page) 1787 putback_zspage(pool, class, src_page); 1788 1789 spin_unlock(&class->lock); 1790 } 1791 1792 unsigned long zs_compact(struct zs_pool *pool) 1793 { 1794 int i; 1795 struct size_class *class; 1796 1797 for (i = zs_size_classes - 1; i >= 0; i--) { 1798 class = pool->size_class[i]; 1799 if (!class) 1800 continue; 1801 if (class->index != i) 1802 continue; 1803 __zs_compact(pool, class); 1804 } 1805 1806 return pool->stats.pages_compacted; 1807 } 1808 EXPORT_SYMBOL_GPL(zs_compact); 1809 1810 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1811 { 1812 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1813 } 1814 EXPORT_SYMBOL_GPL(zs_pool_stats); 1815 1816 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1817 struct shrink_control *sc) 1818 { 1819 unsigned long pages_freed; 1820 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1821 shrinker); 1822 1823 pages_freed = pool->stats.pages_compacted; 1824 /* 1825 * Compact classes and calculate compaction delta. 1826 * Can run concurrently with a manually triggered 1827 * (by user) compaction. 1828 */ 1829 pages_freed = zs_compact(pool) - pages_freed; 1830 1831 return pages_freed ? pages_freed : SHRINK_STOP; 1832 } 1833 1834 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1835 struct shrink_control *sc) 1836 { 1837 int i; 1838 struct size_class *class; 1839 unsigned long pages_to_free = 0; 1840 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1841 shrinker); 1842 1843 for (i = zs_size_classes - 1; i >= 0; i--) { 1844 class = pool->size_class[i]; 1845 if (!class) 1846 continue; 1847 if (class->index != i) 1848 continue; 1849 1850 pages_to_free += zs_can_compact(class); 1851 } 1852 1853 return pages_to_free; 1854 } 1855 1856 static void zs_unregister_shrinker(struct zs_pool *pool) 1857 { 1858 if (pool->shrinker_enabled) { 1859 unregister_shrinker(&pool->shrinker); 1860 pool->shrinker_enabled = false; 1861 } 1862 } 1863 1864 static int zs_register_shrinker(struct zs_pool *pool) 1865 { 1866 pool->shrinker.scan_objects = zs_shrinker_scan; 1867 pool->shrinker.count_objects = zs_shrinker_count; 1868 pool->shrinker.batch = 0; 1869 pool->shrinker.seeks = DEFAULT_SEEKS; 1870 1871 return register_shrinker(&pool->shrinker); 1872 } 1873 1874 /** 1875 * zs_create_pool - Creates an allocation pool to work from. 1876 * @flags: allocation flags used to allocate pool metadata 1877 * 1878 * This function must be called before anything when using 1879 * the zsmalloc allocator. 1880 * 1881 * On success, a pointer to the newly created pool is returned, 1882 * otherwise NULL. 1883 */ 1884 struct zs_pool *zs_create_pool(const char *name, gfp_t flags) 1885 { 1886 int i; 1887 struct zs_pool *pool; 1888 struct size_class *prev_class = NULL; 1889 1890 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1891 if (!pool) 1892 return NULL; 1893 1894 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1895 GFP_KERNEL); 1896 if (!pool->size_class) { 1897 kfree(pool); 1898 return NULL; 1899 } 1900 1901 pool->name = kstrdup(name, GFP_KERNEL); 1902 if (!pool->name) 1903 goto err; 1904 1905 if (create_handle_cache(pool)) 1906 goto err; 1907 1908 /* 1909 * Iterate reversly, because, size of size_class that we want to use 1910 * for merging should be larger or equal to current size. 1911 */ 1912 for (i = zs_size_classes - 1; i >= 0; i--) { 1913 int size; 1914 int pages_per_zspage; 1915 struct size_class *class; 1916 1917 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1918 if (size > ZS_MAX_ALLOC_SIZE) 1919 size = ZS_MAX_ALLOC_SIZE; 1920 pages_per_zspage = get_pages_per_zspage(size); 1921 1922 /* 1923 * size_class is used for normal zsmalloc operation such 1924 * as alloc/free for that size. Although it is natural that we 1925 * have one size_class for each size, there is a chance that we 1926 * can get more memory utilization if we use one size_class for 1927 * many different sizes whose size_class have same 1928 * characteristics. So, we makes size_class point to 1929 * previous size_class if possible. 1930 */ 1931 if (prev_class) { 1932 if (can_merge(prev_class, size, pages_per_zspage)) { 1933 pool->size_class[i] = prev_class; 1934 continue; 1935 } 1936 } 1937 1938 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1939 if (!class) 1940 goto err; 1941 1942 class->size = size; 1943 class->index = i; 1944 class->pages_per_zspage = pages_per_zspage; 1945 if (pages_per_zspage == 1 && 1946 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1947 class->huge = true; 1948 spin_lock_init(&class->lock); 1949 pool->size_class[i] = class; 1950 1951 prev_class = class; 1952 } 1953 1954 pool->flags = flags; 1955 1956 if (zs_pool_stat_create(name, pool)) 1957 goto err; 1958 1959 /* 1960 * Not critical, we still can use the pool 1961 * and user can trigger compaction manually. 1962 */ 1963 if (zs_register_shrinker(pool) == 0) 1964 pool->shrinker_enabled = true; 1965 return pool; 1966 1967 err: 1968 zs_destroy_pool(pool); 1969 return NULL; 1970 } 1971 EXPORT_SYMBOL_GPL(zs_create_pool); 1972 1973 void zs_destroy_pool(struct zs_pool *pool) 1974 { 1975 int i; 1976 1977 zs_unregister_shrinker(pool); 1978 zs_pool_stat_destroy(pool); 1979 1980 for (i = 0; i < zs_size_classes; i++) { 1981 int fg; 1982 struct size_class *class = pool->size_class[i]; 1983 1984 if (!class) 1985 continue; 1986 1987 if (class->index != i) 1988 continue; 1989 1990 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1991 if (class->fullness_list[fg]) { 1992 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1993 class->size, fg); 1994 } 1995 } 1996 kfree(class); 1997 } 1998 1999 destroy_handle_cache(pool); 2000 kfree(pool->size_class); 2001 kfree(pool->name); 2002 kfree(pool); 2003 } 2004 EXPORT_SYMBOL_GPL(zs_destroy_pool); 2005 2006 static int __init zs_init(void) 2007 { 2008 int ret = zs_register_cpu_notifier(); 2009 2010 if (ret) 2011 goto notifier_fail; 2012 2013 init_zs_size_classes(); 2014 2015 #ifdef CONFIG_ZPOOL 2016 zpool_register_driver(&zs_zpool_driver); 2017 #endif 2018 2019 ret = zs_stat_init(); 2020 if (ret) { 2021 pr_err("zs stat initialization failed\n"); 2022 goto stat_fail; 2023 } 2024 return 0; 2025 2026 stat_fail: 2027 #ifdef CONFIG_ZPOOL 2028 zpool_unregister_driver(&zs_zpool_driver); 2029 #endif 2030 notifier_fail: 2031 zs_unregister_cpu_notifier(); 2032 2033 return ret; 2034 } 2035 2036 static void __exit zs_exit(void) 2037 { 2038 #ifdef CONFIG_ZPOOL 2039 zpool_unregister_driver(&zs_zpool_driver); 2040 #endif 2041 zs_unregister_cpu_notifier(); 2042 2043 zs_stat_exit(); 2044 } 2045 2046 module_init(zs_init); 2047 module_exit(zs_exit); 2048 2049 MODULE_LICENSE("Dual BSD/GPL"); 2050 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2051