1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->private: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private: refers to the component page after the first page 30 * If the page is first_page for huge object, it stores handle. 31 * Look at size_class->huge. 32 * page->freelist: points to the first free object in zspage. 33 * Free objects are linked together using in-place 34 * metadata. 35 * page->objects: maximum number of objects we can store in this 36 * zspage (class->zspage_order * PAGE_SIZE / class->size) 37 * page->lru: links together first pages of various zspages. 38 * Basically forming list of zspages in a fullness group. 39 * page->mapping: class index and fullness group of the zspage 40 * page->inuse: the number of objects that are used in this zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/preempt.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 }; 170 171 #ifdef CONFIG_ZSMALLOC_STAT 172 #define NR_ZS_STAT_TYPE (CLASS_ALMOST_EMPTY + 1) 173 #else 174 #define NR_ZS_STAT_TYPE (OBJ_USED + 1) 175 #endif 176 177 struct zs_size_stat { 178 unsigned long objs[NR_ZS_STAT_TYPE]; 179 }; 180 181 #ifdef CONFIG_ZSMALLOC_STAT 182 static struct dentry *zs_stat_root; 183 #endif 184 185 /* 186 * number of size_classes 187 */ 188 static int zs_size_classes; 189 190 /* 191 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 192 * n <= N / f, where 193 * n = number of allocated objects 194 * N = total number of objects zspage can store 195 * f = fullness_threshold_frac 196 * 197 * Similarly, we assign zspage to: 198 * ZS_ALMOST_FULL when n > N / f 199 * ZS_EMPTY when n == 0 200 * ZS_FULL when n == N 201 * 202 * (see: fix_fullness_group()) 203 */ 204 static const int fullness_threshold_frac = 4; 205 206 struct size_class { 207 spinlock_t lock; 208 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 209 /* 210 * Size of objects stored in this class. Must be multiple 211 * of ZS_ALIGN. 212 */ 213 int size; 214 unsigned int index; 215 216 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 217 int pages_per_zspage; 218 struct zs_size_stat stats; 219 220 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 221 bool huge; 222 }; 223 224 /* 225 * Placed within free objects to form a singly linked list. 226 * For every zspage, first_page->freelist gives head of this list. 227 * 228 * This must be power of 2 and less than or equal to ZS_ALIGN 229 */ 230 struct link_free { 231 union { 232 /* 233 * Position of next free chunk (encodes <PFN, obj_idx>) 234 * It's valid for non-allocated object 235 */ 236 void *next; 237 /* 238 * Handle of allocated object. 239 */ 240 unsigned long handle; 241 }; 242 }; 243 244 struct zs_pool { 245 const char *name; 246 247 struct size_class **size_class; 248 struct kmem_cache *handle_cachep; 249 250 gfp_t flags; /* allocation flags used when growing pool */ 251 atomic_long_t pages_allocated; 252 253 struct zs_pool_stats stats; 254 255 /* Compact classes */ 256 struct shrinker shrinker; 257 /* 258 * To signify that register_shrinker() was successful 259 * and unregister_shrinker() will not Oops. 260 */ 261 bool shrinker_enabled; 262 #ifdef CONFIG_ZSMALLOC_STAT 263 struct dentry *stat_dentry; 264 #endif 265 }; 266 267 /* 268 * A zspage's class index and fullness group 269 * are encoded in its (first)page->mapping 270 */ 271 #define CLASS_IDX_BITS 28 272 #define FULLNESS_BITS 4 273 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 274 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 275 276 struct mapping_area { 277 #ifdef CONFIG_PGTABLE_MAPPING 278 struct vm_struct *vm; /* vm area for mapping object that span pages */ 279 #else 280 char *vm_buf; /* copy buffer for objects that span pages */ 281 #endif 282 char *vm_addr; /* address of kmap_atomic()'ed pages */ 283 enum zs_mapmode vm_mm; /* mapping mode */ 284 bool huge; 285 }; 286 287 static int create_handle_cache(struct zs_pool *pool) 288 { 289 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 290 0, 0, NULL); 291 return pool->handle_cachep ? 0 : 1; 292 } 293 294 static void destroy_handle_cache(struct zs_pool *pool) 295 { 296 kmem_cache_destroy(pool->handle_cachep); 297 } 298 299 static unsigned long alloc_handle(struct zs_pool *pool) 300 { 301 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 302 pool->flags & ~__GFP_HIGHMEM); 303 } 304 305 static void free_handle(struct zs_pool *pool, unsigned long handle) 306 { 307 kmem_cache_free(pool->handle_cachep, (void *)handle); 308 } 309 310 static void record_obj(unsigned long handle, unsigned long obj) 311 { 312 *(unsigned long *)handle = obj; 313 } 314 315 /* zpool driver */ 316 317 #ifdef CONFIG_ZPOOL 318 319 static void *zs_zpool_create(const char *name, gfp_t gfp, 320 const struct zpool_ops *zpool_ops, 321 struct zpool *zpool) 322 { 323 return zs_create_pool(name, gfp); 324 } 325 326 static void zs_zpool_destroy(void *pool) 327 { 328 zs_destroy_pool(pool); 329 } 330 331 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 332 unsigned long *handle) 333 { 334 *handle = zs_malloc(pool, size); 335 return *handle ? 0 : -1; 336 } 337 static void zs_zpool_free(void *pool, unsigned long handle) 338 { 339 zs_free(pool, handle); 340 } 341 342 static int zs_zpool_shrink(void *pool, unsigned int pages, 343 unsigned int *reclaimed) 344 { 345 return -EINVAL; 346 } 347 348 static void *zs_zpool_map(void *pool, unsigned long handle, 349 enum zpool_mapmode mm) 350 { 351 enum zs_mapmode zs_mm; 352 353 switch (mm) { 354 case ZPOOL_MM_RO: 355 zs_mm = ZS_MM_RO; 356 break; 357 case ZPOOL_MM_WO: 358 zs_mm = ZS_MM_WO; 359 break; 360 case ZPOOL_MM_RW: /* fallthru */ 361 default: 362 zs_mm = ZS_MM_RW; 363 break; 364 } 365 366 return zs_map_object(pool, handle, zs_mm); 367 } 368 static void zs_zpool_unmap(void *pool, unsigned long handle) 369 { 370 zs_unmap_object(pool, handle); 371 } 372 373 static u64 zs_zpool_total_size(void *pool) 374 { 375 return zs_get_total_pages(pool) << PAGE_SHIFT; 376 } 377 378 static struct zpool_driver zs_zpool_driver = { 379 .type = "zsmalloc", 380 .owner = THIS_MODULE, 381 .create = zs_zpool_create, 382 .destroy = zs_zpool_destroy, 383 .malloc = zs_zpool_malloc, 384 .free = zs_zpool_free, 385 .shrink = zs_zpool_shrink, 386 .map = zs_zpool_map, 387 .unmap = zs_zpool_unmap, 388 .total_size = zs_zpool_total_size, 389 }; 390 391 MODULE_ALIAS("zpool-zsmalloc"); 392 #endif /* CONFIG_ZPOOL */ 393 394 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 395 { 396 return pages_per_zspage * PAGE_SIZE / size; 397 } 398 399 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 400 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 401 402 static int is_first_page(struct page *page) 403 { 404 return PagePrivate(page); 405 } 406 407 static int is_last_page(struct page *page) 408 { 409 return PagePrivate2(page); 410 } 411 412 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 413 enum fullness_group *fullness) 414 { 415 unsigned long m; 416 BUG_ON(!is_first_page(page)); 417 418 m = (unsigned long)page->mapping; 419 *fullness = m & FULLNESS_MASK; 420 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 421 } 422 423 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 424 enum fullness_group fullness) 425 { 426 unsigned long m; 427 BUG_ON(!is_first_page(page)); 428 429 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 430 (fullness & FULLNESS_MASK); 431 page->mapping = (struct address_space *)m; 432 } 433 434 /* 435 * zsmalloc divides the pool into various size classes where each 436 * class maintains a list of zspages where each zspage is divided 437 * into equal sized chunks. Each allocation falls into one of these 438 * classes depending on its size. This function returns index of the 439 * size class which has chunk size big enough to hold the give size. 440 */ 441 static int get_size_class_index(int size) 442 { 443 int idx = 0; 444 445 if (likely(size > ZS_MIN_ALLOC_SIZE)) 446 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 447 ZS_SIZE_CLASS_DELTA); 448 449 return min(zs_size_classes - 1, idx); 450 } 451 452 static inline void zs_stat_inc(struct size_class *class, 453 enum zs_stat_type type, unsigned long cnt) 454 { 455 if (type < NR_ZS_STAT_TYPE) 456 class->stats.objs[type] += cnt; 457 } 458 459 static inline void zs_stat_dec(struct size_class *class, 460 enum zs_stat_type type, unsigned long cnt) 461 { 462 if (type < NR_ZS_STAT_TYPE) 463 class->stats.objs[type] -= cnt; 464 } 465 466 static inline unsigned long zs_stat_get(struct size_class *class, 467 enum zs_stat_type type) 468 { 469 if (type < NR_ZS_STAT_TYPE) 470 return class->stats.objs[type]; 471 return 0; 472 } 473 474 #ifdef CONFIG_ZSMALLOC_STAT 475 476 static int __init zs_stat_init(void) 477 { 478 if (!debugfs_initialized()) 479 return -ENODEV; 480 481 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 482 if (!zs_stat_root) 483 return -ENOMEM; 484 485 return 0; 486 } 487 488 static void __exit zs_stat_exit(void) 489 { 490 debugfs_remove_recursive(zs_stat_root); 491 } 492 493 static int zs_stats_size_show(struct seq_file *s, void *v) 494 { 495 int i; 496 struct zs_pool *pool = s->private; 497 struct size_class *class; 498 int objs_per_zspage; 499 unsigned long class_almost_full, class_almost_empty; 500 unsigned long obj_allocated, obj_used, pages_used; 501 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 502 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 503 504 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", 505 "class", "size", "almost_full", "almost_empty", 506 "obj_allocated", "obj_used", "pages_used", 507 "pages_per_zspage"); 508 509 for (i = 0; i < zs_size_classes; i++) { 510 class = pool->size_class[i]; 511 512 if (class->index != i) 513 continue; 514 515 spin_lock(&class->lock); 516 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 517 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 518 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 519 obj_used = zs_stat_get(class, OBJ_USED); 520 spin_unlock(&class->lock); 521 522 objs_per_zspage = get_maxobj_per_zspage(class->size, 523 class->pages_per_zspage); 524 pages_used = obj_allocated / objs_per_zspage * 525 class->pages_per_zspage; 526 527 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", 528 i, class->size, class_almost_full, class_almost_empty, 529 obj_allocated, obj_used, pages_used, 530 class->pages_per_zspage); 531 532 total_class_almost_full += class_almost_full; 533 total_class_almost_empty += class_almost_empty; 534 total_objs += obj_allocated; 535 total_used_objs += obj_used; 536 total_pages += pages_used; 537 } 538 539 seq_puts(s, "\n"); 540 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", 541 "Total", "", total_class_almost_full, 542 total_class_almost_empty, total_objs, 543 total_used_objs, total_pages); 544 545 return 0; 546 } 547 548 static int zs_stats_size_open(struct inode *inode, struct file *file) 549 { 550 return single_open(file, zs_stats_size_show, inode->i_private); 551 } 552 553 static const struct file_operations zs_stat_size_ops = { 554 .open = zs_stats_size_open, 555 .read = seq_read, 556 .llseek = seq_lseek, 557 .release = single_release, 558 }; 559 560 static int zs_pool_stat_create(const char *name, struct zs_pool *pool) 561 { 562 struct dentry *entry; 563 564 if (!zs_stat_root) 565 return -ENODEV; 566 567 entry = debugfs_create_dir(name, zs_stat_root); 568 if (!entry) { 569 pr_warn("debugfs dir <%s> creation failed\n", name); 570 return -ENOMEM; 571 } 572 pool->stat_dentry = entry; 573 574 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 575 pool->stat_dentry, pool, &zs_stat_size_ops); 576 if (!entry) { 577 pr_warn("%s: debugfs file entry <%s> creation failed\n", 578 name, "classes"); 579 return -ENOMEM; 580 } 581 582 return 0; 583 } 584 585 static void zs_pool_stat_destroy(struct zs_pool *pool) 586 { 587 debugfs_remove_recursive(pool->stat_dentry); 588 } 589 590 #else /* CONFIG_ZSMALLOC_STAT */ 591 static int __init zs_stat_init(void) 592 { 593 return 0; 594 } 595 596 static void __exit zs_stat_exit(void) 597 { 598 } 599 600 static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) 601 { 602 return 0; 603 } 604 605 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 606 { 607 } 608 #endif 609 610 611 /* 612 * For each size class, zspages are divided into different groups 613 * depending on how "full" they are. This was done so that we could 614 * easily find empty or nearly empty zspages when we try to shrink 615 * the pool (not yet implemented). This function returns fullness 616 * status of the given page. 617 */ 618 static enum fullness_group get_fullness_group(struct page *page) 619 { 620 int inuse, max_objects; 621 enum fullness_group fg; 622 BUG_ON(!is_first_page(page)); 623 624 inuse = page->inuse; 625 max_objects = page->objects; 626 627 if (inuse == 0) 628 fg = ZS_EMPTY; 629 else if (inuse == max_objects) 630 fg = ZS_FULL; 631 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 632 fg = ZS_ALMOST_EMPTY; 633 else 634 fg = ZS_ALMOST_FULL; 635 636 return fg; 637 } 638 639 /* 640 * Each size class maintains various freelists and zspages are assigned 641 * to one of these freelists based on the number of live objects they 642 * have. This functions inserts the given zspage into the freelist 643 * identified by <class, fullness_group>. 644 */ 645 static void insert_zspage(struct page *page, struct size_class *class, 646 enum fullness_group fullness) 647 { 648 struct page **head; 649 650 BUG_ON(!is_first_page(page)); 651 652 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 653 return; 654 655 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 656 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 657 658 head = &class->fullness_list[fullness]; 659 if (!*head) { 660 *head = page; 661 return; 662 } 663 664 /* 665 * We want to see more ZS_FULL pages and less almost 666 * empty/full. Put pages with higher ->inuse first. 667 */ 668 list_add_tail(&page->lru, &(*head)->lru); 669 if (page->inuse >= (*head)->inuse) 670 *head = page; 671 } 672 673 /* 674 * This function removes the given zspage from the freelist identified 675 * by <class, fullness_group>. 676 */ 677 static void remove_zspage(struct page *page, struct size_class *class, 678 enum fullness_group fullness) 679 { 680 struct page **head; 681 682 BUG_ON(!is_first_page(page)); 683 684 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 685 return; 686 687 head = &class->fullness_list[fullness]; 688 BUG_ON(!*head); 689 if (list_empty(&(*head)->lru)) 690 *head = NULL; 691 else if (*head == page) 692 *head = (struct page *)list_entry((*head)->lru.next, 693 struct page, lru); 694 695 list_del_init(&page->lru); 696 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 697 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 698 } 699 700 /* 701 * Each size class maintains zspages in different fullness groups depending 702 * on the number of live objects they contain. When allocating or freeing 703 * objects, the fullness status of the page can change, say, from ALMOST_FULL 704 * to ALMOST_EMPTY when freeing an object. This function checks if such 705 * a status change has occurred for the given page and accordingly moves the 706 * page from the freelist of the old fullness group to that of the new 707 * fullness group. 708 */ 709 static enum fullness_group fix_fullness_group(struct size_class *class, 710 struct page *page) 711 { 712 int class_idx; 713 enum fullness_group currfg, newfg; 714 715 BUG_ON(!is_first_page(page)); 716 717 get_zspage_mapping(page, &class_idx, &currfg); 718 newfg = get_fullness_group(page); 719 if (newfg == currfg) 720 goto out; 721 722 remove_zspage(page, class, currfg); 723 insert_zspage(page, class, newfg); 724 set_zspage_mapping(page, class_idx, newfg); 725 726 out: 727 return newfg; 728 } 729 730 /* 731 * We have to decide on how many pages to link together 732 * to form a zspage for each size class. This is important 733 * to reduce wastage due to unusable space left at end of 734 * each zspage which is given as: 735 * wastage = Zp % class_size 736 * usage = Zp - wastage 737 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 738 * 739 * For example, for size class of 3/8 * PAGE_SIZE, we should 740 * link together 3 PAGE_SIZE sized pages to form a zspage 741 * since then we can perfectly fit in 8 such objects. 742 */ 743 static int get_pages_per_zspage(int class_size) 744 { 745 int i, max_usedpc = 0; 746 /* zspage order which gives maximum used size per KB */ 747 int max_usedpc_order = 1; 748 749 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 750 int zspage_size; 751 int waste, usedpc; 752 753 zspage_size = i * PAGE_SIZE; 754 waste = zspage_size % class_size; 755 usedpc = (zspage_size - waste) * 100 / zspage_size; 756 757 if (usedpc > max_usedpc) { 758 max_usedpc = usedpc; 759 max_usedpc_order = i; 760 } 761 } 762 763 return max_usedpc_order; 764 } 765 766 /* 767 * A single 'zspage' is composed of many system pages which are 768 * linked together using fields in struct page. This function finds 769 * the first/head page, given any component page of a zspage. 770 */ 771 static struct page *get_first_page(struct page *page) 772 { 773 if (is_first_page(page)) 774 return page; 775 else 776 return (struct page *)page_private(page); 777 } 778 779 static struct page *get_next_page(struct page *page) 780 { 781 struct page *next; 782 783 if (is_last_page(page)) 784 next = NULL; 785 else if (is_first_page(page)) 786 next = (struct page *)page_private(page); 787 else 788 next = list_entry(page->lru.next, struct page, lru); 789 790 return next; 791 } 792 793 /* 794 * Encode <page, obj_idx> as a single handle value. 795 * We use the least bit of handle for tagging. 796 */ 797 static void *location_to_obj(struct page *page, unsigned long obj_idx) 798 { 799 unsigned long obj; 800 801 if (!page) { 802 BUG_ON(obj_idx); 803 return NULL; 804 } 805 806 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 807 obj |= ((obj_idx) & OBJ_INDEX_MASK); 808 obj <<= OBJ_TAG_BITS; 809 810 return (void *)obj; 811 } 812 813 /* 814 * Decode <page, obj_idx> pair from the given object handle. We adjust the 815 * decoded obj_idx back to its original value since it was adjusted in 816 * location_to_obj(). 817 */ 818 static void obj_to_location(unsigned long obj, struct page **page, 819 unsigned long *obj_idx) 820 { 821 obj >>= OBJ_TAG_BITS; 822 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 823 *obj_idx = (obj & OBJ_INDEX_MASK); 824 } 825 826 static unsigned long handle_to_obj(unsigned long handle) 827 { 828 return *(unsigned long *)handle; 829 } 830 831 static unsigned long obj_to_head(struct size_class *class, struct page *page, 832 void *obj) 833 { 834 if (class->huge) { 835 VM_BUG_ON(!is_first_page(page)); 836 return page_private(page); 837 } else 838 return *(unsigned long *)obj; 839 } 840 841 static unsigned long obj_idx_to_offset(struct page *page, 842 unsigned long obj_idx, int class_size) 843 { 844 unsigned long off = 0; 845 846 if (!is_first_page(page)) 847 off = page->index; 848 849 return off + obj_idx * class_size; 850 } 851 852 static inline int trypin_tag(unsigned long handle) 853 { 854 unsigned long *ptr = (unsigned long *)handle; 855 856 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 857 } 858 859 static void pin_tag(unsigned long handle) 860 { 861 while (!trypin_tag(handle)); 862 } 863 864 static void unpin_tag(unsigned long handle) 865 { 866 unsigned long *ptr = (unsigned long *)handle; 867 868 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 869 } 870 871 static void reset_page(struct page *page) 872 { 873 clear_bit(PG_private, &page->flags); 874 clear_bit(PG_private_2, &page->flags); 875 set_page_private(page, 0); 876 page->mapping = NULL; 877 page->freelist = NULL; 878 page_mapcount_reset(page); 879 } 880 881 static void free_zspage(struct page *first_page) 882 { 883 struct page *nextp, *tmp, *head_extra; 884 885 BUG_ON(!is_first_page(first_page)); 886 BUG_ON(first_page->inuse); 887 888 head_extra = (struct page *)page_private(first_page); 889 890 reset_page(first_page); 891 __free_page(first_page); 892 893 /* zspage with only 1 system page */ 894 if (!head_extra) 895 return; 896 897 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 898 list_del(&nextp->lru); 899 reset_page(nextp); 900 __free_page(nextp); 901 } 902 reset_page(head_extra); 903 __free_page(head_extra); 904 } 905 906 /* Initialize a newly allocated zspage */ 907 static void init_zspage(struct page *first_page, struct size_class *class) 908 { 909 unsigned long off = 0; 910 struct page *page = first_page; 911 912 BUG_ON(!is_first_page(first_page)); 913 while (page) { 914 struct page *next_page; 915 struct link_free *link; 916 unsigned int i = 1; 917 void *vaddr; 918 919 /* 920 * page->index stores offset of first object starting 921 * in the page. For the first page, this is always 0, 922 * so we use first_page->index (aka ->freelist) to store 923 * head of corresponding zspage's freelist. 924 */ 925 if (page != first_page) 926 page->index = off; 927 928 vaddr = kmap_atomic(page); 929 link = (struct link_free *)vaddr + off / sizeof(*link); 930 931 while ((off += class->size) < PAGE_SIZE) { 932 link->next = location_to_obj(page, i++); 933 link += class->size / sizeof(*link); 934 } 935 936 /* 937 * We now come to the last (full or partial) object on this 938 * page, which must point to the first object on the next 939 * page (if present) 940 */ 941 next_page = get_next_page(page); 942 link->next = location_to_obj(next_page, 0); 943 kunmap_atomic(vaddr); 944 page = next_page; 945 off %= PAGE_SIZE; 946 } 947 } 948 949 /* 950 * Allocate a zspage for the given size class 951 */ 952 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 953 { 954 int i, error; 955 struct page *first_page = NULL, *uninitialized_var(prev_page); 956 957 /* 958 * Allocate individual pages and link them together as: 959 * 1. first page->private = first sub-page 960 * 2. all sub-pages are linked together using page->lru 961 * 3. each sub-page is linked to the first page using page->private 962 * 963 * For each size class, First/Head pages are linked together using 964 * page->lru. Also, we set PG_private to identify the first page 965 * (i.e. no other sub-page has this flag set) and PG_private_2 to 966 * identify the last page. 967 */ 968 error = -ENOMEM; 969 for (i = 0; i < class->pages_per_zspage; i++) { 970 struct page *page; 971 972 page = alloc_page(flags); 973 if (!page) 974 goto cleanup; 975 976 INIT_LIST_HEAD(&page->lru); 977 if (i == 0) { /* first page */ 978 SetPagePrivate(page); 979 set_page_private(page, 0); 980 first_page = page; 981 first_page->inuse = 0; 982 } 983 if (i == 1) 984 set_page_private(first_page, (unsigned long)page); 985 if (i >= 1) 986 set_page_private(page, (unsigned long)first_page); 987 if (i >= 2) 988 list_add(&page->lru, &prev_page->lru); 989 if (i == class->pages_per_zspage - 1) /* last page */ 990 SetPagePrivate2(page); 991 prev_page = page; 992 } 993 994 init_zspage(first_page, class); 995 996 first_page->freelist = location_to_obj(first_page, 0); 997 /* Maximum number of objects we can store in this zspage */ 998 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 999 1000 error = 0; /* Success */ 1001 1002 cleanup: 1003 if (unlikely(error) && first_page) { 1004 free_zspage(first_page); 1005 first_page = NULL; 1006 } 1007 1008 return first_page; 1009 } 1010 1011 static struct page *find_get_zspage(struct size_class *class) 1012 { 1013 int i; 1014 struct page *page; 1015 1016 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1017 page = class->fullness_list[i]; 1018 if (page) 1019 break; 1020 } 1021 1022 return page; 1023 } 1024 1025 #ifdef CONFIG_PGTABLE_MAPPING 1026 static inline int __zs_cpu_up(struct mapping_area *area) 1027 { 1028 /* 1029 * Make sure we don't leak memory if a cpu UP notification 1030 * and zs_init() race and both call zs_cpu_up() on the same cpu 1031 */ 1032 if (area->vm) 1033 return 0; 1034 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1035 if (!area->vm) 1036 return -ENOMEM; 1037 return 0; 1038 } 1039 1040 static inline void __zs_cpu_down(struct mapping_area *area) 1041 { 1042 if (area->vm) 1043 free_vm_area(area->vm); 1044 area->vm = NULL; 1045 } 1046 1047 static inline void *__zs_map_object(struct mapping_area *area, 1048 struct page *pages[2], int off, int size) 1049 { 1050 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1051 area->vm_addr = area->vm->addr; 1052 return area->vm_addr + off; 1053 } 1054 1055 static inline void __zs_unmap_object(struct mapping_area *area, 1056 struct page *pages[2], int off, int size) 1057 { 1058 unsigned long addr = (unsigned long)area->vm_addr; 1059 1060 unmap_kernel_range(addr, PAGE_SIZE * 2); 1061 } 1062 1063 #else /* CONFIG_PGTABLE_MAPPING */ 1064 1065 static inline int __zs_cpu_up(struct mapping_area *area) 1066 { 1067 /* 1068 * Make sure we don't leak memory if a cpu UP notification 1069 * and zs_init() race and both call zs_cpu_up() on the same cpu 1070 */ 1071 if (area->vm_buf) 1072 return 0; 1073 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1074 if (!area->vm_buf) 1075 return -ENOMEM; 1076 return 0; 1077 } 1078 1079 static inline void __zs_cpu_down(struct mapping_area *area) 1080 { 1081 kfree(area->vm_buf); 1082 area->vm_buf = NULL; 1083 } 1084 1085 static void *__zs_map_object(struct mapping_area *area, 1086 struct page *pages[2], int off, int size) 1087 { 1088 int sizes[2]; 1089 void *addr; 1090 char *buf = area->vm_buf; 1091 1092 /* disable page faults to match kmap_atomic() return conditions */ 1093 pagefault_disable(); 1094 1095 /* no read fastpath */ 1096 if (area->vm_mm == ZS_MM_WO) 1097 goto out; 1098 1099 sizes[0] = PAGE_SIZE - off; 1100 sizes[1] = size - sizes[0]; 1101 1102 /* copy object to per-cpu buffer */ 1103 addr = kmap_atomic(pages[0]); 1104 memcpy(buf, addr + off, sizes[0]); 1105 kunmap_atomic(addr); 1106 addr = kmap_atomic(pages[1]); 1107 memcpy(buf + sizes[0], addr, sizes[1]); 1108 kunmap_atomic(addr); 1109 out: 1110 return area->vm_buf; 1111 } 1112 1113 static void __zs_unmap_object(struct mapping_area *area, 1114 struct page *pages[2], int off, int size) 1115 { 1116 int sizes[2]; 1117 void *addr; 1118 char *buf; 1119 1120 /* no write fastpath */ 1121 if (area->vm_mm == ZS_MM_RO) 1122 goto out; 1123 1124 buf = area->vm_buf; 1125 if (!area->huge) { 1126 buf = buf + ZS_HANDLE_SIZE; 1127 size -= ZS_HANDLE_SIZE; 1128 off += ZS_HANDLE_SIZE; 1129 } 1130 1131 sizes[0] = PAGE_SIZE - off; 1132 sizes[1] = size - sizes[0]; 1133 1134 /* copy per-cpu buffer to object */ 1135 addr = kmap_atomic(pages[0]); 1136 memcpy(addr + off, buf, sizes[0]); 1137 kunmap_atomic(addr); 1138 addr = kmap_atomic(pages[1]); 1139 memcpy(addr, buf + sizes[0], sizes[1]); 1140 kunmap_atomic(addr); 1141 1142 out: 1143 /* enable page faults to match kunmap_atomic() return conditions */ 1144 pagefault_enable(); 1145 } 1146 1147 #endif /* CONFIG_PGTABLE_MAPPING */ 1148 1149 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1150 void *pcpu) 1151 { 1152 int ret, cpu = (long)pcpu; 1153 struct mapping_area *area; 1154 1155 switch (action) { 1156 case CPU_UP_PREPARE: 1157 area = &per_cpu(zs_map_area, cpu); 1158 ret = __zs_cpu_up(area); 1159 if (ret) 1160 return notifier_from_errno(ret); 1161 break; 1162 case CPU_DEAD: 1163 case CPU_UP_CANCELED: 1164 area = &per_cpu(zs_map_area, cpu); 1165 __zs_cpu_down(area); 1166 break; 1167 } 1168 1169 return NOTIFY_OK; 1170 } 1171 1172 static struct notifier_block zs_cpu_nb = { 1173 .notifier_call = zs_cpu_notifier 1174 }; 1175 1176 static int zs_register_cpu_notifier(void) 1177 { 1178 int cpu, uninitialized_var(ret); 1179 1180 cpu_notifier_register_begin(); 1181 1182 __register_cpu_notifier(&zs_cpu_nb); 1183 for_each_online_cpu(cpu) { 1184 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1185 if (notifier_to_errno(ret)) 1186 break; 1187 } 1188 1189 cpu_notifier_register_done(); 1190 return notifier_to_errno(ret); 1191 } 1192 1193 static void zs_unregister_cpu_notifier(void) 1194 { 1195 int cpu; 1196 1197 cpu_notifier_register_begin(); 1198 1199 for_each_online_cpu(cpu) 1200 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1201 __unregister_cpu_notifier(&zs_cpu_nb); 1202 1203 cpu_notifier_register_done(); 1204 } 1205 1206 static void init_zs_size_classes(void) 1207 { 1208 int nr; 1209 1210 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1211 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1212 nr += 1; 1213 1214 zs_size_classes = nr; 1215 } 1216 1217 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1218 { 1219 if (prev->pages_per_zspage != pages_per_zspage) 1220 return false; 1221 1222 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1223 != get_maxobj_per_zspage(size, pages_per_zspage)) 1224 return false; 1225 1226 return true; 1227 } 1228 1229 static bool zspage_full(struct page *page) 1230 { 1231 BUG_ON(!is_first_page(page)); 1232 1233 return page->inuse == page->objects; 1234 } 1235 1236 unsigned long zs_get_total_pages(struct zs_pool *pool) 1237 { 1238 return atomic_long_read(&pool->pages_allocated); 1239 } 1240 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1241 1242 /** 1243 * zs_map_object - get address of allocated object from handle. 1244 * @pool: pool from which the object was allocated 1245 * @handle: handle returned from zs_malloc 1246 * 1247 * Before using an object allocated from zs_malloc, it must be mapped using 1248 * this function. When done with the object, it must be unmapped using 1249 * zs_unmap_object. 1250 * 1251 * Only one object can be mapped per cpu at a time. There is no protection 1252 * against nested mappings. 1253 * 1254 * This function returns with preemption and page faults disabled. 1255 */ 1256 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1257 enum zs_mapmode mm) 1258 { 1259 struct page *page; 1260 unsigned long obj, obj_idx, off; 1261 1262 unsigned int class_idx; 1263 enum fullness_group fg; 1264 struct size_class *class; 1265 struct mapping_area *area; 1266 struct page *pages[2]; 1267 void *ret; 1268 1269 BUG_ON(!handle); 1270 1271 /* 1272 * Because we use per-cpu mapping areas shared among the 1273 * pools/users, we can't allow mapping in interrupt context 1274 * because it can corrupt another users mappings. 1275 */ 1276 BUG_ON(in_interrupt()); 1277 1278 /* From now on, migration cannot move the object */ 1279 pin_tag(handle); 1280 1281 obj = handle_to_obj(handle); 1282 obj_to_location(obj, &page, &obj_idx); 1283 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1284 class = pool->size_class[class_idx]; 1285 off = obj_idx_to_offset(page, obj_idx, class->size); 1286 1287 area = &get_cpu_var(zs_map_area); 1288 area->vm_mm = mm; 1289 if (off + class->size <= PAGE_SIZE) { 1290 /* this object is contained entirely within a page */ 1291 area->vm_addr = kmap_atomic(page); 1292 ret = area->vm_addr + off; 1293 goto out; 1294 } 1295 1296 /* this object spans two pages */ 1297 pages[0] = page; 1298 pages[1] = get_next_page(page); 1299 BUG_ON(!pages[1]); 1300 1301 ret = __zs_map_object(area, pages, off, class->size); 1302 out: 1303 if (!class->huge) 1304 ret += ZS_HANDLE_SIZE; 1305 1306 return ret; 1307 } 1308 EXPORT_SYMBOL_GPL(zs_map_object); 1309 1310 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1311 { 1312 struct page *page; 1313 unsigned long obj, obj_idx, off; 1314 1315 unsigned int class_idx; 1316 enum fullness_group fg; 1317 struct size_class *class; 1318 struct mapping_area *area; 1319 1320 BUG_ON(!handle); 1321 1322 obj = handle_to_obj(handle); 1323 obj_to_location(obj, &page, &obj_idx); 1324 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1325 class = pool->size_class[class_idx]; 1326 off = obj_idx_to_offset(page, obj_idx, class->size); 1327 1328 area = this_cpu_ptr(&zs_map_area); 1329 if (off + class->size <= PAGE_SIZE) 1330 kunmap_atomic(area->vm_addr); 1331 else { 1332 struct page *pages[2]; 1333 1334 pages[0] = page; 1335 pages[1] = get_next_page(page); 1336 BUG_ON(!pages[1]); 1337 1338 __zs_unmap_object(area, pages, off, class->size); 1339 } 1340 put_cpu_var(zs_map_area); 1341 unpin_tag(handle); 1342 } 1343 EXPORT_SYMBOL_GPL(zs_unmap_object); 1344 1345 static unsigned long obj_malloc(struct page *first_page, 1346 struct size_class *class, unsigned long handle) 1347 { 1348 unsigned long obj; 1349 struct link_free *link; 1350 1351 struct page *m_page; 1352 unsigned long m_objidx, m_offset; 1353 void *vaddr; 1354 1355 handle |= OBJ_ALLOCATED_TAG; 1356 obj = (unsigned long)first_page->freelist; 1357 obj_to_location(obj, &m_page, &m_objidx); 1358 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1359 1360 vaddr = kmap_atomic(m_page); 1361 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1362 first_page->freelist = link->next; 1363 if (!class->huge) 1364 /* record handle in the header of allocated chunk */ 1365 link->handle = handle; 1366 else 1367 /* record handle in first_page->private */ 1368 set_page_private(first_page, handle); 1369 kunmap_atomic(vaddr); 1370 first_page->inuse++; 1371 zs_stat_inc(class, OBJ_USED, 1); 1372 1373 return obj; 1374 } 1375 1376 1377 /** 1378 * zs_malloc - Allocate block of given size from pool. 1379 * @pool: pool to allocate from 1380 * @size: size of block to allocate 1381 * 1382 * On success, handle to the allocated object is returned, 1383 * otherwise 0. 1384 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1385 */ 1386 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1387 { 1388 unsigned long handle, obj; 1389 struct size_class *class; 1390 struct page *first_page; 1391 1392 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1393 return 0; 1394 1395 handle = alloc_handle(pool); 1396 if (!handle) 1397 return 0; 1398 1399 /* extra space in chunk to keep the handle */ 1400 size += ZS_HANDLE_SIZE; 1401 class = pool->size_class[get_size_class_index(size)]; 1402 1403 spin_lock(&class->lock); 1404 first_page = find_get_zspage(class); 1405 1406 if (!first_page) { 1407 spin_unlock(&class->lock); 1408 first_page = alloc_zspage(class, pool->flags); 1409 if (unlikely(!first_page)) { 1410 free_handle(pool, handle); 1411 return 0; 1412 } 1413 1414 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1415 atomic_long_add(class->pages_per_zspage, 1416 &pool->pages_allocated); 1417 1418 spin_lock(&class->lock); 1419 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1420 class->size, class->pages_per_zspage)); 1421 } 1422 1423 obj = obj_malloc(first_page, class, handle); 1424 /* Now move the zspage to another fullness group, if required */ 1425 fix_fullness_group(class, first_page); 1426 record_obj(handle, obj); 1427 spin_unlock(&class->lock); 1428 1429 return handle; 1430 } 1431 EXPORT_SYMBOL_GPL(zs_malloc); 1432 1433 static void obj_free(struct zs_pool *pool, struct size_class *class, 1434 unsigned long obj) 1435 { 1436 struct link_free *link; 1437 struct page *first_page, *f_page; 1438 unsigned long f_objidx, f_offset; 1439 void *vaddr; 1440 1441 BUG_ON(!obj); 1442 1443 obj &= ~OBJ_ALLOCATED_TAG; 1444 obj_to_location(obj, &f_page, &f_objidx); 1445 first_page = get_first_page(f_page); 1446 1447 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1448 1449 vaddr = kmap_atomic(f_page); 1450 1451 /* Insert this object in containing zspage's freelist */ 1452 link = (struct link_free *)(vaddr + f_offset); 1453 link->next = first_page->freelist; 1454 if (class->huge) 1455 set_page_private(first_page, 0); 1456 kunmap_atomic(vaddr); 1457 first_page->freelist = (void *)obj; 1458 first_page->inuse--; 1459 zs_stat_dec(class, OBJ_USED, 1); 1460 } 1461 1462 void zs_free(struct zs_pool *pool, unsigned long handle) 1463 { 1464 struct page *first_page, *f_page; 1465 unsigned long obj, f_objidx; 1466 int class_idx; 1467 struct size_class *class; 1468 enum fullness_group fullness; 1469 1470 if (unlikely(!handle)) 1471 return; 1472 1473 pin_tag(handle); 1474 obj = handle_to_obj(handle); 1475 obj_to_location(obj, &f_page, &f_objidx); 1476 first_page = get_first_page(f_page); 1477 1478 get_zspage_mapping(first_page, &class_idx, &fullness); 1479 class = pool->size_class[class_idx]; 1480 1481 spin_lock(&class->lock); 1482 obj_free(pool, class, obj); 1483 fullness = fix_fullness_group(class, first_page); 1484 if (fullness == ZS_EMPTY) { 1485 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1486 class->size, class->pages_per_zspage)); 1487 atomic_long_sub(class->pages_per_zspage, 1488 &pool->pages_allocated); 1489 free_zspage(first_page); 1490 } 1491 spin_unlock(&class->lock); 1492 unpin_tag(handle); 1493 1494 free_handle(pool, handle); 1495 } 1496 EXPORT_SYMBOL_GPL(zs_free); 1497 1498 static void zs_object_copy(unsigned long dst, unsigned long src, 1499 struct size_class *class) 1500 { 1501 struct page *s_page, *d_page; 1502 unsigned long s_objidx, d_objidx; 1503 unsigned long s_off, d_off; 1504 void *s_addr, *d_addr; 1505 int s_size, d_size, size; 1506 int written = 0; 1507 1508 s_size = d_size = class->size; 1509 1510 obj_to_location(src, &s_page, &s_objidx); 1511 obj_to_location(dst, &d_page, &d_objidx); 1512 1513 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1514 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1515 1516 if (s_off + class->size > PAGE_SIZE) 1517 s_size = PAGE_SIZE - s_off; 1518 1519 if (d_off + class->size > PAGE_SIZE) 1520 d_size = PAGE_SIZE - d_off; 1521 1522 s_addr = kmap_atomic(s_page); 1523 d_addr = kmap_atomic(d_page); 1524 1525 while (1) { 1526 size = min(s_size, d_size); 1527 memcpy(d_addr + d_off, s_addr + s_off, size); 1528 written += size; 1529 1530 if (written == class->size) 1531 break; 1532 1533 s_off += size; 1534 s_size -= size; 1535 d_off += size; 1536 d_size -= size; 1537 1538 if (s_off >= PAGE_SIZE) { 1539 kunmap_atomic(d_addr); 1540 kunmap_atomic(s_addr); 1541 s_page = get_next_page(s_page); 1542 BUG_ON(!s_page); 1543 s_addr = kmap_atomic(s_page); 1544 d_addr = kmap_atomic(d_page); 1545 s_size = class->size - written; 1546 s_off = 0; 1547 } 1548 1549 if (d_off >= PAGE_SIZE) { 1550 kunmap_atomic(d_addr); 1551 d_page = get_next_page(d_page); 1552 BUG_ON(!d_page); 1553 d_addr = kmap_atomic(d_page); 1554 d_size = class->size - written; 1555 d_off = 0; 1556 } 1557 } 1558 1559 kunmap_atomic(d_addr); 1560 kunmap_atomic(s_addr); 1561 } 1562 1563 /* 1564 * Find alloced object in zspage from index object and 1565 * return handle. 1566 */ 1567 static unsigned long find_alloced_obj(struct page *page, int index, 1568 struct size_class *class) 1569 { 1570 unsigned long head; 1571 int offset = 0; 1572 unsigned long handle = 0; 1573 void *addr = kmap_atomic(page); 1574 1575 if (!is_first_page(page)) 1576 offset = page->index; 1577 offset += class->size * index; 1578 1579 while (offset < PAGE_SIZE) { 1580 head = obj_to_head(class, page, addr + offset); 1581 if (head & OBJ_ALLOCATED_TAG) { 1582 handle = head & ~OBJ_ALLOCATED_TAG; 1583 if (trypin_tag(handle)) 1584 break; 1585 handle = 0; 1586 } 1587 1588 offset += class->size; 1589 index++; 1590 } 1591 1592 kunmap_atomic(addr); 1593 return handle; 1594 } 1595 1596 struct zs_compact_control { 1597 /* Source page for migration which could be a subpage of zspage. */ 1598 struct page *s_page; 1599 /* Destination page for migration which should be a first page 1600 * of zspage. */ 1601 struct page *d_page; 1602 /* Starting object index within @s_page which used for live object 1603 * in the subpage. */ 1604 int index; 1605 }; 1606 1607 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1608 struct zs_compact_control *cc) 1609 { 1610 unsigned long used_obj, free_obj; 1611 unsigned long handle; 1612 struct page *s_page = cc->s_page; 1613 struct page *d_page = cc->d_page; 1614 unsigned long index = cc->index; 1615 int ret = 0; 1616 1617 while (1) { 1618 handle = find_alloced_obj(s_page, index, class); 1619 if (!handle) { 1620 s_page = get_next_page(s_page); 1621 if (!s_page) 1622 break; 1623 index = 0; 1624 continue; 1625 } 1626 1627 /* Stop if there is no more space */ 1628 if (zspage_full(d_page)) { 1629 unpin_tag(handle); 1630 ret = -ENOMEM; 1631 break; 1632 } 1633 1634 used_obj = handle_to_obj(handle); 1635 free_obj = obj_malloc(d_page, class, handle); 1636 zs_object_copy(free_obj, used_obj, class); 1637 index++; 1638 record_obj(handle, free_obj); 1639 unpin_tag(handle); 1640 obj_free(pool, class, used_obj); 1641 } 1642 1643 /* Remember last position in this iteration */ 1644 cc->s_page = s_page; 1645 cc->index = index; 1646 1647 return ret; 1648 } 1649 1650 static struct page *isolate_target_page(struct size_class *class) 1651 { 1652 int i; 1653 struct page *page; 1654 1655 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1656 page = class->fullness_list[i]; 1657 if (page) { 1658 remove_zspage(page, class, i); 1659 break; 1660 } 1661 } 1662 1663 return page; 1664 } 1665 1666 /* 1667 * putback_zspage - add @first_page into right class's fullness list 1668 * @pool: target pool 1669 * @class: destination class 1670 * @first_page: target page 1671 * 1672 * Return @fist_page's fullness_group 1673 */ 1674 static enum fullness_group putback_zspage(struct zs_pool *pool, 1675 struct size_class *class, 1676 struct page *first_page) 1677 { 1678 enum fullness_group fullness; 1679 1680 BUG_ON(!is_first_page(first_page)); 1681 1682 fullness = get_fullness_group(first_page); 1683 insert_zspage(first_page, class, fullness); 1684 set_zspage_mapping(first_page, class->index, fullness); 1685 1686 if (fullness == ZS_EMPTY) { 1687 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1688 class->size, class->pages_per_zspage)); 1689 atomic_long_sub(class->pages_per_zspage, 1690 &pool->pages_allocated); 1691 1692 free_zspage(first_page); 1693 } 1694 1695 return fullness; 1696 } 1697 1698 static struct page *isolate_source_page(struct size_class *class) 1699 { 1700 int i; 1701 struct page *page = NULL; 1702 1703 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1704 page = class->fullness_list[i]; 1705 if (!page) 1706 continue; 1707 1708 remove_zspage(page, class, i); 1709 break; 1710 } 1711 1712 return page; 1713 } 1714 1715 /* 1716 * 1717 * Based on the number of unused allocated objects calculate 1718 * and return the number of pages that we can free. 1719 */ 1720 static unsigned long zs_can_compact(struct size_class *class) 1721 { 1722 unsigned long obj_wasted; 1723 1724 obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - 1725 zs_stat_get(class, OBJ_USED); 1726 1727 obj_wasted /= get_maxobj_per_zspage(class->size, 1728 class->pages_per_zspage); 1729 1730 return obj_wasted * class->pages_per_zspage; 1731 } 1732 1733 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1734 { 1735 struct zs_compact_control cc; 1736 struct page *src_page; 1737 struct page *dst_page = NULL; 1738 1739 spin_lock(&class->lock); 1740 while ((src_page = isolate_source_page(class))) { 1741 1742 BUG_ON(!is_first_page(src_page)); 1743 1744 if (!zs_can_compact(class)) 1745 break; 1746 1747 cc.index = 0; 1748 cc.s_page = src_page; 1749 1750 while ((dst_page = isolate_target_page(class))) { 1751 cc.d_page = dst_page; 1752 /* 1753 * If there is no more space in dst_page, resched 1754 * and see if anyone had allocated another zspage. 1755 */ 1756 if (!migrate_zspage(pool, class, &cc)) 1757 break; 1758 1759 putback_zspage(pool, class, dst_page); 1760 } 1761 1762 /* Stop if we couldn't find slot */ 1763 if (dst_page == NULL) 1764 break; 1765 1766 putback_zspage(pool, class, dst_page); 1767 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1768 pool->stats.pages_compacted += class->pages_per_zspage; 1769 spin_unlock(&class->lock); 1770 cond_resched(); 1771 spin_lock(&class->lock); 1772 } 1773 1774 if (src_page) 1775 putback_zspage(pool, class, src_page); 1776 1777 spin_unlock(&class->lock); 1778 } 1779 1780 unsigned long zs_compact(struct zs_pool *pool) 1781 { 1782 int i; 1783 struct size_class *class; 1784 1785 for (i = zs_size_classes - 1; i >= 0; i--) { 1786 class = pool->size_class[i]; 1787 if (!class) 1788 continue; 1789 if (class->index != i) 1790 continue; 1791 __zs_compact(pool, class); 1792 } 1793 1794 return pool->stats.pages_compacted; 1795 } 1796 EXPORT_SYMBOL_GPL(zs_compact); 1797 1798 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1799 { 1800 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1801 } 1802 EXPORT_SYMBOL_GPL(zs_pool_stats); 1803 1804 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1805 struct shrink_control *sc) 1806 { 1807 unsigned long pages_freed; 1808 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1809 shrinker); 1810 1811 pages_freed = pool->stats.pages_compacted; 1812 /* 1813 * Compact classes and calculate compaction delta. 1814 * Can run concurrently with a manually triggered 1815 * (by user) compaction. 1816 */ 1817 pages_freed = zs_compact(pool) - pages_freed; 1818 1819 return pages_freed ? pages_freed : SHRINK_STOP; 1820 } 1821 1822 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1823 struct shrink_control *sc) 1824 { 1825 int i; 1826 struct size_class *class; 1827 unsigned long pages_to_free = 0; 1828 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1829 shrinker); 1830 1831 for (i = zs_size_classes - 1; i >= 0; i--) { 1832 class = pool->size_class[i]; 1833 if (!class) 1834 continue; 1835 if (class->index != i) 1836 continue; 1837 1838 pages_to_free += zs_can_compact(class); 1839 } 1840 1841 return pages_to_free; 1842 } 1843 1844 static void zs_unregister_shrinker(struct zs_pool *pool) 1845 { 1846 if (pool->shrinker_enabled) { 1847 unregister_shrinker(&pool->shrinker); 1848 pool->shrinker_enabled = false; 1849 } 1850 } 1851 1852 static int zs_register_shrinker(struct zs_pool *pool) 1853 { 1854 pool->shrinker.scan_objects = zs_shrinker_scan; 1855 pool->shrinker.count_objects = zs_shrinker_count; 1856 pool->shrinker.batch = 0; 1857 pool->shrinker.seeks = DEFAULT_SEEKS; 1858 1859 return register_shrinker(&pool->shrinker); 1860 } 1861 1862 /** 1863 * zs_create_pool - Creates an allocation pool to work from. 1864 * @flags: allocation flags used to allocate pool metadata 1865 * 1866 * This function must be called before anything when using 1867 * the zsmalloc allocator. 1868 * 1869 * On success, a pointer to the newly created pool is returned, 1870 * otherwise NULL. 1871 */ 1872 struct zs_pool *zs_create_pool(const char *name, gfp_t flags) 1873 { 1874 int i; 1875 struct zs_pool *pool; 1876 struct size_class *prev_class = NULL; 1877 1878 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1879 if (!pool) 1880 return NULL; 1881 1882 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1883 GFP_KERNEL); 1884 if (!pool->size_class) { 1885 kfree(pool); 1886 return NULL; 1887 } 1888 1889 pool->name = kstrdup(name, GFP_KERNEL); 1890 if (!pool->name) 1891 goto err; 1892 1893 if (create_handle_cache(pool)) 1894 goto err; 1895 1896 /* 1897 * Iterate reversly, because, size of size_class that we want to use 1898 * for merging should be larger or equal to current size. 1899 */ 1900 for (i = zs_size_classes - 1; i >= 0; i--) { 1901 int size; 1902 int pages_per_zspage; 1903 struct size_class *class; 1904 1905 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1906 if (size > ZS_MAX_ALLOC_SIZE) 1907 size = ZS_MAX_ALLOC_SIZE; 1908 pages_per_zspage = get_pages_per_zspage(size); 1909 1910 /* 1911 * size_class is used for normal zsmalloc operation such 1912 * as alloc/free for that size. Although it is natural that we 1913 * have one size_class for each size, there is a chance that we 1914 * can get more memory utilization if we use one size_class for 1915 * many different sizes whose size_class have same 1916 * characteristics. So, we makes size_class point to 1917 * previous size_class if possible. 1918 */ 1919 if (prev_class) { 1920 if (can_merge(prev_class, size, pages_per_zspage)) { 1921 pool->size_class[i] = prev_class; 1922 continue; 1923 } 1924 } 1925 1926 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1927 if (!class) 1928 goto err; 1929 1930 class->size = size; 1931 class->index = i; 1932 class->pages_per_zspage = pages_per_zspage; 1933 if (pages_per_zspage == 1 && 1934 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1935 class->huge = true; 1936 spin_lock_init(&class->lock); 1937 pool->size_class[i] = class; 1938 1939 prev_class = class; 1940 } 1941 1942 pool->flags = flags; 1943 1944 if (zs_pool_stat_create(name, pool)) 1945 goto err; 1946 1947 /* 1948 * Not critical, we still can use the pool 1949 * and user can trigger compaction manually. 1950 */ 1951 if (zs_register_shrinker(pool) == 0) 1952 pool->shrinker_enabled = true; 1953 return pool; 1954 1955 err: 1956 zs_destroy_pool(pool); 1957 return NULL; 1958 } 1959 EXPORT_SYMBOL_GPL(zs_create_pool); 1960 1961 void zs_destroy_pool(struct zs_pool *pool) 1962 { 1963 int i; 1964 1965 zs_unregister_shrinker(pool); 1966 zs_pool_stat_destroy(pool); 1967 1968 for (i = 0; i < zs_size_classes; i++) { 1969 int fg; 1970 struct size_class *class = pool->size_class[i]; 1971 1972 if (!class) 1973 continue; 1974 1975 if (class->index != i) 1976 continue; 1977 1978 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1979 if (class->fullness_list[fg]) { 1980 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1981 class->size, fg); 1982 } 1983 } 1984 kfree(class); 1985 } 1986 1987 destroy_handle_cache(pool); 1988 kfree(pool->size_class); 1989 kfree(pool->name); 1990 kfree(pool); 1991 } 1992 EXPORT_SYMBOL_GPL(zs_destroy_pool); 1993 1994 static int __init zs_init(void) 1995 { 1996 int ret = zs_register_cpu_notifier(); 1997 1998 if (ret) 1999 goto notifier_fail; 2000 2001 init_zs_size_classes(); 2002 2003 #ifdef CONFIG_ZPOOL 2004 zpool_register_driver(&zs_zpool_driver); 2005 #endif 2006 2007 ret = zs_stat_init(); 2008 if (ret) { 2009 pr_err("zs stat initialization failed\n"); 2010 goto stat_fail; 2011 } 2012 return 0; 2013 2014 stat_fail: 2015 #ifdef CONFIG_ZPOOL 2016 zpool_unregister_driver(&zs_zpool_driver); 2017 #endif 2018 notifier_fail: 2019 zs_unregister_cpu_notifier(); 2020 2021 return ret; 2022 } 2023 2024 static void __exit zs_exit(void) 2025 { 2026 #ifdef CONFIG_ZPOOL 2027 zpool_unregister_driver(&zs_zpool_driver); 2028 #endif 2029 zs_unregister_cpu_notifier(); 2030 2031 zs_stat_exit(); 2032 } 2033 2034 module_init(zs_init); 2035 module_exit(zs_exit); 2036 2037 MODULE_LICENSE("Dual BSD/GPL"); 2038 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2039