1 /* 2 * zsmalloc memory allocator 3 * 4 * Copyright (C) 2011 Nitin Gupta 5 * Copyright (C) 2012, 2013 Minchan Kim 6 * 7 * This code is released using a dual license strategy: BSD/GPL 8 * You can choose the license that better fits your requirements. 9 * 10 * Released under the terms of 3-clause BSD License 11 * Released under the terms of GNU General Public License Version 2.0 12 */ 13 14 /* 15 * Following is how we use various fields and flags of underlying 16 * struct page(s) to form a zspage. 17 * 18 * Usage of struct page fields: 19 * page->first_page: points to the first component (0-order) page 20 * page->index (union with page->freelist): offset of the first object 21 * starting in this page. For the first page, this is 22 * always 0, so we use this field (aka freelist) to point 23 * to the first free object in zspage. 24 * page->lru: links together all component pages (except the first page) 25 * of a zspage 26 * 27 * For _first_ page only: 28 * 29 * page->private (union with page->first_page): refers to the 30 * component page after the first page 31 * If the page is first_page for huge object, it stores handle. 32 * Look at size_class->huge. 33 * page->freelist: points to the first free object in zspage. 34 * Free objects are linked together using in-place 35 * metadata. 36 * page->objects: maximum number of objects we can store in this 37 * zspage (class->zspage_order * PAGE_SIZE / class->size) 38 * page->lru: links together first pages of various zspages. 39 * Basically forming list of zspages in a fullness group. 40 * page->mapping: class index and fullness group of the zspage 41 * 42 * Usage of struct page flags: 43 * PG_private: identifies the first component page 44 * PG_private2: identifies the last component page 45 * 46 */ 47 48 #include <linux/module.h> 49 #include <linux/kernel.h> 50 #include <linux/sched.h> 51 #include <linux/bitops.h> 52 #include <linux/errno.h> 53 #include <linux/highmem.h> 54 #include <linux/string.h> 55 #include <linux/slab.h> 56 #include <asm/tlbflush.h> 57 #include <asm/pgtable.h> 58 #include <linux/cpumask.h> 59 #include <linux/cpu.h> 60 #include <linux/vmalloc.h> 61 #include <linux/hardirq.h> 62 #include <linux/spinlock.h> 63 #include <linux/types.h> 64 #include <linux/debugfs.h> 65 #include <linux/zsmalloc.h> 66 #include <linux/zpool.h> 67 68 /* 69 * This must be power of 2 and greater than of equal to sizeof(link_free). 70 * These two conditions ensure that any 'struct link_free' itself doesn't 71 * span more than 1 page which avoids complex case of mapping 2 pages simply 72 * to restore link_free pointer values. 73 */ 74 #define ZS_ALIGN 8 75 76 /* 77 * A single 'zspage' is composed of up to 2^N discontiguous 0-order (single) 78 * pages. ZS_MAX_ZSPAGE_ORDER defines upper limit on N. 79 */ 80 #define ZS_MAX_ZSPAGE_ORDER 2 81 #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) 82 83 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) 84 85 /* 86 * Object location (<PFN>, <obj_idx>) is encoded as 87 * as single (unsigned long) handle value. 88 * 89 * Note that object index <obj_idx> is relative to system 90 * page <PFN> it is stored in, so for each sub-page belonging 91 * to a zspage, obj_idx starts with 0. 92 * 93 * This is made more complicated by various memory models and PAE. 94 */ 95 96 #ifndef MAX_PHYSMEM_BITS 97 #ifdef CONFIG_HIGHMEM64G 98 #define MAX_PHYSMEM_BITS 36 99 #else /* !CONFIG_HIGHMEM64G */ 100 /* 101 * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just 102 * be PAGE_SHIFT 103 */ 104 #define MAX_PHYSMEM_BITS BITS_PER_LONG 105 #endif 106 #endif 107 #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) 108 109 /* 110 * Memory for allocating for handle keeps object position by 111 * encoding <page, obj_idx> and the encoded value has a room 112 * in least bit(ie, look at obj_to_location). 113 * We use the bit to synchronize between object access by 114 * user and migration. 115 */ 116 #define HANDLE_PIN_BIT 0 117 118 /* 119 * Head in allocated object should have OBJ_ALLOCATED_TAG 120 * to identify the object was allocated or not. 121 * It's okay to add the status bit in the least bit because 122 * header keeps handle which is 4byte-aligned address so we 123 * have room for two bit at least. 124 */ 125 #define OBJ_ALLOCATED_TAG 1 126 #define OBJ_TAG_BITS 1 127 #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) 128 #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) 129 130 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) 131 /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ 132 #define ZS_MIN_ALLOC_SIZE \ 133 MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) 134 /* each chunk includes extra space to keep handle */ 135 #define ZS_MAX_ALLOC_SIZE PAGE_SIZE 136 137 /* 138 * On systems with 4K page size, this gives 255 size classes! There is a 139 * trader-off here: 140 * - Large number of size classes is potentially wasteful as free page are 141 * spread across these classes 142 * - Small number of size classes causes large internal fragmentation 143 * - Probably its better to use specific size classes (empirically 144 * determined). NOTE: all those class sizes must be set as multiple of 145 * ZS_ALIGN to make sure link_free itself never has to span 2 pages. 146 * 147 * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN 148 * (reason above) 149 */ 150 #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> 8) 151 152 /* 153 * We do not maintain any list for completely empty or full pages 154 */ 155 enum fullness_group { 156 ZS_ALMOST_FULL, 157 ZS_ALMOST_EMPTY, 158 _ZS_NR_FULLNESS_GROUPS, 159 160 ZS_EMPTY, 161 ZS_FULL 162 }; 163 164 enum zs_stat_type { 165 OBJ_ALLOCATED, 166 OBJ_USED, 167 CLASS_ALMOST_FULL, 168 CLASS_ALMOST_EMPTY, 169 NR_ZS_STAT_TYPE, 170 }; 171 172 struct zs_size_stat { 173 unsigned long objs[NR_ZS_STAT_TYPE]; 174 }; 175 176 #ifdef CONFIG_ZSMALLOC_STAT 177 static struct dentry *zs_stat_root; 178 #endif 179 180 /* 181 * number of size_classes 182 */ 183 static int zs_size_classes; 184 185 /* 186 * We assign a page to ZS_ALMOST_EMPTY fullness group when: 187 * n <= N / f, where 188 * n = number of allocated objects 189 * N = total number of objects zspage can store 190 * f = fullness_threshold_frac 191 * 192 * Similarly, we assign zspage to: 193 * ZS_ALMOST_FULL when n > N / f 194 * ZS_EMPTY when n == 0 195 * ZS_FULL when n == N 196 * 197 * (see: fix_fullness_group()) 198 */ 199 static const int fullness_threshold_frac = 4; 200 201 struct size_class { 202 spinlock_t lock; 203 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 204 /* 205 * Size of objects stored in this class. Must be multiple 206 * of ZS_ALIGN. 207 */ 208 int size; 209 unsigned int index; 210 211 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 212 int pages_per_zspage; 213 struct zs_size_stat stats; 214 215 /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ 216 bool huge; 217 }; 218 219 /* 220 * Placed within free objects to form a singly linked list. 221 * For every zspage, first_page->freelist gives head of this list. 222 * 223 * This must be power of 2 and less than or equal to ZS_ALIGN 224 */ 225 struct link_free { 226 union { 227 /* 228 * Position of next free chunk (encodes <PFN, obj_idx>) 229 * It's valid for non-allocated object 230 */ 231 void *next; 232 /* 233 * Handle of allocated object. 234 */ 235 unsigned long handle; 236 }; 237 }; 238 239 struct zs_pool { 240 char *name; 241 242 struct size_class **size_class; 243 struct kmem_cache *handle_cachep; 244 245 gfp_t flags; /* allocation flags used when growing pool */ 246 atomic_long_t pages_allocated; 247 248 struct zs_pool_stats stats; 249 250 /* Compact classes */ 251 struct shrinker shrinker; 252 /* 253 * To signify that register_shrinker() was successful 254 * and unregister_shrinker() will not Oops. 255 */ 256 bool shrinker_enabled; 257 #ifdef CONFIG_ZSMALLOC_STAT 258 struct dentry *stat_dentry; 259 #endif 260 }; 261 262 /* 263 * A zspage's class index and fullness group 264 * are encoded in its (first)page->mapping 265 */ 266 #define CLASS_IDX_BITS 28 267 #define FULLNESS_BITS 4 268 #define CLASS_IDX_MASK ((1 << CLASS_IDX_BITS) - 1) 269 #define FULLNESS_MASK ((1 << FULLNESS_BITS) - 1) 270 271 struct mapping_area { 272 #ifdef CONFIG_PGTABLE_MAPPING 273 struct vm_struct *vm; /* vm area for mapping object that span pages */ 274 #else 275 char *vm_buf; /* copy buffer for objects that span pages */ 276 #endif 277 char *vm_addr; /* address of kmap_atomic()'ed pages */ 278 enum zs_mapmode vm_mm; /* mapping mode */ 279 bool huge; 280 }; 281 282 static int create_handle_cache(struct zs_pool *pool) 283 { 284 pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 285 0, 0, NULL); 286 return pool->handle_cachep ? 0 : 1; 287 } 288 289 static void destroy_handle_cache(struct zs_pool *pool) 290 { 291 kmem_cache_destroy(pool->handle_cachep); 292 } 293 294 static unsigned long alloc_handle(struct zs_pool *pool) 295 { 296 return (unsigned long)kmem_cache_alloc(pool->handle_cachep, 297 pool->flags & ~__GFP_HIGHMEM); 298 } 299 300 static void free_handle(struct zs_pool *pool, unsigned long handle) 301 { 302 kmem_cache_free(pool->handle_cachep, (void *)handle); 303 } 304 305 static void record_obj(unsigned long handle, unsigned long obj) 306 { 307 *(unsigned long *)handle = obj; 308 } 309 310 /* zpool driver */ 311 312 #ifdef CONFIG_ZPOOL 313 314 static void *zs_zpool_create(char *name, gfp_t gfp, 315 const struct zpool_ops *zpool_ops, 316 struct zpool *zpool) 317 { 318 return zs_create_pool(name, gfp); 319 } 320 321 static void zs_zpool_destroy(void *pool) 322 { 323 zs_destroy_pool(pool); 324 } 325 326 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, 327 unsigned long *handle) 328 { 329 *handle = zs_malloc(pool, size); 330 return *handle ? 0 : -1; 331 } 332 static void zs_zpool_free(void *pool, unsigned long handle) 333 { 334 zs_free(pool, handle); 335 } 336 337 static int zs_zpool_shrink(void *pool, unsigned int pages, 338 unsigned int *reclaimed) 339 { 340 return -EINVAL; 341 } 342 343 static void *zs_zpool_map(void *pool, unsigned long handle, 344 enum zpool_mapmode mm) 345 { 346 enum zs_mapmode zs_mm; 347 348 switch (mm) { 349 case ZPOOL_MM_RO: 350 zs_mm = ZS_MM_RO; 351 break; 352 case ZPOOL_MM_WO: 353 zs_mm = ZS_MM_WO; 354 break; 355 case ZPOOL_MM_RW: /* fallthru */ 356 default: 357 zs_mm = ZS_MM_RW; 358 break; 359 } 360 361 return zs_map_object(pool, handle, zs_mm); 362 } 363 static void zs_zpool_unmap(void *pool, unsigned long handle) 364 { 365 zs_unmap_object(pool, handle); 366 } 367 368 static u64 zs_zpool_total_size(void *pool) 369 { 370 return zs_get_total_pages(pool) << PAGE_SHIFT; 371 } 372 373 static struct zpool_driver zs_zpool_driver = { 374 .type = "zsmalloc", 375 .owner = THIS_MODULE, 376 .create = zs_zpool_create, 377 .destroy = zs_zpool_destroy, 378 .malloc = zs_zpool_malloc, 379 .free = zs_zpool_free, 380 .shrink = zs_zpool_shrink, 381 .map = zs_zpool_map, 382 .unmap = zs_zpool_unmap, 383 .total_size = zs_zpool_total_size, 384 }; 385 386 MODULE_ALIAS("zpool-zsmalloc"); 387 #endif /* CONFIG_ZPOOL */ 388 389 static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) 390 { 391 return pages_per_zspage * PAGE_SIZE / size; 392 } 393 394 /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ 395 static DEFINE_PER_CPU(struct mapping_area, zs_map_area); 396 397 static int is_first_page(struct page *page) 398 { 399 return PagePrivate(page); 400 } 401 402 static int is_last_page(struct page *page) 403 { 404 return PagePrivate2(page); 405 } 406 407 static void get_zspage_mapping(struct page *page, unsigned int *class_idx, 408 enum fullness_group *fullness) 409 { 410 unsigned long m; 411 BUG_ON(!is_first_page(page)); 412 413 m = (unsigned long)page->mapping; 414 *fullness = m & FULLNESS_MASK; 415 *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; 416 } 417 418 static void set_zspage_mapping(struct page *page, unsigned int class_idx, 419 enum fullness_group fullness) 420 { 421 unsigned long m; 422 BUG_ON(!is_first_page(page)); 423 424 m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | 425 (fullness & FULLNESS_MASK); 426 page->mapping = (struct address_space *)m; 427 } 428 429 /* 430 * zsmalloc divides the pool into various size classes where each 431 * class maintains a list of zspages where each zspage is divided 432 * into equal sized chunks. Each allocation falls into one of these 433 * classes depending on its size. This function returns index of the 434 * size class which has chunk size big enough to hold the give size. 435 */ 436 static int get_size_class_index(int size) 437 { 438 int idx = 0; 439 440 if (likely(size > ZS_MIN_ALLOC_SIZE)) 441 idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, 442 ZS_SIZE_CLASS_DELTA); 443 444 return min(zs_size_classes - 1, idx); 445 } 446 447 static inline void zs_stat_inc(struct size_class *class, 448 enum zs_stat_type type, unsigned long cnt) 449 { 450 class->stats.objs[type] += cnt; 451 } 452 453 static inline void zs_stat_dec(struct size_class *class, 454 enum zs_stat_type type, unsigned long cnt) 455 { 456 class->stats.objs[type] -= cnt; 457 } 458 459 static inline unsigned long zs_stat_get(struct size_class *class, 460 enum zs_stat_type type) 461 { 462 return class->stats.objs[type]; 463 } 464 465 #ifdef CONFIG_ZSMALLOC_STAT 466 467 static int __init zs_stat_init(void) 468 { 469 if (!debugfs_initialized()) 470 return -ENODEV; 471 472 zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 473 if (!zs_stat_root) 474 return -ENOMEM; 475 476 return 0; 477 } 478 479 static void __exit zs_stat_exit(void) 480 { 481 debugfs_remove_recursive(zs_stat_root); 482 } 483 484 static int zs_stats_size_show(struct seq_file *s, void *v) 485 { 486 int i; 487 struct zs_pool *pool = s->private; 488 struct size_class *class; 489 int objs_per_zspage; 490 unsigned long class_almost_full, class_almost_empty; 491 unsigned long obj_allocated, obj_used, pages_used; 492 unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; 493 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 494 495 seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", 496 "class", "size", "almost_full", "almost_empty", 497 "obj_allocated", "obj_used", "pages_used", 498 "pages_per_zspage"); 499 500 for (i = 0; i < zs_size_classes; i++) { 501 class = pool->size_class[i]; 502 503 if (class->index != i) 504 continue; 505 506 spin_lock(&class->lock); 507 class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); 508 class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); 509 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 510 obj_used = zs_stat_get(class, OBJ_USED); 511 spin_unlock(&class->lock); 512 513 objs_per_zspage = get_maxobj_per_zspage(class->size, 514 class->pages_per_zspage); 515 pages_used = obj_allocated / objs_per_zspage * 516 class->pages_per_zspage; 517 518 seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", 519 i, class->size, class_almost_full, class_almost_empty, 520 obj_allocated, obj_used, pages_used, 521 class->pages_per_zspage); 522 523 total_class_almost_full += class_almost_full; 524 total_class_almost_empty += class_almost_empty; 525 total_objs += obj_allocated; 526 total_used_objs += obj_used; 527 total_pages += pages_used; 528 } 529 530 seq_puts(s, "\n"); 531 seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", 532 "Total", "", total_class_almost_full, 533 total_class_almost_empty, total_objs, 534 total_used_objs, total_pages); 535 536 return 0; 537 } 538 539 static int zs_stats_size_open(struct inode *inode, struct file *file) 540 { 541 return single_open(file, zs_stats_size_show, inode->i_private); 542 } 543 544 static const struct file_operations zs_stat_size_ops = { 545 .open = zs_stats_size_open, 546 .read = seq_read, 547 .llseek = seq_lseek, 548 .release = single_release, 549 }; 550 551 static int zs_pool_stat_create(char *name, struct zs_pool *pool) 552 { 553 struct dentry *entry; 554 555 if (!zs_stat_root) 556 return -ENODEV; 557 558 entry = debugfs_create_dir(name, zs_stat_root); 559 if (!entry) { 560 pr_warn("debugfs dir <%s> creation failed\n", name); 561 return -ENOMEM; 562 } 563 pool->stat_dentry = entry; 564 565 entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, 566 pool->stat_dentry, pool, &zs_stat_size_ops); 567 if (!entry) { 568 pr_warn("%s: debugfs file entry <%s> creation failed\n", 569 name, "classes"); 570 return -ENOMEM; 571 } 572 573 return 0; 574 } 575 576 static void zs_pool_stat_destroy(struct zs_pool *pool) 577 { 578 debugfs_remove_recursive(pool->stat_dentry); 579 } 580 581 #else /* CONFIG_ZSMALLOC_STAT */ 582 static int __init zs_stat_init(void) 583 { 584 return 0; 585 } 586 587 static void __exit zs_stat_exit(void) 588 { 589 } 590 591 static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) 592 { 593 return 0; 594 } 595 596 static inline void zs_pool_stat_destroy(struct zs_pool *pool) 597 { 598 } 599 #endif 600 601 602 /* 603 * For each size class, zspages are divided into different groups 604 * depending on how "full" they are. This was done so that we could 605 * easily find empty or nearly empty zspages when we try to shrink 606 * the pool (not yet implemented). This function returns fullness 607 * status of the given page. 608 */ 609 static enum fullness_group get_fullness_group(struct page *page) 610 { 611 int inuse, max_objects; 612 enum fullness_group fg; 613 BUG_ON(!is_first_page(page)); 614 615 inuse = page->inuse; 616 max_objects = page->objects; 617 618 if (inuse == 0) 619 fg = ZS_EMPTY; 620 else if (inuse == max_objects) 621 fg = ZS_FULL; 622 else if (inuse <= 3 * max_objects / fullness_threshold_frac) 623 fg = ZS_ALMOST_EMPTY; 624 else 625 fg = ZS_ALMOST_FULL; 626 627 return fg; 628 } 629 630 /* 631 * Each size class maintains various freelists and zspages are assigned 632 * to one of these freelists based on the number of live objects they 633 * have. This functions inserts the given zspage into the freelist 634 * identified by <class, fullness_group>. 635 */ 636 static void insert_zspage(struct page *page, struct size_class *class, 637 enum fullness_group fullness) 638 { 639 struct page **head; 640 641 BUG_ON(!is_first_page(page)); 642 643 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 644 return; 645 646 zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? 647 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 648 649 head = &class->fullness_list[fullness]; 650 if (!*head) { 651 *head = page; 652 return; 653 } 654 655 /* 656 * We want to see more ZS_FULL pages and less almost 657 * empty/full. Put pages with higher ->inuse first. 658 */ 659 list_add_tail(&page->lru, &(*head)->lru); 660 if (page->inuse >= (*head)->inuse) 661 *head = page; 662 } 663 664 /* 665 * This function removes the given zspage from the freelist identified 666 * by <class, fullness_group>. 667 */ 668 static void remove_zspage(struct page *page, struct size_class *class, 669 enum fullness_group fullness) 670 { 671 struct page **head; 672 673 BUG_ON(!is_first_page(page)); 674 675 if (fullness >= _ZS_NR_FULLNESS_GROUPS) 676 return; 677 678 head = &class->fullness_list[fullness]; 679 BUG_ON(!*head); 680 if (list_empty(&(*head)->lru)) 681 *head = NULL; 682 else if (*head == page) 683 *head = (struct page *)list_entry((*head)->lru.next, 684 struct page, lru); 685 686 list_del_init(&page->lru); 687 zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? 688 CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); 689 } 690 691 /* 692 * Each size class maintains zspages in different fullness groups depending 693 * on the number of live objects they contain. When allocating or freeing 694 * objects, the fullness status of the page can change, say, from ALMOST_FULL 695 * to ALMOST_EMPTY when freeing an object. This function checks if such 696 * a status change has occurred for the given page and accordingly moves the 697 * page from the freelist of the old fullness group to that of the new 698 * fullness group. 699 */ 700 static enum fullness_group fix_fullness_group(struct size_class *class, 701 struct page *page) 702 { 703 int class_idx; 704 enum fullness_group currfg, newfg; 705 706 BUG_ON(!is_first_page(page)); 707 708 get_zspage_mapping(page, &class_idx, &currfg); 709 newfg = get_fullness_group(page); 710 if (newfg == currfg) 711 goto out; 712 713 remove_zspage(page, class, currfg); 714 insert_zspage(page, class, newfg); 715 set_zspage_mapping(page, class_idx, newfg); 716 717 out: 718 return newfg; 719 } 720 721 /* 722 * We have to decide on how many pages to link together 723 * to form a zspage for each size class. This is important 724 * to reduce wastage due to unusable space left at end of 725 * each zspage which is given as: 726 * wastage = Zp % class_size 727 * usage = Zp - wastage 728 * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... 729 * 730 * For example, for size class of 3/8 * PAGE_SIZE, we should 731 * link together 3 PAGE_SIZE sized pages to form a zspage 732 * since then we can perfectly fit in 8 such objects. 733 */ 734 static int get_pages_per_zspage(int class_size) 735 { 736 int i, max_usedpc = 0; 737 /* zspage order which gives maximum used size per KB */ 738 int max_usedpc_order = 1; 739 740 for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { 741 int zspage_size; 742 int waste, usedpc; 743 744 zspage_size = i * PAGE_SIZE; 745 waste = zspage_size % class_size; 746 usedpc = (zspage_size - waste) * 100 / zspage_size; 747 748 if (usedpc > max_usedpc) { 749 max_usedpc = usedpc; 750 max_usedpc_order = i; 751 } 752 } 753 754 return max_usedpc_order; 755 } 756 757 /* 758 * A single 'zspage' is composed of many system pages which are 759 * linked together using fields in struct page. This function finds 760 * the first/head page, given any component page of a zspage. 761 */ 762 static struct page *get_first_page(struct page *page) 763 { 764 if (is_first_page(page)) 765 return page; 766 else 767 return page->first_page; 768 } 769 770 static struct page *get_next_page(struct page *page) 771 { 772 struct page *next; 773 774 if (is_last_page(page)) 775 next = NULL; 776 else if (is_first_page(page)) 777 next = (struct page *)page_private(page); 778 else 779 next = list_entry(page->lru.next, struct page, lru); 780 781 return next; 782 } 783 784 /* 785 * Encode <page, obj_idx> as a single handle value. 786 * We use the least bit of handle for tagging. 787 */ 788 static void *location_to_obj(struct page *page, unsigned long obj_idx) 789 { 790 unsigned long obj; 791 792 if (!page) { 793 BUG_ON(obj_idx); 794 return NULL; 795 } 796 797 obj = page_to_pfn(page) << OBJ_INDEX_BITS; 798 obj |= ((obj_idx) & OBJ_INDEX_MASK); 799 obj <<= OBJ_TAG_BITS; 800 801 return (void *)obj; 802 } 803 804 /* 805 * Decode <page, obj_idx> pair from the given object handle. We adjust the 806 * decoded obj_idx back to its original value since it was adjusted in 807 * location_to_obj(). 808 */ 809 static void obj_to_location(unsigned long obj, struct page **page, 810 unsigned long *obj_idx) 811 { 812 obj >>= OBJ_TAG_BITS; 813 *page = pfn_to_page(obj >> OBJ_INDEX_BITS); 814 *obj_idx = (obj & OBJ_INDEX_MASK); 815 } 816 817 static unsigned long handle_to_obj(unsigned long handle) 818 { 819 return *(unsigned long *)handle; 820 } 821 822 static unsigned long obj_to_head(struct size_class *class, struct page *page, 823 void *obj) 824 { 825 if (class->huge) { 826 VM_BUG_ON(!is_first_page(page)); 827 return *(unsigned long *)page_private(page); 828 } else 829 return *(unsigned long *)obj; 830 } 831 832 static unsigned long obj_idx_to_offset(struct page *page, 833 unsigned long obj_idx, int class_size) 834 { 835 unsigned long off = 0; 836 837 if (!is_first_page(page)) 838 off = page->index; 839 840 return off + obj_idx * class_size; 841 } 842 843 static inline int trypin_tag(unsigned long handle) 844 { 845 unsigned long *ptr = (unsigned long *)handle; 846 847 return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); 848 } 849 850 static void pin_tag(unsigned long handle) 851 { 852 while (!trypin_tag(handle)); 853 } 854 855 static void unpin_tag(unsigned long handle) 856 { 857 unsigned long *ptr = (unsigned long *)handle; 858 859 clear_bit_unlock(HANDLE_PIN_BIT, ptr); 860 } 861 862 static void reset_page(struct page *page) 863 { 864 clear_bit(PG_private, &page->flags); 865 clear_bit(PG_private_2, &page->flags); 866 set_page_private(page, 0); 867 page->mapping = NULL; 868 page->freelist = NULL; 869 page_mapcount_reset(page); 870 } 871 872 static void free_zspage(struct page *first_page) 873 { 874 struct page *nextp, *tmp, *head_extra; 875 876 BUG_ON(!is_first_page(first_page)); 877 BUG_ON(first_page->inuse); 878 879 head_extra = (struct page *)page_private(first_page); 880 881 reset_page(first_page); 882 __free_page(first_page); 883 884 /* zspage with only 1 system page */ 885 if (!head_extra) 886 return; 887 888 list_for_each_entry_safe(nextp, tmp, &head_extra->lru, lru) { 889 list_del(&nextp->lru); 890 reset_page(nextp); 891 __free_page(nextp); 892 } 893 reset_page(head_extra); 894 __free_page(head_extra); 895 } 896 897 /* Initialize a newly allocated zspage */ 898 static void init_zspage(struct page *first_page, struct size_class *class) 899 { 900 unsigned long off = 0; 901 struct page *page = first_page; 902 903 BUG_ON(!is_first_page(first_page)); 904 while (page) { 905 struct page *next_page; 906 struct link_free *link; 907 unsigned int i = 1; 908 void *vaddr; 909 910 /* 911 * page->index stores offset of first object starting 912 * in the page. For the first page, this is always 0, 913 * so we use first_page->index (aka ->freelist) to store 914 * head of corresponding zspage's freelist. 915 */ 916 if (page != first_page) 917 page->index = off; 918 919 vaddr = kmap_atomic(page); 920 link = (struct link_free *)vaddr + off / sizeof(*link); 921 922 while ((off += class->size) < PAGE_SIZE) { 923 link->next = location_to_obj(page, i++); 924 link += class->size / sizeof(*link); 925 } 926 927 /* 928 * We now come to the last (full or partial) object on this 929 * page, which must point to the first object on the next 930 * page (if present) 931 */ 932 next_page = get_next_page(page); 933 link->next = location_to_obj(next_page, 0); 934 kunmap_atomic(vaddr); 935 page = next_page; 936 off %= PAGE_SIZE; 937 } 938 } 939 940 /* 941 * Allocate a zspage for the given size class 942 */ 943 static struct page *alloc_zspage(struct size_class *class, gfp_t flags) 944 { 945 int i, error; 946 struct page *first_page = NULL, *uninitialized_var(prev_page); 947 948 /* 949 * Allocate individual pages and link them together as: 950 * 1. first page->private = first sub-page 951 * 2. all sub-pages are linked together using page->lru 952 * 3. each sub-page is linked to the first page using page->first_page 953 * 954 * For each size class, First/Head pages are linked together using 955 * page->lru. Also, we set PG_private to identify the first page 956 * (i.e. no other sub-page has this flag set) and PG_private_2 to 957 * identify the last page. 958 */ 959 error = -ENOMEM; 960 for (i = 0; i < class->pages_per_zspage; i++) { 961 struct page *page; 962 963 page = alloc_page(flags); 964 if (!page) 965 goto cleanup; 966 967 INIT_LIST_HEAD(&page->lru); 968 if (i == 0) { /* first page */ 969 SetPagePrivate(page); 970 set_page_private(page, 0); 971 first_page = page; 972 first_page->inuse = 0; 973 } 974 if (i == 1) 975 set_page_private(first_page, (unsigned long)page); 976 if (i >= 1) 977 page->first_page = first_page; 978 if (i >= 2) 979 list_add(&page->lru, &prev_page->lru); 980 if (i == class->pages_per_zspage - 1) /* last page */ 981 SetPagePrivate2(page); 982 prev_page = page; 983 } 984 985 init_zspage(first_page, class); 986 987 first_page->freelist = location_to_obj(first_page, 0); 988 /* Maximum number of objects we can store in this zspage */ 989 first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; 990 991 error = 0; /* Success */ 992 993 cleanup: 994 if (unlikely(error) && first_page) { 995 free_zspage(first_page); 996 first_page = NULL; 997 } 998 999 return first_page; 1000 } 1001 1002 static struct page *find_get_zspage(struct size_class *class) 1003 { 1004 int i; 1005 struct page *page; 1006 1007 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1008 page = class->fullness_list[i]; 1009 if (page) 1010 break; 1011 } 1012 1013 return page; 1014 } 1015 1016 #ifdef CONFIG_PGTABLE_MAPPING 1017 static inline int __zs_cpu_up(struct mapping_area *area) 1018 { 1019 /* 1020 * Make sure we don't leak memory if a cpu UP notification 1021 * and zs_init() race and both call zs_cpu_up() on the same cpu 1022 */ 1023 if (area->vm) 1024 return 0; 1025 area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL); 1026 if (!area->vm) 1027 return -ENOMEM; 1028 return 0; 1029 } 1030 1031 static inline void __zs_cpu_down(struct mapping_area *area) 1032 { 1033 if (area->vm) 1034 free_vm_area(area->vm); 1035 area->vm = NULL; 1036 } 1037 1038 static inline void *__zs_map_object(struct mapping_area *area, 1039 struct page *pages[2], int off, int size) 1040 { 1041 BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); 1042 area->vm_addr = area->vm->addr; 1043 return area->vm_addr + off; 1044 } 1045 1046 static inline void __zs_unmap_object(struct mapping_area *area, 1047 struct page *pages[2], int off, int size) 1048 { 1049 unsigned long addr = (unsigned long)area->vm_addr; 1050 1051 unmap_kernel_range(addr, PAGE_SIZE * 2); 1052 } 1053 1054 #else /* CONFIG_PGTABLE_MAPPING */ 1055 1056 static inline int __zs_cpu_up(struct mapping_area *area) 1057 { 1058 /* 1059 * Make sure we don't leak memory if a cpu UP notification 1060 * and zs_init() race and both call zs_cpu_up() on the same cpu 1061 */ 1062 if (area->vm_buf) 1063 return 0; 1064 area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); 1065 if (!area->vm_buf) 1066 return -ENOMEM; 1067 return 0; 1068 } 1069 1070 static inline void __zs_cpu_down(struct mapping_area *area) 1071 { 1072 kfree(area->vm_buf); 1073 area->vm_buf = NULL; 1074 } 1075 1076 static void *__zs_map_object(struct mapping_area *area, 1077 struct page *pages[2], int off, int size) 1078 { 1079 int sizes[2]; 1080 void *addr; 1081 char *buf = area->vm_buf; 1082 1083 /* disable page faults to match kmap_atomic() return conditions */ 1084 pagefault_disable(); 1085 1086 /* no read fastpath */ 1087 if (area->vm_mm == ZS_MM_WO) 1088 goto out; 1089 1090 sizes[0] = PAGE_SIZE - off; 1091 sizes[1] = size - sizes[0]; 1092 1093 /* copy object to per-cpu buffer */ 1094 addr = kmap_atomic(pages[0]); 1095 memcpy(buf, addr + off, sizes[0]); 1096 kunmap_atomic(addr); 1097 addr = kmap_atomic(pages[1]); 1098 memcpy(buf + sizes[0], addr, sizes[1]); 1099 kunmap_atomic(addr); 1100 out: 1101 return area->vm_buf; 1102 } 1103 1104 static void __zs_unmap_object(struct mapping_area *area, 1105 struct page *pages[2], int off, int size) 1106 { 1107 int sizes[2]; 1108 void *addr; 1109 char *buf; 1110 1111 /* no write fastpath */ 1112 if (area->vm_mm == ZS_MM_RO) 1113 goto out; 1114 1115 buf = area->vm_buf; 1116 if (!area->huge) { 1117 buf = buf + ZS_HANDLE_SIZE; 1118 size -= ZS_HANDLE_SIZE; 1119 off += ZS_HANDLE_SIZE; 1120 } 1121 1122 sizes[0] = PAGE_SIZE - off; 1123 sizes[1] = size - sizes[0]; 1124 1125 /* copy per-cpu buffer to object */ 1126 addr = kmap_atomic(pages[0]); 1127 memcpy(addr + off, buf, sizes[0]); 1128 kunmap_atomic(addr); 1129 addr = kmap_atomic(pages[1]); 1130 memcpy(addr, buf + sizes[0], sizes[1]); 1131 kunmap_atomic(addr); 1132 1133 out: 1134 /* enable page faults to match kunmap_atomic() return conditions */ 1135 pagefault_enable(); 1136 } 1137 1138 #endif /* CONFIG_PGTABLE_MAPPING */ 1139 1140 static int zs_cpu_notifier(struct notifier_block *nb, unsigned long action, 1141 void *pcpu) 1142 { 1143 int ret, cpu = (long)pcpu; 1144 struct mapping_area *area; 1145 1146 switch (action) { 1147 case CPU_UP_PREPARE: 1148 area = &per_cpu(zs_map_area, cpu); 1149 ret = __zs_cpu_up(area); 1150 if (ret) 1151 return notifier_from_errno(ret); 1152 break; 1153 case CPU_DEAD: 1154 case CPU_UP_CANCELED: 1155 area = &per_cpu(zs_map_area, cpu); 1156 __zs_cpu_down(area); 1157 break; 1158 } 1159 1160 return NOTIFY_OK; 1161 } 1162 1163 static struct notifier_block zs_cpu_nb = { 1164 .notifier_call = zs_cpu_notifier 1165 }; 1166 1167 static int zs_register_cpu_notifier(void) 1168 { 1169 int cpu, uninitialized_var(ret); 1170 1171 cpu_notifier_register_begin(); 1172 1173 __register_cpu_notifier(&zs_cpu_nb); 1174 for_each_online_cpu(cpu) { 1175 ret = zs_cpu_notifier(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1176 if (notifier_to_errno(ret)) 1177 break; 1178 } 1179 1180 cpu_notifier_register_done(); 1181 return notifier_to_errno(ret); 1182 } 1183 1184 static void zs_unregister_cpu_notifier(void) 1185 { 1186 int cpu; 1187 1188 cpu_notifier_register_begin(); 1189 1190 for_each_online_cpu(cpu) 1191 zs_cpu_notifier(NULL, CPU_DEAD, (void *)(long)cpu); 1192 __unregister_cpu_notifier(&zs_cpu_nb); 1193 1194 cpu_notifier_register_done(); 1195 } 1196 1197 static void init_zs_size_classes(void) 1198 { 1199 int nr; 1200 1201 nr = (ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) / ZS_SIZE_CLASS_DELTA + 1; 1202 if ((ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE) % ZS_SIZE_CLASS_DELTA) 1203 nr += 1; 1204 1205 zs_size_classes = nr; 1206 } 1207 1208 static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) 1209 { 1210 if (prev->pages_per_zspage != pages_per_zspage) 1211 return false; 1212 1213 if (get_maxobj_per_zspage(prev->size, prev->pages_per_zspage) 1214 != get_maxobj_per_zspage(size, pages_per_zspage)) 1215 return false; 1216 1217 return true; 1218 } 1219 1220 static bool zspage_full(struct page *page) 1221 { 1222 BUG_ON(!is_first_page(page)); 1223 1224 return page->inuse == page->objects; 1225 } 1226 1227 unsigned long zs_get_total_pages(struct zs_pool *pool) 1228 { 1229 return atomic_long_read(&pool->pages_allocated); 1230 } 1231 EXPORT_SYMBOL_GPL(zs_get_total_pages); 1232 1233 /** 1234 * zs_map_object - get address of allocated object from handle. 1235 * @pool: pool from which the object was allocated 1236 * @handle: handle returned from zs_malloc 1237 * 1238 * Before using an object allocated from zs_malloc, it must be mapped using 1239 * this function. When done with the object, it must be unmapped using 1240 * zs_unmap_object. 1241 * 1242 * Only one object can be mapped per cpu at a time. There is no protection 1243 * against nested mappings. 1244 * 1245 * This function returns with preemption and page faults disabled. 1246 */ 1247 void *zs_map_object(struct zs_pool *pool, unsigned long handle, 1248 enum zs_mapmode mm) 1249 { 1250 struct page *page; 1251 unsigned long obj, obj_idx, off; 1252 1253 unsigned int class_idx; 1254 enum fullness_group fg; 1255 struct size_class *class; 1256 struct mapping_area *area; 1257 struct page *pages[2]; 1258 void *ret; 1259 1260 BUG_ON(!handle); 1261 1262 /* 1263 * Because we use per-cpu mapping areas shared among the 1264 * pools/users, we can't allow mapping in interrupt context 1265 * because it can corrupt another users mappings. 1266 */ 1267 BUG_ON(in_interrupt()); 1268 1269 /* From now on, migration cannot move the object */ 1270 pin_tag(handle); 1271 1272 obj = handle_to_obj(handle); 1273 obj_to_location(obj, &page, &obj_idx); 1274 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1275 class = pool->size_class[class_idx]; 1276 off = obj_idx_to_offset(page, obj_idx, class->size); 1277 1278 area = &get_cpu_var(zs_map_area); 1279 area->vm_mm = mm; 1280 if (off + class->size <= PAGE_SIZE) { 1281 /* this object is contained entirely within a page */ 1282 area->vm_addr = kmap_atomic(page); 1283 ret = area->vm_addr + off; 1284 goto out; 1285 } 1286 1287 /* this object spans two pages */ 1288 pages[0] = page; 1289 pages[1] = get_next_page(page); 1290 BUG_ON(!pages[1]); 1291 1292 ret = __zs_map_object(area, pages, off, class->size); 1293 out: 1294 if (!class->huge) 1295 ret += ZS_HANDLE_SIZE; 1296 1297 return ret; 1298 } 1299 EXPORT_SYMBOL_GPL(zs_map_object); 1300 1301 void zs_unmap_object(struct zs_pool *pool, unsigned long handle) 1302 { 1303 struct page *page; 1304 unsigned long obj, obj_idx, off; 1305 1306 unsigned int class_idx; 1307 enum fullness_group fg; 1308 struct size_class *class; 1309 struct mapping_area *area; 1310 1311 BUG_ON(!handle); 1312 1313 obj = handle_to_obj(handle); 1314 obj_to_location(obj, &page, &obj_idx); 1315 get_zspage_mapping(get_first_page(page), &class_idx, &fg); 1316 class = pool->size_class[class_idx]; 1317 off = obj_idx_to_offset(page, obj_idx, class->size); 1318 1319 area = this_cpu_ptr(&zs_map_area); 1320 if (off + class->size <= PAGE_SIZE) 1321 kunmap_atomic(area->vm_addr); 1322 else { 1323 struct page *pages[2]; 1324 1325 pages[0] = page; 1326 pages[1] = get_next_page(page); 1327 BUG_ON(!pages[1]); 1328 1329 __zs_unmap_object(area, pages, off, class->size); 1330 } 1331 put_cpu_var(zs_map_area); 1332 unpin_tag(handle); 1333 } 1334 EXPORT_SYMBOL_GPL(zs_unmap_object); 1335 1336 static unsigned long obj_malloc(struct page *first_page, 1337 struct size_class *class, unsigned long handle) 1338 { 1339 unsigned long obj; 1340 struct link_free *link; 1341 1342 struct page *m_page; 1343 unsigned long m_objidx, m_offset; 1344 void *vaddr; 1345 1346 handle |= OBJ_ALLOCATED_TAG; 1347 obj = (unsigned long)first_page->freelist; 1348 obj_to_location(obj, &m_page, &m_objidx); 1349 m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); 1350 1351 vaddr = kmap_atomic(m_page); 1352 link = (struct link_free *)vaddr + m_offset / sizeof(*link); 1353 first_page->freelist = link->next; 1354 if (!class->huge) 1355 /* record handle in the header of allocated chunk */ 1356 link->handle = handle; 1357 else 1358 /* record handle in first_page->private */ 1359 set_page_private(first_page, handle); 1360 kunmap_atomic(vaddr); 1361 first_page->inuse++; 1362 zs_stat_inc(class, OBJ_USED, 1); 1363 1364 return obj; 1365 } 1366 1367 1368 /** 1369 * zs_malloc - Allocate block of given size from pool. 1370 * @pool: pool to allocate from 1371 * @size: size of block to allocate 1372 * 1373 * On success, handle to the allocated object is returned, 1374 * otherwise 0. 1375 * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. 1376 */ 1377 unsigned long zs_malloc(struct zs_pool *pool, size_t size) 1378 { 1379 unsigned long handle, obj; 1380 struct size_class *class; 1381 struct page *first_page; 1382 1383 if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) 1384 return 0; 1385 1386 handle = alloc_handle(pool); 1387 if (!handle) 1388 return 0; 1389 1390 /* extra space in chunk to keep the handle */ 1391 size += ZS_HANDLE_SIZE; 1392 class = pool->size_class[get_size_class_index(size)]; 1393 1394 spin_lock(&class->lock); 1395 first_page = find_get_zspage(class); 1396 1397 if (!first_page) { 1398 spin_unlock(&class->lock); 1399 first_page = alloc_zspage(class, pool->flags); 1400 if (unlikely(!first_page)) { 1401 free_handle(pool, handle); 1402 return 0; 1403 } 1404 1405 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1406 atomic_long_add(class->pages_per_zspage, 1407 &pool->pages_allocated); 1408 1409 spin_lock(&class->lock); 1410 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1411 class->size, class->pages_per_zspage)); 1412 } 1413 1414 obj = obj_malloc(first_page, class, handle); 1415 /* Now move the zspage to another fullness group, if required */ 1416 fix_fullness_group(class, first_page); 1417 record_obj(handle, obj); 1418 spin_unlock(&class->lock); 1419 1420 return handle; 1421 } 1422 EXPORT_SYMBOL_GPL(zs_malloc); 1423 1424 static void obj_free(struct zs_pool *pool, struct size_class *class, 1425 unsigned long obj) 1426 { 1427 struct link_free *link; 1428 struct page *first_page, *f_page; 1429 unsigned long f_objidx, f_offset; 1430 void *vaddr; 1431 int class_idx; 1432 enum fullness_group fullness; 1433 1434 BUG_ON(!obj); 1435 1436 obj &= ~OBJ_ALLOCATED_TAG; 1437 obj_to_location(obj, &f_page, &f_objidx); 1438 first_page = get_first_page(f_page); 1439 1440 get_zspage_mapping(first_page, &class_idx, &fullness); 1441 f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); 1442 1443 vaddr = kmap_atomic(f_page); 1444 1445 /* Insert this object in containing zspage's freelist */ 1446 link = (struct link_free *)(vaddr + f_offset); 1447 link->next = first_page->freelist; 1448 if (class->huge) 1449 set_page_private(first_page, 0); 1450 kunmap_atomic(vaddr); 1451 first_page->freelist = (void *)obj; 1452 first_page->inuse--; 1453 zs_stat_dec(class, OBJ_USED, 1); 1454 } 1455 1456 void zs_free(struct zs_pool *pool, unsigned long handle) 1457 { 1458 struct page *first_page, *f_page; 1459 unsigned long obj, f_objidx; 1460 int class_idx; 1461 struct size_class *class; 1462 enum fullness_group fullness; 1463 1464 if (unlikely(!handle)) 1465 return; 1466 1467 pin_tag(handle); 1468 obj = handle_to_obj(handle); 1469 obj_to_location(obj, &f_page, &f_objidx); 1470 first_page = get_first_page(f_page); 1471 1472 get_zspage_mapping(first_page, &class_idx, &fullness); 1473 class = pool->size_class[class_idx]; 1474 1475 spin_lock(&class->lock); 1476 obj_free(pool, class, obj); 1477 fullness = fix_fullness_group(class, first_page); 1478 if (fullness == ZS_EMPTY) { 1479 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1480 class->size, class->pages_per_zspage)); 1481 atomic_long_sub(class->pages_per_zspage, 1482 &pool->pages_allocated); 1483 free_zspage(first_page); 1484 } 1485 spin_unlock(&class->lock); 1486 unpin_tag(handle); 1487 1488 free_handle(pool, handle); 1489 } 1490 EXPORT_SYMBOL_GPL(zs_free); 1491 1492 static void zs_object_copy(unsigned long dst, unsigned long src, 1493 struct size_class *class) 1494 { 1495 struct page *s_page, *d_page; 1496 unsigned long s_objidx, d_objidx; 1497 unsigned long s_off, d_off; 1498 void *s_addr, *d_addr; 1499 int s_size, d_size, size; 1500 int written = 0; 1501 1502 s_size = d_size = class->size; 1503 1504 obj_to_location(src, &s_page, &s_objidx); 1505 obj_to_location(dst, &d_page, &d_objidx); 1506 1507 s_off = obj_idx_to_offset(s_page, s_objidx, class->size); 1508 d_off = obj_idx_to_offset(d_page, d_objidx, class->size); 1509 1510 if (s_off + class->size > PAGE_SIZE) 1511 s_size = PAGE_SIZE - s_off; 1512 1513 if (d_off + class->size > PAGE_SIZE) 1514 d_size = PAGE_SIZE - d_off; 1515 1516 s_addr = kmap_atomic(s_page); 1517 d_addr = kmap_atomic(d_page); 1518 1519 while (1) { 1520 size = min(s_size, d_size); 1521 memcpy(d_addr + d_off, s_addr + s_off, size); 1522 written += size; 1523 1524 if (written == class->size) 1525 break; 1526 1527 s_off += size; 1528 s_size -= size; 1529 d_off += size; 1530 d_size -= size; 1531 1532 if (s_off >= PAGE_SIZE) { 1533 kunmap_atomic(d_addr); 1534 kunmap_atomic(s_addr); 1535 s_page = get_next_page(s_page); 1536 BUG_ON(!s_page); 1537 s_addr = kmap_atomic(s_page); 1538 d_addr = kmap_atomic(d_page); 1539 s_size = class->size - written; 1540 s_off = 0; 1541 } 1542 1543 if (d_off >= PAGE_SIZE) { 1544 kunmap_atomic(d_addr); 1545 d_page = get_next_page(d_page); 1546 BUG_ON(!d_page); 1547 d_addr = kmap_atomic(d_page); 1548 d_size = class->size - written; 1549 d_off = 0; 1550 } 1551 } 1552 1553 kunmap_atomic(d_addr); 1554 kunmap_atomic(s_addr); 1555 } 1556 1557 /* 1558 * Find alloced object in zspage from index object and 1559 * return handle. 1560 */ 1561 static unsigned long find_alloced_obj(struct page *page, int index, 1562 struct size_class *class) 1563 { 1564 unsigned long head; 1565 int offset = 0; 1566 unsigned long handle = 0; 1567 void *addr = kmap_atomic(page); 1568 1569 if (!is_first_page(page)) 1570 offset = page->index; 1571 offset += class->size * index; 1572 1573 while (offset < PAGE_SIZE) { 1574 head = obj_to_head(class, page, addr + offset); 1575 if (head & OBJ_ALLOCATED_TAG) { 1576 handle = head & ~OBJ_ALLOCATED_TAG; 1577 if (trypin_tag(handle)) 1578 break; 1579 handle = 0; 1580 } 1581 1582 offset += class->size; 1583 index++; 1584 } 1585 1586 kunmap_atomic(addr); 1587 return handle; 1588 } 1589 1590 struct zs_compact_control { 1591 /* Source page for migration which could be a subpage of zspage. */ 1592 struct page *s_page; 1593 /* Destination page for migration which should be a first page 1594 * of zspage. */ 1595 struct page *d_page; 1596 /* Starting object index within @s_page which used for live object 1597 * in the subpage. */ 1598 int index; 1599 }; 1600 1601 static int migrate_zspage(struct zs_pool *pool, struct size_class *class, 1602 struct zs_compact_control *cc) 1603 { 1604 unsigned long used_obj, free_obj; 1605 unsigned long handle; 1606 struct page *s_page = cc->s_page; 1607 struct page *d_page = cc->d_page; 1608 unsigned long index = cc->index; 1609 int ret = 0; 1610 1611 while (1) { 1612 handle = find_alloced_obj(s_page, index, class); 1613 if (!handle) { 1614 s_page = get_next_page(s_page); 1615 if (!s_page) 1616 break; 1617 index = 0; 1618 continue; 1619 } 1620 1621 /* Stop if there is no more space */ 1622 if (zspage_full(d_page)) { 1623 unpin_tag(handle); 1624 ret = -ENOMEM; 1625 break; 1626 } 1627 1628 used_obj = handle_to_obj(handle); 1629 free_obj = obj_malloc(d_page, class, handle); 1630 zs_object_copy(free_obj, used_obj, class); 1631 index++; 1632 record_obj(handle, free_obj); 1633 unpin_tag(handle); 1634 obj_free(pool, class, used_obj); 1635 } 1636 1637 /* Remember last position in this iteration */ 1638 cc->s_page = s_page; 1639 cc->index = index; 1640 1641 return ret; 1642 } 1643 1644 static struct page *isolate_target_page(struct size_class *class) 1645 { 1646 int i; 1647 struct page *page; 1648 1649 for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { 1650 page = class->fullness_list[i]; 1651 if (page) { 1652 remove_zspage(page, class, i); 1653 break; 1654 } 1655 } 1656 1657 return page; 1658 } 1659 1660 /* 1661 * putback_zspage - add @first_page into right class's fullness list 1662 * @pool: target pool 1663 * @class: destination class 1664 * @first_page: target page 1665 * 1666 * Return @fist_page's fullness_group 1667 */ 1668 static enum fullness_group putback_zspage(struct zs_pool *pool, 1669 struct size_class *class, 1670 struct page *first_page) 1671 { 1672 enum fullness_group fullness; 1673 1674 BUG_ON(!is_first_page(first_page)); 1675 1676 fullness = get_fullness_group(first_page); 1677 insert_zspage(first_page, class, fullness); 1678 set_zspage_mapping(first_page, class->index, fullness); 1679 1680 if (fullness == ZS_EMPTY) { 1681 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1682 class->size, class->pages_per_zspage)); 1683 atomic_long_sub(class->pages_per_zspage, 1684 &pool->pages_allocated); 1685 1686 free_zspage(first_page); 1687 } 1688 1689 return fullness; 1690 } 1691 1692 static struct page *isolate_source_page(struct size_class *class) 1693 { 1694 int i; 1695 struct page *page = NULL; 1696 1697 for (i = ZS_ALMOST_EMPTY; i >= ZS_ALMOST_FULL; i--) { 1698 page = class->fullness_list[i]; 1699 if (!page) 1700 continue; 1701 1702 remove_zspage(page, class, i); 1703 break; 1704 } 1705 1706 return page; 1707 } 1708 1709 /* 1710 * 1711 * Based on the number of unused allocated objects calculate 1712 * and return the number of pages that we can free. 1713 */ 1714 static unsigned long zs_can_compact(struct size_class *class) 1715 { 1716 unsigned long obj_wasted; 1717 1718 obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - 1719 zs_stat_get(class, OBJ_USED); 1720 1721 obj_wasted /= get_maxobj_per_zspage(class->size, 1722 class->pages_per_zspage); 1723 1724 return obj_wasted * class->pages_per_zspage; 1725 } 1726 1727 static void __zs_compact(struct zs_pool *pool, struct size_class *class) 1728 { 1729 struct zs_compact_control cc; 1730 struct page *src_page; 1731 struct page *dst_page = NULL; 1732 1733 spin_lock(&class->lock); 1734 while ((src_page = isolate_source_page(class))) { 1735 1736 BUG_ON(!is_first_page(src_page)); 1737 1738 if (!zs_can_compact(class)) 1739 break; 1740 1741 cc.index = 0; 1742 cc.s_page = src_page; 1743 1744 while ((dst_page = isolate_target_page(class))) { 1745 cc.d_page = dst_page; 1746 /* 1747 * If there is no more space in dst_page, resched 1748 * and see if anyone had allocated another zspage. 1749 */ 1750 if (!migrate_zspage(pool, class, &cc)) 1751 break; 1752 1753 putback_zspage(pool, class, dst_page); 1754 } 1755 1756 /* Stop if we couldn't find slot */ 1757 if (dst_page == NULL) 1758 break; 1759 1760 putback_zspage(pool, class, dst_page); 1761 if (putback_zspage(pool, class, src_page) == ZS_EMPTY) 1762 pool->stats.pages_compacted += class->pages_per_zspage; 1763 spin_unlock(&class->lock); 1764 cond_resched(); 1765 spin_lock(&class->lock); 1766 } 1767 1768 if (src_page) 1769 putback_zspage(pool, class, src_page); 1770 1771 spin_unlock(&class->lock); 1772 } 1773 1774 unsigned long zs_compact(struct zs_pool *pool) 1775 { 1776 int i; 1777 struct size_class *class; 1778 1779 for (i = zs_size_classes - 1; i >= 0; i--) { 1780 class = pool->size_class[i]; 1781 if (!class) 1782 continue; 1783 if (class->index != i) 1784 continue; 1785 __zs_compact(pool, class); 1786 } 1787 1788 return pool->stats.pages_compacted; 1789 } 1790 EXPORT_SYMBOL_GPL(zs_compact); 1791 1792 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) 1793 { 1794 memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); 1795 } 1796 EXPORT_SYMBOL_GPL(zs_pool_stats); 1797 1798 static unsigned long zs_shrinker_scan(struct shrinker *shrinker, 1799 struct shrink_control *sc) 1800 { 1801 unsigned long pages_freed; 1802 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1803 shrinker); 1804 1805 pages_freed = pool->stats.pages_compacted; 1806 /* 1807 * Compact classes and calculate compaction delta. 1808 * Can run concurrently with a manually triggered 1809 * (by user) compaction. 1810 */ 1811 pages_freed = zs_compact(pool) - pages_freed; 1812 1813 return pages_freed ? pages_freed : SHRINK_STOP; 1814 } 1815 1816 static unsigned long zs_shrinker_count(struct shrinker *shrinker, 1817 struct shrink_control *sc) 1818 { 1819 int i; 1820 struct size_class *class; 1821 unsigned long pages_to_free = 0; 1822 struct zs_pool *pool = container_of(shrinker, struct zs_pool, 1823 shrinker); 1824 1825 if (!pool->shrinker_enabled) 1826 return 0; 1827 1828 for (i = zs_size_classes - 1; i >= 0; i--) { 1829 class = pool->size_class[i]; 1830 if (!class) 1831 continue; 1832 if (class->index != i) 1833 continue; 1834 1835 pages_to_free += zs_can_compact(class); 1836 } 1837 1838 return pages_to_free; 1839 } 1840 1841 static void zs_unregister_shrinker(struct zs_pool *pool) 1842 { 1843 if (pool->shrinker_enabled) { 1844 unregister_shrinker(&pool->shrinker); 1845 pool->shrinker_enabled = false; 1846 } 1847 } 1848 1849 static int zs_register_shrinker(struct zs_pool *pool) 1850 { 1851 pool->shrinker.scan_objects = zs_shrinker_scan; 1852 pool->shrinker.count_objects = zs_shrinker_count; 1853 pool->shrinker.batch = 0; 1854 pool->shrinker.seeks = DEFAULT_SEEKS; 1855 1856 return register_shrinker(&pool->shrinker); 1857 } 1858 1859 /** 1860 * zs_create_pool - Creates an allocation pool to work from. 1861 * @flags: allocation flags used to allocate pool metadata 1862 * 1863 * This function must be called before anything when using 1864 * the zsmalloc allocator. 1865 * 1866 * On success, a pointer to the newly created pool is returned, 1867 * otherwise NULL. 1868 */ 1869 struct zs_pool *zs_create_pool(char *name, gfp_t flags) 1870 { 1871 int i; 1872 struct zs_pool *pool; 1873 struct size_class *prev_class = NULL; 1874 1875 pool = kzalloc(sizeof(*pool), GFP_KERNEL); 1876 if (!pool) 1877 return NULL; 1878 1879 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1880 GFP_KERNEL); 1881 if (!pool->size_class) { 1882 kfree(pool); 1883 return NULL; 1884 } 1885 1886 pool->name = kstrdup(name, GFP_KERNEL); 1887 if (!pool->name) 1888 goto err; 1889 1890 if (create_handle_cache(pool)) 1891 goto err; 1892 1893 /* 1894 * Iterate reversly, because, size of size_class that we want to use 1895 * for merging should be larger or equal to current size. 1896 */ 1897 for (i = zs_size_classes - 1; i >= 0; i--) { 1898 int size; 1899 int pages_per_zspage; 1900 struct size_class *class; 1901 1902 size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; 1903 if (size > ZS_MAX_ALLOC_SIZE) 1904 size = ZS_MAX_ALLOC_SIZE; 1905 pages_per_zspage = get_pages_per_zspage(size); 1906 1907 /* 1908 * size_class is used for normal zsmalloc operation such 1909 * as alloc/free for that size. Although it is natural that we 1910 * have one size_class for each size, there is a chance that we 1911 * can get more memory utilization if we use one size_class for 1912 * many different sizes whose size_class have same 1913 * characteristics. So, we makes size_class point to 1914 * previous size_class if possible. 1915 */ 1916 if (prev_class) { 1917 if (can_merge(prev_class, size, pages_per_zspage)) { 1918 pool->size_class[i] = prev_class; 1919 continue; 1920 } 1921 } 1922 1923 class = kzalloc(sizeof(struct size_class), GFP_KERNEL); 1924 if (!class) 1925 goto err; 1926 1927 class->size = size; 1928 class->index = i; 1929 class->pages_per_zspage = pages_per_zspage; 1930 if (pages_per_zspage == 1 && 1931 get_maxobj_per_zspage(size, pages_per_zspage) == 1) 1932 class->huge = true; 1933 spin_lock_init(&class->lock); 1934 pool->size_class[i] = class; 1935 1936 prev_class = class; 1937 } 1938 1939 pool->flags = flags; 1940 1941 if (zs_pool_stat_create(name, pool)) 1942 goto err; 1943 1944 /* 1945 * Not critical, we still can use the pool 1946 * and user can trigger compaction manually. 1947 */ 1948 if (zs_register_shrinker(pool) == 0) 1949 pool->shrinker_enabled = true; 1950 return pool; 1951 1952 err: 1953 zs_destroy_pool(pool); 1954 return NULL; 1955 } 1956 EXPORT_SYMBOL_GPL(zs_create_pool); 1957 1958 void zs_destroy_pool(struct zs_pool *pool) 1959 { 1960 int i; 1961 1962 zs_unregister_shrinker(pool); 1963 zs_pool_stat_destroy(pool); 1964 1965 for (i = 0; i < zs_size_classes; i++) { 1966 int fg; 1967 struct size_class *class = pool->size_class[i]; 1968 1969 if (!class) 1970 continue; 1971 1972 if (class->index != i) 1973 continue; 1974 1975 for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) { 1976 if (class->fullness_list[fg]) { 1977 pr_info("Freeing non-empty class with size %db, fullness group %d\n", 1978 class->size, fg); 1979 } 1980 } 1981 kfree(class); 1982 } 1983 1984 destroy_handle_cache(pool); 1985 kfree(pool->size_class); 1986 kfree(pool->name); 1987 kfree(pool); 1988 } 1989 EXPORT_SYMBOL_GPL(zs_destroy_pool); 1990 1991 static int __init zs_init(void) 1992 { 1993 int ret = zs_register_cpu_notifier(); 1994 1995 if (ret) 1996 goto notifier_fail; 1997 1998 init_zs_size_classes(); 1999 2000 #ifdef CONFIG_ZPOOL 2001 zpool_register_driver(&zs_zpool_driver); 2002 #endif 2003 2004 ret = zs_stat_init(); 2005 if (ret) { 2006 pr_err("zs stat initialization failed\n"); 2007 goto stat_fail; 2008 } 2009 return 0; 2010 2011 stat_fail: 2012 #ifdef CONFIG_ZPOOL 2013 zpool_unregister_driver(&zs_zpool_driver); 2014 #endif 2015 notifier_fail: 2016 zs_unregister_cpu_notifier(); 2017 2018 return ret; 2019 } 2020 2021 static void __exit zs_exit(void) 2022 { 2023 #ifdef CONFIG_ZPOOL 2024 zpool_unregister_driver(&zs_zpool_driver); 2025 #endif 2026 zs_unregister_cpu_notifier(); 2027 2028 zs_stat_exit(); 2029 } 2030 2031 module_init(zs_init); 2032 module_exit(zs_exit); 2033 2034 MODULE_LICENSE("Dual BSD/GPL"); 2035 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); 2036