1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 HUAWEI, Inc. 4 * https://www.huawei.com/ 5 * Copyright (C) 2022 Alibaba Cloud 6 */ 7 #include "compress.h" 8 #include <linux/psi.h> 9 #include <linux/cpuhotplug.h> 10 #include <trace/events/erofs.h> 11 12 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) 13 #define Z_EROFS_INLINE_BVECS 2 14 15 /* 16 * let's leave a type here in case of introducing 17 * another tagged pointer later. 18 */ 19 typedef void *z_erofs_next_pcluster_t; 20 21 struct z_erofs_bvec { 22 struct page *page; 23 int offset; 24 unsigned int end; 25 }; 26 27 #define __Z_EROFS_BVSET(name, total) \ 28 struct name { \ 29 /* point to the next page which contains the following bvecs */ \ 30 struct page *nextpage; \ 31 struct z_erofs_bvec bvec[total]; \ 32 } 33 __Z_EROFS_BVSET(z_erofs_bvset,); 34 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); 35 36 /* 37 * Structure fields follow one of the following exclusion rules. 38 * 39 * I: Modifiable by initialization/destruction paths and read-only 40 * for everyone else; 41 * 42 * L: Field should be protected by the pcluster lock; 43 * 44 * A: Field should be accessed / updated in atomic for parallelized code. 45 */ 46 struct z_erofs_pcluster { 47 struct erofs_workgroup obj; 48 struct mutex lock; 49 50 /* A: point to next chained pcluster or TAILs */ 51 z_erofs_next_pcluster_t next; 52 53 /* L: the maximum decompression size of this round */ 54 unsigned int length; 55 56 /* L: total number of bvecs */ 57 unsigned int vcnt; 58 59 /* I: page offset of start position of decompression */ 60 unsigned short pageofs_out; 61 62 /* I: page offset of inline compressed data */ 63 unsigned short pageofs_in; 64 65 union { 66 /* L: inline a certain number of bvec for bootstrap */ 67 struct z_erofs_bvset_inline bvset; 68 69 /* I: can be used to free the pcluster by RCU. */ 70 struct rcu_head rcu; 71 }; 72 73 union { 74 /* I: physical cluster size in pages */ 75 unsigned short pclusterpages; 76 77 /* I: tailpacking inline compressed size */ 78 unsigned short tailpacking_size; 79 }; 80 81 /* I: compression algorithm format */ 82 unsigned char algorithmformat; 83 84 /* L: whether partial decompression or not */ 85 bool partial; 86 87 /* L: indicate several pageofs_outs or not */ 88 bool multibases; 89 90 /* A: compressed bvecs (can be cached or inplaced pages) */ 91 struct z_erofs_bvec compressed_bvecs[]; 92 }; 93 94 /* the end of a chain of pclusters */ 95 #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) 96 #define Z_EROFS_PCLUSTER_NIL (NULL) 97 98 struct z_erofs_decompressqueue { 99 struct super_block *sb; 100 atomic_t pending_bios; 101 z_erofs_next_pcluster_t head; 102 103 union { 104 struct completion done; 105 struct work_struct work; 106 struct kthread_work kthread_work; 107 } u; 108 bool eio, sync; 109 }; 110 111 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) 112 { 113 return !pcl->obj.index; 114 } 115 116 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) 117 { 118 if (z_erofs_is_inline_pcluster(pcl)) 119 return 1; 120 return pcl->pclusterpages; 121 } 122 123 /* 124 * bit 30: I/O error occurred on this page 125 * bit 0 - 29: remaining parts to complete this page 126 */ 127 #define Z_EROFS_PAGE_EIO (1 << 30) 128 129 static inline void z_erofs_onlinepage_init(struct page *page) 130 { 131 union { 132 atomic_t o; 133 unsigned long v; 134 } u = { .o = ATOMIC_INIT(1) }; 135 136 set_page_private(page, u.v); 137 smp_wmb(); 138 SetPagePrivate(page); 139 } 140 141 static inline void z_erofs_onlinepage_split(struct page *page) 142 { 143 atomic_inc((atomic_t *)&page->private); 144 } 145 146 static void z_erofs_onlinepage_endio(struct page *page, int err) 147 { 148 int orig, v; 149 150 DBG_BUGON(!PagePrivate(page)); 151 152 do { 153 orig = atomic_read((atomic_t *)&page->private); 154 v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); 155 } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); 156 157 if (!(v & ~Z_EROFS_PAGE_EIO)) { 158 set_page_private(page, 0); 159 ClearPagePrivate(page); 160 if (!(v & Z_EROFS_PAGE_EIO)) 161 SetPageUptodate(page); 162 unlock_page(page); 163 } 164 } 165 166 #define Z_EROFS_ONSTACK_PAGES 32 167 168 /* 169 * since pclustersize is variable for big pcluster feature, introduce slab 170 * pools implementation for different pcluster sizes. 171 */ 172 struct z_erofs_pcluster_slab { 173 struct kmem_cache *slab; 174 unsigned int maxpages; 175 char name[48]; 176 }; 177 178 #define _PCLP(n) { .maxpages = n } 179 180 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { 181 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), 182 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) 183 }; 184 185 struct z_erofs_bvec_iter { 186 struct page *bvpage; 187 struct z_erofs_bvset *bvset; 188 unsigned int nr, cur; 189 }; 190 191 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) 192 { 193 if (iter->bvpage) 194 kunmap_local(iter->bvset); 195 return iter->bvpage; 196 } 197 198 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) 199 { 200 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; 201 /* have to access nextpage in advance, otherwise it will be unmapped */ 202 struct page *nextpage = iter->bvset->nextpage; 203 struct page *oldpage; 204 205 DBG_BUGON(!nextpage); 206 oldpage = z_erofs_bvec_iter_end(iter); 207 iter->bvpage = nextpage; 208 iter->bvset = kmap_local_page(nextpage); 209 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); 210 iter->cur = 0; 211 return oldpage; 212 } 213 214 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, 215 struct z_erofs_bvset_inline *bvset, 216 unsigned int bootstrap_nr, 217 unsigned int cur) 218 { 219 *iter = (struct z_erofs_bvec_iter) { 220 .nr = bootstrap_nr, 221 .bvset = (struct z_erofs_bvset *)bvset, 222 }; 223 224 while (cur > iter->nr) { 225 cur -= iter->nr; 226 z_erofs_bvset_flip(iter); 227 } 228 iter->cur = cur; 229 } 230 231 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, 232 struct z_erofs_bvec *bvec, 233 struct page **candidate_bvpage, 234 struct page **pagepool) 235 { 236 if (iter->cur >= iter->nr) { 237 struct page *nextpage = *candidate_bvpage; 238 239 if (!nextpage) { 240 nextpage = erofs_allocpage(pagepool, GFP_NOFS); 241 if (!nextpage) 242 return -ENOMEM; 243 set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); 244 } 245 DBG_BUGON(iter->bvset->nextpage); 246 iter->bvset->nextpage = nextpage; 247 z_erofs_bvset_flip(iter); 248 249 iter->bvset->nextpage = NULL; 250 *candidate_bvpage = NULL; 251 } 252 iter->bvset->bvec[iter->cur++] = *bvec; 253 return 0; 254 } 255 256 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, 257 struct z_erofs_bvec *bvec, 258 struct page **old_bvpage) 259 { 260 if (iter->cur == iter->nr) 261 *old_bvpage = z_erofs_bvset_flip(iter); 262 else 263 *old_bvpage = NULL; 264 *bvec = iter->bvset->bvec[iter->cur++]; 265 } 266 267 static void z_erofs_destroy_pcluster_pool(void) 268 { 269 int i; 270 271 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 272 if (!pcluster_pool[i].slab) 273 continue; 274 kmem_cache_destroy(pcluster_pool[i].slab); 275 pcluster_pool[i].slab = NULL; 276 } 277 } 278 279 static int z_erofs_create_pcluster_pool(void) 280 { 281 struct z_erofs_pcluster_slab *pcs; 282 struct z_erofs_pcluster *a; 283 unsigned int size; 284 285 for (pcs = pcluster_pool; 286 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { 287 size = struct_size(a, compressed_bvecs, pcs->maxpages); 288 289 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); 290 pcs->slab = kmem_cache_create(pcs->name, size, 0, 291 SLAB_RECLAIM_ACCOUNT, NULL); 292 if (pcs->slab) 293 continue; 294 295 z_erofs_destroy_pcluster_pool(); 296 return -ENOMEM; 297 } 298 return 0; 299 } 300 301 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) 302 { 303 int i; 304 305 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 306 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 307 struct z_erofs_pcluster *pcl; 308 309 if (nrpages > pcs->maxpages) 310 continue; 311 312 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); 313 if (!pcl) 314 return ERR_PTR(-ENOMEM); 315 pcl->pclusterpages = nrpages; 316 return pcl; 317 } 318 return ERR_PTR(-EINVAL); 319 } 320 321 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) 322 { 323 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 324 int i; 325 326 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 327 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 328 329 if (pclusterpages > pcs->maxpages) 330 continue; 331 332 kmem_cache_free(pcs->slab, pcl); 333 return; 334 } 335 DBG_BUGON(1); 336 } 337 338 static struct workqueue_struct *z_erofs_workqueue __read_mostly; 339 340 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 341 static struct kthread_worker __rcu **z_erofs_pcpu_workers; 342 343 static void erofs_destroy_percpu_workers(void) 344 { 345 struct kthread_worker *worker; 346 unsigned int cpu; 347 348 for_each_possible_cpu(cpu) { 349 worker = rcu_dereference_protected( 350 z_erofs_pcpu_workers[cpu], 1); 351 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); 352 if (worker) 353 kthread_destroy_worker(worker); 354 } 355 kfree(z_erofs_pcpu_workers); 356 } 357 358 static struct kthread_worker *erofs_init_percpu_worker(int cpu) 359 { 360 struct kthread_worker *worker = 361 kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); 362 363 if (IS_ERR(worker)) 364 return worker; 365 if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) 366 sched_set_fifo_low(worker->task); 367 return worker; 368 } 369 370 static int erofs_init_percpu_workers(void) 371 { 372 struct kthread_worker *worker; 373 unsigned int cpu; 374 375 z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), 376 sizeof(struct kthread_worker *), GFP_ATOMIC); 377 if (!z_erofs_pcpu_workers) 378 return -ENOMEM; 379 380 for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ 381 worker = erofs_init_percpu_worker(cpu); 382 if (!IS_ERR(worker)) 383 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); 384 } 385 return 0; 386 } 387 #else 388 static inline void erofs_destroy_percpu_workers(void) {} 389 static inline int erofs_init_percpu_workers(void) { return 0; } 390 #endif 391 392 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) 393 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); 394 static enum cpuhp_state erofs_cpuhp_state; 395 396 static int erofs_cpu_online(unsigned int cpu) 397 { 398 struct kthread_worker *worker, *old; 399 400 worker = erofs_init_percpu_worker(cpu); 401 if (IS_ERR(worker)) 402 return PTR_ERR(worker); 403 404 spin_lock(&z_erofs_pcpu_worker_lock); 405 old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], 406 lockdep_is_held(&z_erofs_pcpu_worker_lock)); 407 if (!old) 408 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); 409 spin_unlock(&z_erofs_pcpu_worker_lock); 410 if (old) 411 kthread_destroy_worker(worker); 412 return 0; 413 } 414 415 static int erofs_cpu_offline(unsigned int cpu) 416 { 417 struct kthread_worker *worker; 418 419 spin_lock(&z_erofs_pcpu_worker_lock); 420 worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], 421 lockdep_is_held(&z_erofs_pcpu_worker_lock)); 422 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); 423 spin_unlock(&z_erofs_pcpu_worker_lock); 424 425 synchronize_rcu(); 426 if (worker) 427 kthread_destroy_worker(worker); 428 return 0; 429 } 430 431 static int erofs_cpu_hotplug_init(void) 432 { 433 int state; 434 435 state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 436 "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); 437 if (state < 0) 438 return state; 439 440 erofs_cpuhp_state = state; 441 return 0; 442 } 443 444 static void erofs_cpu_hotplug_destroy(void) 445 { 446 if (erofs_cpuhp_state) 447 cpuhp_remove_state_nocalls(erofs_cpuhp_state); 448 } 449 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ 450 static inline int erofs_cpu_hotplug_init(void) { return 0; } 451 static inline void erofs_cpu_hotplug_destroy(void) {} 452 #endif 453 454 void z_erofs_exit_zip_subsystem(void) 455 { 456 erofs_cpu_hotplug_destroy(); 457 erofs_destroy_percpu_workers(); 458 destroy_workqueue(z_erofs_workqueue); 459 z_erofs_destroy_pcluster_pool(); 460 } 461 462 int __init z_erofs_init_zip_subsystem(void) 463 { 464 int err = z_erofs_create_pcluster_pool(); 465 466 if (err) 467 goto out_error_pcluster_pool; 468 469 z_erofs_workqueue = alloc_workqueue("erofs_worker", 470 WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); 471 if (!z_erofs_workqueue) { 472 err = -ENOMEM; 473 goto out_error_workqueue_init; 474 } 475 476 err = erofs_init_percpu_workers(); 477 if (err) 478 goto out_error_pcpu_worker; 479 480 err = erofs_cpu_hotplug_init(); 481 if (err < 0) 482 goto out_error_cpuhp_init; 483 return err; 484 485 out_error_cpuhp_init: 486 erofs_destroy_percpu_workers(); 487 out_error_pcpu_worker: 488 destroy_workqueue(z_erofs_workqueue); 489 out_error_workqueue_init: 490 z_erofs_destroy_pcluster_pool(); 491 out_error_pcluster_pool: 492 return err; 493 } 494 495 enum z_erofs_pclustermode { 496 Z_EROFS_PCLUSTER_INFLIGHT, 497 /* 498 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it 499 * could be dispatched into bypass queue later due to uptodated managed 500 * pages. All related online pages cannot be reused for inplace I/O (or 501 * bvpage) since it can be directly decoded without I/O submission. 502 */ 503 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, 504 /* 505 * The pcluster was just linked to a decompression chain by us. It can 506 * also be linked with the remaining pclusters, which means if the 507 * processing page is the tail page of a pcluster, this pcluster can 508 * safely use the whole page (since the previous pcluster is within the 509 * same chain) for in-place I/O, as illustrated below: 510 * ___________________________________________________ 511 * | tail (partial) page | head (partial) page | 512 * | (of the current pcl) | (of the previous pcl) | 513 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| 514 * 515 * [ (*) the page above can be used as inplace I/O. ] 516 */ 517 Z_EROFS_PCLUSTER_FOLLOWED, 518 }; 519 520 struct z_erofs_decompress_frontend { 521 struct inode *const inode; 522 struct erofs_map_blocks map; 523 struct z_erofs_bvec_iter biter; 524 525 struct page *pagepool; 526 struct page *candidate_bvpage; 527 struct z_erofs_pcluster *pcl; 528 z_erofs_next_pcluster_t owned_head; 529 enum z_erofs_pclustermode mode; 530 531 erofs_off_t headoffset; 532 533 /* a pointer used to pick up inplace I/O pages */ 534 unsigned int icur; 535 }; 536 537 #define DECOMPRESS_FRONTEND_INIT(__i) { \ 538 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ 539 .mode = Z_EROFS_PCLUSTER_FOLLOWED } 540 541 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) 542 { 543 unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; 544 545 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) 546 return false; 547 548 if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) 549 return true; 550 551 if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && 552 fe->map.m_la < fe->headoffset) 553 return true; 554 555 return false; 556 } 557 558 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) 559 { 560 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); 561 struct z_erofs_pcluster *pcl = fe->pcl; 562 bool shouldalloc = z_erofs_should_alloc_cache(fe); 563 bool standalone = true; 564 /* 565 * optimistic allocation without direct reclaim since inplace I/O 566 * can be used if low memory otherwise. 567 */ 568 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | 569 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 570 unsigned int i; 571 572 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) 573 return; 574 575 for (i = 0; i < pcl->pclusterpages; ++i) { 576 struct page *page; 577 void *t; /* mark pages just found for debugging */ 578 struct page *newpage = NULL; 579 580 /* the compressed page was loaded before */ 581 if (READ_ONCE(pcl->compressed_bvecs[i].page)) 582 continue; 583 584 page = find_get_page(mc, pcl->obj.index + i); 585 586 if (page) { 587 t = (void *)((unsigned long)page | 1); 588 } else { 589 /* I/O is needed, no possible to decompress directly */ 590 standalone = false; 591 if (!shouldalloc) 592 continue; 593 594 /* 595 * try to use cached I/O if page allocation 596 * succeeds or fallback to in-place I/O instead 597 * to avoid any direct reclaim. 598 */ 599 newpage = erofs_allocpage(&fe->pagepool, gfp); 600 if (!newpage) 601 continue; 602 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); 603 t = (void *)((unsigned long)newpage | 1); 604 } 605 606 if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) 607 continue; 608 609 if (page) 610 put_page(page); 611 else if (newpage) 612 erofs_pagepool_add(&fe->pagepool, newpage); 613 } 614 615 /* 616 * don't do inplace I/O if all compressed pages are available in 617 * managed cache since it can be moved to the bypass queue instead. 618 */ 619 if (standalone) 620 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; 621 } 622 623 /* called by erofs_shrinker to get rid of all compressed_pages */ 624 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, 625 struct erofs_workgroup *grp) 626 { 627 struct z_erofs_pcluster *const pcl = 628 container_of(grp, struct z_erofs_pcluster, obj); 629 int i; 630 631 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 632 /* 633 * refcount of workgroup is now freezed as 0, 634 * therefore no need to worry about available decompression users. 635 */ 636 for (i = 0; i < pcl->pclusterpages; ++i) { 637 struct page *page = pcl->compressed_bvecs[i].page; 638 639 if (!page) 640 continue; 641 642 /* block other users from reclaiming or migrating the page */ 643 if (!trylock_page(page)) 644 return -EBUSY; 645 646 if (!erofs_page_is_managed(sbi, page)) 647 continue; 648 649 /* barrier is implied in the following 'unlock_page' */ 650 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 651 detach_page_private(page); 652 unlock_page(page); 653 } 654 return 0; 655 } 656 657 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) 658 { 659 struct z_erofs_pcluster *pcl = folio_get_private(folio); 660 bool ret; 661 int i; 662 663 if (!folio_test_private(folio)) 664 return true; 665 666 ret = false; 667 spin_lock(&pcl->obj.lockref.lock); 668 if (pcl->obj.lockref.count > 0) 669 goto out; 670 671 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 672 for (i = 0; i < pcl->pclusterpages; ++i) { 673 if (pcl->compressed_bvecs[i].page == &folio->page) { 674 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 675 ret = true; 676 break; 677 } 678 } 679 if (ret) 680 folio_detach_private(folio); 681 out: 682 spin_unlock(&pcl->obj.lockref.lock); 683 return ret; 684 } 685 686 /* 687 * It will be called only on inode eviction. In case that there are still some 688 * decompression requests in progress, wait with rescheduling for a bit here. 689 * An extra lock could be introduced instead but it seems unnecessary. 690 */ 691 static void z_erofs_cache_invalidate_folio(struct folio *folio, 692 size_t offset, size_t length) 693 { 694 const size_t stop = length + offset; 695 696 /* Check for potential overflow in debug mode */ 697 DBG_BUGON(stop > folio_size(folio) || stop < length); 698 699 if (offset == 0 && stop == folio_size(folio)) 700 while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) 701 cond_resched(); 702 } 703 704 static const struct address_space_operations z_erofs_cache_aops = { 705 .release_folio = z_erofs_cache_release_folio, 706 .invalidate_folio = z_erofs_cache_invalidate_folio, 707 }; 708 709 int erofs_init_managed_cache(struct super_block *sb) 710 { 711 struct inode *const inode = new_inode(sb); 712 713 if (!inode) 714 return -ENOMEM; 715 716 set_nlink(inode, 1); 717 inode->i_size = OFFSET_MAX; 718 inode->i_mapping->a_ops = &z_erofs_cache_aops; 719 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); 720 EROFS_SB(sb)->managed_cache = inode; 721 return 0; 722 } 723 724 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, 725 struct z_erofs_bvec *bvec) 726 { 727 struct z_erofs_pcluster *const pcl = fe->pcl; 728 729 while (fe->icur > 0) { 730 if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, 731 NULL, bvec->page)) { 732 pcl->compressed_bvecs[fe->icur] = *bvec; 733 return true; 734 } 735 } 736 return false; 737 } 738 739 /* callers must be with pcluster lock held */ 740 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, 741 struct z_erofs_bvec *bvec, bool exclusive) 742 { 743 int ret; 744 745 if (exclusive) { 746 /* give priority for inplaceio to use file pages first */ 747 if (z_erofs_try_inplace_io(fe, bvec)) 748 return 0; 749 /* otherwise, check if it can be used as a bvpage */ 750 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && 751 !fe->candidate_bvpage) 752 fe->candidate_bvpage = bvec->page; 753 } 754 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, 755 &fe->pagepool); 756 fe->pcl->vcnt += (ret >= 0); 757 return ret; 758 } 759 760 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) 761 { 762 struct z_erofs_pcluster *pcl = f->pcl; 763 z_erofs_next_pcluster_t *owned_head = &f->owned_head; 764 765 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ 766 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, 767 *owned_head) == Z_EROFS_PCLUSTER_NIL) { 768 *owned_head = &pcl->next; 769 /* so we can attach this pcluster to our submission chain. */ 770 f->mode = Z_EROFS_PCLUSTER_FOLLOWED; 771 return; 772 } 773 774 /* type 2, it belongs to an ongoing chain */ 775 f->mode = Z_EROFS_PCLUSTER_INFLIGHT; 776 } 777 778 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) 779 { 780 struct erofs_map_blocks *map = &fe->map; 781 bool ztailpacking = map->m_flags & EROFS_MAP_META; 782 struct z_erofs_pcluster *pcl; 783 struct erofs_workgroup *grp; 784 int err; 785 786 if (!(map->m_flags & EROFS_MAP_ENCODED) || 787 (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { 788 DBG_BUGON(1); 789 return -EFSCORRUPTED; 790 } 791 792 /* no available pcluster, let's allocate one */ 793 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : 794 map->m_plen >> PAGE_SHIFT); 795 if (IS_ERR(pcl)) 796 return PTR_ERR(pcl); 797 798 spin_lock_init(&pcl->obj.lockref.lock); 799 pcl->obj.lockref.count = 1; /* one ref for this request */ 800 pcl->algorithmformat = map->m_algorithmformat; 801 pcl->length = 0; 802 pcl->partial = true; 803 804 /* new pclusters should be claimed as type 1, primary and followed */ 805 pcl->next = fe->owned_head; 806 pcl->pageofs_out = map->m_la & ~PAGE_MASK; 807 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; 808 809 /* 810 * lock all primary followed works before visible to others 811 * and mutex_trylock *never* fails for a new pcluster. 812 */ 813 mutex_init(&pcl->lock); 814 DBG_BUGON(!mutex_trylock(&pcl->lock)); 815 816 if (ztailpacking) { 817 pcl->obj.index = 0; /* which indicates ztailpacking */ 818 pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); 819 pcl->tailpacking_size = map->m_plen; 820 } else { 821 pcl->obj.index = map->m_pa >> PAGE_SHIFT; 822 823 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); 824 if (IS_ERR(grp)) { 825 err = PTR_ERR(grp); 826 goto err_out; 827 } 828 829 if (grp != &pcl->obj) { 830 fe->pcl = container_of(grp, 831 struct z_erofs_pcluster, obj); 832 err = -EEXIST; 833 goto err_out; 834 } 835 } 836 fe->owned_head = &pcl->next; 837 fe->pcl = pcl; 838 return 0; 839 840 err_out: 841 mutex_unlock(&pcl->lock); 842 z_erofs_free_pcluster(pcl); 843 return err; 844 } 845 846 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) 847 { 848 struct erofs_map_blocks *map = &fe->map; 849 struct super_block *sb = fe->inode->i_sb; 850 erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); 851 struct erofs_workgroup *grp = NULL; 852 int ret; 853 854 DBG_BUGON(fe->pcl); 855 856 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ 857 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); 858 859 if (!(map->m_flags & EROFS_MAP_META)) { 860 grp = erofs_find_workgroup(sb, blknr); 861 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { 862 DBG_BUGON(1); 863 return -EFSCORRUPTED; 864 } 865 866 if (grp) { 867 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); 868 ret = -EEXIST; 869 } else { 870 ret = z_erofs_register_pcluster(fe); 871 } 872 873 if (ret == -EEXIST) { 874 mutex_lock(&fe->pcl->lock); 875 z_erofs_try_to_claim_pcluster(fe); 876 } else if (ret) { 877 return ret; 878 } 879 880 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, 881 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); 882 if (!z_erofs_is_inline_pcluster(fe->pcl)) { 883 /* bind cache first when cached decompression is preferred */ 884 z_erofs_bind_cache(fe); 885 } else { 886 void *mptr; 887 888 mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); 889 if (IS_ERR(mptr)) { 890 ret = PTR_ERR(mptr); 891 erofs_err(sb, "failed to get inline data %d", ret); 892 return ret; 893 } 894 get_page(map->buf.page); 895 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); 896 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; 897 } 898 /* file-backed inplace I/O pages are traversed in reverse order */ 899 fe->icur = z_erofs_pclusterpages(fe->pcl); 900 return 0; 901 } 902 903 /* 904 * keep in mind that no referenced pclusters will be freed 905 * only after a RCU grace period. 906 */ 907 static void z_erofs_rcu_callback(struct rcu_head *head) 908 { 909 z_erofs_free_pcluster(container_of(head, 910 struct z_erofs_pcluster, rcu)); 911 } 912 913 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) 914 { 915 struct z_erofs_pcluster *const pcl = 916 container_of(grp, struct z_erofs_pcluster, obj); 917 918 call_rcu(&pcl->rcu, z_erofs_rcu_callback); 919 } 920 921 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) 922 { 923 struct z_erofs_pcluster *pcl = fe->pcl; 924 925 if (!pcl) 926 return; 927 928 z_erofs_bvec_iter_end(&fe->biter); 929 mutex_unlock(&pcl->lock); 930 931 if (fe->candidate_bvpage) 932 fe->candidate_bvpage = NULL; 933 934 /* 935 * if all pending pages are added, don't hold its reference 936 * any longer if the pcluster isn't hosted by ourselves. 937 */ 938 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) 939 erofs_workgroup_put(&pcl->obj); 940 941 fe->pcl = NULL; 942 } 943 944 static int z_erofs_read_fragment(struct super_block *sb, struct page *page, 945 unsigned int cur, unsigned int end, erofs_off_t pos) 946 { 947 struct inode *packed_inode = EROFS_SB(sb)->packed_inode; 948 struct erofs_buf buf = __EROFS_BUF_INITIALIZER; 949 unsigned int cnt; 950 u8 *src; 951 952 if (!packed_inode) 953 return -EFSCORRUPTED; 954 955 buf.inode = packed_inode; 956 for (; cur < end; cur += cnt, pos += cnt) { 957 cnt = min_t(unsigned int, end - cur, 958 sb->s_blocksize - erofs_blkoff(sb, pos)); 959 src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); 960 if (IS_ERR(src)) { 961 erofs_put_metabuf(&buf); 962 return PTR_ERR(src); 963 } 964 memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); 965 } 966 erofs_put_metabuf(&buf); 967 return 0; 968 } 969 970 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, 971 struct page *page) 972 { 973 struct inode *const inode = fe->inode; 974 struct erofs_map_blocks *const map = &fe->map; 975 const loff_t offset = page_offset(page); 976 bool tight = true, exclusive; 977 unsigned int cur, end, len, split; 978 int err = 0; 979 980 z_erofs_onlinepage_init(page); 981 982 split = 0; 983 end = PAGE_SIZE; 984 repeat: 985 if (offset + end - 1 < map->m_la || 986 offset + end - 1 >= map->m_la + map->m_llen) { 987 z_erofs_pcluster_end(fe); 988 map->m_la = offset + end - 1; 989 map->m_llen = 0; 990 err = z_erofs_map_blocks_iter(inode, map, 0); 991 if (err) 992 goto out; 993 } 994 995 cur = offset > map->m_la ? 0 : map->m_la - offset; 996 /* bump split parts first to avoid several separate cases */ 997 ++split; 998 999 if (!(map->m_flags & EROFS_MAP_MAPPED)) { 1000 zero_user_segment(page, cur, end); 1001 tight = false; 1002 goto next_part; 1003 } 1004 1005 if (map->m_flags & EROFS_MAP_FRAGMENT) { 1006 erofs_off_t fpos = offset + cur - map->m_la; 1007 1008 len = min_t(unsigned int, map->m_llen - fpos, end - cur); 1009 err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, 1010 EROFS_I(inode)->z_fragmentoff + fpos); 1011 if (err) 1012 goto out; 1013 tight = false; 1014 goto next_part; 1015 } 1016 1017 if (!fe->pcl) { 1018 err = z_erofs_pcluster_begin(fe); 1019 if (err) 1020 goto out; 1021 } 1022 1023 /* 1024 * Ensure the current partial page belongs to this submit chain rather 1025 * than other concurrent submit chains or the noio(bypass) chain since 1026 * those chains are handled asynchronously thus the page cannot be used 1027 * for inplace I/O or bvpage (should be processed in a strict order.) 1028 */ 1029 tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); 1030 exclusive = (!cur && ((split <= 1) || tight)); 1031 if (cur) 1032 tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); 1033 1034 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { 1035 .page = page, 1036 .offset = offset - map->m_la, 1037 .end = end, 1038 }), exclusive); 1039 if (err) 1040 goto out; 1041 1042 z_erofs_onlinepage_split(page); 1043 if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) 1044 fe->pcl->multibases = true; 1045 if (fe->pcl->length < offset + end - map->m_la) { 1046 fe->pcl->length = offset + end - map->m_la; 1047 fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; 1048 } 1049 if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && 1050 !(map->m_flags & EROFS_MAP_PARTIAL_REF) && 1051 fe->pcl->length == map->m_llen) 1052 fe->pcl->partial = false; 1053 next_part: 1054 /* shorten the remaining extent to update progress */ 1055 map->m_llen = offset + cur - map->m_la; 1056 map->m_flags &= ~EROFS_MAP_FULL_MAPPED; 1057 1058 end = cur; 1059 if (end > 0) 1060 goto repeat; 1061 1062 out: 1063 z_erofs_onlinepage_endio(page, err); 1064 return err; 1065 } 1066 1067 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, 1068 unsigned int readahead_pages) 1069 { 1070 /* auto: enable for read_folio, disable for readahead */ 1071 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && 1072 !readahead_pages) 1073 return true; 1074 1075 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && 1076 (readahead_pages <= sbi->opt.max_sync_decompress_pages)) 1077 return true; 1078 1079 return false; 1080 } 1081 1082 static bool z_erofs_page_is_invalidated(struct page *page) 1083 { 1084 return !page->mapping && !z_erofs_is_shortlived_page(page); 1085 } 1086 1087 struct z_erofs_decompress_backend { 1088 struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; 1089 struct super_block *sb; 1090 struct z_erofs_pcluster *pcl; 1091 1092 /* pages with the longest decompressed length for deduplication */ 1093 struct page **decompressed_pages; 1094 /* pages to keep the compressed data */ 1095 struct page **compressed_pages; 1096 1097 struct list_head decompressed_secondary_bvecs; 1098 struct page **pagepool; 1099 unsigned int onstack_used, nr_pages; 1100 }; 1101 1102 struct z_erofs_bvec_item { 1103 struct z_erofs_bvec bvec; 1104 struct list_head list; 1105 }; 1106 1107 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, 1108 struct z_erofs_bvec *bvec) 1109 { 1110 struct z_erofs_bvec_item *item; 1111 unsigned int pgnr; 1112 1113 if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && 1114 (bvec->end == PAGE_SIZE || 1115 bvec->offset + bvec->end == be->pcl->length)) { 1116 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; 1117 DBG_BUGON(pgnr >= be->nr_pages); 1118 if (!be->decompressed_pages[pgnr]) { 1119 be->decompressed_pages[pgnr] = bvec->page; 1120 return; 1121 } 1122 } 1123 1124 /* (cold path) one pcluster is requested multiple times */ 1125 item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); 1126 item->bvec = *bvec; 1127 list_add(&item->list, &be->decompressed_secondary_bvecs); 1128 } 1129 1130 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, 1131 int err) 1132 { 1133 unsigned int off0 = be->pcl->pageofs_out; 1134 struct list_head *p, *n; 1135 1136 list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { 1137 struct z_erofs_bvec_item *bvi; 1138 unsigned int end, cur; 1139 void *dst, *src; 1140 1141 bvi = container_of(p, struct z_erofs_bvec_item, list); 1142 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; 1143 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, 1144 bvi->bvec.end); 1145 dst = kmap_local_page(bvi->bvec.page); 1146 while (cur < end) { 1147 unsigned int pgnr, scur, len; 1148 1149 pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; 1150 DBG_BUGON(pgnr >= be->nr_pages); 1151 1152 scur = bvi->bvec.offset + cur - 1153 ((pgnr << PAGE_SHIFT) - off0); 1154 len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); 1155 if (!be->decompressed_pages[pgnr]) { 1156 err = -EFSCORRUPTED; 1157 cur += len; 1158 continue; 1159 } 1160 src = kmap_local_page(be->decompressed_pages[pgnr]); 1161 memcpy(dst + cur, src + scur, len); 1162 kunmap_local(src); 1163 cur += len; 1164 } 1165 kunmap_local(dst); 1166 z_erofs_onlinepage_endio(bvi->bvec.page, err); 1167 list_del(p); 1168 kfree(bvi); 1169 } 1170 } 1171 1172 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) 1173 { 1174 struct z_erofs_pcluster *pcl = be->pcl; 1175 struct z_erofs_bvec_iter biter; 1176 struct page *old_bvpage; 1177 int i; 1178 1179 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); 1180 for (i = 0; i < pcl->vcnt; ++i) { 1181 struct z_erofs_bvec bvec; 1182 1183 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); 1184 1185 if (old_bvpage) 1186 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); 1187 1188 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); 1189 z_erofs_do_decompressed_bvec(be, &bvec); 1190 } 1191 1192 old_bvpage = z_erofs_bvec_iter_end(&biter); 1193 if (old_bvpage) 1194 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); 1195 } 1196 1197 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, 1198 bool *overlapped) 1199 { 1200 struct z_erofs_pcluster *pcl = be->pcl; 1201 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 1202 int i, err = 0; 1203 1204 *overlapped = false; 1205 for (i = 0; i < pclusterpages; ++i) { 1206 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; 1207 struct page *page = bvec->page; 1208 1209 /* compressed pages ought to be present before decompressing */ 1210 if (!page) { 1211 DBG_BUGON(1); 1212 continue; 1213 } 1214 be->compressed_pages[i] = page; 1215 1216 if (z_erofs_is_inline_pcluster(pcl)) { 1217 if (!PageUptodate(page)) 1218 err = -EIO; 1219 continue; 1220 } 1221 1222 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1223 if (!z_erofs_is_shortlived_page(page)) { 1224 if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { 1225 if (!PageUptodate(page)) 1226 err = -EIO; 1227 continue; 1228 } 1229 z_erofs_do_decompressed_bvec(be, bvec); 1230 *overlapped = true; 1231 } 1232 } 1233 1234 if (err) 1235 return err; 1236 return 0; 1237 } 1238 1239 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, 1240 int err) 1241 { 1242 struct erofs_sb_info *const sbi = EROFS_SB(be->sb); 1243 struct z_erofs_pcluster *pcl = be->pcl; 1244 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 1245 const struct z_erofs_decompressor *decompressor = 1246 &erofs_decompressors[pcl->algorithmformat]; 1247 unsigned int i, inputsize; 1248 int err2; 1249 struct page *page; 1250 bool overlapped; 1251 1252 mutex_lock(&pcl->lock); 1253 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; 1254 1255 /* allocate (de)compressed page arrays if cannot be kept on stack */ 1256 be->decompressed_pages = NULL; 1257 be->compressed_pages = NULL; 1258 be->onstack_used = 0; 1259 if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { 1260 be->decompressed_pages = be->onstack_pages; 1261 be->onstack_used = be->nr_pages; 1262 memset(be->decompressed_pages, 0, 1263 sizeof(struct page *) * be->nr_pages); 1264 } 1265 1266 if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) 1267 be->compressed_pages = be->onstack_pages + be->onstack_used; 1268 1269 if (!be->decompressed_pages) 1270 be->decompressed_pages = 1271 kvcalloc(be->nr_pages, sizeof(struct page *), 1272 GFP_KERNEL | __GFP_NOFAIL); 1273 if (!be->compressed_pages) 1274 be->compressed_pages = 1275 kvcalloc(pclusterpages, sizeof(struct page *), 1276 GFP_KERNEL | __GFP_NOFAIL); 1277 1278 z_erofs_parse_out_bvecs(be); 1279 err2 = z_erofs_parse_in_bvecs(be, &overlapped); 1280 if (err2) 1281 err = err2; 1282 if (err) 1283 goto out; 1284 1285 if (z_erofs_is_inline_pcluster(pcl)) 1286 inputsize = pcl->tailpacking_size; 1287 else 1288 inputsize = pclusterpages * PAGE_SIZE; 1289 1290 err = decompressor->decompress(&(struct z_erofs_decompress_req) { 1291 .sb = be->sb, 1292 .in = be->compressed_pages, 1293 .out = be->decompressed_pages, 1294 .pageofs_in = pcl->pageofs_in, 1295 .pageofs_out = pcl->pageofs_out, 1296 .inputsize = inputsize, 1297 .outputsize = pcl->length, 1298 .alg = pcl->algorithmformat, 1299 .inplace_io = overlapped, 1300 .partial_decoding = pcl->partial, 1301 .fillgaps = pcl->multibases, 1302 }, be->pagepool); 1303 1304 out: 1305 /* must handle all compressed pages before actual file pages */ 1306 if (z_erofs_is_inline_pcluster(pcl)) { 1307 page = pcl->compressed_bvecs[0].page; 1308 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); 1309 put_page(page); 1310 } else { 1311 for (i = 0; i < pclusterpages; ++i) { 1312 /* consider shortlived pages added when decompressing */ 1313 page = be->compressed_pages[i]; 1314 1315 if (erofs_page_is_managed(sbi, page)) 1316 continue; 1317 (void)z_erofs_put_shortlivedpage(be->pagepool, page); 1318 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); 1319 } 1320 } 1321 if (be->compressed_pages < be->onstack_pages || 1322 be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) 1323 kvfree(be->compressed_pages); 1324 z_erofs_fill_other_copies(be, err); 1325 1326 for (i = 0; i < be->nr_pages; ++i) { 1327 page = be->decompressed_pages[i]; 1328 if (!page) 1329 continue; 1330 1331 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1332 1333 /* recycle all individual short-lived pages */ 1334 if (z_erofs_put_shortlivedpage(be->pagepool, page)) 1335 continue; 1336 z_erofs_onlinepage_endio(page, err); 1337 } 1338 1339 if (be->decompressed_pages != be->onstack_pages) 1340 kvfree(be->decompressed_pages); 1341 1342 pcl->length = 0; 1343 pcl->partial = true; 1344 pcl->multibases = false; 1345 pcl->bvset.nextpage = NULL; 1346 pcl->vcnt = 0; 1347 1348 /* pcluster lock MUST be taken before the following line */ 1349 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); 1350 mutex_unlock(&pcl->lock); 1351 return err; 1352 } 1353 1354 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, 1355 struct page **pagepool) 1356 { 1357 struct z_erofs_decompress_backend be = { 1358 .sb = io->sb, 1359 .pagepool = pagepool, 1360 .decompressed_secondary_bvecs = 1361 LIST_HEAD_INIT(be.decompressed_secondary_bvecs), 1362 }; 1363 z_erofs_next_pcluster_t owned = io->head; 1364 1365 while (owned != Z_EROFS_PCLUSTER_TAIL) { 1366 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); 1367 1368 be.pcl = container_of(owned, struct z_erofs_pcluster, next); 1369 owned = READ_ONCE(be.pcl->next); 1370 1371 z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); 1372 if (z_erofs_is_inline_pcluster(be.pcl)) 1373 z_erofs_free_pcluster(be.pcl); 1374 else 1375 erofs_workgroup_put(&be.pcl->obj); 1376 } 1377 } 1378 1379 static void z_erofs_decompressqueue_work(struct work_struct *work) 1380 { 1381 struct z_erofs_decompressqueue *bgq = 1382 container_of(work, struct z_erofs_decompressqueue, u.work); 1383 struct page *pagepool = NULL; 1384 1385 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); 1386 z_erofs_decompress_queue(bgq, &pagepool); 1387 erofs_release_pages(&pagepool); 1388 kvfree(bgq); 1389 } 1390 1391 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1392 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) 1393 { 1394 z_erofs_decompressqueue_work((struct work_struct *)work); 1395 } 1396 #endif 1397 1398 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, 1399 int bios) 1400 { 1401 struct erofs_sb_info *const sbi = EROFS_SB(io->sb); 1402 1403 /* wake up the caller thread for sync decompression */ 1404 if (io->sync) { 1405 if (!atomic_add_return(bios, &io->pending_bios)) 1406 complete(&io->u.done); 1407 return; 1408 } 1409 1410 if (atomic_add_return(bios, &io->pending_bios)) 1411 return; 1412 /* Use (kthread_)work and sync decompression for atomic contexts only */ 1413 if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { 1414 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1415 struct kthread_worker *worker; 1416 1417 rcu_read_lock(); 1418 worker = rcu_dereference( 1419 z_erofs_pcpu_workers[raw_smp_processor_id()]); 1420 if (!worker) { 1421 INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); 1422 queue_work(z_erofs_workqueue, &io->u.work); 1423 } else { 1424 kthread_queue_work(worker, &io->u.kthread_work); 1425 } 1426 rcu_read_unlock(); 1427 #else 1428 queue_work(z_erofs_workqueue, &io->u.work); 1429 #endif 1430 /* enable sync decompression for readahead */ 1431 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) 1432 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; 1433 return; 1434 } 1435 z_erofs_decompressqueue_work(&io->u.work); 1436 } 1437 1438 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, 1439 unsigned int nr, 1440 struct page **pagepool, 1441 struct address_space *mc) 1442 { 1443 const pgoff_t index = pcl->obj.index; 1444 gfp_t gfp = mapping_gfp_mask(mc); 1445 bool tocache = false; 1446 1447 struct address_space *mapping; 1448 struct page *oldpage, *page; 1449 int justfound; 1450 1451 repeat: 1452 page = READ_ONCE(pcl->compressed_bvecs[nr].page); 1453 oldpage = page; 1454 1455 if (!page) 1456 goto out_allocpage; 1457 1458 justfound = (unsigned long)page & 1UL; 1459 page = (struct page *)((unsigned long)page & ~1UL); 1460 1461 /* 1462 * preallocated cached pages, which is used to avoid direct reclaim 1463 * otherwise, it will go inplace I/O path instead. 1464 */ 1465 if (page->private == Z_EROFS_PREALLOCATED_PAGE) { 1466 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); 1467 set_page_private(page, 0); 1468 tocache = true; 1469 goto out_tocache; 1470 } 1471 mapping = READ_ONCE(page->mapping); 1472 1473 /* 1474 * file-backed online pages in plcuster are all locked steady, 1475 * therefore it is impossible for `mapping' to be NULL. 1476 */ 1477 if (mapping && mapping != mc) 1478 /* ought to be unmanaged pages */ 1479 goto out; 1480 1481 /* directly return for shortlived page as well */ 1482 if (z_erofs_is_shortlived_page(page)) 1483 goto out; 1484 1485 lock_page(page); 1486 1487 /* only true if page reclaim goes wrong, should never happen */ 1488 DBG_BUGON(justfound && PagePrivate(page)); 1489 1490 /* the page is still in manage cache */ 1491 if (page->mapping == mc) { 1492 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); 1493 1494 if (!PagePrivate(page)) { 1495 /* 1496 * impossible to be !PagePrivate(page) for 1497 * the current restriction as well if 1498 * the page is already in compressed_bvecs[]. 1499 */ 1500 DBG_BUGON(!justfound); 1501 1502 justfound = 0; 1503 set_page_private(page, (unsigned long)pcl); 1504 SetPagePrivate(page); 1505 } 1506 1507 /* no need to submit io if it is already up-to-date */ 1508 if (PageUptodate(page)) { 1509 unlock_page(page); 1510 page = NULL; 1511 } 1512 goto out; 1513 } 1514 1515 /* 1516 * the managed page has been truncated, it's unsafe to 1517 * reuse this one, let's allocate a new cache-managed page. 1518 */ 1519 DBG_BUGON(page->mapping); 1520 DBG_BUGON(!justfound); 1521 1522 tocache = true; 1523 unlock_page(page); 1524 put_page(page); 1525 out_allocpage: 1526 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); 1527 if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, 1528 oldpage, page)) { 1529 erofs_pagepool_add(pagepool, page); 1530 cond_resched(); 1531 goto repeat; 1532 } 1533 out_tocache: 1534 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { 1535 /* turn into temporary page if fails (1 ref) */ 1536 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); 1537 goto out; 1538 } 1539 attach_page_private(page, pcl); 1540 /* drop a refcount added by allocpage (then we have 2 refs here) */ 1541 put_page(page); 1542 1543 out: /* the only exit (for tracing and debugging) */ 1544 return page; 1545 } 1546 1547 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, 1548 struct z_erofs_decompressqueue *fgq, bool *fg) 1549 { 1550 struct z_erofs_decompressqueue *q; 1551 1552 if (fg && !*fg) { 1553 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); 1554 if (!q) { 1555 *fg = true; 1556 goto fg_out; 1557 } 1558 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD 1559 kthread_init_work(&q->u.kthread_work, 1560 z_erofs_decompressqueue_kthread_work); 1561 #else 1562 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); 1563 #endif 1564 } else { 1565 fg_out: 1566 q = fgq; 1567 init_completion(&fgq->u.done); 1568 atomic_set(&fgq->pending_bios, 0); 1569 q->eio = false; 1570 q->sync = true; 1571 } 1572 q->sb = sb; 1573 q->head = Z_EROFS_PCLUSTER_TAIL; 1574 return q; 1575 } 1576 1577 /* define decompression jobqueue types */ 1578 enum { 1579 JQ_BYPASS, 1580 JQ_SUBMIT, 1581 NR_JOBQUEUES, 1582 }; 1583 1584 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, 1585 z_erofs_next_pcluster_t qtail[], 1586 z_erofs_next_pcluster_t owned_head) 1587 { 1588 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; 1589 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; 1590 1591 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); 1592 1593 WRITE_ONCE(*submit_qtail, owned_head); 1594 WRITE_ONCE(*bypass_qtail, &pcl->next); 1595 1596 qtail[JQ_BYPASS] = &pcl->next; 1597 } 1598 1599 static void z_erofs_decompressqueue_endio(struct bio *bio) 1600 { 1601 struct z_erofs_decompressqueue *q = bio->bi_private; 1602 blk_status_t err = bio->bi_status; 1603 struct bio_vec *bvec; 1604 struct bvec_iter_all iter_all; 1605 1606 bio_for_each_segment_all(bvec, bio, iter_all) { 1607 struct page *page = bvec->bv_page; 1608 1609 DBG_BUGON(PageUptodate(page)); 1610 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1611 1612 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { 1613 if (!err) 1614 SetPageUptodate(page); 1615 unlock_page(page); 1616 } 1617 } 1618 if (err) 1619 q->eio = true; 1620 z_erofs_decompress_kickoff(q, -1); 1621 bio_put(bio); 1622 } 1623 1624 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, 1625 struct z_erofs_decompressqueue *fgq, 1626 bool *force_fg, bool readahead) 1627 { 1628 struct super_block *sb = f->inode->i_sb; 1629 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); 1630 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; 1631 struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; 1632 z_erofs_next_pcluster_t owned_head = f->owned_head; 1633 /* bio is NULL initially, so no need to initialize last_{index,bdev} */ 1634 pgoff_t last_index; 1635 struct block_device *last_bdev; 1636 unsigned int nr_bios = 0; 1637 struct bio *bio = NULL; 1638 unsigned long pflags; 1639 int memstall = 0; 1640 1641 /* 1642 * if managed cache is enabled, bypass jobqueue is needed, 1643 * no need to read from device for all pclusters in this queue. 1644 */ 1645 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); 1646 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); 1647 1648 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; 1649 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; 1650 1651 /* by default, all need io submission */ 1652 q[JQ_SUBMIT]->head = owned_head; 1653 1654 do { 1655 struct erofs_map_dev mdev; 1656 struct z_erofs_pcluster *pcl; 1657 pgoff_t cur, end; 1658 unsigned int i = 0; 1659 bool bypass = true; 1660 1661 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); 1662 pcl = container_of(owned_head, struct z_erofs_pcluster, next); 1663 owned_head = READ_ONCE(pcl->next); 1664 1665 if (z_erofs_is_inline_pcluster(pcl)) { 1666 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1667 continue; 1668 } 1669 1670 /* no device id here, thus it will always succeed */ 1671 mdev = (struct erofs_map_dev) { 1672 .m_pa = erofs_pos(sb, pcl->obj.index), 1673 }; 1674 (void)erofs_map_dev(sb, &mdev); 1675 1676 cur = erofs_blknr(sb, mdev.m_pa); 1677 end = cur + pcl->pclusterpages; 1678 1679 do { 1680 struct page *page; 1681 1682 page = pickup_page_for_submission(pcl, i++, 1683 &f->pagepool, mc); 1684 if (!page) 1685 continue; 1686 1687 if (bio && (cur != last_index + 1 || 1688 last_bdev != mdev.m_bdev)) { 1689 submit_bio_retry: 1690 submit_bio(bio); 1691 if (memstall) { 1692 psi_memstall_leave(&pflags); 1693 memstall = 0; 1694 } 1695 bio = NULL; 1696 } 1697 1698 if (unlikely(PageWorkingset(page)) && !memstall) { 1699 psi_memstall_enter(&pflags); 1700 memstall = 1; 1701 } 1702 1703 if (!bio) { 1704 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, 1705 REQ_OP_READ, GFP_NOIO); 1706 bio->bi_end_io = z_erofs_decompressqueue_endio; 1707 1708 last_bdev = mdev.m_bdev; 1709 bio->bi_iter.bi_sector = (sector_t)cur << 1710 (sb->s_blocksize_bits - 9); 1711 bio->bi_private = q[JQ_SUBMIT]; 1712 if (readahead) 1713 bio->bi_opf |= REQ_RAHEAD; 1714 ++nr_bios; 1715 } 1716 1717 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) 1718 goto submit_bio_retry; 1719 1720 last_index = cur; 1721 bypass = false; 1722 } while (++cur < end); 1723 1724 if (!bypass) 1725 qtail[JQ_SUBMIT] = &pcl->next; 1726 else 1727 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1728 } while (owned_head != Z_EROFS_PCLUSTER_TAIL); 1729 1730 if (bio) { 1731 submit_bio(bio); 1732 if (memstall) 1733 psi_memstall_leave(&pflags); 1734 } 1735 1736 /* 1737 * although background is preferred, no one is pending for submission. 1738 * don't issue decompression but drop it directly instead. 1739 */ 1740 if (!*force_fg && !nr_bios) { 1741 kvfree(q[JQ_SUBMIT]); 1742 return; 1743 } 1744 z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); 1745 } 1746 1747 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, 1748 bool force_fg, bool ra) 1749 { 1750 struct z_erofs_decompressqueue io[NR_JOBQUEUES]; 1751 1752 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) 1753 return; 1754 z_erofs_submit_queue(f, io, &force_fg, ra); 1755 1756 /* handle bypass queue (no i/o pclusters) immediately */ 1757 z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); 1758 1759 if (!force_fg) 1760 return; 1761 1762 /* wait until all bios are completed */ 1763 wait_for_completion_io(&io[JQ_SUBMIT].u.done); 1764 1765 /* handle synchronous decompress queue in the caller context */ 1766 z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); 1767 } 1768 1769 /* 1770 * Since partial uptodate is still unimplemented for now, we have to use 1771 * approximate readmore strategies as a start. 1772 */ 1773 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, 1774 struct readahead_control *rac, bool backmost) 1775 { 1776 struct inode *inode = f->inode; 1777 struct erofs_map_blocks *map = &f->map; 1778 erofs_off_t cur, end, headoffset = f->headoffset; 1779 int err; 1780 1781 if (backmost) { 1782 if (rac) 1783 end = headoffset + readahead_length(rac) - 1; 1784 else 1785 end = headoffset + PAGE_SIZE - 1; 1786 map->m_la = end; 1787 err = z_erofs_map_blocks_iter(inode, map, 1788 EROFS_GET_BLOCKS_READMORE); 1789 if (err) 1790 return; 1791 1792 /* expand ra for the trailing edge if readahead */ 1793 if (rac) { 1794 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); 1795 readahead_expand(rac, headoffset, cur - headoffset); 1796 return; 1797 } 1798 end = round_up(end, PAGE_SIZE); 1799 } else { 1800 end = round_up(map->m_la, PAGE_SIZE); 1801 1802 if (!map->m_llen) 1803 return; 1804 } 1805 1806 cur = map->m_la + map->m_llen - 1; 1807 while ((cur >= end) && (cur < i_size_read(inode))) { 1808 pgoff_t index = cur >> PAGE_SHIFT; 1809 struct page *page; 1810 1811 page = erofs_grab_cache_page_nowait(inode->i_mapping, index); 1812 if (page) { 1813 if (PageUptodate(page)) 1814 unlock_page(page); 1815 else 1816 (void)z_erofs_do_read_page(f, page); 1817 put_page(page); 1818 } 1819 1820 if (cur < PAGE_SIZE) 1821 break; 1822 cur = (index << PAGE_SHIFT) - 1; 1823 } 1824 } 1825 1826 static int z_erofs_read_folio(struct file *file, struct folio *folio) 1827 { 1828 struct inode *const inode = folio->mapping->host; 1829 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1830 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1831 int err; 1832 1833 trace_erofs_read_folio(folio, false); 1834 f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; 1835 1836 z_erofs_pcluster_readmore(&f, NULL, true); 1837 err = z_erofs_do_read_page(&f, &folio->page); 1838 z_erofs_pcluster_readmore(&f, NULL, false); 1839 z_erofs_pcluster_end(&f); 1840 1841 /* if some compressed cluster ready, need submit them anyway */ 1842 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); 1843 1844 if (err && err != -EINTR) 1845 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", 1846 err, folio->index, EROFS_I(inode)->nid); 1847 1848 erofs_put_metabuf(&f.map.buf); 1849 erofs_release_pages(&f.pagepool); 1850 return err; 1851 } 1852 1853 static void z_erofs_readahead(struct readahead_control *rac) 1854 { 1855 struct inode *const inode = rac->mapping->host; 1856 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1857 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1858 struct folio *head = NULL, *folio; 1859 unsigned int nr_folios; 1860 int err; 1861 1862 f.headoffset = readahead_pos(rac); 1863 1864 z_erofs_pcluster_readmore(&f, rac, true); 1865 nr_folios = readahead_count(rac); 1866 trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); 1867 1868 while ((folio = readahead_folio(rac))) { 1869 folio->private = head; 1870 head = folio; 1871 } 1872 1873 /* traverse in reverse order for best metadata I/O performance */ 1874 while (head) { 1875 folio = head; 1876 head = folio_get_private(folio); 1877 1878 err = z_erofs_do_read_page(&f, &folio->page); 1879 if (err && err != -EINTR) 1880 erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", 1881 folio->index, EROFS_I(inode)->nid); 1882 } 1883 z_erofs_pcluster_readmore(&f, rac, false); 1884 z_erofs_pcluster_end(&f); 1885 1886 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); 1887 erofs_put_metabuf(&f.map.buf); 1888 erofs_release_pages(&f.pagepool); 1889 } 1890 1891 const struct address_space_operations z_erofs_aops = { 1892 .read_folio = z_erofs_read_folio, 1893 .readahead = z_erofs_readahead, 1894 }; 1895