1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2018 HUAWEI, Inc. 4 * https://www.huawei.com/ 5 * Copyright (C) 2022 Alibaba Cloud 6 */ 7 #include "zdata.h" 8 #include "compress.h" 9 #include <linux/prefetch.h> 10 11 #include <trace/events/erofs.h> 12 13 /* 14 * since pclustersize is variable for big pcluster feature, introduce slab 15 * pools implementation for different pcluster sizes. 16 */ 17 struct z_erofs_pcluster_slab { 18 struct kmem_cache *slab; 19 unsigned int maxpages; 20 char name[48]; 21 }; 22 23 #define _PCLP(n) { .maxpages = n } 24 25 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { 26 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), 27 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) 28 }; 29 30 /* (obsoleted) page type for online pages */ 31 enum z_erofs_page_type { 32 /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ 33 Z_EROFS_PAGE_TYPE_EXCLUSIVE, 34 35 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, 36 37 Z_EROFS_VLE_PAGE_TYPE_HEAD, 38 Z_EROFS_VLE_PAGE_TYPE_MAX 39 }; 40 41 struct z_erofs_bvec_iter { 42 struct page *bvpage; 43 struct z_erofs_bvset *bvset; 44 unsigned int nr, cur; 45 }; 46 47 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) 48 { 49 if (iter->bvpage) 50 kunmap_local(iter->bvset); 51 return iter->bvpage; 52 } 53 54 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) 55 { 56 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; 57 /* have to access nextpage in advance, otherwise it will be unmapped */ 58 struct page *nextpage = iter->bvset->nextpage; 59 struct page *oldpage; 60 61 DBG_BUGON(!nextpage); 62 oldpage = z_erofs_bvec_iter_end(iter); 63 iter->bvpage = nextpage; 64 iter->bvset = kmap_local_page(nextpage); 65 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); 66 iter->cur = 0; 67 return oldpage; 68 } 69 70 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, 71 struct z_erofs_bvset_inline *bvset, 72 unsigned int bootstrap_nr, 73 unsigned int cur) 74 { 75 *iter = (struct z_erofs_bvec_iter) { 76 .nr = bootstrap_nr, 77 .bvset = (struct z_erofs_bvset *)bvset, 78 }; 79 80 while (cur > iter->nr) { 81 cur -= iter->nr; 82 z_erofs_bvset_flip(iter); 83 } 84 iter->cur = cur; 85 } 86 87 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, 88 struct z_erofs_bvec *bvec, 89 struct page **candidate_bvpage) 90 { 91 if (iter->cur == iter->nr) { 92 if (!*candidate_bvpage) 93 return -EAGAIN; 94 95 DBG_BUGON(iter->bvset->nextpage); 96 iter->bvset->nextpage = *candidate_bvpage; 97 z_erofs_bvset_flip(iter); 98 99 iter->bvset->nextpage = NULL; 100 *candidate_bvpage = NULL; 101 } 102 iter->bvset->bvec[iter->cur++] = *bvec; 103 return 0; 104 } 105 106 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, 107 struct z_erofs_bvec *bvec, 108 struct page **old_bvpage) 109 { 110 if (iter->cur == iter->nr) 111 *old_bvpage = z_erofs_bvset_flip(iter); 112 else 113 *old_bvpage = NULL; 114 *bvec = iter->bvset->bvec[iter->cur++]; 115 } 116 117 static void z_erofs_destroy_pcluster_pool(void) 118 { 119 int i; 120 121 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 122 if (!pcluster_pool[i].slab) 123 continue; 124 kmem_cache_destroy(pcluster_pool[i].slab); 125 pcluster_pool[i].slab = NULL; 126 } 127 } 128 129 static int z_erofs_create_pcluster_pool(void) 130 { 131 struct z_erofs_pcluster_slab *pcs; 132 struct z_erofs_pcluster *a; 133 unsigned int size; 134 135 for (pcs = pcluster_pool; 136 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { 137 size = struct_size(a, compressed_pages, pcs->maxpages); 138 139 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); 140 pcs->slab = kmem_cache_create(pcs->name, size, 0, 141 SLAB_RECLAIM_ACCOUNT, NULL); 142 if (pcs->slab) 143 continue; 144 145 z_erofs_destroy_pcluster_pool(); 146 return -ENOMEM; 147 } 148 return 0; 149 } 150 151 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) 152 { 153 int i; 154 155 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 156 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 157 struct z_erofs_pcluster *pcl; 158 159 if (nrpages > pcs->maxpages) 160 continue; 161 162 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); 163 if (!pcl) 164 return ERR_PTR(-ENOMEM); 165 pcl->pclusterpages = nrpages; 166 return pcl; 167 } 168 return ERR_PTR(-EINVAL); 169 } 170 171 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) 172 { 173 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 174 int i; 175 176 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { 177 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; 178 179 if (pclusterpages > pcs->maxpages) 180 continue; 181 182 kmem_cache_free(pcs->slab, pcl); 183 return; 184 } 185 DBG_BUGON(1); 186 } 187 188 /* how to allocate cached pages for a pcluster */ 189 enum z_erofs_cache_alloctype { 190 DONTALLOC, /* don't allocate any cached pages */ 191 /* 192 * try to use cached I/O if page allocation succeeds or fallback 193 * to in-place I/O instead to avoid any direct reclaim. 194 */ 195 TRYALLOC, 196 }; 197 198 /* 199 * tagged pointer with 1-bit tag for all compressed pages 200 * tag 0 - the page is just found with an extra page reference 201 */ 202 typedef tagptr1_t compressed_page_t; 203 204 #define tag_compressed_page_justfound(page) \ 205 tagptr_fold(compressed_page_t, page, 1) 206 207 static struct workqueue_struct *z_erofs_workqueue __read_mostly; 208 209 void z_erofs_exit_zip_subsystem(void) 210 { 211 destroy_workqueue(z_erofs_workqueue); 212 z_erofs_destroy_pcluster_pool(); 213 } 214 215 static inline int z_erofs_init_workqueue(void) 216 { 217 const unsigned int onlinecpus = num_possible_cpus(); 218 219 /* 220 * no need to spawn too many threads, limiting threads could minimum 221 * scheduling overhead, perhaps per-CPU threads should be better? 222 */ 223 z_erofs_workqueue = alloc_workqueue("erofs_unzipd", 224 WQ_UNBOUND | WQ_HIGHPRI, 225 onlinecpus + onlinecpus / 4); 226 return z_erofs_workqueue ? 0 : -ENOMEM; 227 } 228 229 int __init z_erofs_init_zip_subsystem(void) 230 { 231 int err = z_erofs_create_pcluster_pool(); 232 233 if (err) 234 return err; 235 err = z_erofs_init_workqueue(); 236 if (err) 237 z_erofs_destroy_pcluster_pool(); 238 return err; 239 } 240 241 enum z_erofs_collectmode { 242 COLLECT_SECONDARY, 243 COLLECT_PRIMARY, 244 /* 245 * The current collection was the tail of an exist chain, in addition 246 * that the previous processed chained collections are all decided to 247 * be hooked up to it. 248 * A new chain will be created for the remaining collections which are 249 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, 250 * the next collection cannot reuse the whole page safely in 251 * the following scenario: 252 * ________________________________________________________________ 253 * | tail (partial) page | head (partial) page | 254 * | (belongs to the next cl) | (belongs to the current cl) | 255 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| 256 */ 257 COLLECT_PRIMARY_HOOKED, 258 /* 259 * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it 260 * could be dispatched into bypass queue later due to uptodated managed 261 * pages. All related online pages cannot be reused for inplace I/O (or 262 * bvpage) since it can be directly decoded without I/O submission. 263 */ 264 COLLECT_PRIMARY_FOLLOWED_NOINPLACE, 265 /* 266 * The current collection has been linked with the owned chain, and 267 * could also be linked with the remaining collections, which means 268 * if the processing page is the tail page of the collection, thus 269 * the current collection can safely use the whole page (since 270 * the previous collection is under control) for in-place I/O, as 271 * illustrated below: 272 * ________________________________________________________________ 273 * | tail (partial) page | head (partial) page | 274 * | (of the current cl) | (of the previous collection) | 275 * | PRIMARY_FOLLOWED or | | 276 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| 277 * 278 * [ (*) the above page can be used as inplace I/O. ] 279 */ 280 COLLECT_PRIMARY_FOLLOWED, 281 }; 282 283 struct z_erofs_decompress_frontend { 284 struct inode *const inode; 285 struct erofs_map_blocks map; 286 struct z_erofs_bvec_iter biter; 287 288 struct page *candidate_bvpage; 289 struct z_erofs_pcluster *pcl, *tailpcl; 290 /* a pointer used to pick up inplace I/O pages */ 291 struct page **icpage_ptr; 292 z_erofs_next_pcluster_t owned_head; 293 294 enum z_erofs_collectmode mode; 295 296 bool readahead; 297 /* used for applying cache strategy on the fly */ 298 bool backmost; 299 erofs_off_t headoffset; 300 }; 301 302 #define DECOMPRESS_FRONTEND_INIT(__i) { \ 303 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ 304 .mode = COLLECT_PRIMARY_FOLLOWED, .backmost = true } 305 306 static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; 307 static DEFINE_MUTEX(z_pagemap_global_lock); 308 309 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, 310 enum z_erofs_cache_alloctype type, 311 struct page **pagepool) 312 { 313 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); 314 struct z_erofs_pcluster *pcl = fe->pcl; 315 bool standalone = true; 316 /* 317 * optimistic allocation without direct reclaim since inplace I/O 318 * can be used if low memory otherwise. 319 */ 320 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | 321 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; 322 struct page **pages; 323 pgoff_t index; 324 325 if (fe->mode < COLLECT_PRIMARY_FOLLOWED) 326 return; 327 328 pages = pcl->compressed_pages; 329 index = pcl->obj.index; 330 for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) { 331 struct page *page; 332 compressed_page_t t; 333 struct page *newpage = NULL; 334 335 /* the compressed page was loaded before */ 336 if (READ_ONCE(*pages)) 337 continue; 338 339 page = find_get_page(mc, index); 340 341 if (page) { 342 t = tag_compressed_page_justfound(page); 343 } else { 344 /* I/O is needed, no possible to decompress directly */ 345 standalone = false; 346 switch (type) { 347 case TRYALLOC: 348 newpage = erofs_allocpage(pagepool, gfp); 349 if (!newpage) 350 continue; 351 set_page_private(newpage, 352 Z_EROFS_PREALLOCATED_PAGE); 353 t = tag_compressed_page_justfound(newpage); 354 break; 355 default: /* DONTALLOC */ 356 continue; 357 } 358 } 359 360 if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) 361 continue; 362 363 if (page) 364 put_page(page); 365 else if (newpage) 366 erofs_pagepool_add(pagepool, newpage); 367 } 368 369 /* 370 * don't do inplace I/O if all compressed pages are available in 371 * managed cache since it can be moved to the bypass queue instead. 372 */ 373 if (standalone) 374 fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; 375 } 376 377 /* called by erofs_shrinker to get rid of all compressed_pages */ 378 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, 379 struct erofs_workgroup *grp) 380 { 381 struct z_erofs_pcluster *const pcl = 382 container_of(grp, struct z_erofs_pcluster, obj); 383 int i; 384 385 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 386 /* 387 * refcount of workgroup is now freezed as 1, 388 * therefore no need to worry about available decompression users. 389 */ 390 for (i = 0; i < pcl->pclusterpages; ++i) { 391 struct page *page = pcl->compressed_pages[i]; 392 393 if (!page) 394 continue; 395 396 /* block other users from reclaiming or migrating the page */ 397 if (!trylock_page(page)) 398 return -EBUSY; 399 400 if (!erofs_page_is_managed(sbi, page)) 401 continue; 402 403 /* barrier is implied in the following 'unlock_page' */ 404 WRITE_ONCE(pcl->compressed_pages[i], NULL); 405 detach_page_private(page); 406 unlock_page(page); 407 } 408 return 0; 409 } 410 411 int erofs_try_to_free_cached_page(struct page *page) 412 { 413 struct z_erofs_pcluster *const pcl = (void *)page_private(page); 414 int ret = 0; /* 0 - busy */ 415 416 if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { 417 unsigned int i; 418 419 DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); 420 for (i = 0; i < pcl->pclusterpages; ++i) { 421 if (pcl->compressed_pages[i] == page) { 422 WRITE_ONCE(pcl->compressed_pages[i], NULL); 423 ret = 1; 424 break; 425 } 426 } 427 erofs_workgroup_unfreeze(&pcl->obj, 1); 428 429 if (ret) 430 detach_page_private(page); 431 } 432 return ret; 433 } 434 435 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ 436 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, 437 struct page *page) 438 { 439 struct z_erofs_pcluster *const pcl = fe->pcl; 440 441 while (fe->icpage_ptr > pcl->compressed_pages) 442 if (!cmpxchg(--fe->icpage_ptr, NULL, page)) 443 return true; 444 return false; 445 } 446 447 /* callers must be with pcluster lock held */ 448 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, 449 struct z_erofs_bvec *bvec, 450 enum z_erofs_page_type type) 451 { 452 int ret; 453 454 if (fe->mode >= COLLECT_PRIMARY && 455 type == Z_EROFS_PAGE_TYPE_EXCLUSIVE) { 456 /* give priority for inplaceio to use file pages first */ 457 if (z_erofs_try_inplace_io(fe, bvec->page)) 458 return 0; 459 /* otherwise, check if it can be used as a bvpage */ 460 if (fe->mode >= COLLECT_PRIMARY_FOLLOWED && 461 !fe->candidate_bvpage) 462 fe->candidate_bvpage = bvec->page; 463 } 464 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); 465 fe->pcl->vcnt += (ret >= 0); 466 return ret; 467 } 468 469 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) 470 { 471 struct z_erofs_pcluster *pcl = f->pcl; 472 z_erofs_next_pcluster_t *owned_head = &f->owned_head; 473 474 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ 475 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, 476 *owned_head) == Z_EROFS_PCLUSTER_NIL) { 477 *owned_head = &pcl->next; 478 /* so we can attach this pcluster to our submission chain. */ 479 f->mode = COLLECT_PRIMARY_FOLLOWED; 480 return; 481 } 482 483 /* 484 * type 2, link to the end of an existing open chain, be careful 485 * that its submission is controlled by the original attached chain. 486 */ 487 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, 488 *owned_head) == Z_EROFS_PCLUSTER_TAIL) { 489 *owned_head = Z_EROFS_PCLUSTER_TAIL; 490 f->mode = COLLECT_PRIMARY_HOOKED; 491 f->tailpcl = NULL; 492 return; 493 } 494 /* type 3, it belongs to a chain, but it isn't the end of the chain */ 495 f->mode = COLLECT_PRIMARY; 496 } 497 498 static int z_erofs_lookup_pcluster(struct z_erofs_decompress_frontend *fe) 499 { 500 struct erofs_map_blocks *map = &fe->map; 501 struct z_erofs_pcluster *pcl = fe->pcl; 502 unsigned int length; 503 504 /* to avoid unexpected loop formed by corrupted images */ 505 if (fe->owned_head == &pcl->next || pcl == fe->tailpcl) { 506 DBG_BUGON(1); 507 return -EFSCORRUPTED; 508 } 509 510 if (pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) { 511 DBG_BUGON(1); 512 return -EFSCORRUPTED; 513 } 514 515 length = READ_ONCE(pcl->length); 516 if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { 517 if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { 518 DBG_BUGON(1); 519 return -EFSCORRUPTED; 520 } 521 } else { 522 unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; 523 524 if (map->m_flags & EROFS_MAP_FULL_MAPPED) 525 llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; 526 527 while (llen > length && 528 length != cmpxchg_relaxed(&pcl->length, length, llen)) { 529 cpu_relax(); 530 length = READ_ONCE(pcl->length); 531 } 532 } 533 mutex_lock(&pcl->lock); 534 /* used to check tail merging loop due to corrupted images */ 535 if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) 536 fe->tailpcl = pcl; 537 538 z_erofs_try_to_claim_pcluster(fe); 539 return 0; 540 } 541 542 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) 543 { 544 struct erofs_map_blocks *map = &fe->map; 545 bool ztailpacking = map->m_flags & EROFS_MAP_META; 546 struct z_erofs_pcluster *pcl; 547 struct erofs_workgroup *grp; 548 int err; 549 550 if (!(map->m_flags & EROFS_MAP_ENCODED)) { 551 DBG_BUGON(1); 552 return -EFSCORRUPTED; 553 } 554 555 /* no available pcluster, let's allocate one */ 556 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : 557 map->m_plen >> PAGE_SHIFT); 558 if (IS_ERR(pcl)) 559 return PTR_ERR(pcl); 560 561 atomic_set(&pcl->obj.refcount, 1); 562 pcl->algorithmformat = map->m_algorithmformat; 563 pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | 564 (map->m_flags & EROFS_MAP_FULL_MAPPED ? 565 Z_EROFS_PCLUSTER_FULL_LENGTH : 0); 566 567 /* new pclusters should be claimed as type 1, primary and followed */ 568 pcl->next = fe->owned_head; 569 pcl->pageofs_out = map->m_la & ~PAGE_MASK; 570 fe->mode = COLLECT_PRIMARY_FOLLOWED; 571 572 /* 573 * lock all primary followed works before visible to others 574 * and mutex_trylock *never* fails for a new pcluster. 575 */ 576 mutex_init(&pcl->lock); 577 DBG_BUGON(!mutex_trylock(&pcl->lock)); 578 579 if (ztailpacking) { 580 pcl->obj.index = 0; /* which indicates ztailpacking */ 581 pcl->pageofs_in = erofs_blkoff(map->m_pa); 582 pcl->tailpacking_size = map->m_plen; 583 } else { 584 pcl->obj.index = map->m_pa >> PAGE_SHIFT; 585 586 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); 587 if (IS_ERR(grp)) { 588 err = PTR_ERR(grp); 589 goto err_out; 590 } 591 592 if (grp != &pcl->obj) { 593 fe->pcl = container_of(grp, 594 struct z_erofs_pcluster, obj); 595 err = -EEXIST; 596 goto err_out; 597 } 598 } 599 /* used to check tail merging loop due to corrupted images */ 600 if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) 601 fe->tailpcl = pcl; 602 fe->owned_head = &pcl->next; 603 fe->pcl = pcl; 604 return 0; 605 606 err_out: 607 mutex_unlock(&pcl->lock); 608 z_erofs_free_pcluster(pcl); 609 return err; 610 } 611 612 static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) 613 { 614 struct erofs_map_blocks *map = &fe->map; 615 struct erofs_workgroup *grp = NULL; 616 int ret; 617 618 DBG_BUGON(fe->pcl); 619 620 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ 621 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); 622 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); 623 624 if (!(map->m_flags & EROFS_MAP_META)) { 625 grp = erofs_find_workgroup(fe->inode->i_sb, 626 map->m_pa >> PAGE_SHIFT); 627 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { 628 DBG_BUGON(1); 629 return -EFSCORRUPTED; 630 } 631 632 if (grp) { 633 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); 634 ret = -EEXIST; 635 } else { 636 ret = z_erofs_register_pcluster(fe); 637 } 638 639 if (ret == -EEXIST) { 640 ret = z_erofs_lookup_pcluster(fe); 641 if (ret) { 642 erofs_workgroup_put(&fe->pcl->obj); 643 return ret; 644 } 645 } else if (ret) { 646 return ret; 647 } 648 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, 649 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); 650 /* since file-backed online pages are traversed in reverse order */ 651 fe->icpage_ptr = fe->pcl->compressed_pages + 652 z_erofs_pclusterpages(fe->pcl); 653 return 0; 654 } 655 656 /* 657 * keep in mind that no referenced pclusters will be freed 658 * only after a RCU grace period. 659 */ 660 static void z_erofs_rcu_callback(struct rcu_head *head) 661 { 662 z_erofs_free_pcluster(container_of(head, 663 struct z_erofs_pcluster, rcu)); 664 } 665 666 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) 667 { 668 struct z_erofs_pcluster *const pcl = 669 container_of(grp, struct z_erofs_pcluster, obj); 670 671 call_rcu(&pcl->rcu, z_erofs_rcu_callback); 672 } 673 674 static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) 675 { 676 struct z_erofs_pcluster *pcl = fe->pcl; 677 678 if (!pcl) 679 return false; 680 681 z_erofs_bvec_iter_end(&fe->biter); 682 mutex_unlock(&pcl->lock); 683 684 if (fe->candidate_bvpage) { 685 DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); 686 fe->candidate_bvpage = NULL; 687 } 688 689 /* 690 * if all pending pages are added, don't hold its reference 691 * any longer if the pcluster isn't hosted by ourselves. 692 */ 693 if (fe->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) 694 erofs_workgroup_put(&pcl->obj); 695 696 fe->pcl = NULL; 697 return true; 698 } 699 700 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, 701 unsigned int cachestrategy, 702 erofs_off_t la) 703 { 704 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) 705 return false; 706 707 if (fe->backmost) 708 return true; 709 710 return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && 711 la < fe->headoffset; 712 } 713 714 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, 715 struct page *page, struct page **pagepool) 716 { 717 struct inode *const inode = fe->inode; 718 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 719 struct erofs_map_blocks *const map = &fe->map; 720 const loff_t offset = page_offset(page); 721 bool tight = true; 722 723 enum z_erofs_cache_alloctype cache_strategy; 724 enum z_erofs_page_type page_type; 725 unsigned int cur, end, spiltted, index; 726 int err = 0; 727 728 /* register locked file pages as online pages in pack */ 729 z_erofs_onlinepage_init(page); 730 731 spiltted = 0; 732 end = PAGE_SIZE; 733 repeat: 734 cur = end - 1; 735 736 if (offset + cur < map->m_la || 737 offset + cur >= map->m_la + map->m_llen) { 738 erofs_dbg("out-of-range map @ pos %llu", offset + cur); 739 740 if (z_erofs_collector_end(fe)) 741 fe->backmost = false; 742 map->m_la = offset + cur; 743 map->m_llen = 0; 744 err = z_erofs_map_blocks_iter(inode, map, 0); 745 if (err) 746 goto err_out; 747 } else { 748 if (fe->pcl) 749 goto hitted; 750 /* didn't get a valid pcluster previously (very rare) */ 751 } 752 753 if (!(map->m_flags & EROFS_MAP_MAPPED)) 754 goto hitted; 755 756 err = z_erofs_collector_begin(fe); 757 if (err) 758 goto err_out; 759 760 if (z_erofs_is_inline_pcluster(fe->pcl)) { 761 void *mp; 762 763 mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, 764 erofs_blknr(map->m_pa), EROFS_NO_KMAP); 765 if (IS_ERR(mp)) { 766 err = PTR_ERR(mp); 767 erofs_err(inode->i_sb, 768 "failed to get inline page, err %d", err); 769 goto err_out; 770 } 771 get_page(fe->map.buf.page); 772 WRITE_ONCE(fe->pcl->compressed_pages[0], fe->map.buf.page); 773 fe->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; 774 } else { 775 /* bind cache first when cached decompression is preferred */ 776 if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, 777 map->m_la)) 778 cache_strategy = TRYALLOC; 779 else 780 cache_strategy = DONTALLOC; 781 782 z_erofs_bind_cache(fe, cache_strategy, pagepool); 783 } 784 hitted: 785 /* 786 * Ensure the current partial page belongs to this submit chain rather 787 * than other concurrent submit chains or the noio(bypass) chain since 788 * those chains are handled asynchronously thus the page cannot be used 789 * for inplace I/O or bvpage (should be processed in a strict order.) 790 */ 791 tight &= (fe->mode >= COLLECT_PRIMARY_HOOKED && 792 fe->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); 793 794 cur = end - min_t(unsigned int, offset + end - map->m_la, end); 795 if (!(map->m_flags & EROFS_MAP_MAPPED)) { 796 zero_user_segment(page, cur, end); 797 goto next_part; 798 } 799 800 /* let's derive page type */ 801 page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : 802 (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : 803 (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : 804 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); 805 806 if (cur) 807 tight &= (fe->mode >= COLLECT_PRIMARY_FOLLOWED); 808 809 retry: 810 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { 811 .page = page, 812 .offset = offset - map->m_la, 813 .end = end, 814 }), page_type); 815 /* should allocate an additional short-lived page for bvset */ 816 if (err == -EAGAIN && !fe->candidate_bvpage) { 817 fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); 818 set_page_private(fe->candidate_bvpage, 819 Z_EROFS_SHORTLIVED_PAGE); 820 goto retry; 821 } 822 823 if (err) { 824 DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); 825 goto err_out; 826 } 827 828 index = page->index - (map->m_la >> PAGE_SHIFT); 829 830 z_erofs_onlinepage_fixup(page, index, true); 831 832 /* bump up the number of spiltted parts of a page */ 833 ++spiltted; 834 /* also update nr_pages */ 835 fe->pcl->nr_pages = max_t(pgoff_t, fe->pcl->nr_pages, index + 1); 836 next_part: 837 /* can be used for verification */ 838 map->m_llen = offset + cur - map->m_la; 839 840 end = cur; 841 if (end > 0) 842 goto repeat; 843 844 out: 845 z_erofs_onlinepage_endio(page); 846 847 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", 848 __func__, page, spiltted, map->m_llen); 849 return err; 850 851 /* if some error occurred while processing this page */ 852 err_out: 853 SetPageError(page); 854 goto out; 855 } 856 857 static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, 858 unsigned int readahead_pages) 859 { 860 /* auto: enable for read_folio, disable for readahead */ 861 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && 862 !readahead_pages) 863 return true; 864 865 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && 866 (readahead_pages <= sbi->opt.max_sync_decompress_pages)) 867 return true; 868 869 return false; 870 } 871 872 static bool z_erofs_page_is_invalidated(struct page *page) 873 { 874 return !page->mapping && !z_erofs_is_shortlived_page(page); 875 } 876 877 static int z_erofs_parse_out_bvecs(struct z_erofs_pcluster *pcl, 878 struct page **pages, struct page **pagepool) 879 { 880 struct z_erofs_bvec_iter biter; 881 struct page *old_bvpage; 882 int i, err = 0; 883 884 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); 885 for (i = 0; i < pcl->vcnt; ++i) { 886 struct z_erofs_bvec bvec; 887 unsigned int pagenr; 888 889 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); 890 891 if (old_bvpage) 892 z_erofs_put_shortlivedpage(pagepool, old_bvpage); 893 894 pagenr = (bvec.offset + pcl->pageofs_out) >> PAGE_SHIFT; 895 DBG_BUGON(pagenr >= pcl->nr_pages); 896 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); 897 /* 898 * currently EROFS doesn't support multiref(dedup), 899 * so here erroring out one multiref page. 900 */ 901 if (pages[pagenr]) { 902 DBG_BUGON(1); 903 SetPageError(pages[pagenr]); 904 z_erofs_onlinepage_endio(pages[pagenr]); 905 err = -EFSCORRUPTED; 906 } 907 pages[pagenr] = bvec.page; 908 } 909 910 old_bvpage = z_erofs_bvec_iter_end(&biter); 911 if (old_bvpage) 912 z_erofs_put_shortlivedpage(pagepool, old_bvpage); 913 return err; 914 } 915 916 static struct page **z_erofs_parse_in_bvecs(struct erofs_sb_info *sbi, 917 struct z_erofs_pcluster *pcl, struct page **pages, 918 struct page **pagepool, bool *overlapped) 919 { 920 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 921 struct page **compressed_pages; 922 int i, err = 0; 923 924 /* XXX: will have a better approach in the following commits */ 925 compressed_pages = kmalloc_array(pclusterpages, sizeof(struct page *), 926 GFP_KERNEL | __GFP_NOFAIL); 927 *overlapped = false; 928 929 for (i = 0; i < pclusterpages; ++i) { 930 unsigned int pagenr; 931 struct page *page = pcl->compressed_pages[i]; 932 933 /* compressed pages ought to be present before decompressing */ 934 if (!page) { 935 DBG_BUGON(1); 936 continue; 937 } 938 compressed_pages[i] = page; 939 940 if (z_erofs_is_inline_pcluster(pcl)) { 941 if (!PageUptodate(page)) 942 err = -EIO; 943 continue; 944 } 945 946 DBG_BUGON(z_erofs_page_is_invalidated(page)); 947 if (!z_erofs_is_shortlived_page(page)) { 948 if (erofs_page_is_managed(sbi, page)) { 949 if (!PageUptodate(page)) 950 err = -EIO; 951 continue; 952 } 953 954 /* 955 * only if non-head page can be selected 956 * for inplace decompression 957 */ 958 pagenr = z_erofs_onlinepage_index(page); 959 960 DBG_BUGON(pagenr >= pcl->nr_pages); 961 if (pages[pagenr]) { 962 DBG_BUGON(1); 963 SetPageError(pages[pagenr]); 964 z_erofs_onlinepage_endio(pages[pagenr]); 965 err = -EFSCORRUPTED; 966 } 967 pages[pagenr] = page; 968 969 *overlapped = true; 970 } 971 972 /* PG_error needs checking for all non-managed pages */ 973 if (PageError(page)) { 974 DBG_BUGON(PageUptodate(page)); 975 err = -EIO; 976 } 977 } 978 979 if (err) { 980 kfree(compressed_pages); 981 return ERR_PTR(err); 982 } 983 return compressed_pages; 984 } 985 986 static int z_erofs_decompress_pcluster(struct super_block *sb, 987 struct z_erofs_pcluster *pcl, 988 struct page **pagepool) 989 { 990 struct erofs_sb_info *const sbi = EROFS_SB(sb); 991 unsigned int pclusterpages = z_erofs_pclusterpages(pcl); 992 unsigned int i, inputsize, outputsize, llen, nr_pages; 993 struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; 994 struct page **pages, **compressed_pages, *page; 995 996 bool overlapped, partial; 997 int err; 998 999 might_sleep(); 1000 DBG_BUGON(!READ_ONCE(pcl->nr_pages)); 1001 1002 mutex_lock(&pcl->lock); 1003 nr_pages = pcl->nr_pages; 1004 1005 if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { 1006 pages = pages_onstack; 1007 } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && 1008 mutex_trylock(&z_pagemap_global_lock)) { 1009 pages = z_pagemap_global; 1010 } else { 1011 gfp_t gfp_flags = GFP_KERNEL; 1012 1013 if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) 1014 gfp_flags |= __GFP_NOFAIL; 1015 1016 pages = kvmalloc_array(nr_pages, sizeof(struct page *), 1017 gfp_flags); 1018 1019 /* fallback to global pagemap for the lowmem scenario */ 1020 if (!pages) { 1021 mutex_lock(&z_pagemap_global_lock); 1022 pages = z_pagemap_global; 1023 } 1024 } 1025 1026 for (i = 0; i < nr_pages; ++i) 1027 pages[i] = NULL; 1028 1029 err = z_erofs_parse_out_bvecs(pcl, pages, pagepool); 1030 compressed_pages = z_erofs_parse_in_bvecs(sbi, pcl, pages, 1031 pagepool, &overlapped); 1032 if (IS_ERR(compressed_pages)) { 1033 err = PTR_ERR(compressed_pages); 1034 compressed_pages = NULL; 1035 } 1036 1037 if (err) 1038 goto out; 1039 1040 llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; 1041 if (nr_pages << PAGE_SHIFT >= pcl->pageofs_out + llen) { 1042 outputsize = llen; 1043 partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); 1044 } else { 1045 outputsize = (nr_pages << PAGE_SHIFT) - pcl->pageofs_out; 1046 partial = true; 1047 } 1048 1049 if (z_erofs_is_inline_pcluster(pcl)) 1050 inputsize = pcl->tailpacking_size; 1051 else 1052 inputsize = pclusterpages * PAGE_SIZE; 1053 1054 err = z_erofs_decompress(&(struct z_erofs_decompress_req) { 1055 .sb = sb, 1056 .in = compressed_pages, 1057 .out = pages, 1058 .pageofs_in = pcl->pageofs_in, 1059 .pageofs_out = pcl->pageofs_out, 1060 .inputsize = inputsize, 1061 .outputsize = outputsize, 1062 .alg = pcl->algorithmformat, 1063 .inplace_io = overlapped, 1064 .partial_decoding = partial 1065 }, pagepool); 1066 1067 out: 1068 /* must handle all compressed pages before actual file pages */ 1069 if (z_erofs_is_inline_pcluster(pcl)) { 1070 page = pcl->compressed_pages[0]; 1071 WRITE_ONCE(pcl->compressed_pages[0], NULL); 1072 put_page(page); 1073 } else { 1074 for (i = 0; i < pclusterpages; ++i) { 1075 page = pcl->compressed_pages[i]; 1076 1077 if (erofs_page_is_managed(sbi, page)) 1078 continue; 1079 1080 /* recycle all individual short-lived pages */ 1081 (void)z_erofs_put_shortlivedpage(pagepool, page); 1082 WRITE_ONCE(pcl->compressed_pages[i], NULL); 1083 } 1084 } 1085 kfree(compressed_pages); 1086 1087 for (i = 0; i < nr_pages; ++i) { 1088 page = pages[i]; 1089 if (!page) 1090 continue; 1091 1092 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1093 1094 /* recycle all individual short-lived pages */ 1095 if (z_erofs_put_shortlivedpage(pagepool, page)) 1096 continue; 1097 1098 if (err < 0) 1099 SetPageError(page); 1100 1101 z_erofs_onlinepage_endio(page); 1102 } 1103 1104 if (pages == z_pagemap_global) 1105 mutex_unlock(&z_pagemap_global_lock); 1106 else if (pages != pages_onstack) 1107 kvfree(pages); 1108 1109 pcl->nr_pages = 0; 1110 pcl->bvset.nextpage = NULL; 1111 pcl->vcnt = 0; 1112 1113 /* pcluster lock MUST be taken before the following line */ 1114 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); 1115 mutex_unlock(&pcl->lock); 1116 return err; 1117 } 1118 1119 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, 1120 struct page **pagepool) 1121 { 1122 z_erofs_next_pcluster_t owned = io->head; 1123 1124 while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { 1125 struct z_erofs_pcluster *pcl; 1126 1127 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ 1128 DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); 1129 1130 /* no possible that 'owned' equals NULL */ 1131 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); 1132 1133 pcl = container_of(owned, struct z_erofs_pcluster, next); 1134 owned = READ_ONCE(pcl->next); 1135 1136 z_erofs_decompress_pcluster(io->sb, pcl, pagepool); 1137 erofs_workgroup_put(&pcl->obj); 1138 } 1139 } 1140 1141 static void z_erofs_decompressqueue_work(struct work_struct *work) 1142 { 1143 struct z_erofs_decompressqueue *bgq = 1144 container_of(work, struct z_erofs_decompressqueue, u.work); 1145 struct page *pagepool = NULL; 1146 1147 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); 1148 z_erofs_decompress_queue(bgq, &pagepool); 1149 1150 erofs_release_pages(&pagepool); 1151 kvfree(bgq); 1152 } 1153 1154 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, 1155 bool sync, int bios) 1156 { 1157 struct erofs_sb_info *const sbi = EROFS_SB(io->sb); 1158 1159 /* wake up the caller thread for sync decompression */ 1160 if (sync) { 1161 if (!atomic_add_return(bios, &io->pending_bios)) 1162 complete(&io->u.done); 1163 1164 return; 1165 } 1166 1167 if (atomic_add_return(bios, &io->pending_bios)) 1168 return; 1169 /* Use workqueue and sync decompression for atomic contexts only */ 1170 if (in_atomic() || irqs_disabled()) { 1171 queue_work(z_erofs_workqueue, &io->u.work); 1172 /* enable sync decompression for readahead */ 1173 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) 1174 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; 1175 return; 1176 } 1177 z_erofs_decompressqueue_work(&io->u.work); 1178 } 1179 1180 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, 1181 unsigned int nr, 1182 struct page **pagepool, 1183 struct address_space *mc) 1184 { 1185 const pgoff_t index = pcl->obj.index; 1186 gfp_t gfp = mapping_gfp_mask(mc); 1187 bool tocache = false; 1188 1189 struct address_space *mapping; 1190 struct page *oldpage, *page; 1191 1192 compressed_page_t t; 1193 int justfound; 1194 1195 repeat: 1196 page = READ_ONCE(pcl->compressed_pages[nr]); 1197 oldpage = page; 1198 1199 if (!page) 1200 goto out_allocpage; 1201 1202 /* process the target tagged pointer */ 1203 t = tagptr_init(compressed_page_t, page); 1204 justfound = tagptr_unfold_tags(t); 1205 page = tagptr_unfold_ptr(t); 1206 1207 /* 1208 * preallocated cached pages, which is used to avoid direct reclaim 1209 * otherwise, it will go inplace I/O path instead. 1210 */ 1211 if (page->private == Z_EROFS_PREALLOCATED_PAGE) { 1212 WRITE_ONCE(pcl->compressed_pages[nr], page); 1213 set_page_private(page, 0); 1214 tocache = true; 1215 goto out_tocache; 1216 } 1217 mapping = READ_ONCE(page->mapping); 1218 1219 /* 1220 * file-backed online pages in plcuster are all locked steady, 1221 * therefore it is impossible for `mapping' to be NULL. 1222 */ 1223 if (mapping && mapping != mc) 1224 /* ought to be unmanaged pages */ 1225 goto out; 1226 1227 /* directly return for shortlived page as well */ 1228 if (z_erofs_is_shortlived_page(page)) 1229 goto out; 1230 1231 lock_page(page); 1232 1233 /* only true if page reclaim goes wrong, should never happen */ 1234 DBG_BUGON(justfound && PagePrivate(page)); 1235 1236 /* the page is still in manage cache */ 1237 if (page->mapping == mc) { 1238 WRITE_ONCE(pcl->compressed_pages[nr], page); 1239 1240 ClearPageError(page); 1241 if (!PagePrivate(page)) { 1242 /* 1243 * impossible to be !PagePrivate(page) for 1244 * the current restriction as well if 1245 * the page is already in compressed_pages[]. 1246 */ 1247 DBG_BUGON(!justfound); 1248 1249 justfound = 0; 1250 set_page_private(page, (unsigned long)pcl); 1251 SetPagePrivate(page); 1252 } 1253 1254 /* no need to submit io if it is already up-to-date */ 1255 if (PageUptodate(page)) { 1256 unlock_page(page); 1257 page = NULL; 1258 } 1259 goto out; 1260 } 1261 1262 /* 1263 * the managed page has been truncated, it's unsafe to 1264 * reuse this one, let's allocate a new cache-managed page. 1265 */ 1266 DBG_BUGON(page->mapping); 1267 DBG_BUGON(!justfound); 1268 1269 tocache = true; 1270 unlock_page(page); 1271 put_page(page); 1272 out_allocpage: 1273 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); 1274 if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { 1275 erofs_pagepool_add(pagepool, page); 1276 cond_resched(); 1277 goto repeat; 1278 } 1279 out_tocache: 1280 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { 1281 /* turn into temporary page if fails (1 ref) */ 1282 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); 1283 goto out; 1284 } 1285 attach_page_private(page, pcl); 1286 /* drop a refcount added by allocpage (then we have 2 refs here) */ 1287 put_page(page); 1288 1289 out: /* the only exit (for tracing and debugging) */ 1290 return page; 1291 } 1292 1293 static struct z_erofs_decompressqueue * 1294 jobqueue_init(struct super_block *sb, 1295 struct z_erofs_decompressqueue *fgq, bool *fg) 1296 { 1297 struct z_erofs_decompressqueue *q; 1298 1299 if (fg && !*fg) { 1300 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); 1301 if (!q) { 1302 *fg = true; 1303 goto fg_out; 1304 } 1305 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); 1306 } else { 1307 fg_out: 1308 q = fgq; 1309 init_completion(&fgq->u.done); 1310 atomic_set(&fgq->pending_bios, 0); 1311 } 1312 q->sb = sb; 1313 q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; 1314 return q; 1315 } 1316 1317 /* define decompression jobqueue types */ 1318 enum { 1319 JQ_BYPASS, 1320 JQ_SUBMIT, 1321 NR_JOBQUEUES, 1322 }; 1323 1324 static void *jobqueueset_init(struct super_block *sb, 1325 struct z_erofs_decompressqueue *q[], 1326 struct z_erofs_decompressqueue *fgq, bool *fg) 1327 { 1328 /* 1329 * if managed cache is enabled, bypass jobqueue is needed, 1330 * no need to read from device for all pclusters in this queue. 1331 */ 1332 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); 1333 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); 1334 1335 return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); 1336 } 1337 1338 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, 1339 z_erofs_next_pcluster_t qtail[], 1340 z_erofs_next_pcluster_t owned_head) 1341 { 1342 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; 1343 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; 1344 1345 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); 1346 if (owned_head == Z_EROFS_PCLUSTER_TAIL) 1347 owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; 1348 1349 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); 1350 1351 WRITE_ONCE(*submit_qtail, owned_head); 1352 WRITE_ONCE(*bypass_qtail, &pcl->next); 1353 1354 qtail[JQ_BYPASS] = &pcl->next; 1355 } 1356 1357 static void z_erofs_decompressqueue_endio(struct bio *bio) 1358 { 1359 tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); 1360 struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); 1361 blk_status_t err = bio->bi_status; 1362 struct bio_vec *bvec; 1363 struct bvec_iter_all iter_all; 1364 1365 bio_for_each_segment_all(bvec, bio, iter_all) { 1366 struct page *page = bvec->bv_page; 1367 1368 DBG_BUGON(PageUptodate(page)); 1369 DBG_BUGON(z_erofs_page_is_invalidated(page)); 1370 1371 if (err) 1372 SetPageError(page); 1373 1374 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { 1375 if (!err) 1376 SetPageUptodate(page); 1377 unlock_page(page); 1378 } 1379 } 1380 z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); 1381 bio_put(bio); 1382 } 1383 1384 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, 1385 struct page **pagepool, 1386 struct z_erofs_decompressqueue *fgq, 1387 bool *force_fg) 1388 { 1389 struct super_block *sb = f->inode->i_sb; 1390 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); 1391 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; 1392 struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; 1393 void *bi_private; 1394 z_erofs_next_pcluster_t owned_head = f->owned_head; 1395 /* bio is NULL initially, so no need to initialize last_{index,bdev} */ 1396 pgoff_t last_index; 1397 struct block_device *last_bdev; 1398 unsigned int nr_bios = 0; 1399 struct bio *bio = NULL; 1400 1401 bi_private = jobqueueset_init(sb, q, fgq, force_fg); 1402 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; 1403 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; 1404 1405 /* by default, all need io submission */ 1406 q[JQ_SUBMIT]->head = owned_head; 1407 1408 do { 1409 struct erofs_map_dev mdev; 1410 struct z_erofs_pcluster *pcl; 1411 pgoff_t cur, end; 1412 unsigned int i = 0; 1413 bool bypass = true; 1414 1415 /* no possible 'owned_head' equals the following */ 1416 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); 1417 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); 1418 1419 pcl = container_of(owned_head, struct z_erofs_pcluster, next); 1420 1421 /* close the main owned chain at first */ 1422 owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, 1423 Z_EROFS_PCLUSTER_TAIL_CLOSED); 1424 if (z_erofs_is_inline_pcluster(pcl)) { 1425 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1426 continue; 1427 } 1428 1429 /* no device id here, thus it will always succeed */ 1430 mdev = (struct erofs_map_dev) { 1431 .m_pa = blknr_to_addr(pcl->obj.index), 1432 }; 1433 (void)erofs_map_dev(sb, &mdev); 1434 1435 cur = erofs_blknr(mdev.m_pa); 1436 end = cur + pcl->pclusterpages; 1437 1438 do { 1439 struct page *page; 1440 1441 page = pickup_page_for_submission(pcl, i++, pagepool, 1442 mc); 1443 if (!page) 1444 continue; 1445 1446 if (bio && (cur != last_index + 1 || 1447 last_bdev != mdev.m_bdev)) { 1448 submit_bio_retry: 1449 submit_bio(bio); 1450 bio = NULL; 1451 } 1452 1453 if (!bio) { 1454 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, 1455 REQ_OP_READ, GFP_NOIO); 1456 bio->bi_end_io = z_erofs_decompressqueue_endio; 1457 1458 last_bdev = mdev.m_bdev; 1459 bio->bi_iter.bi_sector = (sector_t)cur << 1460 LOG_SECTORS_PER_BLOCK; 1461 bio->bi_private = bi_private; 1462 if (f->readahead) 1463 bio->bi_opf |= REQ_RAHEAD; 1464 ++nr_bios; 1465 } 1466 1467 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) 1468 goto submit_bio_retry; 1469 1470 last_index = cur; 1471 bypass = false; 1472 } while (++cur < end); 1473 1474 if (!bypass) 1475 qtail[JQ_SUBMIT] = &pcl->next; 1476 else 1477 move_to_bypass_jobqueue(pcl, qtail, owned_head); 1478 } while (owned_head != Z_EROFS_PCLUSTER_TAIL); 1479 1480 if (bio) 1481 submit_bio(bio); 1482 1483 /* 1484 * although background is preferred, no one is pending for submission. 1485 * don't issue workqueue for decompression but drop it directly instead. 1486 */ 1487 if (!*force_fg && !nr_bios) { 1488 kvfree(q[JQ_SUBMIT]); 1489 return; 1490 } 1491 z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); 1492 } 1493 1494 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, 1495 struct page **pagepool, bool force_fg) 1496 { 1497 struct z_erofs_decompressqueue io[NR_JOBQUEUES]; 1498 1499 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) 1500 return; 1501 z_erofs_submit_queue(f, pagepool, io, &force_fg); 1502 1503 /* handle bypass queue (no i/o pclusters) immediately */ 1504 z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); 1505 1506 if (!force_fg) 1507 return; 1508 1509 /* wait until all bios are completed */ 1510 wait_for_completion_io(&io[JQ_SUBMIT].u.done); 1511 1512 /* handle synchronous decompress queue in the caller context */ 1513 z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); 1514 } 1515 1516 /* 1517 * Since partial uptodate is still unimplemented for now, we have to use 1518 * approximate readmore strategies as a start. 1519 */ 1520 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, 1521 struct readahead_control *rac, 1522 erofs_off_t end, 1523 struct page **pagepool, 1524 bool backmost) 1525 { 1526 struct inode *inode = f->inode; 1527 struct erofs_map_blocks *map = &f->map; 1528 erofs_off_t cur; 1529 int err; 1530 1531 if (backmost) { 1532 map->m_la = end; 1533 err = z_erofs_map_blocks_iter(inode, map, 1534 EROFS_GET_BLOCKS_READMORE); 1535 if (err) 1536 return; 1537 1538 /* expend ra for the trailing edge if readahead */ 1539 if (rac) { 1540 loff_t newstart = readahead_pos(rac); 1541 1542 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); 1543 readahead_expand(rac, newstart, cur - newstart); 1544 return; 1545 } 1546 end = round_up(end, PAGE_SIZE); 1547 } else { 1548 end = round_up(map->m_la, PAGE_SIZE); 1549 1550 if (!map->m_llen) 1551 return; 1552 } 1553 1554 cur = map->m_la + map->m_llen - 1; 1555 while (cur >= end) { 1556 pgoff_t index = cur >> PAGE_SHIFT; 1557 struct page *page; 1558 1559 page = erofs_grab_cache_page_nowait(inode->i_mapping, index); 1560 if (page) { 1561 if (PageUptodate(page)) { 1562 unlock_page(page); 1563 } else { 1564 err = z_erofs_do_read_page(f, page, pagepool); 1565 if (err) 1566 erofs_err(inode->i_sb, 1567 "readmore error at page %lu @ nid %llu", 1568 index, EROFS_I(inode)->nid); 1569 } 1570 put_page(page); 1571 } 1572 1573 if (cur < PAGE_SIZE) 1574 break; 1575 cur = (index << PAGE_SHIFT) - 1; 1576 } 1577 } 1578 1579 static int z_erofs_read_folio(struct file *file, struct folio *folio) 1580 { 1581 struct page *page = &folio->page; 1582 struct inode *const inode = page->mapping->host; 1583 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1584 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1585 struct page *pagepool = NULL; 1586 int err; 1587 1588 trace_erofs_readpage(page, false); 1589 f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; 1590 1591 z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, 1592 &pagepool, true); 1593 err = z_erofs_do_read_page(&f, page, &pagepool); 1594 z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); 1595 1596 (void)z_erofs_collector_end(&f); 1597 1598 /* if some compressed cluster ready, need submit them anyway */ 1599 z_erofs_runqueue(&f, &pagepool, 1600 z_erofs_get_sync_decompress_policy(sbi, 0)); 1601 1602 if (err) 1603 erofs_err(inode->i_sb, "failed to read, err [%d]", err); 1604 1605 erofs_put_metabuf(&f.map.buf); 1606 erofs_release_pages(&pagepool); 1607 return err; 1608 } 1609 1610 static void z_erofs_readahead(struct readahead_control *rac) 1611 { 1612 struct inode *const inode = rac->mapping->host; 1613 struct erofs_sb_info *const sbi = EROFS_I_SB(inode); 1614 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); 1615 struct page *pagepool = NULL, *head = NULL, *page; 1616 unsigned int nr_pages; 1617 1618 f.readahead = true; 1619 f.headoffset = readahead_pos(rac); 1620 1621 z_erofs_pcluster_readmore(&f, rac, f.headoffset + 1622 readahead_length(rac) - 1, &pagepool, true); 1623 nr_pages = readahead_count(rac); 1624 trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); 1625 1626 while ((page = readahead_page(rac))) { 1627 set_page_private(page, (unsigned long)head); 1628 head = page; 1629 } 1630 1631 while (head) { 1632 struct page *page = head; 1633 int err; 1634 1635 /* traversal in reverse order */ 1636 head = (void *)page_private(page); 1637 1638 err = z_erofs_do_read_page(&f, page, &pagepool); 1639 if (err) 1640 erofs_err(inode->i_sb, 1641 "readahead error at page %lu @ nid %llu", 1642 page->index, EROFS_I(inode)->nid); 1643 put_page(page); 1644 } 1645 z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); 1646 (void)z_erofs_collector_end(&f); 1647 1648 z_erofs_runqueue(&f, &pagepool, 1649 z_erofs_get_sync_decompress_policy(sbi, nr_pages)); 1650 erofs_put_metabuf(&f.map.buf); 1651 erofs_release_pages(&pagepool); 1652 } 1653 1654 const struct address_space_operations z_erofs_aops = { 1655 .read_folio = z_erofs_read_folio, 1656 .readahead = z_erofs_readahead, 1657 }; 1658