1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * https://www.huawei.com/
5 * Copyright (C) 2022 Alibaba Cloud
6 */
7 #include "compress.h"
8 #include <linux/psi.h>
9 #include <linux/cpuhotplug.h>
10 #include <trace/events/erofs.h>
11
12 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
13 #define Z_EROFS_INLINE_BVECS 2
14
15 /*
16 * let's leave a type here in case of introducing
17 * another tagged pointer later.
18 */
19 typedef void *z_erofs_next_pcluster_t;
20
21 struct z_erofs_bvec {
22 struct page *page;
23 int offset;
24 unsigned int end;
25 };
26
27 #define __Z_EROFS_BVSET(name, total) \
28 struct name { \
29 /* point to the next page which contains the following bvecs */ \
30 struct page *nextpage; \
31 struct z_erofs_bvec bvec[total]; \
32 }
33 __Z_EROFS_BVSET(z_erofs_bvset,);
34 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
35
36 /*
37 * Structure fields follow one of the following exclusion rules.
38 *
39 * I: Modifiable by initialization/destruction paths and read-only
40 * for everyone else;
41 *
42 * L: Field should be protected by the pcluster lock;
43 *
44 * A: Field should be accessed / updated in atomic for parallelized code.
45 */
46 struct z_erofs_pcluster {
47 struct erofs_workgroup obj;
48 struct mutex lock;
49
50 /* A: point to next chained pcluster or TAILs */
51 z_erofs_next_pcluster_t next;
52
53 /* L: the maximum decompression size of this round */
54 unsigned int length;
55
56 /* L: total number of bvecs */
57 unsigned int vcnt;
58
59 /* I: page offset of start position of decompression */
60 unsigned short pageofs_out;
61
62 /* I: page offset of inline compressed data */
63 unsigned short pageofs_in;
64
65 union {
66 /* L: inline a certain number of bvec for bootstrap */
67 struct z_erofs_bvset_inline bvset;
68
69 /* I: can be used to free the pcluster by RCU. */
70 struct rcu_head rcu;
71 };
72
73 union {
74 /* I: physical cluster size in pages */
75 unsigned short pclusterpages;
76
77 /* I: tailpacking inline compressed size */
78 unsigned short tailpacking_size;
79 };
80
81 /* I: compression algorithm format */
82 unsigned char algorithmformat;
83
84 /* L: whether partial decompression or not */
85 bool partial;
86
87 /* L: indicate several pageofs_outs or not */
88 bool multibases;
89
90 /* A: compressed bvecs (can be cached or inplaced pages) */
91 struct z_erofs_bvec compressed_bvecs[];
92 };
93
94 /* the end of a chain of pclusters */
95 #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
96 #define Z_EROFS_PCLUSTER_NIL (NULL)
97
98 struct z_erofs_decompressqueue {
99 struct super_block *sb;
100 atomic_t pending_bios;
101 z_erofs_next_pcluster_t head;
102
103 union {
104 struct completion done;
105 struct work_struct work;
106 struct kthread_work kthread_work;
107 } u;
108 bool eio, sync;
109 };
110
z_erofs_is_inline_pcluster(struct z_erofs_pcluster * pcl)111 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
112 {
113 return !pcl->obj.index;
114 }
115
z_erofs_pclusterpages(struct z_erofs_pcluster * pcl)116 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
117 {
118 if (z_erofs_is_inline_pcluster(pcl))
119 return 1;
120 return pcl->pclusterpages;
121 }
122
123 /*
124 * bit 30: I/O error occurred on this page
125 * bit 0 - 29: remaining parts to complete this page
126 */
127 #define Z_EROFS_PAGE_EIO (1 << 30)
128
z_erofs_onlinepage_init(struct page * page)129 static inline void z_erofs_onlinepage_init(struct page *page)
130 {
131 union {
132 atomic_t o;
133 unsigned long v;
134 } u = { .o = ATOMIC_INIT(1) };
135
136 set_page_private(page, u.v);
137 smp_wmb();
138 SetPagePrivate(page);
139 }
140
z_erofs_onlinepage_split(struct page * page)141 static inline void z_erofs_onlinepage_split(struct page *page)
142 {
143 atomic_inc((atomic_t *)&page->private);
144 }
145
z_erofs_onlinepage_endio(struct page * page,int err)146 static void z_erofs_onlinepage_endio(struct page *page, int err)
147 {
148 int orig, v;
149
150 DBG_BUGON(!PagePrivate(page));
151
152 do {
153 orig = atomic_read((atomic_t *)&page->private);
154 v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
155 } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
156
157 if (!(v & ~Z_EROFS_PAGE_EIO)) {
158 set_page_private(page, 0);
159 ClearPagePrivate(page);
160 if (!(v & Z_EROFS_PAGE_EIO))
161 SetPageUptodate(page);
162 unlock_page(page);
163 }
164 }
165
166 #define Z_EROFS_ONSTACK_PAGES 32
167
168 /*
169 * since pclustersize is variable for big pcluster feature, introduce slab
170 * pools implementation for different pcluster sizes.
171 */
172 struct z_erofs_pcluster_slab {
173 struct kmem_cache *slab;
174 unsigned int maxpages;
175 char name[48];
176 };
177
178 #define _PCLP(n) { .maxpages = n }
179
180 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
181 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
182 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
183 };
184
185 struct z_erofs_bvec_iter {
186 struct page *bvpage;
187 struct z_erofs_bvset *bvset;
188 unsigned int nr, cur;
189 };
190
z_erofs_bvec_iter_end(struct z_erofs_bvec_iter * iter)191 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
192 {
193 if (iter->bvpage)
194 kunmap_local(iter->bvset);
195 return iter->bvpage;
196 }
197
z_erofs_bvset_flip(struct z_erofs_bvec_iter * iter)198 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
199 {
200 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
201 /* have to access nextpage in advance, otherwise it will be unmapped */
202 struct page *nextpage = iter->bvset->nextpage;
203 struct page *oldpage;
204
205 DBG_BUGON(!nextpage);
206 oldpage = z_erofs_bvec_iter_end(iter);
207 iter->bvpage = nextpage;
208 iter->bvset = kmap_local_page(nextpage);
209 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
210 iter->cur = 0;
211 return oldpage;
212 }
213
z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter * iter,struct z_erofs_bvset_inline * bvset,unsigned int bootstrap_nr,unsigned int cur)214 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
215 struct z_erofs_bvset_inline *bvset,
216 unsigned int bootstrap_nr,
217 unsigned int cur)
218 {
219 *iter = (struct z_erofs_bvec_iter) {
220 .nr = bootstrap_nr,
221 .bvset = (struct z_erofs_bvset *)bvset,
222 };
223
224 while (cur > iter->nr) {
225 cur -= iter->nr;
226 z_erofs_bvset_flip(iter);
227 }
228 iter->cur = cur;
229 }
230
z_erofs_bvec_enqueue(struct z_erofs_bvec_iter * iter,struct z_erofs_bvec * bvec,struct page ** candidate_bvpage,struct page ** pagepool)231 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
232 struct z_erofs_bvec *bvec,
233 struct page **candidate_bvpage,
234 struct page **pagepool)
235 {
236 if (iter->cur >= iter->nr) {
237 struct page *nextpage = *candidate_bvpage;
238
239 if (!nextpage) {
240 nextpage = erofs_allocpage(pagepool, GFP_NOFS);
241 if (!nextpage)
242 return -ENOMEM;
243 set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
244 }
245 DBG_BUGON(iter->bvset->nextpage);
246 iter->bvset->nextpage = nextpage;
247 z_erofs_bvset_flip(iter);
248
249 iter->bvset->nextpage = NULL;
250 *candidate_bvpage = NULL;
251 }
252 iter->bvset->bvec[iter->cur++] = *bvec;
253 return 0;
254 }
255
z_erofs_bvec_dequeue(struct z_erofs_bvec_iter * iter,struct z_erofs_bvec * bvec,struct page ** old_bvpage)256 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
257 struct z_erofs_bvec *bvec,
258 struct page **old_bvpage)
259 {
260 if (iter->cur == iter->nr)
261 *old_bvpage = z_erofs_bvset_flip(iter);
262 else
263 *old_bvpage = NULL;
264 *bvec = iter->bvset->bvec[iter->cur++];
265 }
266
z_erofs_destroy_pcluster_pool(void)267 static void z_erofs_destroy_pcluster_pool(void)
268 {
269 int i;
270
271 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
272 if (!pcluster_pool[i].slab)
273 continue;
274 kmem_cache_destroy(pcluster_pool[i].slab);
275 pcluster_pool[i].slab = NULL;
276 }
277 }
278
z_erofs_create_pcluster_pool(void)279 static int z_erofs_create_pcluster_pool(void)
280 {
281 struct z_erofs_pcluster_slab *pcs;
282 struct z_erofs_pcluster *a;
283 unsigned int size;
284
285 for (pcs = pcluster_pool;
286 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
287 size = struct_size(a, compressed_bvecs, pcs->maxpages);
288
289 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
290 pcs->slab = kmem_cache_create(pcs->name, size, 0,
291 SLAB_RECLAIM_ACCOUNT, NULL);
292 if (pcs->slab)
293 continue;
294
295 z_erofs_destroy_pcluster_pool();
296 return -ENOMEM;
297 }
298 return 0;
299 }
300
z_erofs_alloc_pcluster(unsigned int nrpages)301 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
302 {
303 int i;
304
305 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
306 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
307 struct z_erofs_pcluster *pcl;
308
309 if (nrpages > pcs->maxpages)
310 continue;
311
312 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
313 if (!pcl)
314 return ERR_PTR(-ENOMEM);
315 pcl->pclusterpages = nrpages;
316 return pcl;
317 }
318 return ERR_PTR(-EINVAL);
319 }
320
z_erofs_free_pcluster(struct z_erofs_pcluster * pcl)321 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
322 {
323 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
324 int i;
325
326 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
327 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
328
329 if (pclusterpages > pcs->maxpages)
330 continue;
331
332 kmem_cache_free(pcs->slab, pcl);
333 return;
334 }
335 DBG_BUGON(1);
336 }
337
338 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
339
340 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
341 static struct kthread_worker __rcu **z_erofs_pcpu_workers;
342
erofs_destroy_percpu_workers(void)343 static void erofs_destroy_percpu_workers(void)
344 {
345 struct kthread_worker *worker;
346 unsigned int cpu;
347
348 for_each_possible_cpu(cpu) {
349 worker = rcu_dereference_protected(
350 z_erofs_pcpu_workers[cpu], 1);
351 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
352 if (worker)
353 kthread_destroy_worker(worker);
354 }
355 kfree(z_erofs_pcpu_workers);
356 }
357
erofs_init_percpu_worker(int cpu)358 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
359 {
360 struct kthread_worker *worker =
361 kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
362
363 if (IS_ERR(worker))
364 return worker;
365 if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
366 sched_set_fifo_low(worker->task);
367 return worker;
368 }
369
erofs_init_percpu_workers(void)370 static int erofs_init_percpu_workers(void)
371 {
372 struct kthread_worker *worker;
373 unsigned int cpu;
374
375 z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
376 sizeof(struct kthread_worker *), GFP_ATOMIC);
377 if (!z_erofs_pcpu_workers)
378 return -ENOMEM;
379
380 for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */
381 worker = erofs_init_percpu_worker(cpu);
382 if (!IS_ERR(worker))
383 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
384 }
385 return 0;
386 }
387 #else
erofs_destroy_percpu_workers(void)388 static inline void erofs_destroy_percpu_workers(void) {}
erofs_init_percpu_workers(void)389 static inline int erofs_init_percpu_workers(void) { return 0; }
390 #endif
391
392 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
393 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
394 static enum cpuhp_state erofs_cpuhp_state;
395
erofs_cpu_online(unsigned int cpu)396 static int erofs_cpu_online(unsigned int cpu)
397 {
398 struct kthread_worker *worker, *old;
399
400 worker = erofs_init_percpu_worker(cpu);
401 if (IS_ERR(worker))
402 return PTR_ERR(worker);
403
404 spin_lock(&z_erofs_pcpu_worker_lock);
405 old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
406 lockdep_is_held(&z_erofs_pcpu_worker_lock));
407 if (!old)
408 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
409 spin_unlock(&z_erofs_pcpu_worker_lock);
410 if (old)
411 kthread_destroy_worker(worker);
412 return 0;
413 }
414
erofs_cpu_offline(unsigned int cpu)415 static int erofs_cpu_offline(unsigned int cpu)
416 {
417 struct kthread_worker *worker;
418
419 spin_lock(&z_erofs_pcpu_worker_lock);
420 worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
421 lockdep_is_held(&z_erofs_pcpu_worker_lock));
422 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
423 spin_unlock(&z_erofs_pcpu_worker_lock);
424
425 synchronize_rcu();
426 if (worker)
427 kthread_destroy_worker(worker);
428 return 0;
429 }
430
erofs_cpu_hotplug_init(void)431 static int erofs_cpu_hotplug_init(void)
432 {
433 int state;
434
435 state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
436 "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
437 if (state < 0)
438 return state;
439
440 erofs_cpuhp_state = state;
441 return 0;
442 }
443
erofs_cpu_hotplug_destroy(void)444 static void erofs_cpu_hotplug_destroy(void)
445 {
446 if (erofs_cpuhp_state)
447 cpuhp_remove_state_nocalls(erofs_cpuhp_state);
448 }
449 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
erofs_cpu_hotplug_init(void)450 static inline int erofs_cpu_hotplug_init(void) { return 0; }
erofs_cpu_hotplug_destroy(void)451 static inline void erofs_cpu_hotplug_destroy(void) {}
452 #endif
453
z_erofs_exit_zip_subsystem(void)454 void z_erofs_exit_zip_subsystem(void)
455 {
456 erofs_cpu_hotplug_destroy();
457 erofs_destroy_percpu_workers();
458 destroy_workqueue(z_erofs_workqueue);
459 z_erofs_destroy_pcluster_pool();
460 }
461
z_erofs_init_zip_subsystem(void)462 int __init z_erofs_init_zip_subsystem(void)
463 {
464 int err = z_erofs_create_pcluster_pool();
465
466 if (err)
467 goto out_error_pcluster_pool;
468
469 z_erofs_workqueue = alloc_workqueue("erofs_worker",
470 WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
471 if (!z_erofs_workqueue) {
472 err = -ENOMEM;
473 goto out_error_workqueue_init;
474 }
475
476 err = erofs_init_percpu_workers();
477 if (err)
478 goto out_error_pcpu_worker;
479
480 err = erofs_cpu_hotplug_init();
481 if (err < 0)
482 goto out_error_cpuhp_init;
483 return err;
484
485 out_error_cpuhp_init:
486 erofs_destroy_percpu_workers();
487 out_error_pcpu_worker:
488 destroy_workqueue(z_erofs_workqueue);
489 out_error_workqueue_init:
490 z_erofs_destroy_pcluster_pool();
491 out_error_pcluster_pool:
492 return err;
493 }
494
495 enum z_erofs_pclustermode {
496 Z_EROFS_PCLUSTER_INFLIGHT,
497 /*
498 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
499 * could be dispatched into bypass queue later due to uptodated managed
500 * pages. All related online pages cannot be reused for inplace I/O (or
501 * bvpage) since it can be directly decoded without I/O submission.
502 */
503 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
504 /*
505 * The pcluster was just linked to a decompression chain by us. It can
506 * also be linked with the remaining pclusters, which means if the
507 * processing page is the tail page of a pcluster, this pcluster can
508 * safely use the whole page (since the previous pcluster is within the
509 * same chain) for in-place I/O, as illustrated below:
510 * ___________________________________________________
511 * | tail (partial) page | head (partial) page |
512 * | (of the current pcl) | (of the previous pcl) |
513 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
514 *
515 * [ (*) the page above can be used as inplace I/O. ]
516 */
517 Z_EROFS_PCLUSTER_FOLLOWED,
518 };
519
520 struct z_erofs_decompress_frontend {
521 struct inode *const inode;
522 struct erofs_map_blocks map;
523 struct z_erofs_bvec_iter biter;
524
525 struct page *pagepool;
526 struct page *candidate_bvpage;
527 struct z_erofs_pcluster *pcl;
528 z_erofs_next_pcluster_t owned_head;
529 enum z_erofs_pclustermode mode;
530
531 erofs_off_t headoffset;
532
533 /* a pointer used to pick up inplace I/O pages */
534 unsigned int icur;
535 };
536
537 #define DECOMPRESS_FRONTEND_INIT(__i) { \
538 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
539 .mode = Z_EROFS_PCLUSTER_FOLLOWED }
540
z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend * fe)541 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
542 {
543 unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
544
545 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
546 return false;
547
548 if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
549 return true;
550
551 if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
552 fe->map.m_la < fe->headoffset)
553 return true;
554
555 return false;
556 }
557
z_erofs_bind_cache(struct z_erofs_decompress_frontend * fe)558 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
559 {
560 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
561 struct z_erofs_pcluster *pcl = fe->pcl;
562 bool shouldalloc = z_erofs_should_alloc_cache(fe);
563 bool standalone = true;
564 /*
565 * optimistic allocation without direct reclaim since inplace I/O
566 * can be used if low memory otherwise.
567 */
568 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
569 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
570 unsigned int i;
571
572 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
573 return;
574
575 for (i = 0; i < pcl->pclusterpages; ++i) {
576 struct page *page;
577 void *t; /* mark pages just found for debugging */
578 struct page *newpage = NULL;
579
580 /* the compressed page was loaded before */
581 if (READ_ONCE(pcl->compressed_bvecs[i].page))
582 continue;
583
584 page = find_get_page(mc, pcl->obj.index + i);
585
586 if (page) {
587 t = (void *)((unsigned long)page | 1);
588 } else {
589 /* I/O is needed, no possible to decompress directly */
590 standalone = false;
591 if (!shouldalloc)
592 continue;
593
594 /*
595 * try to use cached I/O if page allocation
596 * succeeds or fallback to in-place I/O instead
597 * to avoid any direct reclaim.
598 */
599 newpage = erofs_allocpage(&fe->pagepool, gfp);
600 if (!newpage)
601 continue;
602 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
603 t = (void *)((unsigned long)newpage | 1);
604 }
605
606 if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
607 continue;
608
609 if (page)
610 put_page(page);
611 else if (newpage)
612 erofs_pagepool_add(&fe->pagepool, newpage);
613 }
614
615 /*
616 * don't do inplace I/O if all compressed pages are available in
617 * managed cache since it can be moved to the bypass queue instead.
618 */
619 if (standalone)
620 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
621 }
622
623 /* called by erofs_shrinker to get rid of all compressed_pages */
erofs_try_to_free_all_cached_pages(struct erofs_sb_info * sbi,struct erofs_workgroup * grp)624 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
625 struct erofs_workgroup *grp)
626 {
627 struct z_erofs_pcluster *const pcl =
628 container_of(grp, struct z_erofs_pcluster, obj);
629 int i;
630
631 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
632 /*
633 * refcount of workgroup is now freezed as 0,
634 * therefore no need to worry about available decompression users.
635 */
636 for (i = 0; i < pcl->pclusterpages; ++i) {
637 struct page *page = pcl->compressed_bvecs[i].page;
638
639 if (!page)
640 continue;
641
642 /* block other users from reclaiming or migrating the page */
643 if (!trylock_page(page))
644 return -EBUSY;
645
646 if (!erofs_page_is_managed(sbi, page))
647 continue;
648
649 /* barrier is implied in the following 'unlock_page' */
650 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
651 detach_page_private(page);
652 unlock_page(page);
653 }
654 return 0;
655 }
656
z_erofs_cache_release_folio(struct folio * folio,gfp_t gfp)657 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
658 {
659 struct z_erofs_pcluster *pcl = folio_get_private(folio);
660 bool ret;
661 int i;
662
663 if (!folio_test_private(folio))
664 return true;
665
666 ret = false;
667 spin_lock(&pcl->obj.lockref.lock);
668 if (pcl->obj.lockref.count > 0)
669 goto out;
670
671 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
672 for (i = 0; i < pcl->pclusterpages; ++i) {
673 if (pcl->compressed_bvecs[i].page == &folio->page) {
674 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
675 ret = true;
676 break;
677 }
678 }
679 if (ret)
680 folio_detach_private(folio);
681 out:
682 spin_unlock(&pcl->obj.lockref.lock);
683 return ret;
684 }
685
686 /*
687 * It will be called only on inode eviction. In case that there are still some
688 * decompression requests in progress, wait with rescheduling for a bit here.
689 * An extra lock could be introduced instead but it seems unnecessary.
690 */
z_erofs_cache_invalidate_folio(struct folio * folio,size_t offset,size_t length)691 static void z_erofs_cache_invalidate_folio(struct folio *folio,
692 size_t offset, size_t length)
693 {
694 const size_t stop = length + offset;
695
696 /* Check for potential overflow in debug mode */
697 DBG_BUGON(stop > folio_size(folio) || stop < length);
698
699 if (offset == 0 && stop == folio_size(folio))
700 while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
701 cond_resched();
702 }
703
704 static const struct address_space_operations z_erofs_cache_aops = {
705 .release_folio = z_erofs_cache_release_folio,
706 .invalidate_folio = z_erofs_cache_invalidate_folio,
707 };
708
erofs_init_managed_cache(struct super_block * sb)709 int erofs_init_managed_cache(struct super_block *sb)
710 {
711 struct inode *const inode = new_inode(sb);
712
713 if (!inode)
714 return -ENOMEM;
715
716 set_nlink(inode, 1);
717 inode->i_size = OFFSET_MAX;
718 inode->i_mapping->a_ops = &z_erofs_cache_aops;
719 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
720 EROFS_SB(sb)->managed_cache = inode;
721 return 0;
722 }
723
z_erofs_try_inplace_io(struct z_erofs_decompress_frontend * fe,struct z_erofs_bvec * bvec)724 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
725 struct z_erofs_bvec *bvec)
726 {
727 struct z_erofs_pcluster *const pcl = fe->pcl;
728
729 while (fe->icur > 0) {
730 if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
731 NULL, bvec->page)) {
732 pcl->compressed_bvecs[fe->icur] = *bvec;
733 return true;
734 }
735 }
736 return false;
737 }
738
739 /* callers must be with pcluster lock held */
z_erofs_attach_page(struct z_erofs_decompress_frontend * fe,struct z_erofs_bvec * bvec,bool exclusive)740 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
741 struct z_erofs_bvec *bvec, bool exclusive)
742 {
743 int ret;
744
745 if (exclusive) {
746 /* give priority for inplaceio to use file pages first */
747 if (z_erofs_try_inplace_io(fe, bvec))
748 return 0;
749 /* otherwise, check if it can be used as a bvpage */
750 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
751 !fe->candidate_bvpage)
752 fe->candidate_bvpage = bvec->page;
753 }
754 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
755 &fe->pagepool);
756 fe->pcl->vcnt += (ret >= 0);
757 return ret;
758 }
759
z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend * f)760 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
761 {
762 struct z_erofs_pcluster *pcl = f->pcl;
763 z_erofs_next_pcluster_t *owned_head = &f->owned_head;
764
765 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
766 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
767 *owned_head) == Z_EROFS_PCLUSTER_NIL) {
768 *owned_head = &pcl->next;
769 /* so we can attach this pcluster to our submission chain. */
770 f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
771 return;
772 }
773
774 /* type 2, it belongs to an ongoing chain */
775 f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
776 }
777
z_erofs_register_pcluster(struct z_erofs_decompress_frontend * fe)778 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
779 {
780 struct erofs_map_blocks *map = &fe->map;
781 bool ztailpacking = map->m_flags & EROFS_MAP_META;
782 struct z_erofs_pcluster *pcl;
783 struct erofs_workgroup *grp;
784 int err;
785
786 if (!(map->m_flags & EROFS_MAP_ENCODED) ||
787 (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
788 DBG_BUGON(1);
789 return -EFSCORRUPTED;
790 }
791
792 /* no available pcluster, let's allocate one */
793 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
794 map->m_plen >> PAGE_SHIFT);
795 if (IS_ERR(pcl))
796 return PTR_ERR(pcl);
797
798 spin_lock_init(&pcl->obj.lockref.lock);
799 pcl->obj.lockref.count = 1; /* one ref for this request */
800 pcl->algorithmformat = map->m_algorithmformat;
801 pcl->length = 0;
802 pcl->partial = true;
803
804 /* new pclusters should be claimed as type 1, primary and followed */
805 pcl->next = fe->owned_head;
806 pcl->pageofs_out = map->m_la & ~PAGE_MASK;
807 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
808
809 /*
810 * lock all primary followed works before visible to others
811 * and mutex_trylock *never* fails for a new pcluster.
812 */
813 mutex_init(&pcl->lock);
814 DBG_BUGON(!mutex_trylock(&pcl->lock));
815
816 if (ztailpacking) {
817 pcl->obj.index = 0; /* which indicates ztailpacking */
818 pcl->tailpacking_size = map->m_plen;
819 } else {
820 pcl->obj.index = map->m_pa >> PAGE_SHIFT;
821
822 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
823 if (IS_ERR(grp)) {
824 err = PTR_ERR(grp);
825 goto err_out;
826 }
827
828 if (grp != &pcl->obj) {
829 fe->pcl = container_of(grp,
830 struct z_erofs_pcluster, obj);
831 err = -EEXIST;
832 goto err_out;
833 }
834 }
835 fe->owned_head = &pcl->next;
836 fe->pcl = pcl;
837 return 0;
838
839 err_out:
840 mutex_unlock(&pcl->lock);
841 z_erofs_free_pcluster(pcl);
842 return err;
843 }
844
z_erofs_pcluster_begin(struct z_erofs_decompress_frontend * fe)845 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
846 {
847 struct erofs_map_blocks *map = &fe->map;
848 struct super_block *sb = fe->inode->i_sb;
849 erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
850 struct erofs_workgroup *grp = NULL;
851 int ret;
852
853 DBG_BUGON(fe->pcl);
854
855 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
856 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
857
858 if (!(map->m_flags & EROFS_MAP_META)) {
859 grp = erofs_find_workgroup(sb, blknr);
860 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
861 DBG_BUGON(1);
862 return -EFSCORRUPTED;
863 }
864
865 if (grp) {
866 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
867 ret = -EEXIST;
868 } else {
869 ret = z_erofs_register_pcluster(fe);
870 }
871
872 if (ret == -EEXIST) {
873 mutex_lock(&fe->pcl->lock);
874 z_erofs_try_to_claim_pcluster(fe);
875 } else if (ret) {
876 return ret;
877 }
878
879 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
880 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
881 if (!z_erofs_is_inline_pcluster(fe->pcl)) {
882 /* bind cache first when cached decompression is preferred */
883 z_erofs_bind_cache(fe);
884 } else {
885 void *mptr;
886
887 mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
888 if (IS_ERR(mptr)) {
889 ret = PTR_ERR(mptr);
890 erofs_err(sb, "failed to get inline data %d", ret);
891 return ret;
892 }
893 get_page(map->buf.page);
894 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
895 fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
896 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
897 }
898 /* file-backed inplace I/O pages are traversed in reverse order */
899 fe->icur = z_erofs_pclusterpages(fe->pcl);
900 return 0;
901 }
902
903 /*
904 * keep in mind that no referenced pclusters will be freed
905 * only after a RCU grace period.
906 */
z_erofs_rcu_callback(struct rcu_head * head)907 static void z_erofs_rcu_callback(struct rcu_head *head)
908 {
909 z_erofs_free_pcluster(container_of(head,
910 struct z_erofs_pcluster, rcu));
911 }
912
erofs_workgroup_free_rcu(struct erofs_workgroup * grp)913 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
914 {
915 struct z_erofs_pcluster *const pcl =
916 container_of(grp, struct z_erofs_pcluster, obj);
917
918 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
919 }
920
z_erofs_pcluster_end(struct z_erofs_decompress_frontend * fe)921 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
922 {
923 struct z_erofs_pcluster *pcl = fe->pcl;
924
925 if (!pcl)
926 return;
927
928 z_erofs_bvec_iter_end(&fe->biter);
929 mutex_unlock(&pcl->lock);
930
931 if (fe->candidate_bvpage)
932 fe->candidate_bvpage = NULL;
933
934 /*
935 * if all pending pages are added, don't hold its reference
936 * any longer if the pcluster isn't hosted by ourselves.
937 */
938 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
939 erofs_workgroup_put(&pcl->obj);
940
941 fe->pcl = NULL;
942 }
943
z_erofs_read_fragment(struct super_block * sb,struct page * page,unsigned int cur,unsigned int end,erofs_off_t pos)944 static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
945 unsigned int cur, unsigned int end, erofs_off_t pos)
946 {
947 struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
948 struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
949 unsigned int cnt;
950 u8 *src;
951
952 if (!packed_inode)
953 return -EFSCORRUPTED;
954
955 buf.inode = packed_inode;
956 for (; cur < end; cur += cnt, pos += cnt) {
957 cnt = min_t(unsigned int, end - cur,
958 sb->s_blocksize - erofs_blkoff(sb, pos));
959 src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
960 if (IS_ERR(src)) {
961 erofs_put_metabuf(&buf);
962 return PTR_ERR(src);
963 }
964 memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
965 }
966 erofs_put_metabuf(&buf);
967 return 0;
968 }
969
z_erofs_do_read_page(struct z_erofs_decompress_frontend * fe,struct page * page)970 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
971 struct page *page)
972 {
973 struct inode *const inode = fe->inode;
974 struct erofs_map_blocks *const map = &fe->map;
975 const loff_t offset = page_offset(page);
976 bool tight = true, exclusive;
977 unsigned int cur, end, len, split;
978 int err = 0;
979
980 z_erofs_onlinepage_init(page);
981
982 split = 0;
983 end = PAGE_SIZE;
984 repeat:
985 if (offset + end - 1 < map->m_la ||
986 offset + end - 1 >= map->m_la + map->m_llen) {
987 z_erofs_pcluster_end(fe);
988 map->m_la = offset + end - 1;
989 map->m_llen = 0;
990 err = z_erofs_map_blocks_iter(inode, map, 0);
991 if (err)
992 goto out;
993 }
994
995 cur = offset > map->m_la ? 0 : map->m_la - offset;
996 /* bump split parts first to avoid several separate cases */
997 ++split;
998
999 if (!(map->m_flags & EROFS_MAP_MAPPED)) {
1000 zero_user_segment(page, cur, end);
1001 tight = false;
1002 goto next_part;
1003 }
1004
1005 if (map->m_flags & EROFS_MAP_FRAGMENT) {
1006 erofs_off_t fpos = offset + cur - map->m_la;
1007
1008 len = min_t(unsigned int, map->m_llen - fpos, end - cur);
1009 err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
1010 EROFS_I(inode)->z_fragmentoff + fpos);
1011 if (err)
1012 goto out;
1013 tight = false;
1014 goto next_part;
1015 }
1016
1017 if (!fe->pcl) {
1018 err = z_erofs_pcluster_begin(fe);
1019 if (err)
1020 goto out;
1021 }
1022
1023 /*
1024 * Ensure the current partial page belongs to this submit chain rather
1025 * than other concurrent submit chains or the noio(bypass) chain since
1026 * those chains are handled asynchronously thus the page cannot be used
1027 * for inplace I/O or bvpage (should be processed in a strict order.)
1028 */
1029 tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
1030 exclusive = (!cur && ((split <= 1) || tight));
1031 if (cur)
1032 tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
1033
1034 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
1035 .page = page,
1036 .offset = offset - map->m_la,
1037 .end = end,
1038 }), exclusive);
1039 if (err)
1040 goto out;
1041
1042 z_erofs_onlinepage_split(page);
1043 if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
1044 fe->pcl->multibases = true;
1045 if (fe->pcl->length < offset + end - map->m_la) {
1046 fe->pcl->length = offset + end - map->m_la;
1047 fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
1048 }
1049 if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
1050 !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
1051 fe->pcl->length == map->m_llen)
1052 fe->pcl->partial = false;
1053 next_part:
1054 /* shorten the remaining extent to update progress */
1055 map->m_llen = offset + cur - map->m_la;
1056 map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
1057
1058 end = cur;
1059 if (end > 0)
1060 goto repeat;
1061
1062 out:
1063 z_erofs_onlinepage_endio(page, err);
1064 return err;
1065 }
1066
z_erofs_is_sync_decompress(struct erofs_sb_info * sbi,unsigned int readahead_pages)1067 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
1068 unsigned int readahead_pages)
1069 {
1070 /* auto: enable for read_folio, disable for readahead */
1071 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
1072 !readahead_pages)
1073 return true;
1074
1075 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
1076 (readahead_pages <= sbi->opt.max_sync_decompress_pages))
1077 return true;
1078
1079 return false;
1080 }
1081
z_erofs_page_is_invalidated(struct page * page)1082 static bool z_erofs_page_is_invalidated(struct page *page)
1083 {
1084 return !page->mapping && !z_erofs_is_shortlived_page(page);
1085 }
1086
1087 struct z_erofs_decompress_backend {
1088 struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1089 struct super_block *sb;
1090 struct z_erofs_pcluster *pcl;
1091
1092 /* pages with the longest decompressed length for deduplication */
1093 struct page **decompressed_pages;
1094 /* pages to keep the compressed data */
1095 struct page **compressed_pages;
1096
1097 struct list_head decompressed_secondary_bvecs;
1098 struct page **pagepool;
1099 unsigned int onstack_used, nr_pages;
1100 };
1101
1102 struct z_erofs_bvec_item {
1103 struct z_erofs_bvec bvec;
1104 struct list_head list;
1105 };
1106
z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend * be,struct z_erofs_bvec * bvec)1107 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
1108 struct z_erofs_bvec *bvec)
1109 {
1110 struct z_erofs_bvec_item *item;
1111 unsigned int pgnr;
1112
1113 if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
1114 (bvec->end == PAGE_SIZE ||
1115 bvec->offset + bvec->end == be->pcl->length)) {
1116 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1117 DBG_BUGON(pgnr >= be->nr_pages);
1118 if (!be->decompressed_pages[pgnr]) {
1119 be->decompressed_pages[pgnr] = bvec->page;
1120 return;
1121 }
1122 }
1123
1124 /* (cold path) one pcluster is requested multiple times */
1125 item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1126 item->bvec = *bvec;
1127 list_add(&item->list, &be->decompressed_secondary_bvecs);
1128 }
1129
z_erofs_fill_other_copies(struct z_erofs_decompress_backend * be,int err)1130 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
1131 int err)
1132 {
1133 unsigned int off0 = be->pcl->pageofs_out;
1134 struct list_head *p, *n;
1135
1136 list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1137 struct z_erofs_bvec_item *bvi;
1138 unsigned int end, cur;
1139 void *dst, *src;
1140
1141 bvi = container_of(p, struct z_erofs_bvec_item, list);
1142 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1143 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1144 bvi->bvec.end);
1145 dst = kmap_local_page(bvi->bvec.page);
1146 while (cur < end) {
1147 unsigned int pgnr, scur, len;
1148
1149 pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1150 DBG_BUGON(pgnr >= be->nr_pages);
1151
1152 scur = bvi->bvec.offset + cur -
1153 ((pgnr << PAGE_SHIFT) - off0);
1154 len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1155 if (!be->decompressed_pages[pgnr]) {
1156 err = -EFSCORRUPTED;
1157 cur += len;
1158 continue;
1159 }
1160 src = kmap_local_page(be->decompressed_pages[pgnr]);
1161 memcpy(dst + cur, src + scur, len);
1162 kunmap_local(src);
1163 cur += len;
1164 }
1165 kunmap_local(dst);
1166 z_erofs_onlinepage_endio(bvi->bvec.page, err);
1167 list_del(p);
1168 kfree(bvi);
1169 }
1170 }
1171
z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend * be)1172 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
1173 {
1174 struct z_erofs_pcluster *pcl = be->pcl;
1175 struct z_erofs_bvec_iter biter;
1176 struct page *old_bvpage;
1177 int i;
1178
1179 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
1180 for (i = 0; i < pcl->vcnt; ++i) {
1181 struct z_erofs_bvec bvec;
1182
1183 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
1184
1185 if (old_bvpage)
1186 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1187
1188 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
1189 z_erofs_do_decompressed_bvec(be, &bvec);
1190 }
1191
1192 old_bvpage = z_erofs_bvec_iter_end(&biter);
1193 if (old_bvpage)
1194 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1195 }
1196
z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend * be,bool * overlapped)1197 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
1198 bool *overlapped)
1199 {
1200 struct z_erofs_pcluster *pcl = be->pcl;
1201 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1202 int i, err = 0;
1203
1204 *overlapped = false;
1205 for (i = 0; i < pclusterpages; ++i) {
1206 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1207 struct page *page = bvec->page;
1208
1209 /* compressed pages ought to be present before decompressing */
1210 if (!page) {
1211 DBG_BUGON(1);
1212 continue;
1213 }
1214 be->compressed_pages[i] = page;
1215
1216 if (z_erofs_is_inline_pcluster(pcl)) {
1217 if (!PageUptodate(page))
1218 err = -EIO;
1219 continue;
1220 }
1221
1222 DBG_BUGON(z_erofs_page_is_invalidated(page));
1223 if (!z_erofs_is_shortlived_page(page)) {
1224 if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
1225 if (!PageUptodate(page))
1226 err = -EIO;
1227 continue;
1228 }
1229 z_erofs_do_decompressed_bvec(be, bvec);
1230 *overlapped = true;
1231 }
1232 }
1233
1234 if (err)
1235 return err;
1236 return 0;
1237 }
1238
z_erofs_decompress_pcluster(struct z_erofs_decompress_backend * be,int err)1239 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
1240 int err)
1241 {
1242 struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1243 struct z_erofs_pcluster *pcl = be->pcl;
1244 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1245 const struct z_erofs_decompressor *decompressor =
1246 &erofs_decompressors[pcl->algorithmformat];
1247 unsigned int i, inputsize;
1248 int err2;
1249 struct page *page;
1250 bool overlapped;
1251
1252 mutex_lock(&pcl->lock);
1253 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
1254
1255 /* allocate (de)compressed page arrays if cannot be kept on stack */
1256 be->decompressed_pages = NULL;
1257 be->compressed_pages = NULL;
1258 be->onstack_used = 0;
1259 if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
1260 be->decompressed_pages = be->onstack_pages;
1261 be->onstack_used = be->nr_pages;
1262 memset(be->decompressed_pages, 0,
1263 sizeof(struct page *) * be->nr_pages);
1264 }
1265
1266 if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1267 be->compressed_pages = be->onstack_pages + be->onstack_used;
1268
1269 if (!be->decompressed_pages)
1270 be->decompressed_pages =
1271 kvcalloc(be->nr_pages, sizeof(struct page *),
1272 GFP_KERNEL | __GFP_NOFAIL);
1273 if (!be->compressed_pages)
1274 be->compressed_pages =
1275 kvcalloc(pclusterpages, sizeof(struct page *),
1276 GFP_KERNEL | __GFP_NOFAIL);
1277
1278 z_erofs_parse_out_bvecs(be);
1279 err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1280 if (err2)
1281 err = err2;
1282 if (err)
1283 goto out;
1284
1285 if (z_erofs_is_inline_pcluster(pcl))
1286 inputsize = pcl->tailpacking_size;
1287 else
1288 inputsize = pclusterpages * PAGE_SIZE;
1289
1290 err = decompressor->decompress(&(struct z_erofs_decompress_req) {
1291 .sb = be->sb,
1292 .in = be->compressed_pages,
1293 .out = be->decompressed_pages,
1294 .pageofs_in = pcl->pageofs_in,
1295 .pageofs_out = pcl->pageofs_out,
1296 .inputsize = inputsize,
1297 .outputsize = pcl->length,
1298 .alg = pcl->algorithmformat,
1299 .inplace_io = overlapped,
1300 .partial_decoding = pcl->partial,
1301 .fillgaps = pcl->multibases,
1302 }, be->pagepool);
1303
1304 out:
1305 /* must handle all compressed pages before actual file pages */
1306 if (z_erofs_is_inline_pcluster(pcl)) {
1307 page = pcl->compressed_bvecs[0].page;
1308 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1309 put_page(page);
1310 } else {
1311 for (i = 0; i < pclusterpages; ++i) {
1312 /* consider shortlived pages added when decompressing */
1313 page = be->compressed_pages[i];
1314
1315 if (erofs_page_is_managed(sbi, page))
1316 continue;
1317 (void)z_erofs_put_shortlivedpage(be->pagepool, page);
1318 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1319 }
1320 }
1321 if (be->compressed_pages < be->onstack_pages ||
1322 be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1323 kvfree(be->compressed_pages);
1324 z_erofs_fill_other_copies(be, err);
1325
1326 for (i = 0; i < be->nr_pages; ++i) {
1327 page = be->decompressed_pages[i];
1328 if (!page)
1329 continue;
1330
1331 DBG_BUGON(z_erofs_page_is_invalidated(page));
1332
1333 /* recycle all individual short-lived pages */
1334 if (z_erofs_put_shortlivedpage(be->pagepool, page))
1335 continue;
1336 z_erofs_onlinepage_endio(page, err);
1337 }
1338
1339 if (be->decompressed_pages != be->onstack_pages)
1340 kvfree(be->decompressed_pages);
1341
1342 pcl->length = 0;
1343 pcl->partial = true;
1344 pcl->multibases = false;
1345 pcl->bvset.nextpage = NULL;
1346 pcl->vcnt = 0;
1347
1348 /* pcluster lock MUST be taken before the following line */
1349 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
1350 mutex_unlock(&pcl->lock);
1351 return err;
1352 }
1353
z_erofs_decompress_queue(const struct z_erofs_decompressqueue * io,struct page ** pagepool)1354 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1355 struct page **pagepool)
1356 {
1357 struct z_erofs_decompress_backend be = {
1358 .sb = io->sb,
1359 .pagepool = pagepool,
1360 .decompressed_secondary_bvecs =
1361 LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1362 };
1363 z_erofs_next_pcluster_t owned = io->head;
1364
1365 while (owned != Z_EROFS_PCLUSTER_TAIL) {
1366 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
1367
1368 be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1369 owned = READ_ONCE(be.pcl->next);
1370
1371 z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
1372 if (z_erofs_is_inline_pcluster(be.pcl))
1373 z_erofs_free_pcluster(be.pcl);
1374 else
1375 erofs_workgroup_put(&be.pcl->obj);
1376 }
1377 }
1378
z_erofs_decompressqueue_work(struct work_struct * work)1379 static void z_erofs_decompressqueue_work(struct work_struct *work)
1380 {
1381 struct z_erofs_decompressqueue *bgq =
1382 container_of(work, struct z_erofs_decompressqueue, u.work);
1383 struct page *pagepool = NULL;
1384
1385 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
1386 z_erofs_decompress_queue(bgq, &pagepool);
1387 erofs_release_pages(&pagepool);
1388 kvfree(bgq);
1389 }
1390
1391 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
z_erofs_decompressqueue_kthread_work(struct kthread_work * work)1392 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
1393 {
1394 z_erofs_decompressqueue_work((struct work_struct *)work);
1395 }
1396 #endif
1397
z_erofs_decompress_kickoff(struct z_erofs_decompressqueue * io,int bios)1398 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1399 int bios)
1400 {
1401 struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1402
1403 /* wake up the caller thread for sync decompression */
1404 if (io->sync) {
1405 if (!atomic_add_return(bios, &io->pending_bios))
1406 complete(&io->u.done);
1407 return;
1408 }
1409
1410 if (atomic_add_return(bios, &io->pending_bios))
1411 return;
1412 /* Use (kthread_)work and sync decompression for atomic contexts only */
1413 if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1414 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1415 struct kthread_worker *worker;
1416
1417 rcu_read_lock();
1418 worker = rcu_dereference(
1419 z_erofs_pcpu_workers[raw_smp_processor_id()]);
1420 if (!worker) {
1421 INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
1422 queue_work(z_erofs_workqueue, &io->u.work);
1423 } else {
1424 kthread_queue_work(worker, &io->u.kthread_work);
1425 }
1426 rcu_read_unlock();
1427 #else
1428 queue_work(z_erofs_workqueue, &io->u.work);
1429 #endif
1430 /* enable sync decompression for readahead */
1431 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1432 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1433 return;
1434 }
1435 z_erofs_decompressqueue_work(&io->u.work);
1436 }
1437
pickup_page_for_submission(struct z_erofs_pcluster * pcl,unsigned int nr,struct page ** pagepool,struct address_space * mc)1438 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
1439 unsigned int nr,
1440 struct page **pagepool,
1441 struct address_space *mc)
1442 {
1443 const pgoff_t index = pcl->obj.index;
1444 gfp_t gfp = mapping_gfp_mask(mc);
1445 bool tocache = false;
1446
1447 struct address_space *mapping;
1448 struct page *oldpage, *page;
1449 int justfound;
1450
1451 repeat:
1452 page = READ_ONCE(pcl->compressed_bvecs[nr].page);
1453 oldpage = page;
1454
1455 if (!page)
1456 goto out_allocpage;
1457
1458 justfound = (unsigned long)page & 1UL;
1459 page = (struct page *)((unsigned long)page & ~1UL);
1460
1461 /*
1462 * preallocated cached pages, which is used to avoid direct reclaim
1463 * otherwise, it will go inplace I/O path instead.
1464 */
1465 if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
1466 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1467 set_page_private(page, 0);
1468 tocache = true;
1469 goto out_tocache;
1470 }
1471 mapping = READ_ONCE(page->mapping);
1472
1473 /*
1474 * file-backed online pages in plcuster are all locked steady,
1475 * therefore it is impossible for `mapping' to be NULL.
1476 */
1477 if (mapping && mapping != mc)
1478 /* ought to be unmanaged pages */
1479 goto out;
1480
1481 /* directly return for shortlived page as well */
1482 if (z_erofs_is_shortlived_page(page))
1483 goto out;
1484
1485 lock_page(page);
1486 if (likely(page->mapping == mc)) {
1487 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1488
1489 /*
1490 * The cached folio is still in managed cache but without
1491 * a valid `->private` pcluster hint. Let's reconnect them.
1492 */
1493 if (!PagePrivate(page)) {
1494 /*
1495 * impossible to be !PagePrivate(page) for
1496 * the current restriction as well if
1497 * the page is already in compressed_bvecs[].
1498 */
1499 DBG_BUGON(!justfound);
1500
1501 justfound = 0;
1502 set_page_private(page, (unsigned long)pcl);
1503 SetPagePrivate(page);
1504 }
1505
1506 if (likely(page->private == (unsigned long)pcl)) {
1507 /* don't submit cache I/Os again if already uptodate */
1508 if (PageUptodate(page)) {
1509 unlock_page(page);
1510 page = NULL;
1511
1512 }
1513 goto out;
1514 }
1515 /*
1516 * Already linked with another pcluster, which only appears in
1517 * crafted images by fuzzers for now. But handle this anyway.
1518 */
1519 tocache = false; /* use temporary short-lived pages */
1520 } else {
1521 DBG_BUGON(1); /* referenced managed folios can't be truncated */
1522 tocache = true;
1523 }
1524 unlock_page(page);
1525 put_page(page);
1526 out_allocpage:
1527 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
1528 if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
1529 oldpage, page)) {
1530 erofs_pagepool_add(pagepool, page);
1531 cond_resched();
1532 goto repeat;
1533 }
1534 out_tocache:
1535 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1536 /* turn into temporary page if fails (1 ref) */
1537 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
1538 goto out;
1539 }
1540 attach_page_private(page, pcl);
1541 /* drop a refcount added by allocpage (then we have 2 refs here) */
1542 put_page(page);
1543
1544 out: /* the only exit (for tracing and debugging) */
1545 return page;
1546 }
1547
jobqueue_init(struct super_block * sb,struct z_erofs_decompressqueue * fgq,bool * fg)1548 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1549 struct z_erofs_decompressqueue *fgq, bool *fg)
1550 {
1551 struct z_erofs_decompressqueue *q;
1552
1553 if (fg && !*fg) {
1554 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1555 if (!q) {
1556 *fg = true;
1557 goto fg_out;
1558 }
1559 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1560 kthread_init_work(&q->u.kthread_work,
1561 z_erofs_decompressqueue_kthread_work);
1562 #else
1563 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1564 #endif
1565 } else {
1566 fg_out:
1567 q = fgq;
1568 init_completion(&fgq->u.done);
1569 atomic_set(&fgq->pending_bios, 0);
1570 q->eio = false;
1571 q->sync = true;
1572 }
1573 q->sb = sb;
1574 q->head = Z_EROFS_PCLUSTER_TAIL;
1575 return q;
1576 }
1577
1578 /* define decompression jobqueue types */
1579 enum {
1580 JQ_BYPASS,
1581 JQ_SUBMIT,
1582 NR_JOBQUEUES,
1583 };
1584
move_to_bypass_jobqueue(struct z_erofs_pcluster * pcl,z_erofs_next_pcluster_t qtail[],z_erofs_next_pcluster_t owned_head)1585 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1586 z_erofs_next_pcluster_t qtail[],
1587 z_erofs_next_pcluster_t owned_head)
1588 {
1589 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1590 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1591
1592 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
1593
1594 WRITE_ONCE(*submit_qtail, owned_head);
1595 WRITE_ONCE(*bypass_qtail, &pcl->next);
1596
1597 qtail[JQ_BYPASS] = &pcl->next;
1598 }
1599
z_erofs_decompressqueue_endio(struct bio * bio)1600 static void z_erofs_decompressqueue_endio(struct bio *bio)
1601 {
1602 struct z_erofs_decompressqueue *q = bio->bi_private;
1603 blk_status_t err = bio->bi_status;
1604 struct bio_vec *bvec;
1605 struct bvec_iter_all iter_all;
1606
1607 bio_for_each_segment_all(bvec, bio, iter_all) {
1608 struct page *page = bvec->bv_page;
1609
1610 DBG_BUGON(PageUptodate(page));
1611 DBG_BUGON(z_erofs_page_is_invalidated(page));
1612
1613 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
1614 if (!err)
1615 SetPageUptodate(page);
1616 unlock_page(page);
1617 }
1618 }
1619 if (err)
1620 q->eio = true;
1621 z_erofs_decompress_kickoff(q, -1);
1622 bio_put(bio);
1623 }
1624
z_erofs_submit_queue(struct z_erofs_decompress_frontend * f,struct z_erofs_decompressqueue * fgq,bool * force_fg,bool readahead)1625 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
1626 struct z_erofs_decompressqueue *fgq,
1627 bool *force_fg, bool readahead)
1628 {
1629 struct super_block *sb = f->inode->i_sb;
1630 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1631 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1632 struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1633 z_erofs_next_pcluster_t owned_head = f->owned_head;
1634 /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1635 pgoff_t last_index;
1636 struct block_device *last_bdev;
1637 unsigned int nr_bios = 0;
1638 struct bio *bio = NULL;
1639 unsigned long pflags;
1640 int memstall = 0;
1641
1642 /*
1643 * if managed cache is enabled, bypass jobqueue is needed,
1644 * no need to read from device for all pclusters in this queue.
1645 */
1646 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1647 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1648
1649 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1650 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1651
1652 /* by default, all need io submission */
1653 q[JQ_SUBMIT]->head = owned_head;
1654
1655 do {
1656 struct erofs_map_dev mdev;
1657 struct z_erofs_pcluster *pcl;
1658 pgoff_t cur, end;
1659 unsigned int i = 0;
1660 bool bypass = true;
1661
1662 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1663 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1664 owned_head = READ_ONCE(pcl->next);
1665
1666 if (z_erofs_is_inline_pcluster(pcl)) {
1667 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1668 continue;
1669 }
1670
1671 /* no device id here, thus it will always succeed */
1672 mdev = (struct erofs_map_dev) {
1673 .m_pa = erofs_pos(sb, pcl->obj.index),
1674 };
1675 (void)erofs_map_dev(sb, &mdev);
1676
1677 cur = erofs_blknr(sb, mdev.m_pa);
1678 end = cur + pcl->pclusterpages;
1679
1680 do {
1681 struct page *page = NULL;
1682
1683 if (bio && (cur != last_index + 1 ||
1684 last_bdev != mdev.m_bdev)) {
1685 drain_io:
1686 submit_bio(bio);
1687 if (memstall) {
1688 psi_memstall_leave(&pflags);
1689 memstall = 0;
1690 }
1691 bio = NULL;
1692 }
1693
1694 if (!page) {
1695 page = pickup_page_for_submission(pcl, i++,
1696 &f->pagepool, mc);
1697 if (!page)
1698 continue;
1699 }
1700
1701 if (unlikely(PageWorkingset(page)) && !memstall) {
1702 psi_memstall_enter(&pflags);
1703 memstall = 1;
1704 }
1705
1706 if (!bio) {
1707 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1708 REQ_OP_READ, GFP_NOIO);
1709 bio->bi_end_io = z_erofs_decompressqueue_endio;
1710
1711 last_bdev = mdev.m_bdev;
1712 bio->bi_iter.bi_sector = (sector_t)cur <<
1713 (sb->s_blocksize_bits - 9);
1714 bio->bi_private = q[JQ_SUBMIT];
1715 if (readahead)
1716 bio->bi_opf |= REQ_RAHEAD;
1717 ++nr_bios;
1718 }
1719
1720 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
1721 goto drain_io;
1722
1723 last_index = cur;
1724 bypass = false;
1725 } while (++cur < end);
1726
1727 if (!bypass)
1728 qtail[JQ_SUBMIT] = &pcl->next;
1729 else
1730 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1731 } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1732
1733 if (bio)
1734 submit_bio(bio);
1735 if (memstall)
1736 psi_memstall_leave(&pflags);
1737
1738 /*
1739 * although background is preferred, no one is pending for submission.
1740 * don't issue decompression but drop it directly instead.
1741 */
1742 if (!*force_fg && !nr_bios) {
1743 kvfree(q[JQ_SUBMIT]);
1744 return;
1745 }
1746 z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
1747 }
1748
z_erofs_runqueue(struct z_erofs_decompress_frontend * f,bool force_fg,bool ra)1749 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
1750 bool force_fg, bool ra)
1751 {
1752 struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1753
1754 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
1755 return;
1756 z_erofs_submit_queue(f, io, &force_fg, ra);
1757
1758 /* handle bypass queue (no i/o pclusters) immediately */
1759 z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
1760
1761 if (!force_fg)
1762 return;
1763
1764 /* wait until all bios are completed */
1765 wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1766
1767 /* handle synchronous decompress queue in the caller context */
1768 z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
1769 }
1770
1771 /*
1772 * Since partial uptodate is still unimplemented for now, we have to use
1773 * approximate readmore strategies as a start.
1774 */
z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend * f,struct readahead_control * rac,bool backmost)1775 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1776 struct readahead_control *rac, bool backmost)
1777 {
1778 struct inode *inode = f->inode;
1779 struct erofs_map_blocks *map = &f->map;
1780 erofs_off_t cur, end, headoffset = f->headoffset;
1781 int err;
1782
1783 if (backmost) {
1784 if (rac)
1785 end = headoffset + readahead_length(rac) - 1;
1786 else
1787 end = headoffset + PAGE_SIZE - 1;
1788 map->m_la = end;
1789 err = z_erofs_map_blocks_iter(inode, map,
1790 EROFS_GET_BLOCKS_READMORE);
1791 if (err)
1792 return;
1793
1794 /* expand ra for the trailing edge if readahead */
1795 if (rac) {
1796 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1797 readahead_expand(rac, headoffset, cur - headoffset);
1798 return;
1799 }
1800 end = round_up(end, PAGE_SIZE);
1801 } else {
1802 end = round_up(map->m_la, PAGE_SIZE);
1803
1804 if (!map->m_llen)
1805 return;
1806 }
1807
1808 cur = map->m_la + map->m_llen - 1;
1809 while ((cur >= end) && (cur < i_size_read(inode))) {
1810 pgoff_t index = cur >> PAGE_SHIFT;
1811 struct page *page;
1812
1813 page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
1814 if (page) {
1815 if (PageUptodate(page))
1816 unlock_page(page);
1817 else
1818 (void)z_erofs_do_read_page(f, page);
1819 put_page(page);
1820 }
1821
1822 if (cur < PAGE_SIZE)
1823 break;
1824 cur = (index << PAGE_SHIFT) - 1;
1825 }
1826 }
1827
z_erofs_read_folio(struct file * file,struct folio * folio)1828 static int z_erofs_read_folio(struct file *file, struct folio *folio)
1829 {
1830 struct inode *const inode = folio->mapping->host;
1831 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1832 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1833 int err;
1834
1835 trace_erofs_read_folio(folio, false);
1836 f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
1837
1838 z_erofs_pcluster_readmore(&f, NULL, true);
1839 err = z_erofs_do_read_page(&f, &folio->page);
1840 z_erofs_pcluster_readmore(&f, NULL, false);
1841 z_erofs_pcluster_end(&f);
1842
1843 /* if some compressed cluster ready, need submit them anyway */
1844 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
1845
1846 if (err && err != -EINTR)
1847 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
1848 err, folio->index, EROFS_I(inode)->nid);
1849
1850 erofs_put_metabuf(&f.map.buf);
1851 erofs_release_pages(&f.pagepool);
1852 return err;
1853 }
1854
z_erofs_readahead(struct readahead_control * rac)1855 static void z_erofs_readahead(struct readahead_control *rac)
1856 {
1857 struct inode *const inode = rac->mapping->host;
1858 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1859 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1860 struct folio *head = NULL, *folio;
1861 unsigned int nr_folios;
1862 int err;
1863
1864 f.headoffset = readahead_pos(rac);
1865
1866 z_erofs_pcluster_readmore(&f, rac, true);
1867 nr_folios = readahead_count(rac);
1868 trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
1869
1870 while ((folio = readahead_folio(rac))) {
1871 folio->private = head;
1872 head = folio;
1873 }
1874
1875 /* traverse in reverse order for best metadata I/O performance */
1876 while (head) {
1877 folio = head;
1878 head = folio_get_private(folio);
1879
1880 err = z_erofs_do_read_page(&f, &folio->page);
1881 if (err && err != -EINTR)
1882 erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
1883 folio->index, EROFS_I(inode)->nid);
1884 }
1885 z_erofs_pcluster_readmore(&f, rac, false);
1886 z_erofs_pcluster_end(&f);
1887
1888 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
1889 erofs_put_metabuf(&f.map.buf);
1890 erofs_release_pages(&f.pagepool);
1891 }
1892
1893 const struct address_space_operations z_erofs_aops = {
1894 .read_folio = z_erofs_read_folio,
1895 .readahead = z_erofs_readahead,
1896 };
1897