1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * https://www.huawei.com/
5 * Copyright (C) 2022 Alibaba Cloud
6 */
7 #include "compress.h"
8 #include <linux/psi.h>
9 #include <linux/cpuhotplug.h>
10 #include <trace/events/erofs.h>
11
12 #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
13 #define Z_EROFS_INLINE_BVECS 2
14
15 /*
16 * let's leave a type here in case of introducing
17 * another tagged pointer later.
18 */
19 typedef void *z_erofs_next_pcluster_t;
20
21 struct z_erofs_bvec {
22 struct page *page;
23 int offset;
24 unsigned int end;
25 };
26
27 #define __Z_EROFS_BVSET(name, total) \
28 struct name { \
29 /* point to the next page which contains the following bvecs */ \
30 struct page *nextpage; \
31 struct z_erofs_bvec bvec[total]; \
32 }
33 __Z_EROFS_BVSET(z_erofs_bvset,);
34 __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
35
36 /*
37 * Structure fields follow one of the following exclusion rules.
38 *
39 * I: Modifiable by initialization/destruction paths and read-only
40 * for everyone else;
41 *
42 * L: Field should be protected by the pcluster lock;
43 *
44 * A: Field should be accessed / updated in atomic for parallelized code.
45 */
46 struct z_erofs_pcluster {
47 struct erofs_workgroup obj;
48 struct mutex lock;
49
50 /* A: point to next chained pcluster or TAILs */
51 z_erofs_next_pcluster_t next;
52
53 /* L: the maximum decompression size of this round */
54 unsigned int length;
55
56 /* L: total number of bvecs */
57 unsigned int vcnt;
58
59 /* I: page offset of start position of decompression */
60 unsigned short pageofs_out;
61
62 /* I: page offset of inline compressed data */
63 unsigned short pageofs_in;
64
65 union {
66 /* L: inline a certain number of bvec for bootstrap */
67 struct z_erofs_bvset_inline bvset;
68
69 /* I: can be used to free the pcluster by RCU. */
70 struct rcu_head rcu;
71 };
72
73 union {
74 /* I: physical cluster size in pages */
75 unsigned short pclusterpages;
76
77 /* I: tailpacking inline compressed size */
78 unsigned short tailpacking_size;
79 };
80
81 /* I: compression algorithm format */
82 unsigned char algorithmformat;
83
84 /* L: whether partial decompression or not */
85 bool partial;
86
87 /* L: indicate several pageofs_outs or not */
88 bool multibases;
89
90 /* A: compressed bvecs (can be cached or inplaced pages) */
91 struct z_erofs_bvec compressed_bvecs[];
92 };
93
94 /* the end of a chain of pclusters */
95 #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA)
96 #define Z_EROFS_PCLUSTER_NIL (NULL)
97
98 struct z_erofs_decompressqueue {
99 struct super_block *sb;
100 atomic_t pending_bios;
101 z_erofs_next_pcluster_t head;
102
103 union {
104 struct completion done;
105 struct work_struct work;
106 struct kthread_work kthread_work;
107 } u;
108 bool eio, sync;
109 };
110
z_erofs_is_inline_pcluster(struct z_erofs_pcluster * pcl)111 static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
112 {
113 return !pcl->obj.index;
114 }
115
z_erofs_pclusterpages(struct z_erofs_pcluster * pcl)116 static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
117 {
118 if (z_erofs_is_inline_pcluster(pcl))
119 return 1;
120 return pcl->pclusterpages;
121 }
122
123 /*
124 * bit 30: I/O error occurred on this page
125 * bit 29: CPU has dirty data in D-cache (needs aliasing handling);
126 * bit 0 - 29: remaining parts to complete this page
127 */
128 #define Z_EROFS_ONLINEPAGE_EIO 30
129 #define Z_EROFS_ONLINEPAGE_DIRTY 29
130
z_erofs_onlinepage_init(struct page * page)131 static inline void z_erofs_onlinepage_init(struct page *page)
132 {
133 union {
134 atomic_t o;
135 unsigned long v;
136 } u = { .o = ATOMIC_INIT(1) };
137
138 set_page_private(page, u.v);
139 smp_wmb();
140 SetPagePrivate(page);
141 }
142
z_erofs_onlinepage_split(struct page * page)143 static inline void z_erofs_onlinepage_split(struct page *page)
144 {
145 atomic_inc((atomic_t *)&page->private);
146 }
147
z_erofs_onlinepage_end(struct page * page,int err,bool dirty)148 static void z_erofs_onlinepage_end(struct page *page, int err, bool dirty)
149 {
150 int orig, v;
151
152 DBG_BUGON(!PagePrivate(page));
153
154 do {
155 orig = atomic_read((atomic_t *)&page->private);
156 DBG_BUGON(orig <= 0);
157 v = dirty << Z_EROFS_ONLINEPAGE_DIRTY;
158 v |= (orig - 1) | (!!err << Z_EROFS_ONLINEPAGE_EIO);
159 } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
160
161 if (v & (BIT(Z_EROFS_ONLINEPAGE_DIRTY) - 1))
162 return;
163 set_page_private(page, 0);
164 ClearPagePrivate(page);
165 if (v & BIT(Z_EROFS_ONLINEPAGE_DIRTY))
166 flush_dcache_page(page);
167 if (!(v & BIT(Z_EROFS_ONLINEPAGE_EIO)))
168 SetPageUptodate(page);
169 unlock_page(page);
170 }
171
172 #define Z_EROFS_ONSTACK_PAGES 32
173
174 /*
175 * since pclustersize is variable for big pcluster feature, introduce slab
176 * pools implementation for different pcluster sizes.
177 */
178 struct z_erofs_pcluster_slab {
179 struct kmem_cache *slab;
180 unsigned int maxpages;
181 char name[48];
182 };
183
184 #define _PCLP(n) { .maxpages = n }
185
186 static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
187 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
188 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
189 };
190
191 struct z_erofs_bvec_iter {
192 struct page *bvpage;
193 struct z_erofs_bvset *bvset;
194 unsigned int nr, cur;
195 };
196
z_erofs_bvec_iter_end(struct z_erofs_bvec_iter * iter)197 static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
198 {
199 if (iter->bvpage)
200 kunmap_local(iter->bvset);
201 return iter->bvpage;
202 }
203
z_erofs_bvset_flip(struct z_erofs_bvec_iter * iter)204 static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
205 {
206 unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
207 /* have to access nextpage in advance, otherwise it will be unmapped */
208 struct page *nextpage = iter->bvset->nextpage;
209 struct page *oldpage;
210
211 DBG_BUGON(!nextpage);
212 oldpage = z_erofs_bvec_iter_end(iter);
213 iter->bvpage = nextpage;
214 iter->bvset = kmap_local_page(nextpage);
215 iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
216 iter->cur = 0;
217 return oldpage;
218 }
219
z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter * iter,struct z_erofs_bvset_inline * bvset,unsigned int bootstrap_nr,unsigned int cur)220 static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
221 struct z_erofs_bvset_inline *bvset,
222 unsigned int bootstrap_nr,
223 unsigned int cur)
224 {
225 *iter = (struct z_erofs_bvec_iter) {
226 .nr = bootstrap_nr,
227 .bvset = (struct z_erofs_bvset *)bvset,
228 };
229
230 while (cur > iter->nr) {
231 cur -= iter->nr;
232 z_erofs_bvset_flip(iter);
233 }
234 iter->cur = cur;
235 }
236
z_erofs_bvec_enqueue(struct z_erofs_bvec_iter * iter,struct z_erofs_bvec * bvec,struct page ** candidate_bvpage,struct page ** pagepool)237 static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
238 struct z_erofs_bvec *bvec,
239 struct page **candidate_bvpage,
240 struct page **pagepool)
241 {
242 if (iter->cur >= iter->nr) {
243 struct page *nextpage = *candidate_bvpage;
244
245 if (!nextpage) {
246 nextpage = erofs_allocpage(pagepool, GFP_NOFS);
247 if (!nextpage)
248 return -ENOMEM;
249 set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
250 }
251 DBG_BUGON(iter->bvset->nextpage);
252 iter->bvset->nextpage = nextpage;
253 z_erofs_bvset_flip(iter);
254
255 iter->bvset->nextpage = NULL;
256 *candidate_bvpage = NULL;
257 }
258 iter->bvset->bvec[iter->cur++] = *bvec;
259 return 0;
260 }
261
z_erofs_bvec_dequeue(struct z_erofs_bvec_iter * iter,struct z_erofs_bvec * bvec,struct page ** old_bvpage)262 static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
263 struct z_erofs_bvec *bvec,
264 struct page **old_bvpage)
265 {
266 if (iter->cur == iter->nr)
267 *old_bvpage = z_erofs_bvset_flip(iter);
268 else
269 *old_bvpage = NULL;
270 *bvec = iter->bvset->bvec[iter->cur++];
271 }
272
z_erofs_destroy_pcluster_pool(void)273 static void z_erofs_destroy_pcluster_pool(void)
274 {
275 int i;
276
277 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
278 if (!pcluster_pool[i].slab)
279 continue;
280 kmem_cache_destroy(pcluster_pool[i].slab);
281 pcluster_pool[i].slab = NULL;
282 }
283 }
284
z_erofs_create_pcluster_pool(void)285 static int z_erofs_create_pcluster_pool(void)
286 {
287 struct z_erofs_pcluster_slab *pcs;
288 struct z_erofs_pcluster *a;
289 unsigned int size;
290
291 for (pcs = pcluster_pool;
292 pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
293 size = struct_size(a, compressed_bvecs, pcs->maxpages);
294
295 sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
296 pcs->slab = kmem_cache_create(pcs->name, size, 0,
297 SLAB_RECLAIM_ACCOUNT, NULL);
298 if (pcs->slab)
299 continue;
300
301 z_erofs_destroy_pcluster_pool();
302 return -ENOMEM;
303 }
304 return 0;
305 }
306
z_erofs_alloc_pcluster(unsigned int nrpages)307 static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
308 {
309 int i;
310
311 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
312 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
313 struct z_erofs_pcluster *pcl;
314
315 if (nrpages > pcs->maxpages)
316 continue;
317
318 pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
319 if (!pcl)
320 return ERR_PTR(-ENOMEM);
321 pcl->pclusterpages = nrpages;
322 return pcl;
323 }
324 return ERR_PTR(-EINVAL);
325 }
326
z_erofs_free_pcluster(struct z_erofs_pcluster * pcl)327 static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
328 {
329 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
333 struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
334
335 if (pclusterpages > pcs->maxpages)
336 continue;
337
338 kmem_cache_free(pcs->slab, pcl);
339 return;
340 }
341 DBG_BUGON(1);
342 }
343
344 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
345
346 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
347 static struct kthread_worker __rcu **z_erofs_pcpu_workers;
348
erofs_destroy_percpu_workers(void)349 static void erofs_destroy_percpu_workers(void)
350 {
351 struct kthread_worker *worker;
352 unsigned int cpu;
353
354 for_each_possible_cpu(cpu) {
355 worker = rcu_dereference_protected(
356 z_erofs_pcpu_workers[cpu], 1);
357 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
358 if (worker)
359 kthread_destroy_worker(worker);
360 }
361 kfree(z_erofs_pcpu_workers);
362 }
363
erofs_init_percpu_worker(int cpu)364 static struct kthread_worker *erofs_init_percpu_worker(int cpu)
365 {
366 struct kthread_worker *worker =
367 kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu);
368
369 if (IS_ERR(worker))
370 return worker;
371 if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI))
372 sched_set_fifo_low(worker->task);
373 return worker;
374 }
375
erofs_init_percpu_workers(void)376 static int erofs_init_percpu_workers(void)
377 {
378 struct kthread_worker *worker;
379 unsigned int cpu;
380
381 z_erofs_pcpu_workers = kcalloc(num_possible_cpus(),
382 sizeof(struct kthread_worker *), GFP_ATOMIC);
383 if (!z_erofs_pcpu_workers)
384 return -ENOMEM;
385
386 for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */
387 worker = erofs_init_percpu_worker(cpu);
388 if (!IS_ERR(worker))
389 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
390 }
391 return 0;
392 }
393 #else
erofs_destroy_percpu_workers(void)394 static inline void erofs_destroy_percpu_workers(void) {}
erofs_init_percpu_workers(void)395 static inline int erofs_init_percpu_workers(void) { return 0; }
396 #endif
397
398 #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD)
399 static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock);
400 static enum cpuhp_state erofs_cpuhp_state;
401
erofs_cpu_online(unsigned int cpu)402 static int erofs_cpu_online(unsigned int cpu)
403 {
404 struct kthread_worker *worker, *old;
405
406 worker = erofs_init_percpu_worker(cpu);
407 if (IS_ERR(worker))
408 return PTR_ERR(worker);
409
410 spin_lock(&z_erofs_pcpu_worker_lock);
411 old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
412 lockdep_is_held(&z_erofs_pcpu_worker_lock));
413 if (!old)
414 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker);
415 spin_unlock(&z_erofs_pcpu_worker_lock);
416 if (old)
417 kthread_destroy_worker(worker);
418 return 0;
419 }
420
erofs_cpu_offline(unsigned int cpu)421 static int erofs_cpu_offline(unsigned int cpu)
422 {
423 struct kthread_worker *worker;
424
425 spin_lock(&z_erofs_pcpu_worker_lock);
426 worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu],
427 lockdep_is_held(&z_erofs_pcpu_worker_lock));
428 rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL);
429 spin_unlock(&z_erofs_pcpu_worker_lock);
430
431 synchronize_rcu();
432 if (worker)
433 kthread_destroy_worker(worker);
434 return 0;
435 }
436
erofs_cpu_hotplug_init(void)437 static int erofs_cpu_hotplug_init(void)
438 {
439 int state;
440
441 state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
442 "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline);
443 if (state < 0)
444 return state;
445
446 erofs_cpuhp_state = state;
447 return 0;
448 }
449
erofs_cpu_hotplug_destroy(void)450 static void erofs_cpu_hotplug_destroy(void)
451 {
452 if (erofs_cpuhp_state)
453 cpuhp_remove_state_nocalls(erofs_cpuhp_state);
454 }
455 #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */
erofs_cpu_hotplug_init(void)456 static inline int erofs_cpu_hotplug_init(void) { return 0; }
erofs_cpu_hotplug_destroy(void)457 static inline void erofs_cpu_hotplug_destroy(void) {}
458 #endif
459
z_erofs_exit_zip_subsystem(void)460 void z_erofs_exit_zip_subsystem(void)
461 {
462 erofs_cpu_hotplug_destroy();
463 erofs_destroy_percpu_workers();
464 destroy_workqueue(z_erofs_workqueue);
465 z_erofs_destroy_pcluster_pool();
466 }
467
z_erofs_init_zip_subsystem(void)468 int __init z_erofs_init_zip_subsystem(void)
469 {
470 int err = z_erofs_create_pcluster_pool();
471
472 if (err)
473 goto out_error_pcluster_pool;
474
475 z_erofs_workqueue = alloc_workqueue("erofs_worker",
476 WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus());
477 if (!z_erofs_workqueue) {
478 err = -ENOMEM;
479 goto out_error_workqueue_init;
480 }
481
482 err = erofs_init_percpu_workers();
483 if (err)
484 goto out_error_pcpu_worker;
485
486 err = erofs_cpu_hotplug_init();
487 if (err < 0)
488 goto out_error_cpuhp_init;
489 return err;
490
491 out_error_cpuhp_init:
492 erofs_destroy_percpu_workers();
493 out_error_pcpu_worker:
494 destroy_workqueue(z_erofs_workqueue);
495 out_error_workqueue_init:
496 z_erofs_destroy_pcluster_pool();
497 out_error_pcluster_pool:
498 return err;
499 }
500
501 enum z_erofs_pclustermode {
502 Z_EROFS_PCLUSTER_INFLIGHT,
503 /*
504 * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
505 * could be dispatched into bypass queue later due to uptodated managed
506 * pages. All related online pages cannot be reused for inplace I/O (or
507 * bvpage) since it can be directly decoded without I/O submission.
508 */
509 Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
510 /*
511 * The pcluster was just linked to a decompression chain by us. It can
512 * also be linked with the remaining pclusters, which means if the
513 * processing page is the tail page of a pcluster, this pcluster can
514 * safely use the whole page (since the previous pcluster is within the
515 * same chain) for in-place I/O, as illustrated below:
516 * ___________________________________________________
517 * | tail (partial) page | head (partial) page |
518 * | (of the current pcl) | (of the previous pcl) |
519 * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____|
520 *
521 * [ (*) the page above can be used as inplace I/O. ]
522 */
523 Z_EROFS_PCLUSTER_FOLLOWED,
524 };
525
526 struct z_erofs_decompress_frontend {
527 struct inode *const inode;
528 struct erofs_map_blocks map;
529 struct z_erofs_bvec_iter biter;
530
531 struct page *pagepool;
532 struct page *candidate_bvpage;
533 struct z_erofs_pcluster *pcl;
534 z_erofs_next_pcluster_t owned_head;
535 enum z_erofs_pclustermode mode;
536
537 erofs_off_t headoffset;
538
539 /* a pointer used to pick up inplace I/O pages */
540 unsigned int icur;
541 };
542
543 #define DECOMPRESS_FRONTEND_INIT(__i) { \
544 .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
545 .mode = Z_EROFS_PCLUSTER_FOLLOWED }
546
z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend * fe)547 static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe)
548 {
549 unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy;
550
551 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
552 return false;
553
554 if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED))
555 return true;
556
557 if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
558 fe->map.m_la < fe->headoffset)
559 return true;
560
561 return false;
562 }
563
z_erofs_bind_cache(struct z_erofs_decompress_frontend * fe)564 static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
565 {
566 struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
567 struct z_erofs_pcluster *pcl = fe->pcl;
568 bool shouldalloc = z_erofs_should_alloc_cache(fe);
569 bool standalone = true;
570 /*
571 * optimistic allocation without direct reclaim since inplace I/O
572 * can be used if low memory otherwise.
573 */
574 gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
575 __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
576 unsigned int i;
577
578 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
579 return;
580
581 for (i = 0; i < pcl->pclusterpages; ++i) {
582 struct page *page;
583 void *t; /* mark pages just found for debugging */
584 struct page *newpage = NULL;
585
586 /* the compressed page was loaded before */
587 if (READ_ONCE(pcl->compressed_bvecs[i].page))
588 continue;
589
590 page = find_get_page(mc, pcl->obj.index + i);
591
592 if (page) {
593 t = (void *)((unsigned long)page | 1);
594 } else {
595 /* I/O is needed, no possible to decompress directly */
596 standalone = false;
597 if (!shouldalloc)
598 continue;
599
600 /*
601 * try to use cached I/O if page allocation
602 * succeeds or fallback to in-place I/O instead
603 * to avoid any direct reclaim.
604 */
605 newpage = erofs_allocpage(&fe->pagepool, gfp);
606 if (!newpage)
607 continue;
608 set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
609 t = (void *)((unsigned long)newpage | 1);
610 }
611
612 if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
613 continue;
614
615 if (page)
616 put_page(page);
617 else if (newpage)
618 erofs_pagepool_add(&fe->pagepool, newpage);
619 }
620
621 /*
622 * don't do inplace I/O if all compressed pages are available in
623 * managed cache since it can be moved to the bypass queue instead.
624 */
625 if (standalone)
626 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
627 }
628
629 /* called by erofs_shrinker to get rid of all compressed_pages */
erofs_try_to_free_all_cached_pages(struct erofs_sb_info * sbi,struct erofs_workgroup * grp)630 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
631 struct erofs_workgroup *grp)
632 {
633 struct z_erofs_pcluster *const pcl =
634 container_of(grp, struct z_erofs_pcluster, obj);
635 int i;
636
637 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
638 /*
639 * refcount of workgroup is now freezed as 0,
640 * therefore no need to worry about available decompression users.
641 */
642 for (i = 0; i < pcl->pclusterpages; ++i) {
643 struct page *page = pcl->compressed_bvecs[i].page;
644
645 if (!page)
646 continue;
647
648 /* block other users from reclaiming or migrating the page */
649 if (!trylock_page(page))
650 return -EBUSY;
651
652 if (!erofs_page_is_managed(sbi, page))
653 continue;
654
655 /* barrier is implied in the following 'unlock_page' */
656 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
657 detach_page_private(page);
658 unlock_page(page);
659 }
660 return 0;
661 }
662
z_erofs_cache_release_folio(struct folio * folio,gfp_t gfp)663 static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
664 {
665 struct z_erofs_pcluster *pcl = folio_get_private(folio);
666 bool ret;
667 int i;
668
669 if (!folio_test_private(folio))
670 return true;
671
672 ret = false;
673 spin_lock(&pcl->obj.lockref.lock);
674 if (pcl->obj.lockref.count > 0)
675 goto out;
676
677 DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
678 for (i = 0; i < pcl->pclusterpages; ++i) {
679 if (pcl->compressed_bvecs[i].page == &folio->page) {
680 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
681 ret = true;
682 break;
683 }
684 }
685 if (ret)
686 folio_detach_private(folio);
687 out:
688 spin_unlock(&pcl->obj.lockref.lock);
689 return ret;
690 }
691
692 /*
693 * It will be called only on inode eviction. In case that there are still some
694 * decompression requests in progress, wait with rescheduling for a bit here.
695 * An extra lock could be introduced instead but it seems unnecessary.
696 */
z_erofs_cache_invalidate_folio(struct folio * folio,size_t offset,size_t length)697 static void z_erofs_cache_invalidate_folio(struct folio *folio,
698 size_t offset, size_t length)
699 {
700 const size_t stop = length + offset;
701
702 /* Check for potential overflow in debug mode */
703 DBG_BUGON(stop > folio_size(folio) || stop < length);
704
705 if (offset == 0 && stop == folio_size(folio))
706 while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
707 cond_resched();
708 }
709
710 static const struct address_space_operations z_erofs_cache_aops = {
711 .release_folio = z_erofs_cache_release_folio,
712 .invalidate_folio = z_erofs_cache_invalidate_folio,
713 };
714
erofs_init_managed_cache(struct super_block * sb)715 int erofs_init_managed_cache(struct super_block *sb)
716 {
717 struct inode *const inode = new_inode(sb);
718
719 if (!inode)
720 return -ENOMEM;
721
722 set_nlink(inode, 1);
723 inode->i_size = OFFSET_MAX;
724 inode->i_mapping->a_ops = &z_erofs_cache_aops;
725 mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
726 EROFS_SB(sb)->managed_cache = inode;
727 return 0;
728 }
729
z_erofs_try_inplace_io(struct z_erofs_decompress_frontend * fe,struct z_erofs_bvec * bvec)730 static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
731 struct z_erofs_bvec *bvec)
732 {
733 struct z_erofs_pcluster *const pcl = fe->pcl;
734
735 while (fe->icur > 0) {
736 if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
737 NULL, bvec->page)) {
738 pcl->compressed_bvecs[fe->icur] = *bvec;
739 return true;
740 }
741 }
742 return false;
743 }
744
745 /* callers must be with pcluster lock held */
z_erofs_attach_page(struct z_erofs_decompress_frontend * fe,struct z_erofs_bvec * bvec,bool exclusive)746 static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
747 struct z_erofs_bvec *bvec, bool exclusive)
748 {
749 int ret;
750
751 if (exclusive) {
752 /* give priority for inplaceio to use file pages first */
753 if (z_erofs_try_inplace_io(fe, bvec))
754 return 0;
755 /* otherwise, check if it can be used as a bvpage */
756 if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
757 !fe->candidate_bvpage)
758 fe->candidate_bvpage = bvec->page;
759 }
760 ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage,
761 &fe->pagepool);
762 fe->pcl->vcnt += (ret >= 0);
763 return ret;
764 }
765
z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend * f)766 static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
767 {
768 struct z_erofs_pcluster *pcl = f->pcl;
769 z_erofs_next_pcluster_t *owned_head = &f->owned_head;
770
771 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
772 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
773 *owned_head) == Z_EROFS_PCLUSTER_NIL) {
774 *owned_head = &pcl->next;
775 /* so we can attach this pcluster to our submission chain. */
776 f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
777 return;
778 }
779
780 /* type 2, it belongs to an ongoing chain */
781 f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
782 }
783
z_erofs_register_pcluster(struct z_erofs_decompress_frontend * fe)784 static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
785 {
786 struct erofs_map_blocks *map = &fe->map;
787 bool ztailpacking = map->m_flags & EROFS_MAP_META;
788 struct z_erofs_pcluster *pcl;
789 struct erofs_workgroup *grp;
790 int err;
791
792 if (!(map->m_flags & EROFS_MAP_ENCODED) ||
793 (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
794 DBG_BUGON(1);
795 return -EFSCORRUPTED;
796 }
797
798 /* no available pcluster, let's allocate one */
799 pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
800 map->m_plen >> PAGE_SHIFT);
801 if (IS_ERR(pcl))
802 return PTR_ERR(pcl);
803
804 spin_lock_init(&pcl->obj.lockref.lock);
805 pcl->obj.lockref.count = 1; /* one ref for this request */
806 pcl->algorithmformat = map->m_algorithmformat;
807 pcl->length = 0;
808 pcl->partial = true;
809
810 /* new pclusters should be claimed as type 1, primary and followed */
811 pcl->next = fe->owned_head;
812 pcl->pageofs_out = map->m_la & ~PAGE_MASK;
813 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
814
815 /*
816 * lock all primary followed works before visible to others
817 * and mutex_trylock *never* fails for a new pcluster.
818 */
819 mutex_init(&pcl->lock);
820 DBG_BUGON(!mutex_trylock(&pcl->lock));
821
822 if (ztailpacking) {
823 pcl->obj.index = 0; /* which indicates ztailpacking */
824 pcl->tailpacking_size = map->m_plen;
825 } else {
826 pcl->obj.index = map->m_pa >> PAGE_SHIFT;
827
828 grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
829 if (IS_ERR(grp)) {
830 err = PTR_ERR(grp);
831 goto err_out;
832 }
833
834 if (grp != &pcl->obj) {
835 fe->pcl = container_of(grp,
836 struct z_erofs_pcluster, obj);
837 err = -EEXIST;
838 goto err_out;
839 }
840 }
841 fe->owned_head = &pcl->next;
842 fe->pcl = pcl;
843 return 0;
844
845 err_out:
846 mutex_unlock(&pcl->lock);
847 z_erofs_free_pcluster(pcl);
848 return err;
849 }
850
z_erofs_pcluster_begin(struct z_erofs_decompress_frontend * fe)851 static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
852 {
853 struct erofs_map_blocks *map = &fe->map;
854 struct super_block *sb = fe->inode->i_sb;
855 erofs_blk_t blknr = erofs_blknr(sb, map->m_pa);
856 struct erofs_workgroup *grp = NULL;
857 int ret;
858
859 DBG_BUGON(fe->pcl);
860
861 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
862 DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
863
864 if (!(map->m_flags & EROFS_MAP_META)) {
865 grp = erofs_find_workgroup(sb, blknr);
866 } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
867 DBG_BUGON(1);
868 return -EFSCORRUPTED;
869 }
870
871 if (grp) {
872 fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
873 ret = -EEXIST;
874 } else {
875 ret = z_erofs_register_pcluster(fe);
876 }
877
878 if (ret == -EEXIST) {
879 mutex_lock(&fe->pcl->lock);
880 z_erofs_try_to_claim_pcluster(fe);
881 } else if (ret) {
882 return ret;
883 }
884
885 z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
886 Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
887 if (!z_erofs_is_inline_pcluster(fe->pcl)) {
888 /* bind cache first when cached decompression is preferred */
889 z_erofs_bind_cache(fe);
890 } else {
891 void *mptr;
892
893 mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP);
894 if (IS_ERR(mptr)) {
895 ret = PTR_ERR(mptr);
896 erofs_err(sb, "failed to get inline data %d", ret);
897 return ret;
898 }
899 get_page(map->buf.page);
900 WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
901 fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
902 fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
903 }
904 /* file-backed inplace I/O pages are traversed in reverse order */
905 fe->icur = z_erofs_pclusterpages(fe->pcl);
906 return 0;
907 }
908
909 /*
910 * keep in mind that no referenced pclusters will be freed
911 * only after a RCU grace period.
912 */
z_erofs_rcu_callback(struct rcu_head * head)913 static void z_erofs_rcu_callback(struct rcu_head *head)
914 {
915 z_erofs_free_pcluster(container_of(head,
916 struct z_erofs_pcluster, rcu));
917 }
918
erofs_workgroup_free_rcu(struct erofs_workgroup * grp)919 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
920 {
921 struct z_erofs_pcluster *const pcl =
922 container_of(grp, struct z_erofs_pcluster, obj);
923
924 call_rcu(&pcl->rcu, z_erofs_rcu_callback);
925 }
926
z_erofs_pcluster_end(struct z_erofs_decompress_frontend * fe)927 static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe)
928 {
929 struct z_erofs_pcluster *pcl = fe->pcl;
930
931 if (!pcl)
932 return;
933
934 z_erofs_bvec_iter_end(&fe->biter);
935 mutex_unlock(&pcl->lock);
936
937 if (fe->candidate_bvpage)
938 fe->candidate_bvpage = NULL;
939
940 /*
941 * if all pending pages are added, don't hold its reference
942 * any longer if the pcluster isn't hosted by ourselves.
943 */
944 if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
945 erofs_workgroup_put(&pcl->obj);
946
947 fe->pcl = NULL;
948 }
949
z_erofs_read_fragment(struct super_block * sb,struct page * page,unsigned int cur,unsigned int end,erofs_off_t pos)950 static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
951 unsigned int cur, unsigned int end, erofs_off_t pos)
952 {
953 struct inode *packed_inode = EROFS_SB(sb)->packed_inode;
954 struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
955 unsigned int cnt;
956 u8 *src;
957
958 if (!packed_inode)
959 return -EFSCORRUPTED;
960
961 buf.inode = packed_inode;
962 for (; cur < end; cur += cnt, pos += cnt) {
963 cnt = min_t(unsigned int, end - cur,
964 sb->s_blocksize - erofs_blkoff(sb, pos));
965 src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP);
966 if (IS_ERR(src)) {
967 erofs_put_metabuf(&buf);
968 return PTR_ERR(src);
969 }
970 memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt);
971 }
972 erofs_put_metabuf(&buf);
973 return 0;
974 }
975
z_erofs_do_read_page(struct z_erofs_decompress_frontend * fe,struct page * page)976 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
977 struct page *page)
978 {
979 struct inode *const inode = fe->inode;
980 struct erofs_map_blocks *const map = &fe->map;
981 const loff_t offset = page_offset(page);
982 bool tight = true, exclusive;
983 unsigned int cur, end, len, split;
984 int err = 0;
985
986 z_erofs_onlinepage_init(page);
987
988 split = 0;
989 end = PAGE_SIZE;
990 repeat:
991 if (offset + end - 1 < map->m_la ||
992 offset + end - 1 >= map->m_la + map->m_llen) {
993 z_erofs_pcluster_end(fe);
994 map->m_la = offset + end - 1;
995 map->m_llen = 0;
996 err = z_erofs_map_blocks_iter(inode, map, 0);
997 if (err)
998 goto out;
999 }
1000
1001 cur = offset > map->m_la ? 0 : map->m_la - offset;
1002 /* bump split parts first to avoid several separate cases */
1003 ++split;
1004
1005 if (!(map->m_flags & EROFS_MAP_MAPPED)) {
1006 zero_user_segment(page, cur, end);
1007 tight = false;
1008 goto next_part;
1009 }
1010
1011 if (map->m_flags & EROFS_MAP_FRAGMENT) {
1012 erofs_off_t fpos = offset + cur - map->m_la;
1013
1014 len = min_t(unsigned int, map->m_llen - fpos, end - cur);
1015 err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
1016 EROFS_I(inode)->z_fragmentoff + fpos);
1017 if (err)
1018 goto out;
1019 tight = false;
1020 goto next_part;
1021 }
1022
1023 if (!fe->pcl) {
1024 err = z_erofs_pcluster_begin(fe);
1025 if (err)
1026 goto out;
1027 }
1028
1029 /*
1030 * Ensure the current partial page belongs to this submit chain rather
1031 * than other concurrent submit chains or the noio(bypass) chain since
1032 * those chains are handled asynchronously thus the page cannot be used
1033 * for inplace I/O or bvpage (should be processed in a strict order.)
1034 */
1035 tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
1036 exclusive = (!cur && ((split <= 1) || tight));
1037 if (cur)
1038 tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
1039
1040 err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
1041 .page = page,
1042 .offset = offset - map->m_la,
1043 .end = end,
1044 }), exclusive);
1045 if (err)
1046 goto out;
1047
1048 z_erofs_onlinepage_split(page);
1049 if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
1050 fe->pcl->multibases = true;
1051 if (fe->pcl->length < offset + end - map->m_la) {
1052 fe->pcl->length = offset + end - map->m_la;
1053 fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
1054 }
1055 if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
1056 !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
1057 fe->pcl->length == map->m_llen)
1058 fe->pcl->partial = false;
1059 next_part:
1060 /* shorten the remaining extent to update progress */
1061 map->m_llen = offset + cur - map->m_la;
1062 map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
1063
1064 end = cur;
1065 if (end > 0)
1066 goto repeat;
1067
1068 out:
1069 z_erofs_onlinepage_end(page, err, false);
1070 return err;
1071 }
1072
z_erofs_is_sync_decompress(struct erofs_sb_info * sbi,unsigned int readahead_pages)1073 static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi,
1074 unsigned int readahead_pages)
1075 {
1076 /* auto: enable for read_folio, disable for readahead */
1077 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
1078 !readahead_pages)
1079 return true;
1080
1081 if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
1082 (readahead_pages <= sbi->opt.max_sync_decompress_pages))
1083 return true;
1084
1085 return false;
1086 }
1087
z_erofs_page_is_invalidated(struct page * page)1088 static bool z_erofs_page_is_invalidated(struct page *page)
1089 {
1090 return !page->mapping && !z_erofs_is_shortlived_page(page);
1091 }
1092
1093 struct z_erofs_decompress_backend {
1094 struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
1095 struct super_block *sb;
1096 struct z_erofs_pcluster *pcl;
1097
1098 /* pages with the longest decompressed length for deduplication */
1099 struct page **decompressed_pages;
1100 /* pages to keep the compressed data */
1101 struct page **compressed_pages;
1102
1103 struct list_head decompressed_secondary_bvecs;
1104 struct page **pagepool;
1105 unsigned int onstack_used, nr_pages;
1106 };
1107
1108 struct z_erofs_bvec_item {
1109 struct z_erofs_bvec bvec;
1110 struct list_head list;
1111 };
1112
z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend * be,struct z_erofs_bvec * bvec)1113 static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
1114 struct z_erofs_bvec *bvec)
1115 {
1116 struct z_erofs_bvec_item *item;
1117 unsigned int pgnr;
1118
1119 if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) &&
1120 (bvec->end == PAGE_SIZE ||
1121 bvec->offset + bvec->end == be->pcl->length)) {
1122 pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
1123 DBG_BUGON(pgnr >= be->nr_pages);
1124 if (!be->decompressed_pages[pgnr]) {
1125 be->decompressed_pages[pgnr] = bvec->page;
1126 return;
1127 }
1128 }
1129
1130 /* (cold path) one pcluster is requested multiple times */
1131 item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
1132 item->bvec = *bvec;
1133 list_add(&item->list, &be->decompressed_secondary_bvecs);
1134 }
1135
z_erofs_fill_other_copies(struct z_erofs_decompress_backend * be,int err)1136 static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
1137 int err)
1138 {
1139 unsigned int off0 = be->pcl->pageofs_out;
1140 struct list_head *p, *n;
1141
1142 list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
1143 struct z_erofs_bvec_item *bvi;
1144 unsigned int end, cur;
1145 void *dst, *src;
1146
1147 bvi = container_of(p, struct z_erofs_bvec_item, list);
1148 cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
1149 end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
1150 bvi->bvec.end);
1151 dst = kmap_local_page(bvi->bvec.page);
1152 while (cur < end) {
1153 unsigned int pgnr, scur, len;
1154
1155 pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
1156 DBG_BUGON(pgnr >= be->nr_pages);
1157
1158 scur = bvi->bvec.offset + cur -
1159 ((pgnr << PAGE_SHIFT) - off0);
1160 len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
1161 if (!be->decompressed_pages[pgnr]) {
1162 err = -EFSCORRUPTED;
1163 cur += len;
1164 continue;
1165 }
1166 src = kmap_local_page(be->decompressed_pages[pgnr]);
1167 memcpy(dst + cur, src + scur, len);
1168 kunmap_local(src);
1169 cur += len;
1170 }
1171 kunmap_local(dst);
1172 z_erofs_onlinepage_end(bvi->bvec.page, err, true);
1173 list_del(p);
1174 kfree(bvi);
1175 }
1176 }
1177
z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend * be)1178 static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
1179 {
1180 struct z_erofs_pcluster *pcl = be->pcl;
1181 struct z_erofs_bvec_iter biter;
1182 struct page *old_bvpage;
1183 int i;
1184
1185 z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
1186 for (i = 0; i < pcl->vcnt; ++i) {
1187 struct z_erofs_bvec bvec;
1188
1189 z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
1190
1191 if (old_bvpage)
1192 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1193
1194 DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
1195 z_erofs_do_decompressed_bvec(be, &bvec);
1196 }
1197
1198 old_bvpage = z_erofs_bvec_iter_end(&biter);
1199 if (old_bvpage)
1200 z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
1201 }
1202
z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend * be,bool * overlapped)1203 static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
1204 bool *overlapped)
1205 {
1206 struct z_erofs_pcluster *pcl = be->pcl;
1207 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1208 int i, err = 0;
1209
1210 *overlapped = false;
1211 for (i = 0; i < pclusterpages; ++i) {
1212 struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
1213 struct page *page = bvec->page;
1214
1215 /* compressed pages ought to be present before decompressing */
1216 if (!page) {
1217 DBG_BUGON(1);
1218 continue;
1219 }
1220 be->compressed_pages[i] = page;
1221
1222 if (z_erofs_is_inline_pcluster(pcl)) {
1223 if (!PageUptodate(page))
1224 err = -EIO;
1225 continue;
1226 }
1227
1228 DBG_BUGON(z_erofs_page_is_invalidated(page));
1229 if (!z_erofs_is_shortlived_page(page)) {
1230 if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
1231 if (!PageUptodate(page))
1232 err = -EIO;
1233 continue;
1234 }
1235 z_erofs_do_decompressed_bvec(be, bvec);
1236 *overlapped = true;
1237 }
1238 }
1239
1240 if (err)
1241 return err;
1242 return 0;
1243 }
1244
z_erofs_decompress_pcluster(struct z_erofs_decompress_backend * be,int err)1245 static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
1246 int err)
1247 {
1248 struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
1249 struct z_erofs_pcluster *pcl = be->pcl;
1250 unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
1251 const struct z_erofs_decompressor *decompressor =
1252 &erofs_decompressors[pcl->algorithmformat];
1253 unsigned int i, inputsize;
1254 int err2;
1255 struct page *page;
1256 bool overlapped;
1257
1258 mutex_lock(&pcl->lock);
1259 be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
1260
1261 /* allocate (de)compressed page arrays if cannot be kept on stack */
1262 be->decompressed_pages = NULL;
1263 be->compressed_pages = NULL;
1264 be->onstack_used = 0;
1265 if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
1266 be->decompressed_pages = be->onstack_pages;
1267 be->onstack_used = be->nr_pages;
1268 memset(be->decompressed_pages, 0,
1269 sizeof(struct page *) * be->nr_pages);
1270 }
1271
1272 if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
1273 be->compressed_pages = be->onstack_pages + be->onstack_used;
1274
1275 if (!be->decompressed_pages)
1276 be->decompressed_pages =
1277 kvcalloc(be->nr_pages, sizeof(struct page *),
1278 GFP_KERNEL | __GFP_NOFAIL);
1279 if (!be->compressed_pages)
1280 be->compressed_pages =
1281 kvcalloc(pclusterpages, sizeof(struct page *),
1282 GFP_KERNEL | __GFP_NOFAIL);
1283
1284 z_erofs_parse_out_bvecs(be);
1285 err2 = z_erofs_parse_in_bvecs(be, &overlapped);
1286 if (err2)
1287 err = err2;
1288 if (err)
1289 goto out;
1290
1291 if (z_erofs_is_inline_pcluster(pcl))
1292 inputsize = pcl->tailpacking_size;
1293 else
1294 inputsize = pclusterpages * PAGE_SIZE;
1295
1296 err = decompressor->decompress(&(struct z_erofs_decompress_req) {
1297 .sb = be->sb,
1298 .in = be->compressed_pages,
1299 .out = be->decompressed_pages,
1300 .pageofs_in = pcl->pageofs_in,
1301 .pageofs_out = pcl->pageofs_out,
1302 .inputsize = inputsize,
1303 .outputsize = pcl->length,
1304 .alg = pcl->algorithmformat,
1305 .inplace_io = overlapped,
1306 .partial_decoding = pcl->partial,
1307 .fillgaps = pcl->multibases,
1308 }, be->pagepool);
1309
1310 out:
1311 /* must handle all compressed pages before actual file pages */
1312 if (z_erofs_is_inline_pcluster(pcl)) {
1313 page = pcl->compressed_bvecs[0].page;
1314 WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
1315 put_page(page);
1316 } else {
1317 for (i = 0; i < pclusterpages; ++i) {
1318 /* consider shortlived pages added when decompressing */
1319 page = be->compressed_pages[i];
1320
1321 if (erofs_page_is_managed(sbi, page))
1322 continue;
1323 (void)z_erofs_put_shortlivedpage(be->pagepool, page);
1324 WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
1325 }
1326 }
1327 if (be->compressed_pages < be->onstack_pages ||
1328 be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
1329 kvfree(be->compressed_pages);
1330 z_erofs_fill_other_copies(be, err);
1331
1332 for (i = 0; i < be->nr_pages; ++i) {
1333 page = be->decompressed_pages[i];
1334 if (!page)
1335 continue;
1336
1337 DBG_BUGON(z_erofs_page_is_invalidated(page));
1338
1339 /* recycle all individual short-lived pages */
1340 if (z_erofs_put_shortlivedpage(be->pagepool, page))
1341 continue;
1342 z_erofs_onlinepage_end(page, err, true);
1343 }
1344
1345 if (be->decompressed_pages != be->onstack_pages)
1346 kvfree(be->decompressed_pages);
1347
1348 pcl->length = 0;
1349 pcl->partial = true;
1350 pcl->multibases = false;
1351 pcl->bvset.nextpage = NULL;
1352 pcl->vcnt = 0;
1353
1354 /* pcluster lock MUST be taken before the following line */
1355 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
1356 mutex_unlock(&pcl->lock);
1357 return err;
1358 }
1359
z_erofs_decompress_queue(const struct z_erofs_decompressqueue * io,struct page ** pagepool)1360 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
1361 struct page **pagepool)
1362 {
1363 struct z_erofs_decompress_backend be = {
1364 .sb = io->sb,
1365 .pagepool = pagepool,
1366 .decompressed_secondary_bvecs =
1367 LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
1368 };
1369 z_erofs_next_pcluster_t owned = io->head;
1370
1371 while (owned != Z_EROFS_PCLUSTER_TAIL) {
1372 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
1373
1374 be.pcl = container_of(owned, struct z_erofs_pcluster, next);
1375 owned = READ_ONCE(be.pcl->next);
1376
1377 z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
1378 if (z_erofs_is_inline_pcluster(be.pcl))
1379 z_erofs_free_pcluster(be.pcl);
1380 else
1381 erofs_workgroup_put(&be.pcl->obj);
1382 }
1383 }
1384
z_erofs_decompressqueue_work(struct work_struct * work)1385 static void z_erofs_decompressqueue_work(struct work_struct *work)
1386 {
1387 struct z_erofs_decompressqueue *bgq =
1388 container_of(work, struct z_erofs_decompressqueue, u.work);
1389 struct page *pagepool = NULL;
1390
1391 DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL);
1392 z_erofs_decompress_queue(bgq, &pagepool);
1393 erofs_release_pages(&pagepool);
1394 kvfree(bgq);
1395 }
1396
1397 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
z_erofs_decompressqueue_kthread_work(struct kthread_work * work)1398 static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work)
1399 {
1400 z_erofs_decompressqueue_work((struct work_struct *)work);
1401 }
1402 #endif
1403
z_erofs_decompress_kickoff(struct z_erofs_decompressqueue * io,int bios)1404 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
1405 int bios)
1406 {
1407 struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
1408
1409 /* wake up the caller thread for sync decompression */
1410 if (io->sync) {
1411 if (!atomic_add_return(bios, &io->pending_bios))
1412 complete(&io->u.done);
1413 return;
1414 }
1415
1416 if (atomic_add_return(bios, &io->pending_bios))
1417 return;
1418 /* Use (kthread_)work and sync decompression for atomic contexts only */
1419 if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) {
1420 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1421 struct kthread_worker *worker;
1422
1423 rcu_read_lock();
1424 worker = rcu_dereference(
1425 z_erofs_pcpu_workers[raw_smp_processor_id()]);
1426 if (!worker) {
1427 INIT_WORK(&io->u.work, z_erofs_decompressqueue_work);
1428 queue_work(z_erofs_workqueue, &io->u.work);
1429 } else {
1430 kthread_queue_work(worker, &io->u.kthread_work);
1431 }
1432 rcu_read_unlock();
1433 #else
1434 queue_work(z_erofs_workqueue, &io->u.work);
1435 #endif
1436 /* enable sync decompression for readahead */
1437 if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
1438 sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
1439 return;
1440 }
1441 z_erofs_decompressqueue_work(&io->u.work);
1442 }
1443
pickup_page_for_submission(struct z_erofs_pcluster * pcl,unsigned int nr,struct page ** pagepool,struct address_space * mc)1444 static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
1445 unsigned int nr,
1446 struct page **pagepool,
1447 struct address_space *mc)
1448 {
1449 const pgoff_t index = pcl->obj.index;
1450 gfp_t gfp = mapping_gfp_mask(mc);
1451 bool tocache = false;
1452
1453 struct address_space *mapping;
1454 struct page *oldpage, *page;
1455 int justfound;
1456
1457 repeat:
1458 page = READ_ONCE(pcl->compressed_bvecs[nr].page);
1459 oldpage = page;
1460
1461 if (!page)
1462 goto out_allocpage;
1463
1464 justfound = (unsigned long)page & 1UL;
1465 page = (struct page *)((unsigned long)page & ~1UL);
1466
1467 /*
1468 * preallocated cached pages, which is used to avoid direct reclaim
1469 * otherwise, it will go inplace I/O path instead.
1470 */
1471 if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
1472 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1473 set_page_private(page, 0);
1474 tocache = true;
1475 goto out_tocache;
1476 }
1477 mapping = READ_ONCE(page->mapping);
1478
1479 /*
1480 * file-backed online pages in plcuster are all locked steady,
1481 * therefore it is impossible for `mapping' to be NULL.
1482 */
1483 if (mapping && mapping != mc)
1484 /* ought to be unmanaged pages */
1485 goto out;
1486
1487 /* directly return for shortlived page as well */
1488 if (z_erofs_is_shortlived_page(page))
1489 goto out;
1490
1491 lock_page(page);
1492 if (likely(page->mapping == mc)) {
1493 WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
1494
1495 /*
1496 * The cached folio is still in managed cache but without
1497 * a valid `->private` pcluster hint. Let's reconnect them.
1498 */
1499 if (!PagePrivate(page)) {
1500 /*
1501 * impossible to be !PagePrivate(page) for
1502 * the current restriction as well if
1503 * the page is already in compressed_bvecs[].
1504 */
1505 DBG_BUGON(!justfound);
1506
1507 justfound = 0;
1508 set_page_private(page, (unsigned long)pcl);
1509 SetPagePrivate(page);
1510 }
1511
1512 if (likely(page->private == (unsigned long)pcl)) {
1513 /* don't submit cache I/Os again if already uptodate */
1514 if (PageUptodate(page)) {
1515 unlock_page(page);
1516 page = NULL;
1517
1518 }
1519 goto out;
1520 }
1521 /*
1522 * Already linked with another pcluster, which only appears in
1523 * crafted images by fuzzers for now. But handle this anyway.
1524 */
1525 tocache = false; /* use temporary short-lived pages */
1526 } else {
1527 DBG_BUGON(1); /* referenced managed folios can't be truncated */
1528 tocache = true;
1529 }
1530 unlock_page(page);
1531 put_page(page);
1532 out_allocpage:
1533 page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
1534 if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
1535 oldpage, page)) {
1536 erofs_pagepool_add(pagepool, page);
1537 cond_resched();
1538 goto repeat;
1539 }
1540 out_tocache:
1541 if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1542 /* turn into temporary page if fails (1 ref) */
1543 set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
1544 goto out;
1545 }
1546 attach_page_private(page, pcl);
1547 /* drop a refcount added by allocpage (then we have 2 refs here) */
1548 put_page(page);
1549
1550 out: /* the only exit (for tracing and debugging) */
1551 return page;
1552 }
1553
jobqueue_init(struct super_block * sb,struct z_erofs_decompressqueue * fgq,bool * fg)1554 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
1555 struct z_erofs_decompressqueue *fgq, bool *fg)
1556 {
1557 struct z_erofs_decompressqueue *q;
1558
1559 if (fg && !*fg) {
1560 q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
1561 if (!q) {
1562 *fg = true;
1563 goto fg_out;
1564 }
1565 #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD
1566 kthread_init_work(&q->u.kthread_work,
1567 z_erofs_decompressqueue_kthread_work);
1568 #else
1569 INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
1570 #endif
1571 } else {
1572 fg_out:
1573 q = fgq;
1574 init_completion(&fgq->u.done);
1575 atomic_set(&fgq->pending_bios, 0);
1576 q->eio = false;
1577 q->sync = true;
1578 }
1579 q->sb = sb;
1580 q->head = Z_EROFS_PCLUSTER_TAIL;
1581 return q;
1582 }
1583
1584 /* define decompression jobqueue types */
1585 enum {
1586 JQ_BYPASS,
1587 JQ_SUBMIT,
1588 NR_JOBQUEUES,
1589 };
1590
move_to_bypass_jobqueue(struct z_erofs_pcluster * pcl,z_erofs_next_pcluster_t qtail[],z_erofs_next_pcluster_t owned_head)1591 static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1592 z_erofs_next_pcluster_t qtail[],
1593 z_erofs_next_pcluster_t owned_head)
1594 {
1595 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1596 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1597
1598 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL);
1599
1600 WRITE_ONCE(*submit_qtail, owned_head);
1601 WRITE_ONCE(*bypass_qtail, &pcl->next);
1602
1603 qtail[JQ_BYPASS] = &pcl->next;
1604 }
1605
z_erofs_decompressqueue_endio(struct bio * bio)1606 static void z_erofs_decompressqueue_endio(struct bio *bio)
1607 {
1608 struct z_erofs_decompressqueue *q = bio->bi_private;
1609 blk_status_t err = bio->bi_status;
1610 struct bio_vec *bvec;
1611 struct bvec_iter_all iter_all;
1612
1613 bio_for_each_segment_all(bvec, bio, iter_all) {
1614 struct page *page = bvec->bv_page;
1615
1616 DBG_BUGON(PageUptodate(page));
1617 DBG_BUGON(z_erofs_page_is_invalidated(page));
1618
1619 if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
1620 if (!err)
1621 SetPageUptodate(page);
1622 unlock_page(page);
1623 }
1624 }
1625 if (err)
1626 q->eio = true;
1627 z_erofs_decompress_kickoff(q, -1);
1628 bio_put(bio);
1629 }
1630
z_erofs_submit_queue(struct z_erofs_decompress_frontend * f,struct z_erofs_decompressqueue * fgq,bool * force_fg,bool readahead)1631 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
1632 struct z_erofs_decompressqueue *fgq,
1633 bool *force_fg, bool readahead)
1634 {
1635 struct super_block *sb = f->inode->i_sb;
1636 struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
1637 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1638 struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
1639 z_erofs_next_pcluster_t owned_head = f->owned_head;
1640 /* bio is NULL initially, so no need to initialize last_{index,bdev} */
1641 pgoff_t last_index;
1642 struct block_device *last_bdev;
1643 unsigned int nr_bios = 0;
1644 struct bio *bio = NULL;
1645 unsigned long pflags;
1646 int memstall = 0;
1647
1648 /*
1649 * if managed cache is enabled, bypass jobqueue is needed,
1650 * no need to read from device for all pclusters in this queue.
1651 */
1652 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
1653 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
1654
1655 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1656 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1657
1658 /* by default, all need io submission */
1659 q[JQ_SUBMIT]->head = owned_head;
1660
1661 do {
1662 struct erofs_map_dev mdev;
1663 struct z_erofs_pcluster *pcl;
1664 pgoff_t cur, end;
1665 unsigned int i = 0;
1666 bool bypass = true;
1667
1668 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1669 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1670 owned_head = READ_ONCE(pcl->next);
1671
1672 if (z_erofs_is_inline_pcluster(pcl)) {
1673 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1674 continue;
1675 }
1676
1677 /* no device id here, thus it will always succeed */
1678 mdev = (struct erofs_map_dev) {
1679 .m_pa = erofs_pos(sb, pcl->obj.index),
1680 };
1681 (void)erofs_map_dev(sb, &mdev);
1682
1683 cur = erofs_blknr(sb, mdev.m_pa);
1684 end = cur + pcl->pclusterpages;
1685
1686 do {
1687 struct page *page = NULL;
1688
1689 if (bio && (cur != last_index + 1 ||
1690 last_bdev != mdev.m_bdev)) {
1691 drain_io:
1692 submit_bio(bio);
1693 if (memstall) {
1694 psi_memstall_leave(&pflags);
1695 memstall = 0;
1696 }
1697 bio = NULL;
1698 }
1699
1700 if (!page) {
1701 page = pickup_page_for_submission(pcl, i++,
1702 &f->pagepool, mc);
1703 if (!page)
1704 continue;
1705 }
1706
1707 if (unlikely(PageWorkingset(page)) && !memstall) {
1708 psi_memstall_enter(&pflags);
1709 memstall = 1;
1710 }
1711
1712 if (!bio) {
1713 bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
1714 REQ_OP_READ, GFP_NOIO);
1715 bio->bi_end_io = z_erofs_decompressqueue_endio;
1716
1717 last_bdev = mdev.m_bdev;
1718 bio->bi_iter.bi_sector = (sector_t)cur <<
1719 (sb->s_blocksize_bits - 9);
1720 bio->bi_private = q[JQ_SUBMIT];
1721 if (readahead)
1722 bio->bi_opf |= REQ_RAHEAD;
1723 ++nr_bios;
1724 }
1725
1726 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
1727 goto drain_io;
1728
1729 last_index = cur;
1730 bypass = false;
1731 } while (++cur < end);
1732
1733 if (!bypass)
1734 qtail[JQ_SUBMIT] = &pcl->next;
1735 else
1736 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1737 } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1738
1739 if (bio)
1740 submit_bio(bio);
1741 if (memstall)
1742 psi_memstall_leave(&pflags);
1743
1744 /*
1745 * although background is preferred, no one is pending for submission.
1746 * don't issue decompression but drop it directly instead.
1747 */
1748 if (!*force_fg && !nr_bios) {
1749 kvfree(q[JQ_SUBMIT]);
1750 return;
1751 }
1752 z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios);
1753 }
1754
z_erofs_runqueue(struct z_erofs_decompress_frontend * f,bool force_fg,bool ra)1755 static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
1756 bool force_fg, bool ra)
1757 {
1758 struct z_erofs_decompressqueue io[NR_JOBQUEUES];
1759
1760 if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
1761 return;
1762 z_erofs_submit_queue(f, io, &force_fg, ra);
1763
1764 /* handle bypass queue (no i/o pclusters) immediately */
1765 z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool);
1766
1767 if (!force_fg)
1768 return;
1769
1770 /* wait until all bios are completed */
1771 wait_for_completion_io(&io[JQ_SUBMIT].u.done);
1772
1773 /* handle synchronous decompress queue in the caller context */
1774 z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool);
1775 }
1776
1777 /*
1778 * Since partial uptodate is still unimplemented for now, we have to use
1779 * approximate readmore strategies as a start.
1780 */
z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend * f,struct readahead_control * rac,bool backmost)1781 static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
1782 struct readahead_control *rac, bool backmost)
1783 {
1784 struct inode *inode = f->inode;
1785 struct erofs_map_blocks *map = &f->map;
1786 erofs_off_t cur, end, headoffset = f->headoffset;
1787 int err;
1788
1789 if (backmost) {
1790 if (rac)
1791 end = headoffset + readahead_length(rac) - 1;
1792 else
1793 end = headoffset + PAGE_SIZE - 1;
1794 map->m_la = end;
1795 err = z_erofs_map_blocks_iter(inode, map,
1796 EROFS_GET_BLOCKS_READMORE);
1797 if (err)
1798 return;
1799
1800 /* expand ra for the trailing edge if readahead */
1801 if (rac) {
1802 cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
1803 readahead_expand(rac, headoffset, cur - headoffset);
1804 return;
1805 }
1806 end = round_up(end, PAGE_SIZE);
1807 } else {
1808 end = round_up(map->m_la, PAGE_SIZE);
1809
1810 if (!map->m_llen)
1811 return;
1812 }
1813
1814 cur = map->m_la + map->m_llen - 1;
1815 while ((cur >= end) && (cur < i_size_read(inode))) {
1816 pgoff_t index = cur >> PAGE_SHIFT;
1817 struct page *page;
1818
1819 page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
1820 if (page) {
1821 if (PageUptodate(page))
1822 unlock_page(page);
1823 else
1824 (void)z_erofs_do_read_page(f, page);
1825 put_page(page);
1826 }
1827
1828 if (cur < PAGE_SIZE)
1829 break;
1830 cur = (index << PAGE_SHIFT) - 1;
1831 }
1832 }
1833
z_erofs_read_folio(struct file * file,struct folio * folio)1834 static int z_erofs_read_folio(struct file *file, struct folio *folio)
1835 {
1836 struct inode *const inode = folio->mapping->host;
1837 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1838 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1839 int err;
1840
1841 trace_erofs_read_folio(folio, false);
1842 f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
1843
1844 z_erofs_pcluster_readmore(&f, NULL, true);
1845 err = z_erofs_do_read_page(&f, &folio->page);
1846 z_erofs_pcluster_readmore(&f, NULL, false);
1847 z_erofs_pcluster_end(&f);
1848
1849 /* if some compressed cluster ready, need submit them anyway */
1850 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false);
1851
1852 if (err && err != -EINTR)
1853 erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu",
1854 err, folio->index, EROFS_I(inode)->nid);
1855
1856 erofs_put_metabuf(&f.map.buf);
1857 erofs_release_pages(&f.pagepool);
1858 return err;
1859 }
1860
z_erofs_readahead(struct readahead_control * rac)1861 static void z_erofs_readahead(struct readahead_control *rac)
1862 {
1863 struct inode *const inode = rac->mapping->host;
1864 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1865 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1866 struct folio *head = NULL, *folio;
1867 unsigned int nr_folios;
1868 int err;
1869
1870 f.headoffset = readahead_pos(rac);
1871
1872 z_erofs_pcluster_readmore(&f, rac, true);
1873 nr_folios = readahead_count(rac);
1874 trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false);
1875
1876 while ((folio = readahead_folio(rac))) {
1877 folio->private = head;
1878 head = folio;
1879 }
1880
1881 /* traverse in reverse order for best metadata I/O performance */
1882 while (head) {
1883 folio = head;
1884 head = folio_get_private(folio);
1885
1886 err = z_erofs_do_read_page(&f, &folio->page);
1887 if (err && err != -EINTR)
1888 erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
1889 folio->index, EROFS_I(inode)->nid);
1890 }
1891 z_erofs_pcluster_readmore(&f, rac, false);
1892 z_erofs_pcluster_end(&f);
1893
1894 z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true);
1895 erofs_put_metabuf(&f.map.buf);
1896 erofs_release_pages(&f.pagepool);
1897 }
1898
1899 const struct address_space_operations z_erofs_aops = {
1900 .read_folio = z_erofs_read_folio,
1901 .readahead = z_erofs_readahead,
1902 };
1903