1c1d7c514SDavid Sterba // SPDX-License-Identifier: GPL-2.0
2c8b97818SChris Mason /*
3c8b97818SChris Mason * Copyright (C) 2008 Oracle. All rights reserved.
4c8b97818SChris Mason */
5c8b97818SChris Mason
6c8b97818SChris Mason #include <linux/kernel.h>
7c8b97818SChris Mason #include <linux/bio.h>
8c8b97818SChris Mason #include <linux/file.h>
9c8b97818SChris Mason #include <linux/fs.h>
10c8b97818SChris Mason #include <linux/pagemap.h>
11a75b81c3SVishal Moola (Oracle) #include <linux/pagevec.h>
12c8b97818SChris Mason #include <linux/highmem.h>
13e41d12f5SChristoph Hellwig #include <linux/kthread.h>
14c8b97818SChris Mason #include <linux/time.h>
15c8b97818SChris Mason #include <linux/init.h>
16c8b97818SChris Mason #include <linux/string.h>
17c8b97818SChris Mason #include <linux/backing-dev.h>
18c8b97818SChris Mason #include <linux/writeback.h>
194088a47eSChristoph Hellwig #include <linux/psi.h>
205a0e3ad6STejun Heo #include <linux/slab.h>
21fe308533SDavid Sterba #include <linux/sched/mm.h>
2219562430STimofey Titovets #include <linux/log2.h>
23d5178578SJohannes Thumshirn #include <crypto/hash.h>
24602cbe91SDavid Sterba #include "misc.h"
25c8b97818SChris Mason #include "ctree.h"
26ec8eb376SJosef Bacik #include "fs.h"
27c8b97818SChris Mason #include "disk-io.h"
28c8b97818SChris Mason #include "transaction.h"
29c8b97818SChris Mason #include "btrfs_inode.h"
30103c1972SChristoph Hellwig #include "bio.h"
31c8b97818SChris Mason #include "ordered-data.h"
32c8b97818SChris Mason #include "compression.h"
33c8b97818SChris Mason #include "extent_io.h"
34c8b97818SChris Mason #include "extent_map.h"
356a404910SQu Wenruo #include "subpage.h"
36764c7c9aSJohannes Thumshirn #include "zoned.h"
377c8ede16SJosef Bacik #include "file-item.h"
387f0add25SJosef Bacik #include "super.h"
39c8b97818SChris Mason
40e794203eSBen Dooks static struct bio_set btrfs_compressed_bioset;
41544fe4a9SChristoph Hellwig
42e128f9c3SDavid Sterba static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
43e128f9c3SDavid Sterba
btrfs_compress_type2str(enum btrfs_compression_type type)44e128f9c3SDavid Sterba const char* btrfs_compress_type2str(enum btrfs_compression_type type)
45e128f9c3SDavid Sterba {
46e128f9c3SDavid Sterba switch (type) {
47e128f9c3SDavid Sterba case BTRFS_COMPRESS_ZLIB:
48e128f9c3SDavid Sterba case BTRFS_COMPRESS_LZO:
49e128f9c3SDavid Sterba case BTRFS_COMPRESS_ZSTD:
50e128f9c3SDavid Sterba case BTRFS_COMPRESS_NONE:
51e128f9c3SDavid Sterba return btrfs_compress_types[type];
52ce96b7ffSChengguang Xu default:
53ce96b7ffSChengguang Xu break;
54e128f9c3SDavid Sterba }
55e128f9c3SDavid Sterba
56e128f9c3SDavid Sterba return NULL;
57e128f9c3SDavid Sterba }
58e128f9c3SDavid Sterba
to_compressed_bio(struct btrfs_bio * bbio)59544fe4a9SChristoph Hellwig static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
60544fe4a9SChristoph Hellwig {
61544fe4a9SChristoph Hellwig return container_of(bbio, struct compressed_bio, bbio);
62544fe4a9SChristoph Hellwig }
63544fe4a9SChristoph Hellwig
alloc_compressed_bio(struct btrfs_inode * inode,u64 start,blk_opf_t op,btrfs_bio_end_io_t end_io)64544fe4a9SChristoph Hellwig static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
65544fe4a9SChristoph Hellwig u64 start, blk_opf_t op,
66544fe4a9SChristoph Hellwig btrfs_bio_end_io_t end_io)
67544fe4a9SChristoph Hellwig {
68544fe4a9SChristoph Hellwig struct btrfs_bio *bbio;
69544fe4a9SChristoph Hellwig
70544fe4a9SChristoph Hellwig bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
71544fe4a9SChristoph Hellwig GFP_NOFS, &btrfs_compressed_bioset));
724317ff00SQu Wenruo btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
734317ff00SQu Wenruo bbio->inode = inode;
74544fe4a9SChristoph Hellwig bbio->file_offset = start;
75544fe4a9SChristoph Hellwig return to_compressed_bio(bbio);
76544fe4a9SChristoph Hellwig }
77544fe4a9SChristoph Hellwig
btrfs_compress_is_valid_type(const char * str,size_t len)78aa53e3bfSJohannes Thumshirn bool btrfs_compress_is_valid_type(const char *str, size_t len)
79aa53e3bfSJohannes Thumshirn {
80aa53e3bfSJohannes Thumshirn int i;
81aa53e3bfSJohannes Thumshirn
82aa53e3bfSJohannes Thumshirn for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
83aa53e3bfSJohannes Thumshirn size_t comp_len = strlen(btrfs_compress_types[i]);
84aa53e3bfSJohannes Thumshirn
85aa53e3bfSJohannes Thumshirn if (len < comp_len)
86aa53e3bfSJohannes Thumshirn continue;
87aa53e3bfSJohannes Thumshirn
88aa53e3bfSJohannes Thumshirn if (!strncmp(btrfs_compress_types[i], str, comp_len))
89aa53e3bfSJohannes Thumshirn return true;
90aa53e3bfSJohannes Thumshirn }
91aa53e3bfSJohannes Thumshirn return false;
92aa53e3bfSJohannes Thumshirn }
93aa53e3bfSJohannes Thumshirn
compression_compress_pages(int type,struct list_head * ws,struct address_space * mapping,u64 start,struct page ** pages,unsigned long * out_pages,unsigned long * total_in,unsigned long * total_out)941e4eb746SDavid Sterba static int compression_compress_pages(int type, struct list_head *ws,
951e4eb746SDavid Sterba struct address_space *mapping, u64 start, struct page **pages,
961e4eb746SDavid Sterba unsigned long *out_pages, unsigned long *total_in,
971e4eb746SDavid Sterba unsigned long *total_out)
981e4eb746SDavid Sterba {
991e4eb746SDavid Sterba switch (type) {
1001e4eb746SDavid Sterba case BTRFS_COMPRESS_ZLIB:
1011e4eb746SDavid Sterba return zlib_compress_pages(ws, mapping, start, pages,
1021e4eb746SDavid Sterba out_pages, total_in, total_out);
1031e4eb746SDavid Sterba case BTRFS_COMPRESS_LZO:
1041e4eb746SDavid Sterba return lzo_compress_pages(ws, mapping, start, pages,
1051e4eb746SDavid Sterba out_pages, total_in, total_out);
1061e4eb746SDavid Sterba case BTRFS_COMPRESS_ZSTD:
1071e4eb746SDavid Sterba return zstd_compress_pages(ws, mapping, start, pages,
1081e4eb746SDavid Sterba out_pages, total_in, total_out);
1091e4eb746SDavid Sterba case BTRFS_COMPRESS_NONE:
1101e4eb746SDavid Sterba default:
1111e4eb746SDavid Sterba /*
1121d8ba9e7SQu Wenruo * This can happen when compression races with remount setting
1131d8ba9e7SQu Wenruo * it to 'no compress', while caller doesn't call
1141d8ba9e7SQu Wenruo * inode_need_compress() to check if we really need to
1151d8ba9e7SQu Wenruo * compress.
1161d8ba9e7SQu Wenruo *
1171d8ba9e7SQu Wenruo * Not a big deal, just need to inform caller that we
1181d8ba9e7SQu Wenruo * haven't allocated any pages yet.
1191e4eb746SDavid Sterba */
1201d8ba9e7SQu Wenruo *out_pages = 0;
1211e4eb746SDavid Sterba return -E2BIG;
1221e4eb746SDavid Sterba }
1231e4eb746SDavid Sterba }
1241e4eb746SDavid Sterba
compression_decompress_bio(struct list_head * ws,struct compressed_bio * cb)1254a9e803eSSu Yue static int compression_decompress_bio(struct list_head *ws,
1261e4eb746SDavid Sterba struct compressed_bio *cb)
1271e4eb746SDavid Sterba {
1284a9e803eSSu Yue switch (cb->compress_type) {
1291e4eb746SDavid Sterba case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
1301e4eb746SDavid Sterba case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb);
1311e4eb746SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
1321e4eb746SDavid Sterba case BTRFS_COMPRESS_NONE:
1331e4eb746SDavid Sterba default:
1341e4eb746SDavid Sterba /*
1351e4eb746SDavid Sterba * This can't happen, the type is validated several times
1361e4eb746SDavid Sterba * before we get here.
1371e4eb746SDavid Sterba */
1381e4eb746SDavid Sterba BUG();
1391e4eb746SDavid Sterba }
1401e4eb746SDavid Sterba }
1411e4eb746SDavid Sterba
compression_decompress(int type,struct list_head * ws,const u8 * data_in,struct page * dest_page,unsigned long dest_pgoff,size_t srclen,size_t destlen)1421e4eb746SDavid Sterba static int compression_decompress(int type, struct list_head *ws,
1433e09b5b2SDavid Sterba const u8 *data_in, struct page *dest_page,
144*56a1bf2bSQu Wenruo unsigned long dest_pgoff, size_t srclen, size_t destlen)
1451e4eb746SDavid Sterba {
1461e4eb746SDavid Sterba switch (type) {
1471e4eb746SDavid Sterba case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
148*56a1bf2bSQu Wenruo dest_pgoff, srclen, destlen);
1491e4eb746SDavid Sterba case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
150*56a1bf2bSQu Wenruo dest_pgoff, srclen, destlen);
1511e4eb746SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
152*56a1bf2bSQu Wenruo dest_pgoff, srclen, destlen);
1531e4eb746SDavid Sterba case BTRFS_COMPRESS_NONE:
1541e4eb746SDavid Sterba default:
1551e4eb746SDavid Sterba /*
1561e4eb746SDavid Sterba * This can't happen, the type is validated several times
1571e4eb746SDavid Sterba * before we get here.
1581e4eb746SDavid Sterba */
1591e4eb746SDavid Sterba BUG();
1601e4eb746SDavid Sterba }
1611e4eb746SDavid Sterba }
1621e4eb746SDavid Sterba
btrfs_free_compressed_pages(struct compressed_bio * cb)16332586c5bSChristoph Hellwig static void btrfs_free_compressed_pages(struct compressed_bio *cb)
16432586c5bSChristoph Hellwig {
165a959a174SChristoph Hellwig for (unsigned int i = 0; i < cb->nr_pages; i++)
166a959a174SChristoph Hellwig put_page(cb->compressed_pages[i]);
16732586c5bSChristoph Hellwig kfree(cb->compressed_pages);
16832586c5bSChristoph Hellwig }
16932586c5bSChristoph Hellwig
1708140dc30SAnand Jain static int btrfs_decompress_bio(struct compressed_bio *cb);
17148a3b636SEric Sandeen
end_compressed_bio_read(struct btrfs_bio * bbio)17230493ff4SQu Wenruo static void end_compressed_bio_read(struct btrfs_bio *bbio)
17386ccbb4dSQu Wenruo {
174544fe4a9SChristoph Hellwig struct compressed_bio *cb = to_compressed_bio(bbio);
175544fe4a9SChristoph Hellwig blk_status_t status = bbio->bio.bi_status;
17686ccbb4dSQu Wenruo
177544fe4a9SChristoph Hellwig if (!status)
178544fe4a9SChristoph Hellwig status = errno_to_blk_status(btrfs_decompress_bio(cb));
17981bd9328SChristoph Hellwig
18032586c5bSChristoph Hellwig btrfs_free_compressed_pages(cb);
181b7d463a1SChristoph Hellwig btrfs_bio_end_io(cb->orig_bbio, status);
182917f32a2SChristoph Hellwig bio_put(&bbio->bio);
183c8b97818SChris Mason }
184c8b97818SChris Mason
185c8b97818SChris Mason /*
186c8b97818SChris Mason * Clear the writeback bits on all of the file
187c8b97818SChris Mason * pages for a compressed write
188c8b97818SChris Mason */
end_compressed_writeback(const struct compressed_bio * cb)189544fe4a9SChristoph Hellwig static noinline void end_compressed_writeback(const struct compressed_bio *cb)
190c8b97818SChris Mason {
191544fe4a9SChristoph Hellwig struct inode *inode = &cb->bbio.inode->vfs_inode;
192741ec653SQu Wenruo struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
19309cbfeafSKirill A. Shutemov unsigned long index = cb->start >> PAGE_SHIFT;
19409cbfeafSKirill A. Shutemov unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
195a75b81c3SVishal Moola (Oracle) struct folio_batch fbatch;
196544fe4a9SChristoph Hellwig const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
197c8b97818SChris Mason int i;
198c8b97818SChris Mason int ret;
199c8b97818SChris Mason
200606f82e7SJosef Bacik if (errno)
201606f82e7SJosef Bacik mapping_set_error(inode->i_mapping, errno);
2027bdcefc1SFilipe Manana
203a75b81c3SVishal Moola (Oracle) folio_batch_init(&fbatch);
204a75b81c3SVishal Moola (Oracle) while (index <= end_index) {
205a75b81c3SVishal Moola (Oracle) ret = filemap_get_folios(inode->i_mapping, &index, end_index,
206a75b81c3SVishal Moola (Oracle) &fbatch);
207a75b81c3SVishal Moola (Oracle)
208a75b81c3SVishal Moola (Oracle) if (ret == 0)
209a75b81c3SVishal Moola (Oracle) return;
210a75b81c3SVishal Moola (Oracle)
211c8b97818SChris Mason for (i = 0; i < ret; i++) {
212a75b81c3SVishal Moola (Oracle) struct folio *folio = fbatch.folios[i];
213a75b81c3SVishal Moola (Oracle)
214a75b81c3SVishal Moola (Oracle) btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
215741ec653SQu Wenruo cb->start, cb->len);
216c8b97818SChris Mason }
217a75b81c3SVishal Moola (Oracle) folio_batch_release(&fbatch);
218c8b97818SChris Mason }
219c8b97818SChris Mason /* the inode may be gone now */
220c8b97818SChris Mason }
221c8b97818SChris Mason
btrfs_finish_compressed_write_work(struct work_struct * work)222f9327a70SChristoph Hellwig static void btrfs_finish_compressed_write_work(struct work_struct *work)
223c8b97818SChris Mason {
224f9327a70SChristoph Hellwig struct compressed_bio *cb =
225f9327a70SChristoph Hellwig container_of(work, struct compressed_bio, write_end_work);
226f9327a70SChristoph Hellwig
2277dd43954SChristoph Hellwig btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len,
228544fe4a9SChristoph Hellwig cb->bbio.bio.bi_status == BLK_STS_OK);
229c8b97818SChris Mason
2307c0c7269SOmar Sandoval if (cb->writeback)
231544fe4a9SChristoph Hellwig end_compressed_writeback(cb);
2326853c64aSQu Wenruo /* Note, our inode could be gone now */
233c8b97818SChris Mason
23432586c5bSChristoph Hellwig btrfs_free_compressed_pages(cb);
235544fe4a9SChristoph Hellwig bio_put(&cb->bbio.bio);
2366853c64aSQu Wenruo }
2376853c64aSQu Wenruo
2386853c64aSQu Wenruo /*
2396853c64aSQu Wenruo * Do the cleanup once all the compressed pages hit the disk. This will clear
2406853c64aSQu Wenruo * writeback on the file pages and free the compressed pages.
2416853c64aSQu Wenruo *
2426853c64aSQu Wenruo * This also calls the writeback end hooks for the file pages so that metadata
2436853c64aSQu Wenruo * and checksums can be updated in the file.
2446853c64aSQu Wenruo */
end_compressed_bio_write(struct btrfs_bio * bbio)245917f32a2SChristoph Hellwig static void end_compressed_bio_write(struct btrfs_bio *bbio)
2466853c64aSQu Wenruo {
247544fe4a9SChristoph Hellwig struct compressed_bio *cb = to_compressed_bio(bbio);
248544fe4a9SChristoph Hellwig struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
2496853c64aSQu Wenruo
250fed8a72dSChristoph Hellwig queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
2512d4e0b84SQu Wenruo }
2522d4e0b84SQu Wenruo
btrfs_add_compressed_bio_pages(struct compressed_bio * cb)2534513cb0cSChristoph Hellwig static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
25410e924bcSChristoph Hellwig {
25510e924bcSChristoph Hellwig struct bio *bio = &cb->bbio.bio;
25643fa4219SChristoph Hellwig u32 offset = 0;
25710e924bcSChristoph Hellwig
25843fa4219SChristoph Hellwig while (offset < cb->compressed_len) {
25943fa4219SChristoph Hellwig u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
26010e924bcSChristoph Hellwig
26143fa4219SChristoph Hellwig /* Maximum compressed extent is smaller than bio size limit. */
26243fa4219SChristoph Hellwig __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
26343fa4219SChristoph Hellwig len, 0);
26443fa4219SChristoph Hellwig offset += len;
26510e924bcSChristoph Hellwig }
26610e924bcSChristoph Hellwig }
26710e924bcSChristoph Hellwig
268c8b97818SChris Mason /*
269c8b97818SChris Mason * worker function to build and submit bios for previously compressed pages.
270c8b97818SChris Mason * The corresponding pages in the inode should be marked for writeback
271c8b97818SChris Mason * and the compressed pages should have a reference on them for dropping
272c8b97818SChris Mason * when the IO is complete.
273c8b97818SChris Mason *
274c8b97818SChris Mason * This also checksums the file bytes and gets things ready for
275c8b97818SChris Mason * the end io hooks.
276c8b97818SChris Mason */
btrfs_submit_compressed_write(struct btrfs_ordered_extent * ordered,struct page ** compressed_pages,unsigned int nr_pages,blk_opf_t write_flags,bool writeback)277d611935bSChristoph Hellwig void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
278c8b97818SChris Mason struct page **compressed_pages,
27965b5355fSAnand Jain unsigned int nr_pages,
280bf9486d6SBart Van Assche blk_opf_t write_flags,
2817c0c7269SOmar Sandoval bool writeback)
282c8b97818SChris Mason {
283d611935bSChristoph Hellwig struct btrfs_inode *inode = BTRFS_I(ordered->inode);
284c7ee1819SNikolay Borisov struct btrfs_fs_info *fs_info = inode->root->fs_info;
285c8b97818SChris Mason struct compressed_bio *cb;
286c8b97818SChris Mason
287d611935bSChristoph Hellwig ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize));
288d611935bSChristoph Hellwig ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize));
289544fe4a9SChristoph Hellwig
290d611935bSChristoph Hellwig cb = alloc_compressed_bio(inode, ordered->file_offset,
291d611935bSChristoph Hellwig REQ_OP_WRITE | write_flags,
292544fe4a9SChristoph Hellwig end_compressed_bio_write);
293d611935bSChristoph Hellwig cb->start = ordered->file_offset;
294d611935bSChristoph Hellwig cb->len = ordered->num_bytes;
295c8b97818SChris Mason cb->compressed_pages = compressed_pages;
296d611935bSChristoph Hellwig cb->compressed_len = ordered->disk_num_bytes;
2977c0c7269SOmar Sandoval cb->writeback = writeback;
298fed8a72dSChristoph Hellwig INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
299c8b97818SChris Mason cb->nr_pages = nr_pages;
300d611935bSChristoph Hellwig cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT;
301ec63b84dSChristoph Hellwig cb->bbio.ordered = ordered;
3024513cb0cSChristoph Hellwig btrfs_add_compressed_bio_pages(cb);
303c8b97818SChris Mason
304ae42a154SChristoph Hellwig btrfs_submit_bio(&cb->bbio, 0);
305c8b97818SChris Mason }
306c8b97818SChris Mason
3076a404910SQu Wenruo /*
3086a404910SQu Wenruo * Add extra pages in the same compressed file extent so that we don't need to
3096a404910SQu Wenruo * re-read the same extent again and again.
3106a404910SQu Wenruo *
3116a404910SQu Wenruo * NOTE: this won't work well for subpage, as for subpage read, we lock the
3126a404910SQu Wenruo * full page then submit bio for each compressed/regular extents.
3136a404910SQu Wenruo *
3146a404910SQu Wenruo * This means, if we have several sectors in the same page points to the same
3156a404910SQu Wenruo * on-disk compressed data, we will re-read the same extent many times and
3166a404910SQu Wenruo * this function can only help for the next page.
3176a404910SQu Wenruo */
add_ra_bio_pages(struct inode * inode,u64 compressed_end,struct compressed_bio * cb,int * memstall,unsigned long * pflags)318771ed689SChris Mason static noinline int add_ra_bio_pages(struct inode *inode,
319771ed689SChris Mason u64 compressed_end,
3204088a47eSChristoph Hellwig struct compressed_bio *cb,
32182e60d00SJohannes Weiner int *memstall, unsigned long *pflags)
322771ed689SChris Mason {
3236a404910SQu Wenruo struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
324771ed689SChris Mason unsigned long end_index;
325b7d463a1SChristoph Hellwig struct bio *orig_bio = &cb->orig_bbio->bio;
326b7d463a1SChristoph Hellwig u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
327771ed689SChris Mason u64 isize = i_size_read(inode);
328771ed689SChris Mason int ret;
329771ed689SChris Mason struct page *page;
330771ed689SChris Mason struct extent_map *em;
331771ed689SChris Mason struct address_space *mapping = inode->i_mapping;
332771ed689SChris Mason struct extent_map_tree *em_tree;
333771ed689SChris Mason struct extent_io_tree *tree;
3346a404910SQu Wenruo int sectors_missed = 0;
335771ed689SChris Mason
336771ed689SChris Mason em_tree = &BTRFS_I(inode)->extent_tree;
337771ed689SChris Mason tree = &BTRFS_I(inode)->io_tree;
338771ed689SChris Mason
339771ed689SChris Mason if (isize == 0)
340771ed689SChris Mason return 0;
341771ed689SChris Mason
342ca62e85dSQu Wenruo /*
343ca62e85dSQu Wenruo * For current subpage support, we only support 64K page size,
344ca62e85dSQu Wenruo * which means maximum compressed extent size (128K) is just 2x page
345ca62e85dSQu Wenruo * size.
346ca62e85dSQu Wenruo * This makes readahead less effective, so here disable readahead for
347ca62e85dSQu Wenruo * subpage for now, until full compressed write is supported.
348ca62e85dSQu Wenruo */
349ca62e85dSQu Wenruo if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
350ca62e85dSQu Wenruo return 0;
351ca62e85dSQu Wenruo
35209cbfeafSKirill A. Shutemov end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
353771ed689SChris Mason
3546a404910SQu Wenruo while (cur < compressed_end) {
3556a404910SQu Wenruo u64 page_end;
3566a404910SQu Wenruo u64 pg_index = cur >> PAGE_SHIFT;
3576a404910SQu Wenruo u32 add_size;
358771ed689SChris Mason
359306e16ceSDavid Sterba if (pg_index > end_index)
360771ed689SChris Mason break;
361771ed689SChris Mason
3620a943c65SMatthew Wilcox page = xa_load(&mapping->i_pages, pg_index);
3633159f943SMatthew Wilcox if (page && !xa_is_value(page)) {
3646a404910SQu Wenruo sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >>
3656a404910SQu Wenruo fs_info->sectorsize_bits;
3666a404910SQu Wenruo
3676a404910SQu Wenruo /* Beyond threshold, no need to continue */
3686a404910SQu Wenruo if (sectors_missed > 4)
369771ed689SChris Mason break;
3706a404910SQu Wenruo
3716a404910SQu Wenruo /*
3726a404910SQu Wenruo * Jump to next page start as we already have page for
3736a404910SQu Wenruo * current offset.
3746a404910SQu Wenruo */
3756a404910SQu Wenruo cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
3766a404910SQu Wenruo continue;
377771ed689SChris Mason }
378771ed689SChris Mason
379c62d2555SMichal Hocko page = __page_cache_alloc(mapping_gfp_constraint(mapping,
380c62d2555SMichal Hocko ~__GFP_FS));
381771ed689SChris Mason if (!page)
382771ed689SChris Mason break;
383771ed689SChris Mason
384c62d2555SMichal Hocko if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
38509cbfeafSKirill A. Shutemov put_page(page);
3866a404910SQu Wenruo /* There is already a page, skip to page end */
3876a404910SQu Wenruo cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE;
3886a404910SQu Wenruo continue;
389771ed689SChris Mason }
390771ed689SChris Mason
39182e60d00SJohannes Weiner if (!*memstall && PageWorkingset(page)) {
3924088a47eSChristoph Hellwig psi_memstall_enter(pflags);
39382e60d00SJohannes Weiner *memstall = 1;
39482e60d00SJohannes Weiner }
3954088a47eSChristoph Hellwig
39632443de3SQu Wenruo ret = set_page_extent_mapped(page);
39732443de3SQu Wenruo if (ret < 0) {
39832443de3SQu Wenruo unlock_page(page);
39932443de3SQu Wenruo put_page(page);
40032443de3SQu Wenruo break;
40132443de3SQu Wenruo }
40232443de3SQu Wenruo
4036a404910SQu Wenruo page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
404570eb97bSJosef Bacik lock_extent(tree, cur, page_end, NULL);
405890871beSChris Mason read_lock(&em_tree->lock);
4066a404910SQu Wenruo em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
407890871beSChris Mason read_unlock(&em_tree->lock);
408771ed689SChris Mason
4096a404910SQu Wenruo /*
4106a404910SQu Wenruo * At this point, we have a locked page in the page cache for
4116a404910SQu Wenruo * these bytes in the file. But, we have to make sure they map
4126a404910SQu Wenruo * to this compressed extent on disk.
4136a404910SQu Wenruo */
4146a404910SQu Wenruo if (!em || cur < em->start ||
4156a404910SQu Wenruo (cur + fs_info->sectorsize > extent_map_end(em)) ||
41629e70be2SAnand Jain (em->block_start >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) {
417771ed689SChris Mason free_extent_map(em);
418570eb97bSJosef Bacik unlock_extent(tree, cur, page_end, NULL);
419771ed689SChris Mason unlock_page(page);
42009cbfeafSKirill A. Shutemov put_page(page);
421771ed689SChris Mason break;
422771ed689SChris Mason }
423c205565eSFilipe Manana add_size = min(em->start + em->len, page_end + 1) - cur;
424771ed689SChris Mason free_extent_map(em);
425771ed689SChris Mason
426771ed689SChris Mason if (page->index == end_index) {
4277073017aSJohannes Thumshirn size_t zero_offset = offset_in_page(isize);
428771ed689SChris Mason
429771ed689SChris Mason if (zero_offset) {
430771ed689SChris Mason int zeros;
43109cbfeafSKirill A. Shutemov zeros = PAGE_SIZE - zero_offset;
432d048b9c2SIra Weiny memzero_page(page, zero_offset, zeros);
433771ed689SChris Mason }
434771ed689SChris Mason }
435771ed689SChris Mason
436b7d463a1SChristoph Hellwig ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
4376a404910SQu Wenruo if (ret != add_size) {
438570eb97bSJosef Bacik unlock_extent(tree, cur, page_end, NULL);
439771ed689SChris Mason unlock_page(page);
44009cbfeafSKirill A. Shutemov put_page(page);
441771ed689SChris Mason break;
442771ed689SChris Mason }
4436a404910SQu Wenruo /*
4446a404910SQu Wenruo * If it's subpage, we also need to increase its
4456a404910SQu Wenruo * subpage::readers number, as at endio we will decrease
4466a404910SQu Wenruo * subpage::readers and to unlock the page.
4476a404910SQu Wenruo */
4486a404910SQu Wenruo if (fs_info->sectorsize < PAGE_SIZE)
4496a404910SQu Wenruo btrfs_subpage_start_reader(fs_info, page, cur, add_size);
4506a404910SQu Wenruo put_page(page);
4516a404910SQu Wenruo cur += add_size;
452771ed689SChris Mason }
453771ed689SChris Mason return 0;
454771ed689SChris Mason }
455771ed689SChris Mason
456c8b97818SChris Mason /*
457c8b97818SChris Mason * for a compressed read, the bio we get passed has all the inode pages
458c8b97818SChris Mason * in it. We don't actually do IO on those pages but allocate new ones
459c8b97818SChris Mason * to hold the compressed pages on disk.
460c8b97818SChris Mason *
4614f024f37SKent Overstreet * bio->bi_iter.bi_sector points to the compressed extent on disk
462c8b97818SChris Mason * bio->bi_io_vec points to all of the inode pages
463c8b97818SChris Mason *
464c8b97818SChris Mason * After the compressed pages are read, we copy the bytes into the
465c8b97818SChris Mason * bio we were passed and then call the bio end_io calls
466c8b97818SChris Mason */
btrfs_submit_compressed_read(struct btrfs_bio * bbio)467e1949310SChristoph Hellwig void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
468c8b97818SChris Mason {
469690834e4SChristoph Hellwig struct btrfs_inode *inode = bbio->inode;
470544fe4a9SChristoph Hellwig struct btrfs_fs_info *fs_info = inode->root->fs_info;
471544fe4a9SChristoph Hellwig struct extent_map_tree *em_tree = &inode->extent_tree;
472c8b97818SChris Mason struct compressed_bio *cb;
473356b4a2dSAnand Jain unsigned int compressed_len;
474690834e4SChristoph Hellwig u64 file_offset = bbio->file_offset;
475e04ca626SChris Mason u64 em_len;
476e04ca626SChris Mason u64 em_start;
477c8b97818SChris Mason struct extent_map *em;
47882e60d00SJohannes Weiner unsigned long pflags;
47982e60d00SJohannes Weiner int memstall = 0;
480f9f15de8SJosef Bacik blk_status_t ret;
481dd137dd1SSweet Tea Dorminy int ret2;
482c8b97818SChris Mason
483c8b97818SChris Mason /* we need the actual starting offset of this extent in the file */
484890871beSChris Mason read_lock(&em_tree->lock);
485557023eaSQu Wenruo em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
486890871beSChris Mason read_unlock(&em_tree->lock);
487f9f15de8SJosef Bacik if (!em) {
488f9f15de8SJosef Bacik ret = BLK_STS_IOERR;
489f9f15de8SJosef Bacik goto out;
490f9f15de8SJosef Bacik }
491c8b97818SChris Mason
492557023eaSQu Wenruo ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
493d20f7043SChris Mason compressed_len = em->block_len;
4946b82ce8dSliubo
495544fe4a9SChristoph Hellwig cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
496544fe4a9SChristoph Hellwig end_compressed_bio_read);
497c8b97818SChris Mason
498ff5b7ee3SYan Zheng cb->start = em->orig_start;
499e04ca626SChris Mason em_len = em->len;
500e04ca626SChris Mason em_start = em->start;
501d20f7043SChris Mason
502690834e4SChristoph Hellwig cb->len = bbio->bio.bi_iter.bi_size;
503c8b97818SChris Mason cb->compressed_len = compressed_len;
5041d8fa2e2SGoldwyn Rodrigues cb->compress_type = em->compress_type;
505b7d463a1SChristoph Hellwig cb->orig_bbio = bbio;
506c8b97818SChris Mason
5071d8fa2e2SGoldwyn Rodrigues free_extent_map(em);
5081d8fa2e2SGoldwyn Rodrigues
509dd137dd1SSweet Tea Dorminy cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
510dd137dd1SSweet Tea Dorminy cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
511f9f15de8SJosef Bacik if (!cb->compressed_pages) {
512f9f15de8SJosef Bacik ret = BLK_STS_RESOURCE;
513544fe4a9SChristoph Hellwig goto out_free_bio;
514f9f15de8SJosef Bacik }
5156b82ce8dSliubo
516dd137dd1SSweet Tea Dorminy ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
517dd137dd1SSweet Tea Dorminy if (ret2) {
5180e9350deSDan Carpenter ret = BLK_STS_RESOURCE;
519544fe4a9SChristoph Hellwig goto out_free_compressed_pages;
520c8b97818SChris Mason }
521c8b97818SChris Mason
522544fe4a9SChristoph Hellwig add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
523544fe4a9SChristoph Hellwig &pflags);
524771ed689SChris Mason
525771ed689SChris Mason /* include any pages we added in add_ra-bio_pages */
526690834e4SChristoph Hellwig cb->len = bbio->bio.bi_iter.bi_size;
5274513cb0cSChristoph Hellwig cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
5284513cb0cSChristoph Hellwig btrfs_add_compressed_bio_pages(cb);
529524bcd1eSChristoph Hellwig
53082e60d00SJohannes Weiner if (memstall)
5314088a47eSChristoph Hellwig psi_memstall_leave(&pflags);
5324088a47eSChristoph Hellwig
533e1949310SChristoph Hellwig btrfs_submit_bio(&cb->bbio, 0);
534cb4411ddSChristoph Hellwig return;
5356b82ce8dSliubo
536544fe4a9SChristoph Hellwig out_free_compressed_pages:
5376b82ce8dSliubo kfree(cb->compressed_pages);
538544fe4a9SChristoph Hellwig out_free_bio:
53910e924bcSChristoph Hellwig bio_put(&cb->bbio.bio);
540544fe4a9SChristoph Hellwig out:
541690834e4SChristoph Hellwig btrfs_bio_end_io(bbio, ret);
542c8b97818SChris Mason }
543261507a0SLi Zefan
54417b5a6c1STimofey Titovets /*
54517b5a6c1STimofey Titovets * Heuristic uses systematic sampling to collect data from the input data
54617b5a6c1STimofey Titovets * range, the logic can be tuned by the following constants:
54717b5a6c1STimofey Titovets *
54817b5a6c1STimofey Titovets * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
54917b5a6c1STimofey Titovets * @SAMPLING_INTERVAL - range from which the sampled data can be collected
55017b5a6c1STimofey Titovets */
55117b5a6c1STimofey Titovets #define SAMPLING_READ_SIZE (16)
55217b5a6c1STimofey Titovets #define SAMPLING_INTERVAL (256)
55317b5a6c1STimofey Titovets
55417b5a6c1STimofey Titovets /*
55517b5a6c1STimofey Titovets * For statistical analysis of the input data we consider bytes that form a
55617b5a6c1STimofey Titovets * Galois Field of 256 objects. Each object has an attribute count, ie. how
55717b5a6c1STimofey Titovets * many times the object appeared in the sample.
55817b5a6c1STimofey Titovets */
55917b5a6c1STimofey Titovets #define BUCKET_SIZE (256)
56017b5a6c1STimofey Titovets
56117b5a6c1STimofey Titovets /*
56217b5a6c1STimofey Titovets * The size of the sample is based on a statistical sampling rule of thumb.
56317b5a6c1STimofey Titovets * The common way is to perform sampling tests as long as the number of
56417b5a6c1STimofey Titovets * elements in each cell is at least 5.
56517b5a6c1STimofey Titovets *
56617b5a6c1STimofey Titovets * Instead of 5, we choose 32 to obtain more accurate results.
56717b5a6c1STimofey Titovets * If the data contain the maximum number of symbols, which is 256, we obtain a
56817b5a6c1STimofey Titovets * sample size bound by 8192.
56917b5a6c1STimofey Titovets *
57017b5a6c1STimofey Titovets * For a sample of at most 8KB of data per data range: 16 consecutive bytes
57117b5a6c1STimofey Titovets * from up to 512 locations.
57217b5a6c1STimofey Titovets */
57317b5a6c1STimofey Titovets #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
57417b5a6c1STimofey Titovets SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
57517b5a6c1STimofey Titovets
57617b5a6c1STimofey Titovets struct bucket_item {
57717b5a6c1STimofey Titovets u32 count;
57817b5a6c1STimofey Titovets };
5794e439a0bSTimofey Titovets
5804e439a0bSTimofey Titovets struct heuristic_ws {
58117b5a6c1STimofey Titovets /* Partial copy of input data */
58217b5a6c1STimofey Titovets u8 *sample;
583a440d48cSTimofey Titovets u32 sample_size;
58417b5a6c1STimofey Titovets /* Buckets store counters for each byte value */
58517b5a6c1STimofey Titovets struct bucket_item *bucket;
586440c840cSTimofey Titovets /* Sorting buffer */
587440c840cSTimofey Titovets struct bucket_item *bucket_b;
5884e439a0bSTimofey Titovets struct list_head list;
5894e439a0bSTimofey Titovets };
5904e439a0bSTimofey Titovets
59192ee5530SDennis Zhou static struct workspace_manager heuristic_wsm;
59292ee5530SDennis Zhou
free_heuristic_ws(struct list_head * ws)5934e439a0bSTimofey Titovets static void free_heuristic_ws(struct list_head *ws)
5944e439a0bSTimofey Titovets {
5954e439a0bSTimofey Titovets struct heuristic_ws *workspace;
5964e439a0bSTimofey Titovets
5974e439a0bSTimofey Titovets workspace = list_entry(ws, struct heuristic_ws, list);
5984e439a0bSTimofey Titovets
59917b5a6c1STimofey Titovets kvfree(workspace->sample);
60017b5a6c1STimofey Titovets kfree(workspace->bucket);
601440c840cSTimofey Titovets kfree(workspace->bucket_b);
6024e439a0bSTimofey Titovets kfree(workspace);
6034e439a0bSTimofey Titovets }
6044e439a0bSTimofey Titovets
alloc_heuristic_ws(unsigned int level)6057bf49943SDennis Zhou static struct list_head *alloc_heuristic_ws(unsigned int level)
6064e439a0bSTimofey Titovets {
6074e439a0bSTimofey Titovets struct heuristic_ws *ws;
6084e439a0bSTimofey Titovets
6094e439a0bSTimofey Titovets ws = kzalloc(sizeof(*ws), GFP_KERNEL);
6104e439a0bSTimofey Titovets if (!ws)
6114e439a0bSTimofey Titovets return ERR_PTR(-ENOMEM);
6124e439a0bSTimofey Titovets
61317b5a6c1STimofey Titovets ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
61417b5a6c1STimofey Titovets if (!ws->sample)
61517b5a6c1STimofey Titovets goto fail;
6164e439a0bSTimofey Titovets
61717b5a6c1STimofey Titovets ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
61817b5a6c1STimofey Titovets if (!ws->bucket)
61917b5a6c1STimofey Titovets goto fail;
62017b5a6c1STimofey Titovets
621440c840cSTimofey Titovets ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
622440c840cSTimofey Titovets if (!ws->bucket_b)
623440c840cSTimofey Titovets goto fail;
624440c840cSTimofey Titovets
62517b5a6c1STimofey Titovets INIT_LIST_HEAD(&ws->list);
6264e439a0bSTimofey Titovets return &ws->list;
62717b5a6c1STimofey Titovets fail:
62817b5a6c1STimofey Titovets free_heuristic_ws(&ws->list);
62917b5a6c1STimofey Titovets return ERR_PTR(-ENOMEM);
6304e439a0bSTimofey Titovets }
6314e439a0bSTimofey Titovets
632ca4ac360SDennis Zhou const struct btrfs_compress_op btrfs_heuristic_compress = {
633be951045SDavid Sterba .workspace_manager = &heuristic_wsm,
634ca4ac360SDennis Zhou };
635ca4ac360SDennis Zhou
636e8c9f186SDavid Sterba static const struct btrfs_compress_op * const btrfs_compress_op[] = {
637ca4ac360SDennis Zhou /* The heuristic is represented as compression type 0 */
638ca4ac360SDennis Zhou &btrfs_heuristic_compress,
639261507a0SLi Zefan &btrfs_zlib_compress,
640a6fa6faeSLi Zefan &btrfs_lzo_compress,
6415c1aab1dSNick Terrell &btrfs_zstd_compress,
642261507a0SLi Zefan };
643261507a0SLi Zefan
alloc_workspace(int type,unsigned int level)644c778df14SDavid Sterba static struct list_head *alloc_workspace(int type, unsigned int level)
645c778df14SDavid Sterba {
646c778df14SDavid Sterba switch (type) {
647c778df14SDavid Sterba case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
648c778df14SDavid Sterba case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
649c778df14SDavid Sterba case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
650c778df14SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
651c778df14SDavid Sterba default:
652c778df14SDavid Sterba /*
653c778df14SDavid Sterba * This can't happen, the type is validated several times
654c778df14SDavid Sterba * before we get here.
655c778df14SDavid Sterba */
656c778df14SDavid Sterba BUG();
657c778df14SDavid Sterba }
658c778df14SDavid Sterba }
659c778df14SDavid Sterba
free_workspace(int type,struct list_head * ws)6601e002351SDavid Sterba static void free_workspace(int type, struct list_head *ws)
6611e002351SDavid Sterba {
6621e002351SDavid Sterba switch (type) {
6631e002351SDavid Sterba case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
6641e002351SDavid Sterba case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
6651e002351SDavid Sterba case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws);
6661e002351SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
6671e002351SDavid Sterba default:
6681e002351SDavid Sterba /*
6691e002351SDavid Sterba * This can't happen, the type is validated several times
6701e002351SDavid Sterba * before we get here.
6711e002351SDavid Sterba */
6721e002351SDavid Sterba BUG();
6731e002351SDavid Sterba }
6741e002351SDavid Sterba }
6751e002351SDavid Sterba
btrfs_init_workspace_manager(int type)676d5517033SDavid Sterba static void btrfs_init_workspace_manager(int type)
677261507a0SLi Zefan {
6780cf25213SDavid Sterba struct workspace_manager *wsm;
6794e439a0bSTimofey Titovets struct list_head *workspace;
680261507a0SLi Zefan
6810cf25213SDavid Sterba wsm = btrfs_compress_op[type]->workspace_manager;
68292ee5530SDennis Zhou INIT_LIST_HEAD(&wsm->idle_ws);
68392ee5530SDennis Zhou spin_lock_init(&wsm->ws_lock);
68492ee5530SDennis Zhou atomic_set(&wsm->total_ws, 0);
68592ee5530SDennis Zhou init_waitqueue_head(&wsm->ws_wait);
686f77dd0d6SDavid Sterba
687f77dd0d6SDavid Sterba /*
6881666edabSDennis Zhou * Preallocate one workspace for each compression type so we can
6891666edabSDennis Zhou * guarantee forward progress in the worst case
690f77dd0d6SDavid Sterba */
691c778df14SDavid Sterba workspace = alloc_workspace(type, 0);
692f77dd0d6SDavid Sterba if (IS_ERR(workspace)) {
6931666edabSDennis Zhou pr_warn(
6941666edabSDennis Zhou "BTRFS: cannot preallocate compression workspace, will try later\n");
695f77dd0d6SDavid Sterba } else {
69692ee5530SDennis Zhou atomic_set(&wsm->total_ws, 1);
69792ee5530SDennis Zhou wsm->free_ws = 1;
69892ee5530SDennis Zhou list_add(workspace, &wsm->idle_ws);
699f77dd0d6SDavid Sterba }
700261507a0SLi Zefan }
7011666edabSDennis Zhou
btrfs_cleanup_workspace_manager(int type)7022510307eSDavid Sterba static void btrfs_cleanup_workspace_manager(int type)
7031666edabSDennis Zhou {
7042dba7143SDavid Sterba struct workspace_manager *wsman;
7051666edabSDennis Zhou struct list_head *ws;
7061666edabSDennis Zhou
7072dba7143SDavid Sterba wsman = btrfs_compress_op[type]->workspace_manager;
7081666edabSDennis Zhou while (!list_empty(&wsman->idle_ws)) {
7091666edabSDennis Zhou ws = wsman->idle_ws.next;
7101666edabSDennis Zhou list_del(ws);
7111e002351SDavid Sterba free_workspace(type, ws);
7121666edabSDennis Zhou atomic_dec(&wsman->total_ws);
7131666edabSDennis Zhou }
714261507a0SLi Zefan }
715261507a0SLi Zefan
716261507a0SLi Zefan /*
717e721e49dSDavid Sterba * This finds an available workspace or allocates a new one.
718e721e49dSDavid Sterba * If it's not possible to allocate a new one, waits until there's one.
719e721e49dSDavid Sterba * Preallocation makes a forward progress guarantees and we do not return
720e721e49dSDavid Sterba * errors.
721261507a0SLi Zefan */
btrfs_get_workspace(int type,unsigned int level)7225907a9bbSDavid Sterba struct list_head *btrfs_get_workspace(int type, unsigned int level)
723261507a0SLi Zefan {
7245907a9bbSDavid Sterba struct workspace_manager *wsm;
725261507a0SLi Zefan struct list_head *workspace;
726261507a0SLi Zefan int cpus = num_online_cpus();
727fe308533SDavid Sterba unsigned nofs_flag;
7284e439a0bSTimofey Titovets struct list_head *idle_ws;
7294e439a0bSTimofey Titovets spinlock_t *ws_lock;
7304e439a0bSTimofey Titovets atomic_t *total_ws;
7314e439a0bSTimofey Titovets wait_queue_head_t *ws_wait;
7324e439a0bSTimofey Titovets int *free_ws;
733261507a0SLi Zefan
7345907a9bbSDavid Sterba wsm = btrfs_compress_op[type]->workspace_manager;
73592ee5530SDennis Zhou idle_ws = &wsm->idle_ws;
73692ee5530SDennis Zhou ws_lock = &wsm->ws_lock;
73792ee5530SDennis Zhou total_ws = &wsm->total_ws;
73892ee5530SDennis Zhou ws_wait = &wsm->ws_wait;
73992ee5530SDennis Zhou free_ws = &wsm->free_ws;
7404e439a0bSTimofey Titovets
741261507a0SLi Zefan again:
742d9187649SByongho Lee spin_lock(ws_lock);
743d9187649SByongho Lee if (!list_empty(idle_ws)) {
744d9187649SByongho Lee workspace = idle_ws->next;
745261507a0SLi Zefan list_del(workspace);
7466ac10a6aSDavid Sterba (*free_ws)--;
747d9187649SByongho Lee spin_unlock(ws_lock);
748261507a0SLi Zefan return workspace;
749261507a0SLi Zefan
750261507a0SLi Zefan }
7516ac10a6aSDavid Sterba if (atomic_read(total_ws) > cpus) {
752261507a0SLi Zefan DEFINE_WAIT(wait);
753261507a0SLi Zefan
754d9187649SByongho Lee spin_unlock(ws_lock);
755d9187649SByongho Lee prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
7566ac10a6aSDavid Sterba if (atomic_read(total_ws) > cpus && !*free_ws)
757261507a0SLi Zefan schedule();
758d9187649SByongho Lee finish_wait(ws_wait, &wait);
759261507a0SLi Zefan goto again;
760261507a0SLi Zefan }
7616ac10a6aSDavid Sterba atomic_inc(total_ws);
762d9187649SByongho Lee spin_unlock(ws_lock);
763261507a0SLi Zefan
764fe308533SDavid Sterba /*
765fe308533SDavid Sterba * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
766fe308533SDavid Sterba * to turn it off here because we might get called from the restricted
767fe308533SDavid Sterba * context of btrfs_compress_bio/btrfs_compress_pages
768fe308533SDavid Sterba */
769fe308533SDavid Sterba nofs_flag = memalloc_nofs_save();
770c778df14SDavid Sterba workspace = alloc_workspace(type, level);
771fe308533SDavid Sterba memalloc_nofs_restore(nofs_flag);
772fe308533SDavid Sterba
773261507a0SLi Zefan if (IS_ERR(workspace)) {
7746ac10a6aSDavid Sterba atomic_dec(total_ws);
775d9187649SByongho Lee wake_up(ws_wait);
776e721e49dSDavid Sterba
777e721e49dSDavid Sterba /*
778e721e49dSDavid Sterba * Do not return the error but go back to waiting. There's a
779e721e49dSDavid Sterba * workspace preallocated for each type and the compression
780e721e49dSDavid Sterba * time is bounded so we get to a workspace eventually. This
781e721e49dSDavid Sterba * makes our caller's life easier.
78252356716SDavid Sterba *
78352356716SDavid Sterba * To prevent silent and low-probability deadlocks (when the
78452356716SDavid Sterba * initial preallocation fails), check if there are any
78552356716SDavid Sterba * workspaces at all.
786e721e49dSDavid Sterba */
78752356716SDavid Sterba if (atomic_read(total_ws) == 0) {
78852356716SDavid Sterba static DEFINE_RATELIMIT_STATE(_rs,
78952356716SDavid Sterba /* once per minute */ 60 * HZ,
79052356716SDavid Sterba /* no burst */ 1);
79152356716SDavid Sterba
79252356716SDavid Sterba if (__ratelimit(&_rs)) {
793ab8d0fc4SJeff Mahoney pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
79452356716SDavid Sterba }
79552356716SDavid Sterba }
796e721e49dSDavid Sterba goto again;
797261507a0SLi Zefan }
798261507a0SLi Zefan return workspace;
799261507a0SLi Zefan }
800261507a0SLi Zefan
get_workspace(int type,int level)8017bf49943SDennis Zhou static struct list_head *get_workspace(int type, int level)
802929f4bafSDennis Zhou {
8036a0d1272SDavid Sterba switch (type) {
8045907a9bbSDavid Sterba case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
8056a0d1272SDavid Sterba case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
8065907a9bbSDavid Sterba case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level);
8076a0d1272SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
8086a0d1272SDavid Sterba default:
8096a0d1272SDavid Sterba /*
8106a0d1272SDavid Sterba * This can't happen, the type is validated several times
8116a0d1272SDavid Sterba * before we get here.
8126a0d1272SDavid Sterba */
8136a0d1272SDavid Sterba BUG();
8146a0d1272SDavid Sterba }
815929f4bafSDennis Zhou }
816929f4bafSDennis Zhou
817261507a0SLi Zefan /*
818261507a0SLi Zefan * put a workspace struct back on the list or free it if we have enough
819261507a0SLi Zefan * idle ones sitting around
820261507a0SLi Zefan */
btrfs_put_workspace(int type,struct list_head * ws)821a3bbd2a9SDavid Sterba void btrfs_put_workspace(int type, struct list_head *ws)
822261507a0SLi Zefan {
823a3bbd2a9SDavid Sterba struct workspace_manager *wsm;
8244e439a0bSTimofey Titovets struct list_head *idle_ws;
8254e439a0bSTimofey Titovets spinlock_t *ws_lock;
8264e439a0bSTimofey Titovets atomic_t *total_ws;
8274e439a0bSTimofey Titovets wait_queue_head_t *ws_wait;
8284e439a0bSTimofey Titovets int *free_ws;
8294e439a0bSTimofey Titovets
830a3bbd2a9SDavid Sterba wsm = btrfs_compress_op[type]->workspace_manager;
83192ee5530SDennis Zhou idle_ws = &wsm->idle_ws;
83292ee5530SDennis Zhou ws_lock = &wsm->ws_lock;
83392ee5530SDennis Zhou total_ws = &wsm->total_ws;
83492ee5530SDennis Zhou ws_wait = &wsm->ws_wait;
83592ee5530SDennis Zhou free_ws = &wsm->free_ws;
836261507a0SLi Zefan
837d9187649SByongho Lee spin_lock(ws_lock);
83826b28dceSNick Terrell if (*free_ws <= num_online_cpus()) {
839929f4bafSDennis Zhou list_add(ws, idle_ws);
8406ac10a6aSDavid Sterba (*free_ws)++;
841d9187649SByongho Lee spin_unlock(ws_lock);
842261507a0SLi Zefan goto wake;
843261507a0SLi Zefan }
844d9187649SByongho Lee spin_unlock(ws_lock);
845261507a0SLi Zefan
8461e002351SDavid Sterba free_workspace(type, ws);
8476ac10a6aSDavid Sterba atomic_dec(total_ws);
848261507a0SLi Zefan wake:
849093258e6SDavid Sterba cond_wake_up(ws_wait);
850261507a0SLi Zefan }
851261507a0SLi Zefan
put_workspace(int type,struct list_head * ws)852929f4bafSDennis Zhou static void put_workspace(int type, struct list_head *ws)
853929f4bafSDennis Zhou {
854bd3a5287SDavid Sterba switch (type) {
855a3bbd2a9SDavid Sterba case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
856a3bbd2a9SDavid Sterba case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
857a3bbd2a9SDavid Sterba case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws);
858bd3a5287SDavid Sterba case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
859bd3a5287SDavid Sterba default:
860bd3a5287SDavid Sterba /*
861bd3a5287SDavid Sterba * This can't happen, the type is validated several times
862bd3a5287SDavid Sterba * before we get here.
863bd3a5287SDavid Sterba */
864bd3a5287SDavid Sterba BUG();
865bd3a5287SDavid Sterba }
866929f4bafSDennis Zhou }
867929f4bafSDennis Zhou
868261507a0SLi Zefan /*
869adbab642SAnand Jain * Adjust @level according to the limits of the compression algorithm or
870adbab642SAnand Jain * fallback to default
871adbab642SAnand Jain */
btrfs_compress_set_level(int type,unsigned level)872adbab642SAnand Jain static unsigned int btrfs_compress_set_level(int type, unsigned level)
873adbab642SAnand Jain {
874adbab642SAnand Jain const struct btrfs_compress_op *ops = btrfs_compress_op[type];
875adbab642SAnand Jain
876adbab642SAnand Jain if (level == 0)
877adbab642SAnand Jain level = ops->default_level;
878adbab642SAnand Jain else
879adbab642SAnand Jain level = min(level, ops->max_level);
880adbab642SAnand Jain
881adbab642SAnand Jain return level;
882adbab642SAnand Jain }
883adbab642SAnand Jain
884adbab642SAnand Jain /*
88538c31464SDavid Sterba * Given an address space and start and length, compress the bytes into @pages
88638c31464SDavid Sterba * that are allocated on demand.
887261507a0SLi Zefan *
888f51d2b59SDavid Sterba * @type_level is encoded algorithm and level, where level 0 means whatever
889f51d2b59SDavid Sterba * default the algorithm chooses and is opaque here;
890f51d2b59SDavid Sterba * - compression algo are 0-3
891f51d2b59SDavid Sterba * - the level are bits 4-7
892f51d2b59SDavid Sterba *
8934d3a800eSDavid Sterba * @out_pages is an in/out parameter, holds maximum number of pages to allocate
8944d3a800eSDavid Sterba * and returns number of actually allocated pages
895261507a0SLi Zefan *
89638c31464SDavid Sterba * @total_in is used to return the number of bytes actually read. It
89738c31464SDavid Sterba * may be smaller than the input length if we had to exit early because we
898261507a0SLi Zefan * ran out of room in the pages array or because we cross the
899261507a0SLi Zefan * max_out threshold.
900261507a0SLi Zefan *
90138c31464SDavid Sterba * @total_out is an in/out parameter, must be set to the input length and will
90238c31464SDavid Sterba * be also used to return the total number of compressed bytes
903261507a0SLi Zefan */
btrfs_compress_pages(unsigned int type_level,struct address_space * mapping,u64 start,struct page ** pages,unsigned long * out_pages,unsigned long * total_in,unsigned long * total_out)904f51d2b59SDavid Sterba int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
90538c31464SDavid Sterba u64 start, struct page **pages,
906261507a0SLi Zefan unsigned long *out_pages,
907261507a0SLi Zefan unsigned long *total_in,
908e5d74902SDavid Sterba unsigned long *total_out)
909261507a0SLi Zefan {
9101972708aSDennis Zhou int type = btrfs_compress_type(type_level);
9117bf49943SDennis Zhou int level = btrfs_compress_level(type_level);
912261507a0SLi Zefan struct list_head *workspace;
913261507a0SLi Zefan int ret;
914261507a0SLi Zefan
915b0c1fe1eSDavid Sterba level = btrfs_compress_set_level(type, level);
9167bf49943SDennis Zhou workspace = get_workspace(type, level);
9171e4eb746SDavid Sterba ret = compression_compress_pages(type, workspace, mapping, start, pages,
9181e4eb746SDavid Sterba out_pages, total_in, total_out);
919929f4bafSDennis Zhou put_workspace(type, workspace);
920261507a0SLi Zefan return ret;
921261507a0SLi Zefan }
922261507a0SLi Zefan
btrfs_decompress_bio(struct compressed_bio * cb)9238140dc30SAnand Jain static int btrfs_decompress_bio(struct compressed_bio *cb)
924261507a0SLi Zefan {
925261507a0SLi Zefan struct list_head *workspace;
926261507a0SLi Zefan int ret;
9278140dc30SAnand Jain int type = cb->compress_type;
928261507a0SLi Zefan
9297bf49943SDennis Zhou workspace = get_workspace(type, 0);
9304a9e803eSSu Yue ret = compression_decompress_bio(workspace, cb);
931929f4bafSDennis Zhou put_workspace(type, workspace);
932e1ddce71SAnand Jain
9337edb9a3eSChristoph Hellwig if (!ret)
934b7d463a1SChristoph Hellwig zero_fill_bio(&cb->orig_bbio->bio);
935261507a0SLi Zefan return ret;
936261507a0SLi Zefan }
937261507a0SLi Zefan
938261507a0SLi Zefan /*
939261507a0SLi Zefan * a less complex decompression routine. Our compressed data fits in a
940261507a0SLi Zefan * single page, and we want to read a single page out of it.
941261507a0SLi Zefan * start_byte tells us the offset into the compressed data we're interested in
942261507a0SLi Zefan */
btrfs_decompress(int type,const u8 * data_in,struct page * dest_page,unsigned long dest_pgoff,size_t srclen,size_t destlen)9433e09b5b2SDavid Sterba int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
944*56a1bf2bSQu Wenruo unsigned long dest_pgoff, size_t srclen, size_t destlen)
945261507a0SLi Zefan {
946*56a1bf2bSQu Wenruo struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
947261507a0SLi Zefan struct list_head *workspace;
948*56a1bf2bSQu Wenruo const u32 sectorsize = fs_info->sectorsize;
949261507a0SLi Zefan int ret;
950261507a0SLi Zefan
951*56a1bf2bSQu Wenruo /*
952*56a1bf2bSQu Wenruo * The full destination page range should not exceed the page size.
953*56a1bf2bSQu Wenruo * And the @destlen should not exceed sectorsize, as this is only called for
954*56a1bf2bSQu Wenruo * inline file extents, which should not exceed sectorsize.
955*56a1bf2bSQu Wenruo */
956*56a1bf2bSQu Wenruo ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
957*56a1bf2bSQu Wenruo
9587bf49943SDennis Zhou workspace = get_workspace(type, 0);
9591e4eb746SDavid Sterba ret = compression_decompress(type, workspace, data_in, dest_page,
960*56a1bf2bSQu Wenruo dest_pgoff, srclen, destlen);
961929f4bafSDennis Zhou put_workspace(type, workspace);
9627bf49943SDennis Zhou
963261507a0SLi Zefan return ret;
964261507a0SLi Zefan }
965261507a0SLi Zefan
btrfs_init_compress(void)9665565b8e0SQu Wenruo int __init btrfs_init_compress(void)
9671666edabSDennis Zhou {
968544fe4a9SChristoph Hellwig if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
969544fe4a9SChristoph Hellwig offsetof(struct compressed_bio, bbio.bio),
970544fe4a9SChristoph Hellwig BIOSET_NEED_BVECS))
971544fe4a9SChristoph Hellwig return -ENOMEM;
972d5517033SDavid Sterba btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
973d5517033SDavid Sterba btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
974d5517033SDavid Sterba btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
975d5517033SDavid Sterba zstd_init_workspace_manager();
9765565b8e0SQu Wenruo return 0;
9771666edabSDennis Zhou }
9781666edabSDennis Zhou
btrfs_exit_compress(void)979e67c718bSDavid Sterba void __cold btrfs_exit_compress(void)
980261507a0SLi Zefan {
9812510307eSDavid Sterba btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
9822510307eSDavid Sterba btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
9832510307eSDavid Sterba btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
9842510307eSDavid Sterba zstd_cleanup_workspace_manager();
985544fe4a9SChristoph Hellwig bioset_exit(&btrfs_compressed_bioset);
986261507a0SLi Zefan }
9873a39c18dSLi Zefan
9883a39c18dSLi Zefan /*
9891c3dc173SQu Wenruo * Copy decompressed data from working buffer to pages.
9903a39c18dSLi Zefan *
9911c3dc173SQu Wenruo * @buf: The decompressed data buffer
9921c3dc173SQu Wenruo * @buf_len: The decompressed data length
9931c3dc173SQu Wenruo * @decompressed: Number of bytes that are already decompressed inside the
9941c3dc173SQu Wenruo * compressed extent
9951c3dc173SQu Wenruo * @cb: The compressed extent descriptor
9961c3dc173SQu Wenruo * @orig_bio: The original bio that the caller wants to read for
9973a39c18dSLi Zefan *
9981c3dc173SQu Wenruo * An easier to understand graph is like below:
9991c3dc173SQu Wenruo *
10001c3dc173SQu Wenruo * |<- orig_bio ->| |<- orig_bio->|
10011c3dc173SQu Wenruo * |<------- full decompressed extent ----->|
10021c3dc173SQu Wenruo * |<----------- @cb range ---->|
10031c3dc173SQu Wenruo * | |<-- @buf_len -->|
10041c3dc173SQu Wenruo * |<--- @decompressed --->|
10051c3dc173SQu Wenruo *
10061c3dc173SQu Wenruo * Note that, @cb can be a subpage of the full decompressed extent, but
10071c3dc173SQu Wenruo * @cb->start always has the same as the orig_file_offset value of the full
10081c3dc173SQu Wenruo * decompressed extent.
10091c3dc173SQu Wenruo *
10101c3dc173SQu Wenruo * When reading compressed extent, we have to read the full compressed extent,
10111c3dc173SQu Wenruo * while @orig_bio may only want part of the range.
10121c3dc173SQu Wenruo * Thus this function will ensure only data covered by @orig_bio will be copied
10131c3dc173SQu Wenruo * to.
10141c3dc173SQu Wenruo *
10151c3dc173SQu Wenruo * Return 0 if we have copied all needed contents for @orig_bio.
10161c3dc173SQu Wenruo * Return >0 if we need continue decompress.
10173a39c18dSLi Zefan */
btrfs_decompress_buf2page(const char * buf,u32 buf_len,struct compressed_bio * cb,u32 decompressed)10181c3dc173SQu Wenruo int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
10191c3dc173SQu Wenruo struct compressed_bio *cb, u32 decompressed)
10203a39c18dSLi Zefan {
1021b7d463a1SChristoph Hellwig struct bio *orig_bio = &cb->orig_bbio->bio;
10221c3dc173SQu Wenruo /* Offset inside the full decompressed extent */
10231c3dc173SQu Wenruo u32 cur_offset;
10243a39c18dSLi Zefan
10251c3dc173SQu Wenruo cur_offset = decompressed;
10261c3dc173SQu Wenruo /* The main loop to do the copy */
10271c3dc173SQu Wenruo while (cur_offset < decompressed + buf_len) {
10281c3dc173SQu Wenruo struct bio_vec bvec;
10291c3dc173SQu Wenruo size_t copy_len;
10301c3dc173SQu Wenruo u32 copy_start;
10311c3dc173SQu Wenruo /* Offset inside the full decompressed extent */
10321c3dc173SQu Wenruo u32 bvec_offset;
10331c3dc173SQu Wenruo
10341c3dc173SQu Wenruo bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
10353a39c18dSLi Zefan /*
10361c3dc173SQu Wenruo * cb->start may underflow, but subtracting that value can still
10371c3dc173SQu Wenruo * give us correct offset inside the full decompressed extent.
10383a39c18dSLi Zefan */
10391c3dc173SQu Wenruo bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
10403a39c18dSLi Zefan
10411c3dc173SQu Wenruo /* Haven't reached the bvec range, exit */
10421c3dc173SQu Wenruo if (decompressed + buf_len <= bvec_offset)
10433a39c18dSLi Zefan return 1;
10443a39c18dSLi Zefan
10451c3dc173SQu Wenruo copy_start = max(cur_offset, bvec_offset);
10461c3dc173SQu Wenruo copy_len = min(bvec_offset + bvec.bv_len,
10471c3dc173SQu Wenruo decompressed + buf_len) - copy_start;
10481c3dc173SQu Wenruo ASSERT(copy_len);
10491c3dc173SQu Wenruo
10503a39c18dSLi Zefan /*
10511c3dc173SQu Wenruo * Extra range check to ensure we didn't go beyond
10521c3dc173SQu Wenruo * @buf + @buf_len.
10533a39c18dSLi Zefan */
10541c3dc173SQu Wenruo ASSERT(copy_start - decompressed < buf_len);
10551c3dc173SQu Wenruo memcpy_to_page(bvec.bv_page, bvec.bv_offset,
10561c3dc173SQu Wenruo buf + copy_start - decompressed, copy_len);
10571c3dc173SQu Wenruo cur_offset += copy_len;
1058974b1adcSChristoph Hellwig
10591c3dc173SQu Wenruo bio_advance(orig_bio, copy_len);
10601c3dc173SQu Wenruo /* Finished the bio */
10611c3dc173SQu Wenruo if (!orig_bio->bi_iter.bi_size)
10623a39c18dSLi Zefan return 0;
10633a39c18dSLi Zefan }
10643a39c18dSLi Zefan return 1;
10653a39c18dSLi Zefan }
1066c2fcdcdfSTimofey Titovets
106719562430STimofey Titovets /*
106819562430STimofey Titovets * Shannon Entropy calculation
106919562430STimofey Titovets *
107052042d8eSAndrea Gelmini * Pure byte distribution analysis fails to determine compressibility of data.
107119562430STimofey Titovets * Try calculating entropy to estimate the average minimum number of bits
107219562430STimofey Titovets * needed to encode the sampled data.
107319562430STimofey Titovets *
107419562430STimofey Titovets * For convenience, return the percentage of needed bits, instead of amount of
107519562430STimofey Titovets * bits directly.
107619562430STimofey Titovets *
107719562430STimofey Titovets * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
107819562430STimofey Titovets * and can be compressible with high probability
107919562430STimofey Titovets *
108019562430STimofey Titovets * @ENTROPY_LVL_HIGH - data are not compressible with high probability
108119562430STimofey Titovets *
108219562430STimofey Titovets * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
108319562430STimofey Titovets */
108419562430STimofey Titovets #define ENTROPY_LVL_ACEPTABLE (65)
108519562430STimofey Titovets #define ENTROPY_LVL_HIGH (80)
108619562430STimofey Titovets
108719562430STimofey Titovets /*
108819562430STimofey Titovets * For increasead precision in shannon_entropy calculation,
108919562430STimofey Titovets * let's do pow(n, M) to save more digits after comma:
109019562430STimofey Titovets *
109119562430STimofey Titovets * - maximum int bit length is 64
109219562430STimofey Titovets * - ilog2(MAX_SAMPLE_SIZE) -> 13
109319562430STimofey Titovets * - 13 * 4 = 52 < 64 -> M = 4
109419562430STimofey Titovets *
109519562430STimofey Titovets * So use pow(n, 4).
109619562430STimofey Titovets */
ilog2_w(u64 n)109719562430STimofey Titovets static inline u32 ilog2_w(u64 n)
109819562430STimofey Titovets {
109919562430STimofey Titovets return ilog2(n * n * n * n);
110019562430STimofey Titovets }
110119562430STimofey Titovets
shannon_entropy(struct heuristic_ws * ws)110219562430STimofey Titovets static u32 shannon_entropy(struct heuristic_ws *ws)
110319562430STimofey Titovets {
110419562430STimofey Titovets const u32 entropy_max = 8 * ilog2_w(2);
110519562430STimofey Titovets u32 entropy_sum = 0;
110619562430STimofey Titovets u32 p, p_base, sz_base;
110719562430STimofey Titovets u32 i;
110819562430STimofey Titovets
110919562430STimofey Titovets sz_base = ilog2_w(ws->sample_size);
111019562430STimofey Titovets for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
111119562430STimofey Titovets p = ws->bucket[i].count;
111219562430STimofey Titovets p_base = ilog2_w(p);
111319562430STimofey Titovets entropy_sum += p * (sz_base - p_base);
111419562430STimofey Titovets }
111519562430STimofey Titovets
111619562430STimofey Titovets entropy_sum /= ws->sample_size;
111719562430STimofey Titovets return entropy_sum * 100 / entropy_max;
111819562430STimofey Titovets }
111919562430STimofey Titovets
1120440c840cSTimofey Titovets #define RADIX_BASE 4U
1121440c840cSTimofey Titovets #define COUNTERS_SIZE (1U << RADIX_BASE)
1122858177d3STimofey Titovets
get4bits(u64 num,int shift)1123440c840cSTimofey Titovets static u8 get4bits(u64 num, int shift) {
1124440c840cSTimofey Titovets u8 low4bits;
1125440c840cSTimofey Titovets
1126440c840cSTimofey Titovets num >>= shift;
1127440c840cSTimofey Titovets /* Reverse order */
1128440c840cSTimofey Titovets low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1129440c840cSTimofey Titovets return low4bits;
1130440c840cSTimofey Titovets }
1131440c840cSTimofey Titovets
1132440c840cSTimofey Titovets /*
1133440c840cSTimofey Titovets * Use 4 bits as radix base
113452042d8eSAndrea Gelmini * Use 16 u32 counters for calculating new position in buf array
1135440c840cSTimofey Titovets *
1136440c840cSTimofey Titovets * @array - array that will be sorted
1137440c840cSTimofey Titovets * @array_buf - buffer array to store sorting results
1138440c840cSTimofey Titovets * must be equal in size to @array
1139440c840cSTimofey Titovets * @num - array size
1140440c840cSTimofey Titovets */
radix_sort(struct bucket_item * array,struct bucket_item * array_buf,int num)114123ae8c63SDavid Sterba static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
114236243c91SDavid Sterba int num)
1143440c840cSTimofey Titovets {
1144440c840cSTimofey Titovets u64 max_num;
1145440c840cSTimofey Titovets u64 buf_num;
1146440c840cSTimofey Titovets u32 counters[COUNTERS_SIZE];
1147440c840cSTimofey Titovets u32 new_addr;
1148440c840cSTimofey Titovets u32 addr;
1149440c840cSTimofey Titovets int bitlen;
1150440c840cSTimofey Titovets int shift;
1151440c840cSTimofey Titovets int i;
1152440c840cSTimofey Titovets
1153440c840cSTimofey Titovets /*
1154440c840cSTimofey Titovets * Try avoid useless loop iterations for small numbers stored in big
1155440c840cSTimofey Titovets * counters. Example: 48 33 4 ... in 64bit array
1156440c840cSTimofey Titovets */
115723ae8c63SDavid Sterba max_num = array[0].count;
1158440c840cSTimofey Titovets for (i = 1; i < num; i++) {
115923ae8c63SDavid Sterba buf_num = array[i].count;
1160440c840cSTimofey Titovets if (buf_num > max_num)
1161440c840cSTimofey Titovets max_num = buf_num;
1162440c840cSTimofey Titovets }
1163440c840cSTimofey Titovets
1164440c840cSTimofey Titovets buf_num = ilog2(max_num);
1165440c840cSTimofey Titovets bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1166440c840cSTimofey Titovets
1167440c840cSTimofey Titovets shift = 0;
1168440c840cSTimofey Titovets while (shift < bitlen) {
1169440c840cSTimofey Titovets memset(counters, 0, sizeof(counters));
1170440c840cSTimofey Titovets
1171440c840cSTimofey Titovets for (i = 0; i < num; i++) {
117223ae8c63SDavid Sterba buf_num = array[i].count;
1173440c840cSTimofey Titovets addr = get4bits(buf_num, shift);
1174440c840cSTimofey Titovets counters[addr]++;
1175440c840cSTimofey Titovets }
1176440c840cSTimofey Titovets
1177440c840cSTimofey Titovets for (i = 1; i < COUNTERS_SIZE; i++)
1178440c840cSTimofey Titovets counters[i] += counters[i - 1];
1179440c840cSTimofey Titovets
1180440c840cSTimofey Titovets for (i = num - 1; i >= 0; i--) {
118123ae8c63SDavid Sterba buf_num = array[i].count;
1182440c840cSTimofey Titovets addr = get4bits(buf_num, shift);
1183440c840cSTimofey Titovets counters[addr]--;
1184440c840cSTimofey Titovets new_addr = counters[addr];
11857add17beSDavid Sterba array_buf[new_addr] = array[i];
1186440c840cSTimofey Titovets }
1187440c840cSTimofey Titovets
1188440c840cSTimofey Titovets shift += RADIX_BASE;
1189440c840cSTimofey Titovets
1190440c840cSTimofey Titovets /*
1191440c840cSTimofey Titovets * Normal radix expects to move data from a temporary array, to
1192440c840cSTimofey Titovets * the main one. But that requires some CPU time. Avoid that
1193440c840cSTimofey Titovets * by doing another sort iteration to original array instead of
1194440c840cSTimofey Titovets * memcpy()
1195440c840cSTimofey Titovets */
1196440c840cSTimofey Titovets memset(counters, 0, sizeof(counters));
1197440c840cSTimofey Titovets
1198440c840cSTimofey Titovets for (i = 0; i < num; i ++) {
119923ae8c63SDavid Sterba buf_num = array_buf[i].count;
1200440c840cSTimofey Titovets addr = get4bits(buf_num, shift);
1201440c840cSTimofey Titovets counters[addr]++;
1202440c840cSTimofey Titovets }
1203440c840cSTimofey Titovets
1204440c840cSTimofey Titovets for (i = 1; i < COUNTERS_SIZE; i++)
1205440c840cSTimofey Titovets counters[i] += counters[i - 1];
1206440c840cSTimofey Titovets
1207440c840cSTimofey Titovets for (i = num - 1; i >= 0; i--) {
120823ae8c63SDavid Sterba buf_num = array_buf[i].count;
1209440c840cSTimofey Titovets addr = get4bits(buf_num, shift);
1210440c840cSTimofey Titovets counters[addr]--;
1211440c840cSTimofey Titovets new_addr = counters[addr];
12127add17beSDavid Sterba array[new_addr] = array_buf[i];
1213440c840cSTimofey Titovets }
1214440c840cSTimofey Titovets
1215440c840cSTimofey Titovets shift += RADIX_BASE;
1216440c840cSTimofey Titovets }
1217858177d3STimofey Titovets }
1218858177d3STimofey Titovets
1219858177d3STimofey Titovets /*
1220858177d3STimofey Titovets * Size of the core byte set - how many bytes cover 90% of the sample
1221858177d3STimofey Titovets *
1222858177d3STimofey Titovets * There are several types of structured binary data that use nearly all byte
1223858177d3STimofey Titovets * values. The distribution can be uniform and counts in all buckets will be
1224858177d3STimofey Titovets * nearly the same (eg. encrypted data). Unlikely to be compressible.
1225858177d3STimofey Titovets *
1226858177d3STimofey Titovets * Other possibility is normal (Gaussian) distribution, where the data could
1227858177d3STimofey Titovets * be potentially compressible, but we have to take a few more steps to decide
1228858177d3STimofey Titovets * how much.
1229858177d3STimofey Titovets *
1230858177d3STimofey Titovets * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1231858177d3STimofey Titovets * compression algo can easy fix that
1232858177d3STimofey Titovets * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1233858177d3STimofey Titovets * probability is not compressible
1234858177d3STimofey Titovets */
1235858177d3STimofey Titovets #define BYTE_CORE_SET_LOW (64)
1236858177d3STimofey Titovets #define BYTE_CORE_SET_HIGH (200)
1237858177d3STimofey Titovets
byte_core_set_size(struct heuristic_ws * ws)1238858177d3STimofey Titovets static int byte_core_set_size(struct heuristic_ws *ws)
1239858177d3STimofey Titovets {
1240858177d3STimofey Titovets u32 i;
1241858177d3STimofey Titovets u32 coreset_sum = 0;
1242858177d3STimofey Titovets const u32 core_set_threshold = ws->sample_size * 90 / 100;
1243858177d3STimofey Titovets struct bucket_item *bucket = ws->bucket;
1244858177d3STimofey Titovets
1245858177d3STimofey Titovets /* Sort in reverse order */
124636243c91SDavid Sterba radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1247858177d3STimofey Titovets
1248858177d3STimofey Titovets for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1249858177d3STimofey Titovets coreset_sum += bucket[i].count;
1250858177d3STimofey Titovets
1251858177d3STimofey Titovets if (coreset_sum > core_set_threshold)
1252858177d3STimofey Titovets return i;
1253858177d3STimofey Titovets
1254858177d3STimofey Titovets for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1255858177d3STimofey Titovets coreset_sum += bucket[i].count;
1256858177d3STimofey Titovets if (coreset_sum > core_set_threshold)
1257858177d3STimofey Titovets break;
1258858177d3STimofey Titovets }
1259858177d3STimofey Titovets
1260858177d3STimofey Titovets return i;
1261858177d3STimofey Titovets }
1262858177d3STimofey Titovets
1263a288e92cSTimofey Titovets /*
1264a288e92cSTimofey Titovets * Count byte values in buckets.
1265a288e92cSTimofey Titovets * This heuristic can detect textual data (configs, xml, json, html, etc).
1266a288e92cSTimofey Titovets * Because in most text-like data byte set is restricted to limited number of
1267a288e92cSTimofey Titovets * possible characters, and that restriction in most cases makes data easy to
1268a288e92cSTimofey Titovets * compress.
1269a288e92cSTimofey Titovets *
1270a288e92cSTimofey Titovets * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1271a288e92cSTimofey Titovets * less - compressible
1272a288e92cSTimofey Titovets * more - need additional analysis
1273a288e92cSTimofey Titovets */
1274a288e92cSTimofey Titovets #define BYTE_SET_THRESHOLD (64)
1275a288e92cSTimofey Titovets
byte_set_size(const struct heuristic_ws * ws)1276a288e92cSTimofey Titovets static u32 byte_set_size(const struct heuristic_ws *ws)
1277a288e92cSTimofey Titovets {
1278a288e92cSTimofey Titovets u32 i;
1279a288e92cSTimofey Titovets u32 byte_set_size = 0;
1280a288e92cSTimofey Titovets
1281a288e92cSTimofey Titovets for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1282a288e92cSTimofey Titovets if (ws->bucket[i].count > 0)
1283a288e92cSTimofey Titovets byte_set_size++;
1284a288e92cSTimofey Titovets }
1285a288e92cSTimofey Titovets
1286a288e92cSTimofey Titovets /*
1287a288e92cSTimofey Titovets * Continue collecting count of byte values in buckets. If the byte
1288a288e92cSTimofey Titovets * set size is bigger then the threshold, it's pointless to continue,
1289a288e92cSTimofey Titovets * the detection technique would fail for this type of data.
1290a288e92cSTimofey Titovets */
1291a288e92cSTimofey Titovets for (; i < BUCKET_SIZE; i++) {
1292a288e92cSTimofey Titovets if (ws->bucket[i].count > 0) {
1293a288e92cSTimofey Titovets byte_set_size++;
1294a288e92cSTimofey Titovets if (byte_set_size > BYTE_SET_THRESHOLD)
1295a288e92cSTimofey Titovets return byte_set_size;
1296a288e92cSTimofey Titovets }
1297a288e92cSTimofey Titovets }
1298a288e92cSTimofey Titovets
1299a288e92cSTimofey Titovets return byte_set_size;
1300a288e92cSTimofey Titovets }
1301a288e92cSTimofey Titovets
sample_repeated_patterns(struct heuristic_ws * ws)13021fe4f6faSTimofey Titovets static bool sample_repeated_patterns(struct heuristic_ws *ws)
13031fe4f6faSTimofey Titovets {
13041fe4f6faSTimofey Titovets const u32 half_of_sample = ws->sample_size / 2;
13051fe4f6faSTimofey Titovets const u8 *data = ws->sample;
13061fe4f6faSTimofey Titovets
13071fe4f6faSTimofey Titovets return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
13081fe4f6faSTimofey Titovets }
13091fe4f6faSTimofey Titovets
heuristic_collect_sample(struct inode * inode,u64 start,u64 end,struct heuristic_ws * ws)1310a440d48cSTimofey Titovets static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1311a440d48cSTimofey Titovets struct heuristic_ws *ws)
1312a440d48cSTimofey Titovets {
1313a440d48cSTimofey Titovets struct page *page;
1314a440d48cSTimofey Titovets u64 index, index_end;
1315a440d48cSTimofey Titovets u32 i, curr_sample_pos;
1316a440d48cSTimofey Titovets u8 *in_data;
1317a440d48cSTimofey Titovets
1318a440d48cSTimofey Titovets /*
1319a440d48cSTimofey Titovets * Compression handles the input data by chunks of 128KiB
1320a440d48cSTimofey Titovets * (defined by BTRFS_MAX_UNCOMPRESSED)
1321a440d48cSTimofey Titovets *
1322a440d48cSTimofey Titovets * We do the same for the heuristic and loop over the whole range.
1323a440d48cSTimofey Titovets *
1324a440d48cSTimofey Titovets * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1325a440d48cSTimofey Titovets * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1326a440d48cSTimofey Titovets */
1327a440d48cSTimofey Titovets if (end - start > BTRFS_MAX_UNCOMPRESSED)
1328a440d48cSTimofey Titovets end = start + BTRFS_MAX_UNCOMPRESSED;
1329a440d48cSTimofey Titovets
1330a440d48cSTimofey Titovets index = start >> PAGE_SHIFT;
1331a440d48cSTimofey Titovets index_end = end >> PAGE_SHIFT;
1332a440d48cSTimofey Titovets
1333a440d48cSTimofey Titovets /* Don't miss unaligned end */
1334ce394a7fSYushan Zhou if (!PAGE_ALIGNED(end))
1335a440d48cSTimofey Titovets index_end++;
1336a440d48cSTimofey Titovets
1337a440d48cSTimofey Titovets curr_sample_pos = 0;
1338a440d48cSTimofey Titovets while (index < index_end) {
1339a440d48cSTimofey Titovets page = find_get_page(inode->i_mapping, index);
134058c1a35cSIra Weiny in_data = kmap_local_page(page);
1341a440d48cSTimofey Titovets /* Handle case where the start is not aligned to PAGE_SIZE */
1342a440d48cSTimofey Titovets i = start % PAGE_SIZE;
1343a440d48cSTimofey Titovets while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1344a440d48cSTimofey Titovets /* Don't sample any garbage from the last page */
1345a440d48cSTimofey Titovets if (start > end - SAMPLING_READ_SIZE)
1346a440d48cSTimofey Titovets break;
1347a440d48cSTimofey Titovets memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1348a440d48cSTimofey Titovets SAMPLING_READ_SIZE);
1349a440d48cSTimofey Titovets i += SAMPLING_INTERVAL;
1350a440d48cSTimofey Titovets start += SAMPLING_INTERVAL;
1351a440d48cSTimofey Titovets curr_sample_pos += SAMPLING_READ_SIZE;
1352a440d48cSTimofey Titovets }
135358c1a35cSIra Weiny kunmap_local(in_data);
1354a440d48cSTimofey Titovets put_page(page);
1355a440d48cSTimofey Titovets
1356a440d48cSTimofey Titovets index++;
1357a440d48cSTimofey Titovets }
1358a440d48cSTimofey Titovets
1359a440d48cSTimofey Titovets ws->sample_size = curr_sample_pos;
1360a440d48cSTimofey Titovets }
1361a440d48cSTimofey Titovets
1362c2fcdcdfSTimofey Titovets /*
1363c2fcdcdfSTimofey Titovets * Compression heuristic.
1364c2fcdcdfSTimofey Titovets *
1365c2fcdcdfSTimofey Titovets * For now is's a naive and optimistic 'return true', we'll extend the logic to
1366c2fcdcdfSTimofey Titovets * quickly (compared to direct compression) detect data characteristics
136767da05b3SColin Ian King * (compressible/incompressible) to avoid wasting CPU time on incompressible
1368c2fcdcdfSTimofey Titovets * data.
1369c2fcdcdfSTimofey Titovets *
1370c2fcdcdfSTimofey Titovets * The following types of analysis can be performed:
1371c2fcdcdfSTimofey Titovets * - detect mostly zero data
1372c2fcdcdfSTimofey Titovets * - detect data with low "byte set" size (text, etc)
1373c2fcdcdfSTimofey Titovets * - detect data with low/high "core byte" set
1374c2fcdcdfSTimofey Titovets *
1375c2fcdcdfSTimofey Titovets * Return non-zero if the compression should be done, 0 otherwise.
1376c2fcdcdfSTimofey Titovets */
btrfs_compress_heuristic(struct inode * inode,u64 start,u64 end)1377c2fcdcdfSTimofey Titovets int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1378c2fcdcdfSTimofey Titovets {
13797bf49943SDennis Zhou struct list_head *ws_list = get_workspace(0, 0);
13804e439a0bSTimofey Titovets struct heuristic_ws *ws;
1381a440d48cSTimofey Titovets u32 i;
1382a440d48cSTimofey Titovets u8 byte;
138319562430STimofey Titovets int ret = 0;
1384c2fcdcdfSTimofey Titovets
13854e439a0bSTimofey Titovets ws = list_entry(ws_list, struct heuristic_ws, list);
13864e439a0bSTimofey Titovets
1387a440d48cSTimofey Titovets heuristic_collect_sample(inode, start, end, ws);
1388a440d48cSTimofey Titovets
13891fe4f6faSTimofey Titovets if (sample_repeated_patterns(ws)) {
13901fe4f6faSTimofey Titovets ret = 1;
13911fe4f6faSTimofey Titovets goto out;
13921fe4f6faSTimofey Titovets }
13931fe4f6faSTimofey Titovets
1394a440d48cSTimofey Titovets memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1395a440d48cSTimofey Titovets
1396a440d48cSTimofey Titovets for (i = 0; i < ws->sample_size; i++) {
1397a440d48cSTimofey Titovets byte = ws->sample[i];
1398a440d48cSTimofey Titovets ws->bucket[byte].count++;
1399c2fcdcdfSTimofey Titovets }
1400c2fcdcdfSTimofey Titovets
1401a288e92cSTimofey Titovets i = byte_set_size(ws);
1402a288e92cSTimofey Titovets if (i < BYTE_SET_THRESHOLD) {
1403a288e92cSTimofey Titovets ret = 2;
1404a288e92cSTimofey Titovets goto out;
1405a288e92cSTimofey Titovets }
1406a288e92cSTimofey Titovets
1407858177d3STimofey Titovets i = byte_core_set_size(ws);
1408858177d3STimofey Titovets if (i <= BYTE_CORE_SET_LOW) {
1409858177d3STimofey Titovets ret = 3;
1410858177d3STimofey Titovets goto out;
1411858177d3STimofey Titovets }
1412858177d3STimofey Titovets
1413858177d3STimofey Titovets if (i >= BYTE_CORE_SET_HIGH) {
1414858177d3STimofey Titovets ret = 0;
1415858177d3STimofey Titovets goto out;
1416858177d3STimofey Titovets }
1417858177d3STimofey Titovets
141819562430STimofey Titovets i = shannon_entropy(ws);
141919562430STimofey Titovets if (i <= ENTROPY_LVL_ACEPTABLE) {
142019562430STimofey Titovets ret = 4;
142119562430STimofey Titovets goto out;
142219562430STimofey Titovets }
142319562430STimofey Titovets
142419562430STimofey Titovets /*
142519562430STimofey Titovets * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
142619562430STimofey Titovets * needed to give green light to compression.
142719562430STimofey Titovets *
142819562430STimofey Titovets * For now just assume that compression at that level is not worth the
142919562430STimofey Titovets * resources because:
143019562430STimofey Titovets *
143119562430STimofey Titovets * 1. it is possible to defrag the data later
143219562430STimofey Titovets *
143319562430STimofey Titovets * 2. the data would turn out to be hardly compressible, eg. 150 byte
143419562430STimofey Titovets * values, every bucket has counter at level ~54. The heuristic would
143519562430STimofey Titovets * be confused. This can happen when data have some internal repeated
143619562430STimofey Titovets * patterns like "abbacbbc...". This can be detected by analyzing
143719562430STimofey Titovets * pairs of bytes, which is too costly.
143819562430STimofey Titovets */
143919562430STimofey Titovets if (i < ENTROPY_LVL_HIGH) {
144019562430STimofey Titovets ret = 5;
144119562430STimofey Titovets goto out;
144219562430STimofey Titovets } else {
144319562430STimofey Titovets ret = 0;
144419562430STimofey Titovets goto out;
144519562430STimofey Titovets }
144619562430STimofey Titovets
14471fe4f6faSTimofey Titovets out:
1448929f4bafSDennis Zhou put_workspace(0, ws_list);
1449c2fcdcdfSTimofey Titovets return ret;
1450c2fcdcdfSTimofey Titovets }
1451f51d2b59SDavid Sterba
1452d0ab62ceSDennis Zhou /*
1453d0ab62ceSDennis Zhou * Convert the compression suffix (eg. after "zlib" starting with ":") to
1454d0ab62ceSDennis Zhou * level, unrecognized string will set the default level
1455d0ab62ceSDennis Zhou */
btrfs_compress_str2level(unsigned int type,const char * str)1456d0ab62ceSDennis Zhou unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
1457f51d2b59SDavid Sterba {
1458d0ab62ceSDennis Zhou unsigned int level = 0;
1459d0ab62ceSDennis Zhou int ret;
1460d0ab62ceSDennis Zhou
1461d0ab62ceSDennis Zhou if (!type)
1462f51d2b59SDavid Sterba return 0;
1463f51d2b59SDavid Sterba
1464d0ab62ceSDennis Zhou if (str[0] == ':') {
1465d0ab62ceSDennis Zhou ret = kstrtouint(str + 1, 10, &level);
1466d0ab62ceSDennis Zhou if (ret)
1467d0ab62ceSDennis Zhou level = 0;
1468d0ab62ceSDennis Zhou }
1469f51d2b59SDavid Sterba
1470b0c1fe1eSDavid Sterba level = btrfs_compress_set_level(type, level);
1471b0c1fe1eSDavid Sterba
1472b0c1fe1eSDavid Sterba return level;
1473b0c1fe1eSDavid Sterba }
1474