1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/file.h> 9 #include <linux/fs.h> 10 #include <linux/pagemap.h> 11 #include <linux/highmem.h> 12 #include <linux/kthread.h> 13 #include <linux/time.h> 14 #include <linux/init.h> 15 #include <linux/string.h> 16 #include <linux/backing-dev.h> 17 #include <linux/writeback.h> 18 #include <linux/slab.h> 19 #include <linux/sched/mm.h> 20 #include <linux/log2.h> 21 #include <crypto/hash.h> 22 #include "misc.h" 23 #include "ctree.h" 24 #include "disk-io.h" 25 #include "transaction.h" 26 #include "btrfs_inode.h" 27 #include "volumes.h" 28 #include "ordered-data.h" 29 #include "compression.h" 30 #include "extent_io.h" 31 #include "extent_map.h" 32 #include "subpage.h" 33 #include "zoned.h" 34 35 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; 36 37 const char* btrfs_compress_type2str(enum btrfs_compression_type type) 38 { 39 switch (type) { 40 case BTRFS_COMPRESS_ZLIB: 41 case BTRFS_COMPRESS_LZO: 42 case BTRFS_COMPRESS_ZSTD: 43 case BTRFS_COMPRESS_NONE: 44 return btrfs_compress_types[type]; 45 default: 46 break; 47 } 48 49 return NULL; 50 } 51 52 bool btrfs_compress_is_valid_type(const char *str, size_t len) 53 { 54 int i; 55 56 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 57 size_t comp_len = strlen(btrfs_compress_types[i]); 58 59 if (len < comp_len) 60 continue; 61 62 if (!strncmp(btrfs_compress_types[i], str, comp_len)) 63 return true; 64 } 65 return false; 66 } 67 68 static int compression_compress_pages(int type, struct list_head *ws, 69 struct address_space *mapping, u64 start, struct page **pages, 70 unsigned long *out_pages, unsigned long *total_in, 71 unsigned long *total_out) 72 { 73 switch (type) { 74 case BTRFS_COMPRESS_ZLIB: 75 return zlib_compress_pages(ws, mapping, start, pages, 76 out_pages, total_in, total_out); 77 case BTRFS_COMPRESS_LZO: 78 return lzo_compress_pages(ws, mapping, start, pages, 79 out_pages, total_in, total_out); 80 case BTRFS_COMPRESS_ZSTD: 81 return zstd_compress_pages(ws, mapping, start, pages, 82 out_pages, total_in, total_out); 83 case BTRFS_COMPRESS_NONE: 84 default: 85 /* 86 * This can happen when compression races with remount setting 87 * it to 'no compress', while caller doesn't call 88 * inode_need_compress() to check if we really need to 89 * compress. 90 * 91 * Not a big deal, just need to inform caller that we 92 * haven't allocated any pages yet. 93 */ 94 *out_pages = 0; 95 return -E2BIG; 96 } 97 } 98 99 static int compression_decompress_bio(int type, struct list_head *ws, 100 struct compressed_bio *cb) 101 { 102 switch (type) { 103 case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb); 104 case BTRFS_COMPRESS_LZO: return lzo_decompress_bio(ws, cb); 105 case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb); 106 case BTRFS_COMPRESS_NONE: 107 default: 108 /* 109 * This can't happen, the type is validated several times 110 * before we get here. 111 */ 112 BUG(); 113 } 114 } 115 116 static int compression_decompress(int type, struct list_head *ws, 117 unsigned char *data_in, struct page *dest_page, 118 unsigned long start_byte, size_t srclen, size_t destlen) 119 { 120 switch (type) { 121 case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page, 122 start_byte, srclen, destlen); 123 case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page, 124 start_byte, srclen, destlen); 125 case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page, 126 start_byte, srclen, destlen); 127 case BTRFS_COMPRESS_NONE: 128 default: 129 /* 130 * This can't happen, the type is validated several times 131 * before we get here. 132 */ 133 BUG(); 134 } 135 } 136 137 static int btrfs_decompress_bio(struct compressed_bio *cb); 138 139 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, 140 unsigned long disk_size) 141 { 142 return sizeof(struct compressed_bio) + 143 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; 144 } 145 146 static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, 147 u64 disk_start) 148 { 149 struct btrfs_fs_info *fs_info = inode->root->fs_info; 150 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 151 const u32 csum_size = fs_info->csum_size; 152 const u32 sectorsize = fs_info->sectorsize; 153 struct page *page; 154 unsigned int i; 155 char *kaddr; 156 u8 csum[BTRFS_CSUM_SIZE]; 157 struct compressed_bio *cb = bio->bi_private; 158 u8 *cb_sum = cb->sums; 159 160 if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM)) 161 return 0; 162 163 shash->tfm = fs_info->csum_shash; 164 165 for (i = 0; i < cb->nr_pages; i++) { 166 u32 pg_offset; 167 u32 bytes_left = PAGE_SIZE; 168 page = cb->compressed_pages[i]; 169 170 /* Determine the remaining bytes inside the page first */ 171 if (i == cb->nr_pages - 1) 172 bytes_left = cb->compressed_len - i * PAGE_SIZE; 173 174 /* Hash through the page sector by sector */ 175 for (pg_offset = 0; pg_offset < bytes_left; 176 pg_offset += sectorsize) { 177 kaddr = kmap_atomic(page); 178 crypto_shash_digest(shash, kaddr + pg_offset, 179 sectorsize, csum); 180 kunmap_atomic(kaddr); 181 182 if (memcmp(&csum, cb_sum, csum_size) != 0) { 183 btrfs_print_data_csum_error(inode, disk_start, 184 csum, cb_sum, cb->mirror_num); 185 if (btrfs_bio(bio)->device) 186 btrfs_dev_stat_inc_and_print( 187 btrfs_bio(bio)->device, 188 BTRFS_DEV_STAT_CORRUPTION_ERRS); 189 return -EIO; 190 } 191 cb_sum += csum_size; 192 disk_start += sectorsize; 193 } 194 } 195 return 0; 196 } 197 198 /* 199 * Reduce bio and io accounting for a compressed_bio with its corresponding bio. 200 * 201 * Return true if there is no pending bio nor io. 202 * Return false otherwise. 203 */ 204 static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) 205 { 206 struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); 207 unsigned int bi_size = 0; 208 bool last_io = false; 209 struct bio_vec *bvec; 210 struct bvec_iter_all iter_all; 211 212 /* 213 * At endio time, bi_iter.bi_size doesn't represent the real bio size. 214 * Thus here we have to iterate through all segments to grab correct 215 * bio size. 216 */ 217 bio_for_each_segment_all(bvec, bio, iter_all) 218 bi_size += bvec->bv_len; 219 220 if (bio->bi_status) 221 cb->errors = 1; 222 223 ASSERT(bi_size && bi_size <= cb->compressed_len); 224 last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, 225 &cb->pending_sectors); 226 /* 227 * Here we must wake up the possible error handler after all other 228 * operations on @cb finished, or we can race with 229 * finish_compressed_bio_*() which may free @cb. 230 */ 231 wake_up_var(cb); 232 233 return last_io; 234 } 235 236 static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) 237 { 238 unsigned int index; 239 struct page *page; 240 241 /* Release the compressed pages */ 242 for (index = 0; index < cb->nr_pages; index++) { 243 page = cb->compressed_pages[index]; 244 page->mapping = NULL; 245 put_page(page); 246 } 247 248 /* Do io completion on the original bio */ 249 if (cb->errors) { 250 bio_io_error(cb->orig_bio); 251 } else { 252 struct bio_vec *bvec; 253 struct bvec_iter_all iter_all; 254 255 ASSERT(bio); 256 ASSERT(!bio->bi_status); 257 /* 258 * We have verified the checksum already, set page checked so 259 * the end_io handlers know about it 260 */ 261 ASSERT(!bio_flagged(bio, BIO_CLONED)); 262 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { 263 u64 bvec_start = page_offset(bvec->bv_page) + 264 bvec->bv_offset; 265 266 btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), 267 bvec->bv_page, bvec_start, 268 bvec->bv_len); 269 } 270 271 bio_endio(cb->orig_bio); 272 } 273 274 /* Finally free the cb struct */ 275 kfree(cb->compressed_pages); 276 kfree(cb); 277 } 278 279 /* when we finish reading compressed pages from the disk, we 280 * decompress them and then run the bio end_io routines on the 281 * decompressed pages (in the inode address space). 282 * 283 * This allows the checksumming and other IO error handling routines 284 * to work normally 285 * 286 * The compressed pages are freed here, and it must be run 287 * in process context 288 */ 289 static void end_compressed_bio_read(struct bio *bio) 290 { 291 struct compressed_bio *cb = bio->bi_private; 292 struct inode *inode; 293 unsigned int mirror = btrfs_bio(bio)->mirror_num; 294 int ret = 0; 295 296 if (!dec_and_test_compressed_bio(cb, bio)) 297 goto out; 298 299 /* 300 * Record the correct mirror_num in cb->orig_bio so that 301 * read-repair can work properly. 302 */ 303 btrfs_bio(cb->orig_bio)->mirror_num = mirror; 304 cb->mirror_num = mirror; 305 306 /* 307 * Some IO in this cb have failed, just skip checksum as there 308 * is no way it could be correct. 309 */ 310 if (cb->errors == 1) 311 goto csum_failed; 312 313 inode = cb->inode; 314 ret = check_compressed_csum(BTRFS_I(inode), bio, 315 bio->bi_iter.bi_sector << 9); 316 if (ret) 317 goto csum_failed; 318 319 /* ok, we're the last bio for this extent, lets start 320 * the decompression. 321 */ 322 ret = btrfs_decompress_bio(cb); 323 324 csum_failed: 325 if (ret) 326 cb->errors = 1; 327 finish_compressed_bio_read(cb, bio); 328 out: 329 bio_put(bio); 330 } 331 332 /* 333 * Clear the writeback bits on all of the file 334 * pages for a compressed write 335 */ 336 static noinline void end_compressed_writeback(struct inode *inode, 337 const struct compressed_bio *cb) 338 { 339 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 340 unsigned long index = cb->start >> PAGE_SHIFT; 341 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 342 struct page *pages[16]; 343 unsigned long nr_pages = end_index - index + 1; 344 int i; 345 int ret; 346 347 if (cb->errors) 348 mapping_set_error(inode->i_mapping, -EIO); 349 350 while (nr_pages > 0) { 351 ret = find_get_pages_contig(inode->i_mapping, index, 352 min_t(unsigned long, 353 nr_pages, ARRAY_SIZE(pages)), pages); 354 if (ret == 0) { 355 nr_pages -= 1; 356 index += 1; 357 continue; 358 } 359 for (i = 0; i < ret; i++) { 360 if (cb->errors) 361 SetPageError(pages[i]); 362 btrfs_page_clamp_clear_writeback(fs_info, pages[i], 363 cb->start, cb->len); 364 put_page(pages[i]); 365 } 366 nr_pages -= ret; 367 index += ret; 368 } 369 /* the inode may be gone now */ 370 } 371 372 static void finish_compressed_bio_write(struct compressed_bio *cb) 373 { 374 struct inode *inode = cb->inode; 375 unsigned int index; 376 377 /* 378 * Ok, we're the last bio for this extent, step one is to call back 379 * into the FS and do all the end_io operations. 380 */ 381 btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, 382 cb->start, cb->start + cb->len - 1, 383 !cb->errors); 384 385 end_compressed_writeback(inode, cb); 386 /* Note, our inode could be gone now */ 387 388 /* 389 * Release the compressed pages, these came from alloc_page and 390 * are not attached to the inode at all 391 */ 392 for (index = 0; index < cb->nr_pages; index++) { 393 struct page *page = cb->compressed_pages[index]; 394 395 page->mapping = NULL; 396 put_page(page); 397 } 398 399 /* Finally free the cb struct */ 400 kfree(cb->compressed_pages); 401 kfree(cb); 402 } 403 404 /* 405 * Do the cleanup once all the compressed pages hit the disk. This will clear 406 * writeback on the file pages and free the compressed pages. 407 * 408 * This also calls the writeback end hooks for the file pages so that metadata 409 * and checksums can be updated in the file. 410 */ 411 static void end_compressed_bio_write(struct bio *bio) 412 { 413 struct compressed_bio *cb = bio->bi_private; 414 415 if (!dec_and_test_compressed_bio(cb, bio)) 416 goto out; 417 418 btrfs_record_physical_zoned(cb->inode, cb->start, bio); 419 420 finish_compressed_bio_write(cb); 421 out: 422 bio_put(bio); 423 } 424 425 static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, 426 struct compressed_bio *cb, 427 struct bio *bio, int mirror_num) 428 { 429 blk_status_t ret; 430 431 ASSERT(bio->bi_iter.bi_size); 432 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 433 if (ret) 434 return ret; 435 ret = btrfs_map_bio(fs_info, bio, mirror_num); 436 return ret; 437 } 438 439 /* 440 * Allocate a compressed_bio, which will be used to read/write on-disk 441 * (aka, compressed) * data. 442 * 443 * @cb: The compressed_bio structure, which records all the needed 444 * information to bind the compressed data to the uncompressed 445 * page cache. 446 * @disk_byten: The logical bytenr where the compressed data will be read 447 * from or written to. 448 * @endio_func: The endio function to call after the IO for compressed data 449 * is finished. 450 * @next_stripe_start: Return value of logical bytenr of where next stripe starts. 451 * Let the caller know to only fill the bio up to the stripe 452 * boundary. 453 */ 454 455 456 static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, 457 unsigned int opf, bio_end_io_t endio_func, 458 u64 *next_stripe_start) 459 { 460 struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); 461 struct btrfs_io_geometry geom; 462 struct extent_map *em; 463 struct bio *bio; 464 int ret; 465 466 bio = btrfs_bio_alloc(BIO_MAX_VECS); 467 468 bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; 469 bio->bi_opf = opf; 470 bio->bi_private = cb; 471 bio->bi_end_io = endio_func; 472 473 em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); 474 if (IS_ERR(em)) { 475 bio_put(bio); 476 return ERR_CAST(em); 477 } 478 479 if (bio_op(bio) == REQ_OP_ZONE_APPEND) 480 bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); 481 482 ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); 483 free_extent_map(em); 484 if (ret < 0) { 485 bio_put(bio); 486 return ERR_PTR(ret); 487 } 488 *next_stripe_start = disk_bytenr + geom.len; 489 490 return bio; 491 } 492 493 /* 494 * worker function to build and submit bios for previously compressed pages. 495 * The corresponding pages in the inode should be marked for writeback 496 * and the compressed pages should have a reference on them for dropping 497 * when the IO is complete. 498 * 499 * This also checksums the file bytes and gets things ready for 500 * the end io hooks. 501 */ 502 blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, 503 unsigned int len, u64 disk_start, 504 unsigned int compressed_len, 505 struct page **compressed_pages, 506 unsigned int nr_pages, 507 unsigned int write_flags, 508 struct cgroup_subsys_state *blkcg_css) 509 { 510 struct btrfs_fs_info *fs_info = inode->root->fs_info; 511 struct bio *bio = NULL; 512 struct compressed_bio *cb; 513 u64 cur_disk_bytenr = disk_start; 514 u64 next_stripe_start; 515 blk_status_t ret; 516 int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; 517 const bool use_append = btrfs_use_zone_append(inode, disk_start); 518 const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; 519 520 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && 521 IS_ALIGNED(len, fs_info->sectorsize)); 522 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 523 if (!cb) 524 return BLK_STS_RESOURCE; 525 refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); 526 cb->errors = 0; 527 cb->inode = &inode->vfs_inode; 528 cb->start = start; 529 cb->len = len; 530 cb->mirror_num = 0; 531 cb->compressed_pages = compressed_pages; 532 cb->compressed_len = compressed_len; 533 cb->orig_bio = NULL; 534 cb->nr_pages = nr_pages; 535 536 while (cur_disk_bytenr < disk_start + compressed_len) { 537 u64 offset = cur_disk_bytenr - disk_start; 538 unsigned int index = offset >> PAGE_SHIFT; 539 unsigned int real_size; 540 unsigned int added; 541 struct page *page = compressed_pages[index]; 542 bool submit = false; 543 544 /* Allocate new bio if submitted or not yet allocated */ 545 if (!bio) { 546 bio = alloc_compressed_bio(cb, cur_disk_bytenr, 547 bio_op | write_flags, end_compressed_bio_write, 548 &next_stripe_start); 549 if (IS_ERR(bio)) { 550 ret = errno_to_blk_status(PTR_ERR(bio)); 551 bio = NULL; 552 goto finish_cb; 553 } 554 } 555 /* 556 * We should never reach next_stripe_start start as we will 557 * submit comp_bio when reach the boundary immediately. 558 */ 559 ASSERT(cur_disk_bytenr != next_stripe_start); 560 561 /* 562 * We have various limits on the real read size: 563 * - stripe boundary 564 * - page boundary 565 * - compressed length boundary 566 */ 567 real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); 568 real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); 569 real_size = min_t(u64, real_size, compressed_len - offset); 570 ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); 571 572 if (use_append) 573 added = bio_add_zone_append_page(bio, page, real_size, 574 offset_in_page(offset)); 575 else 576 added = bio_add_page(bio, page, real_size, 577 offset_in_page(offset)); 578 /* Reached zoned boundary */ 579 if (added == 0) 580 submit = true; 581 582 cur_disk_bytenr += added; 583 /* Reached stripe boundary */ 584 if (cur_disk_bytenr == next_stripe_start) 585 submit = true; 586 587 /* Finished the range */ 588 if (cur_disk_bytenr == disk_start + compressed_len) 589 submit = true; 590 591 if (submit) { 592 if (!skip_sum) { 593 ret = btrfs_csum_one_bio(inode, bio, start, 1); 594 if (ret) 595 goto finish_cb; 596 } 597 598 ret = submit_compressed_bio(fs_info, cb, bio, 0); 599 if (ret) 600 goto finish_cb; 601 bio = NULL; 602 } 603 cond_resched(); 604 } 605 if (blkcg_css) 606 kthread_associate_blkcg(NULL); 607 608 return 0; 609 610 finish_cb: 611 if (bio) { 612 bio->bi_status = ret; 613 bio_endio(bio); 614 } 615 /* Last byte of @cb is submitted, endio will free @cb */ 616 if (cur_disk_bytenr == disk_start + compressed_len) 617 return ret; 618 619 wait_var_event(cb, refcount_read(&cb->pending_sectors) == 620 (disk_start + compressed_len - cur_disk_bytenr) >> 621 fs_info->sectorsize_bits); 622 /* 623 * Even with previous bio ended, we should still have io not yet 624 * submitted, thus need to finish manually. 625 */ 626 ASSERT(refcount_read(&cb->pending_sectors)); 627 /* Now we are the only one referring @cb, can finish it safely. */ 628 finish_compressed_bio_write(cb); 629 return ret; 630 } 631 632 static u64 bio_end_offset(struct bio *bio) 633 { 634 struct bio_vec *last = bio_last_bvec_all(bio); 635 636 return page_offset(last->bv_page) + last->bv_len + last->bv_offset; 637 } 638 639 /* 640 * Add extra pages in the same compressed file extent so that we don't need to 641 * re-read the same extent again and again. 642 * 643 * NOTE: this won't work well for subpage, as for subpage read, we lock the 644 * full page then submit bio for each compressed/regular extents. 645 * 646 * This means, if we have several sectors in the same page points to the same 647 * on-disk compressed data, we will re-read the same extent many times and 648 * this function can only help for the next page. 649 */ 650 static noinline int add_ra_bio_pages(struct inode *inode, 651 u64 compressed_end, 652 struct compressed_bio *cb) 653 { 654 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 655 unsigned long end_index; 656 u64 cur = bio_end_offset(cb->orig_bio); 657 u64 isize = i_size_read(inode); 658 int ret; 659 struct page *page; 660 struct extent_map *em; 661 struct address_space *mapping = inode->i_mapping; 662 struct extent_map_tree *em_tree; 663 struct extent_io_tree *tree; 664 int sectors_missed = 0; 665 666 em_tree = &BTRFS_I(inode)->extent_tree; 667 tree = &BTRFS_I(inode)->io_tree; 668 669 if (isize == 0) 670 return 0; 671 672 /* 673 * For current subpage support, we only support 64K page size, 674 * which means maximum compressed extent size (128K) is just 2x page 675 * size. 676 * This makes readahead less effective, so here disable readahead for 677 * subpage for now, until full compressed write is supported. 678 */ 679 if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) 680 return 0; 681 682 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 683 684 while (cur < compressed_end) { 685 u64 page_end; 686 u64 pg_index = cur >> PAGE_SHIFT; 687 u32 add_size; 688 689 if (pg_index > end_index) 690 break; 691 692 page = xa_load(&mapping->i_pages, pg_index); 693 if (page && !xa_is_value(page)) { 694 sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> 695 fs_info->sectorsize_bits; 696 697 /* Beyond threshold, no need to continue */ 698 if (sectors_missed > 4) 699 break; 700 701 /* 702 * Jump to next page start as we already have page for 703 * current offset. 704 */ 705 cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; 706 continue; 707 } 708 709 page = __page_cache_alloc(mapping_gfp_constraint(mapping, 710 ~__GFP_FS)); 711 if (!page) 712 break; 713 714 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { 715 put_page(page); 716 /* There is already a page, skip to page end */ 717 cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; 718 continue; 719 } 720 721 ret = set_page_extent_mapped(page); 722 if (ret < 0) { 723 unlock_page(page); 724 put_page(page); 725 break; 726 } 727 728 page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; 729 lock_extent(tree, cur, page_end); 730 read_lock(&em_tree->lock); 731 em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); 732 read_unlock(&em_tree->lock); 733 734 /* 735 * At this point, we have a locked page in the page cache for 736 * these bytes in the file. But, we have to make sure they map 737 * to this compressed extent on disk. 738 */ 739 if (!em || cur < em->start || 740 (cur + fs_info->sectorsize > extent_map_end(em)) || 741 (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { 742 free_extent_map(em); 743 unlock_extent(tree, cur, page_end); 744 unlock_page(page); 745 put_page(page); 746 break; 747 } 748 free_extent_map(em); 749 750 if (page->index == end_index) { 751 size_t zero_offset = offset_in_page(isize); 752 753 if (zero_offset) { 754 int zeros; 755 zeros = PAGE_SIZE - zero_offset; 756 memzero_page(page, zero_offset, zeros); 757 flush_dcache_page(page); 758 } 759 } 760 761 add_size = min(em->start + em->len, page_end + 1) - cur; 762 ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); 763 if (ret != add_size) { 764 unlock_extent(tree, cur, page_end); 765 unlock_page(page); 766 put_page(page); 767 break; 768 } 769 /* 770 * If it's subpage, we also need to increase its 771 * subpage::readers number, as at endio we will decrease 772 * subpage::readers and to unlock the page. 773 */ 774 if (fs_info->sectorsize < PAGE_SIZE) 775 btrfs_subpage_start_reader(fs_info, page, cur, add_size); 776 put_page(page); 777 cur += add_size; 778 } 779 return 0; 780 } 781 782 /* 783 * for a compressed read, the bio we get passed has all the inode pages 784 * in it. We don't actually do IO on those pages but allocate new ones 785 * to hold the compressed pages on disk. 786 * 787 * bio->bi_iter.bi_sector points to the compressed extent on disk 788 * bio->bi_io_vec points to all of the inode pages 789 * 790 * After the compressed pages are read, we copy the bytes into the 791 * bio we were passed and then call the bio end_io calls 792 */ 793 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 794 int mirror_num, unsigned long bio_flags) 795 { 796 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 797 struct extent_map_tree *em_tree; 798 struct compressed_bio *cb; 799 unsigned int compressed_len; 800 unsigned int nr_pages; 801 unsigned int pg_index; 802 struct bio *comp_bio = NULL; 803 const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; 804 u64 cur_disk_byte = disk_bytenr; 805 u64 next_stripe_start; 806 u64 file_offset; 807 u64 em_len; 808 u64 em_start; 809 struct extent_map *em; 810 blk_status_t ret = BLK_STS_RESOURCE; 811 int faili = 0; 812 u8 *sums; 813 814 em_tree = &BTRFS_I(inode)->extent_tree; 815 816 file_offset = bio_first_bvec_all(bio)->bv_offset + 817 page_offset(bio_first_page_all(bio)); 818 819 /* we need the actual starting offset of this extent in the file */ 820 read_lock(&em_tree->lock); 821 em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); 822 read_unlock(&em_tree->lock); 823 if (!em) 824 return BLK_STS_IOERR; 825 826 ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); 827 compressed_len = em->block_len; 828 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 829 if (!cb) 830 goto out; 831 832 refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); 833 cb->errors = 0; 834 cb->inode = inode; 835 cb->mirror_num = mirror_num; 836 sums = cb->sums; 837 838 cb->start = em->orig_start; 839 em_len = em->len; 840 em_start = em->start; 841 842 free_extent_map(em); 843 em = NULL; 844 845 cb->len = bio->bi_iter.bi_size; 846 cb->compressed_len = compressed_len; 847 cb->compress_type = extent_compress_type(bio_flags); 848 cb->orig_bio = bio; 849 850 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 851 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), 852 GFP_NOFS); 853 if (!cb->compressed_pages) 854 goto fail1; 855 856 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 857 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); 858 if (!cb->compressed_pages[pg_index]) { 859 faili = pg_index - 1; 860 ret = BLK_STS_RESOURCE; 861 goto fail2; 862 } 863 } 864 faili = nr_pages - 1; 865 cb->nr_pages = nr_pages; 866 867 add_ra_bio_pages(inode, em_start + em_len, cb); 868 869 /* include any pages we added in add_ra-bio_pages */ 870 cb->len = bio->bi_iter.bi_size; 871 872 while (cur_disk_byte < disk_bytenr + compressed_len) { 873 u64 offset = cur_disk_byte - disk_bytenr; 874 unsigned int index = offset >> PAGE_SHIFT; 875 unsigned int real_size; 876 unsigned int added; 877 struct page *page = cb->compressed_pages[index]; 878 bool submit = false; 879 880 /* Allocate new bio if submitted or not yet allocated */ 881 if (!comp_bio) { 882 comp_bio = alloc_compressed_bio(cb, cur_disk_byte, 883 REQ_OP_READ, end_compressed_bio_read, 884 &next_stripe_start); 885 if (IS_ERR(comp_bio)) { 886 ret = errno_to_blk_status(PTR_ERR(comp_bio)); 887 comp_bio = NULL; 888 goto finish_cb; 889 } 890 } 891 /* 892 * We should never reach next_stripe_start start as we will 893 * submit comp_bio when reach the boundary immediately. 894 */ 895 ASSERT(cur_disk_byte != next_stripe_start); 896 /* 897 * We have various limit on the real read size: 898 * - stripe boundary 899 * - page boundary 900 * - compressed length boundary 901 */ 902 real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); 903 real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); 904 real_size = min_t(u64, real_size, compressed_len - offset); 905 ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); 906 907 added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); 908 /* 909 * Maximum compressed extent is smaller than bio size limit, 910 * thus bio_add_page() should always success. 911 */ 912 ASSERT(added == real_size); 913 cur_disk_byte += added; 914 915 /* Reached stripe boundary, need to submit */ 916 if (cur_disk_byte == next_stripe_start) 917 submit = true; 918 919 /* Has finished the range, need to submit */ 920 if (cur_disk_byte == disk_bytenr + compressed_len) 921 submit = true; 922 923 if (submit) { 924 unsigned int nr_sectors; 925 926 ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); 927 if (ret) 928 goto finish_cb; 929 930 nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 931 fs_info->sectorsize); 932 sums += fs_info->csum_size * nr_sectors; 933 934 ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); 935 if (ret) 936 goto finish_cb; 937 comp_bio = NULL; 938 } 939 } 940 return 0; 941 942 fail2: 943 while (faili >= 0) { 944 __free_page(cb->compressed_pages[faili]); 945 faili--; 946 } 947 948 kfree(cb->compressed_pages); 949 fail1: 950 kfree(cb); 951 out: 952 free_extent_map(em); 953 return ret; 954 finish_cb: 955 if (comp_bio) { 956 comp_bio->bi_status = ret; 957 bio_endio(comp_bio); 958 } 959 /* All bytes of @cb is submitted, endio will free @cb */ 960 if (cur_disk_byte == disk_bytenr + compressed_len) 961 return ret; 962 963 wait_var_event(cb, refcount_read(&cb->pending_sectors) == 964 (disk_bytenr + compressed_len - cur_disk_byte) >> 965 fs_info->sectorsize_bits); 966 /* 967 * Even with previous bio ended, we should still have io not yet 968 * submitted, thus need to finish @cb manually. 969 */ 970 ASSERT(refcount_read(&cb->pending_sectors)); 971 /* Now we are the only one referring @cb, can finish it safely. */ 972 finish_compressed_bio_read(cb, NULL); 973 return ret; 974 } 975 976 /* 977 * Heuristic uses systematic sampling to collect data from the input data 978 * range, the logic can be tuned by the following constants: 979 * 980 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 981 * @SAMPLING_INTERVAL - range from which the sampled data can be collected 982 */ 983 #define SAMPLING_READ_SIZE (16) 984 #define SAMPLING_INTERVAL (256) 985 986 /* 987 * For statistical analysis of the input data we consider bytes that form a 988 * Galois Field of 256 objects. Each object has an attribute count, ie. how 989 * many times the object appeared in the sample. 990 */ 991 #define BUCKET_SIZE (256) 992 993 /* 994 * The size of the sample is based on a statistical sampling rule of thumb. 995 * The common way is to perform sampling tests as long as the number of 996 * elements in each cell is at least 5. 997 * 998 * Instead of 5, we choose 32 to obtain more accurate results. 999 * If the data contain the maximum number of symbols, which is 256, we obtain a 1000 * sample size bound by 8192. 1001 * 1002 * For a sample of at most 8KB of data per data range: 16 consecutive bytes 1003 * from up to 512 locations. 1004 */ 1005 #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 1006 SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 1007 1008 struct bucket_item { 1009 u32 count; 1010 }; 1011 1012 struct heuristic_ws { 1013 /* Partial copy of input data */ 1014 u8 *sample; 1015 u32 sample_size; 1016 /* Buckets store counters for each byte value */ 1017 struct bucket_item *bucket; 1018 /* Sorting buffer */ 1019 struct bucket_item *bucket_b; 1020 struct list_head list; 1021 }; 1022 1023 static struct workspace_manager heuristic_wsm; 1024 1025 static void free_heuristic_ws(struct list_head *ws) 1026 { 1027 struct heuristic_ws *workspace; 1028 1029 workspace = list_entry(ws, struct heuristic_ws, list); 1030 1031 kvfree(workspace->sample); 1032 kfree(workspace->bucket); 1033 kfree(workspace->bucket_b); 1034 kfree(workspace); 1035 } 1036 1037 static struct list_head *alloc_heuristic_ws(unsigned int level) 1038 { 1039 struct heuristic_ws *ws; 1040 1041 ws = kzalloc(sizeof(*ws), GFP_KERNEL); 1042 if (!ws) 1043 return ERR_PTR(-ENOMEM); 1044 1045 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 1046 if (!ws->sample) 1047 goto fail; 1048 1049 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 1050 if (!ws->bucket) 1051 goto fail; 1052 1053 ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); 1054 if (!ws->bucket_b) 1055 goto fail; 1056 1057 INIT_LIST_HEAD(&ws->list); 1058 return &ws->list; 1059 fail: 1060 free_heuristic_ws(&ws->list); 1061 return ERR_PTR(-ENOMEM); 1062 } 1063 1064 const struct btrfs_compress_op btrfs_heuristic_compress = { 1065 .workspace_manager = &heuristic_wsm, 1066 }; 1067 1068 static const struct btrfs_compress_op * const btrfs_compress_op[] = { 1069 /* The heuristic is represented as compression type 0 */ 1070 &btrfs_heuristic_compress, 1071 &btrfs_zlib_compress, 1072 &btrfs_lzo_compress, 1073 &btrfs_zstd_compress, 1074 }; 1075 1076 static struct list_head *alloc_workspace(int type, unsigned int level) 1077 { 1078 switch (type) { 1079 case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level); 1080 case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level); 1081 case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level); 1082 case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level); 1083 default: 1084 /* 1085 * This can't happen, the type is validated several times 1086 * before we get here. 1087 */ 1088 BUG(); 1089 } 1090 } 1091 1092 static void free_workspace(int type, struct list_head *ws) 1093 { 1094 switch (type) { 1095 case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws); 1096 case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws); 1097 case BTRFS_COMPRESS_LZO: return lzo_free_workspace(ws); 1098 case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws); 1099 default: 1100 /* 1101 * This can't happen, the type is validated several times 1102 * before we get here. 1103 */ 1104 BUG(); 1105 } 1106 } 1107 1108 static void btrfs_init_workspace_manager(int type) 1109 { 1110 struct workspace_manager *wsm; 1111 struct list_head *workspace; 1112 1113 wsm = btrfs_compress_op[type]->workspace_manager; 1114 INIT_LIST_HEAD(&wsm->idle_ws); 1115 spin_lock_init(&wsm->ws_lock); 1116 atomic_set(&wsm->total_ws, 0); 1117 init_waitqueue_head(&wsm->ws_wait); 1118 1119 /* 1120 * Preallocate one workspace for each compression type so we can 1121 * guarantee forward progress in the worst case 1122 */ 1123 workspace = alloc_workspace(type, 0); 1124 if (IS_ERR(workspace)) { 1125 pr_warn( 1126 "BTRFS: cannot preallocate compression workspace, will try later\n"); 1127 } else { 1128 atomic_set(&wsm->total_ws, 1); 1129 wsm->free_ws = 1; 1130 list_add(workspace, &wsm->idle_ws); 1131 } 1132 } 1133 1134 static void btrfs_cleanup_workspace_manager(int type) 1135 { 1136 struct workspace_manager *wsman; 1137 struct list_head *ws; 1138 1139 wsman = btrfs_compress_op[type]->workspace_manager; 1140 while (!list_empty(&wsman->idle_ws)) { 1141 ws = wsman->idle_ws.next; 1142 list_del(ws); 1143 free_workspace(type, ws); 1144 atomic_dec(&wsman->total_ws); 1145 } 1146 } 1147 1148 /* 1149 * This finds an available workspace or allocates a new one. 1150 * If it's not possible to allocate a new one, waits until there's one. 1151 * Preallocation makes a forward progress guarantees and we do not return 1152 * errors. 1153 */ 1154 struct list_head *btrfs_get_workspace(int type, unsigned int level) 1155 { 1156 struct workspace_manager *wsm; 1157 struct list_head *workspace; 1158 int cpus = num_online_cpus(); 1159 unsigned nofs_flag; 1160 struct list_head *idle_ws; 1161 spinlock_t *ws_lock; 1162 atomic_t *total_ws; 1163 wait_queue_head_t *ws_wait; 1164 int *free_ws; 1165 1166 wsm = btrfs_compress_op[type]->workspace_manager; 1167 idle_ws = &wsm->idle_ws; 1168 ws_lock = &wsm->ws_lock; 1169 total_ws = &wsm->total_ws; 1170 ws_wait = &wsm->ws_wait; 1171 free_ws = &wsm->free_ws; 1172 1173 again: 1174 spin_lock(ws_lock); 1175 if (!list_empty(idle_ws)) { 1176 workspace = idle_ws->next; 1177 list_del(workspace); 1178 (*free_ws)--; 1179 spin_unlock(ws_lock); 1180 return workspace; 1181 1182 } 1183 if (atomic_read(total_ws) > cpus) { 1184 DEFINE_WAIT(wait); 1185 1186 spin_unlock(ws_lock); 1187 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 1188 if (atomic_read(total_ws) > cpus && !*free_ws) 1189 schedule(); 1190 finish_wait(ws_wait, &wait); 1191 goto again; 1192 } 1193 atomic_inc(total_ws); 1194 spin_unlock(ws_lock); 1195 1196 /* 1197 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have 1198 * to turn it off here because we might get called from the restricted 1199 * context of btrfs_compress_bio/btrfs_compress_pages 1200 */ 1201 nofs_flag = memalloc_nofs_save(); 1202 workspace = alloc_workspace(type, level); 1203 memalloc_nofs_restore(nofs_flag); 1204 1205 if (IS_ERR(workspace)) { 1206 atomic_dec(total_ws); 1207 wake_up(ws_wait); 1208 1209 /* 1210 * Do not return the error but go back to waiting. There's a 1211 * workspace preallocated for each type and the compression 1212 * time is bounded so we get to a workspace eventually. This 1213 * makes our caller's life easier. 1214 * 1215 * To prevent silent and low-probability deadlocks (when the 1216 * initial preallocation fails), check if there are any 1217 * workspaces at all. 1218 */ 1219 if (atomic_read(total_ws) == 0) { 1220 static DEFINE_RATELIMIT_STATE(_rs, 1221 /* once per minute */ 60 * HZ, 1222 /* no burst */ 1); 1223 1224 if (__ratelimit(&_rs)) { 1225 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); 1226 } 1227 } 1228 goto again; 1229 } 1230 return workspace; 1231 } 1232 1233 static struct list_head *get_workspace(int type, int level) 1234 { 1235 switch (type) { 1236 case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level); 1237 case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level); 1238 case BTRFS_COMPRESS_LZO: return btrfs_get_workspace(type, level); 1239 case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level); 1240 default: 1241 /* 1242 * This can't happen, the type is validated several times 1243 * before we get here. 1244 */ 1245 BUG(); 1246 } 1247 } 1248 1249 /* 1250 * put a workspace struct back on the list or free it if we have enough 1251 * idle ones sitting around 1252 */ 1253 void btrfs_put_workspace(int type, struct list_head *ws) 1254 { 1255 struct workspace_manager *wsm; 1256 struct list_head *idle_ws; 1257 spinlock_t *ws_lock; 1258 atomic_t *total_ws; 1259 wait_queue_head_t *ws_wait; 1260 int *free_ws; 1261 1262 wsm = btrfs_compress_op[type]->workspace_manager; 1263 idle_ws = &wsm->idle_ws; 1264 ws_lock = &wsm->ws_lock; 1265 total_ws = &wsm->total_ws; 1266 ws_wait = &wsm->ws_wait; 1267 free_ws = &wsm->free_ws; 1268 1269 spin_lock(ws_lock); 1270 if (*free_ws <= num_online_cpus()) { 1271 list_add(ws, idle_ws); 1272 (*free_ws)++; 1273 spin_unlock(ws_lock); 1274 goto wake; 1275 } 1276 spin_unlock(ws_lock); 1277 1278 free_workspace(type, ws); 1279 atomic_dec(total_ws); 1280 wake: 1281 cond_wake_up(ws_wait); 1282 } 1283 1284 static void put_workspace(int type, struct list_head *ws) 1285 { 1286 switch (type) { 1287 case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws); 1288 case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws); 1289 case BTRFS_COMPRESS_LZO: return btrfs_put_workspace(type, ws); 1290 case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws); 1291 default: 1292 /* 1293 * This can't happen, the type is validated several times 1294 * before we get here. 1295 */ 1296 BUG(); 1297 } 1298 } 1299 1300 /* 1301 * Adjust @level according to the limits of the compression algorithm or 1302 * fallback to default 1303 */ 1304 static unsigned int btrfs_compress_set_level(int type, unsigned level) 1305 { 1306 const struct btrfs_compress_op *ops = btrfs_compress_op[type]; 1307 1308 if (level == 0) 1309 level = ops->default_level; 1310 else 1311 level = min(level, ops->max_level); 1312 1313 return level; 1314 } 1315 1316 /* 1317 * Given an address space and start and length, compress the bytes into @pages 1318 * that are allocated on demand. 1319 * 1320 * @type_level is encoded algorithm and level, where level 0 means whatever 1321 * default the algorithm chooses and is opaque here; 1322 * - compression algo are 0-3 1323 * - the level are bits 4-7 1324 * 1325 * @out_pages is an in/out parameter, holds maximum number of pages to allocate 1326 * and returns number of actually allocated pages 1327 * 1328 * @total_in is used to return the number of bytes actually read. It 1329 * may be smaller than the input length if we had to exit early because we 1330 * ran out of room in the pages array or because we cross the 1331 * max_out threshold. 1332 * 1333 * @total_out is an in/out parameter, must be set to the input length and will 1334 * be also used to return the total number of compressed bytes 1335 */ 1336 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 1337 u64 start, struct page **pages, 1338 unsigned long *out_pages, 1339 unsigned long *total_in, 1340 unsigned long *total_out) 1341 { 1342 int type = btrfs_compress_type(type_level); 1343 int level = btrfs_compress_level(type_level); 1344 struct list_head *workspace; 1345 int ret; 1346 1347 level = btrfs_compress_set_level(type, level); 1348 workspace = get_workspace(type, level); 1349 ret = compression_compress_pages(type, workspace, mapping, start, pages, 1350 out_pages, total_in, total_out); 1351 put_workspace(type, workspace); 1352 return ret; 1353 } 1354 1355 static int btrfs_decompress_bio(struct compressed_bio *cb) 1356 { 1357 struct list_head *workspace; 1358 int ret; 1359 int type = cb->compress_type; 1360 1361 workspace = get_workspace(type, 0); 1362 ret = compression_decompress_bio(type, workspace, cb); 1363 put_workspace(type, workspace); 1364 1365 return ret; 1366 } 1367 1368 /* 1369 * a less complex decompression routine. Our compressed data fits in a 1370 * single page, and we want to read a single page out of it. 1371 * start_byte tells us the offset into the compressed data we're interested in 1372 */ 1373 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, 1374 unsigned long start_byte, size_t srclen, size_t destlen) 1375 { 1376 struct list_head *workspace; 1377 int ret; 1378 1379 workspace = get_workspace(type, 0); 1380 ret = compression_decompress(type, workspace, data_in, dest_page, 1381 start_byte, srclen, destlen); 1382 put_workspace(type, workspace); 1383 1384 return ret; 1385 } 1386 1387 void __init btrfs_init_compress(void) 1388 { 1389 btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); 1390 btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); 1391 btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); 1392 zstd_init_workspace_manager(); 1393 } 1394 1395 void __cold btrfs_exit_compress(void) 1396 { 1397 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); 1398 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); 1399 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); 1400 zstd_cleanup_workspace_manager(); 1401 } 1402 1403 /* 1404 * Copy decompressed data from working buffer to pages. 1405 * 1406 * @buf: The decompressed data buffer 1407 * @buf_len: The decompressed data length 1408 * @decompressed: Number of bytes that are already decompressed inside the 1409 * compressed extent 1410 * @cb: The compressed extent descriptor 1411 * @orig_bio: The original bio that the caller wants to read for 1412 * 1413 * An easier to understand graph is like below: 1414 * 1415 * |<- orig_bio ->| |<- orig_bio->| 1416 * |<------- full decompressed extent ----->| 1417 * |<----------- @cb range ---->| 1418 * | |<-- @buf_len -->| 1419 * |<--- @decompressed --->| 1420 * 1421 * Note that, @cb can be a subpage of the full decompressed extent, but 1422 * @cb->start always has the same as the orig_file_offset value of the full 1423 * decompressed extent. 1424 * 1425 * When reading compressed extent, we have to read the full compressed extent, 1426 * while @orig_bio may only want part of the range. 1427 * Thus this function will ensure only data covered by @orig_bio will be copied 1428 * to. 1429 * 1430 * Return 0 if we have copied all needed contents for @orig_bio. 1431 * Return >0 if we need continue decompress. 1432 */ 1433 int btrfs_decompress_buf2page(const char *buf, u32 buf_len, 1434 struct compressed_bio *cb, u32 decompressed) 1435 { 1436 struct bio *orig_bio = cb->orig_bio; 1437 /* Offset inside the full decompressed extent */ 1438 u32 cur_offset; 1439 1440 cur_offset = decompressed; 1441 /* The main loop to do the copy */ 1442 while (cur_offset < decompressed + buf_len) { 1443 struct bio_vec bvec; 1444 size_t copy_len; 1445 u32 copy_start; 1446 /* Offset inside the full decompressed extent */ 1447 u32 bvec_offset; 1448 1449 bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); 1450 /* 1451 * cb->start may underflow, but subtracting that value can still 1452 * give us correct offset inside the full decompressed extent. 1453 */ 1454 bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; 1455 1456 /* Haven't reached the bvec range, exit */ 1457 if (decompressed + buf_len <= bvec_offset) 1458 return 1; 1459 1460 copy_start = max(cur_offset, bvec_offset); 1461 copy_len = min(bvec_offset + bvec.bv_len, 1462 decompressed + buf_len) - copy_start; 1463 ASSERT(copy_len); 1464 1465 /* 1466 * Extra range check to ensure we didn't go beyond 1467 * @buf + @buf_len. 1468 */ 1469 ASSERT(copy_start - decompressed < buf_len); 1470 memcpy_to_page(bvec.bv_page, bvec.bv_offset, 1471 buf + copy_start - decompressed, copy_len); 1472 flush_dcache_page(bvec.bv_page); 1473 cur_offset += copy_len; 1474 1475 bio_advance(orig_bio, copy_len); 1476 /* Finished the bio */ 1477 if (!orig_bio->bi_iter.bi_size) 1478 return 0; 1479 } 1480 return 1; 1481 } 1482 1483 /* 1484 * Shannon Entropy calculation 1485 * 1486 * Pure byte distribution analysis fails to determine compressibility of data. 1487 * Try calculating entropy to estimate the average minimum number of bits 1488 * needed to encode the sampled data. 1489 * 1490 * For convenience, return the percentage of needed bits, instead of amount of 1491 * bits directly. 1492 * 1493 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 1494 * and can be compressible with high probability 1495 * 1496 * @ENTROPY_LVL_HIGH - data are not compressible with high probability 1497 * 1498 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 1499 */ 1500 #define ENTROPY_LVL_ACEPTABLE (65) 1501 #define ENTROPY_LVL_HIGH (80) 1502 1503 /* 1504 * For increasead precision in shannon_entropy calculation, 1505 * let's do pow(n, M) to save more digits after comma: 1506 * 1507 * - maximum int bit length is 64 1508 * - ilog2(MAX_SAMPLE_SIZE) -> 13 1509 * - 13 * 4 = 52 < 64 -> M = 4 1510 * 1511 * So use pow(n, 4). 1512 */ 1513 static inline u32 ilog2_w(u64 n) 1514 { 1515 return ilog2(n * n * n * n); 1516 } 1517 1518 static u32 shannon_entropy(struct heuristic_ws *ws) 1519 { 1520 const u32 entropy_max = 8 * ilog2_w(2); 1521 u32 entropy_sum = 0; 1522 u32 p, p_base, sz_base; 1523 u32 i; 1524 1525 sz_base = ilog2_w(ws->sample_size); 1526 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 1527 p = ws->bucket[i].count; 1528 p_base = ilog2_w(p); 1529 entropy_sum += p * (sz_base - p_base); 1530 } 1531 1532 entropy_sum /= ws->sample_size; 1533 return entropy_sum * 100 / entropy_max; 1534 } 1535 1536 #define RADIX_BASE 4U 1537 #define COUNTERS_SIZE (1U << RADIX_BASE) 1538 1539 static u8 get4bits(u64 num, int shift) { 1540 u8 low4bits; 1541 1542 num >>= shift; 1543 /* Reverse order */ 1544 low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); 1545 return low4bits; 1546 } 1547 1548 /* 1549 * Use 4 bits as radix base 1550 * Use 16 u32 counters for calculating new position in buf array 1551 * 1552 * @array - array that will be sorted 1553 * @array_buf - buffer array to store sorting results 1554 * must be equal in size to @array 1555 * @num - array size 1556 */ 1557 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, 1558 int num) 1559 { 1560 u64 max_num; 1561 u64 buf_num; 1562 u32 counters[COUNTERS_SIZE]; 1563 u32 new_addr; 1564 u32 addr; 1565 int bitlen; 1566 int shift; 1567 int i; 1568 1569 /* 1570 * Try avoid useless loop iterations for small numbers stored in big 1571 * counters. Example: 48 33 4 ... in 64bit array 1572 */ 1573 max_num = array[0].count; 1574 for (i = 1; i < num; i++) { 1575 buf_num = array[i].count; 1576 if (buf_num > max_num) 1577 max_num = buf_num; 1578 } 1579 1580 buf_num = ilog2(max_num); 1581 bitlen = ALIGN(buf_num, RADIX_BASE * 2); 1582 1583 shift = 0; 1584 while (shift < bitlen) { 1585 memset(counters, 0, sizeof(counters)); 1586 1587 for (i = 0; i < num; i++) { 1588 buf_num = array[i].count; 1589 addr = get4bits(buf_num, shift); 1590 counters[addr]++; 1591 } 1592 1593 for (i = 1; i < COUNTERS_SIZE; i++) 1594 counters[i] += counters[i - 1]; 1595 1596 for (i = num - 1; i >= 0; i--) { 1597 buf_num = array[i].count; 1598 addr = get4bits(buf_num, shift); 1599 counters[addr]--; 1600 new_addr = counters[addr]; 1601 array_buf[new_addr] = array[i]; 1602 } 1603 1604 shift += RADIX_BASE; 1605 1606 /* 1607 * Normal radix expects to move data from a temporary array, to 1608 * the main one. But that requires some CPU time. Avoid that 1609 * by doing another sort iteration to original array instead of 1610 * memcpy() 1611 */ 1612 memset(counters, 0, sizeof(counters)); 1613 1614 for (i = 0; i < num; i ++) { 1615 buf_num = array_buf[i].count; 1616 addr = get4bits(buf_num, shift); 1617 counters[addr]++; 1618 } 1619 1620 for (i = 1; i < COUNTERS_SIZE; i++) 1621 counters[i] += counters[i - 1]; 1622 1623 for (i = num - 1; i >= 0; i--) { 1624 buf_num = array_buf[i].count; 1625 addr = get4bits(buf_num, shift); 1626 counters[addr]--; 1627 new_addr = counters[addr]; 1628 array[new_addr] = array_buf[i]; 1629 } 1630 1631 shift += RADIX_BASE; 1632 } 1633 } 1634 1635 /* 1636 * Size of the core byte set - how many bytes cover 90% of the sample 1637 * 1638 * There are several types of structured binary data that use nearly all byte 1639 * values. The distribution can be uniform and counts in all buckets will be 1640 * nearly the same (eg. encrypted data). Unlikely to be compressible. 1641 * 1642 * Other possibility is normal (Gaussian) distribution, where the data could 1643 * be potentially compressible, but we have to take a few more steps to decide 1644 * how much. 1645 * 1646 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 1647 * compression algo can easy fix that 1648 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 1649 * probability is not compressible 1650 */ 1651 #define BYTE_CORE_SET_LOW (64) 1652 #define BYTE_CORE_SET_HIGH (200) 1653 1654 static int byte_core_set_size(struct heuristic_ws *ws) 1655 { 1656 u32 i; 1657 u32 coreset_sum = 0; 1658 const u32 core_set_threshold = ws->sample_size * 90 / 100; 1659 struct bucket_item *bucket = ws->bucket; 1660 1661 /* Sort in reverse order */ 1662 radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); 1663 1664 for (i = 0; i < BYTE_CORE_SET_LOW; i++) 1665 coreset_sum += bucket[i].count; 1666 1667 if (coreset_sum > core_set_threshold) 1668 return i; 1669 1670 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 1671 coreset_sum += bucket[i].count; 1672 if (coreset_sum > core_set_threshold) 1673 break; 1674 } 1675 1676 return i; 1677 } 1678 1679 /* 1680 * Count byte values in buckets. 1681 * This heuristic can detect textual data (configs, xml, json, html, etc). 1682 * Because in most text-like data byte set is restricted to limited number of 1683 * possible characters, and that restriction in most cases makes data easy to 1684 * compress. 1685 * 1686 * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 1687 * less - compressible 1688 * more - need additional analysis 1689 */ 1690 #define BYTE_SET_THRESHOLD (64) 1691 1692 static u32 byte_set_size(const struct heuristic_ws *ws) 1693 { 1694 u32 i; 1695 u32 byte_set_size = 0; 1696 1697 for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 1698 if (ws->bucket[i].count > 0) 1699 byte_set_size++; 1700 } 1701 1702 /* 1703 * Continue collecting count of byte values in buckets. If the byte 1704 * set size is bigger then the threshold, it's pointless to continue, 1705 * the detection technique would fail for this type of data. 1706 */ 1707 for (; i < BUCKET_SIZE; i++) { 1708 if (ws->bucket[i].count > 0) { 1709 byte_set_size++; 1710 if (byte_set_size > BYTE_SET_THRESHOLD) 1711 return byte_set_size; 1712 } 1713 } 1714 1715 return byte_set_size; 1716 } 1717 1718 static bool sample_repeated_patterns(struct heuristic_ws *ws) 1719 { 1720 const u32 half_of_sample = ws->sample_size / 2; 1721 const u8 *data = ws->sample; 1722 1723 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 1724 } 1725 1726 static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 1727 struct heuristic_ws *ws) 1728 { 1729 struct page *page; 1730 u64 index, index_end; 1731 u32 i, curr_sample_pos; 1732 u8 *in_data; 1733 1734 /* 1735 * Compression handles the input data by chunks of 128KiB 1736 * (defined by BTRFS_MAX_UNCOMPRESSED) 1737 * 1738 * We do the same for the heuristic and loop over the whole range. 1739 * 1740 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 1741 * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 1742 */ 1743 if (end - start > BTRFS_MAX_UNCOMPRESSED) 1744 end = start + BTRFS_MAX_UNCOMPRESSED; 1745 1746 index = start >> PAGE_SHIFT; 1747 index_end = end >> PAGE_SHIFT; 1748 1749 /* Don't miss unaligned end */ 1750 if (!IS_ALIGNED(end, PAGE_SIZE)) 1751 index_end++; 1752 1753 curr_sample_pos = 0; 1754 while (index < index_end) { 1755 page = find_get_page(inode->i_mapping, index); 1756 in_data = kmap_local_page(page); 1757 /* Handle case where the start is not aligned to PAGE_SIZE */ 1758 i = start % PAGE_SIZE; 1759 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 1760 /* Don't sample any garbage from the last page */ 1761 if (start > end - SAMPLING_READ_SIZE) 1762 break; 1763 memcpy(&ws->sample[curr_sample_pos], &in_data[i], 1764 SAMPLING_READ_SIZE); 1765 i += SAMPLING_INTERVAL; 1766 start += SAMPLING_INTERVAL; 1767 curr_sample_pos += SAMPLING_READ_SIZE; 1768 } 1769 kunmap_local(in_data); 1770 put_page(page); 1771 1772 index++; 1773 } 1774 1775 ws->sample_size = curr_sample_pos; 1776 } 1777 1778 /* 1779 * Compression heuristic. 1780 * 1781 * For now is's a naive and optimistic 'return true', we'll extend the logic to 1782 * quickly (compared to direct compression) detect data characteristics 1783 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible 1784 * data. 1785 * 1786 * The following types of analysis can be performed: 1787 * - detect mostly zero data 1788 * - detect data with low "byte set" size (text, etc) 1789 * - detect data with low/high "core byte" set 1790 * 1791 * Return non-zero if the compression should be done, 0 otherwise. 1792 */ 1793 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1794 { 1795 struct list_head *ws_list = get_workspace(0, 0); 1796 struct heuristic_ws *ws; 1797 u32 i; 1798 u8 byte; 1799 int ret = 0; 1800 1801 ws = list_entry(ws_list, struct heuristic_ws, list); 1802 1803 heuristic_collect_sample(inode, start, end, ws); 1804 1805 if (sample_repeated_patterns(ws)) { 1806 ret = 1; 1807 goto out; 1808 } 1809 1810 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 1811 1812 for (i = 0; i < ws->sample_size; i++) { 1813 byte = ws->sample[i]; 1814 ws->bucket[byte].count++; 1815 } 1816 1817 i = byte_set_size(ws); 1818 if (i < BYTE_SET_THRESHOLD) { 1819 ret = 2; 1820 goto out; 1821 } 1822 1823 i = byte_core_set_size(ws); 1824 if (i <= BYTE_CORE_SET_LOW) { 1825 ret = 3; 1826 goto out; 1827 } 1828 1829 if (i >= BYTE_CORE_SET_HIGH) { 1830 ret = 0; 1831 goto out; 1832 } 1833 1834 i = shannon_entropy(ws); 1835 if (i <= ENTROPY_LVL_ACEPTABLE) { 1836 ret = 4; 1837 goto out; 1838 } 1839 1840 /* 1841 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 1842 * needed to give green light to compression. 1843 * 1844 * For now just assume that compression at that level is not worth the 1845 * resources because: 1846 * 1847 * 1. it is possible to defrag the data later 1848 * 1849 * 2. the data would turn out to be hardly compressible, eg. 150 byte 1850 * values, every bucket has counter at level ~54. The heuristic would 1851 * be confused. This can happen when data have some internal repeated 1852 * patterns like "abbacbbc...". This can be detected by analyzing 1853 * pairs of bytes, which is too costly. 1854 */ 1855 if (i < ENTROPY_LVL_HIGH) { 1856 ret = 5; 1857 goto out; 1858 } else { 1859 ret = 0; 1860 goto out; 1861 } 1862 1863 out: 1864 put_workspace(0, ws_list); 1865 return ret; 1866 } 1867 1868 /* 1869 * Convert the compression suffix (eg. after "zlib" starting with ":") to 1870 * level, unrecognized string will set the default level 1871 */ 1872 unsigned int btrfs_compress_str2level(unsigned int type, const char *str) 1873 { 1874 unsigned int level = 0; 1875 int ret; 1876 1877 if (!type) 1878 return 0; 1879 1880 if (str[0] == ':') { 1881 ret = kstrtouint(str + 1, 10, &level); 1882 if (ret) 1883 level = 0; 1884 } 1885 1886 level = btrfs_compress_set_level(type, level); 1887 1888 return level; 1889 } 1890