1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/kernel.h> 7 #include <linux/bio.h> 8 #include <linux/file.h> 9 #include <linux/fs.h> 10 #include <linux/pagemap.h> 11 #include <linux/highmem.h> 12 #include <linux/time.h> 13 #include <linux/init.h> 14 #include <linux/string.h> 15 #include <linux/backing-dev.h> 16 #include <linux/writeback.h> 17 #include <linux/slab.h> 18 #include <linux/sched/mm.h> 19 #include <linux/log2.h> 20 #include <crypto/hash.h> 21 #include "ctree.h" 22 #include "disk-io.h" 23 #include "transaction.h" 24 #include "btrfs_inode.h" 25 #include "volumes.h" 26 #include "ordered-data.h" 27 #include "compression.h" 28 #include "extent_io.h" 29 #include "extent_map.h" 30 31 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; 32 33 const char* btrfs_compress_type2str(enum btrfs_compression_type type) 34 { 35 switch (type) { 36 case BTRFS_COMPRESS_ZLIB: 37 case BTRFS_COMPRESS_LZO: 38 case BTRFS_COMPRESS_ZSTD: 39 case BTRFS_COMPRESS_NONE: 40 return btrfs_compress_types[type]; 41 } 42 43 return NULL; 44 } 45 46 bool btrfs_compress_is_valid_type(const char *str, size_t len) 47 { 48 int i; 49 50 for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) { 51 size_t comp_len = strlen(btrfs_compress_types[i]); 52 53 if (len < comp_len) 54 continue; 55 56 if (!strncmp(btrfs_compress_types[i], str, comp_len)) 57 return true; 58 } 59 return false; 60 } 61 62 static int btrfs_decompress_bio(struct compressed_bio *cb); 63 64 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, 65 unsigned long disk_size) 66 { 67 u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 68 69 return sizeof(struct compressed_bio) + 70 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; 71 } 72 73 static int check_compressed_csum(struct btrfs_inode *inode, 74 struct compressed_bio *cb, 75 u64 disk_start) 76 { 77 struct btrfs_fs_info *fs_info = inode->root->fs_info; 78 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 79 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 80 int ret; 81 struct page *page; 82 unsigned long i; 83 char *kaddr; 84 u8 csum[BTRFS_CSUM_SIZE]; 85 u8 *cb_sum = cb->sums; 86 87 if (inode->flags & BTRFS_INODE_NODATASUM) 88 return 0; 89 90 shash->tfm = fs_info->csum_shash; 91 92 for (i = 0; i < cb->nr_pages; i++) { 93 page = cb->compressed_pages[i]; 94 95 crypto_shash_init(shash); 96 kaddr = kmap_atomic(page); 97 crypto_shash_update(shash, kaddr, PAGE_SIZE); 98 kunmap_atomic(kaddr); 99 crypto_shash_final(shash, (u8 *)&csum); 100 101 if (memcmp(&csum, cb_sum, csum_size)) { 102 btrfs_print_data_csum_error(inode, disk_start, 103 csum, cb_sum, cb->mirror_num); 104 ret = -EIO; 105 goto fail; 106 } 107 cb_sum += csum_size; 108 109 } 110 ret = 0; 111 fail: 112 return ret; 113 } 114 115 /* when we finish reading compressed pages from the disk, we 116 * decompress them and then run the bio end_io routines on the 117 * decompressed pages (in the inode address space). 118 * 119 * This allows the checksumming and other IO error handling routines 120 * to work normally 121 * 122 * The compressed pages are freed here, and it must be run 123 * in process context 124 */ 125 static void end_compressed_bio_read(struct bio *bio) 126 { 127 struct compressed_bio *cb = bio->bi_private; 128 struct inode *inode; 129 struct page *page; 130 unsigned long index; 131 unsigned int mirror = btrfs_io_bio(bio)->mirror_num; 132 int ret = 0; 133 134 if (bio->bi_status) 135 cb->errors = 1; 136 137 /* if there are more bios still pending for this compressed 138 * extent, just exit 139 */ 140 if (!refcount_dec_and_test(&cb->pending_bios)) 141 goto out; 142 143 /* 144 * Record the correct mirror_num in cb->orig_bio so that 145 * read-repair can work properly. 146 */ 147 ASSERT(btrfs_io_bio(cb->orig_bio)); 148 btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; 149 cb->mirror_num = mirror; 150 151 /* 152 * Some IO in this cb have failed, just skip checksum as there 153 * is no way it could be correct. 154 */ 155 if (cb->errors == 1) 156 goto csum_failed; 157 158 inode = cb->inode; 159 ret = check_compressed_csum(BTRFS_I(inode), cb, 160 (u64)bio->bi_iter.bi_sector << 9); 161 if (ret) 162 goto csum_failed; 163 164 /* ok, we're the last bio for this extent, lets start 165 * the decompression. 166 */ 167 ret = btrfs_decompress_bio(cb); 168 169 csum_failed: 170 if (ret) 171 cb->errors = 1; 172 173 /* release the compressed pages */ 174 index = 0; 175 for (index = 0; index < cb->nr_pages; index++) { 176 page = cb->compressed_pages[index]; 177 page->mapping = NULL; 178 put_page(page); 179 } 180 181 /* do io completion on the original bio */ 182 if (cb->errors) { 183 bio_io_error(cb->orig_bio); 184 } else { 185 struct bio_vec *bvec; 186 struct bvec_iter_all iter_all; 187 188 /* 189 * we have verified the checksum already, set page 190 * checked so the end_io handlers know about it 191 */ 192 ASSERT(!bio_flagged(bio, BIO_CLONED)); 193 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) 194 SetPageChecked(bvec->bv_page); 195 196 bio_endio(cb->orig_bio); 197 } 198 199 /* finally free the cb struct */ 200 kfree(cb->compressed_pages); 201 kfree(cb); 202 out: 203 bio_put(bio); 204 } 205 206 /* 207 * Clear the writeback bits on all of the file 208 * pages for a compressed write 209 */ 210 static noinline void end_compressed_writeback(struct inode *inode, 211 const struct compressed_bio *cb) 212 { 213 unsigned long index = cb->start >> PAGE_SHIFT; 214 unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; 215 struct page *pages[16]; 216 unsigned long nr_pages = end_index - index + 1; 217 int i; 218 int ret; 219 220 if (cb->errors) 221 mapping_set_error(inode->i_mapping, -EIO); 222 223 while (nr_pages > 0) { 224 ret = find_get_pages_contig(inode->i_mapping, index, 225 min_t(unsigned long, 226 nr_pages, ARRAY_SIZE(pages)), pages); 227 if (ret == 0) { 228 nr_pages -= 1; 229 index += 1; 230 continue; 231 } 232 for (i = 0; i < ret; i++) { 233 if (cb->errors) 234 SetPageError(pages[i]); 235 end_page_writeback(pages[i]); 236 put_page(pages[i]); 237 } 238 nr_pages -= ret; 239 index += ret; 240 } 241 /* the inode may be gone now */ 242 } 243 244 /* 245 * do the cleanup once all the compressed pages hit the disk. 246 * This will clear writeback on the file pages and free the compressed 247 * pages. 248 * 249 * This also calls the writeback end hooks for the file pages so that 250 * metadata and checksums can be updated in the file. 251 */ 252 static void end_compressed_bio_write(struct bio *bio) 253 { 254 struct compressed_bio *cb = bio->bi_private; 255 struct inode *inode; 256 struct page *page; 257 unsigned long index; 258 259 if (bio->bi_status) 260 cb->errors = 1; 261 262 /* if there are more bios still pending for this compressed 263 * extent, just exit 264 */ 265 if (!refcount_dec_and_test(&cb->pending_bios)) 266 goto out; 267 268 /* ok, we're the last bio for this extent, step one is to 269 * call back into the FS and do all the end_io operations 270 */ 271 inode = cb->inode; 272 cb->compressed_pages[0]->mapping = cb->inode->i_mapping; 273 btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], 274 cb->start, cb->start + cb->len - 1, 275 bio->bi_status == BLK_STS_OK); 276 cb->compressed_pages[0]->mapping = NULL; 277 278 end_compressed_writeback(inode, cb); 279 /* note, our inode could be gone now */ 280 281 /* 282 * release the compressed pages, these came from alloc_page and 283 * are not attached to the inode at all 284 */ 285 index = 0; 286 for (index = 0; index < cb->nr_pages; index++) { 287 page = cb->compressed_pages[index]; 288 page->mapping = NULL; 289 put_page(page); 290 } 291 292 /* finally free the cb struct */ 293 kfree(cb->compressed_pages); 294 kfree(cb); 295 out: 296 bio_put(bio); 297 } 298 299 /* 300 * worker function to build and submit bios for previously compressed pages. 301 * The corresponding pages in the inode should be marked for writeback 302 * and the compressed pages should have a reference on them for dropping 303 * when the IO is complete. 304 * 305 * This also checksums the file bytes and gets things ready for 306 * the end io hooks. 307 */ 308 blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, 309 unsigned long len, u64 disk_start, 310 unsigned long compressed_len, 311 struct page **compressed_pages, 312 unsigned long nr_pages, 313 unsigned int write_flags) 314 { 315 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 316 struct bio *bio = NULL; 317 struct compressed_bio *cb; 318 unsigned long bytes_left; 319 int pg_index = 0; 320 struct page *page; 321 u64 first_byte = disk_start; 322 struct block_device *bdev; 323 blk_status_t ret; 324 int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 325 326 WARN_ON(!PAGE_ALIGNED(start)); 327 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 328 if (!cb) 329 return BLK_STS_RESOURCE; 330 refcount_set(&cb->pending_bios, 0); 331 cb->errors = 0; 332 cb->inode = inode; 333 cb->start = start; 334 cb->len = len; 335 cb->mirror_num = 0; 336 cb->compressed_pages = compressed_pages; 337 cb->compressed_len = compressed_len; 338 cb->orig_bio = NULL; 339 cb->nr_pages = nr_pages; 340 341 bdev = fs_info->fs_devices->latest_bdev; 342 343 bio = btrfs_bio_alloc(first_byte); 344 bio_set_dev(bio, bdev); 345 bio->bi_opf = REQ_OP_WRITE | write_flags; 346 bio->bi_private = cb; 347 bio->bi_end_io = end_compressed_bio_write; 348 refcount_set(&cb->pending_bios, 1); 349 350 /* create and submit bios for the compressed pages */ 351 bytes_left = compressed_len; 352 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { 353 int submit = 0; 354 355 page = compressed_pages[pg_index]; 356 page->mapping = inode->i_mapping; 357 if (bio->bi_iter.bi_size) 358 submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 359 0); 360 361 page->mapping = NULL; 362 if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < 363 PAGE_SIZE) { 364 /* 365 * inc the count before we submit the bio so 366 * we know the end IO handler won't happen before 367 * we inc the count. Otherwise, the cb might get 368 * freed before we're done setting it up 369 */ 370 refcount_inc(&cb->pending_bios); 371 ret = btrfs_bio_wq_end_io(fs_info, bio, 372 BTRFS_WQ_ENDIO_DATA); 373 BUG_ON(ret); /* -ENOMEM */ 374 375 if (!skip_sum) { 376 ret = btrfs_csum_one_bio(inode, bio, start, 1); 377 BUG_ON(ret); /* -ENOMEM */ 378 } 379 380 ret = btrfs_map_bio(fs_info, bio, 0, 1); 381 if (ret) { 382 bio->bi_status = ret; 383 bio_endio(bio); 384 } 385 386 bio = btrfs_bio_alloc(first_byte); 387 bio_set_dev(bio, bdev); 388 bio->bi_opf = REQ_OP_WRITE | write_flags; 389 bio->bi_private = cb; 390 bio->bi_end_io = end_compressed_bio_write; 391 bio_add_page(bio, page, PAGE_SIZE, 0); 392 } 393 if (bytes_left < PAGE_SIZE) { 394 btrfs_info(fs_info, 395 "bytes left %lu compress len %lu nr %lu", 396 bytes_left, cb->compressed_len, cb->nr_pages); 397 } 398 bytes_left -= PAGE_SIZE; 399 first_byte += PAGE_SIZE; 400 cond_resched(); 401 } 402 403 ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); 404 BUG_ON(ret); /* -ENOMEM */ 405 406 if (!skip_sum) { 407 ret = btrfs_csum_one_bio(inode, bio, start, 1); 408 BUG_ON(ret); /* -ENOMEM */ 409 } 410 411 ret = btrfs_map_bio(fs_info, bio, 0, 1); 412 if (ret) { 413 bio->bi_status = ret; 414 bio_endio(bio); 415 } 416 417 return 0; 418 } 419 420 static u64 bio_end_offset(struct bio *bio) 421 { 422 struct bio_vec *last = bio_last_bvec_all(bio); 423 424 return page_offset(last->bv_page) + last->bv_len + last->bv_offset; 425 } 426 427 static noinline int add_ra_bio_pages(struct inode *inode, 428 u64 compressed_end, 429 struct compressed_bio *cb) 430 { 431 unsigned long end_index; 432 unsigned long pg_index; 433 u64 last_offset; 434 u64 isize = i_size_read(inode); 435 int ret; 436 struct page *page; 437 unsigned long nr_pages = 0; 438 struct extent_map *em; 439 struct address_space *mapping = inode->i_mapping; 440 struct extent_map_tree *em_tree; 441 struct extent_io_tree *tree; 442 u64 end; 443 int misses = 0; 444 445 last_offset = bio_end_offset(cb->orig_bio); 446 em_tree = &BTRFS_I(inode)->extent_tree; 447 tree = &BTRFS_I(inode)->io_tree; 448 449 if (isize == 0) 450 return 0; 451 452 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 453 454 while (last_offset < compressed_end) { 455 pg_index = last_offset >> PAGE_SHIFT; 456 457 if (pg_index > end_index) 458 break; 459 460 page = xa_load(&mapping->i_pages, pg_index); 461 if (page && !xa_is_value(page)) { 462 misses++; 463 if (misses > 4) 464 break; 465 goto next; 466 } 467 468 page = __page_cache_alloc(mapping_gfp_constraint(mapping, 469 ~__GFP_FS)); 470 if (!page) 471 break; 472 473 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { 474 put_page(page); 475 goto next; 476 } 477 478 end = last_offset + PAGE_SIZE - 1; 479 /* 480 * at this point, we have a locked page in the page cache 481 * for these bytes in the file. But, we have to make 482 * sure they map to this compressed extent on disk. 483 */ 484 set_page_extent_mapped(page); 485 lock_extent(tree, last_offset, end); 486 read_lock(&em_tree->lock); 487 em = lookup_extent_mapping(em_tree, last_offset, 488 PAGE_SIZE); 489 read_unlock(&em_tree->lock); 490 491 if (!em || last_offset < em->start || 492 (last_offset + PAGE_SIZE > extent_map_end(em)) || 493 (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { 494 free_extent_map(em); 495 unlock_extent(tree, last_offset, end); 496 unlock_page(page); 497 put_page(page); 498 break; 499 } 500 free_extent_map(em); 501 502 if (page->index == end_index) { 503 char *userpage; 504 size_t zero_offset = offset_in_page(isize); 505 506 if (zero_offset) { 507 int zeros; 508 zeros = PAGE_SIZE - zero_offset; 509 userpage = kmap_atomic(page); 510 memset(userpage + zero_offset, 0, zeros); 511 flush_dcache_page(page); 512 kunmap_atomic(userpage); 513 } 514 } 515 516 ret = bio_add_page(cb->orig_bio, page, 517 PAGE_SIZE, 0); 518 519 if (ret == PAGE_SIZE) { 520 nr_pages++; 521 put_page(page); 522 } else { 523 unlock_extent(tree, last_offset, end); 524 unlock_page(page); 525 put_page(page); 526 break; 527 } 528 next: 529 last_offset += PAGE_SIZE; 530 } 531 return 0; 532 } 533 534 /* 535 * for a compressed read, the bio we get passed has all the inode pages 536 * in it. We don't actually do IO on those pages but allocate new ones 537 * to hold the compressed pages on disk. 538 * 539 * bio->bi_iter.bi_sector points to the compressed extent on disk 540 * bio->bi_io_vec points to all of the inode pages 541 * 542 * After the compressed pages are read, we copy the bytes into the 543 * bio we were passed and then call the bio end_io calls 544 */ 545 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 546 int mirror_num, unsigned long bio_flags) 547 { 548 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); 549 struct extent_map_tree *em_tree; 550 struct compressed_bio *cb; 551 unsigned long compressed_len; 552 unsigned long nr_pages; 553 unsigned long pg_index; 554 struct page *page; 555 struct block_device *bdev; 556 struct bio *comp_bio; 557 u64 cur_disk_byte = (u64)bio->bi_iter.bi_sector << 9; 558 u64 em_len; 559 u64 em_start; 560 struct extent_map *em; 561 blk_status_t ret = BLK_STS_RESOURCE; 562 int faili = 0; 563 const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); 564 u8 *sums; 565 566 em_tree = &BTRFS_I(inode)->extent_tree; 567 568 /* we need the actual starting offset of this extent in the file */ 569 read_lock(&em_tree->lock); 570 em = lookup_extent_mapping(em_tree, 571 page_offset(bio_first_page_all(bio)), 572 PAGE_SIZE); 573 read_unlock(&em_tree->lock); 574 if (!em) 575 return BLK_STS_IOERR; 576 577 compressed_len = em->block_len; 578 cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); 579 if (!cb) 580 goto out; 581 582 refcount_set(&cb->pending_bios, 0); 583 cb->errors = 0; 584 cb->inode = inode; 585 cb->mirror_num = mirror_num; 586 sums = cb->sums; 587 588 cb->start = em->orig_start; 589 em_len = em->len; 590 em_start = em->start; 591 592 free_extent_map(em); 593 em = NULL; 594 595 cb->len = bio->bi_iter.bi_size; 596 cb->compressed_len = compressed_len; 597 cb->compress_type = extent_compress_type(bio_flags); 598 cb->orig_bio = bio; 599 600 nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); 601 cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), 602 GFP_NOFS); 603 if (!cb->compressed_pages) 604 goto fail1; 605 606 bdev = fs_info->fs_devices->latest_bdev; 607 608 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 609 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | 610 __GFP_HIGHMEM); 611 if (!cb->compressed_pages[pg_index]) { 612 faili = pg_index - 1; 613 ret = BLK_STS_RESOURCE; 614 goto fail2; 615 } 616 } 617 faili = nr_pages - 1; 618 cb->nr_pages = nr_pages; 619 620 add_ra_bio_pages(inode, em_start + em_len, cb); 621 622 /* include any pages we added in add_ra-bio_pages */ 623 cb->len = bio->bi_iter.bi_size; 624 625 comp_bio = btrfs_bio_alloc(cur_disk_byte); 626 bio_set_dev(comp_bio, bdev); 627 comp_bio->bi_opf = REQ_OP_READ; 628 comp_bio->bi_private = cb; 629 comp_bio->bi_end_io = end_compressed_bio_read; 630 refcount_set(&cb->pending_bios, 1); 631 632 for (pg_index = 0; pg_index < nr_pages; pg_index++) { 633 int submit = 0; 634 635 page = cb->compressed_pages[pg_index]; 636 page->mapping = inode->i_mapping; 637 page->index = em_start >> PAGE_SHIFT; 638 639 if (comp_bio->bi_iter.bi_size) 640 submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, 641 comp_bio, 0); 642 643 page->mapping = NULL; 644 if (submit || bio_add_page(comp_bio, page, PAGE_SIZE, 0) < 645 PAGE_SIZE) { 646 unsigned int nr_sectors; 647 648 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, 649 BTRFS_WQ_ENDIO_DATA); 650 BUG_ON(ret); /* -ENOMEM */ 651 652 /* 653 * inc the count before we submit the bio so 654 * we know the end IO handler won't happen before 655 * we inc the count. Otherwise, the cb might get 656 * freed before we're done setting it up 657 */ 658 refcount_inc(&cb->pending_bios); 659 660 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 661 ret = btrfs_lookup_bio_sums(inode, comp_bio, 662 sums); 663 BUG_ON(ret); /* -ENOMEM */ 664 } 665 666 nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, 667 fs_info->sectorsize); 668 sums += csum_size * nr_sectors; 669 670 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); 671 if (ret) { 672 comp_bio->bi_status = ret; 673 bio_endio(comp_bio); 674 } 675 676 comp_bio = btrfs_bio_alloc(cur_disk_byte); 677 bio_set_dev(comp_bio, bdev); 678 comp_bio->bi_opf = REQ_OP_READ; 679 comp_bio->bi_private = cb; 680 comp_bio->bi_end_io = end_compressed_bio_read; 681 682 bio_add_page(comp_bio, page, PAGE_SIZE, 0); 683 } 684 cur_disk_byte += PAGE_SIZE; 685 } 686 687 ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); 688 BUG_ON(ret); /* -ENOMEM */ 689 690 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 691 ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); 692 BUG_ON(ret); /* -ENOMEM */ 693 } 694 695 ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); 696 if (ret) { 697 comp_bio->bi_status = ret; 698 bio_endio(comp_bio); 699 } 700 701 return 0; 702 703 fail2: 704 while (faili >= 0) { 705 __free_page(cb->compressed_pages[faili]); 706 faili--; 707 } 708 709 kfree(cb->compressed_pages); 710 fail1: 711 kfree(cb); 712 out: 713 free_extent_map(em); 714 return ret; 715 } 716 717 /* 718 * Heuristic uses systematic sampling to collect data from the input data 719 * range, the logic can be tuned by the following constants: 720 * 721 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample 722 * @SAMPLING_INTERVAL - range from which the sampled data can be collected 723 */ 724 #define SAMPLING_READ_SIZE (16) 725 #define SAMPLING_INTERVAL (256) 726 727 /* 728 * For statistical analysis of the input data we consider bytes that form a 729 * Galois Field of 256 objects. Each object has an attribute count, ie. how 730 * many times the object appeared in the sample. 731 */ 732 #define BUCKET_SIZE (256) 733 734 /* 735 * The size of the sample is based on a statistical sampling rule of thumb. 736 * The common way is to perform sampling tests as long as the number of 737 * elements in each cell is at least 5. 738 * 739 * Instead of 5, we choose 32 to obtain more accurate results. 740 * If the data contain the maximum number of symbols, which is 256, we obtain a 741 * sample size bound by 8192. 742 * 743 * For a sample of at most 8KB of data per data range: 16 consecutive bytes 744 * from up to 512 locations. 745 */ 746 #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \ 747 SAMPLING_READ_SIZE / SAMPLING_INTERVAL) 748 749 struct bucket_item { 750 u32 count; 751 }; 752 753 struct heuristic_ws { 754 /* Partial copy of input data */ 755 u8 *sample; 756 u32 sample_size; 757 /* Buckets store counters for each byte value */ 758 struct bucket_item *bucket; 759 /* Sorting buffer */ 760 struct bucket_item *bucket_b; 761 struct list_head list; 762 }; 763 764 static struct workspace_manager heuristic_wsm; 765 766 static void heuristic_init_workspace_manager(void) 767 { 768 btrfs_init_workspace_manager(&heuristic_wsm, &btrfs_heuristic_compress); 769 } 770 771 static void heuristic_cleanup_workspace_manager(void) 772 { 773 btrfs_cleanup_workspace_manager(&heuristic_wsm); 774 } 775 776 static struct list_head *heuristic_get_workspace(unsigned int level) 777 { 778 return btrfs_get_workspace(&heuristic_wsm, level); 779 } 780 781 static void heuristic_put_workspace(struct list_head *ws) 782 { 783 btrfs_put_workspace(&heuristic_wsm, ws); 784 } 785 786 static void free_heuristic_ws(struct list_head *ws) 787 { 788 struct heuristic_ws *workspace; 789 790 workspace = list_entry(ws, struct heuristic_ws, list); 791 792 kvfree(workspace->sample); 793 kfree(workspace->bucket); 794 kfree(workspace->bucket_b); 795 kfree(workspace); 796 } 797 798 static struct list_head *alloc_heuristic_ws(unsigned int level) 799 { 800 struct heuristic_ws *ws; 801 802 ws = kzalloc(sizeof(*ws), GFP_KERNEL); 803 if (!ws) 804 return ERR_PTR(-ENOMEM); 805 806 ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL); 807 if (!ws->sample) 808 goto fail; 809 810 ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL); 811 if (!ws->bucket) 812 goto fail; 813 814 ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL); 815 if (!ws->bucket_b) 816 goto fail; 817 818 INIT_LIST_HEAD(&ws->list); 819 return &ws->list; 820 fail: 821 free_heuristic_ws(&ws->list); 822 return ERR_PTR(-ENOMEM); 823 } 824 825 const struct btrfs_compress_op btrfs_heuristic_compress = { 826 .init_workspace_manager = heuristic_init_workspace_manager, 827 .cleanup_workspace_manager = heuristic_cleanup_workspace_manager, 828 .get_workspace = heuristic_get_workspace, 829 .put_workspace = heuristic_put_workspace, 830 .alloc_workspace = alloc_heuristic_ws, 831 .free_workspace = free_heuristic_ws, 832 }; 833 834 static const struct btrfs_compress_op * const btrfs_compress_op[] = { 835 /* The heuristic is represented as compression type 0 */ 836 &btrfs_heuristic_compress, 837 &btrfs_zlib_compress, 838 &btrfs_lzo_compress, 839 &btrfs_zstd_compress, 840 }; 841 842 void btrfs_init_workspace_manager(struct workspace_manager *wsm, 843 const struct btrfs_compress_op *ops) 844 { 845 struct list_head *workspace; 846 847 wsm->ops = ops; 848 849 INIT_LIST_HEAD(&wsm->idle_ws); 850 spin_lock_init(&wsm->ws_lock); 851 atomic_set(&wsm->total_ws, 0); 852 init_waitqueue_head(&wsm->ws_wait); 853 854 /* 855 * Preallocate one workspace for each compression type so we can 856 * guarantee forward progress in the worst case 857 */ 858 workspace = wsm->ops->alloc_workspace(0); 859 if (IS_ERR(workspace)) { 860 pr_warn( 861 "BTRFS: cannot preallocate compression workspace, will try later\n"); 862 } else { 863 atomic_set(&wsm->total_ws, 1); 864 wsm->free_ws = 1; 865 list_add(workspace, &wsm->idle_ws); 866 } 867 } 868 869 void btrfs_cleanup_workspace_manager(struct workspace_manager *wsman) 870 { 871 struct list_head *ws; 872 873 while (!list_empty(&wsman->idle_ws)) { 874 ws = wsman->idle_ws.next; 875 list_del(ws); 876 wsman->ops->free_workspace(ws); 877 atomic_dec(&wsman->total_ws); 878 } 879 } 880 881 /* 882 * This finds an available workspace or allocates a new one. 883 * If it's not possible to allocate a new one, waits until there's one. 884 * Preallocation makes a forward progress guarantees and we do not return 885 * errors. 886 */ 887 struct list_head *btrfs_get_workspace(struct workspace_manager *wsm, 888 unsigned int level) 889 { 890 struct list_head *workspace; 891 int cpus = num_online_cpus(); 892 unsigned nofs_flag; 893 struct list_head *idle_ws; 894 spinlock_t *ws_lock; 895 atomic_t *total_ws; 896 wait_queue_head_t *ws_wait; 897 int *free_ws; 898 899 idle_ws = &wsm->idle_ws; 900 ws_lock = &wsm->ws_lock; 901 total_ws = &wsm->total_ws; 902 ws_wait = &wsm->ws_wait; 903 free_ws = &wsm->free_ws; 904 905 again: 906 spin_lock(ws_lock); 907 if (!list_empty(idle_ws)) { 908 workspace = idle_ws->next; 909 list_del(workspace); 910 (*free_ws)--; 911 spin_unlock(ws_lock); 912 return workspace; 913 914 } 915 if (atomic_read(total_ws) > cpus) { 916 DEFINE_WAIT(wait); 917 918 spin_unlock(ws_lock); 919 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 920 if (atomic_read(total_ws) > cpus && !*free_ws) 921 schedule(); 922 finish_wait(ws_wait, &wait); 923 goto again; 924 } 925 atomic_inc(total_ws); 926 spin_unlock(ws_lock); 927 928 /* 929 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have 930 * to turn it off here because we might get called from the restricted 931 * context of btrfs_compress_bio/btrfs_compress_pages 932 */ 933 nofs_flag = memalloc_nofs_save(); 934 workspace = wsm->ops->alloc_workspace(level); 935 memalloc_nofs_restore(nofs_flag); 936 937 if (IS_ERR(workspace)) { 938 atomic_dec(total_ws); 939 wake_up(ws_wait); 940 941 /* 942 * Do not return the error but go back to waiting. There's a 943 * workspace preallocated for each type and the compression 944 * time is bounded so we get to a workspace eventually. This 945 * makes our caller's life easier. 946 * 947 * To prevent silent and low-probability deadlocks (when the 948 * initial preallocation fails), check if there are any 949 * workspaces at all. 950 */ 951 if (atomic_read(total_ws) == 0) { 952 static DEFINE_RATELIMIT_STATE(_rs, 953 /* once per minute */ 60 * HZ, 954 /* no burst */ 1); 955 956 if (__ratelimit(&_rs)) { 957 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n"); 958 } 959 } 960 goto again; 961 } 962 return workspace; 963 } 964 965 static struct list_head *get_workspace(int type, int level) 966 { 967 return btrfs_compress_op[type]->get_workspace(level); 968 } 969 970 /* 971 * put a workspace struct back on the list or free it if we have enough 972 * idle ones sitting around 973 */ 974 void btrfs_put_workspace(struct workspace_manager *wsm, struct list_head *ws) 975 { 976 struct list_head *idle_ws; 977 spinlock_t *ws_lock; 978 atomic_t *total_ws; 979 wait_queue_head_t *ws_wait; 980 int *free_ws; 981 982 idle_ws = &wsm->idle_ws; 983 ws_lock = &wsm->ws_lock; 984 total_ws = &wsm->total_ws; 985 ws_wait = &wsm->ws_wait; 986 free_ws = &wsm->free_ws; 987 988 spin_lock(ws_lock); 989 if (*free_ws <= num_online_cpus()) { 990 list_add(ws, idle_ws); 991 (*free_ws)++; 992 spin_unlock(ws_lock); 993 goto wake; 994 } 995 spin_unlock(ws_lock); 996 997 wsm->ops->free_workspace(ws); 998 atomic_dec(total_ws); 999 wake: 1000 cond_wake_up(ws_wait); 1001 } 1002 1003 static void put_workspace(int type, struct list_head *ws) 1004 { 1005 return btrfs_compress_op[type]->put_workspace(ws); 1006 } 1007 1008 /* 1009 * Given an address space and start and length, compress the bytes into @pages 1010 * that are allocated on demand. 1011 * 1012 * @type_level is encoded algorithm and level, where level 0 means whatever 1013 * default the algorithm chooses and is opaque here; 1014 * - compression algo are 0-3 1015 * - the level are bits 4-7 1016 * 1017 * @out_pages is an in/out parameter, holds maximum number of pages to allocate 1018 * and returns number of actually allocated pages 1019 * 1020 * @total_in is used to return the number of bytes actually read. It 1021 * may be smaller than the input length if we had to exit early because we 1022 * ran out of room in the pages array or because we cross the 1023 * max_out threshold. 1024 * 1025 * @total_out is an in/out parameter, must be set to the input length and will 1026 * be also used to return the total number of compressed bytes 1027 * 1028 * @max_out tells us the max number of bytes that we're allowed to 1029 * stuff into pages 1030 */ 1031 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, 1032 u64 start, struct page **pages, 1033 unsigned long *out_pages, 1034 unsigned long *total_in, 1035 unsigned long *total_out) 1036 { 1037 int type = btrfs_compress_type(type_level); 1038 int level = btrfs_compress_level(type_level); 1039 struct list_head *workspace; 1040 int ret; 1041 1042 level = btrfs_compress_op[type]->set_level(level); 1043 workspace = get_workspace(type, level); 1044 ret = btrfs_compress_op[type]->compress_pages(workspace, mapping, 1045 start, pages, 1046 out_pages, 1047 total_in, total_out); 1048 put_workspace(type, workspace); 1049 return ret; 1050 } 1051 1052 /* 1053 * pages_in is an array of pages with compressed data. 1054 * 1055 * disk_start is the starting logical offset of this array in the file 1056 * 1057 * orig_bio contains the pages from the file that we want to decompress into 1058 * 1059 * srclen is the number of bytes in pages_in 1060 * 1061 * The basic idea is that we have a bio that was created by readpages. 1062 * The pages in the bio are for the uncompressed data, and they may not 1063 * be contiguous. They all correspond to the range of bytes covered by 1064 * the compressed extent. 1065 */ 1066 static int btrfs_decompress_bio(struct compressed_bio *cb) 1067 { 1068 struct list_head *workspace; 1069 int ret; 1070 int type = cb->compress_type; 1071 1072 workspace = get_workspace(type, 0); 1073 ret = btrfs_compress_op[type]->decompress_bio(workspace, cb); 1074 put_workspace(type, workspace); 1075 1076 return ret; 1077 } 1078 1079 /* 1080 * a less complex decompression routine. Our compressed data fits in a 1081 * single page, and we want to read a single page out of it. 1082 * start_byte tells us the offset into the compressed data we're interested in 1083 */ 1084 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, 1085 unsigned long start_byte, size_t srclen, size_t destlen) 1086 { 1087 struct list_head *workspace; 1088 int ret; 1089 1090 workspace = get_workspace(type, 0); 1091 ret = btrfs_compress_op[type]->decompress(workspace, data_in, 1092 dest_page, start_byte, 1093 srclen, destlen); 1094 put_workspace(type, workspace); 1095 1096 return ret; 1097 } 1098 1099 void __init btrfs_init_compress(void) 1100 { 1101 int i; 1102 1103 for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) 1104 btrfs_compress_op[i]->init_workspace_manager(); 1105 } 1106 1107 void __cold btrfs_exit_compress(void) 1108 { 1109 int i; 1110 1111 for (i = 0; i < BTRFS_NR_WORKSPACE_MANAGERS; i++) 1112 btrfs_compress_op[i]->cleanup_workspace_manager(); 1113 } 1114 1115 /* 1116 * Copy uncompressed data from working buffer to pages. 1117 * 1118 * buf_start is the byte offset we're of the start of our workspace buffer. 1119 * 1120 * total_out is the last byte of the buffer 1121 */ 1122 int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, 1123 unsigned long total_out, u64 disk_start, 1124 struct bio *bio) 1125 { 1126 unsigned long buf_offset; 1127 unsigned long current_buf_start; 1128 unsigned long start_byte; 1129 unsigned long prev_start_byte; 1130 unsigned long working_bytes = total_out - buf_start; 1131 unsigned long bytes; 1132 char *kaddr; 1133 struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); 1134 1135 /* 1136 * start byte is the first byte of the page we're currently 1137 * copying into relative to the start of the compressed data. 1138 */ 1139 start_byte = page_offset(bvec.bv_page) - disk_start; 1140 1141 /* we haven't yet hit data corresponding to this page */ 1142 if (total_out <= start_byte) 1143 return 1; 1144 1145 /* 1146 * the start of the data we care about is offset into 1147 * the middle of our working buffer 1148 */ 1149 if (total_out > start_byte && buf_start < start_byte) { 1150 buf_offset = start_byte - buf_start; 1151 working_bytes -= buf_offset; 1152 } else { 1153 buf_offset = 0; 1154 } 1155 current_buf_start = buf_start; 1156 1157 /* copy bytes from the working buffer into the pages */ 1158 while (working_bytes > 0) { 1159 bytes = min_t(unsigned long, bvec.bv_len, 1160 PAGE_SIZE - buf_offset); 1161 bytes = min(bytes, working_bytes); 1162 1163 kaddr = kmap_atomic(bvec.bv_page); 1164 memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes); 1165 kunmap_atomic(kaddr); 1166 flush_dcache_page(bvec.bv_page); 1167 1168 buf_offset += bytes; 1169 working_bytes -= bytes; 1170 current_buf_start += bytes; 1171 1172 /* check if we need to pick another page */ 1173 bio_advance(bio, bytes); 1174 if (!bio->bi_iter.bi_size) 1175 return 0; 1176 bvec = bio_iter_iovec(bio, bio->bi_iter); 1177 prev_start_byte = start_byte; 1178 start_byte = page_offset(bvec.bv_page) - disk_start; 1179 1180 /* 1181 * We need to make sure we're only adjusting 1182 * our offset into compression working buffer when 1183 * we're switching pages. Otherwise we can incorrectly 1184 * keep copying when we were actually done. 1185 */ 1186 if (start_byte != prev_start_byte) { 1187 /* 1188 * make sure our new page is covered by this 1189 * working buffer 1190 */ 1191 if (total_out <= start_byte) 1192 return 1; 1193 1194 /* 1195 * the next page in the biovec might not be adjacent 1196 * to the last page, but it might still be found 1197 * inside this working buffer. bump our offset pointer 1198 */ 1199 if (total_out > start_byte && 1200 current_buf_start < start_byte) { 1201 buf_offset = start_byte - buf_start; 1202 working_bytes = total_out - start_byte; 1203 current_buf_start = buf_start + buf_offset; 1204 } 1205 } 1206 } 1207 1208 return 1; 1209 } 1210 1211 /* 1212 * Shannon Entropy calculation 1213 * 1214 * Pure byte distribution analysis fails to determine compressibility of data. 1215 * Try calculating entropy to estimate the average minimum number of bits 1216 * needed to encode the sampled data. 1217 * 1218 * For convenience, return the percentage of needed bits, instead of amount of 1219 * bits directly. 1220 * 1221 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy 1222 * and can be compressible with high probability 1223 * 1224 * @ENTROPY_LVL_HIGH - data are not compressible with high probability 1225 * 1226 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate. 1227 */ 1228 #define ENTROPY_LVL_ACEPTABLE (65) 1229 #define ENTROPY_LVL_HIGH (80) 1230 1231 /* 1232 * For increasead precision in shannon_entropy calculation, 1233 * let's do pow(n, M) to save more digits after comma: 1234 * 1235 * - maximum int bit length is 64 1236 * - ilog2(MAX_SAMPLE_SIZE) -> 13 1237 * - 13 * 4 = 52 < 64 -> M = 4 1238 * 1239 * So use pow(n, 4). 1240 */ 1241 static inline u32 ilog2_w(u64 n) 1242 { 1243 return ilog2(n * n * n * n); 1244 } 1245 1246 static u32 shannon_entropy(struct heuristic_ws *ws) 1247 { 1248 const u32 entropy_max = 8 * ilog2_w(2); 1249 u32 entropy_sum = 0; 1250 u32 p, p_base, sz_base; 1251 u32 i; 1252 1253 sz_base = ilog2_w(ws->sample_size); 1254 for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) { 1255 p = ws->bucket[i].count; 1256 p_base = ilog2_w(p); 1257 entropy_sum += p * (sz_base - p_base); 1258 } 1259 1260 entropy_sum /= ws->sample_size; 1261 return entropy_sum * 100 / entropy_max; 1262 } 1263 1264 #define RADIX_BASE 4U 1265 #define COUNTERS_SIZE (1U << RADIX_BASE) 1266 1267 static u8 get4bits(u64 num, int shift) { 1268 u8 low4bits; 1269 1270 num >>= shift; 1271 /* Reverse order */ 1272 low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE); 1273 return low4bits; 1274 } 1275 1276 /* 1277 * Use 4 bits as radix base 1278 * Use 16 u32 counters for calculating new position in buf array 1279 * 1280 * @array - array that will be sorted 1281 * @array_buf - buffer array to store sorting results 1282 * must be equal in size to @array 1283 * @num - array size 1284 */ 1285 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf, 1286 int num) 1287 { 1288 u64 max_num; 1289 u64 buf_num; 1290 u32 counters[COUNTERS_SIZE]; 1291 u32 new_addr; 1292 u32 addr; 1293 int bitlen; 1294 int shift; 1295 int i; 1296 1297 /* 1298 * Try avoid useless loop iterations for small numbers stored in big 1299 * counters. Example: 48 33 4 ... in 64bit array 1300 */ 1301 max_num = array[0].count; 1302 for (i = 1; i < num; i++) { 1303 buf_num = array[i].count; 1304 if (buf_num > max_num) 1305 max_num = buf_num; 1306 } 1307 1308 buf_num = ilog2(max_num); 1309 bitlen = ALIGN(buf_num, RADIX_BASE * 2); 1310 1311 shift = 0; 1312 while (shift < bitlen) { 1313 memset(counters, 0, sizeof(counters)); 1314 1315 for (i = 0; i < num; i++) { 1316 buf_num = array[i].count; 1317 addr = get4bits(buf_num, shift); 1318 counters[addr]++; 1319 } 1320 1321 for (i = 1; i < COUNTERS_SIZE; i++) 1322 counters[i] += counters[i - 1]; 1323 1324 for (i = num - 1; i >= 0; i--) { 1325 buf_num = array[i].count; 1326 addr = get4bits(buf_num, shift); 1327 counters[addr]--; 1328 new_addr = counters[addr]; 1329 array_buf[new_addr] = array[i]; 1330 } 1331 1332 shift += RADIX_BASE; 1333 1334 /* 1335 * Normal radix expects to move data from a temporary array, to 1336 * the main one. But that requires some CPU time. Avoid that 1337 * by doing another sort iteration to original array instead of 1338 * memcpy() 1339 */ 1340 memset(counters, 0, sizeof(counters)); 1341 1342 for (i = 0; i < num; i ++) { 1343 buf_num = array_buf[i].count; 1344 addr = get4bits(buf_num, shift); 1345 counters[addr]++; 1346 } 1347 1348 for (i = 1; i < COUNTERS_SIZE; i++) 1349 counters[i] += counters[i - 1]; 1350 1351 for (i = num - 1; i >= 0; i--) { 1352 buf_num = array_buf[i].count; 1353 addr = get4bits(buf_num, shift); 1354 counters[addr]--; 1355 new_addr = counters[addr]; 1356 array[new_addr] = array_buf[i]; 1357 } 1358 1359 shift += RADIX_BASE; 1360 } 1361 } 1362 1363 /* 1364 * Size of the core byte set - how many bytes cover 90% of the sample 1365 * 1366 * There are several types of structured binary data that use nearly all byte 1367 * values. The distribution can be uniform and counts in all buckets will be 1368 * nearly the same (eg. encrypted data). Unlikely to be compressible. 1369 * 1370 * Other possibility is normal (Gaussian) distribution, where the data could 1371 * be potentially compressible, but we have to take a few more steps to decide 1372 * how much. 1373 * 1374 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently, 1375 * compression algo can easy fix that 1376 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high 1377 * probability is not compressible 1378 */ 1379 #define BYTE_CORE_SET_LOW (64) 1380 #define BYTE_CORE_SET_HIGH (200) 1381 1382 static int byte_core_set_size(struct heuristic_ws *ws) 1383 { 1384 u32 i; 1385 u32 coreset_sum = 0; 1386 const u32 core_set_threshold = ws->sample_size * 90 / 100; 1387 struct bucket_item *bucket = ws->bucket; 1388 1389 /* Sort in reverse order */ 1390 radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE); 1391 1392 for (i = 0; i < BYTE_CORE_SET_LOW; i++) 1393 coreset_sum += bucket[i].count; 1394 1395 if (coreset_sum > core_set_threshold) 1396 return i; 1397 1398 for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) { 1399 coreset_sum += bucket[i].count; 1400 if (coreset_sum > core_set_threshold) 1401 break; 1402 } 1403 1404 return i; 1405 } 1406 1407 /* 1408 * Count byte values in buckets. 1409 * This heuristic can detect textual data (configs, xml, json, html, etc). 1410 * Because in most text-like data byte set is restricted to limited number of 1411 * possible characters, and that restriction in most cases makes data easy to 1412 * compress. 1413 * 1414 * @BYTE_SET_THRESHOLD - consider all data within this byte set size: 1415 * less - compressible 1416 * more - need additional analysis 1417 */ 1418 #define BYTE_SET_THRESHOLD (64) 1419 1420 static u32 byte_set_size(const struct heuristic_ws *ws) 1421 { 1422 u32 i; 1423 u32 byte_set_size = 0; 1424 1425 for (i = 0; i < BYTE_SET_THRESHOLD; i++) { 1426 if (ws->bucket[i].count > 0) 1427 byte_set_size++; 1428 } 1429 1430 /* 1431 * Continue collecting count of byte values in buckets. If the byte 1432 * set size is bigger then the threshold, it's pointless to continue, 1433 * the detection technique would fail for this type of data. 1434 */ 1435 for (; i < BUCKET_SIZE; i++) { 1436 if (ws->bucket[i].count > 0) { 1437 byte_set_size++; 1438 if (byte_set_size > BYTE_SET_THRESHOLD) 1439 return byte_set_size; 1440 } 1441 } 1442 1443 return byte_set_size; 1444 } 1445 1446 static bool sample_repeated_patterns(struct heuristic_ws *ws) 1447 { 1448 const u32 half_of_sample = ws->sample_size / 2; 1449 const u8 *data = ws->sample; 1450 1451 return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0; 1452 } 1453 1454 static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, 1455 struct heuristic_ws *ws) 1456 { 1457 struct page *page; 1458 u64 index, index_end; 1459 u32 i, curr_sample_pos; 1460 u8 *in_data; 1461 1462 /* 1463 * Compression handles the input data by chunks of 128KiB 1464 * (defined by BTRFS_MAX_UNCOMPRESSED) 1465 * 1466 * We do the same for the heuristic and loop over the whole range. 1467 * 1468 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will 1469 * process no more than BTRFS_MAX_UNCOMPRESSED at a time. 1470 */ 1471 if (end - start > BTRFS_MAX_UNCOMPRESSED) 1472 end = start + BTRFS_MAX_UNCOMPRESSED; 1473 1474 index = start >> PAGE_SHIFT; 1475 index_end = end >> PAGE_SHIFT; 1476 1477 /* Don't miss unaligned end */ 1478 if (!IS_ALIGNED(end, PAGE_SIZE)) 1479 index_end++; 1480 1481 curr_sample_pos = 0; 1482 while (index < index_end) { 1483 page = find_get_page(inode->i_mapping, index); 1484 in_data = kmap(page); 1485 /* Handle case where the start is not aligned to PAGE_SIZE */ 1486 i = start % PAGE_SIZE; 1487 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { 1488 /* Don't sample any garbage from the last page */ 1489 if (start > end - SAMPLING_READ_SIZE) 1490 break; 1491 memcpy(&ws->sample[curr_sample_pos], &in_data[i], 1492 SAMPLING_READ_SIZE); 1493 i += SAMPLING_INTERVAL; 1494 start += SAMPLING_INTERVAL; 1495 curr_sample_pos += SAMPLING_READ_SIZE; 1496 } 1497 kunmap(page); 1498 put_page(page); 1499 1500 index++; 1501 } 1502 1503 ws->sample_size = curr_sample_pos; 1504 } 1505 1506 /* 1507 * Compression heuristic. 1508 * 1509 * For now is's a naive and optimistic 'return true', we'll extend the logic to 1510 * quickly (compared to direct compression) detect data characteristics 1511 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible 1512 * data. 1513 * 1514 * The following types of analysis can be performed: 1515 * - detect mostly zero data 1516 * - detect data with low "byte set" size (text, etc) 1517 * - detect data with low/high "core byte" set 1518 * 1519 * Return non-zero if the compression should be done, 0 otherwise. 1520 */ 1521 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) 1522 { 1523 struct list_head *ws_list = get_workspace(0, 0); 1524 struct heuristic_ws *ws; 1525 u32 i; 1526 u8 byte; 1527 int ret = 0; 1528 1529 ws = list_entry(ws_list, struct heuristic_ws, list); 1530 1531 heuristic_collect_sample(inode, start, end, ws); 1532 1533 if (sample_repeated_patterns(ws)) { 1534 ret = 1; 1535 goto out; 1536 } 1537 1538 memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE); 1539 1540 for (i = 0; i < ws->sample_size; i++) { 1541 byte = ws->sample[i]; 1542 ws->bucket[byte].count++; 1543 } 1544 1545 i = byte_set_size(ws); 1546 if (i < BYTE_SET_THRESHOLD) { 1547 ret = 2; 1548 goto out; 1549 } 1550 1551 i = byte_core_set_size(ws); 1552 if (i <= BYTE_CORE_SET_LOW) { 1553 ret = 3; 1554 goto out; 1555 } 1556 1557 if (i >= BYTE_CORE_SET_HIGH) { 1558 ret = 0; 1559 goto out; 1560 } 1561 1562 i = shannon_entropy(ws); 1563 if (i <= ENTROPY_LVL_ACEPTABLE) { 1564 ret = 4; 1565 goto out; 1566 } 1567 1568 /* 1569 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be 1570 * needed to give green light to compression. 1571 * 1572 * For now just assume that compression at that level is not worth the 1573 * resources because: 1574 * 1575 * 1. it is possible to defrag the data later 1576 * 1577 * 2. the data would turn out to be hardly compressible, eg. 150 byte 1578 * values, every bucket has counter at level ~54. The heuristic would 1579 * be confused. This can happen when data have some internal repeated 1580 * patterns like "abbacbbc...". This can be detected by analyzing 1581 * pairs of bytes, which is too costly. 1582 */ 1583 if (i < ENTROPY_LVL_HIGH) { 1584 ret = 5; 1585 goto out; 1586 } else { 1587 ret = 0; 1588 goto out; 1589 } 1590 1591 out: 1592 put_workspace(0, ws_list); 1593 return ret; 1594 } 1595 1596 /* 1597 * Convert the compression suffix (eg. after "zlib" starting with ":") to 1598 * level, unrecognized string will set the default level 1599 */ 1600 unsigned int btrfs_compress_str2level(unsigned int type, const char *str) 1601 { 1602 unsigned int level = 0; 1603 int ret; 1604 1605 if (!type) 1606 return 0; 1607 1608 if (str[0] == ':') { 1609 ret = kstrtouint(str + 1, 10, &level); 1610 if (ret) 1611 level = 0; 1612 } 1613 1614 level = btrfs_compress_op[type]->set_level(level); 1615 1616 return level; 1617 } 1618