1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "check-integrity.h" 14 #include "dev-replace.h" 15 #include "rcu-string.h" 16 #include "zoned.h" 17 #include "file-item.h" 18 19 static struct bio_set btrfs_bioset; 20 21 /* 22 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 23 * is already initialized by the block layer. 24 */ 25 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 26 struct btrfs_inode *inode, 27 btrfs_bio_end_io_t end_io, void *private) 28 { 29 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 30 bbio->inode = inode; 31 bbio->end_io = end_io; 32 bbio->private = private; 33 } 34 35 /* 36 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 37 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 38 * 39 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 40 * a mempool. 41 */ 42 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 43 struct btrfs_inode *inode, 44 btrfs_bio_end_io_t end_io, void *private) 45 { 46 struct bio *bio; 47 48 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 49 btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); 50 return bio; 51 } 52 53 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 54 struct btrfs_inode *inode, 55 btrfs_bio_end_io_t end_io, void *private) 56 { 57 struct bio *bio; 58 struct btrfs_bio *bbio; 59 60 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 61 62 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 63 bbio = btrfs_bio(bio); 64 btrfs_bio_init(bbio, inode, end_io, private); 65 66 bio_trim(bio, offset >> 9, size >> 9); 67 return bio; 68 } 69 70 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 71 { 72 if (!dev || !dev->bdev) 73 return; 74 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 75 return; 76 77 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 78 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 79 if (!(bio->bi_opf & REQ_RAHEAD)) 80 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 81 if (bio->bi_opf & REQ_PREFLUSH) 82 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 83 } 84 85 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 86 struct bio *bio) 87 { 88 if (bio->bi_opf & REQ_META) 89 return fs_info->endio_meta_workers; 90 return fs_info->endio_workers; 91 } 92 93 static void btrfs_end_bio_work(struct work_struct *work) 94 { 95 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 96 97 bbio->end_io(bbio); 98 } 99 100 static void btrfs_simple_end_io(struct bio *bio) 101 { 102 struct btrfs_fs_info *fs_info = bio->bi_private; 103 struct btrfs_bio *bbio = btrfs_bio(bio); 104 105 btrfs_bio_counter_dec(fs_info); 106 107 if (bio->bi_status) 108 btrfs_log_dev_io_error(bio, bbio->device); 109 110 if (bio_op(bio) == REQ_OP_READ) { 111 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 112 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 113 } else { 114 bbio->end_io(bbio); 115 } 116 } 117 118 static void btrfs_raid56_end_io(struct bio *bio) 119 { 120 struct btrfs_io_context *bioc = bio->bi_private; 121 struct btrfs_bio *bbio = btrfs_bio(bio); 122 123 btrfs_bio_counter_dec(bioc->fs_info); 124 bbio->mirror_num = bioc->mirror_num; 125 bbio->end_io(bbio); 126 127 btrfs_put_bioc(bioc); 128 } 129 130 static void btrfs_orig_write_end_io(struct bio *bio) 131 { 132 struct btrfs_io_stripe *stripe = bio->bi_private; 133 struct btrfs_io_context *bioc = stripe->bioc; 134 struct btrfs_bio *bbio = btrfs_bio(bio); 135 136 btrfs_bio_counter_dec(bioc->fs_info); 137 138 if (bio->bi_status) { 139 atomic_inc(&bioc->error); 140 btrfs_log_dev_io_error(bio, stripe->dev); 141 } 142 143 /* 144 * Only send an error to the higher layers if it is beyond the tolerance 145 * threshold. 146 */ 147 if (atomic_read(&bioc->error) > bioc->max_errors) 148 bio->bi_status = BLK_STS_IOERR; 149 else 150 bio->bi_status = BLK_STS_OK; 151 152 bbio->end_io(bbio); 153 btrfs_put_bioc(bioc); 154 } 155 156 static void btrfs_clone_write_end_io(struct bio *bio) 157 { 158 struct btrfs_io_stripe *stripe = bio->bi_private; 159 160 if (bio->bi_status) { 161 atomic_inc(&stripe->bioc->error); 162 btrfs_log_dev_io_error(bio, stripe->dev); 163 } 164 165 /* Pass on control to the original bio this one was cloned from */ 166 bio_endio(stripe->bioc->orig_bio); 167 bio_put(bio); 168 } 169 170 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 171 { 172 if (!dev || !dev->bdev || 173 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 174 (btrfs_op(bio) == BTRFS_MAP_WRITE && 175 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 176 bio_io_error(bio); 177 return; 178 } 179 180 bio_set_dev(bio, dev->bdev); 181 182 /* 183 * For zone append writing, bi_sector must point the beginning of the 184 * zone 185 */ 186 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 187 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 188 189 if (btrfs_dev_is_sequential(dev, physical)) { 190 u64 zone_start = round_down(physical, 191 dev->fs_info->zone_size); 192 193 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 194 } else { 195 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 196 bio->bi_opf |= REQ_OP_WRITE; 197 } 198 } 199 btrfs_debug_in_rcu(dev->fs_info, 200 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 201 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 202 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 203 dev->devid, bio->bi_iter.bi_size); 204 205 btrfsic_check_bio(bio); 206 submit_bio(bio); 207 } 208 209 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 210 { 211 struct bio *orig_bio = bioc->orig_bio, *bio; 212 213 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 214 215 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 216 if (dev_nr == bioc->num_stripes - 1) { 217 bio = orig_bio; 218 bio->bi_end_io = btrfs_orig_write_end_io; 219 } else { 220 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 221 bio_inc_remaining(orig_bio); 222 bio->bi_end_io = btrfs_clone_write_end_io; 223 } 224 225 bio->bi_private = &bioc->stripes[dev_nr]; 226 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 227 bioc->stripes[dev_nr].bioc = bioc; 228 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 229 } 230 231 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 232 { 233 struct btrfs_bio *bbio = btrfs_bio(bio); 234 u64 logical = bio->bi_iter.bi_sector << 9; 235 u64 length = bio->bi_iter.bi_size; 236 u64 map_length = length; 237 struct btrfs_io_context *bioc = NULL; 238 struct btrfs_io_stripe smap; 239 blk_status_t ret; 240 int error; 241 242 btrfs_bio_counter_inc_blocked(fs_info); 243 error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 244 &bioc, &smap, &mirror_num, 1); 245 if (error) { 246 ret = errno_to_blk_status(error); 247 goto fail; 248 } 249 250 if (map_length < length) { 251 btrfs_crit(fs_info, 252 "mapping failed logical %llu bio len %llu len %llu", 253 logical, length, map_length); 254 BUG(); 255 } 256 257 /* 258 * Save the iter for the end_io handler and preload the checksums for 259 * data reads. 260 */ 261 if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { 262 bbio->iter = bio->bi_iter; 263 ret = btrfs_lookup_bio_sums(bbio); 264 if (ret) 265 goto fail; 266 } 267 268 if (!bioc) { 269 /* Single mirror read/write fast path */ 270 bbio->mirror_num = mirror_num; 271 bbio->device = smap.dev; 272 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 273 bio->bi_private = fs_info; 274 bio->bi_end_io = btrfs_simple_end_io; 275 btrfs_submit_dev_bio(smap.dev, bio); 276 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 277 /* Parity RAID write or read recovery */ 278 bio->bi_private = bioc; 279 bio->bi_end_io = btrfs_raid56_end_io; 280 if (bio_op(bio) == REQ_OP_READ) 281 raid56_parity_recover(bio, bioc, mirror_num); 282 else 283 raid56_parity_write(bio, bioc); 284 } else { 285 /* Write to multiple mirrors */ 286 int total_devs = bioc->num_stripes; 287 int dev_nr; 288 289 bioc->orig_bio = bio; 290 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 291 btrfs_submit_mirrored_bio(bioc, dev_nr); 292 } 293 return; 294 295 fail: 296 btrfs_bio_counter_dec(fs_info); 297 btrfs_bio_end_io(bbio, ret); 298 } 299 300 /* 301 * Submit a repair write. 302 * 303 * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a 304 * RAID setup. Here we only want to write the one bad copy, so we do the 305 * mapping ourselves and submit the bio directly. 306 * 307 * The I/O is issued synchronously to block the repair read completion from 308 * freeing the bio. 309 */ 310 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 311 u64 length, u64 logical, struct page *page, 312 unsigned int pg_offset, int mirror_num) 313 { 314 struct btrfs_device *dev; 315 struct bio_vec bvec; 316 struct bio bio; 317 u64 map_length = 0; 318 u64 sector; 319 struct btrfs_io_context *bioc = NULL; 320 int ret = 0; 321 322 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 323 BUG_ON(!mirror_num); 324 325 if (btrfs_repair_one_zone(fs_info, logical)) 326 return 0; 327 328 map_length = length; 329 330 /* 331 * Avoid races with device replace and make sure our bioc has devices 332 * associated to its stripes that don't go away while we are doing the 333 * read repair operation. 334 */ 335 btrfs_bio_counter_inc_blocked(fs_info); 336 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 337 /* 338 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 339 * to update all raid stripes, but here we just want to correct 340 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 341 * stripe's dev and sector. 342 */ 343 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 344 &map_length, &bioc, 0); 345 if (ret) 346 goto out_counter_dec; 347 ASSERT(bioc->mirror_num == 1); 348 } else { 349 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 350 &map_length, &bioc, mirror_num); 351 if (ret) 352 goto out_counter_dec; 353 /* 354 * This happens when dev-replace is also running, and the 355 * mirror_num indicates the dev-replace target. 356 * 357 * In this case, we don't need to do anything, as the read 358 * error just means the replace progress hasn't reached our 359 * read range, and later replace routine would handle it well. 360 */ 361 if (mirror_num != bioc->mirror_num) 362 goto out_counter_dec; 363 } 364 365 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 366 dev = bioc->stripes[bioc->mirror_num - 1].dev; 367 btrfs_put_bioc(bioc); 368 369 if (!dev || !dev->bdev || 370 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 371 ret = -EIO; 372 goto out_counter_dec; 373 } 374 375 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 376 bio.bi_iter.bi_sector = sector; 377 __bio_add_page(&bio, page, length, pg_offset); 378 379 btrfsic_check_bio(&bio); 380 ret = submit_bio_wait(&bio); 381 if (ret) { 382 /* try to remap that extent elsewhere? */ 383 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 384 goto out_bio_uninit; 385 } 386 387 btrfs_info_rl_in_rcu(fs_info, 388 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 389 ino, start, btrfs_dev_name(dev), sector); 390 ret = 0; 391 392 out_bio_uninit: 393 bio_uninit(&bio); 394 out_counter_dec: 395 btrfs_bio_counter_dec(fs_info); 396 return ret; 397 } 398 399 int __init btrfs_bioset_init(void) 400 { 401 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 402 offsetof(struct btrfs_bio, bio), 403 BIOSET_NEED_BVECS)) 404 return -ENOMEM; 405 return 0; 406 } 407 408 void __cold btrfs_bioset_exit(void) 409 { 410 bioset_exit(&btrfs_bioset); 411 } 412