1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "check-integrity.h" 14 #include "dev-replace.h" 15 #include "rcu-string.h" 16 #include "zoned.h" 17 18 static struct bio_set btrfs_bioset; 19 20 /* 21 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 22 * is already initialized by the block layer. 23 */ 24 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 25 struct btrfs_inode *inode, 26 btrfs_bio_end_io_t end_io, void *private) 27 { 28 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 29 bbio->inode = inode; 30 bbio->end_io = end_io; 31 bbio->private = private; 32 } 33 34 /* 35 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 36 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 37 * 38 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 39 * a mempool. 40 */ 41 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 42 struct btrfs_inode *inode, 43 btrfs_bio_end_io_t end_io, void *private) 44 { 45 struct bio *bio; 46 47 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 48 btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); 49 return bio; 50 } 51 52 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 53 struct btrfs_inode *inode, 54 btrfs_bio_end_io_t end_io, void *private) 55 { 56 struct bio *bio; 57 struct btrfs_bio *bbio; 58 59 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 60 61 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 62 bbio = btrfs_bio(bio); 63 btrfs_bio_init(bbio, inode, end_io, private); 64 65 bio_trim(bio, offset >> 9, size >> 9); 66 return bio; 67 } 68 69 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 70 { 71 if (!dev || !dev->bdev) 72 return; 73 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 74 return; 75 76 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 77 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 78 if (!(bio->bi_opf & REQ_RAHEAD)) 79 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 80 if (bio->bi_opf & REQ_PREFLUSH) 81 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 82 } 83 84 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 85 struct bio *bio) 86 { 87 if (bio->bi_opf & REQ_META) 88 return fs_info->endio_meta_workers; 89 return fs_info->endio_workers; 90 } 91 92 static void btrfs_end_bio_work(struct work_struct *work) 93 { 94 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 95 96 bbio->end_io(bbio); 97 } 98 99 static void btrfs_simple_end_io(struct bio *bio) 100 { 101 struct btrfs_fs_info *fs_info = bio->bi_private; 102 struct btrfs_bio *bbio = btrfs_bio(bio); 103 104 btrfs_bio_counter_dec(fs_info); 105 106 if (bio->bi_status) 107 btrfs_log_dev_io_error(bio, bbio->device); 108 109 if (bio_op(bio) == REQ_OP_READ) { 110 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 111 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 112 } else { 113 bbio->end_io(bbio); 114 } 115 } 116 117 static void btrfs_raid56_end_io(struct bio *bio) 118 { 119 struct btrfs_io_context *bioc = bio->bi_private; 120 struct btrfs_bio *bbio = btrfs_bio(bio); 121 122 btrfs_bio_counter_dec(bioc->fs_info); 123 bbio->mirror_num = bioc->mirror_num; 124 bbio->end_io(bbio); 125 126 btrfs_put_bioc(bioc); 127 } 128 129 static void btrfs_orig_write_end_io(struct bio *bio) 130 { 131 struct btrfs_io_stripe *stripe = bio->bi_private; 132 struct btrfs_io_context *bioc = stripe->bioc; 133 struct btrfs_bio *bbio = btrfs_bio(bio); 134 135 btrfs_bio_counter_dec(bioc->fs_info); 136 137 if (bio->bi_status) { 138 atomic_inc(&bioc->error); 139 btrfs_log_dev_io_error(bio, stripe->dev); 140 } 141 142 /* 143 * Only send an error to the higher layers if it is beyond the tolerance 144 * threshold. 145 */ 146 if (atomic_read(&bioc->error) > bioc->max_errors) 147 bio->bi_status = BLK_STS_IOERR; 148 else 149 bio->bi_status = BLK_STS_OK; 150 151 bbio->end_io(bbio); 152 btrfs_put_bioc(bioc); 153 } 154 155 static void btrfs_clone_write_end_io(struct bio *bio) 156 { 157 struct btrfs_io_stripe *stripe = bio->bi_private; 158 159 if (bio->bi_status) { 160 atomic_inc(&stripe->bioc->error); 161 btrfs_log_dev_io_error(bio, stripe->dev); 162 } 163 164 /* Pass on control to the original bio this one was cloned from */ 165 bio_endio(stripe->bioc->orig_bio); 166 bio_put(bio); 167 } 168 169 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 170 { 171 if (!dev || !dev->bdev || 172 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 173 (btrfs_op(bio) == BTRFS_MAP_WRITE && 174 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 175 bio_io_error(bio); 176 return; 177 } 178 179 bio_set_dev(bio, dev->bdev); 180 181 /* 182 * For zone append writing, bi_sector must point the beginning of the 183 * zone 184 */ 185 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 186 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 187 188 if (btrfs_dev_is_sequential(dev, physical)) { 189 u64 zone_start = round_down(physical, 190 dev->fs_info->zone_size); 191 192 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 193 } else { 194 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 195 bio->bi_opf |= REQ_OP_WRITE; 196 } 197 } 198 btrfs_debug_in_rcu(dev->fs_info, 199 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 200 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 201 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 202 dev->devid, bio->bi_iter.bi_size); 203 204 btrfsic_check_bio(bio); 205 submit_bio(bio); 206 } 207 208 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 209 { 210 struct bio *orig_bio = bioc->orig_bio, *bio; 211 212 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 213 214 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 215 if (dev_nr == bioc->num_stripes - 1) { 216 bio = orig_bio; 217 bio->bi_end_io = btrfs_orig_write_end_io; 218 } else { 219 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 220 bio_inc_remaining(orig_bio); 221 bio->bi_end_io = btrfs_clone_write_end_io; 222 } 223 224 bio->bi_private = &bioc->stripes[dev_nr]; 225 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 226 bioc->stripes[dev_nr].bioc = bioc; 227 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 228 } 229 230 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 231 { 232 struct btrfs_bio *bbio = btrfs_bio(bio); 233 u64 logical = bio->bi_iter.bi_sector << 9; 234 u64 length = bio->bi_iter.bi_size; 235 u64 map_length = length; 236 struct btrfs_io_context *bioc = NULL; 237 struct btrfs_io_stripe smap; 238 blk_status_t ret; 239 int error; 240 241 btrfs_bio_counter_inc_blocked(fs_info); 242 error = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 243 &bioc, &smap, &mirror_num, 1); 244 if (error) { 245 ret = errno_to_blk_status(error); 246 goto fail; 247 } 248 249 if (map_length < length) { 250 btrfs_crit(fs_info, 251 "mapping failed logical %llu bio len %llu len %llu", 252 logical, length, map_length); 253 BUG(); 254 } 255 256 /* Save the iter for the end_io handler for data reads. */ 257 if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) 258 bbio->iter = bio->bi_iter; 259 260 if (!bioc) { 261 /* Single mirror read/write fast path */ 262 bbio->mirror_num = mirror_num; 263 bbio->device = smap.dev; 264 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 265 bio->bi_private = fs_info; 266 bio->bi_end_io = btrfs_simple_end_io; 267 btrfs_submit_dev_bio(smap.dev, bio); 268 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 269 /* Parity RAID write or read recovery */ 270 bio->bi_private = bioc; 271 bio->bi_end_io = btrfs_raid56_end_io; 272 if (bio_op(bio) == REQ_OP_READ) 273 raid56_parity_recover(bio, bioc, mirror_num); 274 else 275 raid56_parity_write(bio, bioc); 276 } else { 277 /* Write to multiple mirrors */ 278 int total_devs = bioc->num_stripes; 279 int dev_nr; 280 281 bioc->orig_bio = bio; 282 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 283 btrfs_submit_mirrored_bio(bioc, dev_nr); 284 } 285 return; 286 287 fail: 288 btrfs_bio_counter_dec(fs_info); 289 btrfs_bio_end_io(bbio, ret); 290 } 291 292 /* 293 * Submit a repair write. 294 * 295 * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a 296 * RAID setup. Here we only want to write the one bad copy, so we do the 297 * mapping ourselves and submit the bio directly. 298 * 299 * The I/O is issued synchronously to block the repair read completion from 300 * freeing the bio. 301 */ 302 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 303 u64 length, u64 logical, struct page *page, 304 unsigned int pg_offset, int mirror_num) 305 { 306 struct btrfs_device *dev; 307 struct bio_vec bvec; 308 struct bio bio; 309 u64 map_length = 0; 310 u64 sector; 311 struct btrfs_io_context *bioc = NULL; 312 int ret = 0; 313 314 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 315 BUG_ON(!mirror_num); 316 317 if (btrfs_repair_one_zone(fs_info, logical)) 318 return 0; 319 320 map_length = length; 321 322 /* 323 * Avoid races with device replace and make sure our bioc has devices 324 * associated to its stripes that don't go away while we are doing the 325 * read repair operation. 326 */ 327 btrfs_bio_counter_inc_blocked(fs_info); 328 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 329 /* 330 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 331 * to update all raid stripes, but here we just want to correct 332 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 333 * stripe's dev and sector. 334 */ 335 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 336 &map_length, &bioc, 0); 337 if (ret) 338 goto out_counter_dec; 339 ASSERT(bioc->mirror_num == 1); 340 } else { 341 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 342 &map_length, &bioc, mirror_num); 343 if (ret) 344 goto out_counter_dec; 345 /* 346 * This happens when dev-replace is also running, and the 347 * mirror_num indicates the dev-replace target. 348 * 349 * In this case, we don't need to do anything, as the read 350 * error just means the replace progress hasn't reached our 351 * read range, and later replace routine would handle it well. 352 */ 353 if (mirror_num != bioc->mirror_num) 354 goto out_counter_dec; 355 } 356 357 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 358 dev = bioc->stripes[bioc->mirror_num - 1].dev; 359 btrfs_put_bioc(bioc); 360 361 if (!dev || !dev->bdev || 362 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 363 ret = -EIO; 364 goto out_counter_dec; 365 } 366 367 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 368 bio.bi_iter.bi_sector = sector; 369 __bio_add_page(&bio, page, length, pg_offset); 370 371 btrfsic_check_bio(&bio); 372 ret = submit_bio_wait(&bio); 373 if (ret) { 374 /* try to remap that extent elsewhere? */ 375 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 376 goto out_bio_uninit; 377 } 378 379 btrfs_info_rl_in_rcu(fs_info, 380 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 381 ino, start, btrfs_dev_name(dev), sector); 382 ret = 0; 383 384 out_bio_uninit: 385 bio_uninit(&bio); 386 out_counter_dec: 387 btrfs_bio_counter_dec(fs_info); 388 return ret; 389 } 390 391 int __init btrfs_bioset_init(void) 392 { 393 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 394 offsetof(struct btrfs_bio, bio), 395 BIOSET_NEED_BVECS)) 396 return -ENOMEM; 397 return 0; 398 } 399 400 void __cold btrfs_bioset_exit(void) 401 { 402 bioset_exit(&btrfs_bioset); 403 } 404