1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 * Copyright (C) 2022 Christoph Hellwig. 5 */ 6 7 #include <linux/bio.h> 8 #include "bio.h" 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "raid56.h" 12 #include "async-thread.h" 13 #include "check-integrity.h" 14 #include "dev-replace.h" 15 #include "rcu-string.h" 16 #include "zoned.h" 17 18 static struct bio_set btrfs_bioset; 19 20 /* 21 * Initialize a btrfs_bio structure. This skips the embedded bio itself as it 22 * is already initialized by the block layer. 23 */ 24 static inline void btrfs_bio_init(struct btrfs_bio *bbio, 25 struct btrfs_inode *inode, 26 btrfs_bio_end_io_t end_io, void *private) 27 { 28 memset(bbio, 0, offsetof(struct btrfs_bio, bio)); 29 bbio->inode = inode; 30 bbio->end_io = end_io; 31 bbio->private = private; 32 } 33 34 /* 35 * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for 36 * btrfs, and is used for all I/O submitted through btrfs_submit_bio. 37 * 38 * Just like the underlying bio_alloc_bioset it will not fail as it is backed by 39 * a mempool. 40 */ 41 struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, 42 struct btrfs_inode *inode, 43 btrfs_bio_end_io_t end_io, void *private) 44 { 45 struct bio *bio; 46 47 bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); 48 btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); 49 return bio; 50 } 51 52 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, 53 struct btrfs_inode *inode, 54 btrfs_bio_end_io_t end_io, void *private) 55 { 56 struct bio *bio; 57 struct btrfs_bio *bbio; 58 59 ASSERT(offset <= UINT_MAX && size <= UINT_MAX); 60 61 bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); 62 bbio = btrfs_bio(bio); 63 btrfs_bio_init(bbio, inode, end_io, private); 64 65 bio_trim(bio, offset >> 9, size >> 9); 66 bbio->iter = bio->bi_iter; 67 return bio; 68 } 69 70 static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) 71 { 72 if (!dev || !dev->bdev) 73 return; 74 if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 75 return; 76 77 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 78 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 79 if (!(bio->bi_opf & REQ_RAHEAD)) 80 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); 81 if (bio->bi_opf & REQ_PREFLUSH) 82 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); 83 } 84 85 static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, 86 struct bio *bio) 87 { 88 if (bio->bi_opf & REQ_META) 89 return fs_info->endio_meta_workers; 90 return fs_info->endio_workers; 91 } 92 93 static void btrfs_end_bio_work(struct work_struct *work) 94 { 95 struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); 96 97 bbio->end_io(bbio); 98 } 99 100 static void btrfs_simple_end_io(struct bio *bio) 101 { 102 struct btrfs_fs_info *fs_info = bio->bi_private; 103 struct btrfs_bio *bbio = btrfs_bio(bio); 104 105 btrfs_bio_counter_dec(fs_info); 106 107 if (bio->bi_status) 108 btrfs_log_dev_io_error(bio, bbio->device); 109 110 if (bio_op(bio) == REQ_OP_READ) { 111 INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); 112 queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); 113 } else { 114 bbio->end_io(bbio); 115 } 116 } 117 118 static void btrfs_raid56_end_io(struct bio *bio) 119 { 120 struct btrfs_io_context *bioc = bio->bi_private; 121 struct btrfs_bio *bbio = btrfs_bio(bio); 122 123 btrfs_bio_counter_dec(bioc->fs_info); 124 bbio->mirror_num = bioc->mirror_num; 125 bbio->end_io(bbio); 126 127 btrfs_put_bioc(bioc); 128 } 129 130 static void btrfs_orig_write_end_io(struct bio *bio) 131 { 132 struct btrfs_io_stripe *stripe = bio->bi_private; 133 struct btrfs_io_context *bioc = stripe->bioc; 134 struct btrfs_bio *bbio = btrfs_bio(bio); 135 136 btrfs_bio_counter_dec(bioc->fs_info); 137 138 if (bio->bi_status) { 139 atomic_inc(&bioc->error); 140 btrfs_log_dev_io_error(bio, stripe->dev); 141 } 142 143 /* 144 * Only send an error to the higher layers if it is beyond the tolerance 145 * threshold. 146 */ 147 if (atomic_read(&bioc->error) > bioc->max_errors) 148 bio->bi_status = BLK_STS_IOERR; 149 else 150 bio->bi_status = BLK_STS_OK; 151 152 bbio->end_io(bbio); 153 btrfs_put_bioc(bioc); 154 } 155 156 static void btrfs_clone_write_end_io(struct bio *bio) 157 { 158 struct btrfs_io_stripe *stripe = bio->bi_private; 159 160 if (bio->bi_status) { 161 atomic_inc(&stripe->bioc->error); 162 btrfs_log_dev_io_error(bio, stripe->dev); 163 } 164 165 /* Pass on control to the original bio this one was cloned from */ 166 bio_endio(stripe->bioc->orig_bio); 167 bio_put(bio); 168 } 169 170 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 171 { 172 if (!dev || !dev->bdev || 173 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 174 (btrfs_op(bio) == BTRFS_MAP_WRITE && 175 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { 176 bio_io_error(bio); 177 return; 178 } 179 180 bio_set_dev(bio, dev->bdev); 181 182 /* 183 * For zone append writing, bi_sector must point the beginning of the 184 * zone 185 */ 186 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 187 u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 188 189 if (btrfs_dev_is_sequential(dev, physical)) { 190 u64 zone_start = round_down(physical, 191 dev->fs_info->zone_size); 192 193 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 194 } else { 195 bio->bi_opf &= ~REQ_OP_ZONE_APPEND; 196 bio->bi_opf |= REQ_OP_WRITE; 197 } 198 } 199 btrfs_debug_in_rcu(dev->fs_info, 200 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", 201 __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, 202 (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), 203 dev->devid, bio->bi_iter.bi_size); 204 205 btrfsic_check_bio(bio); 206 submit_bio(bio); 207 } 208 209 static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) 210 { 211 struct bio *orig_bio = bioc->orig_bio, *bio; 212 213 ASSERT(bio_op(orig_bio) != REQ_OP_READ); 214 215 /* Reuse the bio embedded into the btrfs_bio for the last mirror */ 216 if (dev_nr == bioc->num_stripes - 1) { 217 bio = orig_bio; 218 bio->bi_end_io = btrfs_orig_write_end_io; 219 } else { 220 bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); 221 bio_inc_remaining(orig_bio); 222 bio->bi_end_io = btrfs_clone_write_end_io; 223 } 224 225 bio->bi_private = &bioc->stripes[dev_nr]; 226 bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; 227 bioc->stripes[dev_nr].bioc = bioc; 228 btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); 229 } 230 231 void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) 232 { 233 u64 logical = bio->bi_iter.bi_sector << 9; 234 u64 length = bio->bi_iter.bi_size; 235 u64 map_length = length; 236 struct btrfs_io_context *bioc = NULL; 237 struct btrfs_io_stripe smap; 238 int ret; 239 240 btrfs_bio_counter_inc_blocked(fs_info); 241 ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, 242 &bioc, &smap, &mirror_num, 1); 243 if (ret) { 244 btrfs_bio_counter_dec(fs_info); 245 btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); 246 return; 247 } 248 249 if (map_length < length) { 250 btrfs_crit(fs_info, 251 "mapping failed logical %llu bio len %llu len %llu", 252 logical, length, map_length); 253 BUG(); 254 } 255 256 if (!bioc) { 257 /* Single mirror read/write fast path */ 258 btrfs_bio(bio)->mirror_num = mirror_num; 259 btrfs_bio(bio)->device = smap.dev; 260 bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; 261 bio->bi_private = fs_info; 262 bio->bi_end_io = btrfs_simple_end_io; 263 btrfs_submit_dev_bio(smap.dev, bio); 264 } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 265 /* Parity RAID write or read recovery */ 266 bio->bi_private = bioc; 267 bio->bi_end_io = btrfs_raid56_end_io; 268 if (bio_op(bio) == REQ_OP_READ) 269 raid56_parity_recover(bio, bioc, mirror_num); 270 else 271 raid56_parity_write(bio, bioc); 272 } else { 273 /* Write to multiple mirrors */ 274 int total_devs = bioc->num_stripes; 275 int dev_nr; 276 277 bioc->orig_bio = bio; 278 for (dev_nr = 0; dev_nr < total_devs; dev_nr++) 279 btrfs_submit_mirrored_bio(bioc, dev_nr); 280 } 281 } 282 283 /* 284 * Submit a repair write. 285 * 286 * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a 287 * RAID setup. Here we only want to write the one bad copy, so we do the 288 * mapping ourselves and submit the bio directly. 289 * 290 * The I/O is issued synchronously to block the repair read completion from 291 * freeing the bio. 292 */ 293 int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, 294 u64 length, u64 logical, struct page *page, 295 unsigned int pg_offset, int mirror_num) 296 { 297 struct btrfs_device *dev; 298 struct bio_vec bvec; 299 struct bio bio; 300 u64 map_length = 0; 301 u64 sector; 302 struct btrfs_io_context *bioc = NULL; 303 int ret = 0; 304 305 ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); 306 BUG_ON(!mirror_num); 307 308 if (btrfs_repair_one_zone(fs_info, logical)) 309 return 0; 310 311 map_length = length; 312 313 /* 314 * Avoid races with device replace and make sure our bioc has devices 315 * associated to its stripes that don't go away while we are doing the 316 * read repair operation. 317 */ 318 btrfs_bio_counter_inc_blocked(fs_info); 319 if (btrfs_is_parity_mirror(fs_info, logical, length)) { 320 /* 321 * Note that we don't use BTRFS_MAP_WRITE because it's supposed 322 * to update all raid stripes, but here we just want to correct 323 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad 324 * stripe's dev and sector. 325 */ 326 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, 327 &map_length, &bioc, 0); 328 if (ret) 329 goto out_counter_dec; 330 ASSERT(bioc->mirror_num == 1); 331 } else { 332 ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, 333 &map_length, &bioc, mirror_num); 334 if (ret) 335 goto out_counter_dec; 336 /* 337 * This happens when dev-replace is also running, and the 338 * mirror_num indicates the dev-replace target. 339 * 340 * In this case, we don't need to do anything, as the read 341 * error just means the replace progress hasn't reached our 342 * read range, and later replace routine would handle it well. 343 */ 344 if (mirror_num != bioc->mirror_num) 345 goto out_counter_dec; 346 } 347 348 sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; 349 dev = bioc->stripes[bioc->mirror_num - 1].dev; 350 btrfs_put_bioc(bioc); 351 352 if (!dev || !dev->bdev || 353 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { 354 ret = -EIO; 355 goto out_counter_dec; 356 } 357 358 bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); 359 bio.bi_iter.bi_sector = sector; 360 __bio_add_page(&bio, page, length, pg_offset); 361 362 btrfsic_check_bio(&bio); 363 ret = submit_bio_wait(&bio); 364 if (ret) { 365 /* try to remap that extent elsewhere? */ 366 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 367 goto out_bio_uninit; 368 } 369 370 btrfs_info_rl_in_rcu(fs_info, 371 "read error corrected: ino %llu off %llu (dev %s sector %llu)", 372 ino, start, btrfs_dev_name(dev), sector); 373 ret = 0; 374 375 out_bio_uninit: 376 bio_uninit(&bio); 377 out_counter_dec: 378 btrfs_bio_counter_dec(fs_info); 379 return ret; 380 } 381 382 int __init btrfs_bioset_init(void) 383 { 384 if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, 385 offsetof(struct btrfs_bio, bio), 386 BIOSET_NEED_BVECS)) 387 return -ENOMEM; 388 return 0; 389 } 390 391 void __cold btrfs_bioset_exit(void) 392 { 393 bioset_exit(&btrfs_bioset); 394 } 395