1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/ext4/page-io.c 4 * 5 * This contains the new page_io functions for ext4 6 * 7 * Written by Theodore Ts'o, 2010. 8 */ 9 10 #include <linux/fs.h> 11 #include <linux/time.h> 12 #include <linux/highuid.h> 13 #include <linux/pagemap.h> 14 #include <linux/quotaops.h> 15 #include <linux/string.h> 16 #include <linux/buffer_head.h> 17 #include <linux/writeback.h> 18 #include <linux/pagevec.h> 19 #include <linux/mpage.h> 20 #include <linux/namei.h> 21 #include <linux/uio.h> 22 #include <linux/bio.h> 23 #include <linux/workqueue.h> 24 #include <linux/kernel.h> 25 #include <linux/slab.h> 26 #include <linux/mm.h> 27 #include <linux/backing-dev.h> 28 29 #include "ext4_jbd2.h" 30 #include "xattr.h" 31 #include "acl.h" 32 33 static struct kmem_cache *io_end_cachep; 34 35 int __init ext4_init_pageio(void) 36 { 37 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 38 if (io_end_cachep == NULL) 39 return -ENOMEM; 40 return 0; 41 } 42 43 void ext4_exit_pageio(void) 44 { 45 kmem_cache_destroy(io_end_cachep); 46 } 47 48 /* 49 * Print an buffer I/O error compatible with the fs/buffer.c. This 50 * provides compatibility with dmesg scrapers that look for a specific 51 * buffer I/O error message. We really need a unified error reporting 52 * structure to userspace ala Digital Unix's uerf system, but it's 53 * probably not going to happen in my lifetime, due to LKML politics... 54 */ 55 static void buffer_io_error(struct buffer_head *bh) 56 { 57 printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", 58 bh->b_bdev, 59 (unsigned long long)bh->b_blocknr); 60 } 61 62 static void ext4_finish_bio(struct bio *bio) 63 { 64 int i; 65 struct bio_vec *bvec; 66 struct bvec_iter_all iter_all; 67 68 bio_for_each_segment_all(bvec, bio, i, iter_all) { 69 struct page *page = bvec->bv_page; 70 #ifdef CONFIG_FS_ENCRYPTION 71 struct page *data_page = NULL; 72 #endif 73 struct buffer_head *bh, *head; 74 unsigned bio_start = bvec->bv_offset; 75 unsigned bio_end = bio_start + bvec->bv_len; 76 unsigned under_io = 0; 77 unsigned long flags; 78 79 if (!page) 80 continue; 81 82 #ifdef CONFIG_FS_ENCRYPTION 83 if (!page->mapping) { 84 /* The bounce data pages are unmapped. */ 85 data_page = page; 86 fscrypt_pullback_bio_page(&page, false); 87 } 88 #endif 89 90 if (bio->bi_status) { 91 SetPageError(page); 92 mapping_set_error(page->mapping, -EIO); 93 } 94 bh = head = page_buffers(page); 95 /* 96 * We check all buffers in the page under BH_Uptodate_Lock 97 * to avoid races with other end io clearing async_write flags 98 */ 99 local_irq_save(flags); 100 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 101 do { 102 if (bh_offset(bh) < bio_start || 103 bh_offset(bh) + bh->b_size > bio_end) { 104 if (buffer_async_write(bh)) 105 under_io++; 106 continue; 107 } 108 clear_buffer_async_write(bh); 109 if (bio->bi_status) 110 buffer_io_error(bh); 111 } while ((bh = bh->b_this_page) != head); 112 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 113 local_irq_restore(flags); 114 if (!under_io) { 115 #ifdef CONFIG_FS_ENCRYPTION 116 if (data_page) 117 fscrypt_restore_control_page(data_page); 118 #endif 119 end_page_writeback(page); 120 } 121 } 122 } 123 124 static void ext4_release_io_end(ext4_io_end_t *io_end) 125 { 126 struct bio *bio, *next_bio; 127 128 BUG_ON(!list_empty(&io_end->list)); 129 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 130 WARN_ON(io_end->handle); 131 132 for (bio = io_end->bio; bio; bio = next_bio) { 133 next_bio = bio->bi_private; 134 ext4_finish_bio(bio); 135 bio_put(bio); 136 } 137 kmem_cache_free(io_end_cachep, io_end); 138 } 139 140 /* 141 * Check a range of space and convert unwritten extents to written. Note that 142 * we are protected from truncate touching same part of extent tree by the 143 * fact that truncate code waits for all DIO to finish (thus exclusion from 144 * direct IO is achieved) and also waits for PageWriteback bits. Thus we 145 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are 146 * completed (happens from ext4_free_ioend()). 147 */ 148 static int ext4_end_io(ext4_io_end_t *io) 149 { 150 struct inode *inode = io->inode; 151 loff_t offset = io->offset; 152 ssize_t size = io->size; 153 handle_t *handle = io->handle; 154 int ret = 0; 155 156 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 157 "list->prev 0x%p\n", 158 io, inode->i_ino, io->list.next, io->list.prev); 159 160 io->handle = NULL; /* Following call will use up the handle */ 161 ret = ext4_convert_unwritten_extents(handle, inode, offset, size); 162 if (ret < 0 && !ext4_forced_shutdown(EXT4_SB(inode->i_sb))) { 163 ext4_msg(inode->i_sb, KERN_EMERG, 164 "failed to convert unwritten extents to written " 165 "extents -- potential data loss! " 166 "(inode %lu, offset %llu, size %zd, error %d)", 167 inode->i_ino, offset, size, ret); 168 } 169 ext4_clear_io_unwritten_flag(io); 170 ext4_release_io_end(io); 171 return ret; 172 } 173 174 static void dump_completed_IO(struct inode *inode, struct list_head *head) 175 { 176 #ifdef EXT4FS_DEBUG 177 struct list_head *cur, *before, *after; 178 ext4_io_end_t *io, *io0, *io1; 179 180 if (list_empty(head)) 181 return; 182 183 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); 184 list_for_each_entry(io, head, list) { 185 cur = &io->list; 186 before = cur->prev; 187 io0 = container_of(before, ext4_io_end_t, list); 188 after = cur->next; 189 io1 = container_of(after, ext4_io_end_t, list); 190 191 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 192 io, inode->i_ino, io0, io1); 193 } 194 #endif 195 } 196 197 /* Add the io_end to per-inode completed end_io list. */ 198 static void ext4_add_complete_io(ext4_io_end_t *io_end) 199 { 200 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 201 struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); 202 struct workqueue_struct *wq; 203 unsigned long flags; 204 205 /* Only reserved conversions from writeback should enter here */ 206 WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 207 WARN_ON(!io_end->handle && sbi->s_journal); 208 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 209 wq = sbi->rsv_conversion_wq; 210 if (list_empty(&ei->i_rsv_conversion_list)) 211 queue_work(wq, &ei->i_rsv_conversion_work); 212 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); 213 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 214 } 215 216 static int ext4_do_flush_completed_IO(struct inode *inode, 217 struct list_head *head) 218 { 219 ext4_io_end_t *io; 220 struct list_head unwritten; 221 unsigned long flags; 222 struct ext4_inode_info *ei = EXT4_I(inode); 223 int err, ret = 0; 224 225 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 226 dump_completed_IO(inode, head); 227 list_replace_init(head, &unwritten); 228 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 229 230 while (!list_empty(&unwritten)) { 231 io = list_entry(unwritten.next, ext4_io_end_t, list); 232 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 233 list_del_init(&io->list); 234 235 err = ext4_end_io(io); 236 if (unlikely(!ret && err)) 237 ret = err; 238 } 239 return ret; 240 } 241 242 /* 243 * work on completed IO, to convert unwritten extents to extents 244 */ 245 void ext4_end_io_rsv_work(struct work_struct *work) 246 { 247 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 248 i_rsv_conversion_work); 249 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 250 } 251 252 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 253 { 254 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 255 if (io) { 256 io->inode = inode; 257 INIT_LIST_HEAD(&io->list); 258 atomic_set(&io->count, 1); 259 } 260 return io; 261 } 262 263 void ext4_put_io_end_defer(ext4_io_end_t *io_end) 264 { 265 if (atomic_dec_and_test(&io_end->count)) { 266 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 267 ext4_release_io_end(io_end); 268 return; 269 } 270 ext4_add_complete_io(io_end); 271 } 272 } 273 274 int ext4_put_io_end(ext4_io_end_t *io_end) 275 { 276 int err = 0; 277 278 if (atomic_dec_and_test(&io_end->count)) { 279 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 280 err = ext4_convert_unwritten_extents(io_end->handle, 281 io_end->inode, io_end->offset, 282 io_end->size); 283 io_end->handle = NULL; 284 ext4_clear_io_unwritten_flag(io_end); 285 } 286 ext4_release_io_end(io_end); 287 } 288 return err; 289 } 290 291 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 292 { 293 atomic_inc(&io_end->count); 294 return io_end; 295 } 296 297 /* BIO completion function for page writeback */ 298 static void ext4_end_bio(struct bio *bio) 299 { 300 ext4_io_end_t *io_end = bio->bi_private; 301 sector_t bi_sector = bio->bi_iter.bi_sector; 302 char b[BDEVNAME_SIZE]; 303 304 if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", 305 bio_devname(bio, b), 306 (long long) bio->bi_iter.bi_sector, 307 (unsigned) bio_sectors(bio), 308 bio->bi_status)) { 309 ext4_finish_bio(bio); 310 bio_put(bio); 311 return; 312 } 313 bio->bi_end_io = NULL; 314 315 if (bio->bi_status) { 316 struct inode *inode = io_end->inode; 317 318 ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " 319 "(offset %llu size %ld starting block %llu)", 320 bio->bi_status, inode->i_ino, 321 (unsigned long long) io_end->offset, 322 (long) io_end->size, 323 (unsigned long long) 324 bi_sector >> (inode->i_blkbits - 9)); 325 mapping_set_error(inode->i_mapping, 326 blk_status_to_errno(bio->bi_status)); 327 } 328 329 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 330 /* 331 * Link bio into list hanging from io_end. We have to do it 332 * atomically as bio completions can be racing against each 333 * other. 334 */ 335 bio->bi_private = xchg(&io_end->bio, bio); 336 ext4_put_io_end_defer(io_end); 337 } else { 338 /* 339 * Drop io_end reference early. Inode can get freed once 340 * we finish the bio. 341 */ 342 ext4_put_io_end_defer(io_end); 343 ext4_finish_bio(bio); 344 bio_put(bio); 345 } 346 } 347 348 void ext4_io_submit(struct ext4_io_submit *io) 349 { 350 struct bio *bio = io->io_bio; 351 352 if (bio) { 353 int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? 354 REQ_SYNC : 0; 355 io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint; 356 bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); 357 submit_bio(io->io_bio); 358 } 359 io->io_bio = NULL; 360 } 361 362 void ext4_io_submit_init(struct ext4_io_submit *io, 363 struct writeback_control *wbc) 364 { 365 io->io_wbc = wbc; 366 io->io_bio = NULL; 367 io->io_end = NULL; 368 } 369 370 static int io_submit_init_bio(struct ext4_io_submit *io, 371 struct buffer_head *bh) 372 { 373 struct bio *bio; 374 375 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 376 if (!bio) 377 return -ENOMEM; 378 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 379 bio_set_dev(bio, bh->b_bdev); 380 bio->bi_end_io = ext4_end_bio; 381 bio->bi_private = ext4_get_io_end(io->io_end); 382 io->io_bio = bio; 383 io->io_next_block = bh->b_blocknr; 384 wbc_init_bio(io->io_wbc, bio); 385 return 0; 386 } 387 388 static int io_submit_add_bh(struct ext4_io_submit *io, 389 struct inode *inode, 390 struct page *page, 391 struct buffer_head *bh) 392 { 393 int ret; 394 395 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 396 submit_and_retry: 397 ext4_io_submit(io); 398 } 399 if (io->io_bio == NULL) { 400 ret = io_submit_init_bio(io, bh); 401 if (ret) 402 return ret; 403 io->io_bio->bi_write_hint = inode->i_write_hint; 404 } 405 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 406 if (ret != bh->b_size) 407 goto submit_and_retry; 408 wbc_account_io(io->io_wbc, page, bh->b_size); 409 io->io_next_block++; 410 return 0; 411 } 412 413 int ext4_bio_write_page(struct ext4_io_submit *io, 414 struct page *page, 415 int len, 416 struct writeback_control *wbc, 417 bool keep_towrite) 418 { 419 struct page *data_page = NULL; 420 struct inode *inode = page->mapping->host; 421 unsigned block_start; 422 struct buffer_head *bh, *head; 423 int ret = 0; 424 int nr_submitted = 0; 425 int nr_to_submit = 0; 426 427 BUG_ON(!PageLocked(page)); 428 BUG_ON(PageWriteback(page)); 429 430 if (keep_towrite) 431 set_page_writeback_keepwrite(page); 432 else 433 set_page_writeback(page); 434 ClearPageError(page); 435 436 /* 437 * Comments copied from block_write_full_page: 438 * 439 * The page straddles i_size. It must be zeroed out on each and every 440 * writepage invocation because it may be mmapped. "A file is mapped 441 * in multiples of the page size. For a file that is not a multiple of 442 * the page size, the remaining memory is zeroed when mapped, and 443 * writes to that region are not written out to the file." 444 */ 445 if (len < PAGE_SIZE) 446 zero_user_segment(page, len, PAGE_SIZE); 447 /* 448 * In the first loop we prepare and mark buffers to submit. We have to 449 * mark all buffers in the page before submitting so that 450 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 451 * on the first buffer finishes and we are still working on submitting 452 * the second buffer. 453 */ 454 bh = head = page_buffers(page); 455 do { 456 block_start = bh_offset(bh); 457 if (block_start >= len) { 458 clear_buffer_dirty(bh); 459 set_buffer_uptodate(bh); 460 continue; 461 } 462 if (!buffer_dirty(bh) || buffer_delay(bh) || 463 !buffer_mapped(bh) || buffer_unwritten(bh)) { 464 /* A hole? We can safely clear the dirty bit */ 465 if (!buffer_mapped(bh)) 466 clear_buffer_dirty(bh); 467 if (io->io_bio) 468 ext4_io_submit(io); 469 continue; 470 } 471 if (buffer_new(bh)) 472 clear_buffer_new(bh); 473 set_buffer_async_write(bh); 474 nr_to_submit++; 475 } while ((bh = bh->b_this_page) != head); 476 477 bh = head = page_buffers(page); 478 479 if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) { 480 gfp_t gfp_flags = GFP_NOFS; 481 482 retry_encrypt: 483 data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, 484 page->index, gfp_flags); 485 if (IS_ERR(data_page)) { 486 ret = PTR_ERR(data_page); 487 if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { 488 if (io->io_bio) { 489 ext4_io_submit(io); 490 congestion_wait(BLK_RW_ASYNC, HZ/50); 491 } 492 gfp_flags |= __GFP_NOFAIL; 493 goto retry_encrypt; 494 } 495 data_page = NULL; 496 goto out; 497 } 498 } 499 500 /* Now submit buffers to write */ 501 do { 502 if (!buffer_async_write(bh)) 503 continue; 504 ret = io_submit_add_bh(io, inode, 505 data_page ? data_page : page, bh); 506 if (ret) { 507 /* 508 * We only get here on ENOMEM. Not much else 509 * we can do but mark the page as dirty, and 510 * better luck next time. 511 */ 512 break; 513 } 514 nr_submitted++; 515 clear_buffer_dirty(bh); 516 } while ((bh = bh->b_this_page) != head); 517 518 /* Error stopped previous loop? Clean up buffers... */ 519 if (ret) { 520 out: 521 if (data_page) 522 fscrypt_restore_control_page(data_page); 523 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); 524 redirty_page_for_writepage(wbc, page); 525 do { 526 clear_buffer_async_write(bh); 527 bh = bh->b_this_page; 528 } while (bh != head); 529 } 530 unlock_page(page); 531 /* Nothing submitted - we have to end page writeback */ 532 if (!nr_submitted) 533 end_page_writeback(page); 534 return ret; 535 } 536