1 /* 2 * linux/fs/ext4/page-io.c 3 * 4 * This contains the new page_io functions for ext4 5 * 6 * Written by Theodore Ts'o, 2010. 7 */ 8 9 #include <linux/fs.h> 10 #include <linux/time.h> 11 #include <linux/highuid.h> 12 #include <linux/pagemap.h> 13 #include <linux/quotaops.h> 14 #include <linux/string.h> 15 #include <linux/buffer_head.h> 16 #include <linux/writeback.h> 17 #include <linux/pagevec.h> 18 #include <linux/mpage.h> 19 #include <linux/namei.h> 20 #include <linux/uio.h> 21 #include <linux/bio.h> 22 #include <linux/workqueue.h> 23 #include <linux/kernel.h> 24 #include <linux/slab.h> 25 #include <linux/mm.h> 26 #include <linux/backing-dev.h> 27 28 #include "ext4_jbd2.h" 29 #include "xattr.h" 30 #include "acl.h" 31 32 static struct kmem_cache *io_end_cachep; 33 34 int __init ext4_init_pageio(void) 35 { 36 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 37 if (io_end_cachep == NULL) 38 return -ENOMEM; 39 return 0; 40 } 41 42 void ext4_exit_pageio(void) 43 { 44 kmem_cache_destroy(io_end_cachep); 45 } 46 47 /* 48 * Print an buffer I/O error compatible with the fs/buffer.c. This 49 * provides compatibility with dmesg scrapers that look for a specific 50 * buffer I/O error message. We really need a unified error reporting 51 * structure to userspace ala Digital Unix's uerf system, but it's 52 * probably not going to happen in my lifetime, due to LKML politics... 53 */ 54 static void buffer_io_error(struct buffer_head *bh) 55 { 56 printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", 57 bh->b_bdev, 58 (unsigned long long)bh->b_blocknr); 59 } 60 61 static void ext4_finish_bio(struct bio *bio) 62 { 63 int i; 64 struct bio_vec *bvec; 65 66 bio_for_each_segment_all(bvec, bio, i) { 67 struct page *page = bvec->bv_page; 68 #ifdef CONFIG_EXT4_FS_ENCRYPTION 69 struct page *data_page = NULL; 70 struct ext4_crypto_ctx *ctx = NULL; 71 #endif 72 struct buffer_head *bh, *head; 73 unsigned bio_start = bvec->bv_offset; 74 unsigned bio_end = bio_start + bvec->bv_len; 75 unsigned under_io = 0; 76 unsigned long flags; 77 78 if (!page) 79 continue; 80 81 #ifdef CONFIG_EXT4_FS_ENCRYPTION 82 if (!page->mapping) { 83 /* The bounce data pages are unmapped. */ 84 data_page = page; 85 ctx = (struct ext4_crypto_ctx *)page_private(data_page); 86 page = ctx->w.control_page; 87 } 88 #endif 89 90 if (bio->bi_error) { 91 SetPageError(page); 92 set_bit(AS_EIO, &page->mapping->flags); 93 } 94 bh = head = page_buffers(page); 95 /* 96 * We check all buffers in the page under BH_Uptodate_Lock 97 * to avoid races with other end io clearing async_write flags 98 */ 99 local_irq_save(flags); 100 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 101 do { 102 if (bh_offset(bh) < bio_start || 103 bh_offset(bh) + bh->b_size > bio_end) { 104 if (buffer_async_write(bh)) 105 under_io++; 106 continue; 107 } 108 clear_buffer_async_write(bh); 109 if (bio->bi_error) 110 buffer_io_error(bh); 111 } while ((bh = bh->b_this_page) != head); 112 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 113 local_irq_restore(flags); 114 if (!under_io) { 115 #ifdef CONFIG_EXT4_FS_ENCRYPTION 116 if (ctx) 117 ext4_restore_control_page(data_page); 118 #endif 119 end_page_writeback(page); 120 } 121 } 122 } 123 124 static void ext4_release_io_end(ext4_io_end_t *io_end) 125 { 126 struct bio *bio, *next_bio; 127 128 BUG_ON(!list_empty(&io_end->list)); 129 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 130 WARN_ON(io_end->handle); 131 132 for (bio = io_end->bio; bio; bio = next_bio) { 133 next_bio = bio->bi_private; 134 ext4_finish_bio(bio); 135 bio_put(bio); 136 } 137 kmem_cache_free(io_end_cachep, io_end); 138 } 139 140 /* 141 * Check a range of space and convert unwritten extents to written. Note that 142 * we are protected from truncate touching same part of extent tree by the 143 * fact that truncate code waits for all DIO to finish (thus exclusion from 144 * direct IO is achieved) and also waits for PageWriteback bits. Thus we 145 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are 146 * completed (happens from ext4_free_ioend()). 147 */ 148 static int ext4_end_io(ext4_io_end_t *io) 149 { 150 struct inode *inode = io->inode; 151 loff_t offset = io->offset; 152 ssize_t size = io->size; 153 handle_t *handle = io->handle; 154 int ret = 0; 155 156 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 157 "list->prev 0x%p\n", 158 io, inode->i_ino, io->list.next, io->list.prev); 159 160 io->handle = NULL; /* Following call will use up the handle */ 161 ret = ext4_convert_unwritten_extents(handle, inode, offset, size); 162 if (ret < 0) { 163 ext4_msg(inode->i_sb, KERN_EMERG, 164 "failed to convert unwritten extents to written " 165 "extents -- potential data loss! " 166 "(inode %lu, offset %llu, size %zd, error %d)", 167 inode->i_ino, offset, size, ret); 168 } 169 ext4_clear_io_unwritten_flag(io); 170 ext4_release_io_end(io); 171 return ret; 172 } 173 174 static void dump_completed_IO(struct inode *inode, struct list_head *head) 175 { 176 #ifdef EXT4FS_DEBUG 177 struct list_head *cur, *before, *after; 178 ext4_io_end_t *io, *io0, *io1; 179 180 if (list_empty(head)) 181 return; 182 183 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); 184 list_for_each_entry(io, head, list) { 185 cur = &io->list; 186 before = cur->prev; 187 io0 = container_of(before, ext4_io_end_t, list); 188 after = cur->next; 189 io1 = container_of(after, ext4_io_end_t, list); 190 191 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 192 io, inode->i_ino, io0, io1); 193 } 194 #endif 195 } 196 197 /* Add the io_end to per-inode completed end_io list. */ 198 static void ext4_add_complete_io(ext4_io_end_t *io_end) 199 { 200 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 201 struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); 202 struct workqueue_struct *wq; 203 unsigned long flags; 204 205 /* Only reserved conversions from writeback should enter here */ 206 WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 207 WARN_ON(!io_end->handle && sbi->s_journal); 208 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 209 wq = sbi->rsv_conversion_wq; 210 if (list_empty(&ei->i_rsv_conversion_list)) 211 queue_work(wq, &ei->i_rsv_conversion_work); 212 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); 213 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 214 } 215 216 static int ext4_do_flush_completed_IO(struct inode *inode, 217 struct list_head *head) 218 { 219 ext4_io_end_t *io; 220 struct list_head unwritten; 221 unsigned long flags; 222 struct ext4_inode_info *ei = EXT4_I(inode); 223 int err, ret = 0; 224 225 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 226 dump_completed_IO(inode, head); 227 list_replace_init(head, &unwritten); 228 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 229 230 while (!list_empty(&unwritten)) { 231 io = list_entry(unwritten.next, ext4_io_end_t, list); 232 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 233 list_del_init(&io->list); 234 235 err = ext4_end_io(io); 236 if (unlikely(!ret && err)) 237 ret = err; 238 } 239 return ret; 240 } 241 242 /* 243 * work on completed IO, to convert unwritten extents to extents 244 */ 245 void ext4_end_io_rsv_work(struct work_struct *work) 246 { 247 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 248 i_rsv_conversion_work); 249 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 250 } 251 252 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 253 { 254 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 255 if (io) { 256 io->inode = inode; 257 INIT_LIST_HEAD(&io->list); 258 atomic_set(&io->count, 1); 259 } 260 return io; 261 } 262 263 void ext4_put_io_end_defer(ext4_io_end_t *io_end) 264 { 265 if (atomic_dec_and_test(&io_end->count)) { 266 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 267 ext4_release_io_end(io_end); 268 return; 269 } 270 ext4_add_complete_io(io_end); 271 } 272 } 273 274 int ext4_put_io_end(ext4_io_end_t *io_end) 275 { 276 int err = 0; 277 278 if (atomic_dec_and_test(&io_end->count)) { 279 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 280 err = ext4_convert_unwritten_extents(io_end->handle, 281 io_end->inode, io_end->offset, 282 io_end->size); 283 io_end->handle = NULL; 284 ext4_clear_io_unwritten_flag(io_end); 285 } 286 ext4_release_io_end(io_end); 287 } 288 return err; 289 } 290 291 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 292 { 293 atomic_inc(&io_end->count); 294 return io_end; 295 } 296 297 /* BIO completion function for page writeback */ 298 static void ext4_end_bio(struct bio *bio) 299 { 300 ext4_io_end_t *io_end = bio->bi_private; 301 sector_t bi_sector = bio->bi_iter.bi_sector; 302 303 BUG_ON(!io_end); 304 bio->bi_end_io = NULL; 305 306 if (bio->bi_error) { 307 struct inode *inode = io_end->inode; 308 309 ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " 310 "(offset %llu size %ld starting block %llu)", 311 bio->bi_error, inode->i_ino, 312 (unsigned long long) io_end->offset, 313 (long) io_end->size, 314 (unsigned long long) 315 bi_sector >> (inode->i_blkbits - 9)); 316 mapping_set_error(inode->i_mapping, bio->bi_error); 317 } 318 319 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 320 /* 321 * Link bio into list hanging from io_end. We have to do it 322 * atomically as bio completions can be racing against each 323 * other. 324 */ 325 bio->bi_private = xchg(&io_end->bio, bio); 326 ext4_put_io_end_defer(io_end); 327 } else { 328 /* 329 * Drop io_end reference early. Inode can get freed once 330 * we finish the bio. 331 */ 332 ext4_put_io_end_defer(io_end); 333 ext4_finish_bio(bio); 334 bio_put(bio); 335 } 336 } 337 338 void ext4_io_submit(struct ext4_io_submit *io) 339 { 340 struct bio *bio = io->io_bio; 341 342 if (bio) { 343 int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? 344 WRITE_SYNC : WRITE; 345 submit_bio(io_op, io->io_bio); 346 } 347 io->io_bio = NULL; 348 } 349 350 void ext4_io_submit_init(struct ext4_io_submit *io, 351 struct writeback_control *wbc) 352 { 353 io->io_wbc = wbc; 354 io->io_bio = NULL; 355 io->io_end = NULL; 356 } 357 358 static int io_submit_init_bio(struct ext4_io_submit *io, 359 struct buffer_head *bh) 360 { 361 struct bio *bio; 362 363 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 364 if (!bio) 365 return -ENOMEM; 366 wbc_init_bio(io->io_wbc, bio); 367 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 368 bio->bi_bdev = bh->b_bdev; 369 bio->bi_end_io = ext4_end_bio; 370 bio->bi_private = ext4_get_io_end(io->io_end); 371 io->io_bio = bio; 372 io->io_next_block = bh->b_blocknr; 373 return 0; 374 } 375 376 static int io_submit_add_bh(struct ext4_io_submit *io, 377 struct inode *inode, 378 struct page *page, 379 struct buffer_head *bh) 380 { 381 int ret; 382 383 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 384 submit_and_retry: 385 ext4_io_submit(io); 386 } 387 if (io->io_bio == NULL) { 388 ret = io_submit_init_bio(io, bh); 389 if (ret) 390 return ret; 391 } 392 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 393 if (ret != bh->b_size) 394 goto submit_and_retry; 395 wbc_account_io(io->io_wbc, page, bh->b_size); 396 io->io_next_block++; 397 return 0; 398 } 399 400 int ext4_bio_write_page(struct ext4_io_submit *io, 401 struct page *page, 402 int len, 403 struct writeback_control *wbc, 404 bool keep_towrite) 405 { 406 struct page *data_page = NULL; 407 struct inode *inode = page->mapping->host; 408 unsigned block_start, blocksize; 409 struct buffer_head *bh, *head; 410 int ret = 0; 411 int nr_submitted = 0; 412 int nr_to_submit = 0; 413 414 blocksize = 1 << inode->i_blkbits; 415 416 BUG_ON(!PageLocked(page)); 417 BUG_ON(PageWriteback(page)); 418 419 if (keep_towrite) 420 set_page_writeback_keepwrite(page); 421 else 422 set_page_writeback(page); 423 ClearPageError(page); 424 425 /* 426 * Comments copied from block_write_full_page: 427 * 428 * The page straddles i_size. It must be zeroed out on each and every 429 * writepage invocation because it may be mmapped. "A file is mapped 430 * in multiples of the page size. For a file that is not a multiple of 431 * the page size, the remaining memory is zeroed when mapped, and 432 * writes to that region are not written out to the file." 433 */ 434 if (len < PAGE_SIZE) 435 zero_user_segment(page, len, PAGE_SIZE); 436 /* 437 * In the first loop we prepare and mark buffers to submit. We have to 438 * mark all buffers in the page before submitting so that 439 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 440 * on the first buffer finishes and we are still working on submitting 441 * the second buffer. 442 */ 443 bh = head = page_buffers(page); 444 do { 445 block_start = bh_offset(bh); 446 if (block_start >= len) { 447 clear_buffer_dirty(bh); 448 set_buffer_uptodate(bh); 449 continue; 450 } 451 if (!buffer_dirty(bh) || buffer_delay(bh) || 452 !buffer_mapped(bh) || buffer_unwritten(bh)) { 453 /* A hole? We can safely clear the dirty bit */ 454 if (!buffer_mapped(bh)) 455 clear_buffer_dirty(bh); 456 if (io->io_bio) 457 ext4_io_submit(io); 458 continue; 459 } 460 if (buffer_new(bh)) { 461 clear_buffer_new(bh); 462 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 463 } 464 set_buffer_async_write(bh); 465 nr_to_submit++; 466 } while ((bh = bh->b_this_page) != head); 467 468 bh = head = page_buffers(page); 469 470 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) && 471 nr_to_submit) { 472 gfp_t gfp_flags = GFP_NOFS; 473 474 retry_encrypt: 475 data_page = ext4_encrypt(inode, page, gfp_flags); 476 if (IS_ERR(data_page)) { 477 ret = PTR_ERR(data_page); 478 if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { 479 if (io->io_bio) { 480 ext4_io_submit(io); 481 congestion_wait(BLK_RW_ASYNC, HZ/50); 482 } 483 gfp_flags |= __GFP_NOFAIL; 484 goto retry_encrypt; 485 } 486 data_page = NULL; 487 goto out; 488 } 489 } 490 491 /* Now submit buffers to write */ 492 do { 493 if (!buffer_async_write(bh)) 494 continue; 495 ret = io_submit_add_bh(io, inode, 496 data_page ? data_page : page, bh); 497 if (ret) { 498 /* 499 * We only get here on ENOMEM. Not much else 500 * we can do but mark the page as dirty, and 501 * better luck next time. 502 */ 503 break; 504 } 505 nr_submitted++; 506 clear_buffer_dirty(bh); 507 } while ((bh = bh->b_this_page) != head); 508 509 /* Error stopped previous loop? Clean up buffers... */ 510 if (ret) { 511 out: 512 if (data_page) 513 ext4_restore_control_page(data_page); 514 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); 515 redirty_page_for_writepage(wbc, page); 516 do { 517 clear_buffer_async_write(bh); 518 bh = bh->b_this_page; 519 } while (bh != head); 520 } 521 unlock_page(page); 522 /* Nothing submitted - we have to end page writeback */ 523 if (!nr_submitted) 524 end_page_writeback(page); 525 return ret; 526 } 527