1 /* 2 * linux/fs/ext4/page-io.c 3 * 4 * This contains the new page_io functions for ext4 5 * 6 * Written by Theodore Ts'o, 2010. 7 */ 8 9 #include <linux/fs.h> 10 #include <linux/time.h> 11 #include <linux/highuid.h> 12 #include <linux/pagemap.h> 13 #include <linux/quotaops.h> 14 #include <linux/string.h> 15 #include <linux/buffer_head.h> 16 #include <linux/writeback.h> 17 #include <linux/pagevec.h> 18 #include <linux/mpage.h> 19 #include <linux/namei.h> 20 #include <linux/uio.h> 21 #include <linux/bio.h> 22 #include <linux/workqueue.h> 23 #include <linux/kernel.h> 24 #include <linux/slab.h> 25 #include <linux/mm.h> 26 27 #include "ext4_jbd2.h" 28 #include "xattr.h" 29 #include "acl.h" 30 31 static struct kmem_cache *io_end_cachep; 32 33 int __init ext4_init_pageio(void) 34 { 35 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 36 if (io_end_cachep == NULL) 37 return -ENOMEM; 38 return 0; 39 } 40 41 void ext4_exit_pageio(void) 42 { 43 kmem_cache_destroy(io_end_cachep); 44 } 45 46 /* 47 * Print an buffer I/O error compatible with the fs/buffer.c. This 48 * provides compatibility with dmesg scrapers that look for a specific 49 * buffer I/O error message. We really need a unified error reporting 50 * structure to userspace ala Digital Unix's uerf system, but it's 51 * probably not going to happen in my lifetime, due to LKML politics... 52 */ 53 static void buffer_io_error(struct buffer_head *bh) 54 { 55 char b[BDEVNAME_SIZE]; 56 printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 57 bdevname(bh->b_bdev, b), 58 (unsigned long long)bh->b_blocknr); 59 } 60 61 static void ext4_finish_bio(struct bio *bio) 62 { 63 int i; 64 struct bio_vec *bvec; 65 66 bio_for_each_segment_all(bvec, bio, i) { 67 struct page *page = bvec->bv_page; 68 #ifdef CONFIG_EXT4_FS_ENCRYPTION 69 struct page *data_page = NULL; 70 struct ext4_crypto_ctx *ctx = NULL; 71 #endif 72 struct buffer_head *bh, *head; 73 unsigned bio_start = bvec->bv_offset; 74 unsigned bio_end = bio_start + bvec->bv_len; 75 unsigned under_io = 0; 76 unsigned long flags; 77 78 if (!page) 79 continue; 80 81 #ifdef CONFIG_EXT4_FS_ENCRYPTION 82 if (!page->mapping) { 83 /* The bounce data pages are unmapped. */ 84 data_page = page; 85 ctx = (struct ext4_crypto_ctx *)page_private(data_page); 86 page = ctx->w.control_page; 87 } 88 #endif 89 90 if (bio->bi_error) { 91 SetPageError(page); 92 set_bit(AS_EIO, &page->mapping->flags); 93 } 94 bh = head = page_buffers(page); 95 /* 96 * We check all buffers in the page under BH_Uptodate_Lock 97 * to avoid races with other end io clearing async_write flags 98 */ 99 local_irq_save(flags); 100 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 101 do { 102 if (bh_offset(bh) < bio_start || 103 bh_offset(bh) + bh->b_size > bio_end) { 104 if (buffer_async_write(bh)) 105 under_io++; 106 continue; 107 } 108 clear_buffer_async_write(bh); 109 if (bio->bi_error) 110 buffer_io_error(bh); 111 } while ((bh = bh->b_this_page) != head); 112 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 113 local_irq_restore(flags); 114 if (!under_io) { 115 #ifdef CONFIG_EXT4_FS_ENCRYPTION 116 if (ctx) 117 ext4_restore_control_page(data_page); 118 #endif 119 end_page_writeback(page); 120 } 121 } 122 } 123 124 static void ext4_release_io_end(ext4_io_end_t *io_end) 125 { 126 struct bio *bio, *next_bio; 127 128 BUG_ON(!list_empty(&io_end->list)); 129 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN); 130 WARN_ON(io_end->handle); 131 132 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count)) 133 wake_up_all(ext4_ioend_wq(io_end->inode)); 134 135 for (bio = io_end->bio; bio; bio = next_bio) { 136 next_bio = bio->bi_private; 137 ext4_finish_bio(bio); 138 bio_put(bio); 139 } 140 kmem_cache_free(io_end_cachep, io_end); 141 } 142 143 static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) 144 { 145 struct inode *inode = io_end->inode; 146 147 io_end->flag &= ~EXT4_IO_END_UNWRITTEN; 148 /* Wake up anyone waiting on unwritten extent conversion */ 149 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 150 wake_up_all(ext4_ioend_wq(inode)); 151 } 152 153 /* 154 * Check a range of space and convert unwritten extents to written. Note that 155 * we are protected from truncate touching same part of extent tree by the 156 * fact that truncate code waits for all DIO to finish (thus exclusion from 157 * direct IO is achieved) and also waits for PageWriteback bits. Thus we 158 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are 159 * completed (happens from ext4_free_ioend()). 160 */ 161 static int ext4_end_io(ext4_io_end_t *io) 162 { 163 struct inode *inode = io->inode; 164 loff_t offset = io->offset; 165 ssize_t size = io->size; 166 handle_t *handle = io->handle; 167 int ret = 0; 168 169 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 170 "list->prev 0x%p\n", 171 io, inode->i_ino, io->list.next, io->list.prev); 172 173 io->handle = NULL; /* Following call will use up the handle */ 174 ret = ext4_convert_unwritten_extents(handle, inode, offset, size); 175 if (ret < 0) { 176 ext4_msg(inode->i_sb, KERN_EMERG, 177 "failed to convert unwritten extents to written " 178 "extents -- potential data loss! " 179 "(inode %lu, offset %llu, size %zd, error %d)", 180 inode->i_ino, offset, size, ret); 181 } 182 ext4_clear_io_unwritten_flag(io); 183 ext4_release_io_end(io); 184 return ret; 185 } 186 187 static void dump_completed_IO(struct inode *inode, struct list_head *head) 188 { 189 #ifdef EXT4FS_DEBUG 190 struct list_head *cur, *before, *after; 191 ext4_io_end_t *io, *io0, *io1; 192 193 if (list_empty(head)) 194 return; 195 196 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino); 197 list_for_each_entry(io, head, list) { 198 cur = &io->list; 199 before = cur->prev; 200 io0 = container_of(before, ext4_io_end_t, list); 201 after = cur->next; 202 io1 = container_of(after, ext4_io_end_t, list); 203 204 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 205 io, inode->i_ino, io0, io1); 206 } 207 #endif 208 } 209 210 /* Add the io_end to per-inode completed end_io list. */ 211 static void ext4_add_complete_io(ext4_io_end_t *io_end) 212 { 213 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 214 struct ext4_sb_info *sbi = EXT4_SB(io_end->inode->i_sb); 215 struct workqueue_struct *wq; 216 unsigned long flags; 217 218 /* Only reserved conversions from writeback should enter here */ 219 WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 220 WARN_ON(!io_end->handle && sbi->s_journal); 221 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 222 wq = sbi->rsv_conversion_wq; 223 if (list_empty(&ei->i_rsv_conversion_list)) 224 queue_work(wq, &ei->i_rsv_conversion_work); 225 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); 226 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 227 } 228 229 static int ext4_do_flush_completed_IO(struct inode *inode, 230 struct list_head *head) 231 { 232 ext4_io_end_t *io; 233 struct list_head unwritten; 234 unsigned long flags; 235 struct ext4_inode_info *ei = EXT4_I(inode); 236 int err, ret = 0; 237 238 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 239 dump_completed_IO(inode, head); 240 list_replace_init(head, &unwritten); 241 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 242 243 while (!list_empty(&unwritten)) { 244 io = list_entry(unwritten.next, ext4_io_end_t, list); 245 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 246 list_del_init(&io->list); 247 248 err = ext4_end_io(io); 249 if (unlikely(!ret && err)) 250 ret = err; 251 } 252 return ret; 253 } 254 255 /* 256 * work on completed IO, to convert unwritten extents to extents 257 */ 258 void ext4_end_io_rsv_work(struct work_struct *work) 259 { 260 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 261 i_rsv_conversion_work); 262 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list); 263 } 264 265 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 266 { 267 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 268 if (io) { 269 atomic_inc(&EXT4_I(inode)->i_ioend_count); 270 io->inode = inode; 271 INIT_LIST_HEAD(&io->list); 272 atomic_set(&io->count, 1); 273 } 274 return io; 275 } 276 277 void ext4_put_io_end_defer(ext4_io_end_t *io_end) 278 { 279 if (atomic_dec_and_test(&io_end->count)) { 280 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) { 281 ext4_release_io_end(io_end); 282 return; 283 } 284 ext4_add_complete_io(io_end); 285 } 286 } 287 288 int ext4_put_io_end(ext4_io_end_t *io_end) 289 { 290 int err = 0; 291 292 if (atomic_dec_and_test(&io_end->count)) { 293 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 294 err = ext4_convert_unwritten_extents(io_end->handle, 295 io_end->inode, io_end->offset, 296 io_end->size); 297 io_end->handle = NULL; 298 ext4_clear_io_unwritten_flag(io_end); 299 } 300 ext4_release_io_end(io_end); 301 } 302 return err; 303 } 304 305 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) 306 { 307 atomic_inc(&io_end->count); 308 return io_end; 309 } 310 311 /* BIO completion function for page writeback */ 312 static void ext4_end_bio(struct bio *bio) 313 { 314 ext4_io_end_t *io_end = bio->bi_private; 315 sector_t bi_sector = bio->bi_iter.bi_sector; 316 317 BUG_ON(!io_end); 318 bio->bi_end_io = NULL; 319 320 if (bio->bi_error) { 321 struct inode *inode = io_end->inode; 322 323 ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " 324 "(offset %llu size %ld starting block %llu)", 325 bio->bi_error, inode->i_ino, 326 (unsigned long long) io_end->offset, 327 (long) io_end->size, 328 (unsigned long long) 329 bi_sector >> (inode->i_blkbits - 9)); 330 mapping_set_error(inode->i_mapping, bio->bi_error); 331 } 332 333 if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 334 /* 335 * Link bio into list hanging from io_end. We have to do it 336 * atomically as bio completions can be racing against each 337 * other. 338 */ 339 bio->bi_private = xchg(&io_end->bio, bio); 340 ext4_put_io_end_defer(io_end); 341 } else { 342 /* 343 * Drop io_end reference early. Inode can get freed once 344 * we finish the bio. 345 */ 346 ext4_put_io_end_defer(io_end); 347 ext4_finish_bio(bio); 348 bio_put(bio); 349 } 350 } 351 352 void ext4_io_submit(struct ext4_io_submit *io) 353 { 354 struct bio *bio = io->io_bio; 355 356 if (bio) { 357 int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? 358 WRITE_SYNC : WRITE; 359 bio_get(io->io_bio); 360 submit_bio(io_op, io->io_bio); 361 bio_put(io->io_bio); 362 } 363 io->io_bio = NULL; 364 } 365 366 void ext4_io_submit_init(struct ext4_io_submit *io, 367 struct writeback_control *wbc) 368 { 369 io->io_wbc = wbc; 370 io->io_bio = NULL; 371 io->io_end = NULL; 372 } 373 374 static int io_submit_init_bio(struct ext4_io_submit *io, 375 struct buffer_head *bh) 376 { 377 struct bio *bio; 378 379 bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); 380 if (!bio) 381 return -ENOMEM; 382 wbc_init_bio(io->io_wbc, bio); 383 bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); 384 bio->bi_bdev = bh->b_bdev; 385 bio->bi_end_io = ext4_end_bio; 386 bio->bi_private = ext4_get_io_end(io->io_end); 387 io->io_bio = bio; 388 io->io_next_block = bh->b_blocknr; 389 return 0; 390 } 391 392 static int io_submit_add_bh(struct ext4_io_submit *io, 393 struct inode *inode, 394 struct page *page, 395 struct buffer_head *bh) 396 { 397 int ret; 398 399 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 400 submit_and_retry: 401 ext4_io_submit(io); 402 } 403 if (io->io_bio == NULL) { 404 ret = io_submit_init_bio(io, bh); 405 if (ret) 406 return ret; 407 } 408 ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); 409 if (ret != bh->b_size) 410 goto submit_and_retry; 411 wbc_account_io(io->io_wbc, page, bh->b_size); 412 io->io_next_block++; 413 return 0; 414 } 415 416 int ext4_bio_write_page(struct ext4_io_submit *io, 417 struct page *page, 418 int len, 419 struct writeback_control *wbc, 420 bool keep_towrite) 421 { 422 struct page *data_page = NULL; 423 struct inode *inode = page->mapping->host; 424 unsigned block_start, blocksize; 425 struct buffer_head *bh, *head; 426 int ret = 0; 427 int nr_submitted = 0; 428 429 blocksize = 1 << inode->i_blkbits; 430 431 BUG_ON(!PageLocked(page)); 432 BUG_ON(PageWriteback(page)); 433 434 if (keep_towrite) 435 set_page_writeback_keepwrite(page); 436 else 437 set_page_writeback(page); 438 ClearPageError(page); 439 440 /* 441 * Comments copied from block_write_full_page: 442 * 443 * The page straddles i_size. It must be zeroed out on each and every 444 * writepage invocation because it may be mmapped. "A file is mapped 445 * in multiples of the page size. For a file that is not a multiple of 446 * the page size, the remaining memory is zeroed when mapped, and 447 * writes to that region are not written out to the file." 448 */ 449 if (len < PAGE_CACHE_SIZE) 450 zero_user_segment(page, len, PAGE_CACHE_SIZE); 451 /* 452 * In the first loop we prepare and mark buffers to submit. We have to 453 * mark all buffers in the page before submitting so that 454 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 455 * on the first buffer finishes and we are still working on submitting 456 * the second buffer. 457 */ 458 bh = head = page_buffers(page); 459 do { 460 block_start = bh_offset(bh); 461 if (block_start >= len) { 462 clear_buffer_dirty(bh); 463 set_buffer_uptodate(bh); 464 continue; 465 } 466 if (!buffer_dirty(bh) || buffer_delay(bh) || 467 !buffer_mapped(bh) || buffer_unwritten(bh)) { 468 /* A hole? We can safely clear the dirty bit */ 469 if (!buffer_mapped(bh)) 470 clear_buffer_dirty(bh); 471 if (io->io_bio) 472 ext4_io_submit(io); 473 continue; 474 } 475 if (buffer_new(bh)) { 476 clear_buffer_new(bh); 477 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 478 } 479 set_buffer_async_write(bh); 480 } while ((bh = bh->b_this_page) != head); 481 482 bh = head = page_buffers(page); 483 484 if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { 485 data_page = ext4_encrypt(inode, page); 486 if (IS_ERR(data_page)) { 487 ret = PTR_ERR(data_page); 488 data_page = NULL; 489 goto out; 490 } 491 } 492 493 /* Now submit buffers to write */ 494 do { 495 if (!buffer_async_write(bh)) 496 continue; 497 ret = io_submit_add_bh(io, inode, 498 data_page ? data_page : page, bh); 499 if (ret) { 500 /* 501 * We only get here on ENOMEM. Not much else 502 * we can do but mark the page as dirty, and 503 * better luck next time. 504 */ 505 break; 506 } 507 nr_submitted++; 508 clear_buffer_dirty(bh); 509 } while ((bh = bh->b_this_page) != head); 510 511 /* Error stopped previous loop? Clean up buffers... */ 512 if (ret) { 513 out: 514 if (data_page) 515 ext4_restore_control_page(data_page); 516 printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); 517 redirty_page_for_writepage(wbc, page); 518 do { 519 clear_buffer_async_write(bh); 520 bh = bh->b_this_page; 521 } while (bh != head); 522 } 523 unlock_page(page); 524 /* Nothing submitted - we have to end page writeback */ 525 if (!nr_submitted) 526 end_page_writeback(page); 527 return ret; 528 } 529