1 /* 2 * linux/fs/ext4/page-io.c 3 * 4 * This contains the new page_io functions for ext4 5 * 6 * Written by Theodore Ts'o, 2010. 7 */ 8 9 #include <linux/fs.h> 10 #include <linux/time.h> 11 #include <linux/jbd2.h> 12 #include <linux/highuid.h> 13 #include <linux/pagemap.h> 14 #include <linux/quotaops.h> 15 #include <linux/string.h> 16 #include <linux/buffer_head.h> 17 #include <linux/writeback.h> 18 #include <linux/pagevec.h> 19 #include <linux/mpage.h> 20 #include <linux/namei.h> 21 #include <linux/aio.h> 22 #include <linux/uio.h> 23 #include <linux/bio.h> 24 #include <linux/workqueue.h> 25 #include <linux/kernel.h> 26 #include <linux/slab.h> 27 #include <linux/mm.h> 28 29 #include "ext4_jbd2.h" 30 #include "xattr.h" 31 #include "acl.h" 32 33 static struct kmem_cache *io_end_cachep; 34 35 int __init ext4_init_pageio(void) 36 { 37 io_end_cachep = KMEM_CACHE(ext4_io_end, SLAB_RECLAIM_ACCOUNT); 38 if (io_end_cachep == NULL) 39 return -ENOMEM; 40 return 0; 41 } 42 43 void ext4_exit_pageio(void) 44 { 45 kmem_cache_destroy(io_end_cachep); 46 } 47 48 /* 49 * This function is called by ext4_evict_inode() to make sure there is 50 * no more pending I/O completion work left to do. 51 */ 52 void ext4_ioend_shutdown(struct inode *inode) 53 { 54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 55 56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 57 /* 58 * We need to make sure the work structure is finished being 59 * used before we let the inode get destroyed. 60 */ 61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 63 } 64 65 void ext4_free_io_end(ext4_io_end_t *io) 66 { 67 BUG_ON(!io); 68 BUG_ON(!list_empty(&io->list)); 69 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); 70 71 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 72 wake_up_all(ext4_ioend_wq(io->inode)); 73 kmem_cache_free(io_end_cachep, io); 74 } 75 76 /* check a range of space and convert unwritten extents to written. */ 77 static int ext4_end_io(ext4_io_end_t *io) 78 { 79 struct inode *inode = io->inode; 80 loff_t offset = io->offset; 81 ssize_t size = io->size; 82 int ret = 0; 83 84 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 85 "list->prev 0x%p\n", 86 io, inode->i_ino, io->list.next, io->list.prev); 87 88 ret = ext4_convert_unwritten_extents(inode, offset, size); 89 if (ret < 0) { 90 ext4_msg(inode->i_sb, KERN_EMERG, 91 "failed to convert unwritten extents to written " 92 "extents -- potential data loss! " 93 "(inode %lu, offset %llu, size %zd, error %d)", 94 inode->i_ino, offset, size, ret); 95 } 96 /* Wake up anyone waiting on unwritten extent conversion */ 97 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 98 wake_up_all(ext4_ioend_wq(inode)); 99 if (io->flag & EXT4_IO_END_DIRECT) 100 inode_dio_done(inode); 101 if (io->iocb) 102 aio_complete(io->iocb, io->result, 0); 103 return ret; 104 } 105 106 static void dump_completed_IO(struct inode *inode) 107 { 108 #ifdef EXT4FS_DEBUG 109 struct list_head *cur, *before, *after; 110 ext4_io_end_t *io, *io0, *io1; 111 112 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 113 ext4_debug("inode %lu completed_io list is empty\n", 114 inode->i_ino); 115 return; 116 } 117 118 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 119 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 120 cur = &io->list; 121 before = cur->prev; 122 io0 = container_of(before, ext4_io_end_t, list); 123 after = cur->next; 124 io1 = container_of(after, ext4_io_end_t, list); 125 126 ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", 127 io, inode->i_ino, io0, io1); 128 } 129 #endif 130 } 131 132 /* Add the io_end to per-inode completed end_io list. */ 133 void ext4_add_complete_io(ext4_io_end_t *io_end) 134 { 135 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 136 struct workqueue_struct *wq; 137 unsigned long flags; 138 139 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 140 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; 141 142 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 143 if (list_empty(&ei->i_completed_io_list)) 144 queue_work(wq, &ei->i_unwritten_work); 145 list_add_tail(&io_end->list, &ei->i_completed_io_list); 146 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 147 } 148 149 static int ext4_do_flush_completed_IO(struct inode *inode) 150 { 151 ext4_io_end_t *io; 152 struct list_head unwritten; 153 unsigned long flags; 154 struct ext4_inode_info *ei = EXT4_I(inode); 155 int err, ret = 0; 156 157 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 158 dump_completed_IO(inode); 159 list_replace_init(&ei->i_completed_io_list, &unwritten); 160 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 161 162 while (!list_empty(&unwritten)) { 163 io = list_entry(unwritten.next, ext4_io_end_t, list); 164 BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); 165 list_del_init(&io->list); 166 167 err = ext4_end_io(io); 168 if (unlikely(!ret && err)) 169 ret = err; 170 io->flag &= ~EXT4_IO_END_UNWRITTEN; 171 ext4_free_io_end(io); 172 } 173 return ret; 174 } 175 176 /* 177 * work on completed aio dio IO, to convert unwritten extents to extents 178 */ 179 void ext4_end_io_work(struct work_struct *work) 180 { 181 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 182 i_unwritten_work); 183 ext4_do_flush_completed_IO(&ei->vfs_inode); 184 } 185 186 int ext4_flush_unwritten_io(struct inode *inode) 187 { 188 int ret; 189 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 190 !(inode->i_state & I_FREEING)); 191 ret = ext4_do_flush_completed_IO(inode); 192 ext4_unwritten_wait(inode); 193 return ret; 194 } 195 196 ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 197 { 198 ext4_io_end_t *io = kmem_cache_zalloc(io_end_cachep, flags); 199 if (io) { 200 atomic_inc(&EXT4_I(inode)->i_ioend_count); 201 io->inode = inode; 202 INIT_LIST_HEAD(&io->list); 203 } 204 return io; 205 } 206 207 /* 208 * Print an buffer I/O error compatible with the fs/buffer.c. This 209 * provides compatibility with dmesg scrapers that look for a specific 210 * buffer I/O error message. We really need a unified error reporting 211 * structure to userspace ala Digital Unix's uerf system, but it's 212 * probably not going to happen in my lifetime, due to LKML politics... 213 */ 214 static void buffer_io_error(struct buffer_head *bh) 215 { 216 char b[BDEVNAME_SIZE]; 217 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 218 bdevname(bh->b_bdev, b), 219 (unsigned long long)bh->b_blocknr); 220 } 221 222 static void ext4_end_bio(struct bio *bio, int error) 223 { 224 ext4_io_end_t *io_end = bio->bi_private; 225 struct inode *inode; 226 int i; 227 int blocksize; 228 sector_t bi_sector = bio->bi_sector; 229 230 BUG_ON(!io_end); 231 inode = io_end->inode; 232 blocksize = 1 << inode->i_blkbits; 233 bio->bi_private = NULL; 234 bio->bi_end_io = NULL; 235 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 236 error = 0; 237 for (i = 0; i < bio->bi_vcnt; i++) { 238 struct bio_vec *bvec = &bio->bi_io_vec[i]; 239 struct page *page = bvec->bv_page; 240 struct buffer_head *bh, *head; 241 unsigned bio_start = bvec->bv_offset; 242 unsigned bio_end = bio_start + bvec->bv_len; 243 unsigned under_io = 0; 244 unsigned long flags; 245 246 if (!page) 247 continue; 248 249 if (error) { 250 SetPageError(page); 251 set_bit(AS_EIO, &page->mapping->flags); 252 } 253 bh = head = page_buffers(page); 254 /* 255 * We check all buffers in the page under BH_Uptodate_Lock 256 * to avoid races with other end io clearing async_write flags 257 */ 258 local_irq_save(flags); 259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state); 260 do { 261 if (bh_offset(bh) < bio_start || 262 bh_offset(bh) + blocksize > bio_end) { 263 if (buffer_async_write(bh)) 264 under_io++; 265 continue; 266 } 267 clear_buffer_async_write(bh); 268 if (error) 269 buffer_io_error(bh); 270 } while ((bh = bh->b_this_page) != head); 271 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); 272 local_irq_restore(flags); 273 if (!under_io) 274 end_page_writeback(page); 275 } 276 bio_put(bio); 277 278 if (error) { 279 io_end->flag |= EXT4_IO_END_ERROR; 280 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 281 "(offset %llu size %ld starting block %llu)", 282 inode->i_ino, 283 (unsigned long long) io_end->offset, 284 (long) io_end->size, 285 (unsigned long long) 286 bi_sector >> (inode->i_blkbits - 9)); 287 } 288 289 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 290 ext4_free_io_end(io_end); 291 return; 292 } 293 294 ext4_add_complete_io(io_end); 295 } 296 297 void ext4_io_submit(struct ext4_io_submit *io) 298 { 299 struct bio *bio = io->io_bio; 300 301 if (bio) { 302 bio_get(io->io_bio); 303 submit_bio(io->io_op, io->io_bio); 304 BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); 305 bio_put(io->io_bio); 306 } 307 io->io_bio = NULL; 308 io->io_op = 0; 309 io->io_end = NULL; 310 } 311 312 static int io_submit_init(struct ext4_io_submit *io, 313 struct inode *inode, 314 struct writeback_control *wbc, 315 struct buffer_head *bh) 316 { 317 ext4_io_end_t *io_end; 318 struct page *page = bh->b_page; 319 int nvecs = bio_get_nr_vecs(bh->b_bdev); 320 struct bio *bio; 321 322 io_end = ext4_init_io_end(inode, GFP_NOFS); 323 if (!io_end) 324 return -ENOMEM; 325 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 326 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 327 bio->bi_bdev = bh->b_bdev; 328 bio->bi_private = io->io_end = io_end; 329 bio->bi_end_io = ext4_end_bio; 330 331 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh); 332 333 io->io_bio = bio; 334 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 335 io->io_next_block = bh->b_blocknr; 336 return 0; 337 } 338 339 static int io_submit_add_bh(struct ext4_io_submit *io, 340 struct inode *inode, 341 struct writeback_control *wbc, 342 struct buffer_head *bh) 343 { 344 ext4_io_end_t *io_end; 345 int ret; 346 347 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 348 submit_and_retry: 349 ext4_io_submit(io); 350 } 351 if (io->io_bio == NULL) { 352 ret = io_submit_init(io, inode, wbc, bh); 353 if (ret) 354 return ret; 355 } 356 io_end = io->io_end; 357 if (test_clear_buffer_uninit(bh)) 358 ext4_set_io_unwritten_flag(inode, io_end); 359 io->io_end->size += bh->b_size; 360 io->io_next_block++; 361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 362 if (ret != bh->b_size) 363 goto submit_and_retry; 364 return 0; 365 } 366 367 int ext4_bio_write_page(struct ext4_io_submit *io, 368 struct page *page, 369 int len, 370 struct writeback_control *wbc) 371 { 372 struct inode *inode = page->mapping->host; 373 unsigned block_start, blocksize; 374 struct buffer_head *bh, *head; 375 int ret = 0; 376 int nr_submitted = 0; 377 378 blocksize = 1 << inode->i_blkbits; 379 380 BUG_ON(!PageLocked(page)); 381 BUG_ON(PageWriteback(page)); 382 383 set_page_writeback(page); 384 ClearPageError(page); 385 386 /* 387 * In the first loop we prepare and mark buffers to submit. We have to 388 * mark all buffers in the page before submitting so that 389 * end_page_writeback() cannot be called from ext4_bio_end_io() when IO 390 * on the first buffer finishes and we are still working on submitting 391 * the second buffer. 392 */ 393 bh = head = page_buffers(page); 394 do { 395 block_start = bh_offset(bh); 396 if (block_start >= len) { 397 /* 398 * Comments copied from block_write_full_page_endio: 399 * 400 * The page straddles i_size. It must be zeroed out on 401 * each and every writepage invocation because it may 402 * be mmapped. "A file is mapped in multiples of the 403 * page size. For a file that is not a multiple of 404 * the page size, the remaining memory is zeroed when 405 * mapped, and writes to that region are not written 406 * out to the file." 407 */ 408 zero_user_segment(page, block_start, 409 block_start + blocksize); 410 clear_buffer_dirty(bh); 411 set_buffer_uptodate(bh); 412 continue; 413 } 414 if (!buffer_dirty(bh) || buffer_delay(bh) || 415 !buffer_mapped(bh) || buffer_unwritten(bh)) { 416 /* A hole? We can safely clear the dirty bit */ 417 if (!buffer_mapped(bh)) 418 clear_buffer_dirty(bh); 419 if (io->io_bio) 420 ext4_io_submit(io); 421 continue; 422 } 423 if (buffer_new(bh)) { 424 clear_buffer_new(bh); 425 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); 426 } 427 set_buffer_async_write(bh); 428 } while ((bh = bh->b_this_page) != head); 429 430 /* Now submit buffers to write */ 431 bh = head = page_buffers(page); 432 do { 433 if (!buffer_async_write(bh)) 434 continue; 435 ret = io_submit_add_bh(io, inode, wbc, bh); 436 if (ret) { 437 /* 438 * We only get here on ENOMEM. Not much else 439 * we can do but mark the page as dirty, and 440 * better luck next time. 441 */ 442 redirty_page_for_writepage(wbc, page); 443 break; 444 } 445 nr_submitted++; 446 clear_buffer_dirty(bh); 447 } while ((bh = bh->b_this_page) != head); 448 449 /* Error stopped previous loop? Clean up buffers... */ 450 if (ret) { 451 do { 452 clear_buffer_async_write(bh); 453 bh = bh->b_this_page; 454 } while (bh != head); 455 } 456 unlock_page(page); 457 /* Nothing submitted - we have to end page writeback */ 458 if (!nr_submitted) 459 end_page_writeback(page); 460 return ret; 461 } 462